@@ -23,6 +23,7 @@ import (
2323 "github.com/sap/cap-operator/pkg/client/clientset/versioned"
2424 v1alpha1scheme "github.com/sap/cap-operator/pkg/client/clientset/versioned/scheme"
2525 crdInformers "github.com/sap/cap-operator/pkg/client/informers/externalversions"
26+ "golang.org/x/time/rate"
2627 istio "istio.io/client-go/pkg/clientset/versioned"
2728 istioscheme "istio.io/client-go/pkg/clientset/versioned/scheme"
2829 istioInformers "istio.io/client-go/pkg/informers/externalversions"
@@ -56,17 +57,35 @@ type Controller struct {
5657 eventRecorder events.EventRecorder
5758}
5859
60+ var (
61+ // Application and Domain resources are less frequently updated, so assume a default concurrency of 1.
62+ DefaultReconcile = 1
63+ DefaultConcurrentReconciles = map [int ]int {
64+ ResourceCAPApplicationVersion : 3 , // Moderate concurrency to handle multiple versions efficiently
65+ ResourceCAPTenant : 10 , // High concurrency to handle multiple tenants efficiently
66+ ResourceCAPTenantOperation : 10 , // High concurrency to handle multiple tenant operations efficiently
67+ }
68+ ResourceEnvSuffixMap = map [int ]string {
69+ ResourceCAPApplication : "CAP_APPLICATION" ,
70+ ResourceCAPApplicationVersion : "CAP_APPLICATION_VERSION" ,
71+ ResourceCAPTenant : "CAP_TENANT" ,
72+ ResourceCAPTenantOperation : "CAP_TENANT_OPERATION" ,
73+ ResourceDomain : "DOMAIN" ,
74+ ResourceClusterDomain : "CLUSTER_DOMAIN" ,
75+ }
76+ )
77+
5978func NewController (client kubernetes.Interface , crdClient versioned.Interface , istioClient istio.Interface , gardenerCertificateClient gardenerCert.Interface , certManagerCertificateClient certManager.Interface , gardenerDNSClient gardenerDNS.Interface , promClient promop.Interface ) * Controller {
6079 // Register metrics provider on the workqueue
6180 initializeMetrics ()
6281
6382 queues := map [int ]workqueue.TypedRateLimitingInterface [QueueItem ]{
6483 ResourceCAPApplication : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceCAPApplication ]}),
6584 ResourceCAPApplicationVersion : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceCAPApplicationVersion ]}),
66- ResourceCAPTenant : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceCAPTenant ]}),
67- ResourceCAPTenantOperation : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceCAPTenantOperation ]}),
68- ResourceClusterDomain : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceClusterDomain ]}),
85+ ResourceCAPTenant : workqueue .NewTypedRateLimitingQueueWithConfig (customRateLimiter (), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceCAPTenant ]}),
86+ ResourceCAPTenantOperation : workqueue .NewTypedRateLimitingQueueWithConfig (customRateLimiter (), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceCAPTenantOperation ]}),
6987 ResourceDomain : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceDomain ]}),
88+ ResourceClusterDomain : workqueue .NewTypedRateLimitingQueueWithConfig (workqueue .DefaultTypedControllerRateLimiter [QueueItem ](), workqueue.TypedRateLimitingQueueConfig [QueueItem ]{Name : KindMap [ResourceClusterDomain ]}),
7089 }
7190
7291 // Use 30mins as the default Resync interval for kube / proprietary resources
@@ -89,8 +108,8 @@ func NewController(client kubernetes.Interface, crdClient versioned.Interface, i
89108 // no activity needed on our side so far
90109 }
91110
92- // Use 60 as the default Resync interval for our custom resources (CAP CROs)
93- crdInformerFactory := crdInformers .NewSharedInformerFactory (crdClient , 60 * time .Second )
111+ // Use 5 mins as the default Resync interval for our custom resources (CAP CROs)
112+ crdInformerFactory := crdInformers .NewSharedInformerFactory (crdClient , 5 * time .Minute )
94113
95114 // initialize event recorder
96115 scheme := runtime .NewScheme ()
@@ -122,6 +141,21 @@ func NewController(client kubernetes.Interface, crdClient versioned.Interface, i
122141 return c
123142}
124143
144+ // Custom Rate limiter for Tenant and TenantOperation queues to allow faster retries and higher throughput.
145+ func customRateLimiter () workqueue.TypedRateLimiter [QueueItem ] {
146+ return workqueue .NewTypedMaxOfRateLimiter (
147+ // Faster exponential backoff for transient errors
148+ workqueue .NewTypedItemExponentialFailureRateLimiter [QueueItem ](
149+ 10 * time .Millisecond , // base delay (was 5ms)
150+ 300 * time .Second , // max delay (was ~1000s)
151+ ),
152+ // Higher QPS for bulk processing
153+ & workqueue.TypedBucketRateLimiter [QueueItem ]{
154+ Limiter : rate .NewLimiter (rate .Limit (50 ), 200 ), // 50 QPS, 200 burst (was 10/100)
155+ },
156+ )
157+ }
158+
125159func throwInformerStartError (resources map [reflect.Type ]bool ) {
126160 for resource , ok := range resources {
127161 if ! ok {
@@ -179,15 +213,19 @@ func (c *Controller) Start(ctx context.Context) {
179213
180214 var wg sync.WaitGroup
181215 for k := range c .queues {
182- wg .Add (1 )
183- go func (key int ) {
184- defer wg .Done ()
185- err := c .processQueue (qCxt , key )
186- if err != nil {
187- klog .ErrorS (err , "worker queue ended with error" , "key" , key )
188- }
189- qCancel () // cancel context to inform other workers
190- }(k )
216+ concurrency := getConcurrencyForResource (k )
217+ klog .InfoS ("starting worker queue" , "resource" , getResourceKindFromKey (k ), "concurrency" , concurrency )
218+ for i := range concurrency {
219+ wg .Add (1 )
220+ go func (key , workerId int ) {
221+ defer wg .Done ()
222+ err := c .processQueue (qCxt , key , workerId )
223+ if err != nil {
224+ klog .ErrorS (err , "worker queue ended with error" , "key" , key )
225+ }
226+ qCancel () // cancel context to inform other workers
227+ }(k , i )
228+ }
191229 }
192230
193231 // start version cleanup routines
@@ -199,33 +237,41 @@ func (c *Controller) Start(ctx context.Context) {
199237 wg .Wait ()
200238}
201239
202- func (c * Controller ) processQueue (ctx context.Context , key int ) error {
203- klog .InfoS ("starting to process queue" , "resource" , getResourceKindFromKey (key ))
240+ func getConcurrencyForResource (key int ) int {
241+ concurrency , ok := DefaultConcurrentReconciles [key ]
242+ if ! ok {
243+ concurrency = DefaultReconcile // default concurrency
244+ }
245+ return concurrency
246+ }
247+
248+ func (c * Controller ) processQueue (ctx context.Context , key , workerId int ) error {
249+ klog .InfoS ("starting to process queue" , "resource" , getResourceKindFromKey (key ), "workerId" , workerId )
204250 for {
205251 select {
206252 case <- ctx .Done ():
207- klog .InfoS ("context done; ending processing of queue" , "resource" , getResourceKindFromKey (key ))
253+ klog .InfoS ("context done; ending processing of queue" , "resource" , getResourceKindFromKey (key ), "workerId" , workerId )
208254 return nil
209255 default : // fall through - to avoid blocking
210- err := c .processQueueItem (ctx , key )
256+ err := c .processQueueItem (ctx , key , workerId )
211257 if err != nil {
212258 return err
213259 }
214260 }
215261 }
216262}
217263
218- func (c * Controller ) processQueueItem (ctx context.Context , key int ) error {
264+ func (c * Controller ) processQueueItem (ctx context.Context , key , workerId int ) error {
219265 q , ok := c .queues [key ]
220266 if ! ok {
221267 return fmt .Errorf ("unknown queue; ending worker %d" , key )
222268 }
223269
224- klog .V (2 ).InfoS ("Processing queue item in work queue" , "resource" , getResourceKindFromKey (key ), "queue length" , q .Len ())
270+ klog .V (2 ).InfoS ("Processing queue item in work queue" , "resource" , getResourceKindFromKey (key ), "queue length" , q .Len (), "workerId" , workerId )
225271
226272 item , shutdown := q .Get ()
227273 if shutdown {
228- return fmt .Errorf ("queue (%d) shutdown" , key ) // stop processing when the queue has been shutdown
274+ return fmt .Errorf ("queue (%d, %d ) shutdown" , key , workerId ) // stop processing when the queue has been shutdown
229275 }
230276
231277 // [IMPORTANT] always mark the item as done (after processing it)
@@ -242,7 +288,7 @@ func (c *Controller) processQueueItem(ctx context.Context, key int) error {
242288 // Attempt to recover panics during reconciliation.
243289 defer c .recoverFromPanic (ctx , item , q )
244290
245- klog .InfoS ("Processing Resource" , "namespace" , item .ResourceKey .Namespace , "name" , item .ResourceKey .Name , "kind" , getResourceKindFromKey (key ), "attempt" , attempts )
291+ klog .InfoS ("Processing Resource" , "namespace" , item .ResourceKey .Namespace , "name" , item .ResourceKey .Name , "kind" , getResourceKindFromKey (key ), "attempt" , attempts , "workerId" , workerId )
246292
247293 switch item .Key {
248294 case ResourceCAPApplication :
@@ -263,7 +309,7 @@ func (c *Controller) processQueueItem(ctx context.Context, key int) error {
263309 }
264310 // Handle reconcile errors
265311 if err != nil {
266- klog .ErrorS (err , "queue processing error" , "resource" , getResourceKindFromKey (key ))
312+ klog .ErrorS (err , "queue processing error" , "resource" , getResourceKindFromKey (key ), "workerId" , workerId )
267313 ReconcileErrors .WithLabelValues (getResourceKindFromKey (item .Key ), item .ResourceKey .Namespace , item .ResourceKey .Name ).Inc ()
268314 if ! skipItem {
269315 // add back to queue for re-processing
0 commit comments