diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/DaVinciBackend.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/DaVinciBackend.java index df976ef9a39..41944102b36 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/DaVinciBackend.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/DaVinciBackend.java @@ -65,12 +65,12 @@ import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.service.ICProvider; import com.linkedin.venice.stats.TehutiUtils; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.LogContext; import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; -import java.io.Closeable; import java.util.Collections; import java.util.HashSet; import java.util.Map; @@ -100,7 +100,7 @@ * the shared behavior of this class. Regular clients participate in version swaps while version-specific * clients subscribe to a fixed version and ignore version swap events. */ -public class DaVinciBackend implements Closeable { +public class DaVinciBackend extends AbstractStatsCloseable { private static final Logger LOGGER = LogManager.getLogger(DaVinciBackend.class); // Client type tracking for version-specific vs regular clients @@ -189,16 +189,18 @@ public DaVinciBackend( configLoader.getVeniceClusterConfig().getClusterName()); // OTel per-store version gauge - storeVersionOtelStats = StoreVersionOtelStats - .create(metricsRepository, configLoader.getVeniceClusterConfig().getClusterName(), storeRepository); - - rocksDBMemoryStats = backendConfig.isDatabaseMemoryStatsEnabled() - ? new RocksDBMemoryStats( - metricsRepository, - "RocksDBMemoryStats", - backendConfig.getRocksDBServerConfig().isRocksDBPlainTableFormatEnabled(), - configLoader.getVeniceClusterConfig().getClusterName()) - : null; + storeVersionOtelStats = statsCloseables.register( + StoreVersionOtelStats + .create(metricsRepository, configLoader.getVeniceClusterConfig().getClusterName(), storeRepository)); + + rocksDBMemoryStats = statsCloseables.register( + backendConfig.isDatabaseMemoryStatsEnabled() + ? new RocksDBMemoryStats( + metricsRepository, + "RocksDBMemoryStats", + backendConfig.getRocksDBServerConfig().isRocksDBPlainTableFormatEnabled(), + configLoader.getVeniceClusterConfig().getClusterName()) + : null); /** * The constructor of {@link #storageService} will take care of unused store/store version cleanup. @@ -298,8 +300,11 @@ public DaVinciBackend( ingestionService.start(); if (BlobTransferUtils.isBlobTransferManagerEnabled(backendConfig)) { - aggVersionedBlobTransferStats = - new AggVersionedBlobTransferStats(metricsRepository, storeRepository, configLoader.getVeniceServerConfig()); + aggVersionedBlobTransferStats = statsCloseables.register( + new AggVersionedBlobTransferStats( + metricsRepository, + storeRepository, + configLoader.getVeniceServerConfig())); aggBlobTransferStats = new AggBlobTransferStats(aggVersionedBlobTransferStats, ingestionService.getHostLevelIngestionStats()); P2PBlobTransferConfig p2PBlobTransferConfig = new P2PBlobTransferConfig( @@ -434,9 +439,7 @@ public synchronized void close() { cacheBackend.ifPresent( objectCacheBackend -> storeRepository .unregisterStoreDataChangedListener(objectCacheBackend.getCacheInvalidatingStoreChangeListener())); - if (storeVersionOtelStats != null) { - storeVersionOtelStats.close(); - } + super.close(); ExecutorService storeBackendCloseExecutor = Executors.newCachedThreadPool( new DaemonThreadFactory( "DaVinciBackend-StoreBackend-Close", diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerDaVinciRecordTransformerImpl.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerDaVinciRecordTransformerImpl.java index edb42e93339..7ca54aa1da2 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerDaVinciRecordTransformerImpl.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerDaVinciRecordTransformerImpl.java @@ -27,6 +27,7 @@ import com.linkedin.venice.pubsub.api.PubSubPosition; import com.linkedin.venice.pubsub.api.PubSubSymbolicPosition; import com.linkedin.venice.pubsub.api.PubSubTopicPartition; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.ExceptionUtils; import com.linkedin.venice.utils.LogContext; @@ -64,7 +65,7 @@ import org.apache.logging.log4j.Logger; -public class VeniceChangelogConsumerDaVinciRecordTransformerImpl +public class VeniceChangelogConsumerDaVinciRecordTransformerImpl extends AbstractStatsCloseable implements StatefulVeniceChangelogConsumer, VeniceChangelogConsumer { private static final Logger LOGGER = LogManager.getLogger(VeniceChangelogConsumerDaVinciRecordTransformerImpl.class); private long START_TIMEOUT_IN_SECONDS = 60; @@ -184,10 +185,11 @@ public VeniceChangelogConsumerDaVinciRecordTransformerImpl( } if (changelogClientConfig.getInnerClientConfig().getMetricsRepository() != null) { - this.changeCaptureStats = new BasicConsumerStats( - changelogClientConfig.getInnerClientConfig().getMetricsRepository(), - "vcc-" + changelogClientConfig.getConsumerName(), - storeName); + this.changeCaptureStats = statsCloseables.register( + new BasicConsumerStats( + changelogClientConfig.getInnerClientConfig().getMetricsRepository(), + "vcc-" + changelogClientConfig.getConsumerName(), + storeName)); } else { changeCaptureStats = null; } @@ -439,12 +441,15 @@ public void resume() { this.resume(Collections.emptySet()); } + @Override public void close() { try { this.stop(); } catch (Exception e) { LOGGER.error("Close failed for VeniceChangelogConsumer", e); throw new RuntimeException(e); + } finally { + super.close(); } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java index 6a768e06377..c3634863211 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/VeniceChangelogConsumerImpl.java @@ -57,6 +57,7 @@ import com.linkedin.venice.serialization.avro.AvroSpecificStoreDeserializerCache; import com.linkedin.venice.serializer.FastSerializerDeserializerFactory; import com.linkedin.venice.serializer.RecordDeserializer; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.store.rocksdb.RocksDBUtils; import com.linkedin.venice.utils.ByteUtils; import com.linkedin.venice.utils.DaemonThreadFactory; @@ -102,7 +103,7 @@ import org.apache.logging.log4j.Logger; -public class VeniceChangelogConsumerImpl implements VeniceChangelogConsumer { +public class VeniceChangelogConsumerImpl extends AbstractStatsCloseable implements VeniceChangelogConsumer { private static final Logger LOGGER = LogManager.getLogger(VeniceChangelogConsumerImpl.class); private static final int MAX_SUBSCRIBE_RETRIES = 5; private static final String ROCKSDB_BUFFER_FOLDER = "rocksdb-chunk-buffer"; @@ -250,10 +251,11 @@ public VeniceChangelogConsumerImpl( } if (changelogClientConfig.getInnerClientConfig().getMetricsRepository() != null) { - this.changeCaptureStats = new BasicConsumerStats( - changelogClientConfig.getInnerClientConfig().getMetricsRepository(), - "vcc-" + changelogClientConfig.getConsumerName(), - storeName); + this.changeCaptureStats = statsCloseables.register( + new BasicConsumerStats( + changelogClientConfig.getInnerClientConfig().getMetricsRepository(), + "vcc-" + changelogClientConfig.getConsumerName(), + storeName)); } else { changeCaptureStats = null; } @@ -1169,18 +1171,22 @@ public void close() { LOGGER.info("Closing Changelog Consumer with name: {}", changelogClientConfig.getConsumerName()); subscriptionLock.writeLock().lock(); try { - this.unsubscribeAll(); - pubSubConsumer.close(); - heartbeatReporterThread.interrupt(); - seekExecutorService.shutdown(); - compressorFactory.close(); - - if (rocksDBStorageEngineFactory != null) { - rocksDBStorageEngineFactory.close(); - } + try { + this.unsubscribeAll(); + pubSubConsumer.close(); + heartbeatReporterThread.interrupt(); + seekExecutorService.shutdown(); + compressorFactory.close(); + + if (rocksDBStorageEngineFactory != null) { + rocksDBStorageEngineFactory.close(); + } - veniceChangelogConsumerClientFactory.deregisterClient(changelogClientConfig.getConsumerName()); - LOGGER.info("Closed Changelog Consumer with name: {}", changelogClientConfig.getConsumerName()); + veniceChangelogConsumerClientFactory.deregisterClient(changelogClientConfig.getConsumerName()); + LOGGER.info("Closed Changelog Consumer with name: {}", changelogClientConfig.getConsumerName()); + } finally { + super.close(); + } } finally { subscriptionLock.writeLock().unlock(); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/stats/BasicConsumerStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/stats/BasicConsumerStats.java index 959a4ec0628..7d87367bc6a 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/stats/BasicConsumerStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/consumer/stats/BasicConsumerStats.java @@ -74,7 +74,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.MAX_PARTITION_LAG, Collections.singletonList(new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); minimumConsumingVersionMetric = MetricEntityStateBase.create( BasicConsumerMetricEntity.CURRENT_CONSUMING_VERSION.getMetricEntity(), @@ -83,7 +84,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.MINIMUM_CONSUMING_VERSION, Collections.singletonList(new Gauge()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); maximumConsumingVersionMetric = MetricEntityStateBase.create( BasicConsumerMetricEntity.CURRENT_CONSUMING_VERSION.getMetricEntity(), @@ -92,7 +94,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.MAXIMUM_CONSUMING_VERSION, Collections.singletonList(new Gauge()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); recordsConsumedCountMetric = MetricEntityStateBase.create( BasicConsumerMetricEntity.RECORDS_CONSUMED_COUNT.getMetricEntity(), @@ -101,7 +104,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.RECORDS_CONSUMED, Arrays.asList(new Avg(), new Max(), new Rate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); pollSuccessCountMetric = MetricEntityStateOneEnum.create( BasicConsumerMetricEntity.POLL_COUNT.getMetricEntity(), @@ -110,7 +114,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.POLL_SUCCESS_COUNT, Collections.singletonList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); pollFailCountMetric = MetricEntityStateOneEnum.create( BasicConsumerMetricEntity.POLL_COUNT.getMetricEntity(), @@ -119,7 +124,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.POLL_FAIL_COUNT, Collections.singletonList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); versionSwapSuccessCountMetric = MetricEntityStateOneEnum.create( BasicConsumerMetricEntity.VERSION_SWAP_COUNT.getMetricEntity(), @@ -128,7 +134,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.VERSION_SWAP_SUCCESS_COUNT, Collections.singletonList(new Total()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); versionSwapFailCountMetric = MetricEntityStateOneEnum.create( BasicConsumerMetricEntity.VERSION_SWAP_COUNT.getMetricEntity(), @@ -137,7 +144,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.VERSION_SWAP_FAIL_COUNT, Collections.singletonList(new Total()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); chunkedRecordSuccessCountMetric = MetricEntityStateOneEnum.create( BasicConsumerMetricEntity.CHUNKED_RECORD_COUNT.getMetricEntity(), @@ -146,7 +154,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.CHUNKED_RECORD_SUCCESS_COUNT, Collections.singletonList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); chunkedRecordFailCountMetric = MetricEntityStateOneEnum.create( BasicConsumerMetricEntity.CHUNKED_RECORD_COUNT.getMetricEntity(), @@ -155,7 +164,8 @@ public BasicConsumerStats(MetricsRepository metricsRepository, String consumerNa BasicConsumerTehutiMetricName.CHUNKED_RECORD_FAIL_COUNT, Collections.singletonList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); /* * Record default value for version swap metrics so the UP_DOWN_COUNTER in OTEL will emit a default 0. diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/helix/HelixParticipationService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/helix/HelixParticipationService.java index 882002db8e3..70dbf295128 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/helix/HelixParticipationService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/helix/HelixParticipationService.java @@ -34,6 +34,7 @@ import com.linkedin.venice.serialization.avro.AvroProtocolDefinition; import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.stats.HelixMessageChannelStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.status.StatusMessageHandler; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.HelixUtils; @@ -90,6 +91,8 @@ public class HelixParticipationService extends AbstractVeniceService private VeniceOfflinePushMonitorAccessor veniceOfflinePushMonitorAccessor; private BlobTransferManager blobTransferManager; private final HeartbeatMonitoringService heartbeatMonitoringService; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); // This is ONLY for testing purpose. public ThreadPoolExecutor getLeaderFollowerHelixStateTransitionThreadPool() { @@ -193,20 +196,22 @@ public boolean startInner() { config.getMaxLeaderFollowerStateTransitionThreadNumber(), "Venice-L/F-state-transition"); // register stats that tracks the thread pool - ParticipantStateTransitionStats stateTransitionStats = new ParticipantStateTransitionStats( - metricsRepository, - leaderFollowerHelixStateTransitionThreadPool, - "Venice_L/F_ST_thread_pool"); + ParticipantStateTransitionStats stateTransitionStats = statsCloseables.register( + new ParticipantStateTransitionStats( + metricsRepository, + leaderFollowerHelixStateTransitionThreadPool, + "Venice_L/F_ST_thread_pool")); if (config.getLeaderFollowerThreadPoolStrategy() .equals(LeaderFollowerPartitionStateModelFactory.LeaderFollowerThreadPoolStrategy.DUAL_POOL_STRATEGY)) { ThreadPoolExecutor futureVersionThreadPool = initHelixStateTransitionThreadPool( config.getMaxFutureVersionLeaderFollowerStateTransitionThreadNumber(), "venice-L/F-state-transition-future-version"); - ParticipantStateTransitionStats futureVersionStateTransitionStats = new ParticipantStateTransitionStats( - metricsRepository, - futureVersionThreadPool, - "Venice_L/F_ST_thread_pool_future_version"); + ParticipantStateTransitionStats futureVersionStateTransitionStats = statsCloseables.register( + new ParticipantStateTransitionStats( + metricsRepository, + futureVersionThreadPool, + "Venice_L/F_ST_thread_pool_future_version")); leaderFollowerParticipantModelFactory = new LeaderFollowerPartitionStateModelDualPoolFactory( ingestionBackend, veniceConfigLoader, @@ -301,6 +306,7 @@ public void stopInner() throws IOException { zkClient.close(); LOGGER.info("Closed ZkClient."); } + statsCloseables.close(); LOGGER.info("Finished stopping HelixParticipation service."); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AdaptiveThrottlerSignalService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AdaptiveThrottlerSignalService.java index 0d19a32dfe5..c6a2a2bb87f 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AdaptiveThrottlerSignalService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AdaptiveThrottlerSignalService.java @@ -7,6 +7,7 @@ import com.linkedin.davinci.stats.ingestion.heartbeat.AggregatedHeartbeatLagEntry; import com.linkedin.davinci.stats.ingestion.heartbeat.HeartbeatMonitoringService; import com.linkedin.venice.service.AbstractVeniceService; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.throttle.VeniceAdaptiveThrottler; import com.linkedin.venice.utils.DaemonThreadFactory; import io.tehuti.Metric; @@ -49,6 +50,8 @@ public class AdaptiveThrottlerSignalService extends AbstractVeniceService { private boolean nonCurrentLeaderMaxHeartbeatLagSignal = false; private boolean nonCurrentFollowerMaxHeartbeatLagSignal = false; private final AdaptiveThrottlingServiceStats adaptiveThrottlingServiceStats; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); public AdaptiveThrottlerSignalService( VeniceServerConfig veniceServerConfig, @@ -63,8 +66,8 @@ public AdaptiveThrottlerSignalService( this.updateService = Executors.newSingleThreadScheduledExecutor( new DaemonThreadFactory("AdaptiveThrottlerSignalService", veniceServerConfig.getLogContext())); this.heartbeatMonitoringService = heartbeatMonitoringService; - this.adaptiveThrottlingServiceStats = - new AdaptiveThrottlingServiceStats(metricsRepository, veniceServerConfig.getClusterName()); + this.adaptiveThrottlingServiceStats = statsCloseables + .register(new AdaptiveThrottlingServiceStats(metricsRepository, veniceServerConfig.getClusterName())); } public void registerThrottler(VeniceAdaptiveThrottler adaptiveIngestionThrottler) { @@ -171,6 +174,7 @@ public boolean startInner() throws Exception { @Override public void stopInner() throws Exception { updateService.shutdownNow(); + statsCloseables.close(); } List getThrottlerList() { diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AggKafkaConsumerService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AggKafkaConsumerService.java index f8b5c165dff..ee2950f99d1 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AggKafkaConsumerService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/AggKafkaConsumerService.java @@ -20,6 +20,7 @@ import com.linkedin.venice.pubsub.manager.TopicManagerContext.PubSubPropertiesSupplier; import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.stats.ThreadPoolStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.RedundantExceptionFilter; import com.linkedin.venice.utils.SystemTime; @@ -83,6 +84,8 @@ public class AggKafkaConsumerService extends AbstractVeniceService { private final StuckConsumerRepairStats stuckConsumerStats; private final ThreadPoolExecutor crossTpProcessingPool; private final ThreadPoolStats crossTpProcessingStats; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private final static String STUCK_CONSUMER_MSG = "Didn't find any suspicious ingestion task, and please contact developers to investigate it further"; @@ -119,7 +122,8 @@ public AggKafkaConsumerService( this.kafkaClusterUrlResolver = serverConfig.getKafkaClusterUrlResolver(); this.metadataRepository = metadataRepository; if (serverConfig.isStuckConsumerRepairEnabled()) { - this.stuckConsumerStats = new StuckConsumerRepairStats(metricsRepository, serverConfig.getClusterName()); + this.stuckConsumerStats = + statsCloseables.register(new StuckConsumerRepairStats(metricsRepository, serverConfig.getClusterName())); this.stuckConsumerRepairExecutorService = Executors.newSingleThreadScheduledExecutor( new DaemonThreadFactory(this.getClass().getName() + "-StuckConsumerRepair", serverConfig.getLogContext())); int intervalInSeconds = serverConfig.getStuckConsumerRepairIntervalSecond(); @@ -156,7 +160,8 @@ public AggKafkaConsumerService( TimeUnit.MILLISECONDS, new LinkedBlockingQueue<>(), new DaemonThreadFactory("cross-tp-parallel-processing", serverConfig.getLogContext())); - this.crossTpProcessingStats = new ThreadPoolStats(metricsRepository, crossTpProcessingPool, "CrossTpProcessing"); + this.crossTpProcessingStats = + statsCloseables.register(new ThreadPoolStats(metricsRepository, crossTpProcessingPool, "CrossTpProcessing")); LOGGER.info("Cross-TP parallel processing enabled with shared thread pool size: {}", poolSize); } else { this.crossTpProcessingPool = null; @@ -195,6 +200,7 @@ public void stopInner() throws Exception { Thread.currentThread().interrupt(); } } + statsCloseables.close(); } protected static Runnable getStuckConsumerDetectionAndRepairRunnable( diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java index 2553d41db3f..6a8c1320990 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/KafkaStoreIngestionService.java @@ -86,6 +86,7 @@ import com.linkedin.venice.stats.ThreadPoolStats; import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceIngestionFailureReason; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.system.store.ControllerClientBackedSystemSchemaInitializer; import com.linkedin.venice.system.store.MetaStoreWriter; import com.linkedin.venice.throttle.EventThrottler; @@ -213,6 +214,8 @@ public class KafkaStoreIngestionService extends AbstractVeniceService implements private final Map storeNameToInternalRecordTransformerConfig = new VeniceConcurrentHashMap<>(); private AggVersionedDaVinciRecordTransformerStats recordTransformerStats = null; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private final Set blobTransferDisabledStores = VeniceConcurrentHashMap.newKeySet(); private final PubSubProducerAdapterFactory producerAdapterFactory; @@ -438,11 +441,12 @@ public void handleStoreDeleted(Store store) { metadataRepo, serverConfig.isUnregisterMetricForDeletedStoreEnabled(), SystemTime.INSTANCE); - AggVersionedDIVStats versionedDIVStats = new AggVersionedDIVStats( - metricsRepository, - metadataRepo, - serverConfig.isUnregisterMetricForDeletedStoreEnabled(), - serverConfig.getClusterName()); + AggVersionedDIVStats versionedDIVStats = statsCloseables.register( + new AggVersionedDIVStats( + metricsRepository, + metadataRepo, + serverConfig.isUnregisterMetricForDeletedStoreEnabled(), + serverConfig.getClusterName())); this.versionedIngestionStats = new AggVersionedIngestionStats(metricsRepository, metadataRepo, serverConfig); if (serverConfig.isDedicatedDrainerQueueEnabled()) { this.storeBufferService = @@ -507,10 +511,11 @@ public void handleStoreDeleted(Store store) { this.aaWCWorkLoadProcessingThreadPool = Executors.newFixedThreadPool( serverConfig.getAAWCWorkloadParallelProcessingThreadPoolSize(), new DaemonThreadFactory("AA_WC_PARALLEL_PROCESSING", serverConfig.getLogContext())); - new ThreadPoolStats( - metricsRepository, - (ThreadPoolExecutor) aaWCWorkLoadProcessingThreadPool, - "aa_wc_parallel_processing_thread_pool"); + statsCloseables.register( + new ThreadPoolStats( + metricsRepository, + (ThreadPoolExecutor) aaWCWorkLoadProcessingThreadPool, + "aa_wc_parallel_processing_thread_pool")); } else { this.aaWCWorkLoadProcessingThreadPool = null; } @@ -524,10 +529,11 @@ public void handleStoreDeleted(Store store) { this.aaWCIngestionStorageLookupThreadPool = Executors.newFixedThreadPool( serverConfig.getAaWCIngestionStorageLookupThreadPoolSize(), new DaemonThreadFactory("AA_WC_INGESTION_STORAGE_LOOKUP", serverConfig.getLogContext())); - new ThreadPoolStats( - metricsRepository, - (ThreadPoolExecutor) aaWCIngestionStorageLookupThreadPool, - "aa_wc_ingestion_storage_lookup_thread_pool"); + statsCloseables.register( + new ThreadPoolStats( + metricsRepository, + (ThreadPoolExecutor) aaWCIngestionStorageLookupThreadPool, + "aa_wc_ingestion_storage_lookup_thread_pool")); LOGGER.info( "Enabled a thread pool for AA/WC ingestion lookup with {} threads.", serverConfig.getAaWCIngestionStorageLookupThreadPoolSize()); @@ -781,6 +787,8 @@ public void stopInner() { Utils.closeQuietlyWithErrorLogged(storeBufferService); Utils.closeQuietlyWithErrorLogged(topicManagerRepository); topicLockManager.removeAllLocks(); + + statsCloseables.close(); } @Override @@ -1552,8 +1560,8 @@ synchronized public void registerRecordTransformerConfig( String storeName, DaVinciRecordTransformerConfig recordTransformerConfig) { if (recordTransformerStats == null) { - recordTransformerStats = - new AggVersionedDaVinciRecordTransformerStats(metricsRepository, metadataRepo, serverConfig); + recordTransformerStats = statsCloseables + .register(new AggVersionedDaVinciRecordTransformerStats(metricsRepository, metadataRepo, serverConfig)); } storeNameToInternalRecordTransformerConfig diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreBufferService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreBufferService.java index d011460f708..63f8d221d13 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreBufferService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/kafka/consumer/StoreBufferService.java @@ -22,6 +22,7 @@ import com.linkedin.venice.pubsub.api.PubSubTopic; import com.linkedin.venice.pubsub.api.PubSubTopicPartition; import com.linkedin.venice.stats.OpenTelemetryMetricsSetup; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.LogContext; import com.linkedin.venice.utils.Utils; @@ -71,6 +72,8 @@ public class StoreBufferService extends AbstractStoreBufferService { private final RecordHandler leaderRecordHandler; private final StoreBufferServiceStats storeBufferServiceStats; + /** Stats fields owned by this class; drained by {@link #stopInner}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private final LoadingCache hashCodeCache; private final boolean isSorted; @@ -147,17 +150,18 @@ private StoreBufferService( } this.isSorted = sorted; this.leaderRecordHandler = queueLeaderWrites ? this::queueLeaderRecord : StoreBufferService::processRecord; - this.storeBufferServiceStats = metricsRepository == null - ? Objects.requireNonNull(stats) - : new StoreBufferServiceStats( - Objects.requireNonNull(metricsRepository), - sorted ? "StoreBufferServiceSorted" : "StoreBufferServiceUnsorted", - clusterName, - sorted, - this::getTotalMemoryUsage, - this::getTotalRemainingMemory, - this::getMaxMemoryUsagePerDrainer, - this::getMinMemoryUsagePerDrainer); + this.storeBufferServiceStats = statsCloseables.register( + metricsRepository == null + ? Objects.requireNonNull(stats) + : new StoreBufferServiceStats( + Objects.requireNonNull(metricsRepository), + sorted ? "StoreBufferServiceSorted" : "StoreBufferServiceUnsorted", + clusterName, + sorted, + this::getTotalMemoryUsage, + this::getTotalRemainingMemory, + this::getMaxMemoryUsagePerDrainer, + this::getMinMemoryUsagePerDrainer)); /* * {@link #getDrainerIndexForConsumerRecord} hashes the topic name and partition to determine a drainer. Due to the * different naming conventions for RT (_rt) and Separate RT (_rt_sep), different drainers might be assigned while @@ -384,6 +388,7 @@ public void stopInner() throws Exception { this.executorService.shutdownNow(); this.executorService.awaitTermination(10, TimeUnit.SECONDS); } + statsCloseables.close(); } @Override diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/repository/NativeMetadataRepository.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/repository/NativeMetadataRepository.java index 4e74d184737..da3855af3e1 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/repository/NativeMetadataRepository.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/repository/NativeMetadataRepository.java @@ -25,6 +25,7 @@ import com.linkedin.venice.schema.rmd.RmdSchemaEntry; import com.linkedin.venice.schema.writecompute.DerivedSchemaEntry; import com.linkedin.venice.service.ICProvider; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.utils.RetryUtils; import com.linkedin.venice.utils.VeniceProperties; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; @@ -54,7 +55,7 @@ * stores' metadata. Callers are served by the cache and the cache is refreshed periodically by updating it with methods * provided by the implementers. */ -public abstract class NativeMetadataRepository +public abstract class NativeMetadataRepository extends AbstractStatsCloseable implements SubscriptionBasedReadOnlyStoreRepository, ReadOnlySchemaRepository, ClusterInfoProvider { private static final long DEFAULT_REFRESH_INTERVAL_IN_SECONDS = 60; private static final Logger LOGGER = LogManager.getLogger(NativeMetadataRepository.class); @@ -88,8 +89,8 @@ protected NativeMetadataRepository(ClientConfig clientConfig, VeniceProperties b CLIENT_SYSTEM_STORE_REPOSITORY_REFRESH_INTERVAL_SECONDS, NativeMetadataRepository.DEFAULT_REFRESH_INTERVAL_IN_SECONDS); this.clientConfig = clientConfig; - this.nativeMetadataRepositoryStats = - new NativeMetadataRepositoryStats(clientConfig.getMetricsRepository(), "native_metadata_repository", clock); + this.nativeMetadataRepositoryStats = statsCloseables.register( + new NativeMetadataRepositoryStats(clientConfig.getMetricsRepository(), "native_metadata_repository", clock)); this.clock = clock; } @@ -373,12 +374,8 @@ public void refresh() { LOGGER.debug("Refresh finished for {}", getClass().getSimpleName()); } - /** - * TODO: we may need to rename this function to be 'close' since this resource should not used any more - * after calling this function. - */ @Override - public void clear() { + public void close() { scheduler.shutdown(); try { if (!scheduler.awaitTermination(60, TimeUnit.SECONDS)) { @@ -392,6 +389,12 @@ public void clear() { storeConfigMap.clear(); schemaMap.clear(); totalStoreReadQuota.set(0); + statsCloseables.close(); + } + + @Override + public void clear() { + close(); } /** @@ -425,7 +428,7 @@ protected Store putStore(Store newStore) { protected Store removeStore(String storeName) { // Remove the store name from the subscription. Store oldStore = subscribedStoreMap.remove(storeName); - nativeMetadataRepositoryStats.removeCacheTimestamp(storeName); + nativeMetadataRepositoryStats.handleStoreDeleted(storeName); if (oldStore != null) { totalStoreReadQuota.addAndGet(-oldStore.getReadQuotaInCU()); notifyStoreDeleted(oldStore); diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AbstractVeniceAggVersionedStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AbstractVeniceAggVersionedStats.java index 6be46a2a38f..eb563011aa0 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AbstractVeniceAggVersionedStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AbstractVeniceAggVersionedStats.java @@ -7,6 +7,7 @@ import com.linkedin.venice.meta.StoreDataChangedListener; import com.linkedin.venice.meta.Version; import com.linkedin.venice.stats.StatsSupplier; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; @@ -20,7 +21,7 @@ public abstract class AbstractVeniceAggVersionedStats> - implements StoreDataChangedListener { + extends AbstractStatsCloseable implements StoreDataChangedListener { private static final Logger LOGGER = LogManager.getLogger(AbstractVeniceAggVersionedStats.class); private final Supplier statsInitiator; @@ -216,4 +217,15 @@ protected void onVersionInfoUpdated(String storeName, int currentVersion, int fu protected void cleanupVersionResources(String storeName, int version) { // no-op by default } + + /** + * Unsubscribes from {@link #metadataRepository} first so no further listener callbacks can race + * with cleanup, then drains {@code statsCloseables}. Subclasses with additional cleanup should + * override and call {@code super.close()}. + */ + @Override + public void close() { + metadataRepository.unregisterStoreDataChangedListener(this); + super.close(); + } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AdaptiveThrottlingServiceStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AdaptiveThrottlingServiceStats.java index 2a7d713091e..5b977224f4f 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AdaptiveThrottlingServiceStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AdaptiveThrottlingServiceStats.java @@ -57,7 +57,8 @@ public AdaptiveThrottlingServiceStats(MetricsRepository metricsRepository, Strin getTehutiName(type), Collections.singletonList(new Rate()), dimensionsMap, - attributes)); + attributes, + resources)); } this.rateMetrics = Collections.unmodifiableMap(metricMap); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedBlobTransferStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedBlobTransferStats.java index c92800f8383..7bcb55887f8 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedBlobTransferStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedBlobTransferStats.java @@ -3,6 +3,7 @@ import com.linkedin.davinci.config.VeniceServerConfig; import com.linkedin.venice.meta.ReadOnlyStoreRepository; import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.utils.Time; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; @@ -95,7 +96,7 @@ public void handleStoreDeleted(String storeName) { try { super.handleStoreDeleted(storeName); } finally { - otelStatsMap.remove(storeName); + MetricEntityStateUtils.closeQuietly(otelStatsMap.remove(storeName)); } } @@ -207,4 +208,11 @@ public void recordBlobTransferBytesSent(String storeName, int version, long valu // OTel metrics getBlobTransferOtelStats(storeName).recordBytesSent(version, value); } + + @Override + public void close() { + // Unregister metadata listener first so handleStore* can't re-populate the map while we drain. + super.close(); + MetricEntityStateUtils.closeAndClear(otelStatsMap); + } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDIVStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDIVStats.java index 770c9f08e46..579adf51391 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDIVStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDIVStats.java @@ -2,7 +2,6 @@ import static com.linkedin.davinci.stats.OtelVersionedStatsUtils.classifyVersion; import static com.linkedin.venice.meta.Store.NON_EXISTING_VERSION; -import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; import com.linkedin.venice.exceptions.validation.CorruptDataException; import com.linkedin.venice.exceptions.validation.DataValidationException; @@ -15,15 +14,16 @@ import com.linkedin.venice.stats.dimensions.VeniceDIVResult; import com.linkedin.venice.stats.dimensions.VeniceDIVSeverity; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum; import com.linkedin.venice.stats.metrics.MetricEntityStateTwoEnums; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import it.unimi.dsi.fastutil.ints.IntOpenHashSet; import it.unimi.dsi.fastutil.ints.IntSet; import java.util.Collections; -import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicLong; import java.util.function.BiConsumer; @@ -50,19 +50,48 @@ public class AggVersionedDIVStats extends AbstractVeniceAggVersionedStats baseDimensionsMap; /** - * Per-store OTel metric state maps. Each map grows lazily via {@code computeIfAbsent} and is bounded - * by the number of stores the server is actively ingesting. Entries are removed when a store is - * deleted via {@link #handleStoreDeleted(String)}. These maps are OTel-only; Tehuti recording is - * handled by the parent class via {@code recordVersionedAndTotalStat}. + * Per-store entry bundling the four per-store OTel wrappers; one {@code remove()} in + * {@link #handleStoreDeleted} closes them atomically. Bounded by the number of stores ingested. + * Tehuti recording stays on the parent via {@code recordVersionedAndTotalStat}. */ - private final Map> messageCountPerStore = - new VeniceConcurrentHashMap<>(); - private final Map> offsetRewindCountPerStore = - new VeniceConcurrentHashMap<>(); - private final Map> producerFailureCountPerStore = - new VeniceConcurrentHashMap<>(); - private final Map> benignProducerFailureCountPerStore = - new VeniceConcurrentHashMap<>(); + private final Map perStoreEntryMap = new VeniceConcurrentHashMap<>(); + + /** Per-store state held by {@link #perStoreEntryMap}. */ + private static final class PerStoreEntry extends AbstractStatsCloseable { + final MetricEntityStateTwoEnums messageCount; + final MetricEntityStateTwoEnums offsetRewindCount; + final MetricEntityStateOneEnum producerFailureCount; + final MetricEntityStateOneEnum benignProducerFailureCount; + + PerStoreEntry(VeniceOpenTelemetryMetricsRepository otelRepository, Map dims) { + this.messageCount = MetricEntityStateTwoEnums.create( + DIVOtelMetricEntity.MESSAGE_COUNT.getMetricEntity(), + otelRepository, + dims, + VersionRole.class, + VeniceDIVResult.class, + statsCloseables); + this.offsetRewindCount = MetricEntityStateTwoEnums.create( + DIVOtelMetricEntity.OFFSET_REWIND_COUNT.getMetricEntity(), + otelRepository, + dims, + VersionRole.class, + VeniceDIVSeverity.class, + statsCloseables); + this.producerFailureCount = MetricEntityStateOneEnum.create( + DIVOtelMetricEntity.PRODUCER_FAILURE_COUNT.getMetricEntity(), + otelRepository, + dims, + VersionRole.class, + statsCloseables); + this.benignProducerFailureCount = MetricEntityStateOneEnum.create( + DIVOtelMetricEntity.BENIGN_PRODUCER_FAILURE_COUNT.getMetricEntity(), + otelRepository, + dims, + VersionRole.class, + statsCloseables); + } + } /** * Per-store version info for classifying versions as CURRENT, FUTURE, or BACKUP. @@ -131,20 +160,12 @@ public void recordPotentiallyLossyLeaderOffsetRewind(String storeName, int versi public void recordLeaderProducerFailure(String storeName, int version) { recordVersionedAndTotalStat(storeName, version, DIVStats::recordLeaderProducerFailure); - recordOtelOneEnumMetric( - storeName, - version, - producerFailureCountPerStore, - DIVOtelMetricEntity.PRODUCER_FAILURE_COUNT); + recordOtelProducerFailureCount(storeName, version, false); } public void recordBenignLeaderProducerFailure(String storeName, int version) { recordVersionedAndTotalStat(storeName, version, DIVStats::recordBenignLeaderProducerFailure); - recordOtelOneEnumMetric( - storeName, - version, - benignProducerFailureCountPerStore, - DIVOtelMetricEntity.BENIGN_PRODUCER_FAILURE_COUNT); + recordOtelProducerFailureCount(storeName, version, true); } /** {@link AbstractVeniceAggVersionedStats#addStore(com.linkedin.venice.meta.Store)} @@ -162,10 +183,7 @@ public void handleStoreDeleted(String storeName) { try { super.handleStoreDeleted(storeName); } finally { - messageCountPerStore.remove(storeName); - offsetRewindCountPerStore.remove(storeName); - producerFailureCountPerStore.remove(storeName); - benignProducerFailureCountPerStore.remove(storeName); + MetricEntityStateUtils.closeQuietly(perStoreEntryMap.remove(storeName)); versionInfoMap.remove(storeName); } } @@ -225,10 +243,12 @@ private void resetTotalStats( // --- OTel recording helpers --- - private Map buildStoreDimensionsMap(String storeName) { - Map map = new HashMap<>(baseDimensionsMap); - map.put(VENICE_STORE_NAME, OpenTelemetryMetricsSetup.sanitizeStoreName(storeName)); - return Collections.unmodifiableMap(map); + private PerStoreEntry getOrCreateEntry(String storeName) { + return perStoreEntryMap.computeIfAbsent( + storeName, + k -> new PerStoreEntry( + otelRepository, + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k))); } private void recordOtelMessageCount(String storeName, int version, VeniceDIVResult result) { @@ -236,15 +256,7 @@ private void recordOtelMessageCount(String storeName, int version, VeniceDIVResu return; } VersionRole role = classifyVersion(version, versionInfoMap.get(storeName)); - messageCountPerStore.computeIfAbsent( - storeName, - k -> MetricEntityStateTwoEnums.create( - DIVOtelMetricEntity.MESSAGE_COUNT.getMetricEntity(), - otelRepository, - buildStoreDimensionsMap(k), - VersionRole.class, - VeniceDIVResult.class)) - .record(1, role, result); + getOrCreateEntry(storeName).messageCount.record(1, role, result); } private void recordOtelOffsetRewindCount(String storeName, int version, VeniceDIVSeverity severity) { @@ -252,31 +264,23 @@ private void recordOtelOffsetRewindCount(String storeName, int version, VeniceDI return; } VersionRole role = classifyVersion(version, versionInfoMap.get(storeName)); - offsetRewindCountPerStore.computeIfAbsent( - storeName, - k -> MetricEntityStateTwoEnums.create( - DIVOtelMetricEntity.OFFSET_REWIND_COUNT.getMetricEntity(), - otelRepository, - buildStoreDimensionsMap(k), - VersionRole.class, - VeniceDIVSeverity.class)) - .record(1, role, severity); + getOrCreateEntry(storeName).offsetRewindCount.record(1, role, severity); } - private void recordOtelOneEnumMetric( - String storeName, - int version, - Map> perStoreMap, - DIVOtelMetricEntity metricEntity) { + private void recordOtelProducerFailureCount(String storeName, int version, boolean benign) { if (!emitOtelMetrics) { return; } VersionRole role = classifyVersion(version, versionInfoMap.get(storeName)); - perStoreMap - .computeIfAbsent( - storeName, - k -> MetricEntityStateOneEnum - .create(metricEntity.getMetricEntity(), otelRepository, buildStoreDimensionsMap(k), VersionRole.class)) - .record(1, role); + PerStoreEntry entry = getOrCreateEntry(storeName); + (benign ? entry.benignProducerFailureCount : entry.producerFailureCount).record(1, role); + } + + @Override + public void close() { + // Unregister metadata listener first so handleStore* can't re-populate the maps while we drain. + super.close(); + MetricEntityStateUtils.closeAndClear(perStoreEntryMap); + versionInfoMap.clear(); } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDaVinciRecordTransformerStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDaVinciRecordTransformerStats.java index 84e5000514f..c52646dfb17 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDaVinciRecordTransformerStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/AggVersionedDaVinciRecordTransformerStats.java @@ -10,10 +10,11 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceRecordTransformerOperation; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; -import java.util.HashMap; import java.util.Map; @@ -29,19 +30,29 @@ public class AggVersionedDaVinciRecordTransformerStats private final Map baseDimensionsMap; private final boolean emitOtelMetrics; - /** - * Per-store OTel metric state for latency. Bounded by the number of stores on this host. - * Entries created lazily inside {@link #recordOtelLatency}, removed in - * {@link #handleStoreDeleted(String)}. - */ - private final Map> latencyPerStore = - new VeniceConcurrentHashMap<>(); - - /** - * Per-store OTel metric state for error count. Same bounding and lifecycle as latencyPerStore. - */ - private final Map> errorCountPerStore = - new VeniceConcurrentHashMap<>(); + /** Per-store entry bundling latency + error-count wrappers; one {@code remove()} in {@link #handleStoreDeleted} closes them. */ + private final Map perStoreEntryMap = new VeniceConcurrentHashMap<>(); + + /** Per-store state held by {@link #perStoreEntryMap}. */ + private static final class PerStoreEntry extends AbstractStatsCloseable { + final MetricEntityStateOneEnum latency; + final MetricEntityStateOneEnum errorCount; + + PerStoreEntry(VeniceOpenTelemetryMetricsRepository otelRepository, Map dims) { + this.latency = MetricEntityStateOneEnum.create( + RECORD_TRANSFORMER_LATENCY.getMetricEntity(), + otelRepository, + dims, + VeniceRecordTransformerOperation.class, + statsCloseables); + this.errorCount = MetricEntityStateOneEnum.create( + RECORD_TRANSFORMER_ERROR_COUNT.getMetricEntity(), + otelRepository, + dims, + VeniceRecordTransformerOperation.class, + statsCloseables); + } + } public AggVersionedDaVinciRecordTransformerStats( MetricsRepository metricsRepository, @@ -66,8 +77,7 @@ public void handleStoreDeleted(String storeName) { try { super.handleStoreDeleted(storeName); } finally { - latencyPerStore.remove(storeName); - errorCountPerStore.remove(storeName); + MetricEntityStateUtils.closeQuietly(perStoreEntryMap.remove(storeName)); } } @@ -95,48 +105,38 @@ private void recordOtelLatency(String storeName, double value, VeniceRecordTrans if (!emitOtelMetrics) { return; } - latencyPerStore.computeIfAbsent(storeName, k -> createPerStoreMetric(k, RECORD_TRANSFORMER_LATENCY)) - .record(value, operation); + getOrCreateEntry(storeName).latency.record(value, operation); } private void recordOtelErrorCount(String storeName, VeniceRecordTransformerOperation operation) { if (!emitOtelMetrics) { return; } - errorCountPerStore.computeIfAbsent(storeName, k -> createPerStoreMetric(k, RECORD_TRANSFORMER_ERROR_COUNT)) - .record(1, operation); - } - - @VisibleForTesting - boolean hasLatencyMetricFor(String storeName) { - return latencyPerStore.containsKey(storeName); + getOrCreateEntry(storeName).errorCount.record(1, operation); } - @VisibleForTesting - boolean hasErrorCountMetricFor(String storeName) { - return errorCountPerStore.containsKey(storeName); + private PerStoreEntry getOrCreateEntry(String storeName) { + return perStoreEntryMap.computeIfAbsent( + storeName, + k -> new PerStoreEntry( + otelRepository, + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k))); } @VisibleForTesting - int latencyStoreCount() { - return latencyPerStore.size(); + boolean hasMetricsFor(String storeName) { + return perStoreEntryMap.get(storeName) != null; } @VisibleForTesting - int errorCountStoreCount() { - return errorCountPerStore.size(); + int storeCount() { + return perStoreEntryMap.size(); } - private MetricEntityStateOneEnum createPerStoreMetric( - String storeName, - DaVinciRecordTransformerOtelMetricEntity metricEntity) { - Map storeDimensionsMap = new HashMap<>(baseDimensionsMap); - storeDimensionsMap - .put(VeniceMetricsDimensions.VENICE_STORE_NAME, OpenTelemetryMetricsSetup.sanitizeStoreName(storeName)); - return MetricEntityStateOneEnum.create( - metricEntity.getMetricEntity(), - otelRepository, - storeDimensionsMap, - VeniceRecordTransformerOperation.class); + @Override + public void close() { + // Unregister metadata listener first so handleStore* can't re-populate the map while we drain. + super.close(); + MetricEntityStateUtils.closeAndClear(perStoreEntryMap); } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/BlobTransferOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/BlobTransferOtelStats.java index 1ddd86cce72..b31821bc6d9 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/BlobTransferOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/BlobTransferOtelStats.java @@ -11,6 +11,7 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.MetricEntity; import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum; import com.linkedin.venice.stats.metrics.MetricEntityStateTwoEnums; @@ -28,7 +29,7 @@ *

Note: Tehuti metrics are managed separately in {@link BlobTransferStats} and * {@link BlobTransferStatsReporter}. This class handles only OTel metrics. */ -public class BlobTransferOtelStats { +public class BlobTransferOtelStats extends AbstractStatsCloseable { private final boolean emitOtelMetrics; private volatile VersionInfo versionInfo = VersionInfo.NON_EXISTING; @@ -72,18 +73,20 @@ public BlobTransferOtelStats(MetricsRepository metricsRepository, String storeNa otelRepository, baseDimensionsMap, VersionRole.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + statsCloseables); timeMetric = createOneEnumMetric(TIME.getMetricEntity(), otelRepository, baseDimensionsMap); bytesReceivedMetric = createOneEnumMetric(BYTES_RECEIVED.getMetricEntity(), otelRepository, baseDimensionsMap); bytesSentMetric = createOneEnumMetric(BYTES_SENT.getMetricEntity(), otelRepository, baseDimensionsMap); } - private static MetricEntityStateOneEnum createOneEnumMetric( + private MetricEntityStateOneEnum createOneEnumMetric( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap) { - return MetricEntityStateOneEnum.create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class); + return MetricEntityStateOneEnum + .create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class, statsCloseables); } public boolean emitOtelMetrics() { diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/HeartbeatMonitoringServiceStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/HeartbeatMonitoringServiceStats.java index e722dc1caa6..bd366f068db 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/HeartbeatMonitoringServiceStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/HeartbeatMonitoringServiceStats.java @@ -69,7 +69,8 @@ public HeartbeatMonitoringServiceStats( TehutiMetricName.HEARTBEAT_MONITOR_SERVICE_EXCEPTION_COUNT, Collections.singletonList(new Count()), baseDimensionsMap, - VeniceHeartbeatComponent.class); + VeniceHeartbeatComponent.class, + resources); this.reporterHeartbeatMetrics = MetricEntityStateOneEnum.create( HEARTBEAT_MONITORING_HEARTBEAT_COUNT.getMetricEntity(), @@ -78,7 +79,8 @@ public HeartbeatMonitoringServiceStats( TehutiMetricName.HEARTBEAT_REPORTER, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceHeartbeatComponent.class); + VeniceHeartbeatComponent.class, + resources); this.loggerHeartbeatMetrics = MetricEntityStateOneEnum.create( HEARTBEAT_MONITORING_HEARTBEAT_COUNT.getMetricEntity(), @@ -87,7 +89,8 @@ public HeartbeatMonitoringServiceStats( TehutiMetricName.HEARTBEAT_LOGGER, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceHeartbeatComponent.class); + VeniceHeartbeatComponent.class, + resources); } public void recordHeartbeatExceptionCount(VeniceHeartbeatComponent component) { diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/KafkaConsumerServiceStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/KafkaConsumerServiceStats.java index 375d1cd43c4..af7027dbff3 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/KafkaConsumerServiceStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/KafkaConsumerServiceStats.java @@ -140,7 +140,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.BYTES_PER_POLL, Arrays.asList(new Min(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); pollRecordCountOtel = MetricEntityStateBase.create( POLL_RECORD_COUNT.getMetricEntity(), @@ -149,7 +150,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.CONSUMER_POLL_RESULT_NUM, Arrays.asList(new Avg(), new Min()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); // Total-only OTel repository: null for per-store instances to avoid registering // unused OTel instruments. Total-only metrics are only recorded via aggStats.recordTotal*(). @@ -180,7 +182,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.CONSUMER_POLL_REQUEST, Collections.singletonList(new LongAdderRateGauge(time)), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); TehutiSensorRegistrationFunction pollNonEmptyTehutiReg = totalStats == null ? this::registerSensorIfAbsent @@ -192,7 +195,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.CONSUMER_POLL_NON_ZERO_RESULT_NUM, Collections.singletonList(new LongAdderRateGauge(time)), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); pollTimeOtel = MetricEntityStateBase.create( POLL_TIME.getMetricEntity(), @@ -201,7 +205,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.CONSUMER_POLL_REQUEST_LATENCY, Arrays.asList(new Avg(), new Max()), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); pollErrorCountOtel = MetricEntityStateBase.create( POLL_ERROR_COUNT.getMetricEntity(), @@ -210,7 +215,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.CONSUMER_POLL_ERROR, Collections.singletonList(new OccurrenceRate()), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); produceToWriteBufferTimeOtel = MetricEntityStateBase.create( PRODUCE_TO_WRITE_BUFFER_TIME.getMetricEntity(), @@ -219,7 +225,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.CONSUMER_RECORDS_PRODUCING_TO_WRITE_BUFFER_LATENCY, Arrays.asList(new Avg(), new Max()), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); topicDetectedDeletedCountOtel = MetricEntityStateBase.create( TOPIC_DETECTED_DELETED_COUNT.getMetricEntity(), @@ -228,7 +235,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.DETECTED_DELETED_TOPIC_NUM, Collections.singletonList(new Total()), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); orphanTopicPartitionCountOtel = MetricEntityStateBase.create( ORPHAN_TOPIC_PARTITION_COUNT.getMetricEntity(), @@ -237,7 +245,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.DETECTED_NO_RUNNING_INGESTION_TOPIC_PARTITION_NUM, Collections.singletonList(new Total()), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); pollTimeSinceLastSuccessOtel = MetricEntityStateBase.create( POLL_TIME_SINCE_LAST_SUCCESS.getMetricEntity(), @@ -246,7 +255,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.IDLE_TIME, Collections.singletonList(new Max()), nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); // Shared OTel instrument for subscribe + update_assignment, each with its own Tehuti sensor subscribeActionTimeOtel = MetricEntityStateOneEnum.create( @@ -256,7 +266,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.DELEGATE_SUBSCRIBE_LATENCY, Arrays.asList(new Avg(), new Max()), nonStoreDimensionsMap, - VeniceConsumerPoolAction.class); + VeniceConsumerPoolAction.class, + resources); updateAssignmentActionTimeOtel = MetricEntityStateOneEnum.create( POOL_ACTION_TIME.getMetricEntity(), @@ -265,7 +276,8 @@ public KafkaConsumerServiceStats( TehutiMetricName.UPDATE_CURRENT_ASSIGNMENT_LATENCY, Arrays.asList(new Avg(), new Max()), nonStoreDimensionsMap, - VeniceConsumerPoolAction.class); + VeniceConsumerPoolAction.class, + resources); // Tehuti-only async gauge: OTel intentionally omitted because this reads from the same source // method (getMaxElapsedTimeMSSinceLastPollInConsumerPool) that also records to the @@ -282,7 +294,8 @@ public KafkaConsumerServiceStats( PARTITION_ASSIGNMENT_COUNT.getMetricEntity(), totalOnlyOtelRepo, nonStoreDimensionsMap, - nonStoreAttributes); + nonStoreAttributes, + resources); } // Recording methods diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/NativeMetadataRepositoryStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/NativeMetadataRepositoryStats.java index 0ce390d0226..249fd70d6f1 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/NativeMetadataRepositoryStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/NativeMetadataRepositoryStats.java @@ -6,13 +6,14 @@ import com.linkedin.venice.stats.OpenTelemetryMetricsSetup; import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateBase; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.opentelemetry.api.common.Attributes; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; import java.time.Clock; -import java.util.HashMap; import java.util.Map; import java.util.function.DoubleSupplier; @@ -25,20 +26,32 @@ * high watermark at query time via max aggregation. * *

Per-store OTel callbacks are registered lazily on first {@link #updateCacheTimestamp} call - * and read from the shared {@link #metadataCacheTimestampMapInMs}. When a store is removed, - * the callback returns {@code NaN} (timestamp absent from the map, store no longer tracked). OTel callbacks cannot be - * deregistered (SDK limitation), so the per-store entry stays registered until the process exits. + * and read from the shared {@link #metadataCacheTimestampMapInMs}. When a store is removed via + * {@link #handleStoreDeleted(String)} the OTel callback is deregistered (closed) and the per-store map entry is + * dropped so the SDK stops polling the gauge. */ public class NativeMetadataRepositoryStats extends AbstractVeniceStats { private final Map metadataCacheTimestampMapInMs = new VeniceConcurrentHashMap<>(); private final Clock clock; - // OTel: per-store ASYNC_DOUBLE_GAUGE for staleness. Effectively bounded by the number of - // distinct stores seen during the lifetime of the process, since callbacks cannot be deregistered - // (entries in this map are not removed when a store is unsubscribed). private final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; - private final Map otelPerStore = new VeniceConcurrentHashMap<>(); + /** Per-store ASYNC_DOUBLE_GAUGE; entries are closed in {@link #handleStoreDeleted} so the SDK stops polling. */ + private final Map perStoreEntryMap = new VeniceConcurrentHashMap<>(); + + /** Per-store state held by {@link #perStoreEntryMap}. */ + private static final class PerStoreEntry extends AbstractStatsCloseable { + final AsyncMetricEntityStateBase gauge; + + PerStoreEntry( + VeniceOpenTelemetryMetricsRepository otelRepository, + Map dims, + Attributes attrs, + DoubleSupplier callback) { + this.gauge = AsyncMetricEntityStateBase + .create(METADATA_CACHE_STALENESS.getMetricEntity(), otelRepository, dims, attrs, callback, statsCloseables); + } + } public NativeMetadataRepositoryStats(MetricsRepository metricsRepository, String name, Clock clock) { super(metricsRepository, name); @@ -85,27 +98,33 @@ public void updateCacheTimestamp(String storeName, String clusterName, long cach registerOtelGaugeIfAbsent(storeName, clusterName); } - public void removeCacheTimestamp(String storeName) { + /** Removes the Tehuti cache-timestamp entry and closes the per-store OTel async-gauge wrapper. */ + public void handleStoreDeleted(String storeName) { metadataCacheTimestampMapInMs.remove(storeName); - // OTel callback stays registered but returns NaN (timestamp absent from map, store no longer tracked) + MetricEntityStateUtils.closeQuietly(perStoreEntryMap.remove(storeName)); + } + + @Override + public void close() { + MetricEntityStateUtils.closeAndClear(perStoreEntryMap); + super.close(); } private void registerOtelGaugeIfAbsent(String storeName, String clusterName) { if (otelRepository == null) { return; } - otelPerStore.computeIfAbsent(storeName, k -> { - Map dims = new HashMap<>(baseDimensionsMap); + perStoreEntryMap.computeIfAbsent(storeName, k -> { + Map dims = + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k); dims.put(VeniceMetricsDimensions.VENICE_CLUSTER_NAME, clusterName); - dims.put(VeniceMetricsDimensions.VENICE_STORE_NAME, OpenTelemetryMetricsSetup.sanitizeStoreName(k)); Attributes attrs = otelRepository.createAttributes(METADATA_CACHE_STALENESS.getMetricEntity(), dims); - // DoubleSupplier callback: returns NaN when store is removed (no timestamp in map), + // DoubleSupplier callback returns NaN when store is removed (no timestamp in map), // consistent with the Tehuti high-watermark gauge behavior. - return AsyncMetricEntityStateBase - .create(METADATA_CACHE_STALENESS.getMetricEntity(), otelRepository, dims, attrs, (DoubleSupplier) () -> { - Long ts = metadataCacheTimestampMapInMs.get(k); - return ts == null ? Double.NaN : (double) (clock.millis() - ts); - }); + return new PerStoreEntry(otelRepository, dims, attrs, () -> { + Long ts = metadataCacheTimestampMapInMs.get(k); + return ts == null ? Double.NaN : (double) (clock.millis() - ts); + }); }); } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStateTransitionStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStateTransitionStats.java index 94703b9a5d4..4b15c4b6303 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStateTransitionStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStateTransitionStats.java @@ -88,17 +88,23 @@ public ParticipantStateTransitionStats( otelRepository, baseDimensionsMap, VeniceHelixFromState.class, - VeniceHelixToState.class); + VeniceHelixToState.class, + resources); inProgressMetric = MetricEntityStateTwoEnums.create( IN_PROGRESS_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, VeniceHelixFromState.class, - VeniceHelixToState.class); + VeniceHelixToState.class, + resources); - steadyStateMetric = MetricEntityStateOneEnum - .create(STEADY_STATE_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, VeniceHelixSteadyState.class); + steadyStateMetric = MetricEntityStateOneEnum.create( + STEADY_STATE_COUNT.getMetricEntity(), + otelRepository, + baseDimensionsMap, + VeniceHelixSteadyState.class, + resources); } /** @@ -151,6 +157,17 @@ public void trackStateTransitionFailed(String fromState, String toState) { } private void recordInProgressOtel(long delta, String fromState, String toState) { + // Pre-check nulls so Enum.valueOf doesn't throw NPE (which would surface as a less specific + // failure than the IAE for an unknown state). Both null and unknown enum values funnel into + // recordFailureMetric for diagnosability. + if (fromState == null || toState == null) { + if (otelRepository != null) { + otelRepository.recordFailureMetric( + IN_PROGRESS_COUNT.getMetricEntity(), + new IllegalArgumentException("null state: fromState=" + fromState + ", toState=" + toState)); + } + return; + } try { VeniceHelixFromState from = VeniceHelixFromState.valueOf(fromState); VeniceHelixToState to = VeniceHelixToState.valueOf(toState); @@ -168,6 +185,14 @@ private void recordInProgressOtel(long delta, String fromState, String toState) } private void recordSteadyStateOtel(long delta, String state) { + if (state == null) { + if (otelRepository != null) { + otelRepository.recordFailureMetric( + STEADY_STATE_COUNT.getMetricEntity(), + new IllegalArgumentException("null state: state=null")); + } + return; + } try { VeniceHelixSteadyState steadyState = VeniceHelixSteadyState.valueOf(state); steadyStateMetric.record(delta, steadyState); diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStats.java index 5cbee3c837b..e0945534653 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStats.java @@ -1,7 +1,5 @@ package com.linkedin.davinci.stats; -import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; - import com.linkedin.venice.stats.AbstractVeniceStats; import com.linkedin.venice.stats.OpenTelemetryMetricsSetup; import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; @@ -21,7 +19,6 @@ import io.tehuti.metrics.stats.OccurrenceRate; import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; @@ -83,7 +80,8 @@ public ParticipantStoreConsumptionStats(MetricsRepository metricsRepository, Str TehutiMetricName.HEARTBEAT, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); this.failedInitializationMetric = MetricEntityStateBase.create( ParticipantStoreConsumptionOtelMetricEntity.FAILED_INITIALIZATION_COUNT.getMetricEntity(), @@ -92,13 +90,8 @@ public ParticipantStoreConsumptionStats(MetricsRepository metricsRepository, Str TehutiMetricName.FAILED_INITIALIZATION, Collections.singletonList(new Count()), baseDimensionsMap, - baseAttributes); - } - - private Map buildStoreDimensionsMap(String storeName) { - Map map = new HashMap<>(baseDimensionsMap); - map.put(VENICE_STORE_NAME, storeName); - return Collections.unmodifiableMap(map); + baseAttributes, + resources); } private Attributes buildStoreAttributes( @@ -113,7 +106,8 @@ private MetricEntityStateBase createPerStoreBaseMetric( List tehutiStats, String storeName) { MetricEntity metricEntity = otelMetric.getMetricEntity(); - Map storeDimensionsMap = buildStoreDimensionsMap(storeName); + Map storeDimensionsMap = + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, storeName); return MetricEntityStateBase.create( metricEntity, otelRepository, @@ -121,7 +115,8 @@ private MetricEntityStateBase createPerStoreBaseMetric( tehutiName, tehutiStats, storeDimensionsMap, - buildStoreAttributes(metricEntity, storeDimensionsMap)); + buildStoreAttributes(metricEntity, storeDimensionsMap), + resources); } /** @@ -150,8 +145,9 @@ public void recordKilledPushJobs(String storeName) { this::registerSensorIfAbsent, TehutiMetricName.KILLED_PUSH_JOBS, Collections.singletonList(new Count()), - buildStoreDimensionsMap(k), - VeniceResponseStatusCategory.class)) + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k), + VeniceResponseStatusCategory.class, + resources)) .record(1, VeniceResponseStatusCategory.SUCCESS); } @@ -162,8 +158,9 @@ public void recordFailedKillPushJob(String storeName) { k -> MetricEntityStateOneEnum.create( ParticipantStoreConsumptionOtelMetricEntity.KILL_PUSH_JOB_COUNT.getMetricEntity(), otelRepository, - buildStoreDimensionsMap(k), - VeniceResponseStatusCategory.class)) + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k), + VeniceResponseStatusCategory.class, + resources)) .record(1, VeniceResponseStatusCategory.FAIL); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/RocksDBMemoryStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/RocksDBMemoryStats.java index fc6ff5e9138..91330766ab5 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/RocksDBMemoryStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/RocksDBMemoryStats.java @@ -249,7 +249,8 @@ private void registerAsyncGauge( Collections.singletonList(new AsyncGauge((ig, ig2) -> valueSupplier.getAsLong(), sensorName)), baseDimensionsMap, baseAttributes, - valueSupplier); + valueSupplier, + resources); } /** diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ServerMetadataServiceStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ServerMetadataServiceStats.java index eb904da8f57..b6fea62669f 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ServerMetadataServiceStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ServerMetadataServiceStats.java @@ -1,19 +1,18 @@ package com.linkedin.davinci.stats; -import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; - import com.linkedin.venice.exceptions.VeniceNoStoreException; import com.linkedin.venice.stats.AbstractVeniceStats; import com.linkedin.venice.stats.OpenTelemetryMetricsSetup; import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.Sensor; import io.tehuti.metrics.stats.Rate; -import java.util.HashMap; import java.util.Map; @@ -42,12 +41,26 @@ public class ServerMetadataServiceStats extends AbstractVeniceStats { private final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; /** - * Per-store OTel metrics. When the requested store does not exist (e.g., {@link VeniceNoStoreException}), - * {@link OpenTelemetryMetricsSetup#UNKNOWN_STORE_NAME} is used as a sentinel so that all unknown-store failures - * share one metric entry. Cardinality is bounded by stores deployed to this server + the sentinel. + * Per-store entry map; process-lifetime, drained on {@link #close()}. Unknown stores share the + * {@link OpenTelemetryMetricsSetup#UNKNOWN_STORE_NAME} sentinel. */ - private final Map> perStoreMetrics = - new VeniceConcurrentHashMap<>(); + private final Map perStoreEntryMap = new VeniceConcurrentHashMap<>(); + + private static final class PerStoreEntry extends AbstractStatsCloseable { + final MetricEntityStateOneEnum wrapper; + + PerStoreEntry(VeniceOpenTelemetryMetricsRepository otelRepository, Map dims) { + // Uses the 4-arg (OTel-only) create overload intentionally: the 7-arg overload would bind + // Tehuti recording to every record() call, but we only want Tehuti on failure, not success. + // Tehuti and OTel are therefore recorded in separate steps in recordRequestBasedMetadataFailureCount. + this.wrapper = MetricEntityStateOneEnum.create( + ServerMetadataOtelMetricEntity.METADATA_REQUEST_COUNT.getMetricEntity(), + otelRepository, + dims, + VeniceResponseStatusCategory.class, + statsCloseables); + } + } public ServerMetadataServiceStats(MetricsRepository metricsRepository, String clusterName) { super(metricsRepository, "ServerMetadataStats"); @@ -62,19 +75,12 @@ public ServerMetadataServiceStats(MetricsRepository metricsRepository, String cl this.baseDimensionsMap = otelData.getBaseDimensionsMap(); } - // Uses the 4-arg (OTel-only) create overload intentionally: the 7-arg overload would bind - // Tehuti recording to every record() call, but we only want Tehuti on failure, not success. - // Tehuti and OTel are therefore recorded in separate steps in recordRequestBasedMetadataFailureCount. - private MetricEntityStateOneEnum getStoreMetrics(String storeName) { - return perStoreMetrics.computeIfAbsent(storeName, k -> { - Map dimensionsMap = new HashMap<>(baseDimensionsMap); - dimensionsMap.put(VENICE_STORE_NAME, k); - return MetricEntityStateOneEnum.create( - ServerMetadataOtelMetricEntity.METADATA_REQUEST_COUNT.getMetricEntity(), - otelRepository, - dimensionsMap, - VeniceResponseStatusCategory.class); - }); + private PerStoreEntry getOrCreateEntry(String storeName) { + return perStoreEntryMap.computeIfAbsent( + storeName, + k -> new PerStoreEntry( + otelRepository, + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k))); } public void recordRequestBasedMetadataInvokeCount() { @@ -83,7 +89,7 @@ public void recordRequestBasedMetadataInvokeCount() { public void recordRequestBasedMetadataSuccessCount(String storeName) { if (emitOtelMetrics) { - getStoreMetrics(storeName).record(1, VeniceResponseStatusCategory.SUCCESS); + getOrCreateEntry(storeName).wrapper.record(1, VeniceResponseStatusCategory.SUCCESS); } } @@ -97,8 +103,14 @@ public void recordRequestBasedMetadataFailureCount(String storeName, Exception e if (emitOtelMetrics) { String metricStoreName = (e instanceof VeniceNoStoreException) ? OpenTelemetryMetricsSetup.UNKNOWN_STORE_NAME : storeName; - getStoreMetrics(metricStoreName).record(1, VeniceResponseStatusCategory.FAIL); + getOrCreateEntry(metricStoreName).wrapper.record(1, VeniceResponseStatusCategory.FAIL); } } + @Override + public void close() { + MetricEntityStateUtils.closeAndClear(perStoreEntryMap); + super.close(); + } + } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StorageEngineOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StorageEngineOtelStats.java index b5cd40e1da1..549bc81fb0e 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StorageEngineOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StorageEngineOtelStats.java @@ -14,12 +14,12 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceRecordType; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateOneEnum; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateTwoEnums; import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; -import java.io.Closeable; import java.util.Map; @@ -37,7 +37,7 @@ *

Tehuti metrics are managed separately by * {@link AggVersionedStorageEngineStats.StorageEngineStatsReporter}. */ -public class StorageEngineOtelStats implements Closeable { +public class StorageEngineOtelStats extends AbstractStatsCloseable { private final boolean emitOtelMetrics; /** @@ -89,7 +89,8 @@ public StorageEngineOtelStats(MetricsRepository metricsRepository, String storeN VeniceRecordType.class, VersionRole.class, (recordType, role) -> getWrapperForRole(role), - (wrapper, recordType, role) -> diskUsage(wrapper, recordType)); + (wrapper, recordType, role) -> diskUsage(wrapper, recordType), + statsCloseables); this.keyCountMetric = AsyncMetricEntityStateOneEnum.create( KEY_COUNT_ESTIMATE.getMetricEntity(), @@ -97,11 +98,16 @@ public StorageEngineOtelStats(MetricsRepository metricsRepository, String storeN baseDimensionsMap, VersionRole.class, role -> getWrapperForRole(role), - (wrapper, role) -> wrapper.getKeyCountEstimate()); + (wrapper, role) -> wrapper.getKeyCountEstimate(), + statsCloseables); // RocksDB open failure count: COUNTER with VersionRole dimension - this.openFailureMetric = MetricEntityStateOneEnum - .create(ROCKSDB_OPEN_FAILURE_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, VersionRole.class); + this.openFailureMetric = MetricEntityStateOneEnum.create( + ROCKSDB_OPEN_FAILURE_COUNT.getMetricEntity(), + otelRepository, + baseDimensionsMap, + VersionRole.class, + statsCloseables); } else { this.diskUsageMetrics = null; this.keyCountMetric = null; @@ -188,13 +194,17 @@ private static long diskUsage(StorageEngineStatsWrapper wrapper, VeniceRecordTyp } /** - * Clears internal wrapper references. On subsequent collections each async-gauge's - * {@code liveStateResolver} will return {@code null} for every role and no data points will be - * emitted. The SDK instruments themselves are NOT deregistered — they remain registered and - * are polled until the SDK is shut down. + * Closes per-store SDK instruments and releases internal wrapper references. The two async + * gauges have their callbacks deregistered (SDK stops polling); the sync counter releases its + * wrapper-side memory only (SDK aggregator persists until MeterProvider close). + * + *

Per-store close only — these instruments span all versions of this store. Per-version + * cleanup goes through {@link #onVersionRemoved}, which removes the version's wrapper from + * {@link #wrappersByVersion} but does NOT close the per-store instruments. */ @Override public void close() { wrappersByVersion.clear(); + super.close(); } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreBufferServiceStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreBufferServiceStats.java index 59cfa1050d9..18bc58ecdab 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreBufferServiceStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreBufferServiceStats.java @@ -19,7 +19,6 @@ import io.tehuti.metrics.stats.OccurrenceRate; import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.function.LongSupplier; @@ -35,6 +34,11 @@ enum TehutiMetricName implements TehutiMetricNameEnum { private final Map baseDimensionsMap; private final Attributes baseAttributes; + private final AsyncMetricEntityStateBase memoryUsedMetric; + private final AsyncMetricEntityStateBase memoryRemainingMetric; + private final AsyncMetricEntityStateBase memoryUsedPerWriterMaxMetric; + private final AsyncMetricEntityStateBase memoryUsedPerWriterMinMetric; + /** * Per-store latency metric states. Bounded by the number of active stores on this server (typically < 100). * All stores share a single Tehuti sensor (registered once via {@code registerSensorIfAbsent}); per-store @@ -72,31 +76,29 @@ public StoreBufferServiceStats( this.baseAttributes = otelData.getBaseAttributes(); // Memory metrics (#1-4): joint Tehuti+OTel AsyncGauge. - // Return values are intentionally discarded — the gauge callback is registered internally - // by the Tehuti sensor and OTel SDK during create(). No per-recording state is needed. - registerMemoryGauge( + memoryUsedMetric = registerMemoryGauge( StoreBufferServiceOtelMetricEntity.MEMORY_USED, TehutiMetricName.TOTAL_MEMORY_USAGE, totalMemoryUsageSupplier); - registerMemoryGauge( + memoryRemainingMetric = registerMemoryGauge( StoreBufferServiceOtelMetricEntity.MEMORY_REMAINING, TehutiMetricName.TOTAL_REMAINING_MEMORY, totalRemainingMemorySupplier); - registerMemoryGauge( + memoryUsedPerWriterMaxMetric = registerMemoryGauge( StoreBufferServiceOtelMetricEntity.MEMORY_USED_PER_WRITER_MAX, TehutiMetricName.MAX_MEMORY_USAGE_PER_WRITER, maxMemoryUsagePerDrainerSupplier); - registerMemoryGauge( + memoryUsedPerWriterMinMetric = registerMemoryGauge( StoreBufferServiceOtelMetricEntity.MEMORY_USED_PER_WRITER_MIN, TehutiMetricName.MIN_MEMORY_USAGE_PER_WRITER, minMemoryUsagePerDrainerSupplier); } - private void registerMemoryGauge( + private AsyncMetricEntityStateBase registerMemoryGauge( StoreBufferServiceOtelMetricEntity metricEntity, TehutiMetricName tehutiName, LongSupplier supplier) { - AsyncMetricEntityStateBase.create( + return AsyncMetricEntityStateBase.create( metricEntity.getMetricEntity(), otelRepository, this::registerSensorIfAbsent, @@ -104,7 +106,8 @@ private void registerMemoryGauge( Collections.singletonList(new AsyncGauge((ig, ig2) -> supplier.getAsLong(), tehutiName.getMetricName())), baseDimensionsMap, baseAttributes, - supplier); + supplier, + resources); } private MetricEntityStateBase createPerStoreState( @@ -112,12 +115,18 @@ private MetricEntityStateBase createPerStoreState( MetricEntity metricEntity, TehutiMetricNameEnum tehutiName, List tehutiStats) { - Map dims = new HashMap<>(baseDimensionsMap); - dims.put(VeniceMetricsDimensions.VENICE_STORE_NAME, storeName); - dims = Collections.unmodifiableMap(dims); + Map dims = + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, storeName); Attributes attrs = otelRepository != null ? otelRepository.createAttributes(metricEntity, dims) : null; - return MetricEntityStateBase - .create(metricEntity, otelRepository, this::registerSensorIfAbsent, tehutiName, tehutiStats, dims, attrs); + return MetricEntityStateBase.create( + metricEntity, + otelRepository, + this::registerSensorIfAbsent, + tehutiName, + tehutiStats, + dims, + attrs, + resources); } private MetricEntityStateBase getOrCreateLatencyState(String storeName) { @@ -147,4 +156,12 @@ public void recordInternalProcessingLatency(long latency, String storeName) { public void recordInternalProcessingError(String storeName) { getOrCreateErrorState(storeName).record(1); } + + /** Closes all OTel metric wrappers held by this stats instance, including per-store entries. */ + @Override + public void close() { + super.close(); + latencyPerStore.clear(); + errorPerStore.clear(); + } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreVersionOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreVersionOtelStats.java index 39ef8f7ab4d..43fc8283f0c 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreVersionOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StoreVersionOtelStats.java @@ -11,6 +11,7 @@ import com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateBase; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.opentelemetry.api.common.Attributes; import io.tehuti.metrics.MetricsRepository; @@ -33,14 +34,7 @@ *

This class implements {@link StoreDataChangedListener} and should be registered once per process * on the metadata repository via {@link #register(ReadOnlyStoreRepository)}. Per-store state is * created lazily on first store change and bounded by the number of distinct store names ever - * observed by the process (entries are not removed on deletion — see cleanup limitation below). - * - *

Cleanup limitation: the OTel SDK does support per-instrument deregistration via - * {@code ObservableLongGauge.close()}, but the current Venice wrapper - * ({@link AsyncMetricEntityStateBase}) doesn't surface the SDK instrument handle, so callbacks - * remain registered until the {@link MetricsRepository} is closed. On store deletion, version - * info is reset to {@link VersionInfo#NON_EXISTING} rather than removed — see - * {@link #handleStoreDeleted} for why removing the map entry is unsafe given the current wrapper. + * observed by the process. Entries are not removed on deletion — see {@link #handleStoreDeleted}. */ public class StoreVersionOtelStats implements StoreDataChangedListener, Closeable { private static final Logger LOGGER = LogManager.getLogger(StoreVersionOtelStats.class); @@ -51,9 +45,8 @@ public class StoreVersionOtelStats implements StoreDataChangedListener, Closeabl private final Map> perStoreVersions = new VeniceConcurrentHashMap<>(); /** - * Tracks the metadata repository this OTel listener is registered on. Set when - * {@link #register} succeeds; null otherwise (OTel disabled or never registered). Used by - * {@link #close} to deregister the OTel listener. + * The metadata repository this listener is registered on. Set when {@link #register} succeeds; + * null otherwise (OTel disabled or never registered). Used by {@link #close} to deregister. */ private ReadOnlyStoreRepository registeredMetadataRepository; @@ -124,7 +117,7 @@ public void handleStoreChanged(Store store) { // set() after updates existing entries with the latest data from the ZK event. perStoreVersions.computeIfAbsent(storeName, k -> { AtomicReference newRef = new AtomicReference<>(newInfo); - registerOtelGauge(k, newRef); + registerOtelGauges(k, newRef); return newRef; }).set(newInfo); } @@ -136,7 +129,7 @@ private void initializeStoreIfAbsent(Store store) { VersionInfo newInfo = computeVersionInfo(store); perStoreVersions.computeIfAbsent(storeName, k -> { AtomicReference newRef = new AtomicReference<>(newInfo); - registerOtelGauge(k, newRef); + registerOtelGauges(k, newRef); return newRef; }); } @@ -149,11 +142,11 @@ private static VersionInfo computeVersionInfo(Store store) { /** * Resets version info to {@link VersionInfo#NON_EXISTING} rather than removing the map entry. - * The async-gauge callback closes over the {@link AtomicReference}, which the Venice wrapper - * doesn't currently surface for de-registration. Removing the map entry would orphan the live - * callback (SDK keeps polling stale data); a subsequent re-create would register a second - * callback emitting under the same attributes. Resetting keeps one live callback pointed at - * the right state across delete→re-create cycles. + * The ASYNC_GAUGE callback closes over the {@link AtomicReference}, so removing the entry would + * orphan the SDK-side callback (still polling, with no clean way to deregister it through the + * current wrapper). Resetting keeps one live callback pointed at sentinel values + * ({@code NON_EXISTING_VERSION}) which dashboards can filter; map cardinality stays bounded by + * the host's store count across delete→re-create cycles. */ @Override public void handleStoreDeleted(String storeName) { @@ -166,24 +159,22 @@ public void handleStoreDeleted(String storeName) { } } - /** - * Registers two ASYNC_GAUGE callbacks: one for CURRENT and one for FUTURE. - * Only these two roles are tracked — backup version number is not tracked. - */ - private void registerOtelGauge(String storeName, AtomicReference versionInfoRef) { - Map storeDims = new HashMap<>(baseDimensionsMap); - storeDims.put(VeniceMetricsDimensions.VENICE_STORE_NAME, OpenTelemetryMetricsSetup.sanitizeStoreName(storeName)); + /** Registers ASYNC_GAUGE callbacks for both CURRENT and FUTURE roles for a single store. */ + private void registerOtelGauges(String storeName, AtomicReference versionInfoRef) { + Map storeDims = + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, storeName); registerRoleGauge(storeDims, VersionRole.CURRENT, () -> versionInfoRef.get().getCurrentVersion()); registerRoleGauge(storeDims, VersionRole.FUTURE, () -> versionInfoRef.get().getFutureVersion()); } - private void registerRoleGauge( + private AsyncMetricEntityStateBase registerRoleGauge( Map storeDims, VersionRole role, LongSupplier callback) { Map dims = new HashMap<>(storeDims); dims.put(VeniceMetricsDimensions.VENICE_VERSION_ROLE, role.getDimensionValue()); Attributes attrs = otelRepository.createAttributes(STORE_VERSION.getMetricEntity(), dims); - AsyncMetricEntityStateBase.create(STORE_VERSION.getMetricEntity(), otelRepository, dims, attrs, callback); + return AsyncMetricEntityStateBase + .create(STORE_VERSION.getMetricEntity(), otelRepository, dims, attrs, callback, CompositeCloseable.NONE); } } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StuckConsumerRepairStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StuckConsumerRepairStats.java index 73f0693d815..f8606389907 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StuckConsumerRepairStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/StuckConsumerRepairStats.java @@ -41,7 +41,8 @@ public StuckConsumerRepairStats(MetricsRepository metricsRepository, String clus TehutiMetricName.STUCK_CONSUMER_FOUND, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); ingestionTaskRepairOtel = MetricEntityStateBase.create( STUCK_CONSUMER_TASK_REPAIRED_COUNT.getMetricEntity(), @@ -50,7 +51,8 @@ public StuckConsumerRepairStats(MetricsRepository metricsRepository, String clus TehutiMetricName.INGESTION_TASK_REPAIR, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); repairFailureOtel = MetricEntityStateBase.create( STUCK_CONSUMER_UNRESOLVED_COUNT.getMetricEntity(), @@ -59,7 +61,8 @@ public StuckConsumerRepairStats(MetricsRepository metricsRepository, String clus TehutiMetricName.REPAIR_FAILURE, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordStuckConsumerFound() { diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/IngestionOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/IngestionOtelStats.java index 8796c2fd7fa..68a905dd2ff 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/IngestionOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/IngestionOtelStats.java @@ -78,6 +78,7 @@ import com.linkedin.venice.stats.dimensions.VenicePartialUpdateOperation; import com.linkedin.venice.stats.dimensions.VeniceRecordType; import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateOneEnum; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateTwoEnums; import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.LiveStateResolverOneEnum; @@ -100,7 +101,7 @@ * OpenTelemetry metrics for ingestion statistics. * Note: Tehuti metrics are managed separately in {@link com.linkedin.davinci.stats.IngestionStatsReporter}. */ -public class IngestionOtelStats { +public class IngestionOtelStats extends AbstractStatsCloseable { private final boolean emitOtelMetrics; private final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; @@ -360,7 +361,8 @@ public IngestionOtelStats( baseDimensionsMap, VersionRole.class, VeniceIngestionSourceComponent.class, - VeniceIngestionDestinationComponent.class); + VeniceIngestionDestinationComponent.class, + statsCloseables); // Initialize RT region metric maps this.rtRecordsConsumedByRegion = new VeniceConcurrentHashMap<>(); @@ -482,17 +484,24 @@ public void removeIngestionTask(int version) { } /** - * Cleans up all per-version state for this store. Call this when the store is being deleted. - * After this call each async-gauge's {@code liveStateResolver} returns {@code null} for every - * role, so no data points are emitted for this store on subsequent collections. The SDK - * instruments themselves are not deregistered (OTel does not support it), so this object is - * retained until JVM shutdown — only relevant on store deletion or when no versions remain on - * this host. + * Cleans up all per-version state for this store and deregisters the SDK callbacks. Call this + * when the store is being deleted. After this call: + *

    + *
  • Async gauges are deregistered (the SDK stops polling them).
  • + *
  • Sync wrapper memory is released; the SDK-side aggregator persists until MeterProvider + * close.
  • + *
  • Per-region wrappers are closed and their map cleared.
  • + *
*/ + @Override public void close() { ingestionTasksByVersion.clear(); pushTimeoutByVersion.clear(); idleTimeByVersion.clear(); + + super.close(); + + // Per-region maps: wrappers are closed via the registry above; clear the map references. rtRecordsConsumedByRegion.clear(); rtBytesConsumedByRegion.clear(); } @@ -516,14 +525,15 @@ public void recordIdleTime(int version, long idleTimeMs) { // Helper methods private MetricEntityStateOneEnum createOneEnumMetric(MetricEntity metricEntity) { - return MetricEntityStateOneEnum.create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class); + return MetricEntityStateOneEnum + .create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class, statsCloseables); } private & VeniceDimensionInterface> MetricEntityStateTwoEnums createTwoEnumMetric( MetricEntity metricEntity, Class enumClass) { return MetricEntityStateTwoEnums - .create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class, enumClass); + .create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class, enumClass, statsCloseables); } /** @@ -534,8 +544,14 @@ private AsyncMetricEntityStateOneEnum createAsyncByRole( MetricEntity metricEntity, LiveStateResolverOneEnum liveStateResolver, ValueResolverOneEnum valueResolver) { - return AsyncMetricEntityStateOneEnum - .create(metricEntity, otelRepository, baseDimensionsMap, VersionRole.class, liveStateResolver, valueResolver); + return AsyncMetricEntityStateOneEnum.create( + metricEntity, + otelRepository, + baseDimensionsMap, + VersionRole.class, + liveStateResolver, + valueResolver, + statsCloseables); } /** @@ -553,7 +569,8 @@ private AsyncMetricEntityStateTwoEnums createAsync VersionRole.class, ReplicaType.class, liveStateResolver, - valueResolver); + valueResolver, + statsCloseables); } public boolean emitOtelMetrics() { @@ -664,7 +681,8 @@ private MetricEntityStateTwoEnums getOrCreate otelRepository, dims, VersionRole.class, - VeniceRegionLocality.class); + VeniceRegionLocality.class, + statsCloseables); }); } @@ -678,7 +696,8 @@ private MetricEntityStateTwoEnums getOrCreate otelRepository, dims, VersionRole.class, - VeniceRegionLocality.class); + VeniceRegionLocality.class, + statsCloseables); }); } diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java index 842fc4ace59..88158ed0f0e 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatMonitoringService.java @@ -21,6 +21,7 @@ import com.linkedin.venice.stats.dimensions.VeniceHeartbeatComponent; import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.LogContext; import com.linkedin.venice.utils.RegionUtils; import com.linkedin.venice.utils.Utils; @@ -83,6 +84,8 @@ public class HeartbeatMonitoringService extends AbstractVeniceService { private final Map cleanupHeartbeatMap; private final HeartbeatVersionedStats versionStatsReporter; private final HeartbeatMonitoringServiceStats heartbeatMonitoringServiceStats; + /** Stats fields owned by this class; drained by {@link #stopInner}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private final Duration maxWaitForVersionInfo; private final CompletableFuture customizedViewRepositoryFuture; private final String nodeId; @@ -119,8 +122,8 @@ public HeartbeatMonitoringService( leaderHeartbeatTimeStamps, followerHeartbeatTimeStamps, serverConfig.getClusterName()); - this.heartbeatMonitoringServiceStats = - Objects.requireNonNull(heartbeatMonitoringServiceStats, "heartbeatMonitoringServiceStats cannot be null"); + this.heartbeatMonitoringServiceStats = statsCloseables.register( + Objects.requireNonNull(heartbeatMonitoringServiceStats, "heartbeatMonitoringServiceStats cannot be null")); this.customizedViewRepositoryFuture = customizedViewRepositoryFuture; this.nodeId = Utils.getHelixNodeIdentifier(serverConfig.getListenerHostname(), serverConfig.getListenerPort()); this.lagMonitorCleanupCycle = serverConfig.getLagMonitorCleanupCycle(); @@ -390,6 +393,7 @@ public void stopInner() throws Exception { heartbeatReporterThreadIsRunning.set(false); reportingThread.interrupt(); lagCleanupAndLoggingThread.interrupt(); + statsCloseables.close(); } /** diff --git a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java index 1b0e6ac12cf..e0926381d37 100644 --- a/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java +++ b/clients/da-vinci-client/src/main/java/com/linkedin/davinci/stats/ingestion/heartbeat/HeartbeatOtelStats.java @@ -14,6 +14,7 @@ import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceRegionLocality; import com.linkedin.venice.stats.dimensions.VeniceStoreWriteType; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.stats.metrics.MetricEntityStateFiveEnums; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; @@ -25,7 +26,7 @@ * OpenTelemetry metrics for heartbeat monitoring. * Note: Tehuti metrics are managed separately in {@link HeartbeatStatReporter}. */ -public class HeartbeatOtelStats { +public class HeartbeatOtelStats extends AbstractStatsCloseable { private final boolean emitOtelMetrics; private final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; @@ -135,7 +136,8 @@ private MetricEntityStateFiveEnumsNote: Tehuti metrics are managed separately in {@link HeartbeatStatReporter}. */ -public class RecordLevelDelayOtelStats { +public class RecordLevelDelayOtelStats extends AbstractStatsCloseable { private final boolean emitOtelMetrics; private final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; @@ -149,7 +150,8 @@ private MetricEntityStateFiveEnums()); stats = new ParticipantStateTransitionStats(metricsRepository, executor, TEST_POOL_NAME); } @@ -224,11 +221,8 @@ public void testInvalidStateIncrementsMetricRecordFailure() { @Test public void testNoNpeWhenOtelDisabled() { AsyncGauge.AsyncGaugeExecutor dedicatedExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(dedicatedExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, dedicatedExecutor)) { exerciseAllRecordingPaths(disabledRepo); } } @@ -266,4 +260,42 @@ private static Attributes buildSteadyStateAttributes(String state) { .put(VENICE_HELIX_STATE.getDimensionNameInDefaultFormat(), state.toLowerCase()) .build(); } + + // --- Lifecycle test --- + + @Test + public void testCloseIsIdempotentAndPostCloseRecordIsNoOp() { + // Record once so the in-progress wrapper is created on the SDK side, then close. + stats.trackStateTransitionStarted(OFFLINE_STATE, STANDBY_STATE); + stats.trackStateTransitionCompleted(OFFLINE_STATE, STANDBY_STATE); + + // Snapshot the data point count for IN_PROGRESS_COUNT prior to close — used to assert no + // emission after close. + Collection beforeClose = inMemoryMetricReader.collectAllMetrics(); + + // close() releases parent ThreadPoolStats resources via try-finally and the subclass's own + // CompositeCloseable. A second close() must be a silent no-op (idempotency via CompositeCloseable). + stats.close(); + stats.close(); + + // After close, recording is a defined no-op — must not throw and must not emit new OTel data. + stats.trackStateTransitionStarted(OFFLINE_STATE, STANDBY_STATE); + Collection afterClose = inMemoryMetricReader.collectAllMetrics(); + + // Post-close in-progress recording must not have produced an additional data point. + long afterIncrement = afterClose.stream() + .filter(md -> md.getName().contains(IN_PROGRESS_COUNT.getMetricEntity().getMetricName())) + .flatMap(md -> md.getLongSumData().getPoints().stream()) + .mapToLong(LongPointData::getValue) + .sum(); + long beforeIncrement = beforeClose.stream() + .filter(md -> md.getName().contains(IN_PROGRESS_COUNT.getMetricEntity().getMetricName())) + .flatMap(md -> md.getLongSumData().getPoints().stream()) + .mapToLong(LongPointData::getValue) + .sum(); + assertEquals( + afterIncrement, + beforeIncrement, + "Post-close trackStateTransitionStarted must not emit new OTel data points"); + } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStatsTest.java index 55280734616..c8b13290f24 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ParticipantStoreConsumptionStatsTest.java @@ -121,21 +121,22 @@ public void testRecordKillPushJobFailedConsumptionWithUnknownStore() { } /** - * When OTel is enabled, empty store names are rejected by Venice's dimension validation in - * {@link com.linkedin.venice.stats.VeniceOpenTelemetryMetricsRepository}. - * Callers must sanitize to {@link com.linkedin.venice.stats.OpenTelemetryMetricsSetup#UNKNOWN_STORE_NAME} - * before passing to recording methods. + * Empty store names are sanitized to {@link com.linkedin.venice.stats.OpenTelemetryMetricsSetup#UNKNOWN_STORE_NAME} + * by the shared {@code buildStoreDimensionsMap} helper, so the recording succeeds and emits under the sentinel + * rather than failing dimension validation. */ - @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "Dimension value cannot be null or empty.*") - public void testRecordWithEmptyStoreNameThrowsWhenOtelEnabled() { + @Test + public void testRecordWithEmptyStoreNameSanitizesToUnknown() { stats.recordKillPushJobFailedConsumption(""); + + validateCounter(OTEL_FAILED_CONSUMPTION, 1, buildStoreOnlyAttributes(UNKNOWN_STORE_NAME)); } /** When OTel is disabled, recording methods tolerate empty store names because dimension validation is skipped. */ @Test public void testRecordWithEmptyStoreNameSafeWhenOtelDisabled() { - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX).setEmitOtelMetrics(false).build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, null)) { ParticipantStoreConsumptionStats safeStats = new ParticipantStoreConsumptionStats(disabledRepo, TEST_CLUSTER_NAME); safeStats.recordKillPushJobFailedConsumption(""); @@ -329,8 +330,8 @@ public void testMultipleStoresAreIndependent() { @Test public void testNoNpeWhenOtelDisabled() { - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX).setEmitOtelMetrics(false).build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, null)) { assertAllMethodsSafe(disabledRepo); } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/RocksDBMemoryStatsOtelTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/RocksDBMemoryStatsOtelTest.java index 3e4301fda64..6b1824b3742 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/RocksDBMemoryStatsOtelTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/RocksDBMemoryStatsOtelTest.java @@ -9,6 +9,7 @@ import com.linkedin.venice.stats.VeniceMetricsConfig; import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; import io.tehuti.metrics.MetricConfig; @@ -35,16 +36,18 @@ static Cache createMockCache(long usage, long pinnedUsage) { private InMemoryMetricReader inMemoryMetricReader; private VeniceMetricsRepository metricsRepository; private Attributes expectedAttributes; + private AsyncGauge.AsyncGaugeExecutor asyncGaugeExecutor; @BeforeMethod public void setUp() { inMemoryMetricReader = InMemoryMetricReader.create(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .build()); + // Dedicated executor so tearDown()'s close() doesn't shut down Tehuti's JVM-wide static singleton. + asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); expectedAttributes = Attributes.builder().put(VENICE_CLUSTER_NAME.getDimensionNameInDefaultFormat(), TEST_CLUSTER_NAME).build(); } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ServerMetadataServiceStatsOtelTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ServerMetadataServiceStatsOtelTest.java index b88de179528..27eb4e8cd13 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ServerMetadataServiceStatsOtelTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ServerMetadataServiceStatsOtelTest.java @@ -130,8 +130,8 @@ public void testMultipleStores() { @Test public void testNoNpeWhenOtelDisabled() { - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX).setEmitOtelMetrics(false).build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, null)) { assertAllMethodsSafeWithRepo(disabledRepo); } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StorageEngineOtelStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StorageEngineOtelStatsTest.java index aa35a118fde..25ed7ba1da0 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StorageEngineOtelStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StorageEngineOtelStatsTest.java @@ -14,6 +14,7 @@ import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.dimensions.VeniceRecordType; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.metrics.data.LongPointData; import io.opentelemetry.sdk.metrics.data.MetricData; @@ -351,11 +352,8 @@ public void testSetStatsWrapperNullThrows() { @Test public void testNoNpeWhenOtelDisabled() { - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(false) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(METRIC_PREFIX, null)) { exerciseAllRecordingPaths(disabledRepo); } } @@ -365,6 +363,38 @@ public void testNoNpeWhenPlainMetricsRepository() { exerciseAllRecordingPaths(new MetricsRepository()); } + @Test + public void testCloseDeregistersAsyncGaugesAndDropsDataPoints() { + AggVersionedStorageEngineStats.StorageEngineStatsWrapper wrapper = new MockWrapper(1000, 200, 50); + stats.setStatsWrapper(1, wrapper); + + // Sanity: data points emit pre-close. + OpenTelemetryDataTestUtils.validateLongPointDataFromGauge( + inMemoryMetricReader, + 1000, + buildDiskUsageAttributes(VersionRole.CURRENT, VeniceRecordType.DATA), + DISK_USAGE_METRIC, + METRIC_PREFIX); + + stats.close(); + + // After close: SDK callbacks deregistered → no data point for either async gauge. + assertNull( + OpenTelemetryDataTestUtils.getLongPointDataFromGaugeIfPresent( + inMemoryMetricReader.collectAllMetrics(), + DISK_USAGE_METRIC, + METRIC_PREFIX, + buildDiskUsageAttributes(VersionRole.CURRENT, VeniceRecordType.DATA)), + "Disk usage gauge should be deregistered after close()"); + assertNull( + OpenTelemetryDataTestUtils.getLongPointDataFromGaugeIfPresent( + inMemoryMetricReader.collectAllMetrics(), + KEY_COUNT_METRIC, + METRIC_PREFIX, + buildVersionRoleAttributes(VersionRole.CURRENT)), + "Key count gauge should be deregistered after close()"); + } + // --- Helper methods --- /** Exercises all recording and lifecycle methods on a stats instance to verify no NPE. */ diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreBufferServiceStatsOtelTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreBufferServiceStatsOtelTest.java index fec98e77dc1..cc0c0d3e15f 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreBufferServiceStatsOtelTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreBufferServiceStatsOtelTest.java @@ -11,6 +11,7 @@ import com.linkedin.venice.stats.VeniceMetricsConfig; import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; import io.tehuti.Metric; @@ -38,13 +39,11 @@ public class StoreBufferServiceStatsOtelTest { public void setUp() throws IOException { inMemoryMetricReader = InMemoryMetricReader.create(); asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); } @AfterMethod diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreVersionOtelStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreVersionOtelStatsTest.java index fae87e2b490..87e12aca41b 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreVersionOtelStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StoreVersionOtelStatsTest.java @@ -19,12 +19,11 @@ import com.linkedin.venice.meta.VersionImpl; import com.linkedin.venice.meta.VersionStatus; import com.linkedin.venice.server.VersionRole; -import com.linkedin.venice.stats.VeniceMetricsConfig; import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; -import io.tehuti.metrics.MetricConfig; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; import java.util.Arrays; @@ -54,13 +53,11 @@ public class StoreVersionOtelStatsTest { public void setUp() { inMemoryMetricReader = InMemoryMetricReader.create(); asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); stats = new StoreVersionOtelStats(metricsRepository, TEST_CLUSTER_NAME); } @@ -111,17 +108,20 @@ public void testMultiStoreIsolation() { } @Test - public void testStoreDeletedResetsToNonExistingVersion() { + public void testStoreDeletedResetsToNonExisting() { Store store = createMockStore(TEST_STORE_NAME, 5, createVersion(5, VersionStatus.ONLINE)); stats.handleStoreChanged(store); validateGauge(5, TEST_STORE_NAME, VersionRole.CURRENT); - // Deletion resets versions to NON_EXISTING_VERSION (state kept for callback reuse) + // Deletion resets the AtomicReference to NON_EXISTING. The ASYNC_GAUGE callback keeps polling + // but emits NON_EXISTING_VERSION so dashboards can filter deleted stores. The per-store map + // entry is intentionally NOT removed — see Javadoc on handleStoreDeleted. stats.handleStoreDeleted(TEST_STORE_NAME); validateGauge(NON_EXISTING_VERSION, TEST_STORE_NAME, VersionRole.CURRENT); validateGauge(NON_EXISTING_VERSION, TEST_STORE_NAME, VersionRole.FUTURE); - // Re-creation reuses the existing callback — no duplicate registration + // A subsequent store change repopulates the same AtomicReference; the original callback + // observes the new value, so re-creation works without registering a fresh callback. Store reCreated = createMockStore(TEST_STORE_NAME, 8, createVersion(8, VersionStatus.ONLINE)); stats.handleStoreChanged(reCreated); validateGauge(8, TEST_STORE_NAME, VersionRole.CURRENT); @@ -130,11 +130,8 @@ public void testStoreDeletedResetsToNonExistingVersion() { @Test public void testNoNpeWhenOtelDisabled() { AsyncGauge.AsyncGaugeExecutor dedicatedExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(dedicatedExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, dedicatedExecutor)) { StoreVersionOtelStats disabledStats = new StoreVersionOtelStats(disabledRepo, TEST_CLUSTER_NAME); Store store = createMockStore(TEST_STORE_NAME, 1, createVersion(1, VersionStatus.ONLINE)); disabledStats.handleStoreChanged(store); @@ -300,11 +297,8 @@ public void testCreateFactoryDoesNotRegisterListenerWhenOtelDisabled() throws Ex // When OTel is disabled, registering the listener would only add no-op dispatch overhead // for every store create/change/delete event. Verify the optimization holds. AsyncGauge.AsyncGaugeExecutor dedicatedExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(dedicatedExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, dedicatedExecutor)) { ReadOnlyStoreRepository mockRepo = mock(ReadOnlyStoreRepository.class); StoreVersionOtelStats.create(disabledRepo, TEST_CLUSTER_NAME, mockRepo); verify(mockRepo, never()).registerStoreDataChangedListener(any()); @@ -332,11 +326,8 @@ public void testCloseIsNoOpWhenOtelDisabled() throws Exception { // When OTel is disabled, register() never registered the listener, so close() must not // call unregisterStoreDataChangedListener. AsyncGauge.AsyncGaugeExecutor dedicatedExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(dedicatedExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, dedicatedExecutor)) { ReadOnlyStoreRepository mockRepo = mock(ReadOnlyStoreRepository.class); StoreVersionOtelStats created = StoreVersionOtelStats.create(disabledRepo, TEST_CLUSTER_NAME, mockRepo); created.close(); diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StuckConsumerRepairStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StuckConsumerRepairStatsTest.java index d8441d5ff30..75de5795554 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StuckConsumerRepairStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/StuckConsumerRepairStatsTest.java @@ -5,12 +5,11 @@ import static org.testng.Assert.assertNotNull; import static org.testng.Assert.assertTrue; -import com.linkedin.venice.stats.VeniceMetricsConfig; import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; -import io.tehuti.metrics.MetricConfig; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; import org.testng.annotations.AfterMethod; @@ -34,13 +33,11 @@ public class StuckConsumerRepairStatsTest { public void setUp() { inMemoryMetricReader = InMemoryMetricReader.create(); asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); stats = new StuckConsumerRepairStats(metricsRepository, TEST_CLUSTER_NAME); } @@ -120,11 +117,8 @@ public void testTehutiSensorsRegisteredAndRecorded() { @Test public void testNoNpeWhenOtelDisabled() { AsyncGauge.AsyncGaugeExecutor localExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(localExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, localExecutor)) { exerciseAllRecordingPaths(disabledRepo); } } diff --git a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java index a73cab1dec6..9c00d1d62e1 100644 --- a/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java +++ b/clients/da-vinci-client/src/test/java/com/linkedin/davinci/stats/ingestion/heartbeat/RecordLevelDelayOtelStatsTest.java @@ -484,14 +484,16 @@ public void testMultipleRegionsHaveIndependentMetrics() { } /** - * Verifies that close() clears the per-region metric state and that recording - * after close() re-creates metric state and continues to work. + * Verifies that {@code close()} is final: pre-close recordings emit but any post-close + * recording is silently dropped (the per-region wrappers register with the closed + * {@code CompositeCloseable} and are immediately retired). Version info is retained — only + * wrapper resources are released. */ @Test - public void testCloseAndReuse() { + public void testCloseDropsPostCloseRecordings() { recordLevelDelayOtelStats.updateVersionInfo(CURRENT_VERSION, FUTURE_VERSION); - // Record a metric, then close + // Pre-close recording emits. recordWithDefaultLabels( recordLevelDelayOtelStats, CURRENT_VERSION, @@ -501,11 +503,11 @@ public void testCloseAndReuse() { 100L); recordLevelDelayOtelStats.close(); - // Version info should still be intact (close only clears metric state, not version info) + // Version info survives close (only metric wrappers are released). assertEquals(recordLevelDelayOtelStats.getVersionInfo().getCurrentVersion(), CURRENT_VERSION); assertEquals(recordLevelDelayOtelStats.getVersionInfo().getFutureVersion(), FUTURE_VERSION); - // Recording after close should re-create metric state and work + // Post-close recording is silently dropped. recordWithDefaultLabels( recordLevelDelayOtelStats, CURRENT_VERSION, @@ -514,17 +516,16 @@ public void testCloseAndReuse() { ReplicaState.READY_TO_SERVE, 200L); - // The histogram should have both the pre-close and post-close values since OTel - // accumulates across the metric reader's collection cycle + // Histogram reflects only the pre-close value. validateRecordMetric( REGION_US_WEST, VersionRole.CURRENT, ReplicaType.LEADER, ReplicaState.READY_TO_SERVE, 100.0, - 200.0, - 300.0, - 2); + 100.0, + 100.0, + 1); } // ================================================================================== diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/RetriableAvroGenericStoreClient.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/RetriableAvroGenericStoreClient.java index af547f8c490..3b2c06de7c2 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/RetriableAvroGenericStoreClient.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/RetriableAvroGenericStoreClient.java @@ -9,6 +9,7 @@ import com.linkedin.venice.compute.ComputeRequestWrapper; import com.linkedin.venice.meta.RetryManager; import com.linkedin.venice.read.RequestType; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.BatchGetConfigUtils; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.ExceptionUtils; @@ -60,6 +61,8 @@ public class RetriableAvroGenericStoreClient extends DelegatingAvroStoreCl private final RetryManager multiKeyLongTailRetryManager; private final TreeMap batchGetLongTailRetryThresholdMap; private final TreeMap computeLongTailRetryThresholdMap; + /** Closeable resources owned by this class; drained by {@link #close()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); public RetriableAvroGenericStoreClient( InternalAvroStoreClient delegate, @@ -70,23 +73,25 @@ public RetriableAvroGenericStoreClient( clientConfig.getLongTailRetryThresholdForSingleGetInMicroSeconds(); this.timeoutProcessor = timeoutProcessor; - this.singleKeyLongTailRetryManager = new RetryManager( - clientConfig.getClusterStats().getMetricsRepository(), - SINGLE_KEY_LONG_TAIL_RETRY_STATS_PREFIX + clientConfig.getStoreName(), - clientConfig.getLongTailRetryBudgetEnforcementWindowInMs(), - clientConfig.getRetryBudgetPercentage(), - retryManagerExecutorService, - clientConfig.getStoreName(), - RequestType.SINGLE_GET); - - this.multiKeyLongTailRetryManager = new RetryManager( - clientConfig.getClusterStats().getMetricsRepository(), - MULTI_KEY_LONG_TAIL_RETRY_STATS_PREFIX + clientConfig.getStoreName(), - clientConfig.getLongTailRetryBudgetEnforcementWindowInMs(), - clientConfig.getRetryBudgetPercentage(), - retryManagerExecutorService, - clientConfig.getStoreName(), - RequestType.MULTI_GET); + this.singleKeyLongTailRetryManager = statsCloseables.register( + new RetryManager( + clientConfig.getClusterStats().getMetricsRepository(), + SINGLE_KEY_LONG_TAIL_RETRY_STATS_PREFIX + clientConfig.getStoreName(), + clientConfig.getLongTailRetryBudgetEnforcementWindowInMs(), + clientConfig.getRetryBudgetPercentage(), + retryManagerExecutorService, + clientConfig.getStoreName(), + RequestType.SINGLE_GET)); + + this.multiKeyLongTailRetryManager = statsCloseables.register( + new RetryManager( + clientConfig.getClusterStats().getMetricsRepository(), + MULTI_KEY_LONG_TAIL_RETRY_STATS_PREFIX + clientConfig.getStoreName(), + clientConfig.getLongTailRetryBudgetEnforcementWindowInMs(), + clientConfig.getRetryBudgetPercentage(), + retryManagerExecutorService, + clientConfig.getStoreName(), + RequestType.MULTI_GET)); // Store the fixed threshold for batch get this.longTailRetryThresholdForBatchGetInMicroSeconds = @@ -300,6 +305,7 @@ protected void compute( @Override public void close() { retryManagerExecutorService.shutdownNow(); + statsCloseables.close(); super.close(); } diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/StatsAvroGenericStoreClient.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/StatsAvroGenericStoreClient.java index 3f52680ad2d..acac1ed98b6 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/StatsAvroGenericStoreClient.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/StatsAvroGenericStoreClient.java @@ -25,6 +25,7 @@ import com.linkedin.venice.stats.dimensions.HttpResponseStatusCodeCategory; import com.linkedin.venice.stats.dimensions.HttpResponseStatusEnum; import com.linkedin.venice.stats.dimensions.RejectionReason; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.LatencyUtils; import io.netty.handler.codec.http.HttpResponseStatus; import io.tehuti.metrics.MetricsRepository; @@ -39,6 +40,8 @@ * This class is in charge of all the metric emissions per request. */ public class StatsAvroGenericStoreClient extends DelegatingAvroStoreClient { + /** Stats fields owned by this class; drained by {@link #close()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private final FastClientStats clientStatsForSingleGet; private final FastClientStats clientStatsForStreamingBatchGet; private final FastClientStats clientStatsForStreamingCompute; @@ -48,14 +51,31 @@ public class StatsAvroGenericStoreClient extends DelegatingAvroStoreClient public StatsAvroGenericStoreClient(InternalAvroStoreClient delegate, ClientConfig clientConfig) { super(delegate, clientConfig); - this.clientStatsForSingleGet = clientConfig.getStats(RequestType.SINGLE_GET); - this.clientStatsForStreamingBatchGet = clientConfig.getStats(RequestType.MULTI_GET_STREAMING); - this.clientStatsForStreamingCompute = clientConfig.getStats(RequestType.COMPUTE_STREAMING); - this.clusterStats = clientConfig.getClusterStats(); + this.clientStatsForSingleGet = statsCloseables.register(clientConfig.getStats(RequestType.SINGLE_GET)); + this.clientStatsForStreamingBatchGet = + statsCloseables.register(clientConfig.getStats(RequestType.MULTI_GET_STREAMING)); + this.clientStatsForStreamingCompute = + statsCloseables.register(clientConfig.getStats(RequestType.COMPUTE_STREAMING)); + this.clusterStats = statsCloseables.register(clientConfig.getClusterStats()); this.metricsRepository = clientConfig.getMetricsRepository(); + // {@link #clusterRouteStats} is intentionally NOT registered: it is a process-wide singleton retrieved via + // {@link ClusterRouteStats#getInstance(String)} and may be shared by other live clients of the same store. + // Closing it here would wipe the shared {@code perRouteStatMap} for those clients, causing recording from + // their existing {@link ClusterRouteStats.RouteStats} field references to silently no-op. The per-route SDK + // instruments leak only at process shutdown, which is acceptable given the singleton's process-lifetime + // semantics. this.clusterRouteStats = ClusterRouteStats.getInstance(clientConfig.getStoreName()); } + @Override + public void close() { + try { + super.close(); + } finally { + statsCloseables.close(); + } + } + @Override protected CompletableFuture get(GetRequestContext requestContext, K key) throws VeniceClientException { long startTimeInNS = System.nanoTime(); diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractClientRoutingStrategy.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractClientRoutingStrategy.java index a3728d47cb4..1c7229b888d 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractClientRoutingStrategy.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractClientRoutingStrategy.java @@ -2,11 +2,12 @@ import com.linkedin.venice.exceptions.VeniceUnsupportedOperationException; import com.linkedin.venice.fastclient.RequestContext; +import java.io.Closeable; import java.util.List; import java.util.Map; -public class AbstractClientRoutingStrategy { +public class AbstractClientRoutingStrategy implements Closeable { public String getReplicas(long requestId, int groupId, List replicas) { throw new VeniceUnsupportedOperationException("getReplicas"); } @@ -28,4 +29,13 @@ public boolean trackRequest(RequestContext requestContext) { // Do nothing by default return false; } + + /** + * Default no-op close. Subclasses that own {@link Closeable} stats (e.g. + * {@link HelixGroupRoutingStrategy#helixGroupStats}) must override to release them. + */ + @Override + public void close() { + // no-op default + } } diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractStoreMetadata.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractStoreMetadata.java index 30543c94ea7..954c26080a8 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractStoreMetadata.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/AbstractStoreMetadata.java @@ -63,7 +63,10 @@ private AbstractClientRoutingStrategy getRoutingStrategy(ClientRoutingStrategyTy } public void setRoutingStrategy(ClientRoutingStrategyType strategyType) { + AbstractClientRoutingStrategy previous = this.routingStrategy; this.routingStrategy = getRoutingStrategy(strategyType); + // Close the previous strategy so any Closeable state it owns (e.g. HelixGroupStats) deregisters. + Utils.closeQuietlyWithErrorLogged(previous); LOGGER.info( "Switched to the following routing strategy: {} for store: {} and the new strategy: {}", strategyType, @@ -75,7 +78,9 @@ public void setRoutingStrategy(ClientRoutingStrategyType strategyType) { * For testing only. */ public void setRoutingStrategy(AbstractClientRoutingStrategy routingStrategy) { + AbstractClientRoutingStrategy previous = this.routingStrategy; this.routingStrategy = routingStrategy; + Utils.closeQuietlyWithErrorLogged(previous); } @Override @@ -129,6 +134,10 @@ public int getBatchGetLimit() { @Override public void close() throws IOException { Utils.closeQuietlyWithErrorLogged(instanceHealthMonitor); + // Close the routing strategy so any owned stats (e.g. HelixGroupRoutingStrategy.helixGroupStats) deregister + // their OTel async callbacks. The default {@link AbstractClientRoutingStrategy#close()} is a no-op for + // strategies that don't own Closeable state. + Utils.closeQuietlyWithErrorLogged(routingStrategy); } public VeniceCompressor getCompressor( diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/HelixGroupRoutingStrategy.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/HelixGroupRoutingStrategy.java index b4062f1f941..bb8a166b589 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/HelixGroupRoutingStrategy.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/meta/HelixGroupRoutingStrategy.java @@ -5,7 +5,9 @@ import com.linkedin.venice.read.RequestType; import com.linkedin.venice.stats.routing.HelixGroupStats; import com.linkedin.venice.utils.LatencyUtils; +import com.linkedin.venice.utils.Utils; import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; import java.util.Collections; import java.util.List; import java.util.Map; @@ -109,4 +111,13 @@ public boolean trackRequest(RequestContext requestContext) { }); return true; } + + /** + * Closes the {@link HelixGroupStats} owned by this routing strategy so its OTel async callbacks deregister. + * Called from {@link AbstractStoreMetadata#close()} via the parent's {@link Closeable} contract. + */ + @Override + public void close() { + Utils.closeQuietlyWithErrorLogged(helixGroupStats); + } } diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterRouteStats.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterRouteStats.java index 9d983fe1d32..801aa132aae 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterRouteStats.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterRouteStats.java @@ -37,6 +37,15 @@ import org.apache.logging.log4j.Logger; +/** + * Process-wide singleton holder of {@link RouteStats} for a given store. Retrieved via + * {@link #getInstance(String)} and shared across every {@link com.linkedin.venice.fastclient.StatsAvroGenericStoreClient} + * for that store. The per-route map is therefore process-lifetime — it is intentionally never drained by client + * close paths, since closing it would wipe the shared map for any other live client of the same store. The + * {@link RouteStats#close()} method exists for parity with the wrapper-Closeable contract used elsewhere, but + * production callers are expected to leave per-route entries alive for the JVM lifetime; SDK instruments here are + * released at process shutdown via {@link com.linkedin.venice.stats.VeniceMetricsRepository#close()}. + */ public class ClusterRouteStats { private static final Logger LOGGER = LogManager.getLogger(ClusterRouteStats.class); @@ -144,7 +153,8 @@ public RouteStats( RouteTehutiMetricName.PENDING_REQUEST_COUNT, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); // Initialize OpenTelemetry metric for rejection ratio using ROUTE_REQUEST_REJECTION_RATIO this.rejectionRatio = MetricEntityStateOneEnum.create( @@ -154,7 +164,8 @@ public RouteStats( RouteTehutiMetricName.REJECTION_RATIO, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - RejectionReason.class); + RejectionReason.class, + resources); // Initialize OpenTelemetry metric for healthy request count using ROUTE_CALL_COUNT this.healthyRequestCount = MetricEntityStateThreeEnums.create( @@ -166,7 +177,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // Initialize OpenTelemetry metric for quota exceeded request count using ROUTE_CALL_COUNT this.quotaExceededRequestCount = MetricEntityStateThreeEnums.create( @@ -178,7 +190,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // Initialize OpenTelemetry metric for internal server error request count using ROUTE_CALL_COUNT this.internalServerErrorRequestCount = MetricEntityStateThreeEnums.create( @@ -190,7 +203,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // Initialize OpenTelemetry metric for leaked request count using ROUTE_CALL_COUNT this.leakedRequestCount = MetricEntityStateThreeEnums.create( @@ -202,7 +216,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // Initialize OpenTelemetry metric for service unavailable request count using ROUTE_CALL_COUNT this.serviceUnavailableRequestCount = MetricEntityStateThreeEnums.create( @@ -214,7 +229,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // Initialize OpenTelemetry metric for other error request count using ROUTE_CALL_COUNT this.otherErrorRequestCount = MetricEntityStateThreeEnums.create( @@ -226,7 +242,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // Initialize OpenTelemetry metric for response waiting time using ROUTE_CALL_TIME this.responseWaitingTime = MetricEntityStateThreeEnums.create( @@ -239,7 +256,8 @@ public RouteStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); } public void recordRequest() { diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterStats.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterStats.java index e6b4219396b..6a933051a56 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterStats.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/ClusterStats.java @@ -77,7 +77,8 @@ public ClusterStats(MetricsRepository metricsRepository, String storeName) { ClusterTehutiMetricName.VERSION_UPDATE_FAILURE, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); this.currentVersionNumber = AsyncMetricEntityStateBase.create( STORE_VERSION_CURRENT.getMetricEntity(), @@ -90,7 +91,8 @@ public ClusterStats(MetricsRepository metricsRepository, String storeName) { ClusterTehutiMetricName.CURRENT_VERSION.getMetricName())), baseDimensionsMap, baseAttributes, - this.currentVersion::get); + this.currentVersion::get, + resources); // Initialize OTel metrics for instance error counts this.blockedInstanceErrorCount = MetricEntityStateOneEnum.create( @@ -100,7 +102,8 @@ public ClusterStats(MetricsRepository metricsRepository, String storeName) { ClusterTehutiMetricName.BLOCKED_INSTANCE_COUNT, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - InstanceErrorType.class); + InstanceErrorType.class, + resources); this.unhealthyInstanceErrorCount = MetricEntityStateOneEnum.create( INSTANCE_ERROR_COUNT.getMetricEntity(), @@ -109,7 +112,8 @@ public ClusterStats(MetricsRepository metricsRepository, String storeName) { ClusterTehutiMetricName.UNHEALTHY_INSTANCE_COUNT, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - InstanceErrorType.class); + InstanceErrorType.class, + resources); this.overloadedInstanceErrorCount = MetricEntityStateOneEnum.create( INSTANCE_ERROR_COUNT.getMetricEntity(), @@ -118,7 +122,8 @@ public ClusterStats(MetricsRepository metricsRepository, String storeName) { ClusterTehutiMetricName.OVERLOADED_INSTANCE_COUNT, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - InstanceErrorType.class); + InstanceErrorType.class, + resources); } public void recordBlockedInstanceCount(int count) { diff --git a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/FastClientStats.java b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/FastClientStats.java index 82aeeae5cbd..1041d2378d5 100644 --- a/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/FastClientStats.java +++ b/clients/venice-client/src/main/java/com/linkedin/venice/fastclient/stats/FastClientStats.java @@ -114,7 +114,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.NO_AVAILABLE_REPLICA_REQUEST_COUNT, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - RejectionReason.class); + RejectionReason.class, + resources); this.rejectedRequestCountByLoadController = MetricEntityStateOneEnum.create( FastClientMetricEntity.REQUEST_REJECTION_COUNT.getMetricEntity(), @@ -123,7 +124,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.REJECTED_REQUEST_COUNT_BY_LOAD_CONTROLLER, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - RejectionReason.class); + RejectionReason.class, + resources); this.rejectionRatio = MetricEntityStateOneEnum.create( FastClientMetricEntity.REQUEST_REJECTION_RATIO.getMetricEntity(), @@ -132,7 +134,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.REJECTION_RATIO, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - RejectionReason.class); + RejectionReason.class, + resources); this.longTailRetry = MetricEntityStateOneEnum.create( RETRY_CALL_COUNT.getMetricEntity(), @@ -141,7 +144,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.LONG_TAIL_RETRY_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - RequestRetryType.class); + RequestRetryType.class, + resources); this.errorRetry = MetricEntityStateOneEnum.create( RETRY_CALL_COUNT.getMetricEntity(), otelRepository, @@ -149,7 +153,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.ERROR_RETRY_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - RequestRetryType.class); + RequestRetryType.class, + resources); this.retryRequestWin = MetricEntityStateBase.create( RETRY_REQUEST_WIN_COUNT.getMetricEntity(), @@ -158,7 +163,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.RETRY_REQUEST_WIN, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - getBaseAttributes()); + getBaseAttributes(), + resources); // OTel: fanout_size (MIN_MAX_COUNT_SUM_AGGREGATIONS) with dimensions: venice.store.name, venice.request.method, // venice.request.fanout_type @@ -169,7 +175,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.RETRY_FANOUT_SIZE, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - RequestFanoutType.class); + RequestFanoutType.class, + resources); this.originalFanoutSize = MetricEntityStateOneEnum.create( REQUEST_FANOUT_COUNT.getMetricEntity(), @@ -178,7 +185,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.FANOUT_SIZE, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - RequestFanoutType.class); + RequestFanoutType.class, + resources); Map metadataStalenessDims = null; Attributes metadataStalenessAttrs = null; @@ -212,7 +220,8 @@ private void buildFastClientOtelStats() { FastClientTehutiMetricName.METADATA_STALENESS_HIGH_WATERMARK_MS.getMetricName())), metadataStalenessDims, metadataStalenessAttrs, - () -> this.cacheTimeStampInMs == 0 ? 0 : (System.currentTimeMillis() - this.cacheTimeStampInMs)); + () -> this.cacheTimeStampInMs == 0 ? 0 : (System.currentTimeMillis() - this.cacheTimeStampInMs), + resources); } @Override diff --git a/clients/venice-producer/src/main/java/com/linkedin/venice/producer/PartitionedProducerExecutor.java b/clients/venice-producer/src/main/java/com/linkedin/venice/producer/PartitionedProducerExecutor.java index 2d05a7e0849..2fb3e9396a3 100644 --- a/clients/venice-producer/src/main/java/com/linkedin/venice/producer/PartitionedProducerExecutor.java +++ b/clients/venice-producer/src/main/java/com/linkedin/venice/producer/PartitionedProducerExecutor.java @@ -1,6 +1,7 @@ package com.linkedin.venice.producer; import com.linkedin.venice.stats.ThreadPoolStats; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; import io.tehuti.metrics.MetricsRepository; import java.util.ArrayList; @@ -52,7 +53,7 @@ *
  • workerCount>0, callbackThreadCount>0: Full async - parallel workers + callback isolation
  • * */ -public class PartitionedProducerExecutor { +public class PartitionedProducerExecutor extends AbstractStatsCloseable { private static final Logger LOGGER = LogManager.getLogger(PartitionedProducerExecutor.class); /** @@ -132,7 +133,8 @@ public PartitionedProducerExecutor( new BlockingRejectionHandler(workerName)); if (metricsRepository != null) { - new ThreadPoolStats(metricsRepository, workers[i], storeName + "_producer_worker_" + i); + statsCloseables + .register(new ThreadPoolStats(metricsRepository, workers[i], storeName + "_producer_worker_" + i)); } } LOGGER.info( @@ -158,7 +160,8 @@ public PartitionedProducerExecutor( new BlockingRejectionHandler(callbackPoolName)); if (metricsRepository != null) { - new ThreadPoolStats(metricsRepository, callbackExecutor, storeName + "_producer_callback_pool"); + statsCloseables + .register(new ThreadPoolStats(metricsRepository, callbackExecutor, storeName + "_producer_callback_pool")); } LOGGER.info( "Created callback executor for store {} with {} threads and queue capacity {}", @@ -295,6 +298,9 @@ public void shutdown() { if (callbackExecutor != null) { callbackExecutor.shutdown(); } + // Close every ThreadPoolStats registered at construction (3 ASYNC_GAUGE callbacks per pool) so the SDK + // stops polling them after the executor is shut down. + statsCloseables.close(); } /** @@ -311,6 +317,7 @@ public void shutdownNow() { if (callbackExecutor != null) { callbackExecutor.shutdownNow(); } + statsCloseables.close(); } /** diff --git a/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/BasicClientStats.java b/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/BasicClientStats.java index e5c1cf0d626..2e8668482b3 100644 --- a/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/BasicClientStats.java +++ b/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/BasicClientStats.java @@ -174,7 +174,8 @@ private void buildBasicClientOtelStats() { BasicClientTehutiMetricName.HEALTHY_REQUEST, Collections.singletonList(healthyRequestRate), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); unhealthyRequestMetricForDavinciClient = MetricEntityStateOneEnum.create( BasicClientMetricEntity.CALL_COUNT_DVC.getMetricEntity(), otelRepository, @@ -182,7 +183,8 @@ private void buildBasicClientOtelStats() { BasicClientTehutiMetricName.UNHEALTHY_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // latency healthyLatencyMetricForDavinciClient = MetricEntityStateOneEnum.create( @@ -196,7 +198,8 @@ private void buildBasicClientOtelStats() { getName(), getFullMetricName(BasicClientTehutiMetricName.HEALTHY_REQUEST_LATENCY.getMetricName()))), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); unhealthyLatencyMetricForDavinciClient = MetricEntityStateOneEnum.create( BasicClientMetricEntity.CALL_TIME_DVC.getMetricEntity(), otelRepository, @@ -208,7 +211,8 @@ private void buildBasicClientOtelStats() { getName(), getFullMetricName(BasicClientTehutiMetricName.UNHEALTHY_REQUEST_LATENCY.getMetricName()))), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); healthyRequestMetric = null; unhealthyRequestMetric = null; @@ -222,7 +226,8 @@ private void buildBasicClientOtelStats() { BasicClientTehutiMetricName.REQUEST_KEY_COUNT, Arrays.asList(requestKeyCountRate, new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); successResponseKeyCount = MetricEntityStateBase.create( BasicClientMetricEntity.RESPONSE_KEY_COUNT_DVC.getMetricEntity(), otelRepository, @@ -230,7 +235,8 @@ private void buildBasicClientOtelStats() { BasicClientTehutiMetricName.SUCCESS_REQUEST_KEY_COUNT, Arrays.asList(successRequestKeyCountRate, new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } else { healthyRequestMetric = MetricEntityStateThreeEnums.create( BasicClientMetricEntity.CALL_COUNT.getMetricEntity(), @@ -241,7 +247,8 @@ private void buildBasicClientOtelStats() { baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); unhealthyRequestMetric = MetricEntityStateThreeEnums.create( BasicClientMetricEntity.CALL_COUNT.getMetricEntity(), otelRepository, @@ -251,7 +258,8 @@ private void buildBasicClientOtelStats() { baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); // latency healthyLatencyMetric = MetricEntityStateFourEnums.create( BasicClientMetricEntity.CALL_TIME.getMetricEntity(), @@ -267,7 +275,8 @@ private void buildBasicClientOtelStats() { HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, VeniceResponseStatusCategory.class, - VeniceRequestKeyCountBucket.class); + VeniceRequestKeyCountBucket.class, + resources); unhealthyLatencyMetric = MetricEntityStateFourEnums.create( BasicClientMetricEntity.CALL_TIME.getMetricEntity(), otelRepository, @@ -282,7 +291,8 @@ private void buildBasicClientOtelStats() { HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, VeniceResponseStatusCategory.class, - VeniceRequestKeyCountBucket.class); + VeniceRequestKeyCountBucket.class, + resources); healthyRequestMetricForDavinciClient = null; unhealthyRequestMetricForDavinciClient = null; healthyLatencyMetricForDavinciClient = null; @@ -295,7 +305,8 @@ private void buildBasicClientOtelStats() { BasicClientTehutiMetricName.REQUEST_KEY_COUNT, Arrays.asList(requestKeyCountRate, new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); successResponseKeyCount = MetricEntityStateBase.create( BasicClientMetricEntity.RESPONSE_KEY_COUNT.getMetricEntity(), otelRepository, @@ -303,7 +314,8 @@ private void buildBasicClientOtelStats() { BasicClientTehutiMetricName.SUCCESS_REQUEST_KEY_COUNT, Arrays.asList(successRequestKeyCountRate, new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } } diff --git a/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/ClientStats.java b/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/ClientStats.java index e78eb4c71ba..bbf385bf71c 100644 --- a/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/ClientStats.java +++ b/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/stats/ClientStats.java @@ -108,7 +108,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.REQUEST_RETRY_COUNT, Collections.singletonList(requestRetryCountRate), baseDimensionsMap, - RequestRetryType.class); + RequestRetryType.class, + resources); successRequestDuplicateKeyCount = MetricEntityStateOneEnum.create( ClientMetricEntity.REQUEST_DUPLICATE_KEY_COUNT.getMetricEntity(), @@ -117,7 +118,8 @@ private void buildClientOtelStats() { SUCCESS_REQUEST_DUPLICATE_KEY_COUNT, Collections.singletonList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); /** * The time it took to serialize the request, to be sent to the router. This is done in a blocking fashion * on the caller's thread. @@ -129,7 +131,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.REQUEST_SERIALIZATION_TIME, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); /** * The time it took between sending the request to the router and beginning to process the response. @@ -141,7 +144,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.REQUEST_SUBMISSION_TO_RESPONSE_HANDLING_TIME, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); /** * The total time it took to process the response (deserialization). @@ -153,7 +157,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RESPONSE_DESERIALIZATION_TIME, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); // response decompression time responseDecompressionTime = MetricEntityStateBase.create( @@ -163,7 +168,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RESPONSE_DECOMPRESSION_TIME, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); /** * Metrics to track the latency of each proportion of results received. @@ -175,7 +181,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RESPONSE_TTFR, Collections.singletonList(new Avg()), baseDimensionsMap, - StreamProgress.class); + StreamProgress.class, + resources); // TT50PR batchStreamProgressTimeToReceiveP50thRecord = MetricEntityStateOneEnum.create( ClientMetricEntity.RESPONSE_BATCH_STREAM_PROGRESS_TIME.getMetricEntity(), @@ -184,7 +191,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RESPONSE_TT50PR, Collections.singletonList(new Avg()), baseDimensionsMap, - StreamProgress.class); + StreamProgress.class, + resources); // TT90PR batchStreamProgressTimeToReceiveP90thRecord = MetricEntityStateOneEnum.create( ClientMetricEntity.RESPONSE_BATCH_STREAM_PROGRESS_TIME.getMetricEntity(), @@ -193,7 +201,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RESPONSE_TT90PR, Collections.singletonList(new Avg()), baseDimensionsMap, - StreamProgress.class); + StreamProgress.class, + resources); /** * Metrics to track the timed-out requests. @@ -210,7 +219,8 @@ private void buildClientOtelStats() { APP_TIMED_OUT_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); appTimedOutRequestResultRatio = MetricEntityStateBase.create( ClientMetricEntity.REQUEST_TIMEOUT_PARTIAL_RESPONSE_RATIO.getMetricEntity(), @@ -219,7 +229,8 @@ private void buildClientOtelStats() { APP_TIMED_OUT_REQUEST_RESULT_RATIO, Arrays.asList(new Avg(), new Min(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); clientFutureTimeout = MetricEntityStateBase.create( ClientMetricEntity.REQUEST_TIMEOUT_REQUESTED_DURATION.getMetricEntity(), @@ -228,7 +239,8 @@ private void buildClientOtelStats() { CLIENT_FUTURE_TIMEOUT, Arrays.asList(new Avg(), new Min(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); /* Metrics relevant to track long tail retry efficacy for batch get*/ retryKeyCount = MetricEntityStateBase.create( @@ -238,7 +250,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RETRY_REQUEST_KEY_COUNT, Arrays.asList(retryRequestKeyCount, new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); retrySuccessKeyCount = MetricEntityStateBase.create( ClientMetricEntity.RETRY_RESPONSE_KEY_COUNT.getMetricEntity(), @@ -247,7 +260,8 @@ private void buildClientOtelStats() { ClientTehutiMetricName.RETRY_REQUEST_SUCCESS_KEY_COUNT, Arrays.asList(retryRequestSuccessKeyCount, new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } @Override diff --git a/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/store/StatTrackingStoreClient.java b/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/store/StatTrackingStoreClient.java index f24fe6df025..4d93a410f1c 100644 --- a/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/store/StatTrackingStoreClient.java +++ b/clients/venice-thin-client/src/main/java/com/linkedin/venice/client/store/StatTrackingStoreClient.java @@ -30,6 +30,7 @@ import java.util.concurrent.ConcurrentLinkedQueue; import java.util.function.BiFunction; import org.apache.avro.Schema; +import org.apache.commons.io.IOUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -115,6 +116,20 @@ private void onClusterNameUpdated(String newClusterName) { LOGGER); } + @Override + public void close() { + try { + super.close(); + } finally { + IOUtils.closeQuietly(singleGetStats, LOGGER::error); + IOUtils.closeQuietly(multiGetStats, LOGGER::error); + IOUtils.closeQuietly(multiGetStreamingStats, LOGGER::error); + IOUtils.closeQuietly(schemaReaderStats, LOGGER::error); + IOUtils.closeQuietly(computeStats, LOGGER::error); + IOUtils.closeQuietly(computeStreamingStats, LOGGER::error); + } + } + @Override public CompletableFuture get(K key) { long startTimeInNS = System.nanoTime(); diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java index 82fb1905c36..5d162ca060a 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStats.java @@ -1,11 +1,13 @@ package com.linkedin.venice.stats; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; import java.util.Map; -public abstract class AbstractVeniceAggStats { +public abstract class AbstractVeniceAggStats implements Closeable { public final static String STORE_NAME_FOR_TOTAL_STAT = "total"; protected T totalStats; protected final Map storeStats = new VeniceConcurrentHashMap<>(); @@ -70,4 +72,12 @@ public T getNullableStoreStats(String storeName) { public T getTotalStats() { return totalStats; } + + /** Closes {@link #totalStats} and every per-store entry in {@link #storeStats}. */ + @Override + public void close() { + MetricEntityStateUtils.closeQuietly(totalStats); + storeStats.values().forEach(MetricEntityStateUtils::closeQuietly); + storeStats.clear(); + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceStats.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceStats.java index fbbd543c954..90eac02dcd6 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceStats.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceStats.java @@ -3,6 +3,7 @@ import static com.linkedin.venice.stats.AbstractVeniceAggStats.STORE_NAME_FOR_TOTAL_STAT; import com.linkedin.venice.stats.metrics.AsyncMetricEntityState.TehutiSensorRegistrationFunction; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.stats.metrics.MetricEntityState; import com.linkedin.venice.stats.metrics.MetricEntityStateBase; import com.linkedin.venice.utils.Time; @@ -20,11 +21,12 @@ import io.tehuti.metrics.stats.Percentiles; import io.tehuti.metrics.stats.Rate; import io.tehuti.metrics.stats.Total; +import java.io.Closeable; import java.util.Map; import java.util.function.Supplier; -public class AbstractVeniceStats { +public class AbstractVeniceStats implements Closeable { public static final String DELIMITER = "--"; private final MetricsRepository metricsRepository; @@ -34,6 +36,8 @@ public class AbstractVeniceStats { private final boolean isTehutiMetricsEnabled; /** A dummy sensor to return when Tehuti metrics are disabled */ private final Sensor noopSensor; + /** Wrappers and other Closeable resources owned by this stats instance; drained by {@link #close()}. */ + protected final CompositeCloseable resources = new CompositeCloseable(); public AbstractVeniceStats(MetricsRepository metricsRepository, String name) { this.metricsRepository = metricsRepository; @@ -320,4 +324,10 @@ protected final MeasurableStat[] minAndMax() { protected final MeasurableStat[] avgAndTotal() { return new MeasurableStat[] { new Avg(), new Total() }; } + + /** Drains every {@link Closeable} registered into {@link #resources}. Idempotent. */ + @Override + public void close() { + resources.close(); + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/OpenTelemetryMetricsSetup.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/OpenTelemetryMetricsSetup.java index 454652f966d..7c1bc7a9d2d 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/OpenTelemetryMetricsSetup.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/OpenTelemetryMetricsSetup.java @@ -1,5 +1,7 @@ package com.linkedin.venice.stats; +import static com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions.VENICE_STORE_NAME; + import com.linkedin.venice.read.RequestType; import com.linkedin.venice.stats.dimensions.RequestRetryType; import com.linkedin.venice.stats.dimensions.VeniceDimensionInterface; @@ -32,6 +34,21 @@ public static String sanitizeStoreName(String storeName) { return (trimmed == null || trimmed.isEmpty()) ? UNKNOWN_STORE_NAME : trimmed; } + /** + * Returns a mutable copy of {@code baseDimensionsMap} with the {@code VENICE_STORE_NAME} dimension + * set to the sanitized {@code storeName}. Used by per-store stats classes that maintain a + * {@code Map} keyed by store name and need to materialise per-store dimensions on + * first creation. The returned map is mutable so callers can layer additional per-call + * dimensions on top without copying again. + */ + public static Map buildStoreDimensionsMap( + Map baseDimensionsMap, + String storeName) { + Map dims = new HashMap<>(baseDimensionsMap); + dims.put(VENICE_STORE_NAME, sanitizeStoreName(storeName)); + return dims; + } + /** * Result object containing the setup OpenTelemetry components. * diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/ThreadPoolStats.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/ThreadPoolStats.java index 12021bb60fa..304377f1c4e 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/ThreadPoolStats.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/ThreadPoolStats.java @@ -17,6 +17,9 @@ public class ThreadPoolStats extends AbstractVeniceStats { private final ThreadPoolExecutor threadPoolExecutor; + private final AsyncMetricEntityStateBase activeThreadCountMetric; + private final AsyncMetricEntityStateBase maxThreadCountMetric; + private final AsyncMetricEntityStateBase queueTaskCountMetric; private final MetricEntityStateBase queuedTasksCountMetric; public ThreadPoolStats(MetricsRepository metricsRepository, ThreadPoolExecutor threadPoolExecutor, String name) { @@ -36,26 +39,29 @@ public ThreadPoolStats(MetricsRepository metricsRepository, ThreadPoolExecutor t OpenTelemetryMetricsSetup.builder(metricsRepository).setThreadPoolName(name).build(); // OTel async gauges for thread pool metrics - AsyncMetricEntityStateBase.create( + activeThreadCountMetric = AsyncMetricEntityStateBase.create( ThreadPoolOtelMetricEntity.THREAD_POOL_THREAD_ACTIVE_COUNT.getMetricEntity(), otelData.getOtelRepository(), otelData.getBaseDimensionsMap(), otelData.getBaseAttributes(), - () -> this.threadPoolExecutor.getActiveCount()); + () -> this.threadPoolExecutor.getActiveCount(), + resources); - AsyncMetricEntityStateBase.create( + maxThreadCountMetric = AsyncMetricEntityStateBase.create( ThreadPoolOtelMetricEntity.THREAD_POOL_THREAD_MAX_COUNT.getMetricEntity(), otelData.getOtelRepository(), otelData.getBaseDimensionsMap(), otelData.getBaseAttributes(), - () -> this.threadPoolExecutor.getMaximumPoolSize()); + () -> this.threadPoolExecutor.getMaximumPoolSize(), + resources); - AsyncMetricEntityStateBase.create( + queueTaskCountMetric = AsyncMetricEntityStateBase.create( ThreadPoolOtelMetricEntity.THREAD_POOL_QUEUE_TASK_COUNT.getMetricEntity(), otelData.getOtelRepository(), otelData.getBaseDimensionsMap(), otelData.getBaseAttributes(), - () -> this.threadPoolExecutor.getQueue().size()); + () -> this.threadPoolExecutor.getQueue().size(), + resources); /** * If only registered as Gauge, the metric would show the queue size at the time of the metric collection, which is not @@ -71,7 +77,8 @@ public ThreadPoolStats(MetricsRepository metricsRepository, ThreadPoolExecutor t ThreadPoolTehutiMetricNameEnum.QUEUED_TASK_COUNT, Arrays.asList(new Avg(), new Max()), otelData.getBaseDimensionsMap(), - otelData.getBaseAttributes()); + otelData.getBaseAttributes(), + resources); } /** diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java index e073d52fd72..67e952104e3 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceMetricsRepository.java @@ -68,9 +68,13 @@ public VeniceMetricsRepository cloneWithNewMetricPrefix(String newMetricPrefix) @Override public void close() { - super.close(); - if (openTelemetryMetricsRepository != null) { - openTelemetryMetricsRepository.close(); + try { + super.close(); + } finally { + // Shut down OTel even if Tehuti close() throws, so the exporter thread cannot block JVM exit. + if (openTelemetryMetricsRepository != null) { + openTelemetryMetricsRepository.close(); + } } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java index 50d6248eb7c..9c8c55d1927 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepository.java @@ -8,6 +8,7 @@ import com.linkedin.venice.exceptions.VeniceException; import com.linkedin.venice.stats.dimensions.VeniceDimensionInterface; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.stats.metrics.MetricEntity; import com.linkedin.venice.stats.metrics.MetricEntityStateGeneric; import com.linkedin.venice.stats.metrics.MetricType; @@ -49,6 +50,7 @@ import io.opentelemetry.sdk.metrics.export.PeriodicMetricReader; import io.opentelemetry.sdk.resources.Resource; import io.tehuti.utils.RedundantLogFilter; +import java.io.Closeable; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -63,18 +65,20 @@ import org.apache.logging.log4j.Logger; -public class VeniceOpenTelemetryMetricsRepository { +public class VeniceOpenTelemetryMetricsRepository implements Closeable { private static final Logger LOGGER = LogManager.getLogger(VeniceOpenTelemetryMetricsRepository.class); public static final RedundantLogFilter REDUNDANT_LOG_FILTER = RedundantLogFilter.getRedundantLogFilter(); public static final String DEFAULT_METRIC_PREFIX = "venice."; /** Custom metric prefix for internal infrastructure counters (yields {@code venice.internal.*}). */ private static final String INTERNAL_METRIC_PREFIX = "internal"; + /** Bounded wait for SDK MeterProvider shutdown — long enough for periodic exporters to flush, short enough to not stall JVM exit. */ + private static final long SHUTDOWN_TIMEOUT_SECONDS = 10; private final VeniceMetricsConfig metricsConfig; /** OpenTelemetry instance: Either created or retrieved from GlobalOpenTelemetry set by the application */ private final OpenTelemetry openTelemetry; /** SdkMeterProvider that is used to create the OpenTelemetry instance */ - private SdkMeterProvider sdkMeterProvider = null; + private volatile SdkMeterProvider sdkMeterProvider = null; private final boolean emitOpenTelemetryMetrics; private final boolean emitTehutiMetrics; @@ -89,6 +93,12 @@ public class VeniceOpenTelemetryMetricsRepository { */ private MetricEntityStateGeneric recordFailureMetric; + /** + * Owns the lifecycle of internally-bootstrapped metric wrappers (e.g., {@link #recordFailureMetric}). + * Closed from {@link #close()} together with the SDK meter provider shutdown. + */ + private final CompositeCloseable resources = new CompositeCloseable(); + public VeniceOpenTelemetryMetricsRepository(VeniceMetricsConfig metricsConfig) { this.metricsConfig = metricsConfig; emitOpenTelemetryMetrics = metricsConfig.emitOtelMetrics(); @@ -104,7 +114,7 @@ public VeniceOpenTelemetryMetricsRepository(VeniceMetricsConfig metricsConfig) { this.openTelemetry = initializeOpenTelemetry(metricsConfig); this.meter = openTelemetry.getMeter(transformMetricName(getMetricPrefix(), metricFormat)); this.recordFailureMetric = MetricEntityStateGeneric - .create(CommonMetricsEntity.METRIC_RECORD_FAILURE.getMetricEntity(), this, Collections.emptyMap()); + .create(CommonMetricsEntity.METRIC_RECORD_FAILURE.getMetricEntity(), this, Collections.emptyMap(), resources); } /** @@ -142,7 +152,7 @@ private VeniceOpenTelemetryMetricsRepository(VeniceOpenTelemetryMetricsRepositor // Create a new Meter with the new prefix this.meter = openTelemetry.getMeter(transformMetricName(getMetricPrefix(), metricFormat)); this.recordFailureMetric = MetricEntityStateGeneric - .create(CommonMetricsEntity.METRIC_RECORD_FAILURE.getMetricEntity(), this, Collections.emptyMap()); + .create(CommonMetricsEntity.METRIC_RECORD_FAILURE.getMetricEntity(), this, Collections.emptyMap(), resources); } LOGGER.info("Created child VeniceOpenTelemetryMetricsRepository with metric prefix: {}", newMetricPrefix); } @@ -692,10 +702,26 @@ public Attributes createAttributes( return attributesBuilder.build(); } + @Override public void close() { - if (sdkMeterProvider != null) { - sdkMeterProvider.shutdown(); - sdkMeterProvider = null; + try { + resources.close(); + } finally { + SdkMeterProvider provider = sdkMeterProvider; + if (provider != null) { + sdkMeterProvider = null; + // shutdown() is async; join with a bounded timeout so exporter failures surface here + // rather than racing past in-flight exports. + CompletableResultCode shutdownResult = provider.shutdown(); + shutdownResult.join(SHUTDOWN_TIMEOUT_SECONDS, TimeUnit.SECONDS); + if (!shutdownResult.isSuccess()) { + LOGGER.warn( + "OTel SDK MeterProvider shutdown did not complete cleanly within {}s (done={}, success={})", + SHUTDOWN_TIMEOUT_SECONDS, + shutdownResult.isDone(), + shutdownResult.isSuccess()); + } + } } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AbstractStatsCloseable.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AbstractStatsCloseable.java new file mode 100644 index 00000000000..d354b1ec20a --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AbstractStatsCloseable.java @@ -0,0 +1,18 @@ +package com.linkedin.venice.stats.metrics; + +import java.io.Closeable; + + +/** + * Base class for stats holders that own a {@link CompositeCloseable} registry. Subclasses inherit + * the {@code statsCloseables} field and a {@link #close()} that drains it. Override {@code close()} + * to add extra cleanup; remember to call {@code super.close()} so the registry is drained. + */ +public abstract class AbstractStatsCloseable implements Closeable { + protected final CompositeCloseable statsCloseables = new CompositeCloseable(); + + @Override + public void close() { + statsCloseables.close(); + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityState.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityState.java index 4f5c750ad5c..2c47a436dfa 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityState.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityState.java @@ -8,6 +8,7 @@ import io.tehuti.metrics.MeasurableStat; import io.tehuti.metrics.Sensor; import io.tehuti.metrics.stats.AsyncGauge; +import java.io.Closeable; import java.util.Arrays; import java.util.HashSet; import java.util.List; @@ -42,17 +43,17 @@ * different Tehuti sensor. * */ -public abstract class AsyncMetricEntityState { +public abstract class AsyncMetricEntityState implements Closeable { private final boolean emitOpenTelemetryMetrics; private final boolean emitTehutiMetrics; protected final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; protected final MetricEntity metricEntity; - /** Otel metric */ - protected Object otelMetric = null; - /** Respective tehuti metric */ - protected Sensor tehutiSensor = null; + /** OTel SDK instrument handle. Volatile so {@link #close()} nulling it is promptly visible to recording threads. */ + protected volatile Object otelMetric = null; + /** The Tehuti sensor. Volatile for the same reason as {@link #otelMetric}. */ + protected volatile Sensor tehutiSensor = null; public AsyncMetricEntityState( MetricEntity metricEntity, @@ -398,4 +399,20 @@ public Sensor getTehutiSensor() { public Object getOtelMetric() { return otelMetric; } + + /** + * Releases this wrapper's OTel resources. For async wrappers this deregisters the SDK callback so + * it stops being polled; for sync wrappers it only releases the wrapper-side reference (the SDK + * aggregator persists until MeterProvider close). Idempotent and best-effort — SDK close + * exceptions are logged at WARN and swallowed. Post-close {@code record()} is a silent no-op. + * Not concurrent-safe with {@code record()}; callers must coordinate. + */ + @Override + public void close() { + // Snapshot before the helper call so a concurrent second close() sees null here and skips. + Object localInstrument = otelMetric; + otelMetric = null; + tehutiSensor = null; + MetricEntityStateUtils.closeOtelInstrumentQuietly(localInstrument, metricEntity); + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateBase.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateBase.java index dec11673561..f04b86223c1 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateBase.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateBase.java @@ -71,31 +71,33 @@ private void validateBaseAttributes( if (emitOpenTelemetryMetrics()) { Validate.notNull( baseAttributes, - "Base attributes cannot be null for MetricEntityStateBase for metric: " + metricEntity.getMetricName()); + "Base attributes cannot be null for AsyncMetricEntityStateBase for metric: " + metricEntity.getMetricName()); } } // --- LongSupplier factory methods (for ASYNC_GAUGE) --- - /** Factory method for OTel-only ASYNC_GAUGE with LongSupplier callback */ + /** Factory method for OTel-only ASYNC_GAUGE with LongSupplier callback. */ public static AsyncMetricEntityStateBase create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap, Attributes baseAttributes, - @Nonnull LongSupplier asyncCallback) { - return new AsyncMetricEntityStateBase( - metricEntity, - otelRepository, - null, - null, - Collections.emptyList(), - baseDimensionsMap, - baseAttributes, - asyncCallback); + @Nonnull LongSupplier asyncCallback, + CompositeCloseable registry) { + return registry.register( + new AsyncMetricEntityStateBase( + metricEntity, + otelRepository, + null, + null, + Collections.emptyList(), + baseDimensionsMap, + baseAttributes, + asyncCallback)); } - /** Factory method for joint Tehuti+OTel ASYNC_GAUGE with LongSupplier callback */ + /** Factory method for joint Tehuti+OTel ASYNC_GAUGE with LongSupplier callback. */ public static AsyncMetricEntityStateBase create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -104,39 +106,43 @@ public static AsyncMetricEntityStateBase create( List tehutiMetricStats, Map baseDimensionsMap, Attributes baseAttributes, - @Nonnull LongSupplier asyncCallback) { - return new AsyncMetricEntityStateBase( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - baseAttributes, - asyncCallback); + @Nonnull LongSupplier asyncCallback, + CompositeCloseable registry) { + return registry.register( + new AsyncMetricEntityStateBase( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + baseAttributes, + asyncCallback)); } // --- DoubleSupplier factory methods (for ASYNC_DOUBLE_GAUGE) --- - /** Factory method for OTel-only ASYNC_DOUBLE_GAUGE with DoubleSupplier callback */ + /** Factory method for OTel-only ASYNC_DOUBLE_GAUGE with DoubleSupplier callback. */ public static AsyncMetricEntityStateBase create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap, Attributes baseAttributes, - @Nonnull DoubleSupplier asyncDoubleCallback) { - return new AsyncMetricEntityStateBase( - metricEntity, - otelRepository, - null, - null, - Collections.emptyList(), - baseDimensionsMap, - baseAttributes, - asyncDoubleCallback); + @Nonnull DoubleSupplier asyncDoubleCallback, + CompositeCloseable registry) { + return registry.register( + new AsyncMetricEntityStateBase( + metricEntity, + otelRepository, + null, + null, + Collections.emptyList(), + baseDimensionsMap, + baseAttributes, + asyncDoubleCallback)); } - /** Factory method for joint Tehuti+OTel ASYNC_DOUBLE_GAUGE with DoubleSupplier callback */ + /** Factory method for joint Tehuti+OTel ASYNC_DOUBLE_GAUGE with DoubleSupplier callback. */ public static AsyncMetricEntityStateBase create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -145,15 +151,17 @@ public static AsyncMetricEntityStateBase create( List tehutiMetricStats, Map baseDimensionsMap, Attributes baseAttributes, - @Nonnull DoubleSupplier asyncDoubleCallback) { - return new AsyncMetricEntityStateBase( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - baseAttributes, - asyncDoubleCallback); + @Nonnull DoubleSupplier asyncDoubleCallback, + CompositeCloseable registry) { + return registry.register( + new AsyncMetricEntityStateBase( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + baseAttributes, + asyncDoubleCallback)); } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnum.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnum.java index 52e002bb0bb..48b28b3f615 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnum.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnum.java @@ -6,6 +6,7 @@ import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.LiveStateResolverOneEnum; import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.ValueResolverOneEnum; import io.opentelemetry.api.common.Attributes; +import java.io.Closeable; import java.util.EnumMap; import java.util.Map; import java.util.function.ObjDoubleConsumer; @@ -36,18 +37,27 @@ * cost is {@code O(|E|)} {@code liveStateResolver} calls plus one {@code measurement.record(...)} * per emitted combo. */ -public class AsyncMetricEntityStateOneEnum & VeniceDimensionInterface> { +public class AsyncMetricEntityStateOneEnum & VeniceDimensionInterface> implements Closeable { private final boolean emitOpenTelemetryMetrics; - /** Precomputed per-enum attributes; {@code null} when OTel is disabled. */ - private final EnumMap attributesByEnum; - /** The single SDK instrument; retained so the SDK keeps the callback referenced. */ - private final Object instrument; + private final MetricEntity metricEntity; + /** + * Precomputed per-enum attributes; {@code null} when OTel is disabled or after {@link #close()}. + * Volatile so close()'s nulling is promptly visible. + */ + private volatile EnumMap attributesByEnum; + /** + * The single SDK instrument; retained so the SDK keeps the callback referenced. Nulled by + * {@link #close()}. Volatile so close()'s nulling is promptly visible. + */ + private volatile Object instrument; private AsyncMetricEntityStateOneEnum( boolean emitOpenTelemetryMetrics, + MetricEntity metricEntity, EnumMap attributesByEnum, Object instrument) { this.emitOpenTelemetryMetrics = emitOpenTelemetryMetrics; + this.metricEntity = metricEntity; this.attributesByEnum = attributesByEnum; this.instrument = instrument; } @@ -66,6 +76,7 @@ private AsyncMetricEntityStateOneEnum( * * @param the state type returned by {@code liveStateResolver}. Can be any reference type * (wrapper, task, counter, etc.) — the infra never inspects it beyond null-check. + * @param registry closes the returned wrapper at shutdown; pass {@link CompositeCloseable#NONE} for tests. */ public static & VeniceDimensionInterface, S> AsyncMetricEntityStateOneEnum create( MetricEntity metricEntity, @@ -73,7 +84,8 @@ public static & VeniceDimensionInterface, S> AsyncMetricEntit Map baseDimensionsMap, Class enumTypeClass, LiveStateResolverOneEnum liveStateResolver, - ValueResolverOneEnum valueResolver) { + ValueResolverOneEnum valueResolver, + CompositeCloseable registry) { MetricType metricType = metricEntity.getMetricType(); if (metricType != MetricType.ASYNC_GAUGE && metricType != MetricType.ASYNC_DOUBLE_GAUGE) { throw new IllegalArgumentException( @@ -84,7 +96,7 @@ public static & VeniceDimensionInterface, S> AsyncMetricEntit // If OTel is disabled (or no repo supplied), short-circuit boolean emitOtel = otelRepository != null && otelRepository.emitOpenTelemetryMetrics(); if (!emitOtel) { - return new AsyncMetricEntityStateOneEnum<>(false, null, null); + return registry.register(new AsyncMetricEntityStateOneEnum<>(false, metricEntity, null, null)); } /* @@ -134,7 +146,7 @@ public static & VeniceDimensionInterface, S> AsyncMetricEntit (attrs, value) -> measurement.record((long) value, attrs))); } - return new AsyncMetricEntityStateOneEnum<>(true, attributesByEnum, instrument); + return registry.register(new AsyncMetricEntityStateOneEnum<>(true, metricEntity, attributesByEnum, instrument)); } /** @@ -177,4 +189,21 @@ public EnumMap getAttributesByEnum() { public Object getInstrument() { return instrument; } + + /** + * Deregisters the underlying SDK observable gauge and releases the cached per-enum {@link Attributes}. + * Closes the callback for ALL enum values — use {@code liveStateResolver} returning {@code null} + * for per-combo dormancy. Idempotent and best-effort. + */ + @Override + public void close() { + // Snapshot the volatile field before the helper call so a second concurrent close() cannot + // observe the field non-null in instanceof and then invoke close() on a now-null reference + // and emit a misleading "OTel SDK close threw" WARN. Idempotency is preserved: the second + // close sees null here and skips the SDK call. + Object localInstrument = instrument; + instrument = null; + attributesByEnum = null; + MetricEntityStateUtils.closeOtelInstrumentQuietly(localInstrument, metricEntity); + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnums.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnums.java index 31bd8eed731..25eb207f790 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnums.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnums.java @@ -6,6 +6,7 @@ import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.LiveStateResolverTwoEnums; import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.ValueResolverTwoEnums; import io.opentelemetry.api.common.Attributes; +import java.io.Closeable; import java.util.EnumMap; import java.util.Map; import java.util.function.ObjDoubleConsumer; @@ -37,18 +38,28 @@ * is {@code O(|E1| × |E2|)} {@code liveStateResolver} calls plus one * {@code measurement.record(...)} per emitted pair. */ -public class AsyncMetricEntityStateTwoEnums & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface> { +public class AsyncMetricEntityStateTwoEnums & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface> + implements Closeable { private final boolean emitOpenTelemetryMetrics; - /** Precomputed per-pair attributes; {@code null} when OTel is disabled. */ - private final EnumMap> attributesByEnum; - /** The single SDK instrument; retained so the SDK keeps the callback referenced. */ - private final Object instrument; + private final MetricEntity metricEntity; + /** + * Precomputed per-pair attributes; {@code null} when OTel is disabled or after {@link #close()}. + * Volatile so close()'s nulling is promptly visible. + */ + private volatile EnumMap> attributesByEnum; + /** + * The single SDK instrument; retained so the SDK keeps the callback referenced. Nulled by + * {@link #close()}. Volatile so close()'s nulling is promptly visible. + */ + private volatile Object instrument; private AsyncMetricEntityStateTwoEnums( boolean emitOpenTelemetryMetrics, + MetricEntity metricEntity, EnumMap> attributesByEnum, Object instrument) { this.emitOpenTelemetryMetrics = emitOpenTelemetryMetrics; + this.metricEntity = metricEntity; this.attributesByEnum = attributesByEnum; this.instrument = instrument; } @@ -65,6 +76,7 @@ private AsyncMetricEntityStateTwoEnums( * * @param the state type returned by {@code liveStateResolver}. Any reference type — the * infra never inspects it beyond null-check. + * @param registry closes the returned wrapper at shutdown; pass {@link CompositeCloseable#NONE} for tests. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, S> AsyncMetricEntityStateTwoEnums create( MetricEntity metricEntity, @@ -73,7 +85,8 @@ public static & VeniceDimensionInterface, E2 extends Enum enumTypeClass1, Class enumTypeClass2, LiveStateResolverTwoEnums liveStateResolver, - ValueResolverTwoEnums valueResolver) { + ValueResolverTwoEnums valueResolver, + CompositeCloseable registry) { MetricType metricType = metricEntity.getMetricType(); if (metricType != MetricType.ASYNC_GAUGE && metricType != MetricType.ASYNC_DOUBLE_GAUGE) { throw new IllegalArgumentException( @@ -84,7 +97,7 @@ public static & VeniceDimensionInterface, E2 extends Enum(false, null, null); + return registry.register(new AsyncMetricEntityStateTwoEnums<>(false, metricEntity, null, null)); } /* @@ -140,7 +153,7 @@ public static & VeniceDimensionInterface, E2 extends Enum measurement.record((long) value, attrs))); } - return new AsyncMetricEntityStateTwoEnums<>(true, attributesByEnum, instrument); + return registry.register(new AsyncMetricEntityStateTwoEnums<>(true, metricEntity, attributesByEnum, instrument)); } /** @@ -187,4 +200,17 @@ public EnumMap> getAttributesByEnum() { public Object getInstrument() { return instrument; } + + /** + * Deregisters the underlying SDK observable gauge and releases the cached per-pair {@link Attributes}. + * Closes the callback for ALL enum-pair combinations. Idempotent and best-effort. + */ + @Override + public void close() { + // Snapshot before the helper call so a concurrent second close() sees null here and skips. + Object localInstrument = instrument; + instrument = null; + attributesByEnum = null; + MetricEntityStateUtils.closeOtelInstrumentQuietly(localInstrument, metricEntity); + } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/CompositeCloseable.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/CompositeCloseable.java new file mode 100644 index 00000000000..2b2bbbd82a6 --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/CompositeCloseable.java @@ -0,0 +1,65 @@ +package com.linkedin.venice.stats.metrics; + +import com.google.common.annotations.VisibleForTesting; +import java.io.Closeable; +import java.util.ArrayList; +import java.util.List; + + +/** + * Registry of {@link Closeable} resources owned by a single component. {@link #close()} drains in + * reverse order via {@link MetricEntityStateUtils#closeQuietly}, is idempotent, and is final: + * post-close {@link #register} calls close the resource immediately instead of tracking it. + * Both methods are safe to call concurrently. {@link #NONE} is a no-op sentinel for test or + * ad-hoc callsites. + */ +public class CompositeCloseable implements Closeable { + /** No-op sentinel: registers nothing, closes nothing. Test / ad-hoc use only. */ + @VisibleForTesting + public static final CompositeCloseable NONE = new CompositeCloseable() { + @Override + public T register(T resource) { + return resource; + } + + @Override + public void close() { + /* no-op */ + } + }; + + private final List resources = new ArrayList<>(); + private boolean closed = false; + + /** Registers and returns the resource. Post-close registrations are closed immediately. */ + public T register(T resource) { + if (resource == null) { + return null; + } + synchronized (resources) { + if (closed) { + MetricEntityStateUtils.closeQuietly(resource); + return resource; + } + resources.add(resource); + } + return resource; + } + + @Override + public void close() { + Closeable[] snapshot; + synchronized (resources) { + if (closed) { + return; + } + closed = true; + snapshot = resources.toArray(new Closeable[0]); + resources.clear(); + } + // Close outside the lock so a slow SDK close doesn't block concurrent register() calls. + for (int i = snapshot.length - 1; i >= 0; i--) { + MetricEntityStateUtils.closeQuietly(snapshot[i]); + } + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java index 8abba6b2eb5..ba6c6a0b387 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityState.java @@ -9,11 +9,10 @@ import io.opentelemetry.api.metrics.LongUpDownCounter; import io.opentelemetry.api.metrics.ObservableLongMeasurement; import io.tehuti.metrics.MeasurableStat; +import io.tehuti.metrics.Sensor; import java.util.List; import java.util.Map; import java.util.function.LongSupplier; -import java.util.function.ObjDoubleConsumer; -import java.util.function.ObjLongConsumer; /** @@ -27,9 +26,19 @@ */ public abstract class MetricEntityState extends AsyncMetricEntityState { private final boolean isObservableCounter; - /** define both long and double consumer to avoid unnecessary conversions **/ - private final ObjDoubleConsumer otelDoubleRecordingStrategy; - private final ObjLongConsumer otelLongRecordingStrategy; + /** Strategies take the captured OTel instrument as a parameter so a concurrent {@link #close()} cannot NPE the lambda. */ + private final OtelDoubleRecorder otelDoubleRecordingStrategy; + private final OtelLongRecorder otelLongRecordingStrategy; + + @FunctionalInterface + private interface OtelDoubleRecorder { + void record(Object otelInstrument, MetricAttributesData holder, double value); + } + + @FunctionalInterface + private interface OtelLongRecorder { + void record(Object otelInstrument, MetricAttributesData holder, long value); + } public MetricEntityState( MetricEntity metricEntity, @@ -114,40 +123,41 @@ public final boolean isObservableCounter() { /** * Creates the double recording strategy for histogram types that need double precision. */ - private ObjDoubleConsumer createOtelDoubleRecordingStrategy(MetricType metricType) { + private OtelDoubleRecorder createOtelDoubleRecordingStrategy(MetricType metricType) { switch (metricType) { case HISTOGRAM: case MIN_MAX_COUNT_SUM_AGGREGATIONS: - return (holder, value) -> ((DoubleHistogram) otelMetric).record(value, holder.getAttributes()); + return (instrument, holder, value) -> ((DoubleHistogram) instrument).record(value, holder.getAttributes()); default: // For non-histogram types, delegate to long strategy - return (holder, value) -> otelLongRecordingStrategy.accept(holder, (long) value); + return (instrument, holder, value) -> otelLongRecordingStrategy.record(instrument, holder, (long) value); } } /** * Creates the long recording strategy for counter/gauge types - avoids unnecessary double conversion. */ - private ObjLongConsumer createOtelLongRecordingStrategy(MetricType metricType) { + private OtelLongRecorder createOtelLongRecordingStrategy(MetricType metricType) { switch (metricType) { case ASYNC_COUNTER_FOR_HIGH_PERF_CASES: - return (holder, value) -> { + return (instrument, holder, value) -> { if (value >= 0) { holder.add(value); } }; case ASYNC_UP_DOWN_COUNTER_FOR_HIGH_PERF_CASES: - return (holder, value) -> holder.add(value); + return (instrument, holder, value) -> holder.add(value); case COUNTER: - return (holder, value) -> ((LongCounter) otelMetric).add(value, holder.getAttributes()); + return (instrument, holder, value) -> ((LongCounter) instrument).add(value, holder.getAttributes()); case UP_DOWN_COUNTER: - return (holder, value) -> ((LongUpDownCounter) otelMetric).add(value, holder.getAttributes()); + return (instrument, holder, value) -> ((LongUpDownCounter) instrument).add(value, holder.getAttributes()); case GAUGE: - return (holder, value) -> ((LongGauge) otelMetric).set(value, holder.getAttributes()); + return (instrument, holder, value) -> ((LongGauge) instrument).set(value, holder.getAttributes()); case HISTOGRAM: case MIN_MAX_COUNT_SUM_AGGREGATIONS: // Histograms use double, so convert here (rarely called via long path) - return (holder, value) -> ((DoubleHistogram) otelMetric).record((double) value, holder.getAttributes()); + return (instrument, holder, value) -> ((DoubleHistogram) instrument) + .record((double) value, holder.getAttributes()); default: throw new IllegalArgumentException("Unsupported metric type: " + metricType); } @@ -158,8 +168,12 @@ private ObjLongConsumer createOtelLongRecordingStrategy(Me * {@link #record(double, MetricAttributesData)} API, which records to both OTel and Tehuti. */ void recordOtelMetric(double value, MetricAttributesData holder) { - if (otelMetric != null) { - otelDoubleRecordingStrategy.accept(holder, value); + if (holder == null) { + return; + } + Object localInstrument = otelMetric; + if (localInstrument != null) { + otelDoubleRecordingStrategy.record(localInstrument, holder, value); } } @@ -168,14 +182,19 @@ void recordOtelMetric(double value, MetricAttributesData holder) { * {@link #record(long, MetricAttributesData)} API, which records to both OTel and Tehuti. */ void recordOtelMetric(long value, MetricAttributesData holder) { - if (otelMetric != null) { - otelLongRecordingStrategy.accept(holder, value); + if (holder == null) { + return; + } + Object localInstrument = otelMetric; + if (localInstrument != null) { + otelLongRecordingStrategy.record(localInstrument, holder, value); } } void recordTehutiMetric(double value) { - if (tehutiSensor != null) { - tehutiSensor.record(value); + Sensor localSensor = tehutiSensor; + if (localSensor != null) { + localSensor.record(value); } } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateBase.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateBase.java index 5de5368eb15..21236034048 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateBase.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateBase.java @@ -57,16 +57,18 @@ private MetricEntityStateBase( } } - /** Factory method to keep the API consistent with other subclasses like {@link MetricEntityStateOneEnum} */ + /** Factory method to keep the API consistent with other subclasses like {@link MetricEntityStateOneEnum}. */ public static MetricEntityStateBase create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap, - Attributes baseAttributes) { - return new MetricEntityStateBase(metricEntity, otelRepository, baseDimensionsMap, baseAttributes); + Attributes baseAttributes, + CompositeCloseable registry) { + return registry + .register(new MetricEntityStateBase(metricEntity, otelRepository, baseDimensionsMap, baseAttributes)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static MetricEntityStateBase create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -74,15 +76,17 @@ public static MetricEntityStateBase create( TehutiMetricNameEnum tehutiMetricNameEnum, List tehutiMetricStats, Map baseDimensionsMap, - Attributes baseAttributes) { - return new MetricEntityStateBase( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - baseAttributes); + Attributes baseAttributes, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateBase( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + baseAttributes)); } public void record(double value) { diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java index 060348f8953..60c57094d58 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnums.java @@ -88,7 +88,7 @@ private MetricEntityStateFiveEnums( registerObservableCounterIfNeeded(); } - /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E */ + /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface, E5 extends Enum & VeniceDimensionInterface> MetricEntityStateFiveEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -97,19 +97,21 @@ public static & VeniceDimensionInterface, E2 extends Enum enumTypeClass2, Class enumTypeClass3, Class enumTypeClass4, - Class enumTypeClass5) { - return new MetricEntityStateFiveEnums<>( - metricEntity, - otelRepository, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2, - enumTypeClass3, - enumTypeClass4, - enumTypeClass5); + Class enumTypeClass5, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateFiveEnums<>( + metricEntity, + otelRepository, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4, + enumTypeClass5)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface, E5 extends Enum & VeniceDimensionInterface> MetricEntityStateFiveEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -121,19 +123,21 @@ public static & VeniceDimensionInterface, E2 extends Enum enumTypeClass2, Class enumTypeClass3, Class enumTypeClass4, - Class enumTypeClass5) { - return new MetricEntityStateFiveEnums<>( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2, - enumTypeClass3, - enumTypeClass4, - enumTypeClass5); + Class enumTypeClass5, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateFiveEnums<>( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4, + enumTypeClass5)); } /** diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnums.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnums.java index e908eade036..5bb7a6bdc43 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnums.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnums.java @@ -18,6 +18,7 @@ */ public class MetricEntityStateFourEnums & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface> extends MetricEntityState { + /** Lazy cache of {@link MetricAttributesData}; nulled in {@link #close()}. */ private final EnumMap>>> metricAttributesDataEnumMap; private final Class enumTypeClass1; @@ -82,7 +83,7 @@ private MetricEntityStateFourEnums( registerObservableCounterIfNeeded(); } - /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E */ + /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface> MetricEntityStateFourEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -90,18 +91,20 @@ public static & VeniceDimensionInterface, E2 extends Enum enumTypeClass1, Class enumTypeClass2, Class enumTypeClass3, - Class enumTypeClass4) { - return new MetricEntityStateFourEnums<>( - metricEntity, - otelRepository, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2, - enumTypeClass3, - enumTypeClass4); + Class enumTypeClass4, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateFourEnums<>( + metricEntity, + otelRepository, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface, E4 extends Enum & VeniceDimensionInterface> MetricEntityStateFourEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -112,18 +115,20 @@ public static & VeniceDimensionInterface, E2 extends Enum enumTypeClass1, Class enumTypeClass2, Class enumTypeClass3, - Class enumTypeClass4) { - return new MetricEntityStateFourEnums<>( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2, - enumTypeClass3, - enumTypeClass4); + Class enumTypeClass4, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateFourEnums<>( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3, + enumTypeClass4)); } /** @@ -150,7 +155,6 @@ private MetricAttributesData getMetricAttributesData(E1 dimension1, E2 dimension if (!emitOpenTelemetryMetrics()) { return null; } - return metricAttributesDataEnumMap.computeIfAbsent(dimension1, k -> { validateInputDimension(k); return new EnumMap<>(enumTypeClass2); @@ -195,15 +199,24 @@ public void record( @Override protected Iterable getAllMetricAttributesData() { - if (metricAttributesDataEnumMap == null) { - return null; - } - List allData = new ArrayList<>(); for (EnumMap>> level2Map: metricAttributesDataEnumMap.values()) { + if (level2Map == null) { + continue; + } for (EnumMap> level3Map: level2Map.values()) { + if (level3Map == null) { + continue; + } for (EnumMap level4Map: level3Map.values()) { - allData.addAll(level4Map.values()); + if (level4Map == null) { + continue; + } + for (MetricAttributesData holder: level4Map.values()) { + if (holder != null) { + allData.add(holder); + } + } } } } @@ -214,4 +227,5 @@ protected Iterable getAllMetricAttributesData() { public EnumMap>>> getMetricAttributesDataEnumMap() { return metricAttributesDataEnumMap; } + } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateGeneric.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateGeneric.java index 02e4b444940..0bf9a737fc2 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateGeneric.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateGeneric.java @@ -68,29 +68,32 @@ private void validateMetricType(MetricEntity metricEntity) { } } - /** Factory method to keep the API consistent with other subclasses like {@link MetricEntityStateOneEnum} */ + /** Factory method to keep the API consistent with other subclasses like {@link MetricEntityStateOneEnum}. */ public static MetricEntityStateGeneric create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, - Map baseDimensionsMap) { - return new MetricEntityStateGeneric(metricEntity, otelRepository, baseDimensionsMap); + Map baseDimensionsMap, + CompositeCloseable registry) { + return registry.register(new MetricEntityStateGeneric(metricEntity, otelRepository, baseDimensionsMap)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static MetricEntityStateGeneric create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, TehutiSensorRegistrationFunction registerTehutiSensorFn, TehutiMetricNameEnum tehutiMetricNameEnum, List tehutiMetricStats, - Map baseDimensionsMap) { - return new MetricEntityStateGeneric( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap); + Map baseDimensionsMap, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateGeneric( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap)); } /** diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnum.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnum.java index 5eb13ff8e68..d4e76530d4c 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnum.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnum.java @@ -26,6 +26,7 @@ * */ public class MetricEntityStateOneEnum & VeniceDimensionInterface> extends MetricEntityState { + /** Lazy cache of {@link MetricAttributesData}. */ private final EnumMap metricAttributesDataEnumMap; private final Class enumTypeClass; @@ -60,16 +61,18 @@ private MetricEntityStateOneEnum( registerObservableCounterIfNeeded(); } - /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E */ + /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E. */ public static & VeniceDimensionInterface> MetricEntityStateOneEnum create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap, - Class enumTypeClass) { - return new MetricEntityStateOneEnum<>(metricEntity, otelRepository, baseDimensionsMap, enumTypeClass); + Class enumTypeClass, + CompositeCloseable registry) { + return registry + .register(new MetricEntityStateOneEnum<>(metricEntity, otelRepository, baseDimensionsMap, enumTypeClass)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static & VeniceDimensionInterface> MetricEntityStateOneEnum create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -77,15 +80,17 @@ public static & VeniceDimensionInterface> MetricEntityStateOn TehutiMetricNameEnum tehutiMetricNameEnum, List tehutiMetricStats, Map baseDimensionsMap, - Class enumTypeClass) { - return new MetricEntityStateOneEnum<>( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - enumTypeClass); + Class enumTypeClass, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateOneEnum<>( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + enumTypeClass)); } /** @@ -122,7 +127,6 @@ private MetricAttributesData getMetricAttributesData(E dimension) { if (!emitOpenTelemetryMetrics()) { return null; } - return metricAttributesDataEnumMap.computeIfAbsent(dimension, k -> { validateInputDimension(k); Attributes attrs = createAttributes(k); @@ -148,10 +152,13 @@ public void record(long value, @Nonnull E dimension) { @Override protected Iterable getAllMetricAttributesData() { - if (metricAttributesDataEnumMap == null) { - return null; + List snapshot = new ArrayList<>(metricAttributesDataEnumMap.size()); + for (MetricAttributesData holder: metricAttributesDataEnumMap.values()) { + if (holder != null) { + snapshot.add(holder); + } } - return new ArrayList<>(metricAttributesDataEnumMap.values()); + return snapshot; } /** visible for testing */ diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnums.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnums.java index dc253341031..ac88f720dea 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnums.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnums.java @@ -19,6 +19,7 @@ */ public class MetricEntityStateThreeEnums & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface> extends MetricEntityState { + /** Lazy cache of {@link MetricAttributesData}; nulled in {@link #close()}. */ private final EnumMap>> metricAttributesDataEnumMap; private final Class enumTypeClass1; @@ -71,24 +72,26 @@ private MetricEntityStateThreeEnums( registerObservableCounterIfNeeded(); } - /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E */ + /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface> MetricEntityStateThreeEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap, Class enumTypeClass1, Class enumTypeClass2, - Class enumTypeClass3) { - return new MetricEntityStateThreeEnums<>( - metricEntity, - otelRepository, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2, - enumTypeClass3); + Class enumTypeClass3, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateThreeEnums<>( + metricEntity, + otelRepository, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface, E3 extends Enum & VeniceDimensionInterface> MetricEntityStateThreeEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -98,17 +101,19 @@ public static & VeniceDimensionInterface, E2 extends Enum baseDimensionsMap, Class enumTypeClass1, Class enumTypeClass2, - Class enumTypeClass3) { - return new MetricEntityStateThreeEnums<>( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2, - enumTypeClass3); + Class enumTypeClass3, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateThreeEnums<>( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2, + enumTypeClass3)); } /** @@ -134,7 +139,6 @@ private MetricAttributesData getMetricAttributesData(E1 dimension1, E2 dimension if (!emitOpenTelemetryMetrics()) { return null; } - return metricAttributesDataEnumMap.computeIfAbsent(dimension1, k -> { validateInputDimension(k); return new EnumMap<>(enumTypeClass2); @@ -175,14 +179,20 @@ public void record(long value, @Nonnull E1 dimension1, @Nonnull E2 dimension2, @ @Override protected Iterable getAllMetricAttributesData() { - if (metricAttributesDataEnumMap == null) { - return null; - } - List allData = new ArrayList<>(); for (EnumMap> level2Map: metricAttributesDataEnumMap.values()) { + if (level2Map == null) { + continue; + } for (EnumMap level3Map: level2Map.values()) { - allData.addAll(level3Map.values()); + if (level3Map == null) { + continue; + } + for (MetricAttributesData holder: level3Map.values()) { + if (holder != null) { + allData.add(holder); + } + } } } return allData; @@ -192,4 +202,5 @@ protected Iterable getAllMetricAttributesData() { public EnumMap>> getMetricAttributesDataEnumMap() { return metricAttributesDataEnumMap; } + } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnums.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnums.java index 92786311e7e..14021fdd56f 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnums.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnums.java @@ -18,6 +18,7 @@ */ public class MetricEntityStateTwoEnums & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface> extends MetricEntityState { + /** Lazy cache of {@link MetricAttributesData}. */ private final EnumMap> metricAttributesDataEnumMap; private final Class enumTypeClass1; private final Class enumTypeClass2; @@ -64,22 +65,24 @@ private MetricEntityStateTwoEnums( registerObservableCounterIfNeeded(); } - /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E */ + /** Factory method with named parameters to ensure the passed in enumTypeClass are in the same order as E. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface> MetricEntityStateTwoEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, Map baseDimensionsMap, Class enumTypeClass1, - Class enumTypeClass2) { - return new MetricEntityStateTwoEnums<>( - metricEntity, - otelRepository, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2); + Class enumTypeClass2, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateTwoEnums<>( + metricEntity, + otelRepository, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2)); } - /** Overloaded Factory method for constructor with Tehuti parameters */ + /** Overloaded Factory method for constructor with Tehuti parameters. */ public static & VeniceDimensionInterface, E2 extends Enum & VeniceDimensionInterface> MetricEntityStateTwoEnums create( MetricEntity metricEntity, VeniceOpenTelemetryMetricsRepository otelRepository, @@ -88,16 +91,18 @@ public static & VeniceDimensionInterface, E2 extends Enum tehutiMetricStats, Map baseDimensionsMap, Class enumTypeClass1, - Class enumTypeClass2) { - return new MetricEntityStateTwoEnums<>( - metricEntity, - otelRepository, - registerTehutiSensorFn, - tehutiMetricNameEnum, - tehutiMetricStats, - baseDimensionsMap, - enumTypeClass1, - enumTypeClass2); + Class enumTypeClass2, + CompositeCloseable registry) { + return registry.register( + new MetricEntityStateTwoEnums<>( + metricEntity, + otelRepository, + registerTehutiSensorFn, + tehutiMetricNameEnum, + tehutiMetricStats, + baseDimensionsMap, + enumTypeClass1, + enumTypeClass2)); } /** @@ -123,7 +128,6 @@ private MetricAttributesData getMetricAttributesData(E1 dimension1, E2 dimension if (!emitOpenTelemetryMetrics()) { return null; } - return metricAttributesDataEnumMap.computeIfAbsent(dimension1, k -> { validateInputDimension(k); return new EnumMap<>(enumTypeClass2); @@ -152,13 +156,16 @@ public void record(long value, @Nonnull E1 dimension1, @Nonnull E2 dimension2) { @Override protected Iterable getAllMetricAttributesData() { - if (metricAttributesDataEnumMap == null) { - return null; - } - List allData = new ArrayList<>(); for (EnumMap level2Map: metricAttributesDataEnumMap.values()) { - allData.addAll(level2Map.values()); + if (level2Map == null) { + continue; + } + for (MetricAttributesData holder: level2Map.values()) { + if (holder != null) { + allData.add(holder); + } + } } return allData; } @@ -167,4 +174,5 @@ protected Iterable getAllMetricAttributesData() { public EnumMap> getMetricAttributesDataEnumMap() { return metricAttributesDataEnumMap; } + } diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateUtils.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateUtils.java new file mode 100644 index 00000000000..72a1480263b --- /dev/null +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/metrics/MetricEntityStateUtils.java @@ -0,0 +1,51 @@ +package com.linkedin.venice.stats.metrics; + +import java.io.Closeable; +import java.util.Map; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + + +/** Static helpers for closing {@code MetricEntityState*} / {@code AsyncMetricEntityState*} wrappers. */ +public final class MetricEntityStateUtils { + private static final Logger LOGGER = LogManager.getLogger(MetricEntityStateUtils.class); + + private MetricEntityStateUtils() { + } + + /** + * Null-safe close. Catches every {@link Exception} so a misbehaving wrapper cannot abort an + * enclosing close loop (e.g., {@link CompositeCloseable#close()}) and skip later siblings. + */ + public static void closeQuietly(Closeable wrapper) { + if (wrapper == null) { + return; + } + try { + wrapper.close(); + } catch (Exception e) { + LOGGER.warn("Close threw for {}", wrapper.getClass().getSimpleName(), e); + } + } + + /** + * Closes the SDK-side OTel instrument if it is {@link AutoCloseable}. Sync instruments are not + * AutoCloseable on the SDK side; this is a no-op for them. Caller should pass the snapshot of + * the wrapper's volatile field so a concurrent second close() cannot deref a now-null reference. + */ + public static void closeOtelInstrumentQuietly(Object instrument, MetricEntity metricEntity) { + if (instrument instanceof AutoCloseable) { + try { + ((AutoCloseable) instrument).close(); + } catch (Exception e) { + LOGGER.warn("OTel SDK close threw for metric {}", metricEntity.getMetricName(), e); + } + } + } + + /** Closes every value in the map via {@link #closeQuietly} and clears the map. */ + public static void closeAndClear(Map map) { + map.values().forEach(MetricEntityStateUtils::closeQuietly); + map.clear(); + } +} diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/routing/HelixGroupStats.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/routing/HelixGroupStats.java index acd834eec08..0c17cda32bb 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/routing/HelixGroupStats.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/stats/routing/HelixGroupStats.java @@ -26,7 +26,8 @@ public class HelixGroupStats extends AbstractVeniceStats { /** * Per-Helix-group OTel metric entity states and Tehuti metric references, keyed by group ID. Each map grows * lazily via {@code computeIfAbsent} and is bounded by the number of Helix groups configured for the store - * (typically 3–5). Entries are not evicted — the maps persist for the lifetime of this stats instance. + * (typically 3–5). Entries are not evicted — the maps persist for the lifetime of this stats instance, so + * every wrapper registers with the inherited {@code resources} owner and is closed in {@link #close()}. * {@code groupResponseWaitingTimeAvgMap} holds Tehuti {@link io.tehuti.metrics.Metric} references; the * remaining maps hold OTel {@link MetricEntityStateBase} instances. */ @@ -81,7 +82,8 @@ public HelixGroupStats(MetricsRepository metricsRepository, String prefix) { HelixGroupTehutiMetricName.GROUP_COUNT, Collections.singletonList(new Avg()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordGroupNum(int groupNum) { @@ -97,7 +99,8 @@ private MetricEntityStateBase buildHelixGroupCallCount(int groupId) { HelixGroupTehutiMetricDynamicName.of(HelixGroupTehutiMetricName.GROUP_REQUEST, groupId), Collections.singletonList(new OccurrenceRate()), otelSetup.baseDimensionsMap, - otelSetup.baseAttributes); + otelSetup.baseAttributes, + resources); } public void recordGroupRequest(int groupId) { @@ -113,7 +116,8 @@ private MetricEntityStateBase buildHelixGroupPendingRequest(int groupId) { HelixGroupTehutiMetricDynamicName.of(HelixGroupTehutiMetricName.GROUP_PENDING_REQUEST, groupId), Collections.singletonList(new Avg()), otelSetup.baseDimensionsMap, - otelSetup.baseAttributes); + otelSetup.baseAttributes, + resources); } public void recordGroupPendingRequest(int groupId, int value) { @@ -131,7 +135,8 @@ private MetricEntityStateBase buildHelixGroupResponseWaitingTime(int groupId, Me HelixGroupTehutiMetricDynamicName.of(HelixGroupTehutiMetricName.GROUP_RESPONSE_WAITING_TIME, groupId), Collections.singletonList(avgStat), otelSetup.baseDimensionsMap, - otelSetup.baseAttributes); + otelSetup.baseAttributes, + resources); } public void recordGroupResponseWaitingTime(int groupId, double responseWaitingTime) { @@ -157,6 +162,18 @@ public double getGroupResponseWaitingTimeAvg(int groupId) { return avgLatency; } + @Override + public void close() { + try { + super.close(); + } finally { + groupRequestCountMap.clear(); + groupPendingRequestMap.clear(); + groupResponseWaitingTimeMap.clear(); + groupResponseWaitingTimeAvgMap.clear(); + } + } + /** * Helper class to hold OpenTelemetry setup data for a specific helix group. */ diff --git a/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java b/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java index 1f507f75e13..86cd5a39327 100644 --- a/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java +++ b/internal/venice-client-common/src/main/java/com/linkedin/venice/utils/metrics/MetricsRepositoryUtils.java @@ -4,6 +4,7 @@ import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.VeniceOpenTelemetryMetricNamingFormat; import com.linkedin.venice.stats.metrics.MetricEntity; +import io.opentelemetry.sdk.metrics.export.MetricReader; import io.tehuti.metrics.MetricConfig; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; @@ -109,4 +110,35 @@ public static MetricConfig getMetricConfig( .build()); } + /** + * Constructs an OTel-enabled repository. Pass a dedicated {@link AsyncGauge.AsyncGaugeExecutor} when the test calls + * {@code metricsRepository.close()} — otherwise the close shuts down Tehuti's JVM-wide static executor singleton. + */ + public static VeniceMetricsRepository createOtelEnabledRepository( + String metricPrefix, + Collection metricEntities, + MetricReader reader, + AsyncGauge.AsyncGaugeExecutor executor) { + VeniceMetricsConfig.Builder builder = new VeniceMetricsConfig.Builder().setMetricPrefix(metricPrefix) + .setMetricEntities(metricEntities) + .setEmitOtelMetrics(true) + .setOtelAdditionalMetricsReader(reader); + if (executor != null) { + builder.setTehutiMetricConfig(new MetricConfig(executor)); + } + return new VeniceMetricsRepository(builder.build()); + } + + /** OTel-disabled variant of {@link #createOtelEnabledRepository}. Same executor caveat applies. */ + public static VeniceMetricsRepository createOtelDisabledRepository( + String metricPrefix, + AsyncGauge.AsyncGaugeExecutor executor) { + VeniceMetricsConfig.Builder builder = + new VeniceMetricsConfig.Builder().setMetricPrefix(metricPrefix).setEmitOtelMetrics(false); + if (executor != null) { + builder.setTehutiMetricConfig(new MetricConfig(executor)); + } + return new VeniceMetricsRepository(builder.build()); + } + } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java index 144025065bc..05a5f4054ca 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceMetricsRepositoryTest.java @@ -13,10 +13,13 @@ import com.linkedin.venice.stats.metrics.MetricUnit; import com.linkedin.venice.utils.DataProviderUtils; import io.tehuti.metrics.MetricConfig; +import io.tehuti.metrics.MetricsReporter; +import io.tehuti.metrics.TehutiMetric; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Set; import org.mockito.Mockito; import org.testng.annotations.Test; @@ -109,6 +112,58 @@ public void testCloseMethod() { Mockito.verify(mockOpenTelemetryRepository).close(); } + /** + * Regression test for the try-finally contract in {@link VeniceMetricsRepository#close()}: if Tehuti's + * {@code super.close()} throws (e.g., a misbehaving reporter), the OTel repository's {@code close()} MUST + * still run so the SDK MeterProvider is shut down. Pre-fix (without try-finally) this would leak the OTel + * exporter thread and could prevent JVM exit. + */ + @Test + public void testCloseRunsOtelCloseEvenIfSuperCloseThrows() { + VeniceMetricsConfig mockConfig = Mockito.mock(VeniceMetricsConfig.class); + VeniceOpenTelemetryMetricsRepository mockOpenTelemetryRepository = + Mockito.mock(VeniceOpenTelemetryMetricsRepository.class); + Mockito.when(mockConfig.getTehutiMetricConfig()).thenReturn(new MetricConfig()); + + VeniceMetricsRepository repository = new VeniceMetricsRepository(mockConfig, mockOpenTelemetryRepository); + // Add a Tehuti reporter that throws on close — this makes super.close() throw, exercising the try-finally. + repository.addReporter(new MetricsReporter() { + @Override + public void init(List metrics) { + } + + @Override + public void metricChange(TehutiMetric metric) { + } + + @Override + public void addMetric(TehutiMetric metric) { + } + + @Override + public void removeMetric(TehutiMetric metric) { + } + + @Override + public void close() { + throw new RuntimeException("simulated Tehuti close failure"); + } + + @Override + public void configure(java.util.Map configs) { + } + }); + + try { + repository.close(); + } catch (RuntimeException expected) { + // Expected — close() rethrows after the finally block runs. + } + + // The contract: OTel close MUST have been invoked even though super.close() threw. + Mockito.verify(mockOpenTelemetryRepository).close(); + } + @Test(dataProvider = "True-and-False", dataProviderClass = DataProviderUtils.class) public void testGetVeniceMetricsRepositoryWithSingleThreadedConfig(boolean useSingleThreadedMetricsRepository) { VeniceMetricsRepository repository = VeniceMetricsRepository.getVeniceMetricsRepository( diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java index bb405b36f67..787d6969492 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/VeniceOpenTelemetryMetricsRepositoryTest.java @@ -23,6 +23,7 @@ import com.linkedin.venice.stats.metrics.AsyncMetricEntityState; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateBase; import com.linkedin.venice.stats.metrics.AsyncMetricEntityStateOneEnum; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.stats.metrics.MetricAttributesData; import com.linkedin.venice.stats.metrics.MetricEntity; import com.linkedin.venice.stats.metrics.MetricEntityStateBase; @@ -209,14 +210,24 @@ public void testCreateAndRecordMetricsForAllMetricTypes() { AsyncMetricEntityState metricEntityState; if (metricType == MetricType.ASYNC_DOUBLE_GAUGE) { - metricEntityState = AsyncMetricEntityStateBase - .create(metricEntity, metricsRepository, baseDimensionsMap, baseAttributes, (DoubleSupplier) () -> 10.0); + metricEntityState = AsyncMetricEntityStateBase.create( + metricEntity, + metricsRepository, + baseDimensionsMap, + baseAttributes, + (DoubleSupplier) () -> 10.0, + CompositeCloseable.NONE); } else if (metricType.isAsyncMetric()) { - metricEntityState = AsyncMetricEntityStateBase - .create(metricEntity, metricsRepository, baseDimensionsMap, baseAttributes, () -> 10); + metricEntityState = AsyncMetricEntityStateBase.create( + metricEntity, + metricsRepository, + baseDimensionsMap, + baseAttributes, + () -> 10, + CompositeCloseable.NONE); } else { - metricEntityState = - MetricEntityStateBase.create(metricEntity, metricsRepository, baseDimensionsMap, baseAttributes); + metricEntityState = MetricEntityStateBase + .create(metricEntity, metricsRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); } metricEntityState.setOtelMetric(instrument); @@ -618,7 +629,8 @@ public void testCloneWithNewMetricPrefixCopiesAllRequiredFields() throws Illegal "histogramMap", // Child has its own instrument maps "counterMap", "upDownCounterMap", - "gaugeMap")); + "gaugeMap", + "resources")); // Child has its own CompositeCloseable for owned wrappers // Fields that are expected to be null in child Set FIELDS_EXPECTED_NULL_IN_CHILD = new HashSet<>(Arrays.asList("sdkMeterProvider")); @@ -749,7 +761,8 @@ private MetricEntityStateThreeEnums storeBDimensions = new HashMap<>(); storeBDimensions.put(VeniceMetricsDimensions.VENICE_STORE_NAME, "store_B"); @@ -1039,7 +1054,8 @@ public void testMultipleCallbacksForSameAsyncCounter() { storeBDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); // Record data via both states stateA.record(100L, HttpResponseStatusEnum.OK, HttpResponseStatusCodeCategory.SUCCESS, RequestType.SINGLE_GET); @@ -1106,7 +1122,8 @@ public void testMultipleCallbacksForSameAsyncUpDownCounter() { storeADimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); Map storeBDimensions = new HashMap<>(); storeBDimensions.put(VeniceMetricsDimensions.VENICE_STORE_NAME, "store_B"); @@ -1117,7 +1134,8 @@ public void testMultipleCallbacksForSameAsyncUpDownCounter() { storeBDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); // Record data via both states (including negative values for up-down counter) stateA.record(50L, HttpResponseStatusEnum.OK, HttpResponseStatusCodeCategory.SUCCESS, RequestType.SINGLE_GET); @@ -1249,7 +1267,8 @@ public void testAsyncMetricEntityStateOneEnumRegistersAllEnumCallbacks() { baseDimensionsMap, VersionRole.class, role -> role, - (state, role) -> (role.ordinal() + 1) * 10L); + (state, role) -> (role.ordinal() + 1) * 10L, + CompositeCloseable.NONE); assertNotNull(metricState); @@ -1302,8 +1321,13 @@ public void testAsyncDoubleGaugeEndToEndValueAccuracy() { Attributes baseAttributes = otelRepo.createAttributes(metricEntity, baseDimensionsMap); // Create with a known fractional value (0.75 = 75% usage) - AsyncMetricEntityStateBase state = AsyncMetricEntityStateBase - .create(metricEntity, otelRepo, baseDimensionsMap, baseAttributes, (DoubleSupplier) () -> 0.75); + AsyncMetricEntityStateBase state = AsyncMetricEntityStateBase.create( + metricEntity, + otelRepo, + baseDimensionsMap, + baseAttributes, + (DoubleSupplier) () -> 0.75, + CompositeCloseable.NONE); assertNotNull(state); // Validate the double gauge value is preserved (not truncated to 0) @@ -1460,7 +1484,8 @@ public void testMultipleCallbacksDeltaTemporalityIsolation() { storeADims, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); Map storeBDims = new HashMap<>(); storeBDims.put(VeniceMetricsDimensions.VENICE_STORE_NAME, "store_B"); @@ -1471,7 +1496,8 @@ public void testMultipleCallbacksDeltaTemporalityIsolation() { storeBDims, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); Attributes storeAAttrs = new OpenTelemetryDataTestUtils.OpenTelemetryAttributesBuilder().setStoreName("store_A") .setHttpStatus(HttpResponseStatusEnum.OK) @@ -1633,4 +1659,29 @@ public void testRecordFailureMetricBothOverloadsAccumulateIntoSameDataPoint() { } } + /** + * Regression test for the bounded-join shutdown contract in + * {@link VeniceOpenTelemetryMetricsRepository#close()}: close() must complete cleanly when the SDK shutdown + * succeeds. The 10s join timeout in the production code prevents an exporter that fails to flush from + * blocking JVM exit; this test ensures the happy path returns within the bounded window. (Failure paths — + * exporter timeout, async non-success — are difficult to simulate without mocking the SDK, but the happy + * path is the load-bearing one.) + */ + @Test(timeOut = 30_000) + public void testCloseCompletesCleanlyOnSuccessfulShutdown() { + Set dims = new HashSet<>(); + dims.add(VeniceMetricsDimensions.VENICE_REQUEST_METHOD); + MetricEntity entity = new MetricEntity("close_test_counter", MetricType.COUNTER, MetricUnit.NUMBER, "d", dims); + VeniceMetricsRepository repo = new VeniceMetricsRepository( + new VeniceMetricsConfig.Builder().setServiceName("svc") + .setMetricPrefix("test_close") + .setEmitOtelMetrics(true) + .setMetricEntities(Collections.singletonList(entity)) + .build()); + VeniceOpenTelemetryMetricsRepository otelRepo = repo.getOpenTelemetryMetricsRepository(); + assertNotNull(otelRepo, "OTel repository must exist for the bounded-join shutdown to apply"); + // close() must complete (not throw, not hang). Test timeout is 30s — well above the 10s join. + otelRepo.close(); + repo.close(); + } } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnumTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnumTest.java index aafbafb4be3..c483f5ca08b 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnumTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateOneEnumTest.java @@ -4,6 +4,7 @@ import static com.linkedin.venice.stats.metrics.MetricType.ASYNC_GAUGE; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -19,7 +20,9 @@ import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.LiveStateResolverOneEnum; import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.ValueResolverOneEnum; import io.opentelemetry.api.common.Attributes; +import io.opentelemetry.api.metrics.ObservableDoubleGauge; import io.opentelemetry.api.metrics.ObservableDoubleMeasurement; +import io.opentelemetry.api.metrics.ObservableLongGauge; import io.opentelemetry.api.metrics.ObservableLongMeasurement; import java.util.EnumMap; import java.util.EnumSet; @@ -55,7 +58,8 @@ public void testCreateWithoutOtelRepo() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> 1L); + (state, e) -> 1L, + CompositeCloseable.NONE); assertNotNull(metricState); assertFalse(metricState.emitOpenTelemetryMetrics()); @@ -71,7 +75,8 @@ public void testCreateRegistersExactlyOneObservableGauge() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> 7L); + (state, e) -> 7L, + CompositeCloseable.NONE); assertTrue(metricState.emitOpenTelemetryMetrics()); // Exactly one SDK instrument registered, regardless of |E|. @@ -98,7 +103,8 @@ public void testCallbackEmitsOnlyWhenLiveStateResolverReturnsNonNull() { baseDimensionsMap, DimensionEnum1.class, liveStateResolver, - valueResolver); + valueResolver, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); @@ -131,7 +137,8 @@ public void testThrowingResolverForOneComboDoesNotSkipOthers() { baseDimensionsMap, DimensionEnum1.class, liveStateResolver, - (state, e) -> 7L); + (state, e) -> 7L, + CompositeCloseable.NONE); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); // If per-combo isolation is missing, the whole callback throws and this would propagate. @@ -157,7 +164,8 @@ public void testValueResolverNotInvokedForDormantCombos() { baseDimensionsMap, DimensionEnum1.class, e -> null, // always dormant - valueResolver); + valueResolver, + CompositeCloseable.NONE); callbackCaptor.getValue().accept(mock(ObservableLongMeasurement.class)); verifyNoInteractions(valueResolver); @@ -176,7 +184,8 @@ public void testCallbackIsInvokedFreshOnEachCollection() { baseDimensionsMap, DimensionEnum1.class, liveStateResolver, - (state, e) -> 1L); + (state, e) -> 1L, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); // First collection: nothing live, no records emitted. @@ -213,7 +222,8 @@ public void testAsyncGaugeEmitsTruncatedLong() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> 42.9); + (state, e) -> 42.9, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); @@ -233,7 +243,8 @@ public void testAsyncDoubleGaugeEmitsDoubleDirectly() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> 0.75); + (state, e) -> 0.75, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); // ASYNC_DOUBLE_GAUGE uses registerObservableDoubleGauge, not the Long variant. @@ -256,7 +267,8 @@ public void testRepoPresentButEmissionDisabledYieldsDisabledInstance() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> 1L); + (state, e) -> 1L, + CompositeCloseable.NONE); assertFalse(metricState.emitOpenTelemetryMetrics()); assertNull(metricState.getAttributesByEnum()); @@ -279,7 +291,8 @@ public void testEachComboEmitsWithItsOwnPrecomputedAttributes() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> e.ordinal() * 10L); + (state, e) -> e.ordinal() * 10L, + CompositeCloseable.NONE); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); callbackCaptor.getValue().accept(measurement); @@ -315,7 +328,8 @@ public void testStateFromLiveStateResolverFlowsIntoValueResolver() { throw new AssertionError("state mismatch for " + e + ": got " + state + ", expected " + stateByEnum.get(e)); } return 1L; - }); + }, + CompositeCloseable.NONE); // If any combo crosses state, the valueResolver throws -> callback propagates -> test fails. callbackCaptor.getValue().accept(mock(ObservableLongMeasurement.class)); @@ -331,7 +345,99 @@ public void testCreateThrowsOnNonAsyncGaugeMetricType() { baseDimensionsMap, DimensionEnum1.class, e -> e, - (state, e) -> 1L); + (state, e) -> 1L, + CompositeCloseable.NONE); + } + + @Test + public void testCloseDeregistersInstrumentClearsAttributesAndIsIdempotent() throws Exception { + ObservableLongGauge mockGauge = mock(ObservableLongGauge.class); + when(mockOtelRepository.registerObservableLongGauge(eq(mockMetricEntity), any())).thenReturn(mockGauge); + + AsyncMetricEntityStateOneEnum state = AsyncMetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + e -> e, + (s, e) -> 1L, + CompositeCloseable.NONE); + assertNotNull(state.getInstrument()); + assertNotNull(state.getAttributesByEnum()); + assertEquals(state.getAttributesByEnum().size(), DimensionEnum1.values().length); + + state.close(); + // SDK callback deregistered exactly once. + verify(mockGauge, times(1)).close(); + // Wrapper-side state released. + assertNull(state.getInstrument()); + assertNull(state.getAttributesByEnum()); + + state.close(); + verify(mockGauge, times(1)).close(); + } + + @Test + public void testCloseOnDoubleGaugeDeregistersInstrument() throws Exception { + when(mockMetricEntity.getMetricType()).thenReturn(MetricType.ASYNC_DOUBLE_GAUGE); + ObservableDoubleGauge mockGauge = mock(ObservableDoubleGauge.class); + when(mockOtelRepository.registerObservableDoubleGauge(eq(mockMetricEntity), any())).thenReturn(mockGauge); + + AsyncMetricEntityStateOneEnum state = AsyncMetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + e -> e, + (s, e) -> 1.0, + CompositeCloseable.NONE); + + state.close(); + verify(mockGauge, times(1)).close(); + assertNull(state.getInstrument()); + } + + @Test + public void testCloseWithOtelDisabledIsNoOp() { + when(mockOtelRepository.emitOpenTelemetryMetrics()).thenReturn(false); + + AsyncMetricEntityStateOneEnum state = AsyncMetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + e -> e, + (s, e) -> 1L, + CompositeCloseable.NONE); + assertFalse(state.emitOpenTelemetryMetrics()); + assertNull(state.getInstrument()); + assertNull(state.getAttributesByEnum()); + + // Must not throw on null instrument/attributes. + state.close(); + state.close(); + } + + @Test + public void testCloseSwallowsExceptionFromMisbehavingSdkInstrument() throws Exception { + ObservableLongGauge mockGauge = mock(ObservableLongGauge.class); + doThrow(new RuntimeException("simulated SDK close failure")).when(mockGauge).close(); + when(mockOtelRepository.registerObservableLongGauge(eq(mockMetricEntity), any())).thenReturn(mockGauge); + when(mockMetricEntity.getMetricName()).thenReturn("test_metric"); + + AsyncMetricEntityStateOneEnum state = AsyncMetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + e -> e, + (s, e) -> 1L, + CompositeCloseable.NONE); + + // Best-effort: the exception is logged at WARN and swallowed. + state.close(); + verify(mockGauge, times(1)).close(); + assertNull(state.getInstrument()); } /** Captures the {@code Consumer} passed to {@code registerObservableLongGauge}. */ diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTest.java index 8e65bf7c38b..b9398d045c6 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTest.java @@ -10,9 +10,12 @@ import static org.mockito.Mockito.doCallRealMethod; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; @@ -99,8 +102,8 @@ public void testCreateMetricWithOtelDisabled() { when(mockOtelRepository.createInstrument(mockMetricEntity)).thenReturn(longCounter); // ASYNC_GAUGE (LongSupplier) without tehuti sensor - AsyncMetricEntityState metricEntityState = - AsyncMetricEntityStateBase.create(mockMetricEntity, null, baseDimensionsMap, baseAttributes, () -> 0L); + AsyncMetricEntityState metricEntityState = AsyncMetricEntityStateBase + .create(mockMetricEntity, null, baseDimensionsMap, baseAttributes, () -> 0L, CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); @@ -114,15 +117,21 @@ public void testCreateMetricWithOtelDisabled() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNull(metricEntityState.getOtelMetric()); Assert.assertNotNull(metricEntityState.getTehutiSensor()); // ASYNC_DOUBLE_GAUGE (DoubleSupplier) without tehuti sensor when(mockMetricEntity.getMetricType()).thenReturn(MetricType.ASYNC_DOUBLE_GAUGE); - metricEntityState = AsyncMetricEntityStateBase - .create(mockMetricEntity, null, baseDimensionsMap, baseAttributes, (DoubleSupplier) () -> 0.75); + metricEntityState = AsyncMetricEntityStateBase.create( + mockMetricEntity, + null, + baseDimensionsMap, + baseAttributes, + (DoubleSupplier) () -> 0.75, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); @@ -135,8 +144,13 @@ public void testCreateMetricWithOtelEnabled() { when(mockOtelRepository.registerObservableLongGauge(any(MetricEntity.class), any())).thenReturn(mockLongGauge); // ASYNC_GAUGE (LongSupplier) without tehuti sensor - AsyncMetricEntityState metricEntityState = AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, () -> 0L); + AsyncMetricEntityState metricEntityState = AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes, + () -> 0L, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); @@ -150,7 +164,8 @@ public void testCreateMetricWithOtelEnabled() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNotNull(metricEntityState.getTehutiSensor()); @@ -160,8 +175,13 @@ public void testCreateMetricWithOtelEnabled() { ObservableDoubleGauge mockDoubleGauge = mock(ObservableDoubleGauge.class); when(mockOtelRepository.registerObservableDoubleGauge(any(MetricEntity.class), any())).thenReturn(mockDoubleGauge); - metricEntityState = AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, (DoubleSupplier) () -> 0.75); + metricEntityState = AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes, + (DoubleSupplier) () -> 0.75, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); @@ -175,27 +195,102 @@ public void testCreateMetricWithOtelEnabled() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0.75, "test")), baseDimensionsMap, baseAttributes, - (DoubleSupplier) () -> 0.75); + (DoubleSupplier) () -> 0.75, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNotNull(metricEntityState.getTehutiSensor()); } + @Test + public void testCloseDeregistersAsyncInstrumentAndIsIdempotent() throws Exception { + when(mockMetricEntity.getMetricType()).thenReturn(ASYNC_GAUGE); + ObservableLongGauge mockLongGauge = mock(ObservableLongGauge.class); + when(mockOtelRepository.registerObservableLongGauge(any(MetricEntity.class), any())).thenReturn(mockLongGauge); + + AsyncMetricEntityState state = AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes, + () -> 0L, + CompositeCloseable.NONE); + Assert.assertNotNull(state.getOtelMetric()); + + state.close(); + // SDK callback deregistered exactly once. + verify(mockLongGauge, times(1)).close(); + // Wrapper-side reference released. + assertNull(state.getOtelMetric()); + assertNull(state.getTehutiSensor()); + + // Idempotent: a second close is a no-op (no additional close() call on the SDK handle). + state.close(); + verify(mockLongGauge, times(1)).close(); + } + + @Test + public void testCloseSwallowsExceptionFromMisbehavingSdkInstrument() throws Exception { + when(mockMetricEntity.getMetricType()).thenReturn(ASYNC_GAUGE); + ObservableLongGauge mockLongGauge = mock(ObservableLongGauge.class); + Mockito.doThrow(new RuntimeException("simulated SDK close failure")).when(mockLongGauge).close(); + when(mockOtelRepository.registerObservableLongGauge(any(MetricEntity.class), any())).thenReturn(mockLongGauge); + + AsyncMetricEntityState state = AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes, + () -> 0L, + CompositeCloseable.NONE); + + // Must not propagate; close is best-effort by contract. + state.close(); + assertNull(state.getOtelMetric()); + } + + @Test + public void testCloseOnSyncOtelMetricOnlyClearsWrapperReference() { + // Sync metric type (e.g. COUNTER) — the otelMetric is not AutoCloseable, so close() just nulls the field. + when(mockMetricEntity.getMetricType()).thenReturn(MetricType.COUNTER); + LongCounter mockCounter = mock(LongCounter.class); + when(mockOtelRepository.createInstrument(any(MetricEntity.class))).thenReturn(mockCounter); + + AsyncMetricEntityState state = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); + Assert.assertNotNull(state.getOtelMetric()); + + state.close(); + // Wrapper-side reference released; SDK aggregator stays alive (we don't try to close LongCounter — it isn't + // AutoCloseable). + assertNull(state.getOtelMetric()); + } + @Test public void testValidateRequiredDimensions() { Map baseDimensionsMap = new HashMap<>(); // case 1: right values baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); Attributes baseAttributes1 = getBaseAttributes(baseDimensionsMap); - AsyncMetricEntityState metricEntityState = AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes1, () -> 0L); + AsyncMetricEntityState metricEntityState = AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes1, + () -> 0L, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseAttributes have different count than baseDimensionsMap Attributes baseAttributes2 = Attributes.builder().build(); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes2, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes2, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should have the same size and values")); @@ -206,8 +301,13 @@ public void testValidateRequiredDimensions() { baseAttributes3Map.put(VENICE_REQUEST_RETRY_TYPE, ERROR_RETRY.getDimensionValue()); Attributes baseAttributes3 = getBaseAttributes(baseAttributes3Map); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes3, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes3, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should contain all the keys and same values as in baseDimensionsMap")); @@ -219,8 +319,13 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_REQUEST_RETRY_TYPE, ERROR_RETRY.getDimensionValue()); Attributes baseAttributes4 = getBaseAttributes(baseDimensionsMap); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes4, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes4, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -230,8 +335,13 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); Attributes baseAttributes5 = Attributes.builder().build(); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes5, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes5, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -242,8 +352,13 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_REQUEST_RETRY_TYPE, ERROR_RETRY.getDimensionValue()); Attributes baseAttributes6 = getBaseAttributes(baseDimensionsMap); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes6, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes6, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -256,8 +371,13 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes7, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes7, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should have the same size and values")); @@ -268,8 +388,13 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, null); try { - AsyncMetricEntityStateBase - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes8, () -> 0L); + AsyncMetricEntityStateBase.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + baseAttributes8, + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should contain all the keys and same values as in baseDimensionsMap")); @@ -280,7 +405,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); try { - AsyncMetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, null, () -> 0L); + AsyncMetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, null, () -> 0L, CompositeCloseable.NONE); } catch (IllegalArgumentException e) { fail("baseAttributes can be null when emitting OTel metrics is disabled"); } @@ -289,7 +415,8 @@ public void testValidateRequiredDimensions() { // case 10: baseAttributes is null but emitting OTel metrics is enabled. This should throw exception. try { - AsyncMetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, null, () -> 0L); + AsyncMetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, null, () -> 0L, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("Base attributes cannot be null")); @@ -314,7 +441,8 @@ public void testValidateMetric() { singletonList(new Count()), // No AsyncGauge in stats baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue( @@ -340,7 +468,8 @@ public void testValidateMetric() { Arrays.asList(new AsyncGauge((ignored, ignored2) -> 0, "test"), new Count()), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue( @@ -363,7 +492,8 @@ public void testValidateMetric() { Arrays.asList(new AsyncGauge((ignored, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 4: MetricType is ASYNC_DOUBLE_GAUGE, but tehuti has Count instead of AsyncGauge @@ -384,7 +514,8 @@ public void testValidateMetric() { singletonList(new Count()), baseDimensionsMap, baseAttributes, - (DoubleSupplier) () -> 0.5); + (DoubleSupplier) () -> 0.5, + CompositeCloseable.NONE); fail("Should throw for ASYNC_DOUBLE_GAUGE with non-AsyncGauge Tehuti stat"); } catch (IllegalArgumentException e) { assertTrue( @@ -407,7 +538,8 @@ public void testEmitTehutiMetricsEnabled() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertTrue(metricEntityState.emitTehutiMetrics(), "Should emit Tehuti metrics when enabled"); assertNotNull(metricEntityState.getTehutiSensor(), "Tehuti sensor should be created when enabled"); @@ -425,7 +557,8 @@ public void testEmitTehutiMetricsDisabled() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertFalse(metricEntityState.emitTehutiMetrics(), "Should not emit Tehuti metrics when disabled"); Assert.assertNull(metricEntityState.getTehutiSensor(), "Tehuti sensor should not be created when disabled"); @@ -442,7 +575,8 @@ public void testEmitTehutiMetricsWithNullRepository() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertTrue(metricEntityState.emitTehutiMetrics(), "Should emit Tehuti metrics when repository is null"); assertNotNull( @@ -461,7 +595,8 @@ public void testEmitTehutiMetricsWithNullRegistrationFunction() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertFalse( metricEntityState.emitTehutiMetrics(), @@ -482,7 +617,8 @@ public void testEmitTehutiMetricsWithEmptyStats() { new ArrayList<>(), // Empty stats list baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertFalse(metricEntityState.emitTehutiMetrics(), "Should not emit Tehuti metrics when stats are empty"); Assert.assertNull(metricEntityState.getTehutiSensor(), "Tehuti sensor should not be created when stats are empty"); @@ -502,7 +638,8 @@ public void testEmitTehutiMetricsIndependentOfOtel() { singletonList(new AsyncGauge((ignored1, ignored2) -> 0, "test")), baseDimensionsMap, baseAttributes, - () -> 0L); + () -> 0L, + CompositeCloseable.NONE); assertFalse(metricEntityState.emitOpenTelemetryMetrics(), "OTel metrics should be disabled"); assertTrue(metricEntityState.emitTehutiMetrics(), "Tehuti metrics should be enabled independently"); diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnumsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnumsTest.java index b8c70b57286..f3607c6a6f8 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnumsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/AsyncMetricEntityStateTwoEnumsTest.java @@ -5,6 +5,7 @@ import static com.linkedin.venice.stats.metrics.MetricType.ASYNC_GAUGE; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; @@ -21,6 +22,7 @@ import com.linkedin.venice.stats.metrics.AsyncMetricResolvers.ValueResolverTwoEnums; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.api.metrics.ObservableDoubleMeasurement; +import io.opentelemetry.api.metrics.ObservableLongGauge; import io.opentelemetry.api.metrics.ObservableLongMeasurement; import java.util.HashMap; import java.util.HashSet; @@ -61,7 +63,8 @@ public void testCreateWithoutOtelRepo() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> 1L); + (state, e1, e2) -> 1L, + CompositeCloseable.NONE); assertNotNull(metricState); assertFalse(metricState.emitOpenTelemetryMetrics()); @@ -78,7 +81,8 @@ public void testCreateRegistersExactlyOneObservableGauge() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> 1L); + (state, e1, e2) -> 1L, + CompositeCloseable.NONE); assertTrue(metricState.emitOpenTelemetryMetrics()); verify(mockOtelRepository, times(1)).registerObservableLongGauge(eq(mockMetricEntity), any()); @@ -111,7 +115,8 @@ public void testCallbackEmitsOnlyWhenLiveStateResolverReturnsNonNull() { DimensionEnum1.class, DimensionEnum2.class, liveStateResolver, - (state, e1, e2) -> 42L); + (state, e1, e2) -> 42L, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); @@ -142,7 +147,8 @@ public void testThrowingResolverForOnePairDoesNotSkipOthers() { DimensionEnum1.class, DimensionEnum2.class, liveStateResolver, - (state, e1, e2) -> 3L); + (state, e1, e2) -> 3L, + CompositeCloseable.NONE); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); callbackCaptor.getValue().accept(measurement); @@ -166,7 +172,8 @@ public void testValueResolverNotInvokedForDormantPairs() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> null, // always dormant - valueResolver); + valueResolver, + CompositeCloseable.NONE); callbackCaptor.getValue().accept(mock(ObservableLongMeasurement.class)); verifyNoInteractions(valueResolver); @@ -186,7 +193,8 @@ public void testCallbackIsInvokedFreshOnEachCollection() { DimensionEnum1.class, DimensionEnum2.class, liveStateResolver, - (state, e1, e2) -> 1L); + (state, e1, e2) -> 1L, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); // First collection: nothing live. @@ -219,7 +227,8 @@ public void testAsyncGaugeEmitsTruncatedLong() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> 42.9); + (state, e1, e2) -> 42.9, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); @@ -241,7 +250,8 @@ public void testAsyncDoubleGaugeEmitsDoubleDirectly() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> 0.75); + (state, e1, e2) -> 0.75, + CompositeCloseable.NONE); Consumer callback = callbackCaptor.getValue(); verify(mockOtelRepository, times(0)).registerObservableLongGauge(eq(mockMetricEntity), any()); @@ -265,7 +275,8 @@ public void testRepoPresentButEmissionDisabledYieldsDisabledInstance() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> 1L); + (state, e1, e2) -> 1L, + CompositeCloseable.NONE); assertFalse(metricState.emitOpenTelemetryMetrics()); assertNull(metricState.getAttributesByEnum()); @@ -289,7 +300,8 @@ public void testEachPairEmitsWithItsOwnPrecomputedAttributes() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> e1.ordinal() * 100L + e2.ordinal()); + (state, e1, e2) -> e1.ordinal() * 100L + e2.ordinal(), + CompositeCloseable.NONE); ObservableLongMeasurement measurement = mock(ObservableLongMeasurement.class); callbackCaptor.getValue().accept(measurement); @@ -332,7 +344,8 @@ public void testStateFromLiveStateResolverFlowsIntoValueResolver() { "state mismatch for (" + e1 + "," + e2 + "): got " + state + ", expected " + expected); } return 1L; - }); + }, + CompositeCloseable.NONE); // If any pair crosses state, the valueResolver throws -> callback propagates -> test fails. callbackCaptor.getValue().accept(mock(ObservableLongMeasurement.class)); @@ -349,7 +362,76 @@ public void testCreateThrowsOnNonAsyncGaugeMetricType() { DimensionEnum1.class, DimensionEnum2.class, (e1, e2) -> e1, - (state, e1, e2) -> 1L); + (state, e1, e2) -> 1L, + CompositeCloseable.NONE); + } + + @Test + public void testCloseDeregistersInstrumentClearsAttributesAndIsIdempotent() throws Exception { + ObservableLongGauge mockGauge = mock(ObservableLongGauge.class); + when(mockOtelRepository.registerObservableLongGauge(eq(mockMetricEntity), any())).thenReturn(mockGauge); + + AsyncMetricEntityStateTwoEnums state = AsyncMetricEntityStateTwoEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + DimensionEnum2.class, + (e1, e2) -> e1, + (s, e1, e2) -> 1L, + CompositeCloseable.NONE); + assertNotNull(state.getInstrument()); + assertNotNull(state.getAttributesByEnum()); + + state.close(); + verify(mockGauge, times(1)).close(); + assertNull(state.getInstrument()); + assertNull(state.getAttributesByEnum()); + + state.close(); + verify(mockGauge, times(1)).close(); + } + + @Test + public void testCloseWithOtelDisabledIsNoOp() { + when(mockOtelRepository.emitOpenTelemetryMetrics()).thenReturn(false); + + AsyncMetricEntityStateTwoEnums state = AsyncMetricEntityStateTwoEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + DimensionEnum2.class, + (e1, e2) -> e1, + (s, e1, e2) -> 1L, + CompositeCloseable.NONE); + assertFalse(state.emitOpenTelemetryMetrics()); + + state.close(); + state.close(); + } + + @Test + public void testCloseSwallowsExceptionFromMisbehavingSdkInstrument() throws Exception { + ObservableLongGauge mockGauge = mock(ObservableLongGauge.class); + doThrow(new RuntimeException("simulated SDK close failure")).when(mockGauge).close(); + when(mockOtelRepository.registerObservableLongGauge(eq(mockMetricEntity), any())).thenReturn(mockGauge); + when(mockMetricEntity.getMetricName()).thenReturn("test_metric"); + + AsyncMetricEntityStateTwoEnums state = AsyncMetricEntityStateTwoEnums.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + DimensionEnum1.class, + DimensionEnum2.class, + (e1, e2) -> e1, + (s, e1, e2) -> 1L, + CompositeCloseable.NONE); + + // Best-effort: the exception is logged at WARN and swallowed. + state.close(); + verify(mockGauge, times(1)).close(); + assertNull(state.getInstrument()); } @SuppressWarnings("unchecked") diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/CompositeCloseableTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/CompositeCloseableTest.java new file mode 100644 index 00000000000..57caeb07284 --- /dev/null +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/CompositeCloseableTest.java @@ -0,0 +1,289 @@ +package com.linkedin.venice.stats.metrics; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.assertSame; +import static org.testng.Assert.assertTrue; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import org.testng.annotations.Test; + + +/** + * Unit tests for {@link CompositeCloseable}. The class is a load-bearing piece of the OTel + * runtime cleanup framework; ~100 production sites depend on the contract verified here. + */ +public class CompositeCloseableTest { + /** Records its index when close() is invoked, so tests can verify ordering. */ + private static final class IndexRecordingCloseable implements Closeable { + private final List closeOrder; + private final int myIndex; + private boolean closed = false; + + IndexRecordingCloseable(List closeOrder, int myIndex) { + this.closeOrder = closeOrder; + this.myIndex = myIndex; + } + + @Override + public void close() { + closed = true; + closeOrder.add(myIndex); + } + + boolean isClosed() { + return closed; + } + } + + private static final class ThrowingCloseable implements Closeable { + private final RuntimeException toThrow; + boolean closeCalled = false; + + ThrowingCloseable(RuntimeException toThrow) { + this.toThrow = toThrow; + } + + @Override + public void close() { + closeCalled = true; + throw toThrow; + } + } + + private static final class CountingCloseable implements Closeable { + int closeCount = 0; + + @Override + public void close() { + closeCount++; + } + } + + @Test + public void testRegisterReturnsResourceForFluentAssignment() { + CompositeCloseable resources = new CompositeCloseable(); + CountingCloseable c = new CountingCloseable(); + Closeable returned = resources.register(c); + assertSame(returned, c, "register() must return the same instance for fluent assignment"); + } + + @Test + public void testRegisterNullIsSilentlyIgnored() { + CompositeCloseable resources = new CompositeCloseable(); + // Must not throw. + Closeable returned = resources.register(null); + assertNull(returned, "register(null) must return null"); + resources.close(); // Must not throw despite no resources. + } + + @Test + public void testCloseInvokesAllRegisteredResourcesInReverseOrder() { + CompositeCloseable resources = new CompositeCloseable(); + List closeOrder = new ArrayList<>(); + IndexRecordingCloseable a = new IndexRecordingCloseable(closeOrder, 0); + IndexRecordingCloseable b = new IndexRecordingCloseable(closeOrder, 1); + IndexRecordingCloseable c = new IndexRecordingCloseable(closeOrder, 2); + resources.register(a); + resources.register(b); + resources.register(c); + + resources.close(); + + assertTrue(a.isClosed()); + assertTrue(b.isClosed()); + assertTrue(c.isClosed()); + assertEquals(closeOrder, Arrays.asList(2, 1, 0), "close() must run in reverse registration order"); + } + + @Test + public void testCloseIsIdempotent() { + CompositeCloseable resources = new CompositeCloseable(); + CountingCloseable c = new CountingCloseable(); + resources.register(c); + + resources.close(); + resources.close(); + resources.close(); + + assertEquals(c.closeCount, 1, "Resource must be closed exactly once across multiple close() calls"); + } + + @Test + public void testThrowingCloseableDoesNotAbortIteration() { + // Verifies the documented contract: "a misbehaving wrapper cannot block shutdown." + // closeQuietly catches Exception, so RuntimeException from one resource must NOT + // skip the close() of the others. + CompositeCloseable resources = new CompositeCloseable(); + CountingCloseable before = new CountingCloseable(); + ThrowingCloseable throwing = new ThrowingCloseable(new IllegalStateException("simulated bad close")); + CountingCloseable after = new CountingCloseable(); + resources.register(before); + resources.register(throwing); + resources.register(after); + + resources.close(); + + assertTrue(throwing.closeCalled, "Throwing resource's close() must have been invoked"); + // Reverse order: after closes first, then throwing (throws), then before must still close. + assertEquals(after.closeCount, 1, "Sibling registered after the throwing resource must still close"); + assertEquals(before.closeCount, 1, "Sibling registered before the throwing resource must still close"); + } + + @Test + public void testThrowingCloseableHandlesRuntimeExceptionTypes() { + // Verifies catch is broad enough for the various runtime exceptions the OTel SDK can + // produce: NPE from a half-init instrument, ClassCastException from a wrong cast, etc. + CompositeCloseable resources = new CompositeCloseable(); + resources.register(new ThrowingCloseable(new NullPointerException("simulated NPE"))); + resources.register(new ThrowingCloseable(new ClassCastException("simulated CCE"))); + resources.register(new ThrowingCloseable(new IllegalStateException("simulated ISE"))); + + // Must not throw; all swallowed by closeQuietly. + resources.close(); + } + + @Test + public void testCloseableThatThrowsIOExceptionIsAlsoSwallowed() { + CompositeCloseable resources = new CompositeCloseable(); + resources.register(() -> { + throw new IOException("simulated IO failure"); + }); + // Must not propagate IOException either. + resources.close(); + } + + @Test + public void testRegisterAfterCloseImmediatelyClosesResource() { + // After close(), the registry is in a closed state. Subsequent register() must close + // the resource immediately rather than tracking it (preventing leaks from racy shutdown + // ordering, e.g., a per-store map populated mid-shutdown). + CompositeCloseable resources = new CompositeCloseable(); + resources.close(); + + CountingCloseable late = new CountingCloseable(); + Closeable returned = resources.register(late); + + assertSame(returned, late, "register() after close still returns the resource for assignment"); + assertEquals(late.closeCount, 1, "Resource registered after close() must be closed immediately"); + } + + @Test + public void testRegisterAfterCloseDoesNotResurrect() { + // Calling close() again after a post-close register() must not re-close the resource. + CompositeCloseable resources = new CompositeCloseable(); + resources.close(); + CountingCloseable late = new CountingCloseable(); + resources.register(late); + + resources.close(); + + assertEquals(late.closeCount, 1, "Resource must not be closed twice on a closed registry"); + } + + @Test + public void testNoneSentinelDoesNotTrack() { + // CompositeCloseable.NONE must be a no-op: register() returns the resource without tracking, + // close() does nothing. Tests rely on this so passing NONE doesn't accumulate state across + // unrelated tests. + CountingCloseable c = new CountingCloseable(); + Closeable returned = CompositeCloseable.NONE.register(c); + assertSame(returned, c, "NONE.register() must return the resource"); + + CompositeCloseable.NONE.close(); + + assertEquals(c.closeCount, 0, "NONE must not track or close the registered resource"); + } + + @Test + public void testNoneIsThreadSafeAndIdempotent() { + // Sanity: NONE has no internal state to corrupt, so this is just a smoke test. + for (int i = 0; i < 100; i++) { + CompositeCloseable.NONE.register(new CountingCloseable()); + CompositeCloseable.NONE.close(); + } + } + + @Test + public void testConcurrentRegisterIsThreadSafe() throws InterruptedException { + // Even though the production usage pattern is "register at construction time" (single-threaded), + // the contract claims thread safety. Verify by stressing register() from multiple threads. + CompositeCloseable resources = new CompositeCloseable(); + int threadCount = 8; + int perThread = 500; + AtomicInteger closeCallCount = new AtomicInteger(); + CountDownLatch start = new CountDownLatch(1); + ExecutorService pool = Executors.newFixedThreadPool(threadCount); + + try { + for (int t = 0; t < threadCount; t++) { + pool.submit(() -> { + start.await(); + for (int i = 0; i < perThread; i++) { + resources.register(closeCallCount::incrementAndGet); + } + return null; + }); + } + start.countDown(); + pool.shutdown(); + assertTrue(pool.awaitTermination(10, TimeUnit.SECONDS)); + + resources.close(); + + assertEquals( + closeCallCount.get(), + threadCount * perThread, + "Every concurrently-registered resource must be closed exactly once"); + } finally { + pool.shutdownNow(); + } + } + + @Test + public void testConcurrentRegisterAndCloseDoesNotLeak() throws InterruptedException { + // Race: one thread closes while another registers. The closed-flag guard ensures any + // resource that races past close() is closed immediately rather than leaking. + int rounds = 100; + for (int round = 0; round < rounds; round++) { + CompositeCloseable resources = new CompositeCloseable(); + AtomicInteger closeCallCount = new AtomicInteger(); + CountDownLatch ready = new CountDownLatch(2); + CountDownLatch fire = new CountDownLatch(1); + ExecutorService pool = Executors.newFixedThreadPool(2); + + pool.submit(() -> { + ready.countDown(); + fire.await(); + for (int i = 0; i < 100; i++) { + resources.register(closeCallCount::incrementAndGet); + } + return null; + }); + pool.submit(() -> { + ready.countDown(); + fire.await(); + resources.close(); + return null; + }); + + ready.await(); + fire.countDown(); + pool.shutdown(); + assertTrue(pool.awaitTermination(5, TimeUnit.SECONDS)); + + // Guarantee: any resource registered before close completed was closed; any registered + // after was closed immediately by the post-close guard. Net: closeCount == 100. + assertEquals(closeCallCount.get(), 100, "All registered resources must be closed even under register/close race"); + } + } +} diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java index c233eca7cb0..b07858ef130 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFiveEnumsTest.java @@ -73,7 +73,8 @@ public void testConstructorWithoutOtelRepo() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertNull(metricEntityState.getMetricAttributesDataEnumMap()); for (MetricEntityStateTest.DimensionEnum1 enum1: MetricEntityStateTest.DimensionEnum1.values()) { @@ -100,7 +101,8 @@ public void testConstructorWithOtelRepo() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertEquals(metricEntityState.getMetricAttributesDataEnumMap().size(), 0); } @@ -115,7 +117,8 @@ public void testCreateAttributesEnumMapWithEmptyEnum() { MetricEntityStateTest.EmptyDimensionEnum.class, MetricEntityStateTest.EmptyDimensionEnum.class, MetricEntityStateTest.EmptyDimensionEnum.class, - MetricEntityStateTest.EmptyDimensionEnum.class); + MetricEntityStateTest.EmptyDimensionEnum.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "The input Otel dimension cannot be null.*") @@ -129,7 +132,8 @@ public void testGetAttributesWithNullDimension() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(null, null, null, null, null); } @@ -143,7 +147,8 @@ public void testGetAttributesWithInvalidDimensionType() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); metricEntityState.getAttributes( MULTI_GET_STREAMING, MULTI_GET_STREAMING, @@ -172,7 +177,8 @@ public void testConstructorWithDuplicateClasses() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum1Duplicate.class, // duplicate MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*doesn't match with the required dimensions.*") @@ -188,7 +194,8 @@ public void testConstructorWithDuplicateBaseDimension() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); } @Test @@ -202,7 +209,8 @@ public void testGetAttributesWithValidDimension() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); // getAttributes will work similarly for all cases as the attributes are either pre created // or on demand @@ -270,7 +278,8 @@ public void testGetAttributesWithDiagonalDimensionCombo() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); Attributes diag1 = metricEntityState.getAttributes( MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, @@ -311,7 +320,8 @@ public void testRecordWithValidDimension() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); metricEntityState.record( 100L, DIMENSION_ONE, @@ -429,7 +439,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseDimensionsMap has extra values @@ -445,7 +456,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -462,7 +474,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -480,7 +493,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -498,7 +512,8 @@ public void testGetAllMetricAttributesData() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); // Populate the 5-level EnumMap with 2 distinct entries spanning all enum levels state.record( 100L, @@ -533,7 +548,8 @@ public void testGetAllMetricAttributesData() { MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, MetricEntityStateTest.DimensionEnum4.class, - MetricEntityStateTest.DimensionEnum5.class); + MetricEntityStateTest.DimensionEnum5.class, + CompositeCloseable.NONE); assertNull(disabled.getAllMetricAttributesData()); } } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnumsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnumsTest.java index f52b83facab..4098a97e9ba 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnumsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateFourEnumsTest.java @@ -69,7 +69,8 @@ public void testConstructorWithoutOtelRepo() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertNull(metricEntityState.getMetricAttributesDataEnumMap()); for (MetricEntityStateTest.DimensionEnum1 enum1: MetricEntityStateTest.DimensionEnum1.values()) { @@ -93,7 +94,8 @@ public void testConstructorWithOtelRepo() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertEquals(metricEntityState.getMetricAttributesDataEnumMap().size(), 0); } @@ -107,7 +109,8 @@ public void testCreateAttributesEnumMapWithEmptyEnum() { MetricEntityStateTest.EmptyDimensionEnum.class, MetricEntityStateTest.EmptyDimensionEnum.class, MetricEntityStateTest.EmptyDimensionEnum.class, - MetricEntityStateTest.EmptyDimensionEnum.class); + MetricEntityStateTest.EmptyDimensionEnum.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "The input Otel dimension cannot be null.*") @@ -120,7 +123,8 @@ public void testGetAttributesWithNullDimension() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(null, null, null, null); } @@ -133,7 +137,8 @@ public void testGetAttributesWithInvalidDimensionType() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(MULTI_GET_STREAMING, MULTI_GET_STREAMING, MULTI_GET_STREAMING, MULTI_GET_STREAMING); } @@ -155,7 +160,8 @@ public void testConstructorWithDuplicateClasses() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum1Duplicate.class, // duplicate - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*doesn't match with the required dimensions.*") @@ -170,7 +176,8 @@ public void testConstructorWithDuplicateBaseDimension() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); } @Test @@ -183,7 +190,8 @@ public void testGetAttributesWithValidDimension() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); // getAttributes will work similarly for all cases as the attributes are either pre created // or on demand @@ -240,7 +248,8 @@ public void testRecordWithValidDimension() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); metricEntityState.record( 100L, DIMENSION_ONE, @@ -344,7 +353,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseDimensionsMap has extra values @@ -359,7 +369,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -375,7 +386,8 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -392,10 +404,12 @@ public void testValidateRequiredDimensions() { MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, MetricEntityStateTest.DimensionEnum3.class, - MetricEntityStateTest.DimensionEnum4.class); + MetricEntityStateTest.DimensionEnum4.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); } } + } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateGenericTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateGenericTest.java index e870e2c6436..43ef9ed0808 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateGenericTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateGenericTest.java @@ -94,8 +94,8 @@ public void testRecordOtelMetricCounter() { LongCounter longCounter = mock(LongCounter.class); when(mockMetricEntity.getMetricType()).thenReturn(MetricType.COUNTER); - MetricEntityState metricEntityState = - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityState metricEntityState = MetricEntityStateGeneric + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); metricEntityState.setOtelMetric(longCounter); Attributes attributes = Attributes.builder().put("key", "value").build(); @@ -109,8 +109,8 @@ public void testRecordMetricsWithBothOtelAndTehuti() { DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); - MetricEntityStateGeneric metricEntityState = - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric metricEntityState = MetricEntityStateGeneric + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); metricEntityState.setOtelMetric(doubleHistogram); metricEntityState.setTehutiSensor(mockSensor); @@ -145,8 +145,8 @@ public void testGetAttributes() { DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); - MetricEntityStateGeneric metricEntityState = - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric metricEntityState = MetricEntityStateGeneric + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); metricEntityState.setOtelMetric(doubleHistogram); metricEntityState.setTehutiSensor(mockSensor); @@ -190,13 +190,13 @@ public void testGetAttributes() { @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*does not support ASYNC_COUNTER_FOR_HIGH_PERF_CASES.*") public void testAsyncCounterNotSupported() { when(mockMetricEntity.getMetricType()).thenReturn(MetricType.ASYNC_COUNTER_FOR_HIGH_PERF_CASES); - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*does not support ASYNC_UP_DOWN_COUNTER_FOR_HIGH_PERF_CASES.*") public void testAsyncUpDownCounterNotSupported() { when(mockMetricEntity.getMetricType()).thenReturn(MetricType.ASYNC_UP_DOWN_COUNTER_FOR_HIGH_PERF_CASES); - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); } @Test @@ -204,8 +204,8 @@ public void testValidateRequiredDimensions() { Map baseDimensionsMap = new HashMap<>(); // case 1: right values baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); - MetricEntityStateGeneric metricEntityState = - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric metricEntityState = MetricEntityStateGeneric + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseDimensionsMap has extra values @@ -214,7 +214,7 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); try { - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains invalid dimension"), e.getMessage()); @@ -222,14 +222,15 @@ public void testValidateRequiredDimensions() { // case 3: baseDimensionsMap has less values baseDimensionsMap.clear(); - metricEntityState = MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + metricEntityState = MetricEntityStateGeneric + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 4: baseDimensionsMap has same count, but different dimensions baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); try { - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains invalid dimension"), e.getMessage()); @@ -239,7 +240,7 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, null); try { - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains a null or empty value for dimension"), e.getMessage()); @@ -249,7 +250,7 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, ""); try { - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains a null or empty value for dimension"), e.getMessage()); @@ -261,7 +262,7 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_STORE_NAME, "store1"); baseDimensionsMap.put(VENICE_CLUSTER_NAME, "cluster1"); try { - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains all or more dimensions than required"), e.getMessage()); @@ -270,7 +271,7 @@ public void testValidateRequiredDimensions() { // case 8: baseDimensionsMap has more keys baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); try { - MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap); + MetricEntityStateGeneric.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains all or more dimensions than required"), e.getMessage()); diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnumTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnumTest.java index 09ea3d4abf1..08e035fce4d 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnumTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateOneEnumTest.java @@ -47,8 +47,12 @@ public void setUp() { @Test public void testConstructorWithoutOtelRepo() { - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, null, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + null, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertNull(metricEntityState.getMetricAttributesDataEnumMap()); assertNull(metricEntityState.getAttributes(MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE)); @@ -57,8 +61,12 @@ public void testConstructorWithoutOtelRepo() { @Test public void testConstructorWithOtelRepo() { - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); EnumMap metricAttributesDataEnumMap = metricEntityState.getMetricAttributesDataEnumMap(); @@ -71,20 +79,29 @@ public void testCreateAttributesEnumMapWithEmptyEnum() { mockMetricEntity, mockOtelRepository, baseDimensionsMap, - MetricEntityStateTest.EmptyDimensionEnum.class); + MetricEntityStateTest.EmptyDimensionEnum.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "The input Otel dimension cannot be null.*") public void testGetAttributesWithNullDimension() { - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(null); } @Test(expectedExceptions = ClassCastException.class) public void testGetAttributesWithInvalidDimensionType() { - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(MULTI_GET_STREAMING); } @@ -93,14 +110,22 @@ public void testConstructorWithDuplicateBaseDimension() { Map baseDimensionsMap = new HashMap<>(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); baseDimensionsMap.put(DIMENSION_ONE.getDimensionName(), DIMENSION_ONE.getDimensionValue()); // duplicate - MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); } @Test public void testGetAttributesWithValidDimension() { - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); // getAttributes will work similarly for all cases as the attributes are either pre created // or on demand @@ -129,8 +154,12 @@ public void testGetAttributesWithValidDimension() { @Test public void testRecordWithValidDimension() { - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); // record attempt 1 metricEntityState.record(100L, MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE); @@ -183,8 +212,12 @@ public void testValidateRequiredDimensions() { Map baseDimensionsMap = new HashMap<>(); // case 1: right values baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); - MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum metricEntityState = MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseDimensionsMap has extra values @@ -193,8 +226,12 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); try { - MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -203,8 +240,12 @@ public void testValidateRequiredDimensions() { // case 3: baseDimensionsMap has less values baseDimensionsMap.clear(); try { - MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -214,8 +255,12 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_RETRY_ABORT_REASON, SLOW_ROUTE.getDimensionValue()); try { - MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -225,8 +270,12 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, null); try { - MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains a null or empty value for dimension")); @@ -236,11 +285,16 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, ""); try { - MetricEntityStateOneEnum - .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class); + MetricEntityStateOneEnum.create( + mockMetricEntity, + mockOtelRepository, + baseDimensionsMap, + MetricEntityStateTest.DimensionEnum1.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("contains a null or empty value for dimension")); } } + } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java index 32aabdfabfc..3b02f9ccf5e 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTest.java @@ -14,10 +14,12 @@ import static org.mockito.Mockito.doCallRealMethod; import static org.mockito.Mockito.doReturn; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; import static org.testng.Assert.assertNotNull; +import static org.testng.Assert.assertNull; import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; @@ -101,8 +103,8 @@ public void testCreateMetricWithOtelDisabled() { when(mockOtelRepository.createInstrument(mockMetricEntity)).thenReturn(longCounter); // without tehuti sensor - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, null, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, null, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); // No Tehuti sensors added @@ -110,7 +112,8 @@ public void testCreateMetricWithOtelDisabled() { Assert.assertNull(((MetricEntityStateBase) metricEntityState).getAttributes()); // without tehuti sensor with empty attributes - metricEntityState = MetricEntityStateBase.create(mockMetricEntity, null, baseDimensionsMap, null); + metricEntityState = + MetricEntityStateBase.create(mockMetricEntity, null, baseDimensionsMap, null, CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); // No Tehuti sensors added @@ -124,7 +127,8 @@ public void testCreateMetricWithOtelDisabled() { TestTehutiMetricNameEnum.TEST_METRIC, singletonList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNull(metricEntityState.getOtelMetric()); Assert.assertNotNull(metricEntityState.getTehutiSensor()); @@ -139,8 +143,8 @@ public void testCreateMetricWithOtelEnabled() { when(mockOtelRepository.createInstrument(mockMetricEntity)).thenReturn(longCounter); // without tehuti sensor - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); // No Tehuti sensors added @@ -148,7 +152,8 @@ public void testCreateMetricWithOtelEnabled() { // without tehuti sensor but with empty attributes try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, null); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, null, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { Assert.assertTrue(e.getMessage().contains("Base attributes cannot be null for MetricEntityStateBase")); @@ -163,7 +168,8 @@ public void testCreateMetricWithOtelEnabled() { TestTehutiMetricNameEnum.TEST_METRIC, singletonList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNotNull(metricEntityState.getTehutiSensor()); @@ -178,7 +184,8 @@ public void testCreateMetricWithOtelEnabled() { TestTehutiMetricNameEnum.TEST_METRIC, singletonList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + CompositeCloseable.NONE); Assert.assertNotNull(metricEntityState); Assert.assertNotNull(metricEntityState.getOtelMetric()); Assert.assertNull(metricEntityState.getTehutiSensor()); @@ -191,8 +198,8 @@ public void testRecordOtelMetricHistogram() { DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); metricEntityState.setOtelMetric(doubleHistogram); Attributes attributes = Attributes.builder().put("key", "value").build(); @@ -206,8 +213,8 @@ public void testRecordOtelMetricCounter() { LongCounter longCounter = mock(LongCounter.class); when(mockMetricEntity.getMetricType()).thenReturn(MetricType.COUNTER); - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); metricEntityState.setOtelMetric(longCounter); Attributes attributes = Attributes.builder().put("key", "value").build(); @@ -221,8 +228,8 @@ public void testRecordOtelMetricUpDownCounter() { LongUpDownCounter longUpDownCounter = mock(LongUpDownCounter.class); when(mockMetricEntity.getMetricType()).thenReturn(MetricType.UP_DOWN_COUNTER); - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); metricEntityState.setOtelMetric(longUpDownCounter); Attributes attributes = Attributes.builder().put("key", "value").build(); @@ -241,8 +248,8 @@ public void testRecordOtelMetricGauge() { LongGauge longGauge = mock(LongGauge.class); when(mockMetricEntity.getMetricType()).thenReturn(MetricType.GAUGE); - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); metricEntityState.setOtelMetric(longGauge); Attributes attributes = Attributes.builder().put("key", "value").build(); @@ -258,8 +265,8 @@ public void testRecordOtelMetricGauge() { @Test public void testRecordTehutiMetric() { - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); metricEntityState.setTehutiSensor(mockSensor); metricEntityState.recordTehutiMetric(15.0); verify(mockSensor, times(1)).record(15.0); @@ -270,8 +277,8 @@ public void testRecordMetricsWithBothOtelAndTehuti() { DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); metricEntityState.setOtelMetric(doubleHistogram); metricEntityState.setTehutiSensor(mockSensor); @@ -298,14 +305,15 @@ public void testValidateRequiredDimensions() { // case 1: right values baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); Attributes baseAttributes1 = getBaseAttributes(baseDimensionsMap); - MetricEntityState metricEntityState = - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes1); + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes1, CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseAttributes have different count than baseDimensionsMap Attributes baseAttributes2 = Attributes.builder().build(); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes2); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes2, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should have the same size and values")); @@ -316,7 +324,8 @@ public void testValidateRequiredDimensions() { baseAttributes3Map.put(VENICE_REQUEST_RETRY_TYPE, ERROR_RETRY.getDimensionValue()); Attributes baseAttributes3 = getBaseAttributes(baseAttributes3Map); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes3); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes3, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should contain all the keys and same values as in baseDimensionsMap")); @@ -328,7 +337,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_REQUEST_RETRY_TYPE, ERROR_RETRY.getDimensionValue()); Attributes baseAttributes4 = getBaseAttributes(baseDimensionsMap); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes4); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes4, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -338,7 +348,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); Attributes baseAttributes5 = Attributes.builder().build(); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes5); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes5, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -349,7 +360,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.put(VENICE_REQUEST_RETRY_TYPE, ERROR_RETRY.getDimensionValue()); Attributes baseAttributes6 = getBaseAttributes(baseDimensionsMap); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes6); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes6, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -362,7 +374,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, MULTI_GET_STREAMING.getDimensionValue()); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes7); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes7, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should have the same size and values")); @@ -373,7 +386,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap.clear(); baseDimensionsMap.put(VENICE_REQUEST_METHOD, null); try { - MetricEntityStateBase.create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes8); + MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes8, CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("should contain all the keys and same values as in baseDimensionsMap")); @@ -398,7 +412,8 @@ public void testValidateMetric() { TestTehutiMetricNameEnum.TEST_METRIC, singletonList(new AsyncGauge((ignored, ignored2) -> 0, "test")), baseDimensionsMap, - baseAttributes); // No async callback provided + baseAttributes, + CompositeCloseable.NONE); // No async callback provided fail(); } catch (IllegalArgumentException e) { assertTrue( @@ -421,7 +436,8 @@ public void testValidateMetric() { TestTehutiMetricNameEnum.TEST_METRIC, singletonList(new AsyncGauge((ignored, ignored2) -> 0, "test")), baseDimensionsMap, - baseAttributes); + baseAttributes, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue( @@ -431,6 +447,49 @@ public void testValidateMetric() { } } + @Test + public void testRecordAfterCloseIsNoOpOnBase() { + DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); + when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); + + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); + metricEntityState.setOtelMetric(doubleHistogram); + metricEntityState.setTehutiSensor(mockSensor); + + Attributes attributes = Attributes.builder().put("key", "value").build(); + metricEntityState.record(7.0, attributes); + verify(doubleHistogram, times(1)).record(7.0, attributes); + verify(mockSensor, times(1)).record(7.0); + + metricEntityState.close(); + assertNull(metricEntityState.getOtelMetric()); + + // Post-close: both Tehuti and OTel paths must be silent no-ops. + metricEntityState.record(8.0, attributes); + metricEntityState.recordOtelMetric(9.0, new MetricAttributesData(attributes)); + metricEntityState.recordTehutiMetric(10.0); + verify(doubleHistogram, never()).record(8.0, attributes); + verify(doubleHistogram, never()).record(9.0, attributes); + verify(mockSensor, never()).record(8.0); + verify(mockSensor, never()).record(10.0); + } + + @Test + public void testRecordOtelMetricWithNullHolderIsNoOp() { + DoubleHistogram doubleHistogram = mock(DoubleHistogram.class); + when(mockMetricEntity.getMetricType()).thenReturn(HISTOGRAM); + + MetricEntityState metricEntityState = MetricEntityStateBase + .create(mockMetricEntity, mockOtelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); + metricEntityState.setOtelMetric(doubleHistogram); + + // Holder null guard: must not throw or invoke the SDK. + metricEntityState.recordOtelMetric(1.0, null); + metricEntityState.recordOtelMetric(2L, null); + verify(doubleHistogram, never()).record(any(double.class), any()); + } + private Attributes getBaseAttributes(Map inputMap) { AttributesBuilder builder = Attributes.builder(); for (Map.Entry entry: inputMap.entrySet()) { diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnumsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnumsTest.java index 75937b99685..3d59259187f 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnumsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateThreeEnumsTest.java @@ -75,7 +75,8 @@ public void testConstructorWithoutOtelRepo() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertNull(metricEntityState.getMetricAttributesDataEnumMap()); for (MetricEntityStateTest.DimensionEnum1 enum1: MetricEntityStateTest.DimensionEnum1.values()) { @@ -96,7 +97,8 @@ public void testConstructorWithOtelRepo() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertEquals(metricEntityState.getMetricAttributesDataEnumMap().size(), 0); } @@ -109,7 +111,8 @@ public void testCreateAttributesEnumMapWithEmptyEnum() { baseDimensionsMap, MetricEntityStateTest.EmptyDimensionEnum.class, MetricEntityStateTest.EmptyDimensionEnum.class, - MetricEntityStateTest.EmptyDimensionEnum.class); + MetricEntityStateTest.EmptyDimensionEnum.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "The input Otel dimension cannot be null.*") @@ -121,7 +124,8 @@ public void testGetAttributesWithNullDimension() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(null, null, null); } @@ -133,7 +137,8 @@ public void testGetAttributesWithInvalidDimensionType() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(MULTI_GET_STREAMING, MULTI_GET_STREAMING, MULTI_GET_STREAMING); } @@ -153,7 +158,8 @@ public void testConstructorWithDuplicateClasses() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum1Duplicate.class); // duplicate + MetricEntityStateTest.DimensionEnum1Duplicate.class, + CompositeCloseable.NONE); // duplicate } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*doesn't match with the required dimensions.*") @@ -167,7 +173,8 @@ public void testConstructorWithDuplicateBaseDimension() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); } @Test @@ -179,7 +186,8 @@ public void testGetAttributesWithValidDimension() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); // getAttributes will work similarly for all cases as the attributes are either pre created // or on demand @@ -229,7 +237,8 @@ public void testRecordWithValidDimension() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); metricEntityState.record( 100L, DIMENSION_ONE, @@ -314,7 +323,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseDimensionsMap has extra values @@ -328,7 +338,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -343,7 +354,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -359,7 +371,8 @@ public void testValidateRequiredDimensions() { baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, MetricEntityStateTest.DimensionEnum2.class, - MetricEntityStateTest.DimensionEnum3.class); + MetricEntityStateTest.DimensionEnum3.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -393,7 +406,8 @@ public void testConcurrentAccess() throws InterruptedException { baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); for (HttpResponseStatusEnum enum1: HttpResponseStatusEnum.values()) { for (HttpResponseStatusCodeCategory enum2: HttpResponseStatusCodeCategory.values()) { for (RequestType enum3: RequestType.values()) { @@ -413,7 +427,8 @@ public void testConcurrentAccess() throws InterruptedException { baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - RequestType.class); + RequestType.class, + CompositeCloseable.NONE); int writerThreads = 10; int readerThreads = 10; @@ -496,4 +511,5 @@ public void testConcurrentAccess() throws InterruptedException { } } } + } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnumsTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnumsTest.java index 45ed1156159..eadec8d605a 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnumsTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricEntityStateTwoEnumsTest.java @@ -60,7 +60,8 @@ public void testConstructorWithoutOtelRepo() { null, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); assertNull(metricEntityState.getMetricAttributesDataEnumMap()); for (MetricEntityStateTest.DimensionEnum1 enum1: MetricEntityStateTest.DimensionEnum1.values()) { @@ -78,7 +79,8 @@ public void testConstructorWithOtelRepo() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); EnumMap> metricAttributesDataEnumMap = metricEntityState.getMetricAttributesDataEnumMap(); @@ -92,7 +94,8 @@ public void testCreateAttributesEnumMapWithEmptyEnum() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.EmptyDimensionEnum.class, - MetricEntityStateTest.EmptyDimensionEnum.class); + MetricEntityStateTest.EmptyDimensionEnum.class, + CompositeCloseable.NONE); } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = "The input Otel dimension cannot be null.*") @@ -103,7 +106,8 @@ public void testGetAttributesWithNullDimension() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(null, null); } @@ -114,7 +118,8 @@ public void testGetAttributesWithInvalidDimensionType() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); metricEntityState.getAttributes(MULTI_GET_STREAMING, MULTI_GET_STREAMING); } @@ -132,7 +137,8 @@ public void testConstructorWithDuplicateClasses() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum1Duplicate.class); // duplicate + MetricEntityStateTest.DimensionEnum1Duplicate.class, + CompositeCloseable.NONE); // duplicate } @Test(expectedExceptions = IllegalArgumentException.class, expectedExceptionsMessageRegExp = ".*doesn't match with the required dimensions.*") @@ -145,7 +151,8 @@ public void testConstructorWithDuplicateBaseDimension() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); } @Test @@ -156,7 +163,8 @@ public void testGetAttributesWithValidDimension() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); // getAttributes will work similarly for all cases as the attributes are either pre created // or on demand @@ -197,7 +205,8 @@ public void testRecordWithValidDimension() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); metricEntityState.record( 100L, MetricEntityStateTest.DimensionEnum1.DIMENSION_ONE, @@ -272,7 +281,8 @@ public void testValidateRequiredDimensions() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); assertNotNull(metricEntityState); // case 2: baseDimensionsMap has extra values @@ -285,7 +295,8 @@ public void testValidateRequiredDimensions() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -299,7 +310,8 @@ public void testValidateRequiredDimensions() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); @@ -314,10 +326,12 @@ public void testValidateRequiredDimensions() { mockOtelRepository, baseDimensionsMap, MetricEntityStateTest.DimensionEnum1.class, - MetricEntityStateTest.DimensionEnum2.class); + MetricEntityStateTest.DimensionEnum2.class, + CompositeCloseable.NONE); fail(); } catch (IllegalArgumentException e) { assertTrue(e.getMessage().contains("doesn't match with the required dimensions")); } } + } diff --git a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricTypeTest.java b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricTypeTest.java index f3c64ec9cf2..403d977245e 100644 --- a/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricTypeTest.java +++ b/internal/venice-client-common/src/test/java/com/linkedin/venice/stats/metrics/MetricTypeTest.java @@ -82,8 +82,12 @@ public void testOTelRecordCounter() { InMemoryMetricReader inMemoryMetricReader = InMemoryMetricReader.create(); VeniceOpenTelemetryMetricsRepository otelMetricsRepository = createOtelRepo(metricEntityCounter, inMemoryMetricReader); - MetricEntityStateBase metricEntityStateBaseCounter = MetricEntityStateBase - .create(metricEntityCounter, otelMetricsRepository, getBaseDimensionsMap(), getBaseAttributes()); + MetricEntityStateBase metricEntityStateBaseCounter = MetricEntityStateBase.create( + metricEntityCounter, + otelMetricsRepository, + getBaseDimensionsMap(), + getBaseAttributes(), + CompositeCloseable.NONE); int[] values = { 10, 20, 30, 40, 50 }; for (int value: values) { metricEntityStateBaseCounter.record(value); @@ -112,8 +116,12 @@ public void testOTelRecordUpDownCounter() { InMemoryMetricReader inMemoryMetricReader = InMemoryMetricReader.create(); VeniceOpenTelemetryMetricsRepository otelMetricsRepository = createOtelRepo(metricEntityCounter, inMemoryMetricReader); - MetricEntityStateBase metricEntityStateBaseCounter = MetricEntityStateBase - .create(metricEntityCounter, otelMetricsRepository, getBaseDimensionsMap(), getBaseAttributes()); + MetricEntityStateBase metricEntityStateBaseCounter = MetricEntityStateBase.create( + metricEntityCounter, + otelMetricsRepository, + getBaseDimensionsMap(), + getBaseAttributes(), + CompositeCloseable.NONE); int value = 50; metricEntityStateBaseCounter.record(value); @@ -139,8 +147,12 @@ public void testOTelRecordHistogram(boolean isExponentialHistogram) { InMemoryMetricReader inMemoryMetricReader = InMemoryMetricReader.create(); VeniceOpenTelemetryMetricsRepository otelMetricsRepository = createOtelRepo(metricEntityHistogram, inMemoryMetricReader); - MetricEntityStateBase metricEntityStateBaseHistogram = MetricEntityStateBase - .create(metricEntityHistogram, otelMetricsRepository, getBaseDimensionsMap(), getBaseAttributes()); + MetricEntityStateBase metricEntityStateBaseHistogram = MetricEntityStateBase.create( + metricEntityHistogram, + otelMetricsRepository, + getBaseDimensionsMap(), + getBaseAttributes(), + CompositeCloseable.NONE); int[] values = { 10, 20, 30, 40, 50 }; for (int value: values) { metricEntityStateBaseHistogram.record(value); @@ -186,8 +198,12 @@ public void testOTelRecordGauge(boolean exportLastRecordedValueForSynchronousGau DefaultAggregationSelector.getDefault()); VeniceOpenTelemetryMetricsRepository otelMetricsRepository = createOtelRepo(metricEntityGauge, inMemoryMetricReader); - MetricEntityStateBase metricEntityStateBaseGauge = MetricEntityStateBase - .create(metricEntityGauge, otelMetricsRepository, getBaseDimensionsMap(), getBaseAttributes()); + MetricEntityStateBase metricEntityStateBaseGauge = MetricEntityStateBase.create( + metricEntityGauge, + otelMetricsRepository, + getBaseDimensionsMap(), + getBaseAttributes(), + CompositeCloseable.NONE); metricEntityStateBaseGauge.record(10L); metricEntityStateBaseGauge.record(20L); // validate the last recorded value is 20L: Note that the validate method calls collectAllMetrics() which is @@ -234,8 +250,13 @@ public void testOTelRecordAsyncGauge() { final long[] gaugeValue = { 100L }; LongSupplier supplier = () -> gaugeValue[0]; - AsyncMetricEntityStateBase - .create(metricEntityAsyncGauge, otelMetricsRepository, getBaseDimensionsMap(), getBaseAttributes(), supplier); + AsyncMetricEntityStateBase.create( + metricEntityAsyncGauge, + otelMetricsRepository, + getBaseDimensionsMap(), + getBaseAttributes(), + supplier, + CompositeCloseable.NONE); Collection metrics = inMemoryMetricReader.collectAllMetrics(); assertFalse(metrics.isEmpty(), "Metrics should not be empty"); diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/meta/RetryManager.java b/internal/venice-common/src/main/java/com/linkedin/venice/meta/RetryManager.java index 6485ec16e5c..2dd6d511db2 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/meta/RetryManager.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/meta/RetryManager.java @@ -3,7 +3,9 @@ import com.linkedin.venice.read.RequestType; import com.linkedin.venice.stats.RetryManagerStats; import com.linkedin.venice.throttle.TokenBucket; +import com.linkedin.venice.utils.Utils; import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; import java.time.Clock; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; @@ -14,7 +16,7 @@ import org.apache.logging.log4j.Logger; -public class RetryManager { +public class RetryManager implements Closeable { private static final Logger LOGGER = LogManager.getLogger(RetryManager.class); private static final int TOKEN_BUCKET_REFILL_INTERVAL_IN_SECONDS = 1; /** @@ -157,4 +159,10 @@ private void updateTokenBucket(long newQPS) { public TokenBucket getRetryTokenBucket() { return retryTokenBucket.get(); } + + /** Releases the OTel resources owned by {@link RetryManagerStats}. */ + @Override + public void close() { + Utils.closeQuietlyWithErrorLogged(retryManagerStats); + } } diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java index ed23d6e9fd5..1fadc27bbb0 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStats.java @@ -2,6 +2,7 @@ import com.linkedin.venice.meta.ReadOnlyStoreRepository; import com.linkedin.venice.meta.StoreDataChangedListener; +import com.linkedin.venice.utils.Utils; import io.tehuti.metrics.MetricsRepository; @@ -43,16 +44,21 @@ public T getStoreStats(String storeName) { @Override public void handleStoreDeleted(String storeName) { + // Remove first so a subsequent getStoreStats(storeName) does not return a now-closed instance — + // computeIfAbsent inside the parent will recreate fresh stats if the store is re-created later. + T stats = super.storeStats.remove(storeName); + if (stats == null) { + return; + } if (isUnregisterMetricForDeletedStoreEnabled) { - T stats = super.storeStats.get(storeName); - if (stats != null) { - stats.unregisterAllSensors(); - } + stats.unregisterAllSensors(); } + // OTel close is unconditional — independent of the Tehuti unregister flag. + Utils.closeQuietlyWithErrorLogged(stats); } private void registerStoreDataChangedListenerIfRequired(ReadOnlyStoreRepository metadataRepository) { - if (isUnregisterMetricForDeletedStoreEnabled) { + if (metadataRepository != null) { metadataRepository.registerStoreDataChangedListener(this); } } diff --git a/internal/venice-common/src/main/java/com/linkedin/venice/stats/RetryManagerStats.java b/internal/venice-common/src/main/java/com/linkedin/venice/stats/RetryManagerStats.java index 22a80d89785..591fd35a169 100644 --- a/internal/venice-common/src/main/java/com/linkedin/venice/stats/RetryManagerStats.java +++ b/internal/venice-common/src/main/java/com/linkedin/venice/stats/RetryManagerStats.java @@ -72,7 +72,8 @@ public RetryManagerStats( () -> { TokenBucket bucket = retryManager.getRetryTokenBucket(); return bucket == null ? -1 : (long) bucket.getAmortizedRefillPerSecond(); - }); + }, + resources); this.retriesRemaining = AsyncMetricEntityStateBase.create( RETRY_RATE_LIMIT_REMAINING_TOKENS.getMetricEntity(), @@ -88,7 +89,8 @@ public RetryManagerStats( () -> { TokenBucket bucket = retryManager.getRetryTokenBucket(); return bucket == null ? -1 : bucket.getStaleTokenCount(); - }); + }, + resources); this.rejectedRetry = MetricEntityStateBase.create( RETRY_RATE_LIMIT_REJECTION_COUNT.getMetricEntity(), @@ -97,7 +99,8 @@ public RetryManagerStats( RetryManagerTehutiMetricName.REJECTED_RETRY, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordRejectedRetry(int count) { diff --git a/internal/venice-common/src/test/java/com/linkedin/venice/meta/RetryManagerTest.java b/internal/venice-common/src/test/java/com/linkedin/venice/meta/RetryManagerTest.java index e054a012ebd..0ce4d5c3847 100644 --- a/internal/venice-common/src/test/java/com/linkedin/venice/meta/RetryManagerTest.java +++ b/internal/venice-common/src/test/java/com/linkedin/venice/meta/RetryManagerTest.java @@ -253,4 +253,29 @@ public void testRetryManagerOtelStats() { RETRY_RATE_LIMIT_REMAINING_TOKENS.getMetricEntity().getMetricName(), ClientType.FAST_CLIENT.getMetricsPrefix()); } + + /** + * Exercises {@link RetryManager#close()} — it must be idempotent and safe to call even when no + * {@link RetryManagerStats} was wired in (e.g., disabled retry manager). The close flows through + * {@code Utils.closeQuietlyWithErrorLogged}, which tolerates null. + */ + @Test(timeOut = TEST_TIMEOUT_IN_MS) + public void testCloseIsSafeWhenDisabled() throws Exception { + Clock mockClock = mock(Clock.class); + doReturn(System.currentTimeMillis()).when(mockClock).millis(); + MetricsRepository metricsRepository = MetricsRepositoryUtils.createSingleThreadedMetricsRepository(); + RetryManager retryManager = new RetryManager( + metricsRepository, + "test-retry-manager-close", + "test-store", + null, + 0, + 0.1d, + mockClock, + scheduler); + retryManager.close(); + // Second close must be a silent no-op. + retryManager.close(); + metricsRepository.close(); + } } diff --git a/internal/venice-common/src/test/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStatsBranchCoverageTest.java b/internal/venice-common/src/test/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStatsBranchCoverageTest.java new file mode 100644 index 00000000000..8e42336d7db --- /dev/null +++ b/internal/venice-common/src/test/java/com/linkedin/venice/stats/AbstractVeniceAggStoreStatsBranchCoverageTest.java @@ -0,0 +1,97 @@ +package com.linkedin.venice.stats; + +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.testng.Assert.assertNull; + +import com.linkedin.venice.meta.ReadOnlyStoreRepository; +import io.tehuti.metrics.MetricsRepository; +import org.testng.annotations.Test; + + +/** + * Branch-coverage tests for {@link AbstractVeniceAggStoreStats} — exercises the new + * {@code handleStoreDeleted} early-return / unconditional-OTel-close paths and the + * {@code registerStoreDataChangedListenerIfRequired} null-guard added in this PR. + */ +public class AbstractVeniceAggStoreStatsBranchCoverageTest { + private static final String CLUSTER = "test-cluster"; + private static final String STORE = "test-store"; + + /** Minimal concrete stats class so we can instantiate {@code AbstractVeniceAggStoreStats}. */ + private static class TestStats extends AbstractVeniceStats { + TestStats(MetricsRepository metricsRepository, String name) { + super(metricsRepository, name); + } + } + + private static class TestAggStats extends AbstractVeniceAggStoreStats { + TestAggStats( + MetricsRepository metricsRepository, + ReadOnlyStoreRepository metadataRepository, + boolean isUnregisterEnabled) { + super(CLUSTER, metricsRepository, metadataRepository, isUnregisterEnabled); + } + } + + @Test + public void testRegisterListenerOnConstructionWhenRepoNonNull() { + MetricsRepository metrics = new MetricsRepository(); + ReadOnlyStoreRepository repo = mock(ReadOnlyStoreRepository.class); + TestAggStats stats = new TestAggStats(metrics, repo, true); + verify(repo).registerStoreDataChangedListener(stats); + } + + @Test + public void testRegisterListenerSkippedWhenRepoIsNull() { + MetricsRepository metrics = new MetricsRepository(); + // Null repo path — constructor must not NPE and must not attempt registration. + new TestAggStats(metrics, null, true); + // (No throw == pass for this branch.) + } + + @Test + public void testHandleStoreDeletedEarlyReturnsWhenStoreUnknown() { + MetricsRepository metrics = new MetricsRepository(); + ReadOnlyStoreRepository repo = mock(ReadOnlyStoreRepository.class); + TestAggStats stats = new TestAggStats(metrics, repo, true); + // Store was never registered; handleStoreDeleted must short-circuit. + stats.handleStoreDeleted("never-seen-store"); + // No assertion needed beyond "did not throw" — the branch is covered. + } + + @Test + public void testHandleStoreDeletedUnregistersSensorsWhenEnabled() { + MetricsRepository metrics = new MetricsRepository(); + ReadOnlyStoreRepository repo = mock(ReadOnlyStoreRepository.class); + TestAggStats stats = new TestAggStats(metrics, repo, true); + TestStats storeStats = spy(new TestStats(metrics, STORE)); + stats.storeStats.put(STORE, storeStats); + + stats.handleStoreDeleted(STORE); + + verify(storeStats, times(1)).unregisterAllSensors(); + // Entry must be removed from the map so a re-creation of the same store gets fresh stats + // instead of the now-closed instance. + assertNull(stats.storeStats.get(STORE)); + } + + @Test + public void testHandleStoreDeletedSkipsUnregisterWhenDisabled() { + MetricsRepository metrics = new MetricsRepository(); + ReadOnlyStoreRepository repo = mock(ReadOnlyStoreRepository.class); + TestAggStats stats = new TestAggStats(metrics, repo, false); + TestStats storeStats = spy(new TestStats(metrics, STORE)); + stats.storeStats.put(STORE, storeStats); + + stats.handleStoreDeleted(STORE); + + // Tehuti unregister disabled — sensors stay; OTel close path still runs unconditionally. + verify(storeStats, never()).unregisterAllSensors(); + // Entry is removed regardless of the unregister flag. + assertNull(stats.storeStats.get(STORE)); + } +} diff --git a/internal/venice-test-common/src/jmh/java/com/linkedin/venice/benchmark/VeniceOpenTelemetryPerfTest.java b/internal/venice-test-common/src/jmh/java/com/linkedin/venice/benchmark/VeniceOpenTelemetryPerfTest.java index 969749ce26b..84df034055e 100644 --- a/internal/venice-test-common/src/jmh/java/com/linkedin/venice/benchmark/VeniceOpenTelemetryPerfTest.java +++ b/internal/venice-test-common/src/jmh/java/com/linkedin/venice/benchmark/VeniceOpenTelemetryPerfTest.java @@ -12,6 +12,7 @@ import com.linkedin.venice.stats.dimensions.HttpResponseStatusEnum; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceResponseStatusCategory; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.stats.metrics.MetricEntity; import com.linkedin.venice.stats.metrics.MetricEntityStateBase; import com.linkedin.venice.stats.metrics.MetricEntityStateThreeEnums; @@ -179,7 +180,8 @@ public void testGeneratingAttributes() { baseMetricDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class)); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE)); } long endTimeInit = System.currentTimeMillis(); @@ -237,7 +239,8 @@ public void testTehutiVsOtelMetricRecordingPerf() { baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); // Create Tehuti-only metric MetricsRepository tehutiRepository = new MetricsRepository(); @@ -261,7 +264,8 @@ public String getMetricName() { baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); LOGGER.info("Starting benchmark with " + formatNumber(numLoops) + " loops..."); @@ -330,8 +334,8 @@ public void testAtomicLongVsOtelUpDownCounterPerf() { Utils.setOf(VeniceMetricsDimensions.VENICE_STORE_NAME, VeniceMetricsDimensions.VENICE_CLUSTER_NAME)); Attributes baseAttributes = otelRepository.createAttributes(upDownCounterMetric, baseDimensionsMap); - MetricEntityStateBase otelUpDownCounter = - MetricEntityStateBase.create(upDownCounterMetric, otelRepository, baseDimensionsMap, baseAttributes); + MetricEntityStateBase otelUpDownCounter = MetricEntityStateBase + .create(upDownCounterMetric, otelRepository, baseDimensionsMap, baseAttributes, CompositeCloseable.NONE); AtomicLong atomicCounter = new AtomicLong(0); LOGGER.info("Starting benchmark with " + formatNumber(numLoops) + " loops..."); @@ -405,7 +409,8 @@ public void testThreeEnumMetricWithAndWithoutLongAdder() { baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); // Create regular COUNTER metric (direct OTel recording) MetricEntity counterEntity = createThreeEnumMetricEntity("test_counter", MetricType.COUNTER, "Test counter"); @@ -416,7 +421,8 @@ public void testThreeEnumMetricWithAndWithoutLongAdder() { baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); LOGGER.info("Starting benchmark with " + formatNumber(numLoops) + " loops..."); @@ -493,7 +499,8 @@ public void testAsyncCounterVsLongAdderRateGaugeWithTehuti() { baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); // Create LongAdderRateGauge with Tehuti MetricsRepository tehutiRepository = new MetricsRepository(new MetricConfig()); @@ -598,7 +605,8 @@ public void testConcurrentAsyncCounterVsCounterPerf() throws InterruptedExceptio baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); // Create regular COUNTER metric MetricEntity counterEntity = @@ -610,7 +618,8 @@ public void testConcurrentAsyncCounterVsCounterPerf() throws InterruptedExceptio baseDimensions, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + CompositeCloseable.NONE); // Accumulators for timing (one per thread to avoid contention) long[] asyncTimes = new long[numThreads]; diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/DeferredVersionSwapService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/DeferredVersionSwapService.java index b1803202778..130d6b410bc 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/DeferredVersionSwapService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/DeferredVersionSwapService.java @@ -23,6 +23,7 @@ import com.linkedin.venice.pushmonitor.ExecutionStatus; import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.stats.ThreadPoolStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.LatencyUtils; import com.linkedin.venice.utils.LogContext; @@ -92,6 +93,8 @@ public class DeferredVersionSwapService extends AbstractVeniceService { private final Map clusterToExecutorMap = new ConcurrentHashMap<>(); private final Set storesBeingProcessed = ConcurrentHashMap.newKeySet(); private final Map clusterToThreadPoolStatsMap = new ConcurrentHashMap<>(); + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private final MetricsRepository metricsRepository; private Map storeLifecycleHooksCache = new HashMap<>(); @@ -140,6 +143,9 @@ public void stopInner() throws Exception { Thread.currentThread().interrupt(); } }); + Utils.closeQuietlyWithErrorLogged(deferredVersionSwapStats); + // Close every per-cluster ThreadPoolStats registered above so its ASYNC_GAUGE callbacks deregister. + statsCloseables.close(); } private ThreadPoolExecutor getOrCreateExecutorForCluster(String cluster) { @@ -152,7 +158,8 @@ private ThreadPoolExecutor getOrCreateExecutorForCluster(String cluster) { veniceControllerMultiClusterConfig.getLogContext()); ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threadPoolSize, threadFactory); String statsName = "DeferredVersionSwap-" + cluster; - ThreadPoolStats threadPoolStats = new ThreadPoolStats(metricsRepository, executor, statsName); + ThreadPoolStats threadPoolStats = + statsCloseables.register(new ThreadPoolStats(metricsRepository, executor, statsName)); clusterToThreadPoolStatsMap.put(cluster, threadPoolStats); LOGGER.info("Created thread pool for cluster {} with {} threads", cluster, threadPoolSize); diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ErrorPartitionResetTask.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ErrorPartitionResetTask.java index 6481edf80d8..caac925c89d 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ErrorPartitionResetTask.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ErrorPartitionResetTask.java @@ -14,10 +14,10 @@ import com.linkedin.venice.pushmonitor.PartitionStatus; import com.linkedin.venice.pushmonitor.PushMonitor; import com.linkedin.venice.pushmonitor.ReplicaStatus; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.utils.HelixUtils; import com.linkedin.venice.utils.Utils; import io.tehuti.metrics.MetricsRepository; -import java.io.Closeable; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -39,7 +39,7 @@ * shown Leader in EV but actually ERROR in offline push status. * */ -public class ErrorPartitionResetTask implements Runnable, Closeable { +public class ErrorPartitionResetTask extends AbstractStatsCloseable implements Runnable { private static final String TASK_ID_FORMAT = ErrorPartitionResetTask.class.getSimpleName() + " [cluster: %s] "; /** * Tracks auto reset attempts of applicable resources' error partitions. Automatically reset will only apply to @@ -78,7 +78,7 @@ public ErrorPartitionResetTask( this.pushMonitor = pushMonitor; this.errorPartitionAutoResetLimit = errorPartitionAutoResetLimit; this.processingCycleDelayMs = processingCycleDelayMs; - errorPartitionStats = new ErrorPartitionStats(metricsRepository, clusterName); + errorPartitionStats = statsCloseables.register(new ErrorPartitionStats(metricsRepository, clusterName)); } @Override @@ -226,5 +226,6 @@ private boolean checkPartitionRecovered(Partition partition, Map * All resources in this class is dedicated for one Venice cluster. */ -public class HelixVeniceClusterResources implements VeniceResource { +public class HelixVeniceClusterResources extends AbstractStatsCloseable implements VeniceResource { private static final Logger LOGGER = LogManager.getLogger(HelixVeniceClusterResources.class); private final String clusterName; @@ -230,12 +231,13 @@ public HelixVeniceClusterResources( clusterName, config.getRefreshAttemptsForZkReconnect(), config.getRefreshIntervalForZkReconnectInMs()); - this.aggPartitionHealthStats = new AggPartitionHealthStats( - clusterName, - metricsRepository, - routingDataRepository, - storeMetadataRepository, - pushMonitor); + this.aggPartitionHealthStats = statsCloseables.register( + new AggPartitionHealthStats( + clusterName, + metricsRepository, + routingDataRepository, + storeMetadataRepository, + pushMonitor)); this.storeConfigAccessor = new ZkStoreConfigAccessor(zkClient, adapterSerializer, metaStoreWriter); this.accessController = accessController; if (config.getErrorPartitionAutoResetLimit() > 0) { @@ -273,7 +275,7 @@ public HelixVeniceClusterResources( this.logCompactionService = null; } - veniceAdminStats = new VeniceAdminStats(metricsRepository, "venice-admin-", clusterName); + veniceAdminStats = statsCloseables.register(new VeniceAdminStats(metricsRepository, "venice-admin-", clusterName)); this.storagePersonaRepository = new StoragePersonaRepository(clusterName, this.storeMetadataRepository, adapterSerializer, zkClient); /** @@ -358,6 +360,7 @@ public void clear() { customizedViewRepo.clear(); routersClusterManager.clear(); admin.clearInstanceMonitor(clusterName); + /* statsCloseables is NOT drained here — refresh() calls clear(); inherited close() drains it on true retirement. */ } /** diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ProtocolVersionAutoDetectionService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ProtocolVersionAutoDetectionService.java index 0a2c8a8a5d3..50818a5ecfa 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/ProtocolVersionAutoDetectionService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/ProtocolVersionAutoDetectionService.java @@ -7,6 +7,7 @@ import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.utils.DaemonThreadFactory; import com.linkedin.venice.utils.LatencyUtils; +import com.linkedin.venice.utils.Utils; import java.util.Map; import java.util.Objects; import java.util.Optional; @@ -71,6 +72,7 @@ public boolean startInner() throws Exception { public void stopInner() throws Exception { stop.set(true); executor.shutdownNow(); + Utils.closeQuietlyWithErrorLogged(stats); LOGGER.info("Stopped {}", serviceName); } diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java index 6a9941eda9b..5bedd5edfbd 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/StoreBackupVersionCleanupService.java @@ -18,6 +18,7 @@ import com.linkedin.venice.utils.ObjectMapperFactory; import com.linkedin.venice.utils.SystemTime; import com.linkedin.venice.utils.Time; +import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import java.io.IOException; @@ -133,6 +134,7 @@ public void stopInner() throws IOException { stop.set(true); httpAsyncClient.close(); cleanupThread.interrupt(); + clusterNameCleanupStatsMap.values().forEach(Utils::closeQuietlyWithErrorLogged); } public static void setWaitTimeDeleteRepushSourceVersion(long waitTime) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerService.java index 1890c36c126..e77b165066f 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerService.java @@ -28,6 +28,7 @@ import com.linkedin.venice.serialization.avro.OptimizedKafkaValueSerializer; import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.service.ICProvider; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.system.store.ControllerClientBackedSystemSchemaInitializer; import com.linkedin.venice.utils.pools.LandFillObjectPool; import io.tehuti.metrics.MetricsRepository; @@ -55,6 +56,10 @@ public class VeniceControllerService extends AbstractVeniceService { private final BiConsumer newSchemaEncountered; + private HeartbeatBasedCheckerStats heartbeatBasedCheckerStats; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); + public VeniceControllerService( VeniceControllerMultiClusterConfig multiClusterConfigs, MetricsRepository metricsRepository, @@ -221,11 +226,12 @@ private LingeringStoreVersionChecker createLingeringStoreVersionChecker( MetricsRepository metricsRepository) { if (multiClusterConfigs.getBatchJobHeartbeatEnabled()) { LOGGER.info("Batch job heartbeat is enabled. Hence use the heartbeat-based batch job liveness checker."); + this.heartbeatBasedCheckerStats = statsCloseables.register(new HeartbeatBasedCheckerStats(metricsRepository)); return new HeartbeatBasedLingeringStoreVersionChecker( multiClusterConfigs.getBatchJobHeartbeatTimeout(), multiClusterConfigs.getBatchJobHeartbeatInitialBufferTime(), new DefaultLingeringStoreVersionChecker(), - new HeartbeatBasedCheckerStats(metricsRepository)); + heartbeatBasedCheckerStats); } { LOGGER.info("Batch job heartbeat is NOT enabled. Hence use the default batch job liveness checker."); @@ -271,6 +277,7 @@ public void stopInner() { // TODO merge or differentiate the difference between close() and stopVeniceController() explicitly. admin.stopVeniceController(); admin.close(); + statsCloseables.close(); LOGGER.info("Stopped Venice controller."); } diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerStateModel.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerStateModel.java index c45e6eae0e6..3e9c70014e5 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerStateModel.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceControllerStateModel.java @@ -10,6 +10,7 @@ import com.linkedin.venice.ingestion.control.RealTimeTopicSwitcher; import com.linkedin.venice.meta.ValueSchemaCreatedListener; import com.linkedin.venice.utils.DaemonThreadFactory; +import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.locks.AutoCloseableLock; import io.tehuti.metrics.MetricsRepository; import java.util.List; @@ -409,6 +410,8 @@ private synchronized void clearResources() { clusterResources.stopDeadStoreStatsPreFetchTask(); clusterResources.stopErrorPartitionResetTask(); clusterResources.clear(); + // Drain registered stats closeables on true retirement (after the refresh-safe clear()). + Utils.closeQuietlyWithErrorLogged(clusterResources); clusterResources = null; } } diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java index 3a2835b338d..9e65bab5ccb 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/VeniceHelixAdmin.java @@ -9234,6 +9234,11 @@ public void close() { }); this.clusterToDegradedDcStatesRepo.clear(); D2ClientUtils.shutdownClient(this.d2Client); + // Close per-cluster stats accumulated in maps that are not bound to AbstractVeniceService instances. + this.disabledPartitionStatMap.values().forEach(Utils::closeQuietlyWithErrorLogged); + this.pushJobStatusStatsMap.values().forEach(Utils::closeQuietlyWithErrorLogged); + this.addVersionLatencyStatsMap.values().forEach(Utils::closeQuietlyWithErrorLogged); + this.logCompactionStatsMap.values().forEach(Utils::closeQuietlyWithErrorLogged); long elapsedTime = System.currentTimeMillis() - closeStartTime; long remainingTime = Math.max(1, HELIX_MANAGER_DISCONNECT_TIMEOUT_MS - elapsedTime); diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/TopicCleanupService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/TopicCleanupService.java index 03590048593..46bc67d96c3 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/TopicCleanupService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/TopicCleanupService.java @@ -20,6 +20,7 @@ import com.linkedin.venice.utils.ExceptionUtils; import com.linkedin.venice.utils.LogContext; import com.linkedin.venice.utils.Time; +import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.VeniceProperties; import java.util.ArrayList; import java.util.Collections; @@ -152,6 +153,7 @@ public void stopInner() throws Exception { // N.B.: The two stop mechanisms below are decomposed for the sake of being able to test them separately. stopViaFlag(); stopViaInterrupt(); + Utils.closeQuietlyWithErrorLogged(topicCleanupServiceStats); } /** Package-private for tests. */ diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/consumer/AdminConsumptionTask.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/consumer/AdminConsumptionTask.java index 918d0e67714..d488e15dc0a 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/consumer/AdminConsumptionTask.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/kafka/consumer/AdminConsumptionTask.java @@ -333,6 +333,7 @@ void setAdminExecutionTaskExecutorService(ExecutorService executorService) { @Override public synchronized void close() throws IOException { isRunning.getAndSet(false); + Utils.closeQuietlyWithErrorLogged(stats); } @Override diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/lingeringjob/HeartbeatBasedCheckerStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/lingeringjob/HeartbeatBasedCheckerStats.java index 79e6505f9a5..70cea721ab7 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/lingeringjob/HeartbeatBasedCheckerStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/lingeringjob/HeartbeatBasedCheckerStats.java @@ -36,7 +36,8 @@ public HeartbeatBasedCheckerStats(MetricsRepository metricsRepository) { HeartbeatCheckerTehutiMetricNameEnum.CHECK_JOB_HAS_HEARTBEAT_FAILED, Collections.singletonList(new Count()), otelData.getBaseDimensionsMap(), - otelData.getBaseAttributes()); + otelData.getBaseAttributes(), + resources); timeoutHeartbeatCheckState = MetricEntityStateBase.create( HeartbeatCheckerOtelMetricEntity.BATCH_JOB_HEARTBEAT_TIMEOUT_COUNT.getMetricEntity(), @@ -45,7 +46,8 @@ public HeartbeatBasedCheckerStats(MetricsRepository metricsRepository) { HeartbeatCheckerTehutiMetricNameEnum.TIMEOUT_HEARTBEAT_CHECK, Collections.singletonList(new Count()), otelData.getBaseDimensionsMap(), - otelData.getBaseAttributes()); + otelData.getBaseAttributes(), + resources); noTimeoutHeartbeatCheckState = MetricEntityStateBase.create( HeartbeatCheckerOtelMetricEntity.BATCH_JOB_HEARTBEAT_ACTIVE_COUNT.getMetricEntity(), @@ -54,7 +56,8 @@ public HeartbeatBasedCheckerStats(MetricsRepository metricsRepository) { HeartbeatCheckerTehutiMetricNameEnum.NON_TIMEOUT_HEARTBEAT_CHECK, Collections.singletonList(new Count()), otelData.getBaseDimensionsMap(), - otelData.getBaseAttributes()); + otelData.getBaseAttributes(), + resources); } void recordCheckJobHasHeartbeatFailed() { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/server/AdminSparkServer.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/server/AdminSparkServer.java index 2ac7c962626..0f9969d2352 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/server/AdminSparkServer.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/server/AdminSparkServer.java @@ -749,6 +749,8 @@ public boolean startInner() throws Exception { @Override public void stopInner() { httpService.stop(); + statsMap.values().forEach(Utils::closeQuietlyWithErrorLogged); + Utils.closeQuietlyWithErrorLogged(nonclusterSpecificStats); } int getPort() { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AddVersionLatencyStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AddVersionLatencyStats.java index 0bee438e599..13b8684d90f 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AddVersionLatencyStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AddVersionLatencyStats.java @@ -97,7 +97,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste + AddVersionLatencyTehutiMetricNameEnum.ADD_VERSION_RETIRE_OLD_VERSIONS_LATENCY.getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); resourceAssignmentWaitMetric = MetricEntityStateTwoEnums.create( AddVersionLatencyOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME_PER_COMPONENT @@ -114,7 +115,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste .getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); failureHandlingMetric = MetricEntityStateTwoEnums.create( AddVersionLatencyOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME_PER_COMPONENT @@ -130,7 +132,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste + AddVersionLatencyTehutiMetricNameEnum.ADD_VERSION_CREATION_FAILURE_LATENCY.getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); existingVersionHandlingMetric = MetricEntityStateTwoEnums.create( AddVersionLatencyOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME_PER_COMPONENT @@ -147,7 +150,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste .getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); startOfPushMetric = MetricEntityStateTwoEnums.create( AddVersionLatencyOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME_PER_COMPONENT @@ -163,7 +167,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste + AddVersionLatencyTehutiMetricNameEnum.ADD_VERSION_START_OF_PUSH_LATENCY.getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); batchTopicCreationMetric = MetricEntityStateTwoEnums.create( AddVersionLatencyOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME_PER_COMPONENT @@ -179,7 +184,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste + AddVersionLatencyTehutiMetricNameEnum.ADD_VERSION_BATCH_TOPIC_CREATION_LATENCY.getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); helixResourceCreationMetric = MetricEntityStateTwoEnums.create( AddVersionLatencyOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME_PER_COMPONENT @@ -196,7 +202,8 @@ public AddVersionLatencyStats(MetricsRepository metricsRepository, String cluste .getMetricName())), baseDimensionsMap, AdminMessageType.class, - AdminMessageProcessingComponent.class); + AdminMessageProcessingComponent.class, + resources); } public void recordRetireOldVersionsLatency(long latency) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AdminConsumptionStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AdminConsumptionStats.java index 17ec8c7430f..5e2258f566b 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AdminConsumptionStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/AdminConsumptionStats.java @@ -63,10 +63,16 @@ public class AdminConsumptionStats extends AbstractVeniceStats { private final MetricEntityStateOneEnum addVersionProcessLatencyMetric; /** - * Tracks the duration of each batch processing cycle for admin messages. + * Tracks the duration of each batch processing cycle for admin messages. */ private final MetricEntityStateBase cycleTimeMetric; + /** OTel async gauges. Captured so they can be closed during shutdown. */ + private final AsyncMetricEntityStateBase pendingAdminMessagesCountMetric; + private final AsyncMetricEntityStateBase storesWithPendingAdminMessagesCountMetric; + private final AsyncMetricEntityStateBase adminConsumptionOffsetLagMetric; + private final AsyncMetricEntityStateBase maxAdminConsumptionOffsetLagMetric; + /** * A gauge reporting the total number of pending admin messages remaining in the internal queue at the end of each * consumption cycle. Pending messages could be caused by blocked admin operations or insufficient resources. @@ -110,7 +116,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.FAILED_ADMIN_MESSAGES, Arrays.asList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); retriableFailureCountMetric = MetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_RETRIABLE_FAILURE_COUNT.getMetricEntity(), @@ -119,7 +126,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.FAILED_RETRIABLE_ADMIN_MESSAGES, Arrays.asList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); divFailureCountMetric = MetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_DIV_FAILURE_COUNT.getMetricEntity(), @@ -128,7 +136,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGE_DIV_ERROR_REPORT_COUNT, Arrays.asList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); futureSchemaCountMetric = MetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_FUTURE_SCHEMA_COUNT.getMetricEntity(), @@ -137,7 +146,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGES_WITH_FUTURE_PROTOCOL_VERSION_COUNT, Arrays.asList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); produceToBrokerTimeMetric = MetricEntityStateOneEnum.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_REPLICATION_TO_LOCAL_BROKER_TIME @@ -147,7 +157,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGE_MM_LATENCY_MS, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - AdminMessageType.class); + AdminMessageType.class, + resources); brokerToQueueTimeMetric = MetricEntityStateOneEnum.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_BROKER_TO_PROCESSING_QUEUE_TIME @@ -157,7 +168,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGE_DELEGATE_LATENCY_MS, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - AdminMessageType.class); + AdminMessageType.class, + resources); queueToStartProcessingTimeMetric = MetricEntityStateOneEnum.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_QUEUE_TO_START_PROCESSING_TIME @@ -167,7 +179,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGE_START_PROCESSING_LATENCY_MS, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - AdminMessageType.class); + AdminMessageType.class, + resources); processLatencyMetric = MetricEntityStateOneEnum.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME.getMetricEntity(), @@ -176,7 +189,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGE_PROCESS_LATENCY_MS, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - AdminMessageType.class); + AdminMessageType.class, + resources); addVersionProcessLatencyMetric = MetricEntityStateOneEnum.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PHASE_START_TO_END_PROCESSING_TIME.getMetricEntity(), @@ -185,7 +199,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_MESSAGE_ADD_VERSION_PROCESS_LATENCY_MS, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - AdminMessageType.class); + AdminMessageType.class, + resources); cycleTimeMetric = MetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_BATCH_PROCESSING_CYCLE_TIME.getMetricEntity(), @@ -194,9 +209,10 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_CONSUMPTION_CYCLE_DURATION_MS, Arrays.asList(new Avg(), new Min(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); - AsyncMetricEntityStateBase.create( + pendingAdminMessagesCountMetric = AsyncMetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_MESSAGE_PENDING_COUNT.getMetricEntity(), otelRepository, this::registerSensorIfAbsent, @@ -207,9 +223,10 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.PENDING_ADMIN_MESSAGES_COUNT.getMetricName())), baseDimensionsMap, baseAttributes, - () -> (long) pendingAdminMessagesCountGauge); + () -> (long) pendingAdminMessagesCountGauge, + resources); - AsyncMetricEntityStateBase.create( + storesWithPendingAdminMessagesCountMetric = AsyncMetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_STORE_PENDING_COUNT.getMetricEntity(), otelRepository, this::registerSensorIfAbsent, @@ -220,9 +237,10 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.STORES_WITH_PENDING_ADMIN_MESSAGES_COUNT.getMetricName())), baseDimensionsMap, baseAttributes, - () -> (long) storesWithPendingAdminMessagesCountGauge); + () -> (long) storesWithPendingAdminMessagesCountGauge, + resources); - AsyncMetricEntityStateBase.create( + adminConsumptionOffsetLagMetric = AsyncMetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_CONSUMER_OFFSET_LAG.getMetricEntity(), otelRepository, this::registerSensorIfAbsent, @@ -233,9 +251,10 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.ADMIN_CONSUMPTION_OFFSET_LAG.getMetricName())), baseDimensionsMap, baseAttributes, - () -> this.adminConsumptionOffsetLag); + () -> this.adminConsumptionOffsetLag, + resources); - AsyncMetricEntityStateBase.create( + maxAdminConsumptionOffsetLagMetric = AsyncMetricEntityStateBase.create( AdminConsumptionOtelMetricEntity.ADMIN_CONSUMPTION_CONSUMER_CHECKPOINT_OFFSET_LAG.getMetricEntity(), otelRepository, this::registerSensorIfAbsent, @@ -246,7 +265,8 @@ public AdminConsumptionStats(MetricsRepository metricsRepository, String cluster AdminConsumptionTehutiMetricNameEnum.MAX_ADMIN_CONSUMPTION_OFFSET_LAG.getMetricName())), baseDimensionsMap, baseAttributes, - () -> this.maxAdminConsumptionOffsetLag); + () -> this.maxAdminConsumptionOffsetLag, + resources); // Tehuti-only adminMessageTotalLatencySensor = registerSensor("admin_message_total_latency_ms", new Avg(), new Max()); diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DeferredVersionSwapStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DeferredVersionSwapStats.java index 19c4bd8e2e3..1c17fd2c2b1 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DeferredVersionSwapStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DeferredVersionSwapStats.java @@ -49,7 +49,8 @@ public DeferredVersionSwapStats(MetricsRepository metricsRepository) { this::registerSensorIfAbsent, DeferredVersionSwapTehutiMetricNameEnum.DEFERRED_VERSION_SWAP_ERROR, Collections.singletonList(new Count()), - baseDimensionsMap); + baseDimensionsMap, + resources); deferredVersionSwapThrowableMetric = MetricEntityStateGeneric.create( DeferredVersionSwapOtelMetricEntity.DEFERRED_VERSION_SWAP_PROCESSING_ERROR_COUNT.getMetricEntity(), @@ -57,7 +58,8 @@ public DeferredVersionSwapStats(MetricsRepository metricsRepository) { this::registerSensorIfAbsent, DeferredVersionSwapTehutiMetricNameEnum.DEFERRED_VERSION_SWAP_THROWABLE, Collections.singletonList(new Count()), - baseDimensionsMap); + baseDimensionsMap, + resources); deferredVersionSwapFailedRollForwardMetric = MetricEntityStateGeneric.create( DeferredVersionSwapOtelMetricEntity.DEFERRED_VERSION_SWAP_ROLL_FORWARD_FAILURE_COUNT.getMetricEntity(), @@ -65,7 +67,8 @@ public DeferredVersionSwapStats(MetricsRepository metricsRepository) { this::registerSensorIfAbsent, DeferredVersionSwapTehutiMetricNameEnum.DEFERRED_VERSION_SWAP_FAILED_ROLL_FORWARD, Collections.singletonList(new Count()), - baseDimensionsMap); + baseDimensionsMap, + resources); deferredVersionSwapStalledVersionSwapMetric = MetricEntityStateBase.create( DeferredVersionSwapOtelMetricEntity.DEFERRED_VERSION_SWAP_STALLED_COUNT.getMetricEntity(), @@ -74,7 +77,8 @@ public DeferredVersionSwapStats(MetricsRepository metricsRepository) { DeferredVersionSwapTehutiMetricNameEnum.DEFERRED_VERSION_SWAP_STALLED_VERSION_SWAP, Collections.singletonList(new Gauge()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); deferredVersionSwapParentChildStatusMismatchMetric = MetricEntityStateGeneric.create( DeferredVersionSwapOtelMetricEntity.DEFERRED_VERSION_SWAP_PARENT_STATUS_MISMATCH_COUNT.getMetricEntity(), @@ -82,7 +86,8 @@ public DeferredVersionSwapStats(MetricsRepository metricsRepository) { this::registerSensorIfAbsent, DeferredVersionSwapTehutiMetricNameEnum.DEFERRED_VERSION_SWAP_PARENT_CHILD_STATUS_MISMATCH, Collections.singletonList(new Count()), - baseDimensionsMap); + baseDimensionsMap, + resources); deferredVersionSwapChildStatusMismatchMetric = MetricEntityStateGeneric.create( DeferredVersionSwapOtelMetricEntity.DEFERRED_VERSION_SWAP_CHILD_STATUS_MISMATCH_COUNT.getMetricEntity(), @@ -90,7 +95,8 @@ public DeferredVersionSwapStats(MetricsRepository metricsRepository) { this::registerSensorIfAbsent, DeferredVersionSwapTehutiMetricNameEnum.DEFERRED_VERSION_SWAP_CHILD_STATUS_MISMATCH, Collections.singletonList(new Count()), - baseDimensionsMap); + baseDimensionsMap, + resources); } public void recordDeferredVersionSwapExceptionMetric(String clusterName) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DisabledPartitionStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DisabledPartitionStats.java index 22a54313491..e39d7d321ab 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DisabledPartitionStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/DisabledPartitionStats.java @@ -38,7 +38,8 @@ public DisabledPartitionStats(MetricsRepository metricsRepository, String name) this::registerSensorIfAbsent, DisabledPartitionTehutiMetricNameEnum.DISABLED_PARTITION_COUNT, Collections.singletonList(new Total()), - baseDimensionsMap); + baseDimensionsMap, + resources); } public void recordDisabledPartition(String storeName) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ErrorPartitionStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ErrorPartitionStats.java index 2eff5b3516a..5ec5a962531 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ErrorPartitionStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ErrorPartitionStats.java @@ -53,7 +53,8 @@ public ErrorPartitionStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, ErrorPartitionTehutiMetricNameEnum.CURRENT_VERSION_ERROR_PARTITION_RESET_ATTEMPT, Collections.singletonList(new Total()), - baseDimensionsMap); + baseDimensionsMap, + resources); resetErrorMetric = MetricEntityStateGeneric.create( ErrorPartitionOtelMetricEntity.ERROR_PARTITION_RESET_ERROR_COUNT.getMetricEntity(), @@ -61,7 +62,8 @@ public ErrorPartitionStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, ErrorPartitionTehutiMetricNameEnum.CURRENT_VERSION_ERROR_PARTITION_RESET_ATTEMPT_ERRORED, Collections.singletonList(new Count()), - baseDimensionsMap); + baseDimensionsMap, + resources); recoveredMetric = MetricEntityStateGeneric.create( ErrorPartitionOtelMetricEntity.ERROR_PARTITION_RESET_RECOVERED_PARTITION_COUNT.getMetricEntity(), @@ -69,7 +71,8 @@ public ErrorPartitionStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, ErrorPartitionTehutiMetricNameEnum.CURRENT_VERSION_ERROR_PARTITION_RECOVERED_FROM_RESET, Collections.singletonList(new Total()), - baseDimensionsMap); + baseDimensionsMap, + resources); unrecoverableMetric = MetricEntityStateGeneric.create( ErrorPartitionOtelMetricEntity.ERROR_PARTITION_RESET_UNRECOVERABLE_PARTITION_COUNT.getMetricEntity(), @@ -77,7 +80,8 @@ public ErrorPartitionStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, ErrorPartitionTehutiMetricNameEnum.CURRENT_VERSION_ERROR_PARTITION_UNRECOVERABLE_FROM_RESET, Collections.singletonList(new Total()), - baseDimensionsMap); + baseDimensionsMap, + resources); processingErrorMetric = MetricEntityStateBase.create( ErrorPartitionOtelMetricEntity.ERROR_PARTITION_PROCESSING_ERROR_COUNT.getMetricEntity(), @@ -86,7 +90,8 @@ public ErrorPartitionStats(MetricsRepository metricsRepository, String name) { ErrorPartitionTehutiMetricNameEnum.ERROR_PARTITION_PROCESSING_ERROR, Collections.singletonList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); processingTimeMetric = MetricEntityStateBase.create( ErrorPartitionOtelMetricEntity.ERROR_PARTITION_PROCESSING_TIME.getMetricEntity(), @@ -95,7 +100,8 @@ public ErrorPartitionStats(MetricsRepository metricsRepository, String name) { ErrorPartitionTehutiMetricNameEnum.ERROR_PARTITION_PROCESSING_TIME, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordErrorPartitionResetAttempt(double value, String storeName) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/LogCompactionStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/LogCompactionStats.java index cf5f26cfc8c..5f1cb64f74c 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/LogCompactionStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/LogCompactionStats.java @@ -50,7 +50,8 @@ public LogCompactionStats(MetricsRepository metricsRepository, String clusterNam this::registerSensor, ControllerTehutiMetricNameEnum.REPUSH_CALL_COUNT, Collections.singletonList(new OccurrenceRate()), - baseDimensionsMap); + baseDimensionsMap, + resources); compactionEligibleMetric = MetricEntityStateGeneric.create( LogCompactionOtelMetricEntity.STORE_COMPACTION_ELIGIBLE_STATE.getMetricEntity(), @@ -58,7 +59,8 @@ public LogCompactionStats(MetricsRepository metricsRepository, String clusterNam this::registerSensor, ControllerTehutiMetricNameEnum.COMPACTION_ELIGIBLE_STATE, Collections.singletonList(new Gauge()), - baseDimensionsMap); + baseDimensionsMap, + resources); storeNominatedForCompactionCountMetric = MetricEntityStateGeneric.create( LogCompactionOtelMetricEntity.STORE_COMPACTION_NOMINATED_COUNT.getMetricEntity(), @@ -66,7 +68,8 @@ public LogCompactionStats(MetricsRepository metricsRepository, String clusterNam this::registerSensor, ControllerTehutiMetricNameEnum.STORE_NOMINATED_FOR_COMPACTION_COUNT, Collections.singletonList(new OccurrenceRate()), - baseDimensionsMap); + baseDimensionsMap, + resources); storeCompactionTriggeredCountMetric = MetricEntityStateGeneric.create( LogCompactionOtelMetricEntity.STORE_COMPACTION_TRIGGERED_COUNT.getMetricEntity(), @@ -74,7 +77,8 @@ public LogCompactionStats(MetricsRepository metricsRepository, String clusterNam this::registerSensor, ControllerTehutiMetricNameEnum.STORE_COMPACTION_TRIGGERED_COUNT, Collections.singletonList(new OccurrenceRate()), - baseDimensionsMap); + baseDimensionsMap, + resources); } public void recordRepushStoreCall( diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java index d60817a092b..03ecdbe0a99 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PartitionHealthStats.java @@ -66,7 +66,8 @@ public PartitionHealthStats(MetricsRepository metricsRepository, String name, St PartitionHealthTehutiMetricNameEnum.UNDER_REPLICATED_PARTITION, Arrays.asList(new Max(), new Gauge()), otelData.getBaseDimensionsMap(), - otelData.getBaseAttributes()); + otelData.getBaseAttributes(), + resources); } public void recordUnderReplicatePartition(int num) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ProtocolVersionAutoDetectionStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ProtocolVersionAutoDetectionStats.java index b50fc5f469d..3c90273bc4b 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ProtocolVersionAutoDetectionStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/ProtocolVersionAutoDetectionStats.java @@ -44,7 +44,8 @@ public ProtocolVersionAutoDetectionStats(MetricsRepository metricsRepository, St ProtocolVersionAutoDetectionTehutiMetricNameEnum.PROTOCOL_VERSION_AUTO_DETECTION_ERROR, Collections.singletonList(new Gauge()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); detectionTimeMetric = MetricEntityStateBase.create( ProtocolVersionAutoDetectionOtelMetricEntity.PROTOCOL_VERSION_AUTO_DETECTION_TIME.getMetricEntity(), @@ -53,7 +54,8 @@ public ProtocolVersionAutoDetectionStats(MetricsRepository metricsRepository, St ProtocolVersionAutoDetectionTehutiMetricNameEnum.PROTOCOL_VERSION_AUTO_DETECTION_LATENCY, Collections.singletonList(new Avg()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordProtocolVersionAutoDetectionErrorSensor(int count) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PushJobStatusStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PushJobStatusStats.java index 8f26c73081a..a5590c759bc 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PushJobStatusStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/PushJobStatusStats.java @@ -49,7 +49,8 @@ public PushJobStatusStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, PushJobTehutiMetricNameEnum.BATCH_PUSH_JOB_SUCCESS, Arrays.asList(new Count(), new CountSinceLastMeasurement()), - baseDimensionsMap); + baseDimensionsMap, + resources); batchPushFailureDueToUserErrorMetric = MetricEntityStateGeneric.create( PushJobOtelMetricEntity.PUSH_JOB_COUNT.getMetricEntity(), @@ -57,7 +58,8 @@ public PushJobStatusStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, PushJobTehutiMetricNameEnum.BATCH_PUSH_JOB_FAILED_USER_ERROR, Arrays.asList(new Count(), new CountSinceLastMeasurement()), - baseDimensionsMap); + baseDimensionsMap, + resources); batchPushFailureDueToNonUserErrorMetric = MetricEntityStateGeneric.create( PushJobOtelMetricEntity.PUSH_JOB_COUNT.getMetricEntity(), @@ -65,7 +67,8 @@ public PushJobStatusStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, PushJobTehutiMetricNameEnum.BATCH_PUSH_JOB_FAILED_NON_USER_ERROR, Arrays.asList(new Count(), new CountSinceLastMeasurement()), - baseDimensionsMap); + baseDimensionsMap, + resources); incrementalPushSuccessMetric = MetricEntityStateGeneric.create( PushJobOtelMetricEntity.PUSH_JOB_COUNT.getMetricEntity(), @@ -73,7 +76,8 @@ public PushJobStatusStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, PushJobTehutiMetricNameEnum.INCREMENTAL_PUSH_JOB_SUCCESS, Arrays.asList(new Count(), new CountSinceLastMeasurement()), - baseDimensionsMap); + baseDimensionsMap, + resources); incrementalPushFailureDueToUserErrorMetric = MetricEntityStateGeneric.create( PushJobOtelMetricEntity.PUSH_JOB_COUNT.getMetricEntity(), @@ -81,7 +85,8 @@ public PushJobStatusStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, PushJobTehutiMetricNameEnum.INCREMENTAL_PUSH_JOB_FAILED_USER_ERROR, Arrays.asList(new Count(), new CountSinceLastMeasurement()), - baseDimensionsMap); + baseDimensionsMap, + resources); incrementalPushFailureDueToNonUserErrorMetric = MetricEntityStateGeneric.create( PushJobOtelMetricEntity.PUSH_JOB_COUNT.getMetricEntity(), @@ -89,7 +94,8 @@ public PushJobStatusStats(MetricsRepository metricsRepository, String name) { this::registerSensorIfAbsent, PushJobTehutiMetricNameEnum.INCREMENTAL_PUSH_JOB_FAILED_NON_USER_ERROR, Arrays.asList(new Count(), new CountSinceLastMeasurement()), - baseDimensionsMap); + baseDimensionsMap, + resources); } public void recordBatchPushSuccessSensor(String storeName) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SparkServerStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SparkServerStats.java index ec7eeae6428..ecd57f86f7d 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SparkServerStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SparkServerStats.java @@ -75,7 +75,8 @@ public SparkServerStats(MetricsRepository metricsRepository, String statsPrefix, ControllerTehutiMetricNameEnum.CURRENT_IN_FLIGHT_REQUEST, Collections.singletonList(new Total()), baseDimensionsMap, - ControllerRoute.class); + ControllerRoute.class, + resources); successfulRequestCountMetric = MetricEntityStateFourEnums.create( SparkServerOtelMetricEntity.CALL_COUNT.getMetricEntity(), @@ -87,7 +88,8 @@ public SparkServerStats(MetricsRepository metricsRepository, String statsPrefix, ControllerRoute.class, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); failedRequestCountMetric = MetricEntityStateFourEnums.create( SparkServerOtelMetricEntity.CALL_COUNT.getMetricEntity(), @@ -99,7 +101,8 @@ public SparkServerStats(MetricsRepository metricsRepository, String statsPrefix, ControllerRoute.class, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); successfulRequestLatencyHistogramMetric = MetricEntityStateFourEnums.create( SparkServerOtelMetricEntity.CALL_TIME.getMetricEntity(), @@ -114,7 +117,8 @@ public SparkServerStats(MetricsRepository metricsRepository, String statsPrefix, ControllerRoute.class, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); failedRequestLatencyHistogramMetric = MetricEntityStateFourEnums.create( SparkServerOtelMetricEntity.CALL_TIME.getMetricEntity(), @@ -128,7 +132,8 @@ public SparkServerStats(MetricsRepository metricsRepository, String statsPrefix, ControllerRoute.class, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); } diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/StoreBackupVersionCleanupServiceStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/StoreBackupVersionCleanupServiceStats.java index ec6efc87cf5..4406ccba8c7 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/StoreBackupVersionCleanupServiceStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/StoreBackupVersionCleanupServiceStats.java @@ -42,7 +42,8 @@ public StoreBackupVersionCleanupServiceStats(MetricsRepository metricsRepository BackupVersionCleanupTehutiMetricNameEnum.BACKUP_VERSION_CLEANUP_VERSION_MISMATCH, Arrays.asList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); rolledBackVersionDeletedMetric = MetricEntityStateBase.create( BackupVersionCleanupOtelMetricEntity.ROLLED_BACK_VERSION_DELETED_COUNT.getMetricEntity(), @@ -51,7 +52,8 @@ public StoreBackupVersionCleanupServiceStats(MetricsRepository metricsRepository BackupVersionCleanupTehutiMetricNameEnum.ROLLED_BACK_VERSION_DELETED, Arrays.asList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); rolledBackVersionDeleteErrorMetric = MetricEntityStateBase.create( BackupVersionCleanupOtelMetricEntity.ROLLED_BACK_VERSION_DELETE_ERROR_COUNT.getMetricEntity(), @@ -60,7 +62,8 @@ public StoreBackupVersionCleanupServiceStats(MetricsRepository metricsRepository BackupVersionCleanupTehutiMetricNameEnum.ROLLED_BACK_VERSION_DELETE_ERROR, Arrays.asList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordBackupVersionMismatch() { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SystemStoreHealthCheckStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SystemStoreHealthCheckStats.java index c3c174c7b92..e71b09e753e 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SystemStoreHealthCheckStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/SystemStoreHealthCheckStats.java @@ -35,6 +35,8 @@ public class SystemStoreHealthCheckStats extends AbstractVeniceStats { private final AtomicLong badMetaSystemStoreCounter = new AtomicLong(0); private final AtomicLong badPushStatusSystemStoreCounter = new AtomicLong(0); private final AtomicLong notRepairableSystemStoreCounter = new AtomicLong(0); + private final AsyncMetricEntityStateOneEnum unhealthyCountMetric; + private final AsyncMetricEntityStateBase unrepairableCountMetric; public SystemStoreHealthCheckStats(MetricsRepository metricsRepository, String name) { super(metricsRepository, name); @@ -65,7 +67,7 @@ public SystemStoreHealthCheckStats(MetricsRepository metricsRepository, String n // OTel async gauge. The liveStateResolver returns the backing AtomicLong for each mapped // VeniceSystemStoreType value (null for any future enum additions, which skips emission); the // valueResolver reads the current count. - AsyncMetricEntityStateOneEnum.create( + unhealthyCountMetric = AsyncMetricEntityStateOneEnum.create( SystemStoreHealthCheckOtelMetricEntity.SYSTEM_STORE_UNHEALTHY_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, @@ -85,14 +87,16 @@ public SystemStoreHealthCheckStats(MetricsRepository metricsRepository, String n return null; } }, - (counter, type) -> counter.get()); + (counter, type) -> counter.get(), + resources); - AsyncMetricEntityStateBase.create( + unrepairableCountMetric = AsyncMetricEntityStateBase.create( SystemStoreHealthCheckOtelMetricEntity.SYSTEM_STORE_UNREPAIRABLE_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes, - notRepairableSystemStoreCounter::get); + notRepairableSystemStoreCounter::get, + resources); } public AtomicLong getBadMetaSystemStoreCounter() { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/TopicCleanupServiceStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/TopicCleanupServiceStats.java index 75067b58ac6..6ff8e2d4416 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/TopicCleanupServiceStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/TopicCleanupServiceStats.java @@ -46,7 +46,8 @@ public TopicCleanupServiceStats(MetricsRepository metricsRepository) { TopicCleanupTehutiMetricNameEnum.DELETABLE_TOPICS_COUNT, Arrays.asList(new Gauge()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); topicsDeletedMetric = MetricEntityStateOneEnum.create( TopicCleanupOtelMetricEntity.TOPIC_CLEANUP_DELETED_COUNT.getMetricEntity(), @@ -55,7 +56,8 @@ public TopicCleanupServiceStats(MetricsRepository metricsRepository) { TopicCleanupTehutiMetricNameEnum.TOPICS_DELETED_RATE, Arrays.asList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); topicDeletionErrorMetric = MetricEntityStateOneEnum.create( TopicCleanupOtelMetricEntity.TOPIC_CLEANUP_DELETED_COUNT.getMetricEntity(), @@ -64,7 +66,8 @@ public TopicCleanupServiceStats(MetricsRepository metricsRepository) { TopicCleanupTehutiMetricNameEnum.TOPIC_DELETION_ERROR_RATE, Arrays.asList(new Rate()), baseDimensionsMap, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); } public void recordDeletableTopicsCount(int deletableTopicsCount) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/VeniceAdminStats.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/VeniceAdminStats.java index 33ef94d4a52..9326e900c0b 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/VeniceAdminStats.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/stats/VeniceAdminStats.java @@ -67,7 +67,8 @@ public VeniceAdminStats(MetricsRepository metricsRepository, String statsPrefix, VeniceAdminTehutiMetricNameEnum.UNEXPECTED_TOPIC_ABSENCE_DURING_INCREMENTAL_PUSH_COUNT, Arrays.asList(new Count()), baseDimensionsMap, - PushType.class); + PushType.class, + resources); batchPushStartedMetric = MetricEntityStateOneEnum.create( VeniceAdminOtelMetricEntity.ADMIN_PUSH_STARTED_COUNT.getMetricEntity(), @@ -76,7 +77,8 @@ public VeniceAdminStats(MetricsRepository metricsRepository, String statsPrefix, VeniceAdminTehutiMetricNameEnum.SUCCESSFULLY_STARTED_USER_BATCH_PUSH_PARENT_ADMIN_COUNT, Arrays.asList(new Count()), baseDimensionsMap, - PushType.class); + PushType.class, + resources); incrementalPushStartedMetric = MetricEntityStateOneEnum.create( VeniceAdminOtelMetricEntity.ADMIN_PUSH_STARTED_COUNT.getMetricEntity(), @@ -85,7 +87,8 @@ public VeniceAdminStats(MetricsRepository metricsRepository, String statsPrefix, VeniceAdminTehutiMetricNameEnum.SUCCESSFUL_STARTED_USER_INCREMENTAL_PUSH_PARENT_ADMIN_COUNT, Arrays.asList(new Count()), baseDimensionsMap, - PushType.class); + PushType.class, + resources); serializationFailureMetric = MetricEntityStateBase.create( VeniceAdminOtelMetricEntity.ADMIN_OPERATION_SERIALIZATION_FAILURE_COUNT.getMetricEntity(), @@ -94,7 +97,8 @@ public VeniceAdminStats(MetricsRepository metricsRepository, String statsPrefix, VeniceAdminTehutiMetricNameEnum.FAILED_SERIALIZING_ADMIN_OPERATION_MESSAGE_COUNT, Arrays.asList(new Count()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } public void recordUnexpectedTopicAbsenceCount(PushType pushType) { diff --git a/services/venice-controller/src/main/java/com/linkedin/venice/controller/systemstore/SystemStoreRepairService.java b/services/venice-controller/src/main/java/com/linkedin/venice/controller/systemstore/SystemStoreRepairService.java index 6e7d5b001ec..d67beff5389 100644 --- a/services/venice-controller/src/main/java/com/linkedin/venice/controller/systemstore/SystemStoreRepairService.java +++ b/services/venice-controller/src/main/java/com/linkedin/venice/controller/systemstore/SystemStoreRepairService.java @@ -6,6 +6,7 @@ import com.linkedin.venice.controller.VeniceParentHelixAdmin; import com.linkedin.venice.controller.stats.SystemStoreHealthCheckStats; import com.linkedin.venice.service.AbstractVeniceService; +import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import java.util.Map; @@ -80,6 +81,7 @@ public void stopInner() { } catch (InterruptedException e) { currentThread().interrupt(); } + clusterToSystemStoreHealthCheckStatsMap.values().forEach(Utils::closeQuietlyWithErrorLogged); LOGGER.info("SystemStoreRepairService is shutdown."); } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java b/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java index 2941355a0b5..0c16ea6761f 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/RouterServer.java @@ -89,6 +89,7 @@ import com.linkedin.venice.stats.VeniceJVMStats; import com.linkedin.venice.stats.VeniceMetricsRepository; import com.linkedin.venice.stats.ZkClientStatusStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.stats.metrics.MetricEntity; import com.linkedin.venice.stats.metrics.ModuleMetricEntityInterface; import com.linkedin.venice.throttle.EventThrottler; @@ -164,6 +165,8 @@ public class RouterServer extends AbstractVeniceService { private Optional hybridStoreQuotaRepository; private ReadOnlyStoreRepository metadataRepository; private RouterStats routerStats; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private HelixReadOnlyStoreViewConfigRepositoryAdapter storeConfigRepository; private PushStatusStoreReader pushStatusStoreReader; @@ -669,7 +672,8 @@ public boolean startInner() throws Exception { config.getLogContext(), responseAggregationQueueCapacity, LINKED_BLOCKING_QUEUE); - new ThreadPoolStats(metricsRepository, responseAggregationExecutor, "response_aggregation_thread_pool"); + statsCloseables.register( + new ThreadPoolStats(metricsRepository, responseAggregationExecutor, "response_aggregation_thread_pool")); LOGGER.info( "Response aggregation thread pool enabled with size: {}, queue capacity: {}", responseAggregationThreadPoolSize, @@ -763,7 +767,8 @@ public boolean startInner() throws Exception { config.getLogContext(), config.getResolveQueueCapacity(), LINKED_BLOCKING_QUEUE); - new ThreadPoolStats(metricsRepository, this.dnsResolveExecutor, "dns_resolution_thread_pool"); + statsCloseables + .register(new ThreadPoolStats(metricsRepository, this.dnsResolveExecutor, "dns_resolution_thread_pool")); int resolveThreads = config.getResolveThreads(); int maxConcurrentSslHandshakes = config.getMaxConcurrentSslHandshakes(); int clientResolutionRetryAttempts = config.getClientResolutionRetryAttempts(); @@ -986,6 +991,10 @@ public void stopInner() throws Exception { if (dnsResolveExecutor != null) { dnsResolveExecutor.shutdownNow(); } + Utils.closeQuietlyWithErrorLogged(helixGroupSelector); + Utils.closeQuietlyWithErrorLogged(routerStats); + Utils.closeQuietlyWithErrorLogged(scatterGatherMode); + statsCloseables.close(); } public HelixBaseRoutingRepository getRoutingDataRepository() { diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceDelegateMode.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceDelegateMode.java index ee10178f72a..b80b4c257a2 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceDelegateMode.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VeniceDelegateMode.java @@ -26,7 +26,9 @@ import com.linkedin.venice.router.stats.RouterStats; import com.linkedin.venice.router.throttle.RouterThrottler; import com.linkedin.venice.stats.ThreadPoolStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.DaemonThreadFactory; +import java.io.Closeable; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -61,7 +63,7 @@ * TODO: maybe we should improve DDS lib to catch all kinds of exception in {@link ScatterGatherMode#scatter} to avoid * this potential leaking issue. */ -public class VeniceDelegateMode extends ScatterGatherMode { +public class VeniceDelegateMode extends ScatterGatherMode implements Closeable { public static final Logger LOGGER = LogManager.getLogger(VeniceDelegateMode.class); /** * The following constant defines the threshold for selecting a host based on its average latency. @@ -113,6 +115,8 @@ public class VeniceDelegateMode extends ScatterGatherMode { private final RoutingComputationMode routingComputationMode; private final ThreadPoolExecutor parallelRoutingExecutor; private final int parallelRoutingChunkSize; + /** Stats fields owned by this class; drained by {@link #close()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); public VeniceDelegateMode( VeniceRouterConfig config, @@ -157,10 +161,11 @@ public VeniceDelegateMode( new LinkedBlockingQueue<>(), new DaemonThreadFactory("Venice-Parallel-Routing", config.getLogContext())); if (routeHttpRequestStats.getMetricsRepository() != null) { - new ThreadPoolStats( - routeHttpRequestStats.getMetricsRepository(), - parallelRoutingExecutor, - "ParallelRoutingExecutor"); + statsCloseables.register( + new ThreadPoolStats( + routeHttpRequestStats.getMetricsRepository(), + parallelRoutingExecutor, + "ParallelRoutingExecutor")); } LOGGER.info( "Venice router parallel routing enabled, with thread pool size: {}, chunk size: {}", @@ -887,4 +892,9 @@ protected void selectHostForPartition( populateHostMap(hostMap, selectedHost, partitionKeys); } } + + @Override + public void close() { + statsCloseables.close(); + } } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java index a57e1c58c34..1bc1db1bac3 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/VenicePathParser.java @@ -39,6 +39,7 @@ import com.linkedin.venice.router.streaming.VeniceChunkedWriteHandler; import com.linkedin.venice.router.utils.VeniceRouterUtils; import com.linkedin.venice.streaming.StreamingUtils; +import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.netty.channel.ChannelHandlerContext; import io.netty.handler.codec.http.HttpRequest; @@ -151,8 +152,8 @@ public VenicePathParser( @Override public void handleStoreDeleted(String storeNameString) { StoreName storeName = VenicePathParser.this.nameRepository.getStoreName(storeNameString); - routerSingleKeyRetryManagers.remove(storeName); - routerMultiKeyRetryManagers.remove(storeName); + Utils.closeQuietlyWithErrorLogged(routerSingleKeyRetryManagers.remove(storeName)); + Utils.closeQuietlyWithErrorLogged(routerMultiKeyRetryManagers.remove(storeName)); cleanDecompressorMaps(storeName, storeVersionName -> true); } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/api/routing/helix/HelixGroupSelector.java b/services/venice-router/src/main/java/com/linkedin/venice/router/api/routing/helix/HelixGroupSelector.java index 1831c57d7d5..f7b093fabbe 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/api/routing/helix/HelixGroupSelector.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/api/routing/helix/HelixGroupSelector.java @@ -4,7 +4,9 @@ import com.linkedin.venice.exceptions.VeniceException; import com.linkedin.venice.helix.HelixInstanceConfigRepository; import com.linkedin.venice.stats.routing.HelixGroupStats; +import com.linkedin.venice.utils.Utils; import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; import java.util.concurrent.TimeUnit; @@ -13,7 +15,7 @@ * will delegate all the related API calls to the corresponding objects. * Besides that, this class is also in charge of emitting metrics for each Helix Group. */ -public class HelixGroupSelector implements HelixGroupSelectionStrategy { +public class HelixGroupSelector implements HelixGroupSelectionStrategy, Closeable { /** * The timeout to reset group counter. * So far, there is no need to make it very tight, so we will hard-code it to be 10 seconds. @@ -65,4 +67,9 @@ public int selectGroup(long requestId, int groupNum) { public void finishRequest(long requestId, int groupId, double latency) { selectionStrategy.finishRequest(requestId, groupId, latency); } + + @Override + public void close() { + Utils.closeQuietlyWithErrorLogged(helixGroupStats); + } } diff --git a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java index 4252c4128d4..0c3fd93c178 100644 --- a/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java +++ b/services/venice-router/src/main/java/com/linkedin/venice/router/stats/RouterHttpRequestStats.java @@ -198,7 +198,8 @@ public RouterHttpRequestStats( getName(), getFullMetricName(RouterTehutiMetricNameEnum.REQUEST_SIZE.getMetricName()))), baseDimensionsMap, - MessageType.class); + MessageType.class, + resources); healthyRequestMetric = MetricEntityStateThreeEnums.create( CALL_COUNT.getMetricEntity(), @@ -209,7 +210,8 @@ public RouterHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); unhealthyRequestMetric = MetricEntityStateThreeEnums.create( CALL_COUNT.getMetricEntity(), @@ -220,7 +222,8 @@ public RouterHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); tardyRequestMetric = MetricEntityStateThreeEnums.create( CALL_COUNT.getMetricEntity(), @@ -231,7 +234,8 @@ public RouterHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); throttledRequestMetric = MetricEntityStateThreeEnums.create( CALL_COUNT.getMetricEntity(), @@ -242,7 +246,8 @@ public RouterHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); badRequestMetric = MetricEntityStateThreeEnums.create( CALL_COUNT.getMetricEntity(), @@ -253,7 +258,8 @@ public RouterHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); latencyTehutiSensor = registerSensorWithDetailedPercentiles("latency", new Avg(), new Max(0)); healthyLatencyMetric = createCallTimeMetric( @@ -280,7 +286,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.ERROR_RETRY, singletonList(new Count()), baseDimensionsMap, - RequestRetryType.class); + RequestRetryType.class, + resources); allowedRetryCountMetric = MetricEntityStateBase.create( ALLOWED_RETRY_COUNT.getMetricEntity(), otelRepository, @@ -288,7 +295,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.ALLOWED_RETRY_REQUEST_COUNT, singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); disallowedRetryCountMetric = MetricEntityStateBase.create( DISALLOWED_RETRY_COUNT.getMetricEntity(), @@ -297,7 +305,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.DISALLOWED_RETRY_REQUEST_COUNT, singletonList(new OccurrenceRate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); retryDelayMetric = MetricEntityStateBase.create( RETRY_DELAY.getMetricEntity(), @@ -306,7 +315,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.RETRY_DELAY, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); delayConstraintAbortedRetryCountMetric = MetricEntityStateOneEnum.create( ABORTED_RETRY_COUNT.getMetricEntity(), @@ -315,7 +325,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.DELAY_CONSTRAINT_ABORTED_RETRY_REQUEST, singletonList(new Count()), baseDimensionsMap, - RequestRetryAbortReason.class); + RequestRetryAbortReason.class, + resources); slowRouteAbortedRetryCountMetric = MetricEntityStateOneEnum.create( ABORTED_RETRY_COUNT.getMetricEntity(), @@ -324,7 +335,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.SLOW_ROUTE_ABORTED_RETRY_REQUEST, singletonList(new Count()), baseDimensionsMap, - RequestRetryAbortReason.class); + RequestRetryAbortReason.class, + resources); retryRouteLimitAbortedRetryCountMetric = MetricEntityStateOneEnum.create( ABORTED_RETRY_COUNT.getMetricEntity(), @@ -333,7 +345,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.RETRY_ROUTE_LIMIT_ABORTED_RETRY_REQUEST, singletonList(new Count()), baseDimensionsMap, - RequestRetryAbortReason.class); + RequestRetryAbortReason.class, + resources); noAvailableReplicaAbortedRetryCountMetric = MetricEntityStateOneEnum.create( ABORTED_RETRY_COUNT.getMetricEntity(), @@ -342,7 +355,8 @@ public RouterHttpRequestStats( RouterTehutiMetricNameEnum.NO_AVAILABLE_REPLICA_ABORTED_RETRY_REQUEST, singletonList(new Count()), baseDimensionsMap, - RequestRetryAbortReason.class); + RequestRetryAbortReason.class, + resources); keyCountMetric = MetricEntityStateThreeEnums.create( KEY_COUNT.getMetricEntity(), @@ -350,7 +364,8 @@ public RouterHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); errorRetryAttemptTriggeredByPendingRequestCheckSensor = registerSensor("error_retry_attempt_triggered_by_pending_request_check", new OccurrenceRate()); @@ -458,7 +473,8 @@ public RouterHttpRequestStats( getName(), getFullMetricName(RouterTehutiMetricNameEnum.KEY_SIZE_IN_BYTE.getMetricName()))), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } else { keySizeMetric = null; } @@ -474,7 +490,8 @@ public RouterHttpRequestStats( getName(), getFullMetricName(RouterTehutiMetricNameEnum.RESPONSE_SIZE.getMetricName()))), baseDimensionsMap, - MessageType.class); + MessageType.class, + resources); // Initialize the in-flight request counter currentInFlightRequest = new AtomicInteger(); @@ -496,7 +513,8 @@ private MetricEntityStateFourEnums { +public class RouterStats implements Closeable { private final STAT_TYPE statsForSingleGet; private final STAT_TYPE statsForMultiGet; private final STAT_TYPE statsForCompute; @@ -43,4 +45,22 @@ public STAT_TYPE getStatsByType(RequestType requestType) { } } + /** + * Closes each per-request-type stats instance if it is {@link Closeable}. STAT_TYPE is generic; + * non-Closeable parameterisations are no-ops. + */ + @Override + public void close() { + closeIfCloseable(statsForSingleGet); + closeIfCloseable(statsForMultiGet); + closeIfCloseable(statsForCompute); + closeIfCloseable(statsForMultiGetStreaming); + closeIfCloseable(statsForComputeStreaming); + } + + private static void closeIfCloseable(Object stats) { + if (stats instanceof Closeable) { + MetricEntityStateUtils.closeQuietly((Closeable) stats); + } + } } diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java index 2658efc069d..d4bb5fa7f92 100644 --- a/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/api/TestVeniceDelegateMode.java @@ -1257,4 +1257,24 @@ public void testSelectLeastLoadedHostWithSingleHost() throws RouterException { // Should select the only available host Assert.assertEquals(requests.iterator().next().getHosts().get(0), instance1); } + + /** + * Exercises {@link VeniceDelegateMode#close()}. The instance owns a {@code statsCloseables} + * registry that may be empty (parallel routing disabled) or hold a {@link ThreadPoolStats} + * wrapper (parallel routing enabled). Close must drain whatever's there without throwing in + * either case, and remain idempotent. + */ + @Test + public void testCloseIsIdempotent() { + VeniceRouterConfig config = mock(VeniceRouterConfig.class); + doReturn(LEAST_LOADED_ROUTING).when(config).getMultiKeyRoutingStrategy(); + doReturn(RoutingComputationMode.SEQUENTIAL).when(config).getRoutingComputationMode(); + VeniceDelegateMode scatterMode = new VeniceDelegateMode( + config, + mock(RouterStats.class), + mock(RouteHttpRequestStats.class), + mock(RouterStats.class)); + scatterMode.close(); + scatterMode.close(); + } } diff --git a/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterStatsTest.java b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterStatsTest.java new file mode 100644 index 00000000000..136e2687f8f --- /dev/null +++ b/services/venice-router/src/test/java/com/linkedin/venice/router/stats/RouterStatsTest.java @@ -0,0 +1,79 @@ +package com.linkedin.venice.router.stats; + +import static com.linkedin.venice.read.RequestType.COMPUTE; +import static com.linkedin.venice.read.RequestType.COMPUTE_STREAMING; +import static com.linkedin.venice.read.RequestType.MULTI_GET; +import static com.linkedin.venice.read.RequestType.MULTI_GET_STREAMING; +import static com.linkedin.venice.read.RequestType.SINGLE_GET; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; +import static org.testng.Assert.assertTrue; + +import com.linkedin.venice.read.RequestType; +import java.io.Closeable; +import java.util.EnumMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import org.testng.annotations.Test; + + +public class RouterStatsTest { + /** Closeable double that records how many times {@link #close()} was invoked. */ + private static final class CloseableStub implements Closeable { + final AtomicInteger closeCount = new AtomicInteger(); + + @Override + public void close() { + closeCount.incrementAndGet(); + } + } + + @Test + public void testGetStatsByTypeReturnsPerRequestTypeInstance() { + Map stubs = new EnumMap<>(RequestType.class); + for (RequestType type: new RequestType[] { SINGLE_GET, MULTI_GET, COMPUTE, MULTI_GET_STREAMING, + COMPUTE_STREAMING }) { + stubs.put(type, new CloseableStub()); + } + RouterStats stats = new RouterStats<>(stubs::get); + for (RequestType type: stubs.keySet()) { + assertEquals(stats.getStatsByType(type), stubs.get(type)); + } + } + + @Test + public void testCloseClosesEveryCloseableStat() { + Map stubs = new EnumMap<>(RequestType.class); + for (RequestType type: new RequestType[] { SINGLE_GET, MULTI_GET, COMPUTE, MULTI_GET_STREAMING, + COMPUTE_STREAMING }) { + stubs.put(type, new CloseableStub()); + } + RouterStats stats = new RouterStats<>(stubs::get); + stats.close(); + for (CloseableStub stub: stubs.values()) { + assertEquals(stub.closeCount.get(), 1, "Each per-request-type Closeable should be closed exactly once"); + } + } + + /** Verifies {@code RouterStats#close()} is a silent no-op when {@code STAT_TYPE} is not {@link Closeable}. */ + @Test + public void testCloseSafeForNonCloseableStatType() { + RouterStats stats = new RouterStats<>(type -> "stats-for-" + type.name()); + // Should not throw despite STAT_TYPE not being Closeable. + stats.close(); + assertEquals(stats.getStatsByType(SINGLE_GET), "stats-for-SINGLE_GET"); + } + + @Test + public void testCloseIsIdempotent() { + CloseableStub stub = new CloseableStub(); + RouterStats stats = new RouterStats<>(type -> stub); + stats.close(); + stats.close(); + // Each request-type slot holds the same stub instance, so it's closed once per slot per close() call. + // Across two close() calls that's 2 * 5 invocations — the value isn't what matters, only that no exception fires + // and the per-Closeable close was attempted. + assertTrue(stub.closeCount.get() >= 5, "close() should not throw and should keep delegating on repeat calls"); + assertFalse(stub.closeCount.get() < 5); + } +} diff --git a/services/venice-server/src/main/java/com/linkedin/venice/cleaner/BackupVersionOptimizationService.java b/services/venice-server/src/main/java/com/linkedin/venice/cleaner/BackupVersionOptimizationService.java index 128e6823042..28d40b29209 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/cleaner/BackupVersionOptimizationService.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/cleaner/BackupVersionOptimizationService.java @@ -193,6 +193,7 @@ public void stopInner() throws Exception { stop = true; executor.shutdownNow(); executor.awaitTermination(30, TimeUnit.SECONDS); + Utils.closeQuietlyWithErrorLogged(stats); } @Override diff --git a/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java b/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java index de57ea5ea0e..afcb9f88e82 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/listener/HttpChannelInitializer.java @@ -26,6 +26,7 @@ import com.linkedin.venice.stats.ServerConnectionStats; import com.linkedin.venice.stats.ServerLoadStats; import com.linkedin.venice.stats.ThreadPoolStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; import com.linkedin.venice.utils.ReflectUtils; import com.linkedin.venice.utils.SslUtils; import com.linkedin.venice.utils.Utils; @@ -38,6 +39,7 @@ import io.netty.handler.codec.http.HttpServerCodec; import io.netty.handler.timeout.IdleStateHandler; import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; import java.util.ArrayList; import java.util.List; import java.util.Optional; @@ -47,7 +49,9 @@ import org.apache.logging.log4j.Logger; -public class HttpChannelInitializer extends ChannelInitializer { +public class HttpChannelInitializer extends ChannelInitializer implements Closeable { + /** Stats fields owned by this class; drained by {@link #close()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); private static final Logger LOGGER = LogManager.getLogger(HttpChannelInitializer.class); private final StorageReadRequestHandler requestHandler; @@ -125,9 +129,10 @@ public HttpChannelInitializer( this.sslFactory = sslFactory; this.alpiniSslFactory = sslFactory.isPresent() ? SslUtils.toAlpiniSSLFactory(sslFactory.get()) : null; this.sslHandshakeExecutor = sslHandshakeExecutor; - this.sslHandshakesThreadPoolStats = sslHandshakeExecutor != null - ? new ThreadPoolStats(metricsRepository, sslHandshakeExecutor, "ssl_handshake_thread_pool") - : null; + this.sslHandshakesThreadPoolStats = statsCloseables.register( + sslHandshakeExecutor != null + ? new ThreadPoolStats(metricsRepository, sslHandshakeExecutor, "ssl_handshake_thread_pool") + : null); Class identityParserClass = ReflectUtils.loadClass(serverConfig.getIdentityParserClassName()); this.identityParser = ReflectUtils.callConstructor(identityParserClass, new Class[0], new Object[0]); @@ -150,7 +155,8 @@ public HttpChannelInitializer( if (serverConfig.isQuotaEnforcementEnabled()) { String nodeId = Utils.getHelixNodeIdentifier(serverConfig.getListenerHostname(), serverConfig.getListenerPort()); - this.quotaUsageStats = new AggServerQuotaUsageStats(serverConfig.getClusterName(), metricsRepository); + this.quotaUsageStats = + statsCloseables.register(new AggServerQuotaUsageStats(serverConfig.getClusterName(), metricsRepository)); this.quotaEnforcer = new ReadQuotaEnforcementHandler( serverConfig, storeMetadataRepository, @@ -173,18 +179,20 @@ public HttpChannelInitializer( this.http2PipelineInitializerBuilder = new VeniceHttp2PipelineInitializerBuilder(serverConfig); if (sslFactory.isPresent()) { + ServerConnectionStats serverConnectionStats = statsCloseables.register( + new ServerConnectionStats(metricsRepository, "server_connection_stats", serverConfig.getClusterName())); this.serverConnectionStatsHandler = new ServerConnectionStatsHandler( this.identityParser, - new ServerConnectionStats(metricsRepository, "server_connection_stats", serverConfig.getClusterName()), + serverConnectionStats, serverConfig.getRouterPrincipalName(), serverConfig.getLogContext()); } else { this.serverConnectionStatsHandler = null; } if (serverConfig.isLoadControllerEnabled()) { - this.loadControllerHandler = new ServerLoadControllerHandler( - serverConfig, - new ServerLoadStats(metricsRepository, "server_load", serverConfig.getClusterName())); + ServerLoadStats serverLoadStats = statsCloseables + .register(new ServerLoadStats(metricsRepository, "server_load", serverConfig.getClusterName())); + this.loadControllerHandler = new ServerLoadControllerHandler(serverConfig, serverLoadStats); LOGGER.info("Server load controller is enabled"); } else { this.loadControllerHandler = null; @@ -192,6 +200,11 @@ public HttpChannelInitializer( } } + @Override + public void close() { + statsCloseables.close(); + } + /* Test only */ diff --git a/services/venice-server/src/main/java/com/linkedin/venice/listener/ListenerService.java b/services/venice-server/src/main/java/com/linkedin/venice/listener/ListenerService.java index ce3ddd3a97b..4438e266cb7 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/listener/ListenerService.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/listener/ListenerService.java @@ -19,6 +19,8 @@ import com.linkedin.venice.security.SSLFactory; import com.linkedin.venice.service.AbstractVeniceService; import com.linkedin.venice.stats.ThreadPoolStats; +import com.linkedin.venice.stats.metrics.CompositeCloseable; +import com.linkedin.venice.utils.Utils; import com.linkedin.venice.utils.concurrent.ThreadPoolFactory; import io.grpc.ServerInterceptor; import io.netty.bootstrap.ServerBootstrap; @@ -60,6 +62,9 @@ public class ListenerService extends AbstractVeniceService { private final ThreadPoolExecutor computeExecutor; private final ThreadPoolExecutor grpcExecutor; private ThreadPoolExecutor sslHandshakeExecutor; + private final HttpChannelInitializer channelInitializer; + /** Stats fields owned by this class; drained by {@link #stopInner()}. */ + private final CompositeCloseable statsCloseables = new CompositeCloseable(); // TODO: move netty config to a config file private static int nettyBacklogSize = 1000; @@ -89,13 +94,13 @@ public ListenerService( serverConfig.getRestServiceStorageThreadNum(), "StorageExecutionThread", serverConfig.getDatabaseLookupQueueCapacity()); - new ThreadPoolStats(metricsRepository, executor, "storage_execution_thread_pool"); + statsCloseables.register(new ThreadPoolStats(metricsRepository, executor, "storage_execution_thread_pool")); computeExecutor = createThreadPool( serverConfig.getServerComputeThreadNum(), "StorageComputeThread", serverConfig.getComputeQueueCapacity()); - new ThreadPoolStats(metricsRepository, computeExecutor, "storage_compute_thread_pool"); + statsCloseables.register(new ThreadPoolStats(metricsRepository, computeExecutor, "storage_compute_thread_pool")); if (sslFactory.isPresent() && serverConfig.getSslHandshakeThreadPoolSize() > 0) { this.sslHandshakeExecutor = createThreadPool( @@ -116,7 +121,7 @@ public ListenerService( compressorFactory, resourceReadUsageTracker); - HttpChannelInitializer channelInitializer = new HttpChannelInitializer( + channelInitializer = new HttpChannelInitializer( storeMetadataRepository, customizedViewRepository, metricsRepository, @@ -215,6 +220,8 @@ public void stopInner() throws Exception { LOGGER.info("Stopping gRPC service on port {}", grpcPort); grpcServer.stop(); } + Utils.closeQuietlyWithErrorLogged(channelInitializer); + statsCloseables.close(); } protected ThreadPoolExecutor createThreadPool(int threadCount, String threadNamePrefix, int capacity) { diff --git a/services/venice-server/src/main/java/com/linkedin/venice/listener/ServerReadMetadataRepository.java b/services/venice-server/src/main/java/com/linkedin/venice/listener/ServerReadMetadataRepository.java index 0e7dfac3c2f..d7e141b68ee 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/listener/ServerReadMetadataRepository.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/listener/ServerReadMetadataRepository.java @@ -24,7 +24,9 @@ import com.linkedin.venice.systemstore.schemas.StoreProperties; import com.linkedin.venice.systemstore.schemas.StoreValueSchemas; import com.linkedin.venice.utils.HelixUtils; +import com.linkedin.venice.utils.Utils; import io.tehuti.metrics.MetricsRepository; +import java.io.Closeable; import java.io.PrintWriter; import java.io.StringWriter; import java.util.ArrayList; @@ -41,7 +43,7 @@ /** * A wrapper that holds reference for various repositories responsible for constructing metadata responses upon request. */ -public class ServerReadMetadataRepository implements ReadMetadataRetriever { +public class ServerReadMetadataRepository implements ReadMetadataRetriever, Closeable { private static final Logger LOGGER = LogManager.getLogger(ServerReadMetadataRepository.class); private final String serverCluster; private final boolean sslEnabled; @@ -91,6 +93,11 @@ public ServerReadMetadataRepository( helixInstanceFuture.ifPresent(future -> future.thenApply(helix -> this.helixInstanceConfigRepository = helix)); } + @Override + public void close() { + Utils.closeQuietlyWithErrorLogged(serverMetadataServiceStats); + } + /** * Return the metadata information for the given store. The data is retrieved from its respective repositories which * originate from the VeniceServer. diff --git a/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java b/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java index 860ac2d4f9f..83bb9dafd45 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/server/VeniceServer.java @@ -72,6 +72,7 @@ import com.linkedin.venice.stats.BackupVersionOptimizationServiceStats; import com.linkedin.venice.stats.DiskHealthStats; import com.linkedin.venice.stats.VeniceJVMStats; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; import com.linkedin.venice.system.store.ControllerClientBackedSystemSchemaInitializer; import com.linkedin.venice.utils.CollectionUtils; import com.linkedin.venice.utils.Utils; @@ -97,7 +98,7 @@ * operations and making sure that it is done in the right order based on their * dependencies. */ -public class VeniceServer { +public class VeniceServer extends AbstractStatsCloseable { private static final Logger LOGGER = LogManager.getLogger(VeniceServer.class); private final List serviceDiscoveryAnnouncers; @@ -129,7 +130,6 @@ public class VeniceServer { private VeniceJVMStats jvmStats; private ICProvider icProvider; StorageEngineBackedCompressorFactory compressorFactory; - private StoreVersionOtelStats storeVersionOtelStats; private HeartbeatMonitoringService heartbeatMonitoringService; private AdaptiveThrottlerSignalService adaptiveThrottlerSignalService; private ServerReadMetadataRepository serverReadMetadataRepository; @@ -332,18 +332,19 @@ private List createServices() { clusterConfig.getClusterName()); // OTel per-store version gauge - storeVersionOtelStats = - StoreVersionOtelStats.create(metricsRepository, clusterConfig.getClusterName(), metadataRepo); + statsCloseables + .register(StoreVersionOtelStats.create(metricsRepository, clusterConfig.getClusterName(), metadataRepo)); boolean plainTableEnabled = veniceConfigLoader.getVeniceServerConfig().getRocksDBServerConfig().isRocksDBPlainTableFormatEnabled(); - RocksDBMemoryStats rocksDBMemoryStats = veniceConfigLoader.getVeniceServerConfig().isDatabaseMemoryStatsEnabled() - ? new RocksDBMemoryStats( - metricsRepository, - "RocksDBMemoryStats", - plainTableEnabled, - clusterConfig.getClusterName()) - : null; + RocksDBMemoryStats rocksDBMemoryStats = statsCloseables.register( + veniceConfigLoader.getVeniceServerConfig().isDatabaseMemoryStatsEnabled() + ? new RocksDBMemoryStats( + metricsRepository, + "RocksDBMemoryStats", + plainTableEnabled, + clusterConfig.getClusterName()) + : null); // Create and add StorageService. storeRepository will be populated by StorageService storageService = new StorageService( @@ -361,7 +362,9 @@ private List createServices() { // Create stats for RocksDB storageService.getRocksDBAggregatedStatistics() - .ifPresent(stat -> new AggRocksDBStats(serverConfig.getClusterName(), metricsRepository, stat)); + .ifPresent( + stat -> statsCloseables + .register(new AggRocksDBStats(serverConfig.getClusterName(), metricsRepository, stat))); compressorFactory = new StorageEngineBackedCompressorFactory(storageMetadataService); @@ -449,11 +452,12 @@ private List createServices() { serverConfig.getLogContext()); services.add(diskHealthCheckService); // create stats for disk health check service - new DiskHealthStats( - metricsRepository, - diskHealthCheckService, - "disk_health_check_service", - clusterConfig.getClusterName()); + statsCloseables.register( + new DiskHealthStats( + metricsRepository, + diskHealthCheckService, + "disk_health_check_service", + clusterConfig.getClusterName())); final Optional resourceReadUsageTracker; if (serverConfig.isOptimizeDatabaseForBackupVersionEnabled()) { @@ -479,15 +483,16 @@ private List createServices() { new StoreValueSchemasCacheService(metadataRepo, schemaRepo, serverConfig.getLogContext()); services.add(storeValueSchemasCacheService); - serverReadMetadataRepository = new ServerReadMetadataRepository( - clusterConfig.getClusterName(), - metricsRepository, - metadataRepo, - schemaRepo, - veniceMetadataRepositoryBuilder.getStoreConfigRepo(), - Optional.of(customizedViewFuture), - Optional.of(helixInstanceFuture), - sslFactory.isPresent()); + serverReadMetadataRepository = statsCloseables.register( + new ServerReadMetadataRepository( + clusterConfig.getClusterName(), + metricsRepository, + metadataRepo, + schemaRepo, + veniceMetadataRepositoryBuilder.getStoreConfigRepo(), + Optional.of(customizedViewFuture), + Optional.of(helixInstanceFuture), + sslFactory.isPresent())); // create and add ListenerServer for handling GET requests ListenerService listenerService = createListenerService( @@ -511,7 +516,8 @@ private List createServices() { * Initialize Blob transfer manager for Service */ if (BlobTransferUtils.isBlobTransferManagerEnabled(serverConfig)) { - aggVersionedBlobTransferStats = new AggVersionedBlobTransferStats(metricsRepository, metadataRepo, serverConfig); + aggVersionedBlobTransferStats = + statsCloseables.register(new AggVersionedBlobTransferStats(metricsRepository, metadataRepo, serverConfig)); aggBlobTransferStats = new AggBlobTransferStats( aggVersionedBlobTransferStats, kafkaStoreIngestionService.getHostLevelIngestionStats()); @@ -765,14 +771,7 @@ public void shutdown() throws VeniceException { LOGGER.info("All services have been stopped"); compressorFactory.close(); - if (storeVersionOtelStats != null) { - try { - storeVersionOtelStats.close(); - } catch (Exception e) { - exceptions.add(e); - LOGGER.error("Exception while closing StoreVersionOtelStats", e); - } - } + statsCloseables.close(); try { metricsRepository.close(); diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStats.java index c268f95e515..d07b93895b5 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStats.java @@ -5,13 +5,15 @@ import com.linkedin.venice.cleaner.BackupVersionOptimizationService; import com.linkedin.venice.stats.dimensions.VeniceMetricsDimensions; import com.linkedin.venice.stats.dimensions.VeniceOperationOutcome; +import com.linkedin.venice.stats.metrics.AbstractStatsCloseable; +import com.linkedin.venice.stats.metrics.AsyncMetricEntityState.TehutiSensorRegistrationFunction; import com.linkedin.venice.stats.metrics.MetricEntityStateOneEnum; +import com.linkedin.venice.stats.metrics.MetricEntityStateUtils; import com.linkedin.venice.stats.metrics.TehutiMetricNameEnum; import com.linkedin.venice.utils.concurrent.VeniceConcurrentHashMap; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.OccurrenceRate; import java.util.Collections; -import java.util.HashMap; import java.util.Map; @@ -31,17 +33,37 @@ enum TehutiMetricName implements TehutiMetricNameEnum { private final VeniceOpenTelemetryMetricsRepository otelRepository; private final Map baseDimensionsMap; - /** - * Per-store joint Tehuti+OTel metric state maps. Two maps because they bind to different Tehuti sensors - * (success vs error) while sharing the same OTel instrument ({@code REOPEN_COUNT}) differentiated - * by {@link VeniceOperationOutcome}. Tehuti sensor is registered once (first store) and shared by all - * subsequent stores via {@code registerSensorIfAbsent}. Bounded by the number of stores on this host. - * When OTel is disabled, {@code otelRepository} is null and OTel recording is a no-op. - */ - private final Map> successPerStore = - new VeniceConcurrentHashMap<>(); - private final Map> errorPerStore = - new VeniceConcurrentHashMap<>(); + /** Per-store success/error wrappers; one OTel instrument, two Tehuti sensors. Bounded by host store count. */ + private final Map perStoreEntryMap = new VeniceConcurrentHashMap<>(); + + private static final class PerStoreEntry extends AbstractStatsCloseable { + final MetricEntityStateOneEnum success; + final MetricEntityStateOneEnum error; + + PerStoreEntry( + VeniceOpenTelemetryMetricsRepository otelRepository, + Map dims, + TehutiSensorRegistrationFunction registerTehutiSensorFn) { + this.success = MetricEntityStateOneEnum.create( + REOPEN_COUNT.getMetricEntity(), + otelRepository, + registerTehutiSensorFn, + TehutiMetricName.BACKUP_VERSION_DATABASE_OPTIMIZATION, + Collections.singletonList(new OccurrenceRate()), + dims, + VeniceOperationOutcome.class, + statsCloseables); + this.error = MetricEntityStateOneEnum.create( + REOPEN_COUNT.getMetricEntity(), + otelRepository, + registerTehutiSensorFn, + TehutiMetricName.BACKUP_VERSION_DATA_OPTIMIZATION_ERROR, + Collections.singletonList(new OccurrenceRate()), + dims, + VeniceOperationOutcome.class, + statsCloseables); + } + } public BackupVersionOptimizationServiceStats(MetricsRepository metricsRepository, String name, String clusterName) { super(metricsRepository, name); @@ -53,34 +75,25 @@ public BackupVersionOptimizationServiceStats(MetricsRepository metricsRepository } public void recordBackupVersionDatabaseOptimization(String storeName) { - getOrCreateMetric(successPerStore, storeName, TehutiMetricName.BACKUP_VERSION_DATABASE_OPTIMIZATION) - .record(1, VeniceOperationOutcome.SUCCESS); + getOrCreateEntry(storeName).success.record(1, VeniceOperationOutcome.SUCCESS); } public void recordBackupVersionDatabaseOptimizationError(String storeName) { - getOrCreateMetric(errorPerStore, storeName, TehutiMetricName.BACKUP_VERSION_DATA_OPTIMIZATION_ERROR) - .record(1, VeniceOperationOutcome.FAIL); + getOrCreateEntry(storeName).error.record(1, VeniceOperationOutcome.FAIL); } - private MetricEntityStateOneEnum getOrCreateMetric( - Map> perStoreMap, - String storeName, - TehutiMetricName tehutiName) { - return perStoreMap.computeIfAbsent(storeName, k -> createPerStoreMetric(k, tehutiName)); + private PerStoreEntry getOrCreateEntry(String storeName) { + return perStoreEntryMap.computeIfAbsent( + storeName, + k -> new PerStoreEntry( + otelRepository, + OpenTelemetryMetricsSetup.buildStoreDimensionsMap(baseDimensionsMap, k), + this::registerSensorIfAbsent)); } - private MetricEntityStateOneEnum createPerStoreMetric( - String storeName, - TehutiMetricName tehutiName) { - Map storeDims = new HashMap<>(baseDimensionsMap); - storeDims.put(VeniceMetricsDimensions.VENICE_STORE_NAME, OpenTelemetryMetricsSetup.sanitizeStoreName(storeName)); - return MetricEntityStateOneEnum.create( - REOPEN_COUNT.getMetricEntity(), - otelRepository, - this::registerSensorIfAbsent, - tehutiName, - Collections.singletonList(new OccurrenceRate()), - storeDims, - VeniceOperationOutcome.class); + @Override + public void close() { + MetricEntityStateUtils.closeAndClear(perStoreEntryMap); + super.close(); } } diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/DiskHealthStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/DiskHealthStats.java index 20f7e7afdcb..1273ad08d06 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/DiskHealthStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/DiskHealthStats.java @@ -28,6 +28,8 @@ enum TehutiMetricName implements TehutiMetricNameEnum { DISK_HEALTHY } + private final AsyncMetricEntityStateBase diskHealthMetric; + public DiskHealthStats( MetricsRepository metricsRepository, DiskHealthCheckService diskHealthCheckService, @@ -41,7 +43,7 @@ public DiskHealthStats( Attributes baseAttributes = otelData.getBaseAttributes(); LongSupplier healthCallback = () -> diskHealthCheckService.isDiskHealthy() ? 1 : 0; - AsyncMetricEntityStateBase.create( + diskHealthMetric = AsyncMetricEntityStateBase.create( DISK_HEALTH_STATUS.getMetricEntity(), otelData.getOtelRepository(), this::registerSensorIfAbsent, @@ -50,6 +52,7 @@ public DiskHealthStats( new AsyncGauge((ig, ig2) -> healthCallback.getAsLong(), TehutiMetricName.DISK_HEALTHY.getMetricName())), baseDimensionsMap, baseAttributes, - healthCallback); + healthCallback, + resources); } } diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/RocksDBStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/RocksDBStats.java index bf39746c163..8f4325a5019 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/RocksDBStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/RocksDBStats.java @@ -227,7 +227,7 @@ public RocksDBStats(MetricsRepository metricsRepository, String name, String clu return null; } return stat; - }, (stat, level) -> stat.getTickerCount(GET_HIT_TICKER_BY_LEVEL.get(level))); + }, (stat, level) -> stat.getTickerCount(GET_HIT_TICKER_BY_LEVEL.get(level)), resources); // --- Block Cache Hit Ratio: Tehuti-only (OTel derivable: hit{data} / (hit{data} + sum(miss))) --- registerSensorIfAbsent(new AsyncGauge((ig, ig2) -> { @@ -264,7 +264,8 @@ public RocksDBStats(MetricsRepository metricsRepository, String name, String clu otelRepository, baseDimensionsMap, baseAttributes, - readAmpOtelCallback); + readAmpOtelCallback, + resources); } /** Registers a Tehuti-only AsyncGauge for a RocksDB TickerType counter. */ @@ -284,7 +285,7 @@ private void registerJointTickerMetric( LongSupplier callback = () -> rocksDBStat != null ? rocksDBStat.getTickerCount(tickerType) : -1; registerSensorIfAbsent(new AsyncGauge((ig, ig2) -> callback.getAsLong(), tehutiSensorName)); AsyncMetricEntityStateBase - .create(otelEntity.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes, callback); + .create(otelEntity.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes, callback, resources); } /** Registers an OTel-only AsyncMetricEntityStateOneEnum for per-component block cache metrics. */ @@ -305,7 +306,8 @@ private void registerComponentMetric( } return stat; }, - (stat, component) -> stat.getTickerCount(componentTickers.get(component))); + (stat, component) -> stat.getTickerCount(componentTickers.get(component)), + resources); } public void setRocksDBStat(Statistics stat) { diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerConnectionStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerConnectionStats.java index d9a1b0f395c..93a69eb4bad 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerConnectionStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerConnectionStats.java @@ -68,7 +68,8 @@ public ServerConnectionStats(MetricsRepository metricsRepository, String name, S CONNECTION_ACTIVE_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, - VeniceConnectionSource.class); + VeniceConnectionSource.class, + resources); routerRequestCountOtel = MetricEntityStateOneEnum.create( CONNECTION_REQUEST_COUNT.getMetricEntity(), @@ -77,7 +78,8 @@ public ServerConnectionStats(MetricsRepository metricsRepository, String name, S TehutiMetricName.ROUTER_CONNECTION_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceConnectionSource.class); + VeniceConnectionSource.class, + resources); clientRequestCountOtel = MetricEntityStateOneEnum.create( CONNECTION_REQUEST_COUNT.getMetricEntity(), @@ -86,7 +88,8 @@ public ServerConnectionStats(MetricsRepository metricsRepository, String name, S TehutiMetricName.CLIENT_CONNECTION_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceConnectionSource.class); + VeniceConnectionSource.class, + resources); // Tehuti only — OTel total derived at query time connectionRequestSensor = registerSensorIfAbsent(CONNECTION_REQUEST, new OccurrenceRate()); @@ -99,7 +102,8 @@ public ServerConnectionStats(MetricsRepository metricsRepository, String name, S TehutiMetricName.NEW_CONNECTION_SETUP_LATENCY, Arrays.asList(TehutiUtils.getPercentileStatWithAvgAndMax(getName(), NEW_CONNECTION_SETUP_LATENCY)), baseDimensionsMap, - VeniceConnectionSource.class); + VeniceConnectionSource.class, + resources); } public void incrementRouterConnectionCount() { diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerHttpRequestStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerHttpRequestStats.java index bc5f8b2553b..7b8b1794e93 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerHttpRequestStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerHttpRequestStats.java @@ -144,7 +144,8 @@ public ServerHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); errorRequestMetric = MetricEntityStateThreeEnums.create( READ_CALL_COUNT.getMetricEntity(), @@ -155,7 +156,8 @@ public ServerHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); successRequestRatioSensor = registerSensor( "success_request_ratio", @@ -172,7 +174,8 @@ public ServerHttpRequestStats( HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, VeniceResponseStatusCategory.class, - VeniceRequestKeyCountBucket.class); + VeniceRequestKeyCountBucket.class, + resources); errorRequestLatencyMetric = MetricEntityStateFourEnums.create( READ_CALL_TIME.getMetricEntity(), @@ -185,7 +188,8 @@ public ServerHttpRequestStats( HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, VeniceResponseStatusCategory.class, - VeniceRequestKeyCountBucket.class); + VeniceRequestKeyCountBucket.class, + resources); responseSizeMetric = MetricEntityStateThreeEnums.create( READ_RESPONSE_SIZE.getMetricEntity(), @@ -196,7 +200,8 @@ public ServerHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); storageEngineQueryTimeMetric = MetricEntityStateOneEnum.create( STORAGE_ENGINE_QUERY_TIME.getMetricEntity(), @@ -206,7 +211,8 @@ public ServerHttpRequestStats( Arrays.asList( TehutiUtils.get99PercentileStatWithAvgAndMax(getName(), getFullMetricName("storage_engine_query_latency"))), baseDimensionsMap, - VeniceChunkingStatus.class); + VeniceChunkingStatus.class, + resources); readComputeQueryTimeMetric = MetricEntityStateBase.create( STORAGE_ENGINE_READ_COMPUTE_EXECUTION_TIME.getMetricEntity(), @@ -217,7 +223,8 @@ public ServerHttpRequestStats( TehutiUtils .getPercentileStatWithAvgAndMax(getName(), getFullMetricName("storage_engine_read_compute_latency"))), computeBaseDimensionsMap, - computeBaseAttributes); + computeBaseAttributes, + resources); databaseLookupLatencyForSmallValueSensor = registerPerStoreAndTotal( "storage_engine_query_latency_for_small_value", @@ -249,7 +256,8 @@ public ServerHttpRequestStats( ServerTehutiMetricName.STORAGE_ENGINE_LARGE_VALUE_LOOKUP, largeValueLookupStats, baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); // Queue metrics: Tehuti total-only; per-store recording routes to total's Tehuti sensor. OTel records per-store. if (totalStats == null) { @@ -263,10 +271,15 @@ public ServerHttpRequestStats( getName(), getFullMetricName("storage_execution_handler_submission_wait_time"))), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } else { - queueWaitTimeMetric = MetricEntityStateBase - .create(STORAGE_ENGINE_QUEUE_WAIT_TIME.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes); + queueWaitTimeMetric = MetricEntityStateBase.create( + STORAGE_ENGINE_QUEUE_WAIT_TIME.getMetricEntity(), + otelRepository, + baseDimensionsMap, + baseAttributes, + resources); // Wire per-store recording to total's Tehuti sensor (replicates old registerOnlyTotalSensor behavior) queueWaitTimeMetric.setTehutiSensor(totalStats.queueWaitTimeMetric.getTehutiSensor()); } @@ -279,10 +292,15 @@ public ServerHttpRequestStats( ServerTehutiMetricName.STORAGE_EXECUTION_QUEUE_LEN, Arrays.asList(new Max(), new Avg()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } else { - queueSizeMetric = MetricEntityStateBase - .create(STORAGE_ENGINE_QUEUE_SIZE.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes); + queueSizeMetric = MetricEntityStateBase.create( + STORAGE_ENGINE_QUEUE_SIZE.getMetricEntity(), + otelRepository, + baseDimensionsMap, + baseAttributes, + resources); // Wire per-store recording to total's Tehuti sensor (replicates old registerOnlyTotalSensor behavior) queueSizeMetric.setTehutiSensor(totalStats.queueSizeMetric.getTehutiSensor()); } @@ -295,13 +313,18 @@ public ServerHttpRequestStats( ServerTehutiMetricName.REQUEST_KEY_COUNT, Arrays.asList(new Rate(), new OccurrenceRate(), new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } else { // Single-get key count is always 1. Tehuti skips it (no sensor), but OTel intentionally // records it so that KPS (keys per second) can be measured accurately from a single metric // across all request types without special-casing single-get. - requestKeyCountMetric = MetricEntityStateBase - .create(READ_REQUEST_KEY_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes); + requestKeyCountMetric = MetricEntityStateBase.create( + READ_REQUEST_KEY_COUNT.getMetricEntity(), + otelRepository, + baseDimensionsMap, + baseAttributes, + resources); } keyNotFoundMetric = MetricEntityStateBase.create( @@ -311,7 +334,8 @@ public ServerHttpRequestStats( ServerTehutiMetricName.KEY_NOT_FOUND, Arrays.asList(new Rate()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); requestSizeMetric = MetricEntityStateBase.create( READ_REQUEST_SIZE.getMetricEntity(), @@ -320,7 +344,8 @@ public ServerHttpRequestStats( ServerTehutiMetricName.REQUEST_SIZE_IN_BYTES, Arrays.asList(new Avg(), new Min(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); readComputeLatencyForSmallValueSensor = registerPerStoreAndTotal( "storage_engine_read_compute_latency_for_small_value", @@ -356,7 +381,8 @@ public ServerHttpRequestStats( getName(), getFullMetricName("storage_engine_read_compute_deserialization_latency"))), computeBaseDimensionsMap, - VeniceChunkingStatus.class); + VeniceChunkingStatus.class, + resources); serializationTimeMetric = MetricEntityStateBase.create( STORAGE_ENGINE_READ_COMPUTE_SERIALIZATION_TIME.getMetricEntity(), @@ -368,7 +394,8 @@ public ServerHttpRequestStats( getName(), getFullMetricName("storage_engine_read_compute_serialization_latency"))), computeBaseDimensionsMap, - computeBaseAttributes); + computeBaseAttributes, + resources); // All four compute op metrics share one OTel entity (STORAGE_ENGINE_READ_COMPUTE_EXECUTION_COUNT) // differentiated by the VeniceComputeOperationType dimension. Separate fields are needed @@ -430,7 +457,8 @@ public ServerHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); requestKeySizeMetric = MetricEntityStateBase.create( READ_REQUEST_KEY_SIZE.getMetricEntity(), @@ -439,7 +467,8 @@ public ServerHttpRequestStats( ServerTehutiMetricName.REQUEST_KEY_SIZE, Arrays.asList(keySizeStats), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } else { responseValueSizeMetric = MetricEntityStateThreeEnums.create( READ_RESPONSE_VALUE_SIZE.getMetricEntity(), @@ -447,10 +476,15 @@ public ServerHttpRequestStats( baseDimensionsMap, HttpResponseStatusEnum.class, HttpResponseStatusCodeCategory.class, - VeniceResponseStatusCategory.class); + VeniceResponseStatusCategory.class, + resources); - requestKeySizeMetric = MetricEntityStateBase - .create(READ_REQUEST_KEY_SIZE.getMetricEntity(), otelRepository, baseDimensionsMap, baseAttributes); + requestKeySizeMetric = MetricEntityStateBase.create( + READ_REQUEST_KEY_SIZE.getMetricEntity(), + otelRepository, + baseDimensionsMap, + baseAttributes, + resources); } misroutedStoreVersionSensor = registerPerStoreAndTotal( @@ -466,7 +500,8 @@ public ServerHttpRequestStats( ServerTehutiMetricName.FLUSH_LATENCY, Arrays.asList(TehutiUtils.getPercentileStat(getName(), getFullMetricName("flush_latency"))), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } /** Registers a Tehuti-only sensor that propagates per-store recordings to the total. */ @@ -496,7 +531,8 @@ private MetricEntityStateOneEnum createComputeOpMetr tehutiName, Arrays.asList(new Avg(), new Total()), baseDims, - VeniceComputeOperationType.class); + VeniceComputeOperationType.class, + resources); } public void recordSuccessRequest( diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerLoadStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerLoadStats.java index fb3ebd644e9..5b921de4825 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerLoadStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerLoadStats.java @@ -57,7 +57,8 @@ public ServerLoadStats(MetricsRepository metricsRepository, String name, String TehutiMetricName.REJECTED_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceServerLoadRequestOutcome.class); + VeniceServerLoadRequestOutcome.class, + resources); acceptedRequestMetric = MetricEntityStateOneEnum.create( REQUEST_COUNT.getMetricEntity(), @@ -66,7 +67,8 @@ public ServerLoadStats(MetricsRepository metricsRepository, String name, String TehutiMetricName.ACCEPTED_REQUEST, Collections.singletonList(new OccurrenceRate()), baseDimensionsMap, - VeniceServerLoadRequestOutcome.class); + VeniceServerLoadRequestOutcome.class, + resources); rejectionRatioMetric = MetricEntityStateBase.create( REJECTION_RATIO.getMetricEntity(), @@ -75,7 +77,8 @@ public ServerLoadStats(MetricsRepository metricsRepository, String name, String TehutiMetricName.REJECTION_RATIO, Arrays.asList(new Avg(), new Max()), baseDimensionsMap, - baseAttributes); + baseAttributes, + resources); } /** Tehuti-only: total request count. OTel derives total from sum(ACCEPTED + REJECTED). */ diff --git a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerReadQuotaUsageStats.java b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerReadQuotaUsageStats.java index 4cda8889357..8dccd406613 100644 --- a/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerReadQuotaUsageStats.java +++ b/services/venice-server/src/main/java/com/linkedin/venice/stats/ServerReadQuotaUsageStats.java @@ -95,6 +95,8 @@ static VersionRole classifyVersion(int version, QuotaVersionInfo versionInfo) { private final MetricEntityStateTwoEnums requestCount; /** OTel high-perf counter with QuotaRequestOutcome × VersionRole dimensions. */ private final MetricEntityStateTwoEnums keyCount; + /** Joint Tehuti+OTel async gauge for usage ratio. */ + private final AsyncMetricEntityStateBase usageRatioMetric; /** Tehuti-only sensors for rejected QPS/KPS (unversioned Rate) */ private final Sensor rejectedQPSSensor; @@ -168,7 +170,7 @@ public ServerReadQuotaUsageStats(MetricsRepository metricsRepository, String nam // --- AsyncDoubleGauge: usage ratio (joint Tehuti + OTel, no VersionRole dimension) --- // OTel records the raw ratio as a double (e.g., 0.75 = 75% usage). NaN (uninitialized) maps to 0.0. - AsyncMetricEntityStateBase.create( + usageRatioMetric = AsyncMetricEntityStateBase.create( ServerReadQuotaOtelMetricEntity.READ_QUOTA_USAGE_RATIO.getMetricEntity(), otelRepository, this::registerSensor, @@ -182,7 +184,8 @@ public ServerReadQuotaUsageStats(MetricsRepository metricsRepository, String nam (DoubleSupplier) () -> { Double ratio = getReadQuotaUsageRatio(); return ratio.isNaN() ? 0.0 : ratio; - }); + }, + resources); // --- OTel high-perf counters: request.count and key.count with outcome+role dimensions --- requestCount = MetricEntityStateTwoEnums.create( @@ -190,13 +193,15 @@ public ServerReadQuotaUsageStats(MetricsRepository metricsRepository, String nam otelRepository, baseDimensionsMap, QuotaRequestOutcome.class, - VersionRole.class); + VersionRole.class, + resources); keyCount = MetricEntityStateTwoEnums.create( ServerReadQuotaOtelMetricEntity.READ_QUOTA_KEY_COUNT.getMetricEntity(), otelRepository, baseDimensionsMap, QuotaRequestOutcome.class, - VersionRole.class); + VersionRole.class, + resources); } /** diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStatsTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStatsTest.java index 3011fa1c329..11d1465285a 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStatsTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/BackupVersionOptimizationServiceStatsTest.java @@ -11,6 +11,7 @@ import com.linkedin.venice.stats.dimensions.VeniceOperationOutcome; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; import io.tehuti.metrics.MetricConfig; @@ -38,13 +39,11 @@ public class BackupVersionOptimizationServiceStatsTest { public void setUp() { inMemoryMetricReader = InMemoryMetricReader.create(); asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); stats = new BackupVersionOptimizationServiceStats( metricsRepository, "BackupVersionOptimizationService", @@ -218,11 +217,8 @@ public void testClusterNameArgumentPropagatesToOtelAttributes() { @Test public void testNoNpeWhenOtelDisabled() { AsyncGauge.AsyncGaugeExecutor localExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(localExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, localExecutor)) { exerciseAllRecordingPaths(disabledRepo); } } diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/DiskHealthStatsTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/DiskHealthStatsTest.java index 5acea75a566..2fd8ae7a868 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/DiskHealthStatsTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/DiskHealthStatsTest.java @@ -9,9 +9,9 @@ import com.linkedin.davinci.storage.DiskHealthCheckService; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; -import io.tehuti.metrics.MetricConfig; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; import org.testng.annotations.AfterMethod; @@ -39,13 +39,11 @@ public class DiskHealthStatsTest { public void setUp() { inMemoryMetricReader = InMemoryMetricReader.create(); asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); mockService = mock(DiskHealthCheckService.class); doReturn(true).when(mockService).isDiskHealthy(); new DiskHealthStats(metricsRepository, mockService, STATS_NAME, TEST_CLUSTER_NAME); @@ -99,11 +97,8 @@ public void testTehutiSensorReportsHealth() { @Test public void testNoNpeWhenOtelDisabled() { AsyncGauge.AsyncGaugeExecutor dedicatedExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(dedicatedExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, dedicatedExecutor)) { new DiskHealthStats(disabledRepo, mockService, STATS_NAME, TEST_CLUSTER_NAME); } } diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/RocksDBStatsOtelTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/RocksDBStatsOtelTest.java index 567132917a5..841ec4a9d69 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/RocksDBStatsOtelTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/RocksDBStatsOtelTest.java @@ -27,6 +27,7 @@ import com.linkedin.venice.stats.dimensions.VeniceRocksDBBlockCacheComponent; import com.linkedin.venice.stats.dimensions.VeniceRocksDBLevel; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.metrics.data.MetricData; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; @@ -59,13 +60,11 @@ public void setUp() throws IOException { // executor it was given. Without this, close() in tearDown / try-with-resources would shut // down the static singleton DEFAULT_ASYNC_GAUGE_EXECUTOR JVM-wide and break later tests. asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); mockStats = mock(Statistics.class); rocksDBStats = new RocksDBStats(metricsRepository, "rocksdb_stat", TEST_CLUSTER_NAME); rocksDBStats.setRocksDBStat(mockStats); diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerLoadStatsTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerLoadStatsTest.java index d5aa8778cda..7a3a7f8c34a 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerLoadStatsTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerLoadStatsTest.java @@ -11,9 +11,9 @@ import com.linkedin.venice.stats.dimensions.VeniceServerLoadRequestOutcome; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.testing.exporter.InMemoryMetricReader; -import io.tehuti.metrics.MetricConfig; import io.tehuti.metrics.MetricsRepository; import io.tehuti.metrics.stats.AsyncGauge; import org.testng.annotations.AfterMethod; @@ -37,13 +37,11 @@ public class ServerLoadStatsTest { public void setUp() { inMemoryMetricReader = InMemoryMetricReader.create(); asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .setTehutiMetricConfig(new MetricConfig(asyncGaugeExecutor)) - .build()); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); stats = new ServerLoadStats(metricsRepository, "server_load", TEST_CLUSTER_NAME); } @@ -221,11 +219,8 @@ public void testTehutiSensorsNoCrossContamination() { @Test public void testNoNpeWhenOtelDisabled() { AsyncGauge.AsyncGaugeExecutor localExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); - try (VeniceMetricsRepository disabledRepo = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setEmitOtelMetrics(false) - .setTehutiMetricConfig(new MetricConfig(localExecutor)) - .build())) { + try (VeniceMetricsRepository disabledRepo = + MetricsRepositoryUtils.createOtelDisabledRepository(TEST_METRIC_PREFIX, localExecutor)) { exerciseAllRecordingPaths(disabledRepo); } } diff --git a/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerReadQuotaUsageStatsOtelTest.java b/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerReadQuotaUsageStatsOtelTest.java index 6a51f2ead3f..dd777e763b7 100644 --- a/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerReadQuotaUsageStatsOtelTest.java +++ b/services/venice-server/src/test/java/com/linkedin/venice/stats/ServerReadQuotaUsageStatsOtelTest.java @@ -14,6 +14,7 @@ import com.linkedin.venice.stats.dimensions.QuotaRequestOutcome; import com.linkedin.venice.utils.OpenTelemetryDataTestUtils; import com.linkedin.venice.utils.TestMockTime; +import com.linkedin.venice.utils.metrics.MetricsRepositoryUtils; import io.opentelemetry.api.common.Attributes; import io.opentelemetry.sdk.metrics.data.LongPointData; import io.opentelemetry.sdk.metrics.data.MetricData; @@ -35,17 +36,19 @@ public class ServerReadQuotaUsageStatsOtelTest { private InMemoryMetricReader inMemoryMetricReader; private VeniceMetricsRepository metricsRepository; private TestMockTime mockTime; + private AsyncGauge.AsyncGaugeExecutor asyncGaugeExecutor; @BeforeMethod public void setUp() { mockTime = new TestMockTime(); inMemoryMetricReader = InMemoryMetricReader.create(); - metricsRepository = new VeniceMetricsRepository( - new VeniceMetricsConfig.Builder().setMetricPrefix(TEST_METRIC_PREFIX) - .setMetricEntities(SERVER_METRIC_ENTITIES) - .setEmitOtelMetrics(true) - .setOtelAdditionalMetricsReader(inMemoryMetricReader) - .build()); + // Dedicated executor so tearDown()'s close() doesn't shut down Tehuti's JVM-wide static singleton. + asyncGaugeExecutor = new AsyncGauge.AsyncGaugeExecutor.Builder().build(); + metricsRepository = MetricsRepositoryUtils.createOtelEnabledRepository( + TEST_METRIC_PREFIX, + SERVER_METRIC_ENTITIES, + inMemoryMetricReader, + asyncGaugeExecutor); } @AfterMethod