diff --git a/ddprof-lib/src/main/cpp/arguments.cpp b/ddprof-lib/src/main/cpp/arguments.cpp index 72b8aec22..0b310c34f 100644 --- a/ddprof-lib/src/main/cpp/arguments.cpp +++ b/ddprof-lib/src/main/cpp/arguments.cpp @@ -374,6 +374,15 @@ Error Arguments::parse(const char *args) { } } + CASE("ctxstorage") + if (value != NULL) { + if (strcmp(value, "otel") == 0) { + _context_storage = CTX_STORAGE_OTEL; + } else { + _context_storage = CTX_STORAGE_PROFILER; + } + } + DEFAULT() if (_unknown_arg == NULL) _unknown_arg = arg; diff --git a/ddprof-lib/src/main/cpp/arguments.h b/ddprof-lib/src/main/cpp/arguments.h index 3f2542705..aeb86d12f 100644 --- a/ddprof-lib/src/main/cpp/arguments.h +++ b/ddprof-lib/src/main/cpp/arguments.h @@ -92,6 +92,17 @@ enum Clock { CLK_MONOTONIC }; +/** + * Context storage mode for trace/span context. + * + * PROFILER: Use existing TLS-based storage (proven async-signal safe) + * OTEL: Use OTEL ring buffer storage (discoverable by external profilers, default) + */ +enum ContextStorageMode { + CTX_STORAGE_PROFILER, // TLS-based storage + CTX_STORAGE_OTEL // Default: OTEL ring buffer storage +}; + // Keep this in sync with JfrSync.java enum EventMask { EM_CPU = 1, @@ -189,6 +200,7 @@ class Arguments { bool _lightweight; bool _enable_method_cleanup; bool _remote_symbolication; // Enable remote symbolication for native frames + ContextStorageMode _context_storage; // Context storage mode (profiler TLS or OTEL buffer) Arguments(bool persistent = false) : _buf(NULL), @@ -223,7 +235,8 @@ class Arguments { _wallclock_sampler(ASGCT), _lightweight(false), _enable_method_cleanup(true), - _remote_symbolication(false) {} + _remote_symbolication(false), + _context_storage(CTX_STORAGE_OTEL) {} ~Arguments(); diff --git a/ddprof-lib/src/main/cpp/context_api.cpp b/ddprof-lib/src/main/cpp/context_api.cpp new file mode 100644 index 000000000..22f35901b --- /dev/null +++ b/ddprof-lib/src/main/cpp/context_api.cpp @@ -0,0 +1,169 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "context_api.h" +#include "context.h" +#include "otel_context.h" +#include "common.h" // For TEST_LOG +#include "os.h" // For OS::threadId() + +// Static member initialization +// Default to OTEL mode for tracer-only usage (when profiler is not started) +ContextStorageMode ContextApi::_mode = CTX_STORAGE_OTEL; +bool ContextApi::_initialized = false; + +bool ContextApi::initialize(const Arguments& args) { + if (__atomic_load_n(&_initialized, __ATOMIC_ACQUIRE)) { + TEST_LOG("ContextApi::initialize - already initialized, mode=%s", + __atomic_load_n(&_mode, __ATOMIC_ACQUIRE) == CTX_STORAGE_OTEL ? "OTEL" : "PROFILER"); + return true; + } + + ContextStorageMode mode = args._context_storage; + TEST_LOG("ContextApi::initialize - requested mode=%s", + mode == CTX_STORAGE_OTEL ? "OTEL" : "PROFILER"); + + if (mode == CTX_STORAGE_OTEL) { + if (!OtelContexts::initialize()) { + // Failed to initialize OTEL buffer, fall back to profiler mode + TEST_LOG("ContextApi::initialize - OTEL initialization failed, falling back to PROFILER mode"); + mode = CTX_STORAGE_PROFILER; + __atomic_store_n(&_mode, mode, __ATOMIC_RELEASE); + return false; + } + TEST_LOG("ContextApi::initialize - OTEL mode initialized successfully"); + } else { + TEST_LOG("ContextApi::initialize - PROFILER mode selected (uses TLS context_tls_v1)"); + } + // PROFILER mode uses existing TLS (context_tls_v1) - no explicit init needed + + __atomic_store_n(&_mode, mode, __ATOMIC_RELEASE); + __atomic_store_n(&_initialized, true, __ATOMIC_RELEASE); + return true; +} + +void ContextApi::shutdown() { + if (!__atomic_load_n(&_initialized, __ATOMIC_ACQUIRE)) { + return; + } + + // Always shutdown OTEL buffer if it exists, regardless of current mode. + // This ensures the buffer is properly cleaned up when switching modes. + // OtelContexts::shutdown() is safe to call even if OTEL was never initialized. + OtelContexts::shutdown(); + + __atomic_store_n(&_initialized, false, __ATOMIC_RELEASE); +} + +bool ContextApi::isInitialized() { + return __atomic_load_n(&_initialized, __ATOMIC_ACQUIRE); +} + +ContextStorageMode ContextApi::getMode() { + return __atomic_load_n(&_mode, __ATOMIC_ACQUIRE); +} + +void ContextApi::set(u64 span_id, u64 root_span_id) { + // Map Datadog format to storage + // In OTEL mode: trace_id = (0, root_span_id), span_id = span_id + setOtel(0, root_span_id, span_id); +} + +void ContextApi::setOtel(u64 trace_id_high, u64 trace_id_low, u64 span_id) { + // Use atomic load for mode check - may be called from signal handlers + ContextStorageMode mode = __atomic_load_n(&_mode, __ATOMIC_ACQUIRE); + + TEST_LOG("ContextApi::setOtel: tid=%d mode=%s trace_high=0x%llx trace_low=0x%llx span=0x%llx", + OS::threadId(), mode == CTX_STORAGE_OTEL ? "OTEL" : "PROFILER", + (unsigned long long)trace_id_high, (unsigned long long)trace_id_low, + (unsigned long long)span_id); + + if (mode == CTX_STORAGE_OTEL) { + OtelContexts::set(trace_id_high, trace_id_low, span_id); + } else { + // Profiler mode: use existing TLS + // Note: trace_id_high is ignored in profiler mode (only 64-bit root span ID) + Context& ctx = Contexts::get(); + + // Use checksum protocol for torn-read safety with proper memory ordering + // 1. Clear checksum to mark update in progress (release to ensure visibility) + __atomic_store_n(&ctx.checksum, 0ULL, __ATOMIC_RELEASE); + + // 2. Write data fields with relaxed atomics (ordering guaranteed by checksum barriers) + __atomic_store_n(&ctx.spanId, span_id, __ATOMIC_RELAXED); + __atomic_store_n(&ctx.rootSpanId, trace_id_low, __ATOMIC_RELAXED); + + // 3. Set final checksum with release semantics + // This ensures all prior writes are visible before checksum update + u64 newChecksum = Contexts::checksum(span_id, trace_id_low); + __atomic_store_n(&ctx.checksum, newChecksum, __ATOMIC_RELEASE); + } +} + +bool ContextApi::get(u64& span_id, u64& root_span_id) { + // Use atomic load for mode check - may be called from signal handlers + ContextStorageMode mode = __atomic_load_n(&_mode, __ATOMIC_ACQUIRE); + + if (mode == CTX_STORAGE_OTEL) { + u64 trace_high, trace_low; + if (OtelContexts::get(trace_high, trace_low, span_id)) { + root_span_id = trace_low; + return true; + } + return false; + } else { + // Profiler mode: use existing TLS + Context& ctx = Contexts::get(); + // Read with acquire to synchronize with release in set() + u64 checksum1 = __atomic_load_n(&ctx.checksum, __ATOMIC_ACQUIRE); + span_id = __atomic_load_n(&ctx.spanId, __ATOMIC_RELAXED); + root_span_id = __atomic_load_n(&ctx.rootSpanId, __ATOMIC_RELAXED); + // Validate checksum to detect torn reads + return checksum1 != 0 && checksum1 == Contexts::checksum(span_id, root_span_id); + } +} + +bool ContextApi::getByTid(int tid, u64& span_id, u64& root_span_id) { + // Use atomic load for mode check - may be called from signal handlers + ContextStorageMode mode = __atomic_load_n(&_mode, __ATOMIC_ACQUIRE); + + if (mode == CTX_STORAGE_OTEL) { + u64 trace_high, trace_low; + if (OtelContexts::getByTid(tid, trace_high, trace_low, span_id)) { + root_span_id = trace_low; + return true; + } + return false; + } else { + // Profiler mode: cannot read other thread's TLS + // This is a limitation - JVMTI wall-clock needs OTEL mode for remote reads + // Fall back to returning false (no context available) + span_id = 0; + root_span_id = 0; + return false; + } +} + +void ContextApi::clear() { + // Clear context based on storage mode + if (_mode == CTX_STORAGE_OTEL) { + // In OTEL mode, properly clear the V2 record (sets valid=0, pointer=nullptr) + OtelContexts::clear(); + } else { + // In PROFILER mode, clear by setting checksum to 0 + set(0, 0); + } +} diff --git a/ddprof-lib/src/main/cpp/context_api.h b/ddprof-lib/src/main/cpp/context_api.h new file mode 100644 index 000000000..24ed619f9 --- /dev/null +++ b/ddprof-lib/src/main/cpp/context_api.h @@ -0,0 +1,131 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _CONTEXT_API_H +#define _CONTEXT_API_H + +#include "arch.h" +#include "arguments.h" + +/** + * Unified context API for trace/span context storage. + * + * This class provides a mode-agnostic interface for reading and writing + * thread context. The actual storage is selected at initialization time + * based on the Arguments::_context_storage setting: + * + * - CTX_STORAGE_PROFILER: Uses existing TLS-based storage (context_tls_v1) + * - CTX_STORAGE_OTEL: Uses OTEL ring buffer storage (discoverable by external profilers) + * + * The abstraction allows signal handlers and JNI code to remain unchanged + * while the underlying storage mechanism can be switched via configuration. + */ +class ContextApi { +public: + /** + * Initialize context storage based on configuration. + * + * Must be called once during profiler startup. + * For OTEL mode, creates the discoverable ring buffer. + * + * @param args Profiler arguments containing _context_storage mode + * @return true if initialization succeeded + */ + static bool initialize(const Arguments& args); + + /** + * Shutdown context storage. + * + * Releases resources allocated during initialization. + * For OTEL mode, unmaps the ring buffer. + */ + static void shutdown(); + + /** + * Check if context storage is initialized. + * + * @return true if initialized + */ + static bool isInitialized(); + + /** + * Get the current storage mode. + * + * @return The active context storage mode + */ + static ContextStorageMode getMode(); + + /** + * Write context for the current thread. + * + * This is the primary method for setting trace context from the tracer. + * Maps Datadog's (spanId, rootSpanId) to OTEL's (trace_id_high, trace_id_low, span_id). + * + * In OTEL mode: trace_id_high=0, trace_id_low=rootSpanId, span_id=spanId + * + * @param span_id The span ID + * @param root_span_id The root span ID (trace ID low bits for OTEL) + */ + static void set(u64 span_id, u64 root_span_id); + + /** + * Write full OTEL context for the current thread. + * + * Supports full 128-bit trace IDs when in OTEL mode. + * In profiler mode, trace_id_high is ignored. + * + * @param trace_id_high Upper 64 bits of 128-bit trace ID (OTEL only) + * @param trace_id_low Lower 64 bits of 128-bit trace ID (rootSpanId) + * @param span_id The span ID + */ + static void setOtel(u64 trace_id_high, u64 trace_id_low, u64 span_id); + + /** + * Read context for the current thread. + * + * Used by signal handlers to get the current trace context. + * Returns false if the context is invalid (torn read or uninitialized). + * + * @param span_id Output: the span ID + * @param root_span_id Output: the root span ID + * @return true if context was successfully read + */ + static bool get(u64& span_id, u64& root_span_id); + + /** + * Read context for a specific thread by TID. + * + * Used by JVMTI wall-clock sampling where the sampling thread + * needs to read another thread's context. + * + * @param tid Thread ID to read context for + * @param span_id Output: the span ID + * @param root_span_id Output: the root span ID + * @return true if context was successfully read + */ + static bool getByTid(int tid, u64& span_id, u64& root_span_id); + + /** + * Clear context for the current thread. + */ + static void clear(); + +private: + static ContextStorageMode _mode; + static bool _initialized; +}; + +#endif /* _CONTEXT_API_H */ diff --git a/ddprof-lib/src/main/cpp/flightRecorder.cpp b/ddprof-lib/src/main/cpp/flightRecorder.cpp index 024472ecd..a49d34cbf 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.cpp +++ b/ddprof-lib/src/main/cpp/flightRecorder.cpp @@ -10,6 +10,7 @@ #include "buffers.h" #include "callTraceHashTable.h" #include "context.h" +#include "context_api.h" #include "counters.h" #include "dictionary.h" #include "flightRecorder.h" @@ -1450,6 +1451,21 @@ void Recording::writeContext(Buffer *buf, Context &context) { } } +void Recording::writeCurrentContext(Buffer *buf) { + u64 spanId = 0; + u64 rootSpanId = 0; + ContextApi::get(spanId, rootSpanId); + buf->putVar64(spanId); + buf->putVar64(rootSpanId); + + // Tags still come from TLS Context (even in OTEL mode, for compatibility) + Context &context = Contexts::get(); + for (size_t i = 0; i < Profiler::instance()->numContextAttributes(); i++) { + Tag tag = context.get_tag(i); + buf->putVar32(tag.value); + } +} + void Recording::writeEventSizePrefix(Buffer *buf, int start) { int size = buf->offset() - start; assert(size < MAX_JFR_EVENT_SIZE); @@ -1466,7 +1482,7 @@ void Recording::recordExecutionSample(Buffer *buf, int tid, u64 call_trace_id, buf->put8(static_cast(event->_thread_state)); buf->put8(static_cast(event->_execution_mode)); buf->putVar64(event->_weight); - writeContext(buf, Contexts::get()); + writeCurrentContext(buf); writeEventSizePrefix(buf, start); flushIfNeeded(buf); } @@ -1481,7 +1497,7 @@ void Recording::recordMethodSample(Buffer *buf, int tid, u64 call_trace_id, buf->put8(static_cast(event->_thread_state)); buf->put8(static_cast(event->_execution_mode)); buf->putVar64(event->_weight); - writeContext(buf, Contexts::get()); + writeCurrentContext(buf); writeEventSizePrefix(buf, start); flushIfNeeded(buf); } @@ -1526,7 +1542,7 @@ void Recording::recordQueueTime(Buffer *buf, int tid, QueueTimeEvent *event) { buf->putVar64(event->_scheduler); buf->putVar64(event->_queueType); buf->putVar64(event->_queueLength); - writeContext(buf, Contexts::get()); + writeCurrentContext(buf); writeEventSizePrefix(buf, start); flushIfNeeded(buf); } @@ -1541,7 +1557,7 @@ void Recording::recordAllocation(RecordingBuffer *buf, int tid, buf->putVar64(event->_id); buf->putVar64(event->_size); buf->putFloat(event->_weight); - writeContext(buf, Contexts::get()); + writeCurrentContext(buf); writeEventSizePrefix(buf, start); flushIfNeeded(buf); } @@ -1579,7 +1595,7 @@ void Recording::recordMonitorBlocked(Buffer *buf, int tid, u64 call_trace_id, buf->putVar64(event->_id); buf->put8(0); buf->putVar64(event->_address); - writeContext(buf, Contexts::get()); + writeCurrentContext(buf); writeEventSizePrefix(buf, start); flushIfNeeded(buf); } diff --git a/ddprof-lib/src/main/cpp/flightRecorder.h b/ddprof-lib/src/main/cpp/flightRecorder.h index 7691e7e53..8ec0c38bb 100644 --- a/ddprof-lib/src/main/cpp/flightRecorder.h +++ b/ddprof-lib/src/main/cpp/flightRecorder.h @@ -240,6 +240,7 @@ class Recording { void writeUnwindFailures(Buffer *buf); void writeContext(Buffer *buf, Context &context); + void writeCurrentContext(Buffer *buf); void recordExecutionSample(Buffer *buf, int tid, u64 call_trace_id, ExecutionEvent *event); diff --git a/ddprof-lib/src/main/cpp/javaApi.cpp b/ddprof-lib/src/main/cpp/javaApi.cpp index 00e19f9f9..3d89e5c84 100644 --- a/ddprof-lib/src/main/cpp/javaApi.cpp +++ b/ddprof-lib/src/main/cpp/javaApi.cpp @@ -18,6 +18,7 @@ #include "arch.h" #include "context.h" +#include "context_api.h" #include "counters.h" #include "common.h" #include "engine.h" @@ -34,6 +35,7 @@ #include #include #include +#include static void throwNew(JNIEnv *env, const char *exception_class, const char *message) { @@ -460,6 +462,21 @@ Java_com_datadoghq_profiler_OTelContext_setProcessCtx0(JNIEnv *env, JniString version_str(env, version); JniString tracer_version_str(env, tracer_version); + // Default TLS configuration for profiler context slots (0-9) + // New format: key names in index order (position = key index) + static const char* default_key_map[] = { + "tag.0", "tag.1", "tag.2", "tag.3", "tag.4", + "tag.5", "tag.6", "tag.7", "tag.8", "tag.9", + NULL + }; + + // Default TLS config: schema version "tlsdesc_v1_dev", max 512 bytes per record + static otel_tls_config default_tls_config = { + .schema_version = const_cast("tlsdesc_v1_dev"), + .max_record_size = 512, + .attribute_key_map = const_cast(default_key_map) + }; + otel_process_ctx_data data = { .deployment_environment_name = const_cast(env_str.c_str()), .host_name = const_cast(hostname_str.c_str()), @@ -469,12 +486,82 @@ Java_com_datadoghq_profiler_OTelContext_setProcessCtx0(JNIEnv *env, .telemetry_sdk_language = const_cast("java"), .telemetry_sdk_version = const_cast(tracer_version_str.c_str()), .telemetry_sdk_name = const_cast("dd-trace-java"), - .resources = NULL // TODO: Arbitrary tags not supported yet for Java + .resources = NULL, // TODO: Arbitrary tags not supported yet for Java + .tls_config = &default_tls_config }; otel_process_ctx_result result = otel_process_ctx_publish(&data); } +extern "C" DLLEXPORT void JNICALL +Java_com_datadoghq_profiler_OTelContext_setProcessCtxWithTls0(JNIEnv *env, + jclass unused, + jstring env_data, + jstring hostname, + jstring runtime_id, + jstring service, + jstring version, + jstring tracer_version, + jstring schema_version, + jint max_record_size, + jobjectArray attribute_key_map + ) { + JniString env_str(env, env_data); + JniString hostname_str(env, hostname); + JniString runtime_id_str(env, runtime_id); + JniString service_str(env, service); + JniString version_str(env, version); + JniString tracer_version_str(env, tracer_version); + JniString schema_version_str(env, schema_version); + + // Convert Java String[] to char** for attribute_key_map + char** key_map = NULL; + jsize key_map_len = 0; + std::vector key_map_strs; // Keep JniString objects alive + + if (attribute_key_map != NULL) { + key_map_len = env->GetArrayLength(attribute_key_map); + key_map = (char**)alloca((key_map_len + 1) * sizeof(char*)); + for (jsize i = 0; i < key_map_len; i++) { + jstring str = (jstring)env->GetObjectArrayElement(attribute_key_map, i); + if (str != NULL) { + JniString* js = new JniString(env, str); + key_map_strs.push_back(js); + key_map[i] = const_cast(js->c_str()); + } else { + key_map[i] = NULL; + } + } + key_map[key_map_len] = NULL; // NULL-terminate + } + + otel_tls_config tls_config = { + .schema_version = const_cast(schema_version_str.c_str()), + .max_record_size = max_record_size, + .attribute_key_map = key_map + }; + + otel_process_ctx_data data = { + .deployment_environment_name = const_cast(env_str.c_str()), + .host_name = const_cast(hostname_str.c_str()), + .service_instance_id = const_cast(runtime_id_str.c_str()), + .service_name = const_cast(service_str.c_str()), + .service_version = const_cast(version_str.c_str()), + .telemetry_sdk_language = const_cast("java"), + .telemetry_sdk_version = const_cast(tracer_version_str.c_str()), + .telemetry_sdk_name = const_cast("dd-trace-java"), + .resources = NULL, + .tls_config = &tls_config + }; + + otel_process_ctx_result result = otel_process_ctx_publish(&data); + + // Clean up JniString objects + for (JniString* js : key_map_strs) { + delete js; + } +} + extern "C" DLLEXPORT jobject JNICALL Java_com_datadoghq_profiler_OTelContext_readProcessCtx0(JNIEnv *env, jclass unused) { #ifndef OTEL_PROCESS_CTX_NO_READ @@ -549,13 +636,15 @@ Java_com_datadoghq_profiler_JavaProfiler_initializeContextTls0(JNIEnv* env, jcla extern "C" DLLEXPORT jlong JNICALL Java_com_datadoghq_profiler_ThreadContext_setContext0(JNIEnv* env, jclass unused, jlong spanId, jlong rootSpanId) { - Context& ctx = Contexts::get(); + // Use ContextApi for mode-agnostic context setting (handles TLS or OTEL storage) + ContextApi::set(spanId, rootSpanId); - ctx.spanId = spanId; - ctx.rootSpanId = rootSpanId; - ctx.checksum = Contexts::checksum(spanId, rootSpanId); - - return ctx.checksum; + // Return checksum for API compatibility + // In OTEL mode, return 0 as checksum is not used (OTEL uses in_use flag instead) + if (ContextApi::getMode() == CTX_STORAGE_OTEL) { + return 0; + } + return Contexts::checksum(spanId, rootSpanId); } extern "C" DLLEXPORT void JNICALL @@ -564,6 +653,35 @@ Java_com_datadoghq_profiler_ThreadContext_setContextSlot0(JNIEnv* env, jclass un ctx.tags[offset].value = (u32)value; } +extern "C" DLLEXPORT jboolean JNICALL +Java_com_datadoghq_profiler_ThreadContext_isOtelMode0(JNIEnv* env, jclass unused) { + return ContextApi::isInitialized() && ContextApi::getMode() == CTX_STORAGE_OTEL; +} + +extern "C" DLLEXPORT jlongArray JNICALL +Java_com_datadoghq_profiler_ThreadContext_getContext0(JNIEnv* env, jclass unused) { + u64 spanId = 0; + u64 rootSpanId = 0; + + // Read context via ContextApi (handles both OTEL and TLS modes) + // If read fails (torn read or write in progress), return zeros + if (!ContextApi::get(spanId, rootSpanId)) { + spanId = 0; + rootSpanId = 0; + } + + // Create result array [spanId, rootSpanId] + jlongArray result = env->NewLongArray(2); + if (result == nullptr) { + return nullptr; + } + + jlong values[2] = {(jlong)spanId, (jlong)rootSpanId}; + env->SetLongArrayRegion(result, 0, 2, values); + + return result; +} + // ---- test and debug utilities extern "C" DLLEXPORT void JNICALL Java_com_datadoghq_profiler_JavaProfiler_testlog(JNIEnv* env, jclass unused, jstring msg) { diff --git a/ddprof-lib/src/main/cpp/otel_context.cpp b/ddprof-lib/src/main/cpp/otel_context.cpp new file mode 100644 index 000000000..da35ba0de --- /dev/null +++ b/ddprof-lib/src/main/cpp/otel_context.cpp @@ -0,0 +1,267 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "otel_context.h" +#include "os.h" +#include "common.h" // For TEST_LOG + +#include +#include + +#ifdef __linux__ +#include +#ifndef PR_SET_VMA +#define PR_SET_VMA 0x53564d41 +#endif +#ifndef PR_SET_VMA_ANON_NAME +#define PR_SET_VMA_ANON_NAME 0 +#endif +#endif + +// Static member initialization +OtelContextHeader* OtelContexts::_buffer = nullptr; +size_t OtelContexts::_buffer_size = 0; +size_t OtelContexts::_capacity = 0; + +// V2 context record storage and pointer for external profiler discovery. +// Since OtelContextV2Record has a flexible array member, we allocate a fixed-size +// buffer that can hold the header plus attributes data. +// Thread-local buffer for per-thread V2 records (header + attrs_data space). +static thread_local alignas(4) u8 otel_context_v2_buffer[V2_DEFAULT_MAX_RECORD_SIZE] = {}; +static thread_local OtelContextV2Record* otel_context_v2_record = + reinterpret_cast(otel_context_v2_buffer); + +// External profiler discovery symbol - points to the active V2 record or nullptr. +DLLEXPORT thread_local OtelContextV2Record* custom_labels_current_set_v2 = nullptr; + +/** + * Helper to write a 64-bit value as big-endian bytes. + */ +static inline void write_be64(u8* dest, u64 value) { + dest[0] = (value >> 56) & 0xFF; + dest[1] = (value >> 48) & 0xFF; + dest[2] = (value >> 40) & 0xFF; + dest[3] = (value >> 32) & 0xFF; + dest[4] = (value >> 24) & 0xFF; + dest[5] = (value >> 16) & 0xFF; + dest[6] = (value >> 8) & 0xFF; + dest[7] = value & 0xFF; +} + +/** + * Helper to read a 64-bit value from big-endian bytes. + */ +static inline u64 read_be64(const u8* src) { + return ((u64)src[0] << 56) | ((u64)src[1] << 48) | + ((u64)src[2] << 40) | ((u64)src[3] << 32) | + ((u64)src[4] << 24) | ((u64)src[5] << 16) | + ((u64)src[6] << 8) | (u64)src[7]; +} + +/** + * Updates the V2 context record when context changes. + * Called internally when OtelContexts::set() is invoked. + * + * Record layout (tlsdesc_v1_dev schema): + * trace_id[16] - bytes 0-15: 128-bit trace ID (network order / big-endian) + * span_id[8] - bytes 16-23: 64-bit span ID (network order / big-endian) + * valid[1] - byte 24: non-zero if record is valid + * _padding[1] - byte 25: padding for alignment + * attrs_data_size[2] - bytes 26-27: size of attrs_data (little-endian u16) + * attrs_data[] - bytes 28+: [key_index:1][length:1][value:length]... + */ +static void updateV2Record(u64 trace_id_high, u64 trace_id_low, u64 span_id) { + // Clear valid flag first (atomic visibility) + otel_context_v2_record->valid = 0; + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + // Write trace_id (16 bytes, big-endian: high part first, then low part) + write_be64(otel_context_v2_record->trace_id, trace_id_high); + write_be64(otel_context_v2_record->trace_id + 8, trace_id_low); + + // Write span_id (8 bytes, big-endian) + write_be64(otel_context_v2_record->span_id, span_id); + + // No attributes for now + otel_context_v2_record->_padding = 0; + otel_context_v2_record->attrs_data_size = 0; + + // Memory fence before setting valid + __atomic_thread_fence(__ATOMIC_SEQ_CST); + + // Set valid flag and pointer + otel_context_v2_record->valid = 1; + custom_labels_current_set_v2 = otel_context_v2_record; + + TEST_LOG("updateV2Record: tid=%d ptr=%p trace_high=0x%llx trace_low=0x%llx span=0x%llx", + OS::threadId(), (void*)custom_labels_current_set_v2, + (unsigned long long)trace_id_high, (unsigned long long)trace_id_low, + (unsigned long long)span_id); +} + +/** + * Clears the V2 context record. + */ +static void clearV2Record() { + otel_context_v2_record->valid = 0; + custom_labels_current_set_v2 = nullptr; + TEST_LOG("clearV2Record: tid=%d cleared context", OS::threadId()); +} + +bool OtelContexts::initialize(size_t capacity) { + if (_buffer != nullptr) { + // Already initialized + return true; + } + + // Calculate buffer size: header + slots array + size_t slots_offset = sizeof(OtelContextHeader); + // Align slots to slot size for proper alignment + slots_offset = (slots_offset + sizeof(OtelContextSlot) - 1) & ~(sizeof(OtelContextSlot) - 1); + size_t total_size = slots_offset + capacity * sizeof(OtelContextSlot); + + // Create anonymous mmap + void* ptr = mmap(nullptr, total_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (ptr == MAP_FAILED) { + return false; + } + + // Zero-initialize the buffer + memset(ptr, 0, total_size); + + // Initialize header + OtelContextHeader* header = static_cast(ptr); + header->magic = OTEL_CONTEXT_MAGIC; + header->version = OTEL_CONTEXT_VERSION; + header->capacity = static_cast(capacity); + header->slot_size = static_cast(sizeof(OtelContextSlot)); + +#ifdef __linux__ + // Name the region for discovery via /proc//maps + // This creates an entry like: [anon:DD_OTEL_CTX] + // Note: PR_SET_VMA_ANON_NAME requires kernel 5.17+ + // Failure is not fatal - discovery will still work via magic number scanning + prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, total_size, OTEL_CONTEXT_MMAP_NAME); +#endif + + _buffer = header; + _buffer_size = total_size; + _capacity = capacity; + + return true; +} + +void OtelContexts::shutdown() { + if (_buffer == nullptr) { + return; + } + + munmap(_buffer, _buffer_size); + _buffer = nullptr; + _buffer_size = 0; + _capacity = 0; +} + +bool OtelContexts::isInitialized() { + return _buffer != nullptr; +} + +OtelContextSlot* OtelContexts::getSlot(int tid) { + if (_buffer == nullptr || _capacity == 0) { + return nullptr; + } + + // Calculate slot index using modulo + // Note: TIDs that differ by multiples of _capacity will share the same slot. + // With default capacity of 65536, this is acceptable for most workloads. + // For extremely high TID values or long-running systems with TID recycling, + // consider increasing capacity or implementing a TID-to-slot hash table. + size_t index = static_cast(tid) % _capacity; + + // Calculate slot address (slots start after header, properly aligned) + size_t slots_offset = sizeof(OtelContextHeader); + slots_offset = (slots_offset + sizeof(OtelContextSlot) - 1) & ~(sizeof(OtelContextSlot) - 1); + + char* slots_base = reinterpret_cast(_buffer) + slots_offset; + return reinterpret_cast(slots_base) + index; +} + +void OtelContexts::set(u64 trace_id_high, u64 trace_id_low, u64 span_id) { + // V2 TLS record is the primary storage - external profilers read from here + // via the custom_labels_current_set_v2 symbol + int tid = OS::threadId(); + TEST_LOG("OtelContexts::set: tid=%d trace_high=0x%llx trace_low=0x%llx span=0x%llx", + tid, (unsigned long long)trace_id_high, (unsigned long long)trace_id_low, + (unsigned long long)span_id); + updateV2Record(trace_id_high, trace_id_low, span_id); +} + +bool OtelContexts::get(u64& trace_id_high, u64& trace_id_low, u64& span_id) { + // Read from V2 TLS record (primary storage) + // This is a facade that presents the V2 record in the same API as before + + // Check if context is valid using acquire fence to synchronize with set() + __atomic_thread_fence(__ATOMIC_ACQUIRE); + + if (custom_labels_current_set_v2 == nullptr || !otel_context_v2_record->valid) { + TEST_LOG("OtelContexts::get() failed: ptr=%p valid=%d", + (void*)custom_labels_current_set_v2, (int)otel_context_v2_record->valid); + return false; + } + + // Read fields from V2 record (big-endian to native) + trace_id_high = read_be64(otel_context_v2_record->trace_id); + trace_id_low = read_be64(otel_context_v2_record->trace_id + 8); + span_id = read_be64(otel_context_v2_record->span_id); + + TEST_LOG("OtelContexts::get() returning trace_high=0x%llx, trace_low=0x%llx, span=0x%llx", + (unsigned long long)trace_id_high, (unsigned long long)trace_id_low, + (unsigned long long)span_id); + + // Double-check validity after read + __atomic_thread_fence(__ATOMIC_ACQUIRE); + if (!otel_context_v2_record->valid) { + return false; + } + + return true; +} + +bool OtelContexts::getByTid(int tid, u64& trace_id_high, u64& trace_id_low, u64& span_id) { + // V2 TLS records are per-thread and cannot be read cross-thread from within + // the process. External profilers use ptrace/process_vm_readv to read them. + // If cross-thread reads are needed internally, use PROFILER mode with ContextApi. + (void)tid; + trace_id_high = 0; + trace_id_low = 0; + span_id = 0; + return false; +} + +void OtelContexts::clear() { + // Clear the V2 record properly - set pointer to NULL and valid to 0 + // This matches the reference implementation behavior + clearV2Record(); +} + +OtelContextHeader* OtelContexts::getBuffer() { + return _buffer; +} + +size_t OtelContexts::getBufferSize() { + return _buffer_size; +} diff --git a/ddprof-lib/src/main/cpp/otel_context.h b/ddprof-lib/src/main/cpp/otel_context.h new file mode 100644 index 000000000..68a705714 --- /dev/null +++ b/ddprof-lib/src/main/cpp/otel_context.h @@ -0,0 +1,240 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef _OTEL_CONTEXT_H +#define _OTEL_CONTEXT_H + +#include "arch.h" +#include "vmEntry.h" // For DLLEXPORT +#include + +/** + * OTEL-compatible thread context storage. + * + * This module implements thread-level context storage that is discoverable + * by external profilers following the OTEL profiling context proposal. + * + * Discovery mechanism: + * - Linux: The mmap region is named via prctl(PR_SET_VMA_ANON_NAME) and + * can be discovered by scanning /proc//maps for [anon:DD_OTEL_CTX] + * + * Storage layout: + * - Header with magic number, version, capacity, and slot size + * - Array of slots indexed by TID % capacity + * + * Torn-read protection: + * - Uses in_use flag (0 = valid, 1 = writing) with memory barriers + * - Reader must check in_use before and after reading fields + */ + +// Name used for mmap discovery via /proc//maps +#define OTEL_CONTEXT_MMAP_NAME "DD_OTEL_CTX" + +// Magic number for buffer validation (ASCII "OTEL") +static const u32 OTEL_CONTEXT_MAGIC = 0x4F54454C; + +// Protocol version +static const u32 OTEL_CONTEXT_VERSION = 1; + +// Default capacity (number of thread slots) +static const size_t OTEL_CONTEXT_DEFAULT_CAPACITY = 65536; + +/** + * V2 TLS record format for custom-labels compatibility (OTEL profiling context). + * + * This record format is compatible with the custom-labels v2 specification + * (tlsdesc_v1_dev schema), allowing external profilers (like ddprof) to read + * thread context using the same code path as the Rust custom-labels library. + * + * Layout (28-byte header + variable-length attrs_data): + * trace_id[16] - 128-bit trace ID (bytes, network order) + * span_id[8] - 64-bit span ID (bytes, network order) + * valid[1] - Non-zero if record contains valid data + * _padding[1] - Padding for alignment + * attrs_data_size[2] - Size of attrs_data in bytes (little-endian u16) + * attrs_data[] - Attribute data: [key_index:1][length:1][value:length]... + * + * Total header size: 28 bytes (V2_HEADER_SIZE) + */ +static const size_t V2_HEADER_SIZE = 28; + +#pragma pack(push, 1) +struct OtelContextV2Record { + u8 trace_id[16]; // 128-bit trace ID (bytes, network order) + u8 span_id[8]; // 64-bit span ID (bytes, network order) + u8 valid; // Non-zero if valid (byte 24) + u8 _padding; // Padding for alignment (byte 25) + u16 attrs_data_size; // Size of attrs_data in bytes (bytes 26-27, little-endian) + u8 attrs_data[]; // Flexible array: [key_index:1][length:1][value:length]... +}; +#pragma pack(pop) + +// Maximum record size for allocation (configurable via process context) +static const size_t V2_DEFAULT_MAX_RECORD_SIZE = 512; + +/** + * V2 context record exported for external profiler discovery. + * + * External profilers search for symbol "custom_labels_current_set_v2" to find + * the thread-local context record. This pointer points to a OtelContextV2Record + * when context is active, or is NULL when no context is set. + * + * This symbol is only active when OTEL storage mode is enabled. + */ +DLLEXPORT extern thread_local OtelContextV2Record* custom_labels_current_set_v2; + +/** + * Per-thread context slot in the OTEL ring buffer. + * + * Layout follows OTEL proposal with 128-bit trace ID split into two 64-bit words + * for atomic access. Aligned to 32 bytes to minimize cache line contention. + */ +struct alignas(32) OtelContextSlot { + volatile u64 trace_id_high; // Upper 64 bits of 128-bit trace ID + volatile u64 trace_id_low; // Lower 64 bits of 128-bit trace ID + volatile u64 span_id; // 64-bit span ID + volatile u8 in_use; // 0 = valid, 1 = writing (torn-read protection) + u8 _padding[7]; // Align to 32 bytes +}; + +/** + * OTEL context buffer header. + * + * This header is placed at the start of the mmap region and allows + * external readers to validate and parse the buffer. + */ +struct OtelContextHeader { + u32 magic; // Must be OTEL_CONTEXT_MAGIC (0x4F54454C) + u32 version; // Protocol version (currently 1) + u32 capacity; // Number of slots in the buffer + u32 slot_size; // Size of each slot (sizeof(OtelContextSlot)) + // Slot array follows immediately after header +}; + +/** + * OTEL context storage manager. + * + * Provides thread-safe context storage that can be discovered and read + * by external profilers. Uses a ring buffer indexed by TID % capacity. + * + * Thread safety: + * - set() uses in_use flag with memory barriers for torn-read protection + * - get() and getByTid() return false if a write is in progress + */ +class OtelContexts { +public: + /** + * Initialize the OTEL context buffer. + * + * Creates an anonymous mmap region and names it for discovery. + * Should be called once during profiler startup when OTEL mode is enabled. + * + * @param capacity Number of thread slots (default: 65536) + * @return true if initialization succeeded, false otherwise + */ + static bool initialize(size_t capacity = OTEL_CONTEXT_DEFAULT_CAPACITY); + + /** + * Shutdown and release the OTEL context buffer. + * + * Unmaps the memory region. Should be called during profiler shutdown. + */ + static void shutdown(); + + /** + * Check if OTEL context storage is initialized. + * + * @return true if initialized, false otherwise + */ + static bool isInitialized(); + + /** + * Write context for the current thread. + * + * Uses the calling thread's TID to determine the slot. + * Thread-safe: uses in_use flag with memory barriers. + * + * @param trace_id_high Upper 64 bits of 128-bit trace ID + * @param trace_id_low Lower 64 bits of 128-bit trace ID (rootSpanId for Datadog) + * @param span_id 64-bit span ID + */ + static void set(u64 trace_id_high, u64 trace_id_low, u64 span_id); + + /** + * Read context for the current thread. + * + * Uses the calling thread's TID to determine the slot. + * Returns false if a write is in progress (torn read would occur). + * + * @param trace_id_high Output: upper 64 bits of trace ID + * @param trace_id_low Output: lower 64 bits of trace ID + * @param span_id Output: span ID + * @return true if read succeeded, false if write in progress + */ + static bool get(u64& trace_id_high, u64& trace_id_low, u64& span_id); + + /** + * Read context for a specific thread by TID. + * + * Used by wall-clock JVMTI sampling and external profilers. + * Returns false if a write is in progress (torn read would occur). + * + * @param tid Thread ID to read context for + * @param trace_id_high Output: upper 64 bits of trace ID + * @param trace_id_low Output: lower 64 bits of trace ID + * @param span_id Output: span ID + * @return true if read succeeded, false if write in progress + */ + static bool getByTid(int tid, u64& trace_id_high, u64& trace_id_low, u64& span_id); + + /** + * Clear context for the current thread. + * + * Sets all context fields to zero. + */ + static void clear(); + + /** + * Get the base address of the OTEL context buffer. + * + * Used for testing and external access. + * + * @return Pointer to the buffer header, or nullptr if not initialized + */ + static OtelContextHeader* getBuffer(); + + /** + * Get the size of the OTEL context buffer in bytes. + * + * @return Buffer size, or 0 if not initialized + */ + static size_t getBufferSize(); + +private: + static OtelContextHeader* _buffer; + static size_t _buffer_size; + static size_t _capacity; + + /** + * Get the slot pointer for a given TID. + * + * @param tid Thread ID + * @return Pointer to the slot, or nullptr if buffer not initialized + */ + static OtelContextSlot* getSlot(int tid); +}; + +#endif /* _OTEL_CONTEXT_H */ diff --git a/ddprof-lib/src/main/cpp/otel_process_ctx.cpp b/ddprof-lib/src/main/cpp/otel_process_ctx.cpp index c7ce0c4ce..2190f7f80 100644 --- a/ddprof-lib/src/main/cpp/otel_process_ctx.cpp +++ b/ddprof-lib/src/main/cpp/otel_process_ctx.cpp @@ -2,6 +2,7 @@ // This product includes software developed at Datadog (https://www.datadoghq.com/) Copyright 2025 Datadog, Inc. #include "otel_process_ctx.h" +#include "common.h" #ifndef _GNU_SOURCE #define _GNU_SOURCE @@ -37,7 +38,8 @@ static const otel_process_ctx_data empty_data = { .telemetry_sdk_language = NULL, .telemetry_sdk_version = NULL, .telemetry_sdk_name = NULL, - .resources = NULL + .resources = NULL, + .tls_config = NULL }; #if (defined(OTEL_PROCESS_CTX_NOOP) && OTEL_PROCESS_CTX_NOOP) || !defined(__linux__) @@ -66,19 +68,37 @@ static const otel_process_ctx_data empty_data = { #include #include +#include +#include +#include + +// memfd_create may not be available in older glibc, use syscall wrapper +#ifndef MFD_CLOEXEC + #define MFD_CLOEXEC 0x0001U +#endif + +static int otel_memfd_create(const char *name, unsigned int flags) { + return (int)syscall(__NR_memfd_create, name, flags); +} /** - * The process context data that's written into the published anonymous mapping. + * The process context data that's written into the published memory mapping. * * An outside-of-process reader will read this struct + otel_process_payload to get the data. + * This structure follows the OpenTelemetry Process Context v2 specification. + * + * Header layout (v2): + * - signature[8]: "OTEL_CTX" + * - version: uint32 = 2 + * - payload_size: uint32 (size of protobuf payload) + * - published_at_ns: uint64 (timestamp in nanoseconds since epoch, 0 = update in progress) + * - payload: pointer to protobuf-encoded Resource message */ typedef struct __attribute__((packed, aligned(8))) { - char otel_process_ctx_signature[8]; // Always "OTEL_CTX" - // TODO: Is version useful? Should we just get rid of it? - uint32_t otel_process_ctx_version; // Always > 0, incremented when the data structure changes - // TODO: Is size useful? Should we just get rid of it? - uint32_t otel_process_payload_size; // Always > 0, size of storage - // TODO: Should we just inline the data in the mapping itself? - char *otel_process_payload; // Always non-null, points to the storage for the data; expected to be a msgpack map of string key/value pairs, null-terminated + char otel_process_ctx_signature[8]; // Always "OTEL_CTX" + uint32_t otel_process_ctx_version; // Protocol version (currently 2) + uint32_t otel_process_payload_size; // Size of protobuf payload in bytes + uint64_t published_at_ns; // Timestamp in nanoseconds since epoch (0 = update in progress) + char *otel_process_payload; // Points to protobuf-encoded opentelemetry.proto.resource.v1.Resource } otel_process_ctx_mapping; /** @@ -94,8 +114,12 @@ typedef struct { // The actual mapping of the process context. Note that because we `madvise(..., MADV_DONTFORK)` this mapping is not // propagated to child processes and thus `mapping` is only valid on the process that published the context. otel_process_ctx_mapping *mapping; + // Size of the mapping in bytes + long mapping_size; // The process context payload. char *payload; + // Whether the mapping was created via memfd (true) or anonymous mmap (false) + bool is_memfd; } otel_process_ctx_state; /** @@ -112,42 +136,105 @@ static long size_for_mapping(void) { if (page_size_bytes < 4096) { return -1; } - return page_size_bytes * 2; + // Per PR #34: Use 1 page instead of 2 + return page_size_bytes; } -// The process context is designed to be read by an outside-of-process reader. Thus, for concurrency purposes the steps -// on this method are ordered in a way to avoid races, or if not possible to avoid, to allow the reader to detect if there was a race. +// Get current timestamp in nanoseconds since epoch +static uint64_t get_timestamp_ns(void) { + struct timespec ts; + if (clock_gettime(CLOCK_REALTIME, &ts) != 0) { + return 0; + } + return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; +} + +/** + * The process context is designed to be read by an outside-of-process reader. Thus, for concurrency purposes the steps + * on this method are ordered in a way to avoid races, or if not possible to avoid, to allow the reader to detect if there was a race. + * + * This implements the OpenTelemetry Process Context v2 publication protocol: + * 1. Try memfd_create first, fall back to anonymous mmap + * 2. Apply MADV_DONTFORK to prevent fork inheritance + * 3. Populate header fields (version, payload_size, payload pointer) + * 4. Issue memory barrier + * 5. Write signature last to ensure readers observe complete data + * 6. Set published_at_ns timestamp to signal data is ready + * 7. Name the mapping via prctl for discovery + */ otel_process_ctx_result otel_process_ctx_publish(const otel_process_ctx_data *data) { - // Step: Drop any previous context it if it exists + TEST_LOG("otel_process_ctx_publish: Starting publication, pid=%d", getpid()); + + // Step: Drop any previous context if it exists // No state should be around anywhere after this step. if (!otel_process_ctx_drop_current()) { + TEST_LOG("otel_process_ctx_publish: Failed to drop previous context"); return (otel_process_ctx_result) {.success = false, .error_message = "Failed to drop previous context (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; } // Step: Determine size for mapping long mapping_size = size_for_mapping(); if (mapping_size == -1) { + TEST_LOG("otel_process_ctx_publish: Failed to get page size"); return (otel_process_ctx_result) {.success = false, .error_message = "Failed to get page size (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; } + TEST_LOG("otel_process_ctx_publish: Mapping size=%ld bytes", mapping_size); - // Step: Prepare the payload to be published + // Step: Prepare the payload to be published (protobuf-encoded Resource message) // The payload SHOULD be ready and valid before trying to actually create the mapping. - if (!data) return (otel_process_ctx_result) {.success = false, .error_message = "otel_process_ctx_data is NULL (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; + if (!data) { + TEST_LOG("otel_process_ctx_publish: data is NULL"); + return (otel_process_ctx_result) {.success = false, .error_message = "otel_process_ctx_data is NULL (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; + } uint32_t payload_size = 0; otel_process_ctx_result result = otel_process_ctx_encode_payload(&published_state.payload, &payload_size, *data); - if (!result.success) return result; + if (!result.success) { + TEST_LOG("otel_process_ctx_publish: Failed to encode payload: %s", result.error_message); + return result; + } + TEST_LOG("otel_process_ctx_publish: Encoded payload size=%u bytes", payload_size); // Step: Create the mapping - published_state.publisher_pid = getpid(); // This allows us to detect in forks that we shouldn't touch the mapping - published_state.mapping = (otel_process_ctx_mapping *) - mmap(NULL, mapping_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + // Per v2 spec, prefer memfd_create("OTEL_CTX", ...) with fallback to anonymous mmap + published_state.publisher_pid = getpid(); + published_state.mapping_size = mapping_size; + published_state.is_memfd = false; + + int memfd = otel_memfd_create("OTEL_CTX", MFD_CLOEXEC); + TEST_LOG("otel_process_ctx_publish: memfd_create result=%d", memfd); + if (memfd >= 0) { + // memfd_create succeeded - use shared mapping + if (ftruncate(memfd, mapping_size) == 0) { + published_state.mapping = (otel_process_ctx_mapping *) + mmap(NULL, mapping_size, PROT_READ | PROT_WRITE, MAP_SHARED, memfd, 0); + if (published_state.mapping != MAP_FAILED) { + published_state.is_memfd = true; + TEST_LOG("otel_process_ctx_publish: memfd mapping successful at %p", published_state.mapping); + } else { + TEST_LOG("otel_process_ctx_publish: memfd mmap failed"); + } + } else { + TEST_LOG("otel_process_ctx_publish: ftruncate failed"); + } + close(memfd); + } + + // Fallback to anonymous mapping if memfd failed + if (!published_state.is_memfd) { + TEST_LOG("otel_process_ctx_publish: Falling back to anonymous mmap"); + published_state.mapping = (otel_process_ctx_mapping *) + mmap(NULL, mapping_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + } + if (published_state.mapping == MAP_FAILED) { + TEST_LOG("otel_process_ctx_publish: Failed to allocate mapping"); otel_process_ctx_drop_current(); return (otel_process_ctx_result) {.success = false, .error_message = "Failed to allocate mapping (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; } + TEST_LOG("otel_process_ctx_publish: Mapping created at %p (is_memfd=%d)", published_state.mapping, published_state.is_memfd); // Step: Setup MADV_DONTFORK - // This ensures that the mapping is not propagated to child processes (they should call update/publish again). + // This ensures that the mapping is not propagated to child processes (they should call publish again). if (madvise(published_state.mapping, mapping_size, MADV_DONTFORK) == -1) { if (otel_process_ctx_drop_current()) { return (otel_process_ctx_result) {.success = false, .error_message = "Failed to setup MADV_DONTFORK (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; @@ -156,49 +243,64 @@ otel_process_ctx_result otel_process_ctx_publish(const otel_process_ctx_data *da } } - // Step: Populate the mapping - // The payload and any extra fields must come first and not be reordered with the signature by the compiler. + // Step: Populate the mapping header (v2 format) + // Per v2 spec: signature is written LAST to ensure readers never observe incomplete data + // Initialize with signature zeroed, published_at_ns = 0 (signals "not ready yet") *published_state.mapping = (otel_process_ctx_mapping) { - .otel_process_ctx_signature = {0}, // Set in "Step: Populate the signature into the mapping" below - .otel_process_ctx_version = 1, + .otel_process_ctx_signature = {0}, // Set in final step below + .otel_process_ctx_version = 2, // v2 protocol .otel_process_payload_size = payload_size, + .published_at_ns = 0, // Will be set after signature .otel_process_payload = published_state.payload }; - // Step: Synchronization - Mapping has been filled and is missing signature - // Make sure the initialization of the mapping + payload above does not get reordered with setting the signature below. Setting - // the signature is what tells an outside reader that the context is fully published. + // Step: Memory barrier before signature + // Ensures all header fields are visible before signature is written atomic_thread_fence(memory_order_seq_cst); - // Step: Populate the signature into the mapping - // The signature must come last and not be reordered with the fields above by the compiler. After this step, external readers - // can read the signature and know that the payload is ready to be read. - memcpy(published_state.mapping->otel_process_ctx_signature, "OTEL_CTX", sizeof(published_state.mapping->otel_process_ctx_signature)); + // Step: Write signature LAST (per v2 spec requirement) + // After this, external readers can see the signature and know header is valid + memcpy(published_state.mapping->otel_process_ctx_signature, "OTEL_CTX", + sizeof(published_state.mapping->otel_process_ctx_signature)); - // Step: Change permissions on the mapping to only read permission - // We've observed the combination of anonymous mapping + a given number of pages + read-only permission is not very common, - // so this is left as a hint for when running on older kernels and the naming the mapping feature below isn't available. - // For modern kernels, doing this is harmless so we do it unconditionally. - if (mprotect(published_state.mapping, mapping_size, PROT_READ) == -1) { - if (otel_process_ctx_drop_current()) { - return (otel_process_ctx_result) {.success = false, .error_message = "Failed to change permissions on mapping (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; - } else { - return (otel_process_ctx_result) {.success = false, .error_message = "Failed to drop previous context (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; - } - } + // Step: Memory barrier after signature + atomic_thread_fence(memory_order_seq_cst); - // Step: Name the mapping so outside readers can: - // * Find it by name - // * Hook on prctl to detect when new mappings are published - if (prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, published_state.mapping, mapping_size, "OTEL_CTX") == -1) { - // Naming an anonymous mapping is a Linux 5.17+ feature. On earlier versions, this method call can fail. Thus it's OK - // for this to fail because: - // 1. Things that hook on prctl are still able to see this call, even though it's not supported (TODO: Confirm this is actually the case) - // 2. As a fallback, on older kernels, it's possible to scan the mappings and look for the "OTEL_CTX" signature in the memory itself, - // after observing the mapping has the expected number of pages and permissions. + // Step: Set published_at_ns to signal data is ready + // Per v2 spec: non-zero timestamp indicates active, valid data + uint64_t timestamp = get_timestamp_ns(); + if (timestamp == 0) timestamp = 1; // Ensure non-zero (0 = update in progress) + __atomic_store_n(&published_state.mapping->published_at_ns, timestamp, __ATOMIC_RELEASE); + TEST_LOG("otel_process_ctx_publish: Set published_at_ns=%llu", (unsigned long long)timestamp); + + // NOTE: Per PR #34 spec update - mapping remains writable (rw-p or rw-s) + // This allows for in-place updates and matches the reference implementation. + // The mprotect to PROT_READ has been removed as the reader now accepts rw permissions. + TEST_LOG("otel_process_ctx_publish: Mapping kept writable for in-place updates"); + + // Step: Name the mapping for discovery + // On memfd, the mapping appears as /memfd:OTEL_CTX in /proc/pid/maps + // On anonymous mmap with prctl naming, it appears as [anon:OTEL_CTX] + if (!published_state.is_memfd) { + // Only need prctl naming for anonymous mappings; memfd is already named + int prctl_result = prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, published_state.mapping, mapping_size, "OTEL_CTX"); + TEST_LOG("otel_process_ctx_publish: prctl naming result=%d (is_memfd=false)", prctl_result); + if (prctl_result == -1) { + // Naming is a Linux 5.17+ feature. Failure is acceptable: + // 1. External tools can still detect via memfd name or magic number scanning + // 2. prctl hooks can still observe the call attempt + TEST_LOG("otel_process_ctx_publish: prctl naming failed (expected on kernels < 5.17)"); + } + } else { + TEST_LOG("otel_process_ctx_publish: Skipped prctl naming (using memfd)"); } - // All done! + TEST_LOG("otel_process_ctx_publish: Successfully published context at %p, signature='%.8s', version=%u, payload_size=%u, timestamp=%llu", + published_state.mapping, + published_state.mapping->otel_process_ctx_signature, + published_state.mapping->otel_process_ctx_version, + published_state.mapping->otel_process_payload_size, + (unsigned long long)published_state.mapping->published_at_ns); return (otel_process_ctx_result) {.success = true, .error_message = NULL}; } @@ -206,68 +308,483 @@ otel_process_ctx_result otel_process_ctx_publish(const otel_process_ctx_data *da bool otel_process_ctx_drop_current(void) { otel_process_ctx_state state = published_state; + if (state.mapping == NULL || state.mapping == MAP_FAILED) { + TEST_LOG("otel_process_ctx_drop_current: No active mapping to drop"); + return true; + } + + TEST_LOG("otel_process_ctx_drop_current: Dropping mapping at %p (publisher_pid=%d, current_pid=%d)", + state.mapping, state.publisher_pid, getpid()); + // Zero out the state and make sure no operations below are reordered with zeroing - published_state = (otel_process_ctx_state) {.publisher_pid = 0, .mapping = NULL, .payload = NULL}; + published_state = (otel_process_ctx_state) {.publisher_pid = 0, .mapping = NULL, .mapping_size = 0, .payload = NULL, .is_memfd = false}; atomic_thread_fence(memory_order_seq_cst); // The mapping only exists if it was created by the current process; if it was inherited by a fork it doesn't exist anymore // (due to the MADV_DONTFORK) and we don't need to do anything to it. if (state.mapping != NULL && state.mapping != MAP_FAILED && getpid() == state.publisher_pid) { - long mapping_size = size_for_mapping(); - if (mapping_size == -1 || munmap(state.mapping, mapping_size) == -1) return false; + if (state.mapping_size <= 0 || munmap(state.mapping, state.mapping_size) == -1) { + TEST_LOG("otel_process_ctx_drop_current: Failed to munmap"); + return false; + } + TEST_LOG("otel_process_ctx_drop_current: Successfully unmapped"); } // The payload may have been inherited from a parent. This is a regular malloc so we need to free it so we don't leak. - if (state.payload) free(state.payload); + if (state.payload) { + TEST_LOG("otel_process_ctx_drop_current: Freeing payload"); + free(state.payload); + } return true; } -static otel_process_ctx_result validate_and_calculate_payload_size(size_t *out_pairs_size, size_t *out_num_pairs, char **pairs) { +// ============================================================================= +// Minimal Protobuf Encoder for OpenTelemetry Resource message (v2 spec) +// ============================================================================= +// +// Encodes opentelemetry.proto.resource.v1.Resource message containing KeyValue attributes. +// Wire format reference: https://protobuf.dev/programming-guides/encoding/ +// +// Message hierarchy: +// Resource { repeated KeyValue attributes = 1; } +// KeyValue { string key = 1; AnyValue value = 2; } +// AnyValue { oneof value { string string_value = 1; ... } } + +// Write a varint (variable-length integer) to buffer, return bytes written +static size_t pb_write_varint(uint8_t *buf, uint64_t value) { + size_t bytes = 0; + while (value > 0x7F) { + buf[bytes++] = (uint8_t)((value & 0x7F) | 0x80); + value >>= 7; + } + buf[bytes++] = (uint8_t)(value & 0x7F); + return bytes; +} + +// Calculate varint size without writing +static size_t pb_varint_size(uint64_t value) { + size_t bytes = 1; + while (value > 0x7F) { + bytes++; + value >>= 7; + } + return bytes; +} + +// Write a length-delimited string field: [tag][length][bytes] +// Returns bytes written +static size_t pb_write_string_field(uint8_t *buf, uint32_t field_num, const char *str, size_t len) { + size_t pos = 0; + // Tag: (field_num << 3) | wire_type, wire_type 2 = length-delimited + uint32_t tag = (field_num << 3) | 2; + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, len); + memcpy(buf + pos, str, len); + pos += len; + return pos; +} + +// Calculate size of a string field +static size_t pb_string_field_size(uint32_t field_num, size_t len) { + uint32_t tag = (field_num << 3) | 2; + return pb_varint_size(tag) + pb_varint_size(len) + len; +} + +// Calculate size of AnyValue message containing a string_value (field 1) +static size_t pb_anyvalue_string_size(size_t value_len) { + // AnyValue { string string_value = 1; } + return pb_string_field_size(1, value_len); +} + +// Write AnyValue message containing a string_value (field 1) +static size_t pb_write_anyvalue_string(uint8_t *buf, const char *value, size_t value_len) { + return pb_write_string_field(buf, 1, value, value_len); +} + +// Forward declarations for functions used before their definitions +static size_t pb_keyvalue_size(size_t key_len, size_t value_len); +static size_t pb_write_keyvalue(uint8_t *buf, const char *key, size_t key_len, const char *value, size_t value_len); + +// ============================================================================= +// Int64 Value Encoding (AnyValue.int_value = field 3, wire type 0) +// ============================================================================= + +// Calculate size of AnyValue message containing an int_value (field 3) +static size_t pb_anyvalue_int_size(int64_t value) { + // AnyValue { int64 int_value = 3; } + // tag (1 byte: field 3, wire type 0) + varint-encoded value + return 1 + pb_varint_size((uint64_t)value); +} + +// Write AnyValue message containing an int_value (field 3) +static size_t pb_write_anyvalue_int(uint8_t *buf, int64_t value) { + size_t pos = 0; + buf[pos++] = (3 << 3) | 0; // field 3, wire type 0 (varint) + pos += pb_write_varint(buf + pos, (uint64_t)value); + return pos; +} + +// Calculate size of KeyValue message with int64 value +static size_t pb_keyvalue_int_size(size_t key_len, int64_t value) { + // KeyValue { string key = 1; AnyValue value = 2; } + size_t key_field_size = pb_string_field_size(1, key_len); + size_t anyvalue_size = pb_anyvalue_int_size(value); + // value field (field 2) is embedded message (wire type 2) + size_t value_field_size = pb_varint_size((2 << 3) | 2) + pb_varint_size(anyvalue_size) + anyvalue_size; + return key_field_size + value_field_size; +} + +// Write KeyValue message with int64 value +static size_t pb_write_keyvalue_int(uint8_t *buf, const char *key, size_t key_len, int64_t value) { + size_t pos = 0; + + // Field 1: string key + pos += pb_write_string_field(buf + pos, 1, key, key_len); + + // Field 2: AnyValue value (embedded message) + size_t anyvalue_size = pb_anyvalue_int_size(value); + uint32_t tag = (2 << 3) | 2; // field 2, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, anyvalue_size); + pos += pb_write_anyvalue_int(buf + pos, value); + + return pos; +} + +// ============================================================================= +// KvList Value Encoding (AnyValue.kvlist_value = field 6, wire type 2) +// KeyValueList { repeated KeyValue values = 1; } +// ============================================================================= + +// Calculate size of KeyValueList message (just the content, no outer tag/length) +static size_t pb_keyvaluelist_content_size(const char **pairs, size_t num_pairs) { + size_t size = 0; + for (size_t i = 0; i < num_pairs; i++) { + size_t key_len = strlen(pairs[i * 2]); + size_t value_len = strlen(pairs[i * 2 + 1]); + // Each KeyValue is in field 1 of KeyValueList + size_t kv_size = pb_keyvalue_size(key_len, value_len); + // tag + length prefix + content + size += pb_varint_size((1 << 3) | 2) + pb_varint_size(kv_size) + kv_size; + } + return size; +} + +// Write KeyValueList message content (just the repeated KeyValue entries) +static size_t pb_write_keyvaluelist_content(uint8_t *buf, const char **pairs, size_t num_pairs) { + size_t pos = 0; + for (size_t i = 0; i < num_pairs; i++) { + const char *key = pairs[i * 2]; + const char *value = pairs[i * 2 + 1]; + size_t key_len = strlen(key); + size_t value_len = strlen(value); + + // Write as embedded message: field 1 (values), wire type 2 + size_t kv_size = pb_keyvalue_size(key_len, value_len); + uint32_t tag = (1 << 3) | 2; + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, kv_size); + pos += pb_write_keyvalue(buf + pos, key, key_len, value, value_len); + } + return pos; +} + +// Calculate size of AnyValue message containing a kvlist_value (field 6) +static size_t pb_anyvalue_kvlist_size(const char **pairs, size_t num_pairs) { + // AnyValue { KeyValueList kvlist_value = 6; } + size_t kvlist_size = pb_keyvaluelist_content_size(pairs, num_pairs); + // tag (1 byte: field 6, wire type 2) + length varint + content + return 1 + pb_varint_size(kvlist_size) + kvlist_size; +} + +// Write AnyValue message containing a kvlist_value (field 6) +static size_t pb_write_anyvalue_kvlist(uint8_t *buf, const char **pairs, size_t num_pairs) { + size_t pos = 0; + buf[pos++] = (6 << 3) | 2; // field 6, wire type 2 (length-delimited) + size_t kvlist_size = pb_keyvaluelist_content_size(pairs, num_pairs); + pos += pb_write_varint(buf + pos, kvlist_size); + pos += pb_write_keyvaluelist_content(buf + pos, pairs, num_pairs); + return pos; +} + +// Calculate size of KeyValue message with kvlist value +static size_t pb_keyvalue_kvlist_size(size_t key_len, const char **pairs, size_t num_pairs) { + // KeyValue { string key = 1; AnyValue value = 2; } + size_t key_field_size = pb_string_field_size(1, key_len); + size_t anyvalue_size = pb_anyvalue_kvlist_size(pairs, num_pairs); + // value field (field 2) is embedded message (wire type 2) + size_t value_field_size = pb_varint_size((2 << 3) | 2) + pb_varint_size(anyvalue_size) + anyvalue_size; + return key_field_size + value_field_size; +} + +// Write KeyValue message with kvlist value +static size_t pb_write_keyvalue_kvlist(uint8_t *buf, const char *key, size_t key_len, const char **pairs, size_t num_pairs) { + size_t pos = 0; + + // Field 1: string key + pos += pb_write_string_field(buf + pos, 1, key, key_len); + + // Field 2: AnyValue value (embedded message) + size_t anyvalue_size = pb_anyvalue_kvlist_size(pairs, num_pairs); + uint32_t tag = (2 << 3) | 2; // field 2, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, anyvalue_size); + pos += pb_write_anyvalue_kvlist(buf + pos, pairs, num_pairs); + + return pos; +} + +// ============================================================================= +// Array Value Encoding (AnyValue.array_value = field 5, wire type 2) +// ArrayValue { repeated AnyValue values = 1; } +// ============================================================================= + +// Calculate size of ArrayValue message content (repeated AnyValue entries with string values) +static size_t pb_arrayvalue_strings_content_size(const char **strings, size_t count) { + size_t size = 0; + for (size_t i = 0; i < count; i++) { + size_t str_len = strlen(strings[i]); + // Each string is wrapped in AnyValue (field 1 = string_value) + size_t anyvalue_size = pb_anyvalue_string_size(str_len); + // tag + length prefix + content + size += pb_varint_size((1 << 3) | 2) + pb_varint_size(anyvalue_size) + anyvalue_size; + } + return size; +} + +// Write ArrayValue message content (repeated AnyValue entries with string values) +static size_t pb_write_arrayvalue_strings_content(uint8_t *buf, const char **strings, size_t count) { + size_t pos = 0; + for (size_t i = 0; i < count; i++) { + size_t str_len = strlen(strings[i]); + size_t anyvalue_size = pb_anyvalue_string_size(str_len); + + // Write as embedded AnyValue: ArrayValue.values = field 1, wire type 2 + uint32_t tag = (1 << 3) | 2; + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, anyvalue_size); + pos += pb_write_anyvalue_string(buf + pos, strings[i], str_len); + } + return pos; +} + +// Calculate size of AnyValue message containing an array_value (field 5) +static size_t pb_anyvalue_array_strings_size(const char **strings, size_t count) { + // AnyValue { ArrayValue array_value = 5; } + size_t array_size = pb_arrayvalue_strings_content_size(strings, count); + // tag (1 byte: field 5, wire type 2) + length varint + content + return 1 + pb_varint_size(array_size) + array_size; +} + +// Write AnyValue message containing an array_value (field 5) with strings +static size_t pb_write_anyvalue_array_strings(uint8_t *buf, const char **strings, size_t count) { + size_t pos = 0; + buf[pos++] = (5 << 3) | 2; // field 5, wire type 2 (length-delimited) + size_t array_size = pb_arrayvalue_strings_content_size(strings, count); + pos += pb_write_varint(buf + pos, array_size); + pos += pb_write_arrayvalue_strings_content(buf + pos, strings, count); + return pos; +} + +// Calculate size of KeyValue message with array value (array of strings) +static size_t pb_keyvalue_array_strings_size(size_t key_len, const char **strings, size_t count) { + // KeyValue { string key = 1; AnyValue value = 2; } + size_t key_field_size = pb_string_field_size(1, key_len); + size_t anyvalue_size = pb_anyvalue_array_strings_size(strings, count); + // value field (field 2) is embedded message (wire type 2) + size_t value_field_size = pb_varint_size((2 << 3) | 2) + pb_varint_size(anyvalue_size) + anyvalue_size; + return key_field_size + value_field_size; +} + +// Write KeyValue message with array value (array of strings) +static size_t pb_write_keyvalue_array_strings(uint8_t *buf, const char *key, size_t key_len, const char **strings, size_t count) { + size_t pos = 0; + + // Field 1: string key + pos += pb_write_string_field(buf + pos, 1, key, key_len); + + // Field 2: AnyValue value (embedded message with array_value) + size_t anyvalue_size = pb_anyvalue_array_strings_size(strings, count); + uint32_t tag = (2 << 3) | 2; // field 2, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, anyvalue_size); + pos += pb_write_anyvalue_array_strings(buf + pos, strings, count); + + return pos; +} + +// ============================================================================= +// String Value KeyValue (existing implementation renamed for clarity) +// ============================================================================= + +// Calculate size of KeyValue message +static size_t pb_keyvalue_size(size_t key_len, size_t value_len) { + // KeyValue { string key = 1; AnyValue value = 2; } + size_t key_field_size = pb_string_field_size(1, key_len); + size_t anyvalue_size = pb_anyvalue_string_size(value_len); + // value field (field 2) is embedded message (wire type 2) + size_t value_field_size = pb_varint_size((2 << 3) | 2) + pb_varint_size(anyvalue_size) + anyvalue_size; + return key_field_size + value_field_size; +} + +// Write KeyValue message +static size_t pb_write_keyvalue(uint8_t *buf, const char *key, size_t key_len, const char *value, size_t value_len) { + size_t pos = 0; + + // Field 1: string key + pos += pb_write_string_field(buf + pos, 1, key, key_len); + + // Field 2: AnyValue value (embedded message) + size_t anyvalue_size = pb_anyvalue_string_size(value_len); + uint32_t tag = (2 << 3) | 2; // field 2, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, anyvalue_size); + pos += pb_write_anyvalue_string(buf + pos, value, value_len); + + return pos; +} + +// Validate pairs array and calculate total payload size +static otel_process_ctx_result pb_validate_and_calculate_size(size_t *out_size, size_t *out_num_pairs, const char **pairs) { size_t num_entries = 0; for (size_t i = 0; pairs[i] != NULL; i++) num_entries++; + if (num_entries % 2 != 0) { - return (otel_process_ctx_result) {.success = false, .error_message = "Value in otel_process_ctx_data is NULL (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; + return (otel_process_ctx_result) {.success = false, .error_message = "Pairs array has odd number of entries (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; } *out_num_pairs = num_entries / 2; - *out_pairs_size = 0; + // Calculate size for Resource message: repeated KeyValue attributes (field 1) + *out_size = 0; for (size_t i = 0; i < *out_num_pairs; i++) { size_t key_len = strlen(pairs[i * 2]); - if (key_len > INT16_MAX) { - return (otel_process_ctx_result) {.success = false, .error_message = "Length of key in otel_process_ctx_data exceeds INT16_MAX limit (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; - } size_t value_len = strlen(pairs[i * 2 + 1]); - if (value_len > INT16_MAX) { - return (otel_process_ctx_result) {.success = false, .error_message = "Length of value in otel_process_ctx_data exceeds INT16_MAX limit (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; + + if (key_len > INT16_MAX || value_len > INT16_MAX) { + return (otel_process_ctx_result) {.success = false, .error_message = "Key or value exceeds INT16_MAX limit (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; } - *out_pairs_size += 1 + 2 + key_len; // str 16 for key - *out_pairs_size += 1 + 2 + value_len; // str 16 for value + + // Each KeyValue is an embedded message in field 1 of Resource + size_t kv_size = pb_keyvalue_size(key_len, value_len); + // Field tag + length prefix + message content + *out_size += pb_varint_size((1 << 3) | 2) + pb_varint_size(kv_size) + kv_size; } return (otel_process_ctx_result) {.success = true, .error_message = NULL}; } -static void write_msgpack_string(char **ptr, const char *str) { - size_t len = strlen(str); - // Write str 16 header - *(*ptr)++ = 0xda; - *(*ptr)++ = (len >> 8) & 0xFF; // high byte of length - *(*ptr)++ = len & 0xFF; // low byte of length - memcpy(*ptr, str, len); - *ptr += len; +// Write all key-value pairs as Resource.attributes field (field 1, repeated KeyValue) +static size_t pb_write_attributes(uint8_t *buf, const char **pairs, size_t num_pairs) { + size_t pos = 0; + + for (size_t i = 0; i < num_pairs; i++) { + const char *key = pairs[i * 2]; + const char *value = pairs[i * 2 + 1]; + size_t key_len = strlen(key); + size_t value_len = strlen(value); + + // Write as embedded message: field 1 (attributes), wire type 2 + size_t kv_size = pb_keyvalue_size(key_len, value_len); + uint32_t tag = (1 << 3) | 2; + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, kv_size); + pos += pb_write_keyvalue(buf + pos, key, key_len, value, value_len); + } + + return pos; } -// TODO: The serialization format is still under discussion and is not considered stable yet. -// Comments **very** welcome: Should we use JSON instead? Or protobuf? -// -// Encode the payload as a msgpack map of string key/value pairs. -// -// This method implements an extremely compact but limited msgpack encoder. This encoder supports only encoding a single -// flat key-value map where every key and value is a string. -// For extra compact code, it uses only a "map 16" encoding format with only "str 16" strings, rather than attempting to -// use some of the other encoding alternatives. +// Helper: Write a single KeyValue (string) as a Resource.attributes field (field 1) +static size_t pb_write_attribute_string(uint8_t *buf, const char *key, const char *value) { + size_t key_len = strlen(key); + size_t value_len = strlen(value); + size_t kv_size = pb_keyvalue_size(key_len, value_len); + + size_t pos = 0; + uint32_t tag = (1 << 3) | 2; // field 1, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, kv_size); + pos += pb_write_keyvalue(buf + pos, key, key_len, value, value_len); + return pos; +} + +// Helper: Write a single KeyValue (int64) as a Resource.attributes field (field 1) +static size_t pb_write_attribute_int(uint8_t *buf, const char *key, int64_t value) { + size_t key_len = strlen(key); + size_t kv_size = pb_keyvalue_int_size(key_len, value); + + size_t pos = 0; + uint32_t tag = (1 << 3) | 2; // field 1, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, kv_size); + pos += pb_write_keyvalue_int(buf + pos, key, key_len, value); + return pos; +} + +// Helper: Write a single KeyValue (kvlist) as a Resource.attributes field (field 1) +static size_t pb_write_attribute_kvlist(uint8_t *buf, const char *key, const char **pairs, size_t num_pairs) { + size_t key_len = strlen(key); + size_t kv_size = pb_keyvalue_kvlist_size(key_len, pairs, num_pairs); + + size_t pos = 0; + uint32_t tag = (1 << 3) | 2; // field 1, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, kv_size); + pos += pb_write_keyvalue_kvlist(buf + pos, key, key_len, pairs, num_pairs); + return pos; +} + +// Helper: Calculate size of a single KeyValue (string) as a Resource.attributes field (field 1) +static size_t pb_attribute_string_size(const char *key, const char *value) { + size_t key_len = strlen(key); + size_t value_len = strlen(value); + size_t kv_size = pb_keyvalue_size(key_len, value_len); + return pb_varint_size((1 << 3) | 2) + pb_varint_size(kv_size) + kv_size; +} + +// Helper: Calculate size of a single KeyValue (int64) as a Resource.attributes field (field 1) +static size_t pb_attribute_int_size(const char *key, int64_t value) { + size_t key_len = strlen(key); + size_t kv_size = pb_keyvalue_int_size(key_len, value); + return pb_varint_size((1 << 3) | 2) + pb_varint_size(kv_size) + kv_size; +} + +// Helper: Calculate size of a single KeyValue (kvlist) as a Resource.attributes field (field 1) +static size_t pb_attribute_kvlist_size(const char *key, const char **pairs, size_t num_pairs) { + size_t key_len = strlen(key); + size_t kv_size = pb_keyvalue_kvlist_size(key_len, pairs, num_pairs); + return pb_varint_size((1 << 3) | 2) + pb_varint_size(kv_size) + kv_size; +} + +// Helper: Calculate size of a single KeyValue (array of strings) as a Resource.attributes field (field 1) +static size_t pb_attribute_array_strings_size(const char *key, const char **strings, size_t count) { + size_t key_len = strlen(key); + size_t kv_size = pb_keyvalue_array_strings_size(key_len, strings, count); + return pb_varint_size((1 << 3) | 2) + pb_varint_size(kv_size) + kv_size; +} + +// Helper: Write a single KeyValue (array of strings) as a Resource.attributes field (field 1) +static size_t pb_write_attribute_array_strings(uint8_t *buf, const char *key, const char **strings, size_t count) { + size_t key_len = strlen(key); + size_t kv_size = pb_keyvalue_array_strings_size(key_len, strings, count); + + size_t pos = 0; + uint32_t tag = (1 << 3) | 2; // field 1, wire type 2 + pos += pb_write_varint(buf + pos, tag); + pos += pb_write_varint(buf + pos, kv_size); + pos += pb_write_keyvalue_array_strings(buf + pos, key, key_len, strings, count); + return pos; +} + +/** + * Encode the payload as protobuf opentelemetry.proto.resource.v1.Resource message. + * + * This implements a minimal protobuf encoder supporting string, int64, and kvlist attributes. + * The Resource message contains repeated KeyValue in field 1 (attributes). + */ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint32_t *out_size, otel_process_ctx_data data) { + // Build array of key-value pairs using OpenTelemetry semantic convention keys const char *pairs[] = { "deployment.environment.name", data.deployment_environment_name, "host.name", data.host_name, @@ -280,48 +797,78 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 NULL }; + // Validate and calculate size for standard pairs size_t num_pairs = 0, pairs_size = 0; - otel_process_ctx_result validation_result = validate_and_calculate_payload_size(&pairs_size, &num_pairs, (char **) pairs); - if (!validation_result.success) return validation_result; + otel_process_ctx_result result = pb_validate_and_calculate_size(&pairs_size, &num_pairs, pairs); + if (!result.success) return result; - size_t resources_pairs_size = 0, resources_num_pairs = 0; + // Validate and calculate size for additional resources + size_t resources_num_pairs = 0, resources_size = 0; if (data.resources != NULL) { - validation_result = validate_and_calculate_payload_size(&resources_pairs_size, &resources_num_pairs, data.resources); - if (!validation_result.success) return validation_result; + result = pb_validate_and_calculate_size(&resources_size, &resources_num_pairs, (const char **)data.resources); + if (!result.success) return result; } - size_t total_pairs = num_pairs + resources_num_pairs; - size_t total_size = pairs_size + resources_pairs_size + 1 + 2; // map 16 header (1 byte + 2 bytes for count) + // Calculate size for TLS config if present + size_t tls_config_size = 0; + size_t tls_keymap_count = 0; + if (data.tls_config != NULL) { + // threadlocal.schema_version = schema_version string (e.g. "tlsdesc_v1_dev") + if (data.tls_config->schema_version != NULL) { + tls_config_size += pb_attribute_string_size("threadlocal.schema_version", data.tls_config->schema_version); + } + + // threadlocal.max_record_size = (int64) + tls_config_size += pb_attribute_int_size("threadlocal.max_record_size", data.tls_config->max_record_size); - if (total_pairs > INT16_MAX) { - return (otel_process_ctx_result) {.success = false, .error_message = "Total number of pairs exceeds INT16_MAX limit (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; + // threadlocal.attribute_key_map = (position = key index) + if (data.tls_config->attribute_key_map != NULL) { + // Count entries in attribute_key_map (each entry is a key name, position = index) + for (size_t i = 0; data.tls_config->attribute_key_map[i] != NULL; i++) { + tls_keymap_count++; + } + tls_config_size += pb_attribute_array_strings_size("threadlocal.attribute_key_map", + (const char **)data.tls_config->attribute_key_map, + tls_keymap_count); + } } - char *encoded = (char *) calloc(total_size, 1); + size_t total_size = pairs_size + resources_size + tls_config_size; + + // Allocate buffer for protobuf payload + uint8_t *encoded = (uint8_t *) calloc(total_size, 1); if (!encoded) { return (otel_process_ctx_result) {.success = false, .error_message = "Failed to allocate memory for payload (" __FILE__ ":" ADD_QUOTES(__LINE__) ")"}; } - char *ptr = encoded; - // Write map 16 header (0xde) followed by count - *ptr++ = 0xde; - *ptr++ = (total_pairs >> 8) & 0xFF; // high byte of count - *ptr++ = total_pairs & 0xFF; // low byte of count + // Write standard pairs + size_t pos = pb_write_attributes(encoded, pairs, num_pairs); - for (size_t i = 0; i < num_pairs; i++) { - write_msgpack_string(&ptr, pairs[i * 2]); // Write key - write_msgpack_string(&ptr, pairs[i * 2 + 1]); // Write value + // Write additional resources + if (data.resources != NULL) { + pos += pb_write_attributes(encoded + pos, (const char **)data.resources, resources_num_pairs); } - if (data.resources != NULL) { - for (size_t i = 0; i < resources_num_pairs; i++) { - write_msgpack_string(&ptr, data.resources[i * 2]); // Write key - write_msgpack_string(&ptr, data.resources[i * 2 + 1]); // Write value + // Write TLS config if present + if (data.tls_config != NULL) { + // threadlocal.schema_version = schema_version string (e.g. "tlsdesc_v1_dev") + if (data.tls_config->schema_version != NULL) { + pos += pb_write_attribute_string(encoded + pos, "threadlocal.schema_version", data.tls_config->schema_version); + } + + // threadlocal.max_record_size = + pos += pb_write_attribute_int(encoded + pos, "threadlocal.max_record_size", data.tls_config->max_record_size); + + // threadlocal.attribute_key_map = (position = key index) + if (data.tls_config->attribute_key_map != NULL && tls_keymap_count > 0) { + pos += pb_write_attribute_array_strings(encoded + pos, "threadlocal.attribute_key_map", + (const char **)data.tls_config->attribute_key_map, + tls_keymap_count); } } - *out = encoded; - *out_size = (uint32_t) total_size; + *out = (char *)encoded; + *out_size = (uint32_t)total_size; return (otel_process_ctx_result) {.success = true, .error_message = NULL}; } @@ -330,19 +877,10 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 #include #include #include - #include // Note: The below parsing code is only for otel_process_ctx_read and is only provided for debugging // and testing purposes. - // Named mappings are supported on Linux 5.17+ - static bool named_mapping_supported(void) { - struct utsname uts; - int major, minor; - if (uname(&uts) != 0 || sscanf(uts.release, "%d.%d", &major, &minor) != 2) return false; - return (major > 5) || (major == 5 && minor >= 17); - } - static void *parse_mapping_start(char *line) { char *endptr = NULL; unsigned long long start = strtoull(line, &endptr, 16); @@ -351,13 +889,16 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 } static bool is_otel_process_ctx_mapping(char *line) { - size_t name_len = sizeof("[anon:OTEL_CTX]") - 1; size_t line_len = strlen(line); - if (line_len < name_len) return false; if (line[line_len-1] == '\n') line[--line_len] = '\0'; - // Validate expected permission - if (strstr(line, " r--p ") == NULL) return false; + // Validate expected permissions (accept both old and new formats for backward compatibility) + // Per PR #34: new mappings stay writable (rw-p or rw-s) for in-place updates + // Accept both: r--p/r--s (old, read-only) and rw-p/rw-s (new, read-write) + if (strstr(line, " r--p ") == NULL && strstr(line, " r--s ") == NULL && + strstr(line, " rw-p ") == NULL && strstr(line, " rw-s ") == NULL) { + return false; + } // Validate expected context size int64_t start, end; @@ -365,24 +906,31 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 if (start == 0 || end == 0 || end <= start) return false; if ((end - start) != size_for_mapping()) return false; - if (named_mapping_supported()) { - // On Linux 5.17+, check if the line ends with [anon:OTEL_CTX] - return memcmp(line + (line_len - name_len), "[anon:OTEL_CTX]", name_len) == 0; - } else { - // On older kernels, parse the address to to find the OTEL_CTX signature - void *addr = parse_mapping_start(line); - if (addr == NULL) return false; + // Check for memfd mapping: /memfd:OTEL_CTX (deleted) or similar + if (strstr(line, "/memfd:OTEL_CTX") != NULL) { + return true; + } - // Read 8 bytes at the address using process_vm_readv (to avoid any issues with concurrency/races) - char buffer[8]; - struct iovec local[] = {{.iov_base = buffer, .iov_len = sizeof(buffer)}}; - struct iovec remote[] = {{.iov_base = addr, .iov_len = sizeof(buffer)}}; + // Check for named anonymous mapping: [anon:OTEL_CTX] + const char *anon_name = "[anon:OTEL_CTX]"; + size_t anon_name_len = strlen(anon_name); + if (line_len >= anon_name_len && memcmp(line + (line_len - anon_name_len), anon_name, anon_name_len) == 0) { + return true; + } - ssize_t bytes_read = process_vm_readv(getpid(), local, 1, remote, 1, 0); - if (bytes_read != sizeof(buffer)) return false; + // Fallback: scan for OTEL_CTX signature in memory (for older kernels) + void *addr = parse_mapping_start(line); + if (addr == NULL) return false; - return memcmp(buffer, "OTEL_CTX", sizeof(buffer)) == 0; - } + // Read 8 bytes at the address using process_vm_readv (to avoid any issues with concurrency/races) + char buffer[8]; + struct iovec local[] = {{.iov_base = buffer, .iov_len = sizeof(buffer)}}; + struct iovec remote[] = {{.iov_base = addr, .iov_len = sizeof(buffer)}}; + + ssize_t bytes_read = process_vm_readv(getpid(), local, 1, remote, 1, 0); + if (bytes_read != sizeof(buffer)) return false; + + return memcmp(buffer, "OTEL_CTX", sizeof(buffer)) == 0; } static otel_process_ctx_mapping *try_finding_mapping(void) { @@ -403,21 +951,165 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 return result; } - // Simplified msgpack decoder to match the exact encoder above. If the msgpack string doesn't match the encoder, this will - // return false. - static bool otel_process_ctx_decode_payload(char *payload, otel_process_ctx_data *data_out) { - char *ptr = payload; + // ============================================================================= + // Minimal Protobuf Decoder for OpenTelemetry Resource message (v2 spec) + // ============================================================================= + + // Read a varint from buffer, return bytes consumed (0 on error) + static size_t pb_read_varint(const uint8_t *buf, size_t buf_len, uint64_t *out_value) { + *out_value = 0; + size_t bytes = 0; + int shift = 0; + while (bytes < buf_len && bytes < 10) { // varints are at most 10 bytes + uint8_t b = buf[bytes++]; + *out_value |= (uint64_t)(b & 0x7F) << shift; + if ((b & 0x80) == 0) return bytes; + shift += 7; + } + return 0; // Error: varint too long or buffer overflow + } + + // Decode a length-delimited string field, return bytes consumed (0 on error) + // Allocates and null-terminates the string into *out_str + static size_t pb_read_string(const uint8_t *buf, size_t buf_len, char **out_str) { + uint64_t len; + size_t varint_size = pb_read_varint(buf, buf_len, &len); + if (varint_size == 0 || varint_size + len > buf_len) return 0; + + *out_str = (char *)calloc(len + 1, 1); + if (!*out_str) return 0; + memcpy(*out_str, buf + varint_size, len); + (*out_str)[len] = '\0'; + + return varint_size + len; + } + + // Skip a length-delimited field, return bytes consumed (0 on error) + static size_t pb_skip_length_delimited(const uint8_t *buf, size_t buf_len) { + uint64_t len; + size_t varint_size = pb_read_varint(buf, buf_len, &len); + if (varint_size == 0 || varint_size + len > buf_len) return 0; + return varint_size + len; + } + + // Decode AnyValue message expecting string_value (field 1) or int_value (field 3) + // For int_value, converts to string representation + // For kvlist_value (field 6), returns "" placeholder + // Returns the string (caller must free), or NULL on error + static char *pb_decode_anyvalue(const uint8_t *buf, size_t buf_len) { + size_t pos = 0; + while (pos < buf_len) { + uint64_t tag; + size_t tag_size = pb_read_varint(buf + pos, buf_len - pos, &tag); + if (tag_size == 0) return NULL; + pos += tag_size; + + uint32_t field_num = (uint32_t)(tag >> 3); + uint32_t wire_type = (uint32_t)(tag & 0x07); + + if (field_num == 1 && wire_type == 2) { + // string_value (field 1, wire type 2 = length-delimited) + char *value; + size_t field_size = pb_read_string(buf + pos, buf_len - pos, &value); + if (field_size == 0) return NULL; + return value; + } else if (field_num == 3 && wire_type == 0) { + // int_value (field 3, wire type 0 = varint) + uint64_t int_value; + size_t varint_size = pb_read_varint(buf + pos, buf_len - pos, &int_value); + if (varint_size == 0) return NULL; + // Convert int to string + char *str = (char *)calloc(32, 1); + if (!str) return NULL; + snprintf(str, 32, "%" PRId64, (int64_t)int_value); + return str; + } else if (field_num == 6 && wire_type == 2) { + // kvlist_value (field 6, wire type 2 = length-delimited) + // Skip the content but return a placeholder + size_t skip = pb_skip_length_delimited(buf + pos, buf_len - pos); + if (skip == 0) return NULL; + char *placeholder = (char *)calloc(16, 1); + if (!placeholder) return NULL; + strcpy(placeholder, ""); + return placeholder; + } else if (wire_type == 2) { + // Skip other length-delimited fields + size_t skip = pb_skip_length_delimited(buf + pos, buf_len - pos); + if (skip == 0) return NULL; + pos += skip; + } else if (wire_type == 0) { + // Skip varint fields + uint64_t dummy; + size_t skip = pb_read_varint(buf + pos, buf_len - pos, &dummy); + if (skip == 0) return NULL; + pos += skip; + } else { + // Unsupported wire type + return NULL; + } + } + return NULL; // value not found + } - // Check map 16 header (0xde) - if ((unsigned char)*ptr++ != 0xde) return false; + // Decode KeyValue message: field 1 = key (string), field 2 = value (AnyValue) + // Returns true on success, fills key and value (caller must free) + static bool pb_decode_keyvalue(const uint8_t *buf, size_t buf_len, char **out_key, char **out_value) { + *out_key = NULL; + *out_value = NULL; + + size_t pos = 0; + while (pos < buf_len) { + uint64_t tag; + size_t tag_size = pb_read_varint(buf + pos, buf_len - pos, &tag); + if (tag_size == 0) break; + pos += tag_size; + + uint32_t field_num = (uint32_t)(tag >> 3); + uint32_t wire_type = (uint32_t)(tag & 0x07); + + if (wire_type != 2) { + // Skip non-length-delimited fields + if (wire_type == 0) { + uint64_t dummy; + size_t skip = pb_read_varint(buf + pos, buf_len - pos, &dummy); + if (skip == 0) goto error; + pos += skip; + } else { + goto error; // Unsupported wire type + } + continue; + } + + // Read length + uint64_t field_len; + size_t len_size = pb_read_varint(buf + pos, buf_len - pos, &field_len); + if (len_size == 0 || pos + len_size + field_len > buf_len) goto error; + pos += len_size; + + if (field_num == 1) { + // key (string) + *out_key = (char *)calloc(field_len + 1, 1); + if (!*out_key) goto error; + memcpy(*out_key, buf + pos, field_len); + (*out_key)[field_len] = '\0'; + } else if (field_num == 2) { + // value (AnyValue message) + *out_value = pb_decode_anyvalue(buf + pos, field_len); + if (!*out_value) goto error; + } + pos += field_len; + } - // Read count (2 bytes, big endian) - uint16_t count = ((uint8_t)*ptr << 8) | (uint8_t)*(ptr + 1); - ptr += 2; + if (*out_key && *out_value) return true; - // We expect at least 8 pairs (the standard fields) - if (count < 8) return false; + error: + if (*out_key) { free(*out_key); *out_key = NULL; } + if (*out_value) { free(*out_value); *out_value = NULL; } + return false; + } + // Decode protobuf-encoded Resource message into otel_process_ctx_data + static bool otel_process_ctx_decode_payload(const uint8_t *payload, size_t payload_size, otel_process_ctx_data *data_out) { // Initialize output data data_out->deployment_environment_name = NULL; data_out->host_name = NULL; @@ -428,69 +1120,78 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 data_out->telemetry_sdk_version = NULL; data_out->telemetry_sdk_name = NULL; data_out->resources = NULL; + data_out->tls_config = NULL; // TLS config is not decoded back (write-only) - // Allocate resources array with space for all pairs as a simplification (2 entries per pair + 1 for NULL terminator) - data_out->resources = (char **) calloc(count * 2 + 1, sizeof(char *)); + // Allocate resources array (estimate max 64 extra attributes) + data_out->resources = (char **) calloc(128 + 1, sizeof(char *)); if (!data_out->resources) return false; - int resources_index = 0; - // Decode each key-value pair - for (int i = 0; i < count; i++) { - // Check str 16 header for key (0xda) - if ((unsigned char)*ptr++ != 0xda) return false; - - // Read key length (2 bytes, big endian) - uint16_t key_len = ((uint8_t)*ptr << 8) | (uint8_t)*(ptr + 1); - ptr += 2; - - // Get pointer to key (not null-terminated) - char *key_not_terminated = ptr; - ptr += key_len; - - // Check str 16 header for value (0xda) - if ((unsigned char)*ptr++ != 0xda) return false; - - // Read value length (2 bytes, big endian) - uint16_t value_len = ((uint8_t)*ptr << 8) | (uint8_t)*(ptr + 1); - ptr += 2; - - // Read value - char *value = (char *) calloc(value_len + 1, 1); - if (!value) return false; - memcpy(value, ptr, value_len); - value[value_len] = '\0'; - ptr += value_len; - - // Assign to appropriate field based on key - if (key_len == strlen("deployment.environment.name") && memcmp(key_not_terminated, "deployment.environment.name", strlen("deployment.environment.name")) == 0) { - data_out->deployment_environment_name = value; - } else if (key_len == strlen("host.name") && memcmp(key_not_terminated, "host.name", strlen("host.name")) == 0) { - data_out->host_name = value; - } else if (key_len == strlen("service.instance.id") && memcmp(key_not_terminated, "service.instance.id", strlen("service.instance.id")) == 0) { - data_out->service_instance_id = value; - } else if (key_len == strlen("service.name") && memcmp(key_not_terminated, "service.name", strlen("service.name")) == 0) { - data_out->service_name = value; - } else if (key_len == strlen("service.version") && memcmp(key_not_terminated, "service.version", strlen("service.version")) == 0) { - data_out->service_version = value; - } else if (key_len == strlen("telemetry.sdk.language") && memcmp(key_not_terminated, "telemetry.sdk.language", strlen("telemetry.sdk.language")) == 0) { - data_out->telemetry_sdk_language = value; - } else if (key_len == strlen("telemetry.sdk.version") && memcmp(key_not_terminated, "telemetry.sdk.version", strlen("telemetry.sdk.version")) == 0) { - data_out->telemetry_sdk_version = value; - } else if (key_len == strlen("telemetry.sdk.name") && memcmp(key_not_terminated, "telemetry.sdk.name", strlen("telemetry.sdk.name")) == 0) { - data_out->telemetry_sdk_name = value; - } else { - // Unknown key, put it into resources - char *key = (char *) calloc(key_len + 1, 1); - if (!key) { - free(value); - return false; + size_t pos = 0; + while (pos < payload_size) { + uint64_t tag; + size_t tag_size = pb_read_varint(payload + pos, payload_size - pos, &tag); + if (tag_size == 0) break; + pos += tag_size; + + uint32_t field_num = (uint32_t)(tag >> 3); + uint32_t wire_type = (uint32_t)(tag & 0x07); + + if (field_num == 1 && wire_type == 2) { + // attributes field (repeated KeyValue, field 1, wire type 2) + uint64_t kv_len; + size_t len_size = pb_read_varint(payload + pos, payload_size - pos, &kv_len); + if (len_size == 0 || pos + len_size + kv_len > payload_size) return false; + pos += len_size; + + char *key = NULL, *value = NULL; + if (!pb_decode_keyvalue(payload + pos, kv_len, &key, &value)) { + pos += kv_len; + continue; // Skip malformed KeyValue } - memcpy(key, key_not_terminated, key_len); - key[key_len] = '\0'; - - data_out->resources[resources_index++] = key; - data_out->resources[resources_index++] = value; + pos += kv_len; + + // Assign to appropriate field based on key + if (strcmp(key, "deployment.environment.name") == 0) { + free(key); data_out->deployment_environment_name = value; + } else if (strcmp(key, "host.name") == 0) { + free(key); data_out->host_name = value; + } else if (strcmp(key, "service.instance.id") == 0) { + free(key); data_out->service_instance_id = value; + } else if (strcmp(key, "service.name") == 0) { + free(key); data_out->service_name = value; + } else if (strcmp(key, "service.version") == 0) { + free(key); data_out->service_version = value; + } else if (strcmp(key, "telemetry.sdk.language") == 0) { + free(key); data_out->telemetry_sdk_language = value; + } else if (strcmp(key, "telemetry.sdk.version") == 0) { + free(key); data_out->telemetry_sdk_version = value; + } else if (strcmp(key, "telemetry.sdk.name") == 0) { + free(key); data_out->telemetry_sdk_name = value; + } else { + // Unknown key, put into resources + if (resources_index < 126) { // Leave room for NULL terminator + data_out->resources[resources_index++] = key; + data_out->resources[resources_index++] = value; + } else { + free(key); + free(value); + } + } + } else if (wire_type == 2) { + // Skip other length-delimited fields (e.g., dropped_attributes_count, entity_refs) + size_t skip = pb_skip_length_delimited(payload + pos, payload_size - pos); + if (skip == 0) return false; + pos += skip; + } else if (wire_type == 0) { + // Skip varint fields + uint64_t dummy; + size_t skip = pb_read_varint(payload + pos, payload_size - pos, &dummy); + if (skip == 0) return false; + pos += skip; + } else { + // Unsupported wire type + return false; } } @@ -518,6 +1219,8 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 for (int i = 0; data.resources[i] != NULL; i++) free(data.resources[i]); free(data.resources); } + // Note: tls_config is not decoded back from payload (write-only), so it's always NULL here + // But if we ever did decode it, we'd need to free attribute_key_map entries here } otel_process_ctx_read_result otel_process_ctx_read(void) { @@ -526,17 +1229,38 @@ static otel_process_ctx_result otel_process_ctx_encode_payload(char **out, uint3 return (otel_process_ctx_read_result) {.success = false, .error_message = "No OTEL_CTX mapping found (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; } - if (strncmp(mapping->otel_process_ctx_signature, "OTEL_CTX", sizeof(mapping->otel_process_ctx_signature)) != 0 || mapping->otel_process_ctx_version != 1) { - return (otel_process_ctx_read_result) {.success = false, .error_message = "Invalid OTEL_CTX signature or version (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; + // Validate signature + if (strncmp(mapping->otel_process_ctx_signature, "OTEL_CTX", sizeof(mapping->otel_process_ctx_signature)) != 0) { + return (otel_process_ctx_read_result) {.success = false, .error_message = "Invalid OTEL_CTX signature (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; + } + + // Check version (v2 required) + if (mapping->otel_process_ctx_version != 2) { + return (otel_process_ctx_read_result) {.success = false, .error_message = "Unsupported OTEL_CTX version (expected 2) (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; + } + + // Check published_at_ns (0 = update in progress, per v2 spec) + uint64_t timestamp = __atomic_load_n(&mapping->published_at_ns, __ATOMIC_ACQUIRE); + if (timestamp == 0) { + return (otel_process_ctx_read_result) {.success = false, .error_message = "Context update in progress (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; } otel_process_ctx_data data = empty_data; - if (!otel_process_ctx_decode_payload(mapping->otel_process_payload, &data)) { + // Decode protobuf payload + if (!otel_process_ctx_decode_payload((const uint8_t *)mapping->otel_process_payload, + mapping->otel_process_payload_size, &data)) { otel_process_ctx_read_data_drop(data); return (otel_process_ctx_read_result) {.success = false, .error_message = "Failed to decode payload (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; } + // Re-check timestamp to detect concurrent update (per v2 spec reading protocol) + uint64_t timestamp_after = __atomic_load_n(&mapping->published_at_ns, __ATOMIC_ACQUIRE); + if (timestamp_after != timestamp) { + otel_process_ctx_read_data_drop(data); + return (otel_process_ctx_read_result) {.success = false, .error_message = "Context changed during read (" __FILE__ ":" ADD_QUOTES(__LINE__) ")", .data = empty_data}; + } + return (otel_process_ctx_read_result) {.success = true, .error_message = NULL, .data = data}; } diff --git a/ddprof-lib/src/main/cpp/otel_process_ctx.h b/ddprof-lib/src/main/cpp/otel_process_ctx.h index 878949a5e..d9bcf2e18 100644 --- a/ddprof-lib/src/main/cpp/otel_process_ctx.h +++ b/ddprof-lib/src/main/cpp/otel_process_ctx.h @@ -1,12 +1,12 @@ // Unless explicitly stated otherwise all files in this repository are licensed under the Apache License (Version 2.0). -// This product includes software developed at Datadog (https://www.datadoghq.com/) Copyright 2025 Datadog, Inc. +// This product includes software developed at Datadog (https://www.datadoghq.com/) Copyright 2026 Datadog, Inc. #pragma once -#define OTEL_PROCESS_CTX_VERSION_MAJOR 0 +#define OTEL_PROCESS_CTX_VERSION_MAJOR 2 #define OTEL_PROCESS_CTX_VERSION_MINOR 0 -#define OTEL_PROCESS_CTX_VERSION_PATCH 7 -#define OTEL_PROCESS_CTX_VERSION_STRING "0.0.7" +#define OTEL_PROCESS_CTX_VERSION_PATCH 0 +#define OTEL_PROCESS_CTX_VERSION_STRING "2.0.0" #ifdef __cplusplus extern "C" { @@ -24,6 +24,24 @@ extern "C" { * On non-Linux OS's (or when OTEL_PROCESS_CTX_NOOP is defined) no-op versions of functions are supplied. */ +/** + * TLS context sharing configuration. + * + * When set in otel_process_ctx_data.tls_config, these fields are encoded as: + * - threadlocal.schema_version = schema_version (string, e.g. "tlsdesc_v1_dev") + * - threadlocal.max_record_size = max_record_size (int64) + * - threadlocal.attribute_key_map = attribute_key_map (array of strings, position = index) + * + * These fields allow external profilers to discover and decode thread-local context records. + */ +typedef struct { + char *schema_version; // TLS schema version string (e.g. "tlsdesc_v1_dev") + int max_record_size; // Maximum bytes per TLS record + // Key index to name mapping (NULL-terminated array of key names) + // Position in array = key index (e.g. ["method", "route", NULL] means index 0 = "method", index 1 = "route") + char **attribute_key_map; +} otel_tls_config; + /** * Data that can be published as a process context. * @@ -69,6 +87,9 @@ typedef struct { // Can be NULL if no resources are needed; if non-NULL, this array MUST be terminated with a NULL entry. // Every even entry is a key, every odd entry is a value (E.g. "key1", "value1", "key2", "value2", NULL). char **resources; + // TLS context sharing configuration (optional, set to NULL if not used) + // When set, additional threadlocal.* attributes are included in the process context. + otel_tls_config *tls_config; } otel_process_ctx_data; /** Number of entries in the `otel_process_ctx_data` struct. Can be used to easily detect when the struct is updated. */ diff --git a/ddprof-lib/src/main/cpp/profiler.cpp b/ddprof-lib/src/main/cpp/profiler.cpp index 0391a9907..4969690d4 100644 --- a/ddprof-lib/src/main/cpp/profiler.cpp +++ b/ddprof-lib/src/main/cpp/profiler.cpp @@ -7,6 +7,7 @@ #include "profiler.h" #include "asyncSampleMutex.h" #include "context.h" +#include "context_api.h" #include "criticalSection.h" #include "common.h" #include "counters.h" @@ -1406,6 +1407,9 @@ Error Profiler::start(Arguments &args, bool reset) { _libs->updateBuildIds(); } + // Initialize context storage (TLS or OTEL mode based on args) + ContextApi::initialize(args); + enableEngines(); switchLibraryTrap(_cstack == CSTACK_DWARF || _remote_symbolication); @@ -1509,6 +1513,9 @@ Error Profiler::stop() { // owned by library metadata, so we must keep library patches active until after serialization LibraryPatcher::unpatch_libraries(); + // Shutdown context storage (unmaps OTEL buffer if in OTEL mode) + ContextApi::shutdown(); + _state = IDLE; return Error::OK; } diff --git a/ddprof-lib/src/main/cpp/thread.cpp b/ddprof-lib/src/main/cpp/thread.cpp index d0ac5fa10..bf9d39485 100644 --- a/ddprof-lib/src/main/cpp/thread.cpp +++ b/ddprof-lib/src/main/cpp/thread.cpp @@ -9,6 +9,7 @@ static int g_tls_prime_signal = -1; pthread_key_t ProfiledThread::_tls_key; +volatile bool ProfiledThread::_tls_key_initialized = false; int ProfiledThread::_buffer_size = 0; volatile int ProfiledThread::_running_buffer_pos = 0; ProfiledThread** ProfiledThread::_buffer = nullptr; @@ -20,7 +21,11 @@ void ProfiledThread::initTLSKey() { pthread_once(&tls_initialized, doInitTLSKey); } -void ProfiledThread::doInitTLSKey() { pthread_key_create(&_tls_key, freeKey); } +void ProfiledThread::doInitTLSKey() { + pthread_key_create(&_tls_key, freeKey); + // Use release semantics to ensure the key is visible to other threads + __atomic_store_n(&_tls_key_initialized, true, __ATOMIC_RELEASE); +} inline void ProfiledThread::freeKey(void *key) { ProfiledThread *tls_ref = (ProfiledThread *)(key); @@ -267,8 +272,11 @@ ProfiledThread *ProfiledThread::current() { ProfiledThread *ProfiledThread::currentSignalSafe() { // Signal-safe: never allocate, just return existing TLS or null - pthread_key_t key = _tls_key; - return key != 0 ? (ProfiledThread *)pthread_getspecific(key) : nullptr; + // Use acquire semantics to synchronize with the release in doInitTLSKey() + if (!__atomic_load_n(&_tls_key_initialized, __ATOMIC_ACQUIRE)) { + return nullptr; + } + return (ProfiledThread *)pthread_getspecific(_tls_key); } bool ProfiledThread::isTlsPrimingAvailable() { diff --git a/ddprof-lib/src/main/cpp/thread.h b/ddprof-lib/src/main/cpp/thread.h index 7d416a1d9..2777e4ce9 100644 --- a/ddprof-lib/src/main/cpp/thread.h +++ b/ddprof-lib/src/main/cpp/thread.h @@ -30,6 +30,7 @@ class ProfiledThread : public ThreadLocalData { // Even with 5 levels cap we will need any highly recursing signal handlers static constexpr u32 CRASH_HANDLER_NESTING_LIMIT = 5; static pthread_key_t _tls_key; + static volatile bool _tls_key_initialized; // Tracks whether _tls_key is valid static int _buffer_size; static volatile int _running_buffer_pos; static ProfiledThread** _buffer; diff --git a/ddprof-lib/src/main/cpp/wallClock.cpp b/ddprof-lib/src/main/cpp/wallClock.cpp index 5f1c0e6da..8228c68bc 100644 --- a/ddprof-lib/src/main/cpp/wallClock.cpp +++ b/ddprof-lib/src/main/cpp/wallClock.cpp @@ -7,6 +7,7 @@ #include "wallClock.h" #include "stackFrame.h" #include "context.h" +#include "context_api.h" #include "debugSupport.h" #include "libraries.h" #include "log.h" @@ -68,11 +69,12 @@ void WallClockASGCT::signalHandler(int signo, siginfo_t *siginfo, void *ucontext u64 call_trace_id = 0; if (current != NULL && _collapsing) { StackFrame frame(ucontext); - Context &context = Contexts::get(); + u64 spanId = 0, rootSpanId = 0; + ContextApi::get(spanId, rootSpanId); call_trace_id = current->lookupWallclockCallTraceId( (u64)frame.pc(), (u64)frame.sp(), Profiler::instance()->recordingEpoch(), - context.spanId, context.rootSpanId); + spanId, rootSpanId); if (call_trace_id != 0) { Counters::increment(SKIPPED_WALLCLOCK_UNWINDS); } diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/OTelContext.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/OTelContext.java index d3b74a7cb..3be45f168 100644 --- a/ddprof-lib/src/main/java/com/datadoghq/profiler/OTelContext.java +++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/OTelContext.java @@ -40,6 +40,48 @@ private static final class SingletonHolder { static final OTelContext INSTANCE = new OTelContext(); } + /** + * Represents TLS context sharing configuration. + * + *

This configuration is used to expose thread-local storage context information + * to external profilers. The key map maps indices to attribute names, allowing + * external readers to decode compact TLS records. + */ + public static final class TlsConfig { + /** Default schema version for TLS context sharing (tlsdesc_v1_dev) */ + public static final String DEFAULT_SCHEMA_VERSION = "tlsdesc_v1_dev"; + + /** TLS schema version string (e.g. "tlsdesc_v1_dev") */ + public final String schemaVersion; + /** Maximum bytes per TLS record */ + public final int maxRecordSize; + /** Key names in index order (position = key index, e.g. ["method", "route"]) */ + public final String[] attributeKeyMap; + + /** + * Creates a TLS configuration with the default schema version. + * + * @param maxRecordSize maximum bytes per TLS record + * @param attributeKeyMap key names in index order (position = key index) + */ + public TlsConfig(int maxRecordSize, String[] attributeKeyMap) { + this(DEFAULT_SCHEMA_VERSION, maxRecordSize, attributeKeyMap); + } + + /** + * Creates a TLS configuration with a custom schema version. + * + * @param schemaVersion TLS schema version string (e.g. "tlsdesc_v1_dev") + * @param maxRecordSize maximum bytes per TLS record + * @param attributeKeyMap key names in index order (position = key index) + */ + public TlsConfig(String schemaVersion, int maxRecordSize, String[] attributeKeyMap) { + this.schemaVersion = schemaVersion; + this.maxRecordSize = maxRecordSize; + this.attributeKeyMap = attributeKeyMap; + } + } + /** * Represents the OpenTelemetry process context data. */ @@ -63,7 +105,7 @@ public ProcessContext(String deploymentEnvironmentName, String hostName, String this.telemetrySdkVersion = telemetrySdkVersion; this.telemetrySdkName = telemetrySdkName; } - + @Override public String toString() { return String.format("ProcessContext{deploymentEnvironmentName='%s', hostName='%s', serviceInstanceId='%s', serviceName='%s', serviceVersion='%s', telemetrySdkLanguage='%s', telemetrySdkVersion='%s', telemetrySdkName='%s'}", @@ -211,17 +253,45 @@ public ProcessContext readProcessContext() { * @see OpenTelemetry Deployment Attributes */ public void setProcessContext(String env, String hostname, String runtimeId, String service, String version, String tracerVersion) { + setProcessContext(env, hostname, runtimeId, service, version, tracerVersion, null); + } + + /** + * Sets the OpenTelemetry process context with optional TLS configuration. + * + *

This overload allows specifying TLS context sharing configuration in addition + * to the basic service metadata. The TLS config enables external profilers to + * discover and decode thread-local context records. + * + * @param env the deployment environment name + * @param hostname the hostname of the service + * @param runtimeId the unique identifier for this service instance + * @param service the logical name of the service + * @param version the version of the service + * @param tracerVersion the version of the tracer + * @param tlsConfig TLS context sharing configuration, or null to omit + * + * @see #setProcessContext(String, String, String, String, String, String) + */ + public void setProcessContext(String env, String hostname, String runtimeId, String service, String version, String tracerVersion, TlsConfig tlsConfig) { if (!libraryLoadResult.succeeded) { return; } try { lock.writeLock().lock(); - setProcessCtx0(env, hostname, runtimeId, service, version, tracerVersion); + if (tlsConfig != null) { + setProcessCtxWithTls0(env, hostname, runtimeId, service, version, tracerVersion, + tlsConfig.schemaVersion, tlsConfig.maxRecordSize, tlsConfig.attributeKeyMap); + } else { + setProcessCtx0(env, hostname, runtimeId, service, version, tracerVersion); + } } finally { lock.writeLock().unlock(); - } + } } private static native void setProcessCtx0(String env, String hostname, String runtimeId, String service, String version, String tracerVersion); + private static native void setProcessCtxWithTls0(String env, String hostname, String runtimeId, String service, String version, String tracerVersion, + String schemaVersion, int maxRecordSize, String[] attributeKeyMap); private static native ProcessContext readProcessCtx0(); } diff --git a/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java b/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java index b689df414..853dd6f28 100644 --- a/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java +++ b/ddprof-lib/src/main/java/com/datadoghq/profiler/ThreadContext.java @@ -1,5 +1,5 @@ /* - * Copyright 2025, Datadog, Inc + * Copyright 2025, 2026 Datadog, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,19 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; +/** + * Thread-local context for trace/span identification. + * + *

Provides access to thread-local context storage used by the profiler to correlate + * samples with distributed traces. Supports two storage modes: + *

    + *
  • Profiler mode (default): Context stored in TLS via direct ByteBuffer mapping
  • + *
  • OTEL mode: Context stored in OTEL ring buffer accessible by external profilers
  • + *
+ * + *

The storage mode is determined at profiler startup via the {@code ctxstorage} option. + * Reading and writing context automatically routes to the correct storage via JNI. + */ public final class ThreadContext { /** * Knuth's multiplicative hash constant for 64-bit values. @@ -58,6 +71,13 @@ public static long computeContextChecksum(long spanId, long rootSpanId) { private final boolean useJNI; + /** + * True if OTEL context storage mode is active. + * In OTEL mode, context reads must go through JNI since the buffer + * is a ring buffer indexed by TID, not a direct TLS mapping. + */ + private final boolean otelMode; + /** * Creates a ThreadContext with native struct field offsets. * @@ -79,16 +99,57 @@ public ThreadContext(ByteBuffer buffer, int[] offsets) { this.customTagsOffset = offsets[3]; // For Java 17 and later the cost of downcall to JNI is negligible useJNI = Platform.isJavaVersionAtLeast(17); + // Check if OTEL mode is active - if so, reads must go through JNI + otelMode = isOtelMode0(); } + /** + * Cached context values from last JNI call in OTEL mode. + * Used to provide atomic reads of spanId and rootSpanId together. + * Thread-local by design (ThreadContext is per-thread). + */ + private long[] cachedOtelContext; + + /** + * Gets the span ID from the current thread's context. + * + *

In OTEL mode, reads from the OTEL ring buffer via JNI. + * In profiler mode, reads directly from the TLS ByteBuffer. + * + * @return the span ID, or 0 if not set + */ public long getSpanId() { + if (otelMode) { + refreshOtelContextCache(); + return cachedOtelContext != null ? cachedOtelContext[0] : 0; + } return buffer.getLong(spanIdOffset); } + /** + * Gets the root span ID from the current thread's context. + * + *

In OTEL mode, reads from the OTEL ring buffer via JNI. + * In profiler mode, reads directly from the TLS ByteBuffer. + * + * @return the root span ID, or 0 if not set + */ public long getRootSpanId() { + if (otelMode) { + refreshOtelContextCache(); + return cachedOtelContext != null ? cachedOtelContext[1] : 0; + } return buffer.getLong(rootSpanIdOffset); } + /** + * Refreshes the cached OTEL context from native storage. + * Called before reading spanId or rootSpanId in OTEL mode. + */ + private void refreshOtelContextCache() { + cachedOtelContext = getContext0(); + } + public long getChecksum() { return buffer.getLong(checksumOffset); } @@ -134,4 +195,21 @@ private long setContextSlotJava(int offset, int value) { private static native long setContext0(long spanId, long rootSpanId); private static native void setContextSlot0(int offset, int value); + + /** + * Checks if OTEL context storage mode is active. + * + * @return true if OTEL mode is active, false for default profiler mode + */ + private static native boolean isOtelMode0(); + + /** + * Reads context via the native ContextApi. + * + *

This method routes to the appropriate storage backend based on the + * active storage mode (OTEL ring buffer or TLS). + * + * @return array with [spanId, rootSpanId], or null on error + */ + private static native long[] getContext0(); } diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/context/OtelContextStorageModeTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/context/OtelContextStorageModeTest.java new file mode 100644 index 000000000..0c4131889 --- /dev/null +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/context/OtelContextStorageModeTest.java @@ -0,0 +1,192 @@ +/* + * Copyright 2026, Datadog, Inc + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.datadoghq.profiler.context; + +import com.datadoghq.profiler.JavaProfiler; +import com.datadoghq.profiler.Platform; +import com.datadoghq.profiler.ThreadContext; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assumptions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * Tests for OTEL-compatible context storage mode. + * + *

The profiler supports two context storage modes controlled by the {@code ctxstorage} option: + *

    + *
  • {@code profiler}: Uses TLS-based storage with checksum validation
  • + *
  • {@code otel} (default): Uses OTEL-compatible ring buffer storage (Linux only)
  • + *
+ * + *

The OTEL mode creates a named mmap region that can be discovered by external + * profilers (like DDProf) via {@code /proc//maps}. + * + *

Note: The Java API (getThreadContext) reads from TLS, not the OTEL buffer. + * Full OTEL mode verification requires external profiler integration or a native + * JNI method to read from the OTEL buffer. + */ +public class OtelContextStorageModeTest { + + private static JavaProfiler profiler; + private boolean profilerStarted = false; + + @BeforeAll + public static void setup() throws IOException { + profiler = JavaProfiler.getInstance(); + } + + @AfterEach + public void cleanup() { + if (profilerStarted) { + profiler.stop(); + profilerStarted = false; + } + } + + /** + * Tests that the default (OTEL) mode works correctly. + * Context values written should be readable back. + */ + @Test + public void testDefaultOtelModeContext() throws Exception { + Path jfrFile = Files.createTempFile("otel-ctx-default", ".jfr"); + + profiler.execute(String.format("start,cpu=1ms,jfr,file=%s", jfrFile.toAbsolutePath())); + profilerStarted = true; + + // Clear any previous context + profiler.setContext(0, 0); + + // Write context + long spanId = 0x1234567890ABCDEFL; + long rootSpanId = 0xFEDCBA0987654321L; + profiler.setContext(spanId, rootSpanId); + + // Verify context is readable (routes through OTEL buffer by default) + ThreadContext ctx = profiler.getThreadContext(); + assertEquals(spanId, ctx.getSpanId(), "SpanId should match"); + assertEquals(rootSpanId, ctx.getRootSpanId(), "RootSpanId should match"); + } + + /** + * Tests that the profiler mode works correctly when explicitly specified. + * Context values written should be readable back via TLS. + */ + @Test + public void testExplicitProfilerModeContext() throws Exception { + Path jfrFile = Files.createTempFile("otel-ctx-profiler", ".jfr"); + + profiler.execute(String.format("start,cpu=1ms,ctxstorage=profiler,jfr,file=%s", jfrFile.toAbsolutePath())); + profilerStarted = true; + + // Clear any previous context + profiler.setContext(0, 0); + + // Write context + long spanId = 0x9999888877776666L; + long rootSpanId = 0x1111222233334444L; + profiler.setContext(spanId, rootSpanId); + + // Verify context is readable from TLS + ThreadContext ctx = profiler.getThreadContext(); + assertEquals(spanId, ctx.getSpanId(), "SpanId should match"); + assertEquals(rootSpanId, ctx.getRootSpanId(), "RootSpanId should match"); + } + + /** + * Tests that OTEL storage mode starts successfully and creates a discoverable buffer on Linux. + * The OTEL mode creates a named mmap region that external profilers can find. + */ + @Test + public void testOtelStorageModeStartsOnLinux() throws Exception { + Assumptions.assumeTrue(Platform.isLinux(), "OTEL storage mode only fully supported on Linux"); + + Path jfrFile = Files.createTempFile("otel-ctx-otel", ".jfr"); + + // Start profiler with OTEL context storage mode - should not throw + profiler.execute(String.format("start,cpu=1ms,ctxstorage=otel,jfr,file=%s", jfrFile.toAbsolutePath())); + profilerStarted = true; + + // Set context - this writes to the OTEL buffer + long spanId = 0xAAAABBBBCCCCDDDDL; + long rootSpanId = 0x1111222233334444L; + profiler.setContext(spanId, rootSpanId); + + // Verify context can be read back via getThreadContext() (routes through JNI in OTEL mode) + // This is the primary functional test - context must round-trip correctly + ThreadContext ctx = profiler.getThreadContext(); + assertEquals(spanId, ctx.getSpanId(), "SpanId should match in OTEL mode"); + assertEquals(rootSpanId, ctx.getRootSpanId(), "RootSpanId should match in OTEL mode"); + + // Verify mmap region naming in /proc/self/maps (informational) + // Note: PR_SET_VMA_ANON_NAME requires kernel 5.17+ and may not work in all environments + // The OTEL buffer still works for discovery via magic number scanning if naming fails + boolean hasNamedRegion = checkMapsContains("DD_OTEL_CTX"); + if (!hasNamedRegion) { + System.out.println("INFO: DD_OTEL_CTX mmap naming not available " + + "(requires kernel 5.17+ with PR_SET_VMA_ANON_NAME support)"); + } + } + + /** + * Tests that OTEL mode can be requested on any platform without crashing. + * On non-Linux systems, it falls back to profiler mode. + */ + @Test + public void testOtelModeStartsOnAnyPlatform() throws Exception { + Path jfrFile = Files.createTempFile("otel-ctx-any", ".jfr"); + + // Start profiler with OTEL context storage mode - should not throw on any platform + profiler.execute(String.format("start,cpu=1ms,ctxstorage=otel,jfr,file=%s", jfrFile.toAbsolutePath())); + profilerStarted = true; + + // Context operations should not crash + profiler.setContext(0x123L, 0x456L); + + // On all platforms, the profiler should be running + // (Context read verification is platform-specific due to TLS vs OTEL buffer) + } + + /** + * Checks if /proc/self/maps contains the specified string. + * Java 8 compatible implementation. + */ + private boolean checkMapsContains(String searchString) throws IOException { + Path mapsFile = Paths.get("/proc/self/maps"); + if (!Files.exists(mapsFile)) { + return false; + } + try (BufferedReader reader = Files.newBufferedReader(mapsFile, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + if (line.contains(searchString)) { + return true; + } + } + } + return false; + } +} diff --git a/ddprof-test/src/test/java/com/datadoghq/profiler/context/ProcessContextTest.java b/ddprof-test/src/test/java/com/datadoghq/profiler/context/ProcessContextTest.java index fac9421a1..4b9ada2f4 100644 --- a/ddprof-test/src/test/java/com/datadoghq/profiler/context/ProcessContextTest.java +++ b/ddprof-test/src/test/java/com/datadoghq/profiler/context/ProcessContextTest.java @@ -49,23 +49,38 @@ private static class OtelMappingInfo { } } + /** + * Finds the OTEL_CTX mapping in /proc/self/maps. + * Supports both memfd mappings (/memfd:OTEL_CTX) and named anonymous mappings ([anon:OTEL_CTX]). + */ private OtelMappingInfo findOtelMapping() throws IOException { Path mapsFile = Paths.get("/proc/self/maps"); if (!Files.exists(mapsFile)) { return null; } - - Pattern otelPattern = Pattern.compile("^([0-9a-f]+)-([0-9a-f]+)\\s+(\\S+)\\s+\\S+\\s+\\S+\\s+\\S+\\s*\\[anon:OTEL_CTX\\].*$"); - + + // Pattern for named anonymous mapping: [anon:OTEL_CTX] + Pattern anonPattern = Pattern.compile("^([0-9a-f]+)-([0-9a-f]+)\\s+(\\S+)\\s+\\S+\\s+\\S+\\s+\\S+\\s*\\[anon:OTEL_CTX\\].*$"); + // Pattern for memfd mapping: /memfd:OTEL_CTX (deleted) + Pattern memfdPattern = Pattern.compile("^([0-9a-f]+)-([0-9a-f]+)\\s+(\\S+)\\s+.*?/memfd:OTEL_CTX.*$"); + try (BufferedReader reader = Files.newBufferedReader(mapsFile)) { String line; while ((line = reader.readLine()) != null) { - Matcher matcher = otelPattern.matcher(line); - if (matcher.matches()) { + Matcher anonMatcher = anonPattern.matcher(line); + if (anonMatcher.matches()) { + return new OtelMappingInfo( + anonMatcher.group(1), + anonMatcher.group(2), + anonMatcher.group(3) + ); + } + Matcher memfdMatcher = memfdPattern.matcher(line); + if (memfdMatcher.matches()) { return new OtelMappingInfo( - matcher.group(1), - matcher.group(2), - matcher.group(3) + memfdMatcher.group(1), + memfdMatcher.group(2), + memfdMatcher.group(3) ); } } @@ -76,8 +91,10 @@ private OtelMappingInfo findOtelMapping() throws IOException { private void verifyMappingPermissions(OtelMappingInfo mapping) { assertTrue(mapping.permissions.contains("r"), "OTEL mapping should have read permission, got: " + mapping.permissions); - assertFalse(mapping.permissions.contains("w"), - "OTEL mapping should not have write permission, got: " + mapping.permissions); + // Per PR #34: mappings stay writable (rw-p or rw-s) for in-place updates + // Accept both read-only (old) and read-write (new) permissions + assertTrue(mapping.permissions.matches("r.-.") || mapping.permissions.matches("rw-."), + "OTEL mapping should have r--p, r--s, rw-p, or rw-s permissions, got: " + mapping.permissions); assertFalse(mapping.permissions.contains("x"), "OTEL mapping should not have execute permission, got: " + mapping.permissions); } @@ -107,4 +124,68 @@ public void testNativeReadBackFunctionality() { assertEquals(tracerVersion, readContext.telemetrySdkVersion, "Tracer version should match"); assertEquals("dd-trace-java", readContext.telemetrySdkName, "Tracer name should match"); } + + /** + * Tests that calling setProcessContext multiple times correctly updates the context. + * This verifies the v2 update protocol works correctly. + */ + @Test + public void testMultipleContextUpdates() { + Assumptions.assumeTrue(Platform.isLinux()); + + OTelContext context = OTelContext.getInstance(); + + // First context + context.setProcessContext("env1", "host1", "instance1", "service1", "1.0.0", "1.0.0"); + OTelContext.ProcessContext ctx1 = context.readProcessContext(); + assertNotNull(ctx1, "First context should be readable"); + assertEquals("env1", ctx1.deploymentEnvironmentName); + assertEquals("service1", ctx1.serviceName); + + // Update context + context.setProcessContext("env2", "host2", "instance2", "service2", "2.0.0", "2.0.0"); + OTelContext.ProcessContext ctx2 = context.readProcessContext(); + assertNotNull(ctx2, "Updated context should be readable"); + assertEquals("env2", ctx2.deploymentEnvironmentName); + assertEquals("service2", ctx2.serviceName); + assertEquals("2.0.0", ctx2.serviceVersion); + + // Update again + context.setProcessContext("env3", "host3", "instance3", "service3", "3.0.0", "3.0.0"); + OTelContext.ProcessContext ctx3 = context.readProcessContext(); + assertNotNull(ctx3, "Third context should be readable"); + assertEquals("env3", ctx3.deploymentEnvironmentName); + assertEquals("service3", ctx3.serviceName); + } + + /** + * Tests process context with TLS configuration. + * This verifies that TLS config is properly encoded into the process context. + */ + @Test + public void testProcessContextWithTlsConfig() throws IOException { + Assumptions.assumeTrue(Platform.isLinux()); + + OTelContext context = OTelContext.getInstance(); + + // Create TLS config with attribute key map + // New format: key names in index order (position = key index) + // Index 0 = "method", Index 1 = "route", Index 2 = "user" + String[] keyMap = {"method", "route", "user"}; + OTelContext.TlsConfig tlsConfig = new OTelContext.TlsConfig(512, keyMap); + + // Set process context with TLS config + context.setProcessContext("prod", "myhost", "instance-123", "myservice", "1.0.0", "3.5.0", tlsConfig); + + // Verify basic context is readable + OTelContext.ProcessContext ctx = context.readProcessContext(); + assertNotNull(ctx, "Context should be readable"); + assertEquals("prod", ctx.deploymentEnvironmentName); + assertEquals("myservice", ctx.serviceName); + + // Verify mapping exists (TLS config is encoded in payload but not read back) + OtelMappingInfo mapping = findOtelMapping(); + assertNotNull(mapping, "OTEL mapping should exist with TLS config"); + verifyMappingPermissions(mapping); + } } diff --git a/doc/OTelContextReference.md b/doc/OTelContextReference.md new file mode 100644 index 000000000..69cf130db --- /dev/null +++ b/doc/OTelContextReference.md @@ -0,0 +1,256 @@ +# OpenTelemetry Context Reference Implementation + +## Overview + +The reference implementation for the OpenTelemetry context sharing specification is maintained in the `ctx-sharing-demo` repository. This document provides quick instructions for setting up and using the reference implementation to validate Java profiler integration. + +## Repository Location + +The reference implementation is located at: +``` +~/dd/ctx-sharing-demo +``` + +## Quick Start + +### 1. Build the Reference Implementation + +```bash +cd ~/dd/ctx-sharing-demo +cargo build --release +``` + +This builds: +- `context-reader`: Tools for reading and validating OTEL context from running processes +- `custom-labels`: Rust library implementing the OTEL context specification +- `simple-writer`: Minimal C implementations for testing + +### 2. Validate Java Process Context + +Start a Java application that publishes OTEL context (e.g., demo-java): + +```bash +# In one terminal - run your Java app with OTEL context +java -javaagent:ddprof.jar -jar your-app.jar +``` + +In another terminal, validate the context: + +```bash +cd ~/dd/ctx-sharing-demo/context-reader + +# Find your Java process +jps -l + +# Validate context reading (replace with actual PID) +sudo ./target/release/validate +``` + +Expected output on success: +``` +VALIDATE OK: [v2] thread=12345, labels=[trace_id=..., span_id=..., ...] +``` + +### 3. Read Context Continuously + +To continuously read and display context from a running process: + +```bash +sudo ./target/release/tail +``` + +This will sample the process periodically and display any active tracing context. + +## Key Specification Files + +### Process Context Format + +**Header Structure** (`custom-labels/src/process_context/model.rs`): +```rust +pub const SIGNATURE: &[u8; 8] = b"OTEL_CTX"; +pub const PROCESS_CTX_VERSION: u32 = 2; +``` + +**Discovery** (`custom-labels/src/process_context/reader.rs`): +- Memfd: `/memfd:OTEL_CTX` in `/proc//maps` +- Anonymous: `[anon:OTEL_CTX]` in `/proc//maps` +- Signature scan: Search for `OTEL_CTX` signature bytes + +**Payload Encoding** (`custom-labels/src/process_context/encoding.rs`): +- Protobuf `opentelemetry.proto.resource.v1.Resource` message +- Required fields for TLS config: + - `threadlocal.schema_version` = `"tlsdesc_v1_dev"` (String) + - `threadlocal.max_record_size` = int64 + - `threadlocal.attribute_key_map` = Array of strings (position = key index) + +### TLS Record Format (V2) + +**Schema** (`custom-labels/src/customlabels_v2.h`): +```c +typedef struct { + uint8_t trace_id[16]; // bytes 0-15 (network order) + uint8_t span_id[8]; // bytes 16-23 (network order) + uint8_t valid; // byte 24 (non-zero if valid) + uint8_t _padding; // byte 25 (padding) + uint16_t attrs_data_size; // bytes 26-27 (little-endian) + uint8_t attrs_data[]; // bytes 28+ (attributes) +} custom_labels_v2_tl_record_t; +``` + +**Header size**: 28 bytes + +**Discovery Symbol** (`context-reader/src/v2_reader.rs`): +``` +custom_labels_current_set_v2 +``` + +Thread-local pointer to the current V2 record, or NULL if no context is set. + +## Simple C Writer Example + +For quick prototyping, see `simple-writer/process_context.c`: + +```bash +cd ~/dd/ctx-sharing-demo/simple-writer +make +./writer_v2 # Publishes process context and waits +``` + +In another terminal: +```bash +# Read the context back +./reader_v2 +``` + +## Key Implementation Points + +### Process Context + +1. **Mapping Permissions**: Use `rw-p` (anonymous) or `rw-s` (memfd) + - Do NOT use `mprotect()` to make read-only + - Writable mappings allow in-place updates (PR #34) + +2. **Mapping Size**: 1 page (new) or 2 pages (old, deprecated) + ```c + long page_size = sysconf(_SC_PAGESIZE); + size_t mapping_size = page_size; // Use 1 page + ``` + +3. **Update Protocol** (PR #34): + - Set `published_at_ns = 0` (signals update in progress) + - Memory fence + - Update payload pointer and size + - Memory fence + - Set `published_at_ns = ` (signals complete) + +### TLS Records + +1. **Trace/Span IDs**: Network byte order (big-endian) + ```c + // Write as big-endian bytes + record->trace_id[0] = (trace_id >> 56) & 0xFF; + record->trace_id[1] = (trace_id >> 48) & 0xFF; + // ... etc + ``` + +2. **Attributes**: `[key_index:1][length:1][value:length]` format + - Key index references position in `attribute_key_map` array + - Length and value follow immediately + +3. **Thread Safety**: Use atomic operations for `valid` flag + ```c + record->valid = 0; // Clear first + __atomic_thread_fence(__ATOMIC_SEQ_CST); + // ... write data ... + __atomic_thread_fence(__ATOMIC_SEQ_CST); + record->valid = 1; // Set last + ``` + +## Validation Modes + +### Ptrace Mode (Default) + +Attaches to the process using ptrace to read TLS: +```bash +sudo ./target/release/validate +``` + +More compatible but higher overhead. + +### eBPF Mode + +Uses eBPF probes for lower overhead: +```bash +sudo ./target/release/validate --mode ebpf +``` + +Requires kernel 5.2+ with BTF support. + +## Common Issues + +### "No process-context found" + +**Cause**: Process context mapping not discoverable + +**Check**: +```bash +cat /proc//maps | grep -E "OTEL_CTX|rw.s.*memfd" +``` + +**Expected**: Should see `/memfd:OTEL_CTX` or `[anon:OTEL_CTX]` mapping + +**Fix**: Ensure `otel_process_ctx_publish()` is called with valid TLS config + +### "No TLS readers could be initialized" + +**Cause**: Neither V1 nor V2 TLS symbols found in process + +**Check**: +```bash +nm -D /path/to/libjavaProfiler.so | grep custom_labels +``` + +**Expected**: Should see `custom_labels_current_set_v2` + +**Fix**: Ensure the profiler library is loaded and exports the TLS symbol + +### "Invalid signature" + +**Cause**: Signature mismatch or wrong struct layout + +**Check**: Verify header structure matches exactly: +- Signature at offset 0 (8 bytes = "OTEL_CTX") +- Version at offset 8 (4 bytes = 2) +- Use `__attribute__((packed))` in C/C++ + +### Permission Errors + +The validator requires root or CAP_SYS_PTRACE to read from other processes: + +```bash +# Option 1: Run as root +sudo ./target/release/validate + +# Option 2: Grant capabilities +sudo setcap cap_sys_ptrace+ep ./target/release/validate +./target/release/validate +``` + +## Additional Resources + +- **Specification Changes**: Check git log in `custom-labels/src/process_context/` +- **Test Cases**: See `custom-labels/src/process_context/tests.rs` and `custom-labels/src/v2/tests.rs` +- **Integration Examples**: + - Java: `demo-java/` + - C: `simple-writer/` + - Rust: `custom-labels/examples/` + +## Updating to Latest Specification + +```bash +cd ~/dd/ctx-sharing-demo +git pull +cargo build --release +``` + +Always rebuild both the reference implementation and the Java profiler after specification updates to ensure compatibility. diff --git a/doc/architecture/OtelContextStorage.md b/doc/architecture/OtelContextStorage.md new file mode 100644 index 000000000..a893010bc --- /dev/null +++ b/doc/architecture/OtelContextStorage.md @@ -0,0 +1,557 @@ +# OTEL-Compatible Context Storage Architecture + +## Overview + +The OTEL Context Storage system provides two distinct context sharing mechanisms: + +1. **Thread-Level Context**: Ring buffer storage for per-thread trace/span context (existing implementation) +2. **Process-Level Context**: Service metadata shared via memory-mapped regions (v2 specification compliant) + +This document covers both mechanisms. The process-level context follows the [OpenTelemetry Process Context v2 specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/oteps/profiles/4719-process-ctx.md). + +## Process Context (v2 Specification) + +### Header Structure + +The process context uses a memory-mapped region with the following v2-compliant header: + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Process Context Header (v2) │ +├─────────────────────────────────────────────────────────────────────────┤ +│ Offset │ Size │ Field │ Description │ +├─────────┼───────┼──────────────────┼────────────────────────────────────┤ +│ 0x00 │ 8 │ signature │ "OTEL_CTX" (written last) │ +│ 0x08 │ 4 │ version │ Protocol version = 2 │ +│ 0x0C │ 4 │ payload_size │ Size of protobuf payload │ +│ 0x10 │ 8 │ published_at_ns │ Timestamp (0 = update in progress) │ +│ 0x18 │ 8 │ payload │ Pointer to protobuf data │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### Payload Format + +The payload is encoded as a Protocol Buffers message following `opentelemetry.proto.resource.v1.Resource`: + +```protobuf +message Resource { + repeated KeyValue attributes = 1; // Service metadata +} + +message KeyValue { + string key = 1; + AnyValue value = 2; +} + +message AnyValue { + oneof value { + string string_value = 1; + // ... other types + } +} +``` + +### Memory Allocation Strategy + +Per v2 spec, the implementation prefers `memfd_create` with fallback to anonymous mmap: + +1. **memfd_create** (preferred): Creates `/memfd:OTEL_CTX` visible in `/proc/pid/maps` +2. **Anonymous mmap** (fallback): Creates `[anon:OTEL_CTX]` via `prctl(PR_SET_VMA_ANON_NAME)` + +Both methods apply `MADV_DONTFORK` to prevent context inheritance in child processes. + +### Publication Protocol + +1. Encode payload as protobuf Resource message +2. Create memory mapping (memfd or anonymous) +3. Apply `MADV_DONTFORK` +4. Write header fields (version=2, payload_size, payload pointer) +5. Memory barrier +6. Write signature "OTEL_CTX" (last) +7. Memory barrier +8. Set `published_at_ns` to current timestamp +9. Name mapping via `prctl` (for anonymous) or rely on memfd name + +**Note:** Per PR #34, mappings remain writable (rw-p or rw-s) to allow in-place updates. The mprotect to read-only has been removed. + +### Update Protocol + +Per v2 spec, updates use atomic timestamp signaling: + +1. Write `0` to `published_at_ns` (signals update in progress) +2. Memory barrier +3. Update payload and payload_size +4. Memory barrier +5. Write new timestamp to `published_at_ns` + +### Reading Protocol + +External profilers read the context by: + +1. Scan `/proc//maps` for `[anon:OTEL_CTX]` or `/memfd:OTEL_CTX` +2. Validate signature = "OTEL_CTX" +3. Check version = 2 +4. Read `published_at_ns` (if 0, update in progress - retry) +5. Read payload bytes +6. Re-read `published_at_ns` (if changed, data inconsistent - retry) +7. Decode protobuf payload + +--- + +## Thread-Level Context Storage + +The thread-level context system uses a feature-flagged approach where the storage mode is selected at profiler startup: +- **profiler mode**: Uses the existing TLS-based storage with checksum validation +- **otel mode** (default): Uses an OTEL-compatible ring buffer storage discoverable via `/proc//maps` + +## Core Design Principles + +1. **Feature-Flagged Storage**: Storage mode selected at startup, not runtime switchable +2. **External Discoverability**: OTEL buffer is discoverable by external profilers via named mmap regions +3. **Signal Handler Safety**: Both modes support safe reads from signal handlers +4. **Unified API**: `ContextApi` abstracts storage mode from callers +5. **Backward Compatibility**: Default behavior unchanged, OTEL mode is opt-in +6. **Platform Awareness**: OTEL mode fully supported on Linux, graceful fallback elsewhere + +## Architecture Overview + +### High-Level Component Diagram + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Java Layer │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ JavaProfiler.execute("start,cpu=1ms,ctxstorage=otel,...") │ +│ │ │ +│ ▼ │ +│ ThreadContext.put(spanId, rootSpanId) │ +│ │ │ +│ ▼ │ +│ JNI: setContext0(spanId, rootSpanId) │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ Native Layer │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ContextApi (Unified Interface) │ +│ │ │ +│ ├─ initialize(args) → Select mode based on ctxstorage option │ +│ ├─ set(spanId, rootSpanId) → Route to appropriate storage │ +│ ├─ get(spanId, rootSpanId) → Read from appropriate storage │ +│ └─ getByTid(tid, ...) → Read by thread ID (OTEL mode only) │ +│ │ │ +│ ├─────────────────────────┬─────────────────────────────────────┤ +│ ▼ ▼ │ +│ ┌─────────────────────┐ ┌─────────────────────────────────────────┐ │ +│ │ PROFILER Mode │ │ OTEL Mode │ │ +│ │ (TLS Storage) │ │ (Ring Buffer Storage) │ │ +│ │ │ │ │ │ +│ │ Context struct │ │ OtelContextBuffer (mmap) │ │ +│ │ ├─ spanId │ │ ├─ Header (magic, version, capacity) │ │ +│ │ ├─ rootSpanId │ │ └─ Slots[capacity] │ │ +│ │ ├─ checksum │ │ ├─ trace_id_high │ │ +│ │ └─ tags[10] │ │ ├─ trace_id_low │ │ +│ │ │ │ ├─ span_id │ │ +│ │ Torn-read safety: │ │ └─ in_use flag │ │ +│ │ Checksum protocol │ │ │ │ +│ └─────────────────────┘ │ Torn-read safety: in_use flag │ │ +│ │ │ │ +│ │ Discovery: /proc//maps │ │ +│ │ → [anon:DD_OTEL_CTX] │ │ +│ └─────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────────────┐ +│ External Profiler (DDProf) │ +├─────────────────────────────────────────────────────────────────────────┤ +│ │ +│ 1. Parse /proc//maps │ +│ 2. Find region named [anon:DD_OTEL_CTX] │ +│ 3. Validate header (magic=0x4F54454C, version=1) │ +│ 4. Read slot by TID: buffer->slots[tid % capacity] │ +│ 5. Check in_use flag for torn-read safety │ +│ │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### Storage Mode Selection Flow + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ Profiler Startup │ +└─────────────────────────────────────────────────────────────────────────┘ + │ + ▼ + ┌───────────────────────────────┐ + │ Parse ctxstorage option │ + │ (default: otel) │ + └───────────────────────────────┘ + │ + ┌───────────────┴───────────────┐ + ▼ ▼ + ┌───────────────────┐ ┌───────────────────┐ + │ ctxstorage=profiler│ │ ctxstorage=otel │ + └───────────────────┘ └───────────────────┘ + │ │ + ▼ ▼ + ┌───────────────────┐ ┌───────────────────┐ + │ Use existing TLS │ │ Create mmap buffer│ + │ (no extra init) │ │ with prctl naming │ + └───────────────────┘ └───────────────────┘ + │ │ + │ ┌───────────┴───────────┐ + │ ▼ ▼ + │ ┌───────────────────┐ ┌───────────────────┐ + │ │ mmap succeeded │ │ mmap failed │ + │ └───────────────────┘ └───────────────────┘ + │ │ │ + │ ▼ ▼ + │ ┌───────────────────┐ ┌───────────────────┐ + │ │ OTEL mode active │ │ Fallback to │ + │ └───────────────────┘ │ profiler mode │ + │ └───────────────────┘ + │ │ + └───────────────┬───────────────┘ + ▼ + ┌───────────────────────────────┐ + │ ContextApi ready for use │ + └───────────────────────────────┘ +``` + +## OTEL Ring Buffer Design + +### Memory Layout + +The OTEL buffer is a contiguous mmap region with a header followed by slot array: + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ OtelContextBuffer Layout │ +├─────────────────────────────────────────────────────────────────────────┤ +│ Offset │ Size │ Field │ Description │ +├────────────┼────────┼───────────────┼───────────────────────────────────┤ +│ 0x00 │ 4 │ magic │ 0x4F54454C ("OTEL" in ASCII) │ +│ 0x04 │ 4 │ version │ Protocol version (currently 1) │ +│ 0x08 │ 4 │ capacity │ Number of slots │ +│ 0x0C │ 4 │ slot_size │ sizeof(OtelContextSlot) = 32 │ +│ 0x10 │ 16 │ reserved │ Future use (padding to 32 bytes) │ +├────────────┼────────┼───────────────┼───────────────────────────────────┤ +│ 0x20 │ 32 │ slots[0] │ First context slot │ +│ 0x40 │ 32 │ slots[1] │ Second context slot │ +│ ... │ ... │ ... │ ... │ +│ N*32+0x20 │ 32 │ slots[N-1] │ Last context slot │ +└─────────────────────────────────────────────────────────────────────────┘ + +Total size: 32 (header) + 32 * capacity bytes +Default capacity: 65536 slots = 2MB + 32 bytes +``` + +### Slot Structure + +Each slot is 32 bytes, aligned to prevent false sharing between adjacent slots: + +``` +┌─────────────────────────────────────────────────────────────────────────┐ +│ OtelContextSlot (32 bytes) │ +├─────────────────────────────────────────────────────────────────────────┤ +│ Offset │ Size │ Field │ Description │ +├─────────┼──────┼────────────────┼───────────────────────────────────────┤ +│ 0x00 │ 8 │ trace_id_high │ Upper 64 bits of 128-bit trace ID │ +│ 0x08 │ 8 │ trace_id_low │ Lower 64 bits (maps to rootSpanId) │ +│ 0x10 │ 8 │ span_id │ 64-bit span ID │ +│ 0x18 │ 1 │ in_use │ 1 = write in progress, 0 = valid │ +│ 0x19 │ 7 │ padding │ Alignment to 32 bytes │ +└─────────────────────────────────────────────────────────────────────────┘ +``` + +### TID-to-Slot Mapping + +Slots are indexed by thread ID using simple modulo hashing: + +```cpp +slot_index = tid % capacity +slot_ptr = &buffer->slots[slot_index] +``` + +**Collision Handling**: With 65536 slots, TID collisions are rare. When they occur: +- Two threads with `tid1 % 65536 == tid2 % 65536` share a slot +- The `in_use` flag prevents torn reads but context may be from either thread +- This is acceptable for profiling (low probability, bounded impact) + +## Torn-Read Protection + +### OTEL Mode: in_use Flag Protocol + +The `in_use` flag provides torn-read safety using acquire/release semantics: + +**Writer (application thread):** +```cpp +void OtelContexts::set(u64 trace_id_high, u64 trace_id_low, u64 span_id) { + OtelContextSlot* slot = getSlot(OS::threadId()); + + // 1. Mark write in progress (release semantics) + __atomic_store_n(&slot->in_use, 1, __ATOMIC_RELEASE); + + // 2. Write data fields (relaxed - ordering from in_use barriers) + __atomic_store_n(&slot->trace_id_high, trace_id_high, __ATOMIC_RELAXED); + __atomic_store_n(&slot->trace_id_low, trace_id_low, __ATOMIC_RELAXED); + __atomic_store_n(&slot->span_id, span_id, __ATOMIC_RELAXED); + + // 3. Mark write complete (release semantics) + __atomic_store_n(&slot->in_use, 0, __ATOMIC_RELEASE); +} +``` + +**Reader (signal handler or external profiler):** +```cpp +bool OtelContexts::getByTid(int tid, u64& trace_high, u64& trace_low, u64& span) { + OtelContextSlot* slot = getSlot(tid); + + // 1. Check if write in progress (acquire semantics) + if (__atomic_load_n(&slot->in_use, __ATOMIC_ACQUIRE)) { + return false; // Write in progress, skip this sample + } + + // 2. Read data fields (relaxed - ordering from in_use acquire) + trace_high = __atomic_load_n(&slot->trace_id_high, __ATOMIC_RELAXED); + trace_low = __atomic_load_n(&slot->trace_id_low, __ATOMIC_RELAXED); + span = __atomic_load_n(&slot->span_id, __ATOMIC_RELAXED); + + // 3. Double-check (acquire semantics) + if (__atomic_load_n(&slot->in_use, __ATOMIC_ACQUIRE)) { + return false; // Write started during read, discard + } + + return true; +} +``` + +### Profiler Mode: Checksum Protocol + +The existing TLS mode uses a checksum for torn-read detection (see TLSContext.md for details): + +```cpp +// Writer +__atomic_store_n(&ctx.checksum, 0ULL, __ATOMIC_RELEASE); // Invalidate +__atomic_store_n(&ctx.spanId, span_id, __ATOMIC_RELAXED); +__atomic_store_n(&ctx.rootSpanId, root_span_id, __ATOMIC_RELAXED); +__atomic_store_n(&ctx.checksum, computed_checksum, __ATOMIC_RELEASE); + +// Reader +u64 checksum1 = __atomic_load_n(&ctx.checksum, __ATOMIC_ACQUIRE); +u64 span = __atomic_load_n(&ctx.spanId, __ATOMIC_RELAXED); +u64 root = __atomic_load_n(&ctx.rootSpanId, __ATOMIC_RELAXED); +bool valid = (checksum1 != 0) && (checksum1 == Contexts::checksum(span, root)); +``` + +## External Discovery Mechanism + +### Linux: Named Anonymous Mappings + +On Linux 5.17+, the mmap region is named using `prctl(PR_SET_VMA_ANON_NAME)`: + +```cpp +bool OtelContexts::initialize(size_t capacity) { + size_t size = sizeof(OtelContextHeader) + capacity * sizeof(OtelContextSlot); + + // Create anonymous mapping + void* ptr = mmap(nullptr, size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + + // Name the region for discovery (Linux 5.17+ with CONFIG_ANON_VMA_NAME) + prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, ptr, size, "DD_OTEL_CTX"); + + // Initialize header + buffer->magic = 0x4F54454C; // "OTEL" + buffer->version = 1; + buffer->capacity = capacity; + buffer->slot_size = sizeof(OtelContextSlot); + + return true; +} +``` + +**External profiler discovery:** +```bash +# Find the OTEL context buffer in target process +grep "DD_OTEL_CTX" /proc//maps +# Output: 7f1234560000-7f1234760000 rw-p 00000000 00:00 0 [anon:DD_OTEL_CTX] +``` + +### Fallback: Magic Number Scanning + +If `prctl` naming is unavailable (older kernels, Docker/LinuxKit), external profilers can scan anonymous regions for the magic number: + +```cpp +// External profiler pseudocode +for (region in parse_proc_maps(pid)) { + if (region.is_anonymous && region.is_rw) { + u32 magic = read_u32(region.start); + if (magic == 0x4F54454C) { // "OTEL" + // Validate header + OtelContextHeader* hdr = (OtelContextHeader*)region.start; + if (hdr->version == 1 && hdr->slot_size == 32) { + // Found valid OTEL context buffer + } + } + } +} +``` + +## API Reference + +### ContextApi (Unified Interface) + +```cpp +// context_api.h + +enum ContextStorageMode { + CTX_STORAGE_PROFILER = 0, // TLS-based storage + CTX_STORAGE_OTEL = 1 // OTEL ring buffer storage (default) +}; + +class ContextApi { +public: + // Lifecycle (single-threaded, called from Profiler::start/stop) + static bool initialize(const Arguments& args); + static void shutdown(); + static bool isInitialized(); + static ContextStorageMode getMode(); + + // Context operations (thread-safe, signal-safe) + static void set(u64 span_id, u64 root_span_id); + static void setOtel(u64 trace_id_high, u64 trace_id_low, u64 span_id); + static bool get(u64& span_id, u64& root_span_id); + static bool getByTid(int tid, u64& span_id, u64& root_span_id); + static void clear(); +}; +``` + +### OtelContexts (OTEL-Specific Implementation) + +```cpp +// otel_context.h + +class OtelContexts { +public: + // Lifecycle + static bool initialize(size_t capacity = 65536); + static void shutdown(); + static bool isInitialized(); + + // Context operations + static void set(u64 trace_id_high, u64 trace_id_low, u64 span_id); + static bool get(u64& trace_id_high, u64& trace_id_low, u64& span_id); + static bool getByTid(int tid, u64& trace_id_high, u64& trace_id_low, u64& span_id); +}; +``` + +### Java API + +```java +// ThreadContext.java + +public class ThreadContext { + // Set context (routes through ContextApi) + public long put(long spanId, long rootSpanId); + + // Get context (mode-aware) + public long getSpanId(); + public long getRootSpanId(); + + // Check storage mode + public static boolean isOtelMode(); +} +``` + +## Configuration + +### Profiler Options + +| Option | Values | Default | Description | +|--------|--------|---------|-------------| +| `ctxstorage` | `profiler`, `otel` | `otel` | Context storage mode | + +### Usage Examples + +```bash +# Default (OTEL mode) +java -agentpath:libjavaProfiler.so=start,cpu=1ms,jfr,file=profile.jfr ... + +# Explicit profiler mode +java -agentpath:libjavaProfiler.so=start,cpu=1ms,ctxstorage=profiler,jfr,file=profile.jfr ... +``` + +```java +// Programmatic API (default OTEL mode) +JavaProfiler profiler = JavaProfiler.getInstance(); +profiler.execute("start,cpu=1ms,jfr,file=profile.jfr"); + +// Check mode +if (ThreadContext.isOtelMode()) { + System.out.println("OTEL context storage active"); +} + +// Explicitly use profiler mode +profiler.execute("start,cpu=1ms,ctxstorage=profiler,jfr,file=profile.jfr"); +``` + +## Platform Support + +| Platform | Profiler Mode | OTEL Mode | Notes | +|----------|---------------|-----------|-------| +| Linux x64 | ✓ | ✓ | Full support | +| Linux arm64 | ✓ | ✓ | Full support | +| Linux (musl) | ✓ | ✓ | Full support | +| macOS arm64 | ✓ | ✓* | *mmap naming unavailable | +| macOS x64 | ✓ | ✓* | *mmap naming unavailable | + +**Note**: On macOS, OTEL mode works but the mmap region cannot be named. External profilers must use magic number scanning for discovery. + +## Performance Characteristics + +| Operation | Profiler Mode | OTEL Mode | Notes | +|-----------|---------------|-----------|-------| +| Context write | ~10-20ns | ~15-25ns | OTEL slightly slower (TID lookup) | +| Context read (own thread) | ~5-10ns | ~10-15ns | OTEL has slot lookup overhead | +| Context read (by TID) | N/A | ~10-15ns | Only available in OTEL mode | +| Memory overhead | ~64 bytes/thread | ~2MB fixed | OTEL uses fixed-size buffer | + +## File Structure + +``` +ddprof-lib/src/main/cpp/ +├── context.h # Existing TLS context (profiler mode) +├── context.cpp +├── context_api.h # NEW: Unified context abstraction +├── context_api.cpp +├── otel_context.h # NEW: OTEL ring buffer implementation +├── otel_context.cpp +├── arguments.h # Modified: ctxstorage option +├── arguments.cpp +├── profiler.cpp # Modified: ContextApi initialization +├── javaApi.cpp # Modified: JNI routing through ContextApi +└── wallClock.cpp # Modified: Uses ContextApi + +ddprof-lib/src/main/java/com/datadoghq/profiler/ +├── ThreadContext.java # Modified: isOtelMode(), mode-aware getters + +ddprof-test/src/test/java/com/datadoghq/profiler/context/ +└── OtelContextStorageModeTest.java # NEW: OTEL mode tests +``` + +## Future Considerations + +1. **Full 128-bit Trace ID**: Currently `trace_id_high` is unused (set to 0). Future integration with OTEL tracers may populate the full 128-bit trace ID. + +2. **Tags Support in OTEL Mode**: The current OTEL mode does not support custom tags. This could be added by extending the slot structure. + +3. **Shared Buffer Discovery**: The named mmap region could be made `MAP_SHARED` to allow in-process discovery without `/proc` parsing. + +4. **Dynamic Capacity**: Currently capacity is fixed at initialization. Dynamic resizing could be added for long-running applications with many threads. diff --git a/utils/run-docker-tests.sh b/utils/run-docker-tests.sh index cb185223b..5591b51a5 100755 --- a/utils/run-docker-tests.sh +++ b/utils/run-docker-tests.sh @@ -252,7 +252,13 @@ RUN mkdir -p /gradle-cache WORKDIR /workspace EOF else - cat > "$DOCKERFILE_DIR/Dockerfile.base" <<'EOF' + # libclang-rt-dev is only available on x64, not arm64 + if [[ "$ARCH" == "x64" ]]; then + CLANG_RT_PKG="libclang-rt-dev" + else + CLANG_RT_PKG="" + fi + cat > "$DOCKERFILE_DIR/Dockerfile.base" <