[AMD][ROCm] Improve support of AMD (#7448)

k-artem · sfc-gh-truwase · loadams · web-flow · commit b00b75f05852 · 2025-12-10T17:34:54.000Z
The patch delivers several fixes for building issues for CUDA part of DeepSpeed library. Percentage of passed unit tests improved(tested on RDNA hardware, gfx110x and gfx12x) Before: collected 5298 items / 15 skipped 2773 failed, 862 passed, 1665 skipped, 13 errors After: collected 5851 items / 11 skipped 4187 failed, 1373 passed, 292 skipped, 10 errors Regarding testing of **fp_quantizer(DS_BUILD_FP_QUANTIZER)** via `tests/unit/ops/fp_quantizer/test_fp_quant.py`, this test depends on QPyTorch which should be patched before run on AMD, please apply Tiiiger/QPyTorch#71 --------- Signed-off-by: Artem Kuzmitckii <artem.kuzmitckii@amd.com> Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com> Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
diff --git a/csrc/fp_quantizer/fp_quantize.cu b/csrc/fp_quantizer/fp_quantize.cu
@@ -4,7 +4,7 @@
 // DeepSpeed Team
 
 #include <stdexcept>
-#include "context.h"
+#include "fp_context.h"
 #include "fp_quantize.h"
 #include "memory_access_utils.h"
 #include "reduction_utils.h"
diff --git a/csrc/fp_quantizer/includes/fp_context.h b/csrc/fp_quantizer/includes/fp_context.h
diff --git a/csrc/fp_quantizer/includes/fp_quantize.h b/csrc/fp_quantizer/includes/fp_quantize.h
@@ -9,10 +9,18 @@
 #include <stdint.h>
 
 #include <cuda_fp16.h>
-
-#ifdef BF16_AVAILABLE
+// Note: BF16 support on AMD but we have to exclude here cuda_bf16.h (which turn to
+// <hip/hip_bfloat16.h> after hipifying), because this header is pulled into .cpp translation units
+// that are compiled by a host-only compiler, which triggers build errors. Added forward declaration
+// instead, see code block below
+#if defined(BF16_AVAILABLE)
+#if !defined(__HIP_PLATFORM_AMD__)
 #include <cuda_bf16.h>
+#else
+struct __hip_bfloat16;
+#endif
 #endif
+
 #include <cuda_runtime_api.h>
 #include <stdio.h>
 
diff --git a/csrc/includes/conversion_utils.h b/csrc/includes/conversion_utils.h
@@ -363,42 +363,74 @@ DS_D_INLINE __nv_bfloat16 to(float val)
 template <>
 DS_D_INLINE __nv_bfloat16 to(int64_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __double2bfloat16(__ll2double_rn(val));
+#else
     return __ll2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(int32_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2bfloat16(__int2float_rn(val));
+#else
     return __int2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(int16_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2bfloat16(__int2float_rn(val));
+#else
     return __short2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(int8_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2bfloat16(__int2float_rn(val));
+#else
     return __int2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(uint64_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __double2bfloat16(__ull2double_rn(val));
+#else
     return __ull2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(uint32_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2bfloat16(__uint2float_rn(val));
+#else
     return __uint2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(uint16_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2bfloat16(__uint2float_rn(val));
+#else
     return __ushort2bfloat16_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat16 to(uint8_t val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2bfloat16(__uint2float_rn(val));
+#else
     return __uint2bfloat16_rn(val);
+#endif
 }
 #endif
 
@@ -412,7 +444,11 @@ DS_D_INLINE __nv_bfloat162 to(float2 val)
 template <>
 DS_D_INLINE __nv_bfloat162 to(float val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __bfloat162bfloat162(__float2bfloat16(val));
+#else
     return __float2bfloat162_rn(val);
+#endif
 }
 template <>
 DS_D_INLINE __nv_bfloat162 to(__half2 val)
@@ -444,7 +480,11 @@ DS_D_INLINE int64_t to(__half val)
 template <>
 DS_D_INLINE int64_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2ll_rn(__bfloat162float(val));
+#else
     return __bfloat162ll_rn(val);
+#endif
 }
 #endif
 
@@ -471,7 +511,11 @@ DS_D_INLINE int32_t to(__half val)
 template <>
 DS_D_INLINE int32_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2int_rn(__bfloat162float(val));
+#else
     return __bfloat162int_rn(val);
+#endif
 }
 #endif
 
@@ -498,7 +542,11 @@ DS_D_INLINE int16_t to(__half val)
 template <>
 DS_D_INLINE int16_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2int_rn(__bfloat162float(val));
+#else
     return __bfloat162int_rn(val);
+#endif
 }
 #endif
 
@@ -525,7 +573,11 @@ DS_D_INLINE int8_t to(__half val)
 template <>
 DS_D_INLINE int8_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2int_rn(__bfloat162float(val));
+#else
     return __bfloat162int_rn(val);
+#endif
 }
 #endif
 
@@ -552,7 +604,11 @@ DS_D_INLINE uint64_t to(__half val)
 template <>
 DS_D_INLINE uint64_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2ull_rn(__bfloat162float(val));
+#else
     return __bfloat162ull_rn(val);
+#endif
 }
 #endif
 
@@ -579,7 +635,11 @@ DS_D_INLINE uint32_t to(__half val)
 template <>
 DS_D_INLINE uint32_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2uint_rn(__bfloat162float(val));
+#else
     return __bfloat162uint_rn(val);
+#endif
 }
 #endif
 
@@ -606,7 +666,11 @@ DS_D_INLINE uint16_t to(__half val)
 template <>
 DS_D_INLINE uint16_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2uint_rn(__bfloat162float(val));
+#else
     return __bfloat162uint_rn(val);
+#endif
 }
 #endif
 
@@ -633,7 +697,11 @@ DS_D_INLINE uint8_t to(__half val)
 template <>
 DS_D_INLINE uint8_t to(__nv_bfloat16 val)
 {
+#ifdef __HIP_PLATFORM_AMD__
+    return __float2uint_rn(__bfloat162float(val));
+#else
     return __bfloat162uint_rn(val);
+#endif
 }
 #endif
 
diff --git a/csrc/includes/ds_kernel_utils.h b/csrc/includes/ds_kernel_utils.h
@@ -13,15 +13,21 @@ used throughout the codebase.
 #include <cuda.h>
 #include <cuda_fp16.h>
 
-#ifdef BF16_AVAILABLE
+// Note: BF16 support on AMD but we have to exclude here cuda_bf16.h (which turn to
+// <hip/hip_bfloat16.h> after hipifying), because this header is pulled into .cpp translation units
+// that are compiled by a host-only compiler, which triggers build errors. Added forward declaration
+// instead, see code block below
+#if defined(BF16_AVAILABLE) && !defined(__HIP_PLATFORM_AMD__)
 #include <cuda_bf16.h>
 #endif
 
 #define DS_HD_INLINE __host__ __device__ __forceinline__
 #define DS_D_INLINE __device__ __forceinline__
 
 #ifdef __HIP_PLATFORM_AMD__
-
+#if BF16_AVAILABLE
+struct __hip_bfloat16;
+#endif
 // constexpr variant of warpSize for templating
 constexpr int hw_warp_size = ROCM_WAVEFRONT_SIZE;
 #define HALF_PRECISION_AVAILABLE = 1
diff --git a/csrc/includes/reduction_utils.h b/csrc/includes/reduction_utils.h
@@ -9,6 +9,10 @@
 #include "ds_kernel_utils.h"
 #include "memory_access_utils.h"
 
+#if defined(BF16_AVAILABLE) && defined(__HIP_PLATFORM_AMD__)
+#include <hip/hip_bfloat16.h>
+#endif
+
 namespace cg = cooperative_groups;
 
 namespace reduce {
@@ -374,7 +378,11 @@ DS_D_INLINE __half init<ROpType::Max>()
 template <>
 DS_D_INLINE __nv_bfloat16 init<ROpType::Max>()
 {
+#ifdef __HIP_PLATFORM_AMD__
+    constexpr __hip_bfloat16_raw neg_inf = {0xFF80};
+#else
     constexpr __nv_bfloat16_raw neg_inf = {0xFF80};
+#endif
     return __nv_bfloat16(neg_inf);
 }
 #endif
@@ -573,6 +581,24 @@ DS_D_INLINE void _warp(cg::thread_block_tile<hw_warp_size>& warp, T* data)
     }
 }
 
+#if defined(__HIP_PLATFORM_AMD__)
+template <int reduce_width, typename T, ROpType... Ops>
+DS_D_INLINE void _warp_with_type_conversion(cg::thread_block_tile<hw_warp_size>& warp_arg, T* data)
+{
+    constexpr int elems = sizeof...(Ops);
+    if constexpr (!(std::is_integral<T>::value || std::is_floating_point<T>::value)) {
+        float temp_data[elems];
+#pragma unroll
+        for (int i = 0; i < elems; i++) { temp_data[i] = conversion::to<float>(data[i]); }
+        _warp<float, Ops...>(warp_arg, temp_data);
+#pragma unroll
+        for (int i = 0; i < elems; i++) { data[i] = conversion::to<T>(temp_data[i]); }
+    } else {
+        _warp<T, Ops...>(warp_arg, data);
+    }
+}
+#endif  // defined(__HIP_PLATFORM_AMD__)
+
 /*
 Implementation for primary block reduction that serves both `block` and
 `partitioned_block`.
@@ -600,7 +626,11 @@ DS_D_INLINE void _block(cg::thread_block& tb,
 #endif
 
     // Always perform warp-scope reduction
+#ifdef __HIP_PLATFORM_AMD__
+    _warp_with_type_conversion<hw_warp_size, T, Ops...>(warp_arg, data);
+#else
     _warp<T, Ops...>(warp_arg, data);
+#endif
 
     // If max_warps == 1 let's skip the runtime check
     if (total_warps != 1) {
@@ -624,8 +654,11 @@ DS_D_INLINE void _block(cg::thread_block& tb,
             } else {
                 init<Ops...>(data);
             }
-
+#ifdef __HIP_PLATFORM_AMD__
+            _warp_with_type_conversion<total_warps, T, Ops...>(warp_arg, data);
+#else
             _warp<T, Ops..., total_warps>(warp_arg, data);
+#endif
 
 #pragma unroll
             for (int i = 0; i < elems; i++) {
diff --git a/op_builder/builder.py b/op_builder/builder.py
@@ -779,6 +779,7 @@ def nvcc_args(self):
                 '-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,
                 '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR
             ]
+            self.enable_bf16 = True
         else:
             try:
                 nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))
diff --git a/op_builder/transformer_inference.py b/op_builder/transformer_inference.py
@@ -75,3 +75,14 @@ def extra_ldflags(self):
 
     def include_paths(self):
         return ['csrc/transformer/inference/includes', 'csrc/includes']
+
+    def nvcc_args(self):
+        args = super().nvcc_args()
+        """BF16 is supported on AMD, but including `cuda_bf16.h` (`<hip/hip_bfloat16.h>` after hipification)
+           in host-only translation units (*.cpp files) fails because GPU-specific builtins are pulled in with the BF16 type.
+           This cannot be avoided via forward declarations for this transformer_inference extension,
+           since `pt_binding.cpp` code explicitly requires the BF16 header, so disable it for now.
+        """
+        if self.is_rocm_pytorch():
+            self.enable_bf16 = False
+        return args
diff --git a/tests/unit/ops/fp_quantizer/test_fp_quant.py b/tests/unit/ops/fp_quantizer/test_fp_quant.py
@@ -57,7 +57,7 @@ def test_fp_quant_meta(dtype):
 
         qtorch_out = qtorch_quantize(x, exp_bits=exp_bits, man_bits=man_bits, group_size=group_size)
         qtorch_error = (qtorch_out - x).abs().sum() / x.numel()
-        ds_error = (x_dequantized - x).abs().sum() / x.numel()
+        ds_error = (x_dequantized - ds_x).abs().sum() / x.numel()
 
         assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"
 
@@ -129,6 +129,6 @@ def test_fp_quant(dtype, q_bits):
         qtorch_out = qtorch_quantize(x, exp_bits=exp_bits, man_bits=man_bits, group_size=quant_config.group_size)
 
         qtorch_error = (qtorch_out - x).abs().sum() / x.numel()
-        ds_error = (x_dequantized - x).abs().sum() / x.numel()
+        ds_error = (x_dequantized - ds_x).abs().sum() / x.numel()
 
         assert 0.0004 > abs(qtorch_error.item() - ds_error.item()), f"failed on iteration {i}"

Original file line number	Diff line number	Diff line change
`@@ -363,42 +363,74 @@ DS_D_INLINE __nv_bfloat16 to(float val)`
`363`	`363`	`template <>`
`364`	`364`	`DS_D_INLINE __nv_bfloat16 to(int64_t val)`
`365`	`365`	`{`
	`366`	`+#ifdef __HIP_PLATFORM_AMD__`
	`367`	`+ return __double2bfloat16(__ll2double_rn(val));`
	`368`	`+#else`
`366`	`369`	`return __ll2bfloat16_rn(val);`
	`370`	`+#endif`
`367`	`371`	`}`
`368`	`372`	`template <>`
`369`	`373`	`DS_D_INLINE __nv_bfloat16 to(int32_t val)`
`370`	`374`	`{`
	`375`	`+#ifdef __HIP_PLATFORM_AMD__`
	`376`	`+ return __float2bfloat16(__int2float_rn(val));`
	`377`	`+#else`
`371`	`378`	`return __int2bfloat16_rn(val);`
	`379`	`+#endif`
`372`	`380`	`}`
`373`	`381`	`template <>`
`374`	`382`	`DS_D_INLINE __nv_bfloat16 to(int16_t val)`
`375`	`383`	`{`
	`384`	`+#ifdef __HIP_PLATFORM_AMD__`
	`385`	`+ return __float2bfloat16(__int2float_rn(val));`
	`386`	`+#else`
`376`	`387`	`return __short2bfloat16_rn(val);`
	`388`	`+#endif`
`377`	`389`	`}`
`378`	`390`	`template <>`
`379`	`391`	`DS_D_INLINE __nv_bfloat16 to(int8_t val)`
`380`	`392`	`{`
	`393`	`+#ifdef __HIP_PLATFORM_AMD__`
	`394`	`+ return __float2bfloat16(__int2float_rn(val));`
	`395`	`+#else`
`381`	`396`	`return __int2bfloat16_rn(val);`
	`397`	`+#endif`
`382`	`398`	`}`
`383`	`399`	`template <>`
`384`	`400`	`DS_D_INLINE __nv_bfloat16 to(uint64_t val)`
`385`	`401`	`{`
	`402`	`+#ifdef __HIP_PLATFORM_AMD__`
	`403`	`+ return __double2bfloat16(__ull2double_rn(val));`
	`404`	`+#else`
`386`	`405`	`return __ull2bfloat16_rn(val);`
	`406`	`+#endif`
`387`	`407`	`}`
`388`	`408`	`template <>`
`389`	`409`	`DS_D_INLINE __nv_bfloat16 to(uint32_t val)`
`390`	`410`	`{`
	`411`	`+#ifdef __HIP_PLATFORM_AMD__`
	`412`	`+ return __float2bfloat16(__uint2float_rn(val));`
	`413`	`+#else`
`391`	`414`	`return __uint2bfloat16_rn(val);`
	`415`	`+#endif`
`392`	`416`	`}`
`393`	`417`	`template <>`
`394`	`418`	`DS_D_INLINE __nv_bfloat16 to(uint16_t val)`
`395`	`419`	`{`
	`420`	`+#ifdef __HIP_PLATFORM_AMD__`
	`421`	`+ return __float2bfloat16(__uint2float_rn(val));`
	`422`	`+#else`
`396`	`423`	`return __ushort2bfloat16_rn(val);`
	`424`	`+#endif`
`397`	`425`	`}`
`398`	`426`	`template <>`
`399`	`427`	`DS_D_INLINE __nv_bfloat16 to(uint8_t val)`
`400`	`428`	`{`
	`429`	`+#ifdef __HIP_PLATFORM_AMD__`
	`430`	`+ return __float2bfloat16(__uint2float_rn(val));`
	`431`	`+#else`
`401`	`432`	`return __uint2bfloat16_rn(val);`
	`433`	`+#endif`
`402`	`434`	`}`
`403`	`435`	`#endif`
`404`	`436`
`@@ -412,7 +444,11 @@ DS_D_INLINE __nv_bfloat162 to(float2 val)`
`412`	`444`	`template <>`
`413`	`445`	`DS_D_INLINE __nv_bfloat162 to(float val)`
`414`	`446`	`{`
	`447`	`+#ifdef __HIP_PLATFORM_AMD__`
	`448`	`+ return __bfloat162bfloat162(__float2bfloat16(val));`
	`449`	`+#else`
`415`	`450`	`return __float2bfloat162_rn(val);`
	`451`	`+#endif`
`416`	`452`	`}`
`417`	`453`	`template <>`
`418`	`454`	`DS_D_INLINE __nv_bfloat162 to(__half2 val)`
`@@ -444,7 +480,11 @@ DS_D_INLINE int64_t to(__half val)`
`444`	`480`	`template <>`
`445`	`481`	`DS_D_INLINE int64_t to(__nv_bfloat16 val)`
`446`	`482`	`{`
	`483`	`+#ifdef __HIP_PLATFORM_AMD__`
	`484`	`+ return __float2ll_rn(__bfloat162float(val));`
	`485`	`+#else`
`447`	`486`	`return __bfloat162ll_rn(val);`
	`487`	`+#endif`
`448`	`488`	`}`
`449`	`489`	`#endif`
`450`	`490`
`@@ -471,7 +511,11 @@ DS_D_INLINE int32_t to(__half val)`
`471`	`511`	`template <>`
`472`	`512`	`DS_D_INLINE int32_t to(__nv_bfloat16 val)`
`473`	`513`	`{`
	`514`	`+#ifdef __HIP_PLATFORM_AMD__`
	`515`	`+ return __float2int_rn(__bfloat162float(val));`
	`516`	`+#else`
`474`	`517`	`return __bfloat162int_rn(val);`
	`518`	`+#endif`
`475`	`519`	`}`
`476`	`520`	`#endif`
`477`	`521`
`@@ -498,7 +542,11 @@ DS_D_INLINE int16_t to(__half val)`
`498`	`542`	`template <>`
`499`	`543`	`DS_D_INLINE int16_t to(__nv_bfloat16 val)`
`500`	`544`	`{`
	`545`	`+#ifdef __HIP_PLATFORM_AMD__`
	`546`	`+ return __float2int_rn(__bfloat162float(val));`
	`547`	`+#else`
`501`	`548`	`return __bfloat162int_rn(val);`
	`549`	`+#endif`
`502`	`550`	`}`
`503`	`551`	`#endif`
`504`	`552`
`@@ -525,7 +573,11 @@ DS_D_INLINE int8_t to(__half val)`
`525`	`573`	`template <>`
`526`	`574`	`DS_D_INLINE int8_t to(__nv_bfloat16 val)`
`527`	`575`	`{`
	`576`	`+#ifdef __HIP_PLATFORM_AMD__`
	`577`	`+ return __float2int_rn(__bfloat162float(val));`
	`578`	`+#else`
`528`	`579`	`return __bfloat162int_rn(val);`
	`580`	`+#endif`
`529`	`581`	`}`
`530`	`582`	`#endif`
`531`	`583`
`@@ -552,7 +604,11 @@ DS_D_INLINE uint64_t to(__half val)`
`552`	`604`	`template <>`
`553`	`605`	`DS_D_INLINE uint64_t to(__nv_bfloat16 val)`
`554`	`606`	`{`
	`607`	`+#ifdef __HIP_PLATFORM_AMD__`
	`608`	`+ return __float2ull_rn(__bfloat162float(val));`
	`609`	`+#else`
`555`	`610`	`return __bfloat162ull_rn(val);`
	`611`	`+#endif`
`556`	`612`	`}`
`557`	`613`	`#endif`
`558`	`614`
`@@ -579,7 +635,11 @@ DS_D_INLINE uint32_t to(__half val)`
`579`	`635`	`template <>`
`580`	`636`	`DS_D_INLINE uint32_t to(__nv_bfloat16 val)`
`581`	`637`	`{`
	`638`	`+#ifdef __HIP_PLATFORM_AMD__`
	`639`	`+ return __float2uint_rn(__bfloat162float(val));`
	`640`	`+#else`
`582`	`641`	`return __bfloat162uint_rn(val);`
	`642`	`+#endif`
`583`	`643`	`}`
`584`	`644`	`#endif`
`585`	`645`
`@@ -606,7 +666,11 @@ DS_D_INLINE uint16_t to(__half val)`
`606`	`666`	`template <>`
`607`	`667`	`DS_D_INLINE uint16_t to(__nv_bfloat16 val)`
`608`	`668`	`{`
	`669`	`+#ifdef __HIP_PLATFORM_AMD__`
	`670`	`+ return __float2uint_rn(__bfloat162float(val));`
	`671`	`+#else`
`609`	`672`	`return __bfloat162uint_rn(val);`
	`673`	`+#endif`
`610`	`674`	`}`
`611`	`675`	`#endif`
`612`	`676`
`@@ -633,7 +697,11 @@ DS_D_INLINE uint8_t to(__half val)`
`633`	`697`	`template <>`
`634`	`698`	`DS_D_INLINE uint8_t to(__nv_bfloat16 val)`
`635`	`699`	`{`
	`700`	`+#ifdef __HIP_PLATFORM_AMD__`
	`701`	`+ return __float2uint_rn(__bfloat162float(val));`
	`702`	`+#else`
`636`	`703`	`return __bfloat162uint_rn(val);`
	`704`	`+#endif`
`637`	`705`	`}`
`638`	`706`	`#endif`
`639`	`707`
Original file line number	Diff line number	Diff line change
`@@ -779,6 +779,7 @@ def nvcc_args(self):`
`779`	`779`	`'-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR,`
`780`	`780`	`'-DROCM_VERSION_MINOR=%s' % ROCM_MINOR`
`781`	`781`	`]`
	`782`	`+ self.enable_bf16 = True`
`782`	`783`	`else:`
`783`	`784`	`try:`
`784`	`785`	`nvcc_threads = int(os.getenv("DS_NVCC_THREADS", ""))`