From 3cc407a6ce893ad5e555d183bfbe6d8ef84e662a Mon Sep 17 00:00:00 2001
From: Gregory Roth <groth@nvidia.com>
Date: Mon, 8 Jun 2026 13:47:09 -0700
Subject: [PATCH 1/2] Skip precise-related SROA where native vectors supported

This caused a crash when an element was extracted from a native vector alloca because it used a GEP which was not among the expected operations. Shouldn't be scalarizing native vectors in this case anyway. It was done during DXIL's partial mem2reg to keep the precise indication applied where mem2reg'ing would have erased it. As part of that, they did an SROA because it is skipped elsewhere for the sake of this pass. This allows the vectors to be represented natively as precise.

In addition, the marking of the vector as precise using a specialized call is modified to allow vectors, preventing unnecessary extraction and replacement, which didn't cause any failures, but wasn't necessary for 6.9+

Finally, changes the way precise is indicated since it relied on applying metadata to a dx.attribute.precise function that lacked a body. This disallows pass testing for precise since the verifier objects to bodyless functions having metadata. It is simpler and more consistent to apply attributes to the function the way dxilnoop and similar functions do anyway

Fixes #8528
---
 lib/HLSL/HLModule.cpp                         |  8 +++++--
 .../Scalar/DxilConditionalMem2Reg.cpp         |  5 ++++
 .../hlsl/types/longvec-precise.hlsl           | 23 +++++++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl
diff --git a/lib/HLSL/HLModule.cpp b/lib/HLSL/HLModule.cpp
index bab6e23a30..e172217c75 100644
--- a/lib/HLSL/HLModule.cpp
+++ b/lib/HLSL/HLModule.cpp
@@ -1034,7 +1034,10 @@ void HLModule::MarkPreciseAttributeOnValWithFunctionCall(llvm::Value *V,
                                                          BuilderTy &Builder,
                                                          llvm::Module &M) {
   Type *Ty = V->getType();
-  Type *EltTy = Ty->getScalarType();
+  Type *EltTy = Ty;
+  bool SupportsVectors = M.GetHLModule().GetShaderModel()->IsSM69Plus();
+  if (!SupportsVectors)
+    EltTy = Ty->getScalarType();
 
   // TODO: Only do this on basic types.
 
@@ -1050,7 +1053,8 @@ void HLModule::MarkPreciseAttributeOnValWithFunctionCall(llvm::Value *V,
       cast<Function>(M.getOrInsertFunction(preciseFuncName, preciseFuncTy));
   if (!HLModule::HasPreciseAttribute(preciseFunc))
     MarkPreciseAttribute(preciseFunc);
-  if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
+  if (!SupportsVectors && isa<FixedVectorType>(Ty)) {
+    FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
     for (unsigned i = 0; i < VT->getNumElements(); i++) {
       Value *Elt = Builder.CreateExtractElement(V, i);
       Builder.CreateCall(preciseFunc, {Elt});
diff --git a/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp b/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp
index c3fc16c40b..9ec7d6e18b 100644
--- a/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp
+++ b/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp
@@ -270,6 +270,11 @@ class DxilConditionalMem2Reg : public FunctionPass {
   static bool ScalarizePreciseVectorAlloca(Function &F) {
     BasicBlock *Entry = &*F.begin();
 
+    // No need to scalarize the vector if 6.9 native vector support is available
+    Module *M = F.getParent();
+    if (M->HasHLModule() && M->GetHLModule().GetShaderModel()->IsSM69Plus())
+      return false;
+
     SmallVector<AllocaInst *, 4> PreciseAllocaInsts;
     for (auto it = Entry->begin(); it != Entry->end();) {
       Instruction *I = &*(it++);
diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl
new file mode 100644
index 0000000000..17cc4d7902
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl
@@ -0,0 +1,23 @@
+// RUN: %dxc -T vs_6_9 %s | FileCheck %s
+// Tests a specific case of a precise native vector requiring extraction
+// and reinsertion during the alloca phase where conditionalmem2reg is concerned.
+// Serves as the source for longvec-precise.ll and its specific pass tests
+
+precise float4 main (float4 pos : POSITION, float4 scale : SCL, float4 shift : OFF) : SV_Position {
+  precise float4 position = pos;
+  // Initial multiplication to avoid optimizaton just using the input scalar
+  // CHECK-NOT: fmul fast
+  // CHECK: [[pos:%.*]] = fmul <4 x float>
+  position = position * scale;
+
+  // CHECK: [[z:%.*]] = extractelement <4 x float> [[pos]], i32 2
+  // CHECK-NOT: fadd fast
+  // CHECK: [[sz:%.*]] = fadd float [[z]], 0x
+  // CHECK: [[spos:%.*]] = insertelement <4 x float> [[pos]], float [[sz]], i32 2
+  position.z += 0.01f;
+  // CHECK-NOT: fadd fast
+  // CHECK: fadd <4 x float> %{{.*}}, [[spos]]
+  position += shift;
+
+  return position;
+}

From 5d3328cc4ab1be84d500527071c0c11094c552cc Mon Sep 17 00:00:00 2001
From: Gregory Roth <groth@nvidia.com>
Date: Mon, 8 Jun 2026 17:06:18 -0700
Subject: [PATCH 2/2] Switch preserving dx.annotate.precise call to use
 attributes

The dx.annotate.precise() internal function call is used to mark variables as precise and is identified by the metadata associated with it. The function never makes it into final dxil. It serves only as an intermediate mechanism to mark things as precise.

The LL assembler verifier rejects llvm IR that has a function with no body with attached metadata. In order to have pass tests that involve precise variables indicated by this temporarly internal call, it has to be marked some other way.

This changes the precise marking to a function attribute that is used in other places such as the temporary dxil noop call. In practice, it has little impact as variables themselves are still marked as precise using metadata where and when possible. Instead of sharing that mechanism, this gives the temporary function its own that satisfies the LLVM assembler

Includes the pass tests for precise native vectors that this change makes possible.
---
 include/dxc/DXIL/DxilConstants.h              |   2 +
 lib/DXIL/DxilModule.cpp                       |   3 +
 lib/HLSL/HLModule.cpp                         |  12 +-
 .../passes/longvec-precise-mem2reg.ll         | 112 ++++++++++++++++++
 .../passes/longvec-precise-sroa.ll            | 111 +++++++++++++++++
 5 files changed, 232 insertions(+), 8 deletions(-)
 create mode 100644 tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll
 create mode 100644 tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll

diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h
index 4705b90c55..3988f2b627 100644
--- a/include/dxc/DXIL/DxilConstants.h
+++ b/include/dxc/DXIL/DxilConstants.h
@@ -2487,6 +2487,8 @@ extern const char *kHostLayoutTypePrefix;
 
 extern const char *kWaveOpsIncludeHelperLanesString;
 
+extern const char *kPreciseString;
+
 } // namespace DXIL
 
 } // namespace hlsl
diff --git a/lib/DXIL/DxilModule.cpp b/lib/DXIL/DxilModule.cpp
index b51729e63d..369d6805b0 100644
--- a/lib/DXIL/DxilModule.cpp
+++ b/lib/DXIL/DxilModule.cpp
@@ -87,6 +87,9 @@ const char *kDxLinAlgMatrixTypePrefix = "dx.types.LinAlgMatrix";
 const char *kHostLayoutTypePrefix = "hostlayout.";
 
 const char *kWaveOpsIncludeHelperLanesString = "waveops-include-helper-lanes";
+
+const char *kPreciseString = "dx.precise";
+
 } // namespace DXIL
 
 void SetDxilHook(Module &M);
diff --git a/lib/HLSL/HLModule.cpp b/lib/HLSL/HLModule.cpp
index e172217c75..b6a83e9755 100644
--- a/lib/HLSL/HLModule.cpp
+++ b/lib/HLSL/HLModule.cpp
@@ -1022,11 +1022,7 @@ void HLModule::ClearPreciseAttributeWithMetadata(Instruction *I) {
 }
 
 static void MarkPreciseAttribute(Function *F) {
-  LLVMContext &Ctx = F->getContext();
-  MDNode *preciseNode = MDNode::get(
-      Ctx, {MDString::get(Ctx, DxilMDHelper::kDxilPreciseAttributeMDName)});
-
-  F->setMetadata(DxilMDHelper::kDxilPreciseAttributeMDName, preciseNode);
+  F->addFnAttr(DXIL::kPreciseString);
 }
 
 template <typename BuilderTy>
@@ -1107,9 +1103,9 @@ void HLModule::MarkPreciseAttributeOnPtrWithFunctionCall(llvm::Value *Ptr,
 }
 
 bool HLModule::HasPreciseAttribute(Function *F) {
-  MDNode *preciseNode =
-      F->getMetadata(DxilMDHelper::kDxilPreciseAttributeMDName);
-  return preciseNode != nullptr;
+  AttributeSet Attributeset = F->getAttributes();
+  return Attributeset.hasAttribute(AttributeSet::FunctionIndex,
+                                   DXIL::kPreciseString);
 }
 
 static void AddDIGlobalVariable(DIBuilder &Builder, DIGlobalVariable *LocDIGV,
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll
new file mode 100644
index 0000000000..7ae223dc95
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll
@@ -0,0 +1,112 @@
+; RUN: %dxopt %s -hlsl-passes-resume -dxil-cond-mem2reg -S | FileCheck %s
+; Test that conditionalmem2reg does not scalarize precise native vectors
+; as it would pre-6.9 as part of keeping their allocas around to maintain
+; the precise information.
+
+; The checks are just confirming that the precise calls are preserved
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+define void @main(<4 x float>* noalias %arg, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3) #0 {
+entry:
+  %shift.addr = alloca <4 x float>, align 4, !dx.temp !9
+  %scale.addr = alloca <4 x float>, align 4, !dx.temp !9
+  %pos.addr = alloca <4 x float>, align 4, !dx.temp !9
+
+  ; Confirm that position is the only alloca that is preserved
+  ; CHECK-NOT: alloca
+  ; CHECK: %position = alloca <4 x float>, align 4, !dx.precise
+  ; CHECK-NOT: alloca
+  %position = alloca <4 x float>, align 4, !dx.precise !23
+  store <4 x float> %arg3, <4 x float>* %shift.addr, align 4, !tbaa !24
+  store <4 x float> %arg2, <4 x float>* %scale.addr, align 4, !tbaa !24
+  store <4 x float> %arg1, <4 x float>* %pos.addr, align 4, !tbaa !24
+  %tmp = bitcast <4 x float>* %position to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %tmp) #0
+  %tmp4 = load <4 x float>, <4 x float>* %pos.addr, align 4, !tbaa !24
+
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %arg1)
+  ; CHECK-NEXT: store <4 x float> %arg1, <4 x float>* %position, align 4
+  call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp4)
+  store <4 x float> %tmp4, <4 x float>* %position, align 4, !tbaa !24
+  %tmp5 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24
+  %tmp6 = load <4 x float>, <4 x float>* %scale.addr, align 4, !tbaa !24
+  %mul = fmul <4 x float> %tmp5, %tmp6
+
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %mul)
+  ; CHECK-NEXT: store <4 x float> %mul, <4 x float>* %position, align 4
+  call void @"dx.attribute.precise.<4 x float>"(<4 x float> %mul)
+  store <4 x float> %mul, <4 x float>* %position, align 4, !tbaa !24
+  %tmp7 = load <4 x float>, <4 x float>* %position, align 4
+  %tmp8 = extractelement <4 x float> %tmp7, i32 2
+  %add = fadd float %tmp8, 0x3F847AE140000000
+  %tmp9 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2
+
+  ; CHECK: call void @dx.attribute.precise.float(float %add)
+  ; CHECK-NEXT: store float %add, float* %tmp9
+  call void @dx.attribute.precise.float(float %add)
+  store float %add, float* %tmp9
+  %tmp10 = load <4 x float>, <4 x float>* %shift.addr, align 4, !tbaa !24
+  %tmp11 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24
+  %add1 = fadd <4 x float> %tmp11, %tmp10
+
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %add1)
+  ; CHECK-NEXT: store <4 x float> %add1, <4 x float>* %position, align 4
+  call void @"dx.attribute.precise.<4 x float>"(<4 x float> %add1)
+  store <4 x float> %add1, <4 x float>* %position, align 4, !tbaa !24
+  %tmp12 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24
+  %tmp13 = bitcast <4 x float>* %position to i8*
+  call void @llvm.lifetime.end(i64 16, i8* %tmp13) #0
+  store <4 x float> %tmp12, <4 x float>* %arg
+  ret void
+}
+
+declare void @"dx.attribute.precise.<4 x float>"(<4 x float>) #1
+
+declare void @dx.attribute.precise.float(float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { "dx.precise" }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!19}
+!dx.fnprops = !{!20}
+!dx.options = !{!21, !22}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{i32 1, i32 10}
+!5 = !{!"vs", i32 6, i32 9}
+!6 = !{i32 1, void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, !7}
+!7 = !{!8, !10, !13, !15, !17}
+!8 = !{i32 0, !9, !9}
+!9 = !{}
+!10 = !{i32 1, !11, !12}
+!11 = !{i32 8, i1 true, i32 4, !"SV_Position", i32 7, i32 9}
+!12 = !{i32 0}
+!13 = !{i32 0, !14, !12}
+!14 = !{i32 4, !"POSITION", i32 7, i32 9}
+!15 = !{i32 0, !16, !12}
+!16 = !{i32 4, !"SCL", i32 7, i32 9}
+!17 = !{i32 0, !18, !12}
+!18 = !{i32 4, !"OFF", i32 7, i32 9}
+!19 = !{void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, !"main", null, null, null}
+!20 = !{void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, i32 1}
+!21 = !{i32 64}
+!22 = !{i32 -1}
+!23 = !{i32 1}
+!24 = !{!25, !25, i64 0}
+!25 = !{!"omnipotent char", !26, i64 0}
+!26 = !{!"Simple C/C++ TBAA"}
diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll
new file mode 100644
index 0000000000..29d9f3134a
--- /dev/null
+++ b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll
@@ -0,0 +1,111 @@
+; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s
+; Test that precise native vector allocas are marked with a vector overload call
+; to dx.attribute.precise() and not scalar extracted and re-inserted
+
+target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64"
+target triple = "dxil-ms-dx"
+
+%ConstantBuffer = type opaque
+
+@"$Globals" = external constant %ConstantBuffer
+
+; Function Attrs: nounwind
+define <4 x float> @main(<4 x float> %pos, <4 x float> %scale, <4 x float> %shift) #0 {
+bb:
+  %tmp = alloca <4 x float>, align 4, !dx.temp !10
+  %tmp1 = alloca <4 x float>, align 4, !dx.temp !10
+  %tmp2 = alloca <4 x float>, align 4, !dx.temp !10
+  %position = alloca <4 x float>, align 4, !dx.precise !24
+  store <4 x float> %shift, <4 x float>* %tmp, align 4, !tbaa !25
+  store <4 x float> %scale, <4 x float>* %tmp1, align 4, !tbaa !25
+  store <4 x float> %pos, <4 x float>* %tmp2, align 4, !tbaa !25
+  %tmp3 = bitcast <4 x float>* %position to i8* ; line:7 col:3
+  call void @llvm.lifetime.start(i64 16, i8* %tmp3) #0 ; line:7 col:3
+  %tmp4 = load <4 x float>, <4 x float>* %tmp2, align 4, !tbaa !25 ; line:7 col:29
+  ; CHECK: %tmp4 = load <4 x float>, <4 x float>* %tmp2
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp4)
+
+  store <4 x float> %tmp4, <4 x float>* %position, align 4, !tbaa !25 ; line:7 col:18
+  %tmp5 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:11 col:14
+  %tmp6 = load <4 x float>, <4 x float>* %tmp1, align 4, !tbaa !25 ; line:11 col:25
+  %tmp7 = fmul <4 x float> %tmp5, %tmp6 ; line:11 col:23
+
+  ; CHECK: %tmp7 = fmul <4 x float> %tmp5, %tmp6
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp7)
+
+  store <4 x float> %tmp7, <4 x float>* %position, align 4, !tbaa !25 ; line:11 col:12
+  %tmp8 = load <4 x float>, <4 x float>* %position, align 4 ; line:17 col:14
+  %tmp9 = extractelement <4 x float> %tmp8, i32 2 ; line:17 col:14
+  %tmp10 = fadd float %tmp9, 0x3F847AE140000000 ; line:17 col:14
+  %tmp11 = load <4 x float>, <4 x float>* %position, align 4 ; line:17 col:14
+  %tmp12 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2 ; line:17 col:14
+
+  ; CHECK: %tmp12 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @dx.attribute.precise.float(float %tmp10)
+
+  store float %tmp10, float* %tmp12 ; line:17 col:14
+  %tmp13 = load <4 x float>, <4 x float>* %tmp, align 4, !tbaa !25 ; line:20 col:15
+  %tmp14 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:20 col:12
+  %tmp15 = fadd <4 x float> %tmp14, %tmp13 ; line:20 col:12
+
+  ; CHECK: %tmp15 = fadd <4 x float> %tmp14, %tmp13
+  ; CHECK-NOT: extractelement
+  ; CHECK-NOT: dx.attribute.precise.float
+  ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp15)
+
+  store <4 x float> %tmp15, <4 x float>* %position, align 4, !tbaa !25 ; line:20 col:12
+  %tmp16 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:22 col:10
+  %tmp17 = bitcast <4 x float>* %position to i8* ; line:23 col:1
+  call void @llvm.lifetime.end(i64 16, i8* %tmp17) #0 ; line:23 col:1
+  ret <4 x float> %tmp16 ; line:22 col:3
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+attributes #0 = { nounwind }
+
+!pauseresume = !{!1}
+!dx.version = !{!3}
+!dx.valver = !{!4}
+!dx.shaderModel = !{!5}
+!dx.typeAnnotations = !{!6}
+!dx.entryPoints = !{!17}
+!dx.fnprops = !{!21}
+!dx.options = !{!22, !23}
+
+!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"}
+!3 = !{i32 1, i32 9}
+!4 = !{i32 1, i32 10}
+!5 = !{!"vs", i32 6, i32 9}
+!6 = !{i32 1, <4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, !7}
+!7 = !{!8, !11, !13, !15}
+!8 = !{i32 1, !9, !10}
+!9 = !{i32 8, i1 true, i32 4, !"SV_Position", i32 7, i32 9, i32 13, i32 4}
+!10 = !{}
+!11 = !{i32 0, !12, !10}
+!12 = !{i32 4, !"POSITION", i32 7, i32 9, i32 13, i32 4}
+!13 = !{i32 0, !14, !10}
+!14 = !{i32 4, !"SCL", i32 7, i32 9, i32 13, i32 4}
+!15 = !{i32 0, !16, !10}
+!16 = !{i32 4, !"OFF", i32 7, i32 9, i32 13, i32 4}
+!17 = !{<4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, !"main", null, !18, null}
+!18 = !{null, null, !19, null}
+!19 = !{!20}
+!20 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null}
+!21 = !{<4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, i32 1}
+!22 = !{i32 64}
+!23 = !{i32 -1}
+!24 = !{i32 1}
+!25 = !{!26, !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C/C++ TBAA"}