diff --git a/include/dxc/DXIL/DxilConstants.h b/include/dxc/DXIL/DxilConstants.h index 4705b90c55..3988f2b627 100644 --- a/include/dxc/DXIL/DxilConstants.h +++ b/include/dxc/DXIL/DxilConstants.h @@ -2487,6 +2487,8 @@ extern const char *kHostLayoutTypePrefix; extern const char *kWaveOpsIncludeHelperLanesString; +extern const char *kPreciseString; + } // namespace DXIL } // namespace hlsl diff --git a/lib/DXIL/DxilModule.cpp b/lib/DXIL/DxilModule.cpp index b51729e63d..369d6805b0 100644 --- a/lib/DXIL/DxilModule.cpp +++ b/lib/DXIL/DxilModule.cpp @@ -87,6 +87,9 @@ const char *kDxLinAlgMatrixTypePrefix = "dx.types.LinAlgMatrix"; const char *kHostLayoutTypePrefix = "hostlayout."; const char *kWaveOpsIncludeHelperLanesString = "waveops-include-helper-lanes"; + +const char *kPreciseString = "dx.precise"; + } // namespace DXIL void SetDxilHook(Module &M); diff --git a/lib/HLSL/HLModule.cpp b/lib/HLSL/HLModule.cpp index bab6e23a30..b6a83e9755 100644 --- a/lib/HLSL/HLModule.cpp +++ b/lib/HLSL/HLModule.cpp @@ -1022,11 +1022,7 @@ void HLModule::ClearPreciseAttributeWithMetadata(Instruction *I) { } static void MarkPreciseAttribute(Function *F) { - LLVMContext &Ctx = F->getContext(); - MDNode *preciseNode = MDNode::get( - Ctx, {MDString::get(Ctx, DxilMDHelper::kDxilPreciseAttributeMDName)}); - - F->setMetadata(DxilMDHelper::kDxilPreciseAttributeMDName, preciseNode); + F->addFnAttr(DXIL::kPreciseString); } template @@ -1034,7 +1030,10 @@ void HLModule::MarkPreciseAttributeOnValWithFunctionCall(llvm::Value *V, BuilderTy &Builder, llvm::Module &M) { Type *Ty = V->getType(); - Type *EltTy = Ty->getScalarType(); + Type *EltTy = Ty; + bool SupportsVectors = M.GetHLModule().GetShaderModel()->IsSM69Plus(); + if (!SupportsVectors) + EltTy = Ty->getScalarType(); // TODO: Only do this on basic types. @@ -1050,7 +1049,8 @@ void HLModule::MarkPreciseAttributeOnValWithFunctionCall(llvm::Value *V, cast(M.getOrInsertFunction(preciseFuncName, preciseFuncTy)); if (!HLModule::HasPreciseAttribute(preciseFunc)) MarkPreciseAttribute(preciseFunc); - if (FixedVectorType *VT = dyn_cast(Ty)) { + if (!SupportsVectors && isa(Ty)) { + FixedVectorType *VT = dyn_cast(Ty); for (unsigned i = 0; i < VT->getNumElements(); i++) { Value *Elt = Builder.CreateExtractElement(V, i); Builder.CreateCall(preciseFunc, {Elt}); @@ -1103,9 +1103,9 @@ void HLModule::MarkPreciseAttributeOnPtrWithFunctionCall(llvm::Value *Ptr, } bool HLModule::HasPreciseAttribute(Function *F) { - MDNode *preciseNode = - F->getMetadata(DxilMDHelper::kDxilPreciseAttributeMDName); - return preciseNode != nullptr; + AttributeSet Attributeset = F->getAttributes(); + return Attributeset.hasAttribute(AttributeSet::FunctionIndex, + DXIL::kPreciseString); } static void AddDIGlobalVariable(DIBuilder &Builder, DIGlobalVariable *LocDIGV, diff --git a/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp b/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp index c3fc16c40b..9ec7d6e18b 100644 --- a/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp +++ b/lib/Transforms/Scalar/DxilConditionalMem2Reg.cpp @@ -270,6 +270,11 @@ class DxilConditionalMem2Reg : public FunctionPass { static bool ScalarizePreciseVectorAlloca(Function &F) { BasicBlock *Entry = &*F.begin(); + // No need to scalarize the vector if 6.9 native vector support is available + Module *M = F.getParent(); + if (M->HasHLModule() && M->GetHLModule().GetShaderModel()->IsSM69Plus()) + return false; + SmallVector PreciseAllocaInsts; for (auto it = Entry->begin(); it != Entry->end();) { Instruction *I = &*(it++); diff --git a/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl new file mode 100644 index 0000000000..17cc4d7902 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/hlsl/types/longvec-precise.hlsl @@ -0,0 +1,23 @@ +// RUN: %dxc -T vs_6_9 %s | FileCheck %s +// Tests a specific case of a precise native vector requiring extraction +// and reinsertion during the alloca phase where conditionalmem2reg is concerned. +// Serves as the source for longvec-precise.ll and its specific pass tests + +precise float4 main (float4 pos : POSITION, float4 scale : SCL, float4 shift : OFF) : SV_Position { + precise float4 position = pos; + // Initial multiplication to avoid optimizaton just using the input scalar + // CHECK-NOT: fmul fast + // CHECK: [[pos:%.*]] = fmul <4 x float> + position = position * scale; + + // CHECK: [[z:%.*]] = extractelement <4 x float> [[pos]], i32 2 + // CHECK-NOT: fadd fast + // CHECK: [[sz:%.*]] = fadd float [[z]], 0x + // CHECK: [[spos:%.*]] = insertelement <4 x float> [[pos]], float [[sz]], i32 2 + position.z += 0.01f; + // CHECK-NOT: fadd fast + // CHECK: fadd <4 x float> %{{.*}}, [[spos]] + position += shift; + + return position; +} diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll new file mode 100644 index 0000000000..7ae223dc95 --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-mem2reg.ll @@ -0,0 +1,112 @@ +; RUN: %dxopt %s -hlsl-passes-resume -dxil-cond-mem2reg -S | FileCheck %s +; Test that conditionalmem2reg does not scalarize precise native vectors +; as it would pre-6.9 as part of keeping their allocas around to maintain +; the precise information. + +; The checks are just confirming that the precise calls are preserved + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +define void @main(<4 x float>* noalias %arg, <4 x float> %arg1, <4 x float> %arg2, <4 x float> %arg3) #0 { +entry: + %shift.addr = alloca <4 x float>, align 4, !dx.temp !9 + %scale.addr = alloca <4 x float>, align 4, !dx.temp !9 + %pos.addr = alloca <4 x float>, align 4, !dx.temp !9 + + ; Confirm that position is the only alloca that is preserved + ; CHECK-NOT: alloca + ; CHECK: %position = alloca <4 x float>, align 4, !dx.precise + ; CHECK-NOT: alloca + %position = alloca <4 x float>, align 4, !dx.precise !23 + store <4 x float> %arg3, <4 x float>* %shift.addr, align 4, !tbaa !24 + store <4 x float> %arg2, <4 x float>* %scale.addr, align 4, !tbaa !24 + store <4 x float> %arg1, <4 x float>* %pos.addr, align 4, !tbaa !24 + %tmp = bitcast <4 x float>* %position to i8* + call void @llvm.lifetime.start(i64 16, i8* %tmp) #0 + %tmp4 = load <4 x float>, <4 x float>* %pos.addr, align 4, !tbaa !24 + + ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %arg1) + ; CHECK-NEXT: store <4 x float> %arg1, <4 x float>* %position, align 4 + call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp4) + store <4 x float> %tmp4, <4 x float>* %position, align 4, !tbaa !24 + %tmp5 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24 + %tmp6 = load <4 x float>, <4 x float>* %scale.addr, align 4, !tbaa !24 + %mul = fmul <4 x float> %tmp5, %tmp6 + + ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %mul) + ; CHECK-NEXT: store <4 x float> %mul, <4 x float>* %position, align 4 + call void @"dx.attribute.precise.<4 x float>"(<4 x float> %mul) + store <4 x float> %mul, <4 x float>* %position, align 4, !tbaa !24 + %tmp7 = load <4 x float>, <4 x float>* %position, align 4 + %tmp8 = extractelement <4 x float> %tmp7, i32 2 + %add = fadd float %tmp8, 0x3F847AE140000000 + %tmp9 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2 + + ; CHECK: call void @dx.attribute.precise.float(float %add) + ; CHECK-NEXT: store float %add, float* %tmp9 + call void @dx.attribute.precise.float(float %add) + store float %add, float* %tmp9 + %tmp10 = load <4 x float>, <4 x float>* %shift.addr, align 4, !tbaa !24 + %tmp11 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24 + %add1 = fadd <4 x float> %tmp11, %tmp10 + + ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %add1) + ; CHECK-NEXT: store <4 x float> %add1, <4 x float>* %position, align 4 + call void @"dx.attribute.precise.<4 x float>"(<4 x float> %add1) + store <4 x float> %add1, <4 x float>* %position, align 4, !tbaa !24 + %tmp12 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !24 + %tmp13 = bitcast <4 x float>* %position to i8* + call void @llvm.lifetime.end(i64 16, i8* %tmp13) #0 + store <4 x float> %tmp12, <4 x float>* %arg + ret void +} + +declare void @"dx.attribute.precise.<4 x float>"(<4 x float>) #1 + +declare void @dx.attribute.precise.float(float) #1 + +attributes #0 = { nounwind } +attributes #1 = { "dx.precise" } + +!pauseresume = !{!1} +!dx.version = !{!3} +!dx.valver = !{!4} +!dx.shaderModel = !{!5} +!dx.typeAnnotations = !{!6} +!dx.entryPoints = !{!19} +!dx.fnprops = !{!20} +!dx.options = !{!21, !22} + +!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"} +!3 = !{i32 1, i32 9} +!4 = !{i32 1, i32 10} +!5 = !{!"vs", i32 6, i32 9} +!6 = !{i32 1, void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, !7} +!7 = !{!8, !10, !13, !15, !17} +!8 = !{i32 0, !9, !9} +!9 = !{} +!10 = !{i32 1, !11, !12} +!11 = !{i32 8, i1 true, i32 4, !"SV_Position", i32 7, i32 9} +!12 = !{i32 0} +!13 = !{i32 0, !14, !12} +!14 = !{i32 4, !"POSITION", i32 7, i32 9} +!15 = !{i32 0, !16, !12} +!16 = !{i32 4, !"SCL", i32 7, i32 9} +!17 = !{i32 0, !18, !12} +!18 = !{i32 4, !"OFF", i32 7, i32 9} +!19 = !{void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, !"main", null, null, null} +!20 = !{void (<4 x float>*, <4 x float>, <4 x float>, <4 x float>)* @main, i32 1} +!21 = !{i32 64} +!22 = !{i32 -1} +!23 = !{i32 1} +!24 = !{!25, !25, i64 0} +!25 = !{!"omnipotent char", !26, i64 0} +!26 = !{!"Simple C/C++ TBAA"} diff --git a/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll new file mode 100644 index 0000000000..29d9f3134a --- /dev/null +++ b/tools/clang/test/CodeGenDXIL/passes/longvec-precise-sroa.ll @@ -0,0 +1,111 @@ +; RUN: %dxopt %s -hlsl-passes-resume -scalarrepl-param-hlsl -S | FileCheck %s +; Test that precise native vector allocas are marked with a vector overload call +; to dx.attribute.precise() and not scalar extracted and re-inserted + +target datalayout = "e-m:e-p:32:32-i1:32-i8:32-i16:32-i32:32-i64:64-f16:32-f32:32-f64:64-n8:16:32:64" +target triple = "dxil-ms-dx" + +%ConstantBuffer = type opaque + +@"$Globals" = external constant %ConstantBuffer + +; Function Attrs: nounwind +define <4 x float> @main(<4 x float> %pos, <4 x float> %scale, <4 x float> %shift) #0 { +bb: + %tmp = alloca <4 x float>, align 4, !dx.temp !10 + %tmp1 = alloca <4 x float>, align 4, !dx.temp !10 + %tmp2 = alloca <4 x float>, align 4, !dx.temp !10 + %position = alloca <4 x float>, align 4, !dx.precise !24 + store <4 x float> %shift, <4 x float>* %tmp, align 4, !tbaa !25 + store <4 x float> %scale, <4 x float>* %tmp1, align 4, !tbaa !25 + store <4 x float> %pos, <4 x float>* %tmp2, align 4, !tbaa !25 + %tmp3 = bitcast <4 x float>* %position to i8* ; line:7 col:3 + call void @llvm.lifetime.start(i64 16, i8* %tmp3) #0 ; line:7 col:3 + %tmp4 = load <4 x float>, <4 x float>* %tmp2, align 4, !tbaa !25 ; line:7 col:29 + ; CHECK: %tmp4 = load <4 x float>, <4 x float>* %tmp2 + ; CHECK-NOT: extractelement + ; CHECK-NOT: dx.attribute.precise.float + ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp4) + + store <4 x float> %tmp4, <4 x float>* %position, align 4, !tbaa !25 ; line:7 col:18 + %tmp5 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:11 col:14 + %tmp6 = load <4 x float>, <4 x float>* %tmp1, align 4, !tbaa !25 ; line:11 col:25 + %tmp7 = fmul <4 x float> %tmp5, %tmp6 ; line:11 col:23 + + ; CHECK: %tmp7 = fmul <4 x float> %tmp5, %tmp6 + ; CHECK-NOT: extractelement + ; CHECK-NOT: dx.attribute.precise.float + ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp7) + + store <4 x float> %tmp7, <4 x float>* %position, align 4, !tbaa !25 ; line:11 col:12 + %tmp8 = load <4 x float>, <4 x float>* %position, align 4 ; line:17 col:14 + %tmp9 = extractelement <4 x float> %tmp8, i32 2 ; line:17 col:14 + %tmp10 = fadd float %tmp9, 0x3F847AE140000000 ; line:17 col:14 + %tmp11 = load <4 x float>, <4 x float>* %position, align 4 ; line:17 col:14 + %tmp12 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2 ; line:17 col:14 + + ; CHECK: %tmp12 = getelementptr <4 x float>, <4 x float>* %position, i32 0, i32 2 + ; CHECK-NOT: extractelement + ; CHECK-NOT: dx.attribute.precise.float + ; CHECK: call void @dx.attribute.precise.float(float %tmp10) + + store float %tmp10, float* %tmp12 ; line:17 col:14 + %tmp13 = load <4 x float>, <4 x float>* %tmp, align 4, !tbaa !25 ; line:20 col:15 + %tmp14 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:20 col:12 + %tmp15 = fadd <4 x float> %tmp14, %tmp13 ; line:20 col:12 + + ; CHECK: %tmp15 = fadd <4 x float> %tmp14, %tmp13 + ; CHECK-NOT: extractelement + ; CHECK-NOT: dx.attribute.precise.float + ; CHECK: call void @"dx.attribute.precise.<4 x float>"(<4 x float> %tmp15) + + store <4 x float> %tmp15, <4 x float>* %position, align 4, !tbaa !25 ; line:20 col:12 + %tmp16 = load <4 x float>, <4 x float>* %position, align 4, !tbaa !25 ; line:22 col:10 + %tmp17 = bitcast <4 x float>* %position to i8* ; line:23 col:1 + call void @llvm.lifetime.end(i64 16, i8* %tmp17) #0 ; line:23 col:1 + ret <4 x float> %tmp16 ; line:22 col:3 +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #0 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #0 + +attributes #0 = { nounwind } + +!pauseresume = !{!1} +!dx.version = !{!3} +!dx.valver = !{!4} +!dx.shaderModel = !{!5} +!dx.typeAnnotations = !{!6} +!dx.entryPoints = !{!17} +!dx.fnprops = !{!21} +!dx.options = !{!22, !23} + +!1 = !{!"hlsl-hlemit", !"hlsl-hlensure"} +!3 = !{i32 1, i32 9} +!4 = !{i32 1, i32 10} +!5 = !{!"vs", i32 6, i32 9} +!6 = !{i32 1, <4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, !7} +!7 = !{!8, !11, !13, !15} +!8 = !{i32 1, !9, !10} +!9 = !{i32 8, i1 true, i32 4, !"SV_Position", i32 7, i32 9, i32 13, i32 4} +!10 = !{} +!11 = !{i32 0, !12, !10} +!12 = !{i32 4, !"POSITION", i32 7, i32 9, i32 13, i32 4} +!13 = !{i32 0, !14, !10} +!14 = !{i32 4, !"SCL", i32 7, i32 9, i32 13, i32 4} +!15 = !{i32 0, !16, !10} +!16 = !{i32 4, !"OFF", i32 7, i32 9, i32 13, i32 4} +!17 = !{<4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, !"main", null, !18, null} +!18 = !{null, null, !19, null} +!19 = !{!20} +!20 = !{i32 0, %ConstantBuffer* @"$Globals", !"$Globals", i32 0, i32 -1, i32 1, i32 0, null} +!21 = !{<4 x float> (<4 x float>, <4 x float>, <4 x float>)* @main, i32 1} +!22 = !{i32 64} +!23 = !{i32 -1} +!24 = !{i32 1} +!25 = !{!26, !26, i64 0} +!26 = !{!"omnipotent char", !27, i64 0} +!27 = !{!"Simple C/C++ TBAA"}