From 748e7af8dd6e9b4683a6402a0ca6598fe23a9c1e Mon Sep 17 00:00:00 2001
From: Krish Gupta <krishom70@gmail.com>
Date: Tue, 9 Dec 2025 20:40:21 +0530
Subject: [PATCH 01/63] [flang][OpenMP] Fix firstprivate not working with
 lastprivate in DO SIMD (#170163)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a bug where firstprivate was ignored when the same variable
had both firstprivate and lastprivate clauses in a do simd construct.

What was broken:
```
integer :: a
a = 10
!$omp do simd firstprivate(a) lastprivate(a)
do i = 1, 1
   print *, a  ! Should print 10, but printed garbage/0
   a = 20
end do
!$omp end do simd
print *, a  ! Correctly prints 20
```

Inside the loop, [a] wasn't being initialized from the firstprivate
clause—it just had whatever uninitialized value was there.

The fix:

In genCompositeDoSimd(), we were using simdItemDSP to handle
privatization for the whole loop nest. This only looked at SIMD clauses
and missed the firstprivate from the DO part. Changed it to use
wsloopItemDSP instead, which handles both DO clauses (firstprivate,
lastprivate) correctly.

One line change in OpenMP.cpp

Tests added:

Lowering test to check MLIR generation
Runtime test to verify the actual values are correct
<img width="740" height="440" alt="image"
src="https://github.com/user-attachments/assets/fa911ea8-2024-4edf-b710-52c10659742e"
/>


Fixes #168306

---------

Co-authored-by: Krish Gupta <krishgupta@Krishs-MacBook-Air.local>
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 20 ++---
 ...-simd-firstprivate-lastprivate-runtime.f90 | 48 ++++++++++
 .../do-simd-firstprivate-lastprivate.f90      | 89 +++++++++++++++++++
 flang/test/Lower/OpenMP/order-clause.f90      |  8 +-
 flang/test/Lower/OpenMP/wsloop-simd.f90       |  9 +-
 5 files changed, 151 insertions(+), 23 deletions(-)
 create mode 100644 flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
 create mode 100644 flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 582e684442dfc..9c25c1955cb78 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3314,17 +3314,12 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
-  DataSharingProcessor wsloopItemDSP(
-      converter, semaCtx, doItem->clauses, eval,
-      /*shouldCollectPreDeterminedSymbols=*/false,
-      /*useDelayedPrivatization=*/true, symTable);
+  DataSharingProcessor wsloopItemDSP(converter, semaCtx, doItem->clauses, eval,
+                                     /*shouldCollectPreDeterminedSymbols=*/true,
+                                     /*useDelayedPrivatization=*/true,
+                                     symTable);
   wsloopItemDSP.processStep1(&wsloopClauseOps);
 
-  DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
-                                   /*shouldCollectPreDeterminedSymbols=*/true,
-                                   /*useDelayedPrivatization=*/true, symTable);
-  simdItemDSP.processStep1(&simdClauseOps, simdItem->id);
-
   // Pass the innermost leaf construct's clauses because that's where COLLAPSE
   // is placed by construct decomposition.
   mlir::omp::LoopNestOperands loopNestClauseOps;
@@ -3343,8 +3338,9 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
-  simdArgs.priv.vars = simdClauseOps.privateVars;
+  // For composite 'do simd', privatization is handled by the wsloop.
+  // The simd does not create separate private storage for variables already
+  // privatized by the worksharing construct.
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
@@ -3354,7 +3350,7 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, simdItem,
                 loopNestClauseOps, iv,
                 {{wsloopOp, wsloopArgs}, {simdOp, simdArgs}},
-                llvm::omp::Directive::OMPD_do_simd, simdItemDSP);
+                llvm::omp::Directive::OMPD_do_simd, wsloopItemDSP);
   return wsloopOp;
 }
 
diff --git a/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90 b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
new file mode 100644
index 0000000000000..4fef69188e0ee
--- /dev/null
+++ b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
@@ -0,0 +1,48 @@
+! Test runtime behavior of DO SIMD with firstprivate and lastprivate on same variable
+! This is the reproducer from issue #168306
+
+! REQUIRES: openmp-runtime
+
+! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM
+! RUN: %flang -fopenmp %s -o %t && %t | FileCheck %s
+
+! LLVM-LABEL: define {{.*}} @_QQmain
+program main
+  integer :: a
+  integer :: i
+  
+  a = 10
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+     ! Inside loop: a should be 10 (from firstprivate initialization)
+     ! CHECK: main1 : a = 10
+     print *, "main1 : a = ", a
+     a = 20
+  end do
+  !$omp end do simd
+  ! After loop: a should be 20 (from lastprivate copy-out)
+  ! CHECK: main2 : a = 20
+  print *, "main2 : a = ", a
+  
+  call sub
+  ! CHECK: pass
+  print *, 'pass'
+end program main
+
+subroutine sub
+  integer :: a
+  integer :: i
+  
+  a = 10
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+     ! Inside loop: a should be 10 (from firstprivate initialization)
+     ! CHECK: sub1  : a = 10
+     print *, "sub1  : a = ", a
+     a = 20
+  end do
+  !$omp end do simd
+  ! After loop: a should be 20 (from lastprivate copy-out)
+  ! CHECK: sub2  : a = 20
+  print *, "sub2  : a = ", a
+end subroutine sub
diff --git a/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90 b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
new file mode 100644
index 0000000000000..429409926d47b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
@@ -0,0 +1,89 @@
+! Test for DO SIMD with the same variable in both firstprivate and lastprivate clauses
+! This tests the fix for issue #168306
+
+! RUN: %flang_fc1 -fopenmp -mmlir --enable-delayed-privatization-staging=true -emit-hlfir %s -o - | FileCheck %s
+
+! Test case 1: Basic test with firstprivate + lastprivate on same variable
+! CHECK-LABEL: func.func @_QPdo_simd_first_last_same_var
+subroutine do_simd_first_last_same_var()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  ! CHECK-NEXT: omp.loop_nest (%[[IV:.*]]) : i32
+  !$omp do simd firstprivate(a) lastprivate(a)
+  do i = 1, 1
+    ! CHECK: %[[FIRSTPRIV_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIV_A]]
+    ! CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]]
+    ! The private copy should be initialized from firstprivate (value 10)
+    ! and then modified to 20
+    a = 20
+  end do
+  !$omp end do simd
+  ! After the loop, 'a' should be 20 due to lastprivate
+end subroutine do_simd_first_last_same_var
+
+! Test case 2: Test with lastprivate and firstprivate in reverse order
+! CHECK-LABEL: func.func @_QPdo_simd_last_first_reverse
+subroutine do_simd_last_first_reverse()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+    a = 20
+  end do
+  !$omp end do simd
+end subroutine do_simd_last_first_reverse
+
+! Test case 3: Multiple variables with mixed privatization
+! CHECK-LABEL: func.func @_QPdo_simd_multiple_vars
+subroutine do_simd_multiple_vars()
+  integer :: a, b, c
+  integer :: i
+  a = 10
+  b = 20
+  c = 30
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}private{{.*}} %{{.*}} -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd firstprivate(a, b) lastprivate(a) private(c)
+  do i = 1, 5
+    a = a + 1
+    b = b + 1
+    c = i
+  end do
+  !$omp end do simd
+end subroutine do_simd_multiple_vars
+
+! Test case 4: Reproducer from issue #168306
+! CHECK-LABEL: func.func @_QPissue_168306_reproducer
+subroutine issue_168306_reproducer()
+  integer :: a
+  integer :: i
+  a = 10
+
+  ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-NEXT: omp.simd
+  ! CHECK-NOT: private
+  !$omp do simd lastprivate(a) firstprivate(a)
+  do i = 1, 1
+    ! Inside the loop, 'a' should start at 10 (from firstprivate)
+    ! This is the key behavior that was broken
+    a = 20
+  end do
+  !$omp end do simd
+  ! After the loop, 'a' should be 20 (from lastprivate)
+end subroutine issue_168306_reproducer
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index d5799079b3759..9da7d905ceeed 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -36,15 +36,15 @@ end subroutine do_order
 
 !CHECK-LABEL:   func.func @_QPdo_simd_order() {
 subroutine do_simd_order
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp do simd order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(unconstrained:concurrent) {
+   !CHECK: omp.wsloop order(unconstrained:concurrent)
    !$omp do simd order(unconstrained:concurrent)
    do i = 1, 10
    end do
@@ -53,7 +53,7 @@ end subroutine do_simd_order
 !CHECK-LABEL:   func.func @_QPdo_simd_order_parallel() {
 subroutine do_simd_order_parallel
    !CHECK: omp.parallel {
-   !CHECK: omp.wsloop order(reproducible:concurrent) {
+   !CHECK: omp.wsloop order(reproducible:concurrent)
    !$omp parallel do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index 03e35de04cace..b18bc29efb230 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -71,16 +71,13 @@ end subroutine do_simd_reduction
 subroutine do_simd_private()
   integer, allocatable :: tmp
   ! CHECK:      omp.wsloop
+  ! CHECK-SAME: private(@[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
-  ! CHECK-SAME: private(@[[PRIV_BOX_SYM:.*]] %{{.*}} -> %[[PRIV_BOX:.*]], @[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.loop_nest (%[[IVAR:.*]]) : i32
   !$omp do simd private(tmp)
   do i=1, 10
-  ! CHECK:      %[[PRIV_BOX_DECL:.*]]:2 = hlfir.declare %[[PRIV_BOX]]
   ! CHECK:      %[[PRIV_IVAR_DECL:.*]]:2 = hlfir.declare %[[PRIV_IVAR]]
   ! CHECK:      hlfir.assign %[[IVAR]] to %[[PRIV_IVAR_DECL]]#0
-  ! CHECK:      %[[PRIV_BOX_LOAD:.*]] = fir.load %[[PRIV_BOX_DECL]]
-  ! CHECK:      hlfir.assign %{{.*}} to %[[PRIV_BOX_DECL]]#0
   ! CHECK:      omp.yield
     tmp = tmp + 1
   end do
@@ -90,13 +87,11 @@ end subroutine do_simd_private
 subroutine do_simd_lastprivate_firstprivate()
   integer :: a
   ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]] : !fir.ref<i32>)
+  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
-  ! CHECK-SAME: private(@[[PRIVATE_A_SYM:.*]] %{{.*}} -> %[[PRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   !$omp do simd lastprivate(a) firstprivate(a)
   do i = 1, 10
     ! CHECK: %[[FIRSTPRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIVATE_A]]
-    ! CHECK: %[[PRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_A]]
     ! CHECK: %[[PRIVATE_I_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_I]]
     a = a + 1
   end do

From b0bd8bdbd89701173db0d446757aad1ad166f08d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 14:56:02 +0100
Subject: [PATCH 02/63] [AtomicExpand] Use getSigned() for negative value

---
 llvm/lib/CodeGen/AtomicExpandPass.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d9bc042d6807e..d19862ad7c188 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -853,8 +853,8 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
   if (AddrAlign < MinWordSize) {
     PMV.AlignedAddr = Builder.CreateIntrinsic(
         Intrinsic::ptrmask, {PtrTy, IntTy},
-        {Addr, ConstantInt::get(IntTy, ~(uint64_t)(MinWordSize - 1))}, nullptr,
-        "AlignedAddr");
+        {Addr, ConstantInt::getSigned(IntTy, ~(uint64_t)(MinWordSize - 1))},
+        nullptr, "AlignedAddr");
 
     Value *AddrInt = Builder.CreatePtrToInt(Addr, IntTy);
     PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");

From 80fc9bc17715cb4e68ec233ae1db668ac321e777 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 15:20:08 +0100
Subject: [PATCH 03/63] [Hexagon] Use getSigned() for signed value

---
 llvm/lib/Target/Hexagon/HexagonISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index bae9d705f5a7a..025e5b087abed 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2527,7 +2527,7 @@ HexagonTargetLowering::getBuildVectorConstInts(ArrayRef<SDValue> Values,
     // Make sure to always cast to IntTy.
     if (auto *CN = dyn_cast<ConstantSDNode>(V.getNode())) {
       const ConstantInt *CI = CN->getConstantIntValue();
-      Consts[i] = ConstantInt::get(IntTy, CI->getValue().getSExtValue());
+      Consts[i] = ConstantInt::getSigned(IntTy, CI->getValue().getSExtValue());
     } else if (auto *CN = dyn_cast<ConstantFPSDNode>(V.getNode())) {
       const ConstantFP *CF = CN->getConstantFPValue();
       APInt A = CF->getValueAPF().bitcastToAPInt();

From cf9ba401e3fb2a9c3c728f1a0f49e75db372e704 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 14:38:28 +0100
Subject: [PATCH 04/63] [BypassSlowDivision] Explicitly create bit mask

Explicitly create the high bit mask using getBitsSetFrom() instead
of inverting an integer. This avoids relying on implicit
truncation.
---
 llvm/lib/Transforms/Utils/BypassSlowDivision.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 9f6d89e97180f..66d8fea251cbd 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -335,10 +335,10 @@ Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
   else
     OrV = Op1 ? Op1 : Op2;
 
-  // BitMask is inverted to check if the operands are
-  // larger than the bypass type
-  uint64_t BitMask = ~BypassType->getBitMask();
-  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+  // Check whether the operands are larger than the bypass type.
+  Value *AndV = Builder.CreateAnd(
+      OrV, APInt::getBitsSetFrom(OrV->getType()->getIntegerBitWidth(),
+                                 BypassType->getBitWidth()));
 
   // Compare operand values
   Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);

From 005ef5cda00d4a5a03c9b7ebd2113b5f2314da89 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 9 Dec 2025 10:14:32 -0500
Subject: [PATCH 05/63] [libc][CI] update macOS version in workflow
 configuration (#171228)

upgrade macOS version to latest stable version in github action. We run
into a problem that timed `os_sync` API only becomes available in 14.4+.
---
 .github/workflows/libc-overlay-tests.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libc-overlay-tests.yml b/.github/workflows/libc-overlay-tests.yml
index 807377564fa13..6bb01d502050e 100644
--- a/.github/workflows/libc-overlay-tests.yml
+++ b/.github/workflows/libc-overlay-tests.yml
@@ -16,7 +16,7 @@ jobs:
       # Set fail-fast to false to ensure that feedback is delivered for all matrix combinations.
       fail-fast: false
       matrix:
-        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-14]
+        os: [ubuntu-24.04, ubuntu-24.04-arm, windows-2022, windows-2025, macos-15]
         include:
           # TODO: add linux gcc when it is fixed
           - os: ubuntu-24.04
@@ -35,7 +35,7 @@ jobs:
             compiler:
               c_compiler: clang-cl
               cpp_compiler: clang-cl
-          - os: macos-14
+          - os: macos-15
             compiler:
               c_compiler: clang
               cpp_compiler: clang++

From 6960b633ee7633d0ed7e9853baea296dbe201ab2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 12:37:42 +0100
Subject: [PATCH 06/63] [LSR] Use getSigned() for negated immediate

---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 63b228efe3b11..68cffd4c18688 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5805,7 +5805,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
       // negated immediate.
       if (!ICmpScaledV)
         ICmpScaledV =
-            ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue());
+            ConstantInt::getSigned(IntTy, -(uint64_t)Offset.getFixedValue());
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue());

From 4afc92e43a1462df613e26997f64cb368d552648 Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Tue, 9 Dec 2025 15:26:09 +0000
Subject: [PATCH 07/63] [NFC][ELF] Remove pointless NEEDS_TLSGD_TO_IE (#171046)

NEEDS_TLSGD_TO_IE is only ever set when the symbol is preeptible, in
which case addTpOffsetGotEntry will just add the symbol to the GOT and
emit a symbolic tlsGotRel anyway, so there is no need to give it its own
special case.

As well as simplifying the code upstream, this is useful downstream for
Morello, which doesn't really have a proper GD/IE-to-LE relaxation, and
so for GD-to-IE can benefit from being able to use the optimisations
addTpOffsetGotEntry has for non-preemptible symbols, rather than having
to reimplement them here.
---
 lld/ELF/Relocations.cpp | 9 ++-------
 lld/ELF/Symbols.h       | 4 ++--
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index ef19a2af0c4d2..59aa43036ce01 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1295,7 +1295,7 @@ unsigned RelocScan::handleTlsRelocation(RelExpr expr, RelType type,
     // label, so TLSDESC=>IE will be categorized as R_RELAX_TLS_GD_TO_LE. We fix
     // the categorization in RISCV::relocateAllosec->
     if (sym.isPreemptible) {
-      sym.setFlags(NEEDS_TLSGD_TO_IE);
+      sym.setFlags(NEEDS_TLSIE);
       sec->addReloc({ctx.target->adjustTlsExpr(type, R_RELAX_TLS_GD_TO_IE),
                      type, offset, addend, &sym});
     } else {
@@ -1635,18 +1635,13 @@ void elf::postScanRelocations(Ctx &ctx) {
       else
         got->addConstant({R_ABS, ctx.target->tlsOffsetRel, offsetOff, 0, &sym});
     }
-    if (flags & NEEDS_TLSGD_TO_IE) {
-      got->addEntry(sym);
-      ctx.mainPart->relaDyn->addSymbolReloc(ctx.target->tlsGotRel, *got,
-                                            sym.getGotOffset(ctx), sym);
-    }
     if (flags & NEEDS_GOT_DTPREL) {
       got->addEntry(sym);
       got->addConstant(
           {R_ABS, ctx.target->tlsOffsetRel, sym.getGotOffset(ctx), 0, &sym});
     }
 
-    if ((flags & NEEDS_TLSIE) && !(flags & NEEDS_TLSGD_TO_IE))
+    if (flags & NEEDS_TLSIE)
       addTpOffsetGotEntry(ctx, sym);
   };
 
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index a7d61f48ed3d5..034c8734addb8 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -48,7 +48,7 @@ enum {
   NEEDS_COPY = 1 << 3,
   NEEDS_TLSDESC = 1 << 4,
   NEEDS_TLSGD = 1 << 5,
-  NEEDS_TLSGD_TO_IE = 1 << 6,
+  // 1 << 6 unused
   NEEDS_GOT_DTPREL = 1 << 7,
   NEEDS_TLSIE = 1 << 8,
   NEEDS_GOT_AUTH = 1 << 9,
@@ -352,7 +352,7 @@ class Symbol {
   bool needsDynReloc() const {
     return flags.load(std::memory_order_relaxed) &
            (NEEDS_COPY | NEEDS_GOT | NEEDS_PLT | NEEDS_TLSDESC | NEEDS_TLSGD |
-            NEEDS_TLSGD_TO_IE | NEEDS_GOT_DTPREL | NEEDS_TLSIE);
+            NEEDS_GOT_DTPREL | NEEDS_TLSIE);
   }
   void allocateAux(Ctx &ctx) {
     assert(auxIdx == 0);

From 51d928f0cff426c70d368afd09afd14cf9a67dfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gergely=20B=C3=A1lint?= <gergely.balint@arm.com>
Date: Tue, 9 Dec 2025 16:36:13 +0100
Subject: [PATCH 08/63] [BOLT] Fix pacret-synchronous-unwind.cpp test (#171395)

The test case build a binary from C++, and checks for the number of
functions the PointerAuthCFIFixup pass runs on.
This can change based on the platform. To account for this, the patch
changes the number to a regex.

The test failed when running on RHEL 9.
---
 .../runtime/AArch64/pacret-synchronous-unwind.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
index 1bfeeaed3715a..0f5e9a38da2ba 100644
--- a/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
+++ b/bolt/test/runtime/AArch64/pacret-synchronous-unwind.cpp
@@ -11,12 +11,15 @@
 // RUN: -fno-asynchronous-unwind-tables \
 // RUN: %s -o %t.exe -Wl,-q
 // RUN: llvm-bolt %t.exe -o %t.bolt | FileCheck %s --check-prefix=CHECK
-//
-// CHECK: PointerAuthCFIAnalyzer ran on 3 functions. Ignored
-// CHECK-NOT: 0 functions (0.00%) because of CFI inconsistencies
-// CHECK-SAME: 1 functions (33.33%) because of CFI inconsistencies
-// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports asynchronous
-// CHECK-SAME: unwind tables. For C compilers, see -fasynchronous-unwind-tables.
+
+// Number of functions with .cfi-negate-ra-state in the binary is
+// platform-dependent.
+// CHECK: BOLT-INFO: PointerAuthCFIAnalyzer ran on {{[0-9]+}} functions.
+// CHECK-SAME: Ignored {{[0-9]}} functions ({{[0-9.]+}}%) because of CFI
+// CHECK-SAME: inconsistencies
+// CHECK-NEXT: BOLT-WARNING: PointerAuthCFIAnalyzer only supports
+// CHECK-SAME: asynchronous unwind tables. For C compilers, see
+// CHECK-SAME: -fasynchronous-unwind-tables.
 
 #include <cstdio>
 #include <stdexcept>

From b2ddb909cfd410ed22fee79511bd09e1ba3d9829 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 9 Dec 2025 16:37:32 +0100
Subject: [PATCH 09/63] [libc++] Don't try to be compatible with libstdc++ in
 __libcpp_refstring on iOS (#170816)

iOS doesn't provide a libstdc++ dylib anymore, so we can remove the
compatiblity check whether we can load the dylib.
---
 libcxx/src/include/refstring.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/src/include/refstring.h b/libcxx/src/include/refstring.h
index 3e0ec7a97c7be..1c73c60f9ced1 100644
--- a/libcxx/src/include/refstring.h
+++ b/libcxx/src/include/refstring.h
@@ -15,7 +15,7 @@
 #include <cstring>
 #include <stdexcept>
 
-// MacOS and iOS used to ship with libstdc++, and still support old applications
+// MacOS used to ship with libstdc++, and still support old applications
 // linking against libstdc++. The libc++ and libstdc++ exceptions are supposed
 // to be ABI compatible, such that they can be thrown from one library and caught
 // in the other.
@@ -25,7 +25,7 @@
 // string singleton before manipulating the reference count. This is done so that
 // if an exception is created with a zero-length string in libstdc++, libc++abi
 // won't try to delete the memory.
-#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) || defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__)
+#if defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
 #  define _LIBCPP_CHECK_FOR_GCC_EMPTY_STRING_STORAGE
 #  include <dlfcn.h>
 #  include <mach-o/dyld.h>

From 6b58449b2c813588f9c251d4f0963762d6746881 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 9 Dec 2025 10:43:47 -0500
Subject: [PATCH 10/63] Update the NATVIS file

ElaboratedType is no longer a thing.
---
 clang/utils/ClangVisualizers/clang.natvis | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis
index 3ecd93902d1bb..0755f0ffcbf56 100644
--- a/clang/utils/ClangVisualizers/clang.natvis
+++ b/clang/utils/ClangVisualizers/clang.natvis
@@ -44,9 +44,6 @@ For later versions of Visual Studio, no setup is required-->
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="poly">{(clang::DecayedType *)this,na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="left">{(clang::DecayedType *)this,view(left)na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="right">{(clang::DecayedType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="poly">{(clang::ElaboratedType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="left">{(clang::ElaboratedType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="right">{(clang::ElaboratedType *)this,view(right)na}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="poly">{*(clang::TemplateTypeParmType *)this}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="cpp">{*(clang::TemplateTypeParmType *)this,view(cpp)}</DisplayString>
     <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm" IncludeView="poly">{*(clang::SubstTemplateTypeParmType *)this}</DisplayString>
@@ -94,7 +91,6 @@ For later versions of Visual Studio, no setup is required-->
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray">(clang::IncompleteArrayType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Attributed">*(clang::AttributedType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Decayed">(clang::DecayedType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated">(clang::ElaboratedType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm">(clang::TemplateTypeParmType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm">(clang::SubstTemplateTypeParmType *)this</ExpandedItem>
       <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Record">(clang::RecordType *)this</ExpandedItem>
@@ -428,16 +424,6 @@ For later versions of Visual Studio, no setup is required-->
       <ExpandedItem>(clang::AdjustedType *)this</ExpandedItem>
     </Expand>
   </Type>
-  <Type Name="clang::ElaboratedType">
-    <DisplayString IncludeView="left">{NamedType,view(left)}</DisplayString>
-    <DisplayString IncludeView="right">{NamedType,view(right)}</DisplayString>
-    <DisplayString>{NamedType}</DisplayString>
-    <Expand>
-      <Item Name="[Keyword]">(clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword</Item>
-      <Item Name="[Nested Name Specifier]">NNS</Item>
-      <Item Name="[Underlying Type]">NamedType,view(cmn)</Item>
-    </Expand>
-  </Type>
   <Type Name="clang::TemplateTypeParmType">
     <DisplayString IncludeView="cpp" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">{TTPDecl->Name,view(cpp)}</DisplayString>
     <DisplayString Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">Non-canonical: {*TTPDecl}</DisplayString>

From b3a5870c64c94c361b4144333066382aac0b6dc9 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 15:58:06 +0000
Subject: [PATCH 11/63] [llvm][docs] Add a release note for LLDB "version -v"

Added by #170772.
---
 llvm/docs/ReleaseNotes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 22d5b4183fac0..8ec46c661974b 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -253,6 +253,10 @@ Changes to LLDB
   LLVM's PDB and CodeView support. You can switch back to the DIA reader with
   `settings set plugin.symbol-file.pdb.reader dia`. Note that support for the
   DIA reader will be removed in a future version of LLDB.
+* A `--verbose` option was added to the `version` command. When `--verbose` is used,
+  LLDB's build configuration is included in the command's output. This includes
+  all the supported targets, along with the presence of (or lack of) optional
+  features like XML parsing.
 
 Changes to BOLT
 ---------------------------------

From c66eb25459279ce7663bcd51eabaeeffafae0366 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 9 Dec 2025 16:59:24 +0100
Subject: [PATCH 12/63] [OCaml] Fix build

Fix a mistake introduced in https://github.com/llvm/llvm-project/pull/163979:

We should stick with the deprecated LLVMGetGlobalContext() API
in this file, as getGlobalContextForCAPI() is a C++ API that is
not available here.
---
 llvm/bindings/ocaml/llvm/llvm_ocaml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/bindings/ocaml/llvm/llvm_ocaml.c b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
index 53158c88be19c..a2d948033724c 100644
--- a/llvm/bindings/ocaml/llvm/llvm_ocaml.c
+++ b/llvm/bindings/ocaml/llvm/llvm_ocaml.c
@@ -240,7 +240,7 @@ value llvm_dispose_context(value C) {
 
 /* unit -> llcontext */
 value llvm_global_context(value Unit) {
-  return to_val(getGlobalContextForCAPI());
+  return to_val(LLVMGetGlobalContext());
 }
 
 /* llcontext -> string -> int */

From 7f2bbba60dce67c1eb8cc8b0633d04ca0adfff62 Mon Sep 17 00:00:00 2001
From: valadaptive <79560998+valadaptive@users.noreply.github.com>
Date: Tue, 9 Dec 2025 11:11:26 -0500
Subject: [PATCH 13/63] [AArch64][ARM] Optimize more `tbl`/`tbx` calls into
 `shufflevector` (#169748)

Resolves #169701.

This PR extends the existing InstCombine operation which folds `tbl1`
intrinsics to `shufflevector` if the mask operand is constant. Before
this change, it only handled 64-bit `tbl1` intrinsics with no
out-of-bounds indices. I've extended it to support both 64-bit and
128-bit vectors, and it now handles the full range of `tbl1`-`tbl4` and
`tbx1`-`tbx4`, as long as at most two of the input operands are actually
indexed into.

For the purposes of `tbl`, we need a dummy vector of zeroes if there are
any out-of-bounds indices, and for the purposes of `tbx`, we use the
"fallback" operand. Both of those take up an operand for the purposes of
`shufflevector`.

This works a lot like https://github.com/llvm/llvm-project/pull/169110,
with some added complexity because we need to handle multiple operands.
I raised a couple questions in that PR that still need to be answered:
- Is it correct to check `IsA<UndefValue>` for each mask index, and set
the output mask index to -1 if so? This is later folded to a poison
value, and I'm not sure about the subtle differences between poison and
undef and when you can substitute one for the other. As I mentioned in
#169110, the existing x86 pass (`simplifyX86vpermilvar`) already behaves
this way when it comes to undef.
- How can I write an Alive2 proof for this? It's very hard to find good
documentation or tutorials about Alive2.

As with #169110, most of the regression test cases were generated using
Claude. Everything else was written by me.
---
 .../InstCombine/InstCombineCalls.cpp          | 136 +++++++--
 .../Transforms/InstCombine/AArch64/tbl.ll     | 261 ++++++++++++++++++
 .../Transforms/InstCombine/AArch64/tbl1.ll    |  65 -----
 llvm/test/Transforms/InstCombine/ARM/tbl.ll   | 215 +++++++++++++++
 llvm/test/Transforms/InstCombine/ARM/tbl1.ll  |  35 ---
 5 files changed, 589 insertions(+), 123 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
 create mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl.ll
 delete mode 100644 llvm/test/Transforms/InstCombine/ARM/tbl1.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8e7282a4ffefe..85602a5a7575a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -737,42 +737,119 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   return nullptr;
 }
 
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-static Value *simplifyNeonTbl1(const IntrinsicInst &II,
-                               InstCombiner::BuilderTy &Builder) {
+/// Convert `tbl`/`tbx` intrinsics to shufflevector if the mask is constant, and
+/// at most two source operands are actually referenced.
+static Instruction *simplifyNeonTbl(IntrinsicInst &II, InstCombiner &IC,
+                                    bool IsExtension) {
   // Bail out if the mask is not a constant.
-  auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+  auto *C = dyn_cast<Constant>(II.getArgOperand(II.arg_size() - 1));
   if (!C)
     return nullptr;
 
-  auto *VecTy = cast<FixedVectorType>(II.getType());
-  unsigned NumElts = VecTy->getNumElements();
+  auto *RetTy = cast<FixedVectorType>(II.getType());
+  unsigned NumIndexes = RetTy->getNumElements();
 
-  // Only perform this transformation for <8 x i8> vector types.
-  if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+  // Only perform this transformation for <8 x i8> and <16 x i8> vector types.
+  if (!RetTy->getElementType()->isIntegerTy(8) ||
+      (NumIndexes != 8 && NumIndexes != 16))
     return nullptr;
 
-  int Indexes[8];
+  // For tbx instructions, the first argument is the "fallback" vector, which
+  // has the same length as the mask and return type.
+  unsigned int StartIndex = (unsigned)IsExtension;
+  auto *SourceTy =
+      cast<FixedVectorType>(II.getArgOperand(StartIndex)->getType());
+  // Note that the element count of each source vector does *not* need to be the
+  // same as the element count of the return type and mask! All source vectors
+  // must have the same element count as each other, though.
+  unsigned NumElementsPerSource = SourceTy->getNumElements();
+
+  // There are no tbl/tbx intrinsics for which the destination size exceeds the
+  // source size. However, our definitions of the intrinsics, at least in
+  // IntrinsicsAArch64.td, allow for arbitrary destination vector sizes, so it
+  // *could* technically happen.
+  if (NumIndexes > NumElementsPerSource)
+    return nullptr;
+
+  // The tbl/tbx intrinsics take several source operands followed by a mask
+  // operand.
+  unsigned int NumSourceOperands = II.arg_size() - 1 - (unsigned)IsExtension;
 
-  for (unsigned I = 0; I < NumElts; ++I) {
+  // Map input operands to shuffle indices. This also helpfully deduplicates the
+  // input arguments, in case the same value is passed as an argument multiple
+  // times.
+  SmallDenseMap<Value *, unsigned, 2> ValueToShuffleSlot;
+  Value *ShuffleOperands[2] = {PoisonValue::get(SourceTy),
+                               PoisonValue::get(SourceTy)};
+
+  int Indexes[16];
+  for (unsigned I = 0; I < NumIndexes; ++I) {
     Constant *COp = C->getAggregateElement(I);
 
-    if (!COp || !isa<ConstantInt>(COp))
+    if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return nullptr;
 
-    Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+    if (isa<UndefValue>(COp)) {
+      Indexes[I] = -1;
+      continue;
+    }
+
+    uint64_t Index = cast<ConstantInt>(COp)->getZExtValue();
+    // The index of the input argument that this index references (0 = first
+    // source argument, etc).
+    unsigned SourceOperandIndex = Index / NumElementsPerSource;
+    // The index of the element at that source operand.
+    unsigned SourceOperandElementIndex = Index % NumElementsPerSource;
+
+    Value *SourceOperand;
+    if (SourceOperandIndex >= NumSourceOperands) {
+      // This index is out of bounds. Map it to index into either the fallback
+      // vector (tbx) or vector of zeroes (tbl).
+      SourceOperandIndex = NumSourceOperands;
+      if (IsExtension) {
+        // For out-of-bounds indices in tbx, choose the `I`th element of the
+        // fallback.
+        SourceOperand = II.getArgOperand(0);
+        SourceOperandElementIndex = I;
+      } else {
+        // Otherwise, choose some element from the dummy vector of zeroes (we'll
+        // always choose the first).
+        SourceOperand = Constant::getNullValue(SourceTy);
+        SourceOperandElementIndex = 0;
+      }
+    } else {
+      SourceOperand = II.getArgOperand(SourceOperandIndex + StartIndex);
+    }
+
+    // The source operand may be the fallback vector, which may not have the
+    // same number of elements as the source vector. In that case, we *could*
+    // choose to extend its length with another shufflevector, but it's simpler
+    // to just bail instead.
+    if (cast<FixedVectorType>(SourceOperand->getType())->getNumElements() !=
+        NumElementsPerSource)
+      return nullptr;
 
-    // Make sure the mask indices are in range.
-    if ((unsigned)Indexes[I] >= NumElts)
+    // We now know the source operand referenced by this index. Make it a
+    // shufflevector operand, if it isn't already.
+    unsigned NumSlots = ValueToShuffleSlot.size();
+    // This shuffle references more than two sources, and hence cannot be
+    // represented as a shufflevector.
+    if (NumSlots == 2 && !ValueToShuffleSlot.contains(SourceOperand))
       return nullptr;
+
+    auto [It, Inserted] =
+        ValueToShuffleSlot.try_emplace(SourceOperand, NumSlots);
+    if (Inserted)
+      ShuffleOperands[It->getSecond()] = SourceOperand;
+
+    unsigned RemappedIndex =
+        (It->getSecond() * NumElementsPerSource) + SourceOperandElementIndex;
+    Indexes[I] = RemappedIndex;
   }
 
-  auto *V1 = II.getArgOperand(0);
-  auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ArrayRef(Indexes));
+  Value *Shuf = IC.Builder.CreateShuffleVector(
+      ShuffleOperands[0], ShuffleOperands[1], ArrayRef(Indexes, NumIndexes));
+  return IC.replaceInstUsesWith(II, Shuf);
 }
 
 // Returns true iff the 2 intrinsics have the same operands, limiting the
@@ -3167,10 +3244,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     return CallInst::Create(NewFn, CallArgs);
   }
   case Intrinsic::arm_neon_vtbl1:
+  case Intrinsic::arm_neon_vtbl2:
+  case Intrinsic::arm_neon_vtbl3:
+  case Intrinsic::arm_neon_vtbl4:
   case Intrinsic::aarch64_neon_tbl1:
-    if (Value *V = simplifyNeonTbl1(*II, Builder))
-      return replaceInstUsesWith(*II, V);
-    break;
+  case Intrinsic::aarch64_neon_tbl2:
+  case Intrinsic::aarch64_neon_tbl3:
+  case Intrinsic::aarch64_neon_tbl4:
+    return simplifyNeonTbl(*II, *this, /*IsExtension=*/false);
+  case Intrinsic::arm_neon_vtbx1:
+  case Intrinsic::arm_neon_vtbx2:
+  case Intrinsic::arm_neon_vtbx3:
+  case Intrinsic::arm_neon_vtbx4:
+  case Intrinsic::aarch64_neon_tbx1:
+  case Intrinsic::aarch64_neon_tbx2:
+  case Intrinsic::aarch64_neon_tbx3:
+  case Intrinsic::aarch64_neon_tbx4:
+    return simplifyNeonTbl(*II, *this, /*IsExtension=*/true);
 
   case Intrinsic::arm_neon_vmulls:
   case Intrinsic::arm_neon_vmullu:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
new file mode 100644
index 0000000000000..8a9ca6ce635a3
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/tbl.ll
@@ -0,0 +1,261 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64"
+
+; We can turn a tbl/tbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic tbl1 with all in-bounds indices should optimize to shufflevector.
+define <16 x i8> @tbl1_basic(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with both operands the same should optimize (1 unique source).
+define <16 x i8> @tbl2_duplicate_operands(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing 2 unique operands should optimize.
+define <16 x i8> @tbl3_two_sources(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl3_two_sources(
+; CHECK-NEXT:    [[TBL:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 0, i32 0, i32 0, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <16 x i8> @tbl4_duplicate_operands(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references first two operands should optimize.
+define <16 x i8> @tbl4_unused_operands(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 where mask only references one operand should optimize.
+define <16 x i8> @tbl4_single_operand_used(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <16 x i8> @tbl1_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <16 x i8> @tbl2_duplicate_with_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <16 x i8> @tbl2_with_oob_bail(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl1 with all OOB indices should optimize to zero vector.
+define <16 x i8> @tbl1_all_oob(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_oob(
+; CHECK-NEXT:    ret <16 x i8> zeroinitializer
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbl
+}
+
+; tbl3 referencing all 3 operands should NOT optimize.
+define <16 x i8> @tbl3_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 0, i8 0, i8 0, i8 0>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing 3 unique operands should NOT optimize.
+define <16 x i8> @tbl4_three_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) {
+; CHECK-LABEL: @tbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[A]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %a, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbl4 referencing all 4 unique operands should NOT optimize.
+define <16 x i8> @tbl4_four_sources_bail(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; CHECK-LABEL: @tbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> [[C:%.*]], <16 x i8> [[D:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 32, i8 33, i8 34, i8 35, i8 48, i8 49, i8 50, i8 51>)
+  ret <16 x i8> %tbl
+}
+
+; tbx1 with no OOB should optimize.
+define <16 x i8> @tbx1_no_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 where fallback == second source operand should optimize (deduplicated).
+define <16 x i8> @tbx2_fallback_equals_second_source(<16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %b, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB where fallback == source should optimize (deduplicated).
+define <16 x i8> @tbx1_oob_fallback_same_as_source(<16 x i8> %a) {
+; CHECK-LABEL: @tbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[A:%.*]] = shufflevector <16 x i8> [[A1:%.*]], <16 x i8> poison, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[A]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %a, <16 x i8> %a, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <16 x i8> @tbx2_with_oob_bail(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: @tbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <16 x i8> [[TBX]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> %b, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 16, i8 17, i8 18, i8 19, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with all OOB indices should optimize to fallback.
+define <16 x i8> @tbx1_all_oob(<16 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_all_oob(
+; CHECK-NEXT:    ret <16 x i8> [[FALLBACK:%.*]]
+;
+  %tbx = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> %fallback, <16 x i8> %a, <16 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <16 x i8> %tbx
+}
+
+; tbx1 with OOB and mismatched fallback/source sizes should NOT optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[FALLBACK:%.*]], <16 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; tbx1 with no OOB and mismatched fallback/source sizes should optimize.
+define <8 x i8> @tbx1_fallback_size_mismatch_no_oob(<8 x i8> %fallback, <16 x i8> %a) {
+; CHECK-LABEL: @tbx1_fallback_size_mismatch_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> %fallback, <16 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; tbl1 with non-i8 element type should NOT optimize.
+define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_8x16(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
+;
+entry:
+  ; `tbl1.v8i16` is not really a thing, but it's good to check.
+  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
+  ret <8 x i16> %tbl1
+}
+
+; tbl1 with non-8/16 element count should NOT optimize.
+define <12 x i8> @tbl1_16x8(<16 x i8> %vec) {
+; CHECK-LABEL: @tbl1_16x8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TBL1:%.*]] = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> [[VEC:%.*]], <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+; CHECK-NEXT:    ret <12 x i8> [[TBL1]]
+;
+entry:
+  ; `tbl1.v12i8` is not really a thing, but it's good to check.
+  %tbl1 = call <12 x i8> @llvm.aarch64.neon.tbl1.v12i8(<16 x i8> %vec, <12 x i8> <i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <12 x i8> %tbl1
+}
+
+; Non-constant mask should NOT optimize.
+define <16 x i8> @tbl1_non_constant_mask(<16 x i8> %a, <16 x i8> %mask) {
+; CHECK-LABEL: @tbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A:%.*]], <16 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <16 x i8> [[TBL]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> %mask)
+  ret <16 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <16 x i8> @tbl1_poison_mask_elements(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A:%.*]], <16 x i8> poison, <16 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x i8> [[TMP1]]
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>)
+  ret <16 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <16 x i8> @tbl1_all_poison_mask(<16 x i8> %a) {
+; CHECK-LABEL: @tbl1_all_poison_mask(
+; CHECK-NEXT:    ret <16 x i8> poison
+;
+  %tbl = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %a, <16 x i8> poison)
+  ret <16 x i8> %tbl
+}
diff --git a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll b/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
deleted file mode 100644
index 362cc0f6c4493..0000000000000
--- a/llvm/test/Transforms/InstCombine/AArch64/tbl1.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[VEC:%.*]], <16 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %tbl1
-}
-
-; Bail the optimization if the size of the return vector is not 8 elements.
-define <16 x i8> @tbl1_16x8(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_16x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[VEC:%.*]], <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <16 x i8> [[TBL1]]
-;
-entry:
-  %tbl1 = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> %vec, <16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <16 x i8> %tbl1
-}
-
-; Bail the optimization if the elements of the return vector are not of type i8.
-define <8 x i16> @tbl1_8x16(<16 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x16(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TBL1:%.*]] = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> [[VEC:%.*]], <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-; CHECK-NEXT:    ret <8 x i16> [[TBL1]]
-;
-entry:
-  %tbl1 = call <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8> %vec, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
-  ret <8 x i16> %tbl1
-}
-
-; The type <8 x i16> is not a valid return type for this intrinsic,
-; but we want to test that the optimization won't trigger for vector
-; elements of type different than i8.
-declare <8 x i16> @llvm.aarch64.neon.tbl1.v8i16(<16 x i8>, <8 x i16>)
-
-declare <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>)
-declare <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>)
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl.ll b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
new file mode 100644
index 0000000000000..d4d5ec284d0b7
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ARM/tbl.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-arm-none-eabi"
+
+; We can turn a vtbl/vtbx intrinsic into a shufflevector instruction if the mask
+; is constant and references 2 or fewer operands.
+
+; Basic vtbl1 with all in-bounds indices should optimize to shufflevector.
+define <8 x i8> @vtbl1_basic(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_basic(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with both operands the same should be optimized (1 unique source).
+define <8 x i8> @vtbl2_duplicate_operands(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing 2 unique operands should optimize.
+define <8 x i8> @vtbl3_two_sources(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl3_two_sources(
+; CHECK-NEXT:    [[TBL:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 0, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 with alternating duplicate operands should optimize (2 unique sources).
+define <8 x i8> @vtbl4_duplicate_operands(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl4_duplicate_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 0, i32 1, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references first two operands should optimize.
+define <8 x i8> @vtbl4_unused_operands(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_unused_operands(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 where mask only references one operand should optimize.
+define <8 x i8> @vtbl4_single_operand_used(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_single_operand_used(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with some OOB indices should optimize (1 source + zero vector = 2 sources).
+define <8 x i8> @vtbl1_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 2, i8 3, i8 8, i8 9, i8 10, i8 11>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with duplicate operands and OOB should optimize (1 unique source + zero vector = 2 sources).
+define <8 x i8> @vtbl2_duplicate_with_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl2_duplicate_with_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> <i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 8, i32 8, i32 8, i32 8>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl2 with OOB indices should NOT optimize (2 sources + zero vector = 3 sources).
+define <8 x i8> @vtbl2_with_oob_bail(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbl2_with_oob_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl1 with all OOB indices should optimize to zero vector.
+define <8 x i8> @vtbl1_all_oob(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_oob(
+; CHECK-NEXT:    ret <8 x i8> zeroinitializer
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl3 referencing all 3 operands should NOT optimize.
+define <8 x i8> @vtbl3_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl3_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 0, i8 0>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing 3 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_three_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) {
+; CHECK-LABEL: @vtbl4_three_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[A]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %a, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbl4 referencing all 4 unique operands should NOT optimize.
+define <8 x i8> @vtbl4_four_sources_bail(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; CHECK-LABEL: @vtbl4_four_sources_bail(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> [[C:%.*]], <8 x i8> [[D:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl4(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 16, i8 17, i8 24, i8 25>)
+  ret <8 x i8> %tbl
+}
+
+; vtbx1 with no OOB should optimize.
+define <8 x i8> @vtbx1_no_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_no_oob(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 where fallback == second source operand should optimize (deduplicated).
+define <8 x i8> @vtbx2_fallback_equals_second_source(<8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_fallback_equals_second_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %b, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with OOB where fallback == source should optimize (deduplicated).
+define <8 x i8> @vtbx1_oob_fallback_same_as_source(<8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_oob_fallback_same_as_source(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %a, <8 x i8> %a, <8 x i8> <i8 3, i8 2, i8 1, i8 0, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx2 with OOB should NOT optimize (2 sources + fallback = 3 sources).
+define <8 x i8> @vtbx2_with_oob_bail(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b) {
+; CHECK-LABEL: @vtbx2_with_oob_bail(
+; CHECK-NEXT:    [[TBX:%.*]] = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> [[FALLBACK:%.*]], <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+; CHECK-NEXT:    ret <8 x i8> [[TBX]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx2(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> %b, <8 x i8> <i8 0, i8 1, i8 8, i8 9, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; vtbx1 with all OOB indices should optimize to fallback.
+define <8 x i8> @vtbx1_all_oob(<8 x i8> %fallback, <8 x i8> %a) {
+; CHECK-LABEL: @vtbx1_all_oob(
+; CHECK-NEXT:    ret <8 x i8> [[FALLBACK:%.*]]
+;
+  %tbx = call <8 x i8> @llvm.arm.neon.vtbx1(<8 x i8> %fallback, <8 x i8> %a, <8 x i8> <i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99, i8 99>)
+  ret <8 x i8> %tbx
+}
+
+; Non-constant mask should NOT optimize.
+define <8 x i8> @vtbl1_non_constant_mask(<8 x i8> %a, <8 x i8> %mask) {
+; CHECK-LABEL: @vtbl1_non_constant_mask(
+; CHECK-NEXT:    [[TBL:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[A:%.*]], <8 x i8> [[MASK:%.*]])
+; CHECK-NEXT:    ret <8 x i8> [[TBL]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> %mask)
+  ret <8 x i8> %tbl
+}
+
+; Mask with some poison elements should optimize, with poison propagating to output.
+define <8 x i8> @vtbl1_poison_mask_elements(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_poison_mask_elements(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <8 x i32> <i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i8> [[TMP1]]
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> <i8 0, i8 poison, i8 2, i8 poison, i8 4, i8 5, i8 6, i8 7>)
+  ret <8 x i8> %tbl
+}
+
+; Mask with all poison elements should optimize to poison.
+define <8 x i8> @vtbl1_all_poison_mask(<8 x i8> %a) {
+; CHECK-LABEL: @vtbl1_all_poison_mask(
+; CHECK-NEXT:    ret <8 x i8> poison
+;
+  %tbl = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %a, <8 x i8> poison)
+  ret <8 x i8> %tbl
+}
diff --git a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll b/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
deleted file mode 100644
index fbec1a2bb7a07..0000000000000
--- a/llvm/test/Transforms/InstCombine/ARM/tbl1.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-target triple = "armv8-arm-none-eabi"
-
-; Turning a table lookup intrinsic into a shuffle vector instruction
-; can be beneficial. If the mask used for the lookup is the constant
-; vector {7,6,5,4,3,2,1,0}, then the back-end generates rev64
-; instructions instead.
-
-define <8 x i8> @tbl1_8x8(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VEC:%.*]], <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    ret <8 x i8> [[TMP0]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-; Bail the optimization if a mask index is out of range.
-define <8 x i8> @tbl1_8x8_out_of_range(<8 x i8> %vec) {
-; CHECK-LABEL: @tbl1_8x8_out_of_range(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VTBL1:%.*]] = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> [[VEC:%.*]], <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-; CHECK-NEXT:    ret <8 x i8> [[VTBL1]]
-;
-entry:
-  %vtbl1 = call <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8> %vec, <8 x i8> <i8 8, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>)
-  ret <8 x i8> %vtbl1
-}
-
-declare <8 x i8> @llvm.arm.neon.vtbl1(<8 x i8>, <8 x i8>)

From 1a66474ca0f995868367fd4c22aa1259dcc6cf96 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Tue, 9 Dec 2025 16:15:07 +0000
Subject: [PATCH 14/63] [clang][FMV][AArch64] Remove O3 from failing test
 (#171457)

This fixes the buildbot failures from
https://github.com/llvm/llvm-project/pull/150267.

I could not reproduce them locally but my intuition suggests that the
-O3 option on the RUN line behaves incosistently on different hosts
judging from the error logs.

My intention was to run an integration test which will use llvm's
globalopt pass, but there's no need actually. We have unittests in place
for it.
---
 .../CodeGen/AArch64/fmv-explicit-priority.c   | 208 +++++++-----------
 1 file changed, 75 insertions(+), 133 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fmv-explicit-priority.c b/clang/test/CodeGen/AArch64/fmv-explicit-priority.c
index 1abf330ffee49..b3c544124afa0 100644
--- a/clang/test/CodeGen/AArch64/fmv-explicit-priority.c
+++ b/clang/test/CodeGen/AArch64/fmv-explicit-priority.c
@@ -1,204 +1,146 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -O3 -fno-inline -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -o - %s | FileCheck %s
 
 __attribute__((target_version("lse;priority=30"))) int foo(void) { return 1; }
-__attribute__((target_version("sve2;priority=20"))) int foo(void) { return 2; }
+__attribute__((target_version("aes;priority=20"))) int foo(void) { return 2; }
 __attribute__((target_version("sve;priority=10"))) int foo(void) { return 3; }
 __attribute__((target_version("default"))) int foo(void) { return 0; }
 
-__attribute__((target_clones("lse+sve2;priority=3", "lse;priority=2", "sve;priority=1", "default")))
-int fmv_caller(void) { return foo(); }
-
-
-__attribute__((target_version("aes"))) int bar(void) { return 1; }
-__attribute__((target_version("sm4;priority=5"))) int bar(void) { return 2; }
-__attribute__((target_version("default"))) int bar(void) { return 0; }
-
-__attribute__((target("aes"))) int regular_caller_aes() { return bar(); }
-__attribute__((target("sm4"))) int regular_caller_sm4() { return bar(); }
+__attribute__((target_clones("sme;priority=3", "bti;priority=2", "mops;priority=1", "default"))) int bar(void) { return 0; }
 //.
-// CHECK: @__aarch64_cpu_features = external dso_local local_unnamed_addr global { i64 }
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @foo = weak_odr ifunc i32 (), ptr @foo.resolver
-// CHECK: @fmv_caller = weak_odr ifunc i32 (), ptr @fmv_caller.resolver
 // CHECK: @bar = weak_odr ifunc i32 (), ptr @bar.resolver
 //.
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo._Mlse
 // CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@foo._Msve2
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo._Maes
 // CHECK-SAME: () #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
+// CHECK: Function Attrs: noinline nounwind optnone vscale_range(1,16)
 // CHECK-LABEL: define {{[^@]+}}@foo._Msve
 // CHECK-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@foo.default
 // CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller._MlseMsve2
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar._Msme
 // CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo._Mlse()
-// CHECK-NEXT:    ret i32 [[CALL]]
+// CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller._Mlse
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar._Mbti
 // CHECK-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo._Mlse()
-// CHECK-NEXT:    ret i32 [[CALL]]
+// CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: noinline nounwind vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller._Msve
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@bar._Mmops
 // CHECK-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo() #[[ATTR13:[0-9]+]]
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16)
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller.default
-// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @foo.default()
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
-// CHECK-LABEL: define {{[^@]+}}@bar._Maes
-// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
-// CHECK-LABEL: define {{[^@]+}}@bar._Msm4
-// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
+// CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
+// CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: define {{[^@]+}}@bar.default
 // CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
 //
 //
-// CHECK: Function Attrs: noinline nounwind
-// CHECK-LABEL: define {{[^@]+}}@regular_caller_aes
-// CHECK-SAME: () local_unnamed_addr #[[ATTR10:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @bar() #[[ATTR13]]
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
-// CHECK: Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none)
-// CHECK-LABEL: define {{[^@]+}}@regular_caller_sm4
-// CHECK-SAME: () local_unnamed_addr #[[ATTR11:[0-9]+]] {
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @bar._Msm4()
-// CHECK-NEXT:    ret i32 [[CALL]]
-//
-//
 // CHECK: Function Attrs: disable_sanitizer_instrumentation
 // CHECK-LABEL: define {{[^@]+}}@foo.resolver
-// CHECK-SAME: () #[[ATTR12:[0-9]+]] comdat {
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    tail call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 128
-// CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP1]], 0
-// CHECK-NEXT:    br i1 [[DOTNOT]], label [[RESOLVER_ELSE:%.*]], label [[COMMON_RET:%.*]]
-// CHECK:       common.ret:
-// CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi ptr [ @foo._Mlse, [[RESOLVER_ENTRY:%.*]] ], [ @foo._Msve2, [[RESOLVER_ELSE]] ], [ [[FOO__MSVE_FOO_DEFAULT:%.*]], [[RESOLVER_ELSE2:%.*]] ]
-// CHECK-NEXT:    ret ptr [[COMMON_RET_OP]]
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 128
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @foo._Mlse
 // CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP0]], 69793284352
-// CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 69793284352
-// CHECK-NEXT:    br i1 [[TMP3]], label [[COMMON_RET]], label [[RESOLVER_ELSE2]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 33536
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 33536
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @foo._Maes
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP0]], 1073807616
-// CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 1073807616
-// CHECK-NEXT:    [[FOO__MSVE_FOO_DEFAULT]] = select i1 [[TMP5]], ptr @foo._Msve, ptr @foo.default
-// CHECK-NEXT:    br label [[COMMON_RET]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 1073807616
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 1073807616
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK:       resolver_return3:
+// CHECK-NEXT:    ret ptr @foo._Msve
+// CHECK:       resolver_else4:
+// CHECK-NEXT:    ret ptr @foo.default
 //
 //
 // CHECK: Function Attrs: disable_sanitizer_instrumentation
-// CHECK-LABEL: define {{[^@]+}}@fmv_caller.resolver
-// CHECK-SAME: () #[[ATTR12]] comdat {
+// CHECK-LABEL: define {{[^@]+}}@bar.resolver
+// CHECK-SAME: () #[[ATTR7]] comdat {
 // CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    tail call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 69793284480
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 69793284480
-// CHECK-NEXT:    br i1 [[TMP2]], label [[COMMON_RET:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       common.ret:
-// CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi ptr [ @fmv_caller._MlseMsve2, [[RESOLVER_ENTRY:%.*]] ], [ @fmv_caller._Mlse, [[RESOLVER_ELSE]] ], [ [[FMV_CALLER__MSVE_FMV_CALLER_DEFAULT:%.*]], [[RESOLVER_ELSE2:%.*]] ]
-// CHECK-NEXT:    ret ptr [[COMMON_RET_OP]]
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 4398180795136
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 4398180795136
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @bar._Msme
 // CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], 128
-// CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i64 [[TMP3]], 0
-// CHECK-NEXT:    br i1 [[DOTNOT]], label [[RESOLVER_ELSE2]], label [[COMMON_RET]]
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 1125899906842624
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 1125899906842624
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @bar._Mbti
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    [[TMP4:%.*]] = and i64 [[TMP0]], 1073807616
-// CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 1073807616
-// CHECK-NEXT:    [[FMV_CALLER__MSVE_FMV_CALLER_DEFAULT]] = select i1 [[TMP5]], ptr @fmv_caller._Msve, ptr @fmv_caller.default
-// CHECK-NEXT:    br label [[COMMON_RET]]
-//
-//
-// CHECK: Function Attrs: disable_sanitizer_instrumentation
-// CHECK-LABEL: define {{[^@]+}}@bar.resolver
-// CHECK-SAME: () #[[ATTR12]] comdat {
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    tail call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 800
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 800
-// CHECK-NEXT:    [[TMP3:%.*]] = and i64 [[TMP0]], 33536
-// CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 33536
-// CHECK-NEXT:    [[BAR__MAES_BAR_DEFAULT:%.*]] = select i1 [[TMP4]], ptr @bar._Maes, ptr @bar.default
-// CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[TMP2]], ptr @bar._Msm4, ptr [[BAR__MAES_BAR_DEFAULT]]
-// CHECK-NEXT:    ret ptr [[COMMON_RET_OP]]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 576460752303423488
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 576460752303423488
+// CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
+// CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
+// CHECK:       resolver_return3:
+// CHECK-NEXT:    ret ptr @bar._Mmops
+// CHECK:       resolver_else4:
+// CHECK-NEXT:    ret ptr @bar.default
 //
 //.
-// CHECK: attributes #[[ATTR0]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features"="P1,P2,P3,P4,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
-// CHECK: attributes #[[ATTR1]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P2,P4,sve2" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve,+sve2" }
-// CHECK: attributes #[[ATTR2]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P1,P3,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
-// CHECK: attributes #[[ATTR3]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR4]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P0,P1,lse,sve2" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+lse,+sve,+sve2" }
-// CHECK: attributes #[[ATTR5]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features"="P1,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
-// CHECK: attributes #[[ATTR6]] = { noinline nounwind vscale_range(1,16) "fmv-features"="P0,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
-// CHECK: attributes #[[ATTR7]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) vscale_range(1,16) "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #[[ATTR8]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features"="aes" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR9]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "fmv-features"="P0,P2,sm4" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4" }
-// CHECK: attributes #[[ATTR10]] = { noinline nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+fp-armv8,+neon" }
-// CHECK: attributes #[[ATTR11]] = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4" }
-// CHECK: attributes #[[ATTR12]] = { disable_sanitizer_instrumentation }
-// CHECK: attributes #[[ATTR13]] = { nounwind }
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "fmv-features"="P1,P2,P3,P4,lse" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "fmv-features"="P2,P4,aes" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone vscale_range(1,16) "fmv-features"="P1,P3,sve" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "fmv-features" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "fmv-features"="P0,P1,sme" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sme" }
+// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "fmv-features"="P1,bti" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
+// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "fmv-features"="P0,mops" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops" }
+// CHECK: attributes #[[ATTR7]] = { disable_sanitizer_instrumentation }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
 // CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
-// CHECK: [[META2:![0-9]+]] = !{[[META3:![0-9]+]], [[META3]], i64 0}
-// CHECK: [[META3]] = !{!"int", [[META4:![0-9]+]], i64 0}
-// CHECK: [[META4]] = !{!"omnipotent char", [[META5:![0-9]+]], i64 0}
-// CHECK: [[META5]] = !{!"Simple C/C++ TBAA"}
 //.

From c61a481a231025a90ef12e4f9a04bedb04eeeb99 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 9 Dec 2025 16:19:13 +0000
Subject: [PATCH 15/63] [VPlan] Use SCEV to prove non-aliasing for stores at
 different offsets. (#170347)

Extend the logic add in https://github.com/llvm/llvm-project/pull/168771
to also allow sinking stores past stores in the same noalias set by
checking if we can prove no-alias via the distance between accesses,
checked via SCEV.

PR: https://github.com/llvm/llvm-project/pull/170347
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  | 96 ++++++++++++++++---
 ...predicated-loads-with-predicated-stores.ll | 48 ++++------
 2 files changed, 99 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index eb078c783d5f7..852196e589c59 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -44,6 +44,7 @@
 
 using namespace llvm;
 using namespace VPlanPatternMatch;
+using namespace SCEVPatternMatch;
 
 bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
     VPlan &Plan,
@@ -139,14 +140,77 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
   return true;
 }
 
-// Check if a memory operation doesn't alias with memory operations in blocks
-// between FirstBB and LastBB using scoped noalias metadata.
-// For load hoisting, we only check writes in one direction.
-// For store sinking, we check both reads and writes bidirectionally.
-static bool canHoistOrSinkWithNoAliasCheck(
-    const MemoryLocation &MemLoc, VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
-    bool CheckReads,
-    const SmallPtrSetImpl<VPRecipeBase *> *ExcludeRecipes = nullptr) {
+/// Helper for extra no-alias checks via known-safe recipe and SCEV.
+class SinkStoreInfo {
+  const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes;
+  VPReplicateRecipe &GroupLeader;
+  ScalarEvolution &SE;
+  const Loop &L;
+  VPTypeAnalysis &TypeInfo;
+
+  // Return true if \p A and \p B are known to not alias for all VFs in the
+  // plan, checked via the distance between the accesses
+  bool isNoAliasViaDistance(VPReplicateRecipe *A, VPReplicateRecipe *B) const {
+    if (A->getOpcode() != Instruction::Store ||
+        B->getOpcode() != Instruction::Store)
+      return false;
+
+    VPValue *AddrA = A->getOperand(1);
+    const SCEV *SCEVA = vputils::getSCEVExprForVPValue(AddrA, SE, &L);
+    VPValue *AddrB = B->getOperand(1);
+    const SCEV *SCEVB = vputils::getSCEVExprForVPValue(AddrB, SE, &L);
+    if (isa<SCEVCouldNotCompute>(SCEVA) || isa<SCEVCouldNotCompute>(SCEVB))
+      return false;
+
+    const APInt *Distance;
+    if (!match(SE.getMinusSCEV(SCEVA, SCEVB), m_scev_APInt(Distance)))
+      return false;
+
+    const DataLayout &DL = SE.getDataLayout();
+    Type *TyA = TypeInfo.inferScalarType(A->getOperand(0));
+    uint64_t SizeA = DL.getTypeStoreSize(TyA);
+    Type *TyB = TypeInfo.inferScalarType(B->getOperand(0));
+    uint64_t SizeB = DL.getTypeStoreSize(TyB);
+
+    // Use the maximum store size to ensure no overlap from either direction.
+    // Currently only handles fixed sizes, as it is only used for
+    // replicating VPReplicateRecipes.
+    uint64_t MaxStoreSize = std::max(SizeA, SizeB);
+
+    auto VFs = B->getParent()->getPlan()->vectorFactors();
+    ElementCount MaxVF = *max_element(VFs, ElementCount::isKnownLT);
+    return Distance->abs().uge(
+        MaxVF.multiplyCoefficientBy(MaxStoreSize).getFixedValue());
+  }
+
+public:
+  SinkStoreInfo(const SmallPtrSetImpl<VPRecipeBase *> &ExcludeRecipes,
+                VPReplicateRecipe &GroupLeader, ScalarEvolution &SE,
+                const Loop &L, VPTypeAnalysis &TypeInfo)
+      : ExcludeRecipes(ExcludeRecipes), GroupLeader(GroupLeader), SE(SE), L(L),
+        TypeInfo(TypeInfo) {}
+
+  /// Return true if \p R should be skipped during alias checking, either
+  /// because it's in the exclude set or because no-alias can be proven via
+  /// SCEV.
+  bool shouldSkip(VPRecipeBase &R) const {
+    auto *Store = dyn_cast<VPReplicateRecipe>(&R);
+    return ExcludeRecipes.contains(&R) ||
+           (Store && isNoAliasViaDistance(Store, &GroupLeader));
+  }
+};
+
+/// Check if a memory operation doesn't alias with memory operations in blocks
+/// between \p FirstBB and \p LastBB using scoped noalias metadata. If
+/// \p SinkInfo is std::nullopt, only recipes that may write to memory are
+/// checked (for load hoisting). Otherwise recipes that both read and write
+/// memory are checked, and SCEV is used to prove no-alias between the group
+/// leader and other replicate recipes (for store sinking).
+static bool
+canHoistOrSinkWithNoAliasCheck(const MemoryLocation &MemLoc,
+                               VPBasicBlock *FirstBB, VPBasicBlock *LastBB,
+                               std::optional<SinkStoreInfo> SinkInfo = {}) {
+  bool CheckReads = SinkInfo.has_value();
   if (!MemLoc.AATags.Scope)
     return false;
 
@@ -158,7 +222,7 @@ static bool canHoistOrSinkWithNoAliasCheck(
            "Expected at most one successor in block chain");
     auto *VPBB = cast<VPBasicBlock>(Block);
     for (VPRecipeBase &R : *VPBB) {
-      if (ExcludeRecipes && ExcludeRecipes->contains(&R))
+      if (SinkInfo && SinkInfo->shouldSkip(R))
         continue;
 
       // Skip recipes that don't need checking.
@@ -4273,8 +4337,7 @@ void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
 
     // Check that the load doesn't alias with stores between first and last.
     auto LoadLoc = vputils::getMemoryLocation(*EarliestLoad);
-    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB,
-                                                    /*CheckReads=*/false))
+    if (!LoadLoc || !canHoistOrSinkWithNoAliasCheck(*LoadLoc, FirstBB, LastBB))
       continue;
 
     // Collect common metadata from all loads in the group.
@@ -4301,7 +4364,9 @@ void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
 }
 
 static bool
-canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
+canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink,
+                             ScalarEvolution &SE, const Loop &L,
+                             VPTypeAnalysis &TypeInfo) {
   auto StoreLoc = vputils::getMemoryLocation(*StoresToSink.front());
   if (!StoreLoc || !StoreLoc->AATags.Scope)
     return false;
@@ -4313,8 +4378,8 @@ canSinkStoreWithNoAliasCheck(ArrayRef<VPReplicateRecipe *> StoresToSink) {
 
   VPBasicBlock *FirstBB = StoresToSink.front()->getParent();
   VPBasicBlock *LastBB = StoresToSink.back()->getParent();
-  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB,
-                                        /*CheckReads=*/true, &StoresToSinkSet);
+  SinkStoreInfo SinkInfo(StoresToSinkSet, *StoresToSink[0], SE, L, TypeInfo);
+  return canHoistOrSinkWithNoAliasCheck(*StoreLoc, FirstBB, LastBB, SinkInfo);
 }
 
 void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
@@ -4325,13 +4390,14 @@ void VPlanTransforms::sinkPredicatedStores(VPlan &Plan, ScalarEvolution &SE,
     return;
 
   VPDominatorTree VPDT(Plan);
+  VPTypeAnalysis TypeInfo(Plan);
 
   for (auto &Group : Groups) {
     sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
       return VPDT.properlyDominates(A, B);
     });
 
-    if (!canSinkStoreWithNoAliasCheck(Group))
+    if (!canSinkStoreWithNoAliasCheck(Group, SE, *L, TypeInfo))
       continue;
 
     // Use the last (most dominated) store's location for the unconditional
diff --git a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
index cdbe9bb555834..7450fcccbb484 100644
--- a/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/hoist-predicated-loads-with-predicated-stores.ll
@@ -764,7 +764,7 @@ define void @sink_multiple_store_groups_noalias_via_scev(ptr %dst, ptr %src) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = mul i64 [[INDEX1]], 16
 ; CHECK-NEXT:    [[IV:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP17:%.*]] = add i64 [[INDEX]], 16
@@ -781,42 +781,30 @@ define void @sink_multiple_store_groups_noalias_via_scev(ptr %dst, ptr %src) {
 ; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[TMP22]], align 8, !alias.scope [[META78]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x double> poison, double [[TMP13]], i32 0
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = insertelement <2 x double> [[TMP15]], double [[TMP14]], i32 1
-; CHECK-NEXT:    [[TMP16:%.*]] = xor <2 x i1> [[TMP10]], splat (i1 true)
 ; CHECK-NEXT:    [[TMP34:%.*]] = fadd <2 x double> [[WIDE_LOAD]], splat (double 8.000000e+00)
-; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <2 x i1> [[TMP16]], i32 0
-; CHECK-NEXT:    br i1 [[TMP24]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
-; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x double> [[TMP34]], i32 0
-; CHECK-NEXT:    store double [[TMP19]], ptr [[TMP18]], align 8, !alias.scope [[META81:![0-9]+]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <2 x ptr> [[TMP31]], ptr [[TMP21]], i32 1
+; CHECK-NEXT:    [[TMP20:%.*]] = select <2 x i1> [[TMP10]], <2 x double> [[WIDE_LOAD]], <2 x double> [[TMP34]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <2 x double> [[TMP20]], i32 0
+; CHECK-NEXT:    store double [[TMP32]], ptr [[TMP18]], align 8, !alias.scope [[META81:![0-9]+]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP20]], i32 1
+; CHECK-NEXT:    store double [[TMP33]], ptr [[TMP21]], align 8, !alias.scope [[META81]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP18]], i64 16
+; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP24]], align 8, !alias.scope [[META81]], !noalias [[META78]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP16]], i32 1
-; CHECK-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3:.*]]
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_IF2]]:
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <2 x double> [[TMP34]], i32 1
-; CHECK-NEXT:    store double [[TMP33]], ptr [[TMP21]], align 8, !alias.scope [[META81]], !noalias [[META78]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP21]], i64 16
+; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP35]], align 8, !alias.scope [[META81]], !noalias [[META78]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
 ; CHECK:       [[PRED_STORE_CONTINUE3]]:
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
-; CHECK-NEXT:    br i1 [[TMP23]], label %[[PRED_STORE_IF4:.*]], label %[[PRED_STORE_CONTINUE5:.*]]
-; CHECK:       [[PRED_STORE_IF4]]:
-; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr double, ptr [[DST]], i64 [[IV]]
-; CHECK-NEXT:    store double [[TMP13]], ptr [[TMP31]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i8, ptr [[TMP31]], i64 16
-; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP37]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE5]]
-; CHECK:       [[PRED_STORE_CONTINUE5]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
-; CHECK-NEXT:    br i1 [[TMP25]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
-; CHECK:       [[PRED_STORE_IF6]]:
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr double, ptr [[DST]], i64 [[TMP17]]
-; CHECK-NEXT:    store double [[TMP14]], ptr [[TMP32]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[TMP32]], i64 16
-; CHECK-NEXT:    store double 1.000000e+01, ptr [[TMP47]], align 8, !alias.scope [[META81]], !noalias [[META78]]
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
-; CHECK:       [[PRED_STORE_CONTINUE7]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], 2
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP52]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP83:![0-9]+]]

From 76cffd310ac45c43c95c879d1bfdc1580a3d591e Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 16:33:29 +0000
Subject: [PATCH 16/63] [CI] Tweak wording for builds with passing tests and
 build errors (#171436)

"All tests passed" is too easily interpreted as every possible test was
run and was fine. A lot of the time it means all the tests that didn't
fail to build ran and were fine.

Maybe the wording is still too subtle but at least it hints to the idea
that the tests run might be fewer than if the build had no compilation
errors.
---
 .ci/generate_test_report_lib.py      | 4 ++--
 .ci/generate_test_report_lib_test.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.ci/generate_test_report_lib.py b/.ci/generate_test_report_lib.py
index 9a4fc6030d5ec..5edde254eb73d 100644
--- a/.ci/generate_test_report_lib.py
+++ b/.ci/generate_test_report_lib.py
@@ -267,7 +267,7 @@ def plural(num_tests):
             report.extend(
                 [
                     "",
-                    "All tests passed but another part of the build **failed**. "
+                    "All executed tests passed, but another part of the build **failed**. "
                     "Information about the build failure could not be automatically "
                     "obtained.",
                     "",
@@ -278,7 +278,7 @@ def plural(num_tests):
             report.extend(
                 [
                     "",
-                    "All tests passed but another part of the build **failed**. Click on "
+                    "All executed tests passed, but another part of the build **failed**. Click on "
                     "a failure below to see the details.",
                     "",
                 ]
diff --git a/.ci/generate_test_report_lib_test.py b/.ci/generate_test_report_lib_test.py
index b9e992e0f798b..06279d672f3c3 100644
--- a/.ci/generate_test_report_lib_test.py
+++ b/.ci/generate_test_report_lib_test.py
@@ -343,7 +343,7 @@ def test_no_failures_build_failed(self):
 
               * 1 test passed
 
-              All tests passed but another part of the build **failed**. Information about the build failure could not be automatically obtained.
+              All executed tests passed, but another part of the build **failed**. Information about the build failure could not be automatically obtained.
 
               Download the build's log file to see the details.
               
@@ -390,7 +390,7 @@ def test_no_failures_build_failed_ninja_log(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>test/4.stamp</summary>
@@ -476,7 +476,7 @@ def test_no_failures_multiple_build_failed_ninja_log(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>touch test/2.stamp</summary>
@@ -978,7 +978,7 @@ def test_generate_report_end_to_end(self):
 
                     * 1 test passed
 
-                    All tests passed but another part of the build **failed**. Click on a failure below to see the details.
+                    All executed tests passed, but another part of the build **failed**. Click on a failure below to see the details.
 
                     <details>
                     <summary>test/4.stamp</summary>

From a03318360649c8b6f30b28e1ee5a76af1f559c51 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 08:37:48 -0800
Subject: [PATCH 17/63] [compiler-rt] Try bumping soft_rss_limit again
 (#171469)

This is still failing on some of the bots. Try bumping the limit again
to see if this fixes things.
---
 .../TestCases/Linux/soft_rss_limit_mb_test.cpp              | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
index 958fe40d5db64..0c7257849dc31 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/soft_rss_limit_mb_test.cpp
@@ -2,12 +2,12 @@
 // RUN: %clangxx -O2 %s -o %t
 //
 // Run with limit should fail:
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=1     %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_1
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=1     %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_1
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
 
 // This run uses getrusage. We can only test getrusage when allocator_may_return_null=0
 // because getrusage gives us max-rss, not current-rss.
-// RUN: %env_tool_opts=soft_rss_limit_mb=250:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
+// RUN: %env_tool_opts=soft_rss_limit_mb=384:quarantine_size=1:allocator_may_return_null=0:can_use_proc_maps_statm=0 not %run %t 2>&1 | FileCheck %s -check-prefix=CHECK_MAY_RETURN_0 --implicit-check-not="returned null"
 // REQUIRES: stable-runtime
 
 // Ubsan does not intercept pthread_create.

From 94ebcfd16dac67486bae624f74e1c5c789448bae Mon Sep 17 00:00:00 2001
From: Men-cotton <mencotton0410@gmail.com>
Date: Wed, 10 Dec 2025 01:38:02 +0900
Subject: [PATCH 18/63] [mlir][vector] Fix crash in ReorderCastOpsOnBroadcast
 with non-vector result (#170985)

Fixes a crash in `ReorderCastOpsOnBroadcast` by ensuring the cast result
is a `VectorType` before applying the pattern.
A regression test has been added to
mlir/test/Dialect/Vector/vector-sink.mlir.

Fixes: #126371
---
 .../Vector/Transforms/VectorTransforms.cpp        |  2 ++
 mlir/test/Dialect/Vector/vector-sink.mlir         | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 726da1e9a3d14..ad16b80a732b3 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -453,6 +453,8 @@ struct ReorderCastOpsOnBroadcast
                                 PatternRewriter &rewriter) const override {
     if (op->getNumOperands() != 1)
       return failure();
+    if (!isa<VectorType>(op->getResult(0).getType()))
+      return failure();
     auto bcastOp = op->getOperand(0).getDefiningOp<vector::BroadcastOp>();
     if (!bcastOp)
       return failure();
diff --git a/mlir/test/Dialect/Vector/vector-sink.mlir b/mlir/test/Dialect/Vector/vector-sink.mlir
index beaba52af1841..69fba88a14048 100644
--- a/mlir/test/Dialect/Vector/vector-sink.mlir
+++ b/mlir/test/Dialect/Vector/vector-sink.mlir
@@ -382,6 +382,21 @@ func.func @broadcast_scalar_extsi_scalable(%a : i8) -> vector<2x[4]xi32> {
   return %r : vector<2x[4]xi32>
 }
 
+// -----
+
+// CHECK-LABEL: func.func @negative_broadcast_cast_non_vector_result
+// CHECK-SAME: (%[[ARG:.*]]: i64)
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[ARG]] : i64 to vector<26x7xi64>
+// CHECK: %[[CAST:.*]] = builtin.unrealized_conversion_cast %[[BCAST]] : vector<26x7xi64> to !llvm.array<26 x vector<7xi64>>
+// CHECK: return %[[CAST]] : !llvm.array<26 x vector<7xi64>>
+/// This test ensures that the `ReorderCastOpsOnBroadcast` pattern does not
+/// attempt to reorder a cast operation that produces a non-vector result type.
+func.func @negative_broadcast_cast_non_vector_result(%arg0: i64) -> !llvm.array<26 x vector<7xi64>> {
+  %0 = vector.broadcast %arg0 : i64 to vector<26x7xi64>
+  %1 = builtin.unrealized_conversion_cast %0 : vector<26x7xi64> to !llvm.array<26 x vector<7xi64>>
+  return %1 : !llvm.array<26 x vector<7xi64>>
+}
+
 //===----------------------------------------------------------------------===//
 // [Pattern: ReorderElementwiseOpsOnTranspose]
 //===----------------------------------------------------------------------===//

From 5236af88e5ed0a3449b2292ef02be28b8722b172 Mon Sep 17 00:00:00 2001
From: Jianhui Li <jian.hui.li@intel.com>
Date: Tue, 9 Dec 2025 08:48:27 -0800
Subject: [PATCH 19/63] [MLIR][XeGPU] Extend propagation and sg_to_lane
 distribution pass support broadcast with low rank and scalar source input
 (#170409)

This PR extends XeGPU layout propagation and distribution for
vector.broadcast operation.
It relaxes the restriction of layout propagation to allow low-rank and
scalar source input, and adds a pattern in sg-to-wi distribution to
support the lowering.
---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  34 +++-
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |   2 +-
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    | 143 +++++++++++++++
 .../XeGPU/Transforms/XeGPUPropagateLayout.cpp |  42 +++--
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 163 +++++++++++++++++-
 mlir/test/Dialect/XeGPU/propagate-layout.mlir |  58 +++++++
 .../XeGPU/subgroup-distribute-unit.mlir       |  65 +++++++
 .../Dialect/XeGPU/subgroup-distribute.mlir    |  61 +++++++
 8 files changed, 550 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index 93c5187b00756..eae0bd4e68a84 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -223,6 +223,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
     InterfaceMethod<"Derive a new layout by dropping InstData",
                     "xegpu::DistributeLayoutAttr",
                     "dropInstData">,
+    InterfaceMethod<"Derive a new layout with sg_data, inst_data and lane_data set to 1 for the specified unit dims",
+                    "xegpu::DistributeLayoutAttr",
+                    "setUnitDimData",
+                    /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
+    InterfaceMethod<"Derive a new layout with sg_lane and lane_layout set to 1 for the specified unit dims",
+                    "xegpu::DistributeLayoutAttr",
+                    "setUnitDimLayout",
+                    /*args=*/(ins "const llvm::SetVector<int64_t>": $unitDims)>,
     InterfaceMethod<[{Delinearizes a linear ID into its multidimensional
                       indices based on the effective layout level.}],
                     "FailureOr<SmallVector<Value>>",
@@ -283,9 +291,14 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                       }
                       return true;
                     }]>,
-    InterfaceMethod</*desc=*/[{Check if this layout is a slice of some other layout.}],
+    InterfaceMethod</*desc=*/[{Check if this layout is a slice of another layout.}],
                     /*retTy=*/"bool",
                     /*methodName=*/"isSliceOf",
+                    /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>,
+
+    InterfaceMethod</*desc=*/[{Check if this layout is identical to another layout.}],
+                    /*retTy=*/"bool",
+                    /*methodName=*/"isEqualTo",
                     /*args=*/(ins "const xegpu::DistributeLayoutAttr&": $other)>
   ];
 }
@@ -487,6 +500,12 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
       return {};
     }
 
+    //set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
+    DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims);
+
+    //set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+    DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims);
+
     /// Delinearizes a linear ID into its multidimensional indices
     /// based on the effective level of the layout.
     FailureOr<SmallVector<Value>>
@@ -501,6 +520,9 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
+    
+    /// Check if this is identical to some other layout.
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
 
   }];
 
@@ -649,6 +671,12 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
       return SliceAttr::get(getContext(), parent, attr.getDims());
     }
 
+    //set the layout for the sepcified unit dims: sg_data, inst_data and lane_data to 1
+    DistributeLayoutAttr setUnitDimData(SetVector<int64_t> unitDims);
+
+    //set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+    DistributeLayoutAttr setUnitDimLayout(SetVector<int64_t> unitDims);
+
     /// flatten a nested SliceAttr, e.g., for 2-level nested SliceAttr
     /// #xegpu.slice<#xegpu.slice<#xegpu.layout<sg_layout = [4, 8, 12]>, dims = [0]>, dims = [0]>
     /// it will coalese two slice operations and return a simplified SliceAttr
@@ -670,7 +698,9 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
-
+    
+    /// Check if this is identical to some other layout.
+    bool isEqualTo(const xegpu::DistributeLayoutAttr &other); 
   }];
 
   let assemblyFormat = "`<` qualified($parent) `,` `dims` `=` $dims `>`";
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index 4fe1087d18879..b54d620c3c0c3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -405,7 +405,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
                        OptionalAttr<DenseI64ArrayAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
-                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint, 
+                       OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint,
                        OptionalAttr<DistributeLayoutAttr>:$layout);
 
   let results = (outs XeGPU_ValueType: $value);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 7ab2e612ed890..1a19ab5fd970b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -390,6 +390,86 @@ LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
   return genCoordinates(builder, loc, ids, layout, subShape, shape);
 }
 
+bool LayoutAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
+  if (dyn_cast<xegpu::SliceAttr>(other))
+    return false;
+
+  return *this == dyn_cast<xegpu::LayoutAttr>(other);
+}
+
+// set the layout for unit dims: sg_data, inst_data and lane_data to 1
+DistributeLayoutAttr LayoutAttr::setUnitDimData(SetVector<int64_t> unitDims) {
+  auto sgDataOpt = getSgData();
+  auto instDataOpt = getInstData();
+  auto laneDataOpt = getLaneData();
+
+  SmallVector<int32_t> sgData;
+  SmallVector<int32_t> instData;
+  SmallVector<int32_t> laneData;
+
+  if (sgDataOpt) {
+    sgData = llvm::to_vector(sgDataOpt.asArrayRef());
+  }
+  if (instDataOpt) {
+    instData = llvm::to_vector(instDataOpt.asArrayRef());
+  }
+  if (laneDataOpt) {
+    laneData = llvm::to_vector(laneDataOpt.asArrayRef());
+  }
+
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(sgData.size()))
+      sgData[dim] = 1;
+    if (dim < static_cast<int64_t>(instData.size()))
+      instData[dim] = 1;
+    if (dim < static_cast<int64_t>(laneData.size()))
+      laneData[dim] = 1;
+  }
+
+  return LayoutAttr::get(
+      getContext(), getSgLayout(),
+      sgData.empty() ? DenseI32ArrayAttr()
+                     : DenseI32ArrayAttr::get(getContext(), sgData),
+      instData.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), instData),
+      getLaneLayout(),
+      laneData.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), laneData),
+      getOrder());
+}
+
+// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+DistributeLayoutAttr LayoutAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
+  auto sgLayoutOpt = getSgLayout();
+  auto laneLayoutOpt = getLaneLayout();
+
+  SmallVector<int32_t> sgLayout;
+  SmallVector<int32_t> laneLayout;
+
+  if (sgLayoutOpt) {
+    sgLayout = llvm::to_vector(sgLayoutOpt.asArrayRef());
+  }
+  if (laneLayoutOpt) {
+    laneLayout = llvm::to_vector(laneLayoutOpt.asArrayRef());
+  }
+
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(sgLayout.size()))
+      sgLayout[dim] = 1;
+    if (dim < static_cast<int64_t>(laneLayout.size()))
+      laneLayout[dim] = 1;
+  }
+
+  return LayoutAttr::get(
+      getContext(),
+      sgLayout.empty() ? DenseI32ArrayAttr()
+                       : DenseI32ArrayAttr::get(getContext(), sgLayout),
+      getSgData(), getInstData(),
+      laneLayout.empty() ? DenseI32ArrayAttr()
+                         : DenseI32ArrayAttr::get(getContext(), laneLayout),
+      getLaneData(), getOrder());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_SliceAttr
 //===----------------------------------------------------------------------===//
@@ -510,6 +590,69 @@ bool SliceAttr::isSliceOf(const xegpu::DistributeLayoutAttr &other) {
                       [&](int64_t dim) { return thisDims.contains(dim); });
 }
 
+bool SliceAttr::isEqualTo(const xegpu::DistributeLayoutAttr &other) {
+  if (dyn_cast<xegpu::LayoutAttr>(other))
+    return false;
+
+  auto flattenedThis = flatten();
+  auto flattenedOther = dyn_cast<xegpu::SliceAttr>(other).flatten();
+
+  return ((flattenedThis.getParent() == flattenedOther.getParent()) &&
+          (flattenedThis.getDims() == flattenedOther.getDims()));
+}
+
+// Helper function to adjust unit dimensions from sliced space to parent space
+static SetVector<int64_t>
+adjustUnitDimsWithSliceDims(const SetVector<int64_t> &unitDims,
+                            ArrayRef<int64_t> sliceDims) {
+  // Reconstruct parent's non-sliced dimensions
+
+  int64_t parentRank = sliceDims.size() + unitDims.size();
+  llvm::SmallDenseSet<int64_t> slicedDimsSet(sliceDims.begin(),
+                                             sliceDims.end());
+  SmallVector<int64_t> nonSlicedDims;
+  for (int64_t i = 0; i < parentRank; ++i) {
+    if (!slicedDimsSet.contains(i))
+      nonSlicedDims.push_back(i);
+  }
+
+  // Map unit dims from sliced space to parent space
+  SetVector<int64_t> adjustUnitDims;
+  for (auto dim : unitDims) {
+    if (dim < static_cast<int64_t>(nonSlicedDims.size())) {
+      adjustUnitDims.insert(nonSlicedDims[dim]);
+    }
+  }
+
+  return adjustUnitDims;
+}
+
+// set the layout for unit dims: sg_data, inst_data and lane_data to 1
+DistributeLayoutAttr SliceAttr::setUnitDimData(SetVector<int64_t> unitDims) {
+  SliceAttr attr = flatten();
+  ArrayRef<int64_t> sliceDims = attr.getDims().asArrayRef();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+
+  SetVector<int64_t> adjustUnitDims =
+      adjustUnitDimsWithSliceDims(unitDims, sliceDims);
+
+  return SliceAttr::get(getContext(), parent.setUnitDimData(adjustUnitDims),
+                        attr.getDims());
+}
+
+// set the layout for the sepcified unit dims: sg_lane and lane_layout to 1
+DistributeLayoutAttr SliceAttr::setUnitDimLayout(SetVector<int64_t> unitDims) {
+  SliceAttr attr = flatten();
+  ArrayRef<int64_t> sliceDims = attr.getDims().asArrayRef();
+  auto parent = dyn_cast<LayoutAttr>(attr.getParent());
+
+  SetVector<int64_t> adjustUnitDims =
+      adjustUnitDimsWithSliceDims(unitDims, sliceDims);
+
+  return SliceAttr::get(getContext(), parent.setUnitDimLayout(adjustUnitDims),
+                        attr.getDims());
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_RangeAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
index 59a1ad9dbe189..dc9eb96c169b4 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUPropagateLayout.cpp
@@ -580,23 +580,39 @@ void LayoutInfoPropagation::visitVectorBroadCastOp(
   // Only consider vector to vector broadcasts for now.
   VectorType resultTy = broadcast.getResultVectorType();
   VectorType sourceTy = dyn_cast<VectorType>(broadcast.getSourceType());
-  if (!sourceTy) {
-    broadcast.emitWarning("Expecting source type to be a vector type.");
+  // skip layout propagation for non-vector source operand.
+  if (!sourceTy)
     return;
-  }
 
-  // Only consider nD -> nD broadcast.
+  // Hanlding broadcast from low-rank to high-rank (e.g., 1D to 2D) case.
   if (sourceTy.getRank() != resultTy.getRank()) {
-    broadcast.emitWarning("Expecting source and result to have same rank.");
+    auto sourceDims = sourceTy.getShape();
+    auto resultDims = resultTy.getShape();
+    SmallVector<int64_t> bcastDims;
+    auto dimDiff = resultTy.getRank() - sourceTy.getRank();
+    // adding the missing leading dims
+    for (int i = 0; i < dimDiff; i++)
+      bcastDims.push_back(i);
+
+    // for the rest dims in the resultTy, if sourceTy dim is 1, then it's
+    // broadcasted dim
+    for (size_t i = 0; i < sourceDims.size(); i++)
+      if ((sourceDims[i] == 1) && (resultDims[i + dimDiff] != 1))
+        bcastDims.push_back(i + dimDiff);
+
+    // create a slice layout for the source
+    xegpu::SliceAttr sliceLayout = xegpu::SliceAttr::get(
+        broadcast->getContext(),
+        cast<xegpu::DistributeLayoutAttr>(resultLayout.get()),
+        DenseI64ArrayAttr::get(broadcast->getContext(), bcastDims));
+
+    propagateIfChanged(operands[0], operands[0]->meet(LayoutInfo(sliceLayout)));
     return;
   }
+
   SetVector<int64_t> broadcastUnitDims = broadcast.computeBroadcastedUnitDims();
-  if (broadcastUnitDims.size() != 1) {
-    broadcast.emitWarning("Expecting source type to be nD vector only with "
-                          "one broadcasted dimension.");
-    return;
-  }
-  // Propagate the result layout to the source operand.
+  resultLayout = cast<xegpu::DistributeLayoutAttr>(resultLayout.get())
+                     .setUnitDimData(broadcastUnitDims);
   propagateIfChanged(operands[0], operands[0]->meet(resultLayout));
 }
 
@@ -917,7 +933,7 @@ void LayoutInfoPropagation::visitLoadGatherOp(
   } else {
 
     // The layout is strictly determined by the payload type.
-    auto payloadTy = dyn_cast<VectorType>(load.getValueType());
+    VectorType payloadTy = load.getValueType();
     if (!payloadTy) {
       load.emitWarning("Not propagating, non-vector payload supplied.");
       return;
@@ -987,7 +1003,7 @@ void LayoutInfoPropagation::visitStoreScatterOp(
     // Currently, for 2D StoreScatterOp we expect that the height dimension of
     // the tensor descriptor is equal to the subgroup size. This is ensured by
     // the op verifier.
-    auto payloadTy = dyn_cast<VectorType>(storeScatter.getValueType());
+    VectorType payloadTy = storeScatter.getValueType();
     if (!payloadTy) {
       storeScatter.emitWarning("Not propagating, non-vector payload supplied.");
       return;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 0d1c5eeeff711..ca81c3cd7be42 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -99,7 +99,6 @@ getDistVecTypeBasedOnLaneLayout(xegpu::DistributeLayoutAttr layout,
   for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
     if (i < distributionStart)
       continue;
-
     // Check if the dimension can be distributed evenly.
     if (dim % effectiveLaneLayout[i - distributionStart] != 0)
       return failure();
@@ -1424,6 +1423,166 @@ struct VectorMultiReductionDistribution : public gpu::WarpDistributionPattern {
   }
 };
 
+/// This pattern distributes the `vector.broadcast` operation across lanes in a
+/// warp. The pattern supports three use cases:
+///
+/// 1) Broadcast a low-rank vector to high-rank vector: The low-rank input
+/// vector
+///    must have a slice layout of the result. If the distributed source and
+///    target vector types are identical, this lowers to a no-op; otherwise, it
+///    remains a broadcast but operates on distributed vectors.
+///
+/// 2) Broadcast a same-rank vector with identical layouts for source and
+/// target:
+///    The source vector must have unit dimensions, and lane_data must be unit
+///    size for those unit dims. This always lowers to a no-op.
+///
+/// 3) Broadcast a scalar with no layout: This always lowers to a broadcast from
+///    scalar to distributed result type.
+///
+/// Example 1 (lowering to a broadcast with distributed types):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [0]> } : () -> (vector<32xf32>)
+///   %2 = vector.broadcast %0 {layout_result_0 =
+///     #xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>}
+///     : vector<32xf32> to vector<8x32xf32>
+///     gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [0]> } : () -> (vector<32xf32>)
+///   gpu.yield %0 : vector<32xf32>
+/// }
+/// %2 = vector.broadcast %r#0 : vector<1xf32> to vector<8x1xf32>
+///
+/// Example 2 (no-op):
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x32xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [1]> } : () -> (vector<8xf32>)
+///   %1 = vector.shape_cast %0
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///      1]>}: vector<8xf32> to vector<8x1xf32>
+///   %2 = vector.broadcast %1
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///     1]>}: vector<8x1xf32> to vector<8x32xf32>
+///   gpu.yield %1 : vector<8x32xf32>
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %r:1 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<8x1xf32>) {
+///   %0 = "some_def"() {layout_result_0 =
+///     #xegpu.slice<#xegpu.layout<lane_layout = [1, 32], lane_data = [1, 1]>,
+///     dims = [1]> } : () -> (vector<8xf32>)
+///   %1 = vector.shape_cast %0
+///     {layout_result_0 = #xegpu.layout<lane_layout = [1, 32], lane_data = [1,
+///     1]>}: vector<8xf32> to vector<8x1xf32>
+///   gpu.yield %1 : vector<8x1xf32>
+/// }
+/// // The broadcast is implicit through layout transformation (no-op)
+///  "some_use"(%r#0)
+/// ```
+struct VectorBroadcastDistribution : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *yieldOperand =
+        getWarpResult(warpOp, llvm::IsaPred<vector::BroadcastOp>);
+    if (!yieldOperand)
+      return failure();
+    auto broadcastOp =
+        cast<vector::BroadcastOp>(yieldOperand->get().getDefiningOp());
+    unsigned operandIdx = yieldOperand->getOperandNumber();
+
+    VectorType sourceType = dyn_cast<VectorType>(broadcastOp.getSourceType());
+    VectorType destType =
+        dyn_cast<VectorType>(broadcastOp.getResult().getType());
+
+    xegpu::DistributeLayoutAttr sourceLayout =
+        xegpu::getDistributeLayoutAttr(broadcastOp->getOpOperand(0));
+    xegpu::DistributeLayoutAttr resultLayout =
+        xegpu::getDistributeLayoutAttr(broadcastOp.getResult());
+
+    FailureOr<VectorType> sourceDistType;
+    Type sourceElemOrDistType;
+    if (sourceType) {
+
+      // Case 1 and 2: source is a vector type.
+      int64_t rankDiff = destType.getRank() - sourceType.getRank();
+      if (rankDiff > 0) {
+        // Case 1: source is lower-rank than result.
+        bool isSliceOf = sourceLayout.isSliceOf(resultLayout);
+        if (!isSliceOf)
+          return rewriter.notifyMatchFailure(
+              warpOp,
+              "Broadcast input layout must be a slice of result layout.");
+      }
+      // case 2: source and result have same rank
+      if (rankDiff == 0) {
+        SetVector<int64_t> broadcastUnitDims =
+            broadcastOp.computeBroadcastedUnitDims();
+        resultLayout = resultLayout.setUnitDimData(broadcastUnitDims);
+        bool isEqualTo = sourceLayout.isEqualTo(resultLayout);
+        if (!isEqualTo)
+          return rewriter.notifyMatchFailure(
+              warpOp, "For same-rank broadcast, source must be identical to "
+                      "adjusted result layouts with unit dims.");
+        sourceLayout = sourceLayout.setUnitDimLayout(broadcastUnitDims);
+      }
+
+      sourceDistType =
+          getDistVecTypeBasedOnLaneLayout(sourceLayout, sourceType);
+      if (failed(sourceDistType)) {
+        return rewriter.notifyMatchFailure(
+            warpOp, "Failed to distribute the source vector type.");
+      }
+      sourceElemOrDistType = sourceDistType.value();
+
+    } else {
+      // Case 3: source is a scalar type.
+      if (sourceLayout) {
+        return rewriter.notifyMatchFailure(
+            warpOp, "Broadcast from scalar must not have a layout attribute.");
+      }
+      sourceElemOrDistType = broadcastOp.getSourceType();
+    }
+    FailureOr<VectorType> destDistType =
+        getDistVecTypeBasedOnLaneLayout(resultLayout, destType);
+    if (failed(destDistType)) {
+      return rewriter.notifyMatchFailure(
+          warpOp, "Failed to distribute the dest vector type.");
+    }
+
+    SmallVector<size_t> newRetIndices;
+    auto newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, {broadcastOp.getSource()}, sourceElemOrDistType,
+        newRetIndices);
+
+    Value distributedSource = newWarpOp.getResult(newRetIndices[0]);
+
+    Value newBroadcast = distributedSource;
+
+    if (sourceElemOrDistType != destDistType.value()) {
+      rewriter.setInsertionPointAfter(newWarpOp);
+      newBroadcast =
+          vector::BroadcastOp::create(rewriter, newWarpOp.getLoc(),
+                                      destDistType.value(), distributedSource);
+    }
+
+    rewriter.replaceAllUsesWith(newWarpOp.getResult(operandIdx), newBroadcast);
+    return success();
+  }
+};
+
 /// Distribute a `vector.shape_cast` op feeding into yield op of an enclosing
 /// `gpu.warp_execute_on_lane_0` region.
 struct VectorShapeCastDistribution : public gpu::WarpDistributionPattern {
@@ -1865,7 +2024,7 @@ void xegpu::populateXeGPUSubgroupDistributePatterns(
   // patterns. Therefore, assign higher benefit.
   patterns
       .add<VectorShapeCastDistribution, VectorExtractStridedSliceDistribution,
-           VectorInsertStridedSliceDistribution>(
+           VectorInsertStridedSliceDistribution, VectorBroadcastDistribution>(
           patterns.getContext(),
           /*pattern benefit=*/highPatternBenefit);
 }
diff --git a/mlir/test/Dialect/XeGPU/propagate-layout.mlir b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
index f8b59b87a122b..48e77d867508b 100644
--- a/mlir/test/Dialect/XeGPU/propagate-layout.mlir
+++ b/mlir/test/Dialect/XeGPU/propagate-layout.mlir
@@ -640,3 +640,61 @@ func.func @vector_shape_cast_1d_to_2d_dim0_broadcasted(%arg0: !xegpu.tensor_desc
   return
 }
 }
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_1d_to_2d_broadcast_along_row(
+// CHECK-SAME:    %[[ARG0:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME:    %[[ARG1:[0-9a-zA-Z]+]]: !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>) {
+// CHECK:         %[[LOAD:.*]] = xegpu.load_nd %[[ARG0]] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}>  {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+// CHECK-SAME:      !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+// CHECK-NEXT:    %[[REDUCE:.*]] = vector.multi_reduction <add>, %[[LOAD]], %{{[0-9a-zA-Z]+}}
+// CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT:    %[[BROADCAST:.*]] = vector.broadcast %[[REDUCE]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+func.func @vector_broadcast_1d_to_2d_broadcast_along_row(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.0000> : vector<16xf16>
+  %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.multi_reduction <add>, %3, %cst [0] : vector<16x16xf16> to vector<16xf16>
+  %5 = vector.broadcast %4 : vector<16xf16> to vector<16x16xf16>
+  xegpu.store_nd %5, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_2d_to_2d_along_column(
+// CHECK:            %[[REDUCE:.*]] = vector.multi_reduction <add>
+// CHECK-SAME:       {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>} [1] : vector<16x16xf16> to vector<16xf16>
+// CHECK-NEXT:    %[[SHAPECAST:.*]] = vector.shape_cast %[[REDUCE]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+// CHECK-NEXT:    vector.broadcast %[[SHAPECAST]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+
+func.func @vector_broadcast_2d_to_2d_along_column(%arg0: !xegpu.tensor_desc<16x16xf16>, %arg1: !xegpu.tensor_desc<16x16xf16>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant dense<0.0000> : vector<16xf16>
+  %3 = xegpu.load_nd %arg0  : !xegpu.tensor_desc<16x16xf16> -> vector<16x16xf16>
+  %4 = vector.multi_reduction <add>, %3, %cst [1] : vector<16x16xf16> to vector<16xf16>
+  %5 = vector.shape_cast %4 : vector<16xf16> to vector<16x1xf16>
+  %6 = vector.broadcast %5 : vector<16x1xf16> to vector<16x16xf16>
+  xegpu.store_nd %6, %arg1  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
+
+// -----
+gpu.module @test {
+// CHECK-LABEL: func.func @vector_broadcast_scalar_to_vector(
+// CHECK:         %[[CST:.*]] = arith.constant 0.{{.*}} : f16
+// CHECK-NEXT:    %[[BROADCAST:.*]] = vector.broadcast %[[CST]]
+// CHECK-SAME:       {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+
+func.func @vector_broadcast_scalar_to_vector(%arg0: !xegpu.tensor_desc<16x16xf16>) {
+  %cst = arith.constant 0.0000 : f16
+  %6 = vector.broadcast %cst : f16 to vector<16x16xf16>
+  xegpu.store_nd %6, %arg0  : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16>
+  return
+}
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
index 44ec21359593f..216f3d19cff94 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute-unit.mlir
@@ -920,4 +920,69 @@ gpu.func @vector_insert_strided_slice_unsupported_offset(%laneid: index) {
   gpu.return
 }
 
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane
+// CHECK-SAME: (%[[ARG0:.*]]: index) {
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<1xf16>)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST_INNER:.*]] = vector.broadcast %[[DEF]]
+// CHECK: gpu.yield %[[BCAST_INNER]], %[[DEF]]
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[R]]#1 : vector<1xf16> to vector<16x1xf16>
+// CHECK: "some_use"(%[[BCAST]])
+gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%laneid: index) {
+
+  %r = gpu.warp_execute_on_lane_0(%laneid)[16] -> (vector<16x1xf16>) {
+
+    %1 = "some_def"() : () -> vector<16xf16>
+
+    %2 = vector.broadcast %1 {
+      layout_operand_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    } : vector<16xf16> to vector<16x16xf16>
+
+    gpu.yield %2 : vector<16x16xf16>
+  }
+  "some_use"(%r) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, vector<16x1xf16>)
+// CHECK:   %[[DEF:.*]] = "some_def"() : () -> vector<16x1xf16>
+// CHECK:   %[[BCAST:.*]] = vector.broadcast %[[DEF]]
+// CHECK-SAME: : vector<16x1xf16> to vector<16x16xf16>
+// CHECK:   gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, vector<16x1xf16>
+// CHECK: "some_use"(%[[R]]#1) : (vector<16x1xf16>) -> ()
+gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: index) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
+    %1 = "some_def"() : () -> vector<16x1xf16>
+    %2 = vector.broadcast %1 {
+      layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    } : vector<16x1xf16> to vector<16x16xf16>
+    gpu.yield %2: vector<16x16xf16>
+  }
+  "some_use"(%0) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector
+// CHECK-SAME: (%[[ARG0:.*]]: index)
+// CHECK: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%[[ARG0]])[16] -> (vector<16x1xf16>, f16)
+// CHECK: %[[DEF:.*]] = "some_def"()
+// CHECK: %[[BCAST:.*]] = vector.broadcast %[[DEF]] {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+// CHECK: gpu.yield %[[BCAST]], %[[DEF]] : vector<16x16xf16>, f16
+// CHECK: %[[RESULT:.*]] = vector.broadcast %[[R]]#1 : f16 to vector<16x1xf16>
+// CHECK: "some_use"(%[[RESULT]])
+gpu.func
+@vector_shape_cast_scalar_to_vector(%arg0: index) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[16] -> (vector<16x1xf16>) {
+    %1 = "some_def"() : () -> f16
+    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    gpu.yield %2 : vector<16x16xf16>
+  }
+  "some_use"(%0) : (vector<16x1xf16>) -> ()
+  gpu.return
+}
+
 }
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 22177f8f6a15f..e5e3d2a1c1ad5 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -330,3 +330,64 @@ gpu.module @xevm_module{
     gpu.return
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_1d_to_2d_broadcast_within_lane({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_broadcast_1d_to_2d_broadcast_within_lane(%arg0: memref<16x16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} dense<0.000000e+00> : vector<16xf16>
+    %tdesc0 = xegpu.create_nd_tdesc %arg0 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    %0 = xegpu.load_nd %tdesc0[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>> -> vector<16x16xf16>
+    %1 = vector.multi_reduction <add>, %0, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [0]>} [0] : vector<16x16xf16> to vector<16xf16>
+    // CHECK: %[[BCAST:.*]] = vector.broadcast %{{.*}} : f16 to vector<16xf16>
+    %2 = vector.broadcast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x16xf16>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_broadcast_2d_to_2d_across_lane_lower_to_noop_case(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %mask = vector.constant_mask [16] {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: vector<16xi1>
+    %1 = xegpu.load %arg0[%c0], %mask {layout_result_0 = #xegpu.slice<#xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>, dims = [1]>}: memref<16xf16>, index, vector<16xi1> -> vector<16xf16>
+    
+    %11 = vector.shape_cast %1 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16xf16> to vector<16x1xf16>
+    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : vector<16x1xf16> to vector<16x16xf16>
+    // CHECK-NOT: vector.broadcast
+    // CHECK-NOT: vector.shape_cast
+ 
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    // CHECK: xegpu.store_nd {{.*}}, {{.*}}[{{.*}}, {{.*}}]
+    // CHECK-SAME: : vector<16xf16>, !xegpu.tensor_desc<16x16xf16>
+
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @vector_shape_cast_scalar_to_vector({{.*}}) {
+gpu.module @xevm_module{
+   gpu.func  @vector_shape_cast_scalar_to_vector(%arg0: memref<16xf16>, %arg1: memref<16x16xf16>) {
+    %c0 = arith.constant 0 : index
+    %9 = gpu.block_id  x
+    %10 = arith.index_cast %9 : index to i16
+    %11 = arith.bitcast %10 : i16 to f16
+    // CHECK: vector.broadcast {{.*}} : f16 to vector<16xf16>
+    %2 = vector.broadcast %11 {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>} : f16 to vector<16x16xf16>
+    %tdesc1 = xegpu.create_nd_tdesc %arg1 : memref<16x16xf16>
+      -> !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    xegpu.store_nd %2, %tdesc1[%c0, %c0] <{layout = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}> : vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
+
+

From 0c0ed398ecef2d3840b4ebc11b41d638339d299b Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 9 Dec 2025 08:50:37 -0800
Subject: [PATCH 20/63] [lldb] Don't read firstSubclass and nextSiblingClass
 from class_rw_t (#171213)

We're considering modifying the ObjC runtime's class_rw_t structure to
remove the firstSubclass and nextSiblingClass fields in some cases. LLDB
is currently reading those but not actually using them. Stop doing that
to avoid issues if they are removed by the runtime.

rdar://166084122
---
 .../ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp     | 5 +----
 .../ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h       | 3 ---
 2 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
index 954f269f8860b..ebde8892d8f62 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.cpp
@@ -111,7 +111,6 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
                           process->GetAddressByteSize());
 
   lldb::offset_t cursor = 0;
-
   m_flags = extractor.GetU32_unchecked(&cursor);
   m_version = extractor.GetU32_unchecked(&cursor);
   m_ro_ptr = extractor.GetAddress_unchecked(&cursor);
@@ -119,18 +118,16 @@ bool ClassDescriptorV2::class_rw_t::Read(Process *process, lldb::addr_t addr) {
     m_ro_ptr = abi_sp->FixCodeAddress(m_ro_ptr);
   m_method_list_ptr = extractor.GetAddress_unchecked(&cursor);
   m_properties_ptr = extractor.GetAddress_unchecked(&cursor);
-  m_firstSubclass = extractor.GetAddress_unchecked(&cursor);
-  m_nextSiblingClass = extractor.GetAddress_unchecked(&cursor);
 
   if (m_ro_ptr & 1) {
     DataBufferHeap buffer(ptr_size, '\0');
     process->ReadMemory(m_ro_ptr ^ 1, buffer.GetBytes(), ptr_size, error);
     if (error.Fail())
       return false;
-    cursor = 0;
     DataExtractor extractor(buffer.GetBytes(), ptr_size,
                             process->GetByteOrder(),
                             process->GetAddressByteSize());
+    lldb::offset_t cursor = 0;
     m_ro_ptr = extractor.GetAddress_unchecked(&cursor);
     if (ABISP abi_sp = process->GetABI())
       m_ro_ptr = abi_sp->FixCodeAddress(m_ro_ptr);
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
index 0fff9af438367..8d19b00f1551f 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCClassDescriptorV2.h
@@ -133,9 +133,6 @@ class ClassDescriptorV2 : public ObjCLanguageRuntime::ClassDescriptor {
     lldb::addr_t m_properties_ptr;
     lldb::addr_t m_protocols_ptr;
 
-    ObjCLanguageRuntime::ObjCISA m_firstSubclass;
-    ObjCLanguageRuntime::ObjCISA m_nextSiblingClass;
-
     bool Read(Process *process, lldb::addr_t addr);
   };
 

From 2e16f24957eea220d09bdc869d369c80904eecaf Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 9 Dec 2025 08:51:43 -0800
Subject: [PATCH 21/63] [RISCV] Add VMNoV0 register class with only the
 VMaskVTs. (#171231)

I plan to use this for inline assembly "vd" contraints with mask types
in a follow up patch. Due to the test changes I wanted to post this
separately.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td    |  1 +
 .../instruction-select/rvv/select.mir         | 20 +++++++++----------
 .../RISCV/rvv/pass-fast-math-flags-sdnode.ll  |  2 +-
 .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir  |  4 ++--
 .../rvv/strided-vpload-vpstore-output.ll      |  2 +-
 .../RISCV/rvv/vleff-vlseg2ff-output.ll        |  4 ++--
 .../CodeGen/RISCV/rvv/vmerge-peephole.mir     |  4 ++--
 .../CodeGen/RISCV/rvv/vmv.v.v-peephole.mir    |  6 +++---
 .../RISCV/rvv/vsetvli-insert-crossbb.mir      |  4 ++--
 9 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 11b7a0a3c691a..f354793eb0eac 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -813,6 +813,7 @@ def VMV0 : VReg<VMaskVTs, (add V0), 1>;
 
 // The register class is added for inline assembly for vector mask types.
 def VM : VReg<VMaskVTs, (add VR), 1>;
+def VMNoV0 : VReg<VMaskVTs, (sub VR, V0), 1>;
 
 defvar VTupM1N2VTs = [riscv_nxv8i8x2, riscv_nxv4i8x2, riscv_nxv2i8x2, riscv_nxv1i8x2];
 defvar VTupM1N3VTs = [riscv_nxv8i8x3, riscv_nxv4i8x3, riscv_nxv2i8x3, riscv_nxv1i8x3];
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
index ada76a43639d7..b7cb295648b4e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rvv/select.mir
@@ -11,7 +11,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv1i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -19,7 +19,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv1i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -40,7 +40,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv4i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -48,7 +48,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv4i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 3 /* e8 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -98,7 +98,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv64i8
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -106,7 +106,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv64i8
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF4_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF4 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF4_]]
@@ -127,7 +127,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv2i16
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -135,7 +135,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv2i16
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_M1_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_M1 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 4 /* e16 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_M1_]]
@@ -185,7 +185,7 @@ body:             |
   bb.0.entry:
     ; RV32I-LABEL: name: select_nxv32i16
     ; RV32I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV32I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV32I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV32I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
@@ -193,7 +193,7 @@ body:             |
     ;
     ; RV64I-LABEL: name: select_nxv32i16
     ; RV64I: [[DEF:%[0-9]+]]:vmv0 = IMPLICIT_DEF
-    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
+    ; RV64I-NEXT: [[DEF1:%[0-9]+]]:vmnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[DEF2:%[0-9]+]]:vrnov0 = IMPLICIT_DEF
     ; RV64I-NEXT: [[PseudoVMERGE_VVM_MF2_:%[0-9]+]]:vrnov0 = PseudoVMERGE_VVM_MF2 [[DEF2]], [[DEF1]], [[DEF1]], [[DEF]], -1, 5 /* e32 */
     ; RV64I-NEXT: $v8 = COPY [[PseudoVMERGE_VVM_MF2_]]
diff --git a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
index 0654fe8bd8d66..3225d649f066e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/pass-fast-math-flags-sdnode.ll
@@ -13,7 +13,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %x, <vscale x 1 x double
   ; CHECK-NEXT:   [[SLLI:%[0-9]+]]:gpr = SLLI [[COPY]], 32
   ; CHECK-NEXT:   [[SRLI:%[0-9]+]]:gprnox0 = SRLI killed [[SLLI]], 32
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_E64_MASK:%[0-9]+]]:vrnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_E64_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
+  ; CHECK-NEXT:   [[PseudoVFMUL_VV_M1_E64_MASK:%[0-9]+]]:vmnov0 = nnan ninf nsz arcp contract afn reassoc nofpexcept PseudoVFMUL_VV_M1_E64_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], 7, killed [[SRLI]], 6 /* e64 */, 1 /* ta, mu */, implicit $frm
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVFMUL_VV_M1_E64_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %1 = call fast <vscale x 1 x double> @llvm.vp.fmul.nxv1f64(<vscale x 1 x double> %x, <vscale x 1 x double> %y, <vscale x 1 x i1> %m, i32 %vl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
index c73c2004834db..ece457a09dbdf 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -11,7 +11,7 @@ body: |
     ; CHECK: liveins: $x1, $v8, $v9
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %false:vrnov0 = COPY $v8
-    ; CHECK-NEXT: %true:vrnov0 = COPY $v9
+    ; CHECK-NEXT: %true:vmnov0 = COPY $v9
     ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
     ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 0 /* e8 */
     ; CHECK-NEXT: $v0 = COPY %mask
@@ -135,7 +135,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %false:vrnov0 = COPY $v8
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
-    ; CHECK-NEXT: %true:vrnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %true:vmnov0 = PseudoVADD_VV_M1_MASK %false, $noreg, $noreg, %mask, 4, 5 /* e32 */, 1 /* ta, mu */
     %false:vrnov0 = COPY $v8
     %mask:vmv0 = COPY $v0
     %true:vrnov0 = PseudoVADD_VV_M1_MASK $noreg, $noreg, $noreg, %mask, 4, 5 /* e32 */, 0 /* tu, mu */
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
index f087efcc5f57b..d3649ef4b6664 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload-vpstore-output.ll
@@ -15,7 +15,7 @@ define <vscale x 1 x i8> @strided_vpload_nxv1i8_i8(ptr %ptr, i8 signext %stride,
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x11
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:gpr = COPY $x10
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSE8_V_MF8_MASK:%[0-9]+]]:vmnov0 = PseudoVLSE8_V_MF8_MASK $noreg, [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 1 /* ta, mu */ :: (load unknown-size, align 1)
   ; CHECK-NEXT:   $v8 = COPY [[PseudoVLSE8_V_MF8_MASK]]
   ; CHECK-NEXT:   PseudoRET implicit $v8
   %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 %stride, <vscale x 1 x i1> %m, i32 %evl)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 6b6276b838fba..7cbaceae858b3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -42,9 +42,9 @@ define i64 @test_vleff_nxv8i8_mask(<vscale x 8 x i8> %maskedoff, ptr %p, <vscale
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vr = COPY $v0
   ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vrnov0 = COPY $v8
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vmnov0 = COPY $v8
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vmv0 = COPY [[COPY1]]
-  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vrnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
+  ; CHECK-NEXT:   [[PseudoVLE8FF_V_M1_MASK:%[0-9]+]]:vmnov0, [[PseudoVLE8FF_V_M1_MASK1:%[0-9]+]]:gpr = PseudoVLE8FF_V_M1_MASK [[COPY3]], [[COPY2]], [[COPY4]], [[COPY]], 3 /* e8 */, 0 /* tu, mu */ :: (load unknown-size from %ir.p, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLE8FF_V_M1_MASK1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
index 81a271bd975e3..bc78a7732c15a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmerge-peephole.mir
@@ -148,8 +148,8 @@ body: |
     ; CHECK-NEXT: %y:vr = COPY $v9
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
     ; CHECK-NEXT: %add0:vr = PseudoVADD_VV_M1 $noreg, %x, %y, -1, 5 /* e32 */, 3 /* ta, ma */
-    ; CHECK-NEXT: %add1:vrnov0 = COPY %add:vrnov0
-    ; CHECK-NEXT: %merge:vrnov0 = PseudoVOR_VV_M1_MASK %add:vrnov0, %add1, %y, %mask, -1, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: %add1:vmnov0 = COPY %add:vmnov0
+    ; CHECK-NEXT: %merge:vrnov0 = PseudoVOR_VV_M1_MASK %add:vmnov0, %add1, %y, %mask, -1, 5 /* e32 */, 1 /* ta, mu */
     %x:vr = COPY $v8
     %y:vr = COPY $v9
     %mask:vmv0 = COPY $v0
diff --git a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
index 68e74ff6ba05b..3f551ba91b3a7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vmv.v.v-peephole.mir
@@ -112,7 +112,7 @@ body: |
     ; CHECK-LABEL: name: diff_regclass
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vmnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vmv0 = COPY $v8
     ; CHECK-NEXT: [[PseudoVADD_VV_M1_MASK:%[0-9]+]]:vrnov0 = PseudoVADD_VV_M1_MASK [[PseudoVMV_V_I_MF2_]], $noreg, $noreg, [[COPY]], 0, 5 /* e32 */, 0 /* tu, mu */
     %0:vr = PseudoVMV_V_I_MF2 $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
@@ -128,7 +128,7 @@ body: |
     ; CHECK-LABEL: name: diff_regclass_passthru
     ; CHECK: liveins: $v8
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vrnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
+    ; CHECK-NEXT: [[PseudoVMV_V_I_MF2_:%[0-9]+]]:vmnov0 = PseudoVMV_V_I_MF2 $noreg, 0, 0, 5 /* e32 */, 1 /* ta, mu */
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vmv0 = COPY $v8
     ; CHECK-NEXT: [[PseudoVLSE32_V_MF2_MASK:%[0-9]+]]:vrnov0 = PseudoVLSE32_V_MF2_MASK [[PseudoVMV_V_I_MF2_]], $noreg, $noreg, [[COPY]], 0, 5 /* e32 */, 0 /* tu, mu */ :: (load unknown-size, align 4)
     %2:vr = PseudoVMV_V_I_MF2 $noreg, 0, -1, 5 /* e32 */, 0 /* tu, mu */
@@ -162,7 +162,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %passthru:vrnov0 = COPY $v8
     ; CHECK-NEXT: %mask:vmv0 = COPY $v0
-    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
+    ; CHECK-NEXT: %x:vmnov0 = PseudoVMERGE_VVM_M1 %passthru, %passthru, $noreg, %mask, 4, 5 /* e32 */
     %passthru:vrnov0 = COPY $v8
     %mask:vmv0 = COPY $v0
     %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %passthru, $noreg, %mask, 4, 5 /* e32 */
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
index a35100654432c..0b242abeb035c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.mir
@@ -793,7 +793,7 @@ body:             |
   ; CHECK-NEXT:   %idxs:vr = COPY $v0
   ; CHECK-NEXT:   %t1:vr = COPY $v1
   ; CHECK-NEXT:   %t3:vr = COPY $v2
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vrnov0 = COPY $v3
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vmnov0 = COPY $v3
   ; CHECK-NEXT:   %t5:vrnov0 = COPY $v1
   ; CHECK-NEXT:   dead [[PseudoVSETVLIX0_:%[0-9]+]]:gprnox0 = PseudoVSETVLIX0 killed $x0, 216 /* e64, m1, ta, ma */, implicit-def $vl, implicit-def $vtype
   ; CHECK-NEXT:   %t6:vr = PseudoVMSEQ_VI_M1 %t1, 0, -1, 6 /* e64 */, implicit $vl, implicit $vtype
@@ -811,7 +811,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $v0 = COPY %mask
   ; CHECK-NEXT:   dead $x0 = PseudoVSETVLIX0X0 killed $x0, 69 /* e8, mf8, ta, mu */, implicit-def $vl, implicit-def $vtype, implicit $vl
-  ; CHECK-NEXT:   early-clobber [[COPY]]:vrnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
+  ; CHECK-NEXT:   early-clobber [[COPY]]:vmnov0 = PseudoVLUXEI64_V_M1_MF8_MASK %t5, %inaddr, %idxs, $v0, -1, 3 /* e8 */, 1 /* ta, mu */, implicit $vl, implicit $vtype
   ; CHECK-NEXT:   PseudoBR %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:

From 632cbee244f31854c3d7320033471772a7a2f453 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 9 Dec 2025 08:52:55 -0800
Subject: [PATCH 22/63] [RISCV] Use VM and VMNoV0 for "vr" and "vd" inline asm
 constraints with mask type. (#171235)

The inline assembly handling in SelectionDAG uses the first type
for the register class as the type at the input/output of the
inlineassembly. If this isn't the type for the surrounding DAG,
it needs to be converted.

nxv8i8 is the first type for the VR and VRNoV0 register classes.
So we currently generate insert/extract_subvector and bitcasts to
convert to/from nxv8i8.

I believe some of the special casing we have for this in
splitValueIntoRegisterParts and joinRegisterPartsIntoValue is causing
us to also generate incorrect code for arguments with nxv16i4 types
that should be any extended to nxv16i8. Instead we widen them to nxv32i4
and bitcast to nxv16i8.

This patch uses VM and VMNoV0 for masks which has nxv64i1 as their
first type. This means we will only emit an insert/extract_subvector
without any bitcasts. This will allow me to fix
splitValueIntoRegisterParts and joinRegisterPartsIntoValue to fix the
nxv16i4 argument issue without breaking inline assembly.

I may need to add more register classes to cover fractional LMULs,
but I'm not sure yet.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 32 +++++++++++----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b74dba12e8959..7cbb9c0da4874 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -24323,14 +24323,15 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       break;
     }
   } else if (Constraint == "vr") {
+    // Check VM first so that mask types will use that instead of VR.
     for (const auto *RC :
-         {&RISCV::VRRegClass, &RISCV::VRM2RegClass, &RISCV::VRM4RegClass,
-          &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass, &RISCV::VRN3M1RegClass,
-          &RISCV::VRN4M1RegClass, &RISCV::VRN5M1RegClass,
-          &RISCV::VRN6M1RegClass, &RISCV::VRN7M1RegClass,
-          &RISCV::VRN8M1RegClass, &RISCV::VRN2M2RegClass,
-          &RISCV::VRN3M2RegClass, &RISCV::VRN4M2RegClass,
-          &RISCV::VRN2M4RegClass}) {
+         {&RISCV::VMRegClass, &RISCV::VRRegClass, &RISCV::VRM2RegClass,
+          &RISCV::VRM4RegClass, &RISCV::VRM8RegClass, &RISCV::VRN2M1RegClass,
+          &RISCV::VRN3M1RegClass, &RISCV::VRN4M1RegClass,
+          &RISCV::VRN5M1RegClass, &RISCV::VRN6M1RegClass,
+          &RISCV::VRN7M1RegClass, &RISCV::VRN8M1RegClass,
+          &RISCV::VRN2M2RegClass, &RISCV::VRN3M2RegClass,
+          &RISCV::VRN4M2RegClass, &RISCV::VRN2M4RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
 
@@ -24341,15 +24342,16 @@ RISCVTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       }
     }
   } else if (Constraint == "vd") {
+    // Check VMNoV0 first so that mask types will use that instead of VRNoV0.
     for (const auto *RC :
-         {&RISCV::VRNoV0RegClass, &RISCV::VRM2NoV0RegClass,
-          &RISCV::VRM4NoV0RegClass, &RISCV::VRM8NoV0RegClass,
-          &RISCV::VRN2M1NoV0RegClass, &RISCV::VRN3M1NoV0RegClass,
-          &RISCV::VRN4M1NoV0RegClass, &RISCV::VRN5M1NoV0RegClass,
-          &RISCV::VRN6M1NoV0RegClass, &RISCV::VRN7M1NoV0RegClass,
-          &RISCV::VRN8M1NoV0RegClass, &RISCV::VRN2M2NoV0RegClass,
-          &RISCV::VRN3M2NoV0RegClass, &RISCV::VRN4M2NoV0RegClass,
-          &RISCV::VRN2M4NoV0RegClass}) {
+         {&RISCV::VMNoV0RegClass, &RISCV::VRNoV0RegClass,
+          &RISCV::VRM2NoV0RegClass, &RISCV::VRM4NoV0RegClass,
+          &RISCV::VRM8NoV0RegClass, &RISCV::VRN2M1NoV0RegClass,
+          &RISCV::VRN3M1NoV0RegClass, &RISCV::VRN4M1NoV0RegClass,
+          &RISCV::VRN5M1NoV0RegClass, &RISCV::VRN6M1NoV0RegClass,
+          &RISCV::VRN7M1NoV0RegClass, &RISCV::VRN8M1NoV0RegClass,
+          &RISCV::VRN2M2NoV0RegClass, &RISCV::VRN3M2NoV0RegClass,
+          &RISCV::VRN4M2NoV0RegClass, &RISCV::VRN2M4NoV0RegClass}) {
       if (TRI->isTypeLegalForClass(*RC, VT.SimpleTy))
         return std::make_pair(0U, RC);
 

From b3b033bf7350e6a7cd581caa21f18471aad65c0f Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 9 Dec 2025 09:00:57 -0800
Subject: [PATCH 23/63] [CIR][NFC] Fix bad switch fallthroughs in emitStmt
 (#171224)

This moves a couple of statement emitters that were incorrectly
implemented in the middle of a switch statement where all cases in the
final group are intended to fall through to a handler that emits an NYI
error message. The placement of these implementations was causing some
statement types that should have emitted the NYI error to instead go to
a handler for a different statement type.
---
 clang/lib/CIR/CodeGen/CIRGenStmt.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index da7ab0691cb63..f13e7cb32c71e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -159,6 +159,10 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
     return emitCXXTryStmt(cast<CXXTryStmt>(*s));
   case Stmt::CXXForRangeStmtClass:
     return emitCXXForRangeStmt(cast<CXXForRangeStmt>(*s), attr);
+  case Stmt::CoroutineBodyStmtClass:
+    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
+  case Stmt::IndirectGotoStmtClass:
+    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OpenACCComputeConstructClass:
     return emitOpenACCComputeConstruct(cast<OpenACCComputeConstruct>(*s));
   case Stmt::OpenACCLoopConstructClass:
@@ -199,11 +203,7 @@ mlir::LogicalResult CIRGenFunction::emitStmt(const Stmt *s,
   case Stmt::CaseStmtClass:
   case Stmt::SEHLeaveStmtClass:
   case Stmt::SYCLKernelCallStmtClass:
-  case Stmt::CoroutineBodyStmtClass:
-    return emitCoroutineBody(cast<CoroutineBodyStmt>(*s));
   case Stmt::CoreturnStmtClass:
-  case Stmt::IndirectGotoStmtClass:
-    return emitIndirectGotoStmt(cast<IndirectGotoStmt>(*s));
   case Stmt::OMPParallelDirectiveClass:
   case Stmt::OMPTaskwaitDirectiveClass:
   case Stmt::OMPTaskyieldDirectiveClass:

From e6145e870977babfc599d8339675a2b8c56da730 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 9 Dec 2025 09:01:16 -0800
Subject: [PATCH 24/63] [CIR][NFC] Add stubs for missing visitors in
 ScalarExprEmitter (#171222)

This adds stubs that issue NYI errors for any visitor that is present in
the ClangIR incubator but missing in the upstream implementation. This
will make it easier to find to correct locations to implement missing
functionality.
---
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 170 ++++++++++++++++++++-
 1 file changed, 169 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 9043ecab42f15..6820e2a403288 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -139,6 +139,11 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return {};
   }
 
+  mlir::Value VisitConstantExpr(ConstantExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: constant expr");
+    return {};
+  }
+
   mlir::Value VisitPackIndexingExpr(PackIndexingExpr *e) {
     return Visit(e->getSelectedExpr());
   }
@@ -159,6 +164,14 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   mlir::Value VisitCoawaitExpr(CoawaitExpr *s) {
     return cgf.emitCoawaitExpr(*s).getValue();
   }
+  mlir::Value VisitCoyieldExpr(CoyieldExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: coyield");
+    return {};
+  }
+  mlir::Value VisitUnaryCoawait(const UnaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: unary coawait");
+    return {};
+  }
 
   mlir::Value emitLoadOfLValue(LValue lv, SourceLocation loc) {
     return cgf.emitLoadOfLValue(lv, loc).getValue();
@@ -198,6 +211,12 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
                                    cir::IntAttr::get(type, e->getValue()));
   }
 
+  mlir::Value VisitFixedPointLiteral(const FixedPointLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: fixed point literal");
+    return {};
+  }
+
   mlir::Value VisitFloatingLiteral(const FloatingLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     assert(mlir::isa<cir::FPTypeInterface>(type) &&
@@ -229,6 +248,23 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitOffsetOfExpr(OffsetOfExpr *e);
 
+  mlir::Value VisitSizeOfPackExpr(SizeOfPackExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: size of pack");
+    return {};
+  }
+  mlir::Value VisitPseudoObjectExpr(PseudoObjectExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: pseudo object");
+    return {};
+  }
+  mlir::Value VisitSYCLUniqueStableNameExpr(SYCLUniqueStableNameExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: sycl unique stable name");
+    return {};
+  }
+  mlir::Value VisitEmbedExpr(EmbedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: embed");
+    return {};
+  }
   mlir::Value VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     if (e->isGLValue())
       return emitLoadOfLValue(cgf.getOrCreateOpaqueLValueMapping(e),
@@ -238,6 +274,38 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return cgf.getOrCreateOpaqueRValueMapping(e).getValue();
   }
 
+  mlir::Value VisitObjCSelectorExpr(ObjCSelectorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc selector");
+    return {};
+  }
+  mlir::Value VisitObjCProtocolExpr(ObjCProtocolExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc protocol");
+    return {};
+  }
+  mlir::Value VisitObjCIVarRefExpr(ObjCIvarRefExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc ivar ref");
+    return {};
+  }
+  mlir::Value VisitObjCMessageExpr(ObjCMessageExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc message");
+    return {};
+  }
+  mlir::Value VisitObjCIsaExpr(ObjCIsaExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc isa");
+    return {};
+  }
+  mlir::Value VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc availability check");
+    return {};
+  }
+
+  mlir::Value VisitMatrixSubscriptExpr(MatrixSubscriptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: matrix subscript");
+    return {};
+  }
+
   mlir::Value VisitCastExpr(CastExpr *e);
   mlir::Value VisitCallExpr(const CallExpr *e);
 
@@ -319,6 +387,18 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitInitListExpr(InitListExpr *e);
 
+  mlir::Value VisitArrayInitIndexExpr(ArrayInitIndexExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array init index");
+    return {};
+  }
+
+  mlir::Value VisitImplicitValueInitExpr(const ImplicitValueInitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: implicit value init");
+    return {};
+  }
+
   mlir::Value VisitExplicitCastExpr(ExplicitCastExpr *e) {
     return VisitCastExpr(e);
   }
@@ -726,6 +806,16 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return Visit(e->getSubExpr());
   }
 
+  // C++
+  mlir::Value VisitMaterializeTemporaryExpr(const MaterializeTemporaryExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: materialize temporary");
+    return {};
+  }
+  mlir::Value VisitSourceLocExpr(SourceLocExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: source loc");
+    return {};
+  }
   mlir::Value VisitCXXDefaultArgExpr(CXXDefaultArgExpr *dae) {
     CIRGenFunction::CXXDefaultArgExprScope scope(cgf, dae);
     return Visit(dae->getExpr());
@@ -745,11 +835,43 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     cgf.emitCXXDeleteExpr(e);
     return {};
   }
-
+  mlir::Value VisitTypeTraitExpr(const TypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: type trait");
+    return {};
+  }
+  mlir::Value
+  VisitConceptSpecializationExpr(const ConceptSpecializationExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: concept specialization");
+    return {};
+  }
+  mlir::Value VisitRequiresExpr(const RequiresExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: requires");
+    return {};
+  }
+  mlir::Value VisitArrayTypeTraitExpr(const ArrayTypeTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: array type trait");
+    return {};
+  }
+  mlir::Value VisitExpressionTraitExpr(const ExpressionTraitExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: expression trait");
+    return {};
+  }
+  mlir::Value VisitCXXPseudoDestructorExpr(const CXXPseudoDestructorExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: cxx pseudo destructor");
+    return {};
+  }
   mlir::Value VisitCXXThrowExpr(const CXXThrowExpr *e) {
     cgf.emitCXXThrowExpr(e);
     return {};
   }
+  mlir::Value VisitCXXNoexceptExpr(CXXNoexceptExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: cxx noexcept");
+    return {};
+  }
 
   /// Emit a conversion from the specified type to the specified destination
   /// type, both of which are CIR scalar types.
@@ -1213,6 +1335,52 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     return maybePromoteBoolResult(resOp.getResult(), resTy);
   }
 
+  mlir::Value VisitBinPtrMemD(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem d");
+    return {};
+  }
+
+  mlir::Value VisitBinPtrMemI(const BinaryOperator *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: ptr mem i");
+    return {};
+  }
+
+  // Other Operators.
+  mlir::Value VisitBlockExpr(const BlockExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: block");
+    return {};
+  }
+
+  mlir::Value VisitChooseExpr(ChooseExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: choose");
+    return {};
+  }
+
+  mlir::Value VisitObjCStringLiteral(const ObjCStringLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc string literal");
+    return {};
+  }
+  mlir::Value VisitObjCBoxedExpr(ObjCBoxedExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: objc boxed");
+    return {};
+  }
+  mlir::Value VisitObjCArrayLiteral(ObjCArrayLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc array literal");
+    return {};
+  }
+  mlir::Value VisitObjCDictionaryLiteral(ObjCDictionaryLiteral *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(),
+                     "ScalarExprEmitter: objc dictionary literal");
+    return {};
+  }
+
+  mlir::Value VisitAsTypeExpr(AsTypeExpr *e) {
+    cgf.cgm.errorNYI(e->getSourceRange(), "ScalarExprEmitter: as type");
+    return {};
+  }
+
   mlir::Value VisitAtomicExpr(AtomicExpr *e) {
     return cgf.emitAtomicExpr(e).getValue();
   }

From 9b12f8fcaeb4e9f8a03de6e982e94e525a9a4dc6 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 9 Dec 2025 18:06:26 +0100
Subject: [PATCH 25/63] [LLDB] Run MSVC STL smart pointer tests with PDB
 (#166946)

Runs the `std::shared/unique_ptr` tests with PDB with two changes:

- PDB uses the "full" name, so `std::string` is `std::basic_string<char,
std::char_traits<char>, std::allocator<char>>`
- The type of the pointer inside the shared/unique_ptr isn't the
`element_type` typedef
---
 .../shared_ptr/TestDataFormatterStdSharedPtr.py    | 14 ++++++++++++--
 .../unique_ptr/TestDataFormatterStdUniquePtr.py    | 12 +++++++++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
index d71fbf8d5f81a..fa03fc14dfb83 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/shared_ptr/TestDataFormatterStdSharedPtr.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         """Test `frame variable` output for `std::shared_ptr` types."""
         (_, process, _, bkpt) = lldbutil.run_to_source_breakpoint(
@@ -62,7 +64,7 @@ def do_test(self):
         valobj = self.expect_var_path("sp_user", type="std::shared_ptr<User>")
         self.assertRegex(
             valobj.summary,
-            "element_type @ 0x0*[1-9a-f][0-9a-f]+( strong=1)? weak=0",
+            f"{'User' if self.getDebugInfo() == 'pdb' else 'element_type'} @ 0x0*[1-9a-f][0-9a-f]+( strong=1)? weak=0",
         )
         self.assertNotEqual(valobj.child[0].unsigned, 0)
 
@@ -77,7 +79,15 @@ def do_test(self):
         self.assertEqual(str(valobj), '(User) *pointer = (id = 30, name = "steph")')
 
         self.expect_var_path("sp_user->id", type="int", value="30")
-        self.expect_var_path("sp_user->name", type="std::string", summary='"steph"')
+        self.expect_var_path(
+            "sp_user->name",
+            type=(
+                "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+                if self.getDebugInfo() == "pdb"
+                else "std::string"
+            ),
+            summary='"steph"',
+        )
 
         valobj = self.expect_var_path(
             "si", type="std::shared_ptr<int>", summary="47 strong=2 weak=0"
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
index 0b68b1b532bb0..1516db698798d 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/unique_ptr/TestDataFormatterStdUniquePtr.py
@@ -9,6 +9,8 @@
 
 
 class TestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test(self):
         """Test `frame variable` output for `std::unique_ptr` types."""
 
@@ -84,7 +86,15 @@ def do_test(self):
         self.assertNotEqual(valobj.child[0].unsigned, 0)
 
         self.expect_var_path("up_user->id", type="int", value="30")
-        self.expect_var_path("up_user->name", type="std::string", summary='"steph"')
+        self.expect_var_path(
+            "up_user->name",
+            type=(
+                "std::basic_string<char, std::char_traits<char>, std::allocator<char>>"
+                if self.getDebugInfo() == "pdb"
+                else "std::string"
+            ),
+            summary='"steph"',
+        )
 
         self.runCmd("settings set target.experimental.use-DIL true")
         self.expect_var_path("ptr_node->value", value="1")

From 1bada0af22d878b2ec0cfefd655c09b801b44918 Mon Sep 17 00:00:00 2001
From: Rajat Bajpai <rbajpai@nvidia.com>
Date: Tue, 9 Dec 2025 22:39:11 +0530
Subject: [PATCH 26/63] [NVPTX] Add IR pass for FMA transformation in the llc
 pipeline (#154735)

This change introduces a new IR pass in the llc pipeline for NVPTX that
transforms sequences of FMUL followed by FADD or FSUB into a single FMA
instruction.

Currently, all FMA folding for NVPTX occurs at the DAGCombine stage,
which is too late for any IR-level passes that might want to optimize or
analyze FMAs. By moving this transformation earlier into the IR phase,
we enable more opportunities for FMA folding, including across basic
blocks.

Additionally, this new pass relies on the contract instruction level
fast-math flag to perform these transformations, rather than depending
on the -fp-contract=fast or -enable-unsafe-fp-math options passed to
llc.
---
 llvm/lib/Target/NVPTX/CMakeLists.txt         |   1 +
 llvm/lib/Target/NVPTX/NVPTX.h                |   6 +
 llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp    | 167 +++++++++++++
 llvm/lib/Target/NVPTX/NVPTXPassRegistry.def  |   1 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp |  10 +
 llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll    | 247 +++++++++++++++++++
 6 files changed, 432 insertions(+)
 create mode 100644 llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
 create mode 100644 llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll

diff --git a/llvm/lib/Target/NVPTX/CMakeLists.txt b/llvm/lib/Target/NVPTX/CMakeLists.txt
index f9c24750c4836..6fe58c25c757d 100644
--- a/llvm/lib/Target/NVPTX/CMakeLists.txt
+++ b/llvm/lib/Target/NVPTX/CMakeLists.txt
@@ -18,6 +18,7 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXAtomicLower.cpp
   NVPTXCtorDtorLowering.cpp
+  NVPTXIRPeephole.cpp
   NVPTXForwardParams.cpp
   NVPTXFrameLowering.cpp
   NVPTXGenericToNVVM.cpp
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 95fd05f2a926f..210624fbb235c 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -52,6 +52,7 @@ FunctionPass *createNVPTXLowerAllocaPass();
 FunctionPass *createNVPTXLowerUnreachablePass(bool TrapUnreachable,
                                               bool NoTrapAfterNoreturn);
 FunctionPass *createNVPTXTagInvariantLoadsPass();
+FunctionPass *createNVPTXIRPeepholePass();
 MachineFunctionPass *createNVPTXPeephole();
 MachineFunctionPass *createNVPTXProxyRegErasurePass();
 MachineFunctionPass *createNVPTXForwardParamsPass();
@@ -75,12 +76,17 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
 void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
 void initializeNVPTXPeepholePass(PassRegistry &);
 void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
+void initializeNVPTXIRPeepholePass(PassRegistry &);
 void initializeNVPTXPrologEpilogPassPass(PassRegistry &);
 
 struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
+struct NVPTXIRPeepholePass : PassInfoMixin<NVPTXIRPeepholePass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
 struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> {
   NVVMReflectPass() : SmVersion(0) {}
   NVVMReflectPass(unsigned SmVersion) : SmVersion(SmVersion) {}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp b/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
new file mode 100644
index 0000000000000..bd16c7213b1e7
--- /dev/null
+++ b/llvm/lib/Target/NVPTX/NVPTXIRPeephole.cpp
@@ -0,0 +1,167 @@
+//===------ NVPTXIRPeephole.cpp - NVPTX IR Peephole --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements IR-level peephole optimizations. These transformations
+// run late in the NVPTX IR pass pipeline just before the instruction selection.
+//
+// Currently, it implements the following transformation(s):
+// 1. FMA folding (float/double types):
+//    Transforms FMUL+FADD/FSUB sequences into FMA intrinsics when the
+//    'contract' fast-math flag is present. Supported patterns:
+//    - fadd(fmul(a, b), c) => fma(a, b, c)
+//    - fadd(c, fmul(a, b)) => fma(a, b, c)
+//    - fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+//    - fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+//    - fsub(a, fmul(b, c)) => fma(fneg(b), c, a)
+//    - fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTXUtilities.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+
+#define DEBUG_TYPE "nvptx-ir-peephole"
+
+using namespace llvm;
+
+static bool tryFoldBinaryFMul(BinaryOperator *BI) {
+  Value *Op0 = BI->getOperand(0);
+  Value *Op1 = BI->getOperand(1);
+
+  auto *FMul0 = dyn_cast<BinaryOperator>(Op0);
+  auto *FMul1 = dyn_cast<BinaryOperator>(Op1);
+
+  BinaryOperator *FMul = nullptr;
+  Value *OtherOperand = nullptr;
+  bool IsFirstOperand = false;
+
+  // Either Op0 or Op1 should be a valid FMul
+  if (FMul0 && FMul0->getOpcode() == Instruction::FMul && FMul0->hasOneUse() &&
+      FMul0->hasAllowContract()) {
+    FMul = FMul0;
+    OtherOperand = Op1;
+    IsFirstOperand = true;
+  } else if (FMul1 && FMul1->getOpcode() == Instruction::FMul &&
+             FMul1->hasOneUse() && FMul1->hasAllowContract()) {
+    FMul = FMul1;
+    OtherOperand = Op0;
+    IsFirstOperand = false;
+  } else {
+    return false;
+  }
+
+  bool IsFSub = BI->getOpcode() == Instruction::FSub;
+  LLVM_DEBUG({
+    const char *OpName = IsFSub ? "FSub" : "FAdd";
+    dbgs() << "Found " << OpName << " with FMul (single use) as "
+           << (IsFirstOperand ? "first" : "second") << " operand: " << *BI
+           << "\n";
+  });
+
+  Value *MulOp0 = FMul->getOperand(0);
+  Value *MulOp1 = FMul->getOperand(1);
+  IRBuilder<> Builder(BI);
+  Value *FMA = nullptr;
+
+  if (!IsFSub) {
+    // fadd(fmul(a, b), c) => fma(a, b, c)
+    // fadd(c, fmul(a, b)) => fma(a, b, c)
+    FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                  {MulOp0, MulOp1, OtherOperand});
+  } else {
+    if (IsFirstOperand) {
+      // fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+      Value *NegOtherOp =
+          Builder.CreateFNegFMF(OtherOperand, BI->getFastMathFlags());
+      FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                    {MulOp0, MulOp1, NegOtherOp});
+    } else {
+      // fsub(a, fmul(b, c)) => fma(fneg(b), c, a)
+      Value *NegMulOp0 =
+          Builder.CreateFNegFMF(MulOp0, FMul->getFastMathFlags());
+      FMA = Builder.CreateIntrinsic(Intrinsic::fma, {BI->getType()},
+                                    {NegMulOp0, MulOp1, OtherOperand});
+    }
+  }
+
+  // Combine fast-math flags from the original instructions
+  auto *FMAInst = cast<Instruction>(FMA);
+  FastMathFlags BinaryFMF = BI->getFastMathFlags();
+  FastMathFlags FMulFMF = FMul->getFastMathFlags();
+  FastMathFlags NewFMF = FastMathFlags::intersectRewrite(BinaryFMF, FMulFMF) |
+                         FastMathFlags::unionValue(BinaryFMF, FMulFMF);
+  FMAInst->setFastMathFlags(NewFMF);
+
+  LLVM_DEBUG({
+    const char *OpName = IsFSub ? "FSub" : "FAdd";
+    dbgs() << "Replacing " << OpName << " with FMA: " << *FMA << "\n";
+  });
+  BI->replaceAllUsesWith(FMA);
+  BI->eraseFromParent();
+  FMul->eraseFromParent();
+  return true;
+}
+
+static bool foldFMA(Function &F) {
+  bool Changed = false;
+
+  // Iterate and process float/double FAdd/FSub instructions with allow-contract
+  for (auto &I : llvm::make_early_inc_range(instructions(F))) {
+    if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+      // Only FAdd and FSub are supported.
+      if (BI->getOpcode() != Instruction::FAdd &&
+          BI->getOpcode() != Instruction::FSub)
+        continue;
+
+      // At minimum, the instruction should have allow-contract.
+      if (!BI->hasAllowContract())
+        continue;
+
+      // Only float and double are supported.
+      if (!BI->getType()->isFloatTy() && !BI->getType()->isDoubleTy())
+        continue;
+
+      if (tryFoldBinaryFMul(BI))
+        Changed = true;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+struct NVPTXIRPeephole : public FunctionPass {
+  static char ID;
+  NVPTXIRPeephole() : FunctionPass(ID) {}
+  bool runOnFunction(Function &F) override;
+};
+
+} // namespace
+
+char NVPTXIRPeephole::ID = 0;
+INITIALIZE_PASS(NVPTXIRPeephole, "nvptx-ir-peephole", "NVPTX IR Peephole",
+                false, false)
+
+bool NVPTXIRPeephole::runOnFunction(Function &F) { return foldFMA(F); }
+
+FunctionPass *llvm::createNVPTXIRPeepholePass() {
+  return new NVPTXIRPeephole();
+}
+
+PreservedAnalyses NVPTXIRPeepholePass::run(Function &F,
+                                           FunctionAnalysisManager &) {
+  if (!foldFMA(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index ee37c9826012c..7d645bff7110f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
 FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
 FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
 FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
+FUNCTION_PASS("nvptx-ir-peephole", NVPTXIRPeepholePass())
 #undef FUNCTION_PASS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index a6837a482608c..74bae28044e66 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -51,6 +51,13 @@ static cl::opt<bool>
                                cl::desc("Disable load/store vectorizer"),
                                cl::init(false), cl::Hidden);
 
+// NVPTX IR Peephole is a new pass; this option will lets us turn it off in case
+// we encounter some issues.
+static cl::opt<bool>
+    DisableNVPTXIRPeephole("disable-nvptx-ir-peephole",
+                           cl::desc("Disable NVPTX IR Peephole"),
+                           cl::init(false), cl::Hidden);
+
 // TODO: Remove this flag when we are confident with no regressions.
 static cl::opt<bool> DisableRequireStructuredCFG(
     "disable-nvptx-require-structured-cfg",
@@ -115,6 +122,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeNVPTXTarget() {
   initializeNVPTXExternalAAWrapperPass(PR);
   initializeNVPTXPeepholePass(PR);
   initializeNVPTXTagInvariantLoadLegacyPassPass(PR);
+  initializeNVPTXIRPeepholePass(PR);
   initializeNVPTXPrologEpilogPassPass(PR);
 }
 
@@ -379,6 +387,8 @@ void NVPTXPassConfig::addIRPasses() {
       addPass(createLoadStoreVectorizerPass());
     addPass(createSROAPass());
     addPass(createNVPTXTagInvariantLoadsPass());
+    if (!DisableNVPTXIRPeephole)
+      addPass(createNVPTXIRPeepholePass());
   }
 
   if (ST.hasPTXASUnreachableBug()) {
diff --git a/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
new file mode 100644
index 0000000000000..6d9ad8d3ad436
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/nvptx-fold-fma.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=nvptx-ir-peephole -S | FileCheck %s
+
+target triple = "nvptx64-nvidia-cuda"
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define float @test_fsub_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul = fmul contract float %a, %b
+  %sub = fsub contract float %mul, %c
+  ret float %sub
+}
+
+
+; fsub(c, fmul(a, b)) => fma(-a, b, c)
+define float @test_fsub_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fsub_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[A]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul = fmul contract float %a, %b
+  %sub = fsub contract float %c, %mul
+  ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(a, b, fneg(fmul(c, d)))
+define float @test_fsub_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[MUL2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP1]])
+; CHECK-NEXT:    ret float [[TMP2]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %sub = fsub contract float %mul1, %mul2
+  ret float %sub
+}
+
+
+; fsub(fmul(a, b), fmul(c, d)) => fma(fneg(c), d, fmul(a, b)))
+; fmul(a, b) has multiple uses.
+define float @test_fsub_fmul_fmul_multiple_use(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fsub_fmul_fmul_multiple_use(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract float [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract float @llvm.fma.f32(float [[TMP1]], float [[D]], float [[MUL1]])
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[MUL1]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %sub = fsub contract float %mul1, %mul2
+  %add = fadd float %sub, %mul1
+  ret float %add
+}
+
+
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c)) where fsub and fmul are in different BBs
+define float @test_fsub_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fsub_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[INIT:.*]]:
+; CHECK-NEXT:    [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT:    br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK:       [[ITERATION]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = fneg contract float [[C_PHI]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[TMP0]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+init:
+  %mul = fmul contract float %a, %b
+  %cmp_iter = icmp sgt i32 %n, 10
+  br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+  %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+  %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %i_next = add i32 %i, 1
+  %acc_next = fadd contract float %acc, 1.0
+  %cmp_loop = icmp slt i32 %i_next, %n
+  br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+  %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %sub = fsub contract float %mul, %c_phi
+  ret float %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define float @test_fadd_fmul_c(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %mul, %c
+  ret float %add
+}
+
+
+; fadd(c, fmul(a, b)) => fma(a, b, c)
+define float @test_fadd_c_fmul(float %a, float %b, float %c) {
+; CHECK-LABEL: define float @test_fadd_c_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul = fmul contract float %a, %b
+  %add = fadd contract float %c, %mul
+  ret float %add
+}
+
+
+; fadd(fmul(a, b), fmul(c, d)) => fma(a, b, fmul(c, d))
+define float @test_fadd_fmul_fmul(float %a, float %b, float %c, float %d) {
+; CHECK-LABEL: define float @test_fadd_fmul_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], float [[D:%.*]]) {
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul contract float [[C]], [[D]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[MUL2]])
+; CHECK-NEXT:    ret float [[TMP1]]
+;
+  %mul1 = fmul contract float %a, %b
+  %mul2 = fmul contract float %c, %d
+  %add = fadd contract float %mul1, %mul2
+  ret float %add
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c) where fadd and fmul are in different BBs
+define float @test_fadd_fmul_different_BB(float %a, float %b, float %c, i32 %n) {
+; CHECK-LABEL: define float @test_fadd_fmul_different_BB(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], float [[C:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[INIT:.*]]:
+; CHECK-NEXT:    [[CMP_ITER:%.*]] = icmp sgt i32 [[N]], 10
+; CHECK-NEXT:    br i1 [[CMP_ITER]], label %[[ITERATION:.*]], label %[[EXIT:.*]]
+; CHECK:       [[ITERATION]]:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, %[[INIT]] ], [ [[I_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[ACC:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT:%.*]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[I_NEXT]] = add i32 [[I]], 1
+; CHECK-NEXT:    [[ACC_NEXT]] = fadd contract float [[ACC]], 1.000000e+00
+; CHECK-NEXT:    [[CMP_LOOP:%.*]] = icmp slt i32 [[I_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP_LOOP]], label %[[ITERATION]], label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[C_PHI:%.*]] = phi float [ [[C]], %[[INIT]] ], [ [[ACC_NEXT]], %[[ITERATION]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @llvm.fma.f32(float [[A]], float [[B]], float [[C_PHI]])
+; CHECK-NEXT:    ret float [[TMP0]]
+;
+init:
+  %mul = fmul contract float %a, %b
+  %cmp_iter = icmp sgt i32 %n, 10
+  br i1 %cmp_iter, label %iteration, label %exit
+
+iteration:
+  %i = phi i32 [ 0, %init ], [ %i_next, %iteration ]
+  %acc = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %i_next = add i32 %i, 1
+  %acc_next = fadd contract float %acc, 1.0
+  %cmp_loop = icmp slt i32 %i_next, %n
+  br i1 %cmp_loop, label %iteration, label %exit
+
+exit:
+  %c_phi = phi float [ %c, %init ], [ %acc_next, %iteration ]
+  %add = fadd contract float %mul, %c_phi
+  ret float %add
+}
+
+
+; These scenarios shouldn't work.
+; fadd(fpext(fmul(a, b)), c) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_fpext_fmul_c(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fpext_fmul_c(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[EXT]], [[C]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %mul = fmul contract float %a, %b
+  %ext = fpext float %mul to double
+  %add = fadd contract double %ext, %c
+  ret double %add
+}
+
+
+; fadd(c, fpext(fmul(a, b))) => fma(fpext(a), fpext(b), c)
+define double @test_fadd_c_fpext_fmul(float %a, float %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_c_fpext_fmul(
+; CHECK-SAME: float [[A:%.*]], float [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[MUL:%.*]] = fmul contract float [[A]], [[B]]
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[MUL]] to double
+; CHECK-NEXT:    [[ADD:%.*]] = fadd contract double [[C]], [[EXT]]
+; CHECK-NEXT:    ret double [[ADD]]
+;
+  %mul = fmul contract float %a, %b
+  %ext = fpext float %mul to double
+  %add = fadd contract double %c, %ext
+  ret double %add
+}
+
+
+; Double precision tests
+; fsub(fmul(a, b), c) => fma(a, b, fneg(c))
+define double @test_fsub_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fsub_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg contract double [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[TMP1]])
+; CHECK-NEXT:    ret double [[TMP2]]
+;
+  %mul = fmul contract double %a, %b
+  %sub = fsub contract double %mul, %c
+  ret double %sub
+}
+
+
+; fadd(fmul(a, b), c) => fma(a, b, c)
+define double @test_fadd_fmul_c_double(double %a, double %b, double %c) {
+; CHECK-LABEL: define double @test_fadd_fmul_c_double(
+; CHECK-SAME: double [[A:%.*]], double [[B:%.*]], double [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract double @llvm.fma.f64(double [[A]], double [[B]], double [[C]])
+; CHECK-NEXT:    ret double [[TMP1]]
+;
+  %mul = fmul contract double %a, %b
+  %add = fadd contract double %mul, %c
+  ret double %add
+}

From 4f79552d25c25606f5487076c109b2fe2a76a7e2 Mon Sep 17 00:00:00 2001
From: BaiXilin <x53bai@uwaterloo.ca>
Date: Tue, 9 Dec 2025 12:10:20 -0500
Subject: [PATCH 27/63] [x86][AVX-VNNI] Fix VPDPWXXD Argument Types (#169456)

Fixed the argument types of the following intrinsics to match with the
ISA:
 - vpdpwssd_128, vpdpwssd_256, vpdpwssd_512,
 - vpdpwssds_128, vpdpwssds_256, vpdpwssds_512
 - vpdpwsud_128, vpdpwsud_256, vpdowsud_512
 - vpdpwsuds_128, vpdpwsuds_256, vpdpwsuds_512
 - vpdpwusd_128, vpdpwusd_256, vpdpwusd_512
 - vpdpwusds_128, vpdpwusds_256, vpdpwusds_512
 - vpdpwuud_128, vpdpwuud_256, vpdpwuud_512
 - vpdpwuuds_128, vpdpwuuds_256, vpdpwuuds_512

Fixes #97271. Note that this is the last PR for the issue.
---
 clang/include/clang/Basic/BuiltinsX86.td      |  48 +--
 clang/lib/Headers/avx10_2_512niintrin.h       |  24 +-
 clang/lib/Headers/avx512vlvnniintrin.h        |  17 +-
 clang/lib/Headers/avx512vnniintrin.h          |   8 +-
 clang/lib/Headers/avxvnniint16intrin.h        | 217 +++++-----
 clang/lib/Headers/avxvnniintrin.h             |  12 +-
 .../test/CodeGen/X86/avx10_2_512ni-builtins.c |  36 +-
 clang/test/CodeGen/X86/avx10_2ni-builtins.c   |  48 +--
 .../test/CodeGen/X86/avx512vlvnni-builtins.c  |  24 +-
 clang/test/CodeGen/X86/avx512vnni-builtins.c  |  12 +-
 clang/test/CodeGen/X86/avxvnni-builtins.c     |  16 +-
 .../test/CodeGen/X86/avxvnniint16-builtins.c  |  24 +-
 llvm/include/llvm/IR/IntrinsicsX86.td         |  60 +--
 llvm/lib/IR/AutoUpgrade.cpp                   | 183 +++++++--
 .../Instrumentation/MemorySanitizer.cpp       | 102 ++++-
 .../CodeGen/X86/avx10.2-intrinsic-upgrade.ll  |  96 +++++
 .../CodeGen/X86/avx10_2_512ni-intrinsics.ll   |  18 +-
 llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll |  24 +-
 .../CodeGen/X86/avx512vl_vnni-intrinsics.ll   |  58 +--
 .../X86/avx512vnni-intrinsics-upgrade.ll      |  60 ++-
 .../test/CodeGen/X86/avx512vnni-intrinsics.ll |  28 +-
 .../X86/avx_vnni-intrinsics-upgrade.ll        |  44 ++
 llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll  |  24 +-
 .../X86/avxvnniint16-intrinsics-upgrade.ll    | 185 +++++++++
 .../CodeGen/X86/avxvnniint16-intrinsics.ll    |  72 ++--
 .../CodeGen/X86/stack-folding-int-avxvnni.ll  |  40 +-
 .../X86/stack-folding-int-avxvnniint16.ll     |  24 +-
 .../X86/avx10_2_512ni-intrinsics.ll           | 290 +++++++++-----
 .../X86/avx10_2ni-intrinsics.ll               | 360 ++++++++++++-----
 .../X86/avx512vl_vnni-intrinsics-upgrade.ll   |  72 ++--
 .../X86/avx512vl_vnni-intrinsics.ll           |  72 ++--
 .../X86/avx512vnni-intrinsics-upgrade.ll      |  36 +-
 .../X86/avx512vnni-intrinsics.ll              |  36 +-
 .../X86/avx_vnni-intrinsics.ll                |  24 +-
 .../X86/avxvnniint16-intrinsics.ll            | 376 ++++++++++++------
 35 files changed, 1870 insertions(+), 900 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll

diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index feb3e9a4afe3b..71aee5038d518 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1088,27 +1088,27 @@ let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<5
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwssds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, short>)">;
 }
 
 let Features = "avx512vl,avx512vnni|avxvnni", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwssds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, short>)">;
 }
 
 let Features = "avx512vnni", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
-  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwssds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, short>)">;
 }
 
 let Features = "avxvnniint8|avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
@@ -4222,12 +4222,12 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, RequiredVectorWidth<512>] in {
-  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
-  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Vector<16, int>)">;
+  def vpdpwsud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwsuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, short>, _Vector<32, unsigned short>)">;
+  def vpdpwusd512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwusds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, short>)">;
+  def vpdpwuud512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
+  def vpdpwuuds512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<32, unsigned short>, _Vector<32, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
@@ -4235,51 +4235,51 @@ let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwsuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwsuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwusds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwusds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuud128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuud256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
-  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Vector<4, int>)">;
+  def vpdpwuuds128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<8, unsigned short>, _Vector<8, unsigned short>)">;
 }
 
 let Features = "avxvnniint16|avx10.2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
-  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Vector<8, int>)">;
+  def vpdpwuuds256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
 }
 
 let Features = "avx10.2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
diff --git a/clang/lib/Headers/avx10_2_512niintrin.h b/clang/lib/Headers/avx10_2_512niintrin.h
index fdb57c7c9e27b..b2215b72c57bc 100644
--- a/clang/lib/Headers/avx10_2_512niintrin.h
+++ b/clang/lib/Headers/avx10_2_512niintrin.h
@@ -185,8 +185,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpbuuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsud512((__v16si)__A, (__v32hi)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -206,8 +206,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwsuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwsuds512((__v16si)__A, (__v32hi)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwsuds_epi32(
@@ -227,8 +227,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwsuds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusd_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusd512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -248,8 +248,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusd_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwusds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwusds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hi)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwusds_epi32(
@@ -269,8 +269,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwusds_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuud_epi32(__m512i __A,
                                                                  __m512i __B,
                                                                  __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v16si)__B,
-                                             (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuud512((__v16si)__A, (__v32hu)__B,
+                                             (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -290,8 +290,8 @@ static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_maskz_dpwuud_epi32(
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_dpwuuds_epi32(__m512i __A,
                                                                   __m512i __B,
                                                                   __m512i __C) {
-  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v16si)__B,
-                                              (__v16si)__C);
+  return (__m512i)__builtin_ia32_vpdpwuuds512((__v16si)__A, (__v32hu)__B,
+                                              (__v32hu)__C);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_mask_dpwuuds_epi32(
diff --git a/clang/lib/Headers/avx512vlvnniintrin.h b/clang/lib/Headers/avx512vlvnniintrin.h
index a1a0338a69e0d..4b8a199af32e5 100644
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@@ -80,8 +80,8 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssd_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssd_epi32(S, A, B)                                           \
+  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v16hi)(A), (__v16hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -98,8 +98,9 @@
 ///    ENDFOR
 ///    DST[MAX:256] := 0
 /// \endcode
-#define _mm256_dpwssds_epi32(S, A, B) \
-  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
+#define _mm256_dpwssds_epi32(S, A, B)                                          \
+  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v16hi)(A),             \
+                                        (__v16hi)(B)))
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
 /// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
@@ -157,8 +158,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssd_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssd_epi32(S, A, B)                                              \
+  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
 /// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
@@ -175,8 +176,8 @@
 ///    ENDFOR
 ///    DST[MAX:128] := 0
 /// \endcode
-#define _mm_dpwssds_epi32(S, A, B) \
-  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
+#define _mm_dpwssds_epi32(S, A, B)                                             \
+  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v8hi)(A), (__v8hi)(B)))
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
diff --git a/clang/lib/Headers/avx512vnniintrin.h b/clang/lib/Headers/avx512vnniintrin.h
index c386923360de6..2ce88efe4a04f 100644
--- a/clang/lib/Headers/avx512vnniintrin.h
+++ b/clang/lib/Headers/avx512vnniintrin.h
@@ -68,8 +68,8 @@ _mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A,
-                                             (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v32hi)__A,
+                                             (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
@@ -91,8 +91,8 @@ _mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B)
 static __inline__ __m512i __DEFAULT_FN_ATTRS
 _mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B)
 {
-  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A,
-                                              (__v16si)__B);
+  return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v32hi)__A,
+                                              (__v32hi)__B);
 }
 
 static __inline__ __m512i __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Headers/avxvnniint16intrin.h b/clang/lib/Headers/avxvnniint16intrin.h
index 805d249911c17..98d94ee3fcf3a 100644
--- a/clang/lib/Headers/avxvnniint16intrin.h
+++ b/clang/lib/Headers/avxvnniint16intrin.h
@@ -16,9 +16,10 @@
 #define __AVXVNNIINT16INTRIN_H
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -40,19 +41,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwsud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v8hi)(__A),           \
+                                       (__v8hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -74,20 +77,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
-/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+///		dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v16hi)(__A),          \
+                                       (__v16hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -109,20 +113,22 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
 #define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v8hi)(__A),          \
+                                        (__v8hu)(__B)))
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+///    corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -144,19 +150,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+///		  SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v16hi)(__A),         \
+                                        (__v16hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -178,19 +186,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwusd_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -212,20 +222,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v16hu)(__A),          \
+                                       (__v16hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and
+///    store the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -233,7 +244,7 @@
 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
 ///
 /// \param __W
 ///    A 128-bit vector of [4 x int].
@@ -247,20 +258,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwusds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v8hu)(__A),          \
+                                        (__v8hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding signed 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and
+///    store the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -268,7 +280,7 @@
 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUSDS instruction.
 ///
 /// \param __W
 ///    A 256-bit vector of [8 x int].
@@ -282,19 +294,21 @@
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v16hu)(__A),         \
+                                        (__v16hi)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -305,30 +319,32 @@
 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 ///
 /// \param __W
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 /// \param __A
 ///    A 128-bit vector of [8 x unsigned short].
 /// \param __B
 ///    A 128-bit vector of [8 x unsigned short].
 /// \returns
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwuud_epi32(__W, __A, __B)                                        \
-  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
-                                       (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v8hu)(__A),           \
+                                       (__v8hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W, and store the packed 32-bit
+///    results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -339,31 +355,32 @@
 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
 ///
 /// \param __W
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 /// \param __A
 ///    A 256-bit vector of [16 x unsigned short].
 /// \param __B
 ///    A 256-bit vector of [16 x unsigned short].
 /// \returns
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
-  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
-                                       (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v16hu)(__A),          \
+                                       (__v16hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -371,34 +388,35 @@
 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
 ///
 /// \param __W
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 /// \param __A
 ///    A 128-bit vector of [8 x unsigned short].
 /// \param __B
 ///    A 128-bit vector of [8 x unsigned short].
 /// \returns
-///    A 128-bit vector of [4 x unsigned int].
+///    A 128-bit vector of [4 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 3
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:128] := 0
 /// \endcode
 #define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
-  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
-                                        (__v4si)(__B)))
+  ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v8hu)(__A),          \
+                                        (__v8hu)(__B)))
 
-/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
-///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
-///    signed 16-bit results. Sum these 2 results with the corresponding
-///    32-bit integer in \a __W with signed saturation, and store the packed
-///    32-bit results in \a dst.
+/// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A
+///    with corresponding unsigned 16-bit integers in \a __B, producing 2
+///    intermediate signed 16-bit results. Sum these 2 results with the
+///    corresponding 32-bit integer in \a __W with signed saturation, and store
+///    the packed 32-bit results in \a dst.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -406,27 +424,28 @@
 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c VPDPWSUDS instruction.
+/// This intrinsic corresponds to the \c VPDPWUUDS instruction.
 ///
 /// \param __W
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 /// \param __A
 ///    A 256-bit vector of [16 x unsigned short].
 /// \param __B
 ///    A 256-bit vector of [16 x unsigned short].
 /// \returns
-///    A 256-bit vector of [8 x unsigned int].
+///    A 256-bit vector of [8 x int].
 ///
 /// \code{.operation}
 /// FOR j := 0 to 7
 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
-/// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
+/// 	tmp2.dword :=
+/// 	  ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
 /// ENDFOR
 /// dst[MAX:256] := 0
 /// \endcode
 #define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
-  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
-                                        (__v8si)(__B)))
+  ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v16hu)(__A),         \
+                                        (__v16hu)(__B)))
 
 #endif // __AVXVNNIINT16INTRIN_H
diff --git a/clang/lib/Headers/avxvnniintrin.h b/clang/lib/Headers/avxvnniintrin.h
index 3c4c44a930fe2..1d2e8c906effc 100644
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@@ -109,7 +109,8 @@ _mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v16hi)__A,
+                                             (__v16hi)__B);
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -130,7 +131,8 @@ _mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
 {
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v16hi)__A,
+                                              (__v16hi)__B);
 }
 
 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
@@ -199,7 +201,8 @@ _mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v8hi)__A,
+                                             (__v8hi)__B);
 }
 
 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
@@ -220,7 +223,8 @@ _mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
 {
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v8hi)__A,
+                                              (__v8hi)__B);
 }
 
 #undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
index 728c9f5652dd9..1ba6e87653a74 100644
--- a/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2_512ni-builtins.c
@@ -176,20 +176,20 @@ __m512i test_mm512_maskz_dpbuuds_epi32(__mmask16 __U, __m512i __W, __m512i __A,
 /* VNNI INT16 */
 __m512i test_mm512_dpwsud_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwsud_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwsud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwsud_epi32(__U, __A, __B, __C);
@@ -197,20 +197,20 @@ __m512i test_mm512_maskz_dpwsud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwsuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwsuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwsuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwsuds_epi32(__U, __A, __B, __C);
@@ -218,20 +218,20 @@ __m512i test_mm512_maskz_dpwsuds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
 
 __m512i test_mm512_dpwusd_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwusd_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwusd_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusd_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwusd_epi32(__U, __A, __B, __C);
@@ -239,20 +239,20 @@ __m512i test_mm512_maskz_dpwusd_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwusds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwusds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwusds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwusds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwusds_epi32(__U, __A, __B, __C);
@@ -260,20 +260,20 @@ __m512i test_mm512_maskz_dpwusds_epi32(__mmask16 __U, __m512i __A, __m512i __B,
 
 __m512i test_mm512_dpwuud_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwuud_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwuud_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuud_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwuud_epi32(__U, __A, __B, __C);
@@ -281,20 +281,20 @@ __m512i test_mm512_maskz_dpwuud_epi32(__mmask16 __U, __m512i __A, __m512i __B, _
 
 __m512i test_mm512_dpwuuds_epi32(__m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwuuds_epi32(__A, __B, __C);
 }
 
 __m512i test_mm512_mask_dpwuuds_epi32(__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) {
 // CHECK-LABEL: @test_mm512_mask_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m512i test_mm512_maskz_dpwuuds_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) {
 // CHECK-LABEL: @test_mm512_maskz_dpwuuds_epi32(
-// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+// CHECK: call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
 // CHECK: zeroinitializer
 // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwuuds_epi32(__U, __A, __B, __C);
diff --git a/clang/test/CodeGen/X86/avx10_2ni-builtins.c b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
index a250d91ae5989..be2719c33c52f 100644
--- a/clang/test/CodeGen/X86/avx10_2ni-builtins.c
+++ b/clang/test/CodeGen/X86/avx10_2ni-builtins.c
@@ -259,168 +259,168 @@ __m256i test_mm256_maskz_dpbuuds_epi32(__mmask8 __U, __m256i __W, __m256i __A, _
 // VNNI INT16
 __m128i test_mm_mask_dpwsud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwsud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwsud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwsud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwsud_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwsud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwsud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwsuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwsuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwsuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwsuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwsuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwsuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwsuds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwsuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwsuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwsuds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusd_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwusd_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwusd_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusd_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusd_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwusd_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwusd_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwusd_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusd_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwusd_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwusds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwusds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwusds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwusds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwusds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwusds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwusds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwusds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwusds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwusds_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuud_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwuud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwuud_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuud_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuud_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwuud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwuud_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwuud_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuud_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwuud_epi32(__U, __A, __B, __C);
 }
 
 __m128i test_mm_mask_dpwuuds_epi32(__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) {
 // CHECK-LABEL: @test_mm_mask_dpwuuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m128i test_mm_maskz_dpwuuds_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) {
 // CHECK-LABEL: @test_mm_maskz_dpwuuds_epi32(
-// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+// CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
 // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
 
 __m256i test_mm256_mask_dpwuuds_epi32(__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) {
 // CHECK-LABEL: @test_mm256_mask_dpwuuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwuuds_epi32(__A, __B, __C, __D);
 }
 
 __m256i test_mm256_maskz_dpwuuds_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) {
 // CHECK-LABEL: @test_mm256_maskz_dpwuuds_epi32(
-// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+// CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
 // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwuuds_epi32(__U, __A, __B, __C);
 }
diff --git a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
index f63b5c6e73917..11dbd717a9f77 100644
--- a/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlvnni-builtins.c
@@ -47,41 +47,41 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_mask_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m256i test_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_maskz_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -127,41 +127,41 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_mask_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m128i test_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_maskz_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}
   return _mm_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/clang/test/CodeGen/X86/avx512vnni-builtins.c b/clang/test/CodeGen/X86/avx512vnni-builtins.c
index afe80458e37cc..6b8465206eedb 100644
--- a/clang/test/CodeGen/X86/avx512vnni-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vnni-builtins.c
@@ -47,41 +47,41 @@ __m512i test_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) {
 
 __m512i test_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssd_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssd_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssd_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssd_epi32(__S, __A, __B);
 }
 
 __m512i test_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_mask_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_dpwssds_epi32(__S, __U, __A, __B);
 }
 
 __m512i test_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_maskz_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_dpwssds_epi32(__U, __S, __A, __B);
 }
 
 __m512i test_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_dpwssds_epi32
-  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}})
+  // CHECK: call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}})
   return _mm512_dpwssds_epi32(__S, __A, __B);
 }
 
diff --git a/clang/test/CodeGen/X86/avxvnni-builtins.c b/clang/test/CodeGen/X86/avxvnni-builtins.c
index 7948e0d57d9bf..6557a26807eb2 100644
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@@ -19,13 +19,13 @@ __m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -43,13 +43,13 @@ __m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_epi32(__S, __A, __B);
 }
 
@@ -67,13 +67,13 @@ __m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
 
 __m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssd_avx_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssd_avx_epi32(__S, __A, __B);
 }
 
 __m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
   // CHECK-LABEL: test_mm256_dpwssds_avx_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwssds_avx_epi32(__S, __A, __B);
 }
 
@@ -91,12 +91,12 @@ __m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
 
 __m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssd_avx_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssd_avx_epi32(__S, __A, __B);
 }
 
 __m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
   // CHECK-LABEL: test_mm_dpwssds_avx_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwssds_avx_epi32(__S, __A, __B);
 }
diff --git a/clang/test/CodeGen/X86/avxvnniint16-builtins.c b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
index 941da9aa223b5..f28fff2c0cfec 100644
--- a/clang/test/CodeGen/X86/avxvnniint16-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnniint16-builtins.c
@@ -11,72 +11,72 @@
 
 __m128i test_mm_dpwsud_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwsud_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwsud_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwsud_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwsud_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwsud_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwsuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwsuds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwsuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwsuds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwsuds_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwusd_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwusd_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwusd_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwusd_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwusd_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwusd_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwusds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwusds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwusds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwusds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwusds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwusds_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwuud_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwuud_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwuud_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwuud_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwuud_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwuud_epi32(__A, __B, __C);
 }
 
 __m128i test_mm_dpwuuds_epi32(__m128i __A, __m128i __B, __m128i __C) {
   // CHECK-LABEL: test_mm_dpwuuds_epi32
-  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}})
+  // CHECK: call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_dpwuuds_epi32(__A, __B, __C);
 }
 
 __m256i test_mm256_dpwuuds_epi32(__m256i __A, __m256i __B, __m256i __C) {
   // CHECK-LABEL: test_mm256_dpwuuds_epi32
-  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}})
+  // CHECK: call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_dpwuuds_epi32(__A, __B, __C);
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1dd23f60c7e1e..ec80ba3e1ee81 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -1893,29 +1893,29 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
 
   def int_x86_avx512_vpdpwssd_128 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd128">,
-      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                             llvm_v4i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+                             llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_256 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd256">,
-      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                             llvm_v8i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+                             llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwssd512">,
-      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                             llvm_v16i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+                             llvm_v32i16_ty], [IntrNoMem]>;
 
   def int_x86_avx512_vpdpwssds_128 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds128">,
-      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                             llvm_v4i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v8i16_ty,
+                             llvm_v8i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_256 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds256">,
-      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                             llvm_v8i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v16i16_ty,
+                             llvm_v16i16_ty], [IntrNoMem]>;
   def int_x86_avx512_vpdpwssds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwssds512">,
-      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                             llvm_v16i32_ty], [IntrNoMem]>;
+      DefaultAttrsIntrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v32i16_ty,
+                             llvm_v32i16_ty], [IntrNoMem]>;
   def int_x86_avx2_vpdpbssd_128
       : ClangBuiltin<"__builtin_ia32_vpdpbssd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
@@ -1980,62 +1980,62 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx2_vpdpwsud_128
       : ClangBuiltin<"__builtin_ia32_vpdpwsud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsud_256
       : ClangBuiltin<"__builtin_ia32_vpdpwsud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwsuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwsuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwsuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusd_128
       : ClangBuiltin<"__builtin_ia32_vpdpwusd128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusd_256
       : ClangBuiltin<"__builtin_ia32_vpdpwusd256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwusds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwusds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwusds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuud_128
       : ClangBuiltin<"__builtin_ia32_vpdpwuud128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuud_256
       : ClangBuiltin<"__builtin_ia32_vpdpwuud256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuuds_128
       : ClangBuiltin<"__builtin_ia32_vpdpwuuds128">,
         DefaultAttrsIntrinsic<[llvm_v4i32_ty],
-                              [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty],
+                              [llvm_v4i32_ty, llvm_v8i16_ty, llvm_v8i16_ty],
                               [IntrNoMem]>;
   def int_x86_avx2_vpdpwuuds_256
       : ClangBuiltin<"__builtin_ia32_vpdpwuuds256">,
         DefaultAttrsIntrinsic<[llvm_v8i32_ty],
-                              [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty],
+                              [llvm_v8i32_ty, llvm_v16i16_ty, llvm_v16i16_ty],
                               [IntrNoMem]>;
 }
 
@@ -5031,32 +5031,32 @@ let TargetPrefix = "x86" in {
   def int_x86_avx10_vpdpwsud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwsud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwsuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwsuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwusd_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwusd512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwusds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwusds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwuud_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwuud512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
   def int_x86_avx10_vpdpwuuds_512 :
       ClangBuiltin<"__builtin_ia32_vpdpwuuds512">,
       DefaultAttrsIntrinsic<[llvm_v16i32_ty],
-                            [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty],
+                            [llvm_v16i32_ty, llvm_v32i16_ty, llvm_v32i16_ty],
                             [IntrNoMem]>;
 
   // VMPSADBW
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 487db134b0df3..e67f1ecd96bb1 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -125,6 +125,24 @@ static bool upgradeX86MultiplyAddBytes(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+// Upgrade the declaration of multipy and add words intrinsics whose input
+// arguments' types have changed to vectors of i32 to vectors of i16
+static bool upgradeX86MultiplyAddWords(Function *F, Intrinsic::ID IID,
+                                       Function *&NewFn) {
+  // check if input argument type is a vector of i16
+  Type *Arg1Type = F->getFunctionType()->getParamType(1);
+  Type *Arg2Type = F->getFunctionType()->getParamType(2);
+  if (Arg1Type->isVectorTy() &&
+      cast<VectorType>(Arg1Type)->getElementType()->isIntegerTy(16) &&
+      Arg2Type->isVectorTy() &&
+      cast<VectorType>(Arg2Type)->getElementType()->isIntegerTy(16))
+    return false;
+
+  rename(F);
+  NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID);
+  return true;
+}
+
 static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
                                     Function *&NewFn) {
   if (F->getReturnType()->getScalarType()->isBFloatTy())
@@ -590,43 +608,89 @@ static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
                .Default(Intrinsic::not_intrinsic);
       if (ID != Intrinsic::not_intrinsic)
         return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.starts_with("vpdpwssd.") ||
+               Name.starts_with("vpdpwssds.")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("vpdpwssd.128", Intrinsic::x86_avx512_vpdpwssd_128)
+               .Case("vpdpwssd.256", Intrinsic::x86_avx512_vpdpwssd_256)
+               .Case("vpdpwssd.512", Intrinsic::x86_avx512_vpdpwssd_512)
+               .Case("vpdpwssds.128", Intrinsic::x86_avx512_vpdpwssds_128)
+               .Case("vpdpwssds.256", Intrinsic::x86_avx512_vpdpwssds_256)
+               .Case("vpdpwssds.512", Intrinsic::x86_avx512_vpdpwssds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
     }
     return false; // No other 'x86.avx512.*'.
   }
 
-  if (Name.consume_front("avx2.vpdpb")) {
-    // Added in 21.1
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
-             .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
-             .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
-             .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
-             .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
-             .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
-             .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
-             .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
-             .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
-             .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
-             .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
-             .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+  if (Name.consume_front("avx2.")) {
+    if (Name.consume_front("vpdpb")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("ssd.128", Intrinsic::x86_avx2_vpdpbssd_128)
+               .Case("ssd.256", Intrinsic::x86_avx2_vpdpbssd_256)
+               .Case("ssds.128", Intrinsic::x86_avx2_vpdpbssds_128)
+               .Case("ssds.256", Intrinsic::x86_avx2_vpdpbssds_256)
+               .Case("sud.128", Intrinsic::x86_avx2_vpdpbsud_128)
+               .Case("sud.256", Intrinsic::x86_avx2_vpdpbsud_256)
+               .Case("suds.128", Intrinsic::x86_avx2_vpdpbsuds_128)
+               .Case("suds.256", Intrinsic::x86_avx2_vpdpbsuds_256)
+               .Case("uud.128", Intrinsic::x86_avx2_vpdpbuud_128)
+               .Case("uud.256", Intrinsic::x86_avx2_vpdpbuud_256)
+               .Case("uuds.128", Intrinsic::x86_avx2_vpdpbuuds_128)
+               .Case("uuds.256", Intrinsic::x86_avx2_vpdpbuuds_256)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.consume_front("vpdpw")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("sud.128", Intrinsic::x86_avx2_vpdpwsud_128)
+               .Case("sud.256", Intrinsic::x86_avx2_vpdpwsud_256)
+               .Case("suds.128", Intrinsic::x86_avx2_vpdpwsuds_128)
+               .Case("suds.256", Intrinsic::x86_avx2_vpdpwsuds_256)
+               .Case("usd.128", Intrinsic::x86_avx2_vpdpwusd_128)
+               .Case("usd.256", Intrinsic::x86_avx2_vpdpwusd_256)
+               .Case("usds.128", Intrinsic::x86_avx2_vpdpwusds_128)
+               .Case("usds.256", Intrinsic::x86_avx2_vpdpwusds_256)
+               .Case("uud.128", Intrinsic::x86_avx2_vpdpwuud_128)
+               .Case("uud.256", Intrinsic::x86_avx2_vpdpwuud_256)
+               .Case("uuds.128", Intrinsic::x86_avx2_vpdpwuuds_128)
+               .Case("uuds.256", Intrinsic::x86_avx2_vpdpwuuds_256)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
+    }
     return false; // No other 'x86.avx2.*'
   }
 
-  if (Name.consume_front("avx10.vpdpb")) {
-    // Added in 21.1
-    ID = StringSwitch<Intrinsic::ID>(Name)
-             .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
-             .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
-             .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
-             .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
-             .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
-             .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
-             .Default(Intrinsic::not_intrinsic);
-    if (ID != Intrinsic::not_intrinsic)
-      return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+  if (Name.consume_front("avx10.")) {
+    if (Name.consume_front("vpdpb")) {
+      // Added in 21.1
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("ssd.512", Intrinsic::x86_avx10_vpdpbssd_512)
+               .Case("ssds.512", Intrinsic::x86_avx10_vpdpbssds_512)
+               .Case("sud.512", Intrinsic::x86_avx10_vpdpbsud_512)
+               .Case("suds.512", Intrinsic::x86_avx10_vpdpbsuds_512)
+               .Case("uud.512", Intrinsic::x86_avx10_vpdpbuud_512)
+               .Case("uuds.512", Intrinsic::x86_avx10_vpdpbuuds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddBytes(F, ID, NewFn);
+    } else if (Name.consume_front("vpdpw")) {
+      ID = StringSwitch<Intrinsic::ID>(Name)
+               .Case("sud.512", Intrinsic::x86_avx10_vpdpwsud_512)
+               .Case("suds.512", Intrinsic::x86_avx10_vpdpwsuds_512)
+               .Case("usd.512", Intrinsic::x86_avx10_vpdpwusd_512)
+               .Case("usds.512", Intrinsic::x86_avx10_vpdpwusds_512)
+               .Case("uud.512", Intrinsic::x86_avx10_vpdpwuud_512)
+               .Case("uuds.512", Intrinsic::x86_avx10_vpdpwuuds_512)
+               .Default(Intrinsic::not_intrinsic);
+      if (ID != Intrinsic::not_intrinsic)
+        return upgradeX86MultiplyAddWords(F, ID, NewFn);
+    }
     return false; // No other 'x86.avx10.*'
   }
 
@@ -4315,6 +4379,32 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
+
+    // Input arguments types were incorrectly set to vectors of i32 before but
+    // they should be vectors of i16. Insert bit cast when encountering the old
+    // types
+    if (Args[1]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[1]->getType())
+            ->getElementType()
+            ->isIntegerTy(32) &&
+        Args[2]->getType()->isVectorTy() &&
+        cast<VectorType>(Args[2]->getType())
+            ->getElementType()
+            ->isIntegerTy(32)) {
+      Type *NewArgType = nullptr;
+      if (VecWidth == 128)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 8, false);
+      else if (VecWidth == 256)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 16, false);
+      else if (VecWidth == 512)
+        NewArgType = VectorType::get(Builder.getInt16Ty(), 32, false);
+      else
+        llvm_unreachable("Unexpected vector bit width");
+
+      Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+      Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+    }
+
     Rep = Builder.CreateIntrinsic(IID, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
@@ -5390,6 +5480,39 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     NewCall = Builder.CreateCall(NewFn, Args);
     break;
   }
+  case Intrinsic::x86_avx512_vpdpwssd_128:
+  case Intrinsic::x86_avx512_vpdpwssd_256:
+  case Intrinsic::x86_avx512_vpdpwssd_512:
+  case Intrinsic::x86_avx512_vpdpwssds_128:
+  case Intrinsic::x86_avx512_vpdpwssds_256:
+  case Intrinsic::x86_avx512_vpdpwssds_512:
+  case Intrinsic::x86_avx2_vpdpwsud_128:
+  case Intrinsic::x86_avx2_vpdpwsud_256:
+  case Intrinsic::x86_avx10_vpdpwsud_512:
+  case Intrinsic::x86_avx2_vpdpwsuds_128:
+  case Intrinsic::x86_avx2_vpdpwsuds_256:
+  case Intrinsic::x86_avx10_vpdpwsuds_512:
+  case Intrinsic::x86_avx2_vpdpwusd_128:
+  case Intrinsic::x86_avx2_vpdpwusd_256:
+  case Intrinsic::x86_avx10_vpdpwusd_512:
+  case Intrinsic::x86_avx2_vpdpwusds_128:
+  case Intrinsic::x86_avx2_vpdpwusds_256:
+  case Intrinsic::x86_avx10_vpdpwusds_512:
+  case Intrinsic::x86_avx2_vpdpwuud_128:
+  case Intrinsic::x86_avx2_vpdpwuud_256:
+  case Intrinsic::x86_avx10_vpdpwuud_512:
+  case Intrinsic::x86_avx2_vpdpwuuds_128:
+  case Intrinsic::x86_avx2_vpdpwuuds_256:
+  case Intrinsic::x86_avx10_vpdpwuuds_512:
+    unsigned NumElts = CI->getType()->getPrimitiveSizeInBits() / 16;
+    Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2)};
+    Type *NewArgType = VectorType::get(Builder.getInt16Ty(), NumElts, false);
+    Args[1] = Builder.CreateBitCast(Args[1], NewArgType);
+    Args[2] = Builder.CreateBitCast(Args[2], NewArgType);
+
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b38e305df0ce4..32ee16c89b4fe 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -5896,52 +5896,118 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     //
     // Multiply and Add Signed Word Integers
     //   < 4 x i32> @llvm.x86.avx512.vpdpwssd.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpwssd.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssd.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
     //
     // Multiply and Add Signed Word Integers With Saturation
     //   < 4 x i32> @llvm.x86.avx512.vpdpwssds.128
-    //                  (< 4 x i32>, < 4 x i32>, < 4 x i32>)
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
     //   < 8 x i32> @llvm.x86.avx512.vpdpwssds.256
-    //                  (< 8 x i32>, < 8 x i32>, < 8 x i32>)
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
     //   <16 x i32> @llvm.x86.avx512.vpdpwssds.512
-    //                  (<16 x i32>, <16 x i32>, <16 x i32>)
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Signed and Unsigned Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwsud.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwsud.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwsud.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Signed and Unsigned Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwsuds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwsuds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwsuds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Signed Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwusd.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwusd.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwusd.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Signed Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwusds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwusds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwusds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Unsigned Word Integers
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwuud.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwuud.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwuud.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
+    //
+    // Multiply and Add Unsigned and Unsigned Word Integers With Saturation
+    //   < 4 x i32> @llvm.x86.avx2.vpdpwuuds.128
+    //                  (< 4 x i32>, < 8 x i16>, < 8 x i16>)
+    //   < 8 x i32> @llvm.x86.avx2.vpdpwuuds.256
+    //                  (< 8 x i32>, <16 x i16>, <16 x i16>)
+    //   <16 x i32> @llvm.x86.avx10.vpdpwuuds.512
+    //                  (<16 x i32>, <32 x i16>, <32 x i16>)
     //
     // These intrinsics are auto-upgraded into non-masked forms:
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssd.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssd.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssd.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssd.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //
     //   <4 x i32> @llvm.x86.avx512.mask.vpdpwssds.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <4 x i32> @llvm.x86.avx512.maskz.vpdpwssds.128
-    //                 (<4 x i32>, <4 x i32>, <4 x i32>, i8)
+    //                 (<4 x i32>, <8 x i16>, <8 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.mask.vpdpwssds.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <8 x i32> @llvm.x86.avx512.maskz.vpdpwssds.256
-    //                 (<8 x i32>, <8 x i32>, <8 x i32>, i8)
+    //                 (<8 x i32>, <16 x i16>, <16 x i16>, i8)
     //   <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     //   <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512
-    //                 (<16 x i32>, <16 x i32>, <16 x i32>, i16)
+    //                 (<16 x i32>, <32 x i16>, <32 x i16>, i16)
     case Intrinsic::x86_avx512_vpdpwssd_128:
     case Intrinsic::x86_avx512_vpdpwssd_256:
     case Intrinsic::x86_avx512_vpdpwssd_512:
     case Intrinsic::x86_avx512_vpdpwssds_128:
     case Intrinsic::x86_avx512_vpdpwssds_256:
     case Intrinsic::x86_avx512_vpdpwssds_512:
+    case Intrinsic::x86_avx2_vpdpwsud_128:
+    case Intrinsic::x86_avx2_vpdpwsud_256:
+    case Intrinsic::x86_avx10_vpdpwsud_512:
+    case Intrinsic::x86_avx2_vpdpwsuds_128:
+    case Intrinsic::x86_avx2_vpdpwsuds_256:
+    case Intrinsic::x86_avx10_vpdpwsuds_512:
+    case Intrinsic::x86_avx2_vpdpwusd_128:
+    case Intrinsic::x86_avx2_vpdpwusd_256:
+    case Intrinsic::x86_avx10_vpdpwusd_512:
+    case Intrinsic::x86_avx2_vpdpwusds_128:
+    case Intrinsic::x86_avx2_vpdpwusds_256:
+    case Intrinsic::x86_avx10_vpdpwusds_512:
+    case Intrinsic::x86_avx2_vpdpwuud_128:
+    case Intrinsic::x86_avx2_vpdpwuud_256:
+    case Intrinsic::x86_avx10_vpdpwuud_512:
+    case Intrinsic::x86_avx2_vpdpwuuds_128:
+    case Intrinsic::x86_avx2_vpdpwuuds_256:
+    case Intrinsic::x86_avx10_vpdpwuuds_512:
       handleVectorPmaddIntrinsic(I, /*ReductionFactor=*/2,
                                  /*ZeroPurifies=*/true, /*EltSizeInBits=*/16);
       break;
diff --git a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
index 76d84c1159ee4..860d60ff0d4e1 100644
--- a/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx10.2-intrinsic-upgrade.ll
@@ -97,3 +97,99 @@ define <16 x i32>@test_int_x86_avx10_vpdpbuuds_512(<16 x i32> %x0, <16 x i32> %x
   %res = call <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwsuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwsuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwusds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwusds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwusds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuud_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuud_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuud %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd2,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+
+define <16 x i32>@test_int_x86_avx10_vpdpwuuds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; X86-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X86:       # %bb.0:
+; X86-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx10_vpdpwuuds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    vpdpwuuds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x74,0x48,0xd3,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  ret <16 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
index a2aad604f19bc..e9c6cb6a19ba4 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512ni-intrinsics.ll
@@ -220,7 +220,7 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8
 
 ; VNNI INT16
 
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -231,12 +231,12 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpwsud (%rdi), %zmm1, %zmm0 # encoding: [0x62,0xf2,0x76,0x48,0xd2,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) {
 ; X86-LABEL: test_mm512_mask_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -248,13 +248,13 @@ define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x49,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) {
 ; X86-LABEL: test_mm512_maskz_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
@@ -266,14 +266,14 @@ define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xc9,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
 define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) {
 ; X86-LABEL: test_mm512_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
index 1f270d539cdb4..bf7f9375570f9 100644
--- a/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2ni-intrinsics.ll
@@ -334,7 +334,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 ; VNNI INT16
 
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) {
 ; X86-LABEL: test_mm_mask_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -346,13 +346,13 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x76,0x09,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) {
 ; X86-LABEL: test_mm_maskz_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -364,13 +364,13 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0x89,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) {
 ; X86-LABEL: test_mm256_maskz_dpwsuds_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -382,13 +382,13 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf2,0x76,0x29,0xd3,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) {
 ; X86-LABEL: test_mm256_mask_dpwsud_epi32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
@@ -400,16 +400,16 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
 ; X64-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x76,0xa9,0xd2,0xc2]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
 define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) {
 ; X86-LABEL: test_mm_mask_dpwusd_epi32:
diff --git a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
index b8ebe2a4890a1..ddf0050dbd74a 100644
--- a/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl_vnni-intrinsics.ll
@@ -178,18 +178,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpbusds_128(<4 x i32
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -209,11 +209,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; X64-NEXT:    vpdpwssd %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x52,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <16 x i16>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -221,18 +221,18 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -252,12 +252,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; X64-NEXT:    vpdpwssd %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x52,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
@@ -266,18 +266,18 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
   ret { <4 x i32>, <4 x i32> } %res2
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %1
 }
 
-define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, ptr %x2p, <8 x i32> %x4, i8 %x3) {
+define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, ptr %x2p, <16 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_256:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
@@ -297,11 +297,11 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; X64-NEXT:    vpdpwssds %ymm2, %ymm1, %ymm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x53,0xda]
 ; X64-NEXT:    vmovdqa %ymm3, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <8 x i32>, ptr %x2p
-  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %x2 = load <16 x i16>, ptr %x2p
+  %1 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x0
-  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x4)
+  %4 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %6 = select <8 x i1> %5, <8 x i32> %4, <8 x i32> zeroinitializer
   %res1 = insertvalue { <8 x i32>, <8 x i32> } poison, <8 x i32> %3, 0
@@ -309,9 +309,9 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
   ret { <8 x i32>, <8 x i32> } %res2
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p) {
+define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p) {
 ; X86-LABEL: test_int_x86_avx512_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
@@ -322,12 +322,12 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; X64:       # %bb.0:
 ; X64-NEXT:    vpdpwssds (%rdi), %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x53,0x07]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %1
 }
 
-define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, ptr %x2p, <4 x i32> %x4, i8 %x3) {
+define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, ptr %x2p, <8 x i16> %x4, i8 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
@@ -347,12 +347,12 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; X64-NEXT:    vpdpwssds %xmm2, %xmm1, %xmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x53,0xda]
 ; X64-NEXT:    vmovdqa %xmm3, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <4 x i32>, ptr %x2p
-  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %x2 = load <8 x i16>, ptr %x2p
+  %1 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   %2 = bitcast i8 %x3 to <8 x i1>
   %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x0
-  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x4)
+  %4 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x4)
   %5 = bitcast i8 %x3 to <8 x i1>
   %extract1 = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %6 = select <4 x i1> %extract1, <4 x i32> %4, <4 x i32> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
index 63ff88a7fa4ae..2aabfab1c8666 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics-upgrade.ll
@@ -102,21 +102,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpbusds_512(<16 x
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
+define <16 x i32>@test_int_x86_avx512_vpdpwssd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx512_vpdpwssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x52,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
+; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -125,7 +143,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssd_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
@@ -141,21 +159,39 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res3
 }
 
-declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
-declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
 define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
   ret <16 x i32> %res
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+declare <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <16 x i32> @llvm.x86.avx512.maskz.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+
+define <16 x i32>@test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X86-NEXT:    retl # encoding: [0xc3]
+;
+; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64:       # %bb.0:
+; X64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x53,0xc2]
+; X64-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
+  ret <16 x i32> %res
+}
+
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_maskz_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+; X86-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
+; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
 ; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08]
@@ -164,7 +200,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; X86-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X86-NEXT:    retl # encoding: [0xc3]
 ;
-; X64-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
+; X64-LABEL: test_int_x86_avx512_maskz_vpdpwssds_512:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
 ; X64-NEXT:    kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce]
diff --git a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
index 60d0298e057f3..e97b8a5c5503f 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-intrinsics.ll
@@ -86,18 +86,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpbusds_512(<16 x
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_vpdpwssd_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x52,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssd_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -116,11 +116,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; X64-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x52,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <32 x i16>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
@@ -128,18 +128,18 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
   ret { <16 x i32>, <16 x i32> } %res2
 }
 
-declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ask_vpdpwssds_512:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x53,0xc2]
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   ret <16 x i32> %1
 }
 
-define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %x1, ptr %x2p, <16 x i32> %x4, i16 %x3) {
+define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x i32> %x0, <32 x i16> %x1, ptr %x2p, <32 x i16> %x4, i16 %x3) {
 ; X86-LABEL: test_int_x86_avx512_mask_vpdpwssds_512:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8]
@@ -158,11 +158,11 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; X64-NEXT:    vpdpwssds %zmm2, %zmm1, %zmm3 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x53,0xda]
 ; X64-NEXT:    vmovdqa64 %zmm3, %zmm1 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xcb]
 ; X64-NEXT:    retq # encoding: [0xc3]
-  %x2 = load <16 x i32>, ptr %x2p
-  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2)
+  %x2 = load <32 x i16>, ptr %x2p
+  %1 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x2)
   %2 = bitcast i16 %x3 to <16 x i1>
   %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x0
-  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4)
+  %4 = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> %x0, <32 x i16> %x1, <32 x i16> %x4)
   %5 = bitcast i16 %x3 to <16 x i1>
   %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer
   %res1 = insertvalue { <16 x i32>, <16 x i32> } poison, <16 x i32> %3, 0
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
index 0f4a4f27b9715..f359ecef8ceb3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics-upgrade.ll
@@ -45,3 +45,47 @@ define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8
     %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
     ret <8 x i32> %res
 }
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+    ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; CHECK-LABEL: test_int_x86_avx_vpdpwssds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+    %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+    ret <8 x i32> %res
+}
diff --git a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
index de8b2a41bf8c8..5748a426c76c3 100644
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@@ -68,9 +68,9 @@ define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <16 x i8> %x1, <1
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
@@ -80,13 +80,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
@@ -96,13 +96,13 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %res
 }
 
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
@@ -112,13 +112,13 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <16 x i16> %x1, <16 x i16> %x2)
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
 
-define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2) {
 ; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
 ; AVXVNNI:       # %bb.0:
 ; AVXVNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
@@ -128,6 +128,6 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; AVX512VNNI:       # %bb.0:
 ; AVX512VNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
 ; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <8 x i16> %x1, <8 x i16> %x2)
   ret <4 x i32> %res
 }
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000..abdc296ae1e1c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics-upgrade.ll
@@ -0,0 +1,185 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwsuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusd_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwusds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuud_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_128:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  ret <4 x i32> %ret
+}
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX10-LABEL: test_int_x86_avx2_vpdpwuuds_256:
+; AVX10:       # %bb.0:
+; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
+; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  ret <8 x i32> %ret
+}
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
diff --git a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
index abdc296ae1e1c..7576b12645bd0 100644
--- a/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avxvnniint16-intrinsics.ll
@@ -4,7 +4,7 @@
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 ; RUN: llc < %s -verify-machineinstrs -mtriple=i686-unknown-unknown --show-mc-encoding -mattr=+avx10.2 | FileCheck %s --check-prefix=AVX10
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd2,0xc2]
@@ -14,12 +14,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd2,0xc2]
@@ -29,12 +29,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x72,0xd3,0xc2]
@@ -44,12 +44,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x76,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x76,0xd3,0xc2]
@@ -59,12 +59,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwsuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x76,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd2,0xc2]
@@ -74,12 +74,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd2,0xc2]
@@ -89,12 +89,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xd3,0xc2]
@@ -104,12 +104,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwusds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0xd3,0xc2]
@@ -119,12 +119,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwusds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd2,0xc2]
@@ -134,12 +134,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuud %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd2,0xc2]
@@ -149,12 +149,12 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuud %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd2,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x70,0xd3,0xc2]
@@ -164,12 +164,12 @@ define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuuds %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x74,0x08,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwuuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x74,0xd3,0xc2]
@@ -179,7 +179,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; AVX10:       # %bb.0:
 ; AVX10-NEXT:    vpdpwuuds %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x74,0x28,0xd3,0xc2]
 ; AVX10-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
index cd576b19f8766..345fa0efa42df 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@@ -4,16 +4,16 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
-declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <16 x i8>, <16 x i8>)
 declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <32 x i8>, <32 x i8>)
 declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <16 x i8>, <16 x i8>)
 declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
-define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -23,11 +23,11 @@ define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -37,11 +37,11 @@ define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -51,11 +51,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -65,11 +65,11 @@ define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1,
 ; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
   ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -79,11 +79,11 @@ define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2)
   ret <4 x i32> %2
 }
 
-define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <8 x i16> %a1, <8 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -93,11 +93,11 @@ define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <8 x i16> %a2, <8 x i16> %a1)
   ret <4 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -107,11 +107,11 @@ define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i3
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2)
   ret <8 x i32> %2
 }
 
-define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <16 x i16> %a1, <16 x i16> %a2) {
 ; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -121,7 +121,7 @@ define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1
 ; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <16 x i16> %a2, <16 x i16> %a1)
   ret <8 x i32> %2
 }
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
index 534352f322001..47537c8c4fb6e 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnniint16.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -O3 -disable-peephole -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+avxvnniint16 | FileCheck %s
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
 declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
 declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
@@ -14,7 +14,7 @@ declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i
 declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
 declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -26,11 +26,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd2,0x44,0x24,0xe8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsud_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -42,11 +42,11 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd2,0x44,0x24,0xd8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -58,11 +58,11 @@ define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x72,0xd3,0x44,0x24,0xe8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) {
 ; CHECK-LABEL: test_int_x86_avx2_vpdpwsuds_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -74,7 +74,7 @@ define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8
 ; CHECK-NEXT:    # encoding: [0xc4,0xe2,0x76,0xd3,0x44,0x24,0xd8]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
index 8900085af030d..9d5cabca5fef8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2_512ni-intrinsics.ll
@@ -497,12 +497,12 @@ declare <16 x i32> @llvm.x86.avx10.vpdpbuud.512(<16 x i32>, <64 x i8>, <64 x i8>
 declare <16 x i32> @llvm.x86.avx10.vpdpbuuds.512(<16 x i32>, <64 x i8>, <64 x i8>)
 
 
-define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwsud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -510,87 +510,123 @@ define <16 x i32> @test_mm512_dpwsud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
-; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = or <16 x i32> [[TMP21]], [[TMP4]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
+; CHECK-NEXT:    store <16 x i32> [[TMP22]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwsuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwsuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = and <32 x i1> [[TMP19]], [[TMP20]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP19]], [[TMP22]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP23]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> [[TMP1]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], [[__W]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], [[TMP1]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> [[__W]]
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwsud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwsud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
+; CHECK-NEXT:    [[TMP19:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = and <32 x i1> [[TMP20]], [[TMP21]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP22]], [[TMP21]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP20]], [[TMP23]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP24]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = or <16 x i32> [[TMP17]], [[TMP19]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[TMP18]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor <16 x i32> [[DPI]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[_MSPROP1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = or <16 x i32> [[TMP7]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i32> [[TMP8]], zeroinitializer
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP5]], <16 x i32> [[TMP9]], <16 x i32> [[TMP6]]
 ; CHECK-NEXT:    [[RES:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[DPI]], <16 x i32> zeroinitializer
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwsuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwusd_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -598,33 +634,57 @@ define <16 x i32> @test_mm512_dpwusd_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwusds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -636,23 +696,35 @@ define <16 x i32> @test_mm512_mask_dpwusds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwusd_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -664,21 +736,21 @@ define <16 x i32> @test_mm512_maskz_dpwusd_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusd.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwusds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
-define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr %pB) sanitize_memory {
+define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <32 x i16> %__A, ptr %pB) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_dpwuud_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], ptr [[PB:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
 ; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
 ; CHECK-NEXT:    br i1 [[_MSCMP]], label %[[BB4:.*]], label %[[BB5:.*]], !prof [[PROF1]]
@@ -686,33 +758,57 @@ define <16 x i32> @test_mm512_dpwuud_epi32(<16 x i32> %__W, <16 x i32> %__A, ptr
 ; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5]]
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[BB5]]:
-; CHECK-NEXT:    [[__B:%.*]] = load <16 x i32>, ptr [[PB]], align 64
+; CHECK-NEXT:    [[__B:%.*]] = load <32 x i16>, ptr [[PB]], align 64
 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PB]] to i64
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
-; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP8]], align 64
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[_MSLD]]
-; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[_MSLD:%.*]] = load <32 x i16>, ptr [[TMP8]], align 64
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[_MSLD]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP13:%.*]] = and <32 x i1> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP14:%.*]] = and <32 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP15:%.*]] = and <32 x i1> [[TMP9]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or <32 x i1> [[TMP13]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17:%.*]] = or <32 x i1> [[TMP16]], [[TMP15]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <32 x i1> [[TMP17]] to <32 x i16>
+; CHECK-NEXT:    [[TMP19:%.*]] = bitcast <32 x i16> [[TMP18]] to <16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i32> [[TMP19]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <16 x i1> [[TMP20]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP21]], [[TMP2]]
+; CHECK-NEXT:    [[RES:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %__B = load <16 x i32>, ptr %pB
-  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %__B = load <32 x i16>, ptr %pB
+  %res = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %__U, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_mask_dpwuuds_epi32(
-; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[__W:%.*]], i16 zeroext [[__U:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP1]]
@@ -724,23 +820,35 @@ define <16 x i32> @test_mm512_mask_dpwuuds_epi32(<16 x i32> %__W, i16 zeroext %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> %__W
   ret <16 x i32> %res
 }
 
-define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B) sanitize_memory {
+define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <16 x i32> @test_mm512_maskz_dpwuud_epi32(
-; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <16 x i32> [[__A:%.*]], <16 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 zeroext [[__U:%.*]], <16 x i32> [[__W:%.*]], <32 x i16> [[__A:%.*]], <32 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <32 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 136), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <16 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <16 x i32> [[__A]], <16 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <32 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <32 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <32 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <32 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <32 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <32 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <32 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <32 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <32 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <32 x i1> [[TMP13]] to <32 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <32 x i16> [[TMP14]] to <16 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <16 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <16 x i1> [[TMP16]] to <16 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> [[__W]], <32 x i16> [[__A]], <32 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i16 [[__U]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <16 x i1> [[BST]], <16 x i32> [[_MSPROP1]], <16 x i32> zeroinitializer
@@ -752,14 +860,14 @@ define <16 x i32> @test_mm512_maskz_dpwuud_epi32(i16 zeroext %__U, <16 x i32> %_
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[RES]]
 ;
-  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <16 x i32> %__A, <16 x i32> %__B)
+  %dpi = tail call <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32> %__W, <32 x i16> %__A, <32 x i16> %__B)
   %bst = bitcast i16 %__U to <16 x i1>
   %res = select <16 x i1> %bst, <16 x i32> %dpi, <16 x i32> zeroinitializer
   ret <16 x i32> %res
 }
 
-declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <16 x i32>, <16 x i32>)
-declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <16 x i32>, <16 x i32>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuud.512(<16 x i32>, <32 x i16>, <32 x i16>)
+declare <16 x i32> @llvm.x86.avx10.vpdpwuuds.512(<16 x i32>, <32 x i16>, <32 x i16>)
 
 
 define { <32 x i16>, <32 x i16>, <32 x i16> } @test_mm512_mask_mpsadbw(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x3, i32 %x4) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
index def7ba3f10770..6a53a1595271e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx10_2ni-intrinsics.ll
@@ -739,17 +739,29 @@ declare <8 x i32> @llvm.x86.avx2.vpdpbuud.256(<8 x i32>, <32 x i8>, <32 x i8>)
 declare <8 x i32> @llvm.x86.avx2.vpdpbuuds.256(<8 x i32>, <32 x i8>, <32 x i8>)
 
 
-define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwsud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -761,23 +773,35 @@ define <4 x i32> @test_mm_mask_dpwsud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwsuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -789,23 +813,35 @@ define <4 x i32> @test_mm_maskz_dpwsuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -817,23 +853,35 @@ define <8 x i32> @test_mm256_maskz_dpwsuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwsud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -845,28 +893,40 @@ define <8 x i32> @test_mm256_mask_dpwsud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwusd_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -878,23 +938,35 @@ define <4 x i32> @test_mm_mask_dpwusd_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwusds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -906,23 +978,35 @@ define <4 x i32> @test_mm_maskz_dpwusds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwusds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -934,23 +1018,35 @@ define <8 x i32> @test_mm256_maskz_dpwusds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwusd_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -962,28 +1058,40 @@ define <8 x i32> @test_mm256_mask_dpwusd_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
-define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_mask_dpwuud_epi32(
-; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[__W:%.*]], i4 zeroext [[__U:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> [[TMP1]]
@@ -995,23 +1103,35 @@ define <4 x i32> @test_mm_mask_dpwuud_epi32(<4 x i32> %__W, i4 zeroext %__U, <4
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> %__W
   ret <4 x i32> %res
 }
 
-define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B) sanitize_memory {
+define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_mm_maskz_dpwuuds_epi32(
-; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <4 x i32> [[__A:%.*]], <4 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i4 zeroext [[__U:%.*]], <4 x i32> [[__W:%.*]], <8 x i16> [[__A:%.*]], <8 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 24), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i4, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <4 x i32> [[__A]], <4 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <8 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <8 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <8 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <8 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <8 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i1> [[TMP13]] to <8 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <8 x i16> [[TMP14]] to <4 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <4 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <4 x i1> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[__W]], <8 x i16> [[__A]], <8 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i4 [[TMP4]] to <4 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i4 [[__U]] to <4 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[BST]], <4 x i32> [[_MSPROP1]], <4 x i32> zeroinitializer
@@ -1023,23 +1143,35 @@ define <4 x i32> @test_mm_maskz_dpwuuds_epi32(i4 zeroext %__U, <4 x i32> %__W, <
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
-  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <4 x i32> %__A, <4 x i32> %__B)
+  %dpi = tail call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %__W, <8 x i16> %__A, <8 x i16> %__B)
   %bst = bitcast i4 %__U to <4 x i1>
   %res = select <4 x i1> %bst, <4 x i32> %dpi, <4 x i32> zeroinitializer
   ret <4 x i32> %res
 }
 
-define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(
-; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[__W:%.*]], i8 zeroext [[__U:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP1]]
@@ -1051,23 +1183,35 @@ define <8 x i32> @test_mm256_maskz_dpwuuds_epi32(<8 x i32> %__W, i8 zeroext %__U
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> %__W
   ret <8 x i32> %res
 }
 
-define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B) sanitize_memory {
+define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_mm256_mask_dpwuud_epi32(
-; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <8 x i32> [[__A:%.*]], <8 x i32> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i8 zeroext [[__U:%.*]], <8 x i32> [[__W:%.*]], <16 x i16> [[__A:%.*]], <16 x i16> [[__B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 40), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 72), align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <8 x i32> [[__A]], <8 x i32> [[__B]])
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP20:%.*]] = icmp ne <16 x i16> [[__A]], zeroinitializer
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp ne <16 x i16> [[__B]], zeroinitializer
+; CHECK-NEXT:    [[TMP22:%.*]] = and <16 x i1> [[TMP18]], [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP20]], [[TMP19]]
+; CHECK-NEXT:    [[TMP11:%.*]] = and <16 x i1> [[TMP18]], [[TMP21]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP22]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = or <16 x i1> [[TMP12]], [[TMP11]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <16 x i1> [[TMP13]] to <16 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <16 x i16> [[TMP14]] to <8 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne <8 x i32> [[TMP15]], zeroinitializer
+; CHECK-NEXT:    [[TMP17:%.*]] = sext <8 x i1> [[TMP16]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP17]], [[TMP1]]
+; CHECK-NEXT:    [[DPI:%.*]] = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[__W]], <16 x i16> [[__A]], <16 x i16> [[__B]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[BST:%.*]] = bitcast i8 [[__U]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <8 x i1> [[BST]], <8 x i32> [[_MSPROP1]], <8 x i32> zeroinitializer
@@ -1079,16 +1223,16 @@ define <8 x i32> @test_mm256_mask_dpwuud_epi32(i8 zeroext %__U, <8 x i32> %__W,
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
-  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <8 x i32> %__A, <8 x i32> %__B)
+  %dpi = tail call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %__W, <16 x i16> %__A, <16 x i16> %__B)
   %bst = bitcast i8 %__U to <8 x i1>
   %res = select <8 x i1> %bst, <8 x i32> %dpi, <8 x i32> zeroinitializer
   ret <8 x i32> %res
 }
 
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <4 x i32>, <4 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <8 x i32>, <8 x i32>)
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32>, <8 x i16>, <8 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32>, <16 x i16>, <16 x i16>)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32>, <16 x i16>, <16 x i16>)
 
 
 define { <8 x i16>, <8 x i16>, <8 x i16> } @test_mask_mpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) sanitize_memory {
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
index 5e937485ff282..d0daea5e68fea 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics-upgrade.ll
@@ -528,10 +528,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -546,7 +546,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -574,10 +574,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -592,7 +592,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -601,10 +601,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -619,7 +619,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -653,10 +653,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -671,7 +671,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -699,10 +699,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -717,7 +717,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -728,10 +728,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -746,7 +746,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -783,10 +783,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -801,7 +801,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -829,10 +829,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -847,7 +847,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -856,10 +856,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -874,7 +874,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -908,10 +908,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -926,7 +926,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -954,10 +954,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -972,7 +972,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -983,10 +983,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP3]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -1001,7 +1001,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP6:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
index 1d3046804b74f..f2b1e16e3d5c2 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vl_vnni-intrinsics.ll
@@ -495,10 +495,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -513,7 +513,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -541,10 +541,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -559,7 +559,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -568,10 +568,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -586,7 +586,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssd_256(<8 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -623,10 +623,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP22]], zeroinitializer
@@ -641,7 +641,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP22]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP4]]
 ;
@@ -669,10 +669,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -687,7 +687,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -698,10 +698,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -716,7 +716,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssd_128(<4 x i32>
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -757,10 +757,10 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP22]], zeroinitializer
@@ -775,7 +775,7 @@ define <8 x i32>@test_int_x86_avx512_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1,
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP22]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[TMP4]]
 ;
@@ -803,10 +803,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <8 x i32>, ptr [[TMP10]], align 32
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <8 x i32> [[_MSLD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <16 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <16 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <16 x i16> [[TMP29]], zeroinitializer
@@ -821,7 +821,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <8 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <8 x i1> [[TMP61]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP29]], <16 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[_MSPROP1]], <8 x i32> [[TMP2]]
@@ -830,10 +830,10 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <8 x i1> [[TMP12]], <8 x i32> [[TMP17]], <8 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <8 x i1> [[TMP13]], <8 x i32> [[TMP11]], <8 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <8 x i32> [[TMP5]] to <16 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <8 x i32> [[X4]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <16 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <16 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <16 x i16> [[TMP37]], zeroinitializer
@@ -848,7 +848,7 @@ define { <8 x i32>, <8 x i32> } @test_int_x86_avx512_mask_vpdpwssds_256(<8 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <8 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <8 x i1> [[TMP52]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <8 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP37]], <16 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <8 x i1> [[TMP21]], <8 x i32> [[_MSPROP3]], <8 x i32> zeroinitializer
@@ -896,10 +896,10 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 87960930222080
 ; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 16
-; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP26:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ne <8 x i16> [[TMP11]], zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp ne <8 x i16> [[TMP12]], zeroinitializer
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i16> [[TMP26]], zeroinitializer
@@ -914,7 +914,7 @@ define <4 x i32>@test_int_x86_avx512_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1,
 ; CHECK-NEXT:    [[TMP24:%.*]] = icmp ne <4 x i32> [[TMP23]], zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i1> [[TMP24]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP25]], [[TMP4]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP26]], <8 x i16> [[TMP10]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[TMP9]]
 ;
@@ -943,10 +943,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <4 x i32>, ptr [[TMP10]], align 16
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <4 x i32> [[_MSLD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <8 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <8 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <8 x i16> [[TMP29]], zeroinitializer
@@ -961,7 +961,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <4 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <4 x i1> [[TMP61]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP29]], <8 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP2:%.*]] = shufflevector <8 x i1> [[TMP12]], <8 x i1> [[TMP12]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -972,10 +972,10 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <4 x i1> [[_MSPROP2]], <4 x i32> [[TMP17]], <4 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <4 x i1> [[EXTRACT]], <4 x i32> [[TMP11]], <4 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <4 x i32> [[TMP5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <4 x i32> [[X4]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <8 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <8 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <8 x i16> [[TMP37]], zeroinitializer
@@ -990,7 +990,7 @@ define { <4 x i32>, <4 x i32> } @test_int_x86_avx512_mask_vpdpwssds_128(<4 x i32
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <4 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <4 x i1> [[TMP52]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP4:%.*]] = or <4 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP37]], <8 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 [[TMP4]] to <8 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i8 [[X3]] to <8 x i1>
 ; CHECK-NEXT:    [[_MSPROP5:%.*]] = shufflevector <8 x i1> [[TMP20]], <8 x i1> [[TMP20]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
index 5c99f8a3a1fb6..4e7598b92abcf 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics-upgrade.ll
@@ -270,10 +270,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -288,7 +288,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -316,10 +316,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -334,7 +334,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -343,10 +343,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -361,7 +361,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -395,10 +395,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -413,7 +413,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssds_512(<16 x i32> %x0, <16 x i32> %
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -441,10 +441,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -459,7 +459,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -468,10 +468,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -486,7 +486,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
index 236ff45c6cd08..0fd60149cc15e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx512vnni-intrinsics.ll
@@ -251,10 +251,10 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -269,7 +269,7 @@ define <16 x i32>@test_int_x86_avx512_vpdpwssd_512(<16 x i32> %x0, <16 x i32> %x
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -297,10 +297,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -315,7 +315,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -324,10 +324,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -342,7 +342,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssd_512(<16 x i
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
@@ -379,10 +379,10 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 128), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <16 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <16 x i32> [[TMP2]] to <32 x i16>
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <32 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <32 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <32 x i16> [[TMP22]], zeroinitializer
@@ -397,7 +397,7 @@ define <16 x i32>@test_int_x86_avx512_ask_vpdpwssds_512(<16 x i32> %x0, <16 x i3
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <16 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <16 x i1> [[TMP19]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP22]], <32 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <16 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <16 x i32> [[TMP4]]
 ;
@@ -425,10 +425,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP9:%.*]] = xor i64 [[TMP8]], 87960930222080
 ; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP9]] to ptr
 ; CHECK-NEXT:    [[_MSLD:%.*]] = load <16 x i32>, ptr [[TMP10]], align 64
-; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP31:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP29:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP32:%.*]] = bitcast <16 x i32> [[_MSLD]] to <32 x i16>
+; CHECK-NEXT:    [[TMP30:%.*]] = bitcast <16 x i32> [[X2]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP33:%.*]] = icmp ne <32 x i16> [[TMP31]], zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = icmp ne <32 x i16> [[TMP32]], zeroinitializer
 ; CHECK-NEXT:    [[TMP35:%.*]] = icmp ne <32 x i16> [[TMP29]], zeroinitializer
@@ -443,7 +443,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP61:%.*]] = icmp ne <16 x i32> [[TMP60]], zeroinitializer
 ; CHECK-NEXT:    [[TMP62:%.*]] = sext <16 x i1> [[TMP61]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <16 x i32> [[TMP62]], [[TMP2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP29]], <32 x i16> [[TMP30]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[_MSPROP1]], <16 x i32> [[TMP2]]
@@ -452,10 +452,10 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP17:%.*]] = or <16 x i32> [[TMP16]], [[TMP2]]
 ; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select <16 x i1> [[TMP12]], <16 x i32> [[TMP17]], <16 x i32> [[TMP14]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = select <16 x i1> [[TMP13]], <16 x i32> [[TMP11]], <16 x i32> [[X0]]
-; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
-; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP39:%.*]] = bitcast <16 x i32> [[TMP3]] to <32 x i16>
+; CHECK-NEXT:    [[TMP37:%.*]] = bitcast <16 x i32> [[X1]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP40:%.*]] = bitcast <16 x i32> [[TMP5]] to <32 x i16>
+; CHECK-NEXT:    [[TMP38:%.*]] = bitcast <16 x i32> [[X4]] to <32 x i16>
 ; CHECK-NEXT:    [[TMP41:%.*]] = icmp ne <32 x i16> [[TMP39]], zeroinitializer
 ; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <32 x i16> [[TMP40]], zeroinitializer
 ; CHECK-NEXT:    [[TMP43:%.*]] = icmp ne <32 x i16> [[TMP37]], zeroinitializer
@@ -470,7 +470,7 @@ define { <16 x i32>, <16 x i32> } @test_int_x86_avx512_mask_vpdpwssds_512(<16 x
 ; CHECK-NEXT:    [[TMP52:%.*]] = icmp ne <16 x i32> [[TMP51]], zeroinitializer
 ; CHECK-NEXT:    [[TMP53:%.*]] = sext <16 x i1> [[TMP52]] to <16 x i32>
 ; CHECK-NEXT:    [[_MSPROP3:%.*]] = or <16 x i32> [[TMP53]], [[TMP2]]
-; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <16 x i32> [[X1]], <16 x i32> [[X4]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call <16 x i32> @llvm.x86.avx512.vpdpwssds.512(<16 x i32> [[X0]], <32 x i16> [[TMP37]], <32 x i16> [[TMP38]])
 ; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i16 [[TMP4]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP21:%.*]] = bitcast i16 [[X3]] to <16 x i1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = select <16 x i1> [[TMP21]], <16 x i32> [[_MSPROP3]], <16 x i32> zeroinitializer
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
index 0344fbd5ee2a9..0a6c5f7b089ba 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avx_vnni-intrinsics.ll
@@ -143,10 +143,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -161,7 +161,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -178,10 +178,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -196,7 +196,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
@@ -213,10 +213,10 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <8 x i32> [[TMP2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[X1]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <8 x i32> [[TMP3]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <8 x i32> [[X2]] to <16 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <16 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <16 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <16 x i16> [[TMP4]], zeroinitializer
@@ -231,7 +231,7 @@ define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <8 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <8 x i1> [[TMP19]] to <8 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <8 x i32> [[X1]], <8 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> [[X0]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RES]]
 ;
@@ -248,10 +248,10 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP21:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[X1]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <4 x i32> [[TMP3]] to <8 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <4 x i32> [[X2]] to <8 x i16>
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne <8 x i16> [[TMP6]], zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <8 x i16> [[TMP7]], zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp ne <8 x i16> [[TMP4]], zeroinitializer
@@ -266,7 +266,7 @@ define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp ne <4 x i32> [[TMP18]], zeroinitializer
 ; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i1> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP20]], [[TMP21]]
-; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <4 x i32> [[X1]], <4 x i32> [[X2]])
+; CHECK-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> [[X0]], <8 x i16> [[TMP4]], <8 x i16> [[TMP5]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RES]]
 ;
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
index 707b46bb8686e..fd9e0b953d002 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/avxvnniint16-intrinsics.ll
@@ -22,218 +22,362 @@
 target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsud_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsud_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwsuds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
-; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <4 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
+; CHECK-NEXT:    store <4 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwsuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwsuds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
-; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = or <8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
+; CHECK-NEXT:    store <8 x i32> [[TMP17]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwsuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusd_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusd.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusd_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusd.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwusds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwusds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwusds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwusds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuud_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuud.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuud_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuud.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
 
-define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) sanitize_memory {
+define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <4 x i32> @test_int_x86_avx2_vpdpwuuds_128(
-; CHECK-SAME: <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <4 x i32> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 16), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <4 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <8 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <8 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <8 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <8 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <8 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <8 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <8 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <8 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <8 x i1> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <8 x i16> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <4 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i1> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <4 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> [[A]], <8 x i16> [[B]], <8 x i16> [[C]])
 ; CHECK-NEXT:    store <4 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <4 x i32> [[RET]]
 ;
-  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+  %ret = call <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
   ret <4 x i32> %ret
 }
-declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C)
+declare <4 x i32> @llvm.x86.avx2.vpdpwuuds.128(<4 x i32> %A, <8 x i16> %B, <8 x i16> %C)
 
-define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C) sanitize_memory {
+define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C) sanitize_memory {
 ; CHECK-LABEL: define <8 x i32> @test_int_x86_avx2_vpdpwuuds_256(
-; CHECK-SAME: <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <8 x i32> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i16>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @__msan_param_tls, align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 32), align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i32>, ptr getelementptr (i8, ptr @__msan_param_tls, i64 64), align 8
 ; CHECK-NEXT:    call void @llvm.donothing()
-; CHECK-NEXT:    [[_MSPROP:%.*]] = or <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[_MSPROP]], [[TMP3]]
-; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> [[C]])
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <16 x i16> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne <16 x i16> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <16 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <16 x i16> [[C]], zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = and <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = and <16 x i1> [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = and <16 x i1> [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP8]], [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP10]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sext <16 x i1> [[TMP12]] to <16 x i16>
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <16 x i16> [[TMP13]] to <8 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = icmp ne <8 x i32> [[TMP14]], zeroinitializer
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <8 x i1> [[TMP15]] to <8 x i32>
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or <8 x i32> [[TMP16]], [[TMP1]]
+; CHECK-NEXT:    [[RET:%.*]] = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> [[A]], <16 x i16> [[B]], <16 x i16> [[C]])
 ; CHECK-NEXT:    store <8 x i32> [[_MSPROP1]], ptr @__msan_retval_tls, align 8
 ; CHECK-NEXT:    ret <8 x i32> [[RET]]
 ;
-  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+  %ret = call <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)
   ret <8 x i32> %ret
 }
-declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C)
+declare <8 x i32> @llvm.x86.avx2.vpdpwuuds.256(<8 x i32> %A, <16 x i16> %B, <16 x i16> %C)

From ef47462ce9f1ade501da011a18869ea2a653d2cb Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 9 Dec 2025 17:11:28 +0000
Subject: [PATCH 28/63] [SPIRV] Start adding support for `int128` (#170798)

LLVM has pretty thorough support for `int128`, and it has started seeing
some use. Even thouth we already have support for the
`SPV_ALTERA_arbitrary_precision_integers` extension, the BE was oddly
capping integer width to 64-bits. This patch adds partial support for
lowering 128-bit integers to `OpTypeInt 128`. Some work remains to be
done around legalisation support and validating constant uses (e.g.
cases that get lowered to `OpSpecConstantOp`).
---
 .../SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp   |  8 +++
 llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp    | 19 +++++-
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 20 +++---
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |  3 +-
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 15 +++++
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp          |  7 ++
 .../i128-addsub.ll                            | 67 +++++++++++++++++++
 .../i128-arith.ll                             | 27 ++++++++
 .../i128-switch-lower.ll                      | 27 ++++++++
 .../enable-all-extensions-but-one.ll          |  1 +
 10 files changed, 181 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll

diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
index 62f5e47c5ea3b..42de884840dc9 100644
--- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
+++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVInstPrinter.cpp
@@ -50,6 +50,14 @@ void SPIRVInstPrinter::printOpConstantVarOps(const MCInst *MI,
   unsigned IsBitwidth16 = MI->getFlags() & SPIRV::INST_PRINTER_WIDTH16;
   const unsigned NumVarOps = MI->getNumOperands() - StartIndex;
 
+  if (MI->getOpcode() == SPIRV::OpConstantI && NumVarOps > 2) {
+    // SPV_ALTERA_arbitrary_precision_integers allows for integer widths greater
+    // than 64, which will be encoded via multiple operands.
+    for (unsigned I = StartIndex; I != MI->getNumOperands(); ++I)
+      O << ' ' << MI->getOperand(I).getImm();
+    return;
+  }
+
   assert((NumVarOps == 1 || NumVarOps == 2) &&
          "Unsupported number of bits for literal variable");
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
index d2a8fddc5d8e4..42edad255ce82 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCommandLine.cpp
@@ -13,9 +13,14 @@
 
 #include "SPIRVCommandLine.h"
 #include "MCTargetDesc/SPIRVBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/TargetParser/Triple.h"
-#include <algorithm>
+
+#include <functional>
 #include <map>
+#include <string>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "spirv-commandline"
 
@@ -176,7 +181,17 @@ bool SPIRVExtensionsParser::parse(cl::Option &O, StringRef ArgName,
                                   std::set<SPIRV::Extension::Extension> &Vals) {
   SmallVector<StringRef, 10> Tokens;
   ArgValue.split(Tokens, ",", -1, false);
-  std::sort(Tokens.begin(), Tokens.end());
+  llvm::sort(Tokens, [](auto &&LHS, auto &&RHS) {
+    // We want to ensure that we handle "all" first, to ensure that any
+    // subsequent disablement actually behaves as expected i.e. given
+    // --spv-ext=all,-foo, we first enable all and then disable foo; this should
+    // be revisited and simplified.
+    if (LHS == "all")
+      return true;
+    if (RHS == "all")
+      return false;
+    return !(RHS < LHS);
+  });
 
   std::set<SPIRV::Extension::Extension> EnabledExtensions;
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 0fb44052527f0..5d96a67500dff 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -151,22 +151,22 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) {
 }
 
 unsigned SPIRVGlobalRegistry::adjustOpTypeIntWidth(unsigned Width) const {
-  if (Width > 64)
-    report_fatal_error("Unsupported integer width!");
   const SPIRVSubtarget &ST = cast<SPIRVSubtarget>(CurMF->getSubtarget());
   if (ST.canUseExtension(
           SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers) ||
-      ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4))
+      (Width == 4 && ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4)))
     return Width;
   if (Width <= 8)
-    Width = 8;
+    return 8;
   else if (Width <= 16)
-    Width = 16;
+    return 16;
   else if (Width <= 32)
-    Width = 32;
-  else
-    Width = 64;
-  return Width;
+    return 32;
+  else if (Width <= 64)
+    return 64;
+  else if (Width <= 128)
+    return 128;
+  reportFatalUsageError("Unsupported Integer width!");
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeInt(unsigned Width,
@@ -413,7 +413,7 @@ Register SPIRVGlobalRegistry::createConstInt(const ConstantInt *CI,
           MIB = MIRBuilder.buildInstr(SPIRV::OpConstantI)
                     .addDef(Res)
                     .addUse(getSPIRVTypeID(SpvType));
-          addNumImm(APInt(BitWidth, CI->getZExtValue()), MIB);
+          addNumImm(CI->getValue(), MIB);
         } else {
           MIB = MIRBuilder.buildInstr(SPIRV::OpConstantNull)
                     .addDef(Res)
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index 71df4cced434e..2078bfee2e767 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -48,6 +48,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
+  const LLT s128 = LLT::scalar(128);
 
   const LLT v16s64 = LLT::fixed_vector(16, 64);
   const LLT v16s32 = LLT::fixed_vector(16, 32);
@@ -307,7 +308,7 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
           typeInSet(1, allPtrsScalarsAndVectors)));
 
   getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
-      .legalFor({s1})
+      .legalFor({s1, s128})
       .legalFor(allFloatAndIntScalarsAndPtrs)
       .legalFor(allowedVectorTypes)
       .moreElementsToNextPow2(0)
diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 73432279c3306..7717f18d14a34 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1436,6 +1436,21 @@ void addInstrRequirements(const MachineInstr &MI,
       Reqs.addCapability(SPIRV::Capability::Int16);
     else if (BitWidth == 8)
       Reqs.addCapability(SPIRV::Capability::Int8);
+    else if (BitWidth == 4 &&
+             ST.canUseExtension(SPIRV::Extension::SPV_INTEL_int4)) {
+      Reqs.addExtension(SPIRV::Extension::SPV_INTEL_int4);
+      Reqs.addCapability(SPIRV::Capability::Int4TypeINTEL);
+    } else if (BitWidth != 32) {
+      if (!ST.canUseExtension(
+              SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers))
+        reportFatalUsageError(
+            "OpTypeInt type with a width other than 8, 16, 32 or 64 bits "
+            "requires the following SPIR-V extension: "
+            "SPV_ALTERA_arbitrary_precision_integers");
+      Reqs.addExtension(
+          SPIRV::Extension::SPV_ALTERA_arbitrary_precision_integers);
+      Reqs.addCapability(SPIRV::Capability::ArbitraryPrecisionIntegersALTERA);
+    }
     break;
   }
   case SPIRV::OpDot: {
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index d4dd897647cfc..c32ecfb3ef7ac 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -171,6 +171,13 @@ void addNumImm(const APInt &Imm, MachineInstrBuilder &MIB) {
     // Asm Printer needs this info to print 64-bit operands correctly
     MIB.getInstr()->setAsmPrinterFlag(SPIRV::ASM_PRINTER_WIDTH64);
     return;
+  } else if (Bitwidth <= 128) {
+    uint32_t LowBits = Imm.getRawData()[0] & 0xffffffff;
+    uint32_t MidBits0 = (Imm.getRawData()[0] >> 32) & 0xffffffff;
+    uint32_t MidBits1 = Imm.getRawData()[1] & 0xffffffff;
+    uint32_t HighBits = (Imm.getRawData()[1] >> 32) & 0xffffffff;
+    MIB.addImm(LowBits).addImm(MidBits0).addImm(MidBits1).addImm(HighBits);
+    return;
   }
   report_fatal_error("Unsupported constant bitwidth");
 }
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
new file mode 100644
index 0000000000000..c90ffdd17996c
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-addsub.ll
@@ -0,0 +1,67 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#TestAdd:]] "test_add"
+; CHECK: OpName %[[#TestSub:]] "test_sub"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+; CHECK: %[[#Const64Int128:]] = OpConstant %[[#Int128Ty]] 64 0 0 0
+
+; CHECK: %[[#TestAdd]] = OpFunction
+define spir_func void @test_add(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
+entry:
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftLeftLogical %[[#Int128Ty]] {{%[0-9]+}} %[[#Const64Int128]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpIAdd %[[#Int128Ty]]
+	%tmp1 = zext i64 %AL to i128
+	%tmp23 = zext i64 %AH to i128
+	%tmp4 = shl i128 %tmp23, 64
+	%tmp5 = or i128 %tmp4, %tmp1
+	%tmp67 = zext i64 %BL to i128
+	%tmp89 = zext i64 %BH to i128
+	%tmp11 = shl i128 %tmp89, 64
+	%tmp12 = or i128 %tmp11, %tmp67
+	%tmp15 = add i128 %tmp12, %tmp5
+	%tmp1617 = trunc i128 %tmp15 to i64
+	store i64 %tmp1617, ptr %RL
+	%tmp21 = lshr i128 %tmp15, 64
+	%tmp2122 = trunc i128 %tmp21 to i64
+	store i64 %tmp2122, ptr %RH
+	ret void
+; CHECK: OpFunctionEnd
+}
+
+; CHECK: %[[#TestSub]] = OpFunction
+define spir_func void @test_sub(i64 %AL, i64 %AH, i64 %BL, i64 %BH, ptr %RL, ptr %RH) {
+entry:
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftLeftLogical %[[#Int128Ty]] {{%[0-9]+}} %[[#Const64Int128]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpISub %[[#Int128Ty]]
+	%tmp1 = zext i64 %AL to i128
+	%tmp23 = zext i64 %AH to i128
+	%tmp4 = shl i128 %tmp23, 64
+	%tmp5 = or i128 %tmp4, %tmp1
+	%tmp67 = zext i64 %BL to i128
+	%tmp89 = zext i64 %BH to i128
+	%tmp11 = shl i128 %tmp89, 64
+	%tmp12 = or i128 %tmp11, %tmp67
+	%tmp15 = sub i128 %tmp5, %tmp12
+	%tmp1617 = trunc i128 %tmp15 to i64
+	store i64 %tmp1617, ptr %RL
+	%tmp21 = lshr i128 %tmp15, 64
+	%tmp2122 = trunc i128 %tmp21 to i64
+	store i64 %tmp2122, ptr %RH
+	ret void
+; CHECK: OpFunctionEnd
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
new file mode 100644
index 0000000000000..d1de5df499566
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-arith.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#Foo:]] "foo"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+
+; CHECK: %[[#Foo]] = OpFunction
+define i64 @foo(i64 %x, i64 %y, i32 %amt) {
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpSConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpBitwiseOr %[[#Int128Ty]]
+; CHECK: {{.*}} = OpUConvert %[[#Int128Ty]]
+; CHECK: {{.*}} = OpShiftRightLogical %[[#Int128Ty]]
+  %tmp0 = zext i64 %x to i128
+  %tmp1 = sext i64 %y to i128
+  %tmp2 = or i128 %tmp0, %tmp1
+  %tmp7 = zext i32 13 to i128
+  %tmp3 = lshr i128 %tmp2, %tmp7
+  %tmp4 = trunc i128 %tmp3 to i64
+  ret i64 %tmp4
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll
new file mode 100644
index 0000000000000..669e9362605a5
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_ALTERA_arbitrary_precision_integers/i128-switch-lower.ll
@@ -0,0 +1,27 @@
+; RUN: not llc -O0 -mtriple=spirv64-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+
+; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-ext=+SPV_ALTERA_arbitrary_precision_integers %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpTypeInt type with a width other than 8, 16, 32 or 64 bits requires the following SPIR-V extension: SPV_ALTERA_arbitrary_precision_integers
+
+; CHECK: OpCapability ArbitraryPrecisionIntegersALTERA
+; CHECK: OpExtension "SPV_ALTERA_arbitrary_precision_integers"
+; CHECK: OpName %[[#Test:]] "test"
+; CHECK: OpName %[[#Exit:]] "exit"
+; CHECK: %[[#Int128Ty:]] = OpTypeInt 128 0
+; CHECK: %[[#UndefInt128:]] = OpUndef %[[#Int128Ty]]
+
+; CHECK: %[[#Test]] = OpFunction
+define void @test() {
+entry:
+; CHECK: OpSwitch %[[#UndefInt128]] %[[#Exit]] 0 0 3 0 %[[#Exit]] 0 0 5 0 %[[#Exit]] 0 0 4 0 %[[#Exit]] 0 0 8 0 %[[#Exit]]
+  switch i128 poison, label %exit [
+    i128 55340232221128654848, label %exit
+    i128 92233720368547758080, label %exit
+    i128 73786976294838206464, label %exit
+    i128 147573952589676412928, label %exit
+  ]
+exit:
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
index 5ddfc85702540..ecf4807a1d5fc 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/enable-all-extensions-but-one.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=all,-SPV_ALTERA_arbitrary_precision_integers %s -o - | FileCheck %s
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=-SPV_ALTERA_arbitrary_precision_integers,all %s -o - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=KHR %s -o - | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown --spirv-ext=khr %s -o - | FileCheck %s
 

From 04a5ee6065fe6f2955fece5e8db3ae95cfbcce1c Mon Sep 17 00:00:00 2001
From: Rana Pratap Reddy <109514914+ranapratap55@users.noreply.github.com>
Date: Tue, 9 Dec 2025 22:46:33 +0530
Subject: [PATCH 29/63] [AMDGPU] Modifies builtin def to take _Float16('x') for
 both HIP/C++ and for OpenCL (#167652)

For extended imges insts amdgcn_image_sample_*_/gather4_* builtins,
using 'x' in the builtin def so that it will take _Float16 for both
HIP/C++ and OpenCL.
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def | 34 ++++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 0b78b460c0d6a..5b3074a493d4b 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -972,45 +972,45 @@ TARGET_BUILTIN(__builtin_amdgcn_image_sample_3d_v4f16_f32, "V4xifffQtV4ibii", "n
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_cube_v4f16_f32, "V4xifffQtV4ibii", "nc", "image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f32_f32, "V4fifQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4eifQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1d_v4f16_f32, "V4xifQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_1darray_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_f32_f32, "fiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2d_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_2darray_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_3d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_lz_cube_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4eiffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1d_v4f16_f32, "V4xiffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_1darray_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_f32_f32, "fifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_f32_f32, "fiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_2darray_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_3d_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_l_cube_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f32_f32, "V4fifffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4eifffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1d_v4f16_f32, "V4xifffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f32_f32, "V4fiffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4eiffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_1darray_v4f16_f32, "V4xiffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_f32_f32, "fiffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f32_f32, "V4fiffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4eiffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2d_v4f16_f32, "V4xiffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_f32_f32, "fifffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f32_f32, "V4fifffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4eifffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_2darray_v4f16_f32, "V4xifffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f32_f32, "V4fifffffffffQtV4ibii", "nc", "extended-image-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4eifffffffffQtV4ibii", "nc", "extended-image-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_sample_d_3d_v4f16_f32, "V4xifffffffffQtV4ibii", "nc", "extended-image-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_gather4_lz_2d_v4f32_f32, "V4fiffQtV4ibii", "nc", "extended-image-insts")
 
 #undef BUILTIN

From fa607658a2bdbb5b47e6243b3871d4d6aab09335 Mon Sep 17 00:00:00 2001
From: cs25resch11005-bhuvan <cs25resch11005@iith.ac.in>
Date: Tue, 9 Dec 2025 22:47:17 +0530
Subject: [PATCH 30/63] [CIR][CIRGen][Builtin][X86] Masked compress Intrinsics
 (#169582)

Added masked compress builtin in CIR.
Note: This is my first PR to llvm. Looking forward to corrections

---------

Co-authored-by: bhuvan1527 <balabhuvanvarma@gmail.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  31 +++-
 .../CodeGenBuiltins/X86/avx512vl-builtins.c   |  33 ++++
 .../X86/avx512vlvbmi2-builtins.c              | 171 ++++++++++++++++++
 3 files changed, 229 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index fb17e31bf36d6..855134ba2b249 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -151,6 +151,17 @@ computeFullLaneShuffleMask(CIRGenFunction &cgf, const mlir::Value vec,
 
   outIndices.resize(numElts);
 }
+static mlir::Value emitX86CompressExpand(CIRGenBuilderTy &builder,
+                                         mlir::Location loc, mlir::Value source,
+                                         mlir::Value mask,
+                                         mlir::Value inputVector,
+                                         const std::string &id) {
+  auto resultTy = cast<cir::VectorType>(mask.getType());
+  mlir::Value maskValue = getMaskVecValue(
+      builder, loc, inputVector, cast<cir::VectorType>(resultTy).getSize());
+  return emitIntrinsicCallOp(builder, loc, id, resultTy,
+                             mlir::ValueRange{source, mask, maskValue});
+}
 
 static mlir::Value emitX86MaskAddLogic(CIRGenBuilderTy &builder,
                                        mlir::Location loc,
@@ -712,6 +723,10 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compressstoreqi128_mask:
   case X86::BI__builtin_ia32_compressstoreqi256_mask:
   case X86::BI__builtin_ia32_compressstoreqi512_mask:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
   case X86::BI__builtin_ia32_expanddf128_mask:
   case X86::BI__builtin_ia32_expanddf256_mask:
   case X86::BI__builtin_ia32_expanddf512_mask:
@@ -729,7 +744,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_expandhi512_mask:
   case X86::BI__builtin_ia32_expandqi128_mask:
   case X86::BI__builtin_ia32_expandqi256_mask:
-  case X86::BI__builtin_ia32_expandqi512_mask:
+  case X86::BI__builtin_ia32_expandqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.expand");
+  }
   case X86::BI__builtin_ia32_compressdf128_mask:
   case X86::BI__builtin_ia32_compressdf256_mask:
   case X86::BI__builtin_ia32_compressdf512_mask:
@@ -747,11 +766,11 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_compresshi512_mask:
   case X86::BI__builtin_ia32_compressqi128_mask:
   case X86::BI__builtin_ia32_compressqi256_mask:
-  case X86::BI__builtin_ia32_compressqi512_mask:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return {};
+  case X86::BI__builtin_ia32_compressqi512_mask: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    return emitX86CompressExpand(builder, loc, ops[0], ops[1], ops[2],
+                                 "x86.avx512.mask.compress");
+  }
   case X86::BI__builtin_ia32_gather3div2df:
   case X86::BI__builtin_ia32_gather3div2di:
   case X86::BI__builtin_ia32_gather3div4df:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
index accf1f60d7c32..9ba3e19d41566 100644
--- a/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vl-builtins.c
@@ -199,3 +199,36 @@ __m256i test_mm256_mask_i32gather_epi32(__m256i __v1_old, __mmask8 __mask, __m25
   // OGCG: @llvm.x86.avx512.mask.gather3siv8.si
   return _mm256_mmask_i32gather_epi32(__v1_old, __mask, __index, __addr, 2); 
 }
+
+__m128d test_mm_mask_expand_pd(__m128d __W, __mmask8 __U, __m128d __A) {
+  // CIR-LABEL: _mm_mask_expand_pd
+  // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_mm_mask_expand_pd
+  // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  // OGCG-LABEL: test_mm_mask_expand_pd
+  // OGCG: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  return _mm_mask_expand_pd(__W,__U,__A);
+}
+
+__m128d test_mm_maskz_expand_pd(__mmask8 __U, __m128d __A) {
+  // CIR-LABEL: _mm_maskz_expand_pd
+  // CIR: %[[MASK:.*]] = cir.cast bitcast {{.*}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // CIR: %[[SHUF:.*]] = cir.vec.shuffle(%[[MASK]], %[[MASK]] : !cir.vector<8 x !cir.int<u, 1>>) [#cir.int<0> : !s32i, #cir.int<1> : !s32i] : !cir.vector<2 x !cir.int<u, 1>>
+
+  // LLVM-LABEL: test_mm_maskz_expand_pd
+  // LLVM: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // LLVM: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  // OGCG-LABEL: test_mm_maskz_expand_pd
+  // OGCG: %[[BC:.*]] = bitcast i8 %{{.*}} to <8 x i1>
+  // OGCG: %[[SHUF:.*]] = shufflevector <8 x i1> %[[BC]], <8 x i1> %[[BC]], <2 x i32> <i32 0, i32 1>
+
+  return _mm_maskz_expand_pd(__U,__A);
+}
+
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
new file mode 100644
index 0000000000000..964971d71eb6c
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vlvbmi2-builtins.c
@@ -0,0 +1,171 @@
+
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-cir -o %t.cir -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c++ -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vlvbmi2 -fclangir -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+#include <immintrin.h>
+
+
+__m128i test_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_mask_compress_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK8]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_compress_epi16
+  // %[[MASK8:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.compress.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK8]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_compress_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_compress_epi8
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_mask_compress_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_compress_epi8
+  // %[[ZERO:.+]] = cir.call @_mm_setzero_si128() : () -> !cir.vector<2 x !s64i>
+  // %[[CAST1:.+]] = cir.cast bitcast %[[ZERO]] : !cir.vector<2 x !s64i> -> !cir.vector<16 x !s8i>
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.compress" %{{.+}}, %[[CAST1]], %[[MASK16]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST2:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_compress_epi8
+  // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+  // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+  // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_compress_epi8
+  // store <2 x i64> zeroinitializer, ptr %{{.+}}, align 16
+  // %[[CAST1:.+]] = bitcast <2 x i64> %{{.+}} to <16 x i8>
+  // %[[MASK16:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.compress.v16i8(<16 x i8> %{{.+}}, <16 x i8> %[[CAST1]], <16 x i1> %[[MASK16]])
+  // %[[CAST2:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_compress_epi8(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK16]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_expand_epi16
+  // %[[MASK16:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK16]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_mask_expand_epi16(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u8i -> !cir.vector<8 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<8 x !s16i>, !cir.vector<8 x !s16i>, !cir.vector<8 x !cir.int<u, 1>>) -> !cir.vector<8 x !s16i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<8 x !s16i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_expand_epi16
+  // %[[MASK:.+]] = bitcast i8 %{{.+}} to <8 x i1>
+  // %[[RES:.+]] = call <8 x i16> @llvm.x86.avx512.mask.expand.v8i16(<8 x i16> %{{.+}}, <8 x i16> %{{.+}}, <8 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <8 x i16> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_expand_epi16(__U, __D);
+}
+
+__m128i test_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_mask_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_mask_expand_epi8(__S, __U, __D);
+}
+
+__m128i test_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) {
+  // CIR-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = cir.cast bitcast %{{.+}} : !u16i -> !cir.vector<16 x !cir.int<u, 1>>
+  // %[[RES:.+]] = cir.call_llvm_intrinsic "x86.avx512.mask.expand" %{{.+}}, %{{.+}}, %[[MASK]]: (!cir.vector<16 x !s8i>, !cir.vector<16 x !s8i>, !cir.vector<16 x !cir.int<u, 1>>) -> !cir.vector<16 x !s8i>
+  // %[[CAST:.+]] = cir.cast bitcast %[[RES]] : !cir.vector<16 x !s8i> -> !cir.vector<2 x !s64i>
+
+  // LLVM-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  // OGCG-LABEL: test_mm_maskz_expand_epi8
+  // %[[MASK:.+]] = bitcast i16 %{{.+}} to <16 x i1>
+  // %[[RES:.+]] = call <16 x i8> @llvm.x86.avx512.mask.expand.v16i8(<16 x i8> %{{.+}}, <16 x i8> %{{.+}}, <16 x i1> %[[MASK]])
+  // %[[CAST:.+]] = bitcast <16 x i8> %[[RES]] to <2 x i64>
+
+  return _mm_maskz_expand_epi8(__U, __D);
+}

From ab8208f4f13a256e923342e54439d17022befa01 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 17:25:55 +0000
Subject: [PATCH 31/63] [lldb][docs] Fix Visual Studio link in build doc

Fixes warning:
build.rst:107: WARNING: 'any' reference target not found: https://visualstudio.microsoft.com
---
 lldb/docs/resources/build.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/docs/resources/build.rst b/lldb/docs/resources/build.rst
index 2eb167709dbda..9f76b3a6719c6 100644
--- a/lldb/docs/resources/build.rst
+++ b/lldb/docs/resources/build.rst
@@ -104,7 +104,7 @@ Build Requirements
 
 Please follow the steps below if you only want to **build** lldb.
 
-1. Install `Visual Studio <https://visualstudio.microsoft.com>` with the
+1. Install `Visual Studio <https://visualstudio.microsoft.com>`_ with the
    "Desktop Development with C++" workload. Make sure that the latest Windows
    SDK and the Active Template Library (ATL) are installed.
 2. Install `Git Bash <https://git-scm.com/install/windows>`_ and add

From 0aa8b8292314be29b02c8d93335c7b04d0fc2221 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 9 Dec 2025 17:27:29 +0000
Subject: [PATCH 32/63] [lldb][docs] Fix plaintext markers in command map

Single backticks RST tries to resolve to a reference.
Double means plaintext.

Fixes these warnings:
map.rst:803: WARNING: 'any' reference target not found: target.prefer-dynamic-value
map.rst:814: WARNING: 'any' reference target not found: expr
---
 lldb/docs/use/map.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/docs/use/map.rst b/lldb/docs/use/map.rst
index da566e7afe058..1221546db45da 100644
--- a/lldb/docs/use/map.rst
+++ b/lldb/docs/use/map.rst
@@ -802,7 +802,7 @@ Print the dynamic type of the result of an expression
 
 LLDB does this automatically if determining the dynamic type does not require
 running the target (in C++, running the target is never needed). This default is
-controlled by the `target.prefer-dynamic-value` setting. If that is disabled, it
+controlled by the ``target.prefer-dynamic-value`` setting. If that is disabled, it
 can be re-enabled on a per-command basis:
 
 .. code-block:: shell
@@ -812,7 +812,7 @@ can be re-enabled on a per-command basis:
   (lldb) expr -d no-run-target -- someCPPObjectPtr
 
 Note that printing of the dynamic type of references is not possible with the
-`expr` command. The workaround is to take the address of the reference and
+``expr`` command. The workaround is to take the address of the reference and
 instruct lldb to print the children of the resulting pointer.
 
 .. code-block:: shell

From 1dacdbe6669a5df92d7e2a4dcf11110eaaaa41ea Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya@gmail.com>
Date: Wed, 10 Dec 2025 02:29:23 +0900
Subject: [PATCH 33/63] [Clang] Export inline move constructors in dllexport-ed
 template instantiations on non-MSVC targets (#168170)

Previously, even when MSVC compatibility was not requested, inline move
constructors in dllexport-ed templates were not exported, which was
seemingly unintended.
On non-MSVC targets (MinGW, Cygwin, and PS), such move constructors
should be exported consistently with copy constructors and with the
behavior of modern MSVC.
---
 clang/include/clang/Basic/LangOptions.h         |  2 ++
 clang/lib/Sema/SemaDeclCXX.cpp                  |  1 +
 clang/test/CodeGenCXX/dllexport.cpp             |  1 +
 clang/test/CodeGenCXX/dllimport.cpp             | 10 ++++++----
 .../CodeGenCXX/mingw-template-dllexport.cpp     | 17 +++++++++++++++++
 5 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 3f042f8ddb5a1..61ee0275283fc 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -623,6 +623,8 @@ class LangOptions : public LangOptionsBase {
            !ObjCSubscriptingLegacyRuntime;
   }
 
+  bool isCompatibleWithMSVC() const { return MSCompatibilityVersion > 0; }
+
   bool isCompatibleWithMSVC(MSVCMajorVersion MajorVersion) const {
     return MSCompatibilityVersion >= MajorVersion * 100000U;
   }
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 3bc748969065a..1cadaf54b7bdd 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -6627,6 +6627,7 @@ void Sema::checkClassLevelDLLAttribute(CXXRecordDecl *Class) {
         auto *Ctor = dyn_cast<CXXConstructorDecl>(MD);
         if ((MD->isMoveAssignmentOperator() ||
              (Ctor && Ctor->isMoveConstructor())) &&
+            getLangOpts().isCompatibleWithMSVC() &&
             !getLangOpts().isCompatibleWithMSVC(LangOptions::MSVC2015))
           continue;
 
diff --git a/clang/test/CodeGenCXX/dllexport.cpp b/clang/test/CodeGenCXX/dllexport.cpp
index dfbb2762ac85c..2c9e7d36d2cbe 100644
--- a/clang/test/CodeGenCXX/dllexport.cpp
+++ b/clang/test/CodeGenCXX/dllexport.cpp
@@ -1130,5 +1130,6 @@ class __declspec(dllexport) ACE_Shared_Object {
 class __declspec(dllexport) ACE_Service_Object : public ACE_Shared_Object {};
 // Implicit move constructor declaration.
 // MSVC2015-DAG: define weak_odr dso_local dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
+// PS-DAG: define weak_odr dllexport void @_ZN18ACE_Service_ObjectC1EOS_
 // The declarations should not be exported.
 // MSVC2013-NOT: define weak_odr dso_local dllexport {{.+}}ACE_Service_Object@@Q{{.+}}@$$Q
diff --git a/clang/test/CodeGenCXX/dllimport.cpp b/clang/test/CodeGenCXX/dllimport.cpp
index 363f97a8d58ee..ed1c72c5185d3 100644
--- a/clang/test/CodeGenCXX/dllimport.cpp
+++ b/clang/test/CodeGenCXX/dllimport.cpp
@@ -35,7 +35,7 @@ struct ExplicitSpec_NotImported {};
 #define USEMEMFUNC(class, func) void (class::*UNIQ(use)())() { return &class::func; }
 #define USESTATICMEMFUNC(class, func) void (*UNIQ(use)())() { return &class::func; }
 #define USECLASS(class) void UNIQ(USE)() { class x; }
-#define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(class&) { return &class::operator=; }
+#define USECOPYASSIGN(class) class& (class::*UNIQ(use)())(const class&) { return &class::operator=; }
 #define USEMOVEASSIGN(class) class& (class::*UNIQ(use)())(class&&) { return &class::operator=; }
 
 //===----------------------------------------------------------------------===//
@@ -649,13 +649,15 @@ struct __declspec(dllimport) T {
   static int b;
   // MO1-DAG: @"?b@T@@2HA" = external dllimport global i32
 
-  T& operator=(T&) = default;
-  // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@AAU0@@Z"
+  T& operator=(const T&) = default;
+  // MO1-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@ABU0@@Z"
+  // PS-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN1TaSERKS_
 
   T& operator=(T&&) = default;
-  // Note: Don't mark inline move operators dllimport because current MSVC versions don't export them.
+  // Note: Don't mark inline move operators dllimport because MSVC versions before 2015 don't export them.
   // M18-DAG: define linkonce_odr dso_local x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@$$QAU0@@Z"
   // M19-DAG: define available_externally dllimport x86_thiscallcc nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @"??4T@@QAEAAU0@$$QAU0@@Z"
+  // PS-DAG: declare dllimport nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) ptr @_ZN1TaSEOS_
 };
 USEMEMFUNC(T, a)
 USESTATICMEMFUNC(T, StaticMethod)
diff --git a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
index de112d6da53db..9f116c46853b6 100644
--- a/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
+++ b/clang/test/CodeGenCXX/mingw-template-dllexport.cpp
@@ -10,11 +10,16 @@
 
 template <class T>
 class c {
+public:
+  c(const c &) {}
+  c(c &&) noexcept {}
   void f() {}
 };
 
 template class __declspec(dllexport) c<int>;
 
+// CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiEC1ERKS0_
+// CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiEC1EOS0_
 // CHECK: define {{.*}} dllexport {{.*}} @_ZN1cIiE1fEv
 
 extern template class __declspec(dllexport) c<char>;
@@ -27,6 +32,18 @@ template class __declspec(dllexport) c<double>;
 
 // CHECK-NOT: define {{.*}} dllexport {{.*}} @_ZN1cIdE1fEv
 
+extern template class __declspec(dllimport) c<short>;
+
+// CHECK: declare dllimport {{.*}} @_ZN1cIsEC1ERKS0_
+// CHECK: declare dllimport {{.*}} @_ZN1cIsEC1EOS0_
+// CHECK: declare dllimport {{.*}} @_ZN1cIsE1fEv
+
+void use_ctors(c<short> &&x) {
+  c<short> y{x};
+  c<short> z{static_cast<c<short> &&>(x)};
+  z.f();
+}
+
 template <class T>
 struct outer {
   void f();

From 0a39d1ff9e4f8e8fb5138bdd21d88ed163aede3b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 9 Dec 2025 17:31:58 +0000
Subject: [PATCH 34/63] [gn build] Port 1bada0af22d8

---
 llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
index 7b0db8863c6f0..a6590e86e13b5 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/NVPTX/BUILD.gn
@@ -44,6 +44,7 @@ static_library("LLVMNVPTXCodeGen") {
     "NVPTXForwardParams.cpp",
     "NVPTXFrameLowering.cpp",
     "NVPTXGenericToNVVM.cpp",
+    "NVPTXIRPeephole.cpp",
     "NVPTXISelDAGToDAG.cpp",
     "NVPTXISelLowering.cpp",
     "NVPTXImageOptimizer.cpp",

From c0eac77f3cc94a1671297b0ef09e1875f7f74dda Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Tue, 9 Dec 2025 21:00:27 +0300
Subject: [PATCH 35/63] [ADT] BitVector: give `subsetOf(RHS)` name to
 `!test(RHS)` (NFC) (#170875)

Define `LHS.subsetOf(RHS)` as a more descriptive name for `!LHS.test(RHS)`
and update the existing callers to use that name.

Co-authored-by: Jakub Kuderski <jakub@nod-labs.com>
---
 bolt/lib/Passes/PAuthGadgetScanner.cpp        |  2 +-
 llvm/include/llvm/ADT/BitVector.h             |  5 +-
 llvm/include/llvm/ADT/SmallBitVector.h        |  6 +-
 llvm/lib/Analysis/StackLifetime.cpp           |  4 +-
 .../CodeGen/AsmPrinter/DwarfExpression.cpp    |  2 +-
 llvm/lib/CodeGen/StackColoring.cpp            |  4 +-
 .../lib/Target/Hexagon/HexagonBitSimplify.cpp |  3 +-
 llvm/lib/Target/Hexagon/HexagonGenInsert.cpp  |  3 +-
 llvm/unittests/ADT/BitVectorTest.cpp          | 90 +++++++++++++++++++
 9 files changed, 107 insertions(+), 12 deletions(-)

diff --git a/bolt/lib/Passes/PAuthGadgetScanner.cpp b/bolt/lib/Passes/PAuthGadgetScanner.cpp
index 01b350b2f11fe..d38a7fadb0767 100644
--- a/bolt/lib/Passes/PAuthGadgetScanner.cpp
+++ b/bolt/lib/Passes/PAuthGadgetScanner.cpp
@@ -547,7 +547,7 @@ class SrcSafetyAnalysis {
 
     // Being trusted is a strictly stronger property than being
     // safe-to-dereference.
-    assert(!Next.TrustedRegs.test(Next.SafeToDerefRegs) &&
+    assert(Next.TrustedRegs.subsetOf(Next.SafeToDerefRegs) &&
            "SafeToDerefRegs should contain all TrustedRegs");
 
     return Next;
diff --git a/llvm/include/llvm/ADT/BitVector.h b/llvm/include/llvm/ADT/BitVector.h
index cc3f3a9226395..f4645c18a93f0 100644
--- a/llvm/include/llvm/ADT/BitVector.h
+++ b/llvm/include/llvm/ADT/BitVector.h
@@ -550,7 +550,7 @@ class BitVector {
     return *this;
   }
 
-  /// test - Check if (This - RHS) is zero.
+  /// test - Check if (This - RHS) is non-zero.
   /// This is the same as reset(RHS) and any().
   bool test(const BitVector &RHS) const {
     unsigned ThisWords = Bits.size();
@@ -567,6 +567,9 @@ class BitVector {
     return false;
   }
 
+  /// subsetOf - Check if This is a subset of RHS.
+  bool subsetOf(const BitVector &RHS) const { return !test(RHS); }
+
   template <class F, class... ArgTys>
   static BitVector &apply(F &&f, BitVector &Out, BitVector const &Arg,
                           ArgTys const &...Args) {
diff --git a/llvm/include/llvm/ADT/SmallBitVector.h b/llvm/include/llvm/ADT/SmallBitVector.h
index 5b2a5221b791f..978dc3f073031 100644
--- a/llvm/include/llvm/ADT/SmallBitVector.h
+++ b/llvm/include/llvm/ADT/SmallBitVector.h
@@ -552,7 +552,8 @@ class SmallBitVector {
     return *this;
   }
 
-  /// Check if (This - RHS) is zero. This is the same as reset(RHS) and any().
+  /// Check if (This - RHS) is non-zero.
+  /// This is the same as reset(RHS) and any().
   bool test(const SmallBitVector &RHS) const {
     if (isSmall() && RHS.isSmall())
       return (getSmallBits() & ~RHS.getSmallBits()) != 0;
@@ -571,6 +572,9 @@ class SmallBitVector {
     return false;
   }
 
+  /// Check if This is a subset of RHS.
+  bool subsetOf(const SmallBitVector &RHS) const { return !test(RHS); }
+
   SmallBitVector &operator|=(const SmallBitVector &RHS) {
     resize(std::max(size(), RHS.size()));
     if (isSmall() && RHS.isSmall())
diff --git a/llvm/lib/Analysis/StackLifetime.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 1e20fca965ace..30e0316b882cc 100644
--- a/llvm/lib/Analysis/StackLifetime.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -173,7 +173,7 @@ void StackLifetime::calculateLocalLiveness() {
         BitsIn.resize(NumAllocas, true);
 
       // Update block LiveIn set, noting whether it has changed.
-      if (BitsIn.test(BlockInfo.LiveIn)) {
+      if (!BitsIn.subsetOf(BlockInfo.LiveIn)) {
         BlockInfo.LiveIn |= BitsIn;
       }
 
@@ -198,7 +198,7 @@ void StackLifetime::calculateLocalLiveness() {
       }
 
       // Update block LiveOut set, noting whether it has changed.
-      if (BitsIn.test(BlockInfo.LiveOut)) {
+      if (!BitsIn.subsetOf(BlockInfo.LiveOut)) {
         Changed = true;
         BlockInfo.LiveOut |= BitsIn;
       }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index c7d45897c403b..d6b06b83207c7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -164,7 +164,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
 
     // If this sub-register has a DWARF number and we haven't covered
     // its range, and its range covers the value, emit a DWARF piece for it.
-    if (Offset < MaxSize && CurSubReg.test(Coverage)) {
+    if (Offset < MaxSize && !CurSubReg.subsetOf(Coverage)) {
       // Emit a piece for any gap in the coverage.
       if (Offset > CurPos)
         DwarfRegs.push_back(Register::createSubRegister(
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index f7862641d94b9..4ec8b8b3646c1 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -815,13 +815,13 @@ void StackColoring::calculateLocalLiveness() {
       LocalLiveOut |= BlockInfo.Begin;
 
       // Update block LiveIn set, noting whether it has changed.
-      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+      if (!LocalLiveIn.subsetOf(BlockInfo.LiveIn)) {
         changed = true;
         BlockInfo.LiveIn |= LocalLiveIn;
       }
 
       // Update block LiveOut set, noting whether it has changed.
-      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+      if (!LocalLiveOut.subsetOf(BlockInfo.LiveOut)) {
         changed = true;
         BlockInfo.LiveOut |= LocalLiveOut;
       }
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 557a0a3f27819..848337457c997 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -137,8 +137,7 @@ namespace {
       return !Bits.any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.test(B)  <=>  A-B != {}
-      return !Rs.Bits.test(Bits);
+      return Rs.Bits.subsetOf(Bits);
     }
     bool intersects(const RegisterSet &Rs) const {
       return Bits.anyCommon(Rs.Bits);
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index ff876f6595350..18fcd6a4873fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -153,8 +153,7 @@ namespace {
       return !BitVector::any();
     }
     bool includes(const RegisterSet &Rs) const {
-      // A.BitVector::test(B)  <=>  A-B != {}
-      return !Rs.BitVector::test(*this);
+      return Rs.BitVector::subsetOf(*this);
     }
     bool intersects(const RegisterSet &Rs) const {
       return BitVector::anyCommon(Rs);
diff --git a/llvm/unittests/ADT/BitVectorTest.cpp b/llvm/unittests/ADT/BitVectorTest.cpp
index e13523b8e10c3..d3200b7722ee3 100644
--- a/llvm/unittests/ADT/BitVectorTest.cpp
+++ b/llvm/unittests/ADT/BitVectorTest.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "gtest/gtest.h"
+#include <initializer_list>
 
 using namespace llvm;
 
@@ -835,19 +836,27 @@ TYPED_TEST(BitVectorTest, BinOps) {
   A.resize(65);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(B));
+  EXPECT_TRUE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.resize(64);
   A.set(64);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.set(63);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_FALSE(B.subsetOf(A));
 
   A.set(63);
   EXPECT_TRUE(A.anyCommon(B));
   EXPECT_TRUE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
 
   B.resize(70);
   B.set(64);
@@ -855,6 +864,87 @@ TYPED_TEST(BitVectorTest, BinOps) {
   A.resize(64);
   EXPECT_FALSE(A.anyCommon(B));
   EXPECT_FALSE(B.anyCommon(A));
+  EXPECT_FALSE(A.subsetOf(B));
+  EXPECT_FALSE(B.subsetOf(A));
+
+  B.set(63);
+  B.reset(64);
+  EXPECT_TRUE(A.anyCommon(B));
+  EXPECT_TRUE(B.anyCommon(A));
+  EXPECT_TRUE(A.subsetOf(B));
+  EXPECT_TRUE(B.subsetOf(A));
+}
+
+template <typename VecType>
+static inline VecType
+createBitVectorFromBits(uint32_t Size, std::initializer_list<int> SetBits) {
+  VecType V;
+  V.resize(Size);
+  for (int BitIndex : SetBits)
+    V.set(BitIndex);
+  return V;
+}
+
+TYPED_TEST(BitVectorTest, BinOpsLiteral) {
+  // More tests of binary operations with more focus on the semantics and
+  // less focus on mutability.
+
+  auto AnyCommon = [](uint32_t SizeLHS, std::initializer_list<int> SetBitsLHS,
+                      uint32_t SizeRHS, std::initializer_list<int> SetBitsRHS) {
+    auto LHS = createBitVectorFromBits<TypeParam>(SizeLHS, SetBitsLHS);
+    auto RHS = createBitVectorFromBits<TypeParam>(SizeRHS, SetBitsRHS);
+    return LHS.anyCommon(RHS);
+  };
+  auto SubsetOf = [](uint32_t SizeLHS, std::initializer_list<int> SetBitsLHS,
+                     uint32_t SizeRHS, std::initializer_list<int> SetBitsRHS) {
+    auto LHS = createBitVectorFromBits<TypeParam>(SizeLHS, SetBitsLHS);
+    auto RHS = createBitVectorFromBits<TypeParam>(SizeRHS, SetBitsRHS);
+    return LHS.subsetOf(RHS);
+  };
+
+  // clang-format off
+
+  // Test small-sized vectors.
+  EXPECT_TRUE (AnyCommon(10, {1, 2, 3}, 10, {3, 4, 5}));
+  EXPECT_FALSE(AnyCommon(10, {1, 2, 3}, 10, {4, 5}));
+
+  EXPECT_FALSE(SubsetOf(10, {1, 2, 3}, 10, {2, 3, 4}));
+  EXPECT_TRUE (SubsetOf(10, {2, 3},    10, {2, 3, 4}));
+  EXPECT_FALSE(SubsetOf(10, {1, 2, 3}, 10, {2, 3}));
+  EXPECT_TRUE (SubsetOf(10, {1, 2, 3}, 10, {1, 2, 3}));
+
+  // Test representations of empty sets of various sizes.
+  EXPECT_FALSE(AnyCommon(10,  {}, 10,  {}));
+  EXPECT_FALSE(AnyCommon(10,  {}, 123, {}));
+  EXPECT_FALSE(AnyCommon(123, {}, 10,  {}));
+  EXPECT_FALSE(AnyCommon(123, {}, 123, {}));
+  EXPECT_TRUE(SubsetOf(10,  {}, 10,  {}));
+  EXPECT_TRUE(SubsetOf(10,  {}, 123, {}));
+  EXPECT_TRUE(SubsetOf(123, {}, 10,  {}));
+  EXPECT_TRUE(SubsetOf(123, {}, 123, {}));
+
+  // Test handling of the remainder words.
+  EXPECT_FALSE(AnyCommon(10,  {1, 2},  123, {5, 70}));
+  EXPECT_TRUE (AnyCommon(10,  {1, 2},  123, {1, 70}));
+  EXPECT_FALSE(AnyCommon(123, {5, 70}, 10,  {1, 2}));
+  EXPECT_TRUE (AnyCommon(123, {1, 70}, 10,  {1, 2}));
+
+  EXPECT_FALSE(AnyCommon(10,  {1, 2}, 123, {5}));
+  EXPECT_TRUE (AnyCommon(10,  {1, 2}, 123, {1}));
+  EXPECT_FALSE(AnyCommon(123, {5},    10,  {1, 2}));
+  EXPECT_TRUE (AnyCommon(123, {1},    10,  {1, 2}));
+
+  EXPECT_FALSE(SubsetOf(10,  {1, 2},     123, {2, 70}));
+  EXPECT_TRUE (SubsetOf(10,  {1, 2},     123, {1, 2, 70}));
+  EXPECT_FALSE(SubsetOf(123, {2, 70},    10,  {1, 2}));
+  EXPECT_FALSE(SubsetOf(123, {1, 2, 70}, 10,  {1, 2}));
+
+  EXPECT_FALSE(SubsetOf(10,  {1, 2}, 123, {2}));
+  EXPECT_TRUE (SubsetOf(10,  {1, 2}, 123, {1, 2}));
+  EXPECT_TRUE (SubsetOf(123, {2},    10,  {1, 2}));
+  EXPECT_TRUE (SubsetOf(123, {1, 2}, 10,  {1, 2}));
+
+  // clang-format on
 }
 
 using RangeList = std::vector<std::pair<int, int>>;

From cd805a73737a951049a106de0f61b50e194d7241 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 9 Dec 2025 19:09:33 +0100
Subject: [PATCH 36/63] [LLDB] Run MSVC STL (forward-)list test with PDB
 (#166953)

Since PDB doesn't have template information, we need to get the element
type from somewhere else. I'm using the type of `_Myval` in a list node,
which holds the element type.
---
 .../Plugins/Language/CPlusPlus/GenericList.cpp | 18 ++++++++++++++++++
 .../TestDataFormatterGenericForwardList.py     |  2 ++
 .../list/TestDataFormatterGenericList.py       |  2 ++
 .../loop/TestDataFormatterGenericListLoop.py   |  1 +
 4 files changed, 23 insertions(+)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
index 5289027fbd8af..8c5ac31aef3f3 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/GenericList.cpp
@@ -203,6 +203,16 @@ class MsvcStlListFrontEnd : public AbstractListFrontEnd<StlType::MsvcStl> {
   ValueObject *m_tail = nullptr;
 };
 
+/// Gets the (forward-)list element type from the head node instead of the
+/// template arguments. This is needed with PDB as it doesn't have info about
+/// the template arguments.
+CompilerType GetMsvcStlElementTypeFromHead(ValueObject &head) {
+  auto val_sp = head.GetChildMemberWithName("_Myval");
+  if (val_sp)
+    return val_sp->GetCompilerType();
+  return CompilerType();
+}
+
 } // end anonymous namespace
 
 template <StlType Stl>
@@ -530,6 +540,10 @@ lldb::ChildCacheState MsvcStlForwardListFrontEnd::Update() {
           m_backend.GetChildAtNamePath({"_Mypair", "_Myval2", "_Myhead"}))
     m_head = head_sp.get();
 
+  // With PDB, we can't get the element type from the template arguments
+  if (!m_element_type && m_head)
+    m_element_type = GetMsvcStlElementTypeFromHead(*m_head);
+
   return ChildCacheState::eRefetch;
 }
 
@@ -606,6 +620,10 @@ lldb::ChildCacheState MsvcStlListFrontEnd::Update() {
   m_head = first.get();
   m_tail = last.get();
 
+  // With PDB, we can't get the element type from the template arguments
+  if (!m_element_type && m_head)
+    m_element_type = GetMsvcStlElementTypeFromHead(*m_head);
+
   return lldb::ChildCacheState::eRefetch;
 }
 
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
index 45695c43b42a9..1db0c489bc7f9 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/forward_list/TestDataFormatterGenericForwardList.py
@@ -9,6 +9,8 @@
 
 
 class TestDataFormatterGenericForwardList(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         TestBase.setUp(self)
         self.line = line_number("main.cpp", "// break here")
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
index c0207e6ab5911..fbd021190214b 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/TestDataFormatterGenericList.py
@@ -10,6 +10,8 @@
 
 
 class GenericListDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def setUp(self):
         # Call super's setUp().
         TestBase.setUp(self)
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
index f6174dd786380..9c5daf760b31f 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/list/loop/TestDataFormatterGenericListLoop.py
@@ -11,6 +11,7 @@
 
 
 class GenericListDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
     NO_DEBUG_INFO_TESTCASE = True
 
     def do_test_with_run_command(self):

From d796d73631a9c9cc8a0c6386dba20938b22d34c5 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 21 Aug 2025 11:35:12 -0700
Subject: [PATCH 37/63] [MLIR] Apply clang-tidy fixes for
 readability-identifier-naming in Inliner.cpp (NFC)

---
 mlir/lib/Transforms/Utils/Inliner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index 26c965cfc0237..40950312d566f 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -613,8 +613,8 @@ Inliner::Impl::inlineCallsInSCC(InlinerInterfaceImpl &inlinerIface,
 
   LLVM_DEBUG({
     LDBG() << "* Inliner: Initial calls in SCC are: {";
-    for (unsigned i = 0, e = calls.size(); i < e; ++i)
-      LDBG() << "  " << i << ". " << calls[i].call << ",";
+    for (unsigned I = 0, E = calls.size(); I < E; ++I)
+      LDBG() << "  " << I << ". " << calls[I].call << ",";
     LDBG() << "}";
   });
 

From 00bccfca7cc4aacbbef6127a411c22c0e08bc466 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 9 Dec 2025 18:16:03 +0000
Subject: [PATCH 38/63] [X86] bitcnt-big-integer.ll - add additional test
 coverage where the source values are bitcast from vectors (#171481)

---
 llvm/test/CodeGen/X86/bitcnt-big-integer.ll | 1656 ++++++++++++++++++-
 1 file changed, 1634 insertions(+), 22 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index 0fd555991ae29..749b3ddc96d0d 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -52,6 +52,63 @@ define i32 @load_ctpop_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    popcntq %rcx, %rcx
+; SSE-NEXT:    popcntq %rax, %rax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    popcntq %rax, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    popcntq %rax, %rdx
+; AVX512F-NEXT:    popcntq %rcx, %rax
+; AVX512F-NEXT:    addl %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    popcntq %rcx, %rcx
+; AVX512VL-NEXT:    popcntq %rax, %rax
+; AVX512VL-NEXT:    addl %ecx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl %ecx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctpop.i128(i128 %a0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i256(i256 %a0) nounwind {
 ; CHECK-LABEL: test_ctpop_i256:
 ; CHECK:       # %bb.0:
@@ -183,6 +240,107 @@ define i32 @load_ctpop_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    popcntq %rsi, %rsi
+; SSE-NEXT:    popcntq %rdx, %rdx
+; SSE-NEXT:    addl %esi, %edx
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    popcntq %rax, %rsi
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    popcntq %rcx, %rax
+; SSE-NEXT:    addl %esi, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    popcntq %rdx, %rdx
+; AVX2-NEXT:    popcntq %rsi, %rsi
+; AVX2-NEXT:    addl %edx, %esi
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    popcntq %rax, %rdx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    popcntq %rcx, %rax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    popcntq %rdx, %rdx
+; AVX512F-NEXT:    popcntq %rsi, %rsi
+; AVX512F-NEXT:    addl %edx, %esi
+; AVX512F-NEXT:    popcntq %rax, %rdx
+; AVX512F-NEXT:    popcntq %rcx, %rax
+; AVX512F-NEXT:    addl %edx, %eax
+; AVX512F-NEXT:    addl %esi, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    vmovq %xmm0, %rcx
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    popcntq %rsi, %rsi
+; AVX512VL-NEXT:    popcntq %rdx, %rdx
+; AVX512VL-NEXT:    addl %esi, %edx
+; AVX512VL-NEXT:    xorl %esi, %esi
+; AVX512VL-NEXT:    popcntq %rax, %rsi
+; AVX512VL-NEXT:    xorl %eax, %eax
+; AVX512VL-NEXT:    popcntq %rcx, %rax
+; AVX512VL-NEXT:    addl %esi, %eax
+; AVX512VL-NEXT:    addl %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT:    popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT:    addl %esi, %edx
+; AVX512POPCNT-NEXT:    xorl %esi, %esi
+; AVX512POPCNT-NEXT:    popcntq %rax, %rsi
+; AVX512POPCNT-NEXT:    xorl %eax, %eax
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rax
+; AVX512POPCNT-NEXT:    addl %esi, %eax
+; AVX512POPCNT-NEXT:    addl %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctpop.i256(i256 %a0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i512(i512 %a0) nounwind {
 ; CHECK-LABEL: test_ctpop_i512:
 ; CHECK:       # %bb.0:
@@ -404,6 +562,166 @@ define i32 @load_ctpop_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctpop_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctpop_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    pextrq $1, %xmm2, %rdi
+; SSE-NEXT:    movq %xmm2, %r8
+; SSE-NEXT:    movq %xmm3, %r9
+; SSE-NEXT:    pextrq $1, %xmm3, %r10
+; SSE-NEXT:    popcntq %r10, %r10
+; SSE-NEXT:    popcntq %r9, %r9
+; SSE-NEXT:    addl %r10d, %r9d
+; SSE-NEXT:    popcntq %rdi, %rdi
+; SSE-NEXT:    popcntq %r8, %r8
+; SSE-NEXT:    addl %edi, %r8d
+; SSE-NEXT:    addl %r9d, %r8d
+; SSE-NEXT:    popcntq %rsi, %rsi
+; SSE-NEXT:    popcntq %rdx, %rdx
+; SSE-NEXT:    addl %esi, %edx
+; SSE-NEXT:    popcntq %rcx, %rcx
+; SSE-NEXT:    popcntq %rax, %rax
+; SSE-NEXT:    addl %ecx, %eax
+; SSE-NEXT:    addl %edx, %eax
+; SSE-NEXT:    addl %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctpop_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    popcntq %r9, %r9
+; AVX2-NEXT:    popcntq %r10, %r10
+; AVX2-NEXT:    addl %r9d, %r10d
+; AVX2-NEXT:    popcntq %rdi, %rdi
+; AVX2-NEXT:    popcntq %r8, %r8
+; AVX2-NEXT:    addl %edi, %r8d
+; AVX2-NEXT:    addl %r10d, %r8d
+; AVX2-NEXT:    popcntq %rsi, %rsi
+; AVX2-NEXT:    popcntq %rdx, %rdx
+; AVX2-NEXT:    addl %esi, %edx
+; AVX2-NEXT:    popcntq %rcx, %rcx
+; AVX2-NEXT:    popcntq %rax, %rax
+; AVX2-NEXT:    addl %ecx, %eax
+; AVX2-NEXT:    addl %edx, %eax
+; AVX2-NEXT:    addl %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctpop_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vmovq %xmm1, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX512F-NEXT:    vmovq %xmm1, %r8
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512F-NEXT:    vmovq %xmm0, %r10
+; AVX512F-NEXT:    popcntq %r9, %r9
+; AVX512F-NEXT:    popcntq %r10, %r10
+; AVX512F-NEXT:    addl %r9d, %r10d
+; AVX512F-NEXT:    popcntq %rdi, %rdi
+; AVX512F-NEXT:    popcntq %r8, %r8
+; AVX512F-NEXT:    addl %edi, %r8d
+; AVX512F-NEXT:    addl %r10d, %r8d
+; AVX512F-NEXT:    popcntq %rdx, %rdx
+; AVX512F-NEXT:    popcntq %rsi, %rsi
+; AVX512F-NEXT:    addl %edx, %esi
+; AVX512F-NEXT:    popcntq %rcx, %rcx
+; AVX512F-NEXT:    popcntq %rax, %rax
+; AVX512F-NEXT:    addl %ecx, %eax
+; AVX512F-NEXT:    addl %esi, %eax
+; AVX512F-NEXT:    addl %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctpop_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %rdi
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %r9
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512VL-NEXT:    popcntq %r10, %r10
+; AVX512VL-NEXT:    popcntq %r9, %r9
+; AVX512VL-NEXT:    addl %r10d, %r9d
+; AVX512VL-NEXT:    popcntq %r8, %r8
+; AVX512VL-NEXT:    popcntq %rdi, %rdi
+; AVX512VL-NEXT:    addl %r8d, %edi
+; AVX512VL-NEXT:    addl %r9d, %edi
+; AVX512VL-NEXT:    popcntq %rdx, %rdx
+; AVX512VL-NEXT:    popcntq %rsi, %rsi
+; AVX512VL-NEXT:    addl %edx, %esi
+; AVX512VL-NEXT:    popcntq %rcx, %rcx
+; AVX512VL-NEXT:    popcntq %rax, %rax
+; AVX512VL-NEXT:    addl %ecx, %eax
+; AVX512VL-NEXT:    addl %esi, %eax
+; AVX512VL-NEXT:    addl %edi, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctpop_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rax
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rdi
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512POPCNT-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %r9
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512POPCNT-NEXT:    popcntq %r10, %r10
+; AVX512POPCNT-NEXT:    popcntq %r9, %r9
+; AVX512POPCNT-NEXT:    addl %r10d, %r9d
+; AVX512POPCNT-NEXT:    popcntq %r8, %r8
+; AVX512POPCNT-NEXT:    popcntq %rdi, %rdi
+; AVX512POPCNT-NEXT:    addl %r8d, %edi
+; AVX512POPCNT-NEXT:    addl %r9d, %edi
+; AVX512POPCNT-NEXT:    popcntq %rdx, %rdx
+; AVX512POPCNT-NEXT:    popcntq %rsi, %rsi
+; AVX512POPCNT-NEXT:    addl %edx, %esi
+; AVX512POPCNT-NEXT:    popcntq %rcx, %rcx
+; AVX512POPCNT-NEXT:    popcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl %ecx, %eax
+; AVX512POPCNT-NEXT:    addl %esi, %eax
+; AVX512POPCNT-NEXT:    addl %edi, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.ctpop.i512(i512 %a0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctpop_i1024:
 ; SSE:       # %bb.0:
@@ -969,6 +1287,75 @@ define i32 @load_ctlz_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    bsrq %rdx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    lzcntq %rcx, %rdx
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    lzcntq %rcx, %rdx
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i256:
 ; SSE:       # %bb.0:
@@ -1125,6 +1512,135 @@ define i32 @load_ctlz_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    movq %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rax, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %r8d
+; SSE-NEXT:    bsrq %rdx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rcx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rcx, %rdi
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    lzcntq %rsi, %rdi
+; AVX512F-NEXT:    lzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    lzcntq %rcx, %rdi
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rsi, %rdx
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    lzcntq %rsi, %rdi
+; AVX512VL-NEXT:    lzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    lzcntq %rcx, %rdi
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rsi, %rdx
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rsi, %rdx
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i512:
 ; SSE:       # %bb.0:
@@ -1423,10 +1939,155 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F-LABEL: load_ctlz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rdi
+; SSE-NEXT:    movq %xmm2, %rsi
+; SSE-NEXT:    movq %xmm3, %r8
+; SSE-NEXT:    pextrq $1, %xmm3, %r9
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq %r8, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %r8d
+; SSE-NEXT:    bsrq %rdi, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rsi, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq %xmm1, %rdi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    ptest %xmm3, %xmm3
+; SSE-NEXT:    cmovnel %r8d, %esi
+; SSE-NEXT:    bsrq %rax, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %r8d, %edi
+; SSE-NEXT:    bsrq %rcx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq %rdx, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
+; AVX2-NEXT:    lzcntq %rdi, %rdi
+; AVX2-NEXT:    addl $64, %edi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %edi
+; AVX2-NEXT:    subl $-128, %edi
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    cmovnel %r11d, %edi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm1, %ymm1
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
@@ -1435,10 +2096,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    retq
 ;
-; AVX512VL-LABEL: load_ctlz_i512:
+; AVX512VL-LABEL: vector_ctlz_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
@@ -1448,10 +2109,10 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512POPCNT-LABEL: load_ctlz_i512:
+; AVX512POPCNT-LABEL: vector_ctlz_i512:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
 ; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
@@ -1460,7 +2121,7 @@ define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
-  %a0 = load i512, ptr %p0
+  %a0 = bitcast <16 x i32> %v0 to i512
   %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 0)
   %res = trunc i512 %cnt to i32
   ret i32 %res
@@ -2312,6 +2973,74 @@ define i32 @load_ctlz_undef_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    bsrq %rcx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq %rax, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i128:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    lzcntq %rcx, %rdx
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edx, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i128:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    lzcntq %rcx, %rdx
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edx, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i128:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdx
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edx, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.ctlz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i256:
 ; SSE:       # %bb.0:
@@ -2463,6 +3192,134 @@ define i32 @load_ctlz_undef_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rdx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    bsrq %rcx, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rax, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    lzcntq %rsi, %rdi
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    lzcntq %rcx, %rdi
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rdx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512F-NEXT:    lzcntq %rsi, %rdi
+; AVX512F-NEXT:    lzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    lzcntq %rcx, %rdi
+; AVX512F-NEXT:    lzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rsi, %rdx
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    lzcntq %rsi, %rdi
+; AVX512VL-NEXT:    lzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    lzcntq %rcx, %rdi
+; AVX512VL-NEXT:    lzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rsi, %rdx
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512POPCNT-NEXT:    lzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    lzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    lzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rsi, %rdx
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.ctlz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i512:
 ; SSE:       # %bb.0:
@@ -2796,6 +3653,147 @@ define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_ctlz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rsi
+; SSE-NEXT:    movq %xmm2, %rdx
+; SSE-NEXT:    movq %xmm3, %rdi
+; SSE-NEXT:    pextrq $1, %xmm3, %r8
+; SSE-NEXT:    bsrq %r8, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edi
+; SSE-NEXT:    bsrq %rsi, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rdx, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r8d, %edx
+; SSE-NEXT:    movq %xmm0, %rsi
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    ptest %xmm3, %xmm3
+; SSE-NEXT:    movq %xmm1, %r8
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    bsrq %rax, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %r8, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %edi, %r8d
+; SSE-NEXT:    bsrq %rcx, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rsi, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_ctlz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r8
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vmovq %xmm1, %rdi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
+; AVX2-NEXT:    lzcntq %rdi, %rdi
+; AVX2-NEXT:    addl $64, %edi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %edi
+; AVX2-NEXT:    subl $-128, %edi
+; AVX2-NEXT:    orq %rax, %r8
+; AVX2-NEXT:    cmovnel %r11d, %edi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq %rsi, %r9
+; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm1, %ymm1
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_ctlz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_ctlz_undef_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_ctlz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.ctlz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i1024:
 ; SSE:       # %bb.0:
@@ -3636,6 +4634,49 @@ define i32 @load_cttz_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vector_cttz_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %rax, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 0)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_cttz_i256:
 ; SSE:       # %bb.0:
@@ -3775,21 +4816,146 @@ define i32 @load_cttz_i256(ptr %p0) nounwind {
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT-LABEL: load_cttz_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
+; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
+; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
+define i32 @vector_cttz_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    rep bsfq %rax, %rdi
+; SSE-NEXT:    addl $64, %edi
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %edi
+; SSE-NEXT:    movq %xmm1, %rdx
+; SSE-NEXT:    rep bsfq %rdx, %rsi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rcx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    tzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rcx, %rdi
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    tzcntq %rsi, %rdi
+; AVX512F-NEXT:    tzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    tzcntq %rcx, %rdi
+; AVX512F-NEXT:    tzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rdx, %rsi
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    tzcntq %rsi, %rdi
+; AVX512VL-NEXT:    tzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    tzcntq %rcx, %rdi
+; AVX512VL-NEXT:    tzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rdx, %rsi
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i256:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX512POPCNT-NEXT:    vpcmpeqd %ymm1, %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT:    vpandn %ymm1, %ymm0, %ymm1
-; AVX512POPCNT-NEXT:    vpopcntq %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vptestmq %ymm0, %ymm0, %k1
-; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [256,256,256,256]
-; AVX512POPCNT-NEXT:    vpcompressq %ymm1, %ymm0 {%k1}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rdx, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX512POPCNT-NEXT:    vzeroupper
 ; AVX512POPCNT-NEXT:    retq
-  %a0 = load i256, ptr %p0
+  %a0 = bitcast <8 x i32> %v0 to i256
   %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 0)
   %res = trunc i256 %cnt to i32
   ret i32 %res
@@ -4128,6 +5294,148 @@ define i32 @load_cttz_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm3, %rdx
+; SSE-NEXT:    movq %xmm3, %rcx
+; SSE-NEXT:    pextrq $1, %xmm2, %rax
+; SSE-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE-NEXT:    movq %xmm1, %rdi
+; SSE-NEXT:    pextrq $1, %xmm0, %r8
+; SSE-NEXT:    movq %xmm0, %r9
+; SSE-NEXT:    rep bsfq %r9, %r10
+; SSE-NEXT:    rep bsfq %r8, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %r8d
+; SSE-NEXT:    rep bsfq %rdi, %r9
+; SSE-NEXT:    rep bsfq %rsi, %rsi
+; SSE-NEXT:    addl $64, %esi
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq %xmm2, %rdi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %r8d, %esi
+; SSE-NEXT:    rep bsfq %rdi, %r8
+; SSE-NEXT:    rep bsfq %rax, %r9
+; SSE-NEXT:    addl $64, %r9d
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %r9d
+; SSE-NEXT:    rep bsfq %rcx, %rdi
+; SSE-NEXT:    movl $64, %eax
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    tzcntq %rdi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r8, %r10
+; AVX2-NEXT:    tzcntq %rsi, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rdx, %rdi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rdi, %r9
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 0)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_cttz_i1024:
 ; SSE:       # %bb.0:
@@ -4930,6 +6238,48 @@ define i32 @load_cttz_undef_i128(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i128(<4 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i128:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm0, %rax
+; SSE-NEXT:    movq %xmm0, %rcx
+; SSE-NEXT:    rep bsfq %rcx, %rdx
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    tzcntq %rcx, %rdx
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edx, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: vector_cttz_undef_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    tzcntq %rcx, %rdx
+; AVX512-NEXT:    tzcntq %rax, %rax
+; AVX512-NEXT:    addl $64, %eax
+; AVX512-NEXT:    testq %rcx, %rcx
+; AVX512-NEXT:    cmovnel %edx, %eax
+; AVX512-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512-NEXT:    retq
+  %a0 = bitcast <4 x i32> %v0 to i128
+  %cnt = call i128 @llvm.cttz.i128(i128 %a0, i1 -1)
+  %res = trunc i128 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i256(i256 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i256:
 ; SSE:       # %bb.0:
@@ -5084,6 +6434,130 @@ define i32 @load_cttz_undef_i256(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i256(<8 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm1, %rax
+; SSE-NEXT:    movq %xmm1, %rcx
+; SSE-NEXT:    pextrq $1, %xmm0, %rdx
+; SSE-NEXT:    movq %xmm0, %rsi
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rdx, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    rep bsfq %rcx, %rsi
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    tzcntq %rsi, %rdi
+; AVX2-NEXT:    tzcntq %rdx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rcx, %rdi
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    tzcntq %rsi, %rdi
+; AVX512F-NEXT:    tzcntq %rdx, %r8
+; AVX512F-NEXT:    addl $64, %r8d
+; AVX512F-NEXT:    testq %rsi, %rsi
+; AVX512F-NEXT:    cmovnel %edi, %r8d
+; AVX512F-NEXT:    tzcntq %rcx, %rdi
+; AVX512F-NEXT:    tzcntq %rax, %rax
+; AVX512F-NEXT:    addl $64, %eax
+; AVX512F-NEXT:    testq %rcx, %rcx
+; AVX512F-NEXT:    cmovnel %edi, %eax
+; AVX512F-NEXT:    subl $-128, %eax
+; AVX512F-NEXT:    orq %rdx, %rsi
+; AVX512F-NEXT:    cmovnel %r8d, %eax
+; AVX512F-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512VL-NEXT:    vmovq %xmm1, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rsi
+; AVX512VL-NEXT:    tzcntq %rsi, %rdi
+; AVX512VL-NEXT:    tzcntq %rdx, %r8
+; AVX512VL-NEXT:    addl $64, %r8d
+; AVX512VL-NEXT:    testq %rsi, %rsi
+; AVX512VL-NEXT:    cmovnel %edi, %r8d
+; AVX512VL-NEXT:    tzcntq %rcx, %rdi
+; AVX512VL-NEXT:    tzcntq %rax, %rax
+; AVX512VL-NEXT:    addl $64, %eax
+; AVX512VL-NEXT:    testq %rcx, %rcx
+; AVX512VL-NEXT:    cmovnel %edi, %eax
+; AVX512VL-NEXT:    subl $-128, %eax
+; AVX512VL-NEXT:    orq %rdx, %rsi
+; AVX512VL-NEXT:    cmovnel %r8d, %eax
+; AVX512VL-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i256:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512POPCNT-NEXT:    vmovq %xmm1, %rcx
+; AVX512POPCNT-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512POPCNT-NEXT:    vmovq %xmm0, %rsi
+; AVX512POPCNT-NEXT:    tzcntq %rsi, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rdx, %r8
+; AVX512POPCNT-NEXT:    addl $64, %r8d
+; AVX512POPCNT-NEXT:    testq %rsi, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %edi, %r8d
+; AVX512POPCNT-NEXT:    tzcntq %rcx, %rdi
+; AVX512POPCNT-NEXT:    tzcntq %rax, %rax
+; AVX512POPCNT-NEXT:    addl $64, %eax
+; AVX512POPCNT-NEXT:    testq %rcx, %rcx
+; AVX512POPCNT-NEXT:    cmovnel %edi, %eax
+; AVX512POPCNT-NEXT:    subl $-128, %eax
+; AVX512POPCNT-NEXT:    orq %rdx, %rsi
+; AVX512POPCNT-NEXT:    cmovnel %r8d, %eax
+; AVX512POPCNT-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <8 x i32> %v0 to i256
+  %cnt = call i256 @llvm.cttz.i256(i256 %a0, i1 -1)
+  %res = trunc i256 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i512:
 ; SSE:       # %bb.0:
@@ -5409,6 +6883,144 @@ define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
   ret i32 %res
 }
 
+define i32 @vector_cttz_undef_i512(<16 x i32> %v0) nounwind {
+; SSE-LABEL: vector_cttz_undef_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pextrq $1, %xmm3, %rax
+; SSE-NEXT:    pextrq $1, %xmm2, %rdx
+; SSE-NEXT:    pextrq $1, %xmm1, %rcx
+; SSE-NEXT:    movq %xmm1, %rsi
+; SSE-NEXT:    pextrq $1, %xmm0, %rdi
+; SSE-NEXT:    movq %xmm0, %r8
+; SSE-NEXT:    rep bsfq %r8, %r9
+; SSE-NEXT:    rep bsfq %rdi, %rdi
+; SSE-NEXT:    addl $64, %edi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edi
+; SSE-NEXT:    rep bsfq %rsi, %r8
+; SSE-NEXT:    rep bsfq %rcx, %rcx
+; SSE-NEXT:    addl $64, %ecx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r8d, %ecx
+; SSE-NEXT:    movq %xmm2, %rsi
+; SSE-NEXT:    subl $-128, %ecx
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %edi, %ecx
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rdx, %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %edx
+; SSE-NEXT:    movq %xmm3, %rsi
+; SSE-NEXT:    rep bsfq %rsi, %rdi
+; SSE-NEXT:    rep bsfq %rax, %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %ecx, %eax
+; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: vector_cttz_undef_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm1, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r9
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    tzcntq %rdi, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %r11d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r8, %r10
+; AVX2-NEXT:    tzcntq %rsi, %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %edi, %edi
+; AVX2-NEXT:    tzcntq %rdx, %rdi
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %edi, %r8d
+; AVX2-NEXT:    vmovq %xmm2, %rdi
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rdi, %r9
+; AVX2-NEXT:    tzcntq %rax, %rax
+; AVX2-NEXT:    addl $64, %eax
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %r9d, %eax
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: vector_cttz_undef_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512F-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: vector_cttz_undef_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,128,192,256,320,384,448,512]
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsubq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512POPCNT-LABEL: vector_cttz_undef_i512:
+; AVX512POPCNT:       # %bb.0:
+; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512POPCNT-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vzeroupper
+; AVX512POPCNT-NEXT:    retq
+  %a0 = bitcast <16 x i32> %v0 to i512
+  %cnt = call i512 @llvm.cttz.i512(i512 %a0, i1 -1)
+  %res = trunc i512 %cnt to i32
+  ret i32 %res
+}
+
 define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_cttz_undef_i1024:
 ; SSE:       # %bb.0:

From 93d2ef105703254769a8f182300b329dad5ed976 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 9 Dec 2025 20:20:54 +0200
Subject: [PATCH 39/63] [mlir][bytecode] Add support for deferred
 attribute/type parsing. (#170993)

Add ability to defer parsing and re-enqueueing oneself. This enables
changing CallSiteLoc parsing to not recurse as deeply: previously this
could fail (especially on large inputs in debug mode the recursion could
overflow). Add a default depth cutoff, this could be a parameter later
if needed.
---
 mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 257 ++++++++++++++++----
 mlir/unittests/Bytecode/BytecodeTest.cpp    |  37 +++
 2 files changed, 241 insertions(+), 53 deletions(-)

diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index 1659437e1eb24..dd367b5922558 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -27,6 +27,7 @@
 
 #include <cstddef>
 #include <cstdint>
+#include <deque>
 #include <list>
 #include <memory>
 #include <numeric>
@@ -830,6 +831,23 @@ namespace {
 /// This class provides support for reading attribute and type entries from the
 /// bytecode. Attribute and Type entries are read lazily on demand, so we use
 /// this reader to manage when to actually parse them from the bytecode.
+///
+/// The parsing of attributes & types are generally recursive, this can lead to
+/// stack overflows for deeply nested structures, so we track a few extra pieces
+/// of information to avoid this:
+///
+/// - `depth`: The current depth while parsing nested attributes. We defer on
+///   parsing deeply nested attributes to avoid potential stack overflows. The
+///   deferred parsing is achieved by reporting a failure when parsing a nested
+///   attribute/type and registering the index of the encountered attribute/type
+///   in the deferred parsing worklist. Hence, a failure with deffered entry
+///   does not constitute a failure, it also requires that folks return on
+///   first failure rather than attempting additional parses.
+/// - `deferredWorklist`: A list of attribute/type indices that we could not
+///   parse due to hitting the depth limit. The worklist is used to capture the
+///   indices of attributes/types that need to be parsed/reparsed when we hit
+///   the depth limit. This enables moving the tracking of what needs to be
+///   parsed to the heap.
 class AttrTypeReader {
   /// This class represents a single attribute or type entry.
   template <typename T>
@@ -863,12 +881,34 @@ class AttrTypeReader {
              ArrayRef<uint8_t> sectionData,
              ArrayRef<uint8_t> offsetSectionData);
 
+  LogicalResult readAttribute(uint64_t index, Attribute &result,
+                              uint64_t depth = 0) {
+    return readEntry(attributes, index, result, "attribute", depth);
+  }
+
+  LogicalResult readType(uint64_t index, Type &result, uint64_t depth = 0) {
+    return readEntry(types, index, result, "type", depth);
+  }
+
   /// Resolve the attribute or type at the given index. Returns nullptr on
   /// failure.
-  Attribute resolveAttribute(size_t index) {
-    return resolveEntry(attributes, index, "Attribute");
+  Attribute resolveAttribute(size_t index, uint64_t depth = 0) {
+    return resolveEntry(attributes, index, "Attribute", depth);
+  }
+  Type resolveType(size_t index, uint64_t depth = 0) {
+    return resolveEntry(types, index, "Type", depth);
+  }
+
+  Attribute getAttributeOrSentinel(size_t index) {
+    if (index >= attributes.size())
+      return nullptr;
+    return attributes[index].entry;
+  }
+  Type getTypeOrSentinel(size_t index) {
+    if (index >= types.size())
+      return nullptr;
+    return types[index].entry;
   }
-  Type resolveType(size_t index) { return resolveEntry(types, index, "Type"); }
 
   /// Parse a reference to an attribute or type using the given reader.
   LogicalResult parseAttribute(EncodingReader &reader, Attribute &result) {
@@ -909,23 +949,33 @@ class AttrTypeReader {
                             llvm::getTypeName<T>(), ", but got: ", baseResult);
   }
 
+  /// Add an index to the deferred worklist for re-parsing.
+  void addDeferredParsing(uint64_t index) { deferredWorklist.push_back(index); }
+
 private:
   /// Resolve the given entry at `index`.
   template <typename T>
-  T resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                 StringRef entryType);
+  T resolveEntry(SmallVectorImpl<Entry<T>> &entries, uint64_t index,
+                 StringRef entryType, uint64_t depth = 0);
 
-  /// Parse an entry using the given reader that was encoded using the textual
-  /// assembly format.
+  /// Read the entry at the given index, returning failure if the entry is not
+  /// yet resolved.
   template <typename T>
-  LogicalResult parseAsmEntry(T &result, EncodingReader &reader,
-                              StringRef entryType);
+  LogicalResult readEntry(SmallVectorImpl<Entry<T>> &entries, uint64_t index,
+                          T &result, StringRef entryType, uint64_t depth);
 
   /// Parse an entry using the given reader that was encoded using a custom
   /// bytecode format.
   template <typename T>
   LogicalResult parseCustomEntry(Entry<T> &entry, EncodingReader &reader,
-                                 StringRef entryType);
+                                 StringRef entryType, uint64_t index,
+                                 uint64_t depth);
+
+  /// Parse an entry using the given reader that was encoded using the textual
+  /// assembly format.
+  template <typename T>
+  LogicalResult parseAsmEntry(T &result, EncodingReader &reader,
+                              StringRef entryType);
 
   /// The string section reader used to resolve string references when parsing
   /// custom encoded attribute/type entries.
@@ -951,6 +1001,10 @@ class AttrTypeReader {
 
   /// Reference to the parser configuration.
   const ParserConfig &parserConfig;
+
+  /// Worklist for deferred attribute/type parsing. This is used to handle
+  /// deeply nested structures like CallSiteLoc iteratively.
+  std::vector<uint64_t> deferredWorklist;
 };
 
 class DialectReader : public DialectBytecodeReader {
@@ -959,10 +1013,11 @@ class DialectReader : public DialectBytecodeReader {
                 const StringSectionReader &stringReader,
                 const ResourceSectionReader &resourceReader,
                 const llvm::StringMap<BytecodeDialect *> &dialectsMap,
-                EncodingReader &reader, uint64_t &bytecodeVersion)
+                EncodingReader &reader, uint64_t &bytecodeVersion,
+                uint64_t depth = 0)
       : attrTypeReader(attrTypeReader), stringReader(stringReader),
         resourceReader(resourceReader), dialectsMap(dialectsMap),
-        reader(reader), bytecodeVersion(bytecodeVersion) {}
+        reader(reader), bytecodeVersion(bytecodeVersion), depth(depth) {}
 
   InFlightDiagnostic emitError(const Twine &msg) const override {
     return reader.emitError(msg);
@@ -998,14 +1053,40 @@ class DialectReader : public DialectBytecodeReader {
   // IR
   //===--------------------------------------------------------------------===//
 
+  /// The maximum depth to eagerly parse nested attributes/types before
+  /// deferring.
+  static constexpr uint64_t maxAttrTypeDepth = 5;
+
   LogicalResult readAttribute(Attribute &result) override {
-    return attrTypeReader.parseAttribute(reader, result);
+    uint64_t index;
+    if (failed(reader.parseVarInt(index)))
+      return failure();
+    if (depth > maxAttrTypeDepth) {
+      if (Attribute attr = attrTypeReader.getAttributeOrSentinel(index)) {
+        result = attr;
+        return success();
+      }
+      attrTypeReader.addDeferredParsing(index);
+      return failure();
+    }
+    return attrTypeReader.readAttribute(index, result, depth + 1);
   }
   LogicalResult readOptionalAttribute(Attribute &result) override {
     return attrTypeReader.parseOptionalAttribute(reader, result);
   }
   LogicalResult readType(Type &result) override {
-    return attrTypeReader.parseType(reader, result);
+    uint64_t index;
+    if (failed(reader.parseVarInt(index)))
+      return failure();
+    if (depth > maxAttrTypeDepth) {
+      if (Type type = attrTypeReader.getTypeOrSentinel(index)) {
+        result = type;
+        return success();
+      }
+      attrTypeReader.addDeferredParsing(index);
+      return failure();
+    }
+    return attrTypeReader.readType(index, result, depth + 1);
   }
 
   FailureOr<AsmDialectResourceHandle> readResourceHandle() override {
@@ -1095,6 +1176,7 @@ class DialectReader : public DialectBytecodeReader {
   const llvm::StringMap<BytecodeDialect *> &dialectsMap;
   EncodingReader &reader;
   uint64_t &bytecodeVersion;
+  uint64_t depth;
 };
 
 /// Wraps the properties section and handles reading properties out of it.
@@ -1239,68 +1321,110 @@ LogicalResult AttrTypeReader::initialize(
 
 template <typename T>
 T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                               StringRef entryType) {
+                               StringRef entryType, uint64_t depth) {
   if (index >= entries.size()) {
     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
     return {};
   }
 
-  // If the entry has already been resolved, there is nothing left to do.
-  Entry<T> &entry = entries[index];
-  if (entry.entry)
-    return entry.entry;
+  // Fast path: Try direct parsing without worklist overhead. This handles the
+  // common case where there are no deferred dependencies.
+  assert(deferredWorklist.empty());
+  T result;
+  if (succeeded(readEntry(entries, index, result, entryType, depth))) {
+    assert(deferredWorklist.empty());
+    return result;
+  }
+  if (deferredWorklist.empty()) {
+    // Failed with no deferred entries is error.
+    return T();
+  }
 
-  // Parse the entry.
-  EncodingReader reader(entry.data, fileLoc);
+  // Slow path: Use worklist to handle deferred dependencies. Use a deque to
+  // iteratively resolve entries with dependencies.
+  // - Pop from front to process
+  // - Push new dependencies to front (depth-first)
+  // - Move failed entries to back (retry after dependencies)
+  std::deque<size_t> worklist;
+  llvm::DenseSet<size_t> inWorklist;
 
-  // Parse based on how the entry was encoded.
-  if (entry.hasCustomEncoding) {
-    if (failed(parseCustomEntry(entry, reader, entryType)))
-      return T();
-  } else if (failed(parseAsmEntry(entry.entry, reader, entryType))) {
-    return T();
+  // Add the original index and any dependencies from the fast path attempt.
+  worklist.push_back(index);
+  inWorklist.insert(index);
+  for (uint64_t idx : llvm::reverse(deferredWorklist)) {
+    if (inWorklist.insert(idx).second)
+      worklist.push_front(idx);
   }
 
-  if (!reader.empty()) {
-    reader.emitError("unexpected trailing bytes after " + entryType + " entry");
-    return T();
+  while (!worklist.empty()) {
+    size_t currentIndex = worklist.front();
+    worklist.pop_front();
+
+    // Clear the deferred worklist before parsing to capture any new entries.
+    deferredWorklist.clear();
+
+    T result;
+    if (succeeded(readEntry(entries, currentIndex, result, entryType, depth))) {
+      inWorklist.erase(currentIndex);
+      continue;
+    }
+
+    if (deferredWorklist.empty()) {
+      // Parsing failed with no deferred entries which implies an error.
+      return T();
+    }
+
+    // Move this entry to the back to retry after dependencies.
+    worklist.push_back(currentIndex);
+
+    // Add dependencies to the front (in reverse so they maintain order).
+    for (uint64_t idx : llvm::reverse(deferredWorklist)) {
+      if (inWorklist.insert(idx).second)
+        worklist.push_front(idx);
+    }
+    deferredWorklist.clear();
   }
-  return entry.entry;
+  return entries[index].entry;
 }
 
 template <typename T>
-LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
-                                            StringRef entryType) {
-  StringRef asmStr;
-  if (failed(reader.parseNullTerminatedString(asmStr)))
-    return failure();
+LogicalResult AttrTypeReader::readEntry(SmallVectorImpl<Entry<T>> &entries,
+                                        uint64_t index, T &result,
+                                        StringRef entryType, uint64_t depth) {
+  if (index >= entries.size())
+    return emitError(fileLoc) << "invalid " << entryType << " index: " << index;
 
-  // Invoke the MLIR assembly parser to parse the entry text.
-  size_t numRead = 0;
-  MLIRContext *context = fileLoc->getContext();
-  if constexpr (std::is_same_v<T, Type>)
-    result =
-        ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
-  else
-    result = ::parseAttribute(asmStr, context, Type(), &numRead,
-                              /*isKnownNullTerminated=*/true);
-  if (!result)
+  // If the entry has already been resolved, return it.
+  Entry<T> &entry = entries[index];
+  if (entry.entry) {
+    result = entry.entry;
+    return success();
+  }
+
+  // If the entry hasn't been resolved, try to parse it.
+  EncodingReader reader(entry.data, fileLoc);
+  LogicalResult parseResult =
+      entry.hasCustomEncoding
+          ? parseCustomEntry(entry, reader, entryType, index, depth)
+          : parseAsmEntry(entry.entry, reader, entryType);
+  if (failed(parseResult))
     return failure();
 
-  // Ensure there weren't dangling characters after the entry.
-  if (numRead != asmStr.size()) {
-    return reader.emitError("trailing characters found after ", entryType,
-                            " assembly format: ", asmStr.drop_front(numRead));
-  }
+  if (!reader.empty())
+    return reader.emitError("unexpected trailing bytes after " + entryType +
+                            " entry");
+
+  result = entry.entry;
   return success();
 }
 
 template <typename T>
 LogicalResult AttrTypeReader::parseCustomEntry(Entry<T> &entry,
                                                EncodingReader &reader,
-                                               StringRef entryType) {
+                                               StringRef entryType,
+                                               uint64_t index, uint64_t depth) {
   DialectReader dialectReader(*this, stringReader, resourceReader, dialectsMap,
-                              reader, bytecodeVersion);
+                              reader, bytecodeVersion, depth);
   if (failed(entry.dialect->load(dialectReader, fileLoc.getContext())))
     return failure();
 
@@ -1350,6 +1474,33 @@ LogicalResult AttrTypeReader::parseCustomEntry(Entry<T> &entry,
   return success(!!entry.entry);
 }
 
+template <typename T>
+LogicalResult AttrTypeReader::parseAsmEntry(T &result, EncodingReader &reader,
+                                            StringRef entryType) {
+  StringRef asmStr;
+  if (failed(reader.parseNullTerminatedString(asmStr)))
+    return failure();
+
+  // Invoke the MLIR assembly parser to parse the entry text.
+  size_t numRead = 0;
+  MLIRContext *context = fileLoc->getContext();
+  if constexpr (std::is_same_v<T, Type>)
+    result =
+        ::parseType(asmStr, context, &numRead, /*isKnownNullTerminated=*/true);
+  else
+    result = ::parseAttribute(asmStr, context, Type(), &numRead,
+                              /*isKnownNullTerminated=*/true);
+  if (!result)
+    return failure();
+
+  // Ensure there weren't dangling characters after the entry.
+  if (numRead != asmStr.size()) {
+    return reader.emitError("trailing characters found after ", entryType,
+                            " assembly format: ", asmStr.drop_front(numRead));
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Bytecode Reader
 //===----------------------------------------------------------------------===//
diff --git a/mlir/unittests/Bytecode/BytecodeTest.cpp b/mlir/unittests/Bytecode/BytecodeTest.cpp
index d7b442f6832d0..30e7ed9b6cb7e 100644
--- a/mlir/unittests/Bytecode/BytecodeTest.cpp
+++ b/mlir/unittests/Bytecode/BytecodeTest.cpp
@@ -15,6 +15,7 @@
 #include "mlir/IR/OwningOpRef.h"
 #include "mlir/Parser/Parser.h"
 
+#include "mlir/IR/BuiltinOps.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Endian.h"
@@ -228,3 +229,39 @@ TEST(Bytecode, OpWithoutProperties) {
   EXPECT_TRUE(OperationEquivalence::computeHash(op.get()) ==
               OperationEquivalence::computeHash(roundtripped));
 }
+
+TEST(Bytecode, DeepCallSiteLoc) {
+  MLIRContext context;
+  ParserConfig config(&context);
+
+  // Create a deep CallSiteLoc chain to test iterative parsing.
+  Location baseLoc = FileLineColLoc::get(&context, "test.mlir", 1, 1);
+  Location loc = baseLoc;
+  constexpr int kDepth = 1000;
+  for (int i = 0; i < kDepth; ++i) {
+    loc = CallSiteLoc::get(loc, baseLoc);
+  }
+
+  // Create a simple module with the deep location.
+  Builder builder(&context);
+  OwningOpRef<ModuleOp> module =
+      ModuleOp::create(loc, /*attributes=*/std::nullopt);
+  ASSERT_TRUE(module);
+
+  // Write to bytecode.
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  ASSERT_TRUE(succeeded(writeBytecodeToFile(module.get(), os)));
+
+  // Parse it back using the bytecode reader.
+  std::unique_ptr<Block> block = std::make_unique<Block>();
+  ASSERT_TRUE(succeeded(readBytecodeFile(
+      llvm::MemoryBufferRef(bytecode, "string-buffer"), block.get(), config)));
+
+  // Verify we got the roundtripped module.
+  ASSERT_FALSE(block->empty());
+  Operation *roundTripped = &block->front();
+
+  // Verify the location matches.
+  EXPECT_EQ(module.get()->getLoc(), roundTripped->getLoc());
+}

From cc25ac424a856554edb152c09f8217e95b59a7b6 Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 9 Dec 2025 12:45:45 -0600
Subject: [PATCH 40/63] [flang][OpenMP] Use DirId() instead of DirName().v, NFC
 (#171484)

---
 flang/lib/Parser/openmp-parsers.cpp         | 2 +-
 flang/lib/Semantics/check-omp-loop.cpp      | 4 ++--
 flang/lib/Semantics/check-omp-structure.cpp | 2 +-
 flang/lib/Semantics/resolve-directives.cpp  | 2 +-
 flang/lib/Semantics/rewrite-parse-tree.cpp  | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index 6b3bcd00c5bec..a0e106d70a5fd 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -1920,7 +1920,7 @@ struct OmpLoopConstructParser {
     auto loopItem{LoopNestParser{} || ompLoopConstruct};
 
     if (auto &&begin{OmpBeginDirectiveParser(dirs_).Parse(state)}) {
-      auto loopDir{begin->DirName().v};
+      auto loopDir{begin->DirId()};
       auto assoc{llvm::omp::getDirectiveAssociation(loopDir)};
       if (assoc == llvm::omp::Association::LoopNest) {
         if (auto &&item{attempt(loopItem).Parse(state)}) {
diff --git a/flang/lib/Semantics/check-omp-loop.cpp b/flang/lib/Semantics/check-omp-loop.cpp
index f25cf7eb33817..726dbe865834d 100644
--- a/flang/lib/Semantics/check-omp-loop.cpp
+++ b/flang/lib/Semantics/check-omp-loop.cpp
@@ -271,7 +271,7 @@ void OmpStructureChecker::CheckNestedBlock(const parser::OpenMPLoopConstruct &x,
     } else if (parser::Unwrap<parser::DoConstruct>(stmt)) {
       ++nestedCount;
     } else if (auto *omp{parser::Unwrap<parser::OpenMPLoopConstruct>(stmt)}) {
-      if (!IsLoopTransforming(omp->BeginDir().DirName().v)) {
+      if (!IsLoopTransforming(omp->BeginDir().DirId())) {
         context_.Say(omp->source,
             "Only loop-transforming OpenMP constructs are allowed inside OpenMP loop constructs"_err_en_US);
       }
@@ -324,7 +324,7 @@ void OmpStructureChecker::CheckFullUnroll(
   // since it won't contain a loop.
   if (const parser::OpenMPLoopConstruct *nested{x.GetNestedConstruct()}) {
     auto &nestedSpec{nested->BeginDir()};
-    if (nestedSpec.DirName().v == llvm::omp::Directive::OMPD_unroll) {
+    if (nestedSpec.DirId() == llvm::omp::Directive::OMPD_unroll) {
       bool isPartial{
           llvm::any_of(nestedSpec.Clauses().v, [](const parser::OmpClause &c) {
             return c.Id() == llvm::omp::Clause::OMPC_partial;
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index b6b0f86eb4cce..7776f0d1f21f9 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2800,7 +2800,7 @@ void OmpStructureChecker::Leave(const parser::OpenMPCancelConstruct &) {
 void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) {
   const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
   const std::optional<parser::OmpEndDirective> &endSpec{x.EndDir()};
-  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirName().v);
+  PushContextAndClauseSets(beginSpec.DirName().source, beginSpec.DirId());
 
   const auto &block{std::get<parser::Block>(x.t)};
   CheckNoBranching(
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 44be5ffd684a2..6211643b08970 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2477,7 +2477,7 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPSectionConstruct &x) {
 
 bool OmpAttributeVisitor::Pre(const parser::OpenMPCriticalConstruct &x) {
   const parser::OmpBeginDirective &beginSpec{x.BeginDir()};
-  PushContext(beginSpec.DirName().source, beginSpec.DirName().v);
+  PushContext(beginSpec.DirName().source, beginSpec.DirId());
   GetContext().withinConstruct = true;
   return true;
 }
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 285eaac1e2c8f..60e3e6ab3f5f1 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -118,7 +118,7 @@ static bool ReturnsDataPointer(const Symbol &symbol) {
 }
 
 static bool LoopConstructIsSIMD(parser::OpenMPLoopConstruct *ompLoop) {
-  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirName().v);
+  return llvm::omp::allSimdSet.test(ompLoop->BeginDir().DirId());
 }
 
 // Remove non-SIMD OpenMPConstructs once they are parsed.

From 719826d33df18a2d386c008b11521275e9ec1d74 Mon Sep 17 00:00:00 2001
From: nerix <nerixdev@outlook.de>
Date: Tue, 9 Dec 2025 19:52:02 +0100
Subject: [PATCH 41/63] [LLDB] Run MSVC STL optional test with PDB (#171486)

Similar to the other PRs, this runs the `std::optional` test with PDB.
Since we don't know that variables use typedefs, we check for the full
name when testing PDB.
---
 .../optional/TestDataFormatterGenericOptional.py   | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
index 7bb4f75de4e59..c88e83bb5b1f4 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/optional/TestDataFormatterGenericOptional.py
@@ -5,6 +5,8 @@
 
 
 class GenericOptionalDataFormatterTestCase(TestBase):
+    TEST_WITH_PDB_DEBUG_INFO = True
+
     def do_test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
 
@@ -55,7 +57,11 @@ def cleanup():
         self.expect(
             "frame var numbers",
             substrs=[
-                "(optional_int_vect) numbers =  Has Value=true  {",
+                (
+                    "(std::optional<std::vector<int, std::allocator<int>>>) numbers =  Has Value=true  {"
+                    if self.getDebugInfo() == "pdb"
+                    else "(optional_int_vect) numbers =  Has Value=true  {"
+                ),
                 "Value = size=4 {",
                 "[0] = 1",
                 "[1] = 2",
@@ -69,7 +75,11 @@ def cleanup():
         self.expect(
             "frame var ostring",
             substrs=[
-                "(optional_string) ostring =  Has Value=true  {",
+                (
+                    "(std::optional<std::basic_string<char, std::char_traits<char>, std::allocator<char>>>) ostring =  Has Value=true  {"
+                    if self.getDebugInfo() == "pdb"
+                    else "(optional_string) ostring =  Has Value=true  {"
+                ),
                 'Value = "hello"',
                 "}",
             ],

From 03160c186e90463402786b611826a1702420e1c7 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Tue, 9 Dec 2025 19:52:17 +0100
Subject: [PATCH 42/63] [X86] fix typo: `MCVTTP2SIS` -> `MCVTTP2UIS` (#171229)

This LLVM IR

https://godbolt.org/z/5bM1vrMY1

```llvm
define <4 x i32> @masked(<2 x double> %a, <4 x i32> %src, i8 noundef zeroext %mask) unnamed_addr #0 {
  %r = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128(<2 x double> %a, <4 x i32> %src, i8 noundef %mask)
  ret <4 x i32> %r
}

define <4 x i32> @unmasked(<2 x double> %a) unnamed_addr #0 {
  %r = tail call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128(<2 x double> %a, <4 x i32> zeroinitializer, i8 noundef -1)
  ret <4 x i32> %r
}

declare <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128(<2 x double>, <4 x i32>, i8) unnamed_addr

attributes #0 = { mustprogress nofree norecurse nosync nounwind nonlazybind willreturn memory(none) uwtable "probe-stack"="inline-asm" "target-cpu"="x86-64" "target-features"="+avx10.2-512" }
```

produces

```asm
masked:                                 # @masked
        kmovd   k1, edi
        vcvttpd2dqs     xmm1 {k1}, xmm0
        vmovaps xmm0, xmm1
        ret
unmasked:                               # @unmasked
        vcvttpd2udqs    xmm0, xmm0
        ret
```

So, when a mask is used, somehow the signed version of this instruction
is selected. I suspect this is a typo.
---
 llvm/lib/Target/X86/X86IntrinsicsInfo.h             |  2 +-
 llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 99665b5872fe2..88ade87e1dca8 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -534,7 +534,7 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2qqs_round_512, INTR_TYPE_1OP_MASK,
                        X86ISD::CVTTP2SIS, X86ISD::CVTTP2SIS_SAE),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_128, CVTPD2DQ_MASK,
-                       X86ISD::CVTTP2UIS, X86ISD::MCVTTP2SIS),
+                       X86ISD::CVTTP2UIS, X86ISD::MCVTTP2UIS),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_256, INTR_TYPE_1OP_MASK,
                        X86ISD::CVTTP2UIS, 0),
     X86_INTRINSIC_DATA(avx10_mask_vcvttpd2udqs_round_512, INTR_TYPE_1OP_MASK,
diff --git a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
index 38d54cff6dc23..00db1fb07c78d 100644
--- a/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll
@@ -652,14 +652,14 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128(<2 x double> %x0, <4 x i32
 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
 ; X64-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc8]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc8]
 ; X86-NEXT:    vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> %src, i8 %mask)
@@ -670,13 +670,13 @@ define <4 x i32> @test_int_x86_maskz_vcvtt_pd2udqs_128_z(<2 x double> %x0, i8 %m
 ; X64-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_maskz_vcvtt_pd2udqs_128_z:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6d,0xc0]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0xfc,0x89,0x6c,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> zeroinitializer, i8 %mask)
   ret <4 x i32> %res
@@ -686,13 +686,13 @@ define <4 x i32> @test_int_x86_mask_vcvtt_pd2udqs_128_undef(<2 x double> %x0, i8
 ; X64-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
 ; X64:       # %bb.0:
 ; X64-NEXT:    kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf]
-; X64-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X64-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
 ; X64-NEXT:    retq # encoding: [0xc3]
 ;
 ; X86-LABEL: test_int_x86_mask_vcvtt_pd2udqs_128_undef:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04]
-; X86-NEXT:    vcvttpd2dqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6d,0xc0]
+; X86-NEXT:    vcvttpd2udqs %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0xfc,0x09,0x6c,0xc0]
 ; X86-NEXT:    retl # encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx10.mask.vcvttpd2udqs.128( <2 x double> %x0, <4 x i32> undef, i8 %mask)
   ret <4 x i32> %res

From 5052b6ce1d3a1d1b36333f036525ec8cff6c9a10 Mon Sep 17 00:00:00 2001
From: Anshil Gandhi <95053726+gandhi56@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:59:36 -0500
Subject: [PATCH 43/63] [AMDGPU] Scavenge a VGPR to eliminate a frame index
 (#166979)

If the subtarget supports flat scratch SVS mode and there is no SGPR
available to replace a frame index, convert a scratch instruction in SS
form into SV form and replace the frame index with a scavenged VGPR.
Resolves #155902

Co-authored-by: Matt Arsenault <matthew.arsenault@amd.com>
---
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  32 +-
 .../flat-scratch-alloca-issue-155902.ll       | 469 ++++++++++++++++++
 2 files changed, 498 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll

diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index ad3828fba2187..66586e8bc234a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -2983,10 +2983,36 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                   : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
                                                   MI, false, 0, !UseSGPR);
 
-      // TODO: for flat scratch another attempt can be made with a VGPR index
-      //       if no SGPRs can be scavenged.
-      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
+      if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR)) {
+        int SVOpcode = AMDGPU::getFlatScratchInstSVfromSS(MI->getOpcode());
+        if (ST.hasFlatScratchSVSMode() && SVOpcode != -1) {
+          Register TmpVGPR = RS->scavengeRegisterBackwards(
+              AMDGPU::VGPR_32RegClass, MI, false, 0, /*AllowSpill=*/true);
+
+          // Materialize the frame register.
+          auto MIB =
+              BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR);
+          if (FrameReg)
+            MIB.addReg(FrameReg);
+          else
+            MIB.addImm(Offset);
+
+          // Add the offset to the frame register.
+          if (FrameReg && Offset)
+            BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), FrameReg)
+                .addReg(FrameReg, RegState::Kill)
+                .addImm(Offset);
+
+          BuildMI(*MBB, MI, DL, TII->get(SVOpcode))
+              .add(MI->getOperand(0)) // $vdata
+              .addReg(TmpVGPR)        // $vaddr
+              .addImm(0)              // Offset
+              .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::cpol));
+          MI->eraseFromParent();
+          return true;
+        }
         report_fatal_error("Cannot scavenge register in FI elimination!");
+      }
 
       if (!TmpSReg) {
         // Use frame register and restore it after.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll
new file mode 100644
index 0000000000000..26acb4604cbcb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-alloca-issue-155902.ll
@@ -0,0 +1,469 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck %s --check-prefix=GFX950
+
+; Ensure we don't crash with: "Cannot scavenge register in FI elimination!"
+define amdgpu_kernel void @issue155902(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) {
+; GFX950-LABEL: issue155902:
+; GFX950:       ; %bb.0: ; %bb
+; GFX950-NEXT:    s_mov_b32 s33, 0x4008
+; GFX950-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX950-NEXT:    v_writelane_b32 v2, s33, 0
+; GFX950-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX950-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX950-NEXT:    s_load_dwordx2 vcc, s[2:3], 0x8
+; GFX950-NEXT:    s_load_dwordx2 s[98:99], s[2:3], 0x10
+; GFX950-NEXT:    s_load_dwordx2 s[96:97], s[2:3], 0x18
+; GFX950-NEXT:    s_load_dwordx2 s[94:95], s[2:3], 0x20
+; GFX950-NEXT:    s_load_dwordx2 s[92:93], s[2:3], 0x28
+; GFX950-NEXT:    s_load_dwordx2 s[90:91], s[2:3], 0x30
+; GFX950-NEXT:    s_load_dwordx2 s[88:89], s[2:3], 0x38
+; GFX950-NEXT:    s_load_dwordx2 s[86:87], s[2:3], 0x40
+; GFX950-NEXT:    s_load_dwordx2 s[84:85], s[2:3], 0x48
+; GFX950-NEXT:    s_load_dwordx2 s[82:83], s[2:3], 0x50
+; GFX950-NEXT:    s_load_dwordx2 s[80:81], s[2:3], 0x58
+; GFX950-NEXT:    s_load_dwordx2 s[78:79], s[2:3], 0x60
+; GFX950-NEXT:    s_load_dwordx2 s[76:77], s[2:3], 0x68
+; GFX950-NEXT:    s_load_dwordx2 s[74:75], s[2:3], 0x70
+; GFX950-NEXT:    s_load_dwordx2 s[72:73], s[2:3], 0x78
+; GFX950-NEXT:    s_load_dwordx2 s[70:71], s[2:3], 0x80
+; GFX950-NEXT:    s_load_dwordx2 s[68:69], s[2:3], 0x88
+; GFX950-NEXT:    s_load_dwordx2 s[66:67], s[2:3], 0x90
+; GFX950-NEXT:    s_load_dwordx2 s[64:65], s[2:3], 0x98
+; GFX950-NEXT:    s_load_dwordx2 s[62:63], s[2:3], 0xa0
+; GFX950-NEXT:    s_load_dwordx2 s[60:61], s[2:3], 0xa8
+; GFX950-NEXT:    s_load_dwordx2 s[58:59], s[2:3], 0xb0
+; GFX950-NEXT:    s_load_dwordx2 s[56:57], s[2:3], 0xb8
+; GFX950-NEXT:    s_load_dwordx2 s[54:55], s[2:3], 0xc0
+; GFX950-NEXT:    s_load_dwordx2 s[52:53], s[2:3], 0xc8
+; GFX950-NEXT:    s_load_dwordx2 s[50:51], s[2:3], 0xd0
+; GFX950-NEXT:    s_load_dwordx2 s[48:49], s[2:3], 0xd8
+; GFX950-NEXT:    s_load_dwordx2 s[46:47], s[2:3], 0xe0
+; GFX950-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0xe8
+; GFX950-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0xf0
+; GFX950-NEXT:    s_load_dwordx2 s[40:41], s[2:3], 0xf8
+; GFX950-NEXT:    s_load_dwordx2 s[38:39], s[2:3], 0x100
+; GFX950-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x108
+; GFX950-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x110
+; GFX950-NEXT:    s_load_dwordx2 s[30:31], s[2:3], 0x118
+; GFX950-NEXT:    s_load_dwordx2 s[28:29], s[2:3], 0x120
+; GFX950-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x128
+; GFX950-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0x130
+; GFX950-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x138
+; GFX950-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x140
+; GFX950-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x148
+; GFX950-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x150
+; GFX950-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x158
+; GFX950-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x160
+; GFX950-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x168
+; GFX950-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x170
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x178
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x180
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x188
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX950-NEXT:    v_mov_b32_e32 v3, 0x4008
+; GFX950-NEXT:    buffer_wbl2 sc0 sc1
+; GFX950-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    scratch_store_dwordx2 v3, v[0:1], off sc0 sc1
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s33
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0x384
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s33 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 0
+; GFX950-NEXT:    s_nop 4
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], vcc
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[98:99]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[96:97]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[94:95]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[92:93]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[90:91]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[88:89]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[86:87]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[84:85]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[82:83]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[80:81]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[78:79]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[76:77]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[74:75]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[72:73]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[70:71]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[68:69]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[66:67]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[64:65]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[62:63]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[60:61]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[58:59]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[56:57]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[54:55]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[52:53]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[50:51]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[48:49]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[46:47]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[44:45]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[42:43]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[40:41]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[38:39]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[34:35]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[30:31]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[26:27]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[22:23]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[14:15]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[10:11]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_endpgm
+bb:
+  %alloca.big = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca304 = alloca [2 x i64], align 8, addrspace(5)
+  %alloca307 = alloca i64, align 8, addrspace(5)
+  store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8
+  store i64 900, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg1, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg2, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg3, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg4, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg5, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg6, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg7, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg8, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg9, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg10, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg11, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg12, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg13, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg14, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg15, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg16, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg17, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg18, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg19, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg20, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg21, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg22, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg23, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg24, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg25, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg26, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg27, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg28, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg29, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg30, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg31, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg32, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg33, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg34, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg35, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg36, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg37, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg38, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg39, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg40, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg41, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg42, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg43, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg44, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg45, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg46, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg47, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg48, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg49, ptr addrspace(5) %alloca307, align 8
+  ret void
+}
+
+define amdgpu_kernel void @issue155902_fp(i64 %arg, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15, i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23, i64 %arg24, i64 %arg25, i64 %arg26, i64 %arg27, i64 %arg28, i64 %arg29, i64 %arg30, i64 %arg31, i64 %arg32, i64 %arg33, i64 %arg34, i64 %arg35, i64 %arg36, i64 %arg37, i64 %arg38, i64 %arg39, i64 %arg40, i64 %arg41, i64 %arg42, i64 %arg43, i64 %arg44, i64 %arg45, i64 %arg46, i64 %arg47, i64 %arg48, i64 %arg49) #0 {
+; GFX950-LABEL: issue155902_fp:
+; GFX950:       ; %bb.0: ; %bb
+; GFX950-NEXT:    s_mov_b32 s33, 0
+; GFX950-NEXT:    s_add_i32 s1, s33, 0x4008
+; GFX950-NEXT:    s_mov_b32 s0, s1
+; GFX950-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
+; GFX950-NEXT:    v_writelane_b32 v2, s0, 0
+; GFX950-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_writelane_b32 v2, s4, 1
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_writelane_b32 v2, s5, 2
+; GFX950-NEXT:    s_load_dwordx2 vcc, s[2:3], 0x8
+; GFX950-NEXT:    s_load_dwordx2 s[98:99], s[2:3], 0x10
+; GFX950-NEXT:    s_load_dwordx2 s[96:97], s[2:3], 0x18
+; GFX950-NEXT:    s_load_dwordx2 s[94:95], s[2:3], 0x20
+; GFX950-NEXT:    s_load_dwordx2 s[92:93], s[2:3], 0x28
+; GFX950-NEXT:    s_load_dwordx2 s[90:91], s[2:3], 0x30
+; GFX950-NEXT:    s_load_dwordx2 s[88:89], s[2:3], 0x38
+; GFX950-NEXT:    s_load_dwordx2 s[86:87], s[2:3], 0x40
+; GFX950-NEXT:    s_load_dwordx2 s[84:85], s[2:3], 0x48
+; GFX950-NEXT:    s_load_dwordx2 s[82:83], s[2:3], 0x50
+; GFX950-NEXT:    s_load_dwordx2 s[80:81], s[2:3], 0x58
+; GFX950-NEXT:    s_load_dwordx2 s[78:79], s[2:3], 0x60
+; GFX950-NEXT:    s_load_dwordx2 s[76:77], s[2:3], 0x68
+; GFX950-NEXT:    s_load_dwordx2 s[74:75], s[2:3], 0x70
+; GFX950-NEXT:    s_load_dwordx2 s[72:73], s[2:3], 0x78
+; GFX950-NEXT:    s_load_dwordx2 s[70:71], s[2:3], 0x80
+; GFX950-NEXT:    s_load_dwordx2 s[68:69], s[2:3], 0x88
+; GFX950-NEXT:    s_load_dwordx2 s[66:67], s[2:3], 0x90
+; GFX950-NEXT:    s_load_dwordx2 s[64:65], s[2:3], 0x98
+; GFX950-NEXT:    s_load_dwordx2 s[62:63], s[2:3], 0xa0
+; GFX950-NEXT:    s_load_dwordx2 s[60:61], s[2:3], 0xa8
+; GFX950-NEXT:    s_load_dwordx2 s[58:59], s[2:3], 0xb0
+; GFX950-NEXT:    s_load_dwordx2 s[56:57], s[2:3], 0xb8
+; GFX950-NEXT:    s_load_dwordx2 s[54:55], s[2:3], 0xc0
+; GFX950-NEXT:    s_load_dwordx2 s[52:53], s[2:3], 0xc8
+; GFX950-NEXT:    s_load_dwordx2 s[50:51], s[2:3], 0xd0
+; GFX950-NEXT:    s_load_dwordx2 s[48:49], s[2:3], 0xd8
+; GFX950-NEXT:    s_load_dwordx2 s[46:47], s[2:3], 0xe0
+; GFX950-NEXT:    s_load_dwordx2 s[44:45], s[2:3], 0xe8
+; GFX950-NEXT:    s_load_dwordx2 s[42:43], s[2:3], 0xf0
+; GFX950-NEXT:    s_load_dwordx2 s[40:41], s[2:3], 0xf8
+; GFX950-NEXT:    s_load_dwordx2 s[38:39], s[2:3], 0x100
+; GFX950-NEXT:    s_load_dwordx2 s[36:37], s[2:3], 0x108
+; GFX950-NEXT:    s_load_dwordx2 s[34:35], s[2:3], 0x110
+; GFX950-NEXT:    s_load_dwordx2 s[30:31], s[2:3], 0x118
+; GFX950-NEXT:    s_load_dwordx2 s[28:29], s[2:3], 0x120
+; GFX950-NEXT:    s_load_dwordx2 s[26:27], s[2:3], 0x128
+; GFX950-NEXT:    s_load_dwordx2 s[24:25], s[2:3], 0x130
+; GFX950-NEXT:    s_load_dwordx2 s[22:23], s[2:3], 0x138
+; GFX950-NEXT:    s_load_dwordx2 s[20:21], s[2:3], 0x140
+; GFX950-NEXT:    s_load_dwordx2 s[18:19], s[2:3], 0x148
+; GFX950-NEXT:    s_load_dwordx2 s[16:17], s[2:3], 0x150
+; GFX950-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x158
+; GFX950-NEXT:    s_load_dwordx2 s[12:13], s[2:3], 0x160
+; GFX950-NEXT:    s_load_dwordx2 s[10:11], s[2:3], 0x168
+; GFX950-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x170
+; GFX950-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x178
+; GFX950-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x180
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x188
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX950-NEXT:    s_add_i32 s1, s33, 0x4008
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s1 offset:8
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], 0x384
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 1
+; GFX950-NEXT:    v_readlane_b32 s1, v2, 2
+; GFX950-NEXT:    s_nop 1
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-NEXT:    v_readlane_b32 s0, v2, 0
+; GFX950-NEXT:    s_nop 4
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], vcc
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[98:99]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[96:97]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[94:95]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[92:93]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[90:91]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[88:89]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[86:87]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[84:85]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[82:83]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[80:81]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[78:79]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[76:77]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[74:75]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[72:73]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[70:71]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[68:69]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[66:67]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[64:65]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[62:63]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[60:61]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[58:59]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[56:57]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[54:55]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[52:53]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[50:51]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[48:49]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[46:47]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[44:45]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[42:43]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[40:41]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[38:39]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[36:37]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[34:35]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[30:31]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[28:29]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[26:27]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[24:25]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[22:23]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[20:21]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[18:19]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[16:17]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[14:15]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[10:11]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX950-NEXT:    scratch_store_dwordx2 off, v[0:1], s0 offset:16
+; GFX950-NEXT:    s_endpgm
+bb:
+  %alloca.big = alloca [4096 x i32], align 4, addrspace(5)
+  %alloca304 = alloca [2 x i64], align 8, addrspace(5)
+  %alloca307 = alloca i64, align 8, addrspace(5)
+  store [2 x i64] zeroinitializer, ptr addrspace(5) %alloca304, align 8
+  store i64 900, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg1, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg2, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg3, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg4, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg5, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg6, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg7, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg8, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg9, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg10, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg11, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg12, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg13, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg14, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg15, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg16, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg17, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg18, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg19, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg20, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg21, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg22, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg23, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg24, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg25, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg26, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg27, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg28, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg29, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg30, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg31, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg32, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg33, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg34, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg35, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg36, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg37, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg38, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg39, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg40, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg41, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg42, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg43, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg44, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg45, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg46, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg47, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg48, ptr addrspace(5) %alloca307, align 8
+  store i64 %arg49, ptr addrspace(5) %alloca307, align 8
+  ret void
+}
+
+attributes #0 = { "frame-pointer"="all" }

From 24117f75ad9d7bbb439e8e1bd596fdcf0aa8d6e2 Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Tue, 9 Dec 2025 11:50:20 -0800
Subject: [PATCH 44/63] [clang-doc] Replace HTML generation with Mustache
 backend (#170199)

Removes the legacy HTML backend and replaces it with the Mustache
backend.
---
 clang-tools-extra/clang-doc/CMakeLists.txt    |    1 -
 clang-tools-extra/clang-doc/Generators.cpp    |    2 -
 clang-tools-extra/clang-doc/Generators.h      |    1 -
 clang-tools-extra/clang-doc/HTMLGenerator.cpp | 1212 ++---------------
 .../clang-doc/HTMLMustacheGenerator.cpp       |  179 ---
 clang-tools-extra/clang-doc/support/Utils.cpp |    3 +-
 clang-tools-extra/clang-doc/support/Utils.h   |    4 +-
 .../clang-doc/tool/ClangDocMain.cpp           |   99 +-
 .../test/clang-doc/DR-131697.cpp              |    1 -
 clang-tools-extra/test/clang-doc/assets.cpp   |   16 -
 .../clang-doc/basic-project.mustache.test     |    2 +-
 .../test/clang-doc/basic-project.test         |  267 ----
 .../test/clang-doc/comments-in-macros.cpp     |   17 +-
 .../test/clang-doc/conversion_function.cpp    |    7 +-
 clang-tools-extra/test/clang-doc/enum.cpp     |  306 ++---
 .../test/clang-doc/long-name.cpp              |    2 +-
 .../test/clang-doc/mustache-index.cpp         |    2 +-
 .../clang-doc/mustache-separate-namespace.cpp |    2 +-
 .../test/clang-doc/namespace.cpp              |  327 ++---
 .../test/clang-doc/test-path-abs.cpp          |    7 -
 .../unittests/clang-doc/CMakeLists.txt        |    1 -
 .../unittests/clang-doc/HTMLGeneratorTest.cpp |  467 +------
 .../clang-doc/HTMLMustacheGeneratorTest.cpp   |   66 -
 23 files changed, 378 insertions(+), 2613 deletions(-)
 delete mode 100644 clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
 delete mode 100644 clang-tools-extra/test/clang-doc/test-path-abs.cpp
 delete mode 100644 clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp

diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 5989e5fe60cf3..7a375d7cd0524 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -16,7 +16,6 @@ add_clang_library(clangDoc STATIC
   Representation.cpp
   Serialize.cpp
   YAMLGenerator.cpp
-  HTMLMustacheGenerator.cpp
   JSONGenerator.cpp
 
   DEPENDS
diff --git a/clang-tools-extra/clang-doc/Generators.cpp b/clang-tools-extra/clang-doc/Generators.cpp
index ba46609d0e7fc..d6c1cc948ce30 100644
--- a/clang-tools-extra/clang-doc/Generators.cpp
+++ b/clang-tools-extra/clang-doc/Generators.cpp
@@ -243,8 +243,6 @@ void Generator::addInfoToIndex(Index &Idx, const doc::Info *Info) {
 [[maybe_unused]] static int YAMLGeneratorAnchorDest = YAMLGeneratorAnchorSource;
 [[maybe_unused]] static int MDGeneratorAnchorDest = MDGeneratorAnchorSource;
 [[maybe_unused]] static int HTMLGeneratorAnchorDest = HTMLGeneratorAnchorSource;
-[[maybe_unused]] static int MHTMLGeneratorAnchorDest =
-    MHTMLGeneratorAnchorSource;
 [[maybe_unused]] static int JSONGeneratorAnchorDest = JSONGeneratorAnchorSource;
 } // namespace doc
 } // namespace clang
diff --git a/clang-tools-extra/clang-doc/Generators.h b/clang-tools-extra/clang-doc/Generators.h
index 847722646b029..a50f1ac25eda9 100644
--- a/clang-tools-extra/clang-doc/Generators.h
+++ b/clang-tools-extra/clang-doc/Generators.h
@@ -137,7 +137,6 @@ struct MustacheGenerator : public Generator {
 extern volatile int YAMLGeneratorAnchorSource;
 extern volatile int MDGeneratorAnchorSource;
 extern volatile int HTMLGeneratorAnchorSource;
-extern volatile int MHTMLGeneratorAnchorSource;
 extern volatile int JSONGeneratorAnchorSource;
 
 } // namespace doc
diff --git a/clang-tools-extra/clang-doc/HTMLGenerator.cpp b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
index 7c8c16b8e8aca..6f58c3d00fa28 100644
--- a/clang-tools-extra/clang-doc/HTMLGenerator.cpp
+++ b/clang-tools-extra/clang-doc/HTMLGenerator.cpp
@@ -5,1145 +5,169 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the implementation of the HTMLGenerator class,
+/// which is a Clang-Doc generator for HTML using Mustache templates.
+///
+//===----------------------------------------------------------------------===//
 
 #include "Generators.h"
 #include "Representation.h"
 #include "support/File.h"
-#include "clang/Basic/Version.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/JSON.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-#include <optional>
-#include <string>
 
 using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
 
 namespace clang {
 namespace doc {
 
-namespace {
-
-class HTMLTag {
-public:
-  // Any other tag can be added if required
-  enum TagType {
-    TAG_A,
-    TAG_DIV,
-    TAG_FOOTER,
-    TAG_H1,
-    TAG_H2,
-    TAG_H3,
-    TAG_HEADER,
-    TAG_LI,
-    TAG_LINK,
-    TAG_MAIN,
-    TAG_META,
-    TAG_OL,
-    TAG_P,
-    TAG_SCRIPT,
-    TAG_SPAN,
-    TAG_TITLE,
-    TAG_UL,
-    TAG_TABLE,
-    TAG_THEAD,
-    TAG_TBODY,
-    TAG_TR,
-    TAG_TD,
-    TAG_TH
-  };
-
-  HTMLTag() = default;
-  constexpr HTMLTag(TagType Value) : Value(Value) {}
-
-  operator TagType() const { return Value; }
-  operator bool() = delete;
-
-  bool isSelfClosing() const;
-  StringRef toString() const;
-
-private:
-  TagType Value;
-};
-
-enum NodeType {
-  NODE_TEXT,
-  NODE_TAG,
-};
-
-struct HTMLNode {
-  HTMLNode(NodeType Type) : Type(Type) {}
-  virtual ~HTMLNode() = default;
-
-  virtual void render(llvm::raw_ostream &OS, int IndentationLevel) = 0;
-  NodeType Type; // Type of node
-};
-
-struct TextNode : public HTMLNode {
-  TextNode(const Twine &Text)
-      : HTMLNode(NodeType::NODE_TEXT), Text(Text.str()) {}
-
-  std::string Text; // Content of node
-  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
-};
-
-struct TagNode : public HTMLNode {
-  TagNode(HTMLTag Tag) : HTMLNode(NodeType::NODE_TAG), Tag(Tag) {}
-  TagNode(HTMLTag Tag, const Twine &Text) : TagNode(Tag) {
-    Children.emplace_back(std::make_unique<TextNode>(Text.str()));
-  }
-
-  HTMLTag Tag; // Name of HTML Tag (p, div, h1)
-  std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  std::vector<std::pair<std::string, std::string>>
-      Attributes; // List of key-value attributes for tag
-
-  void render(llvm::raw_ostream &OS, int IndentationLevel) override;
-};
-
-struct HTMLFile {
-  std::vector<std::unique_ptr<HTMLNode>> Children; // List of child nodes
-  void render(llvm::raw_ostream &OS) {
-    OS << "<!DOCTYPE html>\n";
-    for (const auto &C : Children) {
-      C->render(OS, 0);
-      OS << "\n";
-    }
-  }
-};
-
-} // namespace
-
-bool HTMLTag::isSelfClosing() const {
-  switch (Value) {
-  case HTMLTag::TAG_META:
-  case HTMLTag::TAG_LINK:
-    return true;
-  case HTMLTag::TAG_A:
-  case HTMLTag::TAG_DIV:
-  case HTMLTag::TAG_FOOTER:
-  case HTMLTag::TAG_H1:
-  case HTMLTag::TAG_H2:
-  case HTMLTag::TAG_H3:
-  case HTMLTag::TAG_HEADER:
-  case HTMLTag::TAG_LI:
-  case HTMLTag::TAG_MAIN:
-  case HTMLTag::TAG_OL:
-  case HTMLTag::TAG_P:
-  case HTMLTag::TAG_SCRIPT:
-  case HTMLTag::TAG_SPAN:
-  case HTMLTag::TAG_TITLE:
-  case HTMLTag::TAG_UL:
-  case HTMLTag::TAG_TABLE:
-  case HTMLTag::TAG_THEAD:
-  case HTMLTag::TAG_TBODY:
-  case HTMLTag::TAG_TR:
-  case HTMLTag::TAG_TD:
-  case HTMLTag::TAG_TH:
-    return false;
-  }
-  llvm_unreachable("Unhandled HTMLTag::TagType");
-}
-
-StringRef HTMLTag::toString() const {
-  switch (Value) {
-  case HTMLTag::TAG_A:
-    return "a";
-  case HTMLTag::TAG_DIV:
-    return "div";
-  case HTMLTag::TAG_FOOTER:
-    return "footer";
-  case HTMLTag::TAG_H1:
-    return "h1";
-  case HTMLTag::TAG_H2:
-    return "h2";
-  case HTMLTag::TAG_H3:
-    return "h3";
-  case HTMLTag::TAG_HEADER:
-    return "header";
-  case HTMLTag::TAG_LI:
-    return "li";
-  case HTMLTag::TAG_LINK:
-    return "link";
-  case HTMLTag::TAG_MAIN:
-    return "main";
-  case HTMLTag::TAG_META:
-    return "meta";
-  case HTMLTag::TAG_OL:
-    return "ol";
-  case HTMLTag::TAG_P:
-    return "p";
-  case HTMLTag::TAG_SCRIPT:
-    return "script";
-  case HTMLTag::TAG_SPAN:
-    return "span";
-  case HTMLTag::TAG_TITLE:
-    return "title";
-  case HTMLTag::TAG_UL:
-    return "ul";
-  case HTMLTag::TAG_TABLE:
-    return "table";
-  case HTMLTag::TAG_THEAD:
-    return "thead";
-  case HTMLTag::TAG_TBODY:
-    return "tbody";
-  case HTMLTag::TAG_TR:
-    return "tr";
-  case HTMLTag::TAG_TD:
-    return "td";
-  case HTMLTag::TAG_TH:
-    return "th";
-  }
-  llvm_unreachable("Unhandled HTMLTag::TagType");
-}
-
-void TextNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
-  OS.indent(IndentationLevel * 2);
-  printHTMLEscaped(Text, OS);
-}
-
-void TagNode::render(llvm::raw_ostream &OS, int IndentationLevel) {
-  // Children nodes are rendered in the same line if all of them are text nodes
-  bool InlineChildren = true;
-  for (const auto &C : Children)
-    if (C->Type == NodeType::NODE_TAG) {
-      InlineChildren = false;
-      break;
-    }
-  OS.indent(IndentationLevel * 2);
-  OS << "<" << Tag.toString();
-  for (const auto &A : Attributes)
-    OS << " " << A.first << "=\"" << A.second << "\"";
-  if (Tag.isSelfClosing()) {
-    OS << "/>";
-    return;
-  }
-  OS << ">";
-  if (!InlineChildren)
-    OS << "\n";
-  bool NewLineRendered = true;
-  for (const auto &C : Children) {
-    int ChildrenIndentation =
-        InlineChildren || !NewLineRendered ? 0 : IndentationLevel + 1;
-    C->render(OS, ChildrenIndentation);
-    if (!InlineChildren && (C == Children.back() ||
-                            (C->Type != NodeType::NODE_TEXT ||
-                             (&C + 1)->get()->Type != NodeType::NODE_TEXT))) {
-      OS << "\n";
-      NewLineRendered = true;
-    } else
-      NewLineRendered = false;
-  }
-  if (!InlineChildren)
-    OS.indent(IndentationLevel * 2);
-  OS << "</" << Tag.toString() << ">";
-}
-
-template <typename Derived, typename Base,
-          typename = std::enable_if<std::is_base_of<Derived, Base>::value>>
-static void appendVector(std::vector<Derived> &&New,
-                         std::vector<Base> &Original) {
-  std::move(New.begin(), New.end(), std::back_inserter(Original));
-}
-
-// HTML generation
-
-static std::vector<std::unique_ptr<TagNode>>
-genStylesheetsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    auto LinkNode = std::make_unique<TagNode>(HTMLTag::TAG_LINK);
-    LinkNode->Attributes.emplace_back("rel", "stylesheet");
-    SmallString<128> StylesheetPath = computeRelativePath("", InfoPath);
-    llvm::sys::path::append(StylesheetPath,
-                            llvm::sys::path::filename(FilePath));
-    // Paths in HTML must be in posix-style
-    llvm::sys::path::native(StylesheetPath, llvm::sys::path::Style::posix);
-    LinkNode->Attributes.emplace_back("href", std::string(StylesheetPath));
-    Out.emplace_back(std::move(LinkNode));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genJsScriptsHTML(StringRef InfoPath, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-
-  // index_json.js is part of every generated HTML file
-  SmallString<128> IndexJSONPath = computeRelativePath("", InfoPath);
-  auto IndexJSONNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
-  llvm::sys::path::append(IndexJSONPath, "index_json.js");
-  llvm::sys::path::native(IndexJSONPath, llvm::sys::path::Style::posix);
-  IndexJSONNode->Attributes.emplace_back("src", std::string(IndexJSONPath));
-  Out.emplace_back(std::move(IndexJSONNode));
-
-  for (const auto &FilePath : CDCtx.JsScripts) {
-    SmallString<128> ScriptPath = computeRelativePath("", InfoPath);
-    auto ScriptNode = std::make_unique<TagNode>(HTMLTag::TAG_SCRIPT);
-    llvm::sys::path::append(ScriptPath, llvm::sys::path::filename(FilePath));
-    // Paths in HTML must be in posix-style
-    llvm::sys::path::native(ScriptPath, llvm::sys::path::Style::posix);
-    ScriptNode->Attributes.emplace_back("src", std::string(ScriptPath));
-    Out.emplace_back(std::move(ScriptNode));
-  }
-  return Out;
-}
-
-static std::unique_ptr<TagNode> genLink(const Twine &Text, const Twine &Link) {
-  auto LinkNode = std::make_unique<TagNode>(HTMLTag::TAG_A, Text);
-  LinkNode->Attributes.emplace_back("href", Link.str());
-  return LinkNode;
-}
-
-static std::unique_ptr<HTMLNode>
-genReference(const Reference &Type, StringRef CurrentDirectory,
-             std::optional<StringRef> JumpToSection = std::nullopt) {
-  if (Type.Path.empty()) {
-    if (!JumpToSection)
-      return std::make_unique<TextNode>(Type.Name);
-    return genLink(Type.Name, "#" + *JumpToSection);
-  }
-  llvm::SmallString<64> Path = Type.getRelativeFilePath(CurrentDirectory);
-  llvm::sys::path::append(Path, Type.getFileBaseName() + ".html");
-
-  // Paths in HTML must be in posix-style
-  llvm::sys::path::native(Path, llvm::sys::path::Style::posix);
-  if (JumpToSection)
-    Path += ("#" + *JumpToSection).str();
-  return genLink(Type.Name, Path);
-}
-
-static std::vector<std::unique_ptr<HTMLNode>>
-genReferenceList(const llvm::SmallVectorImpl<Reference> &Refs,
-                 const StringRef &CurrentDirectory) {
-  std::vector<std::unique_ptr<HTMLNode>> Out;
-  for (const auto &R : Refs) {
-    if (&R != Refs.begin())
-      Out.emplace_back(std::make_unique<TextNode>(", "));
-    Out.emplace_back(genReference(R, CurrentDirectory));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const EnumInfo &I, const ClangDocContext &CDCtx);
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
-        StringRef ParentInfoDir);
-static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C);
-
-static std::vector<std::unique_ptr<TagNode>>
-genEnumsBlock(const std::vector<EnumInfo> &Enums,
-              const ClangDocContext &CDCtx) {
-  if (Enums.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Enums"));
-  Out.back()->Attributes.emplace_back("id", "Enums");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_DIV));
-  auto &DivBody = Out.back();
-  for (const auto &E : Enums) {
-    std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(E, CDCtx);
-    appendVector(std::move(Nodes), DivBody->Children);
-  }
-  return Out;
-}
-
-static std::unique_ptr<TagNode>
-genEnumMembersBlock(const llvm::SmallVector<EnumValueInfo, 4> &Members) {
-  if (Members.empty())
-    return nullptr;
-
-  auto List = std::make_unique<TagNode>(HTMLTag::TAG_TBODY);
-
-  for (const auto &M : Members) {
-    auto TRNode = std::make_unique<TagNode>(HTMLTag::TAG_TR);
-    TRNode->Children.emplace_back(
-        std::make_unique<TagNode>(HTMLTag::TAG_TD, M.Name));
-    // Use user supplied value if it exists, otherwise use the value
-    if (!M.ValueExpr.empty()) {
-      TRNode->Children.emplace_back(
-          std::make_unique<TagNode>(HTMLTag::TAG_TD, M.ValueExpr));
-    } else {
-      TRNode->Children.emplace_back(
-          std::make_unique<TagNode>(HTMLTag::TAG_TD, M.Value));
-    }
-    if (!M.Description.empty()) {
-      auto TD = std::make_unique<TagNode>(HTMLTag::TAG_TD);
-      TD->Children.emplace_back(genHTML(M.Description));
-      TRNode->Children.emplace_back(std::move(TD));
-    }
-    List->Children.emplace_back(std::move(TRNode));
-  }
-  return List;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genFunctionsBlock(const std::vector<FunctionInfo> &Functions,
-                  const ClangDocContext &CDCtx, StringRef ParentInfoDir) {
-  if (Functions.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Functions"));
-  Out.back()->Attributes.emplace_back("id", "Functions");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_DIV));
-  auto &DivBody = Out.back();
-  for (const auto &F : Functions) {
-    std::vector<std::unique_ptr<TagNode>> Nodes =
-        genHTML(F, CDCtx, ParentInfoDir);
-    appendVector(std::move(Nodes), DivBody->Children);
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genRecordMembersBlock(const llvm::SmallVector<MemberTypeInfo, 4> &Members,
-                      StringRef ParentInfoDir) {
-  if (Members.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, "Members"));
-  Out.back()->Attributes.emplace_back("id", "Members");
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
-  auto &ULBody = Out.back();
-  for (const auto &M : Members) {
-    StringRef Access = getAccessSpelling(M.Access);
-    auto LIBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    auto MemberDecl = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    if (!Access.empty())
-      MemberDecl->Children.emplace_back(
-          std::make_unique<TextNode>(Access + " "));
-    if (M.IsStatic)
-      MemberDecl->Children.emplace_back(std::make_unique<TextNode>("static "));
-    MemberDecl->Children.emplace_back(genReference(M.Type, ParentInfoDir));
-    MemberDecl->Children.emplace_back(std::make_unique<TextNode>(" " + M.Name));
-    if (!M.Description.empty())
-      LIBody->Children.emplace_back(genHTML(M.Description));
-    LIBody->Children.emplace_back(std::move(MemberDecl));
-    ULBody->Children.emplace_back(std::move(LIBody));
-  }
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genReferencesBlock(const std::vector<Reference> &References,
-                   llvm::StringRef Title, StringRef ParentPath) {
-  if (References.empty())
-    return {};
-
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H2, Title));
-  Out.back()->Attributes.emplace_back("id", std::string(Title));
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_UL));
-  auto &ULBody = Out.back();
-  for (const auto &R : References) {
-    auto LiNode = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    LiNode->Children.emplace_back(genReference(R, ParentPath));
-    ULBody->Children.emplace_back(std::move(LiNode));
-  }
-  return Out;
-}
-static std::unique_ptr<TagNode> writeSourceFileRef(const ClangDocContext &CDCtx,
-                                                   const Location &L) {
-
-  if (!L.IsFileInRootDir && !CDCtx.RepositoryUrl)
-    return std::make_unique<TagNode>(
-        HTMLTag::TAG_P, "Defined at line " + std::to_string(L.StartLineNumber) +
-                            " of file " + L.Filename);
-
-  SmallString<128> FileURL(CDCtx.RepositoryUrl.value_or(""));
-  llvm::sys::path::append(
-      FileURL, llvm::sys::path::Style::posix,
-      // If we're on Windows, the file name will be in the wrong format, and
-      // append won't convert the full path being appended to the correct
-      // format, so we need to do that here.
-      llvm::sys::path::convert_to_slash(
-          L.Filename,
-          // The style here is the current style of the path, not the one we're
-          // targeting. If the string is already in the posix style, it will do
-          // nothing.
-          llvm::sys::path::Style::windows));
-  auto Node = std::make_unique<TagNode>(HTMLTag::TAG_P);
-  Node->Children.emplace_back(std::make_unique<TextNode>("Defined at line "));
-  auto LocNumberNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_A, std::to_string(L.StartLineNumber));
-  // The links to a specific line in the source code use the github /
-  // googlesource notation so it won't work for all hosting pages.
-  LocNumberNode->Attributes.emplace_back(
-      "href",
-      formatv("{0}#{1}{2}", FileURL, CDCtx.RepositoryLinePrefix.value_or(""),
-              L.StartLineNumber));
-  Node->Children.emplace_back(std::move(LocNumberNode));
-  Node->Children.emplace_back(std::make_unique<TextNode>(" of file "));
-  auto LocFileNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_A, llvm::sys::path::filename(FileURL));
-  LocFileNode->Attributes.emplace_back("href", std::string(FileURL));
-  Node->Children.emplace_back(std::move(LocFileNode));
-  return Node;
-}
-
-static void maybeWriteSourceFileRef(std::vector<std::unique_ptr<TagNode>> &Out,
-                                    const ClangDocContext &CDCtx,
-                                    const std::optional<Location> &DefLoc) {
-  if (DefLoc)
-    Out.emplace_back(writeSourceFileRef(CDCtx, *DefLoc));
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList);
-
-// Generates a list of child nodes for the HTML head tag
-// It contains a meta node, link nodes to import CSS files, and script nodes to
-// import JS files
-static std::vector<std::unique_ptr<TagNode>>
-genFileHeadNodes(StringRef Title, StringRef InfoPath,
-                 const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  auto MetaNode = std::make_unique<TagNode>(HTMLTag::TAG_META);
-  MetaNode->Attributes.emplace_back("charset", "utf-8");
-  Out.emplace_back(std::move(MetaNode));
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_TITLE, Title));
-  std::vector<std::unique_ptr<TagNode>> StylesheetsNodes =
-      genStylesheetsHTML(InfoPath, CDCtx);
-  appendVector(std::move(StylesheetsNodes), Out);
-  std::vector<std::unique_ptr<TagNode>> JsNodes =
-      genJsScriptsHTML(InfoPath, CDCtx);
-  appendVector(std::move(JsNodes), Out);
-  return Out;
-}
-
-// Generates a header HTML node that can be used for any file
-// It contains the project name
-static std::unique_ptr<TagNode> genFileHeaderNode(StringRef ProjectName) {
-  auto HeaderNode = std::make_unique<TagNode>(HTMLTag::TAG_HEADER, ProjectName);
-  HeaderNode->Attributes.emplace_back("id", "project-title");
-  return HeaderNode;
-}
-
-// Generates a main HTML node that has all the main content of an info file
-// It contains both indexes and the info's documented information
-// This function should only be used for the info files (not for the file that
-// only has the general index)
-static std::unique_ptr<TagNode> genInfoFileMainNode(
-    StringRef InfoPath,
-    std::vector<std::unique_ptr<TagNode>> &MainContentInnerNodes,
-    const Index &InfoIndex) {
-  auto MainNode = std::make_unique<TagNode>(HTMLTag::TAG_MAIN);
-
-  auto LeftSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  LeftSidebarNode->Attributes.emplace_back("id", "sidebar-left");
-  LeftSidebarNode->Attributes.emplace_back("path", std::string(InfoPath));
-  LeftSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left");
-
-  auto MainContentNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  MainContentNode->Attributes.emplace_back("id", "main-content");
-  MainContentNode->Attributes.emplace_back(
-      "class", "col-xs-12 col-sm-9 col-md-8 main-content");
-  appendVector(std::move(MainContentInnerNodes), MainContentNode->Children);
-
-  auto RightSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  RightSidebarNode->Attributes.emplace_back("id", "sidebar-right");
-  RightSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right");
-  std::vector<std::unique_ptr<TagNode>> InfoIndexHTML =
-      genHTML(InfoIndex, InfoPath, true);
-  appendVector(std::move(InfoIndexHTML), RightSidebarNode->Children);
-
-  MainNode->Children.emplace_back(std::move(LeftSidebarNode));
-  MainNode->Children.emplace_back(std::move(MainContentNode));
-  MainNode->Children.emplace_back(std::move(RightSidebarNode));
-
-  return MainNode;
-}
-
-// Generates a footer HTML node that can be used for any file
-// It contains clang-doc's version
-static std::unique_ptr<TagNode> genFileFooterNode() {
-  auto FooterNode = std::make_unique<TagNode>(HTMLTag::TAG_FOOTER);
-  auto SpanNode = std::make_unique<TagNode>(
-      HTMLTag::TAG_SPAN, clang::getClangToolFullVersion("clang-doc"));
-  SpanNode->Attributes.emplace_back("class", "no-break");
-  FooterNode->Children.emplace_back(std::move(SpanNode));
-  return FooterNode;
-}
-
-// Generates a complete HTMLFile for an Info
-static HTMLFile
-genInfoFile(StringRef Title, StringRef InfoPath,
-            std::vector<std::unique_ptr<TagNode>> &MainContentNodes,
-            const Index &InfoIndex, const ClangDocContext &CDCtx) {
-  HTMLFile F;
-
-  std::vector<std::unique_ptr<TagNode>> HeadNodes =
-      genFileHeadNodes(Title, InfoPath, CDCtx);
-  std::unique_ptr<TagNode> HeaderNode = genFileHeaderNode(CDCtx.ProjectName);
-  std::unique_ptr<TagNode> MainNode =
-      genInfoFileMainNode(InfoPath, MainContentNodes, InfoIndex);
-  std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
-
-  appendVector(std::move(HeadNodes), F.Children);
-  F.Children.emplace_back(std::move(HeaderNode));
-  F.Children.emplace_back(std::move(MainNode));
-  F.Children.emplace_back(std::move(FooterNode));
-
-  return F;
-}
-
-template <typename T,
-          typename = std::enable_if<std::is_base_of<T, Info>::value>>
-static Index genInfoIndexItem(const std::vector<T> &Infos, StringRef Title) {
-  Index Idx(Title, Title);
-  for (const auto &C : Infos)
-    Idx.Children.emplace_back(C.extractName(),
-                              llvm::toHex(llvm::toStringRef(C.USR)));
-  return Idx;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const Index &Index, StringRef InfoPath, bool IsOutermostList) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  if (!Index.Name.empty()) {
-    Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_SPAN));
-    auto &SpanBody = Out.back();
-    if (!Index.JumpToSection)
-      SpanBody->Children.emplace_back(genReference(Index, InfoPath));
-    else
-      SpanBody->Children.emplace_back(
-          genReference(Index, InfoPath, Index.JumpToSection->str()));
-  }
-  if (Index.Children.empty())
-    return Out;
-  // Only the outermost list should use ol, the others should use ul
-  HTMLTag ListHTMLTag = IsOutermostList ? HTMLTag::TAG_OL : HTMLTag::TAG_UL;
-  Out.emplace_back(std::make_unique<TagNode>(ListHTMLTag));
-  const auto &UlBody = Out.back();
-  for (const auto &C : Index.Children) {
-    auto LiBody = std::make_unique<TagNode>(HTMLTag::TAG_LI);
-    std::vector<std::unique_ptr<TagNode>> Nodes = genHTML(C, InfoPath, false);
-    appendVector(std::move(Nodes), LiBody->Children);
-    UlBody->Children.emplace_back(std::move(LiBody));
-  }
-  return Out;
-}
-
-static std::unique_ptr<HTMLNode> genHTML(const CommentInfo &I) {
-  switch (I.Kind) {
-  case CommentKind::CK_FullComment: {
-    auto FullComment = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        FullComment->Children.emplace_back(std::move(Node));
-    }
-    return std::move(FullComment);
-  }
-
-  case CommentKind::CK_ParagraphComment: {
-    auto ParagraphComment = std::make_unique<TagNode>(HTMLTag::TAG_P);
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        ParagraphComment->Children.emplace_back(std::move(Node));
-    }
-    if (ParagraphComment->Children.empty())
-      return nullptr;
-    return std::move(ParagraphComment);
-  }
-
-  case CommentKind::CK_BlockCommandComment: {
-    auto BlockComment = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-    BlockComment->Children.emplace_back(
-        std::make_unique<TagNode>(HTMLTag::TAG_DIV, I.Name));
-    for (const auto &Child : I.Children) {
-      std::unique_ptr<HTMLNode> Node = genHTML(*Child);
-      if (Node)
-        BlockComment->Children.emplace_back(std::move(Node));
-    }
-    if (BlockComment->Children.empty())
-      return nullptr;
-    return std::move(BlockComment);
-  }
-
-  case CommentKind::CK_TextComment: {
-    if (I.Text.empty())
-      return nullptr;
-    return std::make_unique<TextNode>(I.Text);
-  }
-
-  // For now, return nullptr for unsupported comment kinds
-  case CommentKind::CK_InlineCommandComment:
-  case CommentKind::CK_HTMLStartTagComment:
-  case CommentKind::CK_HTMLEndTagComment:
-  case CommentKind::CK_ParamCommandComment:
-  case CommentKind::CK_TParamCommandComment:
-  case CommentKind::CK_VerbatimBlockComment:
-  case CommentKind::CK_VerbatimBlockLineComment:
-  case CommentKind::CK_VerbatimLineComment:
-  case CommentKind::CK_Unknown:
-    return nullptr;
-  }
-  llvm_unreachable("Unhandled CommentKind");
-}
-
-static std::unique_ptr<TagNode> genHTML(const std::vector<CommentInfo> &C) {
-  auto CommentBlock = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  for (const auto &Child : C) {
-    if (std::unique_ptr<HTMLNode> Node = genHTML(Child))
-      CommentBlock->Children.emplace_back(std::move(Node));
-  }
-  return CommentBlock;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const EnumInfo &I, const ClangDocContext &CDCtx) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  std::string EnumType = I.Scoped ? "enum class " : "enum ";
-  // Determine if enum members have comments attached
-  bool HasComments = llvm::any_of(
-      I.Members, [](const EnumValueInfo &M) { return !M.Description.empty(); });
-  std::unique_ptr<TagNode> Table =
-      std::make_unique<TagNode>(HTMLTag::TAG_TABLE);
-  std::unique_ptr<TagNode> THead =
-      std::make_unique<TagNode>(HTMLTag::TAG_THEAD);
-  std::unique_ptr<TagNode> TRow = std::make_unique<TagNode>(HTMLTag::TAG_TR);
-  std::unique_ptr<TagNode> TD =
-      std::make_unique<TagNode>(HTMLTag::TAG_TH, EnumType + I.Name);
-  // Span 3 columns if enum has comments
-  TD->Attributes.emplace_back("colspan", HasComments ? "3" : "2");
-
-  Table->Attributes.emplace_back("id", llvm::toHex(llvm::toStringRef(I.USR)));
-  TRow->Children.emplace_back(std::move(TD));
-  THead->Children.emplace_back(std::move(TRow));
-  Table->Children.emplace_back(std::move(THead));
-
-  if (std::unique_ptr<TagNode> Node = genEnumMembersBlock(I.Members))
-    Table->Children.emplace_back(std::move(Node));
-
-  Out.emplace_back(std::move(Table));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const FunctionInfo &I, const ClangDocContext &CDCtx,
-        StringRef ParentInfoDir) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H3, I.Name));
-  // USR is used as id for functions instead of name to disambiguate function
-  // overloads.
-  Out.back()->Attributes.emplace_back("id",
-                                      llvm::toHex(llvm::toStringRef(I.USR)));
-
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
-  auto &FunctionHeader = Out.back();
-
-  std::string Access = getAccessSpelling(I.Access).str();
-  if (Access != "")
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>(Access + " "));
-  if (I.IsStatic)
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>("static "));
-  if (I.ReturnType.Type.Name != "") {
-    FunctionHeader->Children.emplace_back(
-        genReference(I.ReturnType.Type, ParentInfoDir));
-    FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(" "));
-  }
-  FunctionHeader->Children.emplace_back(
-      std::make_unique<TextNode>(I.Name + "("));
-
-  for (const auto &P : I.Params) {
-    if (&P != I.Params.begin())
-      FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(", "));
-    FunctionHeader->Children.emplace_back(genReference(P.Type, ParentInfoDir));
-    FunctionHeader->Children.emplace_back(
-        std::make_unique<TextNode>(" " + P.Name));
-  }
-  FunctionHeader->Children.emplace_back(std::make_unique<TextNode>(")"));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const NamespaceInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  if (I.Name.str() == "")
-    InfoTitle = "Global Namespace";
-  else
-    InfoTitle = ("namespace " + I.Name).str();
+static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
 
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H1, InfoTitle));
+static std::unique_ptr<MustacheTemplateFile> RecordTemplate = nullptr;
 
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  llvm::SmallString<64> BasePath = I.getRelativeFilePath("");
-
-  std::vector<std::unique_ptr<TagNode>> ChildNamespaces =
-      genReferencesBlock(I.Children.Namespaces, "Namespaces", BasePath);
-  appendVector(std::move(ChildNamespaces), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildRecords =
-      genReferencesBlock(I.Children.Records, "Records", BasePath);
-  appendVector(std::move(ChildRecords), Out);
-
-  std::vector<std::unique_ptr<TagNode>> ChildFunctions =
-      genFunctionsBlock(I.Children.Functions, CDCtx, BasePath);
-  appendVector(std::move(ChildFunctions), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildEnums =
-      genEnumsBlock(I.Children.Enums, CDCtx);
-  appendVector(std::move(ChildEnums), Out);
-
-  if (!I.Children.Namespaces.empty())
-    InfoIndex.Children.emplace_back("Namespaces", "Namespaces");
-  if (!I.Children.Records.empty())
-    InfoIndex.Children.emplace_back("Records", "Records");
-  if (!I.Children.Functions.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Functions, "Functions"));
-  if (!I.Children.Enums.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Enums, "Enums"));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const RecordInfo &I, Index &InfoIndex, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  std::vector<std::unique_ptr<TagNode>> Out;
-  InfoTitle = (getTagType(I.TagType) + " " + I.Name).str();
-  Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_H1, InfoTitle));
-
-  maybeWriteSourceFileRef(Out, CDCtx, I.DefLoc);
-
-  if (!I.Description.empty())
-    Out.emplace_back(genHTML(I.Description));
-
-  std::vector<std::unique_ptr<HTMLNode>> Parents =
-      genReferenceList(I.Parents, I.Path);
-  std::vector<std::unique_ptr<HTMLNode>> VParents =
-      genReferenceList(I.VirtualParents, I.Path);
-  if (!Parents.empty() || !VParents.empty()) {
-    Out.emplace_back(std::make_unique<TagNode>(HTMLTag::TAG_P));
-    auto &PBody = Out.back();
-    PBody->Children.emplace_back(std::make_unique<TextNode>("Inherits from "));
-    if (Parents.empty())
-      appendVector(std::move(VParents), PBody->Children);
-    else if (VParents.empty())
-      appendVector(std::move(Parents), PBody->Children);
-    else {
-      appendVector(std::move(Parents), PBody->Children);
-      PBody->Children.emplace_back(std::make_unique<TextNode>(", "));
-      appendVector(std::move(VParents), PBody->Children);
-    }
-  }
-
-  std::vector<std::unique_ptr<TagNode>> Members =
-      genRecordMembersBlock(I.Members, I.Path);
-  appendVector(std::move(Members), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildRecords =
-      genReferencesBlock(I.Children.Records, "Records", I.Path);
-  appendVector(std::move(ChildRecords), Out);
-
-  std::vector<std::unique_ptr<TagNode>> ChildFunctions =
-      genFunctionsBlock(I.Children.Functions, CDCtx, I.Path);
-  appendVector(std::move(ChildFunctions), Out);
-  std::vector<std::unique_ptr<TagNode>> ChildEnums =
-      genEnumsBlock(I.Children.Enums, CDCtx);
-  appendVector(std::move(ChildEnums), Out);
-
-  if (!I.Members.empty())
-    InfoIndex.Children.emplace_back("Members", "Members");
-  if (!I.Children.Records.empty())
-    InfoIndex.Children.emplace_back("Records", "Records");
-  if (!I.Children.Functions.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Functions, "Functions"));
-  if (!I.Children.Enums.empty())
-    InfoIndex.Children.emplace_back(
-        genInfoIndexItem(I.Children.Enums, "Enums"));
-
-  return Out;
-}
-
-static std::vector<std::unique_ptr<TagNode>>
-genHTML(const TypedefInfo &I, const ClangDocContext &CDCtx,
-        std::string &InfoTitle) {
-  // TODO support typedefs in HTML.
-  return {};
-}
-
-/// Generator for HTML documentation.
-class HTMLGenerator : public Generator {
+class HTMLGenerator : public MustacheGenerator {
 public:
   static const char *Format;
-
+  Error createResources(ClangDocContext &CDCtx) override;
+  Error generateDocForInfo(Info *I, raw_ostream &OS,
+                           const ClangDocContext &CDCtx) override;
+  Error setupTemplateFiles(const ClangDocContext &CDCtx) override;
+  Error generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
+                           const ClangDocContext &CDCtx, StringRef ObjTypeStr,
+                           StringRef RelativeRootPath) override;
+  // Populates templates with CSS stylesheets, JS scripts paths.
+  Error setupTemplateResources(const ClangDocContext &CDCtx, json::Value &V,
+                               SmallString<128> RelativeRootPath);
   llvm::Error generateDocumentation(
       StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
       const ClangDocContext &CDCtx, std::string DirName) override;
-  llvm::Error createResources(ClangDocContext &CDCtx) override;
-  llvm::Error generateDocForInfo(Info *I, llvm::raw_ostream &OS,
-                                 const ClangDocContext &CDCtx) override;
 };
 
-const char *HTMLGenerator::Format = "html";
-
-llvm::Error HTMLGenerator::generateDocumentation(
-    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-    const ClangDocContext &CDCtx, std::string DirName) {
-  // Track which directories we already tried to create.
-  llvm::StringSet<> CreatedDirs;
+Error HTMLGenerator::setupTemplateFiles(const ClangDocContext &CDCtx) {
+  // Template files need to use the native path when they're opened,
+  // but have to be used in POSIX style when used in HTML.
+  auto ConvertToNative = [](std::string &&Path) -> std::string {
+    SmallString<128> PathBuf(Path);
+    llvm::sys::path::native(PathBuf);
+    return PathBuf.str().str();
+  };
 
-  // Collect all output by file name and create the nexessary directories.
-  llvm::StringMap<std::vector<doc::Info *>> FileToInfos;
-  for (const auto &Group : Infos) {
-    doc::Info *Info = Group.getValue().get();
+  std::string NamespaceFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("namespace-template"));
+  std::string ClassFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("class-template"));
+  std::string CommentFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("comment-template"));
+  std::string FunctionFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("function-template"));
+  std::string EnumFilePath =
+      ConvertToNative(CDCtx.MustacheTemplates.lookup("enum-template"));
+  std::vector<std::pair<StringRef, StringRef>> Partials = {
+      {"Comments", CommentFilePath},
+      {"FunctionPartial", FunctionFilePath},
+      {"EnumPartial", EnumFilePath}};
+
+  if (Error Err = setupTemplate(NamespaceTemplate, NamespaceFilePath, Partials))
+    return Err;
 
-    llvm::SmallString<128> Path;
-    llvm::sys::path::native(RootDir, Path);
-    llvm::sys::path::append(Path, Info->getRelativeFilePath(""));
-    if (!CreatedDirs.contains(Path)) {
-      if (std::error_code Err = llvm::sys::fs::create_directories(Path);
-          Err != std::error_code()) {
-        return llvm::createStringError(Err, "Failed to create directory '%s'.",
-                                       Path.c_str());
-      }
-      CreatedDirs.insert(Path);
-    }
+  if (Error Err = setupTemplate(RecordTemplate, ClassFilePath, Partials))
+    return Err;
 
-    llvm::sys::path::append(Path, Info->getFileBaseName() + ".html");
-    FileToInfos[Path].push_back(Info);
-  }
+  return Error::success();
+}
 
-  for (const auto &Group : FileToInfos) {
-    std::error_code FileErr;
-    llvm::raw_fd_ostream InfoOS(Group.getKey(), FileErr,
-                                llvm::sys::fs::OF_Text);
-    if (FileErr) {
-      return llvm::createStringError(FileErr, "Error opening file '%s'",
-                                     Group.getKey().str().c_str());
-    }
+Error HTMLGenerator::setupTemplateResources(const ClangDocContext &CDCtx,
+                                            json::Value &V,
+                                            SmallString<128> RelativeRootPath) {
+  V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
+  json::Value StylesheetArr = Array();
+  sys::path::native(RelativeRootPath, sys::path::Style::posix);
 
-    // TODO: https://github.com/llvm/llvm-project/issues/59073
-    // If there are multiple Infos for this file name (for example, template
-    // specializations), this will generate multiple complete web pages (with
-    // <DOCTYPE> and <title>, etc.) concatenated together. This generator needs
-    // some refactoring to be able to output the headers separately from the
-    // contents.
-    for (const auto &Info : Group.getValue()) {
-      if (llvm::Error Err = generateDocForInfo(Info, InfoOS, CDCtx)) {
-        return Err;
-      }
-    }
+  auto *SSA = StylesheetArr.getAsArray();
+  SSA->reserve(CDCtx.UserStylesheets.size());
+  for (const auto &FilePath : CDCtx.UserStylesheets) {
+    SmallString<128> StylesheetPath = RelativeRootPath;
+    sys::path::append(StylesheetPath, sys::path::Style::posix,
+                      sys::path::filename(FilePath));
+    SSA->emplace_back(StylesheetPath);
+  }
+  V.getAsObject()->insert({"Stylesheets", StylesheetArr});
+
+  json::Value ScriptArr = Array();
+  auto *SCA = ScriptArr.getAsArray();
+  SCA->reserve(CDCtx.JsScripts.size());
+  for (auto Script : CDCtx.JsScripts) {
+    SmallString<128> JsPath = RelativeRootPath;
+    sys::path::append(JsPath, sys::path::Style::posix,
+                      sys::path::filename(Script));
+    SCA->emplace_back(JsPath);
+  }
+  V.getAsObject()->insert({"Scripts", ScriptArr});
+  return Error::success();
+}
+
+Error HTMLGenerator::generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
+                                        const ClangDocContext &CDCtx,
+                                        StringRef ObjTypeStr,
+                                        StringRef RelativeRootPath) {
+  if (ObjTypeStr == "namespace") {
+    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
+      return Err;
+    assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
+    NamespaceTemplate->render(JSON, OS);
+  } else if (ObjTypeStr == "record") {
+    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
+      return Err;
+    assert(RecordTemplate && "RecordTemplate is nullptr.");
+    RecordTemplate->render(JSON, OS);
   }
-
-  return llvm::Error::success();
+  return Error::success();
 }
 
-llvm::Error HTMLGenerator::generateDocForInfo(Info *I, llvm::raw_ostream &OS,
-                                              const ClangDocContext &CDCtx) {
-  std::string InfoTitle;
-  std::vector<std::unique_ptr<TagNode>> MainContentNodes;
-  Index InfoIndex;
+Error HTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
+                                        const ClangDocContext &CDCtx) {
   switch (I->IT) {
-  case InfoType::IT_namespace:
-    MainContentNodes = genHTML(*static_cast<clang::doc::NamespaceInfo *>(I),
-                               InfoIndex, CDCtx, InfoTitle);
-    break;
-  case InfoType::IT_record:
-    MainContentNodes = genHTML(*static_cast<clang::doc::RecordInfo *>(I),
-                               InfoIndex, CDCtx, InfoTitle);
-    break;
   case InfoType::IT_enum:
-    MainContentNodes = genHTML(*static_cast<clang::doc::EnumInfo *>(I), CDCtx);
-    break;
   case InfoType::IT_function:
-    MainContentNodes =
-        genHTML(*static_cast<clang::doc::FunctionInfo *>(I), CDCtx, "");
-    break;
   case InfoType::IT_typedef:
-    MainContentNodes =
-        genHTML(*static_cast<clang::doc::TypedefInfo *>(I), CDCtx, InfoTitle);
-    break;
-  case InfoType::IT_concept:
-  case InfoType::IT_variable:
-  case InfoType::IT_friend:
-    break;
-  case InfoType::IT_default:
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "unexpected info type");
-  }
-
-  HTMLFile F = genInfoFile(InfoTitle, I->getRelativeFilePath(""),
-                           MainContentNodes, InfoIndex, CDCtx);
-  F.render(OS);
-
-  return llvm::Error::success();
-}
-
-static std::string getRefType(InfoType IT) {
-  switch (IT) {
-  case InfoType::IT_default:
-    return "default";
   case InfoType::IT_namespace:
-    return "namespace";
   case InfoType::IT_record:
-    return "record";
-  case InfoType::IT_function:
-    return "function";
-  case InfoType::IT_enum:
-    return "enum";
-  case InfoType::IT_typedef:
-    return "typedef";
   case InfoType::IT_concept:
-    return "concept";
   case InfoType::IT_variable:
-    return "variable";
   case InfoType::IT_friend:
-    return "friend";
-  }
-  llvm_unreachable("Unknown InfoType");
-}
-
-static llvm::Error serializeIndex(ClangDocContext &CDCtx) {
-  std::error_code OK;
-  std::error_code FileErr;
-  llvm::SmallString<128> FilePath;
-  llvm::sys::path::native(CDCtx.OutDirectory, FilePath);
-  llvm::sys::path::append(FilePath, "index_json.js");
-  llvm::raw_fd_ostream OS(FilePath, FileErr, llvm::sys::fs::OF_Text);
-  if (FileErr != OK) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "error creating index file: " +
-                                       FileErr.message());
-  }
-  llvm::SmallString<128> RootPath(CDCtx.OutDirectory);
-  if (llvm::sys::path::is_relative(RootPath)) {
-    llvm::sys::fs::make_absolute(RootPath);
+    break;
+  case InfoType::IT_default:
+    return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
   }
-  // Replace the escaped characters with a forward slash. It shouldn't matter
-  // when rendering the webpage in a web browser. This helps to prevent the
-  // JavaScript from escaping characters incorrectly, and introducing  bad paths
-  // in the URLs.
-  std::string RootPathEscaped = RootPath.str().str();
-  llvm::replace(RootPathEscaped, '\\', '/');
-  OS << "var RootPath = \"" << RootPathEscaped << "\";\n";
-
-  llvm::SmallString<128> Base(CDCtx.Base);
-  std::string BaseEscaped = Base.str().str();
-  llvm::replace(BaseEscaped, '\\', '/');
-  OS << "var Base = \"" << BaseEscaped << "\";\n";
-
-  CDCtx.Idx.sort();
-  llvm::json::OStream J(OS, 2);
-  std::function<void(Index)> IndexToJSON = [&](const Index &I) {
-    J.object([&] {
-      J.attribute("USR", toHex(llvm::toStringRef(I.USR)));
-      J.attribute("Name", I.Name);
-      J.attribute("RefType", getRefType(I.RefType));
-      J.attribute("Path", I.getRelativeFilePath(""));
-      J.attributeArray("Children", [&] {
-        for (const Index &C : I.Children)
-          IndexToJSON(C);
-      });
-    });
-  };
-  OS << "async function LoadIndex() {\nreturn";
-  IndexToJSON(CDCtx.Idx);
-  OS << ";\n}";
-  return llvm::Error::success();
+  return Error::success();
 }
 
-// Generates a main HTML node that has the main content of the file that shows
-// only the general index
-// It contains the general index with links to all the generated files
-static std::unique_ptr<TagNode> genIndexFileMainNode() {
-  auto MainNode = std::make_unique<TagNode>(HTMLTag::TAG_MAIN);
-
-  auto LeftSidebarNode = std::make_unique<TagNode>(HTMLTag::TAG_DIV);
-  LeftSidebarNode->Attributes.emplace_back("id", "sidebar-left");
-  LeftSidebarNode->Attributes.emplace_back("path", "");
-  LeftSidebarNode->Attributes.emplace_back(
-      "class", "col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left");
-  LeftSidebarNode->Attributes.emplace_back("style", "flex: 0 100%;");
-
-  MainNode->Children.emplace_back(std::move(LeftSidebarNode));
-
-  return MainNode;
+Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
+  std::string ResourcePath(CDCtx.OutDirectory + "/html");
+  for (const auto &FilePath : CDCtx.UserStylesheets)
+    if (Error Err = copyFile(FilePath, ResourcePath))
+      return Err;
+  for (const auto &FilePath : CDCtx.JsScripts)
+    if (Error Err = copyFile(FilePath, ResourcePath))
+      return Err;
+  return Error::success();
 }
 
-static llvm::Error genIndex(const ClangDocContext &CDCtx) {
-  std::error_code FileErr, OK;
-  llvm::SmallString<128> IndexPath;
-  llvm::sys::path::native(CDCtx.OutDirectory, IndexPath);
-  llvm::sys::path::append(IndexPath, "index.html");
-  llvm::raw_fd_ostream IndexOS(IndexPath, FileErr, llvm::sys::fs::OF_Text);
-  if (FileErr != OK) {
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "error creating main index: " +
-                                       FileErr.message());
-  }
-
-  HTMLFile F;
-
-  std::vector<std::unique_ptr<TagNode>> HeadNodes =
-      genFileHeadNodes("Index", "", CDCtx);
-  std::unique_ptr<TagNode> HeaderNode = genFileHeaderNode(CDCtx.ProjectName);
-  std::unique_ptr<TagNode> MainNode = genIndexFileMainNode();
-  std::unique_ptr<TagNode> FooterNode = genFileFooterNode();
-
-  appendVector(std::move(HeadNodes), F.Children);
-  F.Children.emplace_back(std::move(HeaderNode));
-  F.Children.emplace_back(std::move(MainNode));
-  F.Children.emplace_back(std::move(FooterNode));
-
-  F.render(IndexOS);
-
-  return llvm::Error::success();
+Error HTMLGenerator::generateDocumentation(
+    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
+    const ClangDocContext &CDCtx, std::string DirName) {
+  return MustacheGenerator::generateDocumentation(RootDir, std::move(Infos),
+                                                  CDCtx, "html");
 }
 
-llvm::Error HTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  auto Err = serializeIndex(CDCtx);
-  if (Err)
-    return Err;
-  Err = genIndex(CDCtx);
-  if (Err)
-    return Err;
-
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    Err = copyFile(FilePath, CDCtx.OutDirectory);
-    if (Err)
-      return Err;
-  }
-  for (const auto &FilePath : CDCtx.JsScripts) {
-    Err = copyFile(FilePath, CDCtx.OutDirectory);
-    if (Err)
-      return Err;
-  }
-  return llvm::Error::success();
-}
+const char *HTMLGenerator::Format = "html";
 
-static GeneratorRegistry::Add<HTMLGenerator> HTML(HTMLGenerator::Format,
-                                                  "Generator for HTML output.");
+static GeneratorRegistry::Add<HTMLGenerator>
+    HTML(HTMLGenerator::Format, "Generator for mustache HTML output.");
 
 // This anchor is used to force the linker to link in the generated object
 // file and thus register the generator.
diff --git a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp b/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
deleted file mode 100644
index d33b77feb84be..0000000000000
--- a/clang-tools-extra/clang-doc/HTMLMustacheGenerator.cpp
+++ /dev/null
@@ -1,179 +0,0 @@
-///===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the implementation of the MustacheHTMLGenerator class,
-/// which is Clang-Doc generator for HTML using Mustache templates.
-///
-//===----------------------------------------------------------------------===//
-
-#include "Generators.h"
-#include "Representation.h"
-#include "support/File.h"
-#include "clang/Basic/Diagnostic.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/Path.h"
-
-using namespace llvm;
-using namespace llvm::json;
-using namespace llvm::mustache;
-
-namespace clang {
-namespace doc {
-
-static std::unique_ptr<MustacheTemplateFile> NamespaceTemplate = nullptr;
-
-static std::unique_ptr<MustacheTemplateFile> RecordTemplate = nullptr;
-
-class MustacheHTMLGenerator : public MustacheGenerator {
-public:
-  static const char *Format;
-  Error createResources(ClangDocContext &CDCtx) override;
-  Error generateDocForInfo(Info *I, raw_ostream &OS,
-                           const ClangDocContext &CDCtx) override;
-  Error setupTemplateFiles(const ClangDocContext &CDCtx) override;
-  Error generateDocForJSON(json::Value &JSON, raw_fd_ostream &OS,
-                           const ClangDocContext &CDCtx, StringRef ObjTypeStr,
-                           StringRef RelativeRootPath) override;
-  // Populates templates with CSS stylesheets, JS scripts paths.
-  Error setupTemplateResources(const ClangDocContext &CDCtx, json::Value &V,
-                               SmallString<128> RelativeRootPath);
-  llvm::Error generateDocumentation(
-      StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-      const ClangDocContext &CDCtx, std::string DirName) override;
-};
-
-Error MustacheHTMLGenerator::setupTemplateFiles(const ClangDocContext &CDCtx) {
-  // Template files need to use the native path when they're opened,
-  // but have to be used in POSIX style when used in HTML.
-  auto ConvertToNative = [](std::string &&Path) -> std::string {
-    SmallString<128> PathBuf(Path);
-    llvm::sys::path::native(PathBuf);
-    return PathBuf.str().str();
-  };
-
-  std::string NamespaceFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("namespace-template"));
-  std::string ClassFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("class-template"));
-  std::string CommentFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("comment-template"));
-  std::string FunctionFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("function-template"));
-  std::string EnumFilePath =
-      ConvertToNative(CDCtx.MustacheTemplates.lookup("enum-template"));
-  std::vector<std::pair<StringRef, StringRef>> Partials = {
-      {"Comments", CommentFilePath},
-      {"FunctionPartial", FunctionFilePath},
-      {"EnumPartial", EnumFilePath}};
-
-  if (Error Err = setupTemplate(NamespaceTemplate, NamespaceFilePath, Partials))
-    return Err;
-
-  if (Error Err = setupTemplate(RecordTemplate, ClassFilePath, Partials))
-    return Err;
-
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::setupTemplateResources(
-    const ClangDocContext &CDCtx, json::Value &V,
-    SmallString<128> RelativeRootPath) {
-  V.getAsObject()->insert({"ProjectName", CDCtx.ProjectName});
-  json::Value StylesheetArr = Array();
-  sys::path::native(RelativeRootPath, sys::path::Style::posix);
-
-  auto *SSA = StylesheetArr.getAsArray();
-  SSA->reserve(CDCtx.UserStylesheets.size());
-  for (const auto &FilePath : CDCtx.UserStylesheets) {
-    SmallString<128> StylesheetPath = RelativeRootPath;
-    sys::path::append(StylesheetPath, sys::path::Style::posix,
-                      sys::path::filename(FilePath));
-    SSA->emplace_back(StylesheetPath);
-  }
-  V.getAsObject()->insert({"Stylesheets", StylesheetArr});
-
-  json::Value ScriptArr = Array();
-  auto *SCA = ScriptArr.getAsArray();
-  SCA->reserve(CDCtx.JsScripts.size());
-  for (auto Script : CDCtx.JsScripts) {
-    SmallString<128> JsPath = RelativeRootPath;
-    sys::path::append(JsPath, sys::path::Style::posix,
-                      sys::path::filename(Script));
-    SCA->emplace_back(JsPath);
-  }
-  V.getAsObject()->insert({"Scripts", ScriptArr});
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocForJSON(json::Value &JSON,
-                                                raw_fd_ostream &OS,
-                                                const ClangDocContext &CDCtx,
-                                                StringRef ObjTypeStr,
-                                                StringRef RelativeRootPath) {
-  if (ObjTypeStr == "namespace") {
-    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
-      return Err;
-    assert(NamespaceTemplate && "NamespaceTemplate is nullptr.");
-    NamespaceTemplate->render(JSON, OS);
-  } else if (ObjTypeStr == "record") {
-    if (auto Err = setupTemplateResources(CDCtx, JSON, RelativeRootPath))
-      return Err;
-    assert(RecordTemplate && "RecordTemplate is nullptr.");
-    RecordTemplate->render(JSON, OS);
-  }
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocForInfo(Info *I, raw_ostream &OS,
-                                                const ClangDocContext &CDCtx) {
-  switch (I->IT) {
-  case InfoType::IT_enum:
-  case InfoType::IT_function:
-  case InfoType::IT_typedef:
-  case InfoType::IT_namespace:
-  case InfoType::IT_record:
-  case InfoType::IT_concept:
-  case InfoType::IT_variable:
-  case InfoType::IT_friend:
-    break;
-  case InfoType::IT_default:
-    return createStringError(inconvertibleErrorCode(), "unexpected InfoType");
-  }
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::createResources(ClangDocContext &CDCtx) {
-  std::string ResourcePath(CDCtx.OutDirectory + "/html");
-  for (const auto &FilePath : CDCtx.UserStylesheets)
-    if (Error Err = copyFile(FilePath, ResourcePath))
-      return Err;
-  for (const auto &FilePath : CDCtx.JsScripts)
-    if (Error Err = copyFile(FilePath, ResourcePath))
-      return Err;
-  return Error::success();
-}
-
-Error MustacheHTMLGenerator::generateDocumentation(
-    StringRef RootDir, llvm::StringMap<std::unique_ptr<doc::Info>> Infos,
-    const ClangDocContext &CDCtx, std::string DirName) {
-  return MustacheGenerator::generateDocumentation(RootDir, std::move(Infos),
-                                                  CDCtx, "html");
-}
-
-const char *MustacheHTMLGenerator::Format = "mustache";
-
-static GeneratorRegistry::Add<MustacheHTMLGenerator>
-    MHTML(MustacheHTMLGenerator::Format, "Generator for mustache HTML output.");
-
-// This anchor is used to force the linker to link in the generated object
-// file and thus register the generator.
-volatile int MHTMLGeneratorAnchorSource = 0;
-
-} // namespace doc
-} // namespace clang
diff --git a/clang-tools-extra/clang-doc/support/Utils.cpp b/clang-tools-extra/clang-doc/support/Utils.cpp
index 897a7ad0adb79..f410bfcf956d4 100644
--- a/clang-tools-extra/clang-doc/support/Utils.cpp
+++ b/clang-tools-extra/clang-doc/support/Utils.cpp
@@ -28,8 +28,7 @@ SmallString<128> appendPathPosix(StringRef Base, StringRef Path) {
   return Default;
 }
 
-void getMustacheHtmlFiles(StringRef AssetsPath,
-                          clang::doc::ClangDocContext &CDCtx) {
+void getHtmlFiles(StringRef AssetsPath, clang::doc::ClangDocContext &CDCtx) {
   assert(!AssetsPath.empty());
   assert(sys::fs::is_directory(AssetsPath));
 
diff --git a/clang-tools-extra/clang-doc/support/Utils.h b/clang-tools-extra/clang-doc/support/Utils.h
index 8161c37503f81..f4ed9ec42dce4 100644
--- a/clang-tools-extra/clang-doc/support/Utils.h
+++ b/clang-tools-extra/clang-doc/support/Utils.h
@@ -20,7 +20,7 @@ llvm::SmallString<128> appendPathNative(llvm::StringRef Base,
 llvm::SmallString<128> appendPathPosix(llvm::StringRef Base,
                                        llvm::StringRef Path);
 
-void getMustacheHtmlFiles(llvm::StringRef AssetsPath,
-                          clang::doc::ClangDocContext &CDCtx);
+void getHtmlFiles(llvm::StringRef AssetsPath,
+                  clang::doc::ClangDocContext &CDCtx);
 
 #endif
diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
index cb98c200442a6..ee4c449718871 100644
--- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -111,21 +111,20 @@ Turn on time profiler. Generates clang-doc-tracing.json)"),
                                       llvm::cl::init(false),
                                       llvm::cl::cat(ClangDocCategory));
 
-enum OutputFormatTy { md, yaml, html, mustache, json };
-
-static llvm::cl::opt<OutputFormatTy> FormatEnum(
-    "format", llvm::cl::desc("Format for outputted docs."),
-    llvm::cl::values(clEnumValN(OutputFormatTy::yaml, "yaml",
-                                "Documentation in YAML format."),
-                     clEnumValN(OutputFormatTy::md, "md",
-                                "Documentation in MD format."),
-                     clEnumValN(OutputFormatTy::html, "html",
-                                "Documentation in HTML format."),
-                     clEnumValN(OutputFormatTy::mustache, "mustache",
-                                "Documentation in mustache HTML format"),
-                     clEnumValN(OutputFormatTy::json, "json",
-                                "Documentation in JSON format")),
-    llvm::cl::init(OutputFormatTy::yaml), llvm::cl::cat(ClangDocCategory));
+enum OutputFormatTy { md, yaml, html, json };
+
+static llvm::cl::opt<OutputFormatTy>
+    FormatEnum("format", llvm::cl::desc("Format for outputted docs."),
+               llvm::cl::values(clEnumValN(OutputFormatTy::yaml, "yaml",
+                                           "Documentation in YAML format."),
+                                clEnumValN(OutputFormatTy::md, "md",
+                                           "Documentation in MD format."),
+                                clEnumValN(OutputFormatTy::html, "html",
+                                           "Documentation in HTML format."),
+                                clEnumValN(OutputFormatTy::json, "json",
+                                           "Documentation in JSON format")),
+               llvm::cl::init(OutputFormatTy::yaml),
+               llvm::cl::cat(ClangDocCategory));
 
 static llvm::ExitOnError ExitOnErr;
 
@@ -137,8 +136,6 @@ static std::string getFormatString() {
     return "md";
   case OutputFormatTy::html:
     return "html";
-  case OutputFormatTy::mustache:
-    return "mustache";
   case OutputFormatTy::json:
     return "json";
   }
@@ -175,61 +172,12 @@ static llvm::Error getAssetFiles(clang::doc::ClangDocContext &CDCtx) {
   return llvm::Error::success();
 }
 
-static llvm::Error getDefaultAssetFiles(const char *Argv0,
-                                        clang::doc::ClangDocContext &CDCtx) {
-  void *MainAddr = (void *)(intptr_t)getExecutablePath;
-  std::string ClangDocPath = getExecutablePath(Argv0, MainAddr);
-  llvm::SmallString<128> NativeClangDocPath;
-  llvm::sys::path::native(ClangDocPath, NativeClangDocPath);
-
-  llvm::SmallString<128> AssetsPath;
-  AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
-  llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
-  llvm::SmallString<128> DefaultStylesheet =
-      appendPathNative(AssetsPath, "clang-doc-default-stylesheet.css");
-  llvm::SmallString<128> IndexJS = appendPathNative(AssetsPath, "index.js");
-
-  if (!llvm::sys::fs::is_regular_file(IndexJS))
-    return llvm::createStringError(llvm::inconvertibleErrorCode(),
-                                   "default index.js file missing at " +
-                                       IndexJS + "\n");
-
-  if (!llvm::sys::fs::is_regular_file(DefaultStylesheet))
-    return llvm::createStringError(
-        llvm::inconvertibleErrorCode(),
-        "default clang-doc-default-stylesheet.css file missing at " +
-            DefaultStylesheet + "\n");
-
-  CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(),
-                               std::string(DefaultStylesheet));
-  CDCtx.JsScripts.emplace_back(IndexJS.str());
-
-  return llvm::Error::success();
-}
-
-static llvm::Error getHtmlAssetFiles(const char *Argv0,
-                                     clang::doc::ClangDocContext &CDCtx) {
-  if (!UserAssetPath.empty() &&
-      !llvm::sys::fs::is_directory(std::string(UserAssetPath))) {
-    unsigned ID = CDCtx.Diags.getCustomDiagID(
-        DiagnosticsEngine::Warning,
-        "Asset path supply is not a directory: %0 falling back to default");
-    CDCtx.Diags.Report(ID) << UserAssetPath;
-  }
-  if (llvm::sys::fs::is_directory(std::string(UserAssetPath)))
-    return getAssetFiles(CDCtx);
-  return getDefaultAssetFiles(Argv0, CDCtx);
-}
-
-static llvm::Error getMustacheHtmlFiles(const char *Argv0,
-                                        clang::doc::ClangDocContext &CDCtx) {
+static llvm::Error getHtmlFiles(const char *Argv0,
+                                clang::doc::ClangDocContext &CDCtx) {
   bool IsDir = llvm::sys::fs::is_directory(UserAssetPath);
-  if (!UserAssetPath.empty() && !IsDir) {
-    unsigned ID = CDCtx.Diags.getCustomDiagID(
-        DiagnosticsEngine::Note,
-        "Asset path supply is not a directory: %0 falling back to default");
-    CDCtx.Diags.Report(ID) << UserAssetPath;
-  }
+  if (!UserAssetPath.empty() && !IsDir)
+    llvm::outs() << "Asset path supply is not a directory: " << UserAssetPath
+                 << " falling back to default\n";
   if (IsDir) {
     if (auto Err = getAssetFiles(CDCtx))
       return Err;
@@ -243,7 +191,7 @@ static llvm::Error getMustacheHtmlFiles(const char *Argv0,
   AssetsPath = llvm::sys::path::parent_path(NativeClangDocPath);
   llvm::sys::path::append(AssetsPath, "..", "share", "clang-doc");
 
-  getMustacheHtmlFiles(AssetsPath, CDCtx);
+  getHtmlFiles(AssetsPath, CDCtx);
 
   return llvm::Error::success();
 }
@@ -337,11 +285,8 @@ Example usage for a project using a compile commands database:
         SourceRoot, RepositoryUrl, RepositoryCodeLinePrefix, BaseDirectory,
         {UserStylesheets.begin(), UserStylesheets.end()}, Diags, FTimeTrace);
 
-    if (Format == "html") {
-      ExitOnErr(getHtmlAssetFiles(argv[0], CDCtx));
-    } else if (Format == "mustache") {
-      ExitOnErr(getMustacheHtmlFiles(argv[0], CDCtx));
-    }
+    if (Format == "html")
+      ExitOnErr(getHtmlFiles(argv[0], CDCtx));
 
     llvm::timeTraceProfilerBegin("Executor Launch", "total runtime");
     // Mapping phase
diff --git a/clang-tools-extra/test/clang-doc/DR-131697.cpp b/clang-tools-extra/test/clang-doc/DR-131697.cpp
index 06168e6642f62..9025bbf910813 100644
--- a/clang-tools-extra/test/clang-doc/DR-131697.cpp
+++ b/clang-tools-extra/test/clang-doc/DR-131697.cpp
@@ -1,7 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: split-file %s %t
 // RUN: clang-doc -format=html %t/compile_commands.json %t/main.cpp
-// RUN: clang-doc -format=mustache %t/compile_commands.json %t/main.cpp
 
 //--- main.cpp
 
diff --git a/clang-tools-extra/test/clang-doc/assets.cpp b/clang-tools-extra/test/clang-doc/assets.cpp
index 9acb64a10b4fe..853dfe53d09f0 100644
--- a/clang-tools-extra/test/clang-doc/assets.cpp
+++ b/clang-tools-extra/test/clang-doc/assets.cpp
@@ -1,24 +1,8 @@
 // RUN: rm -rf %t && mkdir %t
 // RUN: clang-doc --format=html --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s --base base_dir
-// RUN: clang-doc --format=mustache --output=%t --asset=%S/Inputs/test-assets --executor=standalone %s --base base_dir
-// RUN: FileCheck %s -input-file=%t/index.html -check-prefix=INDEX
-// RUN: FileCheck %s -input-file=%t/test.css -check-prefix=CSS
-// RUN: FileCheck %s -input-file=%t/test.js -check-prefix=JS
-
 // RUN: FileCheck %s -input-file=%t/html/test.css -check-prefix=CSS
 // RUN: FileCheck %s -input-file=%t/html/test.js -check-prefix=JS
 
-// INDEX: <!DOCTYPE html>
-// INDEX-NEXT: <meta charset="utf-8"/>
-// INDEX-NEXT: <title>Index</title>
-// INDEX-NEXT: <link rel="stylesheet" href="test.css"/>
-// INDEX-NEXT: <script src="index_json.js"></script>
-// INDEX-NEXT: <script src="test.js"></script>
-// INDEX-NEXT: <header id="project-title"></header>
-// INDEX-NEXT: <main>
-// INDEX-NEXT:   <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left" style="flex: 0 100%;"></div>
-// INDEX-NEXT: </main>
-
 // CSS: body {
 // CSS-NEXT:     padding: 0;
 // CSS-NEXT: }
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index 9f7de6e689313..282ca73384c3f 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -1,7 +1,7 @@
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
-// RUN: clang-doc --format=mustache --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
+// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV5Shape.html -check-prefix=HTML-SHAPE
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV10Calculator.html -check-prefix=HTML-CALC
 // RUN: FileCheck %s -input-file=%t/docs/html/GlobalNamespace/_ZTV9Rectangle.html -check-prefix=HTML-RECTANGLE
diff --git a/clang-tools-extra/test/clang-doc/basic-project.test b/clang-tools-extra/test/clang-doc/basic-project.test
index 9c1ed29973d79..9220dc6974508 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.test
@@ -1,31 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t/docs %t/build
 // RUN: sed 's|$test_dir|%/S|g' %S/Inputs/basic-project/database_template.json > %t/build/compile_commands.json
 
-// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
-// RUN: FileCheck %s -input-file=%t/docs/index_json.js -check-prefix=JSON-INDEX
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefix=HTML-SHAPE
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefix=HTML-CALC
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefix=HTML-RECTANGLE
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefix=HTML-CIRCLE
-
-// RUN: clang-doc --format=html --output=%t/docs-with-prefix --executor=all-TUs %t/build/compile_commands.json --repository=https://repository.com --repository-line-prefix=L
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-LINE-PREFIX
-// RUN: FileCheck %s -input-file=%t/docs-with-prefix/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-LINE-PREFIX
-
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-NO-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-NO-REPOSITORY
-
-// RUN: clang-doc --format=html --output=%t/docs --executor=all-TUs %t/build/compile_commands.json --repository=https://repository.com
-// RUN: FileCheck %s -input-file=%t/docs/index_json.js -check-prefixes=JSON-INDEX
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Shape.html -check-prefixes=HTML-SHAPE,SHAPE-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Calculator.html -check-prefixes=HTML-CALC,CALC-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Rectangle.html -check-prefixes=HTML-RECTANGLE,RECTANGLE-REPOSITORY
-// RUN: FileCheck %s -input-file=%t/docs/GlobalNamespace/Circle.html -check-prefixes=HTML-CIRCLE,CIRCLE-REPOSITORY
-
 // RUN: clang-doc --format=md --output=%t/docs --executor=all-TUs %t/build/compile_commands.json
 // RUN: FileCheck %s -input-file=%t/docs/all_files.md -check-prefixes=MD-ALL-FILES
 // RUN: FileCheck %s -input-file=%t/docs/index.md -check-prefixes=MD-INDEX
@@ -81,248 +56,6 @@
 // JSON-INDEX-NEXT: };
 // JSON-INDEX-NEXT: }
 
-//      HTML-SHAPE: <h1>class Shape</h1>
-//   SHAPE-NO-REPOSITORY: <p>Defined at line 8 of file .{{.}}include{{.}}Shape.h</p>
-//      SHAPE-REPOSITORY: <p>
-// SHAPE-REPOSITORY-NEXT: Defined at line
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h#8">8</a>
-//     SHAPE-LINE-PREFIX: <a href="https://repository.com/./include/Shape.h#L8">8</a>
-// SHAPE-REPOSITORY-NEXT: of file
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h">Shape.h</a>
-// SHAPE-REPOSITORY-NEXT: </p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Abstract base class for shapes.</p>
-//      HTML-SHAPE: <p> Provides a common interface for different types of shapes.</p>
-//      HTML-SHAPE: <h2 id="Functions">Functions</h2>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-SHAPE: <p>public double area()</p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Calculates the area of the shape.</p>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-SHAPE: <p>public double perimeter()</p>
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Calculates the perimeter of the shape.</p>
-//      HTML-SHAPE: <div>return</div>
-//      HTML-SHAPE: <p> double The perimeter of the shape.</p>
-//      HTML-SHAPE: <h3 id="{{([0-9A-F]{40})}}">~Shape</h3>
-//      HTML-SHAPE: <p>public void ~Shape()</p>
-
-//   SHAPE-NO-REPOSITORY: Defined at line 13 of file .{{.}}include{{.}}Shape.h
-//      SHAPE-REPOSITORY: Defined at line 
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h#13">13</a>
-//     SHAPE-LINE-PREFIX: <a href="https://repository.com/./include/Shape.h#L13">13</a>
-// SHAPE-REPOSITORY-NEXT: of file 
-// SHAPE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Shape.h">Shape.h</a>
-
-//      HTML-SHAPE: <div>brief</div>
-//      HTML-SHAPE: <p> Virtual destructor.</p>
-
-//      HTML-CALC: <h1>class Calculator</h1>
-// CALC-NO-REPOSITORY: <p>Defined at line 8 of file .{{.}}include{{.}}Calculator.h</p>
-//      CALC-REPOSITORY: <p>
-// CALC-REPOSITORY-NEXT: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h#8">8</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./include/Calculator.h#L8">8</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h">Calculator.h</a>
-// CALC-REPOSITORY-NEXT: </p>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> A simple calculator class.</p>
-//      HTML-CALC: <p> Provides basic arithmetic operations.</p>
-
-//      HTML-CALC: <h2 id="Members">Members</h2>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Holds a public value.</p>
-//      HTML-CALC: <div>public int public_val</div>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> A static value.</p>
-//      HTML-CALC: <div>public static const int static_val</div>
-
-//      HTML-CALC: <h2 id="Functions">Functions</h2>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">add</h3>
-//      HTML-CALC: <p>public int add(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#3">3</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L3">3</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Adds two integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The sum of a and b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">subtract</h3>
-//      HTML-CALC: <p>public int subtract(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 7 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#7">7</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L7">7</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Subtracts the second integer from the first.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The result of a - b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">multiply</h3>
-//      HTML-CALC: <p>public int multiply(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 11 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#11">11</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L11">11</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Multiplies two integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> int The product of a and b.</p>
-//      HTML-CALC: <h3 id="{{([0-9A-F]{40})}}">divide</h3>
-//      HTML-CALC: <p>public double divide(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 15 of file .{{.}}src{{.}}Calculator.cpp
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp#15">15</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./src/Calculator.cpp#L15">15</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./src/Calculator.cpp">Calculator.cpp</a>
-
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Divides the first integer by the second.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> double The result of a / b.</p>
-//      HTML-CALC: <div>throw</div>
-//      HTML-CALC: <p>if b is zero.</p>
-
-//      HTML-CALC: <p>public static int mod(int a, int b)</p>
-//   CALC-NO-REPOSITORY: Defined at line 54 of file .{{.}}include{{.}}Calculator.h
-//      CALC-REPOSITORY: Defined at line 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h#54">54</a>
-//     CALC-LINE-PREFIX: <a href="https://repository.com/./include/Calculator.h#L54">54</a>
-// CALC-REPOSITORY-NEXT: of file 
-// CALC-REPOSITORY-NEXT: <a href="https://repository.com/./include/Calculator.h">Calculator.h</a>
-//      HTML-CALC: <div>brief</div>
-//      HTML-CALC: <p> Performs the mod operation on integers.</p>
-//      HTML-CALC: <div>return</div>
-//      HTML-CALC: <p> The result of a % b.</p>
-
-//      HTML-RECTANGLE: <h1>class Rectangle</h1>
-// RECTANGLE-NO-REPOSITORY: <p>Defined at line 10 of file .{{.}}include{{.}}Rectangle.h</p>
-//      RECTANGLE-REPOSITORY: <p>
-// RECTANGLE-REPOSITORY-NEXT: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Rectangle.h#10">10</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./include/Rectangle.h#L10">10</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Rectangle.h">Rectangle.h</a>
-// RECTANGLE-REPOSITORY-NEXT: </p>
-
-//      HTML-RECTANGLE: <p> Represents a rectangle with a given width and height.</p>
-//      HTML-RECTANGLE: <p>
-//      HTML-RECTANGLE:   Inherits from
-//      HTML-RECTANGLE:   <a href="Shape.html">Shape</a>
-//      HTML-RECTANGLE: </p>
-//      HTML-RECTANGLE: <h2 id="Members">Members</h2>
-//      HTML-RECTANGLE: <p> Width of the rectangle.</p>
-//      HTML-RECTANGLE: <div>private double width_</div>
-//      HTML-RECTANGLE: <p> Height of the rectangle.</p>
-//      HTML-RECTANGLE: <div>private double height_</div>
-//      HTML-RECTANGLE: <h2 id="Functions">Functions</h2>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">Rectangle</h3>
-//      HTML-RECTANGLE: <p>public void Rectangle(double width, double height)</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#3">3</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L3">3</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Constructs a new Rectangle object.</p>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-RECTANGLE: <p>public double area()</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 6 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#6">6</a>
-//     RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L6">6</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Calculates the area of the rectangle.</p>
-//      HTML-RECTANGLE: <div>return</div>
-//      HTML-RECTANGLE: <p> double The area of the rectangle.</p>
-//      HTML-RECTANGLE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-RECTANGLE: <p>public double perimeter()</p>
-//   RECTANGLE-NO-REPOSITORY: Defined at line 10 of file .{{.}}src{{.}}Rectangle.cpp
-//      RECTANGLE-REPOSITORY: Defined at line
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp#10">10</a>
-// RECTANGLE-LINE-PREFIX: <a href="https://repository.com/./src/Rectangle.cpp#L10">10</a>
-// RECTANGLE-REPOSITORY-NEXT: of file 
-// RECTANGLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Rectangle.cpp">Rectangle.cpp</a>
-//      HTML-RECTANGLE: <div>brief</div>
-//      HTML-RECTANGLE: <p> Calculates the perimeter of the rectangle.</p>
-//      HTML-RECTANGLE: <div>return</div>
-//      HTML-RECTANGLE: <p> double The perimeter of the rectangle.</p>
-
-//      HTML-CIRCLE: <h1>class Circle</h1>
-// CIRCLE-NO-REPOSITORY: <p>Defined at line 10 of file .{{.}}include{{.}}Circle.h</p>
-//      CIRCLE-REPOSITORY: <p>
-// CIRCLE-REPOSITORY-NEXT: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Circle.h#10">10</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./include/Circle.h#L10">10</a>
-// CIRCLE-REPOSITORY-NEXT: of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./include/Circle.h">Circle.h</a>
-// CIRCLE-REPOSITORY-NEXT: </p>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Circle class derived from Shape.</p>
-//      HTML-CIRCLE: <p> Represents a circle with a given radius.</p>
-//      HTML-CIRCLE: <p>
-//      HTML-CIRCLE:   Inherits from
-//      HTML-CIRCLE:   <a href="Shape.html">Shape</a>
-//      HTML-CIRCLE: </p>
-//      HTML-CIRCLE: <h2 id="Members">Members</h2>
-//      HTML-CIRCLE: <p> Radius of the circle.</p>
-//      HTML-CIRCLE: <div>private double radius_</div>
-//      HTML-CIRCLE: <h2 id="Functions">Functions</h2>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">Circle</h3>
-//      HTML-CIRCLE: <p>public void Circle(double radius)</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line 3 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#3">3</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L3">3</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Constructs a new Circle object.</p>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">area</h3>
-//      HTML-CIRCLE: <p>public double area()</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line 5 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#5">5</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L5">5</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Calculates the area of the circle.</p>
-//      HTML-CIRCLE: <div>return</div>
-//      HTML-CIRCLE: <p> double The area of the circle.</p>
-//      HTML-CIRCLE: <h3 id="{{([0-9A-F]{40})}}">perimeter</h3>
-//      HTML-CIRCLE: <p>public double perimeter()</p>
-//   CIRCLE-NO-REPOSITORY: Defined at line  9 of file .{{.}}src{{.}}Circle.cpp
-//      CIRCLE-REPOSITORY: Defined at line 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp#9">9</a>
-//     CIRCLE-LINE-PREFIX: <a href="https://repository.com/./src/Circle.cpp#L9">9</a>
-// CIRCLE-REPOSITORY-NEXT:  of file 
-// CIRCLE-REPOSITORY-NEXT: <a href="https://repository.com/./src/Circle.cpp">Circle.cpp</a>
-
-//      HTML-CIRCLE: <div>brief</div>
-//      HTML-CIRCLE: <p> Calculates the perimeter of the circle.</p>
-//      HTML-CIRCLE: <div>return</div>
-//      HTML-CIRCLE: <p> double The perimeter of the circle.</p>
-
 // MD-CALC: # class Calculator
 // MD-CALC: *Defined at .{{[\/]}}include{{[\/]}}Calculator.h#8*
 // MD-CALC: **brief** A simple calculator class.
diff --git a/clang-tools-extra/test/clang-doc/comments-in-macros.cpp b/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
index 0c70fadb7f9ac..bc0ec46b72a05 100644
--- a/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
+++ b/clang-tools-extra/test/clang-doc/comments-in-macros.cpp
@@ -6,8 +6,8 @@
 // RUN: FileCheck %s < %t/GlobalNamespace/MyClass.md --check-prefix=MD-MYCLASS
 
 // RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.html --check-prefix=HTML-MYCLASS-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/MyClass.html --check-prefix=HTML-MYCLASS
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7MyClass.html --check-prefix=HTML-MYCLASS-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7MyClass.html --check-prefix=HTML-MYCLASS
 
 #define DECLARE_METHODS                                           \
     /**   							  
@@ -21,15 +21,18 @@
 // MD-MYCLASS: *public int Add(int a, int b)*
 // MD-MYCLASS: **brief** Declare a method to calculate the sum of two numbers
 
-// HTML-MYCLASS: <p>public int Add(int a, int b)</p>
-// HTML-MYCLASS: <div>brief</div>
-// HTML-MYCLASS: <p> Declare a method to calculate the sum of two numbers</p>
+
+// HTML-MYCLASS: <pre><code class="language-cpp code-clang-doc">int Add (int a, int b)</code></pre>
+// HTML-MYCLASS: <div>
+// HTML-MYCLASS:     <div>
+// HTML-MYCLASS:         <p> Declare a method to calculate the sum of two numbers</p>
+// HTML-MYCLASS:     </div>
 
 
 class MyClass {
 public:
-// MD-MYCLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp#[[@LINE+2]]*
-// HTML-MYCLASS-LINE: <p>Defined at line [[@LINE+1]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp</p>
+// MD-MYCLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp#[[@LINE-2]]*
+// HTML-MYCLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}comments-in-macros.cpp</p>
     DECLARE_METHODS
 };
 
diff --git a/clang-tools-extra/test/clang-doc/conversion_function.cpp b/clang-tools-extra/test/clang-doc/conversion_function.cpp
index 0200a578219ee..63df5d6f50d39 100644
--- a/clang-tools-extra/test/clang-doc/conversion_function.cpp
+++ b/clang-tools-extra/test/clang-doc/conversion_function.cpp
@@ -4,7 +4,7 @@
 // RUN: find %t/ -regex ".*/[0-9A-F]*.yaml" -exec cat {} ";" | FileCheck %s --check-prefix=CHECK-YAML
 
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s 
-// RUN: FileCheck %s < %t/GlobalNamespace/MyStruct.html --check-prefix=CHECK-HTML
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV8MyStruct.html --check-prefix=CHECK-HTML
 
 template <typename T>
 struct MyStruct {
@@ -14,5 +14,6 @@ struct MyStruct {
 // Output correct conversion names.
 // CHECK-YAML:         Name:            'operator T'
 
-// CHECK-HTML: <h3 id="{{[0-9A-F]*}}">operator T</h3>
-// CHECK-HTML: <p>public T operator T()</p>
+// CHECK-HTML: <div id="{{([0-9A-F]{40})}}">
+// CHECK-HTML:     <pre><code class="language-cpp code-clang-doc">T operator T ()</code></pre>
+// CHECK-HTML: </div>
diff --git a/clang-tools-extra/test/clang-doc/enum.cpp b/clang-tools-extra/test/clang-doc/enum.cpp
index 3ba834e0b2e70..bb0d51fc3b36c 100644
--- a/clang-tools-extra/test/clang-doc/enum.cpp
+++ b/clang-tools-extra/test/clang-doc/enum.cpp
@@ -1,19 +1,12 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --doxygen --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --doxygen --output=%t --executor=standalone %s
-// RUN: clang-doc --format=mustache --doxygen --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html --check-prefix=HTML-INDEX
-// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL-LINE
-// RUN: FileCheck %s < %t/GlobalNamespace/Animals.html --check-prefix=HTML-ANIMAL
-// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE
-// RUN: FileCheck %s < %t/Vehicles/index.html --check-prefix=HTML-VEHICLES
-// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=MUSTACHE-INDEX-LINE
-// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=MUSTACHE-INDEX
-// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=MUSTACHE-ANIMAL-LINE
-// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=MUSTACHE-ANIMAL
-// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=MUSTACHE-VEHICLES-LINE
-// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=MUSTACHE-VEHICLES
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=HTML-INDEX-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=HTML-INDEX
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=HTML-ANIMAL-LINE
+// RUN: FileCheck %s < %t/html/GlobalNamespace/_ZTV7Animals.html --check-prefix=HTML-ANIMAL
+// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=HTML-VEHICLES-LINE
+// RUN: FileCheck %s < %t/html/Vehicles/index.html --check-prefix=HTML-VEHICLES
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX-LINE
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md --check-prefix=MD-INDEX
 // RUN: FileCheck %s < %t/GlobalNamespace/Animals.md --check-prefix=MD-ANIMAL-LINE
@@ -28,8 +21,7 @@
  */
 enum Color {
   // MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+  // HTML-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
   Red,   ///< Comment 1
   Green, ///< Comment 2
   Blue   ///< Comment 3
@@ -43,48 +35,36 @@ enum Color {
 // MD-INDEX: | Blue |
 // MD-INDEX: **brief** For specifying RGB colors
 
-// HTML-INDEX: <th colspan="3">enum Color</th>
-// HTML-INDEX: <td>Red</td>
-// HTML-INDEX: <td>0</td>
-// HTML-INDEX: <p> Comment 1</p>
-// HTML-INDEX: <td>Green</td>
-// HTML-INDEX: <td>1</td>
-// HTML-INDEX: <p> Comment 2</p>
-// HTML-INDEX: <td>Blue</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <p> Comment 3</p>
-
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Color</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Red</td>
-// MUSTACHE-INDEX:                 <td>0</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Green</td>
-// MUSTACHE-INDEX:                 <td>1</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Blue</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Color</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Red</td>
+// HTML-INDEX:                 <td>0</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Green</td>
+// HTML-INDEX:                 <td>1</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Blue</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
 
 /**
  * @brief Shape Types
  */
 enum class Shapes {
   // MD-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+  // HTML-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
 
   /// Comment 1
   Circle,
@@ -100,86 +80,60 @@ enum class Shapes {
 // MD-INDEX: | Triangle |
 // MD-INDEX: **brief** Shape Types
 
-// HTML-INDEX: <th colspan="3">enum class Shapes</th>
-// HTML-INDEX: <td>Circle</td>
-// HTML-INDEX: <td>0</td>
-// HTML-INDEX: <p> Comment 1</p>
-// HTML-INDEX: <td>Rectangle</td>
-// HTML-INDEX: <td>1</td>
-// HTML-INDEX: <p> Comment 2</p>
-// HTML-INDEX: <td>Triangle</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <p> Comment 3</p>
-
 // COM: FIXME: Serialize "enum class" in template
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Shapes</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Circle</td>
-// MUSTACHE-INDEX:                 <td>0</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Rectangle</td>
-// MUSTACHE-INDEX:                 <td>1</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>Triangle</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum Shapes</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Circle</td>
+// HTML-INDEX:                 <td>0</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Rectangle</td>
+// HTML-INDEX:                 <td>1</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>Triangle</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
 
 // COM: FIXME: Add enums declared inside of classes to class template
 class Animals {
   // MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
   // HTML-ANIMAL-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-ANIMAL-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
 public:
   /**
    * @brief specify what animal the class is
    */
   enum AnimalType {
     // MD-ANIMAL-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-    // HTML-ANIMAL-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-    // MUSTACHE-ANIMAL-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
+    // HTML-ANIMAL-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
     Dog,   ///< Man's best friend
     Cat,   ///< Man's other best friend
     Iguana ///< A lizard
   };
 };
 
-// HTML-ANIMAL: <h1>class Animals</h1>
-// HTML-ANIMAL: <h2 id="Enums">Enums</h2>
-// HTML-ANIMAL: <th colspan="3">enum AnimalType</th>
-// HTML-ANIMAL: <td>Dog</td>
-// HTML-ANIMAL: <td>0</td>
-// HTML-ANIMAL: <p> Man&apos;s best friend</p>
-// HTML-ANIMAL: <td>Cat</td>
-// HTML-ANIMAL: <td>1</td>
-// HTML-ANIMAL: <p> Man&apos;s other best friend</p>
-// HTML-ANIMAL: <td>Iguana</td>
-// HTML-ANIMAL: <td>2</td>
-// HTML-ANIMAL: <p> A lizard</p>
-
-// MUSTACHE-ANIMAL-NOT: <h1>class Animals</h1>
-// MUSTACHE-ANIMAL-NOT: <h2 id="Enums">Enums</h2>
-// MUSTACHE-ANIMAL-NOT: <th colspan="3">enum AnimalType</th>
-// MUSTACHE-ANIMAL-NOT: <td>Dog</td>
-// MUSTACHE-ANIMAL-NOT: <td>0</td>
-// MUSTACHE-ANIMAL-NOT: <p> Man&apos;s best friend</p>
-// MUSTACHE-ANIMAL-NOT: <td>Cat</td>
-// MUSTACHE-ANIMAL-NOT: <td>1</td>
-// MUSTACHE-ANIMAL-NOT: <p> Man&apos;s other best friend</p>
-// MUSTACHE-ANIMAL-NOT: <td>Iguana</td>
-// MUSTACHE-ANIMAL-NOT: <td>2</td>
-// MUSTACHE-ANIMAL-NOT: <p> A lizard</p>
+// HTML-ANIMAL-NOT: <h1>class Animals</h1>
+// HTML-ANIMAL-NOT: <h2 id="Enums">Enums</h2>
+// HTML-ANIMAL-NOT: <th colspan="3">enum AnimalType</th>
+// HTML-ANIMAL-NOT: <td>Dog</td>
+// HTML-ANIMAL-NOT: <td>0</td>
+// HTML-ANIMAL-NOT: <p> Man&apos;s best friend</p>
+// HTML-ANIMAL-NOT: <td>Cat</td>
+// HTML-ANIMAL-NOT: <td>1</td>
+// HTML-ANIMAL-NOT: <p> Man&apos;s other best friend</p>
+// HTML-ANIMAL-NOT: <td>Iguana</td>
+// HTML-ANIMAL-NOT: <td>2</td>
+// HTML-ANIMAL-NOT: <p> A lizard</p>
 
 // MD-ANIMAL: # class Animals
 // MD-ANIMAL: ## Enums
@@ -196,8 +150,7 @@ namespace Vehicles {
  */
 enum Car {
   // MD-VEHICLES-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp#[[@LINE-1]]*
-  // HTML-VEHICLES-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp</p>
-  // MUSTACHE-VEHICLES-LINE: Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp
+  // HTML-VEHICLES-LINE: Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}enum.cpp
 
   Sedan,    ///< Comment 1
   SUV,      ///< Comment 2
@@ -216,48 +169,33 @@ enum Car {
 // MD-VEHICLES: | Hatchback |
 // MD-VEHICLES: **brief** specify type of car
 
-// HTML-VEHICLES: <h1>namespace Vehicles</h1>
-// HTML-VEHICLES: <th colspan="3">enum Car</th>
-// HTML-VEHICLES: <td>Sedan</td>
-// HTML-VEHICLES: <td>0</td>
-// HTML-VEHICLES: <p> Comment 1</p>
-// HTML-VEHICLES: <td>SUV</td>
-// HTML-VEHICLES: <td>1</td>
-// HTML-VEHICLES: <p> Comment 2</p>
-// HTML-VEHICLES: <td>Pickup</td>
-// HTML-VEHICLES: <td>2</td>
-// HTML-VEHICLES: <p> Comment 3</p>
-// HTML-VEHICLES: <td>Hatchback</td>
-// HTML-VEHICLES: <td>3</td>
-// HTML-VEHICLES: <p> Comment 4</p>
-
-// MUSTACHE-VEHICLES:     <div>
-// MUSTACHE-VEHICLES:         <pre><code class="language-cpp code-clang-doc">enum Car</code></pre>
-// MUSTACHE-VEHICLES:      </div>
-// MUSTACHE-VEHICLES:      <table class="table-wrapper">
-// MUSTACHE-VEHICLES:          <tbody>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <th>Name</th>
-// MUSTACHE-VEHICLES:                  <th>Value</th>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Sedan</td>
-// MUSTACHE-VEHICLES:                  <td>0</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>SUV</td>
-// MUSTACHE-VEHICLES:                  <td>1</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Pickup</td>
-// MUSTACHE-VEHICLES:                  <td>2</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:              <tr>
-// MUSTACHE-VEHICLES:                  <td>Hatchback</td>
-// MUSTACHE-VEHICLES:                  <td>3</td>
-// MUSTACHE-VEHICLES:              </tr>
-// MUSTACHE-VEHICLES:          </tbody>
-// MUSTACHE-VEHICLES:      </table>
+// HTML-VEHICLES:     <div>
+// HTML-VEHICLES:         <pre><code class="language-cpp code-clang-doc">enum Car</code></pre>
+// HTML-VEHICLES:      </div>
+// HTML-VEHICLES:      <table class="table-wrapper">
+// HTML-VEHICLES:          <tbody>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <th>Name</th>
+// HTML-VEHICLES:                  <th>Value</th>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Sedan</td>
+// HTML-VEHICLES:                  <td>0</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>SUV</td>
+// HTML-VEHICLES:                  <td>1</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Pickup</td>
+// HTML-VEHICLES:                  <td>2</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:              <tr>
+// HTML-VEHICLES:                  <td>Hatchback</td>
+// HTML-VEHICLES:                  <td>3</td>
+// HTML-VEHICLES:              </tr>
+// HTML-VEHICLES:          </tbody>
+// HTML-VEHICLES:      </table>
 
 enum ColorUserSpecified {
   RedUserSpecified = 'A',
@@ -271,34 +209,26 @@ enum ColorUserSpecified {
 // MD-INDEX: | GreenUserSpecified |
 // MD-INDEX: | BlueUserSpecified |
 
-// HTML-INDEX: <th colspan="2">enum ColorUserSpecified</th>
-// HTML-INDEX: <td>RedUserSpecified</td>
-// HTML-INDEX: <td>&apos;A&apos;</td>
-// HTML-INDEX: <td>GreenUserSpecified</td>
-// HTML-INDEX: <td>2</td>
-// HTML-INDEX: <td>BlueUserSpecified</td>
-// HTML-INDEX: <td>&apos;C&apos;</td>
-
-// MUSTACHE-INDEX:     <div>
-// MUSTACHE-INDEX:         <pre><code class="language-cpp code-clang-doc">enum ColorUserSpecified</code></pre>
-// MUSTACHE-INDEX:     </div>
-// MUSTACHE-INDEX:     <table class="table-wrapper">
-// MUSTACHE-INDEX:         <tbody>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <th>Name</th>
-// MUSTACHE-INDEX:                 <th>Value</th>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>RedUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>&#39;A&#39;</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>GreenUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>2</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:             <tr>
-// MUSTACHE-INDEX:                 <td>BlueUserSpecified</td>
-// MUSTACHE-INDEX:                 <td>&#39;C&#39;</td>
-// MUSTACHE-INDEX:             </tr>
-// MUSTACHE-INDEX:         </tbody>
-// MUSTACHE-INDEX:     </table>
+// HTML-INDEX:     <div>
+// HTML-INDEX:         <pre><code class="language-cpp code-clang-doc">enum ColorUserSpecified</code></pre>
+// HTML-INDEX:     </div>
+// HTML-INDEX:     <table class="table-wrapper">
+// HTML-INDEX:         <tbody>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <th>Name</th>
+// HTML-INDEX:                 <th>Value</th>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>RedUserSpecified</td>
+// HTML-INDEX:                 <td>&#39;A&#39;</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>GreenUserSpecified</td>
+// HTML-INDEX:                 <td>2</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:             <tr>
+// HTML-INDEX:                 <td>BlueUserSpecified</td>
+// HTML-INDEX:                 <td>&#39;C&#39;</td>
+// HTML-INDEX:             </tr>
+// HTML-INDEX:         </tbody>
+// HTML-INDEX:     </table>
diff --git a/clang-tools-extra/test/clang-doc/long-name.cpp b/clang-tools-extra/test/clang-doc/long-name.cpp
index 77e50b1553ad5..e4a5e29f973d5 100644
--- a/clang-tools-extra/test/clang-doc/long-name.cpp
+++ b/clang-tools-extra/test/clang-doc/long-name.cpp
@@ -1,7 +1,7 @@
 // FIXME: This test seems to break on windows, so disable it for now.
 // UNSUPPORTED: system-windows
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --output=%t --format=mustache --executor=standalone %s
+// RUN: clang-doc --output=%t --format=html --executor=standalone %s
 // RUN: ls %t/json/GlobalNamespace | FileCheck %s -check-prefix=CHECK-JSON
 // RUN: ls %t/html/GlobalNamespace | FileCheck %s -check-prefix=CHECK-HTML
 
diff --git a/clang-tools-extra/test/clang-doc/mustache-index.cpp b/clang-tools-extra/test/clang-doc/mustache-index.cpp
index 709cc82bf85bb..0aa6e21c37cac 100644
--- a/clang-tools-extra/test/clang-doc/mustache-index.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-index.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
+// RUN: clang-doc --format=html --output=%t --executor=standalone %s 
 // RUN: FileCheck %s < %t/html/GlobalNamespace/index.html
 
 enum Color {
diff --git a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
index dfc81df134596..add8a221feb40 100644
--- a/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/mustache-separate-namespace.cpp
@@ -1,5 +1,5 @@
 // RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s 
+// RUN: clang-doc --format=html --output=%t --executor=standalone %s 
 // RUN: FileCheck %s < %t/html/MyNamespace/index.html
 // RUN: FileCheck %s < %t/html/GlobalNamespace/index.html --check-prefix=CHECK-GLOBAL
 
diff --git a/clang-tools-extra/test/clang-doc/namespace.cpp b/clang-tools-extra/test/clang-doc/namespace.cpp
index adf7ab7d946ab..029f9974e775e 100644
--- a/clang-tools-extra/test/clang-doc/namespace.cpp
+++ b/clang-tools-extra/test/clang-doc/namespace.cpp
@@ -1,24 +1,6 @@
 // RUN: rm -rf %t && mkdir -p %t
 // RUN: clang-doc --format=html --output=%t --executor=standalone %s
 // RUN: clang-doc --format=md --output=%t --executor=standalone %s
-// RUN: clang-doc --format=mustache --output=%t --executor=standalone %s
-// RUN: FileCheck %s < %t/index_json.js -check-prefix=JSON-INDEX
-// RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS-LINE
-// RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.html -check-prefix=HTML-ANON-CLASS
-// RUN: FileCheck %s < %t/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX-LINE
-// RUN: FileCheck %s < %t/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX
-// RUN: FileCheck %s < %t/AnotherNamespace/ClassInAnotherNamespace.html -check-prefix=HTML-ANOTHER-CLASS-LINE
-// RUN: FileCheck %s < %t/AnotherNamespace/ClassInAnotherNamespace.html -check-prefix=HTML-ANOTHER-CLASS
-// RUN: FileCheck %s < %t/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX-LINE
-// RUN: FileCheck %s < %t/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/ClassInNestedNamespace.html -check-prefix=HTML-NESTED-CLASS-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/ClassInNestedNamespace.html -check-prefix=HTML-NESTED-CLASS
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX
-// RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.html -check-prefix=HTML-PRIMARY-CLASS-LINE
-// RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.html -check-prefix=HTML-PRIMARY-CLASS
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.md -check-prefix=MD-ANON-CLASS-LINE
 // RUN: FileCheck %s < %t/@nonymous_namespace/AnonClass.md -check-prefix=MD-ANON-CLASS
 // RUN: FileCheck %s < %t/@nonymous_namespace/index.md -check-prefix=MD-ANON-INDEX-LINE
@@ -35,26 +17,26 @@
 // RUN: FileCheck %s < %t/PrimaryNamespace/index.md -check-prefix=MD-PRIMARY-INDEX
 // RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.md -check-prefix=MD-PRIMARY-CLASS-LINE
 // RUN: FileCheck %s < %t/PrimaryNamespace/ClassInPrimaryNamespace.md -check-prefix=MD-PRIMARY-CLASS
-// RUN: FileCheck %s < %t/GlobalNamespace/index.html -check-prefix=HTML-GLOBAL-INDEX
+// RUN: FileCheck %s < %t/html/GlobalNamespace/index.html -check-prefix=HTML-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/GlobalNamespace/index.md -check-prefix=MD-GLOBAL-INDEX
 // RUN: FileCheck %s < %t/all_files.md -check-prefix=MD-ALL-FILES
 // RUN: FileCheck %s < %t/index.md -check-prefix=MD-INDEX
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS-LINE
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=MUSTACHE-ANON-CLASS
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX-LINE
-// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=MUSTACHE-ANON-INDEX
-// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS-LINE
-// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=MUSTACHE-ANOTHER-CLASS
-// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX-LINE
-// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=MUSTACHE-ANOTHER-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=MUSTACHE-NESTED-CLASS
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=MUSTACHE-NESTED-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=MUSTACHE-PRIMARY-INDEX
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS-LINE
-// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=MUSTACHE-PRIMARY-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=HTML-ANON-CLASS-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/_ZTVN12_GLOBAL__N_19AnonClassE.html -check-prefix=HTML-ANON-CLASS
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX-LINE
+// RUN: FileCheck %s < %t/html/@nonymous_namespace/index.html -check-prefix=HTML-ANON-INDEX
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=HTML-ANOTHER-CLASS-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html -check-prefix=HTML-ANOTHER-CLASS
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX-LINE
+// RUN: FileCheck %s < %t/html/AnotherNamespace/index.html -check-prefix=HTML-ANOTHER-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=HTML-NESTED-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html -check-prefix=HTML-NESTED-CLASS
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/NestedNamespace/index.html -check-prefix=HTML-NESTED-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/index.html -check-prefix=HTML-PRIMARY-INDEX
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=HTML-PRIMARY-CLASS-LINE
+// RUN: FileCheck %s < %t/html/PrimaryNamespace/_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html -check-prefix=HTML-PRIMARY-CLASS
 
 // COM: FIXME: Add global functions to the namespace template
 // COM: FIXME: Add namespaces to the namespace template
@@ -63,17 +45,14 @@
 namespace {
 void anonFunction() {}
 // MD-ANON-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-ANON-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-ANON-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 class AnonClass {};
 // MD-ANON-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANON-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANON-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANON-CLASS: # class AnonClass
-// HTML-ANON-CLASS: <h1>class AnonClass</h1>
-// MUSTACHE-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
+// HTML-ANON-CLASS: <h1 class="hero__title-large">class AnonClass</h1>
 } // namespace
 
 // MD-ANON-INDEX: # namespace @nonymous_namespace
@@ -84,69 +63,51 @@ class AnonClass {};
 // MD-ANON-INDEX: ### anonFunction
 // MD-ANON-INDEX: *void anonFunction()*
 
-// HTML-ANON-INDEX: <h1>namespace @nonymous_namespace</h1>
-// HTML-ANON-INDEX: <p> Anonymous Namespace</p>
-// HTML-ANON-INDEX: <h2 id="Records">Records</h2>
-// HTML-ANON-INDEX: <a href="AnonClass.html">AnonClass</a>
-// HTML-ANON-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-ANON-INDEX: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
-// HTML-ANON-INDEX: <p>void anonFunction()</p>
-
-// MUSTACHE-ANON-INDEX: <h2> @nonymous_namespace</h2>
-// MUSTACHE-ANON-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-ANON-INDEX:         <ul class="class-container">
-// MUSTACHE-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
-// MUSTACHE-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
-// MUSTACHE-ANON-INDEX:                 </a>
-// MUSTACHE-ANON-INDEX:             </li>
-// MUSTACHE-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
-// MUSTACHE-ANON-INDEX-NOT: <p>void anonFunction()</p>
+// HTML-ANON-INDEX: <h2> @nonymous_namespace</h2>
+// HTML-ANON-INDEX:     <h2>Inner Classes</h2>
+// HTML-ANON-INDEX:         <ul class="class-container">
+// HTML-ANON-INDEX:             <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-ANON-INDEX:                 <a href="_ZTVN12_GLOBAL__N_19AnonClassE.html">
+// HTML-ANON-INDEX:                     <pre><code class="language-cpp code-clang-doc">class AnonClass</code></pre>
+// HTML-ANON-INDEX:                 </a>
+// HTML-ANON-INDEX:             </li>
+// HTML-ANON-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-ANON-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">anonFunction</h3>
+// HTML-ANON-INDEX-NOT: <p>void anonFunction()</p>
 
 // Primary Namespace
 namespace PrimaryNamespace {
 // Function in PrimaryNamespace
 void functionInPrimaryNamespace() {}
 // MD-PRIMARY-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-PRIMARY-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-PRIMARY-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in PrimaryNamespace
 class ClassInPrimaryNamespace {};
 // MD-PRIMARY-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-PRIMARY-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-PRIMARY-CLASS: # class ClassInPrimaryNamespace
 // MD-PRIMARY-CLASS: Class in PrimaryNamespace
 
-// HTML-PRIMARY-CLASS: <h1>class ClassInPrimaryNamespace</h1>
-// HTML-PRIMARY-CLASS: <p> Class in PrimaryNamespace</p>
-
-// MUSTACHE-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
+// HTML-PRIMARY-CLASS: <h1 class="hero__title-large">class ClassInPrimaryNamespace</h1>
 
 // Nested namespace
 namespace NestedNamespace {
 // Function in NestedNamespace
 void functionInNestedNamespace() {}
 // MD-NESTED-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-NESTED-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-NESTED-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in NestedNamespace
 class ClassInNestedNamespace {};
 // MD-NESTED-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-NESTED-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-NESTED-CLASS: # class ClassInNestedNamespace
 // MD-NESTED-CLASS: Class in NestedNamespace
 
-// HTML-NESTED-CLASS: <h1>class ClassInNestedNamespace</h1>
-// HTML-NESTED-CLASS: <p> Class in NestedNamespace</p>
-
-// MUSTACHE-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
+// HTML-NESTED-CLASS: <h1 class="hero__title-large">class ClassInNestedNamespace</h1>
 } // namespace NestedNamespace
 
 // MD-NESTED-INDEX: # namespace NestedNamespace
@@ -158,28 +119,19 @@ class ClassInNestedNamespace {};
 // MD-NESTED-INDEX: *void functionInNestedNamespace()*
 // MD-NESTED-INDEX: Function in NestedNamespace
 
-// HTML-NESTED-INDEX: <h1>namespace NestedNamespace</h1>
-// HTML-NESTED-INDEX: <p> Nested namespace</p>
-// HTML-NESTED-INDEX: <h2 id="Records">Records</h2>
-// HTML-NESTED-INDEX: <a href="ClassInNestedNamespace.html">ClassInNestedNamespace</a>
-// HTML-NESTED-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-NESTED-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
-// HTML-NESTED-INDEX: <p>void functionInNestedNamespace()</p>
-// HTML-NESTED-INDEX: <p> Function in NestedNamespace</p>
-
-// MUSTACHE-NESTED-INDEX: <h2> NestedNamespace</h2>
-// MUSTACHE-NESTED-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-NESTED-INDEX:     <ul class="class-container">
-// MUSTACHE-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
-// MUSTACHE-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
-// MUSTACHE-NESTED-INDEX:             </a>
-// MUSTACHE-NESTED-INDEX:         </li>
-// MUSTACHE-NESTED-INDEX:     </ul>
-// MUSTACHE-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
-// MUSTACHE-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
-// MUSTACHE-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
+// HTML-NESTED-INDEX: <h2> NestedNamespace</h2>
+// HTML-NESTED-INDEX:     <h2>Inner Classes</h2>
+// HTML-NESTED-INDEX:     <ul class="class-container">
+// HTML-NESTED-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-NESTED-INDEX:             <a href="_ZTVN16PrimaryNamespace15NestedNamespace22ClassInNestedNamespaceE.html">
+// HTML-NESTED-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInNestedNamespace</code></pre>
+// HTML-NESTED-INDEX:             </a>
+// HTML-NESTED-INDEX:         </li>
+// HTML-NESTED-INDEX:     </ul>
+// HTML-NESTED-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-NESTED-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInNestedNamespace</h3>
+// HTML-NESTED-INDEX-NOT: <p>void functionInNestedNamespace()</p>
+// HTML-NESTED-INDEX-NOT: <p> Function in NestedNamespace</p>
 } // namespace PrimaryNamespace
 
 // MD-PRIMARY-INDEX: # namespace PrimaryNamespace
@@ -193,54 +145,38 @@ class ClassInNestedNamespace {};
 // MD-PRIMARY-INDEX: *void functionInPrimaryNamespace()*
 // MD-PRIMARY-INDEX:  Function in PrimaryNamespace
 
-// HTML-PRIMARY-INDEX: <h1>namespace PrimaryNamespace</h1>
-// HTML-PRIMARY-INDEX: <p> Primary Namespace</p>
-// HTML-PRIMARY-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// HTML-PRIMARY-INDEX: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
-// HTML-PRIMARY-INDEX: <h2 id="Records">Records</h2>
-// HTML-PRIMARY-INDEX: <a href="ClassInPrimaryNamespace.html">ClassInPrimaryNamespace</a>
-// HTML-PRIMARY-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-PRIMARY-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
-// HTML-PRIMARY-INDEX: <p>void functionInPrimaryNamespace()</p>
-// HTML-PRIMARY-INDEX: <p> Function in PrimaryNamespace</p>
-
-// MUSTACHE-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
-// MUSTACHE-PRIMARY-INDEX      <h2>Inner Classes</h2>
-// MUSTACHE-PRIMARY-INDEX          <ul class="class-container">
-// MUSTACHE-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
-// MUSTACHE-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
-// MUSTACHE-PRIMARY-INDEX                  </a>
-// MUSTACHE-PRIMARY-INDEX              </li>
-// MUSTACHE-PRIMARY-INDEX          </ul>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
-// MUSTACHE-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
-// MUSTACHE-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
+// HTML-PRIMARY-INDEX: <h2> PrimaryNamespace</h2>
+// HTML-PRIMARY-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// HTML-PRIMARY-INDEX-NOT: <a href="NestedNamespace{{[\/]}}index.html">NestedNamespace</a>
+// HTML-PRIMARY-INDEX      <h2>Inner Classes</h2>
+// HTML-PRIMARY-INDEX          <ul class="class-container">
+// HTML-PRIMARY-INDEX              <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-PRIMARY-INDEX                  <a href="_ZTVN16PrimaryNamespace23ClassInPrimaryNamespaceE.html">
+// HTML-PRIMARY-INDEX                      <pre><code class="language-cpp code-clang-doc">class ClassInPrimaryNamespace</code></pre>
+// HTML-PRIMARY-INDEX                  </a>
+// HTML-PRIMARY-INDEX              </li>
+// HTML-PRIMARY-INDEX          </ul>
+// HTML-PRIMARY-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-PRIMARY-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInPrimaryNamespace</h3>
+// HTML-PRIMARY-INDEX-NOT: <p>void functionInPrimaryNamespace()</p>
+// HTML-PRIMARY-INDEX-NOT: <p> Function in PrimaryNamespace</p>
 
 // AnotherNamespace
 namespace AnotherNamespace {
 // Function in AnotherNamespace
 void functionInAnotherNamespace() {}
 // MD-ANOTHER-INDEX-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
-// HTML-ANOTHER-INDEX-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
+// HTML-ANOTHER-INDEX-LINE-NOT: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // Class in AnotherNamespace
 class ClassInAnotherNamespace {};
 // MD-ANOTHER-CLASS-LINE: *Defined at {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp#[[@LINE-1]]*
 // HTML-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-2]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
-// MUSTACHE-ANOTHER-CLASS-LINE: <p>Defined at line [[@LINE-3]] of file {{.*}}clang-tools-extra{{[\/]}}test{{[\/]}}clang-doc{{[\/]}}namespace.cpp</p>
 
 // MD-ANOTHER-CLASS: # class ClassInAnotherNamespace
 // MD-ANOTHER-CLASS:  Class in AnotherNamespace
 
-// HTML-ANOTHER-CLASS: <h1>class ClassInAnotherNamespace</h1>
-// HTML-ANOTHER-CLASS: <p> Class in AnotherNamespace</p>
-
-// MUSTACHE-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
+// HTML-ANOTHER-CLASS: <h1 class="hero__title-large">class ClassInAnotherNamespace</h1>
 
 } // namespace AnotherNamespace
 
@@ -253,120 +189,27 @@ class ClassInAnotherNamespace {};
 // MD-ANOTHER-INDEX: *void functionInAnotherNamespace()*
 // MD-ANOTHER-INDEX: Function in AnotherNamespace
 
-// HTML-ANOTHER-INDEX: <h1>namespace AnotherNamespace</h1>
-// HTML-ANOTHER-INDEX: <p> AnotherNamespace</p>
-// HTML-ANOTHER-INDEX: <h2 id="Records">Records</h2>
-// HTML-ANOTHER-INDEX: <a href="ClassInAnotherNamespace.html">ClassInAnotherNamespace</a>
-// HTML-ANOTHER-INDEX: <h2 id="Functions">Functions</h2>
-// HTML-ANOTHER-INDEX: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
-// HTML-ANOTHER-INDEX: <p>void functionInAnotherNamespace()</p>
-// HTML-ANOTHER-INDEX: <p> Function in AnotherNamespace</p>
-
-// MUSTACHE-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
-// MUSTACHE-ANOTHER-INDEX:     <h2>Inner Classes</h2>
-// MUSTACHE-ANOTHER-INDEX:     <ul class="class-container">
-// MUSTACHE-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
-// MUSTACHE-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
-// MUSTACHE-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
-// MUSTACHE-ANOTHER-INDEX:             </a>
-// MUSTACHE-ANOTHER-INDEX:         </li>
-// MUSTACHE-ANOTHER-INDEX:     </ul>
-// MUSTACHE-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
-// MUSTACHE-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
-// MUSTACHE-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
-// MUSTACHE-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
-
-// JSON-INDEX: async function LoadIndex() {
-// JSON-INDEX-NEXT: return{
-// JSON-INDEX-NEXT:   "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:   "Name": "",
-// JSON-INDEX-NEXT:   "RefType": "default",
-// JSON-INDEX-NEXT:   "Path": "",
-// JSON-INDEX-NEXT:   "Children": [
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "@nonymous_namespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "@nonymous_namespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "AnonClass",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "@nonymous_namespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "AnotherNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "AnotherNamespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "ClassInAnotherNamespace",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "AnotherNamespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "GlobalNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "GlobalNamespace",
-// JSON-INDEX-NEXT:       "Children": []
-// JSON-INDEX-NEXT:     },
-// JSON-INDEX-NEXT:     {
-// JSON-INDEX-NEXT:       "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:       "Name": "PrimaryNamespace",
-// JSON-INDEX-NEXT:       "RefType": "namespace",
-// JSON-INDEX-NEXT:       "Path": "PrimaryNamespace",
-// JSON-INDEX-NEXT:       "Children": [
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "ClassInPrimaryNamespace",
-// JSON-INDEX-NEXT:           "RefType": "record",
-// JSON-INDEX-NEXT:           "Path": "PrimaryNamespace",
-// JSON-INDEX-NEXT:           "Children": []
-// JSON-INDEX-NEXT:         },
-// JSON-INDEX-NEXT:         {
-// JSON-INDEX-NEXT:           "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:           "Name": "NestedNamespace",
-// JSON-INDEX-NEXT:           "RefType": "namespace",
-// JSON-INDEX-NEXT:           "Path": "PrimaryNamespace{{[\/]+}}NestedNamespace",
-// JSON-INDEX-NEXT:           "Children": [
-// JSON-INDEX-NEXT:             {
-// JSON-INDEX-NEXT:               "USR": "{{([0-9A-F]{40})}}",
-// JSON-INDEX-NEXT:               "Name": "ClassInNestedNamespace",
-// JSON-INDEX-NEXT:               "RefType": "record",
-// JSON-INDEX-NEXT:               "Path": "PrimaryNamespace{{[\/]+}}NestedNamespace",
-// JSON-INDEX-NEXT:               "Children": []
-// JSON-INDEX-NEXT:             }
-// JSON-INDEX-NEXT:           ]
-// JSON-INDEX-NEXT:         }
-// JSON-INDEX-NEXT:       ]
-// JSON-INDEX-NEXT:     }
-// JSON-INDEX-NEXT:   ]
-// JSON-INDEX-NEXT: };
-// JSON-INDEX-NEXT: }
-
-// HTML-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-// HTML-GLOBAL-INDEX: <h1>Global Namespace</h1>
-// HTML-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// HTML-GLOBAL-INDEX: <li>@nonymous_namespace</li>
-// HTML-GLOBAL-INDEX: <li>AnotherNamespace</li>
-// HTML-GLOBAL-INDEX: <li>PrimaryNamespace</li>
-
-// MUSTACHE-GLOBAL-INDEX: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-// MUSTACHE-GLOBAL-INDEX: <h1>Global Namespace</h1>
-// MUSTACHE-GLOBAL-INDEX: <h2 id="Namespaces">Namespaces</h2>
-// MUSTACHE-GLOBAL-INDEX: <li>@nonymous_namespace</li>
-// MUSTACHE-GLOBAL-INDEX: <li>AnotherNamespace</li>
-// MUSTACHE-GLOBAL-INDEX: <li>PrimaryNamespace</li>
+// HTML-ANOTHER-INDEX: <h2> AnotherNamespace</h2>
+// HTML-ANOTHER-INDEX:     <h2>Inner Classes</h2>
+// HTML-ANOTHER-INDEX:     <ul class="class-container">
+// HTML-ANOTHER-INDEX:         <li id="{{([0-9A-F]{40})}}" style="max-height: 40px;">
+// HTML-ANOTHER-INDEX:             <a href="_ZTVN16AnotherNamespace23ClassInAnotherNamespaceE.html">
+// HTML-ANOTHER-INDEX:                 <pre><code class="language-cpp code-clang-doc">class ClassInAnotherNamespace</code></pre>
+// HTML-ANOTHER-INDEX:             </a>
+// HTML-ANOTHER-INDEX:         </li>
+// HTML-ANOTHER-INDEX:     </ul>
+// HTML-ANOTHER-INDEX-NOT: <h2 id="Functions">Functions</h2>
+// HTML-ANOTHER-INDEX-NOT: <h3 id="{{([0-9A-F]{40})}}">functionInAnotherNamespace</h3>
+// HTML-ANOTHER-INDEX-NOT: <p>void functionInAnotherNamespace()</p>
+// HTML-ANOTHER-INDEX-NOT: <p> Function in AnotherNamespace</p>
+
+// COM: FIXME: Add namespaces to namespace template
+// HTML-GLOBAL-INDEX-NOT: <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
+// HTML-GLOBAL-INDEX-NOT: <h1>Global Namespace</h1>
+// HTML-GLOBAL-INDEX-NOT: <h2 id="Namespaces">Namespaces</h2>
+// HTML-GLOBAL-INDEX-NOT: <li>@nonymous_namespace</li>
+// HTML-GLOBAL-INDEX-NOT: <li>AnotherNamespace</li>
+// HTML-GLOBAL-INDEX-NOT: <li>PrimaryNamespace</li>
 
 // MD-GLOBAL-INDEX: # Global Namespace
 // MD-GLOBAL-INDEX: ## Namespaces
diff --git a/clang-tools-extra/test/clang-doc/test-path-abs.cpp b/clang-tools-extra/test/clang-doc/test-path-abs.cpp
deleted file mode 100644
index 8875a3a73ab7e..0000000000000
--- a/clang-tools-extra/test/clang-doc/test-path-abs.cpp
+++ /dev/null
@@ -1,7 +0,0 @@
-// RUN: rm -rf %t && mkdir -p %t
-// RUN: clang-doc --format=html --executor=standalone %s --output=%t --base base_dir
-// RUN: FileCheck %s -input-file=%t/index_json.js  -check-prefix=JSON-INDEX
-
-// JSON-INDEX: var RootPath = "{{.*}}test-path-abs.cpp.tmp";
-// JSON-INDEX-NEXT: var Base = "base_dir";
-
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 18166acf9bbca..01b34ec9a791e 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,7 +26,6 @@ add_extra_unittest(ClangDocTests
   ClangDocTest.cpp
   GeneratorTest.cpp
   HTMLGeneratorTest.cpp
-  HTMLMustacheGeneratorTest.cpp
   MDGeneratorTest.cpp
   MergeTest.cpp
   SerializeTest.cpp
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
index 2fe443d9db5c5..cf510afe214dd 100644
--- a/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/HTMLGeneratorTest.cpp
@@ -9,17 +9,22 @@
 #include "ClangDocTest.h"
 #include "Generators.h"
 #include "Representation.h"
+#include "config.h"
+#include "support/Utils.h"
 #include "clang/Basic/Version.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
-namespace clang {
-namespace doc {
+using namespace llvm;
+using namespace testing;
+using namespace clang;
+using namespace clang::doc;
 
-static const std::string ClangDocVersion =
-    clang::getClangToolFullVersion("clang-doc");
+static const std::string ClangDocVersion = getClangToolFullVersion("clang-doc");
 
 static std::unique_ptr<Generator> getHTMLGenerator() {
-  auto G = doc::findGeneratorByName("html");
+  auto G = findGeneratorByName("html");
   if (!G)
     return nullptr;
   return std::move(G.get());
@@ -50,454 +55,10 @@ class HTMLGeneratorTest : public ClangDocContextTest {
   }
 };
 
-TEST_F(HTMLGeneratorTest, emitNamespaceHTML) {
-  NamespaceInfo I;
-  I.Name = "Namespace";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.Children.Namespaces.emplace_back(EmptySID, "ChildNamespace",
-                                     InfoType::IT_namespace,
-                                     "Namespace::ChildNamespace", "Namespace");
-  I.Children.Records.emplace_back(EmptySID, "ChildStruct", InfoType::IT_record,
-                                  "Namespace::ChildStruct", "Namespace");
-  I.Children.Functions.emplace_back();
-  I.Children.Functions.back().Access = AccessSpecifier::AS_none;
-  I.Children.Functions.back().Name = "OneFunction";
-  I.Children.Enums.emplace_back();
-  I.Children.Enums.back().Name = "OneEnum";
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({"user-provided-stylesheet.css"});
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title>namespace Namespace</title>
-<link rel="stylesheet" href="../clang-doc-default-stylesheet.css"/>
-<link rel="stylesheet" href="../user-provided-stylesheet.css"/>
-<script src="../index_json.js"></script>
-<script src="../index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="Namespace" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h1>namespace Namespace</h1>
-    <h2 id="Namespaces">Namespaces</h2>
-    <ul>
-      <li>
-        <a href="ChildNamespace/index.html">ChildNamespace</a>
-      </li>
-    </ul>
-    <h2 id="Records">Records</h2>
-    <ul>
-      <li>
-        <a href="ChildStruct.html">ChildStruct</a>
-      </li>
-    </ul>
-    <h2 id="Functions">Functions</h2>
-    <div>
-      <h3 id="0000000000000000000000000000000000000000">OneFunction</h3>
-      <p>OneFunction()</p>
-    </div>
-    <h2 id="Enums">Enums</h2>
-    <div>
-      <table id="0000000000000000000000000000000000000000">
-        <thead>
-          <tr>
-            <th colspan="2">enum OneEnum</th>
-          </tr>
-        </thead>
-      </table>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right">
-    <ol>
-      <li>
-        <span>
-          <a href="#Namespaces">Namespaces</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Records">Records</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Functions">Functions</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneFunction</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-      <li>
-        <span>
-          <a href="#Enums">Enums</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneEnum</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-    </ol>
-  </div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitRecordHTML) {
-  RecordInfo I;
-  I.Name = "r";
-  I.Path = "X/Y/Z";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "dir/test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  SmallString<16> PathTo;
-  llvm::sys::path::native("path/to", PathTo);
-  I.Members.emplace_back(TypeInfo("int"), "X", AccessSpecifier::AS_private);
-  I.TagType = TagTypeKind::Class;
-  I.Parents.emplace_back(EmptySID, "F", InfoType::IT_record, "F", PathTo);
-  I.VirtualParents.emplace_back(EmptySID, "G", InfoType::IT_record);
-
-  I.Children.Records.emplace_back(EmptySID, "ChildStruct", InfoType::IT_record,
-                                  "X::Y::Z::r::ChildStruct", "X/Y/Z/r");
-  I.Children.Functions.emplace_back();
-  I.Children.Functions.back().Name = "OneFunction";
-  I.Children.Enums.emplace_back();
-  I.Children.Enums.back().Name = "OneEnum";
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "http://www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title>class r</title>
-<link rel="stylesheet" href="../../../clang-doc-default-stylesheet.css"/>
-<script src="../../../index_json.js"></script>
-<script src="../../../index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="X/Y/Z" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h1>class r</h1>
-    <p>
-      Defined at line 
-      <a href="http://www.repository.com/dir/test.cpp#10">10</a>
-       of file 
-      <a href="http://www.repository.com/dir/test.cpp">test.cpp</a>
-    </p>
-    <p>
-      Inherits from 
-      <a href="../../../path/to/F.html">F</a>
-      , G
-    </p>
-    <h2 id="Members">Members</h2>
-    <ul>
-      <li>
-        <div>private int X</div>
-      </li>
-    </ul>
-    <h2 id="Records">Records</h2>
-    <ul>
-      <li>
-        <a href="../../../X/Y/Z/r/ChildStruct.html">ChildStruct</a>
-      </li>
-    </ul>
-    <h2 id="Functions">Functions</h2>
-    <div>
-      <h3 id="0000000000000000000000000000000000000000">OneFunction</h3>
-      <p>public OneFunction()</p>
-    </div>
-    <h2 id="Enums">Enums</h2>
-    <div>
-      <table id="0000000000000000000000000000000000000000">
-        <thead>
-          <tr>
-            <th colspan="2">enum OneEnum</th>
-          </tr>
-        </thead>
-      </table>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right">
-    <ol>
-      <li>
-        <span>
-          <a href="#Members">Members</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Records">Records</a>
-        </span>
-      </li>
-      <li>
-        <span>
-          <a href="#Functions">Functions</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneFunction</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-      <li>
-        <span>
-          <a href="#Enums">Enums</a>
-        </span>
-        <ul>
-          <li>
-            <span>
-              <a href="#0000000000000000000000000000000000000000">OneEnum</a>
-            </span>
-          </li>
-        </ul>
-      </li>
-    </ol>
-  </div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitFunctionHTML) {
-  FunctionInfo I;
-  I.Name = "f";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "dir/test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  I.Access = AccessSpecifier::AS_none;
-
-  SmallString<16> PathTo;
-  llvm::sys::path::native("path/to", PathTo);
-  I.ReturnType = TypeInfo(
-      Reference(EmptySID, "float", InfoType::IT_default, "float", PathTo));
-  I.Params.emplace_back(TypeInfo("int", PathTo), "P");
-  I.IsMethod = true;
-  I.Parent = Reference(EmptySID, "Parent", InfoType::IT_record);
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "https://www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h3 id="0000000000000000000000000000000000000000">f</h3>
-    <p>
-      <a href="path/to/float.html">float</a>
-       f(
-      <a href="path/to/int.html">int</a>
-       P)
-    </p>
-    <p>
-      Defined at line 
-      <a href="https://www.repository.com/dir/test.cpp#10">10</a>
-       of file 
-      <a href="https://www.repository.com/dir/test.cpp">test.cpp</a>
-    </p>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitEnumHTML) {
-  EnumInfo I;
-  I.Name = "e";
-  I.Namespace.emplace_back(EmptySID, "A", InfoType::IT_namespace);
-
-  I.DefLoc = Location(10, 10, "test.cpp", true);
-  I.Loc.emplace_back(12, 12, "test.cpp");
-
-  I.Members.emplace_back("X");
-  I.Scoped = true;
-
+TEST_F(HTMLGeneratorTest, createResources) {
   auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
-  ClangDocContext CDCtx = getClangDocContext({}, "www.repository.com");
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <table id="0000000000000000000000000000000000000000">
-      <thead>
-        <tr>
-          <th colspan="2">enum class e</th>
-        </tr>
-      </thead>
-      <tbody>
-        <tr>
-          <td>X</td>
-          <td>0</td>
-        </tr>
-      </tbody>
-    </table>
-    <p>
-      Defined at line 
-      <a href="https://www.repository.com/test.cpp#10">10</a>
-       of file 
-      <a href="https://www.repository.com/test.cpp">test.cpp</a>
-    </p>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
-}
-
-TEST_F(HTMLGeneratorTest, emitCommentHTML) {
-  FunctionInfo I;
-  I.Name = "f";
-  I.DefLoc = Location(10, 10, "test.cpp", true);
-  I.ReturnType = TypeInfo("void");
-  I.Params.emplace_back(TypeInfo("int"), "I");
-  I.Params.emplace_back(TypeInfo("int"), "J");
-  I.Access = AccessSpecifier::AS_none;
-
-  CommentInfo Top;
-  Top.Kind = CommentKind::CK_FullComment;
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *BlankLine = Top.Children.back().get();
-  BlankLine->Kind = CommentKind::CK_ParagraphComment;
-  BlankLine->Children.emplace_back(std::make_unique<CommentInfo>());
-  BlankLine->Children.back()->Kind = CommentKind::CK_TextComment;
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Brief = Top.Children.back().get();
-  Brief->Kind = CommentKind::CK_ParagraphComment;
-  Brief->Children.emplace_back(std::make_unique<CommentInfo>());
-  Brief->Children.back()->Kind = CommentKind::CK_TextComment;
-  Brief->Children.back()->Name = "ParagraphComment";
-  Brief->Children.back()->Text = " Brief description.";
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Extended = Top.Children.back().get();
-  Extended->Kind = CommentKind::CK_ParagraphComment;
-  Extended->Children.emplace_back(std::make_unique<CommentInfo>());
-  Extended->Children.back()->Kind = CommentKind::CK_TextComment;
-  Extended->Children.back()->Text = " Extended description that";
-  Extended->Children.emplace_back(std::make_unique<CommentInfo>());
-  Extended->Children.back()->Kind = CommentKind::CK_TextComment;
-  Extended->Children.back()->Text = " continues onto the next line.";
-
-  Top.Children.emplace_back(std::make_unique<CommentInfo>());
-  CommentInfo *Entities = Top.Children.back().get();
-  Entities->Kind = CommentKind::CK_ParagraphComment;
-  Entities->Children.emplace_back(std::make_unique<CommentInfo>());
-  Entities->Children.back()->Kind = CommentKind::CK_TextComment;
-  Entities->Children.back()->Name = "ParagraphComment";
-  Entities->Children.back()->Text =
-      " Comment with html entities: &, <, >, \", \'.";
-
-  I.Description.emplace_back(std::move(Top));
-
-  auto G = getHTMLGenerator();
-  assert(G);
-  std::string Buffer;
-  llvm::raw_string_ostream Actual(Buffer);
+  ASSERT_THAT(G, NotNull()) << "Could not find HTMLGenerator";
   ClangDocContext CDCtx = getClangDocContext();
-  auto Err = G->generateDocForInfo(&I, Actual, CDCtx);
-  assert(!Err);
-  std::string Expected = R"raw(<!DOCTYPE html>
-<meta charset="utf-8"/>
-<title></title>
-<link rel="stylesheet" href="clang-doc-default-stylesheet.css"/>
-<script src="index_json.js"></script>
-<script src="index.js"></script>
-<header id="project-title">test-project</header>
-<main>
-  <div id="sidebar-left" path="" class="col-xs-6 col-sm-3 col-md-2 sidebar sidebar-offcanvas-left"></div>
-  <div id="main-content" class="col-xs-12 col-sm-9 col-md-8 main-content">
-    <h3 id="0000000000000000000000000000000000000000">f</h3>
-    <p>void f(int I, int J)</p>
-    <p>
-      Defined at line 
-      <a href="test.cpp#10">10</a>
-       of file 
-      <a href="test.cpp">test.cpp</a>
-    </p>
-    <div>
-      <div>
-        <p> Brief description.</p>
-        <p> Extended description that continues onto the next line.</p>
-        <p> Comment with html entities: &amp;, &lt;, &gt;, &quot;, &apos;.</p>
-      </div>
-    </div>
-  </div>
-  <div id="sidebar-right" class="col-xs-6 col-sm-6 col-md-2 sidebar sidebar-offcanvas-right"></div>
-</main>
-<footer>
-  <span class="no-break">)raw" +
-                         ClangDocVersion + R"raw(</span>
-</footer>
-)raw";
-
-  EXPECT_EQ(Expected, Actual.str());
+  EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed())
+      << "Empty UserStylesheets or JsScripts should fail!";
 }
-
-} // namespace doc
-} // namespace clang
diff --git a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp b/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp
deleted file mode 100644
index 7bbd299d7b59c..0000000000000
--- a/clang-tools-extra/unittests/clang-doc/HTMLMustacheGeneratorTest.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-//===-- clang-doc/HTMLMustacheGeneratorTest.cpp ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ClangDocTest.h"
-#include "Generators.h"
-#include "Representation.h"
-#include "config.h"
-#include "support/Utils.h"
-#include "clang/Basic/Version.h"
-#include "llvm/Testing/Support/Error.h"
-#include "gmock/gmock.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace testing;
-using namespace clang;
-using namespace clang::doc;
-
-// FIXME: Don't enable unit tests that can read files. Remove once we can use
-// lit to test these properties.
-#define ENABLE_LOCAL_TEST 0
-
-static const std::string ClangDocVersion = getClangToolFullVersion("clang-doc");
-
-static std::unique_ptr<Generator> getHTMLMustacheGenerator() {
-  auto G = findGeneratorByName("mustache");
-  if (!G)
-    return nullptr;
-  return std::move(G.get());
-}
-
-class HTMLMustacheGeneratorTest : public ClangDocContextTest {
-protected:
-  ClangDocContext
-  getClangDocContext(std::vector<std::string> UserStylesheets = {},
-                     StringRef RepositoryUrl = "",
-                     StringRef RepositoryLinePrefix = "", StringRef Base = "") {
-    ClangDocContext CDCtx{nullptr,
-                          "test-project",
-                          false,
-                          "",
-                          "",
-                          RepositoryUrl,
-                          RepositoryLinePrefix,
-                          Base,
-                          UserStylesheets,
-                          Diags,
-                          false};
-    CDCtx.UserStylesheets.insert(CDCtx.UserStylesheets.begin(), "");
-    CDCtx.JsScripts.emplace_back("");
-    return CDCtx;
-  }
-};
-
-TEST_F(HTMLMustacheGeneratorTest, createResources) {
-  auto G = getHTMLMustacheGenerator();
-  ASSERT_THAT(G, NotNull()) << "Could not find HTMLMustacheGenerator";
-  ClangDocContext CDCtx = getClangDocContext();
-  EXPECT_THAT_ERROR(G->createResources(CDCtx), Failed())
-      << "Empty UserStylesheets or JsScripts should fail!";
-}

From 4bff9fdb908b1f9a3710e4570d249e24bd2aae4d Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 9 Dec 2025 19:50:45 +0000
Subject: [PATCH 45/63] [gn build] Port 24117f75ad9d

---
 llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn     | 1 -
 .../gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
index 5815537318b3a..280d72f4d36b5 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-doc/BUILD.gn
@@ -22,7 +22,6 @@ static_library("clang-doc") {
     "ClangDoc.cpp",
     "Generators.cpp",
     "HTMLGenerator.cpp",
-    "HTMLMustacheGenerator.cpp",
     "JSONGenerator.cpp",
     "MDGenerator.cpp",
     "Mapper.cpp",
diff --git a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
index 4cbd3b96d8ff4..427a64e7a8b10 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/unittests/clang-doc/BUILD.gn
@@ -40,7 +40,6 @@ unittest("ClangDocTests") {
     "ClangDocTest.cpp",
     "GeneratorTest.cpp",
     "HTMLGeneratorTest.cpp",
-    "HTMLMustacheGeneratorTest.cpp",
     "JSONGeneratorTest.cpp",
     "MDGeneratorTest.cpp",
     "MergeTest.cpp",

From dc92bd03c965a200724a51ec6d5e89f2d449f1dd Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Tue, 9 Dec 2025 12:05:43 -0800
Subject: [PATCH 46/63] [sanitizer_common] [Darwin] Replace pty with pipe on
 posix_spawn path for spawning symbolizer (#170809)

Due to a legacy incompatibility with `atos`, we were allocating a pty
whenever we spawned the symbolizer. This is no longer necessary and we
can use a regular ol' pipe.

This PR is split into two commits:
- The first removes the pty allocation and replaces it with a pipe. This
relocates the `CreateTwoHighNumberedPipes` call to be common to the
`posix_spawn` and `StartSubprocess` path.
- The second commit adds the `child_stdin_fd_` field to
`SymbolizerProcess`, storing the read end of the stdin pipe. By holding
on to this fd for the lifetime of the symbolizer, we are able to avoid
getting SIGPIPE (which would occur when we write to a pipe whose
read-end had been closed due to the death of the symbolizer). This will
be very close to solving #120915, but this PR is intentionally not
touching the non-posix_spawn path.

rdar://165894284
---
 .../lib/sanitizer_common/sanitizer_mac.cpp    | 97 ++++++-------------
 .../lib/sanitizer_common/sanitizer_posix.h    |  3 +-
 .../sanitizer_symbolizer_internal.h           |  6 +-
 .../sanitizer_symbolizer_libcdep.cpp          | 12 ++-
 .../sanitizer_symbolizer_posix_libcdep.cpp    | 37 +++----
 5 files changed, 65 insertions(+), 90 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index a6f757173728b..3f8de8dd064a5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -281,53 +281,43 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp,
                       (size_t)newlen);
 }
 
-static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
-                                pid_t *pid) {
-  fd_t primary_fd = kInvalidFd;
-  fd_t secondary_fd = kInvalidFd;
-
+bool internal_spawn(const char* argv[], const char* envp[], pid_t* pid,
+                    fd_t fd_stdin, fd_t fd_stdout) {
+  // NOTE: Caller ensures that fd_stdin and fd_stdout are not 0, 1, or 2, since
+  // this can break communication.
+  //
+  // NOTE: Caller is responsible for closing fd_stdin after the process has
+  // died.
+
+  int res;
   auto fd_closer = at_scope_exit([&] {
-    internal_close(primary_fd);
-    internal_close(secondary_fd);
+    // NOTE: We intentionally do not close fd_stdin since this can
+    // cause us to receive a fatal SIGPIPE if the process dies.
+    internal_close(fd_stdout);
   });
 
-  // We need a new pseudoterminal to avoid buffering problems. The 'atos' tool
-  // in particular detects when it's talking to a pipe and forgets to flush the
-  // output stream after sending a response.
-  primary_fd = posix_openpt(O_RDWR);
-  if (primary_fd == kInvalidFd)
-    return kInvalidFd;
-
-  int res = grantpt(primary_fd) || unlockpt(primary_fd);
-  if (res != 0) return kInvalidFd;
-
-  // Use TIOCPTYGNAME instead of ptsname() to avoid threading problems.
-  char secondary_pty_name[128];
-  res = ioctl(primary_fd, TIOCPTYGNAME, secondary_pty_name);
-  if (res == -1) return kInvalidFd;
-
-  secondary_fd = internal_open(secondary_pty_name, O_RDWR);
-  if (secondary_fd == kInvalidFd)
-    return kInvalidFd;
-
   // File descriptor actions
   posix_spawn_file_actions_t acts;
   res = posix_spawn_file_actions_init(&acts);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   auto acts_cleanup = at_scope_exit([&] {
     posix_spawn_file_actions_destroy(&acts);
   });
 
-  res = posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDIN_FILENO) ||
-        posix_spawn_file_actions_adddup2(&acts, secondary_fd, STDOUT_FILENO) ||
-        posix_spawn_file_actions_addclose(&acts, secondary_fd);
-  if (res != 0) return kInvalidFd;
+  res = posix_spawn_file_actions_adddup2(&acts, fd_stdin, STDIN_FILENO) ||
+        posix_spawn_file_actions_adddup2(&acts, fd_stdout, STDOUT_FILENO) ||
+        posix_spawn_file_actions_addclose(&acts, fd_stdin) ||
+        posix_spawn_file_actions_addclose(&acts, fd_stdout);
+  if (res != 0)
+    return false;
 
   // Spawn attributes
   posix_spawnattr_t attrs;
   res = posix_spawnattr_init(&attrs);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   auto attrs_cleanup  = at_scope_exit([&] {
     posix_spawnattr_destroy(&attrs);
@@ -336,50 +326,17 @@ static fd_t internal_spawn_impl(const char *argv[], const char *envp[],
   // In the spawned process, close all file descriptors that are not explicitly
   // described by the file actions object. This is Darwin-specific extension.
   res = posix_spawnattr_setflags(&attrs, POSIX_SPAWN_CLOEXEC_DEFAULT);
-  if (res != 0) return kInvalidFd;
+  if (res != 0)
+    return false;
 
   // posix_spawn
   char **argv_casted = const_cast<char **>(argv);
   char **envp_casted = const_cast<char **>(envp);
   res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, envp_casted);
-  if (res != 0) return kInvalidFd;
-
-  // Disable echo in the new terminal, disable CR.
-  struct termios termflags;
-  tcgetattr(primary_fd, &termflags);
-  termflags.c_oflag &= ~ONLCR;
-  termflags.c_lflag &= ~ECHO;
-  tcsetattr(primary_fd, TCSANOW, &termflags);
-
-  // On success, do not close primary_fd on scope exit.
-  fd_t fd = primary_fd;
-  primary_fd = kInvalidFd;
-
-  return fd;
-}
-
-fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid) {
-  // The client program may close its stdin and/or stdout and/or stderr thus
-  // allowing open/posix_openpt to reuse file descriptors 0, 1 or 2. In this
-  // case the communication is broken if either the parent or the child tries to
-  // close or duplicate these descriptors. We temporarily reserve these
-  // descriptors here to prevent this.
-  fd_t low_fds[3];
-  size_t count = 0;
-
-  for (; count < 3; count++) {
-    low_fds[count] = posix_openpt(O_RDWR);
-    if (low_fds[count] >= STDERR_FILENO)
-      break;
-  }
-
-  fd_t fd = internal_spawn_impl(argv, envp, pid);
-
-  for (; count > 0; count--) {
-    internal_close(low_fds[count]);
-  }
+  if (res != 0)
+    return false;
 
-  return fd;
+  return true;
 }
 
 uptr internal_rename(const char *oldpath, const char *newpath) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
index b5491c540dc08..063408b8360c1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h
@@ -67,7 +67,8 @@ uptr internal_ptrace(int request, int pid, void *addr, void *data);
 uptr internal_waitpid(int pid, int *status, int options);
 
 int internal_fork();
-fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid);
+bool internal_spawn(const char* argv[], const char* envp[], pid_t* pid,
+                    fd_t stdin, fd_t stdout);
 
 int internal_sysctl(const int *name, unsigned int namelen, void *oldp,
                     uptr *oldlenp, const void *newp, uptr newlen);
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
index 2345aee985541..6442a2980bf2f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h
@@ -83,7 +83,7 @@ class SymbolizerProcess {
   const char *SendCommand(const char *command);
 
  protected:
-  ~SymbolizerProcess() {}
+  ~SymbolizerProcess();
 
   /// The maximum number of arguments required to invoke a tool process.
   static const unsigned kArgVMax = 16;
@@ -114,6 +114,10 @@ class SymbolizerProcess {
   fd_t input_fd_;
   fd_t output_fd_;
 
+  // We hold on to the child's stdin fd (the read end of the pipe)
+  // so that when we write to it, we don't get a SIGPIPE
+  fd_t child_stdin_fd_;
+
   InternalMmapVector<char> buffer_;
 
   static const uptr kMaxTimesRestarted = 5;
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
index 565701c85d978..cc31d3d8056f9 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_libcdep.cpp
@@ -476,10 +476,11 @@ const char *LLVMSymbolizer::FormatAndSendCommand(const char *command_prefix,
   return symbolizer_process_->SendCommand(buffer_);
 }
 
-SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
+SymbolizerProcess::SymbolizerProcess(const char* path, bool use_posix_spawn)
     : path_(path),
       input_fd_(kInvalidFd),
       output_fd_(kInvalidFd),
+      child_stdin_fd_(kInvalidFd),
       times_restarted_(0),
       failed_to_start_(false),
       reported_invalid_path_(false),
@@ -488,6 +489,11 @@ SymbolizerProcess::SymbolizerProcess(const char *path, bool use_posix_spawn)
   CHECK_NE(path_[0], '\0');
 }
 
+SymbolizerProcess::~SymbolizerProcess() {
+  if (child_stdin_fd_ != kInvalidFd)
+    CloseFile(child_stdin_fd_);
+}
+
 static bool IsSameModule(const char *path) {
   if (const char *ProcessName = GetProcessName()) {
     if (const char *SymbolizerName = StripModuleName(path)) {
@@ -533,6 +539,10 @@ bool SymbolizerProcess::Restart() {
     CloseFile(input_fd_);
   if (output_fd_ != kInvalidFd)
     CloseFile(output_fd_);
+  if (child_stdin_fd_ != kInvalidFd) {
+    CloseFile(child_stdin_fd_);
+    child_stdin_fd_ = kInvalidFd;  // Don't free in destructor
+  }
   return StartSymbolizerSubprocess();
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 7eb0c9756d64a..29c73e3e1cac1 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -156,30 +156,30 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
     Printf("\n");
   }
 
+  fd_t infd[2] = {}, outfd[2] = {};
+  if (!CreateTwoHighNumberedPipes(infd, outfd)) {
+    Report(
+        "WARNING: Can't create a socket pair to start "
+        "external symbolizer (errno: %d)\n",
+        errno);
+    return false;
+  }
+
   if (use_posix_spawn_) {
 #  if SANITIZER_APPLE
-    fd_t fd = internal_spawn(argv, const_cast<const char **>(GetEnvP()), &pid);
-    if (fd == kInvalidFd) {
+    bool success = internal_spawn(argv, const_cast<const char**>(GetEnvP()),
+                                  &pid, outfd[0], infd[1]);
+    if (!success) {
       Report("WARNING: failed to spawn external symbolizer (errno: %d)\n",
              errno);
+      internal_close(infd[0]);
+      internal_close(outfd[1]);
       return false;
     }
-
-    input_fd_ = fd;
-    output_fd_ = fd;
 #  else   // SANITIZER_APPLE
     UNIMPLEMENTED();
 #  endif  // SANITIZER_APPLE
   } else {
-    fd_t infd[2] = {}, outfd[2] = {};
-    if (!CreateTwoHighNumberedPipes(infd, outfd)) {
-      Report(
-          "WARNING: Can't create a socket pair to start "
-          "external symbolizer (errno: %d)\n",
-          errno);
-      return false;
-    }
-
     pid = StartSubprocess(path_, argv, GetEnvP(), /* stdin */ outfd[0],
                           /* stdout */ infd[1]);
     if (pid < 0) {
@@ -187,11 +187,14 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
       internal_close(outfd[1]);
       return false;
     }
-
-    input_fd_ = infd[0];
-    output_fd_ = outfd[1];
   }
 
+  input_fd_ = infd[0];
+  output_fd_ = outfd[1];
+
+  // We intentionally hold on to the read-end so that we don't get a SIGPIPE
+  child_stdin_fd_ = outfd[0];
+
   CHECK_GT(pid, 0);
 
   // Check that symbolizer subprocess started successfully.

From 7a5e2c9358eabff3d9eb66141590ac453a2e0e08 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 9 Dec 2025 20:06:13 +0000
Subject: [PATCH 47/63] [LV] Add test with threshold=0 and metadata forcing
 vectorization.

Test case for the mis-compile mentioned in
https://github.com/llvm/llvm-project/pull/166247#issuecomment-3631471588

The issue is that we don't generate a runtime check even though it is
required to vectorize.
---
 ...ime-check-threshold-with-force-metadata.ll | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll

diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
new file mode 100644
index 0000000000000..b7d36fe7928e5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
+; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
+
+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+; no runtime check is generated even though one is needed and !noalias
+; annotations are added.
+define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
+; LIMIT0-LABEL: define i16 @runtime_checks_needed(
+; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; LIMIT0-NEXT:  [[ENTRY:.*:]]
+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+; LIMIT0:       [[VECTOR_PH]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT0:       [[VECTOR_BODY]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT0:       [[MIDDLE_BLOCK]]:
+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+; LIMIT0:       [[EXIT]]:
+; LIMIT0-NEXT:    ret i16 [[TMP0]]
+;
+; LIMIT1-LABEL: define i16 @runtime_checks_needed(
+; LIMIT1-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
+; LIMIT1-NEXT:  [[ENTRY:.*:]]
+; LIMIT1-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
+; LIMIT1:       [[VECTOR_MEMCHECK]]:
+; LIMIT1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 2000
+; LIMIT1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 2
+; LIMIT1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; LIMIT1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; LIMIT1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; LIMIT1-NEXT:    br i1 [[FOUND_CONFLICT]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; LIMIT1:       [[VECTOR_PH]]:
+; LIMIT1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT1:       [[VECTOR_BODY]]:
+; LIMIT1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
+; LIMIT1-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT1-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
+; LIMIT1-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT1:       [[MIDDLE_BLOCK]]:
+; LIMIT1-NEXT:    br label %[[EXIT:.*]]
+; LIMIT1:       [[SCALAR_PH]]:
+; LIMIT1-NEXT:    br label %[[LOOP:.*]]
+; LIMIT1:       [[LOOP]]:
+; LIMIT1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; LIMIT1-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+; LIMIT1-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[IV]]
+; LIMIT1-NEXT:    store i16 [[L]], ptr [[GEP_DST]], align 1
+; LIMIT1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; LIMIT1-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000
+; LIMIT1-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; LIMIT1:       [[EXIT]]:
+; LIMIT1-NEXT:    [[L_LCSSA:%.*]] = phi i16 [ [[L]], %[[LOOP]] ], [ [[TMP0]], %[[MIDDLE_BLOCK]] ]
+; LIMIT1-NEXT:    ret i16 [[L_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %l = load i16, ptr %src, align 1
+  %gep.dst = getelementptr inbounds i16, ptr %dst, i64 %iv
+  store i16 %l, ptr %gep.dst, align 1
+  %iv.next = add nsw nuw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 1000
+  br i1 %ec, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret i16 %l
+}
+
+!0 = distinct !{!0, !2, !3}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+;.
+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT1: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT1: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT1: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT1: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT1: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT1: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LIMIT1: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]]}
+;.

From b3d05e680709dbe7988c25c144dabdadc0c4cf27 Mon Sep 17 00:00:00 2001
From: Fateme Hosseini <quic_fhossein@quicinc.com>
Date: Tue, 9 Dec 2025 14:07:52 -0600
Subject: [PATCH 48/63]  [Hexagon] Add HVX V81 builtins (#170680)

Expose the HVXV81 abs, conversion, comparison, log2, negate and mixed
subtract intrinsics so Clang can emit the new instructions.
---
 clang/include/clang/Basic/BuiltinsHexagon.td | 66 ++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/clang/include/clang/Basic/BuiltinsHexagon.td b/clang/include/clang/Basic/BuiltinsHexagon.td
index cf18359e7bf60..00f84cd72a051 100644
--- a/clang/include/clang/Basic/BuiltinsHexagon.td
+++ b/clang/include/clang/Basic/BuiltinsHexagon.td
@@ -2146,3 +2146,69 @@ let Features = HVXV79.Features in {
   def V6_vsub_hf_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>, _Vector<16, int>)">;
   def V6_vsub_hf_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>, _Vector<32, int>)">;
 }
+
+// V81 HVX Instructions.
+let Features = HVXV81.Features in {
+  def V6_vabs_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vabs_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vabs_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_valign4 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, int)">;
+  def V6_valign4_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>, int)">;
+  def V6_vconv_bf_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<32, int>)">;
+  def V6_vconv_bf_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<64, int>)">;
+  def V6_vconv_f8_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_f8_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_h_hf_rnd : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_h_hf_rnd_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_f8 : HexagonBuiltin<"_Vector<32, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_f8_128B : HexagonBuiltin<"_Vector<64, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vconv_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vconv_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_veqhf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqhf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqhf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf : HexagonBuiltin<"_Vector<64, bool>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_and : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_and_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_or : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_or_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_veqsf_xor : HexagonBuiltin<"_Vector<64, bool>(_Vector<64, bool>, _Vector<16, int>, _Vector<16, int>)">;
+  def V6_veqsf_xor_128B : HexagonBuiltin<"_Vector<128, bool>(_Vector<128, bool>, _Vector<32, int>, _Vector<32, int>)">;
+  def V6_vilog2_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vilog2_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vilog2_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf16_hf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf16_hf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf16_qf16 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf16_qf16_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf32_qf32 : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf32_qf32_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vneg_qf32_sf : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>)">;
+  def V6_vneg_qf32_sf_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>)">;
+  def V6_vsub_hf_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_hf_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+  def V6_vsub_sf_mix : HexagonBuiltin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>)">;
+  def V6_vsub_sf_mix_128B : HexagonBuiltin<"_Vector<32, int>(_Vector<32, int>, _Vector<32, int>)">;
+}

From 0eb00eff475dd3950d8a1e7db14f3905d67d119e Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 9 Dec 2025 12:15:56 -0800
Subject: [PATCH 49/63] [alpha.webkit.RetainPtrCtorAdoptChecker] Don't treat
 assignment to an +1 out argument as a leak (#161633)

Make RetainPtrCtorAdoptChecker recognize an assignment to an +1 out
argument so that it won't emit a memory leak warning.
---
 .../WebKit/RetainPtrCtorAdoptChecker.cpp      | 36 +++++++++++++++----
 .../WebKit/retain-ptr-ctor-adopt-use.mm       |  8 +++++
 2 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index 955b8d19a820c..07ef699a5d883 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -355,15 +355,37 @@ class RetainPtrCtorAdoptChecker
   void visitBinaryOperator(const BinaryOperator *BO) const {
     if (!BO->isAssignmentOp())
       return;
-    if (!isa<ObjCIvarRefExpr>(BO->getLHS()))
-      return;
+    auto *LHS = BO->getLHS();
     auto *RHS = BO->getRHS()->IgnoreParenCasts();
-    const Expr *Inner = nullptr;
-    if (isAllocInit(RHS, &Inner)) {
-      CreateOrCopyFnCall.insert(RHS);
-      if (Inner)
-        CreateOrCopyFnCall.insert(Inner);
+    if (isa<ObjCIvarRefExpr>(LHS)) {
+      const Expr *Inner = nullptr;
+      if (isAllocInit(RHS, &Inner)) {
+        CreateOrCopyFnCall.insert(RHS);
+        if (Inner)
+          CreateOrCopyFnCall.insert(Inner);
+      }
+      return;
     }
+    auto *UO = dyn_cast<UnaryOperator>(LHS);
+    if (!UO)
+      return;
+    auto OpCode = UO->getOpcode();
+    if (OpCode != UO_Deref)
+      return;
+    auto *DerefTarget = UO->getSubExpr();
+    if (!DerefTarget)
+      return;
+    DerefTarget = DerefTarget->IgnoreParenCasts();
+    auto *DRE = dyn_cast<DeclRefExpr>(DerefTarget);
+    if (!DRE)
+      return;
+    auto *Decl = DRE->getDecl();
+    if (!Decl)
+      return;
+    if (!isa<ParmVarDecl>(Decl) || !isCreateOrCopy(RHS))
+      return;
+    if (Decl->hasAttr<CFReturnsRetainedAttr>())
+      CreateOrCopyFnCall.insert(RHS);
   }
 
   void visitReturnStmt(const ReturnStmt *RS, const Decl *DeclWithIssue) const {
diff --git a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
index 45705615f3196..427affdbbd601 100644
--- a/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
+++ b/clang/test/Analysis/Checkers/WebKit/retain-ptr-ctor-adopt-use.mm
@@ -190,6 +190,14 @@ void adopt_retainptr() {
   auto bar = adoptNS([allocSomeObj() init]);
 }
 
+CFTypeRef make_cf_obj() CF_RETURNS_RETAINED {
+  return CFArrayCreateMutable(kCFAllocatorDefault, 1);
+}
+
+void get_cf_obj(CFTypeRef* CF_RETURNS_RETAINED result) {
+  *result = CFArrayCreateMutable(kCFAllocatorDefault, 1);
+}
+
 RetainPtr<CFArrayRef> return_arg(CFArrayRef arg) {
   return arg;
 }

From f9326ffb7ede55229a81907cc03aaa8c523520ab Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 9 Dec 2025 12:28:22 -0800
Subject: [PATCH 50/63] [WebKit checkers] Treat a weak property / variable as
 safe (#163689)

Treat a weak Objective-C property, ivar, member variable, and local
variable as safe.
---
 .../WebKit/RawPtrRefLambdaCapturesChecker.cpp |  2 ++
 .../WebKit/RawPtrRefLocalVarsChecker.cpp      |  2 ++
 .../WebKit/RawPtrRefMemberChecker.cpp         |  7 ++++-
 .../unretained-lambda-captures-weak-arc.mm    | 22 +++++++++++++
 .../WebKit/unretained-lambda-captures-weak.mm | 22 +++++++++++++
 .../WebKit/unretained-local-vars-weak-arc.mm  | 13 ++++++++
 .../WebKit/unretained-local-vars-weak.mm      | 13 ++++++++
 .../WebKit/unretained-members-weak-arc.mm     | 29 +++++++++++++++++
 .../WebKit/unretained-members-weak.mm         | 31 +++++++++++++++++++
 9 files changed, 140 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
 create mode 100644 clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
index f60d1936b7584..f3fadeaefc491 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
@@ -587,6 +587,8 @@ class UnretainedLambdaCapturesChecker : public RawPtrRefLambdaCapturesChecker {
   }
 
   std::optional<bool> isUnsafePtr(QualType QT) const final {
+    if (QT.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(QT);
   }
 
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
index c13df47920f72..f2235e7c25ab2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLocalVarsChecker.cpp
@@ -433,6 +433,8 @@ class UnretainedLocalVarsChecker final : public RawPtrRefLocalVarsChecker {
     RTC = RetainTypeChecker();
   }
   std::optional<bool> isUnsafePtr(const QualType T) const final {
+    if (T.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(T);
   }
   bool isSafePtr(const CXXRecordDecl *Record) const final {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
index ace639ce7ab18..0e23ae34ea212 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefMemberChecker.cpp
@@ -231,8 +231,11 @@ class RawPtrRefMemberChecker
     // "assign" property doesn't retain even under ARC so treat it as unsafe.
     bool ignoreARC =
         !PD->isReadOnly() && PD->getSetterKind() == ObjCPropertyDecl::Assign;
+    bool IsWeak =
+        PD->getPropertyAttributes() & ObjCPropertyAttribute::kind_weak;
+    bool HasSafeAttr = PD->isRetaining() || IsWeak;
     auto IsUnsafePtr = isUnsafePtr(QT, ignoreARC);
-    return {IsUnsafePtr && *IsUnsafePtr && !PD->isRetaining(), PropType};
+    return {IsUnsafePtr && *IsUnsafePtr && !HasSafeAttr, PropType};
   }
 
   bool shouldSkipDecl(const RecordDecl *RD) const {
@@ -363,6 +366,8 @@ class NoUnretainedMemberChecker final : public RawPtrRefMemberChecker {
   }
 
   std::optional<bool> isUnsafePtr(QualType QT, bool ignoreARC) const final {
+    if (QT.hasStrongOrWeakObjCLifetime())
+      return false;
     return RTC->isUnretained(QT, ignoreARC);
   }
 
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
new file mode 100644
index 0000000000000..a52bc7c9a5572
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak-arc.mm
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLambdaCapturesChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+void someFunction();
+template <typename Callback> void call(Callback callback) {
+  someFunction();
+  callback();
+}
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+void foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  auto lambda = [weakStr, weakObj]() {
+    return [weakStr length] + [weakObj value];
+  };
+  call(lambda);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
new file mode 100644
index 0000000000000..7439d7f8bb93b
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-lambda-captures-weak.mm
@@ -0,0 +1,22 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLambdaCapturesChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+void someFunction();
+template <typename Callback> void call(Callback callback) {
+  someFunction();
+  callback();
+}
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+void foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  auto lambda = [weakStr, weakObj]() {
+    return [weakStr length] + [weakObj value];
+  };
+  call(lambda);
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
new file mode 100644
index 0000000000000..8c709b5921227
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak-arc.mm
@@ -0,0 +1,13 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+int foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  return [weakStr length] + [weakObj value];
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
new file mode 100644
index 0000000000000..3ac4ff9d1e4cb
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-local-vars-weak.mm
@@ -0,0 +1,13 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.UnretainedLocalVarsChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+NSString *provideStr();
+SomeObj *provideSomeObj();
+
+int foo() {
+  __weak NSString *weakStr = provideStr();
+  __weak SomeObj *weakObj = provideSomeObj();
+  return [weakStr length] + [weakObj value];
+}
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
new file mode 100644
index 0000000000000..c0aaac09e68d8
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak-arc.mm
@@ -0,0 +1,29 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -fobjc-runtime-has-weak -fobjc-weak -fobjc-arc -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+struct Foo {
+  __weak NSString *weakPtr = nullptr;
+  Foo();
+  ~Foo();
+  void bar();
+};
+
+@interface ObjectWithWeakProperty : NSObject
+@property(nonatomic, weak) NSString *weak_prop;
+@end
+
+@implementation ObjectWithWeakProperty
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthesisObjectWithWeakProperty : NSObject
+@property(nonatomic, readonly, weak) NSString *weak_prop;
+@end
+
+@implementation NoSynthesisObjectWithWeakProperty
+- (NSString *)weak_prop {
+  return nil;
+}
+@end
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm
new file mode 100644
index 0000000000000..422cf6189446d
--- /dev/null
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-members-weak.mm
@@ -0,0 +1,31 @@
+// RUN: %clang_analyze_cc1 -analyzer-checker=alpha.webkit.NoUnretainedMemberChecker -fobjc-runtime-has-weak -fobjc-weak -verify %s
+// expected-no-diagnostics
+
+#include "objc-mock-types.h"
+
+struct Foo {
+  __weak NSString *weakPtr = nullptr;
+  Foo();
+  ~Foo();
+  void bar();
+};
+
+@interface ObjectWithWeakProperty : NSObject
+@property(nonatomic, weak) NSString *weak_prop;
+@end
+
+@implementation ObjectWithWeakProperty
+@end
+
+NS_REQUIRES_PROPERTY_DEFINITIONS
+@interface NoSynthesisObjectWithWeakProperty : NSObject
+@property(nonatomic, readonly, weak) NSString *weak_prop;
+@end
+
+@implementation NoSynthesisObjectWithWeakProperty {
+  __weak NSNumber *weak_ivar;
+}
+- (NSString *)weak_prop {
+  return nil;
+}
+@end

From 06f0758282bfa5457ea779e66c79fdff34e58320 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Tue, 9 Dec 2025 12:29:11 -0800
Subject: [PATCH 51/63] [alpha.webkit.UnretainedCallArgsChecker] Recognize
 [allocObj() init] pattern (#161019)

Generalize the check for recognizing [[Obj alloc] init] to also
recognize [allocObj() init]. We do this by utilizing isAllocInit
function in RetainPtrCtorAdoptChecker.
---
 .../Checkers/WebKit/ASTUtils.cpp              | 45 +++++++++++++++++++
 .../StaticAnalyzer/Checkers/WebKit/ASTUtils.h |  4 ++
 .../WebKit/RawPtrRefCallArgsChecker.cpp       |  9 +---
 .../WebKit/RetainPtrCtorAdoptChecker.cpp      | 44 ------------------
 .../Checkers/WebKit/unretained-call-args.mm   |  3 ++
 5 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
index 84adbf318e9f8..e46bba7be4aea 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.cpp
@@ -321,6 +321,51 @@ bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E) {
   return result && *result;
 }
 
+bool isAllocInit(const Expr *E, const Expr **InnerExpr) {
+  auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E);
+  if (auto *POE = dyn_cast<PseudoObjectExpr>(E)) {
+    if (unsigned ExprCount = POE->getNumSemanticExprs()) {
+      auto *Expr = POE->getSemanticExpr(ExprCount - 1)->IgnoreParenCasts();
+      ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(Expr);
+      if (InnerExpr)
+        *InnerExpr = ObjCMsgExpr;
+    }
+  }
+  if (!ObjCMsgExpr)
+    return false;
+  auto Selector = ObjCMsgExpr->getSelector();
+  auto NameForFirstSlot = Selector.getNameForSlot(0);
+  if (NameForFirstSlot.starts_with("alloc") ||
+      NameForFirstSlot.starts_with("copy") ||
+      NameForFirstSlot.starts_with("mutableCopy"))
+    return true;
+  if (!NameForFirstSlot.starts_with("init") &&
+      !NameForFirstSlot.starts_with("_init"))
+    return false;
+  if (!ObjCMsgExpr->isInstanceMessage())
+    return false;
+  auto *Receiver = ObjCMsgExpr->getInstanceReceiver();
+  if (!Receiver)
+    return false;
+  Receiver = Receiver->IgnoreParenCasts();
+  if (auto *Inner = dyn_cast<ObjCMessageExpr>(Receiver)) {
+    if (InnerExpr)
+      *InnerExpr = Inner;
+    auto InnerSelector = Inner->getSelector();
+    return InnerSelector.getNameForSlot(0).starts_with("alloc");
+  } else if (auto *CE = dyn_cast<CallExpr>(Receiver)) {
+    if (InnerExpr)
+      *InnerExpr = CE;
+    if (auto *Callee = CE->getDirectCallee()) {
+      if (Callee->getDeclName().isIdentifier()) {
+        auto CalleeName = Callee->getName();
+        return CalleeName.starts_with("alloc");
+      }
+    }
+  }
+  return false;
+}
+
 class EnsureFunctionVisitor
     : public ConstStmtVisitor<EnsureFunctionVisitor, bool> {
 public:
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
index 9fff456b7e8b8..d0a3e471365e2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/ASTUtils.h
@@ -77,6 +77,10 @@ bool isConstOwnerPtrMemberExpr(const clang::Expr *E);
 /// supports CheckedPtr.
 bool isExprToGetCheckedPtrCapableMember(const clang::Expr *E);
 
+/// \returns true if \p E is a [[alloc] init] pattern expression.
+/// Sets \p InnerExpr to the inner function call or selector invocation.
+bool isAllocInit(const Expr *E, const Expr **InnerExpr = nullptr);
+
 /// \returns true if E is a CXXMemberCallExpr which returns a const smart
 /// pointer type.
 class EnsureFunctionAnalysis {
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
index 791e70998477f..dcc14a0aecdf7 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefCallArgsChecker.cpp
@@ -177,16 +177,11 @@ class RawPtrRefCallArgsChecker
     if (BR->getSourceManager().isInSystemHeader(E->getExprLoc()))
       return;
 
-    auto Selector = E->getSelector();
     if (auto *Receiver = E->getInstanceReceiver()) {
       std::optional<bool> IsUnsafe = isUnsafePtr(E->getReceiverType());
       if (IsUnsafe && *IsUnsafe && !isPtrOriginSafe(Receiver)) {
-        if (auto *InnerMsg = dyn_cast<ObjCMessageExpr>(Receiver)) {
-          auto InnerSelector = InnerMsg->getSelector();
-          if (InnerSelector.getNameForSlot(0) == "alloc" &&
-              Selector.getNameForSlot(0).starts_with("init"))
-            return;
-        }
+        if (isAllocInit(E))
+          return;
         reportBugOnReceiver(Receiver, D);
       }
     }
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
index 07ef699a5d883..2af9067f8f808 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RetainPtrCtorAdoptChecker.cpp
@@ -445,50 +445,6 @@ class RetainPtrCtorAdoptChecker
     return std::nullopt;
   }
 
-  bool isAllocInit(const Expr *E, const Expr **InnerExpr = nullptr) const {
-    auto *ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(E);
-    if (auto *POE = dyn_cast<PseudoObjectExpr>(E)) {
-      if (unsigned ExprCount = POE->getNumSemanticExprs()) {
-        auto *Expr = POE->getSemanticExpr(ExprCount - 1)->IgnoreParenCasts();
-        ObjCMsgExpr = dyn_cast<ObjCMessageExpr>(Expr);
-        if (InnerExpr)
-          *InnerExpr = ObjCMsgExpr;
-      }
-    }
-    if (!ObjCMsgExpr)
-      return false;
-    auto Selector = ObjCMsgExpr->getSelector();
-    auto NameForFirstSlot = Selector.getNameForSlot(0);
-    if (NameForFirstSlot == "alloc" || NameForFirstSlot.starts_with("copy") ||
-        NameForFirstSlot.starts_with("mutableCopy"))
-      return true;
-    if (!NameForFirstSlot.starts_with("init") &&
-        !NameForFirstSlot.starts_with("_init"))
-      return false;
-    if (!ObjCMsgExpr->isInstanceMessage())
-      return false;
-    auto *Receiver = ObjCMsgExpr->getInstanceReceiver();
-    if (!Receiver)
-      return false;
-    Receiver = Receiver->IgnoreParenCasts();
-    if (auto *Inner = dyn_cast<ObjCMessageExpr>(Receiver)) {
-      if (InnerExpr)
-        *InnerExpr = Inner;
-      auto InnerSelector = Inner->getSelector();
-      return InnerSelector.getNameForSlot(0) == "alloc";
-    } else if (auto *CE = dyn_cast<CallExpr>(Receiver)) {
-      if (InnerExpr)
-        *InnerExpr = CE;
-      if (auto *Callee = CE->getDirectCallee()) {
-        if (Callee->getDeclName().isIdentifier()) {
-          auto CalleeName = Callee->getName();
-          return CalleeName.starts_with("alloc");
-        }
-      }
-    }
-    return false;
-  }
-
   bool isCreateOrCopy(const Expr *E) const {
     auto *CE = dyn_cast<CallExpr>(E);
     if (!CE)
diff --git a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
index 8bef24f93ceed..cfc214fae33e5 100644
--- a/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
+++ b/clang/test/Analysis/Checkers/WebKit/unretained-call-args.mm
@@ -625,6 +625,8 @@ void foo() {
 
 } // namespace template_function
 
+SomeObj *allocObj();
+
 @interface TestObject : NSObject
 - (void)doWork:(NSString *)msg, ...;
 - (void)doWorkOnSelf;
@@ -647,6 +649,7 @@ - (void)doWorkOnSelf {
   [self doWork:__null];
   [self doWork:nil];
   [NSApp run];
+  adoptNS([allocObj() init]);
 }
 
 - (SomeObj *)getSomeObj {

From 8a115b6934a90441d77ea54af73e7aaaa1394b38 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 9 Dec 2025 20:37:20 +0000
Subject: [PATCH 52/63] [LV] Mark checks as never succeeding for high cost
 cutoff.

When GeneratedRTChecks::create bails out due to exceeding the cost
threshold, no runtime checks are generated and we must not proceed
assuming checks have been generated.

Mark the checks as never succeeding, to make sure we don't try to
vectorize assuming the runtime checks hold. This fixes a case where we
previously incorrectly vectorized assuming runtime checks had been
generated when forcing vectorization via metadate.

Fixes the mis-compile mentioned in
https://github.com/llvm/llvm-project/pull/166247#issuecomment-3631471588
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 ++-
 ...ime-check-threshold-with-force-metadata.ll | 39 +++++++------------
 2 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 15d0fa41bd902..79cdae25e38da 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1827,8 +1827,12 @@ class GeneratedRTChecks {
     // profile info.
     CostTooHigh =
         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
-    if (CostTooHigh)
+    if (CostTooHigh) {
+      // Mark runtime checks as never succeeding when they exceed the threshold.
+      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
       return;
+    }
 
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
index b7d36fe7928e5..5376eb86882b7 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -2,29 +2,23 @@
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
 
-; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
-; no runtime check is generated even though one is needed and !noalias
-; annotations are added.
+; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
+; no runtime check is generated and the loop should not be vectorized.
 define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
 ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
 ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-; LIMIT0-NEXT:  [[ENTRY:.*:]]
-; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
-; LIMIT0:       [[VECTOR_PH]]:
-; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
-; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
-; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
-; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
-; LIMIT0:       [[VECTOR_BODY]]:
-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; LIMIT0-NEXT:  [[ENTRY:.*]]:
+; LIMIT0-NEXT:    br label %[[LOOP:.*]]
+; LIMIT0:       [[LOOP]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
+; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
 ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
-; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
 ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; LIMIT0:       [[MIDDLE_BLOCK]]:
-; LIMIT0-NEXT:    br label %[[EXIT:.*]]
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
 ; LIMIT0:       [[EXIT]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
 ; LIMIT0-NEXT:    ret i16 [[TMP0]]
 ;
 ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
@@ -88,14 +82,9 @@ exit:
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;.
-; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
-; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
-; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
-; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
-; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
-; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
-; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
-; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
+; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
 ;.
 ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
 ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}

From af3c3ecb181994146df20c82c3046727b62bc269 Mon Sep 17 00:00:00 2001
From: Philip Ginsbach-Chen <philip.ginsbach@cantab.net>
Date: Tue, 9 Dec 2025 20:41:44 +0000
Subject: [PATCH 53/63] [AArch64] recognise trn1/trn2 with flipped operands
 (#169858)

This PR is very similar to #167235, but applied to `trn` rather than
`zip`. There are two further differences:
- The `@combine_v8i16_8first` and `@combine_v8i16_8firstundef` test
  cases in `arm64-zip.ll` didn't have equivalents in `arm64-trn.ll`, so
  this PR adds new test cases `@vtrni8_8first`, `@vtrni8_9first`,
  `@vtrni8_89first_undef`.
- `AArch64TTIImpl::getShuffleCost` calls `isZIPMask`, but not
  `isTRNMask`. It relies on `Kind == TTI::SK_Transpose` instead (which
  in turn is based on `ShuffleVectorInst::isTransposeMask` through
  `improveShuffleKindFromMask`).
Therefore, this PR does not itself influence the slp-vectorizer. In a
follow-up PR, I intend to override
`AArch64TTIImpl::improveShuffleKindFromMask` to ensure we get
`ShuffleKind::SK_Transpose` based on the new `isTRNMask`. In fact, that
follow-up change is the actual motivation for this PR, as it will result
in
  ```C++
  int8x16_t g(int8_t x)
  {
    return (int8x16_t) { 0, x, 1, x, 2, x, 3, x,
                         4, x, 5, x, 6, x, 7, x };
  }
  ```
  from #137447 being optimised by the slp-vectorizer.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  16 +-
 .../Target/AArch64/AArch64PerfectShuffle.h    |  52 +++--
 .../GISel/AArch64PostLegalizerLowering.cpp    |   7 +-
 llvm/test/CodeGen/AArch64/arm64-trn.ll        |  57 ++++++
 .../AArch64/fixed-vector-deinterleave.ll      |   8 +-
 llvm/test/CodeGen/AArch64/reduce-shuffle.ll   | 185 +++++++++---------
 6 files changed, 202 insertions(+), 123 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e140aabb9bbeb..30eb19036ddda 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -14965,9 +14965,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
     return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
   }
-  if (isTRNMask(ShuffleMask, NumElts, WhichResult)) {
+  if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
-    return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2);
+    return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2,
+                       OperandOrder == 0 ? V2 : V1);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
@@ -16679,7 +16680,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
           isREVMask(M, EltSize, NumElts, 16) ||
           isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
           isSingletonEXTMask(M, VT, DummyUnsigned) ||
-          isTRNMask(M, NumElts, DummyUnsigned) ||
+          isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
           isUZPMask(M, NumElts, DummyUnsigned) ||
           isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) ||
           isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
@@ -31798,10 +31799,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
                     OperandOrder == 0 ? Op1 : Op2,
                     OperandOrder == 0 ? Op2 : Op1));
 
-  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) {
+  if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult,
+                OperandOrder)) {
     unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
-    return convertFromScalableVector(
-        DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2));
+    SDValue TRN =
+        DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2,
+                    OperandOrder == 0 ? Op2 : Op1);
+    return convertFromScalableVector(DAG, VT, TRN);
   }
 
   if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0)
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index ef8786d0ad0e1..c7d6b31291197 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6699,33 +6699,53 @@ inline bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
 }
 
 /// Return true for trn1 or trn2 masks of the form:
-///  <0, 8, 2, 10, 4, 12, 6, 14> or
-///  <1, 9, 3, 11, 5, 13, 7, 15>
+///  <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0, OperandOrderOut = 0) or
+///  <1, 9, 3, 11, 5, 13, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or
+///  <8, 0, 10, 2, 12, 4, 14, 6> (WhichResultOut = 0, OperandOrderOut = 1) or
+///  <9, 1, 11, 3, 13, 5, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) or
 inline bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
-                      unsigned &WhichResultOut) {
+                      unsigned &WhichResultOut, unsigned &OperandOrderOut) {
   if (NumElts % 2 != 0)
     return false;
-  // Check the first non-undef element for trn1 vs trn2.
-  unsigned WhichResult = 2;
+
+  // "Result" corresponds to "WhichResultOut", selecting between trn1 and trn2.
+  // "Order" corresponds to "OperandOrderOut", selecting the order of operands
+  // for the instruction (flipped or not).
+  bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0
+  bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0
+  bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1
+  bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1
+  // Check all elements match.
   for (unsigned i = 0; i != NumElts; i += 2) {
     if (M[i] >= 0) {
-      WhichResult = ((unsigned)M[i] == i ? 0 : 1);
-      break;
+      unsigned EvenElt = (unsigned)M[i];
+      if (EvenElt != i)
+        Result0Order0 = false;
+      if (EvenElt != i + 1)
+        Result1Order0 = false;
+      if (EvenElt != NumElts + i)
+        Result0Order1 = false;
+      if (EvenElt != NumElts + i + 1)
+        Result1Order1 = false;
     }
     if (M[i + 1] >= 0) {
-      WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1);
-      break;
+      unsigned OddElt = (unsigned)M[i + 1];
+      if (OddElt != NumElts + i)
+        Result0Order0 = false;
+      if (OddElt != NumElts + i + 1)
+        Result1Order0 = false;
+      if (OddElt != i)
+        Result0Order1 = false;
+      if (OddElt != i + 1)
+        Result1Order1 = false;
     }
   }
-  if (WhichResult == 2)
+
+  if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1)
     return false;
 
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  WhichResultOut = WhichResult;
+  WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1;
+  OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1;
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 4fba593b3d0fb..221a7bcd881bb 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -215,14 +215,15 @@ bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
               ShuffleVectorPseudo &MatchInfo) {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
   unsigned WhichResult;
+  unsigned OperandOrder;
   ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
   Register Dst = MI.getOperand(0).getReg();
   unsigned NumElts = MRI.getType(Dst).getNumElements();
-  if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+  if (!isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder))
     return false;
   unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
-  Register V1 = MI.getOperand(1).getReg();
-  Register V2 = MI.getOperand(2).getReg();
+  Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg();
+  Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg();
   MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
   return true;
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll
index fe245d01a7a6d..120c2d13a7ab7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-trn.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll
@@ -246,6 +246,63 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind {
 	ret <4 x float> %tmp5
 }
 
+define <8 x i8> @vtrni8_trn1_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn1_flipped:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn1 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_trn1_flipped:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn1 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 8, i32 0, i32 10, i32 2, i32 12, i32 4, i32 14, i32 6>
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @vtrni8_trn2_flipped(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_trn2_flipped:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_trn2_flipped:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 9, i32 1, i32 11, i32 3, i32 13, i32 5, i32 15, i32 7>
+  ret <8 x i8> %tmp1
+}
+
+define <8 x i8> @vtrni8_both_flipped_with_poison_values(<8 x i8> %A, <8 x i8> %B) nounwind {
+; CHECKLE-LABEL: vtrni8_both_flipped_with_poison_values:
+; CHECKLE:       // %bb.0:
+; CHECKLE-NEXT:    trn1 v2.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKLE-NEXT:    add v0.8b, v2.8b, v0.8b
+; CHECKLE-NEXT:    ret
+;
+; CHECKBE-LABEL: vtrni8_both_flipped_with_poison_values:
+; CHECKBE:       // %bb.0:
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v1.8b, v1.8b
+; CHECKBE-NEXT:    trn1 v2.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    trn2 v0.8b, v1.8b, v0.8b
+; CHECKBE-NEXT:    add v0.8b, v2.8b, v0.8b
+; CHECKBE-NEXT:    rev64 v0.8b, v0.8b
+; CHECKBE-NEXT:    ret
+  %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 0, i32 poison, i32 2, i32 poison, i32 4, i32 14, i32 6>
+  %tmp2 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> <i32 poison, i32 1, i32 poison, i32 3, i32 13, i32 5, i32 15, i32 poison>
+  %tmp3 = add <8 x i8> %tmp1, %tmp2
+  ret <8 x i8> %tmp3
+}
+
 ; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN:
 
 define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind {
diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
index 282e0503dd7be..8e75d69be5062 100644
--- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll
@@ -6,12 +6,10 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec
 ; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16:
 ; CHECK-SD:       // %bb.0:
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    dup v2.2s, v0.s[1]
-; CHECK-SD-NEXT:    mov v1.16b, v2.16b
-; CHECK-SD-NEXT:    zip1 v2.4h, v0.4h, v2.4h
-; CHECK-SD-NEXT:    mov v1.h[0], v0.h[1]
+; CHECK-SD-NEXT:    dup v1.2s, v0.s[1]
+; CHECK-SD-NEXT:    zip1 v2.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    trn2 v1.4h, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    fmov d0, d2
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 killed $q1
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16:
diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
index 072f6f4e8f73e..39beffcf85783 100644
--- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll
@@ -36,93 +36,93 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    zip1 v5.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    trn1 v6.4s, v3.4s, v0.4s
 ; CHECK-NEXT:    zip2 v0.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v16.16b, v1.16b, v1.16b, #12
-; CHECK-NEXT:    zip2 v17.4s, v1.4s, v2.4s
-; CHECK-NEXT:    zip2 v7.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip1 v18.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v7.16b, v1.16b, v1.16b, #12
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip1 v17.4s, v2.4s, v1.4s
+; CHECK-NEXT:    trn2 v18.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    ext v3.16b, v3.16b, v5.16b, #8
-; CHECK-NEXT:    mov v1.s[0], v2.s[1]
-; CHECK-NEXT:    ext v2.16b, v2.16b, v16.16b, #12
-; CHECK-NEXT:    mov v17.d[1], v6.d[1]
-; CHECK-NEXT:    mov v7.d[1], v6.d[1]
+; CHECK-NEXT:    zip2 v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    ext v2.16b, v2.16b, v7.16b, #12
+; CHECK-NEXT:    mov v16.d[1], v6.d[1]
+; CHECK-NEXT:    mov v18.d[1], v5.d[1]
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
-; CHECK-NEXT:    mov v18.d[1], v3.d[1]
-; CHECK-NEXT:    mov v1.d[1], v5.d[1]
+; CHECK-NEXT:    mov v17.d[1], v3.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v4.4s, v17.4s
-; CHECK-NEXT:    add v3.4s, v1.4s, v18.4s
-; CHECK-NEXT:    sub v1.4s, v18.4s, v1.4s
-; CHECK-NEXT:    sub v2.4s, v7.4s, v2.4s
+; CHECK-NEXT:    add v0.4s, v4.4s, v16.4s
+; CHECK-NEXT:    add v3.4s, v18.4s, v17.4s
+; CHECK-NEXT:    sub v6.4s, v17.4s, v18.4s
+; CHECK-NEXT:    sub v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    rev64 v4.4s, v0.4s
 ; CHECK-NEXT:    rev64 v5.4s, v3.4s
-; CHECK-NEXT:    sub v6.4s, v1.4s, v2.4s
-; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v1.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v6.4s
 ; CHECK-NEXT:    mov v4.d[1], v0.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v3.d[1]
-; CHECK-NEXT:    rev64 v2.4s, v6.4s
+; CHECK-NEXT:    rev64 v6.4s, v2.4s
 ; CHECK-NEXT:    rev64 v7.4s, v1.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v4.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v5.4s
-; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    sub v4.4s, v2.4s, v6.4s
 ; CHECK-NEXT:    sub v5.4s, v1.4s, v7.4s
-; CHECK-NEXT:    addp v4.4s, v3.4s, v6.4s
+; CHECK-NEXT:    addp v2.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    addp v1.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    rev64 v6.4s, v0.4s
 ; CHECK-NEXT:    rev64 v7.4s, v3.4s
-; CHECK-NEXT:    ext v16.16b, v4.16b, v2.16b, #4
+; CHECK-NEXT:    ext v16.16b, v2.16b, v4.16b, #4
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v5.16b, #4
 ; CHECK-NEXT:    sub v0.4s, v0.4s, v6.4s
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
-; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    zip2 v6.4s, v16.4s, v4.4s
+; CHECK-NEXT:    mov v7.16b, v4.16b
+; CHECK-NEXT:    zip2 v6.4s, v16.4s, v2.4s
 ; CHECK-NEXT:    mov v16.16b, v5.16b
 ; CHECK-NEXT:    zip2 v17.4s, v17.4s, v1.4s
 ; CHECK-NEXT:    ext v18.16b, v0.16b, v1.16b, #4
-; CHECK-NEXT:    mov v7.s[2], v4.s[3]
+; CHECK-NEXT:    mov v7.s[2], v2.s[3]
 ; CHECK-NEXT:    mov v21.16b, v3.16b
 ; CHECK-NEXT:    mov v16.s[2], v1.s[3]
 ; CHECK-NEXT:    ext v5.16b, v5.16b, v17.16b, #12
 ; CHECK-NEXT:    zip1 v17.4s, v1.4s, v1.4s
-; CHECK-NEXT:    ext v2.16b, v2.16b, v6.16b, #12
+; CHECK-NEXT:    ext v4.16b, v4.16b, v6.16b, #12
 ; CHECK-NEXT:    ext v18.16b, v18.16b, v18.16b, #4
 ; CHECK-NEXT:    mov v19.16b, v7.16b
-; CHECK-NEXT:    ext v6.16b, v3.16b, v4.16b, #8
-; CHECK-NEXT:    mov v21.s[2], v4.s[1]
+; CHECK-NEXT:    ext v6.16b, v3.16b, v2.16b, #8
+; CHECK-NEXT:    mov v21.s[2], v2.s[1]
 ; CHECK-NEXT:    mov v20.16b, v16.16b
-; CHECK-NEXT:    mov v19.s[1], v4.s[2]
+; CHECK-NEXT:    mov v19.s[1], v2.s[2]
 ; CHECK-NEXT:    trn2 v0.4s, v17.4s, v0.4s
 ; CHECK-NEXT:    sub v16.4s, v16.4s, v5.4s
 ; CHECK-NEXT:    mov v17.16b, v18.16b
 ; CHECK-NEXT:    ext v3.16b, v6.16b, v3.16b, #4
-; CHECK-NEXT:    sub v7.4s, v7.4s, v2.4s
+; CHECK-NEXT:    sub v7.4s, v7.4s, v4.4s
 ; CHECK-NEXT:    mov v20.s[1], v1.s[2]
 ; CHECK-NEXT:    mov v17.s[0], v1.s[1]
 ; CHECK-NEXT:    mov v1.16b, v21.16b
-; CHECK-NEXT:    add v2.4s, v19.4s, v2.4s
-; CHECK-NEXT:    uzp2 v3.4s, v6.4s, v3.4s
+; CHECK-NEXT:    add v4.4s, v19.4s, v4.4s
 ; CHECK-NEXT:    add v5.4s, v20.4s, v5.4s
-; CHECK-NEXT:    mov v1.s[1], v4.s[0]
-; CHECK-NEXT:    sub v4.4s, v0.4s, v18.4s
-; CHECK-NEXT:    mov v2.d[1], v7.d[1]
+; CHECK-NEXT:    mov v1.s[1], v2.s[0]
+; CHECK-NEXT:    uzp2 v2.4s, v6.4s, v3.4s
+; CHECK-NEXT:    sub v3.4s, v0.4s, v18.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v17.4s
+; CHECK-NEXT:    mov v4.d[1], v7.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v16.d[1]
-; CHECK-NEXT:    sub v6.4s, v21.4s, v3.4s
-; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    cmlt v4.8h, v2.8h, #0
-; CHECK-NEXT:    cmlt v3.8h, v5.8h, #0
+; CHECK-NEXT:    sub v6.4s, v21.4s, v2.4s
+; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    mov v0.d[1], v3.d[1]
+; CHECK-NEXT:    cmlt v3.8h, v4.8h, #0
+; CHECK-NEXT:    cmlt v2.8h, v5.8h, #0
 ; CHECK-NEXT:    mov v1.d[1], v6.d[1]
-; CHECK-NEXT:    add v2.4s, v4.4s, v2.4s
 ; CHECK-NEXT:    cmlt v6.8h, v0.8h, #0
-; CHECK-NEXT:    add v5.4s, v3.4s, v5.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
+; CHECK-NEXT:    add v4.4s, v3.4s, v4.4s
+; CHECK-NEXT:    add v5.4s, v2.4s, v5.4s
 ; CHECK-NEXT:    cmlt v7.8h, v1.8h, #0
 ; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
-; CHECK-NEXT:    eor v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
+; CHECK-NEXT:    eor v2.16b, v5.16b, v2.16b
 ; CHECK-NEXT:    add v1.4s, v7.4s, v1.4s
 ; CHECK-NEXT:    eor v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    add v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    add v2.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    eor v1.16b, v1.16b, v7.16b
 ; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
@@ -255,77 +255,76 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    saddw v3.4s, v6.4s, v3.4h
 ; CHECK-NEXT:    saddw v2.4s, v7.4s, v2.4h
 ; CHECK-NEXT:    zip1 v4.4s, v1.4s, v0.4s
-; CHECK-NEXT:    trn1 v18.4s, v1.4s, v0.4s
+; CHECK-NEXT:    trn1 v6.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip2 v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v3.4s, v2.4s
-; CHECK-NEXT:    mov v7.16b, v3.16b
-; CHECK-NEXT:    zip1 v6.4s, v2.4s, v3.4s
-; CHECK-NEXT:    zip2 v16.4s, v3.4s, v2.4s
+; CHECK-NEXT:    zip1 v7.4s, v2.4s, v3.4s
+; CHECK-NEXT:    trn2 v16.4s, v2.4s, v3.4s
+; CHECK-NEXT:    ext v18.16b, v3.16b, v3.16b, #12
 ; CHECK-NEXT:    ext v17.16b, v1.16b, v4.16b, #8
-; CHECK-NEXT:    mov v7.s[0], v2.s[1]
-; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #12
+; CHECK-NEXT:    zip2 v1.4s, v3.4s, v2.4s
 ; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v3.4s
+; CHECK-NEXT:    mov v16.d[1], v4.d[1]
 ; CHECK-NEXT:    zip2 v3.4s, v2.4s, v3.4s
-; CHECK-NEXT:    mov v16.d[1], v18.d[1]
-; CHECK-NEXT:    mov v6.d[1], v17.d[1]
-; CHECK-NEXT:    mov v7.d[1], v4.d[1]
-; CHECK-NEXT:    ext v1.16b, v2.16b, v1.16b, #12
+; CHECK-NEXT:    ext v2.16b, v2.16b, v18.16b, #12
+; CHECK-NEXT:    mov v7.d[1], v17.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v5.d[1], v0.d[1]
-; CHECK-NEXT:    mov v3.d[1], v18.d[1]
-; CHECK-NEXT:    add v2.4s, v7.4s, v6.4s
-; CHECK-NEXT:    mov v1.d[1], v0.d[1]
-; CHECK-NEXT:    add v4.4s, v5.4s, v16.4s
-; CHECK-NEXT:    rev64 v5.4s, v2.4s
-; CHECK-NEXT:    rev64 v0.4s, v4.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    sub v3.4s, v6.4s, v7.4s
-; CHECK-NEXT:    mov v5.d[1], v2.d[1]
-; CHECK-NEXT:    add v6.4s, v1.4s, v3.4s
-; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
-; CHECK-NEXT:    mov v0.d[1], v4.d[1]
-; CHECK-NEXT:    add v4.4s, v4.4s, v5.4s
-; CHECK-NEXT:    sub v0.4s, v2.4s, v0.4s
-; CHECK-NEXT:    zip1 v2.4s, v4.4s, v6.4s
-; CHECK-NEXT:    uzp2 v3.4s, v4.4s, v6.4s
-; CHECK-NEXT:    zip2 v16.4s, v4.4s, v6.4s
-; CHECK-NEXT:    zip1 v5.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn1 v7.4s, v0.4s, v1.4s
-; CHECK-NEXT:    zip2 v1.4s, v0.4s, v1.4s
-; CHECK-NEXT:    trn2 v2.4s, v4.4s, v2.4s
-; CHECK-NEXT:    uzp2 v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    mov v4.s[1], v6.s[1]
-; CHECK-NEXT:    ext v0.16b, v0.16b, v5.16b, #8
-; CHECK-NEXT:    mov v16.d[1], v7.d[1]
-; CHECK-NEXT:    mov v3.d[1], v1.d[1]
-; CHECK-NEXT:    mov v4.d[1], v5.d[1]
+; CHECK-NEXT:    mov v3.d[1], v6.d[1]
 ; CHECK-NEXT:    mov v2.d[1], v0.d[1]
-; CHECK-NEXT:    add v0.4s, v16.4s, v3.4s
-; CHECK-NEXT:    sub v3.4s, v3.4s, v16.4s
-; CHECK-NEXT:    add v1.4s, v4.4s, v2.4s
-; CHECK-NEXT:    sub v2.4s, v2.4s, v4.4s
+; CHECK-NEXT:    add v4.4s, v16.4s, v7.4s
+; CHECK-NEXT:    sub v6.4s, v7.4s, v16.4s
+; CHECK-NEXT:    add v1.4s, v5.4s, v1.4s
+; CHECK-NEXT:    sub v2.4s, v3.4s, v2.4s
+; CHECK-NEXT:    rev64 v5.4s, v4.4s
+; CHECK-NEXT:    rev64 v0.4s, v1.4s
+; CHECK-NEXT:    add v3.4s, v2.4s, v6.4s
+; CHECK-NEXT:    sub v2.4s, v6.4s, v2.4s
+; CHECK-NEXT:    mov v5.d[1], v4.d[1]
+; CHECK-NEXT:    mov v0.d[1], v1.d[1]
+; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
+; CHECK-NEXT:    sub v0.4s, v4.4s, v0.4s
+; CHECK-NEXT:    zip1 v4.4s, v1.4s, v3.4s
+; CHECK-NEXT:    uzp2 v5.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip2 v7.4s, v1.4s, v3.4s
+; CHECK-NEXT:    zip1 v6.4s, v0.4s, v2.4s
+; CHECK-NEXT:    trn1 v16.4s, v0.4s, v2.4s
+; CHECK-NEXT:    zip2 v2.4s, v0.4s, v2.4s
+; CHECK-NEXT:    trn2 v4.4s, v1.4s, v4.4s
+; CHECK-NEXT:    uzp2 v5.4s, v5.4s, v1.4s
+; CHECK-NEXT:    mov v1.s[1], v3.s[1]
+; CHECK-NEXT:    ext v0.16b, v0.16b, v6.16b, #8
+; CHECK-NEXT:    mov v7.d[1], v16.d[1]
+; CHECK-NEXT:    mov v5.d[1], v2.d[1]
+; CHECK-NEXT:    mov v1.d[1], v6.d[1]
+; CHECK-NEXT:    mov v4.d[1], v0.d[1]
+; CHECK-NEXT:    add v0.4s, v7.4s, v5.4s
+; CHECK-NEXT:    sub v3.4s, v5.4s, v7.4s
+; CHECK-NEXT:    add v2.4s, v1.4s, v4.4s
+; CHECK-NEXT:    sub v1.4s, v4.4s, v1.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #4
 ; CHECK-NEXT:    zip2 v6.4s, v0.4s, v3.4s
 ; CHECK-NEXT:    zip2 v7.4s, v3.4s, v0.4s
-; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #4
-; CHECK-NEXT:    zip2 v16.4s, v2.4s, v1.4s
-; CHECK-NEXT:    zip2 v17.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v2.16b, #4
+; CHECK-NEXT:    zip2 v16.4s, v1.4s, v2.4s
+; CHECK-NEXT:    zip2 v17.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    zip1 v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    zip1 v1.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    ext v18.16b, v4.16b, v3.16b, #8
-; CHECK-NEXT:    ext v19.16b, v5.16b, v2.16b, #8
+; CHECK-NEXT:    ext v19.16b, v5.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v1.4s, v2.4s, v1.4s
 ; CHECK-NEXT:    add v2.4s, v16.4s, v7.4s
 ; CHECK-NEXT:    sub v3.4s, v6.4s, v17.4s
-; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ext v4.16b, v18.16b, v4.16b, #4
-; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    cmlt v6.8h, v2.8h, #0
 ; CHECK-NEXT:    ext v5.16b, v19.16b, v5.16b, #4
+; CHECK-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    cmlt v1.8h, v3.8h, #0
 ; CHECK-NEXT:    add v2.4s, v6.4s, v2.4s
 ; CHECK-NEXT:    add v3.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    add v4.4s, v5.4s, v4.4s
 ; CHECK-NEXT:    cmlt v5.8h, v0.8h, #0
-; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
+; CHECK-NEXT:    eor v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    cmlt v7.8h, v4.8h, #0
 ; CHECK-NEXT:    add v0.4s, v5.4s, v0.4s
 ; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
@@ -480,7 +479,7 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur
 ; CHECK-NEXT:    sub v3.4s, v3.4s, v7.4s
 ; CHECK-NEXT:    uzp2 v4.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    uzp1 v7.4s, v1.4s, v0.4s
-; CHECK-NEXT:    mov v6.s[3], v5.s[2]
+; CHECK-NEXT:    trn1 v6.4s, v6.4s, v5.4s
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
 ; CHECK-NEXT:    zip2 v17.4s, v2.4s, v3.4s
 ; CHECK-NEXT:    zip1 v2.4s, v2.4s, v3.4s

From 0895163097b66f944b0e0a59960eab1deaf36684 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 12:43:01 -0800
Subject: [PATCH 54/63] [bazel] Port 24117f75ad9d7bbb439e8e1bd596fdcf0aa8d6e2
 (#171497)

This patch removed some source files that were explicitly enumerated in
the bazel files. Remove them so that the build passes.
---
 .../clang-tools-extra/clang-doc/BUILD.bazel                     | 2 --
 1 file changed, 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
index 45dada5884cb4..179658cadb0e2 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-doc/BUILD.bazel
@@ -28,7 +28,6 @@ cc_library(
         exclude = [
             "Generators.cpp",
             "HTMLGenerator.cpp",
-            "HTMLMustacheGenerator.cpp",
             "MDGenerator.cpp",
             "YAMLGenerator.cpp",
         ],
@@ -53,7 +52,6 @@ cc_library(
     srcs = [
         "Generators.cpp",
         "HTMLGenerator.cpp",
-        "HTMLMustacheGenerator.cpp",
         "MDGenerator.cpp",
         "YAMLGenerator.cpp",
     ],

From 019a2947719b979a4192ad4baa96e155e240f145 Mon Sep 17 00:00:00 2001
From: Medha Tiwari <75640645+medhatiwari@users.noreply.github.com>
Date: Wed, 10 Dec 2025 02:19:42 +0530
Subject: [PATCH 55/63] [CIR][X86] Implement xsave/xrstor builtins Fixes part
 of #167752 (#170877)

Handle xsave/xrstor family of X86 builtins in ClangIR

Part of #167752

---------

Signed-off-by: Medha Tiwari <medhatiwari@ibm.com>
---
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  71 ++++++-
 .../CIR/CodeGenBuiltins/X86/xsave-builtins.c  | 194 ++++++++++++++++++
 2 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 855134ba2b249..62836ce0f7537 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -544,9 +544,78 @@ mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
   case X86::BI__builtin_ia32_xsaves:
   case X86::BI__builtin_ia32_xsaves64:
   case X86::BI__builtin_ia32_xsetbv:
-  case X86::BI_xsetbv:
+  case X86::BI_xsetbv: {
+    mlir::Location loc = getLoc(expr->getExprLoc());
+    StringRef intrinsicName;
+    switch (builtinID) {
+    default:
+      llvm_unreachable("Unexpected builtin");
+    case X86::BI__builtin_ia32_xsave:
+      intrinsicName = "x86.xsave";
+      break;
+    case X86::BI__builtin_ia32_xsave64:
+      intrinsicName = "x86.xsave64";
+      break;
+    case X86::BI__builtin_ia32_xrstor:
+      intrinsicName = "x86.xrstor";
+      break;
+    case X86::BI__builtin_ia32_xrstor64:
+      intrinsicName = "x86.xrstor64";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt:
+      intrinsicName = "x86.xsaveopt";
+      break;
+    case X86::BI__builtin_ia32_xsaveopt64:
+      intrinsicName = "x86.xsaveopt64";
+      break;
+    case X86::BI__builtin_ia32_xrstors:
+      intrinsicName = "x86.xrstors";
+      break;
+    case X86::BI__builtin_ia32_xrstors64:
+      intrinsicName = "x86.xrstors64";
+      break;
+    case X86::BI__builtin_ia32_xsavec:
+      intrinsicName = "x86.xsavec";
+      break;
+    case X86::BI__builtin_ia32_xsavec64:
+      intrinsicName = "x86.xsavec64";
+      break;
+    case X86::BI__builtin_ia32_xsaves:
+      intrinsicName = "x86.xsaves";
+      break;
+    case X86::BI__builtin_ia32_xsaves64:
+      intrinsicName = "x86.xsaves64";
+      break;
+    case X86::BI__builtin_ia32_xsetbv:
+    case X86::BI_xsetbv:
+      intrinsicName = "x86.xsetbv";
+      break;
+    }
+
+    // The xsave family of instructions take a 64-bit mask that specifies
+    // which processor state components to save/restore. The hardware expects
+    // this mask split into two 32-bit registers: EDX (high 32 bits) and
+    // EAX (low 32 bits).
+    mlir::Type i32Ty = builder.getSInt32Ty();
+
+    // Mhi = (uint32_t)(ops[1] >> 32) - extract high 32 bits via right shift
+    cir::ConstantOp shift32 = builder.getSInt64(32, loc);
+    mlir::Value mhi = builder.createShift(loc, ops[1], shift32.getResult(),
+                                          /*isShiftLeft=*/false);
+    mhi = builder.createIntCast(mhi, i32Ty);
+
+    // Mlo = (uint32_t)ops[1] - extract low 32 bits by truncation
+    mlir::Value mlo = builder.createIntCast(ops[1], i32Ty);
+
+    return emitIntrinsicCallOp(builder, loc, intrinsicName, voidTy,
+                               mlir::ValueRange{ops[0], mhi, mlo});
+  }
   case X86::BI__builtin_ia32_xgetbv:
   case X86::BI_xgetbv:
+    // xgetbv reads the extended control register specified by ops[0] (ECX)
+    // and returns the 64-bit value
+    return emitIntrinsicCallOp(builder, getLoc(expr->getExprLoc()),
+                               "x86.xgetbv", builder.getUInt64Ty(), ops[0]);
   case X86::BI__builtin_ia32_storedqudi128_mask:
   case X86::BI__builtin_ia32_storedqusi128_mask:
   case X86::BI__builtin_ia32_storedquhi128_mask:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c
new file mode 100644
index 0000000000000..23d6edc6c8c6e
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/xsave-builtins.c
@@ -0,0 +1,194 @@
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -fclangir -emit-cir -o %t.cir -Wall -Werror
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -fclangir -emit-llvm -o %t.ll -Wall -Werror
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+
+// RUN: %clang_cc1 -x c -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +xsave -target-feature +xsaveopt -target-feature +xsavec -target-feature +xsaves -emit-llvm -o - -Wall -Werror | FileCheck %s -check-prefix=OGCG
+
+void test_xsave(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsave
+  // CIR: [[P:%.*]] = cir.load {{.*}} : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+  // CIR: [[M:%.*]] = cir.load {{.*}} : !cir.ptr<!u64i>, !u64i
+  // CIR: [[CONST:%.*]] = cir.const #cir.int<32> : !s64i
+  // CIR: [[SHIFT:%.*]] = cir.shift(right, [[M]] : !u64i, [[CONST]] : !s64i) -> !u64i
+  // CIR: [[CAST1:%.*]] = cir.cast integral [[SHIFT]] : !u64i -> !s32i
+  // CIR: [[CAST2:%.*]] = cir.cast integral [[M]] : !u64i -> !s32i
+  // CIR: cir.call_llvm_intrinsic "x86.xsave" [[P]], [[CAST1]], [[CAST2]]
+
+  // LLVM-LABEL: test_xsave
+  // LLVM: [[LP:%.*]] = load ptr, ptr
+  // LLVM: [[LM:%.*]] = load i64, ptr
+  // LLVM: [[LSHIFT:%.*]] = lshr i64 [[LM]], 32
+  // LLVM: [[LCAST1:%.*]] = trunc i64 [[LSHIFT]] to i32
+  // LLVM: [[LCAST2:%.*]] = trunc i64 [[LM]] to i32
+  // LLVM: call void @llvm.x86.xsave(ptr [[LP]], i32 [[LCAST1]], i32 [[LCAST2]])
+
+  // OGCG-LABEL: test_xsave
+  // OGCG: [[OP:%.*]] = load ptr, ptr
+  // OGCG: [[OM:%.*]] = load i64, ptr
+  // OGCG: [[OSHIFT:%.*]] = lshr i64 [[OM]], 32
+  // OGCG: [[OCAST1:%.*]] = trunc i64 [[OSHIFT]] to i32
+  // OGCG: [[OCAST2:%.*]] = trunc i64 [[OM]] to i32
+  // OGCG: call void @llvm.x86.xsave(ptr [[OP]], i32 [[OCAST1]], i32 [[OCAST2]])
+  __builtin_ia32_xsave(p, m);
+}
+
+// The following tests use the same pattern as test_xsave (load, shift, cast, cast, intrinsic call).
+// Only the intrinsic name differs, so we just check the intrinsic call.
+
+void test_xsave64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsave64
+  // CIR: cir.call_llvm_intrinsic "x86.xsave64"
+
+  // LLVM-LABEL: test_xsave64
+  // LLVM: call void @llvm.x86.xsave64
+
+  // OGCG-LABEL: test_xsave64
+  // OGCG: call void @llvm.x86.xsave64
+  __builtin_ia32_xsave64(p, m);
+}
+
+void test_xrstor(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstor
+  // CIR: cir.call_llvm_intrinsic "x86.xrstor"
+
+  // LLVM-LABEL: test_xrstor
+  // LLVM: call void @llvm.x86.xrstor
+
+  // OGCG-LABEL: test_xrstor
+  // OGCG: call void @llvm.x86.xrstor
+  __builtin_ia32_xrstor(p, m);
+}
+
+void test_xrstor64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstor64
+  // CIR: cir.call_llvm_intrinsic "x86.xrstor64"
+
+  // LLVM-LABEL: test_xrstor64
+  // LLVM: call void @llvm.x86.xrstor64
+
+  // OGCG-LABEL: test_xrstor64
+  // OGCG: call void @llvm.x86.xrstor64
+  __builtin_ia32_xrstor64(p, m);
+}
+
+void test_xsaveopt(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaveopt
+  // CIR: cir.call_llvm_intrinsic "x86.xsaveopt"
+
+  // LLVM-LABEL: test_xsaveopt
+  // LLVM: call void @llvm.x86.xsaveopt
+
+  // OGCG-LABEL: test_xsaveopt
+  // OGCG: call void @llvm.x86.xsaveopt
+  __builtin_ia32_xsaveopt(p, m);
+}
+
+void test_xsaveopt64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaveopt64
+  // CIR: cir.call_llvm_intrinsic "x86.xsaveopt64"
+
+  // LLVM-LABEL: test_xsaveopt64
+  // LLVM: call void @llvm.x86.xsaveopt64
+
+  // OGCG-LABEL: test_xsaveopt64
+  // OGCG: call void @llvm.x86.xsaveopt64
+  __builtin_ia32_xsaveopt64(p, m);
+}
+
+void test_xsavec(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsavec
+  // CIR: cir.call_llvm_intrinsic "x86.xsavec"
+
+  // LLVM-LABEL: test_xsavec
+  // LLVM: call void @llvm.x86.xsavec
+
+  // OGCG-LABEL: test_xsavec
+  // OGCG: call void @llvm.x86.xsavec
+  __builtin_ia32_xsavec(p, m);
+}
+
+void test_xsavec64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsavec64
+  // CIR: cir.call_llvm_intrinsic "x86.xsavec64"
+
+  // LLVM-LABEL: test_xsavec64
+  // LLVM: call void @llvm.x86.xsavec64
+
+  // OGCG-LABEL: test_xsavec64
+  // OGCG: call void @llvm.x86.xsavec64
+  __builtin_ia32_xsavec64(p, m);
+}
+
+void test_xsaves(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaves
+  // CIR: cir.call_llvm_intrinsic "x86.xsaves"
+
+  // LLVM-LABEL: test_xsaves
+  // LLVM: call void @llvm.x86.xsaves
+
+  // OGCG-LABEL: test_xsaves
+  // OGCG: call void @llvm.x86.xsaves
+  __builtin_ia32_xsaves(p, m);
+}
+
+void test_xsaves64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xsaves64
+  // CIR: cir.call_llvm_intrinsic "x86.xsaves64"
+
+  // LLVM-LABEL: test_xsaves64
+  // LLVM: call void @llvm.x86.xsaves64
+
+  // OGCG-LABEL: test_xsaves64
+  // OGCG: call void @llvm.x86.xsaves64
+  __builtin_ia32_xsaves64(p, m);
+}
+
+void test_xrstors(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstors
+  // CIR: cir.call_llvm_intrinsic "x86.xrstors"
+
+  // LLVM-LABEL: test_xrstors
+  // LLVM: call void @llvm.x86.xrstors
+
+  // OGCG-LABEL: test_xrstors
+  // OGCG: call void @llvm.x86.xrstors
+  __builtin_ia32_xrstors(p, m);
+}
+
+void test_xrstors64(void *p, unsigned long long m) {
+  // CIR-LABEL: test_xrstors64
+  // CIR: cir.call_llvm_intrinsic "x86.xrstors64"
+
+  // LLVM-LABEL: test_xrstors64
+  // LLVM: call void @llvm.x86.xrstors64
+
+  // OGCG-LABEL: test_xrstors64
+  // OGCG: call void @llvm.x86.xrstors64
+  __builtin_ia32_xrstors64(p, m);
+}
+
+unsigned long long test_xgetbv(unsigned int a) {
+  // CIR-LABEL: test_xgetbv
+  // CIR: cir.call_llvm_intrinsic "x86.xgetbv"
+
+  // LLVM-LABEL: test_xgetbv
+  // LLVM: call i64 @llvm.x86.xgetbv
+
+  // OGCG-LABEL: test_xgetbv
+  // OGCG: call i64 @llvm.x86.xgetbv
+  return __builtin_ia32_xgetbv(a);
+}
+
+void test_xsetbv(unsigned int a, unsigned long long m) {
+  // CIR-LABEL: test_xsetbv
+  // CIR: cir.call_llvm_intrinsic "x86.xsetbv"
+
+  // LLVM-LABEL: test_xsetbv
+  // LLVM: call void @llvm.x86.xsetbv
+
+  // OGCG-LABEL: test_xsetbv
+  // OGCG: call void @llvm.x86.xsetbv
+  __builtin_ia32_xsetbv(a, m);
+}
+

From 87bf5ee23863bc0b467ee44b2184b2c134a98464 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor@nvidia.com>
Date: Tue, 9 Dec 2025 12:55:47 -0800
Subject: [PATCH 56/63] [CIR] Add basic support for data member pointers
 (#170939)

This adds the minimum support for C++ data member pointer variables.
---
 clang/include/clang/CIR/Dialect/IR/CIRAttrs.h |  1 +
 .../include/clang/CIR/Dialect/IR/CIRAttrs.td  | 51 +++++++++++
 .../CIR/Dialect/IR/CIRTypeConstraints.td      | 10 ++-
 .../include/clang/CIR/Dialect/IR/CIRTypes.td  | 32 ++++++-
 clang/include/clang/CIR/MissingFeatures.h     |  5 ++
 clang/lib/CIR/CodeGen/CIRGenBuilder.h         |  5 ++
 clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp    |  7 +-
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 20 +++++
 clang/lib/CIR/CodeGen/CIRGenModule.h          |  2 +
 clang/lib/CIR/CodeGen/CIRGenTypes.cpp         | 15 ++++
 clang/lib/CIR/Dialect/IR/CIRAttrs.cpp         | 32 +++++++
 clang/lib/CIR/Dialect/IR/CIRDialect.cpp       |  6 ++
 clang/lib/CIR/Dialect/IR/CIRTypes.cpp         | 20 +++++
 .../lib/CIR/Dialect/Transforms/CMakeLists.txt |  3 +
 .../Transforms/TargetLowering/CIRCXXABI.cpp   | 20 +++++
 .../Transforms/TargetLowering/CIRCXXABI.h     | 55 ++++++++++++
 .../Transforms/TargetLowering/CMakeLists.txt  | 20 +++++
 .../TargetLowering/LowerItaniumCXXABI.cpp     | 90 +++++++++++++++++++
 .../Transforms/TargetLowering/LowerModule.cpp | 87 ++++++++++++++++++
 .../Transforms/TargetLowering/LowerModule.h   | 55 ++++++++++++
 .../CIR/Lowering/DirectToLLVM/CMakeLists.txt  |  5 ++
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 34 ++++++-
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  2 +
 .../CIR/CodeGen/pointer-to-data-member.cpp    | 32 +++++++
 clang/test/CIR/IR/invalid-data-member.cir     | 27 ++++++
 clang/utils/TableGen/CIRLoweringEmitter.cpp   |  7 +-
 26 files changed, 631 insertions(+), 12 deletions(-)
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
 create mode 100644 clang/test/CIR/CodeGen/pointer-to-data-member.cpp
 create mode 100644 clang/test/CIR/IR/invalid-data-member.cir

diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
index 03a6a97dc8c2e..858d4d6350bed 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.h
@@ -35,6 +35,7 @@ namespace cir {
 class ArrayType;
 class BoolType;
 class ComplexType;
+class DataMemberType;
 class IntType;
 class MethodType;
 class PointerType;
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
index 98d4636dafc29..c0279a0b20670 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRAttrs.td
@@ -447,6 +447,57 @@ def CIR_ConstPtrAttr : CIR_Attr<"ConstPtr", "ptr", [TypedAttrInterface]> {
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// DataMemberAttr
+//===----------------------------------------------------------------------===//
+
+def CIR_DataMemberAttr : CIR_Attr<"DataMember", "data_member", [
+  TypedAttrInterface
+]> {
+  let summary = "Holds a constant data member pointer value";
+  let parameters = (ins AttributeSelfTypeParameter<
+                            "", "cir::DataMemberType">:$type,
+                        OptionalParameter<
+                            "std::optional<unsigned>">:$member_index);
+  let description = [{
+    A data member attribute is a literal attribute that represents a constant
+    pointer-to-data-member value.
+
+    The `member_index` parameter represents the index of the pointed-to member
+    within its containing record. It is an optional parameter; lack of this
+    parameter indicates a null pointer-to-data-member value.
+
+    Example:
+    ```
+    #ptr = #cir.data_member<1> : !cir.data_member<!s32i in !rec_22Point22>
+
+    #null = #cir.data_member<null> : !cir.data_member<!s32i in !rec_22Point22>
+    ```
+  }];
+
+  let builders = [
+    AttrBuilderWithInferredContext<(ins "cir::DataMemberType":$type), [{
+      return $_get(type.getContext(), type, std::nullopt);
+    }]>,
+    AttrBuilderWithInferredContext<(ins "cir::DataMemberType":$type,
+                                        "unsigned":$member_index), [{
+      return $_get(type.getContext(), type, member_index);
+    }]>,
+  ];
+
+  let genVerifyDecl = 1;
+
+  let assemblyFormat = [{
+    `<` ($member_index^):(`null`)? `>`
+  }];
+
+  let extraClassDeclaration = [{
+    bool isNullPtr() const {
+      return !getMemberIndex().has_value();
+    }
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // GlobalViewAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
index ddca98eac93ab..89762249ed0c4 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypeConstraints.td
@@ -309,6 +309,13 @@ def CIR_AnyFloatOrVecOfFloatType
     let cppFunctionName = "isFPOrVectorOfFPType";
 }
 
+//===----------------------------------------------------------------------===//
+// Data member type predicates
+//===----------------------------------------------------------------------===//
+
+def CIR_AnyDataMemberType : CIR_TypeBase<"::cir::DataMemberType",
+    "data member type">;
+
 //===----------------------------------------------------------------------===//
 // VPtr type predicates
 //===----------------------------------------------------------------------===//
@@ -322,7 +329,8 @@ def CIR_PtrToVPtr : CIR_PtrToType<CIR_AnyVPtrType>;
 //===----------------------------------------------------------------------===//
 
 defvar CIR_ScalarTypes = [
-    CIR_AnyBoolType, CIR_AnyIntType, CIR_AnyFloatType, CIR_AnyPtrType
+    CIR_AnyBoolType, CIR_AnyIntType, CIR_AnyFloatType, CIR_AnyPtrType,
+    CIR_AnyDataMemberType, CIR_AnyVPtrType
 ];
 
 def CIR_AnyScalarType : AnyTypeOf<CIR_ScalarTypes, "cir scalar type"> {
diff --git a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
index 3e062add6633a..59b97f0c6d39a 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIRTypes.td
@@ -305,6 +305,36 @@ def CIR_PointerType : CIR_Type<"Pointer", "ptr", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// CIR_DataMemberType
+//===----------------------------------------------------------------------===//
+
+def CIR_DataMemberType : CIR_Type<"DataMember", "data_member",
+    [DeclareTypeInterfaceMethods<DataLayoutTypeInterface>]
+> {
+  let summary = "CIR type that represents a pointer-to-data-member in C++";
+  let description = [{
+    `cir.data_member` models a pointer-to-data-member in C++. Values of this
+    type are essentially offsets of the pointed-to member within one of its
+    containing record.
+  }];
+
+  let parameters = (ins "mlir::Type":$member_ty,
+                        "cir::RecordType":$class_ty);
+
+  let builders = [
+    TypeBuilderWithInferredContext<(ins
+      "mlir::Type":$member_ty, "cir::RecordType":$class_ty
+    ), [{
+      return $_get(member_ty.getContext(), member_ty, class_ty);
+    }]>,
+  ];
+
+  let assemblyFormat = [{
+    `<` $member_ty `in` $class_ty `>`
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // CIR_VPtrType
 //===----------------------------------------------------------------------===//
@@ -693,7 +723,7 @@ def CIRRecordType : Type<
 def CIR_AnyType : AnyTypeOf<[
   CIR_VoidType, CIR_BoolType, CIR_ArrayType, CIR_VectorType, CIR_IntType,
   CIR_AnyFloatType, CIR_PointerType, CIR_FuncType, CIR_RecordType,
-  CIR_ComplexType, CIR_VPtrType
+  CIR_ComplexType, CIR_VPtrType, CIR_DataMemberType
 ]>;
 
 #endif // CLANG_CIR_DIALECT_IR_CIRTYPES_TD
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 826a4b13f5c0c..b2d94709016fa 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -189,6 +189,10 @@ struct MissingFeatures {
   static bool globalCtorLexOrder() { return false; }
   static bool globalCtorAssociatedData() { return false; }
 
+  // LowerModule handling
+  static bool lowerModuleCodeGenOpts() { return false; }
+  static bool lowerModuleLangOpts() { return false; }
+
   // Misc
   static bool aarch64SIMDIntrinsics() { return false; }
   static bool aarch64SMEIntrinsics() { return false; }
@@ -292,6 +296,7 @@ struct MissingFeatures {
   static bool lowerModeOptLevel() { return false; }
   static bool loweringPrepareX86CXXABI() { return false; }
   static bool loweringPrepareAArch64XXABI() { return false; }
+  static bool makeTripleAlwaysPresent() { return false; }
   static bool maybeHandleStaticInExternC() { return false; }
   static bool mergeAllConstants() { return false; }
   static bool metaDataNode() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
index 85b38120169fd..bf13eeeaea60a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h
@@ -189,6 +189,11 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy {
     return getType<cir::RecordType>(nameAttr, kind);
   }
 
+  cir::DataMemberAttr getDataMemberAttr(cir::DataMemberType ty,
+                                        unsigned memberIndex) {
+    return cir::DataMemberAttr::get(ty, memberIndex);
+  }
+
   // Return true if the value is a null constant such as null pointer, (+0.0)
   // for floating-point or zero initializer
   bool isNullValue(mlir::Attribute attr) const {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 6820e2a403288..25ce1ba26da09 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -731,11 +731,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
   }
 
   mlir::Value VisitUnaryAddrOf(const UnaryOperator *e) {
-    if (llvm::isa<MemberPointerType>(e->getType())) {
-      cgf.cgm.errorNYI(e->getSourceRange(), "Address of member pointer");
-      return builder.getNullPtr(cgf.convertType(e->getType()),
-                                cgf.getLoc(e->getExprLoc()));
-    }
+    if (llvm::isa<MemberPointerType>(e->getType()))
+      return cgf.cgm.emitMemberPointerConstant(e);
 
     return cgf.emitLValue(e->getSubExpr()).getPointer();
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index e1894c040dd53..41a5d9db83e2b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -1464,6 +1464,26 @@ void CIRGenModule::emitExplicitCastExprType(const ExplicitCastExpr *e,
          "emitExplicitCastExprType");
 }
 
+mlir::Value CIRGenModule::emitMemberPointerConstant(const UnaryOperator *e) {
+  assert(!cir::MissingFeatures::cxxABI());
+
+  mlir::Location loc = getLoc(e->getSourceRange());
+
+  const auto *decl = cast<DeclRefExpr>(e->getSubExpr())->getDecl();
+
+  // A member function pointer.
+  if (isa<CXXMethodDecl>(decl)) {
+    errorNYI(e->getSourceRange(), "emitMemberPointerConstant: method pointer");
+    return {};
+  }
+
+  // Otherwise, a member data pointer.
+  auto ty = mlir::cast<cir::DataMemberType>(convertType(e->getType()));
+  const auto *fieldDecl = cast<FieldDecl>(decl);
+  return cir::ConstantOp::create(
+      builder, loc, builder.getDataMemberAttr(ty, fieldDecl->getFieldIndex()));
+}
+
 void CIRGenModule::emitDeclContext(const DeclContext *dc) {
   for (Decl *decl : dc->decls()) {
     // Unlike other DeclContexts, the contents of an ObjCImplDecl at TU scope
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 59eb5f8938129..9c0961579718d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -497,6 +497,8 @@ class CIRGenModule : public CIRGenTypeCache {
   /// the given type. This is usually, but not always, an LLVM null constant.
   mlir::TypedAttr emitNullConstantForBase(const CXXRecordDecl *record);
 
+  mlir::Value emitMemberPointerConstant(const UnaryOperator *e);
+
   llvm::StringRef getMangledName(clang::GlobalDecl gd);
 
   void emitTentativeDefinition(const VarDecl *d);
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
index 24b106b4bcee7..7f000ece8a494 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.cpp
@@ -482,6 +482,21 @@ mlir::Type CIRGenTypes::convertType(QualType type) {
     break;
   }
 
+  case Type::MemberPointer: {
+    const auto *mpt = cast<MemberPointerType>(ty);
+
+    mlir::Type memberTy = convertType(mpt->getPointeeType());
+    auto clsTy = mlir::cast<cir::RecordType>(
+        convertType(QualType(mpt->getQualifier().getAsType(), 0)));
+    if (mpt->isMemberDataPointer()) {
+      resultType = cir::DataMemberType::get(memberTy, clsTy);
+    } else {
+      assert(!cir::MissingFeatures::methodType());
+      cgm.errorNYI(SourceLocation(), "MethodType");
+    }
+    break;
+  }
+
   case Type::FunctionNoProto:
   case Type::FunctionProto:
     resultType = convertFunctionTypeInternal(type);
diff --git a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
index ee296f171e0d9..59d7765198f9e 100644
--- a/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRAttrs.cpp
@@ -269,6 +269,38 @@ ConstComplexAttr::verify(function_ref<InFlightDiagnostic()> emitError,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DataMemberAttr definitions
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+DataMemberAttr::verify(function_ref<InFlightDiagnostic()> emitError,
+                       cir::DataMemberType ty,
+                       std::optional<unsigned> memberIndex) {
+  // DataMemberAttr without a given index represents a null value.
+  if (!memberIndex.has_value())
+    return success();
+
+  cir::RecordType recTy = ty.getClassTy();
+  if (recTy.isIncomplete())
+    return emitError()
+           << "incomplete 'cir.record' cannot be used to build a non-null "
+              "data member pointer";
+
+  unsigned memberIndexValue = memberIndex.value();
+  if (memberIndexValue >= recTy.getNumElements())
+    return emitError()
+           << "member index of a #cir.data_member attribute is out of range";
+
+  mlir::Type memberTy = recTy.getMembers()[memberIndexValue];
+  if (memberTy != ty.getMemberTy())
+    return emitError()
+           << "member type of a #cir.data_member attribute must match the "
+              "attribute type";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // CIR ConstArrayAttr
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index ec8cae62d6bc8..38a2cecbb8617 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -357,6 +357,12 @@ static LogicalResult checkConstantTypes(mlir::Operation *op, mlir::Type opType,
     return success();
   }
 
+  if (isa<cir::DataMemberAttr>(attrType)) {
+    // More detailed type verifications are already done in
+    // DataMemberAttr::verify. Don't need to repeat here.
+    return success();
+  }
+
   if (isa<cir::ZeroAttr>(attrType)) {
     if (isa<cir::RecordType, cir::ArrayType, cir::VectorType, cir::ComplexType>(
             opType))
diff --git a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
index bb87056048ec5..9a37a4f4e3996 100644
--- a/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRTypes.cpp
@@ -750,6 +750,26 @@ BoolType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
   return 1;
 }
 
+//===----------------------------------------------------------------------===//
+//  DataMemberType Definitions
+//===----------------------------------------------------------------------===//
+
+llvm::TypeSize
+DataMemberType::getTypeSizeInBits(const ::mlir::DataLayout &dataLayout,
+                                  ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: consider size differences under different ABIs
+  assert(!MissingFeatures::cxxABI());
+  return llvm::TypeSize::getFixed(64);
+}
+
+uint64_t
+DataMemberType::getABIAlignment(const ::mlir::DataLayout &dataLayout,
+                                ::mlir::DataLayoutEntryListRef params) const {
+  // FIXME: consider alignment differences under different ABIs
+  assert(!MissingFeatures::cxxABI());
+  return 8;
+}
+
 //===----------------------------------------------------------------------===//
 //  VPtrType Definitions
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
index 3fc5b06b74e4d..e3b7106c1d6b9 100644
--- a/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(TargetLowering)
+
 add_clang_library(MLIRCIRTransforms
   CIRCanonicalize.cpp
   CIRSimplify.cpp
@@ -21,4 +23,5 @@ add_clang_library(MLIRCIRTransforms
 
   MLIRCIR
   MLIRCIRInterfaces
+  MLIRCIRTargetLowering
 )
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
new file mode 100644
index 0000000000000..86cf7ebdc8f50
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.cpp
@@ -0,0 +1,20 @@
+//===- CIRCXXABI.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CGCXXABI.cpp. The queries are
+// adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRCXXABI.h"
+
+namespace cir {
+
+CIRCXXABI::~CIRCXXABI() {}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
new file mode 100644
index 0000000000000..003cd78eb3f26
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
@@ -0,0 +1,55 @@
+//===----- CIRCXXABI.h - Interface to C++ ABIs for CIR Dialect --*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics the CodeGen/CGCXXABI.h class. The main difference
+// is that this is adapted to operate on the CIR dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
+#define CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
+
+#include "mlir/Transforms/DialectConversion.h"
+#include "clang/CIR/Dialect/IR/CIRTypes.h"
+
+namespace cir {
+
+// Forward declarations.
+class LowerModule;
+
+class CIRCXXABI {
+  friend class LowerModule;
+
+protected:
+  LowerModule &lm;
+
+  CIRCXXABI(LowerModule &lm) : lm(lm) {}
+
+public:
+  virtual ~CIRCXXABI();
+
+  /// Lower the given data member pointer type to its ABI type. The returned
+  /// type is also a CIR type.
+  virtual mlir::Type
+  lowerDataMemberType(cir::DataMemberType type,
+                      const mlir::TypeConverter &typeConverter) const = 0;
+
+  /// Lower the given data member pointer constant to a constant of the ABI
+  /// type. The returned constant is represented as an attribute as well.
+  virtual mlir::TypedAttr
+  lowerDataMemberConstant(cir::DataMemberAttr attr,
+                          const mlir::DataLayout &layout,
+                          const mlir::TypeConverter &typeConverter) const = 0;
+};
+
+/// Creates an Itanium-family ABI.
+std::unique_ptr<CIRCXXABI> createItaniumCXXABI(LowerModule &lm);
+
+} // namespace cir
+
+#endif // CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
new file mode 100644
index 0000000000000..158c42e729536
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
@@ -0,0 +1,20 @@
+add_clang_library(MLIRCIRTargetLowering
+  CIRCXXABI.cpp
+  LowerModule.cpp
+  LowerItaniumCXXABI.cpp
+
+  DEPENDS
+  clangBasic
+
+  LINK_COMPONENTS
+  TargetParser
+
+  LINK_LIBS PUBLIC
+
+  clangBasic
+  MLIRIR
+  MLIRPass
+  MLIRDLTIDialect
+  MLIRCIR
+  MLIRCIRInterfaces
+)
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
new file mode 100644
index 0000000000000..7089990343dc0
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerItaniumCXXABI.cpp
@@ -0,0 +1,90 @@
+//===---- LowerItaniumCXXABI.cpp - Emit CIR code Itanium-specific code  ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This provides CIR lowering logic targeting the Itanium C++ ABI. The class in
+// this file generates records that follow the Itanium C++ ABI, which is
+// documented at:
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi.html
+//  https://itanium-cxx-abi.github.io/cxx-abi/abi-eh.html
+//
+// It also supports the closely-related ARM ABI, documented at:
+// https://developer.arm.com/documentation/ihi0041/g/
+//
+// This file partially mimics clang/lib/CodeGen/ItaniumCXXABI.cpp. The queries
+// are adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRCXXABI.h"
+#include "LowerModule.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace cir {
+
+namespace {
+
+class LowerItaniumCXXABI : public CIRCXXABI {
+public:
+  LowerItaniumCXXABI(LowerModule &lm) : CIRCXXABI(lm) {}
+
+  /// Lower the given data member pointer type to its ABI type. The returned
+  /// type is also a CIR type.
+  virtual mlir::Type
+  lowerDataMemberType(cir::DataMemberType type,
+                      const mlir::TypeConverter &typeConverter) const override;
+
+  mlir::TypedAttr lowerDataMemberConstant(
+      cir::DataMemberAttr attr, const mlir::DataLayout &layout,
+      const mlir::TypeConverter &typeConverter) const override;
+};
+
+} // namespace
+
+std::unique_ptr<CIRCXXABI> createItaniumCXXABI(LowerModule &lm) {
+  return std::make_unique<LowerItaniumCXXABI>(lm);
+}
+
+static cir::IntType getPtrDiffCIRTy(LowerModule &lm) {
+  const clang::TargetInfo &target = lm.getTarget();
+  clang::TargetInfo::IntType ptrdiffTy =
+      target.getPtrDiffType(clang::LangAS::Default);
+  return cir::IntType::get(lm.getMLIRContext(), target.getTypeWidth(ptrdiffTy),
+                           target.isTypeSigned(ptrdiffTy));
+}
+
+mlir::Type LowerItaniumCXXABI::lowerDataMemberType(
+    cir::DataMemberType type, const mlir::TypeConverter &typeConverter) const {
+  // Itanium C++ ABI 2.3.1:
+  //   A data member pointer is represented as the data member's offset in bytes
+  //   from the address point of an object of the base type, as a ptrdiff_t.
+  return getPtrDiffCIRTy(lm);
+}
+
+mlir::TypedAttr LowerItaniumCXXABI::lowerDataMemberConstant(
+    cir::DataMemberAttr attr, const mlir::DataLayout &layout,
+    const mlir::TypeConverter &typeConverter) const {
+  uint64_t memberOffset;
+  if (attr.isNullPtr()) {
+    // Itanium C++ ABI 2.3:
+    //   A NULL pointer is represented as -1.
+    memberOffset = -1ull;
+  } else {
+    // Itanium C++ ABI 2.3:
+    //   A pointer to data member is an offset from the base address of
+    //   the class object containing it, represented as a ptrdiff_t
+    unsigned memberIndex = attr.getMemberIndex().value();
+    memberOffset =
+        attr.getType().getClassTy().getElementOffset(layout, memberIndex);
+  }
+
+  mlir::Type abiTy = lowerDataMemberType(attr.getType(), typeConverter);
+  return cir::IntAttr::get(abiTy, memberOffset);
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
new file mode 100644
index 0000000000000..7576e20ac8f54
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.cpp
@@ -0,0 +1,87 @@
+//===--- LowerModule.cpp - Lower CIR Module to a Target -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CodeGenModule.cpp. The queries
+// are adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LowerModule.h"
+#include "CIRCXXABI.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/PatternMatch.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/TargetOptions.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace cir {
+
+static std::unique_ptr<CIRCXXABI> createCXXABI(LowerModule &lm) {
+  switch (lm.getCXXABIKind()) {
+  case clang::TargetCXXABI::AppleARM64:
+  case clang::TargetCXXABI::Fuchsia:
+  case clang::TargetCXXABI::GenericAArch64:
+  case clang::TargetCXXABI::GenericARM:
+  case clang::TargetCXXABI::iOS:
+  case clang::TargetCXXABI::WatchOS:
+  case clang::TargetCXXABI::GenericMIPS:
+  case clang::TargetCXXABI::GenericItanium:
+  case clang::TargetCXXABI::WebAssembly:
+  case clang::TargetCXXABI::XL:
+    return createItaniumCXXABI(lm);
+  case clang::TargetCXXABI::Microsoft:
+    llvm_unreachable("Windows ABI NYI");
+  }
+
+  llvm_unreachable("invalid C++ ABI kind");
+}
+
+LowerModule::LowerModule(clang::LangOptions langOpts,
+                         clang::CodeGenOptions codeGenOpts,
+                         mlir::ModuleOp &module,
+                         std::unique_ptr<clang::TargetInfo> target,
+                         mlir::PatternRewriter &rewriter)
+    : module(module), target(std::move(target)), abi(createCXXABI(*this)),
+      rewriter(rewriter) {}
+
+// TODO: not to create it every time
+std::unique_ptr<LowerModule>
+createLowerModule(mlir::ModuleOp module, mlir::PatternRewriter &rewriter) {
+  // Fetch target information.
+  llvm::Triple triple(mlir::cast<mlir::StringAttr>(
+                          module->getAttr(cir::CIRDialect::getTripleAttrName()))
+                          .getValue());
+  clang::TargetOptions targetOptions;
+  targetOptions.Triple = triple.str();
+  auto targetInfo = clang::targets::AllocateTarget(triple, targetOptions);
+
+  // FIXME(cir): This just uses the default language options. We need to account
+  // for custom options.
+  // Create context.
+  assert(!cir::MissingFeatures::lowerModuleLangOpts());
+  clang::LangOptions langOpts;
+
+  // FIXME(cir): This just uses the default code generation options. We need to
+  // account for custom options.
+  assert(!cir::MissingFeatures::lowerModuleCodeGenOpts());
+  clang::CodeGenOptions codeGenOpts;
+
+  if (auto optInfo = mlir::cast_if_present<cir::OptInfoAttr>(
+          module->getAttr(cir::CIRDialect::getOptInfoAttrName()))) {
+    codeGenOpts.OptimizationLevel = optInfo.getLevel();
+    codeGenOpts.OptimizeSize = optInfo.getSize();
+  }
+
+  return std::make_unique<LowerModule>(std::move(langOpts),
+                                       std::move(codeGenOpts), module,
+                                       std::move(targetInfo), rewriter);
+}
+
+} // namespace cir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
new file mode 100644
index 0000000000000..440e307f571e9
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/LowerModule.h
@@ -0,0 +1,55 @@
+//===--- LowerModule.h - Abstracts CIR's module lowering --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file partially mimics clang/lib/CodeGen/CodeGenModule.h. The queries are
+// adapted to operate on the CIR dialect, however.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
+#define CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
+
+#include "CIRCXXABI.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "clang/Basic/CodeGenOptions.h"
+#include "clang/Basic/LangOptions.h"
+#include "clang/Basic/TargetInfo.h"
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
+#include "clang/CIR/MissingFeatures.h"
+#include <memory>
+
+namespace cir {
+
+class LowerModule {
+  mlir::ModuleOp module;
+  const std::unique_ptr<clang::TargetInfo> target;
+  std::unique_ptr<CIRCXXABI> abi;
+  [[maybe_unused]] mlir::PatternRewriter &rewriter;
+
+public:
+  LowerModule(clang::LangOptions langOpts, clang::CodeGenOptions codeGenOpts,
+              mlir::ModuleOp &module, std::unique_ptr<clang::TargetInfo> target,
+              mlir::PatternRewriter &rewriter);
+  ~LowerModule() = default;
+
+  clang::TargetCXXABI::Kind getCXXABIKind() const {
+    assert(!cir::MissingFeatures::lowerModuleLangOpts());
+    return target->getCXXABI().getKind();
+  }
+
+  CIRCXXABI &getCXXABI() const { return *abi; }
+  const clang::TargetInfo &getTarget() const { return *target; }
+  mlir::MLIRContext *getMLIRContext() { return module.getContext(); }
+};
+
+std::unique_ptr<LowerModule> createLowerModule(mlir::ModuleOp module,
+                                               mlir::PatternRewriter &rewriter);
+
+} // namespace cir
+
+#endif // CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_LOWERMODULE_H
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
index 7baff3412a84e..2525e02ae8f85 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/CMakeLists.txt
@@ -18,7 +18,12 @@ add_clang_library(clangCIRLoweringDirectToLLVM
   clangCIRLoweringCommon
   ${dialect_libs}
   MLIRCIR
+  MLIRCIRTargetLowering
   MLIRBuiltinToLLVMIRTranslation
   MLIRLLVMToLLVMIRTranslation
   MLIRIR
   )
+
+target_include_directories(clangCIRLoweringDirectToLLVM PRIVATE
+  ${CLANG_SOURCE_DIR}/lib/CIR/Dialect/Transforms/TargetLowering
+  )
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index cc911cfc7d778..88ca8033b48ea 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1755,6 +1755,14 @@ mlir::LogicalResult CIRToLLVMConstantOpLowering::matchAndRewrite(
       return mlir::success();
     }
     attr = op.getValue();
+  } else if (mlir::isa<cir::DataMemberType>(op.getType())) {
+    assert(lowerMod && "lower module is not available");
+    auto dataMember = mlir::cast<cir::DataMemberAttr>(op.getValue());
+    mlir::DataLayout layout(op->getParentOfType<mlir::ModuleOp>());
+    mlir::TypedAttr abiValue = lowerMod->getCXXABI().lowerDataMemberConstant(
+        dataMember, layout, *typeConverter);
+    rewriter.replaceOpWithNewOp<ConstantOp>(op, abiValue);
+    return mlir::success();
   } else if (const auto arrTy = mlir::dyn_cast<cir::ArrayType>(op.getType())) {
     const auto constArr = mlir::dyn_cast<cir::ConstArrayAttr>(op.getValue());
     if (!constArr && !isa<cir::ZeroAttr, cir::UndefAttr>(op.getValue()))
@@ -2839,8 +2847,20 @@ mlir::LogicalResult CIRToLLVMSelectOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+std::unique_ptr<cir::LowerModule> prepareLowerModule(mlir::ModuleOp module) {
+  mlir::PatternRewriter rewriter{module->getContext()};
+  // If the triple is not present, e.g. CIR modules parsed from text, we
+  // cannot init LowerModule properly. This happens in some lowering tests,
+  // but it should not happen in real compilation.
+  assert(!cir::MissingFeatures::makeTripleAlwaysPresent());
+  if (!module->hasAttr(cir::CIRDialect::getTripleAttrName()))
+    return {};
+  return cir::createLowerModule(module, rewriter);
+}
+
 static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
-                                 mlir::DataLayout &dataLayout) {
+                                 mlir::DataLayout &dataLayout,
+                                 cir::LowerModule *lowerModule) {
   converter.addConversion([&](cir::PointerType type) -> mlir::Type {
     unsigned addrSpace =
         type.getAddrSpace() ? type.getAddrSpace().getValue().getUInt() : 0;
@@ -2850,6 +2870,13 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter,
     assert(!cir::MissingFeatures::addressSpace());
     return mlir::LLVM::LLVMPointerType::get(type.getContext());
   });
+  converter.addConversion(
+      [&, lowerModule](cir::DataMemberType type) -> mlir::Type {
+        assert(lowerModule && "CXXABI is not available");
+        mlir::Type abiType =
+            lowerModule->getCXXABI().lowerDataMemberType(type, converter);
+        return converter.convertType(abiType);
+      });
   converter.addConversion([&](cir::ArrayType type) -> mlir::Type {
     mlir::Type ty =
         convertTypeForMemory(converter, dataLayout, type.getElementType());
@@ -3118,7 +3145,8 @@ void ConvertCIRToLLVMPass::runOnOperation() {
   mlir::ModuleOp module = getOperation();
   mlir::DataLayout dl(module);
   mlir::LLVMTypeConverter converter(&getContext());
-  prepareTypeConverter(converter, dl);
+  std::unique_ptr<cir::LowerModule> lowerModule = prepareLowerModule(module);
+  prepareTypeConverter(converter, dl, lowerModule.get());
 
   mlir::RewritePatternSet patterns(&getContext());
 
@@ -3126,7 +3154,7 @@ void ConvertCIRToLLVMPass::runOnOperation() {
 #define GET_LLVM_LOWERING_PATTERNS_LIST
 #include "clang/CIR/Dialect/IR/CIRLowering.inc"
 #undef GET_LLVM_LOWERING_PATTERNS_LIST
-      >(converter, patterns.getContext(), dl);
+      >(converter, patterns.getContext(), lowerModule.get(), dl);
 
   processCIRAttrs(module);
 
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index 0591de545b81d..d32f8603ee0be 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -12,6 +12,8 @@
 #ifndef CLANG_CIR_LOWERTOLLVM_H
 #define CLANG_CIR_LOWERTOLLVM_H
 
+#include "LowerModule.h"
+
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
diff --git a/clang/test/CIR/CodeGen/pointer-to-data-member.cpp b/clang/test/CIR/CodeGen/pointer-to-data-member.cpp
new file mode 100644
index 0000000000000..b116d21f01170
--- /dev/null
+++ b/clang/test/CIR/CodeGen/pointer-to-data-member.cpp
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -Wno-unused-value -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -fclangir -Wno-unused-value -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --check-prefix=LLVM --input-file=%t-cir.ll %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++17 -Wno-unused-value -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
+
+struct Point {
+  int x;
+  int y;
+  int z;
+};
+
+auto test1() -> int Point::* {
+  return &Point::y;
+}
+
+// CIR: cir.func {{.*}} @_Z5test1v() -> !cir.data_member<!s32i in !rec_Point> {
+// CIR:   %[[RETVAL:.*]] = cir.alloca !cir.data_member<!s32i in !rec_Point>, !cir.ptr<!cir.data_member<!s32i in !rec_Point>>, ["__retval"]
+// CIR:   %[[MEMBER:.*]] = cir.const #cir.data_member<1> : !cir.data_member<!s32i in !rec_Point>
+// CIR:   cir.store %[[MEMBER]], %[[RETVAL]] : !cir.data_member<!s32i in !rec_Point>, !cir.ptr<!cir.data_member<!s32i in !rec_Point>>
+// CIR:   %[[RET:.*]] = cir.load %[[RETVAL]] : !cir.ptr<!cir.data_member<!s32i in !rec_Point>>, !cir.data_member<!s32i in !rec_Point>
+// CIR:   cir.return %[[RET]] : !cir.data_member<!s32i in !rec_Point>
+
+// LLVM: define {{.*}} i64 @_Z5test1v()
+// LLVM:   %[[RETVAL:.*]] = alloca i64
+// LLVM:   store i64 4, ptr %[[RETVAL]]
+// LLVM:   %[[RET:.*]] = load i64, ptr %[[RETVAL]]
+// LLVM:   ret i64 %[[RET]]
+
+// OGCG: define {{.*}} i64 @_Z5test1v()
+// OGCG:   ret i64 4
diff --git a/clang/test/CIR/IR/invalid-data-member.cir b/clang/test/CIR/IR/invalid-data-member.cir
new file mode 100644
index 0000000000000..2941777404973
--- /dev/null
+++ b/clang/test/CIR/IR/invalid-data-member.cir
@@ -0,0 +1,27 @@
+// RUN: cir-opt %s -verify-diagnostics -split-input-file
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!u32i = !cir.int<u, 32>
+!struct1 = !cir.record<struct "Struct1" {!u16i, !u32i}>
+
+// expected-error@+1 {{member type of a #cir.data_member attribute must match the attribute type}}
+#invalid_member_ty = #cir.data_member<0> : !cir.data_member<!u32i in !struct1>
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!incomplete_struct = !cir.record<struct "Incomplete" incomplete>
+
+// expected-error@+1 {{incomplete 'cir.record' cannot be used to build a non-null data member pointer}}
+#incomplete_cls_member = #cir.data_member<0> : !cir.data_member<!u16i in !incomplete_struct>
+
+// -----
+
+!u16i = !cir.int<u, 16>
+!u32i = !cir.int<u, 32>
+!struct1 = !cir.record<struct "Struct1" {!u16i, !u32i}>
+
+// expected-error@+1 {{member index of a #cir.data_member attribute is out of range}}
+#invalid_member_ty = #cir.data_member<2> : !cir.data_member<!u32i in !struct1>
diff --git a/clang/utils/TableGen/CIRLoweringEmitter.cpp b/clang/utils/TableGen/CIRLoweringEmitter.cpp
index 80dc209c69a7b..c81b8941f9a39 100644
--- a/clang/utils/TableGen/CIRLoweringEmitter.cpp
+++ b/clang/utils/TableGen/CIRLoweringEmitter.cpp
@@ -60,6 +60,7 @@ void GenerateLLVMLoweringPattern(llvm::StringRef OpName,
 
   Code << "class " << PatternName
        << " : public mlir::OpConversionPattern<cir::" << OpName << "> {\n";
+  Code << "  [[maybe_unused]] cir::LowerModule *lowerMod;\n";
   Code << "  [[maybe_unused]] mlir::DataLayout const &dataLayout;\n";
   Code << "\n";
 
@@ -69,10 +70,12 @@ void GenerateLLVMLoweringPattern(llvm::StringRef OpName,
 
   Code << "  " << PatternName
        << "(mlir::TypeConverter const "
-          "&typeConverter, mlir::MLIRContext *context, mlir::DataLayout const "
+          "&typeConverter, mlir::MLIRContext *context, "
+          "cir::LowerModule *lowerMod, mlir::DataLayout const "
           "&dataLayout)\n";
   Code << "    : OpConversionPattern<cir::" << OpName
-       << ">(typeConverter, context), dataLayout(dataLayout)";
+       << ">(typeConverter, context), lowerMod(lowerMod), "
+          "dataLayout(dataLayout)";
   if (IsRecursive) {
     Code << " {\n";
     Code << "    setHasBoundedRewriteRecursion();\n";

From 8f3c8dabc64517132c9c438ff467e75f39fbf8a6 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 13:09:19 -0800
Subject: [PATCH 57/63] [MLIR][Bytecode] Use consistent types for resolveEntry
 (#171502)

uint64_t and size_t are not the same across all platforms. This was
causing build failures when building this file for wasm:

llvm-project/mlir/lib/Bytecode/Reader/BytecodeReader.cpp:1323:19: error:
out-of-line definition of 'resolveEntry' does not match any declaration
in '(anonymous namespace)::AttrTypeReader'
1323 | T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>>
&entries, size_t index,
      |                   ^~~~~~~~~~~~

third_party/llvm/llvm-project/mlir/lib/Bytecode/Reader/BytecodeReader.cpp:851:7:
note: AttrTypeReader defined here
  851 | class AttrTypeReader {
      |       ^~~~~~~~~~~~~~
1 error generated.

Use uint64_t everywhere to ensure portability.
---
 mlir/lib/Bytecode/Reader/BytecodeReader.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
index dd367b5922558..0ac5fc5358ea5 100644
--- a/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
+++ b/mlir/lib/Bytecode/Reader/BytecodeReader.cpp
@@ -1320,8 +1320,9 @@ LogicalResult AttrTypeReader::initialize(
 }
 
 template <typename T>
-T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries, size_t index,
-                               StringRef entryType, uint64_t depth) {
+T AttrTypeReader::resolveEntry(SmallVectorImpl<Entry<T>> &entries,
+                               uint64_t index, StringRef entryType,
+                               uint64_t depth) {
   if (index >= entries.size()) {
     emitError(fileLoc) << "invalid " << entryType << " index: " << index;
     return {};

From 926cbddc185e035a4266f25203e81eec8960f114 Mon Sep 17 00:00:00 2001
From: Andrew Haberlandt <ndrewh@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:22:54 -0800
Subject: [PATCH 58/63] [sanitizer_common] child_stdin_fd_ should only be set
 on posix_spawn path (#171508)

#170809 added the child_stdin_fd_ field on SymbolizerProcess to allow
the parent process to hold on to the read in of the child's stdin pipe.
This was to avoid SIGPIPE.

However, the `StartSubprocess` path still closes the stdin fd in the
parent here:

https://github.com/llvm/llvm-project/blob/7f5ed91684c808444ede24eb01ad9af73b5806e5/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp#L525-L535

This could cause a double-close of this fd (problematic in the case of
fd reuse).

This moves the `child_stdin_fd_` field to only be initialized on the
posix_spawn path. This should ensure #170809 only truly affects Darwin.
---
 .../sanitizer_symbolizer_posix_libcdep.cpp                 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
index 29c73e3e1cac1..ab6aee7c9fba7 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp
@@ -176,6 +176,10 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
       internal_close(outfd[1]);
       return false;
     }
+
+    // We intentionally hold on to the read-end so that we don't get a SIGPIPE
+    child_stdin_fd_ = outfd[0];
+
 #  else   // SANITIZER_APPLE
     UNIMPLEMENTED();
 #  endif  // SANITIZER_APPLE
@@ -192,9 +196,6 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() {
   input_fd_ = infd[0];
   output_fd_ = outfd[1];
 
-  // We intentionally hold on to the read-end so that we don't get a SIGPIPE
-  child_stdin_fd_ = outfd[0];
-
   CHECK_GT(pid, 0);
 
   // Check that symbolizer subprocess started successfully.

From 64ee4bf73411d1462fec0c2c2b5c44aaaa3a1903 Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Tue, 9 Dec 2025 13:30:46 -0800
Subject: [PATCH 59/63] [scudo] Refactor initialization of TSDs. (#169738)

Instead of getting a lock and then checking/modifying the Initialization
variable, make it an atomic. Doing this, we can remove one of the
mutexes in shared TSDs and avoid any potential lock contention in both
shared TSDs and exclusive TSDs if multiple threads do allocation
operations at the same time.

Add two new tests that make sure no crashes occur if multiple threads
try and do allocations at the same time.
---
 .../scudo/standalone/tests/combined_test.cpp  | 88 +++++++++++++++++++
 .../lib/scudo/standalone/tsd_exclusive.h      | 19 ++--
 compiler-rt/lib/scudo/standalone/tsd_shared.h | 50 ++++++-----
 3 files changed, 127 insertions(+), 30 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 1d4208b6a2aa0..b70b9c9269fed 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -18,6 +18,7 @@
 #include "size_class_map.h"
 
 #include <algorithm>
+#include <atomic>
 #include <condition_variable>
 #include <memory>
 #include <mutex>
@@ -1396,6 +1397,7 @@ TEST(ScudoCombinedTest, FullUsableSizeMTE) {
   VerifyExactUsableSize<AllocatorT>(*Allocator);
   VerifyIterateOverUsableSize<AllocatorT>(*Allocator);
 }
+
 // Verify that no special quarantine blocks appear in iterateOverChunks.
 TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
   using AllocatorT = TestAllocator<TestQuarantineConfig>;
@@ -1426,3 +1428,89 @@ TEST(ScudoCombinedTest, QuarantineIterateOverChunks) {
                        << std::hex << Base << " Size " << std::dec << Size;
   }
 }
+
+struct InitSizeClassConfig {
+  static const scudo::uptr NumBits = 1;
+  static const scudo::uptr MinSizeLog = 10;
+  static const scudo::uptr MidSizeLog = 10;
+  static const scudo::uptr MaxSizeLog = 13;
+  static const scudo::u16 MaxNumCachedHint = 8;
+  static const scudo::uptr MaxBytesCachedLog = 12;
+  static const scudo::uptr SizeDelta = 0;
+};
+
+struct TestInitSizeConfig {
+  static const bool MaySupportMemoryTagging = false;
+  static const bool QuarantineDisabled = true;
+
+  struct Primary {
+    using SizeClassMap = scudo::FixedSizeClassMap<InitSizeClassConfig>;
+    static const scudo::uptr RegionSizeLog = 21U;
+    static const scudo::s32 MinReleaseToOsIntervalMs = INT32_MIN;
+    static const scudo::s32 MaxReleaseToOsIntervalMs = INT32_MAX;
+    typedef scudo::uptr CompactPtrT;
+    static const scudo::uptr CompactPtrScale = 0;
+    static const bool EnableRandomOffset = true;
+    static const scudo::uptr MapSizeIncrement = 1UL << 18;
+    static const scudo::uptr GroupSizeLog = 18;
+  };
+  template <typename Config>
+  using PrimaryT = scudo::SizeClassAllocator64<Config>;
+
+  struct Secondary {
+    template <typename Config>
+    using CacheT = scudo::MapAllocatorNoCache<Config>;
+  };
+
+  template <typename Config> using SecondaryT = scudo::MapAllocator<Config>;
+};
+
+struct TestInitSizeTSDSharedConfig : public TestInitSizeConfig {
+  template <class A> using TSDRegistryT = scudo::TSDRegistrySharedT<A, 4U, 4U>;
+};
+
+struct TestInitSizeTSDExclusiveConfig : public TestInitSizeConfig {
+  template <class A> using TSDRegistryT = scudo::TSDRegistryExT<A>;
+};
+
+template <class AllocatorT> void RunStress() {
+  auto Allocator = std::unique_ptr<AllocatorT>(new AllocatorT());
+
+  // This test is designed to try and have many threads trying to initialize
+  // the TSD at the same time. Make sure this doesn't crash.
+  std::atomic_bool StartRunning = false;
+  std::vector<std::thread *> threads;
+  for (size_t I = 0; I < 16; I++) {
+    threads.emplace_back(new std::thread([&Allocator, &StartRunning]() {
+      while (!StartRunning.load())
+        ;
+
+      void *Ptr = Allocator->allocate(10, Origin);
+      EXPECT_TRUE(Ptr != nullptr);
+      // Make sure this value is not optimized away.
+      asm volatile("" : : "r,m"(Ptr) : "memory");
+      Allocator->deallocate(Ptr, Origin);
+    }));
+  }
+
+  StartRunning = true;
+
+  for (auto *thread : threads) {
+    thread->join();
+    delete thread;
+  }
+}
+
+TEST(ScudoCombinedTest, StressThreadInitTSDShared) {
+  using AllocatorT = scudo::Allocator<TestInitSizeTSDSharedConfig>;
+  // Run the stress test a few times.
+  for (size_t I = 0; I < 10; I++)
+    RunStress<AllocatorT>();
+}
+
+TEST(ScudoCombinedTest, StressThreadInitTSDExclusive) {
+  using AllocatorT = scudo::Allocator<TestInitSizeTSDExclusiveConfig>;
+  // Run the stress test a few times.
+  for (size_t I = 0; I < 10; I++)
+    RunStress<AllocatorT>();
+}
diff --git a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
index a58ba6505089f..75921f2be3ffe 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_exclusive.h
@@ -52,17 +52,20 @@ template <class Allocator> struct TSDRegistryExT {
     bool UnlockRequired;
   };
 
-  void init(Allocator *Instance) REQUIRES(Mutex) {
-    DCHECK(!Initialized);
+  void init(Allocator *Instance) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    // If more than one thread is initializing at the exact same moment, the
+    // threads that lose don't need to do anything.
+    if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0))
+      return;
     Instance->init();
     CHECK_EQ(pthread_key_create(&PThreadKey, teardownThread<Allocator>), 0);
     FallbackTSD.init(Instance);
-    Initialized = true;
+    atomic_store_relaxed(&Initialized, 1);
   }
 
-  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
+  void initOnceMaybe(Allocator *Instance) {
+    if (LIKELY(atomic_load_relaxed(&Initialized) != 0))
       return;
     init(Instance); // Sets Initialized.
   }
@@ -81,7 +84,7 @@ template <class Allocator> struct TSDRegistryExT {
     FallbackTSD = {};
     State = {};
     ScopedLock L(Mutex);
-    Initialized = false;
+    atomic_store_relaxed(&Initialized, 0);
   }
 
   void drainCaches(Allocator *Instance) {
@@ -158,7 +161,7 @@ template <class Allocator> struct TSDRegistryExT {
   }
 
   pthread_key_t PThreadKey = {};
-  bool Initialized GUARDED_BY(Mutex) = false;
+  atomic_u8 Initialized = {};
   atomic_u8 Disabled = {};
   TSD<Allocator> FallbackTSD;
   HybridMutex Mutex;
diff --git a/compiler-rt/lib/scudo/standalone/tsd_shared.h b/compiler-rt/lib/scudo/standalone/tsd_shared.h
index 404e984e1f5e9..425a028c955aa 100644
--- a/compiler-rt/lib/scudo/standalone/tsd_shared.h
+++ b/compiler-rt/lib/scudo/standalone/tsd_shared.h
@@ -47,20 +47,24 @@ struct TSDRegistrySharedT {
     TSD<Allocator> *CurrentTSD;
   };
 
-  void init(Allocator *Instance) REQUIRES(Mutex) {
-    DCHECK(!Initialized);
+  void init(Allocator *Instance) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
+    // If more than one thread is initializing at the exact same moment, the
+    // threads that lose don't need to do anything.
+    if (UNLIKELY(atomic_load_relaxed(&Initialized) != 0))
+      return;
+
     Instance->init();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].init(Instance);
     const u32 NumberOfCPUs = getNumberOfCPUs();
     setNumberOfTSDs((NumberOfCPUs == 0) ? DefaultTSDCount
                                         : Min(NumberOfCPUs, DefaultTSDCount));
-    Initialized = true;
+    atomic_store_relaxed(&Initialized, 1);
   }
 
-  void initOnceMaybe(Allocator *Instance) EXCLUDES(Mutex) {
-    ScopedLock L(Mutex);
-    if (LIKELY(Initialized))
+  void initOnceMaybe(Allocator *Instance) {
+    if (LIKELY(atomic_load_relaxed(&Initialized) != 0))
       return;
     init(Instance); // Sets Initialized.
   }
@@ -72,11 +76,11 @@ struct TSDRegistrySharedT {
     }
     setCurrentTSD(nullptr);
     ScopedLock L(Mutex);
-    Initialized = false;
+    atomic_store_relaxed(&Initialized, 0);
   }
 
   void drainCaches(Allocator *Instance) {
-    ScopedLock L(MutexTSDs);
+    ScopedLock L(Mutex);
     for (uptr I = 0; I < NumberOfTSDs; ++I) {
       TSDs[I].lock();
       Instance->drainCache(&TSDs[I]);
@@ -93,7 +97,6 @@ struct TSDRegistrySharedT {
 
   void disable() NO_THREAD_SAFETY_ANALYSIS {
     Mutex.lock();
-    MutexTSDs.lock();
     for (u32 I = 0; I < TSDsArraySize; I++)
       TSDs[I].lock();
   }
@@ -101,13 +104,14 @@ struct TSDRegistrySharedT {
   void enable() NO_THREAD_SAFETY_ANALYSIS {
     for (s32 I = static_cast<s32>(TSDsArraySize - 1); I >= 0; I--)
       TSDs[I].unlock();
-    MutexTSDs.unlock();
     Mutex.unlock();
   }
 
   bool setOption(Option O, sptr Value) {
-    if (O == Option::MaxTSDsCount)
+    if (O == Option::MaxTSDsCount) {
+      ScopedLock L(Mutex);
       return setNumberOfTSDs(static_cast<u32>(Value));
+    }
     if (O == Option::ThreadDisableMemInit)
       setDisableMemInit(Value);
     // Not supported by the TSD Registry, but not an error either.
@@ -116,8 +120,8 @@ struct TSDRegistrySharedT {
 
   bool getDisableMemInit() const { return *getTlsPtr() & 1; }
 
-  void getStats(ScopedString *Str) EXCLUDES(MutexTSDs) {
-    ScopedLock L(MutexTSDs);
+  void getStats(ScopedString *Str) EXCLUDES(Mutex) {
+    ScopedLock L(Mutex);
 
     Str->append("Stats: SharedTSDs: %u available; total %u\n", NumberOfTSDs,
                 TSDsArraySize);
@@ -171,8 +175,7 @@ struct TSDRegistrySharedT {
     return reinterpret_cast<TSD<Allocator> *>(*getTlsPtr() & ~1ULL);
   }
 
-  bool setNumberOfTSDs(u32 N) EXCLUDES(MutexTSDs) {
-    ScopedLock L(MutexTSDs);
+  bool setNumberOfTSDs(u32 N) REQUIRES(Mutex) {
     if (N < NumberOfTSDs)
       return false;
     if (N > TSDsArraySize)
@@ -213,14 +216,14 @@ struct TSDRegistrySharedT {
   // TSDs is an array of locks which is not supported for marking thread-safety
   // capability.
   NOINLINE TSD<Allocator> *getTSDAndLockSlow(TSD<Allocator> *CurrentTSD)
-      EXCLUDES(MutexTSDs) {
+      EXCLUDES(Mutex) {
     // Use the Precedence of the current TSD as our random seed. Since we are
     // in the slow path, it means that tryLock failed, and as a result it's
     // very likely that said Precedence is non-zero.
     const u32 R = static_cast<u32>(CurrentTSD->getPrecedence());
     u32 N, Inc;
     {
-      ScopedLock L(MutexTSDs);
+      ScopedLock L(Mutex);
       N = NumberOfTSDs;
       DCHECK_NE(NumberOfCoPrimes, 0U);
       Inc = CoPrimes[R % NumberOfCoPrimes];
@@ -257,12 +260,15 @@ struct TSDRegistrySharedT {
   }
 
   atomic_u32 CurrentIndex = {};
-  u32 NumberOfTSDs GUARDED_BY(MutexTSDs) = 0;
-  u32 NumberOfCoPrimes GUARDED_BY(MutexTSDs) = 0;
-  u32 CoPrimes[TSDsArraySize] GUARDED_BY(MutexTSDs) = {};
-  bool Initialized GUARDED_BY(Mutex) = false;
+  u32 NumberOfTSDs GUARDED_BY(Mutex) = 0;
+  u32 NumberOfCoPrimes GUARDED_BY(Mutex) = 0;
+  u32 CoPrimes[TSDsArraySize] GUARDED_BY(Mutex) = {};
+  atomic_u8 Initialized = {};
+  // Used for global initialization and TSDs access.
+  // Acquiring the global initialization should only lock once in normal
+  // operation, which is why using it for TSDs access should not cause
+  // any interference.
   HybridMutex Mutex;
-  HybridMutex MutexTSDs;
   TSD<Allocator> TSDs[TSDsArraySize];
 };
 

From f29d06029f1ccae7359440a5d33d06406d10df31 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 9 Dec 2025 21:32:09 +0000
Subject: [PATCH 60/63] Revert "[LV] Mark checks as never succeeding for high
 cost cutoff."

This reverts commit 8a115b6934a90441d77ea54af73e7aaaa1394b38.

This broke premerge. https://lab.llvm.org/staging/#/builders/192/builds/13326

/home/gha/llvm-project/clang/test/Frontend/optimization-remark-options.c:10:11: remark: loop not vectorized: cannot prove it is safe to reorder floating-point operations; allow reordering by specifying '#pragma clang loop vectorize(enable)' before the loop or by providing the compiler option '-ffast-math'
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 +--
 ...ime-check-threshold-with-force-metadata.ll | 39 ++++++++++++-------
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 79cdae25e38da..15d0fa41bd902 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1827,12 +1827,8 @@ class GeneratedRTChecks {
     // profile info.
     CostTooHigh =
         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
-    if (CostTooHigh) {
-      // Mark runtime checks as never succeeding when they exceed the threshold.
-      MemRuntimeCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
-      SCEVCheckCond = ConstantInt::getTrue(L->getHeader()->getContext());
+    if (CostTooHigh)
       return;
-    }
 
     BasicBlock *LoopHeader = L->getHeader();
     BasicBlock *Preheader = L->getLoopPreheader();
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
index 5376eb86882b7..b7d36fe7928e5 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-threshold-with-force-metadata.ll
@@ -2,23 +2,29 @@
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=0 -S %s | FileCheck --check-prefix=LIMIT0 %s
 ; RUN: opt -p loop-vectorize -vectorize-memory-check-threshold=1 -S %s | FileCheck --check-prefix=LIMIT1 %s
 
-; Make sure we do not incorrectly vectorize with -vectorize-memory-check-threshold=0;
-; no runtime check is generated and the loop should not be vectorized.
+; FIXME: Currently this miscompiles with -vectorize-memory-check-threshold=0;
+; no runtime check is generated even though one is needed and !noalias
+; annotations are added.
 define i16 @runtime_checks_needed(ptr %src, ptr %dst) {
 ; LIMIT0-LABEL: define i16 @runtime_checks_needed(
 ; LIMIT0-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]]) {
-; LIMIT0-NEXT:  [[ENTRY:.*]]:
-; LIMIT0-NEXT:    br label %[[LOOP:.*]]
-; LIMIT0:       [[LOOP]]:
-; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[LOOP]] ]
-; LIMIT0-NEXT:    [[L:%.*]] = load i16, ptr [[SRC]], align 1
+; LIMIT0-NEXT:  [[ENTRY:.*:]]
+; LIMIT0-NEXT:    br label %[[VECTOR_PH:.*]]
+; LIMIT0:       [[VECTOR_PH]]:
+; LIMIT0-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 1, !alias.scope [[META0:![0-9]+]]
+; LIMIT0-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; LIMIT0-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; LIMIT0-NEXT:    br label %[[VECTOR_BODY:.*]]
+; LIMIT0:       [[VECTOR_BODY]]:
+; LIMIT0-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; LIMIT0-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[DST]], i64 [[INDEX]]
-; LIMIT0-NEXT:    store i16 [[L]], ptr [[TMP1]], align 1
-; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], 1
+; LIMIT0-NEXT:    store <2 x i16> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; LIMIT0-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; LIMIT0-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000
-; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP0:![0-9]+]]
+; LIMIT0-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; LIMIT0:       [[MIDDLE_BLOCK]]:
+; LIMIT0-NEXT:    br label %[[EXIT:.*]]
 ; LIMIT0:       [[EXIT]]:
-; LIMIT0-NEXT:    [[TMP0:%.*]] = phi i16 [ [[L]], %[[LOOP]] ]
 ; LIMIT0-NEXT:    ret i16 [[TMP0]]
 ;
 ; LIMIT1-LABEL: define i16 @runtime_checks_needed(
@@ -82,9 +88,14 @@ exit:
 !3 = !{!"llvm.loop.vectorize.enable", i1 true}
 
 ;.
-; LIMIT0: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
-; LIMIT0: [[META1]] = !{!"llvm.loop.vectorize.width", i32 2}
-; LIMIT0: [[META2]] = !{!"llvm.loop.vectorize.enable", i1 true}
+; LIMIT0: [[META0]] = !{[[META1:![0-9]+]]}
+; LIMIT0: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; LIMIT0: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; LIMIT0: [[META3]] = !{[[META4:![0-9]+]]}
+; LIMIT0: [[META4]] = distinct !{[[META4]], [[META2]]}
+; LIMIT0: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; LIMIT0: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; LIMIT0: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
 ;.
 ; LIMIT1: [[META0]] = !{[[META1:![0-9]+]]}
 ; LIMIT1: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}

From d86bc19ad79896b956c828a8f9c2c0d94d131466 Mon Sep 17 00:00:00 2001
From: Erick Velez <erickvelez7@gmail.com>
Date: Tue, 9 Dec 2025 13:37:00 -0800
Subject: [PATCH 61/63] [clang-doc] Do not serialize empty text comments
 (#169087)

---
 clang-tools-extra/clang-doc/JSONGenerator.cpp | 43 +++++++-
 .../clang-doc/basic-project.mustache.test     | 97 -------------------
 .../test/clang-doc/json/class.cpp             |  3 -
 3 files changed, 40 insertions(+), 103 deletions(-)

diff --git a/clang-tools-extra/clang-doc/JSONGenerator.cpp b/clang-tools-extra/clang-doc/JSONGenerator.cpp
index 97c599a3f605c..77aa8794561e4 100644
--- a/clang-tools-extra/clang-doc/JSONGenerator.cpp
+++ b/clang-tools-extra/clang-doc/JSONGenerator.cpp
@@ -84,8 +84,23 @@ serializeLocation(const Location &Loc,
   return LocationObj;
 }
 
+/// Insert comments into a key in the Description object.
+///
+/// \param Comment Either an Object or Array, depending on the comment type
+/// \param Key     The type (Brief, Code, etc.) of comment to be inserted
 static void insertComment(Object &Description, json::Value &Comment,
                           StringRef Key) {
+  // The comment has a Children array for the actual text, with meta attributes
+  // alongside it in the Object.
+  if (auto *Obj = Comment.getAsObject()) {
+    if (auto *Children = Obj->getArray("Children"); Children->empty())
+      return;
+  }
+  // The comment is just an array of text comments.
+  else if (auto *Array = Comment.getAsArray(); Array->empty()) {
+    return;
+  }
+
   auto DescriptionIt = Description.find(Key);
 
   if (DescriptionIt == Description.end()) {
@@ -98,10 +113,28 @@ static void insertComment(Object &Description, json::Value &Comment,
   }
 }
 
+/// Takes the nested "Children" array from a comment Object.
+///
+/// \return a json::Array of comments, possible json::Value::Kind::Null
 static json::Value extractTextComments(Object *ParagraphComment) {
   if (!ParagraphComment)
-    return json::Object();
-  return *ParagraphComment->get("Children");
+    return json::Value(nullptr);
+  json::Value *Children = ParagraphComment->get("Children");
+  if (!Children)
+    return json::Value(nullptr);
+  auto ChildrenArray = *Children->getAsArray();
+  auto ChildrenIt = ChildrenArray.begin();
+  while (ChildrenIt != ChildrenArray.end()) {
+    auto *ChildObj = ChildrenIt->getAsObject();
+    assert(ChildObj && "Invalid JSON object in Comment");
+    auto TextComment = ChildObj->getString("TextComment");
+    if (!TextComment || TextComment->empty()) {
+      ChildrenIt = ChildrenArray.erase(ChildrenIt);
+      continue;
+    }
+    ++ChildrenIt;
+  }
+  return ChildrenArray;
 }
 
 static json::Value extractVerbatimComments(json::Array VerbatimLines) {
@@ -131,7 +164,8 @@ static Object serializeComment(const CommentInfo &I, Object &Description) {
 
   switch (I.Kind) {
   case CommentKind::CK_TextComment: {
-    Obj.insert({commentKindToString(I.Kind), I.Text});
+    if (!I.Text.empty())
+      Obj.insert({commentKindToString(I.Kind), I.Text});
     return Obj;
   }
 
@@ -265,6 +299,9 @@ serializeCommonAttributes(const Info &I, json::Object &Obj,
       if (auto *ParagraphComment = Comment.getAsObject();
           ParagraphComment->get("ParagraphComment")) {
         auto TextCommentsArray = extractTextComments(ParagraphComment);
+        if (TextCommentsArray.kind() == json::Value::Null ||
+            TextCommentsArray.getAsArray()->empty())
+          continue;
         insertComment(Description, TextCommentsArray, "ParagraphComments");
       }
     }
diff --git a/clang-tools-extra/test/clang-doc/basic-project.mustache.test b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
index 282ca73384c3f..b985a39265de7 100644
--- a/clang-tools-extra/test/clang-doc/basic-project.mustache.test
+++ b/clang-tools-extra/test/clang-doc/basic-project.mustache.test
@@ -65,9 +65,6 @@ HTML-SHAPE:                         <div>
 HTML-SHAPE:                             <p> Abstract base class for shapes.</p>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                         <div>
-HTML-SHAPE:                             <p></p>
-HTML-SHAPE:                         </div>
-HTML-SHAPE:                         <div>
 HTML-SHAPE:                             <p> Provides a common interface for different types of shapes.</p>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                     </div>
@@ -83,12 +80,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Calculates the area of the shape.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                                 <h3>Returns</h3>
 HTML-SHAPE:                                 <p> double The area of the shape.</p>
 HTML-SHAPE:                             </div>
@@ -101,12 +92,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Calculates the perimeter of the shape.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                                 <h3>Returns</h3>
 HTML-SHAPE:                                 <p> double The perimeter of the shape.</p>
 HTML-SHAPE:                             </div>
@@ -119,9 +104,6 @@ HTML-SHAPE:                             <div>
 HTML-SHAPE:                                 <div>
 HTML-SHAPE:                                     <p> Virtual destructor.</p>
 HTML-SHAPE:                                 </div>
-HTML-SHAPE:                                 <div>
-HTML-SHAPE:                                     <p></p>
-HTML-SHAPE:                                 </div>
 HTML-SHAPE:                             </div>
 HTML-SHAPE:                         </div>
 HTML-SHAPE:                     </div>
@@ -210,9 +192,6 @@ HTML-CALC:                         <div>
 HTML-CALC:                             <p> A simple calculator class.</p>
 HTML-CALC:                         </div>
 HTML-CALC:                         <div>
-HTML-CALC:                             <p></p>
-HTML-CALC:                         </div>
-HTML-CALC:                         <div>
 HTML-CALC:                             <p> Provides basic arithmetic operations.</p>
 HTML-CALC:                         </div>
 HTML-CALC:                     </div>
@@ -239,12 +218,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Adds two integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -264,12 +237,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Subtracts the second integer from the first.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -289,12 +256,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Multiplies two integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -314,12 +275,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Divides the first integer by the second.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -329,7 +284,6 @@ HTML-CALC:                                     <b>b</b>   Second integer.
 HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Returns</h3>
 HTML-CALC:                                 <p> double The result of a / b.</p>
-HTML-CALC:                                 <p></p>
 HTML-CALC:                                 <h3>Throws</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>std::invalid_argument</b> if b is zero.
@@ -344,12 +298,6 @@ HTML-CALC:                             <div>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <p> Performs the mod operation on integers.</p>
 HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
-HTML-CALC:                                 <div>
-HTML-CALC:                                     <p></p>
-HTML-CALC:                                 </div>
 HTML-CALC:                                 <h3>Parameters</h3>
 HTML-CALC:                                 <div>
 HTML-CALC:                                     <b>a</b>   First integer.
@@ -431,9 +379,6 @@ HTML-RECTANGLE:                         <div>
 HTML-RECTANGLE:                             <p> Rectangle class derived from Shape.</p>
 HTML-RECTANGLE:                         </div>
 HTML-RECTANGLE:                         <div>
-HTML-RECTANGLE:                             <p></p>
-HTML-RECTANGLE:                         </div>
-HTML-RECTANGLE:                         <div>
 HTML-RECTANGLE:                             <p> Represents a rectangle with a given width and height.</p>
 HTML-RECTANGLE:                         </div>
 HTML-RECTANGLE:                     </div>
@@ -449,12 +394,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Constructs a new Rectangle object.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Parameters</h3>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <b>width</b>   Width of the rectangle.
@@ -472,12 +411,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Calculates the area of the rectangle.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Returns</h3>
 HTML-RECTANGLE:                                 <p> double The area of the rectangle.</p>
 HTML-RECTANGLE:                             </div>
@@ -490,12 +423,6 @@ HTML-RECTANGLE:                             <div>
 HTML-RECTANGLE:                                 <div>
 HTML-RECTANGLE:                                     <p> Calculates the perimeter of the rectangle.</p>
 HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
-HTML-RECTANGLE:                                 <div>
-HTML-RECTANGLE:                                     <p></p>
-HTML-RECTANGLE:                                 </div>
 HTML-RECTANGLE:                                 <h3>Returns</h3>
 HTML-RECTANGLE:                                 <p> double The perimeter of the rectangle.</p>
 HTML-RECTANGLE:                             </div>
@@ -570,9 +497,6 @@ HTML-CIRCLE:                         <div>
 HTML-CIRCLE:                             <p> Circle class derived from Shape.</p>
 HTML-CIRCLE:                         </div>
 HTML-CIRCLE:                         <div>
-HTML-CIRCLE:                             <p></p>
-HTML-CIRCLE:                         </div>
-HTML-CIRCLE:                         <div>
 HTML-CIRCLE:                             <p> Represents a circle with a given radius.</p>
 HTML-CIRCLE:                         </div>
 HTML-CIRCLE:                     </div>
@@ -588,12 +512,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Constructs a new Circle object.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Parameters</h3>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <b>radius</b>   Radius of the circle.
@@ -608,12 +526,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Calculates the area of the circle.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Returns</h3>
 HTML-CIRCLE:                                 <p> double The area of the circle.</p>
 HTML-CIRCLE:                             </div>
@@ -626,15 +538,6 @@ HTML-CIRCLE:                             <div>
 HTML-CIRCLE:                                 <div>
 HTML-CIRCLE:                                     <p> Calculates the perimeter of the circle.</p>
 HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
-HTML-CIRCLE:                                 <div>
-HTML-CIRCLE:                                     <p></p>
-HTML-CIRCLE:                                 </div>
 HTML-CIRCLE:                                 <h3>Returns</h3>
 HTML-CIRCLE:                                 <p> double The perimeter of the circle.</p>
 HTML-CIRCLE:                                 <h3>Code</h3>
diff --git a/clang-tools-extra/test/clang-doc/json/class.cpp b/clang-tools-extra/test/clang-doc/json/class.cpp
index 91160585bef1a..8bf9402adf054 100644
--- a/clang-tools-extra/test/clang-doc/json/class.cpp
+++ b/clang-tools-extra/test/clang-doc/json/class.cpp
@@ -47,9 +47,6 @@ struct MyClass {
 // CHECK-NEXT:          },
 // CHECK-NEXT:          {
 // CHECK-NEXT:            "TextComment": " It has some nice methods and fields."
-// CHECK-NEXT:          },
-// CHECK-NEXT:          {
-// CHECK-NEXT:            "TextComment": ""
 // CHECK-NEXT:          }
 // CHECK:         "DocumentationFileName": "_ZTV7MyClass",
 // CHECK:         "Enums": [

From 3310c0be583451e5770b6afbedd926ff3781356f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 9 Dec 2025 21:38:58 +0000
Subject: [PATCH 62/63] [VPlan] Strip TODO to consolidate
 (ActiveLaneMask|Widen)PHI (#171392)

They cannot be consolidated, as WidenPHI is not a header PHI, while
ActtiveLaneMaskPHI is.
---
 llvm/lib/Transforms/Vectorize/VPlan.h          | 2 --
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 2 --
 2 files changed, 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fd02493fa2c78..afb654ed882f4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -3600,8 +3600,6 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 
 /// A recipe for generating the active lane mask for the vector loop that is
 /// used to predicate the vector operations.
-/// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
-/// remove VPActiveLaneMaskPHIRecipe.
 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 public:
   VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 519a104b9484f..b0c8564ad231a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -4407,8 +4407,6 @@ void VPWidenPHIRecipe::printRecipe(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
-// remove VPActiveLaneMaskPHIRecipe.
 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
   BasicBlock *VectorPH =
       State.CFG.VPBB2IRBB.at(getParent()->getCFGPredecessor(0));

From 0f318d9b93c097734177a402c3c3507659d0e782 Mon Sep 17 00:00:00 2001
From: Ron Lieberman <ron.lieberman@amd.com>
Date: Wed, 10 Dec 2025 02:15:47 -0600
Subject: [PATCH 63/63] Revert "[flang][OpenMP] Fix firstprivate not working
 with lastprivate in DO SIMD (#170163)"

This reverts commit 748e7af8dd6e9b4683a6402a0ca6598fe23a9c1e.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 22 +++--
 ...-simd-firstprivate-lastprivate-runtime.f90 | 48 ----------
 .../do-simd-firstprivate-lastprivate.f90      | 89 -------------------
 flang/test/Lower/OpenMP/order-clause.f90      |  8 +-
 flang/test/Lower/OpenMP/wsloop-simd.f90       |  9 +-
 revert_patches.txt                            |  3 +
 6 files changed, 27 insertions(+), 152 deletions(-)
 delete mode 100644 flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
 delete mode 100644 flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index b9ca5cedfe8ec..134702d6e4434 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3338,14 +3338,19 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genSimdClauses(converter, semaCtx, simdItem->clauses, loc, simdClauseOps,
                  simdReductionSyms);
 
-
-  DataSharingProcessor wsloopItemDSP(converter, semaCtx, doItem->clauses, eval,
-                                     /*shouldCollectPreDeterminedSymbols=*/true,
-                                     /*useDelayedPrivatization=*/true,
-                                     symTable);
+  DataSharingProcessor wsloopItemDSP(
+      converter, semaCtx, doItem->clauses, eval,
+      /*shouldCollectPreDeterminedSymbols=*/false,
+      /*useDelayedPrivatization=*/true, symTable);
   wsloopItemDSP.processStep1();
   wsloopItemDSP.processStep2(&wsloopClauseOps);
 
+  DataSharingProcessor simdItemDSP(converter, semaCtx, simdItem->clauses, eval,
+                                   /*shouldCollectPreDeterminedSymbols=*/true,
+                                   /*useDelayedPrivatization=*/true, symTable);
+  simdItemDSP.processStep1();
+  simdItemDSP.processStep2(&simdClauseOps, simdItem->id);
+
   // Pass the innermost leaf construct's clauses because that's where COLLAPSE
   // is placed by construct decomposition.
   mlir::omp::LoopNestOperands loopNestClauseOps;
@@ -3364,9 +3369,8 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // For composite 'do simd', privatization is handled by the wsloop.
-  // The simd does not create separate private storage for variables already
-  // privatized by the worksharing construct.
+  simdArgs.priv.syms = simdItemDSP.getDelayedPrivSymbols();
+  simdArgs.priv.vars = simdClauseOps.privateVars;
   simdArgs.reduction.syms = simdReductionSyms;
   simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
@@ -3376,7 +3380,7 @@ static mlir::omp::WsloopOp genCompositeDoSimd(
   genLoopNestOp(converter, symTable, semaCtx, eval, loc, queue, simdItem,
                 loopNestClauseOps, iv,
                 {{wsloopOp, wsloopArgs}, {simdOp, simdArgs}},
-                llvm::omp::Directive::OMPD_do_simd, wsloopItemDSP);
+                llvm::omp::Directive::OMPD_do_simd, simdItemDSP);
   return wsloopOp;
 }
 
diff --git a/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90 b/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
deleted file mode 100644
index 4fef69188e0ee..0000000000000
--- a/flang/test/Integration/OpenMP/do-simd-firstprivate-lastprivate-runtime.f90
+++ /dev/null
@@ -1,48 +0,0 @@
-! Test runtime behavior of DO SIMD with firstprivate and lastprivate on same variable
-! This is the reproducer from issue #168306
-
-! REQUIRES: openmp-runtime
-
-! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s --check-prefix=LLVM
-! RUN: %flang -fopenmp %s -o %t && %t | FileCheck %s
-
-! LLVM-LABEL: define {{.*}} @_QQmain
-program main
-  integer :: a
-  integer :: i
-  
-  a = 10
-  !$omp do simd lastprivate(a) firstprivate(a)
-  do i = 1, 1
-     ! Inside loop: a should be 10 (from firstprivate initialization)
-     ! CHECK: main1 : a = 10
-     print *, "main1 : a = ", a
-     a = 20
-  end do
-  !$omp end do simd
-  ! After loop: a should be 20 (from lastprivate copy-out)
-  ! CHECK: main2 : a = 20
-  print *, "main2 : a = ", a
-  
-  call sub
-  ! CHECK: pass
-  print *, 'pass'
-end program main
-
-subroutine sub
-  integer :: a
-  integer :: i
-  
-  a = 10
-  !$omp do simd lastprivate(a) firstprivate(a)
-  do i = 1, 1
-     ! Inside loop: a should be 10 (from firstprivate initialization)
-     ! CHECK: sub1  : a = 10
-     print *, "sub1  : a = ", a
-     a = 20
-  end do
-  !$omp end do simd
-  ! After loop: a should be 20 (from lastprivate copy-out)
-  ! CHECK: sub2  : a = 20
-  print *, "sub2  : a = ", a
-end subroutine sub
diff --git a/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90 b/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
deleted file mode 100644
index 429409926d47b..0000000000000
--- a/flang/test/Lower/OpenMP/do-simd-firstprivate-lastprivate.f90
+++ /dev/null
@@ -1,89 +0,0 @@
-! Test for DO SIMD with the same variable in both firstprivate and lastprivate clauses
-! This tests the fix for issue #168306
-
-! RUN: %flang_fc1 -fopenmp -mmlir --enable-delayed-privatization-staging=true -emit-hlfir %s -o - | FileCheck %s
-
-! Test case 1: Basic test with firstprivate + lastprivate on same variable
-! CHECK-LABEL: func.func @_QPdo_simd_first_last_same_var
-subroutine do_simd_first_last_same_var()
-  integer :: a
-  integer :: i
-  a = 10
-
-  ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK-NEXT: omp.simd
-  ! CHECK-NOT: private
-  ! CHECK-NEXT: omp.loop_nest (%[[IV:.*]]) : i32
-  !$omp do simd firstprivate(a) lastprivate(a)
-  do i = 1, 1
-    ! CHECK: %[[FIRSTPRIV_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIV_A]]
-    ! CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]]
-    ! The private copy should be initialized from firstprivate (value 10)
-    ! and then modified to 20
-    a = 20
-  end do
-  !$omp end do simd
-  ! After the loop, 'a' should be 20 due to lastprivate
-end subroutine do_simd_first_last_same_var
-
-! Test case 2: Test with lastprivate and firstprivate in reverse order
-! CHECK-LABEL: func.func @_QPdo_simd_last_first_reverse
-subroutine do_simd_last_first_reverse()
-  integer :: a
-  integer :: i
-  a = 10
-
-  ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK-NEXT: omp.simd
-  ! CHECK-NOT: private
-  !$omp do simd lastprivate(a) firstprivate(a)
-  do i = 1, 1
-    a = 20
-  end do
-  !$omp end do simd
-end subroutine do_simd_last_first_reverse
-
-! Test case 3: Multiple variables with mixed privatization
-! CHECK-LABEL: func.func @_QPdo_simd_multiple_vars
-subroutine do_simd_multiple_vars()
-  integer :: a, b, c
-  integer :: i
-  a = 10
-  b = 20
-  c = 30
-
-  ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}firstprivate{{.*}} %{{.*}} -> %{{.*}}, @{{.*}}private{{.*}} %{{.*}} -> %{{.*}} : !fir.ref<i32>, !fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK-NEXT: omp.simd
-  ! CHECK-NOT: private
-  !$omp do simd firstprivate(a, b) lastprivate(a) private(c)
-  do i = 1, 5
-    a = a + 1
-    b = b + 1
-    c = i
-  end do
-  !$omp end do simd
-end subroutine do_simd_multiple_vars
-
-! Test case 4: Reproducer from issue #168306
-! CHECK-LABEL: func.func @_QPissue_168306_reproducer
-subroutine issue_168306_reproducer()
-  integer :: a
-  integer :: i
-  a = 10
-
-  ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@{{.*}}firstprivate{{.*}} %{{.*}} -> %[[FIRSTPRIV_A:.*]], @{{.*}}private{{.*}} %{{.*}} -> %[[PRIV_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
-  ! CHECK-NEXT: omp.simd
-  ! CHECK-NOT: private
-  !$omp do simd lastprivate(a) firstprivate(a)
-  do i = 1, 1
-    ! Inside the loop, 'a' should start at 10 (from firstprivate)
-    ! This is the key behavior that was broken
-    a = 20
-  end do
-  !$omp end do simd
-  ! After the loop, 'a' should be 20 (from lastprivate)
-end subroutine issue_168306_reproducer
diff --git a/flang/test/Lower/OpenMP/order-clause.f90 b/flang/test/Lower/OpenMP/order-clause.f90
index bf007c765b0c1..1f678e02708da 100644
--- a/flang/test/Lower/OpenMP/order-clause.f90
+++ b/flang/test/Lower/OpenMP/order-clause.f90
@@ -36,15 +36,15 @@ end subroutine do_order
 
 !CHECK-LABEL:   func.func @_QPdo_simd_order() {
 subroutine do_simd_order
-   !CHECK: omp.wsloop order(reproducible:concurrent)
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
    !$omp do simd order(concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(reproducible:concurrent)
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
    !$omp do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
-   !CHECK: omp.wsloop order(unconstrained:concurrent)
+   !CHECK: omp.wsloop order(unconstrained:concurrent) {
    !$omp do simd order(unconstrained:concurrent)
    do i = 1, 10
    end do
@@ -53,7 +53,7 @@ end subroutine do_simd_order
 !CHECK-LABEL:   func.func @_QPdo_simd_order_parallel() {
 subroutine do_simd_order_parallel
    !CHECK: omp.parallel {
-   !CHECK: omp.wsloop order(reproducible:concurrent)
+   !CHECK: omp.wsloop order(reproducible:concurrent) {
    !$omp parallel do simd order(reproducible:concurrent)
    do i = 1, 10
    end do
diff --git a/flang/test/Lower/OpenMP/wsloop-simd.f90 b/flang/test/Lower/OpenMP/wsloop-simd.f90
index b18bc29efb230..03e35de04cace 100644
--- a/flang/test/Lower/OpenMP/wsloop-simd.f90
+++ b/flang/test/Lower/OpenMP/wsloop-simd.f90
@@ -71,13 +71,16 @@ end subroutine do_simd_reduction
 subroutine do_simd_private()
   integer, allocatable :: tmp
   ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
+  ! CHECK-SAME: private(@[[PRIV_BOX_SYM:.*]] %{{.*}} -> %[[PRIV_BOX:.*]], @[[PRIV_IVAR_SYM:.*]] %{{.*}} -> %[[PRIV_IVAR:.*]] : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<i32>)
   ! CHECK-NEXT: omp.loop_nest (%[[IVAR:.*]]) : i32
   !$omp do simd private(tmp)
   do i=1, 10
+  ! CHECK:      %[[PRIV_BOX_DECL:.*]]:2 = hlfir.declare %[[PRIV_BOX]]
   ! CHECK:      %[[PRIV_IVAR_DECL:.*]]:2 = hlfir.declare %[[PRIV_IVAR]]
   ! CHECK:      hlfir.assign %[[IVAR]] to %[[PRIV_IVAR_DECL]]#0
+  ! CHECK:      %[[PRIV_BOX_LOAD:.*]] = fir.load %[[PRIV_BOX_DECL]]
+  ! CHECK:      hlfir.assign %{{.*}} to %[[PRIV_BOX_DECL]]#0
   ! CHECK:      omp.yield
     tmp = tmp + 1
   end do
@@ -87,11 +90,13 @@ end subroutine do_simd_private
 subroutine do_simd_lastprivate_firstprivate()
   integer :: a
   ! CHECK:      omp.wsloop
-  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK-SAME: private(@[[FIRSTPRIVATE_A_SYM:.*]] %{{.*}} -> %[[FIRSTPRIVATE_A:.*]] : !fir.ref<i32>)
   ! CHECK-NEXT: omp.simd
+  ! CHECK-SAME: private(@[[PRIVATE_A_SYM:.*]] %{{.*}} -> %[[PRIVATE_A:.*]], @[[PRIVATE_I_SYM:.*]] %{{.*}} -> %[[PRIVATE_I:.*]] : !fir.ref<i32>, !fir.ref<i32>)
   !$omp do simd lastprivate(a) firstprivate(a)
   do i = 1, 10
     ! CHECK: %[[FIRSTPRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[FIRSTPRIVATE_A]]
+    ! CHECK: %[[PRIVATE_A_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_A]]
     ! CHECK: %[[PRIVATE_I_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_I]]
     a = a + 1
   end do
diff --git a/revert_patches.txt b/revert_patches.txt
index 57ede019ece2b..68c3dfbdd28fa 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -11,3 +11,6 @@ complicated build, deferring
 needs more work to land
 [Flang] Move builtin .mod generation into runtimes (Reapply #137828)
 ---
+olcf fails smoke-fort
+[flang][OpenMP] Fix firstprivate not working with lastprivate in DO SIMD (#170163)
+---