rust-lang
diff --git a/‎library/std/src/lib.rs‎
Lines changed: 5 additions & 1 deletion b/‎library/std/src/lib.rs‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎library/stdarch/Cargo.toml‎
Lines changed: 8 additions & 0 deletions b/‎library/stdarch/Cargo.toml‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎library/stdarch/crates/core_arch/src/core_arch_docs.md‎
Lines changed: 17 additions & 17 deletions b/‎library/stdarch/crates/core_arch/src/core_arch_docs.md‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎library/stdarch/crates/core_arch/src/core_arch_docs_examples.md‎
Lines changed: 137 additions & 0 deletions b/‎library/stdarch/crates/core_arch/src/core_arch_docs_examples.md‎
Lines changed: 137 additions & 0 deletions
diff --git a/‎library/stdarch/crates/core_arch/src/core_arch_docs_other_architectures_std.md‎
Lines changed: 41 additions & 0 deletions b/‎library/stdarch/crates/core_arch/src/core_arch_docs_other_architectures_std.md‎
Lines changed: 41 additions & 0 deletions
@@ -652,7 +652,11 @@ pub mod task {
     pub use core::task::*;
 }
 
-#[doc = include_str!("../../stdarch/crates/core_arch/src/core_arch_docs.md")]
+#[doc = concat!(
+    include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_prefix.md"),
+    include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_other_architectures_std.md"),
+    include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_examples.md"),
+)]
 #[stable(feature = "simd_arch", since = "1.27.0")]
 pub mod arch {
     #[stable(feature = "simd_arch", since = "1.27.0")]
 
@@ -18,3 +18,11 @@ incremental = true
 debug = 1
 opt-level = 3
 incremental = true
+
+# The "dist" profile is used by bootstrap when building stdarch docs as part
+# of the distribution pipeline. Keep it aligned with the library workspace so
+# `cargo doc --profile=dist` works when stdarch is built as its own workspace.
+[profile.dist]
+inherits = "release"
+codegen-units = 1
+debug = 1
@@ -199,23 +199,23 @@ others at:
 * [`loongarch64`]
 * [`s390x`]
 
-[`x86`]: ../../core/arch/x86/index.html
-[`x86_64`]: ../../core/arch/x86_64/index.html
-[`arm`]: ../../core/arch/arm/index.html
-[`aarch64`]: ../../core/arch/aarch64/index.html
-[`amdgpu`]: ../../core/arch/amdgpu/index.html
-[`hexagon`]: ../../core/arch/hexagon/index.html
-[`riscv32`]: ../../core/arch/riscv32/index.html
-[`riscv64`]: ../../core/arch/riscv64/index.html
-[`mips`]: ../../core/arch/mips/index.html
-[`mips64`]: ../../core/arch/mips64/index.html
-[`powerpc`]: ../../core/arch/powerpc/index.html
-[`powerpc64`]: ../../core/arch/powerpc64/index.html
-[`nvptx`]: ../../core/arch/nvptx/index.html
-[`wasm32`]: ../../core/arch/wasm32/index.html
-[`loongarch32`]: ../../core/arch/loongarch32/index.html
-[`loongarch64`]: ../../core/arch/loongarch64/index.html
-[`s390x`]: ../../core/arch/s390x/index.html
+[`x86`]: crate::arch::x86
+[`x86_64`]: crate::arch::x86_64
+[`arm`]: crate::arch::arm
+[`aarch64`]: crate::arch::aarch64
+[`amdgpu`]: crate::arch::amdgpu
+[`hexagon`]: crate::arch::hexagon
+[`riscv32`]: crate::arch::riscv32
+[`riscv64`]: crate::arch::riscv64
+[`mips`]: crate::arch::mips
+[`mips64`]: crate::arch::mips64
+[`powerpc`]: crate::arch::powerpc
+[`powerpc64`]: crate::arch::powerpc64
+[`nvptx`]: crate::arch::nvptx
+[`wasm32`]: crate::arch::wasm32
+[`loongarch32`]: crate::arch::loongarch32
+[`loongarch64`]: crate::arch::loongarch64
+[`s390x`]: crate::arch::s390x
 
 # Examples
 
 
@@ -0,0 +1,137 @@
+# Examples
+
+First let's take a look at not actually using any intrinsics but instead
+using LLVM's auto-vectorization to produce optimized vectorized code for
+AVX2 and also for the default platform.
+
+```rust
+fn main() {
+    let mut dst = [0];
+    add_quickly(&[1], &[2], &mut dst);
+    assert_eq!(dst[0], 3);
+}
+
+fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        // Note that this `unsafe` block is safe because we're testing
+        // that the `avx2` feature is indeed available on our CPU.
+        if is_x86_feature_detected!("avx2") {
+            return unsafe { add_quickly_avx2(a, b, c) };
+        }
+    }
+
+    add_quickly_fallback(a, b, c)
+}
+
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+#[target_feature(enable = "avx2")]
+unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
+    add_quickly_fallback(a, b, c) // the function below is inlined here
+}
+
+fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
+    for ((a, b), c) in a.iter().zip(b).zip(c) {
+        *c = *a + *b;
+    }
+}
+```
+
+Next up let's take a look at an example of manually using intrinsics. Here
+we'll be using SSE4.1 features to implement hex encoding.
+
+```
+fn main() {
+    let mut dst = [0; 32];
+    hex_encode(b"\x01\x02\x03", &mut dst);
+    assert_eq!(&dst[..6], b"010203");
+
+    let mut src = [0; 16];
+    for i in 0..16 {
+        src[i] = (i + 1) as u8;
+    }
+    hex_encode(&src, &mut dst);
+    assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
+}
+
+pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
+    let len = src.len().checked_mul(2).unwrap();
+    assert!(dst.len() >= len);
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    {
+        if is_x86_feature_detected!("sse4.1") {
+            return unsafe { hex_encode_sse41(src, dst) };
+        }
+    }
+
+    hex_encode_fallback(src, dst)
+}
+
+// translated from
+// <https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp>
+#[target_feature(enable = "sse4.1")]
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
+    #[cfg(target_arch = "x86")]
+    use std::arch::x86::*;
+    #[cfg(target_arch = "x86_64")]
+    use std::arch::x86_64::*;
+
+    unsafe {
+        let ascii_zero = _mm_set1_epi8(b'0' as i8);
+        let nines = _mm_set1_epi8(9);
+        let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
+        let and4bits = _mm_set1_epi8(0xf);
+
+        let mut i = 0_isize;
+        while src.len() >= 16 {
+            let invec = _mm_loadu_si128(src.as_ptr() as *const _);
+
+            let masked1 = _mm_and_si128(invec, and4bits);
+            let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
+
+            // return 0xff corresponding to the elements > 9, or 0x00 otherwise
+            let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
+            let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
+
+            // add '0' or the offset depending on the masks
+            let masked1 = _mm_add_epi8(
+                masked1,
+                _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
+            );
+            let masked2 = _mm_add_epi8(
+                masked2,
+                _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
+            );
+
+            // interleave masked1 and masked2 bytes
+            let res1 = _mm_unpacklo_epi8(masked2, masked1);
+            let res2 = _mm_unpackhi_epi8(masked2, masked1);
+
+            _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
+            _mm_storeu_si128(
+                dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
+                res2,
+            );
+            src = &src[16..];
+            i += 16;
+        }
+
+        let i = i as usize;
+        hex_encode_fallback(src, &mut dst[i * 2..]);
+    }
+}
+
+fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
+    fn hex(byte: u8) -> u8 {
+        static TABLE: &[u8] = b"0123456789abcdef";
+        TABLE[byte as usize]
+    }
+
+    for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
+        slots[0] = hex((*byte >> 4) & 0xf);
+        slots[1] = hex(*byte & 0xf);
+    }
+}
+```
@@ -0,0 +1,41 @@
+# Other architectures
+
+This documentation is only for one particular architecture, you can find
+others at:
+
+* [`x86`]
+* [`x86_64`]
+* [`arm`]
+* [`aarch64`]
+* [`amdgpu`]
+* [`hexagon`]
+* [`riscv32`]
+* [`riscv64`]
+* [`mips`]
+* [`mips64`]
+* [`powerpc`]
+* [`powerpc64`]
+* [`nvptx`]
+* [`wasm32`]
+* [`loongarch32`]
+* [`loongarch64`]
+* [`s390x`]
+
+[`x86`]: core::arch::x86
+[`x86_64`]: core::arch::x86_64
+[`arm`]: core::arch::arm
+[`aarch64`]: core::arch::aarch64
+[`amdgpu`]: core::arch::amdgpu
+[`hexagon`]: core::arch::hexagon
+[`riscv32`]: core::arch::riscv32
+[`riscv64`]: core::arch::riscv64
+[`mips`]: core::arch::mips
+[`mips64`]: core::arch::mips64
+[`powerpc`]: core::arch::powerpc
+[`powerpc64`]: core::arch::powerpc64
+[`nvptx`]: core::arch::nvptx
+[`wasm32`]: core::arch::wasm32
+[`loongarch32`]: core::arch::loongarch32
+[`loongarch64`]: core::arch::loongarch64
+[`s390x`]: core::arch::s390x
+