Skip to content

Commit 063f7d7

Browse files
committed
bootstrap: add an initial stdarch test step
1 parent 5b61449 commit 063f7d7

19 files changed

Lines changed: 871 additions & 101 deletions

library/std/src/lib.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,11 @@ pub mod task {
652652
pub use core::task::*;
653653
}
654654

655-
#[doc = include_str!("../../stdarch/crates/core_arch/src/core_arch_docs.md")]
655+
#[doc = concat!(
656+
include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_prefix.md"),
657+
include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_other_architectures_std.md"),
658+
include_str!("../../stdarch/crates/core_arch/src/core_arch_docs_examples.md"),
659+
)]
656660
#[stable(feature = "simd_arch", since = "1.27.0")]
657661
pub mod arch {
658662
#[stable(feature = "simd_arch", since = "1.27.0")]

library/stdarch/Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,11 @@ incremental = true
1818
debug = 1
1919
opt-level = 3
2020
incremental = true
21+
22+
# The "dist" profile is used by bootstrap when building stdarch docs as part
23+
# of the distribution pipeline. Keep it aligned with the library workspace so
24+
# `cargo doc --profile=dist` works when stdarch is built as its own workspace.
25+
[profile.dist]
26+
inherits = "release"
27+
codegen-units = 1
28+
debug = 1

library/stdarch/crates/core_arch/src/core_arch_docs.md

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -199,23 +199,23 @@ others at:
199199
* [`loongarch64`]
200200
* [`s390x`]
201201

202-
[`x86`]: ../../core/arch/x86/index.html
203-
[`x86_64`]: ../../core/arch/x86_64/index.html
204-
[`arm`]: ../../core/arch/arm/index.html
205-
[`aarch64`]: ../../core/arch/aarch64/index.html
206-
[`amdgpu`]: ../../core/arch/amdgpu/index.html
207-
[`hexagon`]: ../../core/arch/hexagon/index.html
208-
[`riscv32`]: ../../core/arch/riscv32/index.html
209-
[`riscv64`]: ../../core/arch/riscv64/index.html
210-
[`mips`]: ../../core/arch/mips/index.html
211-
[`mips64`]: ../../core/arch/mips64/index.html
212-
[`powerpc`]: ../../core/arch/powerpc/index.html
213-
[`powerpc64`]: ../../core/arch/powerpc64/index.html
214-
[`nvptx`]: ../../core/arch/nvptx/index.html
215-
[`wasm32`]: ../../core/arch/wasm32/index.html
216-
[`loongarch32`]: ../../core/arch/loongarch32/index.html
217-
[`loongarch64`]: ../../core/arch/loongarch64/index.html
218-
[`s390x`]: ../../core/arch/s390x/index.html
202+
[`x86`]: crate::arch::x86
203+
[`x86_64`]: crate::arch::x86_64
204+
[`arm`]: crate::arch::arm
205+
[`aarch64`]: crate::arch::aarch64
206+
[`amdgpu`]: crate::arch::amdgpu
207+
[`hexagon`]: crate::arch::hexagon
208+
[`riscv32`]: crate::arch::riscv32
209+
[`riscv64`]: crate::arch::riscv64
210+
[`mips`]: crate::arch::mips
211+
[`mips64`]: crate::arch::mips64
212+
[`powerpc`]: crate::arch::powerpc
213+
[`powerpc64`]: crate::arch::powerpc64
214+
[`nvptx`]: crate::arch::nvptx
215+
[`wasm32`]: crate::arch::wasm32
216+
[`loongarch32`]: crate::arch::loongarch32
217+
[`loongarch64`]: crate::arch::loongarch64
218+
[`s390x`]: crate::arch::s390x
219219

220220
# Examples
221221

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Examples
2+
3+
First let's take a look at not actually using any intrinsics but instead
4+
using LLVM's auto-vectorization to produce optimized vectorized code for
5+
AVX2 and also for the default platform.
6+
7+
```rust
8+
fn main() {
9+
let mut dst = [0];
10+
add_quickly(&[1], &[2], &mut dst);
11+
assert_eq!(dst[0], 3);
12+
}
13+
14+
fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) {
15+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
16+
{
17+
// Note that this `unsafe` block is safe because we're testing
18+
// that the `avx2` feature is indeed available on our CPU.
19+
if is_x86_feature_detected!("avx2") {
20+
return unsafe { add_quickly_avx2(a, b, c) };
21+
}
22+
}
23+
24+
add_quickly_fallback(a, b, c)
25+
}
26+
27+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
28+
#[target_feature(enable = "avx2")]
29+
unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) {
30+
add_quickly_fallback(a, b, c) // the function below is inlined here
31+
}
32+
33+
fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) {
34+
for ((a, b), c) in a.iter().zip(b).zip(c) {
35+
*c = *a + *b;
36+
}
37+
}
38+
```
39+
40+
Next up let's take a look at an example of manually using intrinsics. Here
41+
we'll be using SSE4.1 features to implement hex encoding.
42+
43+
```
44+
fn main() {
45+
let mut dst = [0; 32];
46+
hex_encode(b"\x01\x02\x03", &mut dst);
47+
assert_eq!(&dst[..6], b"010203");
48+
49+
let mut src = [0; 16];
50+
for i in 0..16 {
51+
src[i] = (i + 1) as u8;
52+
}
53+
hex_encode(&src, &mut dst);
54+
assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10");
55+
}
56+
57+
pub fn hex_encode(src: &[u8], dst: &mut [u8]) {
58+
let len = src.len().checked_mul(2).unwrap();
59+
assert!(dst.len() >= len);
60+
61+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
62+
{
63+
if is_x86_feature_detected!("sse4.1") {
64+
return unsafe { hex_encode_sse41(src, dst) };
65+
}
66+
}
67+
68+
hex_encode_fallback(src, dst)
69+
}
70+
71+
// translated from
72+
// <https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp>
73+
#[target_feature(enable = "sse4.1")]
74+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
75+
unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) {
76+
#[cfg(target_arch = "x86")]
77+
use std::arch::x86::*;
78+
#[cfg(target_arch = "x86_64")]
79+
use std::arch::x86_64::*;
80+
81+
unsafe {
82+
let ascii_zero = _mm_set1_epi8(b'0' as i8);
83+
let nines = _mm_set1_epi8(9);
84+
let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8);
85+
let and4bits = _mm_set1_epi8(0xf);
86+
87+
let mut i = 0_isize;
88+
while src.len() >= 16 {
89+
let invec = _mm_loadu_si128(src.as_ptr() as *const _);
90+
91+
let masked1 = _mm_and_si128(invec, and4bits);
92+
let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits);
93+
94+
// return 0xff corresponding to the elements > 9, or 0x00 otherwise
95+
let cmpmask1 = _mm_cmpgt_epi8(masked1, nines);
96+
let cmpmask2 = _mm_cmpgt_epi8(masked2, nines);
97+
98+
// add '0' or the offset depending on the masks
99+
let masked1 = _mm_add_epi8(
100+
masked1,
101+
_mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1),
102+
);
103+
let masked2 = _mm_add_epi8(
104+
masked2,
105+
_mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2),
106+
);
107+
108+
// interleave masked1 and masked2 bytes
109+
let res1 = _mm_unpacklo_epi8(masked2, masked1);
110+
let res2 = _mm_unpackhi_epi8(masked2, masked1);
111+
112+
_mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1);
113+
_mm_storeu_si128(
114+
dst.as_mut_ptr().offset(i * 2 + 16) as *mut _,
115+
res2,
116+
);
117+
src = &src[16..];
118+
i += 16;
119+
}
120+
121+
let i = i as usize;
122+
hex_encode_fallback(src, &mut dst[i * 2..]);
123+
}
124+
}
125+
126+
fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) {
127+
fn hex(byte: u8) -> u8 {
128+
static TABLE: &[u8] = b"0123456789abcdef";
129+
TABLE[byte as usize]
130+
}
131+
132+
for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) {
133+
slots[0] = hex((*byte >> 4) & 0xf);
134+
slots[1] = hex(*byte & 0xf);
135+
}
136+
}
137+
```
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# Other architectures
2+
3+
This documentation is only for one particular architecture, you can find
4+
others at:
5+
6+
* [`x86`]
7+
* [`x86_64`]
8+
* [`arm`]
9+
* [`aarch64`]
10+
* [`amdgpu`]
11+
* [`hexagon`]
12+
* [`riscv32`]
13+
* [`riscv64`]
14+
* [`mips`]
15+
* [`mips64`]
16+
* [`powerpc`]
17+
* [`powerpc64`]
18+
* [`nvptx`]
19+
* [`wasm32`]
20+
* [`loongarch32`]
21+
* [`loongarch64`]
22+
* [`s390x`]
23+
24+
[`x86`]: core::arch::x86
25+
[`x86_64`]: core::arch::x86_64
26+
[`arm`]: core::arch::arm
27+
[`aarch64`]: core::arch::aarch64
28+
[`amdgpu`]: core::arch::amdgpu
29+
[`hexagon`]: core::arch::hexagon
30+
[`riscv32`]: core::arch::riscv32
31+
[`riscv64`]: core::arch::riscv64
32+
[`mips`]: core::arch::mips
33+
[`mips64`]: core::arch::mips64
34+
[`powerpc`]: core::arch::powerpc
35+
[`powerpc64`]: core::arch::powerpc64
36+
[`nvptx`]: core::arch::nvptx
37+
[`wasm32`]: core::arch::wasm32
38+
[`loongarch32`]: core::arch::loongarch32
39+
[`loongarch64`]: core::arch::loongarch64
40+
[`s390x`]: core::arch::s390x
41+

0 commit comments

Comments
 (0)