|
| 1 | +# Examples |
| 2 | + |
| 3 | +First let's take a look at not actually using any intrinsics but instead |
| 4 | +using LLVM's auto-vectorization to produce optimized vectorized code for |
| 5 | +AVX2 and also for the default platform. |
| 6 | + |
| 7 | +```rust |
| 8 | +fn main() { |
| 9 | + let mut dst = [0]; |
| 10 | + add_quickly(&[1], &[2], &mut dst); |
| 11 | + assert_eq!(dst[0], 3); |
| 12 | +} |
| 13 | + |
| 14 | +fn add_quickly(a: &[u8], b: &[u8], c: &mut [u8]) { |
| 15 | + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 16 | + { |
| 17 | + // Note that this `unsafe` block is safe because we're testing |
| 18 | + // that the `avx2` feature is indeed available on our CPU. |
| 19 | + if is_x86_feature_detected!("avx2") { |
| 20 | + return unsafe { add_quickly_avx2(a, b, c) }; |
| 21 | + } |
| 22 | + } |
| 23 | + |
| 24 | + add_quickly_fallback(a, b, c) |
| 25 | +} |
| 26 | + |
| 27 | +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 28 | +#[target_feature(enable = "avx2")] |
| 29 | +unsafe fn add_quickly_avx2(a: &[u8], b: &[u8], c: &mut [u8]) { |
| 30 | + add_quickly_fallback(a, b, c) // the function below is inlined here |
| 31 | +} |
| 32 | + |
| 33 | +fn add_quickly_fallback(a: &[u8], b: &[u8], c: &mut [u8]) { |
| 34 | + for ((a, b), c) in a.iter().zip(b).zip(c) { |
| 35 | + *c = *a + *b; |
| 36 | + } |
| 37 | +} |
| 38 | +``` |
| 39 | + |
| 40 | +Next up let's take a look at an example of manually using intrinsics. Here |
| 41 | +we'll be using SSE4.1 features to implement hex encoding. |
| 42 | + |
| 43 | +``` |
| 44 | +fn main() { |
| 45 | + let mut dst = [0; 32]; |
| 46 | + hex_encode(b"\x01\x02\x03", &mut dst); |
| 47 | + assert_eq!(&dst[..6], b"010203"); |
| 48 | +
|
| 49 | + let mut src = [0; 16]; |
| 50 | + for i in 0..16 { |
| 51 | + src[i] = (i + 1) as u8; |
| 52 | + } |
| 53 | + hex_encode(&src, &mut dst); |
| 54 | + assert_eq!(&dst, b"0102030405060708090a0b0c0d0e0f10"); |
| 55 | +} |
| 56 | +
|
| 57 | +pub fn hex_encode(src: &[u8], dst: &mut [u8]) { |
| 58 | + let len = src.len().checked_mul(2).unwrap(); |
| 59 | + assert!(dst.len() >= len); |
| 60 | +
|
| 61 | + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 62 | + { |
| 63 | + if is_x86_feature_detected!("sse4.1") { |
| 64 | + return unsafe { hex_encode_sse41(src, dst) }; |
| 65 | + } |
| 66 | + } |
| 67 | +
|
| 68 | + hex_encode_fallback(src, dst) |
| 69 | +} |
| 70 | +
|
| 71 | +// translated from |
| 72 | +// <https://github.com/Matherunner/bin2hex-sse/blob/master/base16_sse4.cpp> |
| 73 | +#[target_feature(enable = "sse4.1")] |
| 74 | +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] |
| 75 | +unsafe fn hex_encode_sse41(mut src: &[u8], dst: &mut [u8]) { |
| 76 | + #[cfg(target_arch = "x86")] |
| 77 | + use std::arch::x86::*; |
| 78 | + #[cfg(target_arch = "x86_64")] |
| 79 | + use std::arch::x86_64::*; |
| 80 | +
|
| 81 | + unsafe { |
| 82 | + let ascii_zero = _mm_set1_epi8(b'0' as i8); |
| 83 | + let nines = _mm_set1_epi8(9); |
| 84 | + let ascii_a = _mm_set1_epi8((b'a' - 9 - 1) as i8); |
| 85 | + let and4bits = _mm_set1_epi8(0xf); |
| 86 | +
|
| 87 | + let mut i = 0_isize; |
| 88 | + while src.len() >= 16 { |
| 89 | + let invec = _mm_loadu_si128(src.as_ptr() as *const _); |
| 90 | +
|
| 91 | + let masked1 = _mm_and_si128(invec, and4bits); |
| 92 | + let masked2 = _mm_and_si128(_mm_srli_epi64(invec, 4), and4bits); |
| 93 | +
|
| 94 | + // return 0xff corresponding to the elements > 9, or 0x00 otherwise |
| 95 | + let cmpmask1 = _mm_cmpgt_epi8(masked1, nines); |
| 96 | + let cmpmask2 = _mm_cmpgt_epi8(masked2, nines); |
| 97 | +
|
| 98 | + // add '0' or the offset depending on the masks |
| 99 | + let masked1 = _mm_add_epi8( |
| 100 | + masked1, |
| 101 | + _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask1), |
| 102 | + ); |
| 103 | + let masked2 = _mm_add_epi8( |
| 104 | + masked2, |
| 105 | + _mm_blendv_epi8(ascii_zero, ascii_a, cmpmask2), |
| 106 | + ); |
| 107 | +
|
| 108 | + // interleave masked1 and masked2 bytes |
| 109 | + let res1 = _mm_unpacklo_epi8(masked2, masked1); |
| 110 | + let res2 = _mm_unpackhi_epi8(masked2, masked1); |
| 111 | +
|
| 112 | + _mm_storeu_si128(dst.as_mut_ptr().offset(i * 2) as *mut _, res1); |
| 113 | + _mm_storeu_si128( |
| 114 | + dst.as_mut_ptr().offset(i * 2 + 16) as *mut _, |
| 115 | + res2, |
| 116 | + ); |
| 117 | + src = &src[16..]; |
| 118 | + i += 16; |
| 119 | + } |
| 120 | +
|
| 121 | + let i = i as usize; |
| 122 | + hex_encode_fallback(src, &mut dst[i * 2..]); |
| 123 | + } |
| 124 | +} |
| 125 | +
|
| 126 | +fn hex_encode_fallback(src: &[u8], dst: &mut [u8]) { |
| 127 | + fn hex(byte: u8) -> u8 { |
| 128 | + static TABLE: &[u8] = b"0123456789abcdef"; |
| 129 | + TABLE[byte as usize] |
| 130 | + } |
| 131 | +
|
| 132 | + for (byte, slots) in src.iter().zip(dst.chunks_mut(2)) { |
| 133 | + slots[0] = hex((*byte >> 4) & 0xf); |
| 134 | + slots[1] = hex(*byte & 0xf); |
| 135 | + } |
| 136 | +} |
| 137 | +``` |
0 commit comments