diff --git a/ci/run.sh b/ci/run.sh index 3c9dc024..9d163233 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -12,12 +12,16 @@ else $run --release $run --features c $run --features c --release + $run --features asm + $run --features asm --release fi cargo build --target $1 cargo build --target $1 --release cargo build --target $1 --features c cargo build --target $1 --release --features c +cargo build --target $1 --features asm +cargo build --target $1 --release --features asm PREFIX=$(echo $1 | sed -e 's/unknown-//')- case $1 in diff --git a/src/mem/memcpy.rs b/src/mem/memcpy.rs new file mode 100644 index 00000000..8fada9bc --- /dev/null +++ b/src/mem/memcpy.rs @@ -0,0 +1,41 @@ +use super::c_int; + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + let mut i = 0; + while i < n { + *dest.offset(i as isize) = *src.offset(i as isize); + i += 1; + } + dest +} + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { + if src < dest as *const u8 { + // copy from end + let mut i = n; + while i != 0 { + i -= 1; + *dest.offset(i as isize) = *src.offset(i as isize); + } + } else { + // copy from beginning + let mut i = 0; + while i < n { + *dest.offset(i as isize) = *src.offset(i as isize); + i += 1; + } + } + dest +} + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { + let mut i = 0; + while i < n { + *s.offset(i as isize) = c as u8; + i += 1; + } + s +} diff --git a/src/mem.rs b/src/mem/mod.rs similarity index 84% rename from src/mem.rs rename to src/mem/mod.rs index 24552ed8..aa9d4b61 100644 --- a/src/mem.rs +++ b/src/mem/mod.rs @@ -9,45 +9,10 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div} use core::mem; use core::ops::{BitOr, Shl}; -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { - let mut i = 0; - while i < n { - *dest.offset(i as isize) = *src.offset(i as isize); - i += 1; - } - dest -} - -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 { - if src < dest as *const u8 { - // copy from end - let mut i = n; - while i != 0 { - i -= 1; - *dest.offset(i as isize) = *src.offset(i as isize); - } - } else { - // copy from beginning - let mut i = 0; - while i < n { - *dest.offset(i as isize) = *src.offset(i as isize); - i += 1; - } - } - dest -} - -#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] -pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 { - let mut i = 0; - while i < n { - *s.offset(i as isize) = c as u8; - i += 1; - } - s -} +// memcpy/memmove/memset have optimized implementations on some architectures +#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")] +mod memcpy; +pub use self::memcpy::*; #[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 { diff --git a/src/mem/x86_64.rs b/src/mem/x86_64.rs new file mode 100644 index 00000000..1ecffce4 --- /dev/null +++ b/src/mem/x86_64.rs @@ -0,0 +1,79 @@ +use super::c_int; + +// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have +// been enhanced to perform better than an simple qword loop, making them ideal +// for implementing memcpy/memset. Note that "rep cmps" has received no such +// enhancement, so it is not used to implement memcmp. +// +// On certain recent Intel processors, "rep movsb" and "rep stosb" have been +// further enhanced to automatically select the best microarchitectural +// implementation based on length and alignment. See the following features from +// the "IntelĀ® 64 and IA-32 Architectures Optimization Reference Manual": +// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later) +// - FSRM - Fast Short REP MOV (Ice Lake and later) +// - Fast Zero-Length MOVSB (On no current hardware) +// - Fast Short STOSB (On no current hardware) +// However, to avoid run-time feature detection, we don't use these byte-based +// instructions for most of the copying, preferring the qword variants. + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { + let qword_count = count >> 3; + let byte_count = count & 0b111; + asm!( + "rep movsq [rdi], [rsi]", + "mov ecx, {byte_count:e}", + "rep movsb [rdi], [rsi]", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest => _, + inout("rsi") src => _, + options(nostack, preserves_flags) + ); + dest +} + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 { + let delta = (dest as usize).wrapping_sub(src as usize); + if delta >= count { + // We can copy forwards because either dest is far enough ahead of src, + // or src is ahead of dest (and delta overflowed). + return self::memcpy(dest, src, count); + } + // copy backwards + let qword_count = count >> 3; + let byte_count = count & 0b111; + asm!( + "std", + "rep movsq [rdi], [rsi]", + "mov ecx, {byte_count:e}", + "add rdi, 7", + "add rsi, 7", + "rep movsb [rdi], [rsi]", + "cld", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _, + inout("rsi") src.offset(count as isize).wrapping_sub(8) => _, + options(nostack) + ); + dest +} + +#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)] +pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 { + let qword_count = count >> 3; + let byte_count = count & 0b111; + asm!( + "rep stosq [rdi], rax", + "mov ecx, {byte_count:e}", + "rep stosb [rdi], al", + byte_count = in(reg) byte_count, + inout("rcx") qword_count => _, + inout("rdi") dest => _, + in("rax") (c as u8 as u64) * 0x0101010101010101, + options(nostack, preserves_flags) + ); + dest +} diff --git a/testcrate/benches/mem.rs b/testcrate/benches/mem.rs new file mode 100644 index 00000000..57d57508 --- /dev/null +++ b/testcrate/benches/mem.rs @@ -0,0 +1,162 @@ +#![feature(test)] + +extern crate test; +use test::{black_box, Bencher}; + +extern crate compiler_builtins; +use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; + +fn memcpy_builtin(b: &mut Bencher, n: usize) { + let v1 = vec![1u8; n]; + let mut v2 = vec![0u8; n]; + b.bytes = n as u64; + b.iter(|| { + let src: &[u8] = black_box(&v1); + let dst: &mut [u8] = black_box(&mut v2); + dst.copy_from_slice(src); + }) +} + +fn memcpy_rust(b: &mut Bencher, n: usize) { + let v1 = vec![1u8; n]; + let mut v2 = vec![0u8; n]; + b.bytes = n as u64; + b.iter(|| { + let src: &[u8] = black_box(&v1); + let dst: &mut [u8] = black_box(&mut v2); + unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) } + }) +} + +fn memset_builtin(b: &mut Bencher, n: usize) { + let mut v1 = vec![0u8; n]; + b.bytes = n as u64; + b.iter(|| { + let dst: &mut [u8] = black_box(&mut v1); + let val: u8 = black_box(27); + for b in dst { + *b = val; + } + }) +} + +fn memset_rust(b: &mut Bencher, n: usize) { + let mut v1 = vec![0u8; n]; + b.bytes = n as u64; + b.iter(|| { + let dst: &mut [u8] = black_box(&mut v1); + let val = black_box(27); + unsafe { memset(dst.as_mut_ptr(), val, n) } + }) +} + +fn memcmp_builtin(b: &mut Bencher, n: usize) { + let v1 = vec![0u8; n]; + let mut v2 = vec![0u8; n]; + v2[n - 1] = 1; + b.bytes = n as u64; + b.iter(|| { + let s1: &[u8] = black_box(&v1); + let s2: &[u8] = black_box(&v2); + s1.cmp(s2) + }) +} + +fn memcmp_rust(b: &mut Bencher, n: usize) { + let v1 = vec![0u8; n]; + let mut v2 = vec![0u8; n]; + v2[n - 1] = 1; + b.bytes = n as u64; + b.iter(|| { + let s1: &[u8] = black_box(&v1); + let s2: &[u8] = black_box(&v2); + unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) } + }) +} + +fn memmove_builtin(b: &mut Bencher, n: usize) { + let mut v = vec![0u8; n + n / 2]; + b.bytes = n as u64; + b.iter(|| { + let s: &mut [u8] = black_box(&mut v); + s.copy_within(0..n, n / 2); + }) +} + +fn memmove_rust(b: &mut Bencher, n: usize) { + let mut v = vec![0u8; n + n / 2]; + b.bytes = n as u64; + b.iter(|| { + let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr(); + let src: *const u8 = black_box(&v).as_ptr(); + unsafe { memmove(dst, src, n) }; + }) +} + +#[bench] +fn memcpy_builtin_4096(b: &mut Bencher) { + memcpy_builtin(b, 4096) +} +#[bench] +fn memcpy_rust_4096(b: &mut Bencher) { + memcpy_rust(b, 4096) +} +#[bench] +fn memcpy_builtin_1048576(b: &mut Bencher) { + memcpy_builtin(b, 1048576) +} +#[bench] +fn memcpy_rust_1048576(b: &mut Bencher) { + memcpy_rust(b, 1048576) +} + +#[bench] +fn memset_builtin_4096(b: &mut Bencher) { + memset_builtin(b, 4096) +} +#[bench] +fn memset_rust_4096(b: &mut Bencher) { + memset_rust(b, 4096) +} +#[bench] +fn memset_builtin_1048576(b: &mut Bencher) { + memset_builtin(b, 1048576) +} +#[bench] +fn memset_rust_1048576(b: &mut Bencher) { + memset_rust(b, 1048576) +} + +#[bench] +fn memcmp_builtin_4096(b: &mut Bencher) { + memcmp_builtin(b, 4096) +} +#[bench] +fn memcmp_rust_4096(b: &mut Bencher) { + memcmp_rust(b, 4096) +} +#[bench] +fn memcmp_builtin_1048576(b: &mut Bencher) { + memcmp_builtin(b, 1048576) +} +#[bench] +fn memcmp_rust_1048576(b: &mut Bencher) { + memcmp_rust(b, 1048576) +} + +#[bench] +fn memmove_builtin_4096(b: &mut Bencher) { + memmove_builtin(b, 4096) +} +#[bench] +fn memmove_rust_4096(b: &mut Bencher) { + memmove_rust(b, 4096) +} +#[bench] +fn memmove_builtin_1048576(b: &mut Bencher) { + memmove_builtin(b, 1048576) +} +#[bench] +fn memmove_rust_1048576(b: &mut Bencher) { + memmove_rust(b, 1048576) +} diff --git a/testcrate/tests/mem.rs b/testcrate/tests/mem.rs new file mode 100644 index 00000000..a5596b28 --- /dev/null +++ b/testcrate/tests/mem.rs @@ -0,0 +1,133 @@ +extern crate compiler_builtins; +use compiler_builtins::mem::{memcmp, memcpy, memmove, memset}; + +#[test] +fn memcpy_3() { + let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + unsafe { + let src = arr.as_ptr().offset(9); + let dst = arr.as_mut_ptr().offset(1); + assert_eq!(memcpy(dst, src, 3), dst); + assert_eq!(arr, [0, 9, 10, 11, 4, 5, 6, 7, 8, 9, 10, 11]); + } + arr = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + unsafe { + let src = arr.as_ptr().offset(1); + let dst = arr.as_mut_ptr().offset(9); + assert_eq!(memcpy(dst, src, 3), dst); + assert_eq!(arr, [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3]); + } +} + +#[test] +fn memcpy_10() { + let arr: [u8; 18] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let mut dst: [u8; 12] = [0; 12]; + unsafe { + let src = arr.as_ptr().offset(1); + assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr()); + assert_eq!(dst, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 0]); + } + unsafe { + let src = arr.as_ptr().offset(8); + assert_eq!(memcpy(dst.as_mut_ptr(), src, 10), dst.as_mut_ptr()); + assert_eq!(dst, [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 0, 0]); + } +} + +#[test] +fn memcpy_big() { + // Make the arrays cross 3 pages + const SIZE: usize = 8193; + let src: [u8; SIZE] = [22; SIZE]; + struct Dst { + start: usize, + buf: [u8; SIZE], + end: usize, + } + + let mut dst = Dst { + start: 0, + buf: [0; SIZE], + end: 0, + }; + unsafe { + assert_eq!( + memcpy(dst.buf.as_mut_ptr(), src.as_ptr(), SIZE), + dst.buf.as_mut_ptr() + ); + assert_eq!(dst.start, 0); + assert_eq!(dst.buf, [22; SIZE]); + assert_eq!(dst.end, 0); + } +} + +#[test] +fn memmove_forward() { + let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + unsafe { + let src = arr.as_ptr().offset(6); + let dst = arr.as_mut_ptr().offset(3); + assert_eq!(memmove(dst, src, 5), dst); + assert_eq!(arr, [0, 1, 2, 6, 7, 8, 9, 10, 8, 9, 10, 11]); + } +} + +#[test] +fn memmove_backward() { + let mut arr: [u8; 12] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]; + unsafe { + let src = arr.as_ptr().offset(3); + let dst = arr.as_mut_ptr().offset(6); + assert_eq!(memmove(dst, src, 5), dst); + assert_eq!(arr, [0, 1, 2, 3, 4, 5, 3, 4, 5, 6, 7, 11]); + } +} + +#[test] +fn memset_zero() { + let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; + unsafe { + let ptr = arr.as_mut_ptr().offset(5); + assert_eq!(memset(ptr, 0, 2), ptr); + assert_eq!(arr, [0, 1, 2, 3, 4, 0, 0, 7]); + + // Only the LSB matters for a memset + assert_eq!(memset(arr.as_mut_ptr(), 0x2000, 8), arr.as_mut_ptr()); + assert_eq!(arr, [0, 0, 0, 0, 0, 0, 0, 0]); + } +} + +#[test] +fn memset_nonzero() { + let mut arr: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; + unsafe { + let ptr = arr.as_mut_ptr().offset(2); + assert_eq!(memset(ptr, 22, 3), ptr); + assert_eq!(arr, [0, 1, 22, 22, 22, 5, 6, 7]); + + // Only the LSB matters for a memset + assert_eq!(memset(arr.as_mut_ptr(), 0x2009, 8), arr.as_mut_ptr()); + assert_eq!(arr, [9, 9, 9, 9, 9, 9, 9, 9]); + } +} + +#[test] +fn memcmp_eq() { + let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; + let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; + unsafe { + assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8), 0); + assert_eq!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 3), 0); + } +} + +#[test] +fn memcmp_ne() { + let arr1: [u8; 8] = [0, 1, 2, 3, 4, 5, 6, 7]; + let arr2: [u8; 8] = [0, 1, 2, 3, 4, 5, 7, 7]; + unsafe { + assert!(memcmp(arr1.as_ptr(), arr2.as_ptr(), 8) < 0); + assert!(memcmp(arr2.as_ptr(), arr1.as_ptr(), 8) > 0); + } +}