Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use REP MOVSQ/STOSQ on x86_64 #365

Merged
merged 10 commits into from
Oct 24, 2020
4 changes: 4 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@ else
$run --release
$run --features c
$run --features c --release
$run --features asm
$run --features asm --release
fi

cargo build --target $1
cargo build --target $1 --release
cargo build --target $1 --features c
cargo build --target $1 --release --features c
cargo build --target $1 --features asm
cargo build --target $1 --release --features asm

PREFIX=$(echo $1 | sed -e 's/unknown-//')-
case $1 in
Expand Down
41 changes: 41 additions & 0 deletions src/mem/memcpy.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
use super::c_int;

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
if src < dest as *const u8 {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
} else {
// copy from beginning
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c as u8;
i += 1;
}
s
}
43 changes: 4 additions & 39 deletions src/mem.rs → src/mem/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,45 +9,10 @@ use core::intrinsics::{atomic_load_unordered, atomic_store_unordered, exact_div}
use core::mem;
use core::ops::{BitOr, Shl};

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, n: usize) -> *mut u8 {
if src < dest as *const u8 {
// copy from end
let mut i = n;
while i != 0 {
i -= 1;
*dest.offset(i as isize) = *src.offset(i as isize);
}
} else {
// copy from beginning
let mut i = 0;
while i < n {
*dest.offset(i as isize) = *src.offset(i as isize);
i += 1;
}
}
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(s: *mut u8, c: c_int, n: usize) -> *mut u8 {
let mut i = 0;
while i < n {
*s.offset(i as isize) = c as u8;
i += 1;
}
s
}
// memcpy/memmove/memset have optimized implementations on some architectures
#[cfg_attr(all(feature = "asm", target_arch = "x86_64"), path = "x86_64.rs")]
mod memcpy;
pub use self::memcpy::*;

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
Expand Down
79 changes: 79 additions & 0 deletions src/mem/x86_64.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
use super::c_int;

// On most modern Intel and AMD processors, "rep movsq" and "rep stosq" have
// been enhanced to perform better than an simple qword loop, making them ideal
// for implementing memcpy/memset. Note that "rep cmps" has received no such
// enhancement, so it is not used to implement memcmp.
//
// On certain recent Intel processors, "rep movsb" and "rep stosb" have been
// further enhanced to automatically select the best microarchitectural
// implementation based on length and alignment. See the following features from
// the "Intel® 64 and IA-32 Architectures Optimization Reference Manual":
// - ERMSB - Enhanced REP MOVSB and STOSB (Ivy Bridge and later)
// - FSRM - Fast Short REP MOV (Ice Lake and later)
// - Fast Zero-Length MOVSB (On no current hardware)
// - Fast Short STOSB (On no current hardware)
// However, to avoid run-time feature detection, we don't use these byte-based
// instructions for most of the copying, preferring the qword variants.

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memcpy(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
josephlr marked this conversation as resolved.
Show resolved Hide resolved
"rep movsq [rdi], [rsi]",
"mov ecx, {byte_count:e}",
"rep movsb [rdi], [rsi]",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
inout("rsi") src => _,
options(nostack, preserves_flags)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memmove(dest: *mut u8, src: *const u8, count: usize) -> *mut u8 {
let delta = (dest as usize).wrapping_sub(src as usize);
if delta >= count {
// We can copy forwards because either dest is far enough ahead of src,
// or src is ahead of dest (and delta overflowed).
return self::memcpy(dest, src, count);
}
// copy backwards
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
"std",
"rep movsq [rdi], [rsi]",
"mov ecx, {byte_count:e}",
"add rdi, 7",
"add rsi, 7",
"rep movsb [rdi], [rsi]",
"cld",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest.offset(count as isize).wrapping_sub(8) => _,
inout("rsi") src.offset(count as isize).wrapping_sub(8) => _,
options(nostack)
);
dest
}

#[cfg_attr(all(feature = "mem", not(feature = "mangled-names")), no_mangle)]
pub unsafe extern "C" fn memset(dest: *mut u8, c: c_int, count: usize) -> *mut u8 {
let qword_count = count >> 3;
let byte_count = count & 0b111;
asm!(
"rep stosq [rdi], rax",
"mov ecx, {byte_count:e}",
"rep stosb [rdi], al",
byte_count = in(reg) byte_count,
inout("rcx") qword_count => _,
inout("rdi") dest => _,
in("rax") (c as u8 as u64) * 0x0101010101010101,
options(nostack, preserves_flags)
);
dest
}
162 changes: 162 additions & 0 deletions testcrate/benches/mem.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#![feature(test)]

extern crate test;
use test::{black_box, Bencher};

extern crate compiler_builtins;
use compiler_builtins::mem::{memcmp, memcpy, memmove, memset};

fn memcpy_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
josephlr marked this conversation as resolved.
Show resolved Hide resolved
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
dst.copy_from_slice(src);
})
}

fn memcpy_rust(b: &mut Bencher, n: usize) {
let v1 = vec![1u8; n];
let mut v2 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let src: &[u8] = black_box(&v1);
let dst: &mut [u8] = black_box(&mut v2);
unsafe { memcpy(dst.as_mut_ptr(), src.as_ptr(), n) }
})
}

fn memset_builtin(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let val: u8 = black_box(27);
for b in dst {
*b = val;
}
})
}

fn memset_rust(b: &mut Bencher, n: usize) {
let mut v1 = vec![0u8; n];
b.bytes = n as u64;
b.iter(|| {
let dst: &mut [u8] = black_box(&mut v1);
let val = black_box(27);
unsafe { memset(dst.as_mut_ptr(), val, n) }
})
}

fn memcmp_builtin(b: &mut Bencher, n: usize) {
let v1 = vec![0u8; n];
let mut v2 = vec![0u8; n];
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1);
let s2: &[u8] = black_box(&v2);
s1.cmp(s2)
})
}

fn memcmp_rust(b: &mut Bencher, n: usize) {
let v1 = vec![0u8; n];
let mut v2 = vec![0u8; n];
v2[n - 1] = 1;
b.bytes = n as u64;
b.iter(|| {
let s1: &[u8] = black_box(&v1);
let s2: &[u8] = black_box(&v2);
unsafe { memcmp(s1.as_ptr(), s2.as_ptr(), n) }
})
}

fn memmove_builtin(b: &mut Bencher, n: usize) {
let mut v = vec![0u8; n + n / 2];
b.bytes = n as u64;
b.iter(|| {
let s: &mut [u8] = black_box(&mut v);
s.copy_within(0..n, n / 2);
})
}

fn memmove_rust(b: &mut Bencher, n: usize) {
let mut v = vec![0u8; n + n / 2];
b.bytes = n as u64;
b.iter(|| {
let dst: *mut u8 = black_box(&mut v[n / 2..]).as_mut_ptr();
let src: *const u8 = black_box(&v).as_ptr();
unsafe { memmove(dst, src, n) };
})
}

#[bench]
fn memcpy_builtin_4096(b: &mut Bencher) {
memcpy_builtin(b, 4096)
}
#[bench]
fn memcpy_rust_4096(b: &mut Bencher) {
memcpy_rust(b, 4096)
}
#[bench]
fn memcpy_builtin_1048576(b: &mut Bencher) {
memcpy_builtin(b, 1048576)
}
#[bench]
fn memcpy_rust_1048576(b: &mut Bencher) {
memcpy_rust(b, 1048576)
}

#[bench]
fn memset_builtin_4096(b: &mut Bencher) {
memset_builtin(b, 4096)
}
#[bench]
fn memset_rust_4096(b: &mut Bencher) {
memset_rust(b, 4096)
}
#[bench]
fn memset_builtin_1048576(b: &mut Bencher) {
memset_builtin(b, 1048576)
}
#[bench]
fn memset_rust_1048576(b: &mut Bencher) {
memset_rust(b, 1048576)
}

#[bench]
fn memcmp_builtin_4096(b: &mut Bencher) {
memcmp_builtin(b, 4096)
}
#[bench]
fn memcmp_rust_4096(b: &mut Bencher) {
memcmp_rust(b, 4096)
}
#[bench]
fn memcmp_builtin_1048576(b: &mut Bencher) {
memcmp_builtin(b, 1048576)
}
#[bench]
fn memcmp_rust_1048576(b: &mut Bencher) {
memcmp_rust(b, 1048576)
}

#[bench]
fn memmove_builtin_4096(b: &mut Bencher) {
memmove_builtin(b, 4096)
}
#[bench]
fn memmove_rust_4096(b: &mut Bencher) {
memmove_rust(b, 4096)
}
#[bench]
fn memmove_builtin_1048576(b: &mut Bencher) {
memmove_builtin(b, 1048576)
}
#[bench]
fn memmove_rust_1048576(b: &mut Bencher) {
memmove_rust(b, 1048576)
}
Loading