-
Notifications
You must be signed in to change notification settings - Fork 12.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add tests for the generated assembly of mask related simd instructions. #121953
Merged
bors
merged 1 commit into
rust-lang:master
from
jhorstmann:assembly-tests-for-masked-simd-instructions
Mar 12, 2024
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
//@ revisions: x86 x86-avx2 x86-avx512 aarch64 | ||
//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86] needs-llvm-components: x86 | ||
//@ [x86-avx2] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86-avx2] compile-flags: -C target-feature=+avx2 | ||
//@ [x86-avx2] needs-llvm-components: x86 | ||
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq | ||
//@ [x86-avx512] needs-llvm-components: x86 | ||
//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu | ||
//@ [aarch64] needs-llvm-components: aarch64 | ||
//@ [aarch64] min-llvm-version: 18.0 | ||
//@ assembly-output: emit-asm | ||
//@ compile-flags: --crate-type=lib -O | ||
|
||
#![feature(no_core, lang_items, repr_simd, intrinsics)] | ||
#![no_core] | ||
#![allow(non_camel_case_types)] | ||
|
||
// Because we don't have core yet. | ||
#[lang = "sized"] | ||
pub trait Sized {} | ||
|
||
#[lang = "copy"] | ||
trait Copy {} | ||
|
||
#[repr(simd)] | ||
pub struct m8x16([i8; 16]); | ||
|
||
#[repr(simd)] | ||
pub struct m8x64([i8; 64]); | ||
|
||
#[repr(simd)] | ||
pub struct m32x4([i32; 4]); | ||
|
||
#[repr(simd)] | ||
pub struct m64x2([i64; 2]); | ||
|
||
#[repr(simd)] | ||
pub struct m64x4([i64; 4]); | ||
|
||
extern "rust-intrinsic" { | ||
fn simd_bitmask<V, B>(mask: V) -> B; | ||
} | ||
|
||
// CHECK-LABEL: bitmask_m8x16 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn bitmask_m8x16(mask: m8x16) -> u16 { | ||
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary. | ||
// Note that x86 has no byte shift, llvm uses a word shift to move the least significant bit | ||
// of each byte into the right position. | ||
// | ||
// x86-NOT: psllw | ||
// x86: movmskb eax, xmm0 | ||
// | ||
// x86-avx2-NOT: vpsllw | ||
// x86-avx2: vpmovmskb eax, xmm0 | ||
// | ||
// x86-avx512-NOT: vpsllw xmm0 | ||
// x86-avx512: vpmovmskb eax, xmm0 | ||
// | ||
// aarch64: adrp | ||
// aarch64-NEXT: cmlt | ||
// aarch64-NEXT: ldr | ||
// aarch64-NEXT: and | ||
// aarch64-NEXT: ext | ||
// aarch64-NEXT: zip1 | ||
// aarch64-NEXT: addv | ||
// aarch64-NEXT: fmov | ||
simd_bitmask(mask) | ||
} | ||
|
||
// CHECK-LABEL: bitmask_m8x64 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn bitmask_m8x64(mask: m8x64) -> u64 { | ||
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary. | ||
// Note that x86 has no byte shift, llvm uses a word shift to move the least significant bit | ||
// of each byte into the right position. | ||
// | ||
// The parameter is a 512 bit vector which in the C abi is only valid for avx512 targets. | ||
// | ||
// x86-avx512-NOT: vpsllw | ||
// x86-avx512: vpmovb2m k0, zmm0 | ||
// x86-avx512: kmovq rax, k0 | ||
simd_bitmask(mask) | ||
} | ||
|
||
// CHECK-LABEL: bitmask_m32x4 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn bitmask_m32x4(mask: m32x4) -> u8 { | ||
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary. | ||
// | ||
// x86-NOT: psllq | ||
// x86: movmskps eax, xmm0 | ||
// | ||
// x86-avx2-NOT: vpsllq | ||
// x86-avx2: vmovmskps eax, xmm0 | ||
// | ||
// x86-avx512-NOT: vpsllq | ||
// x86-avx512: vmovmskps eax, xmm0 | ||
// | ||
// aarch64: adrp | ||
// aarch64-NEXT: cmlt | ||
// aarch64-NEXT: ldr | ||
// aarch64-NEXT: and | ||
// aarch64-NEXT: addv | ||
// aarch64-NEXT: fmov | ||
// aarch64-NEXT: and | ||
simd_bitmask(mask) | ||
} | ||
|
||
// CHECK-LABEL: bitmask_m64x2 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn bitmask_m64x2(mask: m64x2) -> u8 { | ||
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary. | ||
// | ||
// x86-NOT: psllq | ||
// x86: movmskpd eax, xmm0 | ||
// | ||
// x86-avx2-NOT: vpsllq | ||
// x86-avx2: vmovmskpd eax, xmm0 | ||
// | ||
// x86-avx512-NOT: vpsllq | ||
// x86-avx512: vmovmskpd eax, xmm0 | ||
// | ||
// aarch64: adrp | ||
// aarch64-NEXT: cmlt | ||
// aarch64-NEXT: ldr | ||
// aarch64-NEXT: and | ||
// aarch64-NEXT: addp | ||
// aarch64-NEXT: fmov | ||
// aarch64-NEXT: and | ||
simd_bitmask(mask) | ||
} | ||
|
||
// CHECK-LABEL: bitmask_m64x4 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn bitmask_m64x4(mask: m64x4) -> u8 { | ||
// The simd_bitmask intrinsic already uses the most significant bit, so no shift is necessary. | ||
// | ||
// The parameter is a 256 bit vector which in the C abi is only valid for avx/avx512 targets. | ||
// | ||
// x86-avx2-NOT: vpsllq | ||
// x86-avx2: vmovmskpd eax, ymm0 | ||
// | ||
// x86-avx512-NOT: vpsllq | ||
// x86-avx512: vmovmskpd eax, ymm0 | ||
simd_bitmask(mask) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
//@ revisions: x86-avx512 | ||
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq | ||
//@ [x86-avx512] needs-llvm-components: x86 | ||
//@ [x86-avx512] min-llvm-version: 18.0 | ||
//@ assembly-output: emit-asm | ||
//@ compile-flags: --crate-type=lib -O | ||
|
||
#![feature(no_core, lang_items, repr_simd, intrinsics)] | ||
#![no_core] | ||
#![allow(non_camel_case_types)] | ||
|
||
// Because we don't have core yet. | ||
#[lang = "sized"] | ||
pub trait Sized {} | ||
|
||
#[lang = "copy"] | ||
trait Copy {} | ||
|
||
#[repr(simd)] | ||
pub struct f64x4([f64; 4]); | ||
|
||
#[repr(simd)] | ||
pub struct m64x4([i64; 4]); | ||
|
||
#[repr(simd)] | ||
pub struct pf64x4([*const f64; 4]); | ||
|
||
extern "rust-intrinsic" { | ||
fn simd_gather<V, M, P>(values: V, mask: M, pointer: P) -> V; | ||
} | ||
|
||
// CHECK-LABEL: gather_f64x4 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn gather_f64x4(mask: m64x4, ptrs: pf64x4) -> f64x4 { | ||
// FIXME: This should also get checked to generate a gather instruction for avx2. | ||
// Currently llvm scalarizes this code, see https://github.com/llvm/llvm-project/issues/59789 | ||
// | ||
// x86-avx512: vpsllq ymm0, ymm0, 63 | ||
// x86-avx512-NEXT: vpmovq2m k1, ymm0 | ||
// x86-avx512-NEXT: vpxor xmm0, xmm0, xmm0 | ||
// x86-avx512-NEXT: vgatherqpd ymm0 {k1}, ymmword ptr [1*ymm1] | ||
simd_gather(f64x4([0_f64, 0_f64, 0_f64, 0_f64]), ptrs, mask) | ||
workingjubilee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
//@ revisions: x86-avx2 x86-avx512 | ||
//@ [x86-avx2] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86-avx2] compile-flags: -C target-feature=+avx2 | ||
//@ [x86-avx2] needs-llvm-components: x86 | ||
//@ [x86-avx512] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86-avx512] compile-flags: -C target-feature=+avx512f,+avx512vl,+avx512bw,+avx512dq | ||
//@ [x86-avx512] needs-llvm-components: x86 | ||
//@ assembly-output: emit-asm | ||
//@ compile-flags: --crate-type=lib -O | ||
|
||
#![feature(no_core, lang_items, repr_simd, intrinsics)] | ||
#![no_core] | ||
#![allow(non_camel_case_types)] | ||
|
||
// Because we don't have core yet. | ||
#[lang = "sized"] | ||
pub trait Sized {} | ||
|
||
#[lang = "copy"] | ||
trait Copy {} | ||
|
||
#[repr(simd)] | ||
pub struct i8x16([i8; 16]); | ||
|
||
#[repr(simd)] | ||
pub struct m8x16([i8; 16]); | ||
|
||
#[repr(simd)] | ||
pub struct f32x8([f32; 8]); | ||
|
||
#[repr(simd)] | ||
pub struct m32x8([i32; 8]); | ||
|
||
#[repr(simd)] | ||
pub struct f64x4([f64; 4]); | ||
|
||
#[repr(simd)] | ||
pub struct m64x4([i64; 4]); | ||
|
||
extern "rust-intrinsic" { | ||
fn simd_masked_load<M, P, T>(mask: M, pointer: P, values: T) -> T; | ||
} | ||
|
||
// CHECK-LABEL: load_i8x16 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn load_i8x16(mask: m8x16, pointer: *const i8) -> i8x16 { | ||
// Since avx2 supports no masked loads for bytes, the code tests each individual bit | ||
// and jumps to code that inserts individual bytes. | ||
// x86-avx2: vpsllw xmm0, xmm0, 7 | ||
// x86-avx2-NEXT: vpmovmskb eax, xmm0 | ||
// x86-avx2-NEXT: vpxor xmm0, xmm0 | ||
// x86-avx2-NEXT: test al, 1 | ||
// x86-avx2-NEXT: jne | ||
// x86-avx2-NEXT: test al, 2 | ||
// x86-avx2-NEXT: jne | ||
// x86-avx2-DAG: movzx [[REG:[a-z]+]], byte ptr [rdi] | ||
// x86-avx2-NEXT: vmovd xmm0, [[REG]] | ||
// x86-avx2-DAG: vpinsrb xmm0, xmm0, byte ptr [rdi + 1], 1 | ||
// | ||
// x86-avx512: vpsllw xmm0, xmm0, 7 | ||
// x86-avx512-NEXT: vpmovb2m k1, xmm0 | ||
// x86-avx512-NEXT: vmovdqu8 xmm0 {k1} {z}, xmmword ptr [rdi] | ||
simd_masked_load(mask, pointer, i8x16([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) | ||
} | ||
workingjubilee marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
// CHECK-LABEL: load_f32x8 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn load_f32x8(mask: m32x8, pointer: *const f32) -> f32x8 { | ||
// x86-avx2: vpslld ymm0, ymm0, 31 | ||
// x86-avx2-NEXT: vmaskmovps ymm0, ymm0, ymmword ptr [rdi] | ||
// | ||
// x86-avx512: vpslld ymm0, ymm0, 31 | ||
// x86-avx512-NEXT: vpmovd2m k1, ymm0 | ||
// x86-avx512-NEXT: vmovups ymm0 {k1} {z}, ymmword ptr [rdi] | ||
simd_masked_load(mask, pointer, f32x8([0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32])) | ||
} | ||
|
||
// CHECK-LABEL: load_f64x4 | ||
#[no_mangle] | ||
pub unsafe extern "C" fn load_f64x4(mask: m64x4, pointer: *const f64) -> f64x4 { | ||
// x86-avx2: vpsllq ymm0, ymm0, 63 | ||
// x86-avx2-NEXT: vmaskmovpd ymm0, ymm0, ymmword ptr [rdi] | ||
// | ||
// x86-avx512: vpsllq ymm0, ymm0, 63 | ||
// x86-avx512-NEXT: vpmovq2m k1, ymm0 | ||
// x86-avx512-NEXT: vmovupd ymm0 {k1} {z}, ymmword ptr [rdi] | ||
simd_masked_load(mask, pointer, f64x4([0_f64, 0_f64, 0_f64, 0_f64])) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// verify that simd mask reductions do not introduce additional bit shift operations | ||
//@ revisions: x86 aarch64 | ||
//@ [x86] compile-flags: --target=x86_64-unknown-linux-gnu -C llvm-args=-x86-asm-syntax=intel | ||
//@ [x86] needs-llvm-components: x86 | ||
//@ [aarch64] compile-flags: --target=aarch64-unknown-linux-gnu | ||
//@ [aarch64] needs-llvm-components: aarch64 | ||
//@ [aarch64] min-llvm-version: 18.0 | ||
//@ assembly-output: emit-asm | ||
//@ compile-flags: --crate-type=lib -O | ||
|
||
#![feature(no_core, lang_items, repr_simd, intrinsics)] | ||
#![no_core] | ||
#![allow(non_camel_case_types)] | ||
|
||
// Because we don't have core yet. | ||
#[lang = "sized"] | ||
pub trait Sized {} | ||
|
||
#[lang = "copy"] | ||
trait Copy {} | ||
|
||
#[repr(simd)] | ||
pub struct mask8x16([i8; 16]); | ||
|
||
extern "rust-intrinsic" { | ||
fn simd_reduce_all<T>(x: T) -> bool; | ||
fn simd_reduce_any<T>(x: T) -> bool; | ||
} | ||
|
||
// CHECK-LABEL: mask_reduce_all: | ||
#[no_mangle] | ||
pub unsafe extern "C" fn mask_reduce_all(m: mask8x16) -> bool { | ||
// x86: psllw xmm0, 7 | ||
// x86-NEXT: pmovmskb eax, xmm0 | ||
// x86-NEXT: {{cmp ax, -1|xor eax, 65535}} | ||
// x86-NEXT: sete al | ||
// | ||
// aarch64: shl v0.16b, v0.16b, #7 | ||
// aarch64-NEXT: cmlt v0.16b, v0.16b, #0 | ||
// aarch64-NEXT: uminv b0, v0.16b | ||
// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0 | ||
// aarch64-NEXT: and w0, [[REG]], #0x1 | ||
simd_reduce_all(m) | ||
} | ||
|
||
// CHECK-LABEL: mask_reduce_any: | ||
#[no_mangle] | ||
pub unsafe extern "C" fn mask_reduce_any(m: mask8x16) -> bool { | ||
// x86: psllw xmm0, 7 | ||
// x86-NEXT: pmovmskb | ||
// x86-NEXT: test eax, eax | ||
// x86-NEXT: setne al | ||
// | ||
// aarch64: shl v0.16b, v0.16b, #7 | ||
// aarch64-NEXT: cmlt v0.16b, v0.16b, #0 | ||
// aarch64-NEXT: umaxv b0, v0.16b | ||
// aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0 | ||
// aarch64-NEXT: and w0, [[REG]], #0x1 | ||
simd_reduce_any(m) | ||
} |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks ok. Full listing including the constants:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice.