diff --git a/compiler/rustc_codegen_llvm/src/intrinsic.rs b/compiler/rustc_codegen_llvm/src/intrinsic.rs index 467e02d55e332..ab86362646b96 100644 --- a/compiler/rustc_codegen_llvm/src/intrinsic.rs +++ b/compiler/rustc_codegen_llvm/src/intrinsic.rs @@ -1034,6 +1034,60 @@ fn generic_simd_intrinsic<'ll, 'tcx>( }}; } + /// Returns the bitwidth of the `$ty` argument if it is an `Int` type. + macro_rules! require_int_ty { + ($ty: expr, $diag: expr) => { + match $ty { + ty::Int(i) => i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()), + _ => { + return_error!($diag); + } + } + }; + } + + /// Returns the bitwidth of the `$ty` argument if it is an `Int` or `Uint` type. + macro_rules! require_int_or_uint_ty { + ($ty: expr, $diag: expr) => { + match $ty { + ty::Int(i) => i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()), + ty::Uint(i) => { + i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()) + } + _ => { + return_error!($diag); + } + } + }; + } + + /// Converts a vector mask, where each element has a bit width equal to the data elements it is used with, + /// down to an i1 based mask that can be used by llvm intrinsics. + /// + /// The rust simd semantics are that each element should either consist of all ones or all zeroes, + /// but this information is not available to llvm. Truncating the vector effectively uses the lowest bit, + /// but codegen for several targets is better if we consider the highest bit by shifting. + /// + /// For x86 SSE/AVX targets this is beneficial since most instructions with mask parameters only consider the highest bit. + /// So even though on llvm level we have an additional shift, in the final assembly there is no shift or truncate and + /// instead the mask can be used as is. + /// + /// For aarch64 and other targets there is a benefit because a mask from the sign bit can be more + /// efficiently converted to an all ones / all zeroes mask by comparing whether each element is negative. + fn vector_mask_to_bitmask<'a, 'll, 'tcx>( + bx: &mut Builder<'a, 'll, 'tcx>, + i_xn: &'ll Value, + in_elem_bitwidth: u64, + in_len: u64, + ) -> &'ll Value { + // Shift the MSB to the right by "in_elem_bitwidth - 1" into the first bit position. + let shift_idx = bx.cx.const_int(bx.type_ix(in_elem_bitwidth), (in_elem_bitwidth - 1) as _); + let shift_indices = vec![shift_idx; in_len as _]; + let i_xn_msb = bx.lshr(i_xn, bx.const_vector(shift_indices.as_slice())); + // Truncate vector to an + bx.trunc(i_xn_msb, bx.type_vector(bx.type_i1(), in_len)) + } + let tcx = bx.tcx(); let sig = tcx.normalize_erasing_late_bound_regions(ty::ParamEnv::reveal_all(), callee_ty.fn_sig(tcx)); @@ -1294,14 +1348,11 @@ fn generic_simd_intrinsic<'ll, 'tcx>( m_len == v_len, InvalidMonomorphization::MismatchedLengths { span, name, m_len, v_len } ); - match m_elem_ty.kind() { - ty::Int(_) => {} - _ => return_error!(InvalidMonomorphization::MaskType { span, name, ty: m_elem_ty }), - } - // truncate the mask to a vector of i1s - let i1 = bx.type_i1(); - let i1xn = bx.type_vector(i1, m_len as u64); - let m_i1s = bx.trunc(args[0].immediate(), i1xn); + let in_elem_bitwidth = require_int_ty!( + m_elem_ty.kind(), + InvalidMonomorphization::MaskType { span, name, ty: m_elem_ty } + ); + let m_i1s = vector_mask_to_bitmask(bx, args[0].immediate(), in_elem_bitwidth, m_len); return Ok(bx.select(m_i1s, args[1].immediate(), args[2].immediate())); } @@ -1319,32 +1370,12 @@ fn generic_simd_intrinsic<'ll, 'tcx>( let expected_bytes = expected_int_bits / 8 + ((expected_int_bits % 8 > 0) as u64); // Integer vector : - let (i_xn, in_elem_bitwidth) = match in_elem.kind() { - ty::Int(i) => ( - args[0].immediate(), - i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()), - ), - ty::Uint(i) => ( - args[0].immediate(), - i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()), - ), - _ => return_error!(InvalidMonomorphization::VectorArgument { - span, - name, - in_ty, - in_elem - }), - }; + let in_elem_bitwidth = require_int_or_uint_ty!( + in_elem.kind(), + InvalidMonomorphization::VectorArgument { span, name, in_ty, in_elem } + ); - // Shift the MSB to the right by "in_elem_bitwidth - 1" into the first bit position. - let shift_indices = - vec![ - bx.cx.const_int(bx.type_ix(in_elem_bitwidth), (in_elem_bitwidth - 1) as _); - in_len as _ - ]; - let i_xn_msb = bx.lshr(i_xn, bx.const_vector(shift_indices.as_slice())); - // Truncate vector to an - let i1xn = bx.trunc(i_xn_msb, bx.type_vector(bx.type_i1(), in_len)); + let i1xn = vector_mask_to_bitmask(bx, args[0].immediate(), in_elem_bitwidth, in_len); // Bitcast to iN: let i_ = bx.bitcast(i1xn, bx.type_ix(in_len)); @@ -1562,28 +1593,23 @@ fn generic_simd_intrinsic<'ll, 'tcx>( } ); - match element_ty2.kind() { - ty::Int(_) => (), - _ => { - return_error!(InvalidMonomorphization::ThirdArgElementType { - span, - name, - expected_element: element_ty2, - third_arg: arg_tys[2] - }); + let mask_elem_bitwidth = require_int_ty!( + element_ty2.kind(), + InvalidMonomorphization::ThirdArgElementType { + span, + name, + expected_element: element_ty2, + third_arg: arg_tys[2] } - } + ); // Alignment of T, must be a constant integer value: let alignment_ty = bx.type_i32(); let alignment = bx.const_i32(bx.align_of(in_elem).bytes() as i32); // Truncate the mask vector to a vector of i1s: - let (mask, mask_ty) = { - let i1 = bx.type_i1(); - let i1xn = bx.type_vector(i1, in_len); - (bx.trunc(args[2].immediate(), i1xn), i1xn) - }; + let mask = vector_mask_to_bitmask(bx, args[2].immediate(), mask_elem_bitwidth, in_len); + let mask_ty = bx.type_vector(bx.type_i1(), in_len); // Type of the vector of pointers: let llvm_pointer_vec_ty = llvm_vector_ty(bx, element_ty1, in_len); @@ -1668,8 +1694,8 @@ fn generic_simd_intrinsic<'ll, 'tcx>( } ); - require!( - matches!(mask_elem.kind(), ty::Int(_)), + let m_elem_bitwidth = require_int_ty!( + mask_elem.kind(), InvalidMonomorphization::ThirdArgElementType { span, name, @@ -1678,17 +1704,13 @@ fn generic_simd_intrinsic<'ll, 'tcx>( } ); + let mask = vector_mask_to_bitmask(bx, args[0].immediate(), m_elem_bitwidth, mask_len); + let mask_ty = bx.type_vector(bx.type_i1(), mask_len); + // Alignment of T, must be a constant integer value: let alignment_ty = bx.type_i32(); let alignment = bx.const_i32(bx.align_of(values_elem).bytes() as i32); - // Truncate the mask vector to a vector of i1s: - let (mask, mask_ty) = { - let i1 = bx.type_i1(); - let i1xn = bx.type_vector(i1, mask_len); - (bx.trunc(args[0].immediate(), i1xn), i1xn) - }; - let llvm_pointer = bx.type_ptr(); // Type of the vector of elements: @@ -1760,8 +1782,8 @@ fn generic_simd_intrinsic<'ll, 'tcx>( } ); - require!( - matches!(mask_elem.kind(), ty::Int(_)), + let m_elem_bitwidth = require_int_ty!( + mask_elem.kind(), InvalidMonomorphization::ThirdArgElementType { span, name, @@ -1770,17 +1792,13 @@ fn generic_simd_intrinsic<'ll, 'tcx>( } ); + let mask = vector_mask_to_bitmask(bx, args[0].immediate(), m_elem_bitwidth, mask_len); + let mask_ty = bx.type_vector(bx.type_i1(), mask_len); + // Alignment of T, must be a constant integer value: let alignment_ty = bx.type_i32(); let alignment = bx.const_i32(bx.align_of(values_elem).bytes() as i32); - // Truncate the mask vector to a vector of i1s: - let (mask, mask_ty) = { - let i1 = bx.type_i1(); - let i1xn = bx.type_vector(i1, in_len); - (bx.trunc(args[0].immediate(), i1xn), i1xn) - }; - let ret_t = bx.type_void(); let llvm_pointer = bx.type_ptr(); @@ -1859,28 +1877,23 @@ fn generic_simd_intrinsic<'ll, 'tcx>( ); // The element type of the third argument must be a signed integer type of any width: - match element_ty2.kind() { - ty::Int(_) => (), - _ => { - return_error!(InvalidMonomorphization::ThirdArgElementType { - span, - name, - expected_element: element_ty2, - third_arg: arg_tys[2] - }); + let mask_elem_bitwidth = require_int_ty!( + element_ty2.kind(), + InvalidMonomorphization::ThirdArgElementType { + span, + name, + expected_element: element_ty2, + third_arg: arg_tys[2] } - } + ); // Alignment of T, must be a constant integer value: let alignment_ty = bx.type_i32(); let alignment = bx.const_i32(bx.align_of(in_elem).bytes() as i32); // Truncate the mask vector to a vector of i1s: - let (mask, mask_ty) = { - let i1 = bx.type_i1(); - let i1xn = bx.type_vector(i1, in_len); - (bx.trunc(args[2].immediate(), i1xn), i1xn) - }; + let mask = vector_mask_to_bitmask(bx, args[2].immediate(), mask_elem_bitwidth, in_len); + let mask_ty = bx.type_vector(bx.type_i1(), in_len); let ret_t = bx.type_void(); @@ -2018,8 +2031,13 @@ fn generic_simd_intrinsic<'ll, 'tcx>( ); args[0].immediate() } else { - match in_elem.kind() { - ty::Int(_) | ty::Uint(_) => {} + let bitwidth = match in_elem.kind() { + ty::Int(i) => { + i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()) + } + ty::Uint(i) => { + i.bit_width().unwrap_or_else(|| bx.data_layout().pointer_size.bits()) + } _ => return_error!(InvalidMonomorphization::UnsupportedSymbol { span, name, @@ -2028,12 +2046,9 @@ fn generic_simd_intrinsic<'ll, 'tcx>( in_elem, ret_ty }), - } + }; - // boolean reductions operate on vectors of i1s: - let i1 = bx.type_i1(); - let i1xn = bx.type_vector(i1, in_len as u64); - bx.trunc(args[0].immediate(), i1xn) + vector_mask_to_bitmask(bx, args[0].immediate(), bitwidth, in_len as _) }; return match in_elem.kind() { ty::Int(_) | ty::Uint(_) => { diff --git a/tests/assembly/simd-intrinsic-gather.rs b/tests/assembly/simd-intrinsic-gather.rs index ef6b597c25f1e..fe987b8d37ecf 100644 --- a/tests/assembly/simd-intrinsic-gather.rs +++ b/tests/assembly/simd-intrinsic-gather.rs @@ -36,8 +36,8 @@ pub unsafe extern "C" fn gather_f64x4(mask: m64x4, ptrs: pf64x4) -> f64x4 { // FIXME: This should also get checked to generate a gather instruction for avx2. // Currently llvm scalarizes this code, see https://github.com/llvm/llvm-project/issues/59789 // - // x86-avx512: vpsllq ymm0, ymm0, 63 - // x86-avx512-NEXT: vpmovq2m k1, ymm0 + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, ymm0 // x86-avx512-NEXT: vpxor xmm0, xmm0, xmm0 // x86-avx512-NEXT: vgatherqpd ymm0 {k1}, ymmword ptr [1*ymm1] simd_gather(f64x4([0_f64, 0_f64, 0_f64, 0_f64]), ptrs, mask) diff --git a/tests/assembly/simd-intrinsic-mask-load.rs b/tests/assembly/simd-intrinsic-mask-load.rs index 49d231c45f858..a29c3ae66c3dd 100644 --- a/tests/assembly/simd-intrinsic-mask-load.rs +++ b/tests/assembly/simd-intrinsic-mask-load.rs @@ -46,9 +46,9 @@ extern "rust-intrinsic" { pub unsafe extern "C" fn load_i8x16(mask: m8x16, pointer: *const i8) -> i8x16 { // Since avx2 supports no masked loads for bytes, the code tests each individual bit // and jumps to code that inserts individual bytes. - // x86-avx2: vpsllw xmm0, xmm0, 7 - // x86-avx2-NEXT: vpmovmskb eax, xmm0 - // x86-avx2-NEXT: vpxor xmm0, xmm0 + // x86-avx2-NOT: vpsllw + // x86-avx2-DAG: vpmovmskb eax + // x86-avx2-DAG: vpxor // x86-avx2-NEXT: test al, 1 // x86-avx2-NEXT: jne // x86-avx2-NEXT: test al, 2 @@ -57,8 +57,8 @@ pub unsafe extern "C" fn load_i8x16(mask: m8x16, pointer: *const i8) -> i8x16 { // x86-avx2-NEXT: vmovd xmm0, [[REG]] // x86-avx2-DAG: vpinsrb xmm0, xmm0, byte ptr [rdi + 1], 1 // - // x86-avx512: vpsllw xmm0, xmm0, 7 - // x86-avx512-NEXT: vpmovb2m k1, xmm0 + // x86-avx512-NOT: vpsllw + // x86-avx512: vpmovb2m k1, xmm0 // x86-avx512-NEXT: vmovdqu8 xmm0 {k1} {z}, xmmword ptr [rdi] simd_masked_load(mask, pointer, i8x16([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])) } @@ -66,11 +66,11 @@ pub unsafe extern "C" fn load_i8x16(mask: m8x16, pointer: *const i8) -> i8x16 { // CHECK-LABEL: load_f32x8 #[no_mangle] pub unsafe extern "C" fn load_f32x8(mask: m32x8, pointer: *const f32) -> f32x8 { - // x86-avx2: vpslld ymm0, ymm0, 31 - // x86-avx2-NEXT: vmaskmovps ymm0, ymm0, ymmword ptr [rdi] + // x86-avx2-NOT: vpslld + // x86-avx2: vmaskmovps ymm0, ymm0, ymmword ptr [rdi] // - // x86-avx512: vpslld ymm0, ymm0, 31 - // x86-avx512-NEXT: vpmovd2m k1, ymm0 + // x86-avx512-NOT: vpslld + // x86-avx512: vpmovd2m k1, ymm0 // x86-avx512-NEXT: vmovups ymm0 {k1} {z}, ymmword ptr [rdi] simd_masked_load(mask, pointer, f32x8([0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32, 0_f32])) } @@ -78,11 +78,10 @@ pub unsafe extern "C" fn load_f32x8(mask: m32x8, pointer: *const f32) -> f32x8 { // CHECK-LABEL: load_f64x4 #[no_mangle] pub unsafe extern "C" fn load_f64x4(mask: m64x4, pointer: *const f64) -> f64x4 { - // x86-avx2: vpsllq ymm0, ymm0, 63 - // x86-avx2-NEXT: vmaskmovpd ymm0, ymm0, ymmword ptr [rdi] + // x86-avx2-NOT: vpsllq + // x86-avx2: vmaskmovpd ymm0, ymm0, ymmword ptr [rdi] // - // x86-avx512: vpsllq ymm0, ymm0, 63 - // x86-avx512-NEXT: vpmovq2m k1, ymm0 - // x86-avx512-NEXT: vmovupd ymm0 {k1} {z}, ymmword ptr [rdi] + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, ymm0 simd_masked_load(mask, pointer, f64x4([0_f64, 0_f64, 0_f64, 0_f64])) } diff --git a/tests/assembly/simd-intrinsic-mask-reduce.rs b/tests/assembly/simd-intrinsic-mask-reduce.rs index 763401755fad2..77d854e9bd2ae 100644 --- a/tests/assembly/simd-intrinsic-mask-reduce.rs +++ b/tests/assembly/simd-intrinsic-mask-reduce.rs @@ -30,29 +30,30 @@ extern "rust-intrinsic" { // CHECK-LABEL: mask_reduce_all: #[no_mangle] pub unsafe extern "C" fn mask_reduce_all(m: mask8x16) -> bool { - // x86: psllw xmm0, 7 - // x86-NEXT: pmovmskb eax, xmm0 - // x86-NEXT: {{cmp ax, -1|xor eax, 65535}} + // x86-NOT: psllw + // x86: pmovmskb eax, xmm0 + // x86-NEXT: {{cmp ax, -1|cmp eax, 65535|xor eax, 65535}} // x86-NEXT: sete al // - // aarch64: shl v0.16b, v0.16b, #7 - // aarch64-NEXT: cmlt v0.16b, v0.16b, #0 - // aarch64-NEXT: uminv b0, v0.16b - // aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0 - // aarch64-NEXT: and w0, [[REG]], #0x1 + // aarch64-NOT: shl + // aarch64: cmge v0.16b, v0.16b, #0 + // aarch64-DAG: mov [[REG1:[a-z0-9]+]], #1 + // aarch64-DAG: umaxv b0, v0.16b + // aarch64-NEXT: fmov [[REG2:[a-z0-9]+]], s0 + // aarch64-NEXT: bic w0, [[REG1]], [[REG2]] simd_reduce_all(m) } // CHECK-LABEL: mask_reduce_any: #[no_mangle] pub unsafe extern "C" fn mask_reduce_any(m: mask8x16) -> bool { - // x86: psllw xmm0, 7 - // x86-NEXT: pmovmskb + // x86-NOT: psllw + // x86: pmovmskb // x86-NEXT: test eax, eax // x86-NEXT: setne al // - // aarch64: shl v0.16b, v0.16b, #7 - // aarch64-NEXT: cmlt v0.16b, v0.16b, #0 + // aarch64-NOT: shl + // aarch64: cmlt v0.16b, v0.16b, #0 // aarch64-NEXT: umaxv b0, v0.16b // aarch64-NEXT: fmov [[REG:[a-z0-9]+]], s0 // aarch64-NEXT: and w0, [[REG]], #0x1 diff --git a/tests/assembly/simd-intrinsic-mask-store.rs b/tests/assembly/simd-intrinsic-mask-store.rs index a6611e1c23d5c..9609048989a5b 100644 --- a/tests/assembly/simd-intrinsic-mask-store.rs +++ b/tests/assembly/simd-intrinsic-mask-store.rs @@ -46,8 +46,8 @@ extern "rust-intrinsic" { pub unsafe extern "C" fn store_i8x16(mask: m8x16, pointer: *mut i8, value: i8x16) { // Since avx2 supports no masked stores for bytes, the code tests each individual bit // and jumps to code that extracts individual bytes to memory. - // x86-avx2: vpsllw xmm0, xmm0, 7 - // x86-avx2-NEXT: vpmovmskb eax, xmm0 + // x86-avx2-NOT: vpsllw + // x86-avx2: vpmovmskb eax, xmm0 // x86-avx2-NEXT: test al, 1 // x86-avx2-NEXT: jne // x86-avx2-NEXT: test al, 2 @@ -55,8 +55,8 @@ pub unsafe extern "C" fn store_i8x16(mask: m8x16, pointer: *mut i8, value: i8x16 // x86-avx2-DAG: vpextrb byte ptr [rdi + 1], xmm1, 1 // x86-avx2-DAG: vpextrb byte ptr [rdi], xmm1, 0 // - // x86-avx512: vpsllw xmm0, xmm0, 7 - // x86-avx512-NEXT: vpmovb2m k1, xmm0 + // x86-avx512-NOT: vpsllw + // x86-avx512: vpmovb2m k1, xmm0 // x86-avx512-NEXT: vmovdqu8 xmmword ptr [rdi] {k1}, xmm1 simd_masked_store(mask, pointer, value) } @@ -64,11 +64,11 @@ pub unsafe extern "C" fn store_i8x16(mask: m8x16, pointer: *mut i8, value: i8x16 // CHECK-LABEL: store_f32x8 #[no_mangle] pub unsafe extern "C" fn store_f32x8(mask: m32x8, pointer: *mut f32, value: f32x8) { - // x86-avx2: vpslld ymm0, ymm0, 31 - // x86-avx2-NEXT: vmaskmovps ymmword ptr [rdi], ymm0, ymm1 + // x86-avx2-NOT: vpslld + // x86-avx2: vmaskmovps ymmword ptr [rdi], ymm0, ymm1 // - // x86-avx512: vpslld ymm0, ymm0, 31 - // x86-avx512-NEXT: vpmovd2m k1, ymm0 + // x86-avx512-NOT: vpslld + // x86-avx512: vpmovd2m k1, ymm0 // x86-avx512-NEXT: vmovups ymmword ptr [rdi] {k1}, ymm1 simd_masked_store(mask, pointer, value) } @@ -76,11 +76,11 @@ pub unsafe extern "C" fn store_f32x8(mask: m32x8, pointer: *mut f32, value: f32x // CHECK-LABEL: store_f64x4 #[no_mangle] pub unsafe extern "C" fn store_f64x4(mask: m64x4, pointer: *mut f64, value: f64x4) { - // x86-avx2: vpsllq ymm0, ymm0, 63 - // x86-avx2-NEXT: vmaskmovpd ymmword ptr [rdi], ymm0, ymm1 + // x86-avx2-NOT: vpsllq + // x86-avx2: vmaskmovpd ymmword ptr [rdi], ymm0, ymm1 // - // x86-avx512: vpsllq ymm0, ymm0, 63 - // x86-avx512-NEXT: vpmovq2m k1, ymm0 + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, ymm0 // x86-avx512-NEXT: vmovupd ymmword ptr [rdi] {k1}, ymm1 simd_masked_store(mask, pointer, value) } diff --git a/tests/assembly/simd-intrinsic-scatter.rs b/tests/assembly/simd-intrinsic-scatter.rs index 6ffefb0801aec..2f725e66e99f5 100644 --- a/tests/assembly/simd-intrinsic-scatter.rs +++ b/tests/assembly/simd-intrinsic-scatter.rs @@ -33,8 +33,8 @@ extern "rust-intrinsic" { // CHECK-LABEL: scatter_f64x4 #[no_mangle] pub unsafe extern "C" fn scatter_f64x4(values: f64x4, ptrs: pf64x4, mask: m64x4) { - // x86-avx512: vpsllq ymm2, ymm2, 63 - // x86-avx512-NEXT: vpmovq2m k1, ymm2 + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, ymm2 // x86-avx512-NEXT: vscatterqpd ymmword ptr [1*ymm1] {k1}, ymm0 simd_scatter(values, ptrs, mask) } diff --git a/tests/assembly/simd-intrinsic-select.rs b/tests/assembly/simd-intrinsic-select.rs index 3f36402e3d0d5..1929adcbb1d0e 100644 --- a/tests/assembly/simd-intrinsic-select.rs +++ b/tests/assembly/simd-intrinsic-select.rs @@ -59,15 +59,15 @@ extern "rust-intrinsic" { // CHECK-LABEL: select_i8x16 #[no_mangle] pub unsafe extern "C" fn select_i8x16(mask: m8x16, a: i8x16, b: i8x16) -> i8x16 { - // x86-avx2: vpsllw xmm0, xmm0, 7 - // x86-avx2-NEXT: vpblendvb xmm0, xmm2, xmm1, xmm0 + // x86-avx2-NOT: vpsllw + // x86-avx2: vpblendvb xmm0, xmm2, xmm1, xmm0 // - // x86-avx512: vpsllw xmm0, xmm0, 7 - // x86-avx512-NEXT: vpmovb2m k1, xmm0 + // x86-avx512-NOT: vpsllw + // x86-avx512: vpmovb2m k1, xmm0 // x86-avx512-NEXT: vpblendmb xmm0 {k1}, xmm2, xmm1 // - // aarch64: shl v0.16b, v0.16b, #7 - // aarch64-NEXT: cmlt v0.16b, v0.16b, #0 + // aarch64-NOT: shl + // aarch64: cmlt v0.16b, v0.16b, #0 // aarch64-NEXT: bsl v0.16b, v1.16b, v2.16b simd_select(mask, a, b) } @@ -75,15 +75,15 @@ pub unsafe extern "C" fn select_i8x16(mask: m8x16, a: i8x16, b: i8x16) -> i8x16 // CHECK-LABEL: select_f32x4 #[no_mangle] pub unsafe extern "C" fn select_f32x4(mask: m32x4, a: f32x4, b: f32x4) -> f32x4 { - // x86-avx2: vpslld xmm0, xmm0, 31 - // x86-avx2-NEXT: vblendvps xmm0, xmm2, xmm1, xmm0 + // x86-avx2-NOT: vpslld + // x86-avx2: vblendvps xmm0, xmm2, xmm1, xmm0 // - // x86-avx512: vpslld xmm0, xmm0, 31 - // x86-avx512-NEXT: vpmovd2m k1, xmm0 + // x86-avx512-NOT: vpslld + // x86-avx512: vpmovd2m k1, xmm0 // x86-avx512-NEXT: vblendmps xmm0 {k1}, xmm2, xmm1 // - // aarch64: shl v0.4s, v0.4s, #31 - // aarch64-NEXT: cmlt v0.4s, v0.4s, #0 + // aarch64-NOT: shl + // aarch64: cmlt v0.4s, v0.4s, #0 // aarch64-NEXT: bsl v0.16b, v1.16b, v2.16b simd_select(mask, a, b) } @@ -91,15 +91,15 @@ pub unsafe extern "C" fn select_f32x4(mask: m32x4, a: f32x4, b: f32x4) -> f32x4 // CHECK-LABEL: select_f64x2 #[no_mangle] pub unsafe extern "C" fn select_f64x2(mask: m64x2, a: f64x2, b: f64x2) -> f64x2 { - // x86-avx2: vpsllq xmm0, xmm0, 63 - // x86-avx2-NEXT: vblendvpd xmm0, xmm2, xmm1, xmm0 + // x86-avx2-NOT: vpsllq + // x86-avx2: vblendvpd xmm0, xmm2, xmm1, xmm0 // - // x86-avx512: vpsllq xmm0, xmm0, 63 - // x86-avx512-NEXT: vpmovq2m k1, xmm0 + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, xmm0 // x86-avx512-NEXT: vblendmpd xmm0 {k1}, xmm2, xmm1 // - // aarch64: shl v0.2d, v0.2d, #63 - // aarch64-NEXT: cmlt v0.2d, v0.2d, #0 + // aarch64-NOT: shl + // aarch64: cmlt v0.2d, v0.2d, #0 // aarch64-NEXT: bsl v0.16b, v1.16b, v2.16b simd_select(mask, a, b) } @@ -109,11 +109,11 @@ pub unsafe extern "C" fn select_f64x2(mask: m64x2, a: f64x2, b: f64x2) -> f64x2 pub unsafe extern "C" fn select_f64x4(mask: m64x4, a: f64x4, b: f64x4) -> f64x4 { // The parameter is a 256 bit vector which in the C abi is only valid for avx targets. // - // x86-avx2: vpsllq ymm0, ymm0, 63 - // x86-avx2-NEXT: vblendvpd ymm0, ymm2, ymm1, ymm0 + // x86-avx2-NOT: vpsllq + // x86-avx2: vblendvpd ymm0, ymm2, ymm1, ymm0 // - // x86-avx512: vpsllq ymm0, ymm0, 63 - // x86-avx512-NEXT: vpmovq2m k1, ymm0 + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, ymm0 // x86-avx512-NEXT: vblendmpd ymm0 {k1}, ymm2, ymm1 simd_select(mask, a, b) } @@ -123,8 +123,8 @@ pub unsafe extern "C" fn select_f64x4(mask: m64x4, a: f64x4, b: f64x4) -> f64x4 pub unsafe extern "C" fn select_f64x8(mask: m64x8, a: f64x8, b: f64x8) -> f64x8 { // The parameter is a 256 bit vector which in the C abi is only valid for avx512 targets. // - // x86-avx512: vpsllq zmm0, zmm0, 63 - // x86-avx512-NEXT: vpmovq2m k1, zmm0 + // x86-avx512-NOT: vpsllq + // x86-avx512: vpmovq2m k1, zmm0 // x86-avx512-NEXT: vblendmpd zmm0 {k1}, zmm2, zmm1 simd_select(mask, a, b) } diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-gather.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-gather.rs index 863a9606c7e99..9829758fe315f 100644 --- a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-gather.rs +++ b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-gather.rs @@ -23,7 +23,9 @@ extern "rust-intrinsic" { #[no_mangle] pub unsafe fn gather_f32x2(pointers: Vec2<*const f32>, mask: Vec2, values: Vec2) -> Vec2 { - // CHECK: call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> {{.*}}, <2 x float> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> [[B]], <2 x float> {{.*}}) simd_gather(values, pointers, mask) } @@ -31,6 +33,8 @@ pub unsafe fn gather_f32x2(pointers: Vec2<*const f32>, mask: Vec2, #[no_mangle] pub unsafe fn gather_pf32x2(pointers: Vec2<*const *const f32>, mask: Vec2, values: Vec2<*const f32>) -> Vec2<*const f32> { - // CHECK: call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> {{.*}}, <2 x ptr> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: call <2 x ptr> @llvm.masked.gather.v2p0.v2p0(<2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> [[B]], <2 x ptr> {{.*}}) simd_gather(values, pointers, mask) } diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-load.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-load.rs index b41c42810aafd..0b9a213203365 100644 --- a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-load.rs +++ b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-load.rs @@ -21,7 +21,9 @@ extern "rust-intrinsic" { #[no_mangle] pub unsafe fn load_f32x2(mask: Vec2, pointer: *const f32, values: Vec2) -> Vec2 { - // CHECK: call <2 x float> @llvm.masked.load.v2f32.p0(ptr {{.*}}, i32 4, <2 x i1> {{.*}}, <2 x float> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: call <2 x float> @llvm.masked.load.v2f32.p0(ptr {{.*}}, i32 4, <2 x i1> [[B]], <2 x float> {{.*}}) simd_masked_load(mask, pointer, values) } @@ -29,6 +31,8 @@ pub unsafe fn load_f32x2(mask: Vec2, pointer: *const f32, #[no_mangle] pub unsafe fn load_pf32x4(mask: Vec4, pointer: *const *const f32, values: Vec4<*const f32>) -> Vec4<*const f32> { - // CHECK: call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr {{.*}}, i32 {{.*}}, <4 x i1> {{.*}}, <4 x ptr> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <4 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <4 x i32> [[A]] to <4 x i1> + // CHECK: call <4 x ptr> @llvm.masked.load.v4p0.p0(ptr {{.*}}, i32 {{.*}}, <4 x i1> [[B]], <4 x ptr> {{.*}}) simd_masked_load(mask, pointer, values) } diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-store.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-store.rs index 066392bcde682..407eaecc1bf6d 100644 --- a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-store.rs +++ b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-masked-store.rs @@ -20,13 +20,17 @@ extern "rust-intrinsic" { // CHECK-LABEL: @store_f32x2 #[no_mangle] pub unsafe fn store_f32x2(mask: Vec2, pointer: *mut f32, values: Vec2) { - // CHECK: call void @llvm.masked.store.v2f32.p0(<2 x float> {{.*}}, ptr {{.*}}, i32 4, <2 x i1> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: call void @llvm.masked.store.v2f32.p0(<2 x float> {{.*}}, ptr {{.*}}, i32 4, <2 x i1> [[B]]) simd_masked_store(mask, pointer, values) } // CHECK-LABEL: @store_pf32x4 #[no_mangle] pub unsafe fn store_pf32x4(mask: Vec4, pointer: *mut *const f32, values: Vec4<*const f32>) { - // CHECK: call void @llvm.masked.store.v4p0.p0(<4 x ptr> {{.*}}, ptr {{.*}}, i32 {{.*}}, <4 x i1> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <4 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <4 x i32> [[A]] to <4 x i1> + // CHECK: call void @llvm.masked.store.v4p0.p0(<4 x ptr> {{.*}}, ptr {{.*}}, i32 {{.*}}, <4 x i1> [[B]]) simd_masked_store(mask, pointer, values) } diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-scatter.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-scatter.rs index e85bd61c7f83b..c2ae2c8b9b035 100644 --- a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-scatter.rs +++ b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-scatter.rs @@ -23,7 +23,9 @@ extern "rust-intrinsic" { #[no_mangle] pub unsafe fn scatter_f32x2(pointers: Vec2<*mut f32>, mask: Vec2, values: Vec2) { - // CHECK: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> {{.*}}, <2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> {{.*}}, <2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> [[B]] simd_scatter(values, pointers, mask) } @@ -32,6 +34,8 @@ pub unsafe fn scatter_f32x2(pointers: Vec2<*mut f32>, mask: Vec2, #[no_mangle] pub unsafe fn scatter_pf32x2(pointers: Vec2<*mut *const f32>, mask: Vec2, values: Vec2<*const f32>) { - // CHECK: call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> {{.*}}, <2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> {{.*}}) + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> {{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: call void @llvm.masked.scatter.v2p0.v2p0(<2 x ptr> {{.*}}, <2 x ptr> {{.*}}, i32 {{.*}}, <2 x i1> [[B]] simd_scatter(values, pointers, mask) } diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs index 05d2bf627ef12..06344a58cf8e7 100644 --- a/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs +++ b/tests/codegen/simd-intrinsic/simd-intrinsic-generic-select.rs @@ -6,32 +6,48 @@ #[allow(non_camel_case_types)] #[repr(simd)] -#[derive(Copy, Clone, PartialEq, Debug)] +#[derive(Copy, Clone)] pub struct f32x4(pub f32, pub f32, pub f32, pub f32); #[repr(simd)] -#[derive(Copy, Clone, PartialEq, Debug)] +#[derive(Copy, Clone)] pub struct f32x8(f32, f32, f32, f32, f32, f32, f32, f32); #[repr(simd)] -#[derive(Copy, Clone, PartialEq, Debug)] -pub struct b8x4(pub i8, pub i8, pub i8, pub i8); +#[derive(Copy, Clone)] +pub struct m8x4(pub i8, pub i8, pub i8, pub i8); + +#[repr(simd)] +#[derive(Copy, Clone)] +pub struct m32x4(pub i32, pub i32, pub i32, pub i32); extern "rust-intrinsic" { fn simd_select(x: T, a: U, b: U) -> U; fn simd_select_bitmask(x: T, a: U, b: U) -> U; } -// CHECK-LABEL: @select +// CHECK-LABEL: @select_m8 +#[no_mangle] +pub unsafe fn select_m8(m: m8x4, a: f32x4, b: f32x4) -> f32x4 { + // CHECK: [[A:%[0-9]+]] = lshr <4 x i8> %{{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <4 x i8> [[A]] to <4 x i1> + // CHECK: select <4 x i1> [[B]] + simd_select(m, a, b) +} + +// CHECK-LABEL: @select_m32 #[no_mangle] -pub unsafe fn select(m: b8x4, a: f32x4, b: f32x4) -> f32x4 { - // CHECK: select <4 x i1> +pub unsafe fn select_m32(m: m32x4, a: f32x4, b: f32x4) -> f32x4 { + // CHECK: [[A:%[0-9]+]] = lshr <4 x i32> %{{.*}}, + // CHECK: [[B:%[0-9]+]] = trunc <4 x i32> [[A]] to <4 x i1> + // CHECK: select <4 x i1> [[B]] simd_select(m, a, b) } // CHECK-LABEL: @select_bitmask #[no_mangle] pub unsafe fn select_bitmask(m: i8, a: f32x8, b: f32x8) -> f32x8 { - // CHECK: select <8 x i1> + // CHECK: [[A:%[0-9]+]] = bitcast i8 {{.*}} to <8 x i1> + // CHECK: select <8 x i1> [[A]] simd_select_bitmask(m, a, b) } diff --git a/tests/codegen/simd-intrinsic/simd-intrinsic-mask-reduce.rs b/tests/codegen/simd-intrinsic/simd-intrinsic-mask-reduce.rs new file mode 100644 index 0000000000000..d50194d8fa2a4 --- /dev/null +++ b/tests/codegen/simd-intrinsic/simd-intrinsic-mask-reduce.rs @@ -0,0 +1,65 @@ +//@ compile-flags: -C no-prepopulate-passes +// + +#![crate_type = "lib"] +#![feature(repr_simd, intrinsics)] +#![allow(non_camel_case_types)] + +#[repr(simd)] +#[derive(Copy, Clone)] +pub struct mask32x2(i32, i32); + +#[repr(simd)] +#[derive(Copy, Clone)] +pub struct mask8x16(i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8); + +extern "rust-intrinsic" { + fn simd_reduce_all(x: T) -> bool; + fn simd_reduce_any(x: T) -> bool; +} + +// NOTE(eddyb) `%{{x|1}}` is used because on some targets (e.g. WASM) +// SIMD vectors are passed directly, resulting in `%x` being a vector, +// while on others they're passed indirectly, resulting in `%x` being +// a pointer to a vector, and `%1` a vector loaded from that pointer. +// This is controlled by the target spec option `simd_types_indirect`. + +// CHECK-LABEL: @reduce_any_32x2 +#[no_mangle] +pub unsafe fn reduce_any_32x2(x: mask32x2) -> bool { + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> %{{x|1}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: [[C:%[0-9]+]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> [[B]]) + // CHECK: %{{[0-9]+}} = zext i1 [[C]] to i8 + simd_reduce_any(x) +} + +// CHECK-LABEL: @reduce_all_32x2 +#[no_mangle] +pub unsafe fn reduce_all_32x2(x: mask32x2) -> bool { + // CHECK: [[A:%[0-9]+]] = lshr <2 x i32> %{{x|1}}, + // CHECK: [[B:%[0-9]+]] = trunc <2 x i32> [[A]] to <2 x i1> + // CHECK: [[C:%[0-9]+]] = call i1 @llvm.vector.reduce.and.v2i1(<2 x i1> [[B]]) + // CHECK: %{{[0-9]+}} = zext i1 [[C]] to i8 + simd_reduce_all(x) +} + +// CHECK-LABEL: @reduce_any_8x16 +#[no_mangle] +pub unsafe fn reduce_any_8x16(x: mask8x16) -> bool { + // CHECK: [[A:%[0-9]+]] = lshr <16 x i8> %{{x|1}}, + // CHECK: [[B:%[0-9]+]] = trunc <16 x i8> [[A]] to <16 x i1> + // CHECK: [[C:%[0-9]+]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[B]]) + // CHECK: %{{[0-9]+}} = zext i1 [[C]] to i8 + simd_reduce_any(x) +} + +// CHECK-LABEL: @reduce_all_8x16 +#[no_mangle] +pub unsafe fn reduce_all_8x16(x: mask8x16) -> bool { + // CHECK: [[A:%[0-9]+]] = lshr <16 x i8> %{{x|1}}, + // CHECK: [[B:%[0-9]+]] = trunc <16 x i8> [[A]] to <16 x i1> + // CHECK: [[C:%[0-9]+]] = call i1 @llvm.vector.reduce.and.v16i1(<16 x i1> [[B]]) + // CHECK: %{{[0-9]+}} = zext i1 [[C]] to i8 + simd_reduce_all(x) +}