Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Not using WASM bitmask / all_true instructions #351

Closed
stepantubanov opened this issue May 26, 2023 · 4 comments
Closed

Not using WASM bitmask / all_true instructions #351

stepantubanov opened this issue May 26, 2023 · 4 comments
Labels
C-bug Category: Bug

Comments

@stepantubanov
Copy link

Hi,

Not sure if this belongs in here or in compiler repo.
Example code (https://godbolt.org/z/4fqcdGqq6):

#![feature(portable_simd)]

use std::simd::{self, SimdPartialEq, ToBitMask};

pub fn to_bitmask(v: &[u8; 16]) -> usize {
    let data = simd::u8x16::from_array(*v);
    let zero = simd::u8x16::splat(0);
    let mask = data.simd_eq(zero);
    mask.to_bitmask() as usize
}

pub fn all_zeros(v: &[u8; 16]) -> bool {
    let data = simd::u8x16::from_array(*v);
    let zero = simd::u8x16::splat(0);
    let mask = data.simd_eq(zero);
    mask.all()
}

Ideally it should've been compiled using i8x16.bitmask and i8x16.all_true instructions.

https://github.com/WebAssembly/simd/blob/main/proposals/simd/SIMD.md#all-lanes-true

Instead, it generated some really slow code extracting individual lanes.
Full wasm output:

to_bitmask wasm
example::to_bitmask:
        global.get      __stack_pointer
        i32.const       16
        i32.sub 
        drop
        local.get       0
        v128.load       0:p2align=0
        v128.const      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        i8x16.eq
        local.tee       1
        i8x16.extract_lane_u    0
        i32.const       1
        i32.and 
        local.get       1
        i8x16.extract_lane_u    1
        i32.const       1
        i32.and 
        i32.const       1
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    2
        i32.const       1
        i32.and 
        i32.const       2
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    3
        i32.const       1
        i32.and 
        i32.const       3
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    4
        i32.const       1
        i32.and 
        i32.const       4
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    5
        i32.const       1
        i32.and 
        i32.const       5
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    6
        i32.const       1
        i32.and 
        i32.const       6
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    7
        i32.const       1
        i32.and 
        i32.const       7
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    8
        i32.const       1
        i32.and 
        i32.const       8
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    9
        i32.const       1
        i32.and 
        i32.const       9
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    10
        i32.const       1
        i32.and 
        i32.const       10
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    11
        i32.const       1
        i32.and 
        i32.const       11
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    12
        i32.const       1
        i32.and 
        i32.const       12
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    13
        i32.const       1
        i32.and 
        i32.const       13
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    14
        i32.const       1
        i32.and 
        i32.const       14
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    15
        i32.const       15
        i32.shl 
        i32.or  
        i32.const       65535
        i32.and 
        end_function
all_zeros wasm
example::all_zeros:
        global.get      __stack_pointer
        i32.const       16
        i32.sub 
        drop
        local.get       0
        v128.load       0:p2align=0
        v128.const      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
        i8x16.ne
        local.tee       1
        i8x16.extract_lane_u    0
        i32.const       1
        i32.and 
        local.get       1
        i8x16.extract_lane_u    1
        i32.const       1
        i32.and 
        i32.const       1
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    2
        i32.const       1
        i32.and 
        i32.const       2
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    3
        i32.const       1
        i32.and 
        i32.const       3
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    4
        i32.const       1
        i32.and 
        i32.const       4
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    5
        i32.const       1
        i32.and 
        i32.const       5
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    6
        i32.const       1
        i32.and 
        i32.const       6
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    7
        i32.const       1
        i32.and 
        i32.const       7
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    8
        i32.const       1
        i32.and 
        i32.const       8
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    9
        i32.const       1
        i32.and 
        i32.const       9
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    10
        i32.const       1
        i32.and 
        i32.const       10
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    11
        i32.const       1
        i32.and 
        i32.const       11
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    12
        i32.const       1
        i32.and 
        i32.const       12
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    13
        i32.const       1
        i32.and 
        i32.const       13
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    14
        i32.const       1
        i32.and 
        i32.const       14
        i32.shl 
        i32.or  
        local.get       1
        i8x16.extract_lane_u    15
        i32.const       15
        i32.shl 
        i32.or  
        i32.const       65535
        i32.and 
        i32.eqz
        end_function

Meta

rustc --version --verbose:

rustc 1.71.0-nightly (521f4dae1 2023-05-19)
@stepantubanov stepantubanov added the C-bug Category: Bug label May 26, 2023
@calebzulawski
Copy link
Member

This is the correct place to file this issue, thanks!

In this case, LLVM is lowering to wasm poorly. I'm working on a change that should fix all()/any() first, then I'll investigate to_bitmask.

@calebzulawski
Copy link
Member

This is fixed in llvm/llvm-project@8392bf6, not sure how long until this makes it to nightly

@calebzulawski
Copy link
Member

Resolved via llvm/llvm-project@8392bf6 and rust-lang/rust#114048

@stepantubanov
Copy link
Author

@calebzulawski Awesome, thank you! 🚀

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
C-bug Category: Bug
Projects
None yet
Development

No branches or pull requests

2 participants