bytecodealliance · afonso360 · Jun 6, 2023 · May 29, 2023 · May 29, 2023 · May 29, 2023
@@ -1573,6 +1573,12 @@
 
 ;; UImm5 Helpers
 
+;; Extractor that matches a `Value` equivalent to a replicated UImm5 on all lanes.
+;; TODO: Try matching vconst here as well
+(decl replicated_uimm5 (UImm5) Value)
+(extractor (replicated_uimm5 n)
+  (def_inst (splat (uimm5_from_value n))))
+
 ;; Helper to go directly from a `Value`, when it's an `iconst`, to an `UImm5`.
 (decl uimm5_from_value (UImm5) Value)
 (extractor (uimm5_from_value n)

@@ -654,17 +654,39 @@ fn riscv64_get_operands<F: Fn(VReg) -> VReg>(inst: &Inst, collector: &mut Operan
 
             collector.reg_use(vs1);
             collector.reg_use(vs2);
-            collector.reg_def(vd);
+
+            // If the operation forbids source/destination overlap, then we must
+            // register it as an early_def. This encodes the constraint that
+            // these must not overlap.
+            if op.forbids_src_dst_overlaps() {
+                collector.reg_early_def(vd);
+            } else {
+                collector.reg_def(vd);
+            }
+
             vec_mask_operands(mask, collector);
         }
         &Inst::VecAluRRImm5 {
-            vd, vs2, ref mask, ..
+            op,
+            vd,
+            vs2,
+            ref mask,
+            ..
         } => {
             debug_assert_eq!(vd.to_reg().class(), RegClass::Vector);
             debug_assert_eq!(vs2.class(), RegClass::Vector);
 
             collector.reg_use(vs2);
-            collector.reg_def(vd);
+
+            // If the operation forbids source/destination overlap, then we must
+            // register it as an early_def. This encodes the constraint that
+            // these must not overlap.
+            if op.forbids_src_dst_overlaps() {
+                collector.reg_early_def(vd);
+            } else {
+                collector.reg_def(vd);
+            }
+
             vec_mask_operands(mask, collector);
         }
         &Inst::VecAluRR {

@@ -296,6 +296,7 @@ impl VecAluOpRRR {
             VecAluOpRRR::VssubuVV | VecAluOpRRR::VssubuVX => 0b100010,
             VecAluOpRRR::VssubVV | VecAluOpRRR::VssubVX => 0b100011,
             VecAluOpRRR::VfsgnjnVV => 0b001001,
+            VecAluOpRRR::VrgatherVV | VecAluOpRRR::VrgatherVX => 0b001100,
             VecAluOpRRR::VmsltVX => 0b011011,
         }
     }
@@ -318,7 +319,8 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VminVV
             | VecAluOpRRR::VmaxuVV
             | VecAluOpRRR::VmaxVV
-            | VecAluOpRRR::VmergeVVM => VecOpCategory::OPIVV,
+            | VecAluOpRRR::VmergeVVM
+            | VecAluOpRRR::VrgatherVV => VecOpCategory::OPIVV,
             VecAluOpRRR::VmulVV
             | VecAluOpRRR::VmulhVV
             | VecAluOpRRR::VmulhuVV
@@ -343,7 +345,8 @@ impl VecAluOpRRR {
             | VecAluOpRRR::VmaxVX
             | VecAluOpRRR::VslidedownVX
             | VecAluOpRRR::VmergeVXM
-            | VecAluOpRRR::VmsltVX => VecOpCategory::OPIVX,
+            | VecAluOpRRR::VmsltVX
+            | VecAluOpRRR::VrgatherVX => VecOpCategory::OPIVX,
             VecAluOpRRR::VfaddVV
             | VecAluOpRRR::VfsubVV
             | VecAluOpRRR::VfmulVV
@@ -368,6 +371,14 @@ impl VecAluOpRRR {
             _ => unreachable!(),
         }
     }
+
+    /// Some instructions do not allow the source and destination registers to overlap.
+    pub fn forbids_src_dst_overlaps(&self) -> bool {
+        match self {
+            VecAluOpRRR::VrgatherVV | VecAluOpRRR::VrgatherVX => true,
+            _ => false,
+        }
+    }
 }
 
 impl fmt::Display for VecAluOpRRR {
@@ -408,6 +419,7 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VmergeVIM => 0b010111,
             VecAluOpRRImm5::VsadduVI => 0b100000,
             VecAluOpRRImm5::VsaddVI => 0b100001,
+            VecAluOpRRImm5::VrgatherVI => 0b001100,
         }
     }
 
@@ -424,7 +436,8 @@ impl VecAluOpRRImm5 {
             | VecAluOpRRImm5::VslidedownVI
             | VecAluOpRRImm5::VmergeVIM
             | VecAluOpRRImm5::VsadduVI
-            | VecAluOpRRImm5::VsaddVI => VecOpCategory::OPIVI,
+            | VecAluOpRRImm5::VsaddVI
+            | VecAluOpRRImm5::VrgatherVI => VecOpCategory::OPIVI,
         }
     }
 
@@ -433,7 +446,8 @@ impl VecAluOpRRImm5 {
             VecAluOpRRImm5::VsllVI
             | VecAluOpRRImm5::VsrlVI
             | VecAluOpRRImm5::VsraVI
-            | VecAluOpRRImm5::VslidedownVI => true,
+            | VecAluOpRRImm5::VslidedownVI
+            | VecAluOpRRImm5::VrgatherVI => true,
             VecAluOpRRImm5::VaddVI
             | VecAluOpRRImm5::VrsubVI
             | VecAluOpRRImm5::VandVI
@@ -444,6 +458,14 @@ impl VecAluOpRRImm5 {
             | VecAluOpRRImm5::VsaddVI => false,
         }
     }
+
+    /// Some instructions do not allow the source and destination registers to overlap.
+    pub fn forbids_src_dst_overlaps(&self) -> bool {
+        match self {
+            VecAluOpRRImm5::VrgatherVI => true,
+            _ => false,
+        }
+    }
 }
 
 impl fmt::Display for VecAluOpRRImm5 {

@@ -117,6 +117,7 @@
   (VmergeVVM)
   (VredmaxuVS)
   (VredminuVS)
+  (VrgatherVV)
 
   ;; Vector-Scalar Opcodes
   (VaddVX)
@@ -145,6 +146,7 @@
   (VfrdivVF)
   (VmergeVXM)
   (VfmergeVFM)
+  (VrgatherVX)
   (VmsltVX)
 ))
 
@@ -163,6 +165,7 @@
   (VxorVI)
   (VslidedownVI)
   (VmergeVIM)
+  (VrgatherVI)
 ))
 
 ;; Imm only ALU Ops
@@ -718,6 +721,25 @@
 (rule (rv_vredmaxu_vs vs2 vs1 mask vstate)
   (vec_alu_rrr (VecAluOpRRR.VredmaxuVS) vs2 vs1 mask vstate))
 
+;; Helper for emitting the `vrgather.vv` instruction.
+;;
+;; vd[i] = (vs1[i] >= VLMAX) ? 0 : vs2[vs1[i]];
+(decl rv_vrgather_vv (VReg VReg VecOpMasking VState) VReg)
+(rule (rv_vrgather_vv vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VrgatherVV) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vrgather.vx` instruction.
+;;
+;; vd[i] = (x[rs1] >= VLMAX) ? 0 : vs2[x[rs1]]
+(decl rv_vrgather_vx (VReg XReg VecOpMasking VState) VReg)
+(rule (rv_vrgather_vx vs2 vs1 mask vstate)
+  (vec_alu_rrr (VecAluOpRRR.VrgatherVX) vs2 vs1 mask vstate))
+
+;; Helper for emitting the `vrgather.vi` instruction.
+(decl rv_vrgather_vi (VReg UImm5 VecOpMasking VState) VReg)
+(rule (rv_vrgather_vi vs2 imm mask vstate)
+  (vec_alu_rr_uimm5 (VecAluOpRRImm5.VrgatherVI) vs2 imm mask vstate))
+
 ;; Helper for emitting the `vmslt.vx` (Vector Mask Set Less Than) instruction.
 (decl rv_vmslt_vx (VReg XReg VecOpMasking VState) VReg)
 (rule (rv_vmslt_vx vs2 vs1 mask vstate)

@@ -1407,3 +1407,14 @@
         ;; use the original type as a VState and avoid a state change.
         (x_mask XReg (rv_vmv_xs mask (vstate_from_type $I64X2))))
     (gen_andi x_mask (ty_lane_mask ty))))
+
+;;;; Rules for `swizzle` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+(rule 0 (lower (has_type (ty_vec_fits_in_register ty) (swizzle x y)))
+  (rv_vrgather_vv x y (unmasked) ty))
+
+(rule 1 (lower (has_type (ty_vec_fits_in_register ty) (swizzle x (splat y))))
+  (rv_vrgather_vx x y (unmasked) ty))
+
+(rule 2 (lower (has_type (ty_vec_fits_in_register ty) (swizzle x (replicated_uimm5 y))))
+  (rv_vrgather_vi x y (unmasked) ty))
@@ -0,0 +1,121 @@
+test compile precise-output
+set unwind_info=false
+target riscv64 has_v
+
+function %swizzle_i8x16(i8x16, i8x16) -> i8x16 {
+block0(v0: i8x16, v1: i8x16):
+    v2 = swizzle v0, v1
+    return v2
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vle8.v v3,32(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vrgather.vv v6,v1,v3 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v6,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   addi t6, s0, 0x20
+;   .byte 0x87, 0x81, 0x0f, 0x02
+;   .byte 0x57, 0x83, 0x11, 0x32
+;   .byte 0x27, 0x03, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swizzle_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = swizzle v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vrgather.vx v5,v1,a0 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v5,0(a1) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0xd7, 0x42, 0x15, 0x32
+;   .byte 0xa7, 0x82, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
+function %swizzle_splat_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 2
+    v2 = splat.i8x16 v1
+    v3 = swizzle v0, v2
+    return v3
+}
+
+; VCode:
+;   add sp,-16
+;   sd ra,8(sp)
+;   sd fp,0(sp)
+;   mv fp,sp
+; block0:
+;   vle8.v v1,16(fp) #avl=16, #vtype=(e8, m1, ta, ma)
+;   vrgather.vi v4,v1,2 #avl=16, #vtype=(e8, m1, ta, ma)
+;   vse8.v v4,0(a0) #avl=16, #vtype=(e8, m1, ta, ma)
+;   ld ra,8(sp)
+;   ld fp,0(sp)
+;   add sp,+16
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   addi sp, sp, -0x10
+;   sd ra, 8(sp)
+;   sd s0, 0(sp)
+;   ori s0, sp, 0
+; block1: ; offset 0x10
+;   .byte 0x57, 0x70, 0x08, 0xcc
+;   addi t6, s0, 0x10
+;   .byte 0x87, 0x80, 0x0f, 0x02
+;   .byte 0x57, 0x32, 0x11, 0x32
+;   .byte 0x27, 0x02, 0x05, 0x02
+;   ld ra, 8(sp)
+;   ld s0, 0(sp)
+;   addi sp, sp, 0x10
+;   ret
+
@@ -5,6 +5,7 @@ target s390x
 set enable_simd
 target x86_64 has_sse3 has_ssse3 has_sse41
 target x86_64 has_sse3 has_ssse3 has_sse41 has_avx
+target riscv64gc has_v
 
 function %swizzle_i8x16(i8x16, i8x16) -> i8x16 {
 block0(v0: i8x16, v1: i8x16):
@@ -13,3 +14,22 @@ block0(v0: i8x16, v1: i8x16):
 }
 ; run: %swizzle_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], [0 9 15 1 6 13 7 11 10 8 100 12 4 2 3 5]) == [1 10 16 2 7 14 8 12 11 9 0 13 5 3 4 6]
 
+function %swizzle_splat_i8x16(i8x16, i8) -> i8x16 {
+block0(v0: i8x16, v1: i8):
+    v2 = splat.i8x16 v1
+    v3 = swizzle v0, v2
+    return v3
+}
+; run: %swizzle_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 5) == [6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]
+; run: %swizzle_splat_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16], 99) == [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
+
+
+function %swizzle_splat_const_i8x16(i8x16) -> i8x16 {
+block0(v0: i8x16):
+    v1 = iconst.i8 2
+    v2 = splat.i8x16 v1
+    v3 = swizzle v0, v2
+    return v3
+}
+; run: %swizzle_splat_const_i8x16([1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]) == [3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
+