From 87779fd823bb0d619c26449db91c7504ce7513c5 Mon Sep 17 00:00:00 2001 From: Chia Date: Fri, 29 Dec 2023 17:15:47 +0900 Subject: [PATCH] [RISCV][ISel] Remove redundant min/max in saturating truncation (#75145) This patch closed #73424, which is also a missed-optimization case similar to #68466 on X86. ## Source Code ``` define void @trunc_sat_i8i16(ptr %x, ptr %y) { %1 = load <8 x i16>, ptr %x, align 16 %2 = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> %1, <8 x i16> ) %3 = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %2, <8 x i16> ) %4 = trunc <8 x i16> %3 to <8 x i8> store <8 x i8> %4, ptr %y, align 8 ret void } ``` ## Before this patch: ``` trunc_sat_i8i16: # @trunc_maxmin_id_i8i16 vsetivli zero, 8, e16, m1, ta, ma vle16.v v8, (a0) li a0, -128 vmax.vx v8, v8, a0 li a0, 127 vmin.vx v8, v8, a0 vsetvli zero, zero, e8, mf2, ta, ma vnsrl.wi v8, v8, 0 vse8.v v8, (a1) ret ``` ## After this patch: ``` trunc_sat_i8i16: # @trunc_maxmin_id_i8i16 vsetivli zero, 8, e8, mf2, ta, ma vle16.v v8, (a0) csrwi vxrm, 0 vnclip.wi v8, v8, 0 vse8.v v8, (a1) ret ``` --- .../Target/RISCV/RISCVInstrInfoVVLPatterns.td | 58 +++ .../CodeGen/RISCV/rvv/fpclamptosat_vec.ll | 170 +++----- llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll | 394 ++++++++++++++++++ 3 files changed, 500 insertions(+), 122 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td index 33bdc3366aa3e3..5b50a4a78c018b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td @@ -2338,6 +2338,64 @@ defm : VPatBinaryVL_VV_VX_VI; defm : VPatBinaryVL_VV_VX; defm : VPatBinaryVL_VV_VX; +// 12.5. Vector Narrowing Fixed-Point Clip Instructions +class VPatTruncSatClipMaxMinBase : + Pat<(vti.Vector (riscv_trunc_vector_vl + (wti.Vector (op1 + (wti.Vector (op2 + (wti.Vector wti.RegClass:$rs1), + (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), op2_value, (XLenVT srcvalue))), + (wti.Vector undef),(wti.Mask V0), VLOpFrag)), + (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), op1_value, (XLenVT srcvalue))), + (wti.Vector undef), (wti.Mask V0), VLOpFrag)), + (vti.Mask V0), VLOpFrag)), + (!cast(inst#"_WI_"#vti.LMul.MX#"_MASK") + (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0, + (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>; + +class VPatTruncSatClipUMin : + Pat<(vti.Vector (riscv_trunc_vector_vl + (wti.Vector (riscv_umin_vl + (wti.Vector wti.RegClass:$rs1), + (wti.Vector (riscv_vmv_v_x_vl (wti.Vector undef), uminval, (XLenVT srcvalue))), + (wti.Vector undef), (wti.Mask V0), VLOpFrag)), + (vti.Mask V0), VLOpFrag)), + (!cast("PseudoVNCLIPU_WI_"#vti.LMul.MX#"_MASK") + (vti.Vector (IMPLICIT_DEF)), wti.RegClass:$rs1, 0, + (vti.Mask V0), 0, GPR:$vl, vti.Log2SEW, TA_MA)>; + +multiclass VPatTruncSatClipMaxMin { + def : VPatTruncSatClipMaxMinBase; + def : VPatTruncSatClipMaxMinBase; +} + +multiclass VPatTruncSatClip { + defvar sew = vti.SEW; + defvar uminval = !sub(!shl(1, sew), 1); + defvar sminval = !sub(!shl(1, !sub(sew, 1)), 1); + defvar smaxval = !sub(0, !shl(1, !sub(sew, 1))); + + let Predicates = !listconcat(GetVTypePredicates.Predicates, + GetVTypePredicates.Predicates) in { + defm : VPatTruncSatClipMaxMin<"PseudoVNCLIP", vti, wti, riscv_smin_vl, + sminval, riscv_smax_vl, smaxval>; + def : VPatTruncSatClipUMin; + } + +} + +foreach vtiToWti = AllWidenableIntVectors in + defm : VPatTruncSatClip; + // 13. Vector Floating-Point Instructions // 13.2. Vector Single-Width Floating-Point Add/Subtract Instructions diff --git a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll index 7497051027fa37..e1ebf2afda657e 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fpclamptosat_vec.ll @@ -39,12 +39,9 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: lui a0, 524288 -; CHECK-V-NEXT: addiw a1, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v8, a1 -; CHECK-V-NEXT: vmax.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -79,11 +76,9 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.xu.f.v v8, v8 -; CHECK-V-NEXT: li a0, -1 -; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vminu.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -198,13 +193,8 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.x.f.v v10, v8 -; CHECK-V-NEXT: lui a0, 524288 -; CHECK-V-NEXT: addiw a1, a0, -1 -; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vmin.vx v8, v10, a1 -; CHECK-V-NEXT: vmax.vx v10, v8, a0 -; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -257,12 +247,8 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.xu.f.v v10, v8 -; CHECK-V-NEXT: li a0, -1 -; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vminu.vx v10, v10, a0 -; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <4 x float> %x to <4 x i64> @@ -510,12 +496,9 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: lui a0, 524288 -; CHECK-V-NEXT: addiw a1, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a1 -; CHECK-V-NEXT: vmax.vx v10, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 @@ -682,11 +665,9 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) { ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: li a0, -1 -; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vminu.vx v10, v10, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 @@ -925,13 +906,9 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.x.f.w v9, v8 -; CHECK-V-NEXT: lui a0, 8 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v9, a0 -; CHECK-V-NEXT: lui a0, 1048568 -; CHECK-V-NEXT: vmax.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v9, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -966,11 +943,9 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.xu.f.w v9, v8 -; CHECK-V-NEXT: lui a0, 16 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vminu.vx v8, v9, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v9, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -1087,13 +1062,9 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: lui a0, 8 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v8, a0 -; CHECK-V-NEXT: lui a0, 1048568 -; CHECK-V-NEXT: vmax.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -1146,11 +1117,9 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.xu.f.v v8, v8 -; CHECK-V-NEXT: lui a0, 16 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vminu.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -1525,13 +1494,9 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 7 -; CHECK-V-NEXT: lui a0, 8 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 -; CHECK-V-NEXT: lui a0, 1048568 -; CHECK-V-NEXT: vmax.vx v10, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 @@ -1808,11 +1773,9 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 7 -; CHECK-V-NEXT: lui a0, 16 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vminu.vx v10, v10, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 @@ -3385,12 +3348,9 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: lui a0, 524288 -; CHECK-V-NEXT: addiw a1, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v8, a1 -; CHECK-V-NEXT: vmax.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -3423,11 +3383,9 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.xu.f.v v8, v8 -; CHECK-V-NEXT: li a0, -1 -; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vminu.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -3539,13 +3497,8 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.x.f.v v10, v8 -; CHECK-V-NEXT: lui a0, 524288 -; CHECK-V-NEXT: addiw a1, a0, -1 -; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vmin.vx v8, v10, a1 -; CHECK-V-NEXT: vmax.vx v10, v8, a0 -; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i64> @@ -3596,12 +3549,8 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfwcvt.rtz.xu.f.v v10, v8 -; CHECK-V-NEXT: li a0, -1 -; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vsetvli zero, zero, e64, m2, ta, ma -; CHECK-V-NEXT: vminu.vx v10, v10, a0 -; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <4 x float> %x to <4 x i64> @@ -3846,12 +3795,9 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: lui a0, 524288 -; CHECK-V-NEXT: addiw a1, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a1 -; CHECK-V-NEXT: vmax.vx v10, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 @@ -4016,11 +3962,9 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) { ; CHECK-V-NEXT: addi a0, a0, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 3 -; CHECK-V-NEXT: li a0, -1 -; CHECK-V-NEXT: srli a0, a0, 32 -; CHECK-V-NEXT: vminu.vx v10, v10, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e32, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 2 ; CHECK-V-NEXT: add sp, sp, a0 @@ -4256,13 +4200,9 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.x.f.w v9, v8 -; CHECK-V-NEXT: lui a0, 8 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v9, a0 -; CHECK-V-NEXT: lui a0, 1048568 -; CHECK-V-NEXT: vmax.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v9, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -4295,11 +4235,9 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-V-NEXT: vfncvt.rtz.xu.f.w v9, v8 -; CHECK-V-NEXT: lui a0, 16 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vminu.vx v8, v9, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf4, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v9, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -4413,13 +4351,9 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.x.f.v v8, v8 -; CHECK-V-NEXT: lui a0, 8 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v8, a0 -; CHECK-V-NEXT: lui a0, 1048568 -; CHECK-V-NEXT: vmax.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptosi <4 x float> %x to <4 x i32> @@ -4470,11 +4404,9 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) { ; CHECK-V: # %bb.0: # %entry ; CHECK-V-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-V-NEXT: vfcvt.rtz.xu.f.v v8, v8 -; CHECK-V-NEXT: lui a0, 16 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vminu.vx v8, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, mf2, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v8, 0 ; CHECK-V-NEXT: ret entry: %conv = fptoui <4 x float> %x to <4 x i32> @@ -4846,13 +4778,9 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 7 -; CHECK-V-NEXT: lui a0, 8 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vmin.vx v8, v10, a0 -; CHECK-V-NEXT: lui a0, 1048568 -; CHECK-V-NEXT: vmax.vx v10, v8, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclip.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 @@ -5125,11 +5053,9 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-V-NEXT: addi a0, sp, 16 ; CHECK-V-NEXT: vl2r.v v10, (a0) # Unknown-size Folded Reload ; CHECK-V-NEXT: vslideup.vi v10, v8, 7 -; CHECK-V-NEXT: lui a0, 16 -; CHECK-V-NEXT: addi a0, a0, -1 -; CHECK-V-NEXT: vminu.vx v10, v10, a0 ; CHECK-V-NEXT: vsetvli zero, zero, e16, m1, ta, ma -; CHECK-V-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-V-NEXT: csrwi vxrm, 0 +; CHECK-V-NEXT: vnclipu.wi v8, v10, 0 ; CHECK-V-NEXT: csrr a0, vlenb ; CHECK-V-NEXT: slli a0, a0, 1 ; CHECK-V-NEXT: add sp, sp, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll new file mode 100644 index 00000000000000..e12c9e515a9fd4 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/trunc-sat-clip.ll @@ -0,0 +1,394 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s + +declare <4 x i16> @llvm.smax.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.smin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.smin.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.smax.v4i64(<4 x i64>, <4 x i64>) +declare <4 x i64> @llvm.smin.v4i64(<4 x i64>, <4 x i64>) + +declare <4 x i16> @llvm.umax.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.umin.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i32> @llvm.umax.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.umin.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.umax.v4i64(<4 x i64>, <4 x i64>) +declare <4 x i64> @llvm.umin.v4i64(<4 x i64>, <4 x i64>) + +define void @trunc_sat_i8i16_maxmin(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i8i16_maxmin: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclip.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %2, <4 x i16> ) + %4 = trunc <4 x i16> %3 to <4 x i8> + store <4 x i8> %4, ptr %y, align 8 + ret void +} + +define void @trunc_sat_i8i16_minmax(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i8i16_minmax: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclip.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %2, <4 x i16> ) + %4 = trunc <4 x i16> %3 to <4 x i8> + store <4 x i8> %4, ptr %y, align 8 + ret void +} + +define void @trunc_sat_i8i16_notopt(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i8i16_notopt: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: li a0, -127 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: li a0, 128 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.smax.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = tail call <4 x i16> @llvm.smin.v4i16(<4 x i16> %2, <4 x i16> ) + %4 = trunc <4 x i16> %3 to <4 x i8> + store <4 x i8> %4, ptr %y, align 8 + ret void +} + +define void @trunc_sat_u8u16_min(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u8u16_min: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = trunc <4 x i16> %2 to <4 x i8> + store <4 x i8> %3, ptr %y, align 8 + ret void +} + +define void @trunc_sat_u8u16_notopt(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u8u16_notopt: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: li a0, 127 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = trunc <4 x i16> %2 to <4 x i8> + store <4 x i8> %3, ptr %y, align 8 + ret void +} + +define void @trunc_sat_u8u16_maxmin(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u8u16_maxmin: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.umax.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %2, <4 x i16> ) + %4 = trunc <4 x i16> %3 to <4 x i8> + store <4 x i8> %4, ptr %y, align 8 + ret void +} + +define void @trunc_sat_u8u16_minmax(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u8u16_minmax: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vle16.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i16>, ptr %x, align 16 + %2 = tail call <4 x i16> @llvm.umin.v4i16(<4 x i16> %1, <4 x i16> ) + %3 = tail call <4 x i16> @llvm.umax.v4i16(<4 x i16> %2, <4 x i16> ) + %4 = trunc <4 x i16> %3 to <4 x i8> + store <4 x i8> %4, ptr %y, align 8 + ret void +} + + +define void @trunc_sat_i16i32_notopt(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i16i32_notopt: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> ) + %4 = trunc <4 x i32> %3 to <4 x i16> + store <4 x i16> %4, ptr %y, align 16 + ret void +} + +define void @trunc_sat_i16i32_maxmin(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i16i32_maxmin: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclip.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %2, <4 x i32> ) + %4 = trunc <4 x i32> %3 to <4 x i16> + store <4 x i16> %4, ptr %y, align 16 + ret void +} + +define void @trunc_sat_i16i32_minmax(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i16i32_minmax: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclip.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %2, <4 x i32> ) + %4 = trunc <4 x i32> %3 to <4 x i16> + store <4 x i16> %4, ptr %y, align 16 + ret void +} + +define void @trunc_sat_u16u32_notopt(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u16u32_notopt: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = trunc <4 x i32> %2 to <4 x i16> + store <4 x i16> %3, ptr %y, align 16 + ret void +} + +define void @trunc_sat_u16u32_min(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u16u32_min: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = trunc <4 x i32> %2 to <4 x i16> + store <4 x i16> %3, ptr %y, align 16 + ret void +} + +define void @trunc_sat_u16u32_minmax(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u16u32_minmax: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %2, <4 x i32> ) + %4 = trunc <4 x i32> %3 to <4 x i16> + store <4 x i16> %4, ptr %y, align 16 + ret void +} + +define void @trunc_sat_u16u32_maxmin(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u16u32_maxmin: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v8, v8, 0 +; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i32>, ptr %x, align 32 + %2 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %1, <4 x i32> ) + %3 = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %2, <4 x i32> ) + %4 = trunc <4 x i32> %3 to <4 x i16> + store <4 x i16> %4, ptr %y, align 16 + ret void +} + + +define void @trunc_sat_i32i64_notopt(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i32i64_notopt: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a0, a0, 1 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: li a0, 1 +; CHECK-NEXT: slli a0, a0, 31 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> ) + %4 = trunc <4 x i64> %3 to <4 x i32> + store <4 x i32> %4, ptr %y, align 32 + ret void +} + +define void @trunc_sat_i32i64_maxmin(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i32i64_maxmin: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclip.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %2, <4 x i64> ) + %4 = trunc <4 x i64> %3 to <4 x i32> + store <4 x i32> %4, ptr %y, align 32 + ret void +} + +define void @trunc_sat_i32i64_minmax(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_i32i64_minmax: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclip.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.smin.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = tail call <4 x i64> @llvm.smax.v4i64(<4 x i64> %2, <4 x i64> ) + %4 = trunc <4 x i64> %3 to <4 x i32> + store <4 x i32> %4, ptr %y, align 32 + ret void +} + + +define void @trunc_sat_u32u64_notopt(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u32u64_notopt: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = trunc <4 x i64> %2 to <4 x i32> + store <4 x i32> %3, ptr %y, align 32 + ret void +} + +define void @trunc_sat_u32u64_min(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u32u64_min: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = trunc <4 x i64> %2 to <4 x i32> + store <4 x i32> %3, ptr %y, align 32 + ret void +} + + +define void @trunc_sat_u32u64_maxmin(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u32u64_maxmin: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.umax.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %2, <4 x i64> ) + %4 = trunc <4 x i64> %3 to <4 x i32> + store <4 x i32> %4, ptr %y, align 32 + ret void +} + +define void @trunc_sat_u32u64_minmax(ptr %x, ptr %y) { +; CHECK-LABEL: trunc_sat_u32u64_minmax: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: csrwi vxrm, 0 +; CHECK-NEXT: vnclipu.wi v10, v8, 0 +; CHECK-NEXT: vse32.v v10, (a1) +; CHECK-NEXT: ret + %1 = load <4 x i64>, ptr %x, align 64 + %2 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %1, <4 x i64> ) + %3 = tail call <4 x i64> @llvm.umax.v4i64(<4 x i64> %2, <4 x i64> ) + %4 = trunc <4 x i64> %3 to <4 x i32> + store <4 x i32> %4, ptr %y, align 32 + ret void +}