From f471f6ff2f02a5081f89e0daf18c2ee9f3dc103d Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 1 Nov 2023 10:33:22 +0000 Subject: [PATCH] [X86] combineTruncateWithSat - relax minimum truncation size for PACKSS/PACKUS truncateVectorWithPACK handling of sub-128-bit result types was improved some time ago, so remove the old 64-bit limit Fixes #68466 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 6 +- llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 259 ++++-------------- .../CodeGen/X86/masked_store_trunc_ssat.ll | 143 +++------- llvm/test/CodeGen/X86/vector-trunc-packus.ll | 180 +++--------- llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 170 +++--------- 5 files changed, 159 insertions(+), 599 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 18f6a695e4502e..9a3e1e9bd3233c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49604,14 +49604,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); - if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 && - VT.getSizeInBits() >= 64 && + if (!PreferAVX512 && VT.getVectorNumElements() > 1 && + isPowerOf2_32(VT.getVectorNumElements()) && (SVT == MVT::i8 || SVT == MVT::i16) && (InSVT == MVT::i16 || InSVT == MVT::i32)) { if (SDValue USatVal = detectSSatPattern(In, VT, true)) { // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). - // Only do this when the result is at least 64 bits or we'll leaving - // dangling PACKSSDW nodes. if (SVT == MVT::i8 && InSVT == MVT::i32) { EVT MidVT = VT.changeVectorElementType(MVT::i16); SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll index 017fe14366bd67..78ccc983d1637a 100644 --- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll +++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll @@ -1092,38 +1092,14 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i16: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = <32767,32767,u,u> -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u> -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: stest_f64i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: stest_f64i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpmovdw %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: stest_f64i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1198,24 +1174,11 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX2-LABEL: ustest_f64i16: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: ustest_f64i16: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovdw %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: ustest_f64i16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1652,40 +1615,16 @@ define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = <127,127,u,u> -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u> -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: stest_f64i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: stest_f64i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: stest_f64i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1748,39 +1687,16 @@ define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind { ; SSE-LABEL: ustest_f64i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = <255,255,u,u> -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: ustest_f64i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: retq -; -; AVX512-LABEL: ustest_f64i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovdb %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: ustest_f64i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %0 = icmp slt <2 x i32> %conv, @@ -1795,37 +1711,16 @@ define <4 x i8> @stest_f32i8(<4 x float> %x) nounwind { ; SSE-LABEL: stest_f32i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm0, %xmm2 -; SSE-NEXT: pandn %xmm1, %xmm0 -; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: packsswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: stest_f32i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: retq -; -; AVX512-LABEL: stest_f32i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512-NEXT: vpmovsdb %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: stest_f32i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -1888,37 +1783,16 @@ define <4 x i8> @ustest_f32i8(<4 x float> %x) nounwind { ; SSE-LABEL: ustest_f32i8: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttps2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: pcmpgtd %xmm1, %xmm0 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: packuswb %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: ustest_f32i8: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-NEXT: retq -; -; AVX512-LABEL: ustest_f32i8: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttps2dq %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovusdb %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: ustest_f32i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttps2dq %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -3863,38 +3737,14 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) nounwind { ; SSE-LABEL: stest_f64i16_mm: ; SSE: # %bb.0: # %entry ; SSE-NEXT: cvttpd2dq %xmm0, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm1 = <32767,32767,u,u> -; SSE-NEXT: movdqa %xmm1, %xmm2 -; SSE-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE-NEXT: pand %xmm2, %xmm0 -; SSE-NEXT: pandn %xmm1, %xmm2 -; SSE-NEXT: por %xmm0, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u> -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE-NEXT: pand %xmm1, %xmm2 -; SSE-NEXT: pandn %xmm0, %xmm1 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] +; SSE-NEXT: packssdw %xmm0, %xmm0 ; SSE-NEXT: retq ; -; AVX2-LABEL: stest_f64i16_mm: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528] -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: stest_f64i16_mm: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpmovdw %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: stest_f64i16_mm: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) @@ -3966,24 +3816,11 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind { ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7] ; SSE-NEXT: retq ; -; AVX2-LABEL: ustest_f64i16_mm: -; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535] -; AVX2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: ustest_f64i16_mm: -; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm0 -; AVX512-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 -; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512-NEXT: vpmovdw %xmm0, %xmm0 -; AVX512-NEXT: retq +; AVX-LABEL: ustest_f64i16_mm: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vcvttpd2dq %xmm0, %xmm0 +; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq entry: %conv = fptosi <2 x double> %x to <2 x i32> %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> ) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index 38abaf8ff11c6c..bd1e6d320b69e1 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -5166,25 +5166,13 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-LABEL: truncstore_v4i32_v4i8: ; SSE2: # %bb.0: ; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [127,127,127,127] -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm4 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: pandn %xmm3, %xmm4 -; SSE2-NEXT: por %xmm0, %xmm4 -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: pand %xmm3, %xmm4 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm4, %xmm3 -; SSE2-NEXT: packssdw %xmm3, %xmm3 -; SSE2-NEXT: packsswb %xmm3, %xmm3 +; SSE2-NEXT: packssdw %xmm0, %xmm0 +; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx ; SSE2-NEXT: xorl $15, %ecx ; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: movd %xmm3, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB14_1 ; SSE2-NEXT: # %bb.2: # %else ; SSE2-NEXT: testb $2, %cl @@ -5219,8 +5207,6 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-LABEL: truncstore_v4i32_v4i8: ; SSE4: # %bb.0: ; SSE4-NEXT: pxor %xmm2, %xmm2 -; SSE4-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE4-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE4-NEXT: packssdw %xmm0, %xmm0 ; SSE4-NEXT: packsswb %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 @@ -5255,92 +5241,49 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: pextrb $3, %xmm0, 3(%rdi) ; SSE4-NEXT: retq ; -; AVX1-LABEL: truncstore_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax -; AVX1-NEXT: testb $1, %al -; AVX1-NEXT: jne .LBB14_1 -; AVX1-NEXT: # %bb.2: # %else -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: jne .LBB14_3 -; AVX1-NEXT: .LBB14_4: # %else2 -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: jne .LBB14_5 -; AVX1-NEXT: .LBB14_6: # %else4 -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: jne .LBB14_7 -; AVX1-NEXT: .LBB14_8: # %else6 -; AVX1-NEXT: retq -; AVX1-NEXT: .LBB14_1: # %cond.store -; AVX1-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX1-NEXT: testb $2, %al -; AVX1-NEXT: je .LBB14_4 -; AVX1-NEXT: .LBB14_3: # %cond.store1 -; AVX1-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX1-NEXT: testb $4, %al -; AVX1-NEXT: je .LBB14_6 -; AVX1-NEXT: .LBB14_5: # %cond.store3 -; AVX1-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX1-NEXT: testb $8, %al -; AVX1-NEXT: je .LBB14_8 -; AVX1-NEXT: .LBB14_7: # %cond.store5 -; AVX1-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX1-NEXT: retq -; -; AVX2-LABEL: truncstore_v4i32_v4i8: -; AVX2: # %bb.0: -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [127,127,127,127] -; AVX2-NEXT: vpminsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-NEXT: vpmaxsd %xmm3, %xmm0, %xmm0 -; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax -; AVX2-NEXT: testb $1, %al -; AVX2-NEXT: jne .LBB14_1 -; AVX2-NEXT: # %bb.2: # %else -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: jne .LBB14_3 -; AVX2-NEXT: .LBB14_4: # %else2 -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: jne .LBB14_5 -; AVX2-NEXT: .LBB14_6: # %else4 -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: jne .LBB14_7 -; AVX2-NEXT: .LBB14_8: # %else6 -; AVX2-NEXT: retq -; AVX2-NEXT: .LBB14_1: # %cond.store -; AVX2-NEXT: vpextrb $0, %xmm0, (%rdi) -; AVX2-NEXT: testb $2, %al -; AVX2-NEXT: je .LBB14_4 -; AVX2-NEXT: .LBB14_3: # %cond.store1 -; AVX2-NEXT: vpextrb $1, %xmm0, 1(%rdi) -; AVX2-NEXT: testb $4, %al -; AVX2-NEXT: je .LBB14_6 -; AVX2-NEXT: .LBB14_5: # %cond.store3 -; AVX2-NEXT: vpextrb $2, %xmm0, 2(%rdi) -; AVX2-NEXT: testb $8, %al -; AVX2-NEXT: je .LBB14_8 -; AVX2-NEXT: .LBB14_7: # %cond.store5 -; AVX2-NEXT: vpextrb $3, %xmm0, 3(%rdi) -; AVX2-NEXT: retq +; AVX-LABEL: truncstore_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vmovmskps %xmm1, %eax +; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: testb $1, %al +; AVX-NEXT: jne .LBB14_1 +; AVX-NEXT: # %bb.2: # %else +; AVX-NEXT: testb $2, %al +; AVX-NEXT: jne .LBB14_3 +; AVX-NEXT: .LBB14_4: # %else2 +; AVX-NEXT: testb $4, %al +; AVX-NEXT: jne .LBB14_5 +; AVX-NEXT: .LBB14_6: # %else4 +; AVX-NEXT: testb $8, %al +; AVX-NEXT: jne .LBB14_7 +; AVX-NEXT: .LBB14_8: # %else6 +; AVX-NEXT: retq +; AVX-NEXT: .LBB14_1: # %cond.store +; AVX-NEXT: vpextrb $0, %xmm0, (%rdi) +; AVX-NEXT: testb $2, %al +; AVX-NEXT: je .LBB14_4 +; AVX-NEXT: .LBB14_3: # %cond.store1 +; AVX-NEXT: vpextrb $1, %xmm0, 1(%rdi) +; AVX-NEXT: testb $4, %al +; AVX-NEXT: je .LBB14_6 +; AVX-NEXT: .LBB14_5: # %cond.store3 +; AVX-NEXT: vpextrb $2, %xmm0, 2(%rdi) +; AVX-NEXT: testb $8, %al +; AVX-NEXT: je .LBB14_8 +; AVX-NEXT: .LBB14_7: # %cond.store5 +; AVX-NEXT: vpextrb $3, %xmm0, 3(%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: truncstore_v4i32_v4i8: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al ; AVX512F-NEXT: jne .LBB14_1 @@ -5376,11 +5319,11 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX512BW-LABEL: truncstore_v4i32_v4i8: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index a94104a002d5ce..f93f5682df826b 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -4042,94 +4042,28 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2 } define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" { -; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: packuswb %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v4i32_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmaxsd %xmm1, %xmm0 -; SSE41-NEXT: packusdw %xmm0, %xmm0 -; SSE41-NEXT: packuswb %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_packus_v4i32_v4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_packus_v4i32_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512VL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512VL-NEXT: vpmovusdb %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; SSE-LABEL: trunc_packus_v4i32_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: retq ; -; AVX512BW-LABEL: trunc_packus_v4i32_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: trunc_packus_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpmovusdb %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_packus_v4i32_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_packus_v4i32_v4i8: ; SKX: # %bb.0: -; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; SKX-NEXT: vpmovusdb %xmm0, %xmm0 +; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -4140,71 +4074,25 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width" } define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { -; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8_store: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,255] -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pxor %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: packuswb %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_packus_v4i32_v4i8_store: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pxor %xmm1, %xmm1 -; SSE41-NEXT: pmaxsd %xmm0, %xmm1 -; SSE41-NEXT: packusdw %xmm1, %xmm1 -; SSE41-NEXT: packuswb %xmm1, %xmm1 -; SSE41-NEXT: movd %xmm1, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_packus_v4i32_v4i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8_store: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) -; AVX2-SLOW-NEXT: retq +; SSE-LABEL: trunc_packus_v4i32_v4i8_store: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: packuswb %xmm0, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdi) +; SSE-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8_store: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255] -; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) -; AVX2-FAST-NEXT: retq +; AVX-LABEL: trunc_packus_v4i32_v4i8_store: +; AVX: # %bb.0: +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512F-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_packus_v4i32_v4i8_store: @@ -4216,11 +4104,9 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512BW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX512BW-NEXT: vpmovusdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8_store: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 2f3fdeb74dc473..14f724fc3b8c79 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -3777,86 +3777,28 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256 } define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) { -; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127] -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-SSSE3-NEXT: movdqa %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: packssdw %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: packsswb %xmm0, %xmm0 -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v4i32_v4i8: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: packssdw %xmm0, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v4i32_v4i8: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12] -; AVX2-FAST-NEXT: retq -; -; AVX512F-LABEL: trunc_ssat_v4i32_v4i8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 -; AVX512F-NEXT: vzeroupper -; AVX512F-NEXT: retq -; -; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpmovsdb %xmm0, %xmm0 -; AVX512VL-NEXT: retq +; SSE-LABEL: trunc_ssat_v4i32_v4i8: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: retq ; -; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 -; AVX512BW-NEXT: vzeroupper -; AVX512BW-NEXT: retq +; AVX-LABEL: trunc_ssat_v4i32_v4i8: +; AVX: # %bb.0: +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq ; -; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8: -; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpmovsdb %xmm0, %xmm0 -; AVX512BWVL-NEXT: retq +; AVX512-LABEL: trunc_ssat_v4i32_v4i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: retq ; ; SKX-LABEL: trunc_ssat_v4i32_v4i8: ; SKX: # %bb.0: -; SKX-NEXT: vpmovsdb %xmm0, %xmm0 +; SKX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; SKX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; SKX-NEXT: retq %1 = icmp slt <4 x i32> %a0, %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> @@ -3867,70 +3809,25 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) { } define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { -; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8_store: -; SSE2-SSSE3: # %bb.0: -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [127,127,127,127] -; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 -; SSE2-SSSE3-NEXT: pandn %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: por %xmm0, %xmm2 -; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168] -; SSE2-SSSE3-NEXT: movdqa %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: pcmpgtd %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: pand %xmm1, %xmm2 -; SSE2-SSSE3-NEXT: pandn %xmm0, %xmm1 -; SSE2-SSSE3-NEXT: por %xmm2, %xmm1 -; SSE2-SSSE3-NEXT: packssdw %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: packsswb %xmm1, %xmm1 -; SSE2-SSSE3-NEXT: movd %xmm1, (%rdi) -; SSE2-SSSE3-NEXT: retq -; -; SSE41-LABEL: trunc_ssat_v4i32_v4i8_store: -; SSE41: # %bb.0: -; SSE41-NEXT: pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE41-NEXT: packssdw %xmm0, %xmm0 -; SSE41-NEXT: packsswb %xmm0, %xmm0 -; SSE41-NEXT: movd %xmm0, (%rdi) -; SSE41-NEXT: retq -; -; AVX1-LABEL: trunc_ssat_v4i32_v4i8_store: -; AVX1: # %bb.0: -; AVX1-NEXT: vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX1-NEXT: vmovd %xmm0, (%rdi) -; AVX1-NEXT: retq -; -; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8_store: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-SLOW-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-SLOW-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 -; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi) -; AVX2-SLOW-NEXT: retq +; SSE-LABEL: trunc_ssat_v4i32_v4i8_store: +; SSE: # %bb.0: +; SSE-NEXT: packssdw %xmm0, %xmm0 +; SSE-NEXT: packsswb %xmm0, %xmm0 +; SSE-NEXT: movd %xmm0, (%rdi) +; SSE-NEXT: retq ; -; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8_store: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127] -; AVX2-FAST-NEXT: vpminsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168] -; AVX2-FAST-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi) -; AVX2-FAST-NEXT: retq +; AVX-LABEL: trunc_ssat_v4i32_v4i8_store: +; AVX: # %bb.0: +; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, (%rdi) +; AVX-NEXT: retq ; ; AVX512F-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512F: # %bb.0: -; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512F-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512F-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: vmovd %xmm0, (%rdi) -; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8_store: @@ -3940,10 +3837,9 @@ define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) { ; ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8_store: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512BW-NEXT: vpmovsdb %zmm0, %xmm0 +; AVX512BW-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 +; AVX512BW-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX512BW-NEXT: vmovd %xmm0, (%rdi) -; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8_store: