diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index d8c4e7b69c15382..42a73cd7d839862 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43742,17 +43742,6 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, if (VT.is512BitVector()) return SDValue(); - // PreAVX512, without mask-registers, attempt to sign-extend bool vectors to - // allow us to use BLENDV. - if (!Subtarget.hasAVX512() && BitWidth == 1) { - EVT CondVT = VT.changeVectorElementTypeToInteger(); - if (SDValue ExtCond = combineToExtendBoolVectorInReg( - ISD::SIGN_EXTEND, SDLoc(N), CondVT, Cond, DAG, DCI, Subtarget)) { - return DAG.getNode(X86ISD::BLENDV, SDLoc(N), VT, ExtCond, - N->getOperand(1), N->getOperand(2)); - } - } - // Don't optimize before the condition has been transformed to a legal type // and don't ever optimize vector selects that map to AVX512 mask-registers. if (BitWidth < 8 || BitWidth > 64) @@ -44235,7 +44224,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, // If this an avx512 target we can improve the use of zero masking by // swapping the operands and inverting the condition. if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() && - Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && + Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && ISD::isBuildVectorAllZeros(LHS.getNode()) && !ISD::isBuildVectorAllZeros(RHS.getNode())) { // Invert the cond to not(cond) : xor(op,allones)=not(op) @@ -44244,6 +44233,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } + // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might + // get split by legalization. + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST && + CondVT.getVectorElementType() == MVT::i1 && Cond.hasOneUse() && + TLI.isTypeLegal(VT.getScalarType())) { + EVT ExtCondVT = VT.changeVectorElementTypeToInteger(); + if (SDValue ExtCond = combineToExtendBoolVectorInReg( + ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) { + ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond); + return DAG.getSelect(DL, VT, ExtCond, LHS, RHS); + } + } + // Early exit check if (!TLI.isTypeLegal(VT)) return SDValue(); diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll index 470d3eab16ea56f..68348e0c1a52e96 100644 --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -540,51 +540,16 @@ define <8 x float> @fadd_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou define <8 x double> @fadd_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { ; AVX2-LABEL: fadd_v8f64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vaddpd %ymm3, %ymm1, %ymm1 @@ -609,8 +574,7 @@ define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vsubps %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -636,52 +600,15 @@ define <8 x float> @fsub_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou define <8 x double> @fsub_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { ; AVX2-LABEL: fsub_v8f64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vsubpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vsubpd %ymm3, %ymm1, %ymm1 @@ -734,51 +661,16 @@ define <8 x float> @fmul_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou define <8 x double> @fmul_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { ; AVX2-LABEL: fmul_v8f64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vmulpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vmulpd %ymm3, %ymm1, %ymm1 @@ -831,51 +723,16 @@ define <8 x float> @fdiv_v8f32_cast_cond(i8 noundef zeroext %pb, <8 x float> nou define <8 x double> @fdiv_v8f64_cast_cond(i8 noundef zeroext %pb, <8 x double> noundef %x, <8 x double> noundef %y) { ; AVX2-LABEL: fdiv_v8f64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vdivpd %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vdivpd %ymm3, %ymm1, %ymm1 @@ -964,8 +821,7 @@ define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -992,52 +848,15 @@ define <8 x i32> @add_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef define <8 x i64> @add_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { ; AVX2-LABEL: add_v8i64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm3, %ymm1, %ymm1 @@ -1191,8 +1010,7 @@ define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1218,52 +1036,15 @@ define <8 x i32> @sub_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef define <8 x i64> @sub_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { ; AVX2-LABEL: sub_v8i64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm1 @@ -1386,51 +1167,16 @@ define <8 x i32> @mul_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef define <8 x i64> @mul_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { ; AVX2-LABEL: mul_v8i64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm6 = [1,1,1,1] ; AVX2-NEXT: vblendvpd %ymm5, %ymm3, %ymm6, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm2, %ymm6, %ymm2 ; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm4 ; AVX2-NEXT: vpmuludq %ymm2, %ymm4, %ymm4 @@ -1606,8 +1352,7 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1634,52 +1379,15 @@ define <8 x i32> @shl_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef define <8 x i64> @shl_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { ; AVX2-LABEL: shl_v8i64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsllvq %ymm3, %ymm1, %ymm1 @@ -1833,8 +1541,7 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1861,52 +1568,15 @@ define <8 x i32> @lshr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef define <8 x i64> @lshr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { ; AVX2-LABEL: lshr_v8i64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpsrlvq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpsrlvq %ymm3, %ymm1, %ymm1 @@ -2060,8 +1730,7 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef ; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vblendvps %ymm2, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -2088,52 +1757,15 @@ define <8 x i32> @ashr_v8i32_cast_cond(i8 noundef zeroext %pb, <8 x i32> noundef define <8 x i64> @ashr_v8i64_cast_cond(i8 noundef zeroext %pb, <8 x i64> noundef %x, <8 x i64> noundef %y) { ; AVX2-LABEL: ashr_v8i64_cast_cond: ; AVX2: # %bb.0: -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: movl %edi, %ecx -; AVX2-NEXT: andb $1, %cl -; AVX2-NEXT: movzbl %cl, %ecx -; AVX2-NEXT: vmovd %ecx, %xmm4 -; AVX2-NEXT: vpinsrb $2, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $2, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $4, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $3, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $6, %eax, %xmm4, %xmm4 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $4, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $8, %eax, %xmm4, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $5, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $10, %eax, %xmm5, %xmm5 -; AVX2-NEXT: movl %edi, %eax -; AVX2-NEXT: shrb $6, %al -; AVX2-NEXT: andb $1, %al -; AVX2-NEXT: movzbl %al, %eax -; AVX2-NEXT: vpinsrb $12, %eax, %xmm5, %xmm5 -; AVX2-NEXT: shrb $7, %dil -; AVX2-NEXT: movzbl %dil, %eax -; AVX2-NEXT: vpinsrb $14, %eax, %xmm5, %xmm5 -; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; AVX2-NEXT: vpslld $31, %xmm5, %xmm5 -; AVX2-NEXT: vpsrad $31, %xmm5, %xmm5 -; AVX2-NEXT: vpmovsxdq %xmm5, %ymm5 +; AVX2-NEXT: vmovd %edi, %xmm4 +; AVX2-NEXT: vpbroadcastb %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [16,32,64,128] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm6 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm6, %ymm5 ; AVX2-NEXT: vpand %ymm3, %ymm5, %ymm3 -; AVX2-NEXT: vpmovzxwd {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; AVX2-NEXT: vpslld $31, %xmm4, %xmm4 -; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 -; AVX2-NEXT: vpmovsxdq %xmm4, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [1,2,4,8] +; AVX2-NEXT: vpand %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpcmpeqq %ymm5, %ymm4, %ymm4 ; AVX2-NEXT: vpand %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] ; AVX2-NEXT: vpsrlvq %ymm2, %ymm4, %ymm5