llvm-project/llvm/test/Transforms/InstCombine/X86/x86-muldq.ll

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -passes=instcombine -S | FileCheck %s

;
; UNDEF Elts
;

define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_pmuludq_128(
; CHECK-NEXT:    ret <2 x i64> zeroinitializer
;
  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)
  ret <2 x i64> %1
}

define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_pmuludq_256(
; CHECK-NEXT:    ret <4 x i64> zeroinitializer
;
  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)
  ret <4 x i64> %1
}

define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_pmuludq_512(
; CHECK-NEXT:    ret <8 x i64> zeroinitializer
;
  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)
  ret <8 x i64> %1
}

define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_pmuldq_128(
; CHECK-NEXT:    ret <2 x i64> zeroinitializer
;
  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)
  ret <2 x i64> %1
}

define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_pmuldq_256(
; CHECK-NEXT:    ret <4 x i64> zeroinitializer
;
  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)
  ret <4 x i64> %1
}

define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_pmuldq_512(
; CHECK-NEXT:    ret <8 x i64> zeroinitializer
;
  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)
  ret <8 x i64> %1
}

define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuludq_128(
; CHECK-NEXT:    ret <2 x i64> zeroinitializer
;
  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)
  ret <2 x i64> %1
}

define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuludq_256(
; CHECK-NEXT:    ret <4 x i64> zeroinitializer
;
  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)
  ret <4 x i64> %1
}

define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuludq_512(
; CHECK-NEXT:    ret <8 x i64> zeroinitializer
;
  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)
  ret <8 x i64> %1
}

define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuldq_128(
; CHECK-NEXT:    ret <2 x i64> zeroinitializer
;
  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)
  ret <2 x i64> %1
}

define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuldq_256(
; CHECK-NEXT:    ret <4 x i64> zeroinitializer
;
  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)
  ret <4 x i64> %1
}

define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @undef_zero_pmuldq_512(
; CHECK-NEXT:    ret <8 x i64> zeroinitializer
;
  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)
  ret <8 x i64> %1
}

;
; Constant Folding
;

define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @fold_pmuludq_128(
; CHECK-NEXT:    ret <2 x i64> <i64 9223372030412324865, i64 4294967295>
;
  %1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)
  ret <2 x i64> %1
}

define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @fold_pmuludq_256(
; CHECK-NEXT:    ret <4 x i64> zeroinitializer
;
  %1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)
  ret <4 x i64> %1
}

define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @fold_pmuludq_512(
; CHECK-NEXT:    ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>
;
  %1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
  ret <8 x i64> %1
}

define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @fold_pmuldq_128(
; CHECK-NEXT:    ret <2 x i64> <i64 0, i64 2>
;
  %1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)
  ret <2 x i64> %1
}

define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @fold_pmuldq_256(
; CHECK-NEXT:    ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>
;
  %1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)
  ret <4 x i64> %1
}

define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @fold_pmuldq_512(
; CHECK-NEXT:    ret <8 x i64> zeroinitializer
;
  %1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)
  ret <8 x i64> %1
}

;
; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.
;

define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuludq_128(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = and <2 x i64> [[TMP3]], <i64 4294967295, i64 poison>
; CHECK-NEXT:    [[TMP6:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 poison>
; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <2 x i64> [[TMP5]], [[TMP6]]
; CHECK-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> undef, <2 x i32> zeroinitializer
; CHECK-NEXT:    ret <2 x i64> [[TMP8]]
;
  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
  %3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)
  %4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
  ret <2 x i64> %4
}

define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuludq_256(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <4 x i64> [[TMP5]], [[TMP6]]
; CHECK-NEXT:    ret <4 x i64> [[TMP7]]
;
  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
  %3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)
  ret <4 x i64> %3
}

define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuludq_512(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = and <8 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
; CHECK-NEXT:    [[TMP6:%.*]] = and <8 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
; CHECK-NEXT:    [[TMP7:%.*]] = mul nuw <8 x i64> [[TMP5]], [[TMP6]]
; CHECK-NEXT:    ret <8 x i64> [[TMP7]]
;
  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
  %3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)
  ret <8 x i64> %3
}

define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuldq_128(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = shl <2 x i64> [[TMP3]], <i64 32, i64 32>
; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <2 x i64> [[TMP5]], <i64 32, i64 32>
; CHECK-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP4]], <i64 32, i64 32>
; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <2 x i64> [[TMP7]], <i64 32, i64 32>
; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <2 x i64> [[TMP6]], [[TMP8]]
; CHECK-NEXT:    ret <2 x i64> [[TMP9]]
;
  %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
  %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
  %3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)
  ret <2 x i64> %3
}

define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuldq_256(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A0:%.*]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <4 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP6]], [[TMP8]]
; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
; CHECK-NEXT:    ret <4 x i64> [[TMP10]]
;
  %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
  %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>
  %3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)
  %4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>
  ret <4 x i64> %4
}

define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {
; CHECK-LABEL: @test_demanded_elts_pmuldq_512(
; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A0:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>
; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>
; CHECK-NEXT:    [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP6:%.*]] = ashr exact <8 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP7:%.*]] = shl <8 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP8:%.*]] = ashr exact <8 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP8]]
; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
; CHECK-NEXT:    ret <8 x i64> [[TMP10]]
;
  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>
  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
  %3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)
  %4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>
  ret <8 x i64> %4
}

declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone

declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone

declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_test_checks.py`
[test] Use -passes=instcombine instead of -instcombine in lots of tests. NFC Another step moving away from the deprecated syntax of specifying pass pipeline in opt. Differential Revision: https://reviews.llvm.org/D119081 2022-02-04 20:39:52 +08:00			`; RUN: opt < %s -passes=instcombine -S \| FileCheck %s`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00
			`;`
			`; UNDEF Elts`
			`;`

			`define <2 x i64> @undef_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @undef_pmuludq_128(`
			`; CHECK-NEXT: ret <2 x i64> zeroinitializer`
			`;`
			`%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> undef)`
			`ret <2 x i64> %1`
			`}`

			`define <4 x i64> @undef_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @undef_pmuludq_256(`
			`; CHECK-NEXT: ret <4 x i64> zeroinitializer`
			`;`
			`%1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> undef, <8 x i32> undef)`
			`ret <4 x i64> %1`
			`}`

			`define <8 x i64> @undef_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @undef_pmuludq_512(`
			`; CHECK-NEXT: ret <8 x i64> zeroinitializer`
			`;`
			`%1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> undef)`
			`ret <8 x i64> %1`
			`}`

			`define <2 x i64> @undef_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @undef_pmuldq_128(`
			`; CHECK-NEXT: ret <2 x i64> zeroinitializer`
			`;`
			`%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> undef, <4 x i32> undef)`
			`ret <2 x i64> %1`
			`}`

			`define <4 x i64> @undef_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @undef_pmuldq_256(`
			`; CHECK-NEXT: ret <4 x i64> zeroinitializer`
			`;`
			`%1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> undef)`
			`ret <4 x i64> %1`
			`}`

			`define <8 x i64> @undef_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @undef_pmuldq_512(`
			`; CHECK-NEXT: ret <8 x i64> zeroinitializer`
			`;`
			`%1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> undef, <16 x i32> undef)`
			`ret <8 x i64> %1`
			`}`

			`define <2 x i64> @undef_zero_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @undef_zero_pmuludq_128(`
			`; CHECK-NEXT: ret <2 x i64> zeroinitializer`
			`;`
			`%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> undef, <4 x i32> zeroinitializer)`
			`ret <2 x i64> %1`
			`}`

			`define <4 x i64> @undef_zero_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @undef_zero_pmuludq_256(`
			`; CHECK-NEXT: ret <4 x i64> zeroinitializer`
			`;`
			`%1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> undef)`
			`ret <4 x i64> %1`
			`}`

			`define <8 x i64> @undef_zero_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @undef_zero_pmuludq_512(`
			`; CHECK-NEXT: ret <8 x i64> zeroinitializer`
			`;`
			`%1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> undef, <16 x i32> zeroinitializer)`
			`ret <8 x i64> %1`
			`}`

			`define <2 x i64> @undef_zero_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @undef_zero_pmuldq_128(`
			`; CHECK-NEXT: ret <2 x i64> zeroinitializer`
			`;`
			`%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> zeroinitializer, <4 x i32> undef)`
			`ret <2 x i64> %1`
			`}`

			`define <4 x i64> @undef_zero_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @undef_zero_pmuldq_256(`
			`; CHECK-NEXT: ret <4 x i64> zeroinitializer`
			`;`
			`%1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> undef, <8 x i32> zeroinitializer)`
			`ret <4 x i64> %1`
			`}`

			`define <8 x i64> @undef_zero_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @undef_zero_pmuldq_512(`
			`; CHECK-NEXT: ret <8 x i64> zeroinitializer`
			`;`
			`%1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> undef)`
			`ret <8 x i64> %1`
			`}`

			`;`
			`; Constant Folding`
			`;`

			`define <2 x i64> @fold_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @fold_pmuludq_128(`
			`; CHECK-NEXT: ret <2 x i64> <i64 9223372030412324865, i64 4294967295>`
			`;`
			`%1 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 2147483647, i32 1, i32 1, i32 3>)`
			`ret <2 x i64> %1`
			`}`

			`define <4 x i64> @fold_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @fold_pmuludq_256(`
			`; CHECK-NEXT: ret <4 x i64> zeroinitializer`
			`;`
			`%1 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> zeroinitializer, <8 x i32> zeroinitializer)`
			`ret <4 x i64> %1`
			`}`

			`define <8 x i64> @fold_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @fold_pmuludq_512(`
			`; CHECK-NEXT: ret <8 x i64> <i64 0, i64 0, i64 255, i64 131070, i64 0, i64 -281474976645121, i64 140737488289792, i64 281470681743360>`
			`;`
			`%1 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> <i32 0, i32 0, i32 undef, i32 0, i32 1, i32 1, i32 2, i32 2, i32 undef, i32 undef, i32 -1, i32 -1, i32 65536, i32 -1, i32 -65536, i32 undef>, <16 x i32> <i32 undef, i32 undef, i32 undef, i32 1, i32 255, i32 -256, i32 65535, i32 -65536, i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)`
			`ret <8 x i64> %1`
			`}`

			`define <2 x i64> @fold_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @fold_pmuldq_128(`
			`; CHECK-NEXT: ret <2 x i64> <i64 0, i64 2>`
			`;`
			`%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> <i32 undef, i32 -1, i32 -1, i32 -1>, <4 x i32> <i32 undef, i32 1, i32 -2, i32 3>)`
			`ret <2 x i64> %1`
			`}`

			`define <4 x i64> @fold_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @fold_pmuldq_256(`
			`; CHECK-NEXT: ret <4 x i64> <i64 0, i64 4294836225, i64 140737488289792, i64 -140737488355328>`
			`;`
			`%1 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> <i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>, <8 x i32> <i32 0, i32 -1, i32 -65535, i32 -65535, i32 2147483647, i32 2147483648, i32 65536, i32 -65535>)`
			`ret <4 x i64> %1`
			`}`

			`define <8 x i64> @fold_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @fold_pmuldq_512(`
			`; CHECK-NEXT: ret <8 x i64> zeroinitializer`
			`;`
			`%1 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> zeroinitializer, <16 x i32> <i32 undef, i32 -1, i32 -3, i32 -1, i32 8, i32 10, i32 -256, i32 65536, i32 undef, i32 1, i32 -65535, i32 128, i32 65536, i32 2147483647, i32 -2147483648, i32 65536>)`
			`ret <8 x i64> %1`
			`}`

			`;`
			`; PMULUDQ/PMULDQ - only the even elements (0, 2, 4, 6) of the vXi32 inputs are required.`
			`;`

			`define <2 x i64> @test_demanded_elts_pmuludq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @test_demanded_elts_pmuludq_128(`
			`; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[A0:%.]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>`
			`; CHECK-NEXT: [[TMP2:%.]] = shufflevector <4 x i32> [[A1:%.]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 undef, i32 undef>`
			`; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>`
			`; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>`
[InstCombine] use poison as placeholder for undemanded elems Currently undef is used as a don’t-care vector when constructing a vector using a series of insertelement. However, this is problematic because undef isn’t undefined enough. Especially, a sequence of insertelement can be optimized to shufflevector, but using undef as its placeholder makes shufflevector a poison-blocking instruction because undef cannot be optimized to poison. This makes a few straightforward optimizations incorrect, such as: ``` ; https://bugs.llvm.org/show_bug.cgi?id=44185 define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) { %xv = insertelement <4 x float> %q, float %x, i32 2 %r = shufflevector <4 x float> %y, <4 x float> %xv, <4 x i32> { 0, 6, 2, undef } ret <4 x float> %r ; %r[3] is undef } => define <4 x float> @insert_not_undef_shuffle_translate_commute(float %x, <4 x float> %y, <4 x float> %q) { %r = insertelement <4 x float> %y, float %x, i32 1 ret <4 x float> %r ; %r[3] = %y[3], incorrect if %y[3] = poison } Transformation doesn't verify! ERROR: Target is more poisonous than source ``` I’d like to suggest 1. Using poison as insertelement’s placeholder value (IRBuilder::CreateVectorSplat should be patched too) 2. Updating shufflevector’s semantics to return poison element if mask is undef Note that poison is currently lowered into UNDEF in SelDag, so codegen part is okay. m_Undef() matches PoisonValue as well, so existing optimizations will still fire. The only concern is hidden miscompilations that will go incorrect when poison constant is given. A conservative way is copying all tests having `insertelement undef` & replacing it with `insertelement poison` & run Alive2 on it, but it will create many tests and people won’t like it. :( Instead, I’ll simply locally maintain the tests and run Alive2. If there is any bug found, I’ll report it. Relevant links: https://bugs.llvm.org/show_bug.cgi?id=43958 , http://lists.llvm.org/pipermail/llvm-dev/2019-November/137242.html Reviewed By: nikic Differential Revision: https://reviews.llvm.org/D93586 2020-12-28 07:58:15 +08:00			`; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP3]], <i64 4294967295, i64 poison>`
			`; CHECK-NEXT: [[TMP6:%.*]] = and <2 x i64> [[TMP4]], <i64 4294967295, i64 poison>`
[InstCombine] don't automatically drop poison-generating flags in SimplifyVectorDemandedElts I noticed this while reviewing the test diffs in D115460 (and so the diffs in that patch will be reduced if this one is applied first). This is effectively a revert of 3436dc29239d ( https://reviews.llvm.org/rG3436dc29239d ) - since that commit, we've made several enhancements, so the reasoning there is no longer valid. Specifically, we added a poison value to IR, and we clarified the behavior of undef/poison elements in a shuffle mask: https://llvm.org/docs/LangRef.html#shufflevector-instruction Alive2 seems to agree that the propagation of flags in the test diffs shown here are valid: https://alive2.llvm.org/ce/z/UuY-jr https://alive2.llvm.org/ce/z/GXoMD9 https://alive2.llvm.org/ce/z/nVCyVH Differential Revision: https://reviews.llvm.org/D115526 2021-12-13 23:02:38 +08:00			`; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <2 x i64> [[TMP5]], [[TMP6]]`
Revert "Temporarily Revert "Add basic loop fusion pass."" The reversion apparently deleted the test/Transforms directory. Will be re-reverting again. llvm-svn: 358552 2019-04-17 12:52:47 +08:00			`; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> undef, <2 x i32> zeroinitializer`
			`; CHECK-NEXT: ret <2 x i64> [[TMP8]]`
			`;`
			`%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>`
			`%2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>`
			`%3 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %1, <4 x i32> %2)`
			`%4 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer`
			`ret <2 x i64> %4`
			`}`

			`define <4 x i64> @test_demanded_elts_pmuludq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @test_demanded_elts_pmuludq_256(`
			`; CHECK-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A0:%.]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>`
			`; CHECK-NEXT: [[TMP2:%.]] = shufflevector <8 x i32> [[A1:%.]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>`
			`; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>`
			`; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>`
			`; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>`
			`; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>`
			`; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <4 x i64> [[TMP5]], [[TMP6]]`
			`; CHECK-NEXT: ret <4 x i64> [[TMP7]]`
			`;`
			`%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>`
			`%2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>`
			`%3 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %1, <8 x i32> %2)`
			`ret <4 x i64> %3`
			`}`

			`define <8 x i64> @test_demanded_elts_pmuludq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @test_demanded_elts_pmuludq_512(`
			`; CHECK-NEXT: [[TMP1:%.]] = shufflevector <16 x i32> [[A0:%.]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>`
			`; CHECK-NEXT: [[TMP2:%.]] = shufflevector <16 x i32> [[A1:%.]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>`
			`; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>`
			`; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>`
			`; CHECK-NEXT: [[TMP5:%.*]] = and <8 x i64> [[TMP3]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>`
			`; CHECK-NEXT: [[TMP6:%.*]] = and <8 x i64> [[TMP4]], <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>`
			`; CHECK-NEXT: [[TMP7:%.*]] = mul nuw <8 x i64> [[TMP5]], [[TMP6]]`
			`; CHECK-NEXT: ret <8 x i64> [[TMP7]]`
			`;`
			`%1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>`
			`%2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>`
			`%3 = call <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32> %1, <16 x i32> %2)`
			`ret <8 x i64> %3`
			`}`

			`define <2 x i64> @test_demanded_elts_pmuldq_128(<4 x i32> %a0, <4 x i32> %a1) {`
			`; CHECK-LABEL: @test_demanded_elts_pmuldq_128(`
			`; CHECK-NEXT: [[TMP1:%.]] = shufflevector <4 x i32> [[A0:%.]], <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>`
			`; CHECK-NEXT: [[TMP2:%.]] = shufflevector <4 x i32> [[A1:%.]], <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>`
			`; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64>`
			`; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>`
			`; CHECK-NEXT: [[TMP5:%.*]] = shl <2 x i64> [[TMP3]], <i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <2 x i64> [[TMP5]], <i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP4]], <i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <2 x i64> [[TMP7]], <i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <2 x i64> [[TMP6]], [[TMP8]]`
			`; CHECK-NEXT: ret <2 x i64> [[TMP9]]`
			`;`
			`%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>`
			`%2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>`
			`%3 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %1, <4 x i32> %2)`
			`ret <2 x i64> %3`
			`}`

			`define <4 x i64> @test_demanded_elts_pmuldq_256(<8 x i32> %a0, <8 x i32> %a1) {`
			`; CHECK-LABEL: @test_demanded_elts_pmuldq_256(`
			`; CHECK-NEXT: [[TMP1:%.]] = shufflevector <8 x i32> [[A0:%.]], <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>`
			`; CHECK-NEXT: [[TMP2:%.]] = shufflevector <8 x i32> [[A1:%.]], <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>`
			`; CHECK-NEXT: [[TMP3:%.*]] = bitcast <8 x i32> [[TMP1]] to <4 x i64>`
			`; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i32> [[TMP2]] to <4 x i64>`
			`; CHECK-NEXT: [[TMP5:%.*]] = shl <4 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <4 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP7:%.*]] = shl <4 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <4 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i64> [[TMP6]], [[TMP8]]`
			`; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>`
			`; CHECK-NEXT: ret <4 x i64> [[TMP10]]`
			`;`
			`%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>`
			`%2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7>`
			`%3 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %1, <8 x i32> %2)`
			`%4 = shufflevector <4 x i64> %3, <4 x i64> undef, <4 x i32> <i32 0, i32 0, i32 3, i32 3>`
			`ret <4 x i64> %4`
			`}`

			`define <8 x i64> @test_demanded_elts_pmuldq_512(<16 x i32> %a0, <16 x i32> %a1) {`
			`; CHECK-LABEL: @test_demanded_elts_pmuldq_512(`
			`; CHECK-NEXT: [[TMP1:%.]] = shufflevector <16 x i32> [[A0:%.]], <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>`
			`; CHECK-NEXT: [[TMP2:%.]] = shufflevector <16 x i32> [[A1:%.]], <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>`
			`; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i32> [[TMP1]] to <8 x i64>`
			`; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x i32> [[TMP2]] to <8 x i64>`
			`; CHECK-NEXT: [[TMP5:%.*]] = shl <8 x i64> [[TMP3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP6:%.*]] = ashr exact <8 x i64> [[TMP5]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP7:%.*]] = shl <8 x i64> [[TMP4]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP8:%.*]] = ashr exact <8 x i64> [[TMP7]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>`
			`; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <8 x i64> [[TMP6]], [[TMP8]]`
			`; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>`
			`; CHECK-NEXT: ret <8 x i64> [[TMP10]]`
			`;`
			`%1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>`
			`%2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>`
			`%3 = call <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32> %1, <16 x i32> %2)`
			`%4 = shufflevector <8 x i64> %3, <8 x i64> undef, <8 x i32> <i32 0, i32 0, i32 3, i32 3, i32 4, i32 4, i32 7, i32 7>`
			`ret <8 x i64> %4`
			`}`

			`declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone`
			`declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone`

			`declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone`
			`declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone`

			`declare <8 x i64> @llvm.x86.avx512.pmulu.dq.512(<16 x i32>, <16 x i32>) nounwind readnone`
			`declare <8 x i64> @llvm.x86.avx512.pmul.dq.512(<16 x i32>, <16 x i32>) nounwind readnone`