llvm-project/llvm/test/CodeGen/X86/pseudo_cmov_lower.ll

; RUN: llc < %s -mtriple=i386-linux-gnu -o - | FileCheck %s 

; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo1:
; CHECK: js
; CHECK-NOT: js
define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {
entry:
  %cmp = icmp slt i32 %v1, 0
  %v2.v3 = select i1 %cmp, i32 %v2, i32 %v3
  %v1.v2 = select i1 %cmp, i32 %v1, i32 %v2
  %sub = sub i32 %v1.v2, %v2.v3
  ret i32 %sub
}

; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. This makes
; sure the code for the lowering for opposite conditions gets tested.
; CHECK-LABEL: foo11:
; CHECK: js
; CHECK-NOT: js
; CHECK-NOT: jns
define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {
entry:
  %cmp1 = icmp slt i32 %v1, 0
  %v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3
  %cmp2 = icmp sge i32 %v1, 0
  %v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2
  %sub = sub i32 %v1.v2, %v2.v3
  ret i32 %sub
}

; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo2:
; CHECK: js
; CHECK-NOT: js
define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {
entry:
  %cmp = icmp slt i8 %v1, 0
  %v2.v3 = select i1 %cmp, i8 %v2, i8 %v3
  %v1.v2 = select i1 %cmp, i8 %v1, i8 %v2
  %t1 = sext i8 %v2.v3 to i32
  %t2 = sext i8 %v1.v2 to i32
  %sub = sub i32 %t1, %t2
  ret i32 %sub
}

; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo3:
; CHECK: js
; CHECK-NOT: js
define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {
entry:
  %cmp = icmp slt i16 %v1, 0
  %v2.v3 = select i1 %cmp, i16 %v2, i16 %v3
  %v1.v2 = select i1 %cmp, i16 %v1, i16 %v2
  %t1 = sext i16 %v2.v3 to i32
  %t2 = sext i16 %v1.v2 to i32
  %sub = sub i32 %t1, %t2
  ret i32 %sub
}

; This test checks that only a single js gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo4:
; CHECK: js
; CHECK-NOT: js
define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {
entry:
  %cmp = icmp slt i32 %v1, 0
  %t1 = select i1 %cmp, float %v2, float %v3
  %t2 = select i1 %cmp, float %v3, float %v4
  %sub = fsub float %t1, %t2
  ret float %sub
}

; This test checks that only a single je gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo5:
; CHECK: je
; CHECK-NOT: je
define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {
entry:
  %cmp = icmp eq i32 %v1, 0
  %t1 = select i1 %cmp, double %v2, double %v3
  %t2 = select i1 %cmp, double %v3, double %v4
  %sub = fsub double %t1, %t2
  ret double %sub
}

; This test checks that only a single je gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo6:
; CHECK: je
; CHECK-NOT: je
define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {
entry:
  %cmp = icmp eq i32 %v1, 0
  %t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3
  %t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4
  %sub = fsub <4 x float> %t1, %t2
  ret <4 x float> %sub
}

; This test checks that only a single je gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; CHECK-LABEL: foo7:
; CHECK: je
; CHECK-NOT: je
define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {
entry:
  %cmp = icmp eq i32 %v1, 0
  %t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3
  %t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4
  %sub = fsub <2 x double> %t1, %t2
  ret <2 x double> %sub
}

; This test checks that only a single ja gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR. This combines
; all the supported types together into one long string of selects based
; on the same condition.
; CHECK-LABEL: foo8:
; CHECK: ja
; CHECK-NOT: ja
define void @foo8(i32 %v1,
                  i8 %v2, i8 %v3,
                  i16 %v12, i16 %v13,
                  i32 %v22, i32 %v23,
                  float %v32, float %v33,
                  double %v42, double %v43,
                  <4 x float> %v52, <4 x float> %v53,
                  <2 x double> %v62, <2 x double> %v63,
                  <8 x float> %v72, <8 x float> %v73,
                  <4 x double> %v82, <4 x double> %v83,
                  <16 x float> %v92, <16 x float> %v93,
                  <8 x double> %v102, <8 x double> %v103,
                  i8 * %dst) nounwind {
entry:
  %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2
  %a11 = bitcast i8* %add.ptr11 to i16*

  %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
  %a21 = bitcast i8* %add.ptr21 to i32*

  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
  %a31 = bitcast i8* %add.ptr31 to float*

  %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
  %a41 = bitcast i8* %add.ptr41 to double*

  %add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32
  %a51 = bitcast i8* %add.ptr51 to <4 x float>*

  %add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48
  %a61 = bitcast i8* %add.ptr61 to <2 x double>*

  %add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64
  %a71 = bitcast i8* %add.ptr71 to <8 x float>*

  %add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128
  %a81 = bitcast i8* %add.ptr81 to <4 x double>*

  %add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64
  %a91 = bitcast i8* %add.ptr91 to <16 x float>*

  %add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128
  %a101 = bitcast i8* %add.ptr101 to <8 x double>*

  ; These operations are necessary, because select of two single use loads
  ; ends up getting optimized into a select of two leas, followed by a
  ; single load of the selected address.
  %t13 = xor i16 %v13, 11
  %t23 = xor i32 %v23, 1234
  %t33 = fadd float %v33, %v32
  %t43 = fadd double %v43, %v42
  %t53 = fadd <4 x float> %v53, %v52
  %t63 = fadd <2 x double> %v63, %v62
  %t73 = fsub <8 x float> %v73, %v72
  %t83 = fsub <4 x double> %v83, %v82
  %t93 = fsub <16 x float> %v93, %v92
  %t103 = fsub <8 x double> %v103, %v102

  %cmp = icmp ugt i32 %v1, 31
  %t11 = select i1 %cmp, i16 %v12, i16 %t13
  %t21 = select i1 %cmp, i32 %v22, i32 %t23
  %t31 = select i1 %cmp, float %v32, float %t33
  %t41 = select i1 %cmp, double %v42, double %t43
  %t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53
  %t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63
  %t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73
  %t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83
  %t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93
  %t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103

  store i16 %t11, i16* %a11, align 2
  store i32 %t21, i32* %a21, align 4
  store float %t31, float* %a31, align 4
  store double %t41, double* %a41, align 8
  store <4 x float> %t51, <4 x float>* %a51, align 16
  store <2 x double> %t61, <2 x double>* %a61, align 16
  store <8 x float> %t71, <8 x float>* %a71, align 32
  store <4 x double> %t81, <4 x double>* %a81, align 32
  store <16 x float> %t91, <16 x float>* %a91, align 32
  store <8 x double> %t101, <8 x double>* %a101, align 32

  ret void
}

; This test checks that only a single ja gets generated in the final code
; for lowering the CMOV pseudos that get created for this IR.
; on the same condition.
; Contrary to my expectations, this doesn't exercise the code for
; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1.  Instead the selects all
; get lowered into vector length number of selects, which all eventually turn
; into a huge number of CMOV_GR8, which are all contiguous, so the optimization
; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get
; CMOV_V*I1 pseudo-opcodes to get generated. If a way exists to get CMOV_V*1
; pseudo-opcodes to be generated, this test should be replaced with one that
; tests those opcodes.
;
; CHECK-LABEL: foo9:
; CHECK: ja
; CHECK-NOT: ja
define void @foo9(i32 %v1,
                  <8 x i1> %v12, <8 x i1> %v13,
                  <16 x i1> %v22, <16 x i1> %v23,
                  <32 x i1> %v32, <32 x i1> %v33,
                  <64 x i1> %v42, <64 x i1> %v43,
                  i8 * %dst) nounwind {
entry:
  %add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0
  %a11 = bitcast i8* %add.ptr11 to <8 x i1>*

  %add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4
  %a21 = bitcast i8* %add.ptr21 to <16 x i1>*

  %add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8
  %a31 = bitcast i8* %add.ptr31 to <32 x i1>*

  %add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16
  %a41 = bitcast i8* %add.ptr41 to <64 x i1>*

  ; These operations are necessary, because select of two single use loads
  ; ends up getting optimized into a select of two leas, followed by a
  ; single load of the selected address.
  %t13 = xor <8 x i1> %v13, %v12
  %t23 = xor <16 x i1> %v23, %v22
  %t33 = xor <32 x i1> %v33, %v32
  %t43 = xor <64 x i1> %v43, %v42

  %cmp = icmp ugt i32 %v1, 31
  %t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13
  %t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23
  %t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33
  %t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43

  store <8 x i1> %t11, <8 x i1>* %a11, align 16
  store <16 x i1> %t21, <16 x i1>* %a21, align 4
  store <32 x i1> %t31, <32 x i1>* %a31, align 8
  store <64 x i1> %t41, <64 x i1>* %a41, align 16

  ret void
}
[X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions. This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo instructions with the same (or exactly opposite) conditions get lowered using a single new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points) when contiguous CMOVs are being lowered. Patch by: kevin.b.smith@intel.com Differential Revision: http://reviews.llvm.org/D11428 llvm-svn: 244202 2015-08-06 16:45:34 +08:00			`; RUN: llc < %s -mtriple=i386-linux-gnu -o - \| FileCheck %s`

			`; This test checks that only a single js gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo1:`
			`; CHECK: js`
			`; CHECK-NOT: js`
			`define i32 @foo1(i32 %v1, i32 %v2, i32 %v3) nounwind {`
			`entry:`
			`%cmp = icmp slt i32 %v1, 0`
			`%v2.v3 = select i1 %cmp, i32 %v2, i32 %v3`
			`%v1.v2 = select i1 %cmp, i32 %v1, i32 %v2`
			`%sub = sub i32 %v1.v2, %v2.v3`
			`ret i32 %sub`
			`}`

			`; This test checks that only a single js gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR. This makes`
			`; sure the code for the lowering for opposite conditions gets tested.`
			`; CHECK-LABEL: foo11:`
			`; CHECK: js`
			`; CHECK-NOT: js`
			`; CHECK-NOT: jns`
			`define i32 @foo11(i32 %v1, i32 %v2, i32 %v3) nounwind {`
			`entry:`
			`%cmp1 = icmp slt i32 %v1, 0`
			`%v2.v3 = select i1 %cmp1, i32 %v2, i32 %v3`
			`%cmp2 = icmp sge i32 %v1, 0`
			`%v1.v2 = select i1 %cmp2, i32 %v1, i32 %v2`
			`%sub = sub i32 %v1.v2, %v2.v3`
			`ret i32 %sub`
			`}`

			`; This test checks that only a single js gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo2:`
			`; CHECK: js`
			`; CHECK-NOT: js`
			`define i32 @foo2(i8 %v1, i8 %v2, i8 %v3) nounwind {`
			`entry:`
			`%cmp = icmp slt i8 %v1, 0`
			`%v2.v3 = select i1 %cmp, i8 %v2, i8 %v3`
			`%v1.v2 = select i1 %cmp, i8 %v1, i8 %v2`
			`%t1 = sext i8 %v2.v3 to i32`
			`%t2 = sext i8 %v1.v2 to i32`
			`%sub = sub i32 %t1, %t2`
			`ret i32 %sub`
			`}`

			`; This test checks that only a single js gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo3:`
			`; CHECK: js`
			`; CHECK-NOT: js`
			`define i32 @foo3(i16 %v1, i16 %v2, i16 %v3) nounwind {`
			`entry:`
			`%cmp = icmp slt i16 %v1, 0`
			`%v2.v3 = select i1 %cmp, i16 %v2, i16 %v3`
			`%v1.v2 = select i1 %cmp, i16 %v1, i16 %v2`
			`%t1 = sext i16 %v2.v3 to i32`
			`%t2 = sext i16 %v1.v2 to i32`
			`%sub = sub i32 %t1, %t2`
			`ret i32 %sub`
			`}`

			`; This test checks that only a single js gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo4:`
[X86] Don't bother avoiding illegal FCMOVs if we don't have the cmov subtarget feature. We'll be forced to emit branches so we might as well use the most direct condition. 2020-02-21 16:29:18 +08:00			`; CHECK: js`
			`; CHECK-NOT: js`
[X86] Improve EmitLoweredSelect for contiguous CMOV pseudo instructions. This change improves EmitLoweredSelect() so that multiple contiguous CMOV pseudo instructions with the same (or exactly opposite) conditions get lowered using a single new basic-block. This eliminates unnecessary extra basic-blocks (and CFG merge points) when contiguous CMOVs are being lowered. Patch by: kevin.b.smith@intel.com Differential Revision: http://reviews.llvm.org/D11428 llvm-svn: 244202 2015-08-06 16:45:34 +08:00			`define float @foo4(i32 %v1, float %v2, float %v3, float %v4) nounwind {`
			`entry:`
			`%cmp = icmp slt i32 %v1, 0`
			`%t1 = select i1 %cmp, float %v2, float %v3`
			`%t2 = select i1 %cmp, float %v3, float %v4`
			`%sub = fsub float %t1, %t2`
			`ret float %sub`
			`}`

			`; This test checks that only a single je gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo5:`
			`; CHECK: je`
			`; CHECK-NOT: je`
			`define double @foo5(i32 %v1, double %v2, double %v3, double %v4) nounwind {`
			`entry:`
			`%cmp = icmp eq i32 %v1, 0`
			`%t1 = select i1 %cmp, double %v2, double %v3`
			`%t2 = select i1 %cmp, double %v3, double %v4`
			`%sub = fsub double %t1, %t2`
			`ret double %sub`
			`}`

			`; This test checks that only a single je gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo6:`
			`; CHECK: je`
			`; CHECK-NOT: je`
			`define <4 x float> @foo6(i32 %v1, <4 x float> %v2, <4 x float> %v3, <4 x float> %v4) nounwind {`
			`entry:`
			`%cmp = icmp eq i32 %v1, 0`
			`%t1 = select i1 %cmp, <4 x float> %v2, <4 x float> %v3`
			`%t2 = select i1 %cmp, <4 x float> %v3, <4 x float> %v4`
			`%sub = fsub <4 x float> %t1, %t2`
			`ret <4 x float> %sub`
			`}`

			`; This test checks that only a single je gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; CHECK-LABEL: foo7:`
			`; CHECK: je`
			`; CHECK-NOT: je`
			`define <2 x double> @foo7(i32 %v1, <2 x double> %v2, <2 x double> %v3, <2 x double> %v4) nounwind {`
			`entry:`
			`%cmp = icmp eq i32 %v1, 0`
			`%t1 = select i1 %cmp, <2 x double> %v2, <2 x double> %v3`
			`%t2 = select i1 %cmp, <2 x double> %v3, <2 x double> %v4`
			`%sub = fsub <2 x double> %t1, %t2`
			`ret <2 x double> %sub`
			`}`

			`; This test checks that only a single ja gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR. This combines`
			`; all the supported types together into one long string of selects based`
			`; on the same condition.`
			`; CHECK-LABEL: foo8:`
			`; CHECK: ja`
			`; CHECK-NOT: ja`
			`define void @foo8(i32 %v1,`
			`i8 %v2, i8 %v3,`
			`i16 %v12, i16 %v13,`
			`i32 %v22, i32 %v23,`
			`float %v32, float %v33,`
			`double %v42, double %v43,`
			`<4 x float> %v52, <4 x float> %v53,`
			`<2 x double> %v62, <2 x double> %v63,`
			`<8 x float> %v72, <8 x float> %v73,`
			`<4 x double> %v82, <4 x double> %v83,`
			`<16 x float> %v92, <16 x float> %v93,`
			`<8 x double> %v102, <8 x double> %v103,`
			`i8 * %dst) nounwind {`
			`entry:`
			`%add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 2`
			`%a11 = bitcast i8* %add.ptr11 to i16*`

			`%add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4`
			`%a21 = bitcast i8* %add.ptr21 to i32*`

			`%add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8`
			`%a31 = bitcast i8* %add.ptr31 to float*`

			`%add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16`
			`%a41 = bitcast i8* %add.ptr41 to double*`

			`%add.ptr51 = getelementptr inbounds i8, i8* %dst, i32 32`
			`%a51 = bitcast i8* %add.ptr51 to <4 x float>*`

			`%add.ptr61 = getelementptr inbounds i8, i8* %dst, i32 48`
			`%a61 = bitcast i8* %add.ptr61 to <2 x double>*`

			`%add.ptr71 = getelementptr inbounds i8, i8* %dst, i32 64`
			`%a71 = bitcast i8* %add.ptr71 to <8 x float>*`

			`%add.ptr81 = getelementptr inbounds i8, i8* %dst, i32 128`
			`%a81 = bitcast i8* %add.ptr81 to <4 x double>*`

			`%add.ptr91 = getelementptr inbounds i8, i8* %dst, i32 64`
			`%a91 = bitcast i8* %add.ptr91 to <16 x float>*`

			`%add.ptr101 = getelementptr inbounds i8, i8* %dst, i32 128`
			`%a101 = bitcast i8* %add.ptr101 to <8 x double>*`

			`; These operations are necessary, because select of two single use loads`
			`; ends up getting optimized into a select of two leas, followed by a`
			`; single load of the selected address.`
			`%t13 = xor i16 %v13, 11`
			`%t23 = xor i32 %v23, 1234`
			`%t33 = fadd float %v33, %v32`
			`%t43 = fadd double %v43, %v42`
			`%t53 = fadd <4 x float> %v53, %v52`
			`%t63 = fadd <2 x double> %v63, %v62`
			`%t73 = fsub <8 x float> %v73, %v72`
			`%t83 = fsub <4 x double> %v83, %v82`
			`%t93 = fsub <16 x float> %v93, %v92`
			`%t103 = fsub <8 x double> %v103, %v102`

			`%cmp = icmp ugt i32 %v1, 31`
			`%t11 = select i1 %cmp, i16 %v12, i16 %t13`
			`%t21 = select i1 %cmp, i32 %v22, i32 %t23`
			`%t31 = select i1 %cmp, float %v32, float %t33`
			`%t41 = select i1 %cmp, double %v42, double %t43`
			`%t51 = select i1 %cmp, <4 x float> %v52, <4 x float> %t53`
			`%t61 = select i1 %cmp, <2 x double> %v62, <2 x double> %t63`
			`%t71 = select i1 %cmp, <8 x float> %v72, <8 x float> %t73`
			`%t81 = select i1 %cmp, <4 x double> %v82, <4 x double> %t83`
			`%t91 = select i1 %cmp, <16 x float> %v92, <16 x float> %t93`
			`%t101 = select i1 %cmp, <8 x double> %v102, <8 x double> %t103`

			`store i16 %t11, i16* %a11, align 2`
			`store i32 %t21, i32* %a21, align 4`
			`store float %t31, float* %a31, align 4`
			`store double %t41, double* %a41, align 8`
			`store <4 x float> %t51, <4 x float>* %a51, align 16`
			`store <2 x double> %t61, <2 x double>* %a61, align 16`
			`store <8 x float> %t71, <8 x float>* %a71, align 32`
			`store <4 x double> %t81, <4 x double>* %a81, align 32`
			`store <16 x float> %t91, <16 x float>* %a91, align 32`
			`store <8 x double> %t101, <8 x double>* %a101, align 32`

			`ret void`
			`}`

			`; This test checks that only a single ja gets generated in the final code`
			`; for lowering the CMOV pseudos that get created for this IR.`
			`; on the same condition.`
			`; Contrary to my expectations, this doesn't exercise the code for`
			`; CMOV_V8I1, CMOV_V16I1, CMOV_V32I1, or CMOV_V64I1. Instead the selects all`
			`; get lowered into vector length number of selects, which all eventually turn`
			`; into a huge number of CMOV_GR8, which are all contiguous, so the optimization`
			`; kicks in as long as CMOV_GR8 is supported. I couldn't find a way to get`
			`; CMOV_VI1 pseudo-opcodes to get generated. If a way exists to get CMOV_V1`
			`; pseudo-opcodes to be generated, this test should be replaced with one that`
			`; tests those opcodes.`
			`;`
			`; CHECK-LABEL: foo9:`
			`; CHECK: ja`
			`; CHECK-NOT: ja`
			`define void @foo9(i32 %v1,`
			`<8 x i1> %v12, <8 x i1> %v13,`
			`<16 x i1> %v22, <16 x i1> %v23,`
			`<32 x i1> %v32, <32 x i1> %v33,`
			`<64 x i1> %v42, <64 x i1> %v43,`
			`i8 * %dst) nounwind {`
			`entry:`
			`%add.ptr11 = getelementptr inbounds i8, i8* %dst, i32 0`
			`%a11 = bitcast i8* %add.ptr11 to <8 x i1>*`

			`%add.ptr21 = getelementptr inbounds i8, i8* %dst, i32 4`
			`%a21 = bitcast i8* %add.ptr21 to <16 x i1>*`

			`%add.ptr31 = getelementptr inbounds i8, i8* %dst, i32 8`
			`%a31 = bitcast i8* %add.ptr31 to <32 x i1>*`

			`%add.ptr41 = getelementptr inbounds i8, i8* %dst, i32 16`
			`%a41 = bitcast i8* %add.ptr41 to <64 x i1>*`

			`; These operations are necessary, because select of two single use loads`
			`; ends up getting optimized into a select of two leas, followed by a`
			`; single load of the selected address.`
			`%t13 = xor <8 x i1> %v13, %v12`
			`%t23 = xor <16 x i1> %v23, %v22`
			`%t33 = xor <32 x i1> %v33, %v32`
			`%t43 = xor <64 x i1> %v43, %v42`

			`%cmp = icmp ugt i32 %v1, 31`
			`%t11 = select i1 %cmp, <8 x i1> %v12, <8 x i1> %t13`
			`%t21 = select i1 %cmp, <16 x i1> %v22, <16 x i1> %t23`
			`%t31 = select i1 %cmp, <32 x i1> %v32, <32 x i1> %t33`
			`%t41 = select i1 %cmp, <64 x i1> %v42, <64 x i1> %t43`

			`store <8 x i1> %t11, <8 x i1>* %a11, align 16`
			`store <16 x i1> %t21, <16 x i1>* %a21, align 4`
			`store <32 x i1> %t31, <32 x i1>* %a31, align 8`
			`store <64 x i1> %t41, <64 x i1>* %a41, align 16`

			`ret void`
			`}`