llvm-project/llvm/test/CodeGen/X86/gather-addresses.ll

; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN
; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=WIN
; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s | FileCheck %s --check-prefix=LIN32
; rdar://7398554

; When doing vector gather-scatter index calculation with 32-bit indices,
; use an efficient mov/shift sequence rather than shuffling each individual
; element out of the index vector.

; CHECK-LABEL: foo:
; LIN: movdqa	(%rsi), %xmm0
; LIN: pand 	(%rdx), %xmm0
; LIN: pextrq	$1, %xmm0, %r[[REG4:.+]]
; LIN: movq 	%xmm0, %r[[REG2:.+]]
; LIN: movslq	%e[[REG2]], %r[[REG1:.+]]
; LIN: sarq    $32, %r[[REG2]]
; LIN: movslq	%e[[REG4]], %r[[REG3:.+]]
; LIN: sarq    $32, %r[[REG4]]
; LIN: movsd    (%rdi,%r[[REG3]],8), %xmm1
; LIN: movhpd   (%rdi,%r[[REG4]],8), %xmm1 
; LIN: movq     %rdi, %xmm1 
; LIN: movq     %r[[REG3]], %xmm0

; WIN: movdqa	(%rdx), %xmm0
; WIN: pand 	(%r8), %xmm0
; WIN: pextrq	$1, %xmm0, %r[[REG4:.+]]
; WIN: movq 	%xmm0, %r[[REG2:.+]]
; WIN: movslq	%e[[REG2]], %r[[REG1:.+]]
; WIN: sarq    $32, %r[[REG2]]
; WIN: movslq	%e[[REG4]], %r[[REG3:.+]]
; WIN: sarq    $32, %r[[REG4]]
; WIN: movsd    (%rcx,%r[[REG3]],8), %xmm1
; WIN: movhpd   (%rcx,%r[[REG4]],8), %xmm1
; WIN: movdqa   (%r[[REG2]]), %xmm0
; WIN: movq     %r[[REG2]], %xmm1

define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
  %a = load <4 x i32>, <4 x i32>* %i
  %b = load <4 x i32>, <4 x i32>* %h
  %j = and <4 x i32> %a, %b
  %d0 = extractelement <4 x i32> %j, i32 0
  %d1 = extractelement <4 x i32> %j, i32 1
  %d2 = extractelement <4 x i32> %j, i32 2
  %d3 = extractelement <4 x i32> %j, i32 3
  %q0 = getelementptr double, double* %p, i32 %d0
  %q1 = getelementptr double, double* %p, i32 %d1
  %q2 = getelementptr double, double* %p, i32 %d2
  %q3 = getelementptr double, double* %p, i32 %d3
  %r0 = load double, double* %q0
  %r1 = load double, double* %q1
  %r2 = load double, double* %q2
  %r3 = load double, double* %q3
  %v0 = insertelement <4 x double> undef, double %r0, i32 0
  %v1 = insertelement <4 x double> %v0, double %r1, i32 1
  %v2 = insertelement <4 x double> %v1, double %r2, i32 2
  %v3 = insertelement <4 x double> %v2, double %r3, i32 3
  ret <4 x double> %v3
}

; Check that the sequence previously used above, which bounces the vector off the
; cache works for x86-32. Note that in this case it will not be used for index
; calculation, since indexes are 32-bit, not 64.
; CHECK-LABEL: old:
; LIN32: movaps	%xmm0, (%esp)
; LIN32-DAG: {{(mov|and)}}l	(%esp),
; LIN32-DAG: {{(mov|and)}}l	4(%esp),
; LIN32-DAG: {{(mov|and)}}l	8(%esp),
; LIN32-DAG: {{(mov|and)}}l	12(%esp),
define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind {
  %a = load <4 x i32>, <4 x i32>* %i
  %b = load <4 x i32>, <4 x i32>* %h
  %j = and <4 x i32> %a, %b
  %d0 = extractelement <4 x i32> %j, i32 0
  %d1 = extractelement <4 x i32> %j, i32 1
  %d2 = extractelement <4 x i32> %j, i32 2
  %d3 = extractelement <4 x i32> %j, i32 3
  %q0 = zext i32 %d0 to i64
  %q1 = zext i32 %d1 to i64
  %q2 = zext i32 %d2 to i64
  %q3 = zext i32 %d3 to i64  
  %r0 = and i64 %q0, %f
  %r1 = and i64 %q1, %f
  %r2 = and i64 %q2, %f
  %r3 = and i64 %q3, %f
  %v0 = insertelement <4 x i64> undef, i64 %r0, i32 0
  %v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1
  %v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2
  %v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3
  ret <4 x i64> %v3
}
Enable MI Sched for x86. This changes the SelectionDAG scheduling preference to source order. Soon, the SelectionDAG scheduler can be bypassed saving a nice chunk of compile time. Performance differences that result from this change are often a consequence of register coalescing. The register coalescer is far from perfect. Bugs can be filed for deficiencies. On x86 SandyBridge/Haswell, the source order schedule is often preserved, particularly for small blocks. Register pressure is generally improved over the SD scheduler's ILP mode. However, we are still able to handle large blocks that require latency hiding, unlike the SD scheduler's BURR mode. MI scheduler also attempts to discover the critical path in single-block loops and adjust heuristics accordingly. The MI scheduler relies on the new machine model. This is currently unimplemented for AVX, so we may not be generating the best code yet. Unit tests are updated so they don't depend on SD scheduling heuristics. llvm-svn: 192750 2013-10-16 07:33:07 +08:00			`; RUN: llc -mtriple=x86_64-linux -mcpu=nehalem < %s \| FileCheck %s --check-prefix=LIN`
			`; RUN: llc -mtriple=x86_64-win32 -mcpu=nehalem < %s \| FileCheck %s --check-prefix=WIN`
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`; RUN: llc -mtriple=i686-win32 -mcpu=nehalem < %s \| FileCheck %s --check-prefix=LIN32`
Add an rdar number to this test. llvm-svn: 98654 2010-03-17 03:08:20 +08:00			`; rdar://7398554`
Recognize code for doing vector gather/scatter index calculations with 32-bit indices. Instead of shuffling each element out of the index vector, when all indices are needed, just store the input vector to the stack and load the elements out. llvm-svn: 98588 2010-03-16 07:23:03 +08:00
			`; When doing vector gather-scatter index calculation with 32-bit indices,`
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`; use an efficient mov/shift sequence rather than shuffling each individual`
Recognize code for doing vector gather/scatter index calculations with 32-bit indices. Instead of shuffling each element out of the index vector, when all indices are needed, just store the input vector to the stack and load the elements out. llvm-svn: 98588 2010-03-16 07:23:03 +08:00			`; element out of the index vector.`

[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`; CHECK-LABEL: foo:`
			`; LIN: movdqa (%rsi), %xmm0`
			`; LIN: pand (%rdx), %xmm0`
			`; LIN: pextrq $1, %xmm0, %r[[REG4:.+]]`
[X86][SSE2] Fix asm string for movq (Move Quadword) instruction. Replace "mov{d\|q}" with "movq". Differential Revision: https://reviews.llvm.org/D32220 llvm-svn: 301386 2017-04-26 15:08:44 +08:00			`; LIN: movq %xmm0, %r[[REG2:.+]]`
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`; LIN: movslq %e[[REG2]], %r[[REG1:.+]]`
			`; LIN: sarq $32, %r[[REG2]]`
			`; LIN: movslq %e[[REG4]], %r[[REG3:.+]]`
			`; LIN: sarq $32, %r[[REG4]]`
This patch completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target. The SandyBridge architects have provided us with a more accurate information about each instruction latency, number of uOPs and used ports and I used it to replace the existing estimated SNB instructions scheduling and to add missing scheduling information. Please note that the patch extensively affects the X86 MC instr scheduling for SNB. Also note that this patch will be followed by additional patches for the remaining target architectures HSW, IVB, BDW, SKL and SKX. The updated and extended information about each instruction includes the following details: •static latency of the instruction •number of uOps from which the instruction consists of •all ports used by the instruction's' uOPs For example, the following code dictates that instructions, ADC64mr, ADC8mr, SBB64mr, SBB8mr have a static latency of 9 cycles. Each of these instructions is decoded into 6 micro operations which use ports 4, ports 2 or 3 and port 0 and ports 0 or 1 or 5: def SBWriteResGroup94 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> { let Latency = 9; let NumMicroOps = 6; let ResourceCycles = [1,2,2,1]; } def: InstRW<[SBWriteResGroup94], (instregex "ADC64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "ADC8mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB8mr")>; Note that apart for the header, most of the X86SchedSandyBridge.td file was generated by a script. Reviewers: zvi, chandlerc, RKSimon, m_zuckerman, craig.topper, igorb Differential Revision: https://reviews.llvm.org/D35019#inline-304691 llvm-svn: 307529 2017-07-10 17:53:16 +08:00			`; LIN: movsd (%rdi,%r[[REG3]],8), %xmm1`
			`; LIN: movhpd (%rdi,%r[[REG4]],8), %xmm1`
			`; LIN: movq %rdi, %xmm1`
			`; LIN: movq %r[[REG3]], %xmm0`
Reverting commit 306414 on behalf of @gadi.haber llvm-svn: 306532 2017-06-28 19:23:31 +08:00
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`; WIN: movdqa (%rdx), %xmm0`
			`; WIN: pand (%r8), %xmm0`
			`; WIN: pextrq $1, %xmm0, %r[[REG4:.+]]`
[X86][SSE2] Fix asm string for movq (Move Quadword) instruction. Replace "mov{d\|q}" with "movq". Differential Revision: https://reviews.llvm.org/D32220 llvm-svn: 301386 2017-04-26 15:08:44 +08:00			`; WIN: movq %xmm0, %r[[REG2:.+]]`
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`; WIN: movslq %e[[REG2]], %r[[REG1:.+]]`
			`; WIN: sarq $32, %r[[REG2]]`
			`; WIN: movslq %e[[REG4]], %r[[REG3:.+]]`
			`; WIN: sarq $32, %r[[REG4]]`
This patch completely replaces the scheduling information for the SandyBridge architecture target by modifying the file X86SchedSandyBridge.td located under the X86 Target. The SandyBridge architects have provided us with a more accurate information about each instruction latency, number of uOPs and used ports and I used it to replace the existing estimated SNB instructions scheduling and to add missing scheduling information. Please note that the patch extensively affects the X86 MC instr scheduling for SNB. Also note that this patch will be followed by additional patches for the remaining target architectures HSW, IVB, BDW, SKL and SKX. The updated and extended information about each instruction includes the following details: •static latency of the instruction •number of uOps from which the instruction consists of •all ports used by the instruction's' uOPs For example, the following code dictates that instructions, ADC64mr, ADC8mr, SBB64mr, SBB8mr have a static latency of 9 cycles. Each of these instructions is decoded into 6 micro operations which use ports 4, ports 2 or 3 and port 0 and ports 0 or 1 or 5: def SBWriteResGroup94 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> { let Latency = 9; let NumMicroOps = 6; let ResourceCycles = [1,2,2,1]; } def: InstRW<[SBWriteResGroup94], (instregex "ADC64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "ADC8mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB64mr")>; def: InstRW<[SBWriteResGroup94], (instregex "SBB8mr")>; Note that apart for the header, most of the X86SchedSandyBridge.td file was generated by a script. Reviewers: zvi, chandlerc, RKSimon, m_zuckerman, craig.topper, igorb Differential Revision: https://reviews.llvm.org/D35019#inline-304691 llvm-svn: 307529 2017-07-10 17:53:16 +08:00			`; WIN: movsd (%rcx,%r[[REG3]],8), %xmm1`
			`; WIN: movhpd (%rcx,%r[[REG4]],8), %xmm1`
			`; WIN: movdqa (%r[[REG2]]), %xmm0`
			`; WIN: movq %r[[REG2]], %xmm1`
Recognize code for doing vector gather/scatter index calculations with 32-bit indices. Instead of shuffling each element out of the index vector, when all indices are needed, just store the input vector to the stack and load the elements out. llvm-svn: 98588 2010-03-16 07:23:03 +08:00
			`define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%a = load <4 x i32>, <4 x i32>* %i`
			`%b = load <4 x i32>, <4 x i32>* %h`
Recognize code for doing vector gather/scatter index calculations with 32-bit indices. Instead of shuffling each element out of the index vector, when all indices are needed, just store the input vector to the stack and load the elements out. llvm-svn: 98588 2010-03-16 07:23:03 +08:00			`%j = and <4 x i32> %a, %b`
			`%d0 = extractelement <4 x i32> %j, i32 0`
			`%d1 = extractelement <4 x i32> %j, i32 1`
			`%d2 = extractelement <4 x i32> %j, i32 2`
			`%d3 = extractelement <4 x i32> %j, i32 3`
[opaque pointer type] Add textual IR support for explicit type parameter to getelementptr instruction One of several parallel first steps to remove the target type of pointers, replacing them with a single opaque pointer type. This adds an explicit type parameter to the gep instruction so that when the first parameter becomes an opaque pointer type, the type to gep through is still available to the instructions. * This doesn't modify gep operators, only instructions (operators will be handled separately) * Textual IR changes only. Bitcode (including upgrade) and changing the in-memory representation will be in separate changes. * geps of vectors are transformed as: getelementptr <4 x float> %x, ... ->getelementptr float, <4 x float> %x, ... Then, once the opaque pointer type is introduced, this will ultimately look like: getelementptr float, <4 x ptr> %x with the unambiguous interpretation that it is a vector of pointers to float. * address spaces remain on the pointer, not the type: getelementptr float addrspace(1)* %x ->getelementptr float, float addrspace(1)* %x Then, eventually: getelementptr float, ptr addrspace(1) %x Importantly, the massive amount of test case churn has been automated by same crappy python code. I had to manually update a few test cases that wouldn't fit the script's model (r228970,r229196,r229197,r229198). The python script just massages stdin and writes the result to stdout, I then wrapped that in a shell script to handle replacing files, then using the usual find+xargs to migrate all the files. update.py: import fileinput import sys import re ibrep = re.compile(r"(^.?[^%\w]getelementptr inbounds )(((?:<\d x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") normrep = re.compile( r"(^.?[^%\w]getelementptr )(((?:<\d* x )?)(.?)(\| addrspace\(\d\)) \(\|>)(?:$\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$))") def conv(match, line): if not match: return line line = match.groups()[0] if len(match.groups()[5]) == 0: line += match.groups()[2] line += match.groups()[3] line += ", " line += match.groups()[1] line += "\n" return line for line in sys.stdin: if line.find("getelementptr ") == line.find("getelementptr inbounds"): if line.find("getelementptr inbounds") != line.find("getelementptr inbounds ("): line = conv(re.match(ibrep, line), line) elif line.find("getelementptr ") != line.find("getelementptr ("): line = conv(re.match(normrep, line), line) sys.stdout.write(line) apply.sh: for name in "$@" do python3 `dirname "$0"`/update.py < "$name" > "$name.tmp" && mv "$name.tmp" "$name" rm -f "$name.tmp" done The actual commands: From llvm/src: find test/ -name .ll \| xargs ./apply.sh From llvm/src/tools/clang: find test/ -name .mm -o -name .m -o -name .cpp -o -name .c \| xargs -I '{}' ../../apply.sh "{}" From llvm/src/tools/polly: find test/ -name *.ll \| xargs ./apply.sh After that, check-all (with llvm, clang, clang-tools-extra, lld, compiler-rt, and polly all checked out). The extra 'rm' in the apply.sh script is due to a few files in clang's test suite using interesting unicode stuff that my python script was throwing exceptions on. None of those files needed to be migrated, so it seemed sufficient to ignore those cases. Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7636 llvm-svn: 230786 2015-02-28 03:29:02 +08:00			`%q0 = getelementptr double, double* %p, i32 %d0`
			`%q1 = getelementptr double, double* %p, i32 %d1`
			`%q2 = getelementptr double, double* %p, i32 %d2`
			`%q3 = getelementptr double, double* %p, i32 %d3`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%r0 = load double, double* %q0`
			`%r1 = load double, double* %q1`
			`%r2 = load double, double* %q2`
			`%r3 = load double, double* %q3`
Recognize code for doing vector gather/scatter index calculations with 32-bit indices. Instead of shuffling each element out of the index vector, when all indices are needed, just store the input vector to the stack and load the elements out. llvm-svn: 98588 2010-03-16 07:23:03 +08:00			`%v0 = insertelement <4 x double> undef, double %r0, i32 0`
			`%v1 = insertelement <4 x double> %v0, double %r1, i32 1`
			`%v2 = insertelement <4 x double> %v1, double %r2, i32 2`
			`%v3 = insertelement <4 x double> %v2, double %r3, i32 3`
			`ret <4 x double> %v3`
			`}`
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00
			`; Check that the sequence previously used above, which bounces the vector off the`
			`; cache works for x86-32. Note that in this case it will not be used for index`
			`; calculation, since indexes are 32-bit, not 64.`
			`; CHECK-LABEL: old:`
			`; LIN32: movaps %xmm0, (%esp)`
			`; LIN32-DAG: {{(mov\|and)}}l (%esp),`
			`; LIN32-DAG: {{(mov\|and)}}l 4(%esp),`
			`; LIN32-DAG: {{(mov\|and)}}l 8(%esp),`
			`; LIN32-DAG: {{(mov\|and)}}l 12(%esp),`
			`define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind {`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%a = load <4 x i32>, <4 x i32>* %i`
			`%b = load <4 x i32>, <4 x i32>* %h`
[X86] Improve a dag-combine that handles a vector extract -> zext sequence. The current DAG combine turns a sequence of extracts from <4 x i32> followed by zexts into a store followed by scalar loads. According to measurements by Martin Krastev (see PR 21269) for x86-64, a sequence of an extract, movs and shifts gives better performance. However, for 32-bit x86, the previous sequence still seems better. Differential Revision: http://reviews.llvm.org/D6501 llvm-svn: 223360 2014-12-04 21:49:51 +08:00			`%j = and <4 x i32> %a, %b`
			`%d0 = extractelement <4 x i32> %j, i32 0`
			`%d1 = extractelement <4 x i32> %j, i32 1`
			`%d2 = extractelement <4 x i32> %j, i32 2`
			`%d3 = extractelement <4 x i32> %j, i32 3`
			`%q0 = zext i32 %d0 to i64`
			`%q1 = zext i32 %d1 to i64`
			`%q2 = zext i32 %d2 to i64`
			`%q3 = zext i32 %d3 to i64`
			`%r0 = and i64 %q0, %f`
			`%r1 = and i64 %q1, %f`
			`%r2 = and i64 %q2, %f`
			`%r3 = and i64 %q3, %f`
			`%v0 = insertelement <4 x i64> undef, i64 %r0, i32 0`
			`%v1 = insertelement <4 x i64> %v0, i64 %r1, i32 1`
			`%v2 = insertelement <4 x i64> %v1, i64 %r2, i32 2`
			`%v3 = insertelement <4 x i64> %v2, i64 %r3, i32 3`
			`ret <4 x i64> %v3`
			`}`