llvm-project/llvm/test/CodeGen/X86/add-nsw-sext.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s

; The fundamental problem: an add separated from other arithmetic by a sext can't
; be combined with the later instructions. However, if the first add is 'nsw',
; then we can promote the sext ahead of that add to allow optimizations.

define i64 @add_nsw_consts(i32 %i) {
; CHECK-LABEL: add_nsw_consts:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    addq $12, %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
  %ext = sext i32 %add to i64
  %idx = add i64 %ext, 7
  ret i64 %idx
}

; An x86 bonus: If we promote the sext ahead of the 'add nsw',
; we allow LEA formation and eliminate an add instruction.

define i64 @add_nsw_sext_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_add:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    leaq 5(%rsi,%rax), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
  %ext = sext i32 %add to i64
  %idx = add i64 %x, %ext
  ret i64 %idx
}

; Throw in a scale (left shift) because an LEA can do that too.
; Use a negative constant (LEA displacement) to verify that's handled correctly.

define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext_lsh_add:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    leaq -40(%rsi,%rax,8), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, -5
  %ext = sext i32 %add to i64
  %shl = shl i64 %ext, 3
  %idx = add i64 %x, %shl
  ret i64 %idx
}

; Don't promote the sext if it has no users. The wider add instruction needs an
; extra byte to encode.

define i64 @add_nsw_sext(i32 %i, i64 %x) {
; CHECK-LABEL: add_nsw_sext:
; CHECK:       # BB#0:
; CHECK-NEXT:    addl $5, %edi
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
  %ext = sext i32 %add to i64
  ret i64 %ext
}

; The typical use case: a 64-bit system where an 'int' is used as an index into an array.

define i8* @gep8(i32 %i, i8* %x) {
; CHECK-LABEL: gep8:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    leaq 5(%rsi,%rax), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
  %ext = sext i32 %add to i64
  %idx = getelementptr i8, i8* %x, i64 %ext
  ret i8* %idx
}

define i16* @gep16(i32 %i, i16* %x) {
; CHECK-LABEL: gep16:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    leaq -10(%rsi,%rax,2), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, -5
  %ext = sext i32 %add to i64
  %idx = getelementptr i16, i16* %x, i64 %ext
  ret i16* %idx
}

define i32* @gep32(i32 %i, i32* %x) {
; CHECK-LABEL: gep32:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    leaq 20(%rsi,%rax,4), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
  %ext = sext i32 %add to i64
  %idx = getelementptr i32, i32* %x, i64 %ext
  ret i32* %idx
}

define i64* @gep64(i32 %i, i64* %x) {
; CHECK-LABEL: gep64:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    leaq -40(%rsi,%rax,8), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, -5
  %ext = sext i32 %add to i64
  %idx = getelementptr i64, i64* %x, i64 %ext
  ret i64* %idx
}

; LEA can't scale by 16, but the adds can still be combined into an LEA.

define i128* @gep128(i32 %i, i128* %x) {
; CHECK-LABEL: gep128:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %edi, %rax
; CHECK-NEXT:    shlq $4, %rax
; CHECK-NEXT:    leaq 80(%rsi,%rax), %rax
; CHECK-NEXT:    retq

  %add = add nsw i32 %i, 5
  %ext = sext i32 %add to i64
  %idx = getelementptr i128, i128* %x, i64 %ext
  ret i128* %idx
}

; A bigger win can be achieved when there is more than one use of the
; sign extended value. In this case, we can eliminate sign extension
; instructions plus use more efficient addressing modes for memory ops.

define void @PR20134(i32* %a, i32 %i) {
; CHECK-LABEL: PR20134:
; CHECK:       # BB#0:
; CHECK-NEXT:    movslq %esi, %rax
; CHECK-NEXT:    movl 4(%rdi,%rax,4), %ecx
; CHECK-NEXT:    addl 8(%rdi,%rax,4), %ecx
; CHECK-NEXT:    movl %ecx, (%rdi,%rax,4)
; CHECK-NEXT:    retq

  %add1 = add nsw i32 %i, 1
  %idx1 = sext i32 %add1 to i64
  %gep1 = getelementptr i32, i32* %a, i64 %idx1
  %load1 = load i32, i32* %gep1, align 4

  %add2 = add nsw i32 %i, 2
  %idx2 = sext i32 %add2 to i64
  %gep2 = getelementptr i32, i32* %a, i64 %idx2
  %load2 = load i32, i32* %gep2, align 4

  %add3 = add i32 %load1, %load2
  %idx3 = sext i32 %i to i64
  %gep3 = getelementptr i32, i32* %a, i64 %idx3
  store i32 %add3, i32* %gep3, align 4
  ret void
}
Make utils/update_llc_test_checks.py note that the assertions are autogenerated. Also update existing test cases which appear to be generated by it and weren't modified (other than addition of the header) by rerunning it. llvm-svn: 253917 2015-11-24 05:33:58 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown \| FileCheck %s`

			`; The fundamental problem: an add separated from other arithmetic by a sext can't`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; be combined with the later instructions. However, if the first add is 'nsw',`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; then we can promote the sext ahead of that add to allow optimizations.`

			`define i64 @add_nsw_consts(i32 %i) {`
			`; CHECK-LABEL: add_nsw_consts:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; CHECK-NEXT: addq $12, %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, 5`
			`%ext = sext i32 %add to i64`
			`%idx = add i64 %ext, 7`
			`ret i64 %idx`
			`}`

			`; An x86 bonus: If we promote the sext ahead of the 'add nsw',`
			`; we allow LEA formation and eliminate an add instruction.`

			`define i64 @add_nsw_sext_add(i32 %i, i64 %x) {`
			`; CHECK-LABEL: add_nsw_sext_add:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
DAGCombiner: Don't unnecessarily swap operands in ReassociateOps In the case where op = add, y = base_ptr, and x = offset, this transform: (op y, (op x, c1)) -> (op (op x, y), c1) breaks the canonical form of add by putting the base pointer in the second operand and the offset in the first. This fix is important for the R600 target, because for some address spaces the base pointer and the offset are stored in separate register classes. The old pattern caused the ISel code for matching addressing modes to put the base pointer and offset in the wrong register classes, which required no-trivial code transformations to fix. llvm-svn: 262148 2016-02-28 03:57:45 +08:00			`; CHECK-NEXT: leaq 5(%rsi,%rax), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, 5`
			`%ext = sext i32 %add to i64`
			`%idx = add i64 %x, %ext`
			`ret i64 %idx`
			`}`

			`; Throw in a scale (left shift) because an LEA can do that too.`
			`; Use a negative constant (LEA displacement) to verify that's handled correctly.`

			`define i64 @add_nsw_sext_lsh_add(i32 %i, i64 %x) {`
			`; CHECK-LABEL: add_nsw_sext_lsh_add:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, -5`
			`%ext = sext i32 %add to i64`
			`%shl = shl i64 %ext, 3`
			`%idx = add i64 %x, %shl`
			`ret i64 %idx`
			`}`

			`; Don't promote the sext if it has no users. The wider add instruction needs an`
			`; extra byte to encode.`

			`define i64 @add_nsw_sext(i32 %i, i64 %x) {`
			`; CHECK-LABEL: add_nsw_sext:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: addl $5, %edi`
			`; CHECK-NEXT: movslq %edi, %rax`
			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, 5`
			`%ext = sext i32 %add to i64`
			`ret i64 %ext`
			`}`

			`; The typical use case: a 64-bit system where an 'int' is used as an index into an array.`

			`define i8* @gep8(i32 %i, i8* %x) {`
			`; CHECK-LABEL: gep8:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
DAGCombiner: Don't unnecessarily swap operands in ReassociateOps In the case where op = add, y = base_ptr, and x = offset, this transform: (op y, (op x, c1)) -> (op (op x, y), c1) breaks the canonical form of add by putting the base pointer in the second operand and the offset in the first. This fix is important for the R600 target, because for some address spaces the base pointer and the offset are stored in separate register classes. The old pattern caused the ISel code for matching addressing modes to put the base pointer and offset in the wrong register classes, which required no-trivial code transformations to fix. llvm-svn: 262148 2016-02-28 03:57:45 +08:00			`; CHECK-NEXT: leaq 5(%rsi,%rax), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, 5`
			`%ext = sext i32 %add to i64`
			`%idx = getelementptr i8, i8* %x, i64 %ext`
			`ret i8* %idx`
			`}`

			`define i16* @gep16(i32 %i, i16* %x) {`
			`; CHECK-LABEL: gep16:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; CHECK-NEXT: leaq -10(%rsi,%rax,2), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, -5`
			`%ext = sext i32 %add to i64`
			`%idx = getelementptr i16, i16* %x, i64 %ext`
			`ret i16* %idx`
			`}`

			`define i32* @gep32(i32 %i, i32* %x) {`
			`; CHECK-LABEL: gep32:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; CHECK-NEXT: leaq 20(%rsi,%rax,4), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, 5`
			`%ext = sext i32 %add to i64`
			`%idx = getelementptr i32, i32* %x, i64 %ext`
			`ret i32* %idx`
			`}`

			`define i64* @gep64(i32 %i, i64* %x) {`
			`; CHECK-LABEL: gep64:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; CHECK-NEXT: leaq -40(%rsi,%rax,8), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, -5`
			`%ext = sext i32 %add to i64`
			`%idx = getelementptr i64, i64* %x, i64 %ext`
			`ret i64* %idx`
			`}`

			`; LEA can't scale by 16, but the adds can still be combined into an LEA.`

			`define i128* @gep128(i32 %i, i128* %x) {`
			`; CHECK-LABEL: gep128:`
			`; CHECK: # BB#0:`
			`; CHECK-NEXT: movslq %edi, %rax`
			`; CHECK-NEXT: shlq $4, %rax`
DAGCombiner: Don't unnecessarily swap operands in ReassociateOps In the case where op = add, y = base_ptr, and x = offset, this transform: (op y, (op x, c1)) -> (op (op x, y), c1) breaks the canonical form of add by putting the base pointer in the second operand and the offset in the first. This fix is important for the R600 target, because for some address spaces the base pointer and the offset are stored in separate register classes. The old pattern caused the ISel code for matching addressing modes to put the base pointer and offset in the wrong register classes, which required no-trivial code transformations to fix. llvm-svn: 262148 2016-02-28 03:57:45 +08:00			`; CHECK-NEXT: leaq 80(%rsi,%rax), %rax`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add = add nsw i32 %i, 5`
			`%ext = sext i32 %add to i64`
			`%idx = getelementptr i128, i128* %x, i64 %ext`
			`ret i128* %idx`
			`}`

			`; A bigger win can be achieved when there is more than one use of the`
			`; sign extended value. In this case, we can eliminate sign extension`
			`; instructions plus use more efficient addressing modes for memory ops.`

			`define void @PR20134(i32* %a, i32 %i) {`
			`; CHECK-LABEL: PR20134:`
			`; CHECK: # BB#0:`
[x86] promote 'add nsw' to a wider type to allow more combines The motivation for this patch starts with PR20134: https://llvm.org/bugs/show_bug.cgi?id=20134 void foo(int *a, int i) { a[i] = a[i+1] + a[i+2]; } It seems better to produce this (14 bytes): movslq %esi, %rsi movl 0x4(%rdi,%rsi,4), %eax addl 0x8(%rdi,%rsi,4), %eax movl %eax, (%rdi,%rsi,4) Rather than this (22 bytes): leal 0x1(%rsi), %eax cltq leal 0x2(%rsi), %ecx movslq %ecx, %rcx movl (%rdi,%rcx,4), %ecx addl (%rdi,%rax,4), %ecx movslq %esi, %rax movl %ecx, (%rdi,%rax,4) The most basic problem (the first test case in the patch combines constants) should also be fixed in InstCombine, but it gets more complicated after that because we need to consider architecture and micro-architecture. For example, AArch64 may not see any benefit from the more general transform because the ISA solves the sexting in hardware. Some x86 chips may not want to replace 2 ADD insts with 1 LEA, and there's an attribute for that: FeatureSlowLEA. But I suspect that doesn't go far enough or maybe it's not getting used when it should; I'm also not sure if FeatureSlowLEA should also mean "slow complex addressing mode". I see no perf differences on test-suite with this change running on AMD Jaguar, but I see small code size improvements when building clang and the LLVM tools with the patched compiler. A more general solution to the sext(add nsw(x, C)) problem that works for multiple targets is available in CodeGenPrepare, but it may take quite a bit more work to get that to fire on all of the test cases that this patch takes care of. Differential Revision: http://reviews.llvm.org/D13757 llvm-svn: 250560 2015-10-17 06:14:12 +08:00			`; CHECK-NEXT: movslq %esi, %rax`
			`; CHECK-NEXT: movl 4(%rdi,%rax,4), %ecx`
			`; CHECK-NEXT: addl 8(%rdi,%rax,4), %ecx`
			`; CHECK-NEXT: movl %ecx, (%rdi,%rax,4)`
add x86 codegen tests for 'add nsw' followed by 'sext' llvm-svn: 250332 2015-10-15 05:47:03 +08:00			`; CHECK-NEXT: retq`

			`%add1 = add nsw i32 %i, 1`
			`%idx1 = sext i32 %add1 to i64`
			`%gep1 = getelementptr i32, i32* %a, i64 %idx1`
			`%load1 = load i32, i32* %gep1, align 4`

			`%add2 = add nsw i32 %i, 2`
			`%idx2 = sext i32 %add2 to i64`
			`%gep2 = getelementptr i32, i32* %a, i64 %idx2`
			`%load2 = load i32, i32* %gep2, align 4`

			`%add3 = add i32 %load1, %load2`
			`%idx3 = sext i32 %i to i64`
			`%gep3 = getelementptr i32, i32* %a, i64 %idx3`
			`store i32 %add3, i32* %gep3, align 4`
			`ret void`
			`}`