llvm-project/llvm/test/CodeGen/X86/x86-64-double-precision-shi...

; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 | FileCheck %s
; Verify that for the architectures that are known to have poor latency
; double precision shift instructions we generate alternative sequence 
; of instructions with lower latencies instead of shld instruction.

;uint64_t lshift1(uint64_t a, uint64_t b)
;{
;    return (a << 1) | (b >> 63);
;}

; CHECK-LABEL:       lshift1:
; CHECK:             shrq    $63, %rsi
; CHECK-NEXT:        leaq    (%rsi,%rdi,2), %rax

define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
entry:
  %shl = shl i64 %a, 1
  %shr = lshr i64 %b, 63
  %or = or i64 %shr, %shl
  ret i64 %or
}

;uint64_t lshift2(uint64_t a, uint64_t b)
;{
;    return (a << 2) | (b >> 62);
;}

; CHECK-LABEL:       lshift2:
; CHECK:             shrq    $62, %rsi
; CHECK-NEXT:        leaq    (%rsi,%rdi,4), %rax

define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
entry:
  %shl = shl i64 %a, 2
  %shr = lshr i64 %b, 62
  %or = or i64 %shr, %shl
  ret i64 %or
}

;uint64_t lshift7(uint64_t a, uint64_t b)
;{
;    return (a << 7) | (b >> 57);
;}

; CHECK:             lshift7:
; CHECK:             shlq    $7, {{.*}}
; CHECK-NEXT:        shrq    $57, {{.*}}
; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}

define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {
entry:
  %shl = shl i64 %a, 7
  %shr = lshr i64 %b, 57
  %or = or i64 %shr, %shl
  ret i64 %or
}

;uint64_t lshift63(uint64_t a, uint64_t b)
;{
;    return (a << 63) | (b >> 1);
;}

; CHECK:             lshift63:
; CHECK:             shlq    $63, {{.*}}
; CHECK-NEXT:        shrq    {{.*}}
; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}

define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {
entry:
  %shl = shl i64 %a, 63
  %shr = lshr i64 %b, 1
  %or = or i64 %shr, %shl
  ret i64 %or
}
specify triple so Windows bots won't be sad llvm-svn: 252519 2015-11-10 05:53:58 +08:00			`; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 \| FileCheck %s`
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction. AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) \| (y >> (64 - c))) when we are not optimizing for size. It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel. llvm-svn: 195383 2013-11-22 07:21:26 +08:00			`; Verify that for the architectures that are known to have poor latency`
			`; double precision shift instructions we generate alternative sequence`
			`; of instructions with lower latencies instead of shld instruction.`

			`;uint64_t lshift1(uint64_t a, uint64_t b)`
			`;{`
			`; return (a << 1) \| (b >> 63);`
			`;}`

[x86] try harder to match bitwise 'or' into an LEA The motivation for this patch starts with the epic fail example in PR18007: https://llvm.org/bugs/show_bug.cgi?id=18007 ...unfortunately, this patch makes no difference for that case, but it solves some simpler cases. We'll get there some day. :) The current 'or' matching code was using computeKnownBits() via isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use. We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can treat the 'or' as if it was an 'add'. There's a TODO comment here because we should lift the bit-checking logic into a helper function, so it's not duplicated in DAGCombiner. An example of the better LEA matching: leal (%rdi,%rdi), %eax andl $1, %esi orl %esi, %eax Becomes: andl $1, %esi leal (%rsi,%rdi,2), %eax Differential Revision: http://reviews.llvm.org/D13956 llvm-svn: 252515 2015-11-10 05:16:49 +08:00			`; CHECK-LABEL: lshift1:`
			`; CHECK: shrq $63, %rsi`
			`; CHECK-NEXT: leaq (%rsi,%rdi,2), %rax`
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction. AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) \| (y >> (64 - c))) when we are not optimizing for size. It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel. llvm-svn: 195383 2013-11-22 07:21:26 +08:00
			`define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {`
			`entry:`
			`%shl = shl i64 %a, 1`
			`%shr = lshr i64 %b, 63`
			`%or = or i64 %shr, %shl`
			`ret i64 %or`
			`}`

			`;uint64_t lshift2(uint64_t a, uint64_t b)`
			`;{`
			`; return (a << 2) \| (b >> 62);`
			`;}`

[x86] try harder to match bitwise 'or' into an LEA The motivation for this patch starts with the epic fail example in PR18007: https://llvm.org/bugs/show_bug.cgi?id=18007 ...unfortunately, this patch makes no difference for that case, but it solves some simpler cases. We'll get there some day. :) The current 'or' matching code was using computeKnownBits() via isBaseWithConstantOffset() -> MaskedValueIsZero(), but that's an unnecessarily limited use. We can do more by copying the logic in ValueTracking's haveNoCommonBitsSet(), so we can treat the 'or' as if it was an 'add'. There's a TODO comment here because we should lift the bit-checking logic into a helper function, so it's not duplicated in DAGCombiner. An example of the better LEA matching: leal (%rdi,%rdi), %eax andl $1, %esi orl %esi, %eax Becomes: andl $1, %esi leal (%rsi,%rdi,2), %eax Differential Revision: http://reviews.llvm.org/D13956 llvm-svn: 252515 2015-11-10 05:16:49 +08:00			`; CHECK-LABEL: lshift2:`
			`; CHECK: shrq $62, %rsi`
			`; CHECK-NEXT: leaq (%rsi,%rdi,4), %rax`
SHLD/SHRD are VectorPath (microcode) instructions known to have poor latency on certain architectures. While generating SHLD/SHRD instructions is acceptable when optimizing for size, optimizing for speed on these platforms should be implemented using alternative sequences of instructions composed of add, adc, shr, shl, or and lea which are directPath instructions. These alternative instructions not only have a lower latency but they also increase the decode bandwidth by allowing simultaneous decoding of a third directPath instruction. AMD's processors family K7, K8, K10, K12, K15 and K16 are known to have SHLD/SHRD instructions with very poor latency. Optimization guides for these processors recommend using an alternative sequence of instructions. For these AMD's processors, I disabled folding (or (x << c) \| (y >> (64 - c))) when we are not optimizing for size. It might be beneficial to disable this folding for some of the Intel's processors. However, since I couldn't find specific recommendations regarding using SHLD/SHRD instructions on Intel's processors, I haven't disabled this peephole for Intel. llvm-svn: 195383 2013-11-22 07:21:26 +08:00
			`define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {`
			`entry:`
			`%shl = shl i64 %a, 2`
			`%shr = lshr i64 %b, 62`
			`%or = or i64 %shr, %shl`
			`ret i64 %or`
			`}`

			`;uint64_t lshift7(uint64_t a, uint64_t b)`
			`;{`
			`; return (a << 7) \| (b >> 57);`
			`;}`

			`; CHECK: lshift7:`
			`; CHECK: shlq $7, {{.*}}`
			`; CHECK-NEXT: shrq $57, {{.*}}`
			`; CHECK-NEXT: leaq ({{.}},{{.}}), {{.*}}`

			`define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {`
			`entry:`
			`%shl = shl i64 %a, 7`
			`%shr = lshr i64 %b, 57`
			`%or = or i64 %shr, %shl`
			`ret i64 %or`
			`}`

			`;uint64_t lshift63(uint64_t a, uint64_t b)`
			`;{`
			`; return (a << 63) \| (b >> 1);`
			`;}`

			`; CHECK: lshift63:`
			`; CHECK: shlq $63, {{.*}}`
			`; CHECK-NEXT: shrq {{.*}}`
			`; CHECK-NEXT: leaq ({{.}},{{.}}), {{.*}}`

			`define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {`
			`entry:`
			`%shl = shl i64 %a, 63`
			`%shr = lshr i64 %b, 1`
			`%or = or i64 %shr, %shl`
			`ret i64 %or`
			`}`