From 5015a89aa50b20c1a5539f00cb375eca661da7e2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 15 Aug 2014 17:17:07 +0000
Subject: [PATCH] R600/SI: Implement isLegalAddressingMode

The default assumes that a 16-bit signed offset is used.
LDS instruction use a 16-bit unsigned offset, so it wasn't
being used in some cases where it was assumed a negative offset
could be used.

More should be done here, but first isLegalAddressingMode needs
to gain an addressing mode argument. For now, copy most of the rest
of the default implementation with the immediate offset change.

llvm-svn: 215732
---
 llvm/lib/Target/R600/SIISelLowering.cpp       | 43 +++++++++++++
 llvm/lib/Target/R600/SIISelLowering.h         |  4 ++
 ...ds-negative-offset-addressing-mode-loop.ll | 60 +++++++++++++++++++
 3 files changed, 107 insertions(+)
 create mode 100644 llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll

diff --git a/llvm/lib/Target/R600/SIISelLowering.cpp b/llvm/lib/Target/R600/SIISelLowering.cpp
index f9be144461a0..911c5e55949e 100644
--- a/llvm/lib/Target/R600/SIISelLowering.cpp
+++ b/llvm/lib/Target/R600/SIISelLowering.cpp
@@ -242,6 +242,49 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+// FIXME: This really needs an address space argument. The immediate offset
+// size is different for different sets of memory instruction sets.
+
+// The single offset DS instructions have a 16-bit unsigned byte offset.
+//
+// MUBUF / MTBUF have a 12-bit unsigned byte offset, and additionally can do r +
+// r + i with addr64. 32-bit has more addressing mode options. Depending on the
+// resource constant, it can also do (i64 r0) + (i32 r1) * (i14 i).
+//
+// SMRD instructions have an 8-bit, dword offset.
+//
+bool SITargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                             Type *Ty) const {
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Allow a 16-bit unsigned immediate field, since this is what DS instructions
+  // use.
+  if (!isUInt<16>(AM.BaseOffs))
+    return false;
+
+  // Only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default: // Don't allow n * r
+    return false;
+  }
+
+  return true;
+}
+
 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT  VT,
                                                       unsigned AddrSpace,
                                                       unsigned Align,
diff --git a/llvm/lib/Target/R600/SIISelLowering.h b/llvm/lib/Target/R600/SIISelLowering.h
index cbc77a5cd789..2a9aa49491fa 100644
--- a/llvm/lib/Target/R600/SIISelLowering.h
+++ b/llvm/lib/Target/R600/SIISelLowering.h
@@ -59,6 +59,10 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
 public:
   SITargetLowering(TargetMachine &tm);
+
+  bool isLegalAddressingMode(const AddrMode &AM,
+                             Type *Ty) const override;
+
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *IsFast) const override;
diff --git a/llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll b/llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
new file mode 100644
index 000000000000..d768c0e4ba9b
--- /dev/null
+++ b/llvm/test/CodeGen/R600/ds-negative-offset-addressing-mode-loop.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+declare i32 @llvm.r600.read.tidig.x() #0
+declare void @llvm.AMDGPU.barrier.local() #1
+
+; Function Attrs: nounwind
+; SI-LABEL: @signed_ds_offset_addressing_loop
+; SI: BB0_1:
+; SI: V_ADD_I32_e32 [[VADDR:v[0-9]+]],
+; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x0
+; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x4
+; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x80
+; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x84
+; SI-DAG: DS_READ_B32 v{{[0-9]+}}, [[VADDR]], 0x100
+; SI: S_ENDPGM
+define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+entry:
+  %x.i = tail call i32 @llvm.r600.read.tidig.x() #0
+  %mul = shl nsw i32 %x.i, 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %sum.03 = phi float [ 0.000000e+00, %entry ], [ %add13, %for.body ]
+  %offset.02 = phi i32 [ %mul, %entry ], [ %add14, %for.body ]
+  %k.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.AMDGPU.barrier.local() #1
+  %arrayidx = getelementptr inbounds float addrspace(3)* %lptr, i32 %offset.02
+  %tmp = load float addrspace(3)* %arrayidx, align 4
+  %add1 = add nsw i32 %offset.02, 1
+  %arrayidx2 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add1
+  %tmp1 = load float addrspace(3)* %arrayidx2, align 4
+  %add3 = add nsw i32 %offset.02, 32
+  %arrayidx4 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add3
+  %tmp2 = load float addrspace(3)* %arrayidx4, align 4
+  %add5 = add nsw i32 %offset.02, 33
+  %arrayidx6 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add5
+  %tmp3 = load float addrspace(3)* %arrayidx6, align 4
+  %add7 = add nsw i32 %offset.02, 64
+  %arrayidx8 = getelementptr inbounds float addrspace(3)* %lptr, i32 %add7
+  %tmp4 = load float addrspace(3)* %arrayidx8, align 4
+  %add9 = fadd float %tmp, %tmp1
+  %add10 = fadd float %add9, %tmp2
+  %add11 = fadd float %add10, %tmp3
+  %add12 = fadd float %add11, %tmp4
+  %add13 = fadd float %sum.03, %add12
+  %inc = add nsw i32 %k.01, 1
+  %add14 = add nsw i32 %offset.02, 97
+  %exitcond = icmp eq i32 %inc, 8
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  %tmp5 = sext i32 %x.i to i64
+  %arrayidx15 = getelementptr inbounds float addrspace(1)* %out, i64 %tmp5
+  store float %add13, float addrspace(1)* %arrayidx15, align 4
+  ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { noduplicate nounwind }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }