From 29bc5dd19407c4d7cad1c059dea26ee216ddc7ca Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 5 Feb 2020 10:27:43 -0800
Subject: [PATCH] [BPF] implement isTruncateFree and isZExtFree in
 BPFTargetLowering

Currently, isTruncateFree() and isZExtFree() callbacks return false
as they are not implemented in BPF backend. This may cause suboptimal
code generation. For example, if the load in the context of zero extension
has more than one use, the pattern zextload{i8,i16,i32} will
not be generated. Rather, the load will be matched first and
then the result is zero extended.

For example, in the test together with this commit, we have
   I1: %0 = load i32, i32* %data_end1, align 4, !tbaa !2
   I2: %conv = zext i32 %0 to i64
   ...
   I3: %2 = load i32, i32* %data, align 4, !tbaa !7
   I4: %conv2 = zext i32 %2 to i64
   ...
   I5: %4 = trunc i64 %sub.ptr.lhs.cast to i32
   I6: %conv13 = sub i32 %4, %2
   ...

The I1 and I2 will match to one zextloadi32 DAG node, where SUBREG_TO_REG is
used to convert a 32bit register to 64bit one. During code generation,
SUBREG_TO_REG is a noop.

The %2 in I3 is used in both I4 and I6. If isTruncateFree() is false,
the current implementation will generate a SLL_ri and SRL_ri
for the zext part during lowering.

This patch implement isTruncateFree() in the BPF backend, so for the
above example, I3 and I4 will generate a zextloadi32 DAG node with
SUBREG_TO_REG is generated during lowering to Machine IR.

isZExtFree() is also implemented as it should help code gen as well.

This patch also enables the change in https://reviews.llvm.org/D73985
since it won't kick in generates MOV_32_64 machine instruction.

Differential Revision: https://reviews.llvm.org/D74101
---
 llvm/lib/Target/BPF/BPFISelLowering.cpp       | 32 ++++++++
 llvm/lib/Target/BPF/BPFISelLowering.h         | 10 +++
 .../BPF/CORE/offset-reloc-fieldinfo-1.ll      | 10 ++-
 .../BPF/CORE/offset-reloc-fieldinfo-2.ll      | 23 +++---
 llvm/test/CodeGen/BPF/is_trunc_free.ll        | 80 +++++++++++++++++++
 llvm/test/CodeGen/BPF/is_zext_free.ll         | 26 ++++++
 6 files changed, 168 insertions(+), 13 deletions(-)
 create mode 100644 llvm/test/CodeGen/BPF/is_trunc_free.ll
 create mode 100644 llvm/test/CodeGen/BPF/is_zext_free.ll

diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 56e0288f26c9..799a60c6cc92 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -171,6 +171,38 @@ bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) cons
   return false;
 }
 
+bool BPFTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool BPFTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool BPFTargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!getHasAlu32() || !Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+
+bool BPFTargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (!getHasAlu32() || !VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 BPFTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index 0e9ec1398aeb..cc752dda87b0 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -122,6 +122,16 @@ private:
     return false;
   }
 
+  // isTruncateFree - Return true if it's free to truncate a value of
+  // type Ty1 to type Ty2. e.g. On BPF at alu32 mode, it's free to truncate
+  // a i64 value in register R1 to i32 by referencing its sub-register W1.
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  // For 32bit ALU result zext to 64bit is free.
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+
   unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
                          bool isSigned) const;
 
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
index d7e48d390416..d871cb35311b 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
-; RUN: llc -march=bpfel -mattr=+alu32 -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK,CHECK64 %s
+; RUN: llc -march=bpfel -mattr=+alu32 -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK,CHECK32 %s
 ; Source code:
 ;   struct s {
 ;     int a;
@@ -74,8 +74,10 @@ entry:
 ; CHECK:             r{{[0-9]+}} = 4
 ; CHECK:             r{{[0-9]+}} = 4
 ; CHECK:             r{{[0-9]+}} <<= 51
-; CHECK:             r{{[0-9]+}} s>>= 60
-; CHECK:             r{{[0-9]+}} >>= 60
+; CHECK64:           r{{[0-9]+}} s>>= 60
+; CHECK64:           r{{[0-9]+}} >>= 60
+; CHECK32:           r{{[0-9]+}} >>= 60
+; CHECK32:           r{{[0-9]+}} s>>= 60
 ; CHECK:             r{{[0-9]+}} = 1
 
 ; CHECK:             .byte   115                     # string offset=1
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
index 01af9d8a697b..45d5ae1e1f30 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
@@ -116,8 +116,10 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK:             r{{[0-9]+}} = 4
 ; CHECK-EL:          r{{[0-9]+}} <<= 51
 ; CHECK-EB:          r{{[0-9]+}} <<= 41
-; CHECK:             r{{[0-9]+}} s>>= 60
-; CHECK:             r{{[0-9]+}} >>= 60
+; CHECK64:           r{{[0-9]+}} s>>= 60
+; CHECK64:           r{{[0-9]+}} >>= 60
+; CHECK32:           r{{[0-9]+}} >>= 60
+; CHECK32:           r{{[0-9]+}} s>>= 60
 ; CHECK:             r{{[0-9]+}} = 1
 
 ; CHECK:             .long   1                       # BTF_KIND_STRUCT(id = 2)
@@ -127,8 +129,11 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 
 ; CHECK:             .long   16                      # FieldReloc
 ; CHECK-NEXT:        .long   30                      # Field reloc section string offset=30
-; CHECK32:           .long   6
-; CHECK64:           .long   7
+; CHECK-NEXT:        .long   8
+; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   36
+; CHECK-NEXT:        .long   1
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   36
@@ -136,11 +141,11 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   36
-; CHECK-NEXT:        .long   1
-; CHECK64:           .long   .Ltmp{{[0-9]+}}
-; CHECK64:           .long   2
-; CHECK64:           .long   36
-; CHECK64:           .long   0
+; CHECK-NEXT:        .long   0
+; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
+; CHECK-NEXT:        .long   2
+; CHECK-NEXT:        .long   36
+; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   .Ltmp{{[0-9]+}}
 ; CHECK-NEXT:        .long   2
 ; CHECK-NEXT:        .long   36
diff --git a/llvm/test/CodeGen/BPF/is_trunc_free.ll b/llvm/test/CodeGen/BPF/is_trunc_free.ll
new file mode 100644
index 000000000000..cc6bef1a92a9
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/is_trunc_free.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=bpfel -mattr=+alu32 < %s | FileCheck %s
+; Source:
+;   struct env_t {
+;     unsigned data;
+;     unsigned data_end;
+;   };
+;   extern int work(struct env_t *skb, unsigned offset);
+;   int test(struct env_t *skb)
+;   {
+;     void *cursor, *data_end;
+;     struct env_t *srh, *ip;
+;
+;     data_end = (void *)(long)skb->data_end;
+;     cursor = (void *)(long)skb->data;
+;
+;     ip = cursor; cursor += sizeof(*ip);
+;     if ((void *)ip + sizeof(*ip) > data_end)
+;       return 0;
+;
+;     srh = cursor; cursor += sizeof(*srh);
+;     if ((void *)srh + sizeof(*srh) > data_end)
+;       return 0;
+;
+;     return work(skb, (char *)srh - (char *)(long)skb->data);
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -emit-llvm -S test.c
+
+%struct.env_t = type { i32, i32 }
+
+; Function Attrs: nounwind
+define dso_local i32 @test(%struct.env_t* %skb) local_unnamed_addr #0 {
+entry:
+  %data_end1 = getelementptr inbounds %struct.env_t, %struct.env_t* %skb, i64 0, i32 1
+  %0 = load i32, i32* %data_end1, align 4, !tbaa !2
+  %conv = zext i32 %0 to i64
+  %1 = inttoptr i64 %conv to i8*
+  %data = getelementptr inbounds %struct.env_t, %struct.env_t* %skb, i64 0, i32 0
+  %2 = load i32, i32* %data, align 4, !tbaa !7
+  %conv2 = zext i32 %2 to i64
+  %3 = inttoptr i64 %conv2 to i8*
+  %add.ptr = getelementptr i8, i8* %3, i64 8
+  %cmp = icmp ugt i8* %add.ptr, %1
+  %add.ptr6 = getelementptr i8, i8* %3, i64 16
+  %cmp7 = icmp ugt i8* %add.ptr6, %1
+  %or.cond = or i1 %cmp, %cmp7
+  br i1 %or.cond, label %cleanup, label %if.end10
+
+if.end10:                                         ; preds = %entry
+  %sub.ptr.lhs.cast = ptrtoint i8* %add.ptr to i64
+  %4 = trunc i64 %sub.ptr.lhs.cast to i32
+  %conv13 = sub i32 %4, %2
+  %call = tail call i32 @work(%struct.env_t* nonnull %skb, i32 %conv13) #2
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end10
+  %retval.0 = phi i32 [ %call, %if.end10 ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+; CHECK: w{{[0-9]+}} = *(u32 *)(r{{[0-9]+}} + 0)
+; CHECK-NOT: w{{[0-9]+}} = w{{[0-9]+}}
+
+declare dso_local i32 @work(%struct.env_t*, i32) local_unnamed_addr #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git 016d3ce1f4b07ee3056f7c10fedb24c441c4870f)"}
+!2 = !{!3, !4, i64 4}
+!3 = !{!"env_t", !4, i64 0, !4, i64 4}
+!4 = !{!"int", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
+!7 = !{!3, !4, i64 0}
diff --git a/llvm/test/CodeGen/BPF/is_zext_free.ll b/llvm/test/CodeGen/BPF/is_zext_free.ll
new file mode 100644
index 000000000000..2a62dbefd62d
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/is_zext_free.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=bpfel -mattr=+alu32 < %s | FileCheck %s
+; Source:
+;   unsigned test(unsigned long x, unsigned long y) {
+;     return x & y;
+;   }
+; Compilation flag:
+;   clang -target bpf -O2 -emit-llvm -S test.c
+
+; Function Attrs: norecurse nounwind readnone
+define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr #0 {
+entry:
+  %and = and i64 %y, %x
+  %conv = trunc i64 %and to i32
+  ret i32 %conv
+}
+
+; CHECK: r[[REG1:[0-9]+]] = r{{[0-9]+}}
+; CHECK: w[[REG1]] &= w{{[0-9]+}}
+
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git b3ab5b2e7ffe9964ddf75a92fd7a444fe5aaa426)"}