[Intrinsics] define funnel shift IR intrinsics + DAG builder support

As discussed here: http://lists.llvm.org/pipermail/llvm-dev/2018-May/123292.html http://lists.llvm.org/pipermail/llvm-dev/2018-July/124400.html We want to add rotate intrinsics because the IR expansion of that pattern is 4+ instructions, and we can lose pieces of the pattern before it gets to the backend. Generalizing the operation by allowing 2 different input values (plus the 3rd shift/rotate amount) gives us a "funnel shift" operation which may also be a single hardware instruction. Initially, I thought we needed to define new DAG nodes for these ops, and I spent time working on that (much larger patch), but then I concluded that we don't need it. At least as a first step, we have all of the backend support necessary to match these ops...because it was required. And shepherding these through the IR optimizer is the primary concern, so the IR intrinsics are likely all that we'll ever need. There was also a question about converting the intrinsics to the existing ROTL/ROTR DAG nodes (along with improving the oversized shift documentation). Again, I don't think that's strictly necessary (as the test results here prove). That can be an efficiency improvement as a small follow-up patch. So all we're left with is documentation, definition of the IR intrinsics, and DAG builder support. Differential Revision: https://reviews.llvm.org/D49242 llvm-svn: 337221
2018-07-16 22:59:31 +00:00 · 2018-07-16 22:59:31 +00:00 · c71adc8040
parent c4846a551e
commit c71adc8040
9 changed files with 2059 additions and 0 deletions
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@ -11880,6 +11880,98 @@ then the result is the size in bits of the type of ``src`` if

 .. _int_overflow:

+'``llvm.fshl.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.fshl`` on any
+integer bit width or any vector of integer elements. Not all targets
+support all bit widths or vector types, however.
+
+::
+
+      declare i8  @llvm.fshl.i8 (i8 %a, i8 %b, i8 %c)
+      declare i67 @llvm.fshl.i67(i67 %a, i67 %b, i67 %c)
+      declare <2 x i32> @llvm.fshl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
+
+Overview:
+"""""""""
+
+The '``llvm.fshl``' family of intrinsic functions performs a funnel shift left:
+the first two values are concatenated as { %a : %b } (%a is the most significant
+bits of the wide value), the combined value is shifted left, and the most 
+significant bits are extracted to produce a result that is the same size as the 
+original arguments. If the first 2 arguments are identical, this is equivalent 
+to a rotate left operation. For vector types, the operation occurs for each 
+element of the vector. The shift argument is treated as an unsigned amount 
+modulo the element size of the arguments.
+
+Arguments:
+""""""""""
+
+The first two arguments are the values to be concatenated. The third
+argument is the shift amount. The arguments may be any integer type or a
+vector with integer element type. All arguments and the return value must
+have the same type.
+
+Example:
+""""""""
+
+.. code-block:: text
+
+      %r = call i8 @llvm.fshl.i8(i8 %x, i8 %y, i8 %z)  ; %r = i8: msb_extract((concat(x, y) << (z % 8)), 8)
+      %r = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15)  ; %r = i8: 128 (0b10000000)
+      %r = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)  ; %r = i8: 120 (0b01111000)
+      %r = call i8 @llvm.fshl.i8(i8 0, i8 255, i8 8)   ; %r = i8: 0   (0b00000000)
+
+'``llvm.fshr.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.fshr`` on any
+integer bit width or any vector of integer elements. Not all targets
+support all bit widths or vector types, however.
+
+::
+
+      declare i8  @llvm.fshr.i8 (i8 %a, i8 %b, i8 %c)
+      declare i67 @llvm.fshr.i67(i67 %a, i67 %b, i67 %c)
+      declare <2 x i32> @llvm.fshr.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
+
+Overview:
+"""""""""
+
+The '``llvm.fshr``' family of intrinsic functions performs a funnel shift right:
+the first two values are concatenated as { %a : %b } (%a is the most significant
+bits of the wide value), the combined value is shifted right, and the least 
+significant bits are extracted to produce a result that is the same size as the 
+original arguments. If the first 2 arguments are identical, this is equivalent 
+to a rotate right operation. For vector types, the operation occurs for each 
+element of the vector. The shift argument is treated as an unsigned amount 
+modulo the element size of the arguments.
+
+Arguments:
+""""""""""
+
+The first two arguments are the values to be concatenated. The third
+argument is the shift amount. The arguments may be any integer type or a
+vector with integer element type. All arguments and the return value must
+have the same type.
+
+Example:
+""""""""
+
+.. code-block:: text
+
+      %r = call i8 @llvm.fshr.i8(i8 %x, i8 %y, i8 %z)  ; %r = i8: lsb_extract((concat(x, y) >> (z % 8)), 8)
+      %r = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15)  ; %r = i8: 254 (0b11111110)
+      %r = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)  ; %r = i8: 225 (0b11100001)
+      %r = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)   ; %r = i8: 255 (0b11111111)
+
 Arithmetic with Overflow Intrinsics
 -----------------------------------

--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@ -577,6 +577,10 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable] in {
  def int_ctlz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
  def int_cttz : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, llvm_i1_ty]>;
  def int_bitreverse : Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>]>;
+  def int_fshl : Intrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+  def int_fshr : Intrinsic<[llvm_anyint_ty],
+      [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
 }

 //===------------------------ Debugger Intrinsics -------------------------===//
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -5656,6 +5656,43 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
    setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
    return nullptr;
  }
+  case Intrinsic::fshl:
+  case Intrinsic::fshr: {
+    bool IsFSHL = Intrinsic == Intrinsic::fshl;
+    SDValue X = getValue(I.getArgOperand(0));
+    SDValue Y = getValue(I.getArgOperand(1));
+    SDValue Z = getValue(I.getArgOperand(2));
+    EVT VT = X.getValueType();
+
+    // TODO: When X == Y, this is rotate. Create the node directly if legal.
+
+    // Get the shift amount and inverse shift amount, modulo the bit-width.
+    SDValue BitWidthC = DAG.getConstant(VT.getScalarSizeInBits(), sdl, VT);
+    SDValue ShAmt = DAG.getNode(ISD::UREM, sdl, VT, Z, BitWidthC);
+    SDValue NegZ = DAG.getNode(ISD::SUB, sdl, VT, BitWidthC, Z);
+    SDValue InvShAmt = DAG.getNode(ISD::UREM, sdl, VT, NegZ, BitWidthC);
+
+    // fshl: (X << (Z % BW)) | (Y >> ((BW - Z) % BW))
+    // fshr: (X << ((BW - Z) % BW)) | (Y >> (Z % BW))
+    SDValue ShX = DAG.getNode(ISD::SHL, sdl, VT, X, IsFSHL ? ShAmt : InvShAmt);
+    SDValue ShY = DAG.getNode(ISD::SRL, sdl, VT, Y, IsFSHL ? InvShAmt : ShAmt);
+    SDValue Res = DAG.getNode(ISD::OR, sdl, VT, ShX, ShY);
+
+    // If (Z % BW == 0), then (BW - Z) % BW is also zero, so the result would
+    // be X | Y. If X == Y (rotate), that's fine. If not, we have to select.
+    if (X != Y) {
+      SDValue Zero = DAG.getConstant(0, sdl, VT);
+      EVT CCVT = MVT::i1;
+      if (VT.isVector())
+        CCVT = EVT::getVectorVT(*Context, CCVT, VT.getVectorNumElements());
+      // For fshl, 0 shift returns the 1st arg (X).
+      // For fshr, 0 shift returns the 2nd arg (Y).
+      SDValue IsZeroShift = DAG.getSetCC(sdl, CCVT, ShAmt, Zero, ISD::SETEQ);
+      Res = DAG.getSelect(sdl, VT, IsZeroShift, IsFSHL ? X : Y, Res);
+    }
+    setValue(&I, Res);
+    return nullptr;
+  }
  case Intrinsic::stacksave: {
    SDValue Op = getRoot();
    Res = DAG.getNode(
--- a/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift-rot.ll
@ -0,0 +1,217 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; When first 2 operands match, it's a rotate.
+
+define i8 @rotl_i8_const_shift(i8 %x) {
+; CHECK-LABEL: rotl_i8_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #5, #3
+; CHECK-NEXT:    bfi w8, w0, #3, #29
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
+  ret i8 %f
+}
+
+define i64 @rotl_i64_const_shift(i64 %x) {
+; CHECK-LABEL: rotl_i64_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ror x0, x0, #61
+; CHECK-NEXT:    ret
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 3)
+  ret i64 %f
+}
+
+; When first 2 operands match, it's a rotate (by variable amount).
+
+define i16 @rotl_i16(i16 %x, i16 %z) {
+; CHECK-LABEL: rotl_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w10, wzr, #0x10
+; CHECK-NEXT:    sub w10, w10, w1
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    and w9, w1, #0xf
+; CHECK-NEXT:    and w10, w10, #0xf
+; CHECK-NEXT:    lsl w9, w0, w9
+; CHECK-NEXT:    lsr w8, w8, w10
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z)
+  ret i16 %f
+}
+
+define i32 @rotl_i32(i32 %x, i32 %z) {
+; CHECK-LABEL: rotl_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w8, wzr, #0x20
+; CHECK-NEXT:    sub w8, w8, w1
+; CHECK-NEXT:    ror w0, w0, w8
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %z)
+  ret i32 %f
+}
+
+; Vector rotate.
+
+define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
+; CHECK-LABEL: rotl_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.4s, #31
+; CHECK-NEXT:    movi v3.4s, #32
+; CHECK-NEXT:    and v4.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    ushl v3.4s, v0.4s, v4.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    orr v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
+  ret <4 x i32> %f
+}
+
+; Vector rotate by constant splat amount.
+
+define <4 x i32> @rotl_v4i32_rotl_const_shift(<4 x i32> %x) {
+; CHECK-LABEL: rotl_v4i32_rotl_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushr v1.4s, v0.4s, #29
+; CHECK-NEXT:    shl v0.4s, v0.4s, #3
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %f
+}
+
+; Repeat everything for funnel shift right.
+
+; When first 2 operands match, it's a rotate.
+
+define i8 @rotr_i8_const_shift(i8 %x) {
+; CHECK-LABEL: rotr_i8_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ubfx w8, w0, #3, #5
+; CHECK-NEXT:    bfi w8, w0, #5, #27
+; CHECK-NEXT:    mov w0, w8
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
+  ret i8 %f
+}
+
+define i32 @rotr_i32_const_shift(i32 %x) {
+; CHECK-LABEL: rotr_i32_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ror w0, w0, #3
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 3)
+  ret i32 %f
+}
+
+; When first 2 operands match, it's a rotate (by variable amount).
+
+define i16 @rotr_i16(i16 %x, i16 %z) {
+; CHECK-LABEL: rotr_i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    and w8, w0, #0xffff
+; CHECK-NEXT:    and w9, w1, #0xf
+; CHECK-NEXT:    orr w10, wzr, #0x10
+; CHECK-NEXT:    lsr w8, w8, w9
+; CHECK-NEXT:    sub w9, w10, w1
+; CHECK-NEXT:    and w9, w9, #0xf
+; CHECK-NEXT:    lsl w9, w0, w9
+; CHECK-NEXT:    orr w0, w9, w8
+; CHECK-NEXT:    ret
+  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
+  ret i16 %f
+}
+
+define i64 @rotr_i64(i64 %x, i64 %z) {
+; CHECK-LABEL: rotr_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x40
+; CHECK-NEXT:    sub w9, w9, w1
+; CHECK-NEXT:    lsr x8, x0, x1
+; CHECK-NEXT:    lsl x9, x0, x9
+; CHECK-NEXT:    orr x0, x9, x8
+; CHECK-NEXT:    ret
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z)
+  ret i64 %f
+}
+
+; Vector rotate.
+
+define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
+; CHECK-LABEL: rotr_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi v2.4s, #31
+; CHECK-NEXT:    movi v3.4s, #32
+; CHECK-NEXT:    and v4.16b, v1.16b, v2.16b
+; CHECK-NEXT:    sub v1.4s, v3.4s, v1.4s
+; CHECK-NEXT:    neg v3.4s, v4.4s
+; CHECK-NEXT:    and v1.16b, v1.16b, v2.16b
+; CHECK-NEXT:    ushl v2.4s, v0.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
+  ret <4 x i32> %f
+}
+
+; Vector rotate by constant splat amount.
+
+define <4 x i32> @rotr_v4i32_const_shift(<4 x i32> %x) {
+; CHECK-LABEL: rotr_v4i32_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushr v1.4s, v0.4s, #3
+; CHECK-NEXT:    shl v0.4s, v0.4s, #29
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %f
+}
+
+define i32 @rotl_i32_shift_by_bitwidth(i32 %x) {
+; CHECK-LABEL: rotl_i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 32)
+  ret i32 %f
+}
+
+define i32 @rotr_i32_shift_by_bitwidth(i32 %x) {
+; CHECK-LABEL: rotr_i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @rotl_v4i32_shift_by_bitwidth(<4 x i32> %x) {
+; CHECK-LABEL: rotl_v4i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @rotr_v4i32_shift_by_bitwidth(<4 x i32> %x) {
+; CHECK-LABEL: rotr_v4i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@ -0,0 +1,311 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-- | FileCheck %s
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; General case - all operands can be variables.
+
+define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: fshl_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x20
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsl w8, w0, w2
+; CHECK-NEXT:    lsr w9, w1, w9
+; CHECK-NEXT:    orr w8, w8, w9
+; CHECK-NEXT:    tst w2, #0x1f
+; CHECK-NEXT:    csel w0, w0, w8, eq
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshl.i37(i37, i37, i37)
+define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
+; CHECK-LABEL: fshl_i37:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x11, #31883
+; CHECK-NEXT:    mov w10, #37
+; CHECK-NEXT:    movk x11, #3542, lsl #16
+; CHECK-NEXT:    movk x11, #51366, lsl #32
+; CHECK-NEXT:    sub x12, x10, x2
+; CHECK-NEXT:    and x8, x2, #0x1fffffffff
+; CHECK-NEXT:    movk x11, #56679, lsl #48
+; CHECK-NEXT:    and x12, x12, #0x1fffffffff
+; CHECK-NEXT:    umulh x13, x8, x11
+; CHECK-NEXT:    umulh x11, x12, x11
+; CHECK-NEXT:    lsr x13, x13, #5
+; CHECK-NEXT:    lsr x11, x11, #5
+; CHECK-NEXT:    and x9, x1, #0x1fffffffff
+; CHECK-NEXT:    msub x8, x13, x10, x8
+; CHECK-NEXT:    msub x10, x11, x10, x12
+; CHECK-NEXT:    lsl x13, x0, x8
+; CHECK-NEXT:    lsr x9, x9, x10
+; CHECK-NEXT:    orr x9, x13, x9
+; CHECK-NEXT:    cmp x8, #0 // =0
+; CHECK-NEXT:    csel x0, x0, x9, eq
+; CHECK-NEXT:    ret
+  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
+
+declare i7 @llvm.fshl.i7(i7, i7, i7)
+define i7 @fshl_i7_const_fold() {
+; CHECK-LABEL: fshl_i7_const_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, #67
+; CHECK-NEXT:    ret
+  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+define i8 @fshl_i8_const_fold_overshift_1() {
+; CHECK-LABEL: fshl_i8_const_fold_overshift_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0x80
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15)
+  ret i8 %f
+}
+
+define i8 @fshl_i8_const_fold_overshift_2() {
+; CHECK-LABEL: fshl_i8_const_fold_overshift_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0x78
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)
+  ret i8 %f
+}
+
+define i8 @fshl_i8_const_fold_overshift_3() {
+; CHECK-LABEL: fshl_i8_const_fold_overshift_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshl.i8(i8 0, i8 225, i8 8)
+  ret i8 %f
+}
+
+; With constant shift amount, this is 'extr'.
+
+define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w0, w1, #23
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount.
+
+define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_const_overshift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w0, w1, #23
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work.
+
+define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
+; CHECK-LABEL: fshl_i64_const_overshift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr x0, x0, x1, #23
+; CHECK-NEXT:    ret
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshl_i8_const_fold() {
+; CHECK-LABEL: fshl_i8_const_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0x80
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+; Repeat everything for funnel shift right.
+
+; General case - all operands can be variables.
+
+define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: fshr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w9, wzr, #0x20
+; CHECK-NEXT:    sub w9, w9, w2
+; CHECK-NEXT:    lsr w8, w1, w2
+; CHECK-NEXT:    lsl w9, w0, w9
+; CHECK-NEXT:    orr w8, w9, w8
+; CHECK-NEXT:    tst w2, #0x1f
+; CHECK-NEXT:    csel w0, w1, w8, eq
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshr.i37(i37, i37, i37)
+define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
+; CHECK-LABEL: fshr_i37:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x11, #31883
+; CHECK-NEXT:    mov w10, #37
+; CHECK-NEXT:    movk x11, #3542, lsl #16
+; CHECK-NEXT:    movk x11, #51366, lsl #32
+; CHECK-NEXT:    sub x12, x10, x2
+; CHECK-NEXT:    and x9, x2, #0x1fffffffff
+; CHECK-NEXT:    movk x11, #56679, lsl #48
+; CHECK-NEXT:    and x12, x12, #0x1fffffffff
+; CHECK-NEXT:    umulh x13, x9, x11
+; CHECK-NEXT:    umulh x11, x12, x11
+; CHECK-NEXT:    lsr x13, x13, #5
+; CHECK-NEXT:    lsr x11, x11, #5
+; CHECK-NEXT:    and x8, x1, #0x1fffffffff
+; CHECK-NEXT:    msub x9, x13, x10, x9
+; CHECK-NEXT:    msub x10, x11, x10, x12
+; CHECK-NEXT:    lsr x8, x8, x9
+; CHECK-NEXT:    lsl x10, x0, x10
+; CHECK-NEXT:    orr x8, x10, x8
+; CHECK-NEXT:    cmp x9, #0 // =0
+; CHECK-NEXT:    csel x0, x1, x8, eq
+; CHECK-NEXT:    ret
+  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
+
+declare i7 @llvm.fshr.i7(i7, i7, i7)
+define i7 @fshr_i7_const_fold() {
+; CHECK-LABEL: fshr_i7_const_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0x1f
+; CHECK-NEXT:    ret
+  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+define i8 @fshr_i8_const_fold_overshift_1() {
+; CHECK-LABEL: fshr_i8_const_fold_overshift_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0xfe
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15)
+  ret i8 %f
+}
+
+define i8 @fshr_i8_const_fold_overshift_2() {
+; CHECK-LABEL: fshr_i8_const_fold_overshift_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, #225
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)
+  ret i8 %f
+}
+
+define i8 @fshr_i8_const_fold_overshift_3() {
+; CHECK-LABEL: fshr_i8_const_fold_overshift_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0xff
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)
+  ret i8 %f
+}
+
+; With constant shift amount, this is 'extr'.
+
+define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_const_shift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w0, w1, #9
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount. 41-32=9.
+
+define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_const_overshift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w0, w1, #9
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work. 105-64 = 41.
+
+define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
+; CHECK-LABEL: fshr_i64_const_overshift:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr x0, x0, x1, #41
+; CHECK-NEXT:    ret
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshr_i8_const_fold() {
+; CHECK-LABEL: fshr_i8_const_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    orr w0, wzr, #0xfe
+; CHECK-NEXT:    ret
+  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: fshr_v4i32_shift_by_bitwidth:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
--- a/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift-rot.ll
@ -0,0 +1,214 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; When first 2 operands match, it's a rotate.
+
+define i8 @rotl_i8_const_shift(i8 %x) {
+; CHECK-LABEL: rotl_i8_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 3, 27, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 3, 0, 28
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
+  ret i8 %f
+}
+
+define i64 @rotl_i64_const_shift(i64 %x) {
+; CHECK-LABEL: rotl_i64_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 3, 3, 3
+; CHECK-NEXT:    blr
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 3)
+  ret i64 %f
+}
+
+; When first 2 operands match, it's a rotate (by variable amount).
+
+define i16 @rotl_i16(i16 %x, i16 %z) {
+; CHECK-LABEL: rotl_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 5, 4, 16
+; CHECK-NEXT:    clrlwi 6, 3, 16
+; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 31
+; CHECK-NEXT:    clrlwi 5, 5, 28
+; CHECK-NEXT:    slw 3, 3, 4
+; CHECK-NEXT:    srw 4, 6, 5
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    blr
+  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z)
+  ret i16 %f
+}
+
+define i32 @rotl_i32(i32 %x, i32 %z) {
+; CHECK-LABEL: rotl_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 0, 27, 31
+; CHECK-NEXT:    rlwnm 3, 3, 4, 0, 31
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %z)
+  ret i32 %f
+}
+
+; Vector rotate.
+
+define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) {
+; CHECK-LABEL: rotl_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI4_0@toc@l
+; CHECK-NEXT:    lvx 4, 0, 3
+; CHECK-NEXT:    vsubuwm 4, 4, 3
+; CHECK-NEXT:    vslw 3, 2, 3
+; CHECK-NEXT:    vsrw 2, 2, 4
+; CHECK-NEXT:    xxlor 34, 35, 34
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
+  ret <4 x i32> %f
+}
+
+; Vector rotate by constant splat amount.
+
+define <4 x i32> @rotl_v4i32_const_shift(<4 x i32> %x) {
+; CHECK-LABEL: rotl_v4i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vspltisw 3, -16
+; CHECK-NEXT:    vspltisw 4, 13
+; CHECK-NEXT:    vspltisw 5, 3
+; CHECK-NEXT:    vsubuwm 3, 4, 3
+; CHECK-NEXT:    vslw 4, 2, 5
+; CHECK-NEXT:    vsrw 2, 2, 3
+; CHECK-NEXT:    xxlor 34, 36, 34
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %f
+}
+
+; Repeat everything for funnel shift right.
+
+define i8 @rotr_i8_const_shift(i8 %x) {
+; CHECK-LABEL: rotr_i8_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 3, 29, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 5, 0, 26
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
+  ret i8 %f
+}
+
+define i32 @rotr_i32_const_shift(i32 %x) {
+; CHECK-LABEL: rotr_i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 3, 3, 29, 0, 31
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 3)
+  ret i32 %f
+}
+
+; When first 2 operands match, it's a rotate (by variable amount).
+
+define i16 @rotr_i16(i16 %x, i16 %z) {
+; CHECK-LABEL: rotr_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 5, 4, 16
+; CHECK-NEXT:    clrlwi 6, 3, 16
+; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 31
+; CHECK-NEXT:    clrlwi 5, 5, 28
+; CHECK-NEXT:    srw 4, 6, 4
+; CHECK-NEXT:    slw 3, 3, 5
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    blr
+  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
+  ret i16 %f
+}
+
+define i64 @rotr_i64(i64 %x, i64 %z) {
+; CHECK-LABEL: rotr_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 4, 4, 64
+; CHECK-NEXT:    rlwinm 4, 4, 0, 26, 31
+; CHECK-NEXT:    rotld 3, 3, 4
+; CHECK-NEXT:    blr
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z)
+  ret i64 %f
+}
+
+; Vector rotate.
+
+define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) {
+; CHECK-LABEL: rotr_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addis 3, 2, .LCPI10_0@toc@ha
+; CHECK-NEXT:    addi 3, 3, .LCPI10_0@toc@l
+; CHECK-NEXT:    lvx 4, 0, 3
+; CHECK-NEXT:    vsubuwm 4, 4, 3
+; CHECK-NEXT:    vsrw 3, 2, 3
+; CHECK-NEXT:    vslw 2, 2, 4
+; CHECK-NEXT:    xxlor 34, 34, 35
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
+  ret <4 x i32> %f
+}
+
+; Vector rotate by constant splat amount.
+
+define <4 x i32> @rotr_v4i32_const_shift(<4 x i32> %x) {
+; CHECK-LABEL: rotr_v4i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vspltisw 3, -16
+; CHECK-NEXT:    vspltisw 4, 13
+; CHECK-NEXT:    vspltisw 5, 3
+; CHECK-NEXT:    vsubuwm 3, 4, 3
+; CHECK-NEXT:    vsrw 4, 2, 5
+; CHECK-NEXT:    vslw 2, 2, 3
+; CHECK-NEXT:    xxlor 34, 34, 36
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %f
+}
+
+define i32 @rotl_i32_shift_by_bitwidth(i32 %x) {
+; CHECK-LABEL: rotl_i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 32)
+  ret i32 %f
+}
+
+define i32 @rotr_i32_shift_by_bitwidth(i32 %x) {
+; CHECK-LABEL: rotr_i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @rotl_v4i32_shift_by_bitwidth(<4 x i32> %x) {
+; CHECK-LABEL: rotl_v4i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @rotr_v4i32_shift_by_bitwidth(<4 x i32> %x) {
+; CHECK-LABEL: rotr_v4i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
--- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
@ -0,0 +1,271 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64le-- | FileCheck %s
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; General case - all operands can be variables.
+
+define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: fshl_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 6, 5, 32
+; CHECK-NEXT:    andi. 5, 5, 31
+; CHECK-NEXT:    clrlwi 6, 6, 27
+; CHECK-NEXT:    slw 5, 3, 5
+; CHECK-NEXT:    srw 4, 4, 6
+; CHECK-NEXT:    or 4, 5, 4
+; CHECK-NEXT:    isel 3, 3, 4, 2
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshl.i37(i37, i37, i37)
+define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
+; CHECK-LABEL: fshl_i37:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis 6, -8857
+; CHECK-NEXT:    subfic 7, 5, 37
+; CHECK-NEXT:    clrldi 5, 5, 27
+; CHECK-NEXT:    clrldi 4, 4, 27
+; CHECK-NEXT:    ori 6, 6, 51366
+; CHECK-NEXT:    clrldi 7, 7, 27
+; CHECK-NEXT:    sldi 6, 6, 32
+; CHECK-NEXT:    oris 6, 6, 3542
+; CHECK-NEXT:    ori 6, 6, 31883
+; CHECK-NEXT:    mulhdu 8, 7, 6
+; CHECK-NEXT:    mulhdu 6, 5, 6
+; CHECK-NEXT:    rldicl 8, 8, 59, 5
+; CHECK-NEXT:    rldicl 6, 6, 59, 5
+; CHECK-NEXT:    mulli 8, 8, 37
+; CHECK-NEXT:    mulli 6, 6, 37
+; CHECK-NEXT:    sub 7, 7, 8
+; CHECK-NEXT:    subf. 5, 6, 5
+; CHECK-NEXT:    srd 4, 4, 7
+; CHECK-NEXT:    sld 5, 3, 5
+; CHECK-NEXT:    or 4, 5, 4
+; CHECK-NEXT:    isel 3, 3, 4, 2
+; CHECK-NEXT:    blr
+  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
+
+declare i7 @llvm.fshl.i7(i7, i7, i7)
+define i7 @fshl_i7_const_fold() {
+; CHECK-LABEL: fshl_i7_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 67
+; CHECK-NEXT:    blr
+  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+; With constant shift amount, this is rotate + insert (missing extended mnemonics).
+
+define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 9, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 9, 0, 22
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount.
+
+define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 9, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 9, 0, 22
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work.
+
+define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
+; CHECK-LABEL: fshl_i64_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 4, 4, 41
+; CHECK-NEXT:    rldimi 4, 3, 41, 0
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshl_i8_const_fold() {
+; CHECK-LABEL: fshl_i8_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 128
+; CHECK-NEXT:    blr
+  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+; Repeat everything for funnel shift right.
+
+; General case - all operands can be variables.
+
+define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: fshr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subfic 6, 5, 32
+; CHECK-NEXT:    andi. 5, 5, 31
+; CHECK-NEXT:    clrlwi 6, 6, 27
+; CHECK-NEXT:    srw 5, 4, 5
+; CHECK-NEXT:    slw 3, 3, 6
+; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    isel 3, 4, 3, 2
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshr.i37(i37, i37, i37)
+define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
+; CHECK-LABEL: fshr_i37:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lis 6, -8857
+; CHECK-NEXT:    subfic 7, 5, 37
+; CHECK-NEXT:    clrldi 5, 5, 27
+; CHECK-NEXT:    clrldi 9, 4, 27
+; CHECK-NEXT:    ori 6, 6, 51366
+; CHECK-NEXT:    clrldi 7, 7, 27
+; CHECK-NEXT:    sldi 6, 6, 32
+; CHECK-NEXT:    oris 6, 6, 3542
+; CHECK-NEXT:    ori 6, 6, 31883
+; CHECK-NEXT:    mulhdu 8, 5, 6
+; CHECK-NEXT:    mulhdu 6, 7, 6
+; CHECK-NEXT:    rldicl 8, 8, 59, 5
+; CHECK-NEXT:    rldicl 6, 6, 59, 5
+; CHECK-NEXT:    mulli 8, 8, 37
+; CHECK-NEXT:    mulli 6, 6, 37
+; CHECK-NEXT:    subf. 5, 8, 5
+; CHECK-NEXT:    sub 6, 7, 6
+; CHECK-NEXT:    srd 5, 9, 5
+; CHECK-NEXT:    sld 3, 3, 6
+; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    isel 3, 4, 3, 2
+; CHECK-NEXT:    blr
+  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
+
+declare i7 @llvm.fshr.i7(i7, i7, i7)
+define i7 @fshr_i7_const_fold() {
+; CHECK-LABEL: fshr_i7_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 31
+; CHECK-NEXT:    blr
+  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+; With constant shift amount, this is rotate + insert (missing extended mnemonics).
+
+define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_const_shift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 23, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 23, 0, 8
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount. 41-32=9.
+
+define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rlwinm 4, 4, 23, 0, 31
+; CHECK-NEXT:    rlwimi 4, 3, 23, 0, 8
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work. 105-64 = 41.
+
+define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
+; CHECK-LABEL: fshr_i64_const_overshift:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rotldi 4, 4, 23
+; CHECK-NEXT:    rldimi 4, 3, 23, 0
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshr_i8_const_fold() {
+; CHECK-LABEL: fshr_i8_const_fold:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li 3, 254
+; CHECK-NEXT:    blr
+  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
+; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
+; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: fshr_v4i32_shift_by_bitwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmr 2, 3
+; CHECK-NEXT:    blr
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
--- a/llvm/test/CodeGen/X86/funnel-shift-rot.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-rot.ll
@ -0,0 +1,387 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686--   -mattr=sse2 | FileCheck %s --check-prefixes=ANY,X32-SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=ANY,X64-AVX2
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; When first 2 operands match, it's a rotate.
+
+define i8 @rotl_i8_const_shift(i8 %x) nounwind {
+; X32-SSE2-LABEL: rotl_i8_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-SSE2-NEXT:    rolb $3, %al
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_i8_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    rolb $3, %dil
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i8 @llvm.fshl.i8(i8 %x, i8 %x, i8 3)
+  ret i8 %f
+}
+
+define i64 @rotl_i64_const_shift(i64 %x) nounwind {
+; X32-SSE2-LABEL: rotl_i64_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl %ecx, %eax
+; X32-SSE2-NEXT:    shldl $3, %edx, %eax
+; X32-SSE2-NEXT:    shldl $3, %ecx, %edx
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_i64_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    rolq $3, %rdi
+; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    retq
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 3)
+  ret i64 %f
+}
+
+define i16 @rotl_i16(i16 %x, i16 %z) nounwind {
+; X32-SSE2-LABEL: rotl_i16:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X32-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    rolw %cl, %ax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_i16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %esi, %ecx
+; X64-AVX2-NEXT:    rolw %cl, %di
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %x, i16 %z)
+  ret i16 %f
+}
+
+define i32 @rotl_i32(i32 %x, i32 %z) nounwind {
+; X32-SSE2-LABEL: rotl_i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    roll %cl, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %esi, %ecx
+; X64-AVX2-NEXT:    roll %cl, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 %z)
+  ret i32 %f
+}
+
+; Vector rotate.
+
+define <4 x i32> @rotl_v4i32(<4 x i32> %x, <4 x i32> %z) nounwind {
+; X32-SSE2-LABEL: rotl_v4i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32]
+; X32-SSE2-NEXT:    psubd %xmm1, %xmm3
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
+; X32-SSE2-NEXT:    pand %xmm4, %xmm3
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE2-NEXT:    psrld %xmm2, %xmm5
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X32-SSE2-NEXT:    psrld %xmm6, %xmm2
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[2,3,3,3,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X32-SSE2-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,1,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE2-NEXT:    psrld %xmm3, %xmm5
+; X32-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; X32-SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm5[0,3]
+; X32-SSE2-NEXT:    pand %xmm4, %xmm1
+; X32-SSE2-NEXT:    pslld $23, %xmm1
+; X32-SSE2-NEXT:    paddd {{\.LCPI.*}}, %xmm1
+; X32-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE2-NEXT:    pmuludq %xmm1, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X32-SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-SSE2-NEXT:    orps %xmm0, %xmm2
+; X32-SSE2-NEXT:    movaps %xmm2, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_v4i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; X64-AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X64-AVX2-NEXT:    vpsllvd %xmm3, %xmm0, %xmm3
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; X64-AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpor %xmm0, %xmm3, %xmm0
+; X64-AVX2-NEXT:    retq
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
+  ret <4 x i32> %f
+}
+
+; Vector rotate by constant splat amount.
+
+define <4 x i32> @rotl_v4i32_const_shift(<4 x i32> %x) nounwind {
+; X32-SSE2-LABEL: rotl_v4i32_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrld $29, %xmm1
+; X32-SSE2-NEXT:    pslld $3, %xmm0
+; X32-SSE2-NEXT:    por %xmm1, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_v4i32_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrld $29, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpslld $3, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %f
+}
+
+; Repeat everything for funnel shift right.
+
+define i8 @rotr_i8_const_shift(i8 %x) nounwind {
+; X32-SSE2-LABEL: rotr_i8_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-SSE2-NEXT:    rolb $5, %al
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_i8_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    rolb $5, %dil
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i8 @llvm.fshr.i8(i8 %x, i8 %x, i8 3)
+  ret i8 %f
+}
+
+define i32 @rotr_i32_const_shift(i32 %x) nounwind {
+; X32-SSE2-LABEL: rotr_i32_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    roll $29, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_i32_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    roll $29, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 3)
+  ret i32 %f
+}
+
+; When first 2 operands match, it's a rotate (by variable amount).
+
+define i16 @rotr_i16(i16 %x, i16 %z) nounwind {
+; X32-SSE2-LABEL: rotr_i16:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; X32-SSE2-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    rorw %cl, %ax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_i16:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %esi, %ecx
+; X64-AVX2-NEXT:    rorw %cl, %di
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %x, i16 %z)
+  ret i16 %f
+}
+
+define i64 @rotr_i64(i64 %x, i64 %z) nounwind {
+; X32-SSE2-LABEL: rotr_i64:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    andl $63, %ecx
+; X32-SSE2-NEXT:    movl %edx, %edi
+; X32-SSE2-NEXT:    shrl %cl, %edi
+; X32-SSE2-NEXT:    movl %esi, %ebx
+; X32-SSE2-NEXT:    shrdl %cl, %edx, %ebx
+; X32-SSE2-NEXT:    xorl %ebp, %ebp
+; X32-SSE2-NEXT:    testb $32, %cl
+; X32-SSE2-NEXT:    cmovnel %edi, %ebx
+; X32-SSE2-NEXT:    cmovnel %ebp, %edi
+; X32-SSE2-NEXT:    movl $64, %ecx
+; X32-SSE2-NEXT:    subl %eax, %ecx
+; X32-SSE2-NEXT:    andl $63, %ecx
+; X32-SSE2-NEXT:    movl %esi, %eax
+; X32-SSE2-NEXT:    shll %cl, %eax
+; X32-SSE2-NEXT:    shldl %cl, %esi, %edx
+; X32-SSE2-NEXT:    testb $32, %cl
+; X32-SSE2-NEXT:    cmovnel %eax, %edx
+; X32-SSE2-NEXT:    cmovnel %ebp, %eax
+; X32-SSE2-NEXT:    orl %ebx, %eax
+; X32-SSE2-NEXT:    orl %edi, %edx
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_i64:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %esi, %ecx
+; X64-AVX2-NEXT:    rorq %cl, %rdi
+; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    retq
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z)
+  ret i64 %f
+}
+
+; Vector rotate.
+
+define <4 x i32> @rotr_v4i32(<4 x i32> %x, <4 x i32> %z) nounwind {
+; X32-SSE2-LABEL: rotr_v4i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [31,31,31,31]
+; X32-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [32,32,32,32]
+; X32-SSE2-NEXT:    psubd %xmm1, %xmm3
+; X32-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X32-SSE2-NEXT:    pand %xmm2, %xmm4
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE2-NEXT:    psrld %xmm1, %xmm5
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrld %xmm6, %xmm1
+; X32-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,0,1]
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X32-SSE2-NEXT:    psrld %xmm5, %xmm6
+; X32-SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X32-SSE2-NEXT:    psrld %xmm4, %xmm5
+; X32-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm5 = xmm5[1],xmm6[1]
+; X32-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
+; X32-SSE2-NEXT:    pand %xmm2, %xmm3
+; X32-SSE2-NEXT:    pslld $23, %xmm3
+; X32-SSE2-NEXT:    paddd {{\.LCPI.*}}, %xmm3
+; X32-SSE2-NEXT:    cvttps2dq %xmm3, %xmm2
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
+; X32-SSE2-NEXT:    pmuludq %xmm2, %xmm0
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X32-SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; X32-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; X32-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; X32-SSE2-NEXT:    orps %xmm0, %xmm1
+; X32-SSE2-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_v4i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
+; X64-AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X64-AVX2-NEXT:    vpsrlvd %xmm3, %xmm0, %xmm3
+; X64-AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [32,32,32,32]
+; X64-AVX2-NEXT:    vpsubd %xmm1, %xmm4, %xmm1
+; X64-AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpor %xmm3, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %z)
+  ret <4 x i32> %f
+}
+
+; Vector rotate by constant splat amount.
+
+define <4 x i32> @rotr_v4i32_const_shift(<4 x i32> %x) nounwind {
+; X32-SSE2-LABEL: rotr_v4i32_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X32-SSE2-NEXT:    psrld $3, %xmm1
+; X32-SSE2-NEXT:    pslld $29, %xmm0
+; X32-SSE2-NEXT:    por %xmm1, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_v4i32_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vpsrld $3, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpslld $29, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 3, i32 3, i32 3, i32 3>)
+  ret <4 x i32> %f
+}
+
+define i32 @rotl_i32_shift_by_bitwidth(i32 %x) nounwind {
+; X32-SSE2-LABEL: rotl_i32_shift_by_bitwidth:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotl_i32_shift_by_bitwidth:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %x, i32 32)
+  ret i32 %f
+}
+
+define i32 @rotr_i32_shift_by_bitwidth(i32 %x) nounwind {
+; X32-SSE2-LABEL: rotr_i32_shift_by_bitwidth:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: rotr_i32_shift_by_bitwidth:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %x, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @rotl_v4i32_shift_by_bitwidth(<4 x i32> %x) nounwind {
+; ANY-LABEL: rotl_v4i32_shift_by_bitwidth:
+; ANY:       # %bb.0:
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @rotr_v4i32_shift_by_bitwidth(<4 x i32> %x) nounwind {
+; ANY-LABEL: rotr_v4i32_shift_by_bitwidth:
+; ANY:       # %bb.0:
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@ -0,0 +1,526 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686--   -mattr=sse2 | FileCheck %s --check-prefixes=ANY,X32-SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2 | FileCheck %s --check-prefixes=ANY,X64-AVX2
+
+declare i8 @llvm.fshl.i8(i8, i8, i8)
+declare i16 @llvm.fshl.i16(i16, i16, i16)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+declare i8 @llvm.fshr.i8(i8, i8, i8)
+declare i16 @llvm.fshr.i16(i16, i16, i16)
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
+
+; General case - all operands can be variables - x86 has shld, but that's not matched.
+
+define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) nounwind {
+; X32-SSE2-LABEL: fshl_i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl $32, %ecx
+; X32-SSE2-NEXT:    subl %edx, %ecx
+; X32-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X32-SSE2-NEXT:    shrl %cl, %edi
+; X32-SSE2-NEXT:    andl $31, %edx
+; X32-SSE2-NEXT:    movl %esi, %eax
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shll %cl, %eax
+; X32-SSE2-NEXT:    orl %edi, %eax
+; X32-SSE2-NEXT:    testl %edx, %edx
+; X32-SSE2-NEXT:    cmovel %esi, %eax
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl $32, %ecx
+; X64-AVX2-NEXT:    subl %edx, %ecx
+; X64-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-AVX2-NEXT:    shrl %cl, %esi
+; X64-AVX2-NEXT:    andl $31, %edx
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    shll %cl, %eax
+; X64-AVX2-NEXT:    orl %esi, %eax
+; X64-AVX2-NEXT:    testl %edx, %edx
+; X64-AVX2-NEXT:    cmovel %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshl.i37(i37, i37, i37)
+define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
+; X32-SSE2-LABEL: fshl_i37:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    subl $8, %esp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    andl $31, %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %eax, %ebp
+; X32-SSE2-NEXT:    andl $31, %ebp
+; X32-SSE2-NEXT:    movl $37, %ecx
+; X32-SSE2-NEXT:    subl %ebx, %ecx
+; X32-SSE2-NEXT:    movl $0, %edx
+; X32-SSE2-NEXT:    sbbl %eax, %edx
+; X32-SSE2-NEXT:    andl $31, %edx
+; X32-SSE2-NEXT:    pushl $0
+; X32-SSE2-NEXT:    pushl $37
+; X32-SSE2-NEXT:    pushl %edx
+; X32-SSE2-NEXT:    pushl %ecx
+; X32-SSE2-NEXT:    calll __umoddi3
+; X32-SSE2-NEXT:    addl $16, %esp
+; X32-SSE2-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shrdl %cl, %esi, %edi
+; X32-SSE2-NEXT:    pushl $0
+; X32-SSE2-NEXT:    pushl $37
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    calll __umoddi3
+; X32-SSE2-NEXT:    addl $16, %esp
+; X32-SSE2-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl %edx, %ebp
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    shll %cl, %ebp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-SSE2-NEXT:    shldl %cl, %edx, %ebx
+; X32-SSE2-NEXT:    testb $32, %al
+; X32-SSE2-NEXT:    cmovnel %ebp, %ebx
+; X32-SSE2-NEXT:    movl $0, %edx
+; X32-SSE2-NEXT:    cmovnel %edx, %ebp
+; X32-SSE2-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X32-SSE2-NEXT:    shrl %cl, %esi
+; X32-SSE2-NEXT:    testb $32, %cl
+; X32-SSE2-NEXT:    cmovnel %esi, %edi
+; X32-SSE2-NEXT:    cmovnel %edx, %esi
+; X32-SSE2-NEXT:    orl %ebx, %esi
+; X32-SSE2-NEXT:    orl %ebp, %edi
+; X32-SSE2-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl %edi, %eax
+; X32-SSE2-NEXT:    movl %esi, %edx
+; X32-SSE2-NEXT:    addl $8, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_i37:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    pushq %rbx
+; X64-AVX2-NEXT:    movq %rdx, %r9
+; X64-AVX2-NEXT:    movabsq $137438953471, %r10 # imm = 0x1FFFFFFFFF
+; X64-AVX2-NEXT:    andq %r10, %rsi
+; X64-AVX2-NEXT:    movl $37, %r8d
+; X64-AVX2-NEXT:    subq %rdx, %r8
+; X64-AVX2-NEXT:    andq %r10, %r9
+; X64-AVX2-NEXT:    movabsq $-2492803253203993461, %r11 # imm = 0xDD67C8A60DD67C8B
+; X64-AVX2-NEXT:    movq %r9, %rax
+; X64-AVX2-NEXT:    mulq %r11
+; X64-AVX2-NEXT:    shrq $5, %rdx
+; X64-AVX2-NEXT:    imulq $37, %rdx, %rax
+; X64-AVX2-NEXT:    subq %rax, %r9
+; X64-AVX2-NEXT:    movq %rdi, %rbx
+; X64-AVX2-NEXT:    movl %r9d, %ecx
+; X64-AVX2-NEXT:    shlq %cl, %rbx
+; X64-AVX2-NEXT:    movq %r8, %rax
+; X64-AVX2-NEXT:    andq %r10, %rax
+; X64-AVX2-NEXT:    mulq %r11
+; X64-AVX2-NEXT:    shrl $5, %edx
+; X64-AVX2-NEXT:    imull $37, %edx, %eax
+; X64-AVX2-NEXT:    subl %eax, %r8d
+; X64-AVX2-NEXT:    movl %r8d, %ecx
+; X64-AVX2-NEXT:    shrq %cl, %rsi
+; X64-AVX2-NEXT:    orq %rbx, %rsi
+; X64-AVX2-NEXT:    testq %r9, %r9
+; X64-AVX2-NEXT:    cmoveq %rdi, %rsi
+; X64-AVX2-NEXT:    movq %rsi, %rax
+; X64-AVX2-NEXT:    popq %rbx
+; X64-AVX2-NEXT:    retq
+  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
+
+declare i7 @llvm.fshl.i7(i7, i7, i7)
+define i7 @fshl_i7_const_fold() {
+; ANY-LABEL: fshl_i7_const_fold:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movb $67, %al
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+; With constant shift amount, this is 'shld' with constant operand.
+
+define i32 @fshl_i32_const_shift(i32 %x, i32 %y) nounwind {
+; X32-SSE2-LABEL: fshl_i32_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    shldl $9, %ecx, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_i32_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    shldl $9, %esi, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount.
+
+define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) nounwind {
+; X32-SSE2-LABEL: fshl_i32_const_overshift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    shldl $9, %ecx, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_i32_const_overshift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    shldl $9, %esi, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work.
+
+define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) nounwind {
+; X32-SSE2-LABEL: fshl_i64_const_overshift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    shldl $9, %ecx, %edx
+; X32-SSE2-NEXT:    shrdl $23, %ecx, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_i64_const_overshift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    shldq $41, %rsi, %rdi
+; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    retq
+  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshl_i8_const_fold() nounwind {
+; ANY-LABEL: fshl_i8_const_fold:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movb $-128, %al
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+; Repeat everything for funnel shift right.
+
+; General case - all operands can be variables - x86 has 'shrd', but this doesn't match.
+
+define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) nounwind {
+; X32-SSE2-LABEL: fshr_i32:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl $32, %ebx
+; X32-SSE2-NEXT:    subl %edx, %ebx
+; X32-SSE2-NEXT:    andl $31, %edx
+; X32-SSE2-NEXT:    movl %esi, %edi
+; X32-SSE2-NEXT:    movl %edx, %ecx
+; X32-SSE2-NEXT:    shrl %cl, %edi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl %ebx, %ecx
+; X32-SSE2-NEXT:    shll %cl, %eax
+; X32-SSE2-NEXT:    orl %edi, %eax
+; X32-SSE2-NEXT:    testl %edx, %edx
+; X32-SSE2-NEXT:    cmovel %esi, %eax
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_i32:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl $32, %r8d
+; X64-AVX2-NEXT:    subl %edx, %r8d
+; X64-AVX2-NEXT:    andl $31, %edx
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    movl %edx, %ecx
+; X64-AVX2-NEXT:    shrl %cl, %eax
+; X64-AVX2-NEXT:    movl %r8d, %ecx
+; X64-AVX2-NEXT:    shll %cl, %edi
+; X64-AVX2-NEXT:    orl %eax, %edi
+; X64-AVX2-NEXT:    testl %edx, %edx
+; X64-AVX2-NEXT:    cmovel %esi, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
+  ret i32 %f
+}
+
+; Verify that weird types are minimally supported.
+declare i37 @llvm.fshr.i37(i37, i37, i37)
+define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
+; X32-SSE2-LABEL: fshr_i37:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    pushl %ebx
+; X32-SSE2-NEXT:    pushl %edi
+; X32-SSE2-NEXT:    pushl %esi
+; X32-SSE2-NEXT:    pushl %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    andl $31, %esi
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    andl $31, %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-SSE2-NEXT:    pushl $0
+; X32-SSE2-NEXT:    pushl $37
+; X32-SSE2-NEXT:    pushl %eax
+; X32-SSE2-NEXT:    pushl %ebp
+; X32-SSE2-NEXT:    calll __umoddi3
+; X32-SSE2-NEXT:    addl $16, %esp
+; X32-SSE2-NEXT:    movl %eax, %ebx
+; X32-SSE2-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X32-SSE2-NEXT:    movl $37, %eax
+; X32-SSE2-NEXT:    subl %ebp, %eax
+; X32-SSE2-NEXT:    movl $0, %edx
+; X32-SSE2-NEXT:    sbbl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    andl $31, %edx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT:    movl %ebx, %ecx
+; X32-SSE2-NEXT:    shrdl %cl, %esi, %ebp
+; X32-SSE2-NEXT:    pushl $0
+; X32-SSE2-NEXT:    pushl $37
+; X32-SSE2-NEXT:    pushl %edx
+; X32-SSE2-NEXT:    pushl %eax
+; X32-SSE2-NEXT:    calll __umoddi3
+; X32-SSE2-NEXT:    addl $16, %esp
+; X32-SSE2-NEXT:    movl %eax, %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    movl %edx, %eax
+; X32-SSE2-NEXT:    shll %cl, %eax
+; X32-SSE2-NEXT:    shldl %cl, %edx, %edi
+; X32-SSE2-NEXT:    testb $32, %cl
+; X32-SSE2-NEXT:    cmovnel %eax, %edi
+; X32-SSE2-NEXT:    movl $0, %edx
+; X32-SSE2-NEXT:    cmovnel %edx, %eax
+; X32-SSE2-NEXT:    movl %ebx, %ecx
+; X32-SSE2-NEXT:    shrl %cl, %esi
+; X32-SSE2-NEXT:    testb $32, %bl
+; X32-SSE2-NEXT:    cmovnel %esi, %ebp
+; X32-SSE2-NEXT:    cmovnel %edx, %esi
+; X32-SSE2-NEXT:    orl %edi, %esi
+; X32-SSE2-NEXT:    orl %eax, %ebp
+; X32-SSE2-NEXT:    orl %ebx, (%esp) # 4-byte Folded Spill
+; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %ebp
+; X32-SSE2-NEXT:    cmovel {{[0-9]+}}(%esp), %esi
+; X32-SSE2-NEXT:    movl %ebp, %eax
+; X32-SSE2-NEXT:    movl %esi, %edx
+; X32-SSE2-NEXT:    addl $4, %esp
+; X32-SSE2-NEXT:    popl %esi
+; X32-SSE2-NEXT:    popl %edi
+; X32-SSE2-NEXT:    popl %ebx
+; X32-SSE2-NEXT:    popl %ebp
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_i37:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    pushq %rbx
+; X64-AVX2-NEXT:    movq %rdx, %r9
+; X64-AVX2-NEXT:    movabsq $137438953471, %r11 # imm = 0x1FFFFFFFFF
+; X64-AVX2-NEXT:    movq %rsi, %r10
+; X64-AVX2-NEXT:    andq %r11, %r10
+; X64-AVX2-NEXT:    movl $37, %r8d
+; X64-AVX2-NEXT:    subq %rdx, %r8
+; X64-AVX2-NEXT:    andq %r11, %r9
+; X64-AVX2-NEXT:    movabsq $-2492803253203993461, %rbx # imm = 0xDD67C8A60DD67C8B
+; X64-AVX2-NEXT:    movq %r9, %rax
+; X64-AVX2-NEXT:    mulq %rbx
+; X64-AVX2-NEXT:    shrq $5, %rdx
+; X64-AVX2-NEXT:    imulq $37, %rdx, %rax
+; X64-AVX2-NEXT:    subq %rax, %r9
+; X64-AVX2-NEXT:    movl %r9d, %ecx
+; X64-AVX2-NEXT:    shrq %cl, %r10
+; X64-AVX2-NEXT:    movq %r8, %rax
+; X64-AVX2-NEXT:    andq %r11, %rax
+; X64-AVX2-NEXT:    mulq %rbx
+; X64-AVX2-NEXT:    shrl $5, %edx
+; X64-AVX2-NEXT:    imull $37, %edx, %eax
+; X64-AVX2-NEXT:    subl %eax, %r8d
+; X64-AVX2-NEXT:    movl %r8d, %ecx
+; X64-AVX2-NEXT:    shlq %cl, %rdi
+; X64-AVX2-NEXT:    orq %r10, %rdi
+; X64-AVX2-NEXT:    testq %r9, %r9
+; X64-AVX2-NEXT:    cmoveq %rsi, %rdi
+; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    popq %rbx
+; X64-AVX2-NEXT:    retq
+  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
+  ret i37 %f
+}
+
+; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
+
+declare i7 @llvm.fshr.i7(i7, i7, i7)
+define i7 @fshr_i7_const_fold() nounwind {
+; ANY-LABEL: fshr_i7_const_fold:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movb $31, %al
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
+  ret i7 %f
+}
+
+; With constant shift amount, this is 'shrd' or 'shld'.
+
+define i32 @fshr_i32_const_shift(i32 %x, i32 %y) nounwind {
+; X32-SSE2-LABEL: fshr_i32_const_shift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    shldl $23, %ecx, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_i32_const_shift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    shldl $23, %esi, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
+  ret i32 %f
+}
+
+; Check modulo math on shift amount. 41-32=9, but right-shift became left, so 32-9=23.
+
+define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) nounwind {
+; X32-SSE2-LABEL: fshr_i32_const_overshift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    shldl $23, %ecx, %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_i32_const_overshift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    shldl $23, %esi, %edi
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
+  ret i32 %f
+}
+
+; 64-bit should also work. 105-64 = 41, but right-shift became left, so 64-41=23.
+
+define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) nounwind {
+; X32-SSE2-LABEL: fshr_i64_const_overshift:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-SSE2-NEXT:    shrdl $9, %ecx, %eax
+; X32-SSE2-NEXT:    shldl $23, %ecx, %edx
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_i64_const_overshift:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    shldq $23, %rsi, %rdi
+; X64-AVX2-NEXT:    movq %rdi, %rax
+; X64-AVX2-NEXT:    retq
+  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
+  ret i64 %f
+}
+
+; This should work without any node-specific logic.
+
+define i8 @fshr_i8_const_fold() nounwind {
+; ANY-LABEL: fshr_i8_const_fold:
+; ANY:       # %bb.0:
+; ANY-NEXT:    movb $-2, %al
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
+  ret i8 %f
+}
+
+define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) nounwind {
+; X32-SSE2-LABEL: fshl_i32_shift_by_bitwidth:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshl_i32_shift_by_bitwidth:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %edi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) nounwind {
+; X32-SSE2-LABEL: fshr_i32_shift_by_bitwidth:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_i32_shift_by_bitwidth:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    movl %esi, %eax
+; X64-AVX2-NEXT:    retq
+  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
+  ret i32 %f
+}
+
+define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounwind {
+; ANY-LABEL: fshl_v4i32_shift_by_bitwidth:
+; ANY:       # %bb.0:
+; ANY-NEXT:    ret{{[l|q]}}
+  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+
+define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounwind {
+; X32-SSE2-LABEL: fshr_v4i32_shift_by_bitwidth:
+; X32-SSE2:       # %bb.0:
+; X32-SSE2-NEXT:    movaps %xmm1, %xmm0
+; X32-SSE2-NEXT:    retl
+;
+; X64-AVX2-LABEL: fshr_v4i32_shift_by_bitwidth:
+; X64-AVX2:       # %bb.0:
+; X64-AVX2-NEXT:    vmovaps %xmm1, %xmm0
+; X64-AVX2-NEXT:    retq
+  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
+  ret <4 x i32> %f
+}
+