[ARM] Add MVE vector load/store instructions.

This adds the rest of the vector memory access instructions. It includes contiguous loads/stores, with an ordinary addressing mode such as [r0,#offset] (plus writeback variants); gather loads and scatter stores with a scalar base address register and a vector of offsets from it (written [r0,q1] or similar); and gather/scatters with a vector of base addresses (written [q0,#offset], again with writeback). Additionally, some of the loads can widen each loaded value into a larger vector lane, and the corresponding stores narrow them again. To implement these, we also have to add the addressing modes they need. Also, in AsmParser, the `isMem` query function now has subqueries `isGPRMem` and `isMVEMem`, according to which kind of base register is used by a given memory access operand. I've also had to add an extra check in `checkTargetMatchPredicate` in the AsmParser, without which our last-minute check of `rGPR` register operands against SP and PC was failing an assertion because Tablegen had inserted an immediate 0 in place of one of a pair of tied register operands. (This matches the way the corresponding check for `MCK_rGPR` in `validateTargetOperandClass` is guarded.) Apparently the MVE load instructions were the first to have ever triggered this assertion, but I think only because they were the first to have a combination of the usual Arm pre/post writeback system and the `rGPR` class in particular. Reviewers: dmgreen, samparker, SjoerdMeijer, t.p.northover Subscribers: javed.absar, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D62680 llvm-svn: 364291
2019-06-25 11:24:18 +00:00 · 2019-06-25 11:24:18 +00:00 · e6824160dd
parent 49b3778e32
commit e6824160dd
15 changed files with 4371 additions and 60 deletions
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@ -2477,11 +2477,14 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
      NumBits = 8;
      Scale = 2;
      break;
+    case ARMII::AddrModeT2_i7:
+    case ARMII::AddrModeT2_i7s2:
    case ARMII::AddrModeT2_i7s4:
      ImmIdx = FrameRegIdx+1;
      InstrOffs = MI.getOperand(ImmIdx).getImm();
      NumBits = 7;
-      Scale = 4;
+      Scale = (AddrMode == ARMII::AddrModeT2_i7s2 ? 2 :
+               AddrMode == ARMII::AddrModeT2_i7s4 ? 4 : 1);
      break;
    default:
      llvm_unreachable("Unsupported addressing mode!");
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@ -110,6 +110,8 @@ def AddrMode_i12    : AddrMode<16>;
 def AddrMode5FP16   : AddrMode<17>;
 def AddrModeT2_ldrex : AddrMode<18>;
 def AddrModeT2_i7s4 : AddrMode<19>;
+def AddrModeT2_i7s2 : AddrMode<20>;
+def AddrModeT2_i7   : AddrMode<21>;

 // Load / store index mode.
 class IndexMode<bits<2> val> {
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@ -1234,6 +1234,15 @@ def addr_offset_none : MemOperand,
  let MIOperandInfo = (ops GPR:$base);
 }

+// t_addr_offset_none := reg [r0-r7]
+def MemNoOffsetTAsmOperand : AsmOperandClass { let Name = "MemNoOffsetT"; }
+def t_addr_offset_none : MemOperand {
+  let PrintMethod = "printAddrMode7Operand";
+  let DecoderMethod = "DecodetGPRRegisterClass";
+  let ParserMatchClass = MemNoOffsetTAsmOperand;
+  let MIOperandInfo = (ops tGPR:$base);
+}
+
 def nohash_imm : Operand<i32> {
  let PrintMethod = "printNoHashImmediate";
 }
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@ -153,6 +153,127 @@ def VecList4Q : RegisterOperand<QQQQPR, "printMVEVectorListFourQ"> {
  let PrintMethod = "printMVEVectorList<4>";
 }

+// taddrmode_imm7  := reg[r0-r7] +/- (imm7 << shift)
+class TMemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
+  let Name = "TMemImm7Shift"#shift#"Offset";
+  let PredicateMethod = "isMemImm7ShiftedOffset<"#shift#",ARM::tGPRRegClassID>";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
+
+class taddrmode_imm7<int shift> : MemOperand {
+  let ParserMatchClass = TMemImm7ShiftOffsetAsmOperand<shift>;
+  // They are printed the same way as the T2 imm8 version
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+  // This can also be the same as the T2 version.
+  let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">";
+  let DecoderMethod = "DecodeTAddrModeImm7<"#shift#">";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm7  := reg +/- (imm7)
+class MemImm7ShiftOffsetAsmOperand<int shift> : AsmOperandClass {
+  let Name = "MemImm7Shift"#shift#"Offset";
+  let PredicateMethod = "isMemImm7ShiftedOffset<" # shift #
+                        ",ARM::GPRnopcRegClassID>";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
+
+def MemImm7Shift0OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<0>;
+def MemImm7Shift1OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<1>;
+def MemImm7Shift2OffsetAsmOperand : MemImm7ShiftOffsetAsmOperand<2>;
+class T2AddrMode_Imm7<int shift> : MemOperand,
+      ComplexPattern<i32, 2, "SelectT2AddrModeImm7<"#shift#">", []> {
+  let EncoderMethod = "getT2AddrModeImmOpValue<7,"#shift#">";
+  let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 0>";
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("MemImm7Shift"#shift#"OffsetAsmOperand");
+  let MIOperandInfo = (ops GPRnopc:$base, i32imm:$offsimm);
+}
+
+class t2addrmode_imm7<int shift> : T2AddrMode_Imm7<shift> {
+  // They are printed the same way as the imm8 version
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+class MemImm7ShiftOffsetWBAsmOperand<int shift> : AsmOperandClass {
+  let Name = "MemImm7Shift"#shift#"OffsetWB";
+  let PredicateMethod = "isMemImm7ShiftedOffset<" # shift #
+                        ",ARM::rGPRRegClassID>";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
+
+def MemImm7Shift0OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<0>;
+def MemImm7Shift1OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<1>;
+def MemImm7Shift2OffsetWBAsmOperand : MemImm7ShiftOffsetWBAsmOperand<2>;
+
+class t2addrmode_imm7_pre<int shift> : T2AddrMode_Imm7<shift> {
+  // They are printed the same way as the imm8 version
+  let PrintMethod = "printT2AddrModeImm8Operand<true>";
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("MemImm7Shift"#shift#"OffsetWBAsmOperand");
+  let DecoderMethod = "DecodeT2AddrModeImm7<"#shift#", 1>";
+  let MIOperandInfo = (ops rGPR:$base, i32imm:$offsim);
+}
+
+class t2am_imm7shiftOffsetAsmOperand<int shift>
+  : AsmOperandClass { let Name = "Imm7Shift"#shift; }
+def t2am_imm7shift0OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<0>;
+def t2am_imm7shift1OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<1>;
+def t2am_imm7shift2OffsetAsmOperand : t2am_imm7shiftOffsetAsmOperand<2>;
+
+class t2am_imm7_offset<int shift> : MemOperand {
+  // They are printed the same way as the imm8 version
+  let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("t2am_imm7shift"#shift#"OffsetAsmOperand");
+  let EncoderMethod = "getT2ScaledImmOpValue<7,"#shift#">";
+  let DecoderMethod = "DecodeT2Imm7<"#shift#">";
+}
+
+// Operands for gather/scatter loads of the form [Rbase, Qoffsets]
+class MemRegRQOffsetAsmOperand<int shift> : AsmOperandClass {
+  let Name = "MemRegRQS"#shift#"Offset";
+  let PredicateMethod = "isMemRegRQOffset<"#shift#">";
+  let RenderMethod = "addMemRegRQOffsetOperands";
+}
+
+def MemRegRQS0OffsetAsmOperand : MemRegRQOffsetAsmOperand<0>;
+def MemRegRQS1OffsetAsmOperand : MemRegRQOffsetAsmOperand<1>;
+def MemRegRQS2OffsetAsmOperand : MemRegRQOffsetAsmOperand<2>;
+def MemRegRQS3OffsetAsmOperand : MemRegRQOffsetAsmOperand<3>;
+
+// mve_addr_rq_shift  := reg + vreg{ << UXTW #shift}
+class mve_addr_rq_shift<int shift> : MemOperand {
+  let EncoderMethod = "getMveAddrModeRQOpValue";
+  let PrintMethod = "printMveAddrModeRQOperand<"#shift#">";
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("MemRegRQS"#shift#"OffsetAsmOperand");
+  let DecoderMethod = "DecodeMveAddrModeRQ";
+  let MIOperandInfo = (ops GPRnopc:$base, MQPR:$offsreg);
+}
+
+class MemRegQOffsetAsmOperand<int shift> : AsmOperandClass {
+  let Name = "MemRegQS"#shift#"Offset";
+  let PredicateMethod = "isMemRegQOffset<"#shift#">";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
+
+def MemRegQS2OffsetAsmOperand : MemRegQOffsetAsmOperand<2>;
+def MemRegQS3OffsetAsmOperand : MemRegQOffsetAsmOperand<3>;
+
+// mve_addr_q_shift  := vreg {+ #imm7s2/4}
+class mve_addr_q_shift<int shift> : MemOperand {
+  let EncoderMethod = "getMveAddrModeQOpValue<"#shift#">";
+  // Can be printed same way as other reg + imm operands
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+  let ParserMatchClass =
+    !cast<AsmOperandClass>("MemRegQS"#shift#"OffsetAsmOperand");
+  let DecoderMethod = "DecodeMveAddrModeQ<"#shift#">";
+  let MIOperandInfo = (ops MQPR:$base, i32imm:$imm);
+}
+
+// --------- Start of base classes for the instructions themselves
+
 class MVE_MI<dag oops, dag iops, InstrItinClass itin, string asm,
             string ops, string cstr, list<dag> pattern>
  : Thumb2XI<oops, iops, AddrModeNone, 4, itin, !strconcat(asm, "\t", ops), cstr,
@ -3245,6 +3366,359 @@ foreach wb = [MVE_vldst24_writeback<

 // end of MVE interleaving load/store

+// start of MVE predicable load/store
+
+// A parameter class for the direction of transfer.
+class MVE_ldst_direction<bit b, dag Oo, dag Io, string c=""> {
+  bit load = b;
+  dag Oops = Oo;
+  dag Iops = Io;
+  string cstr = c;
+}
+def MVE_ld: MVE_ldst_direction<1, (outs MQPR:$Qd), (ins), ",@earlyclobber $Qd">;
+def MVE_st: MVE_ldst_direction<0, (outs), (ins MQPR:$Qd)>;
+
+// A parameter class for the size of memory access in a load.
+class MVE_memsz<bits<2> e, int s, AddrMode m, string mn, list<string> types> {
+  bits<2> encoding = e;         // opcode bit(s) for encoding
+  int shift = s;                // shift applied to immediate load offset
+  AddrMode AM = m;
+
+  // For instruction aliases: define the complete list of type
+  // suffixes at this size, and the canonical ones for loads and
+  // stores.
+  string MnemonicLetter = mn;
+  int TypeBits = !shl(8, s);
+  string CanonLoadSuffix = ".u" # TypeBits;
+  string CanonStoreSuffix = "." # TypeBits;
+  list<string> suffixes = !foreach(letter, types, "." # letter # TypeBits);
+}
+
+// Instances of MVE_memsz.
+//
+// (memD doesn't need an AddrMode, because those are only for
+// contiguous loads, and memD is only used by gather/scatters.)
+def MVE_memB: MVE_memsz<0b00, 0, AddrModeT2_i7,   "b", ["", "u", "s"]>;
+def MVE_memH: MVE_memsz<0b01, 1, AddrModeT2_i7s2, "h", ["", "u", "s", "f"]>;
+def MVE_memW: MVE_memsz<0b10, 2, AddrModeT2_i7s4, "w", ["", "u", "s", "f"]>;
+def MVE_memD: MVE_memsz<0b11, 3, ?,               "d", ["", "u", "s", "f"]>;
+
+// This is the base class for all the MVE loads and stores other than
+// the interleaving ones. All the non-interleaving loads/stores share
+// the characteristic that they operate on just one vector register,
+// so they are VPT-predicable.
+//
+// The predication operand is vpred_n, for both loads and stores. For
+// store instructions, the reason is obvious: if there is no output
+// register, there can't be a need for an input parameter giving the
+// output register's previous value. Load instructions also don't need
+// that input parameter, because unlike MVE data processing
+// instructions, predicated loads are defined to set the inactive
+// lanes of the output register to zero, instead of preserving their
+// input values.
+class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
+                       dag oops, dag iops, string asm, string suffix,
+                       string ops, string cstr, list<dag> pattern=[]>
+ : MVE_p<oops, iops, NoItinerary, asm, suffix, ops, vpred_n, cstr, pattern> {
+  bits<3> Qd;
+
+  let Inst{28} = U;
+  let Inst{25} = 0b0;
+  let Inst{24} = P;
+  let Inst{22} = 0b0;
+  let Inst{21} = W;
+  let Inst{20} = dir.load;
+  let Inst{15-13} = Qd{2-0};
+  let Inst{12} = opc;
+  let Inst{11-9} = 0b111;
+
+  let mayLoad = dir.load;
+  let mayStore = !eq(dir.load,0);
+}
+
+// Contiguous load and store instructions. These come in two main
+// categories: same-size loads/stores in which 128 bits of vector
+// register is transferred to or from 128 bits of memory in the most
+// obvious way, and widening loads / narrowing stores, in which the
+// size of memory accessed is less than the size of a vector register,
+// so the load instructions sign- or zero-extend each memory value
+// into a wider vector lane, and the store instructions truncate
+// correspondingly.
+//
+// The instruction mnemonics for these two classes look reasonably
+// similar, but the actual encodings are different enough to need two
+// separate base classes.
+
+// Contiguous, same size
+class MVE_VLDRSTR_cs<MVE_ldst_direction dir, MVE_memsz memsz, bit P, bit W,
+                     dag oops, dag iops, string asm, string suffix,
+                     IndexMode im, string ops, string cstr>
+  : MVE_VLDRSTR_base<dir, 0, P, W, 1, oops, iops, asm, suffix, ops, cstr> {
+  bits<12> addr;
+  let Inst{23} = addr{7};
+  let Inst{19-16} = addr{11-8};
+  let Inst{8-7} = memsz.encoding;
+  let Inst{6-0} = addr{6-0};
+}
+
+// Contiguous, widening/narrowing
+class MVE_VLDRSTR_cw<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
+                     bit P, bit W, bits<2> size, dag oops, dag iops,
+                     string asm, string suffix, IndexMode im,
+                     string ops, string cstr>
+  : MVE_VLDRSTR_base<dir, U, P, W, 0, oops, iops, asm, suffix, ops, cstr> {
+  bits<11> addr;
+  let Inst{23} = addr{7};
+  let Inst{19} = memsz.encoding{0}; // enough to tell 16- from 32-bit
+  let Inst{18-16} = addr{10-8};
+  let Inst{8-7} = size;
+  let Inst{6-0} = addr{6-0};
+
+  let IM = im;
+}
+
+// Multiclass wrapper on each of the _cw and _cs base classes, to
+// generate three writeback modes (none, preindex, postindex).
+
+multiclass MVE_VLDRSTR_cw_m<MVE_ldst_direction dir, MVE_memsz memsz,
+                            string asm, string suffix, bit U, bits<2> size> {
+  let AM = memsz.AM in {
+    def "" : MVE_VLDRSTR_cw<
+        dir, memsz, U, 1, 0, size,
+        dir.Oops, !con(dir.Iops, (ins taddrmode_imm7<memsz.shift>:$addr)),
+        asm, suffix, IndexModeNone, "$Qd, $addr", "">;
+
+    def _pre : MVE_VLDRSTR_cw<
+        dir, memsz, U, 1, 1, size,
+        !con((outs tGPR:$wb), dir.Oops),
+        !con(dir.Iops, (ins taddrmode_imm7<memsz.shift>:$addr)),
+        asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> {
+      let DecoderMethod = "DecodeMVE_MEM_1_pre<"#memsz.shift#">";
+    }
+
+    def _post : MVE_VLDRSTR_cw<
+        dir, memsz, U, 0, 1, size,
+        !con((outs tGPR:$wb), dir.Oops),
+        !con(dir.Iops, (ins t_addr_offset_none:$Rn,
+                            t2am_imm7_offset<memsz.shift>:$addr)),
+        asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
+      bits<4> Rn;
+      let Inst{18-16} = Rn{2-0};
+    }
+  }
+}
+
+multiclass MVE_VLDRSTR_cs_m<MVE_ldst_direction dir, MVE_memsz memsz,
+                            string asm, string suffix> {
+  let AM = memsz.AM in {
+    def "" : MVE_VLDRSTR_cs<
+        dir, memsz, 1, 0,
+        dir.Oops, !con(dir.Iops, (ins t2addrmode_imm7<memsz.shift>:$addr)),
+        asm, suffix, IndexModeNone, "$Qd, $addr", "">;
+
+    def _pre : MVE_VLDRSTR_cs<
+        dir, memsz, 1, 1,
+        !con((outs rGPR:$wb), dir.Oops),
+        !con(dir.Iops, (ins t2addrmode_imm7_pre<memsz.shift>:$addr)),
+        asm, suffix, IndexModePre, "$Qd, $addr!", "$addr.base = $wb"> {
+      let DecoderMethod = "DecodeMVE_MEM_2_pre<"#memsz.shift#">";
+    }
+
+    def _post : MVE_VLDRSTR_cs<
+        dir, memsz, 0, 1,
+        !con((outs rGPR:$wb), dir.Oops),
+        // We need an !if here to select the base register class,
+        // because it's legal to write back to SP in a load of this
+        // type, but not in a store.
+        !con(dir.Iops, (ins !if(dir.load, t2_addr_offset_none,
+                                          t2_nosp_addr_offset_none):$Rn,
+                            t2am_imm7_offset<memsz.shift>:$addr)),
+        asm, suffix, IndexModePost, "$Qd, $Rn$addr", "$Rn.base = $wb"> {
+      bits<4> Rn;
+      let Inst{19-16} = Rn{3-0};
+    }
+  }
+}
+
+// Now actually declare all the contiguous load/stores, via those
+// multiclasses. The instruction ids coming out of this are the bare
+// names shown in the defm, with _pre or _post appended for writeback,
+// e.g. MVE_VLDRBS16, MVE_VSTRB16_pre, MVE_VSTRHU16_post.
+
+defm MVE_VLDRBS16: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "s16", 0, 0b01>;
+defm MVE_VLDRBS32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "s32", 0, 0b10>;
+defm MVE_VLDRBU16: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "u16", 1, 0b01>;
+defm MVE_VLDRBU32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memB, "vldrb", "u32", 1, 0b10>;
+defm MVE_VLDRHS32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memH, "vldrh", "s32", 0, 0b10>;
+defm MVE_VLDRHU32: MVE_VLDRSTR_cw_m<MVE_ld, MVE_memH, "vldrh", "u32", 1, 0b10>;
+
+defm MVE_VLDRBU8:  MVE_VLDRSTR_cs_m<MVE_ld, MVE_memB, "vldrb", "u8">;
+defm MVE_VLDRHU16: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memH, "vldrh", "u16">;
+defm MVE_VLDRWU32: MVE_VLDRSTR_cs_m<MVE_ld, MVE_memW, "vldrw", "u32">;
+
+defm MVE_VSTRB16:  MVE_VLDRSTR_cw_m<MVE_st, MVE_memB, "vstrb", "16",  0, 0b01>;
+defm MVE_VSTRB32:  MVE_VLDRSTR_cw_m<MVE_st, MVE_memB, "vstrb", "32",  0, 0b10>;
+defm MVE_VSTRH32:  MVE_VLDRSTR_cw_m<MVE_st, MVE_memH, "vstrh", "32",  0, 0b10>;
+
+defm MVE_VSTRBU8 : MVE_VLDRSTR_cs_m<MVE_st, MVE_memB, "vstrb", "8">;
+defm MVE_VSTRHU16: MVE_VLDRSTR_cs_m<MVE_st, MVE_memH, "vstrh", "16">;
+defm MVE_VSTRWU32: MVE_VLDRSTR_cs_m<MVE_st, MVE_memW, "vstrw", "32">;
+
+// Gather loads / scatter stores whose address operand is of the form
+// [Rn,Qm], i.e. a single GPR as the common base address, plus a
+// vector of offset from it. ('Load/store this sequence of elements of
+// the same array.')
+//
+// Like the contiguous family, these loads and stores can widen the
+// loaded values / truncate the stored ones, or they can just
+// load/store the same size of memory and vector lane. But unlike the
+// contiguous family, there's no particular difference in encoding
+// between those two cases.
+//
+// This family also comes with the option to scale the offset values
+// in Qm by the size of the loaded memory (i.e. to treat them as array
+// indices), or not to scale them (to treat them as plain byte offsets
+// in memory, so that perhaps the loaded values are unaligned). The
+// scaled instructions' address operand in assembly looks like
+// [Rn,Qm,UXTW #2] or similar.
+
+// Base class.
+class MVE_VLDRSTR_rq<MVE_ldst_direction dir, MVE_memsz memsz, bit U,
+                     bits<2> size, bit os, string asm, string suffix, int shift>
+  : MVE_VLDRSTR_base<dir, U, 0b0, 0b0, 0, dir.Oops,
+                     !con(dir.Iops, (ins mve_addr_rq_shift<shift>:$addr)),
+                     asm, suffix, "$Qd, $addr", dir.cstr> {
+  bits<7> addr;
+  let Inst{23} = 0b1;
+  let Inst{19-16} = addr{6-3};
+  let Inst{8-7} = size;
+  let Inst{6} = memsz.encoding{1};
+  let Inst{5} = 0;
+  let Inst{4} = memsz.encoding{0};
+  let Inst{3-1} = addr{2-0};
+  let Inst{0} = os;
+}
+
+// Multiclass that defines the scaled and unscaled versions of an
+// instruction, when the memory size is wider than a byte. The scaled
+// version gets the default name like MVE_VLDRBU16_rq; the unscaled /
+// potentially unaligned version gets a "_u" suffix, e.g.
+// MVE_VLDRBU16_rq_u.
+multiclass MVE_VLDRSTR_rq_w<MVE_ldst_direction dir, MVE_memsz memsz,
+                            string asm, string suffix, bit U, bits<2> size> {
+  def _u : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
+  def "" : MVE_VLDRSTR_rq<dir, memsz, U, size, 1, asm, suffix, memsz.shift>;
+}
+
+// Subclass of MVE_VLDRSTR_rq with the same API as that multiclass,
+// for use when the memory size is one byte, so there's no 'scaled'
+// version of the instruction at all. (This is encoded as if it were
+// unscaled, but named in the default way with no _u suffix.)
+class MVE_VLDRSTR_rq_b<MVE_ldst_direction dir, MVE_memsz memsz,
+                       string asm, string suffix, bit U, bits<2> size>
+  : MVE_VLDRSTR_rq<dir, memsz, U, size, 0, asm, suffix, 0>;
+
+// Actually define all the loads and stores in this family.
+
+def  MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u8",  1,0b00>;
+def  MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u16", 1,0b01>;
+def  MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s16", 0,0b01>;
+def  MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","u32", 1,0b10>;
+def  MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b<MVE_ld, MVE_memB, "vldrb","s32", 0,0b10>;
+
+defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u16", 1,0b01>;
+defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","u32", 1,0b10>;
+defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memH, "vldrh","s32", 0,0b10>;
+defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memW, "vldrw","u32", 1,0b10>;
+defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w<MVE_ld, MVE_memD, "vldrd","u64", 1,0b11>;
+
+def  MVE_VSTRB8_rq  : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","8",   0,0b00>;
+def  MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","16",  0,0b01>;
+def  MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b<MVE_st, MVE_memB, "vstrb","32",  0,0b10>;
+
+defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","16",  0,0b01>;
+defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memH, "vstrh","32",  0,0b10>;
+defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memW, "vstrw","32",  0,0b10>;
+defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w<MVE_st, MVE_memD, "vstrd","64",  0,0b11>;
+
+// Gather loads / scatter stores whose address operand is of the form
+// [Qm,#imm], i.e. a vector containing a full base address for each
+// loaded item, plus an immediate offset applied consistently to all
+// of them. ('Load/store the same field from this vector of pointers
+// to a structure type.')
+//
+// This family requires the vector lane size to be at least 32 bits
+// (so there's room for an address in each lane at all). It has no
+// widening/narrowing variants. But it does support preindex
+// writeback, in which the address vector is updated to hold the
+// addresses actually loaded from.
+
+// Base class.
+class MVE_VLDRSTR_qi<MVE_ldst_direction dir, MVE_memsz memsz, bit W, dag wbops,
+                     string asm, string wbAsm, string suffix, string cstr = "">
+  : MVE_VLDRSTR_base<dir, 1, 1, W, 1, !con(wbops, dir.Oops),
+                     !con(dir.Iops, (ins mve_addr_q_shift<memsz.shift>:$addr)),
+                     asm, suffix, "$Qd, $addr" # wbAsm, cstr # dir.cstr> {
+  bits<11> addr;
+  let Inst{23} = addr{7};
+  let Inst{19-17} = addr{10-8};
+  let Inst{16} = 0;
+  let Inst{8} = memsz.encoding{0}; // enough to distinguish 32- from 64-bit
+  let Inst{7} = 0;
+  let Inst{6-0} = addr{6-0};
+}
+
+// Multiclass that generates the non-writeback and writeback variants.
+multiclass MVE_VLDRSTR_qi_m<MVE_ldst_direction dir, MVE_memsz memsz,
+                            string asm, string suffix> {
+  def ""   : MVE_VLDRSTR_qi<dir, memsz, 0, (outs),          asm, "",  suffix>;
+  def _pre : MVE_VLDRSTR_qi<dir, memsz, 1, (outs MQPR:$wb), asm, "!", suffix,
+                            "$addr.base = $wb"> {
+    let DecoderMethod="DecodeMVE_MEM_3_pre<"#memsz.shift#">";
+  }
+}
+
+// Actual instruction definitions.
+defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memW, "vldrw", "u32">;
+defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m<MVE_ld, MVE_memD, "vldrd", "u64">;
+defm MVE_VSTRW32_qi:  MVE_VLDRSTR_qi_m<MVE_st, MVE_memW, "vstrw", "32">;
+defm MVE_VSTRD64_qi:  MVE_VLDRSTR_qi_m<MVE_st, MVE_memD, "vstrd", "64">;
+
+// Define aliases for all the instructions where memory size and
+// vector lane size are the same. These are mnemonic aliases, so they
+// apply consistently across all of the above families - contiguous
+// loads, and both the rq and qi types of gather/scatter.
+//
+// Rationale: As long as you're loading (for example) 16-bit memory
+// values into 16-bit vector lanes, you can think of them as signed or
+// unsigned integers, fp16 or just raw 16-bit blobs and it makes no
+// difference. So we permit all of vldrh.16, vldrh.u16, vldrh.s16,
+// vldrh.f16 and treat them all as equivalent to the canonical
+// spelling (which happens to be .u16 for loads, and just .16 for
+// stores).
+
+foreach vpt_cond = ["", "t", "e"] in
+foreach memsz = [MVE_memB, MVE_memH, MVE_memW, MVE_memD] in
+foreach suffix = memsz.suffixes in {
+
+  // These foreaches are conceptually ifs, implemented by iterating a
+  // dummy variable over a list with 0 or 1 elements depending on the
+  // condition. The idea is to iterate over _nearly_ all the suffixes
+  // in memsz.suffixes, but omit the one we want all the others to alias.
+
+  foreach _ = !if(!ne(suffix, memsz.CanonLoadSuffix), [1], []<int>) in
+  def : MnemonicAlias<
+    "vldr" # memsz.MnemonicLetter # vpt_cond # suffix,
+    "vldr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonLoadSuffix>;
+
+  foreach _ = !if(!ne(suffix, memsz.CanonStoreSuffix), [1], []<int>) in
+  def : MnemonicAlias<
+    "vstr" # memsz.MnemonicLetter # vpt_cond # suffix,
+    "vstr" # memsz.MnemonicLetter # vpt_cond # memsz.CanonStoreSuffix>;
+}
+
+// end of MVE predicable load/store
+
 class MVE_VPT<string suffix, bits<2> size, dag iops, string asm, list<dag> pattern=[]>
  : MVE_MI<(outs ), iops, NoItinerary, !strconcat("vpt", "${Mk}", ".", suffix), asm, "", pattern> {
  bits<3> fc;
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@ -212,31 +212,40 @@ def t2adrlabel : Operand<i32> {
 }

 // t2addrmode_posimm8  := reg + imm8
-def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
+def MemPosImm8OffsetAsmOperand : AsmOperandClass {
+  let Name="MemPosImm8Offset";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
 def t2addrmode_posimm8 : MemOperand {
  let PrintMethod = "printT2AddrModeImm8Operand<false>";
-  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let EncoderMethod = "getT2AddrModeImmOpValue<8,0>";
  let DecoderMethod = "DecodeT2AddrModeImm8";
  let ParserMatchClass = MemPosImm8OffsetAsmOperand;
  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }

 // t2addrmode_negimm8  := reg - imm8
-def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
+def MemNegImm8OffsetAsmOperand : AsmOperandClass {
+  let Name="MemNegImm8Offset";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
 def t2addrmode_negimm8 : MemOperand,
                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
  let PrintMethod = "printT2AddrModeImm8Operand<false>";
-  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let EncoderMethod = "getT2AddrModeImmOpValue<8,0>";
  let DecoderMethod = "DecodeT2AddrModeImm8";
  let ParserMatchClass = MemNegImm8OffsetAsmOperand;
  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }

 // t2addrmode_imm8  := reg +/- imm8
-def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
+def MemImm8OffsetAsmOperand : AsmOperandClass {
+  let Name = "MemImm8Offset";
+  let RenderMethod = "addMemImmOffsetOperands";
+}
 class T2AddrMode_Imm8 : MemOperand,
                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
-  let EncoderMethod = "getT2AddrModeImm8OpValue";
+  let EncoderMethod = "getT2AddrModeImmOpValue<8,0>";
  let DecoderMethod = "DecodeT2AddrModeImm8";
  let ParserMatchClass = MemImm8OffsetAsmOperand;
  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@ -1025,7 +1025,7 @@ public:
      if (!CE) return false;
      Val = CE->getValue();
    }
-    else if (isMem()) {
+    else if (isGPRMem()) {
      if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
      if(Memory.BaseRegNum != ARM::PC) return false;
      Val = Memory.OffsetImm->getValue();
@ -1059,7 +1059,14 @@ public:
    int64_t Value = CE->getValue();
    return ((Value & 3) == 0) && Value >= N && Value <= M;
  }
-
+  template<int64_t N, int64_t M>
+  bool isImmediateS2() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int64_t Value = CE->getValue();
+    return ((Value & 1) == 0) && Value >= N && Value <= M;
+  }
  bool isFBits16() const {
    return isImmediate<0, 17>();
  }
@ -1072,6 +1079,18 @@ public:
  bool isImm7s4() const {
    return isImmediateS4<-508, 508>();
  }
+  bool isImm7Shift0() const {
+    return isImmediate<-127, 127>();
+  }
+  bool isImm7Shift1() const {
+    return isImmediateS2<-255, 255>();
+  }
+  bool isImm7Shift2() const {
+    return isImmediateS4<-511, 511>();
+  }
+  bool isImm7() const {
+    return isImmediate<-127, 127>();
+  }
  bool isImm0_1020s4() const {
    return isImmediateS4<0, 1020>();
  }
@ -1253,6 +1272,22 @@ public:
  bool isInstSyncBarrierOpt() const { return Kind == k_InstSyncBarrierOpt; }
  bool isTraceSyncBarrierOpt() const { return Kind == k_TraceSyncBarrierOpt; }
  bool isMem() const override {
+      return isGPRMem() || isMVEMem();
+  }
+  bool isMVEMem() const {
+    if (Kind != k_Memory)
+      return false;
+    if (Memory.BaseRegNum &&
+        !ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Memory.BaseRegNum) &&
+        !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(Memory.BaseRegNum))
+      return false;
+    if (Memory.OffsetRegNum &&
+        !ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+            Memory.OffsetRegNum))
+      return false;
+    return true;
+  }
+  bool isGPRMem() const {
    if (Kind != k_Memory)
      return false;
    if (Memory.BaseRegNum &&
@ -1332,14 +1367,14 @@ public:
    return isPostIdxRegShifted() && PostIdxReg.ShiftTy == ARM_AM::no_shift;
  }
  bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
-    if (!isMem())
+    if (!isGPRMem())
      return false;
    // No offset of any kind.
    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
     (alignOK || Memory.Alignment == Alignment);
  }
  bool isMemNoOffsetT2(bool alignOK = false, unsigned Alignment = 0) const {
-    if (!isMem())
+    if (!isGPRMem())
      return false;

    if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
@ -1351,7 +1386,7 @@ public:
     (alignOK || Memory.Alignment == Alignment);
  }
  bool isMemNoOffsetT2NoSp(bool alignOK = false, unsigned Alignment = 0) const {
-    if (!isMem())
+    if (!isGPRMem())
      return false;

    if (!ARMMCRegisterClasses[ARM::rGPRRegClassID].contains(
@ -1362,8 +1397,20 @@ public:
    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
     (alignOK || Memory.Alignment == Alignment);
  }
+  bool isMemNoOffsetT(bool alignOK = false, unsigned Alignment = 0) const {
+    if (!isGPRMem())
+      return false;
+
+    if (!ARMMCRegisterClasses[ARM::tGPRRegClassID].contains(
+            Memory.BaseRegNum))
+      return false;
+
+    // No offset of any kind.
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+     (alignOK || Memory.Alignment == Alignment);
+  }
  bool isMemPCRelImm12() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Base register must be PC.
    if (Memory.BaseRegNum != ARM::PC)
@ -1450,7 +1497,7 @@ public:
  }

  bool isAddrMode2() const {
-    if (!isMem() || Memory.Alignment != 0) return false;
+    if (!isGPRMem() || Memory.Alignment != 0) return false;
    // Check for register offset.
    if (Memory.OffsetRegNum) return true;
    // Immediate offset in range [-4095, 4095].
@ -1475,7 +1522,7 @@ public:
    // and we reject it.
    if (isImm() && !isa<MCConstantExpr>(getImm()))
      return true;
-    if (!isMem() || Memory.Alignment != 0) return false;
+    if (!isGPRMem() || Memory.Alignment != 0) return false;
    // No shifts are legal for AM3.
    if (Memory.ShiftType != ARM_AM::no_shift) return false;
    // Check for register offset.
@ -1509,7 +1556,7 @@ public:
    // and we reject it.
    if (isImm() && !isa<MCConstantExpr>(getImm()))
      return true;
-    if (!isMem() || Memory.Alignment != 0) return false;
+    if (!isGPRMem() || Memory.Alignment != 0) return false;
    // Check for register offset.
    if (Memory.OffsetRegNum) return false;
    // Immediate offset in range [-1020, 1020] and a multiple of 4.
@ -1525,7 +1572,7 @@ public:
    // and we reject it.
    if (isImm() && !isa<MCConstantExpr>(getImm()))
      return true;
-    if (!isMem() || Memory.Alignment != 0) return false;
+    if (!isGPRMem() || Memory.Alignment != 0) return false;
    // Check for register offset.
    if (Memory.OffsetRegNum) return false;
    // Immediate offset in range [-510, 510] and a multiple of 2.
@ -1536,14 +1583,14 @@ public:
  }

  bool isMemTBB() const {
-    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
      return false;
    return true;
  }

  bool isMemTBH() const {
-    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
        Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
        Memory.Alignment != 0 )
      return false;
@ -1551,13 +1598,13 @@ public:
  }

  bool isMemRegOffset() const {
-    if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+    if (!isGPRMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
      return false;
    return true;
  }

  bool isT2MemRegOffset() const {
-    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
        Memory.Alignment != 0 || Memory.BaseRegNum == ARM::PC)
      return false;
    // Only lsl #{0, 1, 2, 3} allowed.
@ -1571,7 +1618,7 @@ public:
  bool isMemThumbRR() const {
    // Thumb reg+reg addressing is simple. Just two registers, a base and
    // an offset. No shifts, negations or any other complicating factors.
-    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isGPRMem() || !Memory.OffsetRegNum || Memory.isNegative ||
        Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
      return false;
    return isARMLowRegister(Memory.BaseRegNum) &&
@ -1579,7 +1626,7 @@ public:
  }

  bool isMemThumbRIs4() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 ||
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
      return false;
    // Immediate offset, multiple of 4 in range [0, 124].
@ -1589,7 +1636,7 @@ public:
  }

  bool isMemThumbRIs2() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 ||
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
      return false;
    // Immediate offset, multiple of 4 in range [0, 62].
@ -1599,7 +1646,7 @@ public:
  }

  bool isMemThumbRIs1() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 ||
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
        !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
      return false;
    // Immediate offset in range [0, 31].
@ -1609,7 +1656,7 @@ public:
  }

  bool isMemThumbSPI() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 ||
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 ||
        Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
      return false;
    // Immediate offset, multiple of 4 in range [0, 1020].
@ -1624,7 +1671,7 @@ public:
    // and we reject it.
    if (isImm() && !isa<MCConstantExpr>(getImm()))
      return true;
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Immediate offset a multiple of 4 in range [-1020, 1020].
    if (!Memory.OffsetImm) return true;
@ -1639,7 +1686,7 @@ public:
    // and we reject it.
    if (isImm() && !isa<MCConstantExpr>(getImm()))
      return true;
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 ||
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 ||
        !ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
            Memory.BaseRegNum))
      return false;
@ -1650,7 +1697,7 @@ public:
    return (Val >= -508 && Val <= 508 && (Val & 3) == 0) || Val == INT32_MIN;
  }
  bool isMemImm0_1020s4Offset() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Immediate offset a multiple of 4 in range [0, 1020].
    if (!Memory.OffsetImm) return true;
@ -1659,7 +1706,7 @@ public:
  }

  bool isMemImm8Offset() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Base reg of PC isn't allowed for these encodings.
    if (Memory.BaseRegNum == ARM::PC) return false;
@ -1670,8 +1717,81 @@ public:
           (Val > -256 && Val < 256);
  }

+  template<unsigned Bits, unsigned RegClassID>
+  bool isMemImm7ShiftedOffset() const {
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0 ||
+        !ARMMCRegisterClasses[RegClassID].contains(Memory.BaseRegNum))
+      return false;
+
+    // Expect an immediate offset equal to an element of the range
+    // [-127, 127], shifted left by Bits.
+
+    if (!Memory.OffsetImm) return true;
+    int64_t Val = Memory.OffsetImm->getValue();
+
+    // INT32_MIN is a special-case value (indicating the encoding with
+    // zero offset and the subtract bit set)
+    if (Val == INT32_MIN)
+      return true;
+
+    unsigned Divisor = 1U << Bits;
+
+    // Check that the low bits are zero
+    if (Val % Divisor != 0)
+      return false;
+
+    // Check that the remaining offset is within range.
+    Val /= Divisor;
+    return (Val >= -127 && Val <= 127);
+  }
+
+  template <int shift> bool isMemRegRQOffset() const {
+    if (!isMVEMem() || Memory.OffsetImm != 0 || Memory.Alignment != 0)
+      return false;
+
+    if (!ARMMCRegisterClasses[ARM::GPRnopcRegClassID].contains(
+            Memory.BaseRegNum))
+      return false;
+    if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+            Memory.OffsetRegNum))
+      return false;
+
+    if (shift == 0 && Memory.ShiftType != ARM_AM::no_shift)
+      return false;
+
+    if (shift > 0 &&
+        (Memory.ShiftType != ARM_AM::uxtw || Memory.ShiftImm != shift))
+      return false;
+
+    return true;
+  }
+
+  template <int shift> bool isMemRegQOffset() const {
+    if (!isMVEMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+      return false;
+
+    if (!ARMMCRegisterClasses[ARM::MQPRRegClassID].contains(
+            Memory.BaseRegNum))
+      return false;
+
+    if(!Memory.OffsetImm) return true;
+    static_assert(shift < 56,
+                  "Such that we dont shift by a value higher than 62");
+    int64_t Val = Memory.OffsetImm->getValue();
+
+    // The value must be a multiple of (1 << shift)
+    if ((Val & ((1U << shift) - 1)) != 0)
+      return false;
+
+    // And be in the right range, depending on the amount that it is shifted
+    // by.  Shift 0, is equal to 7 unsigned bits, the sign bit is set
+    // separately.
+    int64_t Range = (1U << (7+shift)) - 1;
+    return (Val == INT32_MIN) || (Val > -Range && Val < Range);
+  }
+
  bool isMemPosImm8Offset() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Immediate offset in range [0, 255].
    if (!Memory.OffsetImm) return true;
@ -1680,7 +1800,7 @@ public:
  }

  bool isMemNegImm8Offset() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Base reg of PC isn't allowed for these encodings.
    if (Memory.BaseRegNum == ARM::PC) return false;
@ -1692,7 +1812,7 @@ public:
  }

  bool isMemUImm12Offset() const {
-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Immediate offset in range [0, 4095].
    if (!Memory.OffsetImm) return true;
@ -1708,7 +1828,7 @@ public:
    if (isImm() && !isa<MCConstantExpr>(getImm()))
      return true;

-    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isGPRMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
      return false;
    // Immediate offset in range [-4095, 4095].
    if (!Memory.OffsetImm) return true;
@ -2423,6 +2543,34 @@ public:
    Inst.addOperand(MCOperand::createImm(CE->getValue()));
  }

+  void addImm7Shift0Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE != nullptr && "Invalid operand type!");
+    Inst.addOperand(MCOperand::createImm(CE->getValue()));
+  }
+
+  void addImm7Shift1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE != nullptr && "Invalid operand type!");
+    Inst.addOperand(MCOperand::createImm(CE->getValue()));
+  }
+
+  void addImm7Shift2Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE != nullptr && "Invalid operand type!");
+    Inst.addOperand(MCOperand::createImm(CE->getValue()));
+  }
+
+  void addImm7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    assert(CE != nullptr && "Invalid operand type!");
+    Inst.addOperand(MCOperand::createImm(CE->getValue()));
+  }
+
  void addImm0_1020s4Operands(MCInst &Inst, unsigned N) const {
    assert(N == 1 && "Invalid number of operands!");
    // The immediate is scaled by four in the encoding and is stored
@ -2532,7 +2680,7 @@ public:
      return;
    }

-    assert(isMem()  && "Unknown value type!");
+    assert(isGPRMem()  && "Unknown value type!");
    assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
    Inst.addOperand(MCOperand::createImm(Memory.OffsetImm->getValue()));
  }
@ -2567,6 +2715,11 @@ public:
    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
  }

+  void addMemNoOffsetTOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+  }
+
  void addMemPCRelImm12Operands(MCInst &Inst, unsigned N) const {
    assert(N == 1 && "Invalid number of operands!");
    int32_t Imm = Memory.OffsetImm->getValue();
@ -2808,19 +2961,17 @@ public:
    Inst.addOperand(MCOperand::createImm(Val));
  }

-  void addMemImm8OffsetOperands(MCInst &Inst, unsigned N) const {
+  void addMemImmOffsetOperands(MCInst &Inst, unsigned N) const {
    assert(N == 2 && "Invalid number of operands!");
    int64_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
    Inst.addOperand(MCOperand::createImm(Val));
  }

-  void addMemPosImm8OffsetOperands(MCInst &Inst, unsigned N) const {
-    addMemImm8OffsetOperands(Inst, N);
-  }
-
-  void addMemNegImm8OffsetOperands(MCInst &Inst, unsigned N) const {
-    addMemImm8OffsetOperands(Inst, N);
+  void addMemRegRQOffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(Memory.BaseRegNum));
+    Inst.addOperand(MCOperand::createReg(Memory.OffsetRegNum));
  }

  void addMemUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
@ -5657,6 +5808,8 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
    St = ARM_AM::ror;
  else if (ShiftName == "rrx" || ShiftName == "RRX")
    St = ARM_AM::rrx;
+  else if (ShiftName == "uxtw" || ShiftName == "UXTW")
+    St = ARM_AM::uxtw;
  else
    return Error(Loc, "illegal shift operator");
  Parser.Lex(); // Eat shift type token.
@ -6518,7 +6671,7 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,

  if (!Op2.isReg())
    return;
-  if (!Op3.isMem())
+  if (!Op3.isGPRMem())
    return;

  const MCRegisterClass &GPR = MRI->getRegClass(ARM::GPRRegClassID);
@ -7291,6 +7444,54 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
                   "destination register and base register can't be identical");
    return false;
  }
+
+  case ARM::MVE_VLDRBU8_rq:
+  case ARM::MVE_VLDRBU16_rq:
+  case ARM::MVE_VLDRBS16_rq:
+  case ARM::MVE_VLDRBU32_rq:
+  case ARM::MVE_VLDRBS32_rq:
+  case ARM::MVE_VLDRHU16_rq:
+  case ARM::MVE_VLDRHU16_rq_u:
+  case ARM::MVE_VLDRHU32_rq:
+  case ARM::MVE_VLDRHU32_rq_u:
+  case ARM::MVE_VLDRHS32_rq:
+  case ARM::MVE_VLDRHS32_rq_u:
+  case ARM::MVE_VLDRWU32_rq:
+  case ARM::MVE_VLDRWU32_rq_u:
+  case ARM::MVE_VLDRDU64_rq:
+  case ARM::MVE_VLDRDU64_rq_u:
+  case ARM::MVE_VLDRWU32_qi:
+  case ARM::MVE_VLDRWU32_qi_pre:
+  case ARM::MVE_VLDRDU64_qi:
+  case ARM::MVE_VLDRDU64_qi_pre: {
+    // Qd must be different from Qm.
+    unsigned QdIdx = 0, QmIdx = 2;
+    bool QmIsPointer = false;
+    switch (Opcode) {
+    case ARM::MVE_VLDRWU32_qi:
+    case ARM::MVE_VLDRDU64_qi:
+      QmIdx = 1;
+      QmIsPointer = true;
+      break;
+    case ARM::MVE_VLDRWU32_qi_pre:
+    case ARM::MVE_VLDRDU64_qi_pre:
+      QdIdx = 1;
+      QmIsPointer = true;
+      break;
+    }
+
+    const unsigned Qd = MRI->getEncodingValue(Inst.getOperand(QdIdx).getReg());
+    const unsigned Qm = MRI->getEncodingValue(Inst.getOperand(QmIdx).getReg());
+
+    if (Qd == Qm) {
+      return Error(Operands[3]->getStartLoc(),
+                   Twine("destination vector register and vector ") +
+                   (QmIsPointer ? "pointer" : "offset") +
+                   " register can't be identical");
+    }
+    return false;
+  }
+
  case ARM::SBFX:
  case ARM::t2SBFX:
  case ARM::UBFX:
@ -10042,9 +10243,23 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
  for (unsigned I = 0; I < MCID.NumOperands; ++I)
    if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
      // rGPRRegClass excludes PC, and also excluded SP before ARMv8
-      if ((Inst.getOperand(I).getReg() == ARM::SP) && !hasV8Ops())
+      const auto &Op = Inst.getOperand(I);
+      if (!Op.isReg()) {
+        // This can happen in awkward cases with tied operands, e.g. a
+        // writeback load/store with a complex addressing mode in
+        // which there's an output operand corresponding to the
+        // updated written-back base register: the Tablegen-generated
+        // AsmMatcher will have written a placeholder operand to that
+        // slot in the form of an immediate 0, because it can't
+        // generate the register part of the complex addressing-mode
+        // operand ahead of time.
+        continue;
+      }
+
+      unsigned Reg = Op.getReg();
+      if ((Reg == ARM::SP) && !hasV8Ops())
        return Match_RequiresV8;
-      else if (Inst.getOperand(I).getReg() == ARM::PC)
+      else if (Reg == ARM::PC)
        return Match_InvalidOperand;
    }

--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@ -332,6 +332,11 @@ static DecodeStatus DecodeTBLInstruction(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeCoprocessor(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeMemBarrierOption(MCInst &Inst, unsigned Insn,
@ -428,8 +433,17 @@ static DecodeStatus DecodeT2AddrModeImm0_1020s4(MCInst &Inst,unsigned Val,
                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
                               uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                               uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
+template<int shift, int WriteBack>
+static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbAddSPImm(MCInst &Inst, uint16_t Val,
                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeThumbAddSPReg(MCInst &Inst, uint16_t Insn,
@ -509,6 +523,15 @@ template<bool Writeback>
 static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Insn,
                                          uint64_t Address,
                                          const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
+template<int shift>
+static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder);
 template<unsigned MinLog, unsigned MaxLog>
 static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
@ -4138,6 +4161,21 @@ static DecodeStatus DecodeT2Imm8(MCInst &Inst, unsigned Val,
  return MCDisassembler::Success;
 }

+template<int shift>
+static DecodeStatus DecodeT2Imm7(MCInst &Inst, unsigned Val,
+                         uint64_t Address, const void *Decoder) {
+  int imm = Val & 0x7F;
+  if (Val == 0)
+    imm = INT32_MIN;
+  else if (!(Val & 0x80))
+    imm *= -1;
+  if (imm != INT32_MIN)
+    imm <<= shift;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder) {
  DecodeStatus S = MCDisassembler::Success;
@ -4184,6 +4222,42 @@ static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
  return S;
 }

+template<int shift>
+static DecodeStatus DecodeTAddrModeImm7(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 8, 3);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+
+  if (!Check(S, DecodetGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm7<shift>(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+template<int shift, int WriteBack>
+static DecodeStatus DecodeT2AddrModeImm7(MCInst &Inst, unsigned Val,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Val, 8, 4);
+  unsigned imm = fieldFromInstruction(Val, 0, 8);
+  if (WriteBack) {
+    if (!Check(S, DecoderGPRRegisterClass(Inst, Rn, Address, Decoder)))
+      return MCDisassembler::Fail;
+  } else if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2Imm7<shift>(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
 static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
                                    uint64_t Address, const void *Decoder) {
  DecodeStatus S = MCDisassembler::Success;
@ -4331,6 +4405,43 @@ static DecodeStatus DecodePostIdxReg(MCInst &Inst, unsigned Insn,
  return S;
 }

+static DecodeStatus DecodeMveAddrModeRQ(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Rn = fieldFromInstruction(Insn, 3, 4);
+  unsigned Qm = fieldFromInstruction(Insn, 0, 3);
+
+  if (!Check(S, DecodeGPRnopcRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+template<int shift>
+static DecodeStatus DecodeMveAddrModeQ(MCInst &Inst, unsigned Insn,
+                             uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+  unsigned Qm = fieldFromInstruction(Insn, 8, 3);
+  int imm = fieldFromInstruction(Insn, 0, 7);
+
+  if (!Check(S, DecodeMQPRRegisterClass(Inst, Qm, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  if(!fieldFromInstruction(Insn, 7, 1)) {
+    if (imm == 0)
+      imm = INT32_MIN;                 // indicate -0
+    else
+      imm *= -1;
+  }
+  if (imm != INT32_MIN)
+    imm <<= shift;
+  Inst.addOperand(MCOperand::createImm(imm));
+
+  return S;
+}
+
 static DecodeStatus DecodeThumbBLXOffset(MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder) {
  // Val is passed in as S:J1:J2:imm10H:imm10L:'0'
@ -6175,6 +6286,52 @@ static DecodeStatus DecodeVSTRVLDR_SYSREG(MCInst &Inst, unsigned Val,
  return S;
 }

+static inline DecodeStatus DecodeMVE_MEM_pre(
+  MCInst &Inst, unsigned Val, uint64_t Address, const void *Decoder,
+  unsigned Rn, OperandDecoder RnDecoder, OperandDecoder AddrDecoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Qd = fieldFromInstruction(Val, 13, 3);
+  unsigned addr = fieldFromInstruction(Val, 0, 7) |
+                  (fieldFromInstruction(Val, 23, 1) << 7) | (Rn << 8);
+
+  if (!Check(S, RnDecoder(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeMQPRRegisterClass(Inst, Qd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, AddrDecoder(Inst, addr, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+template <int shift>
+static DecodeStatus DecodeMVE_MEM_1_pre(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
+                           fieldFromInstruction(Val, 16, 3),
+                           DecodetGPRRegisterClass,
+                           DecodeTAddrModeImm7<shift>);
+}
+
+template <int shift>
+static DecodeStatus DecodeMVE_MEM_2_pre(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
+                           fieldFromInstruction(Val, 16, 4),
+                           DecoderGPRRegisterClass,
+                           DecodeT2AddrModeImm7<shift,1>);
+}
+
+template <int shift>
+static DecodeStatus DecodeMVE_MEM_3_pre(MCInst &Inst, unsigned Val,
+                                        uint64_t Address, const void *Decoder) {
+  return DecodeMVE_MEM_pre(Inst, Val, Address, Decoder,
+                           fieldFromInstruction(Val, 17, 3),
+                           DecodeMQPRRegisterClass,
+                           DecodeMveAddrModeQ<shift>);
+}
+
 template<unsigned MinLog, unsigned MaxLog>
 static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
                                          uint64_t Address,
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAddressingModes.h
@ -30,7 +30,8 @@ namespace ARM_AM {
    lsl,
    lsr,
    ror,
-    rrx
+    rrx,
+    uxtw
  };

  enum AddrOpc {
@ -48,6 +49,7 @@ namespace ARM_AM {
    case ARM_AM::lsr: return "lsr";
    case ARM_AM::ror: return "ror";
    case ARM_AM::rrx: return "rrx";
+    case ARM_AM::uxtw: return "uxtw";
    }
  }

--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@ -203,6 +203,8 @@ namespace ARMII {
    AddrMode5FP16   = 17,  // i8 * 2
    AddrModeT2_ldrex = 18, // i8 * 4, with unscaled offset in MCInst
    AddrModeT2_i7s4 = 19, // i7 * 4
+    AddrModeT2_i7s2 = 20, // i7 * 2
+    AddrModeT2_i7   = 21, // i7 * 1
  };

  inline static const char *AddrModeToString(AddrMode addrmode) {
@ -227,6 +229,8 @@ namespace ARMII {
    case AddrMode_i12:    return "AddrMode_i12";
    case AddrModeT2_ldrex:return "AddrModeT2_ldrex";
    case AddrModeT2_i7s4: return "AddrModeT2_i7s4";
+    case AddrModeT2_i7s2: return "AddrModeT2_i7s2";
+    case AddrModeT2_i7:   return "AddrModeT2_i7";
    }
  }

--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@ -603,6 +603,40 @@ void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI, unsigned OpNum,
    << markup(">");
 }

+template<int shift>
+void ARMInstPrinter::printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+  O << ", ";
+  printRegName(O, MO2.getReg());
+
+  if (shift > 0)
+    printRegImmShift(O, ARM_AM::uxtw, shift, UseMarkup);
+
+  O << "]" << markup(">");
+}
+
+void ARMInstPrinter::printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
+                                               const MCSubtargetInfo &STI,
+                                               raw_ostream &O) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum + 1);
+
+  O << markup("<mem:") << "[";
+  printRegName(O, MO1.getReg());
+
+  int64_t Imm = MO2.getImm();
+  if (Imm != 0)
+    O << ", " << markup("<imm:") << '#' << Imm << markup(">");
+
+  O << "]" << markup(">");
+}
+
 void ARMInstPrinter::printLdStmModeOperand(const MCInst *MI, unsigned OpNum,
                                           const MCSubtargetInfo &STI,
                                           raw_ostream &O) {
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@ -255,6 +255,11 @@ public:
                                raw_ostream &O);
  void printVPTMask(const MCInst *MI, unsigned OpNum,
                    const MCSubtargetInfo &STI, raw_ostream &O);
+  template<int shift>
+  void printMveAddrModeRQOperand(const MCInst *MI, unsigned OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O);
+  void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
+                                const MCSubtargetInfo &STI, raw_ostream &O);
  void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
                               const MCSubtargetInfo &STI, raw_ostream &O);

--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@ -208,6 +208,19 @@ public:
                                 SmallVectorImpl<MCFixup> &Fixups,
                                 const MCSubtargetInfo &STI) const;

+  /// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg'
+  /// operand.
+  uint32_t getMveAddrModeRQOpValue(const MCInst &MI, unsigned OpIdx,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  /// getMveAddrModeQOpValue - Return encoding info for 'reg +/- imm7<<{shift}'
+  /// operand.
+  template<int shift>
+  uint32_t getMveAddrModeQOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
  /// getLdStSORegOpValue - Return encoding info for 'reg +/- reg shop imm'
  /// operand as needed by load/store instructions.
  uint32_t getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
@ -238,8 +251,10 @@ public:
    case ARM_AM::asr: return 2;
    case ARM_AM::ror:
    case ARM_AM::rrx: return 3;
+    case ARM_AM::uxtw:
+    default:
+      llvm_unreachable("Invalid ShiftOpc!");
    }
-    llvm_unreachable("Invalid ShiftOpc!");
  }

  /// getAddrMode2OffsetOpValue - Return encoding for am2offset operands.
@ -338,7 +353,8 @@ public:
  unsigned getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
    SmallVectorImpl<MCFixup> &Fixups,
    const MCSubtargetInfo &STI) const;
-  unsigned getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
+  template<unsigned Bits, unsigned Shift>
+  unsigned getT2AddrModeImmOpValue(const MCInst &MI, unsigned OpNum,
    SmallVectorImpl<MCFixup> &Fixups,
    const MCSubtargetInfo &STI) const;
  unsigned getT2AddrModeImm8OffsetOpValue(const MCInst &MI, unsigned OpNum,
@ -1040,6 +1056,58 @@ getT2ScaledImmOpValue(const MCInst &MI, unsigned OpIdx,
    Binary |= (1U << Bits);
  return Binary;
 }
+
+/// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg'
+/// operand.
+uint32_t ARMMCCodeEmitter::
+getMveAddrModeRQOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+    // {6-3} Rn
+    // {2-0} Qm
+    const MCOperand &M0 = MI.getOperand(OpIdx);
+    const MCOperand &M1 = MI.getOperand(OpIdx + 1);
+
+    unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(M0.getReg());
+    unsigned Qm = CTX.getRegisterInfo()->getEncodingValue(M1.getReg());
+
+    assert(Qm < 8 && "Qm is supposed to be encodable in 3 bits");
+
+    return (Rn << 3) | Qm;
+}
+
+/// getMveAddrModeRQOpValue - Return encoding info for 'reg, vreg'
+/// operand.
+template<int shift>
+uint32_t ARMMCCodeEmitter::
+getMveAddrModeQOpValue(const MCInst &MI, unsigned OpIdx,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
+    // {10-8} Qm
+    // {7-0} Imm
+    const MCOperand &M0 = MI.getOperand(OpIdx);
+    const MCOperand &M1 = MI.getOperand(OpIdx + 1);
+
+    unsigned Qm = CTX.getRegisterInfo()->getEncodingValue(M0.getReg());
+    int32_t Imm = M1.getImm();
+
+    bool isAdd = Imm >= 0;
+
+    Imm >>= shift;
+
+    if (!isAdd)
+      Imm = -(uint32_t)Imm;
+
+    Imm &= 0x7f;
+
+    if (isAdd)
+      Imm |= 0x80;
+
+    assert(Qm < 8 && "Qm is supposed to be encodable in 3 bits");
+
+    return (Qm << 8) | Imm;
+}
+
 /// getT2AddrModeImm8s4OpValue - Return encoding info for
 /// 'reg +/- imm8<<2' operand.
 uint32_t ARMMCCodeEmitter::
@ -1540,25 +1608,26 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
  return Value;
 }

+template<unsigned Bits, unsigned Shift>
 unsigned ARMMCCodeEmitter::
-getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const {
+getT2AddrModeImmOpValue(const MCInst &MI, unsigned OpNum,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const {
  const MCOperand &MO1 = MI.getOperand(OpNum);
  const MCOperand &MO2 = MI.getOperand(OpNum+1);

  // FIXME: Needs fixup support.
  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());

-  // Even though the immediate is 8 bits long, we need 9 bits in order
+  // If the immediate is B bits long, we need B+1 bits in order
  // to represent the (inverse of the) sign bit.
-  Value <<= 9;
-  int32_t tmp = (int32_t)MO2.getImm();
+  Value <<= (Bits + 1);
+  int32_t tmp = (int32_t)MO2.getImm() >> Shift;
  if (tmp < 0)
    tmp = abs(tmp);
  else
-    Value |= 256; // Set the ADD bit
-  Value |= tmp & 255;
+    Value |= (1U << Bits); // Set the ADD bit
+  Value |= tmp & ((1U << Bits) - 1);
  return Value;
 }

--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@ -610,17 +610,23 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
        Offset = -Offset;
        isSub = true;
      }
-    } else if (AddrMode == ARMII::AddrModeT2_i7s4) {
+    } else if (AddrMode == ARMII::AddrModeT2_i7s4 ||
+               AddrMode == ARMII::AddrModeT2_i7s2 ||
+               AddrMode == ARMII::AddrModeT2_i7) {
      Offset += MI.getOperand(FrameRegIdx + 1).getImm();
-      NumBits = 9; // 7 bits scaled by 4
-      unsigned OffsetMask = 0x3;
+      unsigned OffsetMask;
+      switch (AddrMode) {
+      case ARMII::AddrModeT2_i7s4: NumBits = 9; OffsetMask = 0x3; break;
+      case ARMII::AddrModeT2_i7s2: NumBits = 8; OffsetMask = 0x1; break;
+      default:                     NumBits = 7; OffsetMask = 0x0; break;
+      }
      // MCInst operand expects already scaled value.
      Scale = 1;
      assert((Offset & OffsetMask) == 0 && "Can't encode this offset!");
      (void)OffsetMask; // squash unused-variable warning at -NDEBUG
    } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
      Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
-      NumBits = 10; // 8 bits scaled by 4
+      NumBits = 8 + 2;
      // MCInst operand expects already scaled value.
      Scale = 1;
      assert((Offset & 3) == 0 && "Can't encode this offset!");
--- a/llvm/test/MC/ARM/mve-load-store.s
+++ b/llvm/test/MC/ARM/mve-load-store.s
--- a/llvm/test/MC/Disassembler/ARM/mve-load-store.txt
+++ b/llvm/test/MC/Disassembler/ARM/mve-load-store.txt