[SKX] Extended non-temporal load/store instructions for AVX512VL subsets.

Added avx512_movnt_vl multiclass for handling 256/128-bit forms of instruction.
Added encoding and lowering tests.

Reviewed by Elena Demikhovsky <elena.demikhovsky@intel.com>

llvm-svn: 215536
This commit is contained in:
Robert Khasanov 2014-08-13 10:46:00 +00:00
parent d97a634f12
commit ed8829703f
7 changed files with 390 additions and 35 deletions

View File

@ -1954,8 +1954,6 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
llvm_i32_ty], [IntrNoMem, Commutative]>;
def int_x86_avx2_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa256">,
Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
}
//===----------------------------------------------------------------------===//
@ -3219,6 +3217,8 @@ let TargetPrefix = "x86" in {
Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty,
llvm_v8i64_ty, llvm_i8_ty],
[IntrNoMem]>;
def int_x86_avx512_movntdqa : GCCBuiltin<"__builtin_ia32_movntdqa512">,
Intrinsic<[llvm_v8i64_ty], [llvm_ptr_ty], [IntrReadMem]>;
}
//===----------------------------------------------------------------------===//

View File

@ -2090,43 +2090,73 @@ def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
//===----------------------------------------------------------------------===//
// AVX-512 - Non-temporals
//===----------------------------------------------------------------------===//
let SchedRW = [WriteLoad] in {
def VMOVNTDQAZrm : AVX512PI<0x2A, MRMSrcMem, (outs VR512:$dst),
(ins i512mem:$src), "vmovntdqa\t{$src, $dst|$dst, $src}",
[(set VR512:$dst, (int_x86_avx512_movntdqa addr:$src))],
SSEPackedInt>, EVEX, T8PD, EVEX_V512,
EVEX_CD8<64, CD8VF>;
def VMOVNTDQAZrm : AVX5128I<0x2A, MRMSrcMem, (outs VR512:$dst),
(ins i512mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}",
[(set VR512:$dst,
(int_x86_avx512_movntdqa addr:$src))]>,
EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
let Predicates = [HasAVX512, HasVLX] in {
def VMOVNTDQAZ256rm : AVX512PI<0x2A, MRMSrcMem, (outs VR256X:$dst),
(ins i256mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}", [],
SSEPackedInt>, EVEX, T8PD, EVEX_V256,
EVEX_CD8<64, CD8VF>;
// Prefer non-temporal over temporal versions
let AddedComplexity = 400, SchedRW = [WriteStore] in {
def VMOVNTPSZmr : AVX512PSI<0x2B, MRMDestMem, (outs),
(ins f512mem:$dst, VR512:$src),
"vmovntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v16f32 VR512:$src),
addr:$dst)],
IIC_SSE_MOVNT>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VF>;
def VMOVNTPDZmr : AVX512PDI<0x2B, MRMDestMem, (outs),
(ins f512mem:$dst, VR512:$src),
"vmovntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v8f64 VR512:$src),
addr:$dst)],
IIC_SSE_MOVNT>,
EVEX, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
def VMOVNTDQZmr : AVX512BI<0xE7, MRMDestMem, (outs),
(ins i512mem:$dst, VR512:$src),
"vmovntdq\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v8i64 VR512:$src),
addr:$dst)],
IIC_SSE_MOVNT>,
EVEX, EVEX_V512, EVEX_CD8<64, CD8VF>;
def VMOVNTDQAZ128rm : AVX512PI<0x2A, MRMSrcMem, (outs VR128X:$dst),
(ins i128mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}", [],
SSEPackedInt>, EVEX, T8PD, EVEX_V128,
EVEX_CD8<64, CD8VF>;
}
}
multiclass avx512_movnt<bits<8> opc, string OpcodeStr, PatFrag st_frag,
ValueType OpVT, RegisterClass RC, X86MemOperand memop,
Domain d, InstrItinClass itin = IIC_SSE_MOVNT> {
let SchedRW = [WriteStore], mayStore = 1,
AddedComplexity = 400 in
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins memop:$dst, RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(st_frag (OpVT RC:$src), addr:$dst)], d, itin>, EVEX;
}
multiclass avx512_movnt_vl<bits<8> opc, string OpcodeStr, PatFrag st_frag,
string elty, string elsz, string vsz512,
string vsz256, string vsz128, Domain d,
Predicate prd, InstrItinClass itin = IIC_SSE_MOVNT> {
let Predicates = [prd] in
defm Z : avx512_movnt<opc, OpcodeStr, st_frag,
!cast<ValueType>("v"##vsz512##elty##elsz), VR512,
!cast<X86MemOperand>(elty##"512mem"), d, itin>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_movnt<opc, OpcodeStr, st_frag,
!cast<ValueType>("v"##vsz256##elty##elsz), VR256X,
!cast<X86MemOperand>(elty##"256mem"), d, itin>,
EVEX_V256;
defm Z128 : avx512_movnt<opc, OpcodeStr, st_frag,
!cast<ValueType>("v"##vsz128##elty##elsz), VR128X,
!cast<X86MemOperand>(elty##"128mem"), d, itin>,
EVEX_V128;
}
}
defm VMOVNTDQ : avx512_movnt_vl<0xE7, "vmovntdq", alignednontemporalstore,
"i", "64", "8", "4", "2", SSEPackedInt,
HasAVX512>, PD, EVEX_CD8<64, CD8VF>;
defm VMOVNTPD : avx512_movnt_vl<0x2B, "vmovntpd", alignednontemporalstore,
"f", "64", "8", "4", "2", SSEPackedDouble,
HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVNTPS : avx512_movnt_vl<0x2B, "vmovntps", alignednontemporalstore,
"f", "32", "16", "8", "4", SSEPackedSingle,
HasAVX512>, PS, EVEX_CD8<32, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - Integer arithmetic
//

View File

@ -727,6 +727,7 @@ def HasDQI : Predicate<"Subtarget->hasDQI()">;
def HasBWI : Predicate<"Subtarget->hasBWI()">;
def HasVLX : Predicate<"Subtarget->hasVLX()">,
AssemblerPredicate<"FeatureVLX", "AVX-512 VLX ISA">;
def NoVLX : Predicate<"!Subtarget->hasVLX()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;

View File

@ -3697,6 +3697,7 @@ let Predicates = [UseSSE1] in {
let AddedComplexity = 400 in { // Prefer non-temporal versions
let SchedRW = [WriteStore] in {
let Predicates = [HasAVX, NoVLX] in {
def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
@ -3737,6 +3738,7 @@ def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
[(alignednontemporalstore (v4i64 VR256:$src),
addr:$dst)],
IIC_SSE_MOVNT>, VEX, VEX_L;
}
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",

View File

@ -0,0 +1,34 @@
; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx --show-mc-encoding | FileCheck %s
define void @f256(<8 x float> %A, <8 x float> %AA, i8* %B, <4 x double> %C, <4 x double> %CC, i32 %D, <4 x i64> %E, <4 x i64> %EE) {
; CHECK: vmovntps %ymm{{.*}} ## encoding: [0x62
%cast = bitcast i8* %B to <8 x float>*
%A2 = fadd <8 x float> %A, %AA
store <8 x float> %A2, <8 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %ymm{{.*}} ## encoding: [0x62
%cast1 = bitcast i8* %B to <4 x i64>*
%E2 = add <4 x i64> %E, %EE
store <4 x i64> %E2, <4 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %ymm{{.*}} ## encoding: [0x62
%cast2 = bitcast i8* %B to <4 x double>*
%C2 = fadd <4 x double> %C, %CC
store <4 x double> %C2, <4 x double>* %cast2, align 64, !nontemporal !0
ret void
}
define void @f128(<4 x float> %A, <4 x float> %AA, i8* %B, <2 x double> %C, <2 x double> %CC, i32 %D, <2 x i64> %E, <2 x i64> %EE) {
; CHECK: vmovntps %xmm{{.*}} ## encoding: [0x62
%cast = bitcast i8* %B to <4 x float>*
%A2 = fadd <4 x float> %A, %AA
store <4 x float> %A2, <4 x float>* %cast, align 64, !nontemporal !0
; CHECK: vmovntdq %xmm{{.*}} ## encoding: [0x62
%cast1 = bitcast i8* %B to <2 x i64>*
%E2 = add <2 x i64> %E, %EE
store <2 x i64> %E2, <2 x i64>* %cast1, align 64, !nontemporal !0
; CHECK: vmovntpd %xmm{{.*}} ## encoding: [0x62
%cast2 = bitcast i8* %B to <2 x double>*
%C2 = fadd <2 x double> %C, %CC
store <2 x double> %C2, <2 x double>* %cast2, align 64, !nontemporal !0
ret void
}
!0 = metadata !{i32 1}

View File

@ -665,6 +665,102 @@
// CHECK: encoding: [0x62,0xf1,0xfe,0x48,0x6f,0xb2,0xc0,0xdf,0xff,0xff]
vmovdqu64 -8256(%rdx), %zmm6
// CHECK: vmovntdq %zmm24, (%rcx)
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x01]
vmovntdq %zmm24, (%rcx)
// CHECK: vmovntdq %zmm24, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0x21,0x7d,0x48,0xe7,0x84,0xf0,0x23,0x01,0x00,0x00]
vmovntdq %zmm24, 291(%rax,%r14,8)
// CHECK: vmovntdq %zmm24, 8128(%rdx)
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x7f]
vmovntdq %zmm24, 8128(%rdx)
// CHECK: vmovntdq %zmm24, 8192(%rdx)
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0x00,0x20,0x00,0x00]
vmovntdq %zmm24, 8192(%rdx)
// CHECK: vmovntdq %zmm24, -8192(%rdx)
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x42,0x80]
vmovntdq %zmm24, -8192(%rdx)
// CHECK: vmovntdq %zmm24, -8256(%rdx)
// CHECK: encoding: [0x62,0x61,0x7d,0x48,0xe7,0x82,0xc0,0xdf,0xff,0xff]
vmovntdq %zmm24, -8256(%rdx)
// CHECK: vmovntdqa (%rcx), %zmm17
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x09]
vmovntdqa (%rcx), %zmm17
// CHECK: vmovntdqa 291(%rax,%r14,8), %zmm17
// CHECK: encoding: [0x62,0xa2,0x7d,0x48,0x2a,0x8c,0xf0,0x23,0x01,0x00,0x00]
vmovntdqa 291(%rax,%r14,8), %zmm17
// CHECK: vmovntdqa 8128(%rdx), %zmm17
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x7f]
vmovntdqa 8128(%rdx), %zmm17
// CHECK: vmovntdqa 8192(%rdx), %zmm17
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0x00,0x20,0x00,0x00]
vmovntdqa 8192(%rdx), %zmm17
// CHECK: vmovntdqa -8192(%rdx), %zmm17
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x4a,0x80]
vmovntdqa -8192(%rdx), %zmm17
// CHECK: vmovntdqa -8256(%rdx), %zmm17
// CHECK: encoding: [0x62,0xe2,0x7d,0x48,0x2a,0x8a,0xc0,0xdf,0xff,0xff]
vmovntdqa -8256(%rdx), %zmm17
// CHECK: vmovntpd %zmm17, (%rcx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x09]
vmovntpd %zmm17, (%rcx)
// CHECK: vmovntpd %zmm17, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0xa1,0xfd,0x48,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00]
vmovntpd %zmm17, 291(%rax,%r14,8)
// CHECK: vmovntpd %zmm17, 8128(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x7f]
vmovntpd %zmm17, 8128(%rdx)
// CHECK: vmovntpd %zmm17, 8192(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0x00,0x20,0x00,0x00]
vmovntpd %zmm17, 8192(%rdx)
// CHECK: vmovntpd %zmm17, -8192(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x4a,0x80]
vmovntpd %zmm17, -8192(%rdx)
// CHECK: vmovntpd %zmm17, -8256(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x48,0x2b,0x8a,0xc0,0xdf,0xff,0xff]
vmovntpd %zmm17, -8256(%rdx)
// CHECK: vmovntps %zmm5, (%rcx)
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x29]
vmovntps %zmm5, (%rcx)
// CHECK: vmovntps %zmm5, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0xb1,0x7c,0x48,0x2b,0xac,0xf0,0x23,0x01,0x00,0x00]
vmovntps %zmm5, 291(%rax,%r14,8)
// CHECK: vmovntps %zmm5, 8128(%rdx)
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x7f]
vmovntps %zmm5, 8128(%rdx)
// CHECK: vmovntps %zmm5, 8192(%rdx)
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0x00,0x20,0x00,0x00]
vmovntps %zmm5, 8192(%rdx)
// CHECK: vmovntps %zmm5, -8192(%rdx)
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0x6a,0x80]
vmovntps %zmm5, -8192(%rdx)
// CHECK: vmovntps %zmm5, -8256(%rdx)
// CHECK: encoding: [0x62,0xf1,0x7c,0x48,0x2b,0xaa,0xc0,0xdf,0xff,0xff]
vmovntps %zmm5, -8256(%rdx)
// CHECK: vmovupd %zmm9, %zmm27
// CHECK: encoding: [0x62,0x41,0xfd,0x48,0x10,0xd9]
vmovupd %zmm9, %zmm27

View File

@ -432,6 +432,198 @@
// CHECK: encoding: [0x62,0x61,0xfe,0x28,0x6f,0xaa,0xe0,0xef,0xff,0xff]
vmovdqu64 -4128(%rdx), %ymm29
// CHECK: vmovntdq %xmm22, (%rcx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x31]
vmovntdq %xmm22, (%rcx)
// CHECK: vmovntdq %xmm22, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0xa1,0x7d,0x08,0xe7,0xb4,0xf0,0x23,0x01,0x00,0x00]
vmovntdq %xmm22, 291(%rax,%r14,8)
// CHECK: vmovntdq %xmm22, 2032(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x72,0x7f]
vmovntdq %xmm22, 2032(%rdx)
// CHECK: vmovntdq %xmm22, 2048(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0xb2,0x00,0x08,0x00,0x00]
vmovntdq %xmm22, 2048(%rdx)
// CHECK: vmovntdq %xmm22, -2048(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0x72,0x80]
vmovntdq %xmm22, -2048(%rdx)
// CHECK: vmovntdq %xmm22, -2064(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x08,0xe7,0xb2,0xf0,0xf7,0xff,0xff]
vmovntdq %xmm22, -2064(%rdx)
// CHECK: vmovntdq %ymm19, (%rcx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x19]
vmovntdq %ymm19, (%rcx)
// CHECK: vmovntdq %ymm19, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0xa1,0x7d,0x28,0xe7,0x9c,0xf0,0x23,0x01,0x00,0x00]
vmovntdq %ymm19, 291(%rax,%r14,8)
// CHECK: vmovntdq %ymm19, 4064(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x5a,0x7f]
vmovntdq %ymm19, 4064(%rdx)
// CHECK: vmovntdq %ymm19, 4096(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x9a,0x00,0x10,0x00,0x00]
vmovntdq %ymm19, 4096(%rdx)
// CHECK: vmovntdq %ymm19, -4096(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x5a,0x80]
vmovntdq %ymm19, -4096(%rdx)
// CHECK: vmovntdq %ymm19, -4128(%rdx)
// CHECK: encoding: [0x62,0xe1,0x7d,0x28,0xe7,0x9a,0xe0,0xef,0xff,0xff]
vmovntdq %ymm19, -4128(%rdx)
// CHECK: vmovntdqa (%rcx), %xmm24
// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x01]
vmovntdqa (%rcx), %xmm24
// CHECK: vmovntdqa 291(%rax,%r14,8), %xmm24
// CHECK: encoding: [0x62,0x22,0x7d,0x08,0x2a,0x84,0xf0,0x23,0x01,0x00,0x00]
vmovntdqa 291(%rax,%r14,8), %xmm24
// CHECK: vmovntdqa 2032(%rdx), %xmm24
// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x42,0x7f]
vmovntdqa 2032(%rdx), %xmm24
// CHECK: vmovntdqa 2048(%rdx), %xmm24
// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x82,0x00,0x08,0x00,0x00]
vmovntdqa 2048(%rdx), %xmm24
// CHECK: vmovntdqa -2048(%rdx), %xmm24
// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x42,0x80]
vmovntdqa -2048(%rdx), %xmm24
// CHECK: vmovntdqa -2064(%rdx), %xmm24
// CHECK: encoding: [0x62,0x62,0x7d,0x08,0x2a,0x82,0xf0,0xf7,0xff,0xff]
vmovntdqa -2064(%rdx), %xmm24
// CHECK: vmovntdqa (%rcx), %ymm28
// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0x21]
vmovntdqa (%rcx), %ymm28
// CHECK: vmovntdqa 291(%rax,%r14,8), %ymm28
// CHECK: encoding: [0x62,0x22,0x7d,0x28,0x2a,0xa4,0xf0,0x23,0x01,0x00,0x00]
vmovntdqa 291(%rax,%r14,8), %ymm28
// CHECK: vmovntdqa 4064(%rdx), %ymm28
// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0x62,0x7f]
vmovntdqa 4064(%rdx), %ymm28
// CHECK: vmovntdqa 4096(%rdx), %ymm28
// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0xa2,0x00,0x10,0x00,0x00]
vmovntdqa 4096(%rdx), %ymm28
// CHECK: vmovntdqa -4096(%rdx), %ymm28
// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0x62,0x80]
vmovntdqa -4096(%rdx), %ymm28
// CHECK: vmovntdqa -4128(%rdx), %ymm28
// CHECK: encoding: [0x62,0x62,0x7d,0x28,0x2a,0xa2,0xe0,0xef,0xff,0xff]
vmovntdqa -4128(%rdx), %ymm28
// CHECK: vmovntpd %xmm17, (%rcx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x09]
vmovntpd %xmm17, (%rcx)
// CHECK: vmovntpd %xmm17, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0xa1,0xfd,0x08,0x2b,0x8c,0xf0,0x23,0x01,0x00,0x00]
vmovntpd %xmm17, 291(%rax,%r14,8)
// CHECK: vmovntpd %xmm17, 2032(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x4a,0x7f]
vmovntpd %xmm17, 2032(%rdx)
// CHECK: vmovntpd %xmm17, 2048(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x8a,0x00,0x08,0x00,0x00]
vmovntpd %xmm17, 2048(%rdx)
// CHECK: vmovntpd %xmm17, -2048(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x4a,0x80]
vmovntpd %xmm17, -2048(%rdx)
// CHECK: vmovntpd %xmm17, -2064(%rdx)
// CHECK: encoding: [0x62,0xe1,0xfd,0x08,0x2b,0x8a,0xf0,0xf7,0xff,0xff]
vmovntpd %xmm17, -2064(%rdx)
// CHECK: vmovntpd %ymm27, (%rcx)
// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x19]
vmovntpd %ymm27, (%rcx)
// CHECK: vmovntpd %ymm27, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0x21,0xfd,0x28,0x2b,0x9c,0xf0,0x23,0x01,0x00,0x00]
vmovntpd %ymm27, 291(%rax,%r14,8)
// CHECK: vmovntpd %ymm27, 4064(%rdx)
// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x5a,0x7f]
vmovntpd %ymm27, 4064(%rdx)
// CHECK: vmovntpd %ymm27, 4096(%rdx)
// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x9a,0x00,0x10,0x00,0x00]
vmovntpd %ymm27, 4096(%rdx)
// CHECK: vmovntpd %ymm27, -4096(%rdx)
// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x5a,0x80]
vmovntpd %ymm27, -4096(%rdx)
// CHECK: vmovntpd %ymm27, -4128(%rdx)
// CHECK: encoding: [0x62,0x61,0xfd,0x28,0x2b,0x9a,0xe0,0xef,0xff,0xff]
vmovntpd %ymm27, -4128(%rdx)
// CHECK: vmovntps %xmm26, (%rcx)
// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x11]
vmovntps %xmm26, (%rcx)
// CHECK: vmovntps %xmm26, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0x21,0x7c,0x08,0x2b,0x94,0xf0,0x23,0x01,0x00,0x00]
vmovntps %xmm26, 291(%rax,%r14,8)
// CHECK: vmovntps %xmm26, 2032(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x52,0x7f]
vmovntps %xmm26, 2032(%rdx)
// CHECK: vmovntps %xmm26, 2048(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x92,0x00,0x08,0x00,0x00]
vmovntps %xmm26, 2048(%rdx)
// CHECK: vmovntps %xmm26, -2048(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x52,0x80]
vmovntps %xmm26, -2048(%rdx)
// CHECK: vmovntps %xmm26, -2064(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x08,0x2b,0x92,0xf0,0xf7,0xff,0xff]
vmovntps %xmm26, -2064(%rdx)
// CHECK: vmovntps %ymm28, (%rcx)
// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0x21]
vmovntps %ymm28, (%rcx)
// CHECK: vmovntps %ymm28, 291(%rax,%r14,8)
// CHECK: encoding: [0x62,0x21,0x7c,0x28,0x2b,0xa4,0xf0,0x23,0x01,0x00,0x00]
vmovntps %ymm28, 291(%rax,%r14,8)
// CHECK: vmovntps %ymm28, 4064(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0x62,0x7f]
vmovntps %ymm28, 4064(%rdx)
// CHECK: vmovntps %ymm28, 4096(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0xa2,0x00,0x10,0x00,0x00]
vmovntps %ymm28, 4096(%rdx)
// CHECK: vmovntps %ymm28, -4096(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0x62,0x80]
vmovntps %ymm28, -4096(%rdx)
// CHECK: vmovntps %ymm28, -4128(%rdx)
// CHECK: encoding: [0x62,0x61,0x7c,0x28,0x2b,0xa2,0xe0,0xef,0xff,0xff]
vmovntps %ymm28, -4128(%rdx)
// CHECK: vmovupd %xmm22, %xmm24
// CHECK: encoding: [0x62,0x21,0xfd,0x08,0x10,0xc6]
vmovupd %xmm22, %xmm24