[X86][SSE] Add shufps+shufps test for fold through commutation

As mentioned on D73023, lowerShuffleWithSHUFPS should be able to commute the shufps inputs to fold the second arg as it will then permute the shufps result anyway.
2020-01-24 11:16:16 +00:00 · 2020-01-24 11:16:16 +00:00 · e37cdbeeab
parent d4b092b341
commit e37cdbeeab
1 changed files with 28 additions and 0 deletions
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v4.ll
@ -2467,3 +2467,31 @@ define <4 x float> @shuffle_mem_v4f32_4523(<4 x float> %a, <4 x float>* %pb) {
  %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
  ret <4 x float> %shuffle
 }
+
+define  <4 x float> @shuffle_mem_v4f32_0624(<4 x float> %a0, <4 x float>* %a1) {
+; SSE-LABEL: shuffle_mem_v4f32_0624:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps (%rdi), %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[2,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1OR2-LABEL: shuffle_mem_v4f32_0624:
+; AVX1OR2:       # %bb.0:
+; AVX1OR2-NEXT:    vmovaps (%rdi), %xmm1
+; AVX1OR2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,0]
+; AVX1OR2-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1OR2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_mem_v4f32_0624:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovaps (%rdi), %xmm2
+; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,2,4]
+; AVX512VL-NEXT:    vpermi2ps %xmm0, %xmm2, %xmm1
+; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+  %1 = load <4 x float>, <4 x float>* %a1
+  %2 = shufflevector <4 x float> %1, <4 x float> %a0, <4 x i32> <i32 0, i32 6, i32 2, i32 4>
+  ret <4 x float> %2
+}