From 0c2a12cb3e5050ddab2b80fee3dc24b1794904d6 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 9 Apr 2018 17:07:40 +0000 Subject: [PATCH] [X86] Revert the SLM part of r328914. While it appears to be correct information based on Intel's optimization manual and Agner's data, it causes perf regressions on a couple of the benchmarks in our internal list. llvm-svn: 329593 --- llvm/lib/Target/X86/X86ScheduleSLM.td | 4 ++- llvm/test/CodeGen/X86/slow-pmulld.ll | 42 +++++++++++++------------ llvm/test/CodeGen/X86/sse41-schedule.ll | 4 +-- 3 files changed, 27 insertions(+), 23 deletions(-) diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 73eb257ad5ce..a712a188aa0a 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -145,7 +145,9 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; -defm : SLMWriteResPair; +// FIXME: The below is closer to correct, but caused some perf regressions. +//defm : SLMWriteResPair; +defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll index 59b36e93274d..007531fca7df 100644 --- a/llvm/test/CodeGen/X86/slow-pmulld.ll +++ b/llvm/test/CodeGen/X86/slow-pmulld.ll @@ -1215,32 +1215,34 @@ define <8 x i32> @test_mul_v8i32_v8i16_minsize(<8 x i16> %A) minsize { define <16 x i32> @test_mul_v16i32_v16i16_minsize(<16 x i16> %A) minsize { ; SLM32-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM32: # %bb.0: -; SLM32-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] -; SLM32-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SLM32-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM32-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; SLM32-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; SLM32-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; SLM32-NEXT: pmulld %xmm5, %xmm0 -; SLM32-NEXT: pmulld %xmm5, %xmm2 -; SLM32-NEXT: pmulld %xmm5, %xmm1 -; SLM32-NEXT: pmulld %xmm5, %xmm3 +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM32-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM32-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLM32-NEXT: pmulld %xmm1, %xmm4 +; SLM32-NEXT: pmulld %xmm1, %xmm0 +; SLM32-NEXT: pmulld %xmm1, %xmm2 +; SLM32-NEXT: pmulld %xmm1, %xmm3 +; SLM32-NEXT: movdqa %xmm4, %xmm1 ; SLM32-NEXT: retl ; ; SLM64-LABEL: test_mul_v16i32_v16i16_minsize: ; SLM64: # %bb.0: -; SLM64-NEXT: movdqa {{.*#+}} xmm5 = [18778,18778,18778,18778] -; SLM64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SLM64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] ; SLM64-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm1 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero -; SLM64-NEXT: pmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero -; SLM64-NEXT: pmulld %xmm5, %xmm0 -; SLM64-NEXT: pmulld %xmm5, %xmm2 -; SLM64-NEXT: pmulld %xmm5, %xmm1 -; SLM64-NEXT: pmulld %xmm5, %xmm3 +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero +; SLM64-NEXT: pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; SLM64-NEXT: movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778] +; SLM64-NEXT: pmulld %xmm1, %xmm4 +; SLM64-NEXT: pmulld %xmm1, %xmm0 +; SLM64-NEXT: pmulld %xmm1, %xmm2 +; SLM64-NEXT: pmulld %xmm1, %xmm3 +; SLM64-NEXT: movdqa %xmm4, %xmm1 ; SLM64-NEXT: retq ; ; SLOW32-LABEL: test_mul_v16i32_v16i16_minsize: diff --git a/llvm/test/CodeGen/X86/sse41-schedule.ll b/llvm/test/CodeGen/X86/sse41-schedule.ll index 39052ba15fca..96da898e3967 100644 --- a/llvm/test/CodeGen/X86/sse41-schedule.ll +++ b/llvm/test/CodeGen/X86/sse41-schedule.ll @@ -4817,8 +4817,8 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) { ; ; SLM-LABEL: test_pmulld: ; SLM: # %bb.0: -; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [11:11.00] -; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [14:11.00] +; SLM-NEXT: pmulld %xmm1, %xmm0 # sched: [4:1.00] +; SLM-NEXT: pmulld (%rdi), %xmm0 # sched: [7:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_pmulld: