mirror of https://github.com/GNOME/gimp.git
Trying to be more flexible in register use so the compiler can do reloads
* app/composite/gimp-composite-sse2.c: Trying to be more flexible in register use so the compiler can do reloads without running out of registers when using optimisation modes other than 2. Avoid the message "error: can't find a register in class `GENERAL_REGS' while reloading `asm'" * app/composite/gimp-composite-x86.h Use more newlines in asm() macros to ensure that gcc gets the instruction count correct. This is partially complete as of this commit.
This commit is contained in:
parent
faa0a60cb1
commit
f41b1f24f6
16
ChangeLog
16
ChangeLog
|
@ -1,3 +1,19 @@
|
|||
2005-05-17 Helvetix Victorinox <helvetix@gimp.org>
|
||||
|
||||
* app/composite/gimp-composite-sse2.c:
|
||||
Trying to be more flexible in register use so the compiler can do
|
||||
reloads without running out of registers when using optimisation
|
||||
modes other than 2.
|
||||
|
||||
Avoid the message "error: can't find a register in class
|
||||
`GENERAL_REGS' while reloading `asm'"
|
||||
|
||||
* app/composite/gimp-composite-x86.h
|
||||
Use more newlines in asm() macros to ensure that gcc gets the
|
||||
instruction count correct. This is partially complete as of this
|
||||
commit.
|
||||
|
||||
|
||||
2005-05-17 Sven Neumann <sven@gimp.org>
|
||||
|
||||
* configure.in: bumped version number to 2.3.1.
|
||||
|
|
|
@ -99,16 +99,16 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
|
||||
for (; n_pixels >= 2; n_pixels -= 2)
|
||||
{
|
||||
asm volatile (" movq %1,%%mm2\n"
|
||||
"\tmovq %2,%%mm3\n"
|
||||
"\tmovq %%mm2,%%mm4\n"
|
||||
"\tpaddusb %%mm3,%%mm4\n"
|
||||
"\tmovq %%mm0,%%mm1\n"
|
||||
"\tpandn %%mm4,%%mm1\n"
|
||||
asm volatile (" movq %1, %%mm2\n"
|
||||
"\tmovq %2, %%mm3\n"
|
||||
"\tmovq %%mm2, %%mm4\n"
|
||||
"\tpaddusb %%mm3, %%mm4\n"
|
||||
"\tmovq %%mm0, %%mm1\n"
|
||||
"\tpandn %%mm4, %%mm1\n"
|
||||
"\t" pminub(mm3, mm2, mm4) "\n"
|
||||
"\tpand %%mm0,%%mm2\n"
|
||||
"\tpor %%mm2,%%mm1\n"
|
||||
"\tmovq %%mm1,%0\n"
|
||||
"\tpand %%mm0, %%mm2\n"
|
||||
"\tpor %%mm2, %%mm1\n"
|
||||
"\tmovq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
|
@ -119,16 +119,16 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
|
||||
if (n_pixels > 0)
|
||||
{
|
||||
asm volatile (" movd %1,%%mm2\n"
|
||||
"\tmovd %2,%%mm3\n"
|
||||
"\tmovq %%mm2,%%mm4\n"
|
||||
"\tpaddusb %%mm3,%%mm4\n"
|
||||
"\tmovq %%mm0,%%mm1\n"
|
||||
"\tpandn %%mm4,%%mm1\n"
|
||||
asm volatile (" movd %1, %%mm2\n"
|
||||
"\tmovd %2, %%mm3\n"
|
||||
"\tmovq %%mm2, %%mm4\n"
|
||||
"\tpaddusb %%mm3, %%mm4\n"
|
||||
"\tmovq %%mm0, %%mm1\n"
|
||||
"\tpandn %%mm4, %%mm1\n"
|
||||
"\t" pminub(mm3, mm2, mm4) "\n"
|
||||
"\tpand %%mm0,%%mm2\n"
|
||||
"\tpor %%mm2,%%mm1\n"
|
||||
"\tmovd %%mm1,%0\n"
|
||||
"\tpand %%mm0, %%mm2\n"
|
||||
"\tpor %%mm2, %%mm1\n"
|
||||
"\tmovd %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
|
@ -271,7 +271,7 @@ gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovq %%mm2, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm2", "%mm3", "%mm4");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
a++;
|
||||
b++;
|
||||
d++;
|
||||
|
@ -318,7 +318,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
a++;
|
||||
b++;
|
||||
d++;
|
||||
|
@ -341,7 +341,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovd %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
|
@ -591,7 +591,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovq %%mm1,%0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
a++;
|
||||
b++;
|
||||
d++;
|
||||
|
@ -622,7 +622,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovd %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
|
@ -636,9 +636,9 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
uint64 *b = (uint64 *) _op->B;
|
||||
gulong n_pixels = _op->n_pixels;
|
||||
|
||||
asm volatile ("movq %0,%%mm0\n"
|
||||
"pxor %%mm6,%%mm6\n"
|
||||
"movq %1,%%mm7\n"
|
||||
asm volatile ("movq %0, %%mm0\n"
|
||||
"pxor %%mm6, %%mm6\n"
|
||||
"movq %1, %%mm7\n"
|
||||
: /* empty */
|
||||
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
|
||||
: "%mm0", "%mm6", "%mm7");
|
||||
|
@ -669,7 +669,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
a++;
|
||||
b++;
|
||||
d++;
|
||||
|
@ -699,7 +699,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovd %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
|
@ -1048,7 +1048,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovq %%mm1,%0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm7");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
|
||||
a++;
|
||||
d++;
|
||||
}
|
||||
|
@ -1066,7 +1066,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
"\tmovd %%mm1,%0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm7");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
|
@ -1267,8 +1267,8 @@ gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
|
|||
{
|
||||
asm volatile (" movq %0,%%mm2\n"
|
||||
"\tmovq %1,%%mm3\n"
|
||||
"\tmovq %%mm3,%0\n"
|
||||
"\tmovq %%mm2,%1\n"
|
||||
"\tmovntq %%mm3,%0\n"
|
||||
"\tmovntq %%mm2,%1\n"
|
||||
: "+m" (*a), "+m" (*b)
|
||||
:
|
||||
: "%mm2", "%mm3");
|
||||
|
|
|
@ -242,10 +242,10 @@ gimp_composite_darken_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
|
||||
for (; n_pixels >= 2; n_pixels -= 2)
|
||||
{
|
||||
asm volatile (" movq %1, %%mm2\n"
|
||||
"\tmovq %2, %%mm3\n"
|
||||
asm volatile (" movq %1,%%mm2\n"
|
||||
"\tmovq %2,%%mm3\n"
|
||||
"\t" pminub(mm3, mm2, mm4) "\n"
|
||||
"\tmovntq %%mm2, %0\n"
|
||||
"\tmovntq %%mm2,%0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
|
@ -292,10 +292,10 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
"\tpminub %%mm3, %%mm2\n"
|
||||
"\tpand %%mm0, %%mm2\n"
|
||||
"\tpor %%mm2, %%mm1\n"
|
||||
"\tmovq %%mm1, %0\n"
|
||||
"\tmovntq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
a++;
|
||||
b++;
|
||||
d++;
|
||||
|
@ -318,206 +318,12 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
"\tmovd %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
}
|
||||
|
||||
#if 0
|
||||
void
|
||||
xxxgimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
||||
{
|
||||
GimpCompositeContext op = *_op;
|
||||
|
||||
asm volatile (" movq %0, %%mm0\n"
|
||||
"\tmovq %1, %%mm7\n"
|
||||
:
|
||||
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
|
||||
: "%mm0", "%mm7");
|
||||
|
||||
for (; op.n_pixels >= 2; op.n_pixels -= 2)
|
||||
{
|
||||
asm volatile (" movq %1,%%mm0\n"
|
||||
"\tmovq %2,%%mm1\n"
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
|
||||
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm5,%%mm5\n"
|
||||
"\tpunpcklbw %%mm5,%%mm3\n"
|
||||
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
|
||||
|
||||
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
|
||||
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
|
||||
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm6,%%mm6\n"
|
||||
"\tpunpckhbw %%mm6,%%mm3\n"
|
||||
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
|
||||
|
||||
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
|
||||
|
||||
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
|
||||
|
||||
"\tpminub %%mm0,%%mm1\n"
|
||||
"\tmovq %3,%%mm3\n"
|
||||
"\tmovq %%mm3,%%mm2\n"
|
||||
|
||||
"\tpandn %%mm5,%%mm3\n"
|
||||
|
||||
"\tpand %%mm2,%%mm1\n"
|
||||
"\tpor %%mm1,%%mm3\n"
|
||||
|
||||
"\tmovq %%mm3,%0\n"
|
||||
: "=m" (*op.D)
|
||||
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
|
||||
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
op.A += 8;
|
||||
op.B += 8;
|
||||
op.D += 8;
|
||||
}
|
||||
|
||||
if (op.n_pixels)
|
||||
{
|
||||
asm volatile (" movd %1,%%mm0\n"
|
||||
"\tmovd %2,%%mm1\n"
|
||||
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
|
||||
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm5,%%mm5\n"
|
||||
"\tpunpcklbw %%mm5,%%mm3\n"
|
||||
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
|
||||
|
||||
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
|
||||
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
|
||||
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm6,%%mm6\n"
|
||||
"\tpunpckhbw %%mm6,%%mm3\n"
|
||||
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
|
||||
|
||||
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
|
||||
|
||||
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
|
||||
|
||||
"\tpminub %%mm0,%%mm1\n"
|
||||
"\tmovq %3,%%mm3\n"
|
||||
"\tmovq %%mm3,%%mm2\n"
|
||||
|
||||
"\tpandn %%mm5,%%mm3\n"
|
||||
|
||||
"\tpand %%mm2,%%mm1\n"
|
||||
"\tpor %%mm1,%%mm3\n"
|
||||
|
||||
"\tmovd %%mm3,%0\n"
|
||||
: "=m" (*op.D)
|
||||
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
|
||||
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
}
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
void
|
||||
xxxgimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
||||
{
|
||||
GimpCompositeContext op = *_op;
|
||||
|
||||
for (; op.n_pixels >= 2; op.n_pixels -= 2)
|
||||
{
|
||||
asm volatile (" movq %1,%%mm0\n"
|
||||
"\tmovq %2,%%mm1\n"
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpcklbw %%mm2,%%mm3\n"
|
||||
"\tpunpcklbw %%mm0,%%mm2\n"
|
||||
|
||||
"\tmovq %3,%%mm4\n"
|
||||
"\tpsubw %%mm3,%%mm4\n"
|
||||
|
||||
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
|
||||
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpckhbw %%mm2,%%mm3\n"
|
||||
"\tpunpckhbw %%mm0,%%mm2\n"
|
||||
|
||||
"\tmovq %3,%%mm4\n"
|
||||
"\tpsubw %%mm3,%%mm4\n"
|
||||
|
||||
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
|
||||
|
||||
"\tpackuswb %%mm6,%%mm5\n"
|
||||
|
||||
"\tmovq %4,%%mm6\n"
|
||||
"\tmovq %%mm1,%%mm7\n"
|
||||
"\t" pminub(mm0,mm7,mm2) "\n"
|
||||
"\tpand %%mm6,%%mm7\n"
|
||||
"\tpandn %%mm5,%%mm6\n"
|
||||
|
||||
"\tpor %%mm6,%%mm7\n"
|
||||
|
||||
"\tmovq %%mm7,%0\n"
|
||||
: "=m" (*op.D)
|
||||
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
|
||||
: "%eax", "%ecx", "%edx", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
|
||||
op.A += 8;
|
||||
op.B += 8;
|
||||
op.D += 8;
|
||||
}
|
||||
|
||||
if (op.n_pixels)
|
||||
{
|
||||
asm volatile (" movd %1,%%mm0\n"
|
||||
"\tmovq %2,%%mm1\n"
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpcklbw %%mm2,%%mm3\n"
|
||||
"\tpunpcklbw %%mm0,%%mm2\n"
|
||||
|
||||
"\tmovq %3,%%mm4\n"
|
||||
"\tpsubw %%mm3,%%mm4\n"
|
||||
|
||||
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
|
||||
|
||||
"\tmovq %%mm1,%%mm3\n"
|
||||
"\tpxor %%mm2,%%mm2\n"
|
||||
"\tpunpckhbw %%mm2,%%mm3\n"
|
||||
"\tpunpckhbw %%mm0,%%mm2\n"
|
||||
|
||||
"\tmovq %3,%%mm4\n"
|
||||
"\tpsubw %%mm3,%%mm4\n"
|
||||
|
||||
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
|
||||
|
||||
"\tpackuswb %%mm6,%%mm5\n"
|
||||
|
||||
"\tmovq %4,%%mm6\n"
|
||||
"\tmovq %%mm1,%%mm7\n"
|
||||
"\tpminub %%mm0,%%mm7\n"
|
||||
"\tpand %%mm6,%%mm7\n"
|
||||
"\tpandn %%mm5,%%mm6\n"
|
||||
|
||||
"\tpor %%mm6,%%mm7\n"
|
||||
|
||||
"\tmovd %%mm7,%2\n"
|
||||
: "=m" (*op.D)
|
||||
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
|
||||
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
}
|
||||
|
||||
asm("emms");
|
||||
}
|
||||
#endif
|
||||
|
||||
void
|
||||
gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
||||
|
@ -562,7 +368,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
"\tpand %%mm0,%%mm2\n"
|
||||
|
||||
"\tpor %%mm2,%%mm1\n"
|
||||
"\tmovq %%mm1,%0\n"
|
||||
"\tmovntq %%mm1,%0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
|
@ -641,7 +447,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
"\tmovq %%mm0, %%mm1\n"
|
||||
"\tpandn %%mm4, %%mm1\n"
|
||||
"\tpor %%mm2, %%mm1\n"
|
||||
"\tmovq %%mm1, %0\n"
|
||||
"\tmovntq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4");
|
||||
|
@ -702,7 +508,7 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
"\tpminub %%mm2, %%mm3\n"
|
||||
"\tpand %%mm0, %%mm3\n"
|
||||
"\tpor %%mm3, %%mm1\n"
|
||||
"\tmovq %%mm1, %0\n"
|
||||
"\tmovntq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
|
@ -765,7 +571,7 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
"\tpand %%mm0, %%mm2\n"
|
||||
"\tpor %%mm2, %%mm1\n"
|
||||
|
||||
"\tmovq %%mm1, %0\n"
|
||||
"\tmovntq %%mm1, %0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a), "m" (*b)
|
||||
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
|
||||
|
@ -1009,7 +815,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
|
||||
"\tpackuswb %%mm4,%%mm1\n"
|
||||
|
||||
"\tmovq %%mm1,%0\n"
|
||||
"\tmovntq %%mm1,%0\n"
|
||||
: "=m" (*d)
|
||||
: "m" (*a)
|
||||
: "%mm1", "%mm2", "%mm4", "%mm5", "%mm7");
|
||||
|
@ -1044,9 +850,13 @@ gimp_composite_screen_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
|
|||
uint64 *b = (uint64 *) _op->B;
|
||||
gulong n_pixels = _op->n_pixels;
|
||||
|
||||
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
|
||||
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7");
|
||||
asm volatile ("pxor %mm6, %mm6");
|
||||
asm volatile ("pxor %%mm6,%%mm6\n"
|
||||
"movq %0,%%mm0\n"
|
||||
"movq %1,%%mm7\n"
|
||||
: /* empty */
|
||||
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
|
||||
: "%mm0", "%mm6", "%mm7");
|
||||
|
||||
|
||||
for (; n_pixels >= 2; n_pixels -= 2)
|
||||
{
|
||||
|
|
|
@ -605,46 +605,32 @@ gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op)
|
|||
GimpCompositeContext op = *_op;
|
||||
|
||||
/*
|
||||
* Inhale one whole i686 cache line at once. 64 bytes, 16 rgba8
|
||||
* pixels, 4 128 bit xmm registers.
|
||||
* Inhale one whole i686 cache line at once. 128 bytes == 32 rgba8
|
||||
* pixels == 8 128 bit xmm registers.
|
||||
*/
|
||||
for (; op.n_pixels >= 16; op.n_pixels -= 16)
|
||||
{
|
||||
asm volatile (" movdqu %0,%%xmm0\n"
|
||||
"\tmovdqu %1,%%xmm1\n"
|
||||
"\tmovdqu %2,%%xmm2\n"
|
||||
"\tmovdqu %3,%%xmm3\n"
|
||||
"\tmovdqu %4,%%xmm4\n"
|
||||
"\tmovdqu %5,%%xmm5\n"
|
||||
"\tmovdqu %6,%%xmm6\n"
|
||||
"\tmovdqu %7,%%xmm7\n"
|
||||
:
|
||||
: "m" (op.A[0]), "m" (op.B[0]),
|
||||
"m" (op.A[1]), "m" (op.B[1]),
|
||||
"m" (op.A[2]), "m" (op.B[2]),
|
||||
"m" (op.A[3]), "m" (op.B[3])
|
||||
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
|
||||
);
|
||||
asm volatile (" movdqu %0,%%xmm0\n" : :"m" (op.A[0]) : "%xmm0");
|
||||
asm volatile (" movdqu %0,%%xmm1\n" : :"m" (op.B[0]) : "%xmm1");
|
||||
asm volatile (" movdqu %0,%%xmm2\n" : :"m" (op.A[1]) : "%xmm2");
|
||||
asm volatile (" movdqu %0,%%xmm3\n" : :"m" (op.B[1]) : "%xmm3");
|
||||
asm volatile (" movdqu %0,%%xmm4\n" : :"m" (op.A[2]) : "%xmm4");
|
||||
asm volatile (" movdqu %0,%%xmm5\n" : :"m" (op.B[2]) : "%xmm5");
|
||||
asm volatile (" movdqu %0,%%xmm6\n" : :"m" (op.A[3]) : "%xmm6");
|
||||
asm volatile (" movdqu %0,%%xmm7\n" : :"m" (op.B[3]) : "%xmm7");
|
||||
|
||||
asm volatile ("\tmovdqu %%xmm0,%1\n"
|
||||
"\tmovdqu %%xmm1,%0\n"
|
||||
"\tmovdqu %%xmm2,%3\n"
|
||||
"\tmovdqu %%xmm3,%2\n"
|
||||
"\tmovdqu %%xmm4,%5\n"
|
||||
"\tmovdqu %%xmm5,%4\n"
|
||||
"\tmovdqu %%xmm6,%7\n"
|
||||
"\tmovdqu %%xmm7,%6\n"
|
||||
: "=m" (op.A[0]), "=m" (op.B[0]),
|
||||
"=m" (op.A[1]), "=m" (op.B[1]),
|
||||
"=m" (op.A[2]), "=m" (op.B[2]),
|
||||
"=m" (op.A[3]), "=m" (op.B[3])
|
||||
: /* empty */
|
||||
);
|
||||
asm volatile ("\tmovdqu %%xmm0,%0\n" : "=m" (op.A[0]));
|
||||
asm volatile ("\tmovdqu %%xmm1,%0\n" : "=m" (op.B[0]));
|
||||
asm volatile ("\tmovdqu %%xmm2,%0\n" : "=m" (op.A[1]));
|
||||
asm volatile ("\tmovdqu %%xmm3,%0\n" : "=m" (op.B[1]));
|
||||
asm volatile ("\tmovdqu %%xmm4,%0\n" : "=m" (op.A[2]));
|
||||
asm volatile ("\tmovdqu %%xmm5,%0\n" : "=m" (op.B[2]));
|
||||
asm volatile ("\tmovdqu %%xmm6,%0\n" : "=m" (op.A[3]));
|
||||
asm volatile ("\tmovdqu %%xmm7,%0\n" : "=m" (op.B[3]));
|
||||
op.A += 64;
|
||||
op.B += 64;
|
||||
}
|
||||
|
||||
|
||||
for (; op.n_pixels >= 4; op.n_pixels -= 4)
|
||||
{
|
||||
asm volatile (" movdqu %0,%%xmm2\n"
|
||||
|
|
|
@ -25,14 +25,14 @@
|
|||
* Convert the low 8bit byte of the src to 16bit words in dst.
|
||||
*/
|
||||
#define mmx_low_bytes_to_words(src,dst,zero) \
|
||||
"\tmovq %%"#src", %%"#dst"; " \
|
||||
"\tmovq %%"#src", %%"#dst"\n" \
|
||||
"\tpunpcklbw %%"#zero", %%"#dst"\n"
|
||||
|
||||
/*
|
||||
* Convert the high 8bit byte of the src to 16bit words in dst.
|
||||
*/
|
||||
#define mmx_high_bytes_to_words(src,dst,zero) \
|
||||
"\tmovq %%"#src", %%"#dst"; " \
|
||||
"\tmovq %%"#src", %%"#dst"\n" \
|
||||
"\tpunpckhbw %%"#zero", %%"#dst"\n"
|
||||
|
||||
#define xmm_low_bytes_to_words(src,dst,zero) \
|
||||
|
@ -65,18 +65,18 @@
|
|||
* (high-order bit of each word is cleared)
|
||||
* Clobbers eax, ecx edx
|
||||
*/
|
||||
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
|
||||
"movd %%" #divisor ",%%ecx; " \
|
||||
"xorl %%edx,%%edx; " \
|
||||
"divw %%cx; " \
|
||||
"roll $16, %%eax; " \
|
||||
"roll $16, %%ecx; " \
|
||||
"xorl %%edx,%%edx; " \
|
||||
"divw %%cx; " \
|
||||
"btr $15, %%eax; " \
|
||||
"roll $16, %%eax; " \
|
||||
"btr $15, %%eax; " \
|
||||
"movd %%eax,%%" #quotient ";"
|
||||
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax\n" \
|
||||
"movd %%" #divisor ",%%ecx\n" \
|
||||
"xorl %%edx,%%edx\n" \
|
||||
"divw %%cx\n" \
|
||||
"roll $16, %%eax\n" \
|
||||
"roll $16, %%ecx\n" \
|
||||
"xorl %%edx,%%edx\n" \
|
||||
"divw %%cx\n" \
|
||||
"btr $15, %%eax\n" \
|
||||
"roll $16, %%eax\n" \
|
||||
"btr $15, %%eax\n" \
|
||||
"movd %%eax,%%" #quotient "\n"
|
||||
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue