Trying to be more flexible in register use so the compiler can do reloads

* app/composite/gimp-composite-sse2.c:
Trying to be more flexible in register use so the compiler can do
reloads without running out of registers when using optimisation
modes other than 2.

Avoid the message "error: can't find a register in class
`GENERAL_REGS' while reloading `asm'"

* app/composite/gimp-composite-x86.h
Use more newlines in asm() macros to ensure that gcc gets the
instruction count correct.  This is partially complete as of this
commit.
This commit is contained in:
Helvetix Victorinox 2005-05-17 17:24:26 +00:00
parent faa0a60cb1
commit f41b1f24f6
5 changed files with 98 additions and 286 deletions

View File

@ -1,3 +1,19 @@
2005-05-17 Helvetix Victorinox <helvetix@gimp.org>
* app/composite/gimp-composite-sse2.c:
Trying to be more flexible in register use so the compiler can do
reloads without running out of registers when using optimisation
modes other than 2.
Avoid the message "error: can't find a register in class
`GENERAL_REGS' while reloading `asm'"
* app/composite/gimp-composite-x86.h
Use more newlines in asm() macros to ensure that gcc gets the
instruction count correct. This is partially complete as of this
commit.
2005-05-17 Sven Neumann <sven@gimp.org>
* configure.in: bumped version number to 2.3.1.

View File

@ -99,16 +99,16 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %1,%%mm2\n"
"\tmovq %2,%%mm3\n"
"\tmovq %%mm2,%%mm4\n"
"\tpaddusb %%mm3,%%mm4\n"
"\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n"
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\t" pminub(mm3, mm2, mm4) "\n"
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%0\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
@ -119,16 +119,16 @@ gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
if (n_pixels > 0)
{
asm volatile (" movd %1,%%mm2\n"
"\tmovd %2,%%mm3\n"
"\tmovq %%mm2,%%mm4\n"
"\tpaddusb %%mm3,%%mm4\n"
"\tmovq %%mm0,%%mm1\n"
"\tpandn %%mm4,%%mm1\n"
asm volatile (" movd %1, %%mm2\n"
"\tmovd %2, %%mm3\n"
"\tmovq %%mm2, %%mm4\n"
"\tpaddusb %%mm3, %%mm4\n"
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\t" pminub(mm3, mm2, mm4) "\n"
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovd %%mm1,%0\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm0", "%mm1", "%mm2", "%mm3", "%mm4");
@ -271,7 +271,7 @@ gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm2, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm2", "%mm3", "%mm4");
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
@ -318,7 +318,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
@ -341,7 +341,7 @@ gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -591,7 +591,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
@ -622,7 +622,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -636,9 +636,9 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0\n"
"pxor %%mm6,%%mm6\n"
"movq %1,%%mm7\n"
asm volatile ("movq %0, %%mm0\n"
"pxor %%mm6, %%mm6\n"
"movq %1, %%mm7\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7");
@ -669,7 +669,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "%mm1", "%mm2", "%mm3", "%mm4");
a++;
b++;
d++;
@ -699,7 +699,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
: "%mm1", "%mm2", "%mm3", "%mm4");
}
asm("emms");
@ -1048,7 +1048,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm7");
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
a++;
d++;
}
@ -1066,7 +1066,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
"\tmovd %%mm1,%0\n"
: "=m" (*d)
: "m" (*a)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm7");
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
}
asm("emms");
@ -1267,8 +1267,8 @@ gimp_composite_swap_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
{
asm volatile (" movq %0,%%mm2\n"
"\tmovq %1,%%mm3\n"
"\tmovq %%mm3,%0\n"
"\tmovq %%mm2,%1\n"
"\tmovntq %%mm3,%0\n"
"\tmovntq %%mm2,%1\n"
: "+m" (*a), "+m" (*b)
:
: "%mm2", "%mm3");

View File

@ -242,10 +242,10 @@ gimp_composite_darken_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
for (; n_pixels >= 2; n_pixels -= 2)
{
asm volatile (" movq %1, %%mm2\n"
"\tmovq %2, %%mm3\n"
asm volatile (" movq %1,%%mm2\n"
"\tmovq %2,%%mm3\n"
"\t" pminub(mm3, mm2, mm4) "\n"
"\tmovntq %%mm2, %0\n"
"\tmovntq %%mm2,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
@ -292,10 +292,10 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpminub %%mm3, %%mm2\n"
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
"\tmovntq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
a++;
b++;
d++;
@ -318,206 +318,12 @@ gimp_composite_difference_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tmovd %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
}
#if 0
void
xxxgimp_composite_divide_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
asm volatile (" movq %0, %%mm0\n"
"\tmovq %1, %%mm7\n"
:
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w1_64)
: "%mm0", "%mm7");
for (; op.n_pixels >= 2; op.n_pixels -= 2)
{
asm volatile (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm6,%%mm6\n"
"\tpunpckhbw %%mm6,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
"\tpminub %%mm0,%%mm1\n"
"\tmovq %3,%%mm3\n"
"\tmovq %%mm3,%%mm2\n"
"\tpandn %%mm5,%%mm3\n"
"\tpand %%mm2,%%mm1\n"
"\tpor %%mm1,%%mm3\n"
"\tmovq %%mm3,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
op.A += 8;
op.B += 8;
op.D += 8;
}
if (op.n_pixels)
{
asm volatile (" movd %1,%%mm0\n"
"\tmovd %2,%%mm1\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm5,%%mm5\n"
"\tpunpcklbw %%mm5,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm6,%%mm6\n"
"\tpunpckhbw %%mm6,%%mm3\n"
"\tpaddw %%mm7,%%mm3\n" /* mm3 = B+1 */
"\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
"\tpackuswb %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
"\tpminub %%mm0,%%mm1\n"
"\tmovq %3,%%mm3\n"
"\tmovq %%mm3,%%mm2\n"
"\tpandn %%mm5,%%mm3\n"
"\tpand %%mm2,%%mm1\n"
"\tpor %%mm1,%%mm3\n"
"\tmovd %%mm3,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
}
#endif
#if 0
void
xxxgimp_composite_dodge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
{
GimpCompositeContext op = *_op;
for (; op.n_pixels >= 2; op.n_pixels -= 2)
{
asm volatile (" movq %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm2,%%mm3\n"
"\tpunpcklbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm2,%%mm3\n"
"\tpunpckhbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
"\tpackuswb %%mm6,%%mm5\n"
"\tmovq %4,%%mm6\n"
"\tmovq %%mm1,%%mm7\n"
"\t" pminub(mm0,mm7,mm2) "\n"
"\tpand %%mm6,%%mm7\n"
"\tpandn %%mm5,%%mm6\n"
"\tpor %%mm6,%%mm7\n"
"\tmovq %%mm7,%0\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
op.A += 8;
op.B += 8;
op.D += 8;
}
if (op.n_pixels)
{
asm volatile (" movd %1,%%mm0\n"
"\tmovq %2,%%mm1\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpcklbw %%mm2,%%mm3\n"
"\tpunpcklbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm5) "\n"
"\tmovq %%mm1,%%mm3\n"
"\tpxor %%mm2,%%mm2\n"
"\tpunpckhbw %%mm2,%%mm3\n"
"\tpunpckhbw %%mm0,%%mm2\n"
"\tmovq %3,%%mm4\n"
"\tpsubw %%mm3,%%mm4\n"
"\t" pdivwuqX(mm2,mm4,mm6) "\n"
"\tpackuswb %%mm6,%%mm5\n"
"\tmovq %4,%%mm6\n"
"\tmovq %%mm1,%%mm7\n"
"\tpminub %%mm0,%%mm7\n"
"\tpand %%mm6,%%mm7\n"
"\tpandn %%mm5,%%mm6\n"
"\tpor %%mm6,%%mm7\n"
"\tmovd %%mm7,%2\n"
: "=m" (*op.D)
: "m" (*op.A), "m" (*op.B), "m" (*rgba8_w256_64), "m" (*rgba8_alpha_mask_64)
: "%eax", "%ecx", "%edx", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
}
asm("emms");
}
#endif
void
gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
@ -562,7 +368,7 @@ gimp_composite_grain_extract_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpand %%mm0,%%mm2\n"
"\tpor %%mm2,%%mm1\n"
"\tmovq %%mm1,%0\n"
"\tmovntq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
@ -641,7 +447,7 @@ gimp_composite_grain_merge_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tmovq %%mm0, %%mm1\n"
"\tpandn %%mm4, %%mm1\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
"\tmovntq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4");
@ -702,7 +508,7 @@ gimp_composite_lighten_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpminub %%mm2, %%mm3\n"
"\tpand %%mm0, %%mm3\n"
"\tpor %%mm3, %%mm1\n"
"\tmovq %%mm1, %0\n"
"\tmovntq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
@ -765,7 +571,7 @@ gimp_composite_multiply_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpand %%mm0, %%mm2\n"
"\tpor %%mm2, %%mm1\n"
"\tmovq %%mm1, %0\n"
"\tmovntq %%mm1, %0\n"
: "=m" (*d)
: "m" (*a), "m" (*b)
: "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
@ -1009,7 +815,7 @@ gimp_composite_scale_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
"\tpackuswb %%mm4,%%mm1\n"
"\tmovq %%mm1,%0\n"
"\tmovntq %%mm1,%0\n"
: "=m" (*d)
: "m" (*a)
: "%mm1", "%mm2", "%mm4", "%mm5", "%mm7");
@ -1044,9 +850,13 @@ gimp_composite_screen_rgba8_rgba8_rgba8_sse (GimpCompositeContext *_op)
uint64 *b = (uint64 *) _op->B;
gulong n_pixels = _op->n_pixels;
asm volatile ("movq %0,%%mm0" : : "m" (*rgba8_alpha_mask_64) : "%mm0");
asm volatile ("movq %0,%%mm7" : : "m" (*rgba8_w128_64) : "%mm7");
asm volatile ("pxor %mm6, %mm6");
asm volatile ("pxor %%mm6,%%mm6\n"
"movq %0,%%mm0\n"
"movq %1,%%mm7\n"
: /* empty */
: "m" (*rgba8_alpha_mask_64), "m" (*rgba8_w128_64)
: "%mm0", "%mm6", "%mm7");
for (; n_pixels >= 2; n_pixels -= 2)
{

View File

@ -605,46 +605,32 @@ gimp_composite_swap_rgba8_rgba8_rgba8_sse2 (GimpCompositeContext *_op)
GimpCompositeContext op = *_op;
/*
* Inhale one whole i686 cache line at once. 64 bytes, 16 rgba8
* pixels, 4 128 bit xmm registers.
* Inhale one whole i686 cache line at once. 128 bytes == 32 rgba8
* pixels == 8 128 bit xmm registers.
*/
for (; op.n_pixels >= 16; op.n_pixels -= 16)
{
asm volatile (" movdqu %0,%%xmm0\n"
"\tmovdqu %1,%%xmm1\n"
"\tmovdqu %2,%%xmm2\n"
"\tmovdqu %3,%%xmm3\n"
"\tmovdqu %4,%%xmm4\n"
"\tmovdqu %5,%%xmm5\n"
"\tmovdqu %6,%%xmm6\n"
"\tmovdqu %7,%%xmm7\n"
:
: "m" (op.A[0]), "m" (op.B[0]),
"m" (op.A[1]), "m" (op.B[1]),
"m" (op.A[2]), "m" (op.B[2]),
"m" (op.A[3]), "m" (op.B[3])
: "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7"
);
asm volatile (" movdqu %0,%%xmm0\n" : :"m" (op.A[0]) : "%xmm0");
asm volatile (" movdqu %0,%%xmm1\n" : :"m" (op.B[0]) : "%xmm1");
asm volatile (" movdqu %0,%%xmm2\n" : :"m" (op.A[1]) : "%xmm2");
asm volatile (" movdqu %0,%%xmm3\n" : :"m" (op.B[1]) : "%xmm3");
asm volatile (" movdqu %0,%%xmm4\n" : :"m" (op.A[2]) : "%xmm4");
asm volatile (" movdqu %0,%%xmm5\n" : :"m" (op.B[2]) : "%xmm5");
asm volatile (" movdqu %0,%%xmm6\n" : :"m" (op.A[3]) : "%xmm6");
asm volatile (" movdqu %0,%%xmm7\n" : :"m" (op.B[3]) : "%xmm7");
asm volatile ("\tmovdqu %%xmm0,%1\n"
"\tmovdqu %%xmm1,%0\n"
"\tmovdqu %%xmm2,%3\n"
"\tmovdqu %%xmm3,%2\n"
"\tmovdqu %%xmm4,%5\n"
"\tmovdqu %%xmm5,%4\n"
"\tmovdqu %%xmm6,%7\n"
"\tmovdqu %%xmm7,%6\n"
: "=m" (op.A[0]), "=m" (op.B[0]),
"=m" (op.A[1]), "=m" (op.B[1]),
"=m" (op.A[2]), "=m" (op.B[2]),
"=m" (op.A[3]), "=m" (op.B[3])
: /* empty */
);
asm volatile ("\tmovdqu %%xmm0,%0\n" : "=m" (op.A[0]));
asm volatile ("\tmovdqu %%xmm1,%0\n" : "=m" (op.B[0]));
asm volatile ("\tmovdqu %%xmm2,%0\n" : "=m" (op.A[1]));
asm volatile ("\tmovdqu %%xmm3,%0\n" : "=m" (op.B[1]));
asm volatile ("\tmovdqu %%xmm4,%0\n" : "=m" (op.A[2]));
asm volatile ("\tmovdqu %%xmm5,%0\n" : "=m" (op.B[2]));
asm volatile ("\tmovdqu %%xmm6,%0\n" : "=m" (op.A[3]));
asm volatile ("\tmovdqu %%xmm7,%0\n" : "=m" (op.B[3]));
op.A += 64;
op.B += 64;
}
for (; op.n_pixels >= 4; op.n_pixels -= 4)
{
asm volatile (" movdqu %0,%%xmm2\n"

View File

@ -25,14 +25,14 @@
* Convert the low 8bit byte of the src to 16bit words in dst.
*/
#define mmx_low_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \
"\tmovq %%"#src", %%"#dst"\n" \
"\tpunpcklbw %%"#zero", %%"#dst"\n"
/*
* Convert the high 8bit byte of the src to 16bit words in dst.
*/
#define mmx_high_bytes_to_words(src,dst,zero) \
"\tmovq %%"#src", %%"#dst"; " \
"\tmovq %%"#src", %%"#dst"\n" \
"\tpunpckhbw %%"#zero", %%"#dst"\n"
#define xmm_low_bytes_to_words(src,dst,zero) \
@ -65,18 +65,18 @@
* (high-order bit of each word is cleared)
* Clobbers eax, ecx edx
*/
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
"movd %%" #divisor ",%%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"roll $16, %%eax; " \
"roll $16, %%ecx; " \
"xorl %%edx,%%edx; " \
"divw %%cx; " \
"btr $15, %%eax; " \
"roll $16, %%eax; " \
"btr $15, %%eax; " \
"movd %%eax,%%" #quotient ";"
#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax\n" \
"movd %%" #divisor ",%%ecx\n" \
"xorl %%edx,%%edx\n" \
"divw %%cx\n" \
"roll $16, %%eax\n" \
"roll $16, %%ecx\n" \
"xorl %%edx,%%edx\n" \
"divw %%cx\n" \
"btr $15, %%eax\n" \
"roll $16, %%eax\n" \
"btr $15, %%eax\n" \
"movd %%eax,%%" #quotient "\n"