need to test for GIMP_COMPOSITE_OPTION_NOEXTENSIONS.

2006-06-02 Sven Neumann <sven@gimp.org> * app/composite/gimp-composite.c (gimp_composite_use_cpu_accel): need to test for GIMP_COMPOSITE_OPTION_NOEXTENSIONS. * libgimp/gimp.c (gimp_config): call gimp_set_use_cpu_accel() from here, not in gimp_main(). * plug-ins/common/sel_gauss.c: applied patch from Loren Merritt that adds MMX code to boost the plug-in speed (bug #342860).
2006-06-02 11:36:16 +00:00 · 2006-06-02 11:36:16 +00:00 · fd972c23dc
parent 7777e97eba
commit fd972c23dc
4 changed files with 294 additions and 9 deletions
--- a/11
+++ b/11
@ -1,3 +1,14 @@
+2006-06-02  Sven Neumann  <sven@gimp.org>
+
+	* app/composite/gimp-composite.c (gimp_composite_use_cpu_accel):
+	need to test for GIMP_COMPOSITE_OPTION_NOEXTENSIONS.
+
+	* libgimp/gimp.c (gimp_config): call gimp_set_use_cpu_accel() from
+	here, not in gimp_main().
+
+	* plug-ins/common/sel_gauss.c: applied patch from Loren Merritt
+	that adds MMX code to boost the plug-in speed (bug #342860).
+
 2006-06-02  Sven Neumann  <sven@gimp.org>

 	Moved the CPU detection code to libgimpbase (see bug #342860):
--- a/app/composite/gimp-composite.c
+++ b/app/composite/gimp-composite.c
@ -335,8 +335,8 @@ gimp_composite_init (gboolean  be_verbose,
      gimp_composite_options.bits = strtoul(p, NULL, 16);
    }

-  if (!use_cpu_accel)
-    gimp_composite_options.bits |=  GIMP_COMPOSITE_OPTION_NOEXTENSIONS;
+  if (! use_cpu_accel)
+    gimp_composite_options.bits |= GIMP_COMPOSITE_OPTION_NOEXTENSIONS;

  if (be_verbose)
    g_printerr ("gimp_composite: use=%s, verbose=%s\n",
@ -388,5 +388,5 @@ gimp_composite_init (gboolean  be_verbose,
 gboolean
 gimp_composite_use_cpu_accel (void)
 {
-  return ((gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_USE) != 0);
+  return ! (gimp_composite_options.bits & GIMP_COMPOSITE_OPTION_NOEXTENSIONS);
 }
--- a/libgimp/gimp.c
+++ b/libgimp/gimp.c
@ -391,9 +391,6 @@ gimp_main (const GimpPlugInInfo *info,
    gimp_base_init (&vtable);
  }

-  gimp_cpu_accel_set_use (gimp_use_cpu_accel ());
-
-
  /* initialize i18n support */

  setlocale (LC_ALL, "");
@ -1726,6 +1723,8 @@ gimp_config (GPConfig *config)
  if (config->app_name)
    g_set_application_name (config->app_name);

+  gimp_cpu_accel_set_use (gimp_use_cpu_accel ());
+
  if (_shm_ID != -1)
    {
 #if defined(USE_SYSV_SHM)
--- a/plug-ins/common/sel_gauss.c
+++ b/plug-ins/common/sel_gauss.c
@ -317,6 +317,271 @@ init_matrix (gdouble  radius,
    mat[dx] = c1 * exp ((dx * dx)/ c2);
 }

+
+#if defined(ARCH_X86) && defined(USE_MMX) && defined(__GNUC__)
+#define HAVE_ACCEL 1
+
+static ALWAYS_INLINE void
+matrixmult_mmx (const guchar  *src,
+                guchar        *dest,
+                gint           width,
+                gint           height,
+                const gdouble *mat,
+                gint           numrad,
+                gint           bytes,
+                gboolean       has_alpha,
+                gint           maxdelta,
+                gboolean       preview_mode)
+{
+  const gint       rowstride = width * bytes;
+  const long long  maxdelta4 = maxdelta * 0x0001000100010001ULL;
+  gushort         *imat;
+  gdouble          fsum, fscale;
+  gint             i, j, x, y, d;
+
+  g_assert (has_alpha ? (bytes == 4) : (bytes == 3 || bytes == 1));
+
+  imat = g_new (gushort, 2 * numrad + 3);
+
+  fsum = 0.0;
+  for (y = 1 - numrad; y < numrad; y++)
+    fsum += mat[ABS(y)];
+
+  /* Ensure that one pixel's product fits in 16bits,
+   * and that the sum fits in 32bits.
+   */
+  fscale = MIN (0x100 / mat[0], 0x1000 / fsum);
+  for (y = 0; y < numrad; y++)
+    imat[numrad - y] = imat[numrad + y] = mat[y] * fscale;
+
+  for (y = numrad; y < numrad + 3; y++)
+    imat[numrad + y] = 0;
+
+  for (y = 0; y < height; y++)
+    {
+      asm volatile (
+        "pxor  %%mm7, %%mm7 \n\t":
+      );
+
+      for (x = 0; x < width; x++)
+        {
+          guint r, g, b, fr, fg, fb;
+          gint  offset;
+          gint  dix;
+
+          r = g = b = fr = fg = fb = 0;
+
+          dix = bytes * (width * y + x);
+
+          if (has_alpha)
+            {
+              *(guint*) &dest[dix] = *(guint*) &src[dix];
+
+              if (!src[dix + 3])
+                continue;
+            }
+
+          asm volatile (
+            "movd         %0, %%mm6 \n\t"
+            "punpcklbw %%mm7, %%mm6 \n\t" /* center pixel */
+            :: "m"(src[dix])
+          );
+
+          offset = rowstride * (y - numrad) + bytes * (x - numrad);
+
+          if (bytes == 1)
+            {
+              asm volatile (
+                "pshufw $0, %%mm6, %%mm6 \n\t": /* center pixel x4 */
+              );
+              for (j = 1 - numrad; j < numrad; j++)
+                {
+                  const guchar *src_b;
+                  guint         rowsum  = 0;
+                  guint         rowfact = 0;
+
+                  offset += rowstride;
+
+                  if (y + j < 0 || y + j >= height)
+                    continue;
+
+                  src_b = src + offset - 3;
+
+                  asm volatile (
+                    "pxor  %%mm5, %%mm5 \n\t" /* row fact */
+                    "pxor  %%mm4, %%mm4 \n\t" /* row sum  */
+                    :
+                  );
+
+                  for (i = 1 - numrad; i < numrad; i += 4)
+                    {
+                      src_b += 4;
+                      if (x + i < 0 || x + i >= width)
+                        continue;
+
+                      asm volatile (
+                        "movd         %0, %%mm0 \n\t"
+                        "movq      %%mm6, %%mm1 \n\t"
+                        "punpcklbw %%mm7, %%mm0 \n\t" /* one pixel      */
+                        "psubusw   %%mm0, %%mm1 \n\t" /* diff           */
+                        "movq      %%mm0, %%mm2 \n\t"
+                        "psubusw   %%mm6, %%mm2 \n\t"
+                        "por       %%mm2, %%mm1 \n\t" /* abs diff       */
+                        "pcmpgtw      %1, %%mm1 \n\t" /* threshold      */
+                        "pandn        %2, %%mm1 \n\t" /* weight         */
+                        "pmullw    %%mm1, %%mm0 \n\t" /* pixel * weight */
+                        "paddusw   %%mm1, %%mm5 \n\t" /* fact           */
+                        "movq      %%mm0, %%mm2 \n\t"
+                        "punpcklwd %%mm7, %%mm0 \n\t"
+                        "punpckhwd %%mm7, %%mm2 \n\t"
+                        "paddd     %%mm0, %%mm4 \n\t"
+                        "paddd     %%mm2, %%mm4 \n\t" /* sum            */
+                        :: "m"(*src_b), "m"(maxdelta4), "m"(imat[numrad + i])
+                      );
+                    }
+
+                  asm volatile (
+                    "pshufw $0xb1, %%mm5, %%mm3 \n\t"
+                    "paddusw       %%mm3, %%mm5 \n\t"
+                    "pshufw $0x0e, %%mm4, %%mm2 \n\t"
+                    "pshufw $0x0e, %%mm5, %%mm3 \n\t"
+                    "paddd         %%mm2, %%mm4 \n\t"
+                    "paddusw       %%mm3, %%mm5 \n\t"
+                    "movd          %%mm4, %0    \n\t"
+                    "movd          %%mm5, %1    \n\t"
+                    :"=g"(rowsum), "=g"(rowfact)
+                  );
+                  d = imat[numrad + j];
+                  r += d * rowsum;
+                  fr += d * (gushort) rowfact;
+                }
+
+              dest[dix] = r / fr;
+            }
+          else
+            {
+              for (j = 1 - numrad; j < numrad; j++)
+                {
+                  const guchar *src_b;
+                  gushort       rf[4];
+                  guint         rr, rg, rb;
+
+                  offset += rowstride;
+                  if (y + j < 0 || y + j >= height)
+                    continue;
+
+                  src_b = src + offset;
+
+                  asm volatile (
+                    "pxor  %%mm5, %%mm5 \n\t" /* row fact   */
+                    "pxor  %%mm4, %%mm4 \n\t" /* row sum RG */
+                    "pxor  %%mm3, %%mm3 \n\t" /* row sum B  */
+                    :
+                  );
+
+                  for (i = 1 - numrad; i < numrad; i++)
+                    {
+                      src_b += bytes;
+                      if (x + i < 0 || x + i >= width)
+                        continue;
+
+                      if (has_alpha)
+                        asm volatile (
+                          "movd         %0, %%mm0 \n\t"
+                          "movq      %%mm6, %%mm1 \n\t"
+                          "punpcklbw %%mm7, %%mm0 \n\t" /* one pixel       */
+                          "psubusw   %%mm0, %%mm1 \n\t" /* diff            */
+                          "movq      %%mm0, %%mm2 \n\t"
+                          "psubusw   %%mm6, %%mm2 \n\t"
+                          "por       %%mm2, %%mm1 \n\t" /* abs diff        */
+                          "pcmpgtw      %1, %%mm1 \n\t" /* threshold       */
+                          "pshufw   $0, %2, %%mm2 \n\t" /* weight          */
+                          "pandn     %%mm2, %%mm1 \n\t"
+                          "pshufw $0xff, %%mm0, %%mm2 \n\t" /* alpha       */
+                          "psllw        $8, %%mm2 \n\t"
+                          "pmulhuw   %%mm2, %%mm1 \n\t" /* weight *= alpha */
+                          "pmullw    %%mm1, %%mm0 \n\t" /* pixel * weight  */
+                          "paddusw   %%mm1, %%mm5 \n\t" /* fact            */
+                          "movq      %%mm0, %%mm2 \n\t"
+                          "punpcklwd %%mm7, %%mm0 \n\t" /* RG              */
+                          "punpckhwd %%mm7, %%mm2 \n\t" /* B               */
+                          "paddd     %%mm0, %%mm4 \n\t"
+                          "paddd     %%mm2, %%mm3 \n\t"
+                          :: "m"(*src_b), "m"(maxdelta4), "m"(imat[numrad + i])
+                        );
+                      else
+                        asm volatile (
+                          "movd         %0, %%mm0 \n\t"
+                          "movq      %%mm6, %%mm1 \n\t"
+                          "punpcklbw %%mm7, %%mm0 \n\t" /* one pixel       */
+                          "psubusw   %%mm0, %%mm1 \n\t" /* diff            */
+                          "movq      %%mm0, %%mm2 \n\t"
+                          "psubusw   %%mm6, %%mm2 \n\t"
+                          "por       %%mm2, %%mm1 \n\t" /* abs diff        */
+                          "pcmpgtw      %1, %%mm1 \n\t" /* threshold       */
+                          "pshufw   $0, %2, %%mm2 \n\t" /* weight          */
+                          "pandn     %%mm2, %%mm1 \n\t"
+                          "pmullw    %%mm1, %%mm0 \n\t" /* pixel * weight  */
+                          "paddusw   %%mm1, %%mm5 \n\t" /* fact            */
+                          "movq      %%mm0, %%mm2 \n\t"
+                          "punpcklwd %%mm7, %%mm0 \n\t" /* RG              */
+                          "punpckhwd %%mm7, %%mm2 \n\t" /* B               */
+                          "paddd     %%mm0, %%mm4 \n\t"
+                          "paddd     %%mm2, %%mm3 \n\t"
+                          :: "m"(*src_b), "m"(maxdelta4), "m"(imat[numrad + i])
+                        );
+                    }
+
+                  asm volatile (
+                    "movd    %%mm4, %0 \n\t"
+                    "movd    %%mm3, %2 \n\t"
+                    "psrlq  $32, %%mm4 \n\t"
+                    "movq    %%mm5, %3 \n\t"
+                    "movd    %%mm4, %1 \n\t"
+                    :"=g"(rr), "=g"(rg), "=g"(rb), "=m"(*rf)
+                    ::"memory"
+                  );
+                  d = imat[numrad + j];
+                  r += d * rr;
+                  g += d * rg;
+                  b += d * rb;
+                  fr += d * rf[0];
+                  fg += d * rf[1];
+                  fb += d * rf[2];
+                }
+
+              if (has_alpha)
+                {
+                  if (fr)
+                    dest[dix+0] = r / fr;
+                  if (fg)
+                    dest[dix+1] = g / fg;
+                  if (fb)
+                    dest[dix+2] = b / fb;
+                }
+              else
+                {
+                  dest[dix+0] = r / fr;
+                  dest[dix+1] = g / fg;
+                  dest[dix+2] = b / fb;
+                }
+            }
+        }
+
+      if (!(y % 10) && !preview_mode)
+        {
+          asm volatile ("emms");
+          gimp_progress_update ((double)y / (double)height);
+        }
+    }
+
+  asm volatile ("emms");
+
+  g_free (imat);
+}
+#endif /* ARCH_X86 && USE_MMX && __GNUC__ */
+
+
 static ALWAYS_INLINE void
 matrixmult_int (const guchar  *src,
                guchar        *dest,
@ -331,10 +596,20 @@ matrixmult_int (const guchar  *src,
 {
  const gint  nb        = bytes - (has_alpha ? 1 : 0);
  const gint  rowstride = width * bytes;
-  gushort    *imat      = g_new (gushort, 2 * numrad);
+  gushort    *imat;
+  gdouble     fsum, fscale;
+  gint        i, j, b, x, y, d;

-  gdouble  fsum, fscale;
-  gint     i, j, b, x, y, d;
+#ifdef HAVE_ACCEL
+  GimpCpuAccelFlags cpu = gimp_cpu_accel_get_support ();
+
+  if ((has_alpha ? (bytes == 4) : (bytes == 3 || bytes == 1))
+      && (cpu & (GIMP_CPU_ACCEL_X86_MMXEXT | GIMP_CPU_ACCEL_X86_SSE)))
+    return matrixmult_mmx (src, dest, width, height, mat, numrad,
+                           bytes, has_alpha, maxdelta, preview_mode);
+#endif
+
+  imat = g_new (gushort, 2 * numrad);

  fsum = 0.0;
  for (y = 1 - numrad; y < numrad; y++)