From 45b2be990fed79bfb4c039c743d1a54d0e46f3da Mon Sep 17 00:00:00 2001
From: Helvetix Victorinox <Helvetix@src.gnome.org>
Date: Tue, 8 Jul 2003 23:15:16 +0000
Subject: [PATCH] I hate cvs.

Re-adding app/composite
---
 app/composite/Makefile.am                     |   39 +
 app/composite/gimp-composite-generic.c        | 1153 ++++++++
 app/composite/gimp-composite-generic.h        |   29 +
 app/composite/gimp-composite-mmx.c            | 2441 +++++++++++++++++
 app/composite/gimp-composite-mmx.h            |   51 +
 app/composite/gimp-composite-util.h           |   30 +
 app/composite/gimp-composite.c                |  172 ++
 app/composite/gimp-composite.h                |  182 ++
 app/composite/gimp-composite.html             |   82 +
 app/composite/make-gimp-composite-dispatch.py |  460 ++++
 app/composite/ns.py                           |  185 ++
 app/composite/tester.c                        |  466 ++++
 12 files changed, 5290 insertions(+)
 create mode 100644 app/composite/Makefile.am
 create mode 100644 app/composite/gimp-composite-generic.c
 create mode 100644 app/composite/gimp-composite-generic.h
 create mode 100644 app/composite/gimp-composite-mmx.c
 create mode 100644 app/composite/gimp-composite-mmx.h
 create mode 100644 app/composite/gimp-composite-util.h
 create mode 100644 app/composite/gimp-composite.c
 create mode 100644 app/composite/gimp-composite.h
 create mode 100644 app/composite/gimp-composite.html
 create mode 100755 app/composite/make-gimp-composite-dispatch.py
 create mode 100755 app/composite/ns.py
 create mode 100644 app/composite/tester.c

diff --git a/app/composite/Makefile.am b/app/composite/Makefile.am
new file mode 100644
index 0000000000..09e9a79c17
--- /dev/null
+++ b/app/composite/Makefile.am
@@ -0,0 +1,39 @@
+## Process this file with automake to produce Makefile.in
+
+noinst_LIBRARIES = libgimpcomposite.a
+
+libgimpcomposite_a_sources = \
+	gimp-composite.c \
+	gimp-composite-generic.c \
+	gimp-composite-generic.h \
+	gimp-composite.h \
+	gimp-composite-mmx.c \
+	gimp-composite-mmx.h \
+	gimp-composite-util.h
+
+libgimpcomposite_a_built_sources = gimp-composite-dispatch.c
+
+libgimpcomposite_a_SOURCES = $(libgimpcomposite_a_built_sources) $(libgimpcomposite_a_sources)
+
+INCLUDES = \
+	-I$(top_srcdir)/app	\
+	-I$(top_srcdir)/app/composite	\
+	$(GLIB_CFLAGS)		\
+	-I$(includedir)
+
+AM_CPPFLAGS = \
+	-DG_LOG_DOMAIN=\"Gimp-Compositing\"	\
+	@GIMP_THREAD_FLAGS@ 			\
+	@GIMP_MP_FLAGS@
+
+AM_CCASFLAGS = \
+	-I$(top_builddir)	\
+	-I$(top_srcdir) 	\
+	-I$(top_srcdir)/app
+
+EXTRA_DIST = makefile.msc
+
+gimp-composite.c: gimp-composite-dispatch.c
+
+gimp-composite-dispatch.c: gimp-composite-generic.o make-gimp-composite-dispatch.py
+	./make-gimp-composite-dispatch.py gimp-composite-generic.o > gimp-composite-dispatch.c
diff --git a/app/composite/gimp-composite-generic.c b/app/composite/gimp-composite-generic.c
new file mode 100644
index 0000000000..69ffed84d9
--- /dev/null
+++ b/app/composite/gimp-composite-generic.c
@@ -0,0 +1,1153 @@
+/* The GIMP -- an image manipulation program
+ * Copyright (C) 1995 Spencer Kimball and Peter Mattis
+ *
+ * -*- mode: c tab-width: 2; -*-
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/*
+ * This file is supposed to contain the generic (read: C) implementation
+ * of the pixelfiddeling paint-functions. 
+ */
+
+#include <string.h>
+
+#include "glib/grand.h"
+#include "glib/gtypes.h"
+
+#include "libgimpcolor/gimpcolortypes.h"
+#include "libgimpcolor/gimpcolorspace.h"
+
+#include "gimp-composite.h"
+
+#define OPAQUE_OPACITY 255
+#define TRANSPARENT_OPACITY 0
+
+#define INT_MULT(a,b,t)  ((t) = (a) * (b) + 0x80, ((((t) >> 8) + (t)) >> 8))
+
+/* This version of INT_MULT3 is very fast, but suffers from some
+   slight roundoff errors.  It returns the correct result 99.987
+   percent of the time */
+#define INT_MULT3(a,b,c,t)  ((t) = (a) * (b) * (c)+ 0x7F5B, ((((t) >> 7) + (t)) >> 16))
+/*
+  This version of INT_MULT3 always gives the correct result, but runs at
+  approximatly one third the speed. */
+/*  #define INT_MULT3(a,b,c,t) (((a) * (b) * (c)+ 32512) / 65025.0)
+ */
+
+#define INT_BLEND(a,b,alpha,tmp)  (INT_MULT((a)-(b), alpha, tmp) + (b))
+
+#define RANDOM_TABLE_SIZE  4096
+
+/* A drawable has an alphachannel if contains either 4 or 2 bytes data
+ * aka GRAYA and RGBA and thus the macro below works. This will have
+ * to change if we support bigger formats. We'll do it so for now because
+ * masking is always cheaper than passing parameters over the stack.      */
+/* FIXME: Move to a global place */
+
+#define HAS_ALPHA(bytes) (~bytes & 1)
+
+
+static guchar add_lut[511];
+static gint32 random_table[RANDOM_TABLE_SIZE];
+
+/*
+ *
+ * Pixel format type conversion
+ *
+ * XXX This implementation will not work for >8 bit colours.
+ * XXX This implementation is totally wrong.
+ */
+void
+gimp_composite_convert_any_any_any_generic(GimpCompositeContext *ctx)
+{
+  int i;
+  int j;
+  char *D = ctx->D;
+  char *A = ctx->A;
+  int bpp_A = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  int bpp_D = gimp_composite_pixel_bpp[ctx->pixelformat_D];
+
+  for (i = 0; i < ctx->n_pixels; i++) {
+    for (j = 0; j < bpp_A; j++) {
+      D[j] = A[j];
+    }
+    D[j] = GIMP_COMPOSITE_ALPHA_OPAQUE;
+    A += bpp_A;
+    D += bpp_D;
+  }
+}
+
+void
+gimp_composite_color_any_any_any_generic(guchar * dest, const guchar * color, guint w, guint bytes)
+{
+  /* dest % bytes and color % bytes must be 0 or we will crash 
+     when bytes = 2 or 4.
+     Is this safe to assume?  Lets find out.
+     This is 4-7X as fast as the simple version.
+   */
+
+#if defined(sparc) || defined(__sparc__)
+  guchar c0, c1, c2, c3;
+#else
+  guchar c0, c1, c2;
+  guint32 *longd, longc;
+  guint16 *shortd, shortc;
+#endif
+
+  switch (bytes)
+    {
+    case 1:
+      memset(dest, *color, w);
+      break;
+
+    case 2:
+#if defined(sparc) || defined(__sparc__)
+      c0 = color[0];
+      c1 = color[1];
+      while (w--)
+        {
+          dest[0] = c0;
+          dest[1] = c1;
+          dest += 2;
+        }
+#else
+      shortc = ((guint16 *) color)[0];
+      shortd = (guint16 *) dest;
+      while (w--)
+        {
+          *shortd = shortc;
+          shortd++;
+        }
+#endif /* sparc || __sparc__ */
+      break;
+
+    case 3:
+      c0 = color[0];
+      c1 = color[1];
+      c2 = color[2];
+      while (w--)
+        {
+          dest[0] = c0;
+          dest[1] = c1;
+          dest[2] = c2;
+          dest += 3;
+        }
+      break;
+
+    case 4:
+#if defined(sparc) || defined(__sparc__)
+      c0 = color[0];
+      c1 = color[1];
+      c2 = color[2];
+      c3 = color[3];
+      while (w--)
+        {
+          dest[0] = c0;
+          dest[1] = c1;
+          dest[2] = c2;
+          dest[3] = c3;
+          dest += 4;
+        }
+#else
+      longc = ((guint32 *) color)[0];
+      longd = (guint32 *) dest;
+      while (w--)
+        {
+          *longd = longc;
+          longd++;
+        }
+#endif /* sparc || __sparc__ */
+      break;
+
+    default:
+      while (w--)
+        {
+          memcpy(dest, color, bytes);
+          dest += bytes;
+        }
+    }
+}
+
+void
+gimp_composite_blend_any_any_any_generic(GimpCompositeContext *ctx)
+{
+  guchar *src1 = ctx->A;
+  guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guchar blend = ctx->blend.blend;
+  guint bytes = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint w = ctx->n_pixels;
+  guint b;
+  const guchar blend2 = (255 - blend);
+
+  while (w--)
+    {
+      for (b = 0; b < bytes; b++)
+        dest[b] = (src1[b] * blend2 + src2[b] * blend) / 255;
+
+      src1 += bytes;
+      src2 += bytes;
+      dest += bytes;
+    }
+}
+
+
+#if 0
+void
+gimp_composite_shade_generic(const guchar *src, guchar *dest, const guchar *col, guchar blend, guint w, guint bytes, guint has_alpha)
+{
+  const guchar blend2 = (255 - blend);
+  const guint alpha = (has_alpha) ? bytes - 1 : bytes;
+  guint b;
+
+  while (w--)
+    {
+      for (b = 0; b < alpha; b++)
+        dest[b] = (src[b] * blend2 + col[b] * blend) / 255;
+
+      if (has_alpha)
+        dest[alpha] = src[alpha];       /* alpha channel */
+
+      src += bytes;
+      dest += bytes;
+    }
+}
+#endif
+
+void
+gimp_composite_darken_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+  guchar s1, s2;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          s1 = src1[b];
+          s2 = src2[b];
+          dest[b] = (s1 < s2) ? s1 : s2;
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+void
+gimp_composite_lighten_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+  guchar s1, s2;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          s1 = src1[b];
+          s2 = src2[b];
+          dest[b] = (s1 < s2) ? s2 : s1;
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_hue_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  guint r1, g1, b1;
+  guint r2, g2, b2;
+
+  /*  assumes inputs are only 4 byte RGBA pixels  */
+  while (length--)
+    {
+      r1 = src1[0];
+      g1 = src1[1];
+      b1 = src1[2];
+      r2 = src2[0];
+      g2 = src2[1];
+      b2 = src2[2];
+      gimp_rgb_to_hsv_int(&r1, &g1, &b1);
+      gimp_rgb_to_hsv_int(&r2, &g2, &b2);
+
+      r1 = r2;
+
+      /*  set the destination  */
+      gimp_hsv_to_rgb_int(&r1, &g1, &b1);
+
+      dest[0] = r1;
+      dest[1] = g1;
+      dest[2] = b1;
+
+      if (has_alpha1 && has_alpha2)
+        dest[3] = MIN(src1[3], src2[3]);
+      else if (has_alpha2)
+        dest[3] = src2[3];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_saturation_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  guint r1, g1, b1;
+  guint r2, g2, b2;
+
+  /*  assumes inputs are only 4 byte RGBA pixels  */
+  while (length--)
+    {
+      r1 = src1[0];
+      g1 = src1[1];
+      b1 = src1[2];
+      r2 = src2[0];
+      g2 = src2[1];
+      b2 = src2[2];
+      gimp_rgb_to_hsv_int(&r1, &g1, &b1);
+      gimp_rgb_to_hsv_int(&r2, &g2, &b2);
+
+      g1 = g2;
+
+      /*  set the destination  */
+      gimp_hsv_to_rgb_int(&r1, &g1, &b1);
+
+      dest[0] = r1;
+      dest[1] = g1;
+      dest[2] = b1;
+
+      if (has_alpha1 && has_alpha2)
+        dest[3] = MIN(src1[3], src2[3]);
+      else if (has_alpha2)
+        dest[3] = src2[3];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_value_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  guint r1, g1, b1;
+  guint r2, g2, b2;
+
+  /*  assumes inputs are only 4 byte RGBA pixels  */
+  while (length--)
+    {
+      r1 = src1[0];
+      g1 = src1[1];
+      b1 = src1[2];
+      r2 = src2[0];
+      g2 = src2[1];
+      b2 = src2[2];
+      gimp_rgb_to_hsv_int(&r1, &g1, &b1);
+      gimp_rgb_to_hsv_int(&r2, &g2, &b2);
+
+      b1 = b2;
+
+      /*  set the destination  */
+      gimp_hsv_to_rgb_int(&r1, &g1, &b1);
+
+      dest[0] = r1;
+      dest[1] = g1;
+      dest[2] = b1;
+
+      if (has_alpha1 && has_alpha2)
+        dest[3] = MIN(src1[3], src2[3]);
+      else if (has_alpha2)
+        dest[3] = src2[3];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_color_only_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  guint r1, g1, b1;
+  guint r2, g2, b2;
+
+  /*  assumes inputs are only 4 byte RGBA pixels  */
+  while (length--)
+    {
+      r1 = src1[0];
+      g1 = src1[1];
+      b1 = src1[2];
+      r2 = src2[0];
+      g2 = src2[1];
+      b2 = src2[2];
+      gimp_rgb_to_hls_int(&r1, &g1, &b1);
+      gimp_rgb_to_hls_int(&r2, &g2, &b2);
+
+      /*  transfer hue and saturation to the source pixel  */
+      r1 = r2;
+      b1 = b2;
+
+      /*  set the destination  */
+      gimp_hls_to_rgb_int(&r1, &g1, &b1);
+
+      dest[0] = r1;
+      dest[1] = g1;
+      dest[2] = b1;
+
+      if (has_alpha1 && has_alpha2)
+        dest[3] = MIN(src1[3], src2[3]);
+      else if (has_alpha2)
+        dest[3] = src2[3];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+void
+gimp_composite_multiply_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, tmp;
+
+  if (has_alpha1 && has_alpha2) {
+    while (length--)
+      {
+        for (b = 0; b < alpha; b++)
+          dest[b] = INT_MULT(src1[b], src2[b], tmp);
+
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+
+        src1 += bytes1;
+        src2 += bytes2;
+        dest += bytes2;
+      }
+  } else if (has_alpha2) {
+    while (length--)
+      {
+        for (b = 0; b < alpha; b++)
+          dest[b] = INT_MULT(src1[b], src2[b], tmp);
+        
+        dest[alpha] = src2[alpha];
+        
+        src1 += bytes1;
+        src2 += bytes2;
+        dest += bytes2;
+      }
+  } else {
+    while (length--)
+      {
+        for (b = 0; b < alpha; b++)
+          dest[b] = INT_MULT(src1[b], src2[b], tmp);
+        
+        src1 += bytes1;
+        src2 += bytes2;
+        dest += bytes2;
+      }
+  }
+}
+
+
+void
+gimp_composite_divide_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, result;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          result = ((src1[b] * 256) / (1 + src2[b]));
+          dest[b] = MIN(result, 255);
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_screen_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, tmp;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        dest[b] = 255 - INT_MULT((255 - src1[b]), (255 - src2[b]), tmp);
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_overlay_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, tmp;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          dest[b] = INT_MULT(src1[b], src1[b] + INT_MULT(2 * src2[b], 255 - src1[b], tmp), tmp);
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_dodge_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, tmp;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          tmp = src1[b] << 8;
+          tmp /= 256 - src2[b];
+          dest[b] = (guchar) CLAMP(tmp, 0, 255);
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_burn_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+
+  /* FIXME: Is the burn effect supposed to be dependant on the sign of this
+   * temporary variable? */
+  gint tmp;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          tmp = (255 - src1[b]) << 8;
+          tmp /= src2[b] + 1;
+          dest[b] = (guchar) CLAMP(255 - tmp, 0, 255);
+        }
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_hardlight_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, tmp;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          if (src2[b] > 128)
+            {
+              tmp = ((gint) 255 - src1[b]) * ((gint) 255 - ((src2[b] - 128) << 1));
+              dest[b] = (guchar) CLAMP(255 - (tmp >> 8), 0, 255);
+            }
+          else
+            {
+              tmp = (gint) src1[b] * ((gint) src2[b] << 1);
+              dest[b] = (guchar) CLAMP(tmp >> 8, 0, 255);
+            }
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_softlight_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = gimp_composite_pixel_alphap[ctx->pixelformat_A];
+  const guint has_alpha2 = gimp_composite_pixel_alphap[ctx->pixelformat_B];
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b, tmpS, tmpM, tmp1, tmp2, tmp3;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          /* Mix multiply and screen */
+          tmpM = INT_MULT(src1[b], src2[b], tmpM);
+          tmpS = 255 - INT_MULT((255 - src1[b]), (255 - src2[b]), tmp1);
+          dest[b] = INT_MULT((255 - src1[b]), tmpM, tmp2) + INT_MULT(src1[b], tmpS, tmp3);
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_grain_extract_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = gimp_composite_pixel_alphap[ctx->pixelformat_A];
+  const guint has_alpha2 = gimp_composite_pixel_alphap[ctx->pixelformat_B];
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+  gint diff;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          diff = src1[b] - src2[b] + 128;
+          dest[b] = (guchar) CLAMP(diff, 0, 255);
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_grain_merge_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = HAS_ALPHA(bytes1);
+  const guint has_alpha2 = HAS_ALPHA(bytes2);
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+  gint sum;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          /* Add, re-center and clip. */
+          sum = src1[b] + src2[b] - 128;
+          dest[b] = (guchar) CLAMP(sum, 0, 255);
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+void
+gimp_composite_addition_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *A = ctx->A;
+  const guchar *B = ctx->B;
+  guchar *D = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = gimp_composite_pixel_alphap[ctx->pixelformat_A];
+  const guint has_alpha2 = gimp_composite_pixel_alphap[ctx->pixelformat_B];
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+
+  if (has_alpha1 && has_alpha2) {
+    while (length--)
+      {
+        for (b = 0; b < alpha; b++)
+          D[b] = add_lut[A[b] + B[b]];
+        D[alpha] = MIN(A[alpha], B[alpha]);
+        A += bytes1;
+        B += bytes2;
+        D += bytes2;
+      }
+  }  else if (has_alpha2) {
+    while (length--)
+      {
+        for (b = 0; b < alpha; b++)
+          D[b] = add_lut[A[b] + B[b]];
+        D[alpha] = B[alpha];
+        A += bytes1;
+        B += bytes2;
+        D += bytes2;
+      }
+  } else {
+    while (length--)
+      {
+        for (b = 0; b < alpha; b++)
+          D[b] = add_lut[A[b] + B[b]];
+        A += bytes1;
+        B += bytes2;
+        D += bytes2;
+      }
+  }
+}
+
+
+void
+gimp_composite_subtract_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = gimp_composite_pixel_alphap[ctx->pixelformat_A];
+  const guint has_alpha2 = gimp_composite_pixel_alphap[ctx->pixelformat_B];
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+  gint diff;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          diff = src1[b] - src2[b];
+          dest[b] = (diff < 0) ? 0 : diff;
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_difference_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  const guchar *src2 = ctx->B;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  guint bytes2 = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  const guint has_alpha1 = gimp_composite_pixel_alphap[ctx->pixelformat_A];
+  const guint has_alpha2 = gimp_composite_pixel_alphap[ctx->pixelformat_B];
+  const guint alpha = (has_alpha1 || has_alpha2) ? MAX(bytes1, bytes2) - 1 : bytes1;
+  guint b;
+  gint diff;
+
+  while (length--)
+    {
+      for (b = 0; b < alpha; b++)
+        {
+          diff = src1[b] - src2[b];
+          dest[b] = (diff < 0) ? -diff : diff;
+        }
+
+      if (has_alpha1 && has_alpha2)
+        dest[alpha] = MIN(src1[alpha], src2[alpha]);
+      else if (has_alpha2)
+        dest[alpha] = src2[alpha];
+
+      src1 += bytes1;
+      src2 += bytes2;
+      dest += bytes2;
+    }
+}
+
+
+void
+gimp_composite_dissolve_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  GRand *gr;
+  gint alpha;
+  gint b;
+  gint combined_opacity;
+  gint db = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  gint length = ctx->n_pixels;
+  gint opacity = ctx->dissolve.opacity;
+  gint sb = gimp_composite_pixel_bpp[ctx->pixelformat_B];
+  gint x = ctx->dissolve.x;
+  gint y = ctx->dissolve.y;
+  guchar *mask = ctx->M;
+  gint32 rand_val;
+  guchar *dest = ctx->D;
+  guchar *src = ctx->B;
+  guint has_alpha = gimp_composite_pixel_alpha[ctx->pixelformat_B];
+
+  /*
+   * if destination does not have an alpha channel, add one to it.
+   */
+  if (!gimp_composite_pixel_alphap[ctx->pixelformat_D]) {
+    ctx->pixelformat_D = gimp_composite_pixel_alpha[ctx->pixelformat_D];
+    /*gimp_composite_convert_any_any_any_generic(ctx);*/
+  }
+
+  gr = g_rand_new_with_seed(random_table[y % RANDOM_TABLE_SIZE]);
+
+  for (b = 0; b < x; b ++)
+    g_rand_int (gr);
+
+  alpha = db - 1;
+
+  /*
+   * XXX NB: The mask is assumed to be a linear array of bytes, no
+   * accounting for the mask being of a particular pixel format.
+   */
+  while (length--)
+    {
+      /*  preserve the intensity values  */
+      for (b = 0; b < alpha; b++)
+        dest[b] = src[b];
+
+      /*  dissolve if random value is > opacity  */
+      rand_val = g_rand_int_range(gr, 0, 256);
+
+      if (mask) {
+        if (has_alpha)
+          combined_opacity = opacity * src[alpha] * (*mask) / (255 * 255);
+        else
+          combined_opacity = opacity * (*mask) / 255;
+ 
+        mask++;
+      } else {
+        if (has_alpha)
+          combined_opacity = opacity * src[alpha] / 255;
+        else
+          combined_opacity = opacity;
+      }
+ 
+      dest[alpha] = (rand_val > combined_opacity) ? 0 : OPAQUE_OPACITY;
+
+      dest += db;
+      src += sb;
+    }
+
+  g_rand_free(gr);
+
+  ctx->combine = gimp_composite_pixel_alphap[ctx->pixelformat_A] ? COMBINE_INTEN_A_INTEN_A : COMBINE_INTEN_INTEN_A;
+}
+
+void
+gimp_composite_replace_any_any_any_generic(GimpCompositeContext *ctx)
+{
+  ctx->D = ctx->B;
+  ctx->combine = REPLACE_INTEN;
+}
+
+
+void
+gimp_composite_swap_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  guint length;
+  guchar *src = ctx->A;
+  guchar *dest = ctx->B;
+  guint bytes1 = gimp_composite_pixel_bpp[ctx->pixelformat_A];
+  length = ctx->n_pixels * bytes1;
+
+  while (length--)
+    {
+      *src = *src ^ *dest;
+      *dest = *dest ^ *src;
+      *src = *src ^ *dest;
+      src++;
+      dest++;
+    }
+}
+
+void
+gimp_composite_normal_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  ctx->D = ctx->B;
+}
+
+
+void
+gimp_composite_normal_rgba8_any_any_generic(GimpCompositeContext * ctx)
+{
+  ctx->D = ctx->B;
+}
+
+
+void
+gimp_composite_erase_rgba8_any_any_generic(GimpCompositeContext *ctx)
+{
+  ctx->D = ctx->B;
+  ctx->combine = (gimp_composite_pixel_alphap[ctx->pixelformat_A] && gimp_composite_pixel_alphap[ctx->pixelformat_B]) ? ERASE_INTEN : 0;
+}
+
+void
+gimp_composite_anti_erase_any_any_any_generic(GimpCompositeContext *ctx)
+{
+  ctx->D = ctx->B;
+  ctx->combine = (gimp_composite_pixel_alphap[ctx->pixelformat_A] && gimp_composite_pixel_alphap[ctx->pixelformat_B]) ? ANTI_ERASE_INTEN : 0;
+}
+
+void
+gimp_composite_color_erase_any_any_any_generic(GimpCompositeContext *ctx)
+{
+  ctx->D = ctx->B;
+  ctx->combine = (gimp_composite_pixel_alphap[ctx->pixelformat_A] && gimp_composite_pixel_alphap[ctx->pixelformat_B]) ? COLOR_ERASE_INTEN : 0;
+}
+
+
+void
+gimp_composite_scale_any_any_any_generic(GimpCompositeContext * ctx)
+{
+  const guchar *src1 = ctx->A;
+  guchar *dest = ctx->D;
+  guint length = ctx->n_pixels;
+  guint bytes1 = (ctx->pixelformat_A == GIMP_PIXELFORMAT_V8) ? 1
+    : (ctx->pixelformat_A == GIMP_PIXELFORMAT_VA8) ? 2
+    : (ctx->pixelformat_A == GIMP_PIXELFORMAT_RGB8) ? 3 : (ctx->pixelformat_A == GIMP_PIXELFORMAT_RGBA8) ? 4 : 0;
+  gint tmp;
+
+  length = ctx->n_pixels * bytes1;
+
+  while (length--)
+    {
+      *dest++ = (guchar) INT_MULT(*src1, ctx->scale.scale, tmp);
+      src1++;
+    }
+}
+
+void
+gimp_composite_generic_init()
+{
+  guint i;
+  GRand *gr;
+#define RANDOM_SEED 314159265
+
+  /*  generate a table of random seeds  */
+  gr = g_rand_new_with_seed(RANDOM_SEED);
+
+  for (i = 0; i < RANDOM_TABLE_SIZE; i++)
+    random_table[i] = g_rand_int(gr);
+
+  for (i = 0; i < 256; i++)
+    add_lut[i] = i;
+
+  for (i = 256; i <= 510; i++)
+    add_lut[i] = 255;
+}
diff --git a/app/composite/gimp-composite-generic.h b/app/composite/gimp-composite-generic.h
new file mode 100644
index 0000000000..ac9baac73d
--- /dev/null
+++ b/app/composite/gimp-composite-generic.h
@@ -0,0 +1,29 @@
+
+extern void gimp_composite_color_generic (GimpCompositeContext *);
+extern void gimp_composite_blend_pixels (GimpCompositeContext *);
+extern void gimp_composite_shade_generic (GimpCompositeContext *);
+extern void gimp_composite_darken_generic(GimpCompositeContext *);
+extern void gimp_composite_lighten_generic(GimpCompositeContext *);
+extern void gimp_composite_hue_only_generic (GimpCompositeContext *);
+extern void gimp_composite_saturation_generic (GimpCompositeContext *);
+extern void gimp_composite_value_generic(GimpCompositeContext *);
+extern void gimp_composite_color_only_generic(GimpCompositeContext *);
+extern void gimp_composite_multiply_generic(GimpCompositeContext *);
+extern void gimp_composite_divide_generic(GimpCompositeContext *);
+extern void gimp_composite_screen_generic(GimpCompositeContext *);
+extern void gimp_composite_overlay_generic(GimpCompositeContext *);
+extern void gimp_composite_dodge_generic(GimpCompositeContext *);
+extern void gimp_composite_burn_generic (GimpCompositeContext *);
+extern void gimp_composite_hardlight_generic(GimpCompositeContext *);
+extern void gimp_composite_softlight_generic(GimpCompositeContext *);
+extern void gimp_composite_grain_extract_generic(GimpCompositeContext *);
+extern void gimp_composite_grain_merge_generic(GimpCompositeContext *);
+extern void gimp_composite_addition_generic(GimpCompositeContext *);
+extern void gimp_composite_subtract_generic(GimpCompositeContext *);
+extern void gimp_composite_difference_generic(GimpCompositeContext *);
+extern void gimp_composite_dissolve_generic(GimpCompositeContext *);
+extern void gimp_composite_replace_generic(GimpCompositeContext *);
+extern void gimp_composite_generic_init(GimpCompositeContext *);
+extern void gimp_composite_swap_generic (GimpCompositeContext *);
+extern void gimp_composite_scale_generic (GimpCompositeContext *);
+
diff --git a/app/composite/gimp-composite-mmx.c b/app/composite/gimp-composite-mmx.c
new file mode 100644
index 0000000000..d43b7143e3
--- /dev/null
+++ b/app/composite/gimp-composite-mmx.c
@@ -0,0 +1,2441 @@
+/* The GIMP -- an image manipulation program
+ * Copyright (C) 1995 Spencer Kimball and Peter Mattis
+ *
+ * -*- mode: c tab-width: 2; -*-
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Much of the content of this file are derivative works of David
+ * Monniaux which are Copyright (C) 1999, 2001 David Monniaux
+ * Tip-o-the-hat to David for pioneering this effort.
+ *
+ * All of these functions use the mmx registers and expect them to
+ * remain intact across multiple asm() constructs.  This may not work
+ * in the future, if the compiler allocates mmx registers for it's own
+ * use. XXX
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+
+#include "gimp-composite.h"
+#include "gimp-composite-mmx.h"
+
+#undef USE_SSE
+
+#ifdef USE_SSE
+#define pminub(src,dst,tmp)  "pminub " "%%" #src ", %%" #dst
+#define pmaxub(src,dst,tmp)  "pmaxub " "%%" #src ", %%" #dst
+#else
+#define pminub(src,dst,tmp)  "movq %%" #dst ", %%" #tmp ";" "psubusb %%" #src ", %%" #tmp ";" "psubb %%" #tmp ", %%" #dst
+
+#define pmaxub(a,b,tmp)      "movq %%" #a ", %%" #tmp ";" "psubusb %%" #b ", %%" #tmp ";" "paddb %%" #tmp ", %%" #b
+#endif
+
+
+/*
+ *  "\t" pdivwX(mm4,mm5,mm7) "\n"
+ * "\tpsrlq     $32,%%mm4\n"
+ * "\tpsrlq     $32,%%mm5\n"
+ * "\t" pdivwX(mm4,mm5,mm5) "\n"
+ * "\tpsllq     $32,%%mm5\n"
+ * "\tpor       %%mm5,%%mm7\n"
+ */
+/*
+ * Clobbers eax, ecx edx
+ */
+/*
+ * Double-word divide.  Adjusted for subsequent unsigned packing
+ * (high-order bit of each word is cleared)
+ */
+#define pdivwX(dividend,divisor,quotient) "movd %%" #dividend ",%%eax; " \
+                                          "movd %%" #divisor  ",%%ecx; " \
+                                          "xorl %%edx,%%edx; "           \
+                                          "divw %%cx; "                  \
+                                          "roll $16, %%eax; "            \
+                                          "roll $16, %%ecx; "            \
+                                          "xorl %%edx,%%edx; "           \
+                                          "divw %%cx; "                  \
+                                          "btr $15, %%eax; "             \
+                                          "roll $16, %%eax; "            \
+                                          "btr $15, %%eax; "             \
+                                          "movd %%eax,%%" #quotient ";"
+
+/*
+ * Quadword divide.  No adjustment for subsequent unsigned packing
+ * (high-order bit of each word is left alone)
+ */
+#define pdivwqX(dividend,divisor,quotient) "movd   %%" #dividend ",%%eax; " \
+                                          "movd   %%" #divisor  ",%%ecx; " \
+                                          "xorl   %%edx,%%edx; "           \
+                                          "divw   %%cx; "                  \
+                                          "roll   $16, %%eax; "            \
+                                          "roll   $16, %%ecx; "            \
+                                          "xorl   %%edx,%%edx; "           \
+                                          "divw   %%cx; "                  \
+                                          "roll   $16, %%eax; "            \
+                                          "movd   %%eax,%%" #quotient "; " \
+                                          "psrlq $32,%%" #dividend ";"     \
+                                          "psrlq $32,%%" #divisor ";"      \
+                                          "movd   %%" #dividend ",%%eax; " \
+                                          "movd   %%" #divisor  ",%%ecx; " \
+                                          "xorl   %%edx,%%edx; "           \
+                                          "divw   %%cx; "                  \
+                                          "roll   $16, %%eax; "            \
+                                          "roll   $16, %%ecx; "            \
+                                          "xorl   %%edx,%%edx; "           \
+                                          "divw   %%cx; "                  \
+                                          "roll   $16, %%eax; "            \
+                                          "movd   %%eax,%%" #divisor ";"   \
+                                          "psllq  $32,%%" #divisor ";"     \
+                                          "por    %%" #divisor ",%%" #quotient ";"
+   
+/*
+ * Quadword divide.  Adjusted for subsequent unsigned packing
+ * (high-order bit of each word is cleared)
+ */
+#define pdivwuqX(dividend,divisor,quotient) \
+                                          pdivwX(dividend,divisor,quotient) \
+                                            "psrlq  $32,%%" #dividend ";"   \
+                                            "psrlq  $32,%%" #divisor ";"    \
+                                          pdivwX(dividend,divisor,quotient) \
+                                          "movd   %%eax,%%" #divisor ";"    \
+                                            "psllq  $32,%%" #divisor ";"    \
+                                            "por    %%" #divisor ",%%" #quotient ";"
+
+/* equivalent to INT_MULT() macro */
+/*
+ * opr2 = INT_MULT(opr1, opr2, t)
+ *
+ * Operates across quad-words
+ * Result is left in opr2
+ *
+ * opr1 = opr1 * opr + w128
+ */
+#define pmulwX(opr1,opr2,w128) \
+                  "\tpmullw    %%"#opr2", %%"#opr1"; " \
+                  "\tpaddw     %%"#w128", %%"#opr1"; " \
+                  "\tmovq      %%"#opr1", %%"#opr2"; " \
+                  "\tpsrlw     $8,        %%"#opr2"; " \
+                  "\tpaddw     %%"#opr1", %%"#opr2"; " \
+                  "\tpsrlw     $8,        %%"#opr2"\n"
+ 
+
+ 
+
+#define ASM(x) debug(#x); asm(x)
+
+#define DEBUG(x) 
+
+
+void
+debug_display_mmx()
+{
+#define mask32(x) ((x)& (unsigned long long) 0xFFFFFFFF)
+#define print64(reg) { unsigned long long reg; asm("movq %%" #reg ",%0" : "=m" (reg)); printf(#reg"=%08llx %08llx", mask32(reg>>32), mask32(reg)); }
+  printf("--------------------------------------------\n");
+  print64(mm0); printf("  "); print64(mm1); printf("\n");
+  print64(mm2); printf("  "); print64(mm3); printf("\n");
+  print64(mm4); printf("  "); print64(mm5); printf("\n");
+  print64(mm6); printf("  "); print64(mm7); printf("\n");
+  printf("--------------------------------------------\n");
+}
+
+
+unsigned long rgba8_alpha_mask[2] = { 0xFF000000, 0xFF000000 };
+unsigned long rgba8_b1[2] = { 0x01010101, 0x01010101 };
+unsigned long rgba8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
+unsigned long rgba8_w1[2] = { 0x00010001, 0x00010001 };
+unsigned long rgba8_w128[2] = { 0x00800080, 0x00800080 };
+unsigned long rgba8_w256[2] = { 0x01000100, 0x01000100 };
+unsigned long rgba8_w255[2] = { 0X00FF00FF, 0X00FF00FF };
+
+unsigned long va8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00 };
+unsigned long va8_b255[2] = { 0xFFFFFFFF, 0xFFFFFFFF };
+unsigned long va8_w1[2] = { 0x00010001, 0x00010001 };
+unsigned long va8_w255[2] = { 0X00FF00FF, 0X00FF00FF };
+/*
+ *
+ */
+void
+gimp_composite_addition_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl $8, %1\n"
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\tpaddusb %%mm3, %%mm4\n"
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+                  "\t" pminub(mm3, mm2, mm4) "\n"
+                  "\tpand    %%mm0, %%mm2\n"
+                  "\tpor     %%mm2, %%mm1\n"
+                  "\tmovq    %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+  }
+
+  if (op.n_pixels) {
+    asm("  movd    (%0), %%mm2;\n"
+        "\tmovd    (%1), %%mm3;\n"
+        "\tmovq    %%mm2, %%mm4\n"
+        "\tpaddusb %%mm3, %%mm4\n"
+        "\tmovq    %%mm0, %%mm1\n"
+        "\tpandn   %%mm4, %%mm1\n"
+        "\t" pminub(mm3, mm2, mm4) "\n"
+        "\tpand    %%mm0, %%mm2\n"
+        "\tpor     %%mm2, %%mm1\n"
+        "\tmovd    %%mm1, (%2);\n"
+        : /* empty */
+        : "r" (op.A), "r" (op.B), "r" (op.D)
+        : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+  }
+
+  asm("emms");
+}
+
+void gimp_composite_burn_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq   %0,%%mm1"
+      :
+      : "m" (rgba8_alpha_mask)
+      : "%mm1");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq      (%0),%%mm0; addl $8,%0\n"
+                  "\tmovq      (%1),%%mm1; addl $8,%1\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpcklbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+
+                  "\t" pdivwqX(mm4,mm5,mm7) "\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb   %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpckhbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+                  "\t" pdivwqX(mm4,mm5,mm6) "\n"
+
+                  "\tmovq      %5,%%mm4\n"
+                  "\tmovq      %%mm4,%%mm5\n"
+                  "\tpsubusw     %%mm6,%%mm4\n"
+                  "\tpsubusw     %%mm7,%%mm5\n"
+                  
+                  "\tpackuswb  %%mm4,%%mm5\n"
+
+                  "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
+
+                  "\tmovq      %6,%%mm7\n"
+                  "\tpand      %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
+
+                  "\tpandn     %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
+                  "\tpor       %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
+
+                  "\tmovq      %%mm7,(%2); addl $8,%2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : "m" (rgba8_b255), "m" (rgba8_w1), "m" (rgba8_w255), "m" (rgba8_alpha_mask)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd      (%0),%%mm0\n"
+                  "\tmovd      (%1),%%mm1\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpcklbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+
+                  "\t" pdivwqX(mm4,mm5,mm7) "\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb   %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpckhbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+                  "\t" pdivwqX(mm4,mm5,mm6) "\n"
+
+                  "\tmovq      %5,%%mm4\n"
+                  "\tmovq      %%mm4,%%mm5\n"
+                  "\tpsubusw     %%mm6,%%mm4\n"
+                  "\tpsubusw     %%mm7,%%mm5\n"
+                  
+                  "\tpackuswb  %%mm4,%%mm5\n"
+
+                  "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
+
+                  "\tmovq      %6,%%mm7\n"
+                  "\tpand      %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
+
+                  "\tpandn     %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
+                  "\tpor       %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
+
+                  "\tmovd      %%mm7,(%2)\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D), "m" (rgba8_b255), "m" (rgba8_w1), "m" (rgba8_w255), "m" (rgba8_alpha_mask)
+                  : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+  }
+
+  asm("emms");
+}
+
+void
+xxxgimp_composite_coloronly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl  $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl  $8, %1\n"
+
+
+                  "\tmovq    %%mm1, (%2); addl  $8, %2\n"
+                  : "+r" (op.A), "+S" (op.B), "+D" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  asm("emms");
+
+}
+
+void
+gimp_composite_darken_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl  $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl  $8, %1\n"
+                  "\t" pminub(mm3, mm2, mm4) "\n"
+                  "\tmovq    %%mm2, (%2); addl  $8, %2\n"
+                  : "+r" (op.A), "+S" (op.B), "+D" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+                  "\t" pminub(mm3, mm2, mm4) "\n"
+                  "\tmovd    %%mm2, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm2", "%mm3", "%mm4");
+  }
+        
+  asm("emms");
+}
+
+void
+gimp_composite_difference_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl $8, %1\n"
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\tmovq    %%mm3, %%mm5\n"
+                  "\tpsubusb %%mm3, %%mm4\n"
+                  "\tpsubusb %%mm2, %%mm5\n"
+                  "\tpaddb   %%mm5, %%mm4\n"
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+                  "\tpminub  %%mm3, %%mm2\n"
+                  "\tpand    %%mm0, %%mm2\n"
+                  "\tpor     %%mm2, %%mm1\n"
+                  "\tmovq    %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+  
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\tmovq    %%mm3, %%mm5\n"
+                  "\tpsubusb %%mm3, %%mm4\n"
+                  "\tpsubusb %%mm2, %%mm5\n"
+                  "\tpaddb   %%mm5, %%mm4\n"
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+                  "\tpminub  %%mm3, %%mm2\n"
+                  "\tpand    %%mm0, %%mm2\n"
+                  "\tpor     %%mm2, %%mm1\n"
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  asm("emms");
+}
+
+
+void
+xxxgimp_composite_dissolve_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("\tmovq    (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl $8, %1\n"
+
+                  "\tmovq      %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("\tmovd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+
+                  "\tmovd      %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_divide_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0, %%mm0\n"
+      "\tmovq    %1, %%mm7\n"
+      :
+      : "m" (rgba8_alpha_mask), "m" (rgba8_w1)
+      : "%mm0", "%mm7");
+  
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm0; addl $8, %0\n"
+                  "\tmovq    (%1), %%mm1; addl $8, %1\n"
+
+                  "\tpxor      %%mm2,%%mm2\n"
+                  "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpcklbw %%mm5,%%mm3\n"
+                  "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
+
+                  "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
+
+                  "\tpxor      %%mm2,%%mm2\n"
+                  "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm6,%%mm6\n"
+                  "\tpunpckhbw %%mm6,%%mm3\n"
+                  "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
+
+                  "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
+                  
+                  "\tpackuswb  %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
+
+                  "\t" pminub(mm0,mm1,mm3) "\n"
+                  "\tmovq      %3,%%mm3\n"
+                  "\tmovq      %%mm3,%%mm2\n"
+
+                  "\tpandn     %%mm5,%%mm3\n"
+
+                  "\tpand      %%mm2,%%mm1\n"
+                  "\tpor       %%mm1,%%mm3\n"
+
+                  "\tmovq      %%mm3,(%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : "m" (rgba8_alpha_mask)
+                  : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm0; addl $8, %0\n"
+                  "\tmovd    (%1), %%mm1; addl $8, %1\n"
+
+                  "\tpxor      %%mm2,%%mm2\n"
+                  "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpcklbw %%mm5,%%mm3\n"
+                  "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
+
+                  "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
+
+                  "\tpxor      %%mm2,%%mm2\n"
+                  "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm6,%%mm6\n"
+                  "\tpunpckhbw %%mm6,%%mm3\n"
+                  "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
+
+                  "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
+                  
+                  "\tpackuswb  %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
+
+                  "\t" pminub(mm0,mm1,mm3) "\n"
+                  "\tmovq      %3,%%mm3\n"
+                  "\tmovq      %%mm3,%%mm2\n"
+
+                  "\tpandn     %%mm5,%%mm3\n"
+
+                  "\tpand      %%mm2,%%mm1\n"
+                  "\tpor       %%mm1,%%mm3\n"
+
+                  "\tmovd      %%mm3,(%2); addl $8, %2\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D), "m" (rgba8_alpha_mask)
+                  : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+/*
+ * (src1[b] << 8) / (256 - src2[b]);
+ */
+void
+gimp_composite_dodge_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq     (%0), %%mm0; addl $8, %0\n"
+                  "\tmovq     (%1), %%mm1; addl $8, %1\n"
+                  "\tmovq      %%mm1, %%mm3\n"
+                  "\tpxor      %%mm2, %%mm2\n"
+                  "\tpunpcklbw %%mm2, %%mm3\n"
+                  "\tpunpcklbw %%mm0, %%mm2\n"
+
+                  "\tmovq      rgba8_w256, %%mm4\n"
+                  "\tpsubw     %%mm3, %%mm4\n"
+
+                  "\t" pdivwuqX(mm2,mm4,mm5) "\n"
+
+                  "\tmovq      %%mm1, %%mm3\n"
+                  "\tpxor      %%mm2, %%mm2\n"
+                  "\tpunpckhbw %%mm2, %%mm3\n"
+                  "\tpunpckhbw %%mm0, %%mm2\n"
+
+                  "\tmovq      rgba8_w256, %%mm4\n"
+                  "\tpsubw     %%mm3, %%mm4\n"
+
+                  "\t" pdivwuqX(mm2,mm4,mm6) "\n"
+
+                  "\tpackuswb  %%mm6, %%mm5\n"
+
+                  "\tmovq      rgba8_alpha_mask, %%mm6\n"
+                  "\tmovq      %%mm1,%%mm7\n"
+                  "\t" pminub(mm0,mm7,mm2) "\n"
+                  "\tpand      %%mm6, %%mm7\n"
+                  "\tpandn     %%mm5, %%mm6\n"
+
+                  "\tpor       %%mm6, %%mm7\n"
+
+                  "\tmovq    %%mm7, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd     (%0), %%mm0;\n"
+                  "\tmovq     (%1), %%mm1;\n"
+                  "\tmovq      %%mm1, %%mm3\n"
+                  "\tpxor      %%mm2, %%mm2\n"
+                  "\tpunpcklbw %%mm2, %%mm3\n"
+                  "\tpunpcklbw %%mm0, %%mm2\n"
+
+                  "\tmovq      rgba8_w256, %%mm4\n"
+                  "\tpsubw     %%mm3, %%mm4\n"
+
+                  "\t" pdivwuqX(mm2,mm4,mm5) "\n"
+
+                  "\tmovq      %%mm1, %%mm3\n"
+                  "\tpxor      %%mm2, %%mm2\n"
+                  "\tpunpckhbw %%mm2, %%mm3\n"
+                  "\tpunpckhbw %%mm0, %%mm2\n"
+
+                  "\tmovq      rgba8_w256, %%mm4\n"
+                  "\tpsubw     %%mm3, %%mm4\n"
+
+                  "\t" pdivwuqX(mm2,mm4,mm6) "\n"
+
+                  "\tpackuswb  %%mm6, %%mm5\n"
+
+                  "\tmovq      rgba8_alpha_mask, %%mm6\n"
+                  "\tmovq      %%mm1,%%mm7\n"
+                  "\t" pminub(mm0,mm7,mm2) "\n"
+                  "\tpand      %%mm6, %%mm7\n"
+                  "\tpandn     %%mm5, %%mm6\n"
+
+                  "\tpor       %%mm6, %%mm7\n"
+
+                  "\tmovd    %%mm7, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+  asm("pxor    %%mm6,%%mm6"  :  :                        : "%mm6");
+  asm("movq    %0,%%mm7"     :  : "m" (rgba8_w128)       : "%mm7");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl $8, %1\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpcklbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+
+                  "\tpsubw     %%mm5, %%mm4\n"
+                  "\tpaddw     %%mm7, %%mm4\n"
+                  "\tmovq      %%mm4, %%mm1\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+
+                  "\tpsubw     %%mm5, %%mm4\n"
+                  "\tpaddw     %%mm7, %%mm4\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+                  "\tmovq      %%mm1, %%mm4\n"
+
+                  "\tmovq      %%mm0, %%mm1; pandn     %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+                  "\tpand      %%mm0, %%mm2\n"
+
+                  "\tpor       %%mm2, %%mm1\n"
+                  "\tmovq      %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpcklbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+
+                  "\tpsubw     %%mm5, %%mm4\n"
+                  "\tpaddw     %%mm7, %%mm4\n"
+                  "\tmovq      %%mm4, %%mm1\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+
+                  "\tpsubw     %%mm5, %%mm4\n"
+                  "\tpaddw     %%mm7, %%mm4\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+                  "\tmovq      %%mm1, %%mm4\n"
+
+                  "\tmovq      %%mm0, %%mm1; pandn     %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+                  "\tpand      %%mm0, %%mm2\n"
+
+                  "\tpor       %%mm2, %%mm1\n"
+                  "\tmovd      %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  asm("emms");
+
+}
+
+void
+gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0, %%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+  asm("pxor    %%mm6, %%mm6"  :  :                        : "%mm6");
+  asm("movq    %0, %%mm7"     :  : "m" (rgba8_w128)       : "%mm7");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl $8, %1\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpcklbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+
+                  "\tpaddw     %%mm5, %%mm4\n"
+                  "\tpsubw     %%mm7, %%mm4\n"
+                  "\tmovq      %%mm4, %%mm1\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+
+                  "\tpaddw     %%mm5, %%mm4\n"
+                  "\tpsubw     %%mm7, %%mm4\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+                  "\tmovq      %%mm1, %%mm4\n"
+
+                  "\tmovq      %%mm0, %%mm1; pandn     %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+                  "\tpand      %%mm0, %%mm2\n"
+
+                  "\tpor       %%mm2, %%mm1\n"
+                  "\tmovq      %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpcklbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+
+                  "\tpaddw     %%mm5, %%mm4\n"
+                  "\tpsubw     %%mm7, %%mm4\n"
+                  "\tmovq      %%mm4, %%mm1\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+
+                  "\tpaddw     %%mm5, %%mm4\n"
+                  "\tpsubw     %%mm7, %%mm4\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+                  "\tmovq      %%mm1, %%mm4\n"
+
+                  "\tmovq      %%mm0, %%mm1; pandn     %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+                  "\tpand      %%mm0, %%mm2\n"
+
+                  "\tpor       %%mm2, %%mm1\n"
+                  "\tmovd      %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  asm("emms");
+
+}
+
+void
+xxxgimp_composite_hardlight_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+
+  }
+
+  if (op.n_pixels) {
+
+  }
+
+  asm("emms");
+
+}
+
+void
+xxxgimp_composite_hueonly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+
+  }
+
+  if (op.n_pixels) {
+
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_lighten_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq     (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq     (%1), %%mm3; addl $8, %1\n"
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\t" pmaxub(mm3,mm4,mm5) "\n"
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+                  "\t" pminub(mm2,mm3,mm4) "\n"
+                  "\tpand    %%mm0, %%mm3\n"
+                  "\tpor     %%mm3, %%mm1\n"
+                  "\tmovq    %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2\n"
+                  "\tmovd    (%1), %%mm3\n"
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\t" pmaxub(mm3,mm4,mm5) "\n"
+
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm2,mm3,mm4) "\n"
+
+                  "\tpand    %%mm0, %%mm3\n"
+                  "\tpor     %%mm3, %%mm1\n"
+                  "\tmovd    %%mm1, (%2)\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_multiply_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+  asm("movq    %0,%%mm7"     :  : "m" (rgba8_w128) : "%mm7");
+  asm("pxor    %%mm6,%%mm6"  :  :  : "%mm6");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq     (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq     (%1), %%mm3; addl $8, %1\n"
+
+                  "\tmovq      %%mm2, %%mm1\n"
+                  "\tpunpcklbw %%mm6, %%mm1\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+                  
+                  "\t" pmulwX(mm5,mm1,mm7) "\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+
+                  "\t" pmulwX(mm5,mm4,mm7) "\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+
+                  "\tmovq      %%mm0, %%mm4\n"
+                  "\tpandn     %%mm1, %%mm4\n"
+                  "\tmovq      %%mm4, %%mm1\n"
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+                  "\tpand      %%mm0, %%mm2\n"
+                  "\tpor       %%mm2, %%mm1\n"
+                  
+                  "\tmovq    %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd     (%0), %%mm2\n"
+                  "\tmovd     (%1), %%mm3\n"
+
+                  "\tmovq      %%mm2, %%mm1\n"
+                  "\tpunpcklbw %%mm6, %%mm1\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+
+                  "\t" pmulwX(mm5,mm1,mm7) "\n"
+
+                  "\tmovq      %%mm2, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tmovq      %%mm3, %%mm5\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+
+                  "\t" pmulwX(mm5,mm4,mm7) "\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+
+                  "\tmovq      %%mm0, %%mm4\n"
+                  "\tpandn     %%mm1, %%mm4\n"
+                  "\tmovq      %%mm4, %%mm1\n"
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+                  "\tpand      %%mm0, %%mm2\n"
+                  "\tpor       %%mm2, %%mm1\n"
+                  
+                  "\tmovd    %%mm1, (%2)\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+unsigned long rgba8_lower_ff[2] = {  0x00FF00FF, 0x00FF00FF };
+
+void
+op_overlay()
+{
+  asm("movq      %mm2, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq      %mm3, %mm5");
+  asm("punpcklbw %mm6, %mm5");
+  asm("pmullw    %mm5, %mm1");
+  asm("paddw     %mm7, %mm1");
+  asm("movq      %mm1, %mm5");
+  asm("psrlw     $8, %mm5");
+  asm("paddw     %mm5, %mm1");
+  asm("psrlw     $8, %mm1");
+
+  asm("pcmpeqb   %mm4, %mm4");
+  asm("psubb     %mm2, %mm4");
+  asm("punpcklbw %mm6, %mm4");
+  asm("pcmpeqb   %mm5, %mm5");
+  asm("psubb     %mm3, %mm5");
+  asm("punpcklbw %mm6, %mm5");
+  asm("pmullw    %mm5, %mm4");
+  asm("paddw     %mm7, %mm4");
+  asm("movq      %mm4, %mm5");
+  asm("psrlw     $8, %mm5");
+  asm("paddw     %mm5, %mm4");
+  asm("psrlw     $8, %mm4");
+
+  asm("movq      rgba8_lower_ff, %mm5");
+  asm("psubw     %mm4, %mm5");
+
+  asm("psubw     %mm1, %mm5");
+  asm("movq      %mm2, %mm4");
+  asm("punpcklbw %mm6, %mm4");
+  asm("pmullw    %mm4, %mm5");
+  asm("paddw     %mm7, %mm5");
+  asm("movq      %mm5, %mm4");
+  asm("psrlw     $8, %mm4");
+  asm("paddw     %mm4, %mm5");
+  asm("psrlw     $8, %mm5");
+  asm("paddw     %mm1, %mm5");
+
+  asm("subl      $8, %esp");
+  asm("movq      %mm5, (%esp)");
+
+  asm("movq      %mm2, %mm1");
+  asm("punpckhbw %mm6, %mm1");
+  asm("movq      %mm3, %mm5");
+  asm("punpckhbw %mm6, %mm5");
+  asm("pmullw    %mm5, %mm1");
+  asm("paddw     %mm7, %mm1");
+  asm("movq      %mm1, %mm5");
+  asm("psrlw     $8, %mm5");
+  asm("paddw     %mm5, %mm1");
+  asm("psrlw     $8, %mm1");
+
+  asm("pcmpeqb   %mm4, %mm4");
+  asm("psubb     %mm2, %mm4");
+  asm("punpckhbw %mm6, %mm4");
+  asm("pcmpeqb   %mm5, %mm5");
+  asm("psubb     %mm3, %mm5");
+  asm("punpckhbw %mm6, %mm5");
+  asm("pmullw    %mm5, %mm4");
+  asm("paddw     %mm7, %mm4");
+  asm("movq      %mm4, %mm5");
+  asm("psrlw     $8, %mm5");
+  asm("paddw     %mm5, %mm4");
+  asm("psrlw     $8, %mm4");
+
+  asm("movq      rgba8_lower_ff, %mm5");
+  asm("psubw     %mm4, %mm5");
+
+  asm("psubw     %mm1, %mm5");
+  asm("movq      %mm2, %mm4");
+  asm("punpckhbw %mm6, %mm4");
+  asm("pmullw    %mm4, %mm5");
+  asm("paddw     %mm7, %mm5");
+  asm("movq      %mm5, %mm4");
+  asm("psrlw     $8, %mm4");
+  asm("paddw     %mm4, %mm5");
+  asm("psrlw     $8, %mm5");
+  asm("paddw     %mm1, %mm5");
+
+  asm("movq      (%esp), %mm4");
+  asm("addl      $8, %esp");
+
+  asm("packuswb  %mm5, %mm4");
+  asm("movq      %mm0, %mm1");
+  asm("pandn     %mm4, %mm1");
+
+  asm("movq      %mm2, %mm4");
+  asm("psubusb   %mm3, %mm4");
+  asm("psubb     %mm4, %mm2");
+  asm("pand      %mm0, %mm2");
+  asm("por       %mm2, %mm1");
+}
+
+void
+gimp_composite_overlay_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl  $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl  $8, %1\n"
+
+                  "\tcall op_overlay\n"
+
+                  "\tmovq    %%mm1, (%2); addl  $8, %2\n"
+                  : "+r" (op.A), "+S" (op.B), "+D" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+
+                  "\tcall op_overlay\n"
+
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  asm("emms");
+}
+
+void
+xxxgimp_composite_saturationonly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl  $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl  $8, %1\n"
+
+
+                  "\tmovq    %%mm1, (%2); addl  $8, %2\n"
+                  : "+r" (op.A), "+S" (op.B), "+D" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_scale_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm volatile ("pxor    %%mm0,%%mm0\n"
+                "\tmovl  %0,%%eax\n"
+                "\tmovl  %%eax,%%ebx\n"
+                "\tshl   $16,%%ebx\n"
+                "\torl   %%ebx,%%eax\n"
+                "\tmovd  %%eax,%%mm5\n"
+                "\tmovd  %%eax,%%mm3\n"
+                "\tpsllq $32,%%mm5\n"
+                "\tpor   %%mm5,%%mm3\n"
+                "\tmovq  %1,%%mm7\n"
+                : /* empty */
+                : "m" (op.scale.scale), "m" (rgba8_w128)
+                : "%eax", "%mm0", "%mm5", "%mm6", "%mm7");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("movq      (%0),%%mm2; addl $8,%0\n"
+                  "\tmovq      %%mm2,%%mm1\n"
+                  "\tpunpcklbw %%mm0,%%mm1\n"
+                  "\tmovq      %%mm3,%%mm5\n"
+
+                  "\t" pmulwX(mm5,mm1,mm7) "\n"
+
+                  "\tmovq      %%mm2,%%mm4\n"
+                  "\tpunpckhbw %%mm0,%%mm4\n"
+                  "\tmovq      %%mm3,%%mm5\n"
+
+                  "\t" pmulwX(mm5,mm4,mm7) "\n"
+
+                  "\tpackuswb  %%mm4,%%mm1\n"
+
+                  "\tmovq    %%mm1,(%1);  addl $8,%1\n"
+                  : "+r" (op.A), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("movd      (%0), %%mm2\n"
+                  "\tmovq      %%mm2,%%mm1\n"
+                  "\tpunpcklbw %%mm0,%%mm1\n"
+                  "\tmovq      %%mm3,%%mm5\n"
+
+                  "\t" pmulwX(mm5,mm1,mm7) "\n"
+
+                  "\tpackuswb  %%mm0,%%mm1\n"
+                  "\tmovd    %%mm1,(%1)\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.D)
+                  : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_screen_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+  asm("movq    %0,%%mm7"     :  : "m" (rgba8_w128)  : "%mm7");
+  asm("pxor    %mm6, %mm6");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq     (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq     (%1), %%mm3; addl $8, %1\n"
+
+                  "\tpcmpeqb   %%mm4, %%mm4\n"
+                  "\tpsubb     %%mm2, %%mm4\n"
+                  "\tpcmpeqb   %%mm5, %%mm5\n"
+                  "\tpsubb     %%mm3, %%mm5\n"
+
+                  "\tpunpcklbw %%mm6, %%mm4\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+                  "\tpmullw    %%mm4, %%mm5\n"
+                  "\tpaddw     %%mm7, %%mm5\n"
+                  "\tmovq      %%mm5, %%mm1\n"
+                  "\tpsrlw     $ 8, %%mm1\n"
+                  "\tpaddw     %%mm5, %%mm1\n"
+                  "\tpsrlw     $ 8, %%mm1\n"
+
+                  "\tpcmpeqb   %%mm4, %%mm4\n"
+                  "\tpsubb     %%mm2, %%mm4\n"
+                  "\tpcmpeqb   %%mm5, %%mm5\n"
+                  "\tpsubb     %%mm3, %%mm5\n"
+
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+                  "\tpmullw    %%mm4, %%mm5\n"
+                  "\tpaddw     %%mm7, %%mm5\n"
+                  "\tmovq      %%mm5, %%mm4\n"
+                  "\tpsrlw     $ 8, %%mm4\n"
+                  "\tpaddw     %%mm5, %%mm4\n"
+                  "\tpsrlw     $ 8, %%mm4\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+
+                  "\tpcmpeqb   %%mm4, %%mm4\n"
+                  "\tpsubb     %%mm1, %%mm4\n"
+
+                  "\tmovq      %%mm0, %%mm1\n"
+                  "\tpandn     %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm2,mm3,mm4) "\n"
+                  "\tpand      %%mm0, %%mm3\n"
+
+                  "\tpor       %%mm3, %%mm1\n"
+
+                  "\tmovq    %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd     (%0), %%mm2;\n"
+                  "\tmovd     (%1), %%mm3;\n"
+
+                  "\tpcmpeqb   %%mm4, %%mm4\n"
+                  "\tpsubb     %%mm2, %%mm4\n"
+                  "\tpcmpeqb   %%mm5, %%mm5\n"
+                  "\tpsubb     %%mm3, %%mm5\n"
+
+                  "\tpunpcklbw %%mm6, %%mm4\n"
+                  "\tpunpcklbw %%mm6, %%mm5\n"
+                  "\tpmullw    %%mm4, %%mm5\n"
+                  "\tpaddw     %%mm7, %%mm5\n"
+                  "\tmovq      %%mm5, %%mm1\n"
+                  "\tpsrlw     $ 8, %%mm1\n"
+                  "\tpaddw     %%mm5, %%mm1\n"
+                  "\tpsrlw     $ 8, %%mm1\n"
+
+                  "\tpcmpeqb   %%mm4, %%mm4\n"
+                  "\tpsubb     %%mm2, %%mm4\n"
+                  "\tpcmpeqb   %%mm5, %%mm5\n"
+                  "\tpsubb     %%mm3, %%mm5\n"
+
+                  "\tpunpckhbw %%mm6, %%mm4\n"
+                  "\tpunpckhbw %%mm6, %%mm5\n"
+                  "\tpmullw    %%mm4, %%mm5\n"
+                  "\tpaddw     %%mm7, %%mm5\n"
+                  "\tmovq      %%mm5, %%mm4\n"
+                  "\tpsrlw     $ 8, %%mm4\n"
+                  "\tpaddw     %%mm5, %%mm4\n"
+                  "\tpsrlw     $ 8, %%mm4\n"
+
+                  "\tpackuswb  %%mm4, %%mm1\n"
+
+                  "\tpcmpeqb   %%mm4, %%mm4\n"
+                  "\tpsubb     %%mm1, %%mm4\n"
+
+                  "\tmovq      %%mm0, %%mm1\n"
+                  "\tpandn     %%mm4, %%mm1\n"
+
+                  "\t" pminub(mm2,mm3,mm4) "\n"
+                  "\tpand      %%mm0, %%mm3\n"
+
+                  "\tpor       %%mm3, %%mm1\n"
+
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+void
+xxxgimp_composite_softlight_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl  $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl  $8, %1\n"
+                  
+
+                  "\tmovq    %%mm1, (%2); addl  $8, %2\n"
+                  : "+r" (op.A), "+S" (op.B), "+D" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+                  
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+  
+  asm("emms");
+}
+
+void
+gimp_composite_subtract_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq     (%0), %%mm2; addl $8, %0\n"
+                  "\tmovq     (%1), %%mm3; addl $8, %1\n"
+
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\tpsubusb %%mm3, %%mm4\n"
+                  
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+                  
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+
+                  "\tpand    %%mm0, %%mm2\n"
+                  "\tpor     %%mm2, %%mm1\n"
+                  "\tmovq    %%mm1, (%2); addl $8, %2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd     (%0), %%mm2;\n"
+                  "\tmovd     (%1), %%mm3;\n"
+
+                  "\tmovq    %%mm2, %%mm4\n"
+                  "\tpsubusb %%mm3, %%mm4\n"
+                  
+                  "\tmovq    %%mm0, %%mm1\n"
+                  "\tpandn   %%mm4, %%mm1\n"
+                  
+                  "\t" pminub(mm3,mm2,mm4) "\n"
+
+                  "\tpand    %%mm0, %%mm2\n"
+                  "\tpor     %%mm2, %%mm1\n"
+                  "\tmovd    %%mm1, (%2); addl $8, %2\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
+  }
+
+  asm("emms");
+}
+
+void
+gimp_composite_swap_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2\n"
+                  "\tmovq    (%1), %%mm3\n"
+                  "\tmovq    %%mm3, (%0)\n"
+                  "\tmovq    %%mm2, (%1)\n"
+                  "\taddl    $8, %0\n"
+                  "\taddl    $8, %1\n"
+                  : "+r" (op.A), "+r" (op.B)
+                  : /* empty */
+                  : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2\n"
+                  "\tmovd    (%1), %%mm3\n"
+                  "\tmovd    %%mm3, (%0)\n"
+                  "\tmovd    %%mm2, (%1)\n"                  
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B)
+                  : "0", "1", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+  
+  asm("emms");
+}
+
+void
+gimp_composite_valueonly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq    %0,%%mm0"     :  : "m" (rgba8_alpha_mask) : "%mm0");
+
+  for (; op.n_pixels >= 2; op.n_pixels -= 2) {
+    asm volatile ("  movq    (%0), %%mm2; addl  $8, %0\n"
+                  "\tmovq    (%1), %%mm3; addl  $8, %1\n"
+                  
+
+                  "\tmovq    %%mm1, (%2); addl  $8, %2\n"
+                  : "+r" (op.A), "+S" (op.B), "+D" (op.D)
+                  : /* empty */
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd    (%0), %%mm2;\n"
+                  "\tmovd    (%1), %%mm3;\n"
+                  
+                  "\tmovd    %%mm1, (%2);\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+  
+  asm("emms");
+}
+
+
+unsigned long v8_alpha_mask[2] = { 0xFF00FF00, 0xFF00FF00};
+unsigned long v8_mul_shift[2] = { 0x00800080, 0x00800080 };
+
+#if 0
+void
+gimp_composite_addition_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+
+  asm("subl $ 4, %ecx");
+  asm("jl .add_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".add_pixels_1a_1a_loop:");
+
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("paddusb %mm3, %mm4");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm4, %mm1");
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .add_pixels_1a_1a_loop");
+
+  asm(".add_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .add_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("paddusb %mm3, %mm4");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm4, %mm1");
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".add_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .add_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("paddusb %mm3, %mm4");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm4, %mm1");
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".add_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+gimp_composite_burn_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("movq   %0,%%mm1"
+      :
+      : "m" (va8_alpha_mask)
+      : "%mm1");
+
+  for (; op.n_pixels >= 4; op.n_pixels -= 4) {
+    asm volatile ("  movq      (%0),%%mm0; addl $8,%0\n"
+                  "\tmovq      (%1),%%mm1; addl $8,%1\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpcklbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+
+                  "\t" pdivwqX(mm4,mm5,mm7) "\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpckhbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+                  "\t" pdivwqX(mm4,mm5,mm6) "\n"
+
+                  "\tmovq      %5,%%mm4\n"
+                  "\tmovq      %%mm4,%%mm5\n"
+                  "\tpsubusw     %%mm6,%%mm4\n"
+                  "\tpsubusw     %%mm7,%%mm5\n"
+                  
+                  "\tpackuswb  %%mm4,%%mm5\n"
+
+                  "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
+
+                  "\tmovq      %6,%%mm7\n"
+                  "\tpand      %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
+
+                  "\tpandn     %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
+                  "\tpor       %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
+
+                  "\tmovq      %%mm7,(%2); addl $8,%2\n"
+                  : "+r" (op.A), "+r" (op.B), "+r" (op.D)
+                  : "m" (va8_b255), "m" (va8_w1), "m" (va8_w255), "m" (va8_alpha_mask)
+                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
+  }
+
+  if (op.n_pixels) {
+    asm volatile ("  movd      (%0),%%mm0\n"
+                  "\tmovd      (%1),%%mm1\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpcklbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+
+                  "\t" pdivwqX(mm4,mm5,mm7) "\n"
+
+                  "\tmovq      %3,%%mm2\n"
+                  "\tpsubb   %%mm0,%%mm2\n" /* mm2 = 255 - A */
+                  "\tpxor      %%mm4,%%mm4\n"
+                  "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */
+
+                  "\tmovq      %%mm1,%%mm3\n"
+                  "\tpxor      %%mm5,%%mm5\n"
+                  "\tpunpckhbw %%mm5,%%mm3\n"
+                  "\tmovq      %4,%%mm5\n"
+                  "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
+                  "\t" pdivwqX(mm4,mm5,mm6) "\n"
+
+                  "\tmovq      %5,%%mm4\n"
+                  "\tmovq      %%mm4,%%mm5\n"
+                  "\tpsubusw     %%mm6,%%mm4\n"
+                  "\tpsubusw     %%mm7,%%mm5\n"
+                  
+                  "\tpackuswb  %%mm4,%%mm5\n"
+
+                  "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
+
+                  "\tmovq      %6,%%mm7\n"
+                  "\tpand      %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
+
+                  "\tpandn     %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
+                  "\tpor       %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
+
+                  "\tmovd      %%mm7,(%2)\n"
+                  : /* empty */
+                  : "r" (op.A), "r" (op.B), "r" (op.D), "m" (va8_b255), "m" (va8_w1), "m" (va8_w255), "m" (va8_alpha_mask)
+                  : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
+  }
+
+  asm("emms");
+}
+
+void
+xxxgimp_composite_coloronly_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+gimp_composite_darken_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .darken_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".darken_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("movq %mm2, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .darken_pixels_1a_1a_loop");
+
+  asm(".darken_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .darken_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("movq %mm2, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".darken_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .darken_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("movq %mm2, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".darken_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+gimp_composite_difference_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .difference_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".difference_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("movq %mm3, %mm5");
+  asm("psubusb %mm3, %mm4");
+  asm("psubusb %mm2, %mm5");
+  asm("movq %mm0, %mm1");
+  asm("paddb %mm5, %mm4");
+  asm("pandn %mm4, %mm1");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .difference_pixels_1a_1a_loop");
+
+  asm(".difference_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .difference_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("movq %mm3, %mm5");
+  asm("psubusb %mm3, %mm4");
+  asm("psubusb %mm2, %mm5");
+  asm("movq %mm0, %mm1");
+  asm("paddb %mm5, %mm4");
+  asm("pandn %mm4, %mm1");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".difference_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .difference_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("movq %mm3, %mm5");
+  asm("psubusb %mm3, %mm4");
+  asm("psubusb %mm2, %mm5");
+  asm("movq %mm0, %mm1");
+  asm("paddb %mm5, %mm4");
+  asm("pandn %mm4, %mm1");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".difference_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+xxxgimp_composite_dissolve_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_divide_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_dodge_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_grainextract_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_grainmerge_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_hardlight_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_hueonly_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_lighten_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .lighten_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".lighten_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("paddb %mm4, %mm3");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm3, %mm1");
+
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .lighten_pixels_1a_1a_loop");
+
+  asm(".lighten_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .lighten_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("paddb %mm4, %mm3");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm3, %mm1");
+
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".lighten_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .lighten_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("paddb %mm4, %mm3");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm3, %mm1");
+
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".lighten_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+xxxgimp_composite_multiply_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .multiply_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".multiply_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+
+  asm("movq %mm2, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq %mm3, %mm5");
+  asm("punpcklbw %mm6, %mm5");
+  asm("pmullw %mm5, %mm1");
+  asm("paddw %mm7, %mm1");
+  asm("movq %mm1, %mm5");
+  asm("psrlw $ 8, %mm5");
+  asm("paddw %mm5, %mm1");
+  asm("psrlw $ 8, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("punpckhbw %mm6, %mm4");
+  asm("movq %mm3, %mm5");
+  asm("punpckhbw %mm6, %mm5");
+  asm("pmullw %mm5, %mm4");
+  asm("paddw %mm7, %mm4");
+  asm("movq %mm4, %mm5");
+  asm("psrlw $ 8, %mm5");
+  asm("paddw %mm5, %mm4");
+  asm("psrlw $ 8, %mm4");
+
+  asm("packuswb %mm4, %mm1");
+
+  asm("movq %mm0, %mm4");
+  asm("pandn %mm1, %mm4");
+  asm("movq %mm4, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .multiply_pixels_1a_1a_loop");
+
+  asm(".multiply_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .multiply_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+
+  asm("movq %mm2, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq %mm3, %mm5");
+  asm("punpcklbw %mm6, %mm5");
+  asm("pmullw %mm5, %mm1");
+  asm("paddw %mm7, %mm1");
+  asm("movq %mm1, %mm5");
+  asm("psrlw $ 8, %mm5");
+  asm("paddw %mm5, %mm1");
+  asm("psrlw $ 8, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("punpckhbw %mm6, %mm4");
+  asm("movq %mm3, %mm5");
+  asm("punpckhbw %mm6, %mm5");
+  asm("pmullw %mm5, %mm4");
+  asm("paddw %mm7, %mm4");
+  asm("movq %mm4, %mm5");
+  asm("psrlw $ 8, %mm5");
+  asm("paddw %mm5, %mm4");
+  asm("psrlw $ 8, %mm4");
+
+  asm("packuswb %mm4, %mm1");
+
+  asm("movq %mm0, %mm4");
+  asm("pandn %mm1, %mm4");
+  asm("movq %mm4, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".multiply_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .multiply_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+
+  asm("movq %mm2, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq %mm3, %mm5");
+  asm("punpcklbw %mm6, %mm5");
+  asm("pmullw %mm5, %mm1");
+  asm("paddw %mm7, %mm1");
+  asm("movq %mm1, %mm5");
+  asm("psrlw $ 8, %mm5");
+  asm("paddw %mm5, %mm1");
+  asm("psrlw $ 8, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("punpckhbw %mm6, %mm4");
+  asm("movq %mm3, %mm5");
+  asm("punpckhbw %mm6, %mm5");
+  asm("pmullw %mm5, %mm4");
+  asm("paddw %mm7, %mm4");
+  asm("movq %mm4, %mm5");
+  asm("psrlw $ 8, %mm5");
+  asm("paddw %mm5, %mm4");
+  asm("psrlw $ 8, %mm4");
+
+  asm("packuswb %mm4, %mm1");
+
+  asm("movq %mm0, %mm4");
+  asm("pandn %mm1, %mm4");
+  asm("movq %mm4, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".multiply_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+gimp_composite_overlay_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .overlay_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".overlay_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+  asm("call op_overlay");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .overlay_pixels_1a_1a_loop");
+
+  asm(".overlay_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .overlay_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+  asm("call op_overlay");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".overlay_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .overlay_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+  asm("call op_overlay");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".overlay_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+xxxgimp_composite_replace_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_saturationonly_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_screen_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .screen_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".screen_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+
+  asm("pcmpeqb %mm4, %mm4");
+  asm("psubb %mm2, %mm4");
+  asm("pcmpeqb %mm5, %mm5");
+  asm("psubb %mm3, %mm5");
+
+  asm("movq %mm4, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq %mm5, %mm3");
+  asm("punpcklbw %mm6, %mm3");
+  asm("pmullw %mm3, %mm1");
+  asm("paddw %mm7, %mm1");
+  asm("movq %mm1, %mm3");
+  asm("psrlw $ 8, %mm3");
+  asm("paddw %mm3, %mm1");
+  asm("psrlw $ 8, %mm1");
+
+  asm("movq %mm4, %mm2");
+  asm("punpckhbw %mm6, %mm2");
+  asm("movq %mm5, %mm3");
+  asm("punpckhbw %mm6, %mm3");
+  asm("pmullw %mm3, %mm2");
+  asm("paddw %mm7, %mm2");
+  asm("movq %mm2, %mm3");
+  asm("psrlw $ 8, %mm3");
+  asm("paddw %mm3, %mm2");
+  asm("psrlw $ 8, %mm2");
+
+  asm("packuswb %mm2, %mm1");
+
+  asm("pcmpeqb %mm3, %mm3");
+  asm("psubb %mm1, %mm3");
+
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm3, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm5, %mm2");
+  asm("paddb %mm2, %mm5");
+  asm("pcmpeqb %mm3, %mm3");
+  asm("psubb %mm5, %mm3");
+
+  asm("pand %mm0, %mm3");
+  asm("por %mm3, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .screen_pixels_1a_1a_loop");
+
+  asm(".screen_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .screen_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+
+  asm("pcmpeqb %mm4, %mm4");
+  asm("psubb %mm2, %mm4");
+  asm("pcmpeqb %mm5, %mm5");
+  asm("psubb %mm3, %mm5");
+
+  asm("movq %mm4, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq %mm5, %mm3");
+  asm("punpcklbw %mm6, %mm3");
+  asm("pmullw %mm3, %mm1");
+  asm("paddw %mm7, %mm1");
+  asm("movq %mm1, %mm3");
+  asm("psrlw $ 8, %mm3");
+  asm("paddw %mm3, %mm1");
+  asm("psrlw $ 8, %mm1");
+
+  asm("movq %mm4, %mm2");
+  asm("punpckhbw %mm6, %mm2");
+  asm("movq %mm5, %mm3");
+  asm("punpckhbw %mm6, %mm3");
+  asm("pmullw %mm3, %mm2");
+  asm("paddw %mm7, %mm2");
+  asm("movq %mm2, %mm3");
+  asm("psrlw $ 8, %mm3");
+  asm("paddw %mm3, %mm2");
+  asm("psrlw $ 8, %mm2");
+
+  asm("packuswb %mm2, %mm1");
+
+  asm("pcmpeqb %mm3, %mm3");
+  asm("psubb %mm1, %mm3");
+
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm3, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm5, %mm2");
+  asm("paddb %mm2, %mm5");
+  asm("pcmpeqb %mm3, %mm3");
+  asm("psubb %mm5, %mm3");
+
+  asm("pand %mm0, %mm3");
+  asm("por %mm3, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".screen_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .screen_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+
+  asm("pcmpeqb %mm4, %mm4");
+  asm("psubb %mm2, %mm4");
+  asm("pcmpeqb %mm5, %mm5");
+  asm("psubb %mm3, %mm5");
+
+  asm("movq %mm4, %mm1");
+  asm("punpcklbw %mm6, %mm1");
+  asm("movq %mm5, %mm3");
+  asm("punpcklbw %mm6, %mm3");
+  asm("pmullw %mm3, %mm1");
+  asm("paddw %mm7, %mm1");
+  asm("movq %mm1, %mm3");
+  asm("psrlw $ 8, %mm3");
+  asm("paddw %mm3, %mm1");
+  asm("psrlw $ 8, %mm1");
+
+  asm("movq %mm4, %mm2");
+  asm("punpckhbw %mm6, %mm2");
+  asm("movq %mm5, %mm3");
+  asm("punpckhbw %mm6, %mm3");
+  asm("pmullw %mm3, %mm2");
+  asm("paddw %mm7, %mm2");
+  asm("movq %mm2, %mm3");
+  asm("psrlw $ 8, %mm3");
+  asm("paddw %mm3, %mm2");
+  asm("psrlw $ 8, %mm2");
+
+  asm("packuswb %mm2, %mm1");
+
+  asm("pcmpeqb %mm3, %mm3");
+  asm("psubb %mm1, %mm3");
+
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm3, %mm1");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm5, %mm2");
+  asm("paddb %mm2, %mm5");
+  asm("pcmpeqb %mm3, %mm3");
+  asm("psubb %mm5, %mm3");
+
+  asm("pand %mm0, %mm3");
+  asm("por %mm3, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".screen_pixels_1a_1a_end:");
+
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+xxxgimp_composite_softlight_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_subtract_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+  asm("pushl %edi");
+  asm("pushl %ebx");
+  asm("movl 12(%esp), %edi");
+  asm("movq v8_alpha_mask, %mm0");
+  asm("subl $ 4, %ecx");
+  asm("jl .substract_pixels_1a_1a_last3");
+  asm("movl $ 8, %ebx");
+  asm(".substract_pixels_1a_1a_loop:");
+  asm("movq (%eax), %mm2");
+  asm("movq (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm4, %mm1");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movq %mm1, (%edi)");
+  asm("addl %ebx, %eax");
+  asm("addl %ebx, %edx");
+  asm("addl %ebx, %edi");
+  asm("subl $ 4, %ecx");
+  asm("jge .substract_pixels_1a_1a_loop");
+
+  asm(".substract_pixels_1a_1a_last3:");
+  asm("test $ 2, %ecx");
+  asm("jz .substract_pixels_1a_1a_last1");
+  asm("movd (%eax), %mm2");
+  asm("movd (%edx), %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm4, %mm1");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("addl $ 4, %eax");
+  asm("addl $ 4, %edx");
+  asm("addl $ 4, %edi");
+
+  asm(".substract_pixels_1a_1a_last1:");
+  asm("test $ 1, %ecx");
+  asm("jz .substract_pixels_1a_1a_end");
+
+  asm("movw (%eax), %bx");
+  asm("movd %ebx, %mm2");
+  asm("movw (%edx), %bx");
+  asm("movd %ebx, %mm3");
+
+  asm("movq %mm2, %mm4");
+  asm("psubusb %mm3, %mm4");
+  asm("movq %mm0, %mm1");
+  asm("pandn %mm4, %mm1");
+  asm("psubb %mm4, %mm2");
+  asm("pand %mm0, %mm2");
+  asm("por %mm2, %mm1");
+  asm("movd %mm1, %ebx");
+  asm("movw %bx, (%edi)");
+
+  asm(".substract_pixels_1a_1a_end:");
+  asm("emms");
+  asm("popl %ebx");
+  asm("popl %edi");
+}
+
+void
+xxxgimp_composite_swap_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+
+void
+xxxgimp_composite_valueonly_va8_va8_va8_mmx(GimpCompositeContext *_op)
+{
+  GimpCompositeContext op = *_op;
+
+}
+#endif
+
+void
+gimp_composite_mmx_init()
+{
+
+}
diff --git a/app/composite/gimp-composite-mmx.h b/app/composite/gimp-composite-mmx.h
new file mode 100644
index 0000000000..944d6a1856
--- /dev/null
+++ b/app/composite/gimp-composite-mmx.h
@@ -0,0 +1,51 @@
+#ifndef gimp_composite_context_h
+#define gimp_composite_context_h
+/*
+ *
+ */
+extern void gimp_composite_addition_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_burn_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_coloronly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_darken_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_difference_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_dissolve_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_divide_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_dodge_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_grainextract_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_grainmerge_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_hardlight_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_hueonly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_lighten_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_multiply_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_overlay_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_replace_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_saturationonly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_screen_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_softlight_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_subtract_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_swap_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+extern void gimp_composite_valueonly_rgba8_rgba8_rgba8_mmx(GimpCompositeContext *);
+
+extern void gimp_composite_addition_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_burn_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_coloronly_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_darken_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_difference_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_dissolve_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_divide_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_dodge_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_grainextract_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_grainmerge_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_hardlight_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_hueonly_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_lighten_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_multiply_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_overlay_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_replace_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_saturationonly_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_screen_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_softlight_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_subtract_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_swap_va8_va8_va8_mmx(GimpCompositeContext *);
+extern void gimp_composite_valueonly_va8_va8_va8_mmx(GimpCompositeContext *);
+#endif
diff --git a/app/composite/gimp-composite-util.h b/app/composite/gimp-composite-util.h
new file mode 100644
index 0000000000..9be706610d
--- /dev/null
+++ b/app/composite/gimp-composite-util.h
@@ -0,0 +1,30 @@
+#ifndef gimp_composite_util
+#define gimp_composite_util
+/*
+ *
+ */
+
+typedef struct {
+  unsigned char r;
+  unsigned char g;
+  unsigned char b;
+  unsigned char a;
+} rgba8_t;
+
+typedef struct {
+  unsigned char r;
+  unsigned char g;
+  unsigned char b;
+} rgb8_t;
+
+typedef struct {
+  unsigned char v;
+} v8_t;
+
+typedef struct {
+  unsigned char v;
+  unsigned char a;
+} va8_t;
+
+extern int gimp_composite_bpp[];
+#endif
diff --git a/app/composite/gimp-composite.c b/app/composite/gimp-composite.c
new file mode 100644
index 0000000000..acc4de20bc
--- /dev/null
+++ b/app/composite/gimp-composite.c
@@ -0,0 +1,172 @@
+/* The GIMP -- an image manipulation program
+ * Copyright (C) 1995 Spencer Kimball and Peter Mattis
+ *
+ * Gimp image compositing
+ * Copyright (C) 2003  Helvetix Victorinox, a pseudonym, <helvetix@gimp.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+/*
+ * $Id$
+ */
+#include <stdio.h>
+
+#include "gimp-composite.h"
+
+/*
+ * Details about pixel formats, bits-per-pixel alpha and non alpha
+ * versions of pixel formats.
+ */
+/*
+ * Report on the number of bytes a particular pixel format consumes per pixel.
+ */
+unsigned char gimp_composite_pixel_bpp[] = {
+  1, /* GIMP_PIXELFORMAT_V8      */
+  2, /* GIMP_PIXELFORMAT_VA8     */
+  3, /* GIMP_PIXELFORMAT_RGB8    */
+  4, /* GIMP_PIXELFORMAT_RGBA8   */
+#if GIMP_16BITCOLOR
+  2, /* GIMP_PIXELFORMAT_V16     */
+  4, /* GIMP_PIXELFORMAT_VA16    */
+  6, /* GIMP_PIXELFORMAT_RGB16   */
+  8, /* GIMP_PIXELFORMAT_RGBA16  */
+#endif
+  0, /* GIMP_PIXELFORMAT_ANY */
+};
+
+char *gimp_composite_pixel_name[] = {
+  "GIMP_PIXELFORMAT_V8",
+  "GIMP_PIXELFORMAT_VA8",
+  "GIMP_PIXELFORMAT_RGB8",
+  "GIMP_PIXELFORMAT_RGBA8",
+#if GIMP_16BITCOLOR
+  "GIMP_PIXELFORMAT_V16",
+  "GIMP_PIXELFORMAT_VA16",
+  "GIMP_PIXELFORMAT_RGB16 ",
+  "GIMP_PIXELFORMAT_RGBA16 ",
+#endif
+  "GIMP_PIXELFORMAT_ANY",
+};
+/*
+ * Report true (non-zero) if a pixel format has alpha.
+ */
+unsigned char gimp_composite_pixel_alphap[] = {
+  0, /* GIMP_PIXELFORMAT_V8      */
+  1, /* GIMP_PIXELFORMAT_VA8     */
+  0, /* GIMP_PIXELFORMAT_RGB8    */
+  1, /* GIMP_PIXELFORMAT_RGBA8   */
+#if GIMP_16BITCOLOR
+  0, /* GIMP_PIXELFORMAT_V16     */
+  1, /* GIMP_PIXELFORMAT_VA16    */
+  0, /* GIMP_PIXELFORMAT_RGB16   */
+  1, /* GIMP_PIXELFORMAT_RGBA16  */
+#endif
+  0, /* GIMP_PIXELFORMAT_UNKNOWN */
+};
+
+/*
+ * Convert to/from pixel formats with/without alpha.
+ */
+GimpPixelFormat gimp_composite_pixel_alpha[] = {
+  GIMP_PIXELFORMAT_VA8,         /* GIMP_PIXELFORMAT_V8      */
+  GIMP_PIXELFORMAT_V8,          /* GIMP_PIXELFORMAT_VA8     */
+  GIMP_PIXELFORMAT_RGBA8,       /* GIMP_PIXELFORMAT_RGB8    */
+  GIMP_PIXELFORMAT_RGB8,        /* GIMP_PIXELFORMAT_RGBA8   */
+#if GIMP_16BITCOLOR
+  GIMP_PIXELFORMAT_VA16,
+  GIMP_PIXELFORMAT_V16,
+  GIMP_PIXELFORMAT_RGBA16,
+  GIMP_PIXELFORMAT_RGB16
+#endif
+  GIMP_PIXELFORMAT_ANY,         /* GIMP_PIXELFORMAT_ANY */
+};
+
+
+/*
+ * XXX I don't like to put this here.  I think this information,
+ * specific to the functions, ought to be with the function.
+ */
+struct GimpCompositeOperationEffects gimp_composite_operation_effects[] = {
+  { TRUE,  TRUE,  FALSE, },     /*  GIMP_NORMAL_MODE        */
+  { TRUE,  TRUE,  FALSE, },     /*  GIMP_DISSOLVE_MODE      */
+  { TRUE,  TRUE,  FALSE, },     /*  GIMP_BEHIND_MODE        */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_MULTIPLY_MODE      */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_SCREEN_MODE        */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_OVERLAY_MODE       */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_DIFFERENCE_MODE    */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_ADDITION_MODE      */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_SUBTRACT_MODE      */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_DARKEN_ONLY_MODE   */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_LIGHTEN_ONLY_MODE  */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_HUE_MODE           */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_SATURATION_MODE    */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_COLOR_MODE         */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_VALUE_MODE         */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_DIVIDE_MODE        */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_DODGE_MODE         */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_BURN_MODE          */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_HARDLIGHT_MODE     */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_SOFTLIGHT_MODE     */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_GRAIN_EXTRACT_MODE */
+  { FALSE, FALSE, FALSE, },     /*  GIMP_GRAIN_MERGE_MODE   */
+  { TRUE,  FALSE, TRUE,  },     /*  GIMP_COLOR_ERASE_MODE   */
+  { TRUE,  FALSE, TRUE,  },     /*  GIMP_ERASE_MODE         */
+  { TRUE,  TRUE,  TRUE,  },     /*  GIMP_REPLACE_MODE       */
+  { TRUE,  TRUE,  FALSE, },     /*  GIMP_ANTI_ERASE_MODE    */
+
+  { FALSE, FALSE, FALSE },      /*  GIMP_SWAP */
+  { FALSE, FALSE, FALSE },      /*  GIMP_SCALE */
+  { FALSE, FALSE, FALSE },      /*  GIMP_CONVERT */
+};
+
+void
+gimp_composite_unsupported(GimpCompositeContext *ctx)
+{
+  printf("compositing function %d unsupported\n", ctx->op);
+}
+
+struct {
+  char announce_function;
+} gimp_composite_debug;
+
+#include "gimp-composite-dispatch.c"
+
+void
+gimp_composite_dispatch(GimpCompositeContext *ctx)
+{
+  void (*function)();
+
+  function = gimp_composite_function[ctx->op][ctx->pixelformat_A][ctx->pixelformat_B][ctx->pixelformat_D];
+
+  if (function)
+    (*function)(ctx);
+  else {
+    printf("unsupported composite operation %d %d %d (see gimp-composite.h)\n", ctx->op, ctx->pixelformat_A, ctx->pixelformat_B);
+  }
+}
+
+void
+gimp_composite_context_print(GimpCompositeContext *ctx)
+{
+  printf("%p: %s op=%d A=%s(%d):%p B=%s(%d):%p D=%s(%d):%p M=%s(%d):%p n_pixels=%lu\n",
+         ctx,
+         gimp_composite_function_name[ctx->op][ctx->pixelformat_A][ctx->pixelformat_B][ctx->pixelformat_D],
+         ctx->op,
+         gimp_composite_pixel_name[ctx->pixelformat_A], ctx->pixelformat_A, ctx->A, 
+         gimp_composite_pixel_name[ctx->pixelformat_B], ctx->pixelformat_B, ctx->A, 
+         gimp_composite_pixel_name[ctx->pixelformat_D], ctx->pixelformat_D, ctx->A, 
+         gimp_composite_pixel_name[ctx->pixelformat_M], ctx->pixelformat_M, ctx->A, 
+         ctx->n_pixels);
+}
diff --git a/app/composite/gimp-composite.h b/app/composite/gimp-composite.h
new file mode 100644
index 0000000000..66eeb3c276
--- /dev/null
+++ b/app/composite/gimp-composite.h
@@ -0,0 +1,182 @@
+/* The GIMP -- an image manipulation program
+ * Copyright (C) 1995 Spencer Kimball and Peter Mattis
+ *
+ * Gimp Image Compositing
+ * Copyright (C) 2003  Helvetix Victorinox, a pseudonym, <helvetix@gimp.org>
+ * $Id$
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#ifndef gimp_composite_h
+#define gimp_composite_h
+
+#include <sys/types.h>
+#include <glib-object.h>
+#include "base/base-enums.h"
+#include "paint-funcs/paint-funcs-types.h"
+
+#ifndef NULL
+#define NULL ((void) 0)
+#endif
+
+typedef enum {
+  GIMP_PIXELFORMAT_V8,
+  GIMP_PIXELFORMAT_VA8,
+  GIMP_PIXELFORMAT_RGB8,
+  GIMP_PIXELFORMAT_RGBA8,
+#if GIMP_16BITCOLOR
+  GIMP_PIXELFORMAT_V16,
+  GIMP_PIXELFORMAT_VA16,
+  GIMP_PIXELFORMAT_RGB16,
+  GIMP_PIXELFORMAT_RGBA16,
+#endif
+  GIMP_PIXELFORMAT_ANY,
+  GIMP_PIXELFORMAT_N
+} GimpPixelFormat;
+
+typedef struct {
+  u_int8_t v;
+} gimp_v8_t;
+
+typedef struct {
+  u_int8_t v;
+  u_int8_t a;
+} gimp_va8_t;
+
+typedef struct {
+  u_int8_t r;
+  u_int8_t g;
+  u_int8_t b;
+} gimp_rgb8_t;
+
+typedef struct {
+  u_int8_t r;
+  u_int8_t g;
+  u_int8_t b;
+  u_int8_t a;
+} gimp_rgba8_t;
+
+#ifdef GIMP_16BITCOLOUR
+typedef struct {
+  u_int16_t v;
+} gimp_v16_t;
+
+typedef struct {
+  u_int16_t v;
+  u_int16_t a;
+} gimp_va16_t;
+
+typedef struct {
+  u_int16_t r;
+  u_int16_t g;
+  u_int16_t b;
+} gimp_rgb16_t;
+
+typedef struct {
+  u_int16_t r;
+  u_int16_t g;
+  u_int16_t b;
+  u_int16_t a;
+} gimp_rgba16_t;
+#endif
+
+extern unsigned char gimp_composite_pixel_bpp[]; /* bytes per-pixel for each of the pixel formats */
+extern unsigned char gimp_composite_pixel_alphap[]; /* does pixel format have alpha? */
+extern GimpPixelFormat gimp_composite_pixel_alpha[]; /* converter between alpha and non-alpha pixel formats */
+
+#define GIMP_COMPOSITE_ALPHA_OPAQUE (-1)
+#define GIMP_COMPOSITE_ALPHA_TRANSPARENT (0)
+/*
+ * This is the enumeration of all the supported compositing
+ * operations.  Many of them are taken from the GimpLayerModeEffect
+ * enumeration, but there are (possibly more) implemented.  Here is
+ * where they are all enumerated.
+ *
+ * Nota Bene: Unfortunately, the order here is important!
+ */
+typedef enum {
+  GIMP_COMPOSITE_NORMAL        = GIMP_NORMAL_MODE,
+  GIMP_COMPOSITE_DISSOLVE      = GIMP_DISSOLVE_MODE,
+  GIMP_COMPOSITE_BEHIND        = GIMP_BEHIND_MODE,
+  GIMP_COMPOSITE_MULTIPLY      = GIMP_MULTIPLY_MODE,
+  GIMP_COMPOSITE_SCREEN        = GIMP_SCREEN_MODE,
+  GIMP_COMPOSITE_OVERLAY       = GIMP_OVERLAY_MODE,
+  GIMP_COMPOSITE_DIFFERENCE    = GIMP_DIFFERENCE_MODE,
+  GIMP_COMPOSITE_ADDITION      = GIMP_ADDITION_MODE,
+  GIMP_COMPOSITE_SUBTRACT      = GIMP_SUBTRACT_MODE,
+  GIMP_COMPOSITE_DARKEN        = GIMP_DARKEN_ONLY_MODE,
+  GIMP_COMPOSITE_LIGHTEN       = GIMP_LIGHTEN_ONLY_MODE,
+  GIMP_COMPOSITE_HUE           = GIMP_HUE_MODE,
+  GIMP_COMPOSITE_SATURATION    = GIMP_SATURATION_MODE,
+  GIMP_COMPOSITE_COLOR_ONLY    = GIMP_COLOR_MODE,
+  GIMP_COMPOSITE_VALUE         = GIMP_VALUE_MODE,
+  GIMP_COMPOSITE_DIVIDE        = GIMP_DIVIDE_MODE,
+  GIMP_COMPOSITE_DODGE         = GIMP_DODGE_MODE,
+  GIMP_COMPOSITE_BURN          = GIMP_BURN_MODE,
+  GIMP_COMPOSITE_HARDLIGHT     = GIMP_HARDLIGHT_MODE,
+  GIMP_COMPOSITE_SOFTLIGHT     = GIMP_SOFTLIGHT_MODE,
+  GIMP_COMPOSITE_GRAIN_EXTRACT = GIMP_GRAIN_EXTRACT_MODE,
+  GIMP_COMPOSITE_GRAIN_MERGE   = GIMP_GRAIN_MERGE_MODE,
+  GIMP_COMPOSITE_COLOR_ERASE   = GIMP_COLOR_ERASE_MODE,
+  GIMP_COMPOSITE_ERASE         = GIMP_ERASE_MODE,
+  GIMP_COMPOSITE_REPLACE       = GIMP_REPLACE_MODE,
+  GIMP_COMPOSITE_ANTI_ERASE    = GIMP_ANTI_ERASE_MODE,
+  GIMP_COMPOSITE_BLEND,
+  GIMP_COMPOSITE_SHADE,
+  GIMP_COMPOSITE_SWAP,
+  GIMP_COMPOSITE_SCALE,
+  GIMP_COMPOSITE_CONVERT,
+  GIMP_COMPOSITE_N
+} GimpCompositeOperation;
+
+struct GimpCompositeOperationEffects {
+  unsigned char affect_opacity;
+  unsigned char increase_opacity;
+  unsigned char decrease_opacity;
+};
+
+extern struct GimpCompositeOperationEffects gimp_composite_operation_effects[];
+
+/*
+ * This is structure for communicating all that is necessary to a
+ * compositing operation.
+ */
+typedef struct {
+  unsigned char *A;             /* Source A */
+  unsigned char *B;             /* Source B */
+  unsigned char *D;             /* Destination */
+  unsigned char *M;             /* Mask */
+  unsigned long n_pixels;
+
+  GimpPixelFormat pixelformat_A;
+  GimpPixelFormat pixelformat_B;
+  GimpPixelFormat pixelformat_D;
+  GimpPixelFormat pixelformat_M;
+
+  struct { int opacity; char affect;  } replace;
+  struct { int scale;                 } scale;
+  struct { int blend;                 } blend;
+  struct { int x; int y; int opacity; } dissolve;
+
+  CombinationMode combine;
+  GimpCompositeOperation op;
+} GimpCompositeContext;
+
+
+extern void gimp_composite_dispatch(GimpCompositeContext *);
+extern void gimp_composite_init();
+extern void gimp_composite_context_print(GimpCompositeContext *);
+#endif
diff --git a/app/composite/gimp-composite.html b/app/composite/gimp-composite.html
new file mode 100644
index 0000000000..2a487a9a47
--- /dev/null
+++ b/app/composite/gimp-composite.html
@@ -0,0 +1,82 @@
+    <h1>A GIMP Image Compositing Subsystem</h1>
+<quote>
+  Update.  The latest version of this code will be available in the
+  cvs version of The GIMP "real soon now."  Instead of tracking this
+  software separately, you can simply wait for it to show up in CVS.
+</quote>
+    <p>
+      On Februrary 26, 2003 I volunteered to help with the GIMP <abbr
+      title="Intel Multimedia Extensions">MMX</abbr> implemetation
+      that had been languishing and had recently started to cause
+      problems when building the current GIMP code.
+    </p>
+    <p>
+      <a href="gimp-composite.tgz">This</a> is release 0.0 of an extensible and customisable image
+      compositing interface for the GIMP.  I'd like to hear feedback.
+    </p>
+    <p>
+      What you get is this:
+    </p>
+    <ul>
+      <li style="margin-bottom: 1em;">
+        A general mechanism for incorporating compositing functions based
+        upon the compositing function and the pixel formats of the inputs and
+        the outputs of the function.
+      </li>
+      <li style="margin-bottom: 1em;">
+        Generic implementations of the supported compositing functions as a
+        foundation for further/future improvements.  You can see this code in
+        gimp-composite-generic.c which is a direct "port" of the existing GIMP
+        code which does the same.
+      </li>
+      <li style="margin-bottom: 1em;">
+        The general mechanism allows any compositing function
+        implementation to be replaced by a different implementation that is,
+        for example, customised for the target CPU, or pixel formats, hardware
+        acceleration, and so forth.  You can see this sort of code in
+        gimp-composite-mmx.c which contains implementations of several
+        compositing functions optimised with MMX assembly code.
+      </li>
+    </ul>
+      
+    <h2>Caveat</h2>
+    <p>
+      While I've been using this code, and working out the various problems
+      as they occur, this code is still immature and you may experience
+      problems. If you do, please tell me.
+    </p>
+
+    <h2>Installation</h2>
+    <p>
+      To use this you :
+    </p>
+    <ul>
+      <li style="margin-bottom: 1em;">
+        Untar the gimp-composite.tgz tarball,
+        <pre style="border: 1px solid purple; padding: 1ex;">% tar xzf gimp-composite.tgz</pre>
+      </li>
+      <li style="margin-bottom: 1em;">
+        Edit <tt>gimp-composite/Makefile</tt> to set the values of three variables
+        to correspond to your local environment.  For example, my values are:
+        <pre style="border: 1px solid purple; padding: 1ex;">
+GLIBINCLUDE=/home/helvetix/garnome/include/glib-2.0
+GLIBLIB=/home/helvetix/garnome/lib/glib-2.0/
+GIMP=/home/helvetix/Gnome/gimp</pre>
+        Note that the <tt>GIMP</tt> variable points to a clean cvs checkout of the
+        gimp-1.3 source code, <b>not</b> the path name of the gimp executable.
+      </li>
+      <li style="margin-bottom: 1em;">
+        In the gimp-composite/ directory, execute "make install"
+        <pre style="border: 1px solid purple; padding: 1ex;">% cd gimp-composite ; make install</pre>
+      </li>
+      <li style="margin-bottom: 1em;">
+        In the gimp-1.3 source directory, execute "autogen.sh && make"
+        <pre style="border: 1px solid purple; padding: 1ex;">% cd gimp ; ./autogen.sh && make</pre>
+      </li>
+      <li style="margin-bottom: 1em;">
+        You can install the resultant gimp, or you can run it in place.
+      </li>
+    </ul>
+    <p>
+      Enjoy!  Comments, feedback, complaints to me: HELVETIX Mysterious.ORG
+    </p>
diff --git a/app/composite/make-gimp-composite-dispatch.py b/app/composite/make-gimp-composite-dispatch.py
new file mode 100755
index 0000000000..79c33850bb
--- /dev/null
+++ b/app/composite/make-gimp-composite-dispatch.py
@@ -0,0 +1,460 @@
+#!/usr/bin/env python
+# -*- mode: python py-indent-offset: 2; -*-
+#
+# Gimp image compositing
+# Copyright (C) 2003  Helvetix Victorinox, <helvetix@gimp.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+import sys
+import string
+import os
+import ns
+import pprint
+import getopt
+import copy
+
+#
+# This programme creates C code for gluing a collection of compositing
+# functions into an array indexed by compositing function, and the
+# pixel formats of its arguments.
+#
+# I make some assuptions about the names of the compositing functions.
+#
+# I look into the namespace of a set of object files and figure out
+# from them what compositing functions are implemented.  This let's me
+# build a table with the right cells populated with either the special
+# compositing functions, or to use a generically implemented
+# compositing function.
+
+
+# These are in the same order as they appear in the
+# ./app/base/base-enums.h GimpLayerModeEffects enumeration, because we
+# (probably unwisely) use the value of the enumeration as an index
+# into the Big Table.
+#
+# XXX I'd like some python functions that let me rummage around in C code....
+#
+composite_modes=[
+  "GIMP_COMPOSITE_NORMAL",
+  "GIMP_COMPOSITE_DISSOLVE",
+  "GIMP_COMPOSITE_BEHIND",
+  "GIMP_COMPOSITE_MULTIPLY",
+  "GIMP_COMPOSITE_SCREEN",
+  "GIMP_COMPOSITE_OVERLAY",
+  "GIMP_COMPOSITE_DIFFERENCE",
+  "GIMP_COMPOSITE_ADDITION",
+  "GIMP_COMPOSITE_SUBTRACT",
+  "GIMP_COMPOSITE_DARKEN",
+  "GIMP_COMPOSITE_LIGHTEN",
+  "GIMP_COMPOSITE_HUE",
+  "GIMP_COMPOSITE_SATURATION",
+  "GIMP_COMPOSITE_COLOR_ONLY",
+  "GIMP_COMPOSITE_VALUE",
+  "GIMP_COMPOSITE_DIVIDE",
+  "GIMP_COMPOSITE_DODGE",
+  "GIMP_COMPOSITE_BURN",
+  "GIMP_COMPOSITE_HARDLIGHT",
+  "GIMP_COMPOSITE_SOFTLIGHT",
+  "GIMP_COMPOSITE_GRAIN_EXTRACT",
+  "GIMP_COMPOSITE_GRAIN_MERGE",
+  "GIMP_COMPOSITE_COLOR_ERASE",
+  "GIMP_COMPOSITE_ERASE" ,
+  "GIMP_COMPOSITE_REPLACE" ,
+  "GIMP_COMPOSITE_ANTI_ERASE",
+  "GIMP_COMPOSITE_BLEND",
+  "GIMP_COMPOSITE_SHADE",
+  "GIMP_COMPOSITE_SWAP",
+  "GIMP_COMPOSITE_SCALE",
+  "GIMP_COMPOSITE_CONVERT",
+  ]
+
+pixel_format=[
+  "GIMP_PIXELFORMAT_V8",
+  "GIMP_PIXELFORMAT_VA8",
+  "GIMP_PIXELFORMAT_RGB8",
+  "GIMP_PIXELFORMAT_RGBA8",
+#  "GIMP_PIXELFORMAT_V16",
+#  "GIMP_PIXELFORMAT_VA16",
+#  "GIMP_PIXELFORMAT_RGB16",
+#  "GIMP_PIXELFORMAT_RGBA16"
+  "GIMP_PIXELFORMAT_ANY",
+  ]
+
+
+def pixel_depth_name(pixel_format):
+  s = string.replace(pixel_format.lower(), "gimp_pixelformat_", "")
+  return (s)
+
+pp = pprint.PrettyPrinter(indent=4)
+
+
+def functionnameify(filename):
+  f = os.path.basename(filename)
+  f = string.replace(f, ".o", "")
+  f = string.replace(f, ".c", "")
+  f = string.replace(f, ".h", "")
+  f = string.replace(f, "-", "_")
+  return (f)
+
+def print_function_table(filename, function_table):
+
+  function_table_declarations = dict()
+
+  function_table_keys = function_table.keys()
+  function_table_keys.sort()
+  
+  for key in function_table_keys:
+    if not function_table_declarations.has_key(function_table[key][0]):
+      print "void %s(GimpCompositeContext *);" % (function_table[key][0])
+      function_table_declarations[function_table[key][0]] = function_table[key][0]
+      pass
+    pass
+
+  print ""
+  print "void (*%s[%d][%d][%d][%d])() = {" % (functionnameify(filename),
+                                              len(composite_modes),
+                                              len(pixel_format)-1,
+                                              len(pixel_format)-1,
+                                              len(pixel_format)-1)
+  for mode in composite_modes:
+    print " { /* %s */" % (mode)
+    for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+      print "  { /* A = %s */" % (pixel_depth_name(A))
+      for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        print "   /* %-6s */ {" % (pixel_depth_name(B)),
+        for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+          if function_table.has_key(key):
+            print "%s, " % (function_table[key][0]),
+          else:
+            print "%s, " % ("NULL"),
+            pass
+          pass
+        print "},"
+        pass
+      print "  },"
+      pass
+    print " },"
+    pass
+
+  print "};\n"
+  
+  return
+  
+def print_function_table_name(filename, function_table):
+
+  print ""
+  print "char *%s_name[%d][%d][%d][%d] = {" % (functionnameify(filename),
+                                                len(composite_modes),
+                                                len(pixel_format)-1,
+                                                len(pixel_format)-1,
+                                                len(pixel_format)-1)
+  for mode in composite_modes:
+    print " { /* %s */" % (mode)
+    for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+      print "  { /* A = %s */" % (pixel_depth_name(A))
+      for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        print "   /* %-6s */ {" % (pixel_depth_name(B)),
+        for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+          if function_table.has_key(key):
+            print '"%s", ' % (function_table[key][0]),
+          else:
+            print '"%s", ' % (""),
+            pass
+          pass
+        print "},"
+        pass
+      print "  },"
+      pass
+    print " },"
+    pass
+
+  print "};\n"
+  
+  return
+  
+def load_function_table(filename):
+  nmx = ns.nmx(filename)
+
+  gimp_composite_function = dict()
+
+  for mode in composite_modes:
+    for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+      for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+            
+          for a in ["GIMP_PIXELFORMAT_ANY", A]:
+            for b in ["GIMP_PIXELFORMAT_ANY", B]:
+              for d in ["GIMP_PIXELFORMAT_ANY", D]:
+                key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(a), pixel_depth_name(b), pixel_depth_name(d))
+                  
+                f = nmx.exports_re(key + ".*")
+                if f != None: gimp_composite_function["%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))] =  [f]
+                pass
+              pass
+            pass
+          pass
+        pass
+      pass
+    pass
+
+  return (gimp_composite_function)
+
+
+def merge_function_tables(tables):
+  main_table = copy.deepcopy(tables[0][1])
+  
+  for t in tables[1:]:
+    print >>sys.stderr, t[0]
+    for mode in composite_modes:
+      for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+            key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+            if t[1].has_key(key):
+              print >>sys.stderr, "%s = %s::%s" % (key, t[0], t[1][key])
+              main_table[key] = t[1][key]
+              pass
+            pass
+          pass
+        pass
+      pass
+    pass
+            
+  return (main_table)
+
+
+def print_test_code(tables):
+  return
+
+
+def main(argv):
+
+  objects = map(ns.nmx, argv)
+
+  objs = objects
+  objs.reverse()
+  
+  gimp_composite_function = dict()
+  for o in objs:
+    for mode in composite_modes:
+      for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+            key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+            
+            for a in [A, "GIMP_PIXELFORMAT_ANY"]:
+              for b in [B, "GIMP_PIXELFORMAT_ANY"]:
+                for d in [D, "GIMP_PIXELFORMAT_ANY"]:
+                  composite_function = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(a), pixel_depth_name(b), pixel_depth_name(d))
+                  
+                  f = o.exports_re(composite_function + ".*")
+                  if f != None:
+                    gimp_composite_function.update({key : [f, mode, A, B, D]})
+                    break
+                  pass
+                if gimp_composite_function.has_key(key):
+                  break;
+                pass
+              if gimp_composite_function.has_key(key):
+                break;
+              pass
+
+            if not gimp_composite_function.has_key(key):
+              gimp_composite_function.update({key : ["gimp_composite_unsupported", mode, A, B, D]})
+              pass
+
+            pass
+          pass
+        pass
+      pass
+    pass
+
+
+  print "/* THIS FILE IS AUTOMATICALLY GENERATED.  DO NOT EDIT */"
+  print "$Id$"
+  print '#include "gimp-composite.h"'
+  print "extern void %s(GimpCompositeContext *);" % ("gimp_composite_unsupported")
+  done = dict()
+  for k in gimp_composite_function.keys():
+    f = gimp_composite_function[k]
+    if not done.has_key(f[0]):
+      print "extern void %s(GimpCompositeContext *);" % (f[0])
+      done.update({f[0] : None})
+      pass
+    pass
+
+  if 1:
+    print "char *gimp_composite_function_name[%d][%d][%d][%d] = {" % (len(composite_modes), len(pixel_format)-1, len(pixel_format)-1, len(pixel_format)-1)
+    for mode in composite_modes:
+      print " {"
+      for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        print "  {"
+        for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          print "    {",
+          for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+            key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+            if gimp_composite_function.has_key(key):
+              print '"%s", ' % (gimp_composite_function[key][0]),
+            else:
+              print '"%s", ' % ("gimp_composite_unsupported"),
+              pass
+            pass
+          print "},"
+          pass
+        print "  },"
+      
+        pass
+      print " },"
+      pass
+
+    print "};"
+    pass
+
+
+  print ""
+  print "void (*gimp_composite_function[%d][%d][%d][%d])() = {" % (len(composite_modes), len(pixel_format)-1, len(pixel_format)-1, len(pixel_format)-1)
+  for mode in composite_modes:
+    print " { /* %s */" % (mode)
+    for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+      print "  { /* A = %s */" % (pixel_depth_name(A))
+      for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        print "   /* %s */ {" % (pixel_depth_name(B)),
+        for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+          if gimp_composite_function.has_key(key):
+            print "%s, " % (gimp_composite_function[key][0]),
+          else:
+            print "%s, " % ("gimp_composite_unsupported"),
+            pass
+          pass
+        print "},"
+        pass
+      print "  },"
+      
+      pass
+    print " },"
+    pass
+
+  print "};"
+
+
+  print """
+static int gimp_composite_initialised = 0;
+
+void
+gimp_composite_init()
+{
+  if (!gimp_composite_initialised) {
+"""
+  for o in objects:
+    print "    %s_init();" % (functionnameify(o.filename))
+    pass
+  
+  print "    gimp_composite_initialised = 1;"
+  print "  }"
+  print "}"
+  pass
+
+def gimp_composite_regression(function_tables):
+
+  print """
+void
+gimp_composite_regression()
+{
+  GimpCompositeContext generic_ctx;
+  GimpCompositeContext special_ctx;
+"""
+
+  generic_table = function_tables[0][1]
+  
+  for mode in composite_modes:
+    for A in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+      for B in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+        for D in filter(lambda pf: pf != "GIMP_PIXELFORMAT_ANY", pixel_format):
+          for f in function_tables[1:]:
+            key = "%s_%s_%s_%s" % (string.lower(mode), pixel_depth_name(A), pixel_depth_name(B), pixel_depth_name(D))
+            if f[1].has_key(key):
+              print ""
+              print "  special_ctx.op = %s;" % (mode)
+              print "  generic_ctx.op = %s;" % (mode)
+              print "  %s(&special_ctx);" % (f[1][key][0])
+              print "  %s(&generic_ctx);" % (generic_table[key][0])
+              print '  if (gimp_composite_regression_compare(&generic_ctx, &special_ctx)) {'
+              print '    printf("%s disagrees with %s\\n");' % (f[1][key][0], generic_table[key][0])
+              print '  }'
+              pass
+            pass
+          pass
+        pass
+      pass
+    pass
+  
+  
+  print """
+}
+"""
+
+def gimp_composite_init(function_tables):
+  for o in function_tables:
+    print "extern void %s_init();" % (functionnameify(o[0]))
+    pass
+
+  print ""
+  
+  print """
+static int gimp_composite_initialised = 0;
+
+void
+gimp_composite_init()
+{
+  if (!gimp_composite_initialised) {
+"""
+  for o in function_tables:
+    print "    %s_init();" % (functionnameify(o[0]))
+    pass
+  
+  print "    gimp_composite_initialised = 1;"
+  print "  }"
+  print "}"
+  pass
+
+
+print "/* THIS FILE IS AUTOMATICALLY GENERATED.  DO NOT EDIT */"
+print "/* $Id$ */"
+print '#include "gimp-composite.h"'
+print "extern void %s(GimpCompositeContext *);" % ("gimp_composite_unsupported")
+print ""
+
+d = list()
+for f in sys.argv[1:]:
+  dd = load_function_table(f)
+  d.append((f, dd))
+  print_function_table(f, dd)
+  pass
+
+main_table = merge_function_tables(d)
+
+print_function_table("gimp_composite_function", main_table)
+print_function_table_name("gimp_composite_function", main_table)
+
+gimp_composite_init(d)
+
+#gimp_composite_regression(d)
+
+sys.exit(0)
diff --git a/app/composite/ns.py b/app/composite/ns.py
new file mode 100755
index 0000000000..502e75c566
--- /dev/null
+++ b/app/composite/ns.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+# Copyright (C) 2003  Helvetix Victorinox, a pseudonym, <helvetix@gimp.org>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+# -*- mode: python py-indent-offset: 2; -*-
+#
+# Look at object files and figure things about the namespaces they
+# require and provide.
+#
+# It is very useful when working on libraries where you really should
+# be hygenic about the namespace you occupy and not clutter it with
+# conflicting and extraneous names.
+#
+
+import os
+import re
+import sys
+import string
+import pprint
+
+pp = pprint.PrettyPrinter(indent=2)
+
+#
+# for each object file, we keep two lists: exported names and imported names.
+#
+# nm -A [files...]
+#
+class nmx:
+    def __init__(self, objfile=None):
+        self.objects = dict()
+        self.filename = None
+        
+        if objfile != None:
+            self.update(objfile)
+            pass
+
+        return (None)
+
+    def update(self, objfile):
+        self.filename = objfile
+        
+        fp = os.popen("nm -A " + objfile, "r")
+
+        for line in fp.readlines():
+            (object, type, symbol) = string.split(line)
+            object = object[:string.rfind(object, ':')]
+
+            if not self.objects.has_key(object):
+                self.objects.update({ object : dict({"exports" : dict(), "imports" : dict()})})
+                pass
+
+            if type == "U":
+                self.objects[object]["imports"].update({symbol : dict()})
+            elif type in ["C", "D", "T"]:
+                self.objects[object]["exports"].update({symbol : dict()})
+                pass
+            pass
+
+        fp.close()
+        return (None)
+
+    def exports(self, name):
+        for o in self.objects.keys():
+            if self.objects[o]["exports"].has_key(name):
+                return (1)
+            pass
+        return (0)
+
+    def exports_re(self, name):
+        regex = re.compile(name)
+
+        for o in self.objects.keys():
+            for p in self.objects[o]["exports"].keys():
+                if regex.match(p):
+                    return (p)
+                pass
+            pass
+        return (None)
+
+    pass
+
+
+def nm(nmfile):
+    objects = dict()
+
+    fp = open(nmfile, "r")
+    for line in fp.readlines():
+        (object, type, symbol) = string.split(line)
+        object = object[:string.rfind(object, ':')]
+
+        if not objects.has_key(object):
+            objects.update({ object : dict({"exports" : dict(), "imports" : dict()})})
+            pass
+
+        if type == "U":
+            objects[object]["imports"].update({symbol : dict()})
+        elif type in ["C", "D", "T"]:
+            objects[object]["exports"].update({symbol : dict()})
+        pass
+
+    fp.close()
+    return (objects)
+
+def resolve_(objects, obj):
+
+    for object in objects.keys():
+        if object != obj:
+            for imported in objects[obj]["imports"].keys():
+                if objects[object]["exports"].has_key(imported):
+                    objects[obj]["imports"][imported] = object
+                    pass
+                pass
+
+            for exported in objects[obj]["exports"].keys():
+                if objects[object]["imports"].has_key(exported):
+                    objects[obj]["exports"][exported] = object
+                    pass
+                pass
+            pass
+        pass
+
+    return
+
+def resolve(objects):
+
+    for object in objects.keys():
+        resolve_(objects, object)
+
+    return (objects)
+
+def report_unreferenced(objects):
+    for object in objects.keys():
+        for symbol in objects[object]["exports"].keys():
+            if len(objects[object]["exports"][symbol]) == 0:
+                print object + ":" + symbol, "unreferenced"
+                pass
+            pass
+        pass
+    return
+
+def report_referenced(objects):
+    for object in objects.keys():
+        for symbol in objects[object]["imports"].keys():
+            if len(objects[object]["imports"][symbol]) > 0:
+                print objects[object]["imports"][symbol] + ":" + symbol, object, "referenced"
+                pass
+            pass
+        pass
+    return
+
+def make_depend(objects):
+    for object in objects.keys():
+        for symbol in objects[object]["imports"].keys():
+            if len(objects[object]["imports"][symbol]) > 0:
+                print object + ":" + symbol, "referenced", objects[object]["imports"][symbol]
+                pass
+            pass
+        pass
+    return
+
+
+def main(argv):
+    ns = nm(argv[0])
+
+    resolve(ns)
+
+    report_referenced(ns)
+    report_unreferenced(ns)
+    pass
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/app/composite/tester.c b/app/composite/tester.c
new file mode 100644
index 0000000000..7a643b887d
--- /dev/null
+++ b/app/composite/tester.c
@@ -0,0 +1,466 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include "gimp-composite.h"
+#include "gimp-composite-util.h"
+
+#undef use_oldmmx
+
+extern void xxx_3a(rgba8_t *, rgba8_t *, rgba8_t *, u_long);
+
+main(int argc, char *argv[])
+{
+  double f;
+  GimpCompositeContext ctx;
+  GimpCompositeContext ctx_generic;
+  GimpCompositeContext ctx_va8;
+  GimpCompositeContext ctx_va8_generic;
+  int iterations;
+  rgba8_t *d1;
+  rgba8_t *d2;
+  rgba8_t *rgba8A;
+  rgba8_t *rgba8B;
+  va8_t *va8A;
+  va8_t *va8B;
+  va8_t *va8_d1;
+  va8_t *va8_d2;
+  struct timeval t0, t1, new_elapsed, old_elapsed;
+  unsigned long i;
+  unsigned long n_pixels;
+
+  iterations = atoi(argv[1]);
+  n_pixels = atol(argv[2]);
+
+  rgba8A = (rgba8_t *) calloc(sizeof(rgba8_t), n_pixels+1);
+  rgba8B = (rgba8_t *) calloc(sizeof(rgba8_t), n_pixels+1);
+  va8A = (va8_t *) calloc(sizeof(va8_t), n_pixels+1);
+  va8B = (va8_t *) calloc(sizeof(va8_t), n_pixels+1);
+  d1 = (rgba8_t *) calloc(sizeof(rgba8_t), n_pixels+1);
+  d2 = (rgba8_t *) calloc(sizeof(rgba8_t), n_pixels+1);
+  va8_d1 = (va8_t *) calloc(sizeof(va8_t), n_pixels+1);
+  va8_d2 = (va8_t *) calloc(sizeof(va8_t), n_pixels+1);
+
+  srand(314159);
+
+  for (i = 0; i < n_pixels; i++) {
+#if 0
+    rgba8A[i].r = rand() % 256;
+    rgba8A[i].g = rand() % 256;
+    rgba8A[i].b = rand() % 256;
+    rgba8A[i].a = rand() % 256;
+
+    rgba8B[i].r = rand() % 256;
+    rgba8B[i].g = rand() % 256;
+    rgba8B[i].b = rand() % 256;
+    rgba8B[i].a = rand() % 256;
+#else
+    rgba8A[i].r = 255-i;
+    rgba8A[i].g = 255-i;
+    rgba8A[i].b = 255-i;
+    rgba8A[i].a = 255-i;
+
+    rgba8B[i].r = i;
+    rgba8B[i].g = i;
+    rgba8B[i].b = i;
+    rgba8B[i].a = i;
+
+    va8A[i].v = i;
+    va8A[i].a = 255-i;
+    va8B[i].v = i;
+    va8B[i].a = i;
+#endif
+  }
+
+  gimp_composite_init();
+
+#define do_add
+#define do_darken
+#define do_difference
+#define do_lighten
+#define do_multiply
+#define do_subtract
+#define do_screen
+#define do_grainextract
+#define do_grainmerge
+#define do_divide
+#define do_dodge
+#define do_swap
+#define do_scale
+#define do_burn
+
+  ctx.A = (unsigned char *) rgba8A;
+  ctx.pixelformat_A = GIMP_PIXELFORMAT_RGBA8;
+  ctx.B = (unsigned char *) rgba8B;
+  ctx.pixelformat_B = GIMP_PIXELFORMAT_RGBA8;
+  ctx.D = (unsigned char *) d2;
+  ctx.pixelformat_D = GIMP_PIXELFORMAT_RGBA8;
+  ctx.M = NULL;
+  ctx.pixelformat_M = GIMP_PIXELFORMAT_ANY;
+  ctx.n_pixels = n_pixels;
+  ctx.scale.scale = 2;
+
+  ctx_generic.A = (unsigned char *) rgba8A;
+  ctx_generic.pixelformat_A = GIMP_PIXELFORMAT_RGBA8;
+  ctx_generic.B = (unsigned char *) rgba8B;
+  ctx_generic.pixelformat_B = GIMP_PIXELFORMAT_RGBA8;
+  ctx_generic.D = (unsigned char *) d1;
+  ctx_generic.pixelformat_D = GIMP_PIXELFORMAT_RGBA8;
+  ctx_generic.M = NULL;
+  ctx_generic.pixelformat_M = GIMP_PIXELFORMAT_ANY;
+  ctx_generic.n_pixels = n_pixels;
+  ctx_generic.scale.scale = 2;
+
+
+  ctx_va8.A = (unsigned char *) va8A;
+  ctx_va8.pixelformat_A = GIMP_PIXELFORMAT_VA8;
+  ctx_va8.B = (unsigned char *) va8B;
+  ctx_va8.pixelformat_B = GIMP_PIXELFORMAT_VA8;
+  ctx_va8.D = (unsigned char *) va8_d2;
+  ctx_va8.pixelformat_D = GIMP_PIXELFORMAT_VA8;
+  ctx_va8.M = NULL;
+  ctx_va8.pixelformat_M = GIMP_PIXELFORMAT_ANY;
+  ctx_va8.n_pixels = n_pixels;
+  ctx_va8.scale.scale = 2;
+
+  ctx_va8_generic.A = (unsigned char *) va8A;
+  ctx_va8_generic.pixelformat_A = GIMP_PIXELFORMAT_VA8;
+  ctx_va8_generic.B = (unsigned char *) va8B;
+  ctx_va8_generic.pixelformat_B = GIMP_PIXELFORMAT_VA8;
+  ctx_va8_generic.D = (unsigned char *) va8_d1;
+  ctx_va8_generic.pixelformat_D = GIMP_PIXELFORMAT_VA8;
+  ctx_va8_generic.M = NULL;
+  ctx_va8_generic.pixelformat_M = GIMP_PIXELFORMAT_ANY;
+  ctx_va8_generic.n_pixels = n_pixels;
+  ctx_va8_generic.scale.scale = 2;
+
+
+#define timer_fsecs(tv) ((double) ((tv).tv_sec) + (double) ((tv).tv_usec / 1000000.0))
+#define timer_report(name,t1,t2) printf("%15s %15.10f %15.10f %15.10f\n", name, timer_fsecs(t1), timer_fsecs(t2), timer_fsecs(t1)/timer_fsecs(t2));
+
+#ifdef do_burn
+  /* burn */
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_BURN;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_burn_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("burn rgba8", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("burn rgba8", old_elapsed, new_elapsed);
+
+  gettimeofday(&t0, NULL);
+  ctx_va8.op = GIMP_COMPOSITE_BURN;
+  ctx_va8_generic.op = GIMP_COMPOSITE_BURN;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx_va8); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_burn_any_any_any_generic(&ctx_va8_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_va8("burn rgba8", ctx_va8.A, ctx_va8.B, ctx_va8_generic.D, ctx_va8.D, ctx_va8.n_pixels);
+  timer_report("burn va8", old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_dodge
+  /* dodge */
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_DODGE;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_dodge_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("dodge", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("dodge", old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_divide
+  /* divide */
+  ctx.op = GIMP_COMPOSITE_DIVIDE;
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_divide_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("divide", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("divide",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_grainextract
+  /* grainextract */
+  ctx.op = GIMP_COMPOSITE_GRAIN_EXTRACT;
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_grain_extract_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("grain extract", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("grainextract",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_grainmerge
+  ctx.op = GIMP_COMPOSITE_GRAIN_MERGE;
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_grain_merge_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("grain merge", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("grainmerge",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_scale
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_SCALE;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_scale_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("scale", ctx.A, NULL, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("scale", old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_screen
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_SCREEN;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_screen_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("screen", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("screen",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_lighten
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_LIGHTEN;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_lighten_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("lighten", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("lighten",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_darken
+  /* darken */
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_DARKEN;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_darken_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("darken", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("darken",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_difference
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_DIFFERENCE;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_difference_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("difference", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("difference",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_multiply
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_MULTIPLY;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_multiply_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("multiply", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("multiply",  old_elapsed, new_elapsed);
+#endif
+  
+#ifdef do_subtract
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_SUBTRACT;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_subtract_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("subtract", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("subtract",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_add
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_ADDITION;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_addition_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("addition", ctx.A, ctx.B, ctx_generic.D, ctx.D, ctx.n_pixels);
+  timer_report("add",  old_elapsed, new_elapsed);
+#endif
+
+#ifdef do_swap
+  gettimeofday(&t0, NULL);
+  ctx.op = GIMP_COMPOSITE_SWAP;
+  for (i = 0; i < iterations; i++) { gimp_composite_dispatch(&ctx); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &new_elapsed);
+  gettimeofday(&t0, NULL);
+  for (i = 0; i < iterations; i++) { gimp_composite_swap_any_any_any_generic(&ctx_generic); }
+  gettimeofday(&t1, NULL);
+  timersub(&t1, &t0, &old_elapsed);
+  comp_rgba8("swap", ctx.A, ctx.B, ctx_generic.A, ctx.A, ctx.n_pixels);
+  comp_rgba8("swap", ctx.A, ctx.B, ctx_generic.B, ctx.B, ctx.n_pixels);
+  timer_report("swap",  old_elapsed, new_elapsed);
+#endif
+
+  return (0);
+}
+
+print_rgba8(rgba8_t *p)
+{
+  printf("#%02x%02x%02x,%02X", p->r, p->g, p->b, p->a);
+  fflush(stdout);
+}
+
+print_va8(va8_t *va8)
+{
+  printf("#%02x,%02X", va8->v, va8->a);
+  fflush(stdout);
+}
+
+comp_rgba8(char *str, rgba8_t *rgba8A, rgba8_t *rgba8B, rgba8_t *expected, rgba8_t *got, u_long length)
+{
+  int i;
+  int failed;
+  int fail_count;
+
+  fail_count = 0;
+
+  for (i = 0; i < length; i++) {
+    failed = 0;
+    
+    if (expected[i].r != got[i].r) { failed = 1; }
+    if (expected[i].g != got[i].g) { failed = 1; }
+    if (expected[i].b != got[i].b) { failed = 1; }
+    if (expected[i].a != got[i].a) { failed = 1; }
+    if (failed) {
+      fail_count++;
+      printf("%s %8d A=", str, i); print_rgba8(&rgba8A[i]);
+      if (rgba8B != (rgba8_t *) 0) {
+        printf(" B="); print_rgba8(&rgba8B[i]);
+      }
+      printf("   ");
+      printf("exp=");
+      print_rgba8(&expected[i]);
+      printf(" got=");
+      print_rgba8(&got[i]);
+      printf("\n");
+    }
+    if (fail_count > 5)
+      break;
+  }
+
+  return (fail_count);
+}
+
+comp_va8(char *str, va8_t *va8A, va8_t *va8B, va8_t *expected, va8_t *got, u_long length)
+{
+  int i;
+  int failed;
+  int fail_count;
+
+  fail_count = 0;
+
+  for (i = 0; i < length; i++) {
+    failed = 0;
+    
+    if (expected[i].v != got[i].v) { failed = 1; }
+    if (expected[i].a != got[i].a) { failed = 1; }
+    if (failed) {
+      fail_count++;
+      printf("%s %8d A=", str, i); print_va8(&va8A[i]);
+      if (va8B != (va8_t *) 0) { printf(" B="); print_va8(&va8B[i]); }
+      printf("   ");
+      printf("exp=");
+      print_va8(&expected[i]);
+      printf(" got=");
+      print_va8(&got[i]);
+      printf("\n");
+    }
+    if (fail_count > 5)
+      break;
+  }
+
+  return (fail_count);
+}
+
+
+dump_rgba8(char *str, rgba8_t *rgba, u_long length)
+{
+  int i;
+
+  printf("%s\n", str);
+
+  for (i = 0; i < length; i++) {
+    printf("%5d: ", i);
+    print_rgba8(&rgba[i]);
+    printf("\n");
+  }
+}
+
+void
+xxx_3a(rgba8_t *a, rgba8_t *b, rgba8_t *c, u_long length)
+{
+  int i;
+
+  for (i = 0; i < length; i++) {
+    printf("%5d: ", i);
+    print_rgba8(&a[i]);
+    printf(" ");
+    print_rgba8(&b[i]);
+    printf(" ");
+    print_rgba8(&c[i]);
+    printf("\n");
+  }
+}