Merge remote branch 'giannis/master'

Conflicts: doc/source/pyFAI.rst openCL/ocl_azim_CSR.cl pyFAI-src/ocl_azim_csr_dis.py src/morphology.c
2014-06-16 14:13:16 +02:00 · 2014-06-16 14:13:16 +02:00 · 5c914a008e
parent d28a78766b e1531f2c46
commit 5c914a008e
23 changed files with 52761 additions and 66 deletions
--- a/benchmark/benchmark_ocl_csr3.py
+++ b/benchmark/benchmark_ocl_csr3.py
@ -6,6 +6,7 @@ from __future__ import print_function, division

 import json, sys, time, timeit, os, platform, subprocess
 import numpy
+from numpy import log2
 import fabio
 import os.path as op
 sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
@ -58,6 +59,8 @@ class Bench(object):
        self.repeat = repeat
        self.nbr = nbr
        self.results = {}
+        self.flops = {}
+        self.mem_band = {}
        self.meth = []
        self._cpu = None
        self.fig = None
@ -172,8 +175,10 @@ data = fabio.open(r"%s").data
            print("Working on processor: %s" % self.get_cpu())
            label = "1D_" + self.LABELS[method]
        results = {}
+        flops = {}
+        mem_band = {}
        first = True
-        param = "Frelon2k.poni"
+        param = "Pilatus1M.poni"
        block_size_list = [1,2,4,8,16,32,64,128,256]
        for block_size in block_size_list:
            self.update_mp()
@ -190,6 +195,14 @@ data = fabio.open(r"%s").data
            if check:
                if "csr" in method:
                    print("csr: size= %s \t nbytes %.3f MB " % (ai._csr_integrator.data.size, ai._csr_integrator.lut_nbytes / 2 ** 20))
+            
+            bins = ai._csr_integrator.bins
+            nnz = ai._csr_integrator.nnz
+            parallel_reduction = sum([2**i for i in range(1,int(log2(block_size)))])
+            
+            FLOPs = 9*nnz + 11*parallel_reduction + 1*bins
+            mem_access = (2*block_size*bins + 5*nnz + 7*bins)*4
+            
            del ai, data
            self.update_mp()
            
@ -216,9 +229,13 @@ data = fabio.open(r"%s").data
                self.update_mp()
                if R < self.LIMIT:
                    results[block_size ] = tmin
+                    flops[block_size ] = (FLOPs/tmin)*1e-6
+                    mem_band[block_size ] = (mem_access/tmin)*1e-6
                    self.update_mp()
            else:
                results[block_size ] = tmin
+                flops[block_size ] = FLOPs/tmin
+                mem_band[block_size ] = mem_access/tmin
            if first:
                self.new_curve(results, label)
                first = False
@ -227,6 +244,8 @@ data = fabio.open(r"%s").data
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
+        self.flops[label] = flops
+        self.mem_band[label] = mem_band
        self.update_mp()


@ -234,12 +253,12 @@ data = fabio.open(r"%s").data
        self.update_mp()
        json.dump(self.results, open(filename, "w"))

-    def print_res(self):
+    def print_res(self,summary,results):
        self.update_mp()
-        print("Summary: execution time in milliseconds")
+        print(summary)
        print("Size/Meth\t" + "\t".join(self.meth))
        for i in self.size:
-            print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (self.results[j].get(i, 0)) for j in self.meth))
+            print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (results[j].get(i, 0)) for j in self.meth))

    def init_curve(self):
        self.update_mp()
@ -394,10 +413,16 @@ if __name__ == "__main__":
            bench.bench_1d_ocl_csr(True, {"devicetype":"ACC"})

    bench.save()
-    bench.print_res()
+    results = bench.results
+    flops = bench.flops
+    mem_band = bench.mem_band
+
+    bench.print_res("Summary: Execution time in milliseconds", results)
+    bench.print_res("Summary: MFLOPS",flops)
+    bench.print_res("Summary: Memory Bandwidth in MB/s",mem_band)
    bench.update_mp()

    bench.ax.set_ylim(1, 200)
    # plt.show()
    plt.ion()
-    raw_input("Enter to quit")
+#    raw_input("Enter to quit")
--- a/benchmark/benchmark_split.py
+++ b/benchmark/benchmark_split.py
@ -0,0 +1,752 @@
+#!/usr/bin/python
+import json, sys, time, timeit, os, platform, subprocess
+import numpy
+import fabio
+import os.path as op
+sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
+import utilstest
+pyFAI = utilstest.UtilsTest.pyFAI
+ocl = pyFAI.opencl.ocl
+from matplotlib import pyplot as plt
+plt.ion()
+
+ds_list = ["Pilatus1M.poni", "halfccd.poni", "Frelon2k.poni", "Pilatus6M.poni", "Mar3450.poni", "Fairchild.poni"]
+datasets = {"Fairchild.poni":utilstest.UtilsTest.getimage("1880/Fairchild.edf"),
+            "halfccd.poni":utilstest.UtilsTest.getimage("1882/halfccd.edf"),
+            "Frelon2k.poni":utilstest.UtilsTest.getimage("1881/Frelon2k.edf"),
+            "Pilatus6M.poni":utilstest.UtilsTest.getimage("1884/Pilatus6M.cbf"),
+            "Pilatus1M.poni":utilstest.UtilsTest.getimage("1883/Pilatus1M.edf"),
+            "Mar3450.poni":utilstest.UtilsTest.getimage("2201/LaB6_260210.mar3450")
+      }
+b = None
+class Bench(object):
+    HEADER = '\033[95m'
+    OKBLUE = '\033[94m'
+    OKGREEN = '\033[92m'
+    WARNING = '\033[93m'
+    FAIL = '\033[91m'
+    ENDC = '\033[0m'
+    def __init__(self, nbr=10, memprofile=False):
+        self.reference_1d = {}
+        self.LIMIT = 8
+        self.repeat = 1
+        self.nbr = nbr
+        self.results = {}
+        self.meth = []
+        self._cpu = None
+        self.fig = None
+        self.ax = None
+        self.starttime = time.time()
+        self.plot = None
+        self.plot_x = []
+        self.plot_y = []
+        self.do_memprofile = memprofile
+        self.fig_mp = None
+        self.ax_mp = None
+        self.plot_mp = None
+        self.memory_profile = ([], [])
+
+
+    def get_cpu(self):
+        if self._cpu is None:
+            if os.name == "nt":
+                self._cpu = platform.processor()
+            elif os.path.exists("/proc/cpuinfo"):
+                self._cpu = [i.split(": ", 1)[1] for i in open("/proc/cpuinfo") if i.startswith("model name")][0].strip()
+            elif os.path.exists("/usr/sbin/sysctl"):
+                proc = subprocess.Popen(["sysctl", "-n", "machdep.cpu.brand_string"], stdout=subprocess.PIPE)
+                proc.wait()
+                self._cpu = proc.stdout.read().strip()
+            old = self._cpu
+            self._cpu = old.replace("  ", " ")
+            while old != self._cpu:
+                old = self._cpu
+                self._cpu = old.replace("  ", " ")
+        return self._cpu
+
+    def get_gpu(self, devicetype="gpu", useFp64=False, platformid=None, deviceid=None):
+        if ocl is None:
+            return "NoGPU"
+        ctx = ocl.create_context(devicetype, useFp64, platformid, deviceid)
+        return ctx.devices[0].name
+
+    def get_mem(self):
+        """
+        Returns the occupied memory for memory-leak hunting in MByte
+        """
+        pid = os.getpid()
+        if os.path.exists("/proc/%i/status" % pid):
+            for l in open("/proc/%i/status" % pid):
+                if l.startswith("VmRSS"):
+                    mem = int(l.split(":", 1)[1].split()[0]) / 1024.
+        else:
+            mem = 0
+        return mem
+
+
+    def print_init(self, t):
+        print(" * Initialization time: %.1f ms" % (1000.0 * t))
+        self.update_mp()
+
+
+    def print_exec(self, t):
+        print(" * Execution time rep : %.1f ms" % (1000.0 * t))
+        self.update_mp()
+
+
+    def print_sep(self):
+        print("*"*80)
+        self.update_mp()
+
+    def get_ref(self, param):
+        if param not in self.reference_1d:
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            N = min(data.shape)
+            res = ai.xrpd(data, N)
+            self.reference_1d[param] = res
+            del ai, data
+        return self.reference_1d[param]
+
+    def bench_cpu1d(self):
+        self.update_mp()
+        print("Working on processor: %s" % self.get_cpu())
+        results = {}
+        label = "1D_CPU_serial_full_split"
+        first = True
+        for param in ds_list:
+            self.update_mp()
+            ref = self.get_ref(param)
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = min(data.shape)
+            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            res = ai.integrate1d(data, 1000, method="splitpixelfull", unit="2th_deg", correctSolidAngle=False)
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            self.update_mp()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+N=min(data.shape)
+out=ai.xrpd(data,N)""" % (param, fn)
+            t = timeit.Timer("ai.integrate1d(data, 1000, method='splitpixelfull', unit='2th_deg', correctSolidAngle=False)", setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.update_mp()
+            self.print_exec(tmin)
+            size /= 1e6
+            tmin *= 1000.0
+            results[size ] = tmin
+            if first:
+                self.new_curve(results, label)
+                first = False
+            else:
+                self.new_point(size, tmin)
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+    def bench_cpu1d_lut(self):
+        self.update_mp()
+        print("Working on processor: %s" % self.get_cpu())
+        label = "1D_CPU_parallel_OpenMP"
+        results = {}
+        self.new_curve(results, label)
+        for param in ds_list:
+            self.update_mp()
+            ref = self.get_ref(param)
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = min(data.shape)
+            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            res = ai.xrpd_LUT(data, N)
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            print "lut.shape=", ai._lut_integrator.lut.shape, "lut.nbytes (MB)", ai._lut_integrator.size * 8 / 1e6
+            self.update_mp()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+N=min(data.shape)
+out=ai.xrpd_LUT(data,N)""" % (param, fn)
+            t = timeit.Timer("ai.xrpd_LUT(data,N,safe=False)", setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.print_exec(tmin)
+            R = utilstest.Rwp(res, ref)
+            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
+            self.update_mp()
+            if R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size ] = tmin
+                self.new_point(size, tmin)
+            self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+    def bench_cpu1d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
+        self.update_mp()
+        if (ocl is None):
+            print("No pyopencl")
+            return
+        if (platformid is None) or (deviceid is None):
+            platdev = ocl.select_device(devicetype)
+            if not platdev:
+                print("No such OpenCL device: skipping benchmark")
+                return
+            platformid, deviceid = platdev
+        print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
+        label = "1D_%s_parallel_OpenCL" % devicetype
+        first = True
+        results = {}
+        for param in ds_list:
+            self.update_mp()
+            ref = self.get_ref(param)
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = min(data.shape)
+            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            try:
+                res = ai.xrpd_LUT_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid)
+            except MemoryError as error:
+                print(error)
+                break
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            self.update_mp()
+            ai.reset()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+N=min(data.shape)
+out=ai.xrpd_LUT_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s)""" % (param, fn, devicetype, platformid, deviceid)
+            t = timeit.Timer("ai.xrpd_LUT_OCL(data,N,safe=False)", setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.update_mp()
+            del t
+            self.update_mp()
+            self.print_exec(tmin)
+            R = utilstest.Rwp(res, ref)
+            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
+            if R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size] = tmin
+                if first:
+                    self.new_curve(results, label)
+                    first = False
+                else:
+                    self.new_point(size, tmin)
+            self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+
+    def bench_cpu1d_csr_ocl(self, devicetype="GPU", platformid=None, deviceid=None, padded=False, block_size=32):
+        self.update_mp()
+        if (ocl is None):
+            print("No pyopencl")
+            return
+        if (platformid is None) or (deviceid is None):
+            platdev = ocl.select_device(devicetype)
+            if not platdev:
+                print("No such OpenCL device: skipping benchmark")
+                return
+            platformid, deviceid = platdev
+        print("Working on device: %s platform: %s device: %s padding: %s block_size= %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid], padded, block_size))
+        label = "1D_%s_parallel_OpenCL, padded=%s, block_size=%s" % (devicetype, padded, block_size)
+        first = True
+        results = {}
+        for param in ds_list:
+            self.update_mp()
+            ref = self.get_ref(param)
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = min(data.shape)
+            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            try:
+                res = ai.xrpd_CSR_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid, padded=padded, block_size=block_size)
+            except MemoryError as error:
+                print(error)
+                break
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            self.update_mp()
+            ai.reset()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+N=min(data.shape)
+out=ai.xrpd_CSR_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s,padded=%s,block_size=%s)""" % (param, fn, devicetype, platformid, deviceid, padded, block_size)
+            t = timeit.Timer("ai.xrpd_CSR_OCL(data,N,safe=False,padded=%s,block_size=%s)" % (padded, block_size), setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.update_mp()
+            del t
+            self.update_mp()
+            self.print_exec(tmin)
+            R = utilstest.Rwp(res, ref)
+            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
+            if R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size] = tmin
+                if first:
+                    self.new_curve(results, label)
+                    first = False
+                else:
+                    self.new_point(size, tmin)
+            self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+
+    def bench_cpu2d(self):
+        self.update_mp()
+        print("Working on processor: %s" % self.get_cpu())
+        results = {}
+        label = "2D_CPU_serial"
+        first = True
+        for param in ds_list:
+            self.update_mp()
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = (500, 360)
+            print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            _ = ai.xrpd2(data, N[0], N[1])
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            self.update_mp()
+            ai.reset()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+out=ai.xrpd2(data,%s,%s)""" % (param, fn, N[0], N[1])
+            t = timeit.Timer("ai.xrpd2(data,%s,%s)" % N, setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.update_mp()
+            del t
+            self.update_mp()
+            self.print_exec(tmin)
+            print("")
+            if 1:  # R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size] = tmin
+                if first:
+                    self.new_curve(results, label)
+                    first = False
+                else:
+                    self.new_point(size, tmin)
+            self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+    def bench_cpu2d_lut(self):
+        print("Working on processor: %s" % self.get_cpu())
+        label = "2D_CPU_parallel_OpenMP"
+        first = True
+        results = {}
+        for param in ds_list:
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = (500, 360)
+            print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            _ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut")
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
+            self.update_mp()
+            ai.reset()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut")""" % (param, fn, N[0], N[1])
+            t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut')" % N, setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.update_mp()
+            del t
+            self.update_mp()
+            self.print_exec(tmin)
+            print("")
+            if 1:  # R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size] = tmin
+                if first:
+                    self.new_curve(results, label)
+                    first = False
+                else:
+                    self.new_point(size, tmin)
+                self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+    def bench_cpu2d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
+        self.update_mp()
+        if (ocl is None):
+            print("No pyopencl")
+            return
+        if (platformid is None) or (deviceid is None):
+            platdev = ocl.select_device(devicetype)
+            if not platdev:
+                print("No such OpenCL device: skipping benchmark")
+                return
+            platformid, deviceid = platdev
+        print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
+        results = {}
+        label = "2D_%s_parallel_OpenCL" % devicetype.upper()
+        first = True
+        for param in ds_list:
+            self.update_mp()
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = (500, 360)
+            print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
+            t0 = time.time()
+            try:
+                _ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut_ocl_%i,%i" % (platformid, deviceid))
+            except MemoryError as error:
+                print(error)
+                break
+            t1 = time.time()
+            self.print_init(t1 - t0)
+            print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
+            self.update_mp()
+            ai.reset()
+            del ai, data
+            self.update_mp()
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut_ocl_%i,%i")""" % (param, fn, N[0], N[1], platformid, deviceid)
+            t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut_ocl')" % N, setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            self.update_mp()
+            del t
+            self.update_mp()
+            self.print_exec(tmin)
+            print("")
+            if 1:  # R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size] = tmin
+                if first:
+                    self.new_curve(results, label)
+                    first = False
+                else:
+                    self.new_point(size, tmin)
+                self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+
+    def bench_gpu1d(self, devicetype="gpu", useFp64=True, platformid=None, deviceid=None):
+        self.update_mp()
+        print("Working on %s, in " % devicetype + ("64 bits mode" if useFp64 else"32 bits mode") + "(%s.%s)" % (platformid, deviceid))
+        if ocl is None or not ocl.select_device(devicetype):
+            print("No pyopencl or no such device: skipping benchmark")
+            return
+        results = {}
+        label = "Forward_OpenCL_%s_%s_bits" % (devicetype , ("64" if useFp64 else"32"))
+        first = True
+        for param in ds_list:
+            self.update_mp()
+            fn = datasets[param]
+            ai = pyFAI.load(param)
+            data = fabio.open(fn).data
+            size = data.size
+            N = min(data.shape)
+            print("1D integration of %s %.1f Mpixel -> %i bins (%s)" % (op.basename(fn), size / 1e6, N, ("64 bits mode" if useFp64 else"32 bits mode")))
+
+            try:
+                t0 = time.time()
+                res = ai.xrpd_OpenCL(data, N, devicetype=devicetype, useFp64=useFp64, platformid=platformid, deviceid=deviceid)
+                t1 = time.time()
+            except Exception as error:
+                print("Failed to find an OpenCL GPU (useFp64:%s) %s" % (useFp64, error))
+                continue
+            self.print_init(t1 - t0)
+            self.update_mp()
+            ref = ai.xrpd(data, N)
+            R = utilstest.Rwp(res, ref)
+            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
+            setup = """
+import pyFAI,fabio
+ai=pyFAI.load(r"%s")
+data = fabio.open(r"%s").data
+N=min(data.shape)
+out=ai.xrpd_OpenCL(data,N, devicetype=r"%s", useFp64=%s, platformid=%s, deviceid=%s)""" % (param, fn, devicetype, useFp64, platformid, deviceid)
+            t = timeit.Timer("ai.xrpd_OpenCL(data,N,safe=False)", setup)
+            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
+            del t
+            self.update_mp()
+            self.print_exec(tmin)
+            print("")
+            if R < self.LIMIT:
+                size /= 1e6
+                tmin *= 1000.0
+                results[size] = tmin
+                if first:
+                    self.new_curve(results, label)
+                    first = False
+                else:
+                    self.new_point(size, tmin)
+                self.update_mp()
+        self.print_sep()
+        self.meth.append(label)
+        self.results[label] = results
+        self.update_mp()
+
+    def save(self, filename="benchmark.json"):
+        self.update_mp()
+        json.dump(self.results, open(filename, "w"))
+
+    def print_res(self):
+        self.update_mp()
+        print("Summary: execution time in milliseconds")
+        print "Size/Meth\t" + "\t".join(b.meth)
+        for i in self.size:
+            print "%7.2f\t\t" % i + "\t\t".join("%.2f" % (b.results[j].get(i, 0)) for j in b.meth)
+
+    def init_curve(self):
+        self.update_mp()
+        if self.fig:
+            print("Already initialized")
+            return
+        if "DISPLAY" in os.environ:
+            plt.ion()
+            self.fig = plt.figure()
+            self.ax = self.fig.add_subplot(1, 1, 1)
+            self.ax.set_autoscale_on(False)
+            self.ax.set_xlabel("Image size in Mega-Pixels")
+            self.ax.set_ylabel("Frames processed per second")
+            self.ax.set_yscale("log", basey=2)
+            t = [1, 2, 5, 10, 20, 50, 100, 200, 400, 500]
+            self.ax.set_yticks([float(i) for i in t])
+            self.ax.set_yticklabels([str(i)for i in t])
+            self.ax.set_xlim(0.5, 20)
+            self.ax.set_ylim(0.5, 500)
+            self.ax.set_title(self.get_cpu() + " / " + self.get_gpu())
+
+            if self.fig.canvas:
+                self.fig.canvas.draw()
+#            plt.show()
+
+    def new_curve(self, results, label):
+        self.update_mp()
+        if not self.fig:
+            return
+        self.plot_x = list(results.keys())
+        self.plot_x.sort()
+        self.plot_y = [1000.0 / results[i] for i in self.plot_x]
+        self.plot = self.ax.plot(self.plot_x, self.plot_y, "o-", label=label)[0]
+        self.ax.legend()
+        if self.fig.canvas:
+            self.fig.canvas.draw()
+
+    def new_point(self, size, exec_time):
+        """
+        Add new point to current curve
+        @param size: of the system
+        @parm exec_time: execution time in ms
+        """
+        self.update_mp()
+        if not self.plot:
+            return
+
+        self.plot_x.append(size)
+        self.plot_y.append(1000.0 / exec_time)
+        self.plot.set_data(self.plot_x, self.plot_y)
+        if self.fig.canvas:
+            self.fig.canvas.draw()
+
+    def display_all(self):
+        if not self.fig:
+            return
+        for k in self.meth:
+            self.new_curve(self.results[k], k)
+        self.ax.legend()
+        self.fig.savefig("benchmark.png")
+        self.fig.show()
+#        plt.ion()
+
+
+    def update_mp(self):
+        if not self.do_memprofile:
+            return
+        self.memory_profile[0].append(time.time() - self.starttime)
+        self.memory_profile[1].append(self.get_mem())
+        if not self.fig_mp:
+            self.fig_mp = plt.figure()
+            self.ax_mp = self.fig_mp.add_subplot(1, 1, 1)
+            self.ax_mp.set_autoscale_on(False)
+            self.ax_mp.set_xlabel("Run time (s)")
+            self.ax_mp.set_xlim(0, 100)
+            self.ax_mp.set_ylim(0, 2 ** 10)
+            self.ax_mp.set_ylabel("Memory occupancy (MB)")
+            self.ax_mp.set_title("Memory leak hunter")
+            self.plot_mp = self.ax_mp.plot(*self.memory_profile)[0]
+        else:
+            self.plot_mp.set_data(*self.memory_profile)
+            tmax = self.memory_profile[0][-1]
+            mmax = max(self.memory_profile[1])
+            if tmax > self.ax_mp.get_xlim()[-1]:
+                self.ax_mp.set_xlim(0, tmax)
+            if mmax > self.ax_mp.get_ylim()[-1]:
+                self.ax_mp.set_ylim(0, mmax)
+
+        if self.fig_mp.canvas:
+            self.fig_mp.canvas.draw()
+
+    def get_size(self):
+        if len(self.meth) == 0:
+            return []
+        size = list(self.results[self.meth[0]].keys())
+        for i in self.meth[1:]:
+            s = list(self.results[i].keys())
+            if len(s) > len(size):
+                size = s
+        size.sort()
+        return size
+    size = property(get_size)
+
+
+if __name__ == "__main__":
+    try:
+        from argparse import ArgumentParser
+    except:
+        from pyFAI.argparse import ArgumentParser
+    description = """Benchmark for Azimuthal integration
+    """
+    epilog = """  """
+    usage = """benchmark [options] """
+    version = "pyFAI benchmark version " + pyFAI.version
+    parser = ArgumentParser(usage=usage, description=description, epilog=epilog)
+    parser.add_argument("-v", action='version', version=version)
+    parser.add_argument("-d", "--debug",
+                          action="store_true", dest="debug", default=False,
+                          help="switch to verbose/debug mode")
+    parser.add_argument("-c", "--cpu",
+                      action="store_true", dest="opencl_cpu", default=False,
+                      help="perform benchmark using OpenCL on the CPU")
+    parser.add_argument("-g", "--gpu",
+                      action="store_true", dest="opencl_gpu", default=False,
+                      help="perform benchmark using OpenCL on the GPU")
+    parser.add_argument("-a", "--acc",
+                      action="store_true", dest="opencl_acc", default=False,
+                      help="perform benchmark using OpenCL on the Accelerator (like XeonPhi/MIC)")
+    parser.add_argument("-s", "--small",
+                      action="store_true", dest="small", default=False,
+                      help="Limit the size of the dataset to 6 Mpixel images (for computer with limited memory)")
+    parser.add_argument("-n", "--number",
+                      dest="number", default=10, type=int,
+                      help="Number of repetition of the test, by default 10")
+    parser.add_argument("-2d", "--2dimentions",
+                      action="store_true", dest="twodim", default=False,
+                      help="Benchmark also algorithm for 2D-regrouping")
+    parser.add_argument("-m", "--memprof",
+                      action="store_true", dest="memprof", default=False,
+                      help="Perfrom memory profiling (Linux only)")
+    parser.add_argument("-f", "--fullsplit",
+                      action="store_true", dest="split_cpu", default=False,
+                      help="perform benchmark using full pixel splitting on CPU")
+
+
+    options = parser.parse_args()
+    if options.small:
+        ds_list = ds_list[:4]
+    if options.debug:
+            pyFAI.logger.setLevel(logging.DEBUG)
+    print("Averaging over %i repetitions (best of 3)." % options.number)
+    b = Bench(options.number, options.memprof)
+    b.init_curve()
+    b.bench_cpu1d()
+    b.bench_cpu1d_lut()
+    if options.opencl_cpu:
+        b.bench_cpu1d_lut_ocl("CPU")
+    if options.opencl_gpu:
+        b.bench_cpu1d_lut_ocl("GPU")
+    if options.opencl_acc:
+        b.bench_cpu1d_lut_ocl("ACC")
+    if options.split_cpu:
+        b.bench_cpu1d
+
+#    b.bench_cpu1d_ocl_lut("CPU")
+#    b.bench_gpu1d("gpu", True)
+#    b.bench_gpu1d("gpu", False)
+#    b.bench_gpu1d("cpu", True)
+#    b.bench_gpu1d("cpu", False)
+    if options.twodim:
+        b.bench_cpu2d()
+        b.bench_cpu2d_lut()
+        if options.opencl_cpu:
+            b.bench_cpu2d_lut_ocl("CPU")
+        if options.opencl_gpu:
+            b.bench_cpu2d_lut_ocl("GPU")
+        if options.opencl_acc:
+            b.bench_cpu2d_lut_ocl("ACC")
+
+#    b.bench_cpu2d_lut()
+#    b.bench_cpu2d_lut_ocl()
+    b.save()
+    b.print_res()
+#    b.display_all()
+    b.update_mp()
+
+    b.ax.set_ylim(1, 200)
+    # plt.show()
+    plt.ion()
+    raw_input("Enter to quit")
--- a/doc/source/pyFAI.rst
+++ b/doc/source/pyFAI.rst
@ -140,12 +140,19 @@ PyFAI solves this problem by pixel
 splitting : in addition to the pixel position, its
 spatial extension is calculated and each pixel is then split and
 distributed over the corresponding bins, the intensity being considered
+<<<<<<< HEAD
 as homogeneous within a pixel and spread accordingly.
 The drawback of this is the correlation introduced between two adjacent bins.
 To simplify 
 calculations, this was initially done by abstracting the pixel shape 
 with a bounding box that circumscribes the pixel. In an effort to better
 the quality of the results this method was dropped in favo2r of a full 
+=======
+as homogeneous within a pixel and spread accordingly. To simplify 
+calculations, this was initially done by abstracting the pixel shape 
+with a bounding box that circumscribes the pixel. In an effort to better
+the quality of the results this method was dropped in favour of a full 
+>>>>>>> giannis/master
 pixel splitting scheme that actually uses the actual pixel geometry 
 for its calculations.

@ -194,6 +201,7 @@ but can still be too large to fit on an entry-level graphics card.
 By making this change we switched from a “linear read / random write” forward algorithm
 to a “random read / linear write” backward algorithm which is more suitable for parallelization.
 As a farther improvement on the algorithm, the use of compressed sparse row (CSR) format was 
+<<<<<<< HEAD
 introduced, to store the LUT data.
 This algorithm was implemented both in [Cython]_-OpenMP and OpenCL.
 The CSR approach has a double benefit: 
@ -206,6 +214,14 @@ This makes it very well suited to run on GPUs and accelerators
 where hundreds to thousands of simultaneous threads are available.

 When using OpenCL for the GPU we used a compensated (or Kahan_summation_), to reduce
+=======
+introduced, to store the LUT data. This reduced its size even more, giving this way the 
+opportunity of working with bigger images on the same hardware, when memory space is of concern, 
+as well as making the code better suited to be run on GPUs or accelerators, as transferring 
+data to the device is one of the most important bottlenecks of such computations.
+This algorithm was implemented in Cython-OpenMP and OpenCL.
+When using OpenCL for the GPU we used a compensated, or Kahan summation to reduce
+>>>>>>> giannis/master
 the error accumulation in the histogram summation (at the cost of more operations to be done).
 This allows accurate results to be obtained on cheap hardware that performs calculations
 in single precision floating-point arithmetic (32 bits) which are available on consumer
@ -214,12 +230,17 @@ Double precision operations are currently limited to high price and performance
 The additional cost of Kahan summation, 4x more arithmetic operations, is hidden by smaller data types,
 the higher number of single precision units and that the GPU is usually limited by the memory bandwidth anyway.

+<<<<<<< HEAD
 .. _Kahan_summation: http://en.wikipedia.org/wiki/Kahan_summation_algorithm

 The performances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s 
 on recent multi-core computer with a mid-range graphics card. 
 On multi-socket server featuring high-end GPUs like Tesla cards, the performances are similar with 
 the additional capability to work on multiple detector simultaneously.
+=======
+The perfomances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s 
+on recent multi-socket, multi-core computer or on high-end GPUs like Tesla cards.
+>>>>>>> giannis/master

 .. figure:: img/benchmark.png
   :align: center
--- a/openCL/ocl_azim_CSR.cl
+++ b/openCL/ocl_azim_CSR.cl
@ -174,25 +174,21 @@ corrections( 		__global float 	*image,


 /**
- * \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT
+ * \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
 *
 * An image instensity value is spread across the bins according to the positions stored in the LUT.
- * The lut is an 2D-array of index (contains the positions of the pixel in the input array)
- * and coeficients (fraction of pixel going to the bin)
+ * The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
 * Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
 *
 * This implementation is especially efficient on CPU where each core reads adjacents memory.
 * the use of local pointer can help on the CPU.
 *
 * @param weights     Float pointer to global memory storing the input image.
- * @param lut         Pointer to an 2D-array of (unsigned integers,float) containing the index of input pixels and the fraction of pixel going to the bin
+ * @param coefs       Float pointer to global memory holding the coeficient part of the LUT
+ * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
+ * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
 * @param do_dummy    Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
 * @param dummy       Float: value for bad pixels
- * @param delta_dummy Float: precision for bad pixel value
- * @param do_dark     Bool/int: shall dark-current correction be applied ?
- * @param dark        Float pointer to global memory storing the dark image.
- * @param do_flat     Bool/int: shall flat-field correction be applied ? (could contain polarization corrections)
- * @param flat        Float pointer to global memory storing the flat image.
 * @param outData     Float pointer to the output 1D array with the weighted histogram
 * @param outCount    Float pointer to the output 1D array with the unweighted histogram
 * @param outMerged   Float pointer to the output 1D array with the diffractogram
@ -292,6 +288,106 @@ csr_integrate(	const 	__global	float	*weights,
    cd = 0;
    cc = 0;

+    if (thread_id_loc == 0)
+    {
+        outData[bin_num] = super_sum_data[0];
+        outCount[bin_num] = super_sum_count[0];
+        if (outCount[bin_num] > epsilon)
+            outMerge[bin_num] =  outData[bin_num] / outCount[bin_num];
+        else
+            outMerge[bin_num] = dummy;
+    }
+};//end kernel
+
+/**
+ * \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
+ *
+ * An image instensity value is spread across the bins according to the positions stored in the LUT.
+ * The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
+ * Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
+ *
+ * This kernel is ment to be ran with padded data (the span of each bin must be a multiple of the workgroup size)
+ *
+ * @param weights     Float pointer to global memory storing the input image.
+ * @param coefs       Float pointer to global memory holding the coeficient part of the LUT
+ * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
+ * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
+ * @param do_dummy    Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * @param dummy       Float: value for bad pixels
+ * @param outData     Float pointer to the output 1D array with the weighted histogram
+ * @param outCount    Float pointer to the output 1D array with the unweighted histogram
+ * @param outMerged   Float pointer to the output 1D array with the diffractogram
+ *
+ */
+__kernel void
+csr_integrate_padded(	const 	__global	float	*weights,
+                        const   __global    float   *coefs,
+                        const   __global    int     *row_ind,
+                        const   __global    int     *col_ptr,
+                        const				int   	do_dummy,
+                        const			 	float 	dummy,
+                                __global 	float	*outData,
+                                __global 	float	*outCount,
+                                __global 	float	*outMerge
+                    )
+{
+    int thread_id_loc = get_local_id(0);
+    int bin_num = get_group_id(0); // each workgroup of size=warp is assinged to 1 bin
+    int2 bin_bounds;
+//    bin_bounds = (int2) *(col_ptr+bin_num);  // cool stuff!
+    bin_bounds.x = col_ptr[bin_num];
+    bin_bounds.y = col_ptr[bin_num+1];
+	float sum_data = 0.0f;
+	float sum_count = 0.0f;
+	float cd = 0.0f;
+	float cc = 0.0f;
+	float t, y;
+	const float epsilon = 1e-10f;
+	float coef, data;
+	int idx, k, j;
+
+	for (j=bin_bounds.x;j<bin_bounds.y;j+=WORKGROUP_SIZE)
+	{
+		k = j+thread_id_loc;
+   		coef = coefs[k];
+        idx = row_ind[k];
+   		data = weights[idx];
+   		if( (!do_dummy) || (data!=dummy) )
+   		{
+   			//sum_data +=  coef * data;
+   			//sum_count += coef;
+   			//Kahan summation allows single precision arithmetics with error compensation
+   			//http://en.wikipedia.org/wiki/Kahan_summation_algorithm
+   			y = coef*data - cd;
+   			t = sum_data + y;
+   			cd = (t - sum_data) - y;
+    		sum_data = t;
+    		y = coef - cc;
+    		t = sum_count + y;
+    		cc = (t - sum_count) - y;
+    		sum_count = t;
+    	};//end if dummy
+    };//for j
+/*
+ * parallel reduction
+ */
+
+// REMEMBER TO PASS WORKGROUP_SIZE AS A CPP DEF
+    __local float super_sum_data[WORKGROUP_SIZE];
+    __local float super_sum_data_correction[WORKGROUP_SIZE];
+    __local float super_sum_count[WORKGROUP_SIZE];
+    __local float super_sum_count_correction[WORKGROUP_SIZE];
+    super_sum_data[thread_id_loc] = sum_data;
+    super_sum_count[thread_id_loc] = sum_count;
+    super_sum_data_correction[thread_id_loc] = cd;
+    super_sum_count_correction[thread_id_loc] = cc;
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    float super_sum_temp = 0.0f;
+    int index, active_threads = WORKGROUP_SIZE;
+    cd = 0;
+    cc = 0;
+    
    while (active_threads != 1)
    {
        active_threads /= 2;
@ -326,6 +422,20 @@ csr_integrate(	const 	__global	float	*weights,
    }
 };//end kernel

+=======
+
+Correct an image based on the look-up table calculated ...
+/**
+ * \brief Performs distortion corrections on an image using a LUT in CSR form
+ *
+ * @param weights     Float pointer to global memory storing the input image.
+ * @param coefs       Float pointer to global memory holding the coeficient part of the LUT
+ * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
+ * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
+ * @param outData     Float pointer to the output 1D array with the corrected image
+ *
+ */
+
 __kernel void
 csr_integrate_dis(  const   __global    float   *weights,
                const   __global    float   *coefs,
--- a/openCL/ocl_hist_pixelsplit.cl
+++ b/openCL/ocl_hist_pixelsplit.cl
@ -0,0 +1,405 @@
+//#pragma OPENCL EXTENSION cl_amd_printf : enable
+//#pragma OPENCL EXTENSION cl_intel_printf : enable
+
+
+float area4(float a0, float a1, float b0, float b1, float c0, float c1, float d0, float d1)
+{
+    return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)));
+}
+
+
+float integrate_line( float A0, float B0, float2 AB)
+{
+    return (A0==B0) ? 0.0 : AB.s0*(B0*B0 - A0*A0)*0.5 + AB.s1*(B0-A0);
+}
+
+
+float getBinNr(float x0, float delta, float pos0_min)
+{
+    return (x0 - pos0_min) / delta;
+}
+
+
+float min4f(float a, float b, float c, float d)
+{
+    return fmin(fmin(a,b),fmin(c,d));
+}
+
+
+float max4f(float a, float b, float c, float d)
+{
+    return fmax(fmax(a,b),fmax(c,d));
+}
+
+
+void AtomicAdd(volatile __global float *source, const float operand) 
+{
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } newVal;
+    union {
+        unsigned int intVal;
+        float floatVal;
+    } prevVal;
+    do {
+        prevVal.floatVal = *source;
+        newVal.floatVal = prevVal.floatVal + operand;
+    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
+}
+
+
+/**
+ * \brief cast values of an array of uint16 into a float output array.
+ *
+ * @param array_u16: Pointer to global memory with the input data as unsigned16 array
+ * @param array_float:  Pointer to global memory with the output data as float array
+ */
+__kernel void
+u16_to_float(__global unsigned short  *array_u16,
+             __global float *array_float
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if(i < NIMAGE)
+    array_float[i]=(float)array_u16[i];
+}
+
+
+/**
+ * \brief convert values of an array of int32 into a float output array.
+ *
+ * @param array_int:  Pointer to global memory with the data in int
+ * @param array_float:  Pointer to global memory with the data in float
+ */
+__kernel void
+s32_to_float(   __global int  *array_int,
+                __global float  *array_float
+        )
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if(i < NIMAGE)
+    array_float[i] = (float)(array_int[i]);
+}
+
+
+
+/**
+ * \brief Sets the values of 3 float output arrays to zero.
+ *
+ * Gridsize = size of arrays + padding.
+ *
+ * @param array0: float Pointer to global memory with the outMerge array
+ * @param array1: float Pointer to global memory with the outCount array
+ * @param array2: float Pointer to global memory with the outData array
+ */
+__kernel void
+memset_out(__global float *array0,
+           __global float *array1,
+           __global float *array2
+)
+{
+  int i = get_global_id(0);
+  //Global memory guard for padding
+  if(i < BINS)
+  {
+    array0[i]=0.0f;
+    array1[i]=0.0f;
+    array2[i]=0.0f;
+  }
+}
+
+
+
+/**
+ * \brief Performs the first part of a 2-step parallel reduction.
+ * 
+ * Together with the second part, it take a flattened 4D-array 
+ * and returns the min and max of both of the 2 components of the 
+ * last dimension
+ *
+ * @param buffer: float Pointer to global memory with the flattened 4D-array (pos in pyFAI)
+ * @param length: interger value of the length of the buffer array
+ * @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
+ */
+
+__kernel
+void reduce1(__global float2* buffer,
+             __const int length,
+             __global float4* preresult) {
+    
+    
+    int global_index = get_global_id(0);
+    int global_size  = get_global_size(0);
+    float4 accumulator;
+    accumulator.x = INFINITY;
+    accumulator.y = -INFINITY;
+    accumulator.z = INFINITY;
+    accumulator.w = -INFINITY;
+    
+    // Loop sequentially over chunks of input vector
+    while (global_index < length/2) {
+        float2 element = buffer[global_index];
+        accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
+        accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
+        accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
+        accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
+        global_index += global_size;
+    }
+    
+    __local float4 scratch[WORKGROUP_SIZE];
+
+    // Perform parallel reduction
+    int local_index = get_local_id(0);
+    
+    scratch[local_index] = accumulator;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int active_threads = get_local_size(0);
+    
+    while (active_threads != 1)
+    {
+        active_threads /= 2;
+        if (local_index < active_threads)
+        {
+            float4 other = scratch[local_index + active_threads];
+            float4 mine  = scratch[local_index];
+            mine.x = (mine.x < other.x) ? mine.x : other.x;
+            mine.y = (mine.y > other.y) ? mine.y : other.y;
+            mine.z = (mine.z < other.z) ? mine.z : other.z;
+            mine.w = (mine.w > other.w) ? mine.w : other.w;
+            /*
+            float2 tmp;
+            tmp.x = (mine.x < other.x) ? mine.x : other.x;
+            tmp.y = (mine.y > other.y) ? mine.y : other.y;
+            scratch[local_index] = tmp;
+            */
+            scratch[local_index] = mine;
+       }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (local_index == 0) {
+        preresult[get_group_id(0)] = scratch[0];
+    }
+}
+
+
+/**
+ * \brief Performs the second part of a 2-step parallel reduction.
+ * 
+ * Together with the second part, it take a flattened 4D-array 
+ * and returns the min and max of both of the 2 components of the 
+ * last dimension
+ *
+ * @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
+ * @param result: float Pointer to global memory with the min/max values requested
+ */
+
+__kernel
+void reduce2(__global float4* preresult,
+             __global float4* result) {
+    
+    
+    __local float4 scratch[WORKGROUP_SIZE];
+
+    int local_index = get_local_id(0);
+    
+    scratch[local_index] = preresult[local_index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int active_threads = get_local_size(0);
+    
+    while (active_threads != 1)
+    {
+        active_threads /= 2;
+        if (local_index < active_threads)
+        {
+            float4 other = scratch[local_index + active_threads];
+            float4 mine  = scratch[local_index];
+            mine.x = (mine.x < other.x) ? mine.x : other.x;
+            mine.y = (mine.y > other.y) ? mine.y : other.y;
+            mine.z = (mine.z < other.z) ? mine.z : other.z;
+            mine.w = (mine.w > other.w) ? mine.w : other.w;
+            /*
+            float2 tmp;
+            tmp.x = (mine.x < other.x) ? mine.x : other.x;
+            tmp.y = (mine.y > other.y) ? mine.y : other.y;
+            scratch[local_index] = tmp;
+            */
+            scratch[local_index] = mine;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    
+
+    if (local_index == 0) {
+        result[0] = scratch[0];
+    }
+}
+
+
+/**
+ * \brief Performs Normalization of input image
+ *
+ * Intensities of images are corrected by:
+ *  - dark (read-out) noise subtraction
+ *  - Solid angle correction (division)
+ *  - polarization correction (division)
+ *  - flat fiels correction (division)
+ * Corrections are made in place unless the pixel is dummy.
+ * Dummy pixels are left untouched so that they remain dummy
+ *
+ * @param image           Float pointer to global memory storing the input image.
+ * @param do_dark         Bool/int: shall dark-current correction be applied ?
+ * @param dark            Float pointer to global memory storing the dark image.
+ * @param do_flat         Bool/int: shall flat-field correction be applied ?
+ * @param flat            Float pointer to global memory storing the flat image.
+ * @param do_solidangle   Bool/int: shall flat-field correction be applied ?
+ * @param solidangle      Float pointer to global memory storing the solid angle of each pixel.
+ * @param do_polarization Bool/int: shall flat-field correction be applied ?
+ * @param polarization    Float pointer to global memory storing the polarization of each pixel.
+ * @param do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * @param dummy           Float: value for bad pixels
+ * @param delta_dummy     Float: precision for bad pixel value
+ *
+**/
+__kernel void
+corrections(        __global float  *image,
+            const            int    do_dark,
+            const   __global float  *dark,
+            const            int    do_flat,
+            const   __global float  *flat,
+            const            int    do_solidangle,
+            const   __global float  *solidangle,
+            const            int    do_polarization,
+            const   __global float  *polarization,
+            const            int    do_dummy,
+            const            float  dummy,
+            const            float  delta_dummy
+            )
+{
+    float data;
+    int i= get_global_id(0);
+    if(i < NIMAGE)
+    {
+        data = image[i];
+        int dummy_condition = ((!do_dummy) || ((delta_dummy!=0.0f) && (fabs(data-dummy) > delta_dummy)) || ((delta_dummy==0.0f) && (data!=dummy)));
+        data -= do_dark         ? dark[i]           : 0;
+        data *= do_flat         ? 1/flat[i]         : 1;
+        data *= do_solidangle   ? 1/solidangle[i]   : 1;
+        data *= do_polarization ? 1/polarization[i] : 1;
+        image[i] = dummy_condition ? data : dummy;
+    };//end if NIMAGE
+};//end kernel
+
+
+/**
+ * \brief Performs 1d azimuthal integration with full pixel splitting
+ *
+ * @param pos         Float pointer to global memory storting the flattened 4D-array with the pixel point coords
+ * @param image       Float pointer to global memory storing the input image.
+ * @param minmax      Float pointer to global memory holding the min/max results of the reduction kernels
+ * @param length:     Interger value of the length of the buffer array
+ * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
+ * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
+ * @param do_dummy    Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
+ * @param dummy       Float: value for bad pixels
+ * @param outData     Float pointer to the output 1D array with the weighted histogram
+ * @param outCount    Float pointer to the output 1D array with the unweighted histogram
+ *
+ */
+
+__kernel
+void integrate1(__global float8* pos,
+                __global float*  image,
+    //             __global int*    mask,
+    //             __const  int     check_mask,
+                __global float4* minmax,
+                const    int     length,
+    //                     float2  pos0Range,
+    //                     float2  pos1Range,
+                const    int     do_dummy,
+                const    float   dummy,
+                __global float*  outData,
+                __global float*  outCount)
+{
+    int global_index = get_global_id(0);
+    if (global_index < length)
+    {
+//         float pos0_min = fmax(fmin(pos0Range.x,pos0Range.y),minmax[0].s0);
+//         float pos0_max = fmin(fmax(pos0Range.x,pos0Range.y),minmax[0].s1);
+        float pos0_min = minmax[0].s0;
+        float pos0_max = minmax[0].s1;
+        pos0_max *= 1 + EPS;
+        
+        float delta = (pos0_max - pos0_min) / BINS;
+        
+        int local_index  = get_local_id(0);
+        
+        float8 pixel = pos[global_index];
+        float  data  = image[global_index];
+        
+        pixel.s0 = getBinNr(pixel.s0, delta, pos0_min);
+        pixel.s2 = getBinNr(pixel.s2, delta, pos0_min);
+        pixel.s4 = getBinNr(pixel.s4, delta, pos0_min);
+        pixel.s6 = getBinNr(pixel.s6, delta, pos0_min);
+        
+        float min0 = min4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
+        float max0 = max4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
+        
+        int bin0_min = floor(min0);
+        int bin0_max = floor(max0);
+        
+        float2 AB, BC, CD, DA;
+        
+        AB.x=(pixel.s3-pixel.s1)/(pixel.s2-pixel.s0);
+        AB.y= pixel.s1 - AB.x*pixel.s0;
+        BC.x=(pixel.s5-pixel.s3)/(pixel.s4-pixel.s2);
+        BC.y= pixel.s3 - BC.x*pixel.s2;
+        CD.x=(pixel.s7-pixel.s5)/(pixel.s6-pixel.s4);
+        CD.y= pixel.s5 - CD.x*pixel.s4;
+        DA.x=(pixel.s1-pixel.s7)/(pixel.s0-pixel.s6);
+        DA.y= pixel.s7 - DA.x*pixel.s6;
+        
+        float areaPixel = area4(pixel.s0, pixel.s1, pixel.s2, pixel.s3, pixel.s4, pixel.s5, pixel.s6, pixel.s7);
+        float oneOverPixelArea = 1.0 / areaPixel;
+        for (int bin=bin0_min; bin < bin0_max+1; bin++)
+        {
+            float A_lim = (pixel.s0<=bin)*(pixel.s0<=(bin+1))*bin + (pixel.s0>bin)*(pixel.s0<=(bin+1))*pixel.s0 + (pixel.s0>bin)*(pixel.s0>(bin+1))*(bin+1);
+            float B_lim = (pixel.s2<=bin)*(pixel.s2<=(bin+1))*bin + (pixel.s2>bin)*(pixel.s2<=(bin+1))*pixel.s2 + (pixel.s2>bin)*(pixel.s2>(bin+1))*(bin+1);
+            float C_lim = (pixel.s4<=bin)*(pixel.s4<=(bin+1))*bin + (pixel.s4>bin)*(pixel.s4<=(bin+1))*pixel.s4 + (pixel.s4>bin)*(pixel.s4>(bin+1))*(bin+1);
+            float D_lim = (pixel.s6<=bin)*(pixel.s6<=(bin+1))*bin + (pixel.s6>bin)*(pixel.s6<=(bin+1))*pixel.s6 + (pixel.s6>bin)*(pixel.s6>(bin+1))*(bin+1);
+            float partialArea  = integrate_line(A_lim, B_lim, AB);
+            partialArea += integrate_line(B_lim, C_lim, BC);
+            partialArea += integrate_line(C_lim, D_lim, CD);
+            partialArea += integrate_line(D_lim, A_lim, DA);
+            float tmp = fabs(partialArea) * oneOverPixelArea;
+            outCount[bin] = tmp;
+            outData[bin]  = data*tmp;
+//             AtomicAdd(&outCount[bin], tmp); 
+//             AtomicAdd(&outData[bin], data*tmp);
+            
+        }
+    }
+}
+
+/**
+ * \brief Finished the 1d azimuthal integration by calculating the ratio of the 2 histograms
+ *
+ * @param outData     Float pointer to the output 1D array with the weighted histogram
+ * @param outCount    Float pointer to the output 1D array with the unweighted histogram
+ * @param outMerged   Float pointer to the output 1D array with the diffractogram
+ *
+ */
+
+__kernel
+void integrate2(__global float*  outData,
+                __global float*  outCount,
+                __global float*  outMerge)
+{
+    int global_index = get_global_id(0);
+    if (global_index < BINS)
+        outMerge[global_index] = outData[global_index]/outCount[global_index];
+}
--- a/openCL/reduction_test.cl
+++ b/openCL/reduction_test.cl
@ -0,0 +1,92 @@
+__kernel
+void reduce1(__global float* buffer,
+             __const int length,
+             __global float2* preresult) {
+    
+    
+    int global_index = get_global_id(0);
+    int global_size  = get_global_size(0);
+    float2 accumulator;
+    accumulator.x = INFINITY;
+    accumulator.y = -INFINITY;
+    // Loop sequentially over chunks of input vector
+    while (global_index < length) {
+        float element = buffer[global_index];
+        accumulator.x = (accumulator.x < element) ? accumulator.x : element;
+        accumulator.y = (accumulator.y > element) ? accumulator.y : element;
+        global_index += global_size;
+    }
+    
+    __local float2 scratch[WORKGROUP_SIZE];
+
+    // Perform parallel reduction
+    int local_index = get_local_id(0);
+    
+    scratch[local_index] = accumulator;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int active_threads = get_local_size(0);
+    
+    while (active_threads != 2)
+    {
+        active_threads /= 2;
+        if (thread_id_loc < active_threads)
+        {
+            float2 other = scratch[local_index + active_threads];
+            float2 mine  = scratch[local_index];
+            mine.x = (mine.x < other.x) ? mine.x : other.x;
+            mine.y = (mine.y > other.y) ? mine.y : other.y;
+            /*
+            float2 tmp;
+            tmp.x = (mine.x < other.x) ? mine.x : other.x;
+            tmp.y = (mine.y > other.y) ? mine.y : other.y;
+            scratch[local_index] = tmp;
+            */
+            scratch[local_index] = mine;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (local_index == 0) {
+        preresult[get_group_id(0)] = scratch[0];
+    }
+}
+
+__kernel
+void reduce2(__global float2* preresult,
+             __global float4* result) {
+    
+    
+    __local float2 scratch[WORKGROUP_SIZE];
+
+    int local_index = get_local_id(0);
+    
+    scratch[local_index] = preresult[local_index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int active_threads = get_local_size(0);
+    
+    while (active_threads != 2)
+    {
+        active_threads /= 2;
+        if (thread_id_loc < active_threads)
+        {
+            float2 other = scratch[local_index + active_threads];
+            float2 mine  = scratch[local_index];
+            mine.x = (mine.x < other.x) ? mine.x : other.x;
+            mine.y = (mine.y > other.y) ? mine.y : other.y;
+            /*
+            float2 tmp;
+            tmp.x = (mine.x < other.x) ? mine.x : other.x;
+            tmp.y = (mine.y > other.y) ? mine.y : other.y;
+            scratch[local_index] = tmp;
+            */
+            scratch[local_index] = mine;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    
+
+    if (local_index == 0) {
+        result[0] = vload4(0,scratch);
+    }
+}
--- a/openCL/reduction_test4.cl
+++ b/openCL/reduction_test4.cl
@ -0,0 +1,104 @@
+__kernel
+void reduce1(__global float2* buffer,
+             __const int length,
+             __global float4* preresult) {
+    
+    
+    int global_index = get_global_id(0);
+    int global_size  = get_global_size(0);
+    float4 accumulator;
+    accumulator.x = INFINITY;
+    accumulator.y = -INFINITY;
+    accumulator.z = INFINITY;
+    accumulator.w = -INFINITY;
+    
+    // Loop sequentially over chunks of input vector
+    while (global_index < length/2) {
+        float2 element = buffer[global_index];
+        accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
+        accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
+        accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
+        accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
+        global_index += global_size;
+    }
+    
+    __local float4 scratch[WORKGROUP_SIZE];
+
+    // Perform parallel reduction
+    int local_index = get_local_id(0);
+    
+    scratch[local_index] = accumulator;
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int active_threads = get_local_size(0);
+    
+    while (active_threads != 1)
+    {
+        active_threads /= 2;
+        if (local_index < active_threads)
+        {
+            float4 other = scratch[local_index + active_threads];
+            float4 mine  = scratch[local_index];
+            mine.x = (mine.x < other.x) ? mine.x : other.x;
+            mine.y = (mine.y > other.y) ? mine.y : other.y;
+            mine.z = (mine.z < other.z) ? mine.z : other.z;
+            mine.w = (mine.w > other.w) ? mine.w : other.w;
+            /*
+            float2 tmp;
+            tmp.x = (mine.x < other.x) ? mine.x : other.x;
+            tmp.y = (mine.y > other.y) ? mine.y : other.y;
+            scratch[local_index] = tmp;
+            */
+            scratch[local_index] = mine;
+       }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if (local_index == 0) {
+        preresult[get_group_id(0)] = scratch[0];
+    }
+}
+
+
+
+
+__kernel
+void reduce2(__global float4* preresult,
+             __global float4* result) {
+    
+    
+    __local float4 scratch[WORKGROUP_SIZE];
+
+    int local_index = get_local_id(0);
+    
+    scratch[local_index] = preresult[local_index];
+    barrier(CLK_LOCAL_MEM_FENCE);
+    
+    int active_threads = get_local_size(0);
+    
+    while (active_threads != 1)
+    {
+        active_threads /= 2;
+        if (local_index < active_threads)
+        {
+            float4 other = scratch[local_index + active_threads];
+            float4 mine  = scratch[local_index];
+            mine.x = (mine.x < other.x) ? mine.x : other.x;
+            mine.y = (mine.y > other.y) ? mine.y : other.y;
+            mine.z = (mine.z < other.z) ? mine.z : other.z;
+            mine.w = (mine.w > other.w) ? mine.w : other.w;
+            /*
+            float2 tmp;
+            tmp.x = (mine.x < other.x) ? mine.x : other.x;
+            tmp.y = (mine.y > other.y) ? mine.y : other.y;
+            scratch[local_index] = tmp;
+            */
+            scratch[local_index] = mine;
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    
+
+    if (local_index == 0) {
+        result[0] = scratch[0];
+    }
+}
--- a/openCL/test4.py
+++ b/openCL/test4.py
@ -0,0 +1,52 @@
+import pyopencl as cl
+from pyopencl import array
+import numpy
+
+length = 640000
+workgroup_size = 128
+
+a = numpy.random.rand(length).astype(numpy.float32)
+a.shape = (length/8,4,2)
+input_a = a.reshape(length)
+
+min0 = a[:, :, 0].min()
+max0 = a[:, :, 0].max()
+min1 = a[:, :, 1].min()
+max1 = a[:, :, 1].max()
+minmax=(min0,max0,min1,max1)
+
+
+platform = cl.get_platforms()[0]
+device = platform.get_devices()[0]
+ctx = cl.Context((device,))
+queue = cl.CommandQueue(ctx)
+mf = cl.mem_flags
+
+
+d_input     = array.to_device(queue, input_a)
+d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
+d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
+
+with open("reduction_test4.cl", "r") as kernelFile:
+    kernel_src = kernelFile.read()
+
+compile_options = "-D WORKGROUP_SIZE=%i" % (workgroup_size)
+
+program = cl.Program(ctx, kernel_src).build(options=compile_options)
+
+program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data,  numpy.uint32(length), d_preresult)
+
+program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
+
+result = numpy.ndarray(4,dtype=numpy.float32)
+
+cl.enqueue_copy(queue,result, d_result)
+
+
+print minmax
+
+print result
+
+
+
+
--- a/pyFAI-src/azimuthalIntegrator.py
+++ b/pyFAI-src/azimuthalIntegrator.py
@ -84,6 +84,13 @@ except ImportError as error:
                  " full pixel splitting: %s" % error)
    splitPixel = None

+try:
+    from . import splitPixelFull  # IGNORE:F0401
+except ImportError as error:
+    logger.error("Unable to import pyFAI.splitPixelFull"
+                  " full pixel splitting: %s" % error)
+    splitPixelFull = None
+
 try:
    from . import splitBBox  # IGNORE:F0401
 except ImportError as error:
@ -2407,6 +2414,41 @@ class AzimuthalIntegrator(Geometry):


        if (I is None) and ("splitpix" in method):
+            if "full" in method:
+                if splitPixel is None:
+                    logger.warning("SplitPixelFull is not available,"
+                                " falling back on splitbbox histogram !")
+                    method = "splitbbox"
+                else:
+                    logger.debug("integrate1d uses SplitPixel implementation")
+                    pos = self.array_from_unit(shape, "corner", unit)
+                    qAxis, I, a, b = splitPixelFull.fullSplit1D(pos=pos,
+                                                            weights=data,
+                                                            bins=nbPt,
+                                                            pos0Range=radial_range,
+                                                            pos1Range=azimuth_range,
+                                                            dummy=dummy,
+                                                            delta_dummy=delta_dummy,
+                                                            mask=mask,
+                                                            dark=dark,
+                                                            flat=flat,
+                                                            solidangle=solidangle,
+                                                            polarization=polarization
+                                                            )
+                    if error_model == "azimuthal":
+                        variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
+                    if variance is not None:
+                        _, var1d, a, b = splitPixelFull.fullSplit1D(pos=pos,
+                                                                weights=variance,
+                                                                bins=nbPt,
+                                                                pos0Range=radial_range,
+                                                                pos1Range=azimuth_range,
+                                                                dummy=dummy,
+                                                                delta_dummy=delta_dummy,
+                                                                mask=mask,
+                                                                )
+                        sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
+            else:
                if splitPixel is None:
                    logger.warning("SplitPixel is not available,"
                                " falling back on splitbbox histogram !")
--- a/pyFAI-src/ocl_azim_csr.py
+++ b/pyFAI-src/ocl_azim_csr.py
@ -58,6 +58,7 @@ class OCL_CSR_Integrator(object):
            indptr: row pointer indicates the start of a given row. len nbin+1
        @param image_size: size of the image (for pre-processing)
        @param devicetype: can be "cpu","gpu","acc" or "all"
+        @param block_size: the chosen size for WORKGROUP_SIZE
        @param platformid: number of the platform as given by clinfo
        @type platformid: int
        @param deviceid: number of the device as given by clinfo
--- a/pyFAI-src/ocl_azim_csr_dis.py
+++ b/pyFAI-src/ocl_azim_csr_dis.py
@ -52,11 +52,13 @@ class OCL_CSR_Integrator(object):
                 platformid=None, deviceid=None,
                 checksum=None, profile=False):
        """
-        @param data: coefficient of the matrix in a 1D vector of float32 - size of nnz
-        @param indices: Column index position for the data (same size as data)
-        @param indptr: row pointer indicates the start of a given row. len nbin+1
+        @param lut: 3-tuple of arrays 
+            data: coefficient of the matrix in a 1D vector of float32 - size of nnz
+            indices: Column index position for the data (same size as data) 
+            indptr: row pointer indicates the start of a given row. len nbin+1
        @param image_size: 
        @param devicetype: can be "cpu","gpu","acc" or "all"
+        @param block_size: the chosen size for WORKGROUP_SIZE
        @param platformid: number of the platform as given by clinfo
        @type platformid: int
        @param deviceid: number of the device as given by clinfo
--- a/pyFAI-src/ocl_hist_pixelsplit.py
+++ b/pyFAI-src/ocl_hist_pixelsplit.py
@ -0,0 +1,380 @@
+# -*- coding: utf-8 -*-
+#
+#    Project: Azimuthal integration
+#             https://github.com/kif/pyFAI
+#
+#
+#    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#                            Giannis Ashiotis
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+__authors__ = ["Jérôme Kieffer", "Giannis Ashiotis"]
+__license__ = "GPLv3"
+__date__ = "04/04/2014"
+__copyright__ = "2014, ESRF, Grenoble"
+__contact__ = "jerome.kieffer@esrf.fr"
+
+import os, gc, logging
+import threading
+import hashlib
+import numpy
+from .opencl import ocl, pyopencl
+from .splitBBoxLUT import HistoBBox1d
+from .utils import get_cl_file
+from pyopencl import array
+if pyopencl:
+    mf = pyopencl.mem_flags
+else:
+    raise ImportError("pyopencl is not installed")
+try:
+    from .fastcrc import crc32
+except:
+    from zlib import crc32
+logger = logging.getLogger("pyFAI.ocl_azim_csr")
+
+class OCL_Hist_Pixelsplit(object):
+    def __init__(self, pos, bins, image_size, pos0Range=None, pos1Range=None, devicetype="all",
+                 padded=False, block_size=32,
+                 platformid=None, deviceid=None, 
+                 checksum=None, profile=False):
+        """
+        @param lut: 3-tuple of arrays 
+            data: coefficient of the matrix in a 1D vector of float32 - size of nnz
+            indices: Column index position for the data (same size as data) 
+            indptr: row pointer indicates the start of a given row. len nbin+1
+        @param image_size: size of the image (for pre-processing)
+        @param devicetype: can be "cpu","gpu","acc" or "all"
+        @param platformid: number of the platform as given by clinfo
+        @type platformid: int
+        @param deviceid: number of the device as given by clinfo
+        @type deviceid: int
+        @param checksum: pre - calculated checksum to prevent re - calculating it :)
+        @param profile: store profiling elements
+        """
+        self.BLOCK_SIZE = block_size  # query for warp size
+        self.padded = padded
+        self._sem = threading.Semaphore()
+        self.pos = pos
+        self.bins = bins
+        self.pos_size = pos.size
+        self.size = image_size
+        if self.pos_size != 8 * self.size:
+            raise RuntimeError("pos.size != 8 * image_size")
+        self.pos0Range = numpy.zeros(1, pyopencl.array.vec.float2) 
+        self.pos1Range = numpy.zeros(1, pyopencl.array.vec.float2)
+        if (pos0Range is not None) and (len(pos0Range) is 2):
+            self.pos0Range[0][0] = min(pos0Range)
+            self.pos0Range[0][1] = max(pos0Range)
+        else:
+            self.pos0Range[0][0] = -float("inf")
+            self.pos0Range[0][1] =  float("inf")
+            
+        if (pos1Range is not None) and (len(pos1Range) is 2):
+            self.pos1Range[0][0] = min(pos1Range)
+            self.pos1Range[0][1] = max(pos1Range)
+        else:
+            self.pos1Range[0][0] = -float("inf")
+            self.pos1Range[0][1] =  float("inf")
+            
+        self.profile = profile
+        if not checksum:
+            checksum = crc32(self.pos)
+        self.on_device = {"pos":checksum, "dark":None, "flat":None, "polarization":None, "solidangle":None}
+        self._cl_kernel_args = {}
+        self._cl_mem = {}
+        self.events = []
+        if (platformid is None) and (deviceid is None):
+            platformid, deviceid = ocl.select_device(devicetype)
+        elif platformid is None:
+            platformid = 0
+        elif deviceid is None:
+            deviceid = 0
+        self.platform = ocl.platforms[platformid]
+        self.device = self.platform.devices[deviceid]
+        self.device_type = self.device.type
+        if (self.device_type == "CPU") and (self.platform.vendor == "Apple"):
+            logger.warning("This is a workaround for Apple's OpenCL on CPU: enforce BLOCK_SIZE=1")
+            self.BLOCK_SIZE = 1
+        self.workgroup_size = self.BLOCK_SIZE,
+        self.wdim_bins = (self.bins * self.BLOCK_SIZE),
+        self.wdim_data = (self.size + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
+        try:
+            #self._ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
+            self._ctx = pyopencl.create_some_context()
+            if self.profile:         
+                self._queue = pyopencl.CommandQueue(self._ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
+            else:
+                self._queue = pyopencl.CommandQueue(self._ctx)
+            self._allocate_buffers()
+            self._compile_kernels()
+            self._set_kernel_arguments()
+        except pyopencl.MemoryError as error:
+            raise MemoryError(error)
+        ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["pos"], self.pos)
+        if self.profile: self.events.append(("copy pos data",ev))
+        reduction_wg_size = 256
+        reduce1 = self._program.reduce1(self._queue, (reduction_wg_size*reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce1"])
+        self.events.append(("reduce1",reduce1))
+        reduce2 = self._program.reduce2(self._queue, (reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce2"])
+        self.events.append(("reduce2",reduce2))
+        
+        result = numpy.ndarray(4,dtype=numpy.float32)
+        pyopencl.enqueue_copy(self._queue,result, self._cl_mem["minmax"])
+        print result
+        min0 = pos[:, :, 0].min()
+        max0 = pos[:, :, 0].max()
+        min1 = pos[:, :, 1].min()
+        max1 = pos[:, :, 1].max()
+        minmax=(min0,max0,min1,max1)
+
+        print minmax
+       
+    def __del__(self):
+        """
+        Destructor: release all buffers
+        """
+        self._free_kernels()
+        self._free_buffers()
+        self._queue = None
+        self._ctx = None
+        gc.collect()
+
+    def _allocate_buffers(self):
+        """
+        Allocate OpenCL buffers required for a specific configuration
+
+        Note that an OpenCL context also requires some memory, as well as Event and other OpenCL functionalities which cannot and
+        are not taken into account here.
+        The memory required by a context varies depending on the device. Typical for GTX580 is 65Mb but for a 9300m is ~15Mb
+        In addition, a GPU will always have at least 3-5Mb of memory in use.
+        Unfortunately, OpenCL does NOT have a built-in way to check the actual free memory on a device, only the total memory.
+        """
+        if self.size < self.BLOCK_SIZE:
+            raise RuntimeError("Fatal error in _allocate_buffers. size (%d) must be >= BLOCK_SIZE (%d)\n", self.size, self.BLOCK_SIZE)
+        size_of_float = numpy.dtype(numpy.float32).itemsize
+        size_of_short = numpy.dtype(numpy.int16).itemsize
+        size_of_int = numpy.dtype(numpy.int32).itemsize
+        size_of_long = numpy.dtype(numpy.int64).itemsize
+
+        ualloc  = (self.pos_size * size_of_float)
+        ualloc += (4 * self.BLOCK_SIZE * size_of_float)
+        ualloc += (self.size * size_of_float) * 5
+        ualloc += (self.bins * size_of_float) * 3
+        memory = self.device.memory
+        logger.info("%.3fMB are needed on device which has %.3fMB" % (ualloc / 1.0e6, memory / 1.0e6))
+        if ualloc >= memory:
+            raise MemoryError("Fatal error in _allocate_buffers. Not enough device memory for buffers (%lu requested, %lu available)" % (ualloc, memory))
+        # now actually allocate:
+        try:
+            self._cl_mem["pos"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.pos_size)
+            self._cl_mem["preresult"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4 * self.BLOCK_SIZE)
+            self._cl_mem["minmax"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4)
+            self._cl_mem["outData"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
+            self._cl_mem["outCount"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
+            self._cl_mem["outMerge"] = pyopencl.Buffer(self._ctx, mf.WRITE_ONLY, size=size_of_float * self.bins)
+            self._cl_mem["image_u16"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_short * self.size)
+            self._cl_mem["image"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.size)
+            self._cl_mem["dark"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
+            self._cl_mem["flat"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
+            self._cl_mem["polarization"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
+            self._cl_mem["solidangle"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
+        except pyopencl.MemoryError as error:
+            self._free_buffers()
+            raise MemoryError(error)
+
+    def _free_buffers(self):
+        """
+        free all memory allocated on the device
+        """
+        for buffer_name in self._cl_mem:
+            if self._cl_mem[buffer_name] is not None:
+                try:
+                    self._cl_mem[buffer_name].release()
+                    self._cl_mem[buffer_name] = None
+                except pyopencl.LogicError:
+                    logger.error("Error while freeing buffer %s" % buffer_name)
+
+    def _compile_kernels(self, kernel_file=None):
+        """
+        Call the OpenCL compiler
+        @param kernel_file: path tothe
+        """
+        kernel_name = "ocl_hist_pixelsplit.cl"
+        if kernel_file is None:
+            if os.path.isfile(kernel_name):
+                kernel_file = os.path.abspath(kernel_name)
+            else:
+                kernel_file = get_cl_file(kernel_name)
+        else:
+            kernel_file = str(kernel_file)
+        with open(kernel_file, "r") as kernelFile:
+            kernel_src = kernelFile.read()
+
+        compile_options = "-D BINS=%i  -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
+                (self.bins, self.size, self.BLOCK_SIZE, numpy.finfo(numpy.float32).eps)
+        logger.info("Compiling file %s with options %s" % (kernel_file, compile_options))
+        try:
+            self._program = pyopencl.Program(self._ctx, kernel_src).build(options=compile_options)
+        except pyopencl.MemoryError as error:
+            raise MemoryError(error)
+
+    def _free_kernels(self):
+        """
+        free all kernels
+        """
+        for kernel in self._cl_kernel_args:
+            self._cl_kernel_args[kernel] = []
+        self._program = None
+
+    def _set_kernel_arguments(self):
+        """Tie arguments of OpenCL kernel-functions to the actual kernels
+
+        set_kernel_arguments() is a private method, called by configure().
+        It uses the dictionary _cl_kernel_args.
+        Note that by default, since TthRange is disabled, the integration kernels have tth_min_max tied to the tthRange argument slot.
+        When setRange is called it replaces that argument with tthRange low and upper bounds. When unsetRange is called, the argument slot
+        is reset to tth_min_max.
+        """
+        self._cl_kernel_args["reduce1"] = [self._cl_mem["pos"], numpy.int32(self.pos_size), self._cl_mem["preresult"]]
+        self._cl_kernel_args["reduce2"] = [self._cl_mem["preresult"], self._cl_mem["minmax"]]
+        self._cl_kernel_args["corrections"] = [self._cl_mem["image"], numpy.int32(0), self._cl_mem["dark"], numpy.int32(0), self._cl_mem["flat"], \
+                                              numpy.int32(0), self._cl_mem["solidangle"], numpy.int32(0), self._cl_mem["polarization"], \
+                                              numpy.int32(0), numpy.float32(0), numpy.float32(0)]
+        self._cl_kernel_args["integrate1"] = [self._cl_mem["pos"], self._cl_mem["image"], self._cl_mem["minmax"], numpy.int32(0), self.pos0Range[0], \
+                                            self.pos1Range[0], numpy.int32(0), numpy.float32(0), self._cl_mem["outData"], self._cl_mem["outCount"]]
+        self._cl_kernel_args["integrate2"] = [self._cl_mem["outData"], self._cl_mem["outCount"], self._cl_mem["outMerge"]]
+        self._cl_kernel_args["memset_out"] = [self._cl_mem[i] for i in ["outData", "outCount", "outMerge"]]
+        self._cl_kernel_args["u16_to_float"] = [self._cl_mem[i] for i in ["image_u16", "image"]]
+        self._cl_kernel_args["s32_to_float"] = [self._cl_mem[i] for i in ["image", "image"]]
+
+    def  integrate(self, data, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None, dark_checksum=None, flat_checksum=None, solidAngle_checksum=None, polarization_checksum=None):
+        events = []
+        with self._sem:
+            if data.dtype == numpy.uint16:
+                copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image_u16"], numpy.ascontiguousarray(data))
+                cast_u16_to_float = self._program.u16_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["u16_to_float"])
+                events+=[("copy image",copy_image),("cast", cast_u16_to_float)]
+            elif data.dtype == numpy.int32:
+                copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data))
+                cast_s32_to_float = self._program.s32_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["s32_to_float"])
+                events+=[("copy image",copy_image),("cast", cast_s32_to_float)]
+            else:
+                copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data, dtype=numpy.float32))
+                events+=[("copy image",copy_image)]
+            memset = self._program.memset_out(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["memset_out"])
+            events+=[("memset",memset)]
+            if dummy is not None:
+                do_dummy = numpy.int32(1)
+                dummy = numpy.float32(dummy)
+                if delta_dummy == None:
+                    delta_dummy = numpy.float32(0)
+                else:
+                    delta_dummy = numpy.float32(abs(delta_dummy))
+            else:
+                do_dummy = numpy.int32(0)
+                dummy = numpy.float32(0)
+                delta_dummy = numpy.float32(0)
+            self._cl_kernel_args["corrections"][9] = do_dummy
+            self._cl_kernel_args["corrections"][10] = dummy
+            self._cl_kernel_args["corrections"][11] = delta_dummy
+            self._cl_kernel_args["integrate1"][6] = do_dummy
+            self._cl_kernel_args["integrate1"][7] = dummy
+
+            if dark is not None:
+                do_dark = numpy.int32(1)
+                if not dark_checksum:
+                    dark_checksum = crc32(dark)
+                if dark_checksum != self.on_device["dark"]:
+                    ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["dark"], numpy.ascontiguousarray(dark, dtype=numpy.float32))
+                    events.append("copy dark",ev)
+                    self.on_device["dark"] = dark_checksum
+            else:
+                do_dark = numpy.int32(0)
+            self._cl_kernel_args["corrections"][1] = do_dark
+            if flat is not None:
+                do_flat = numpy.int32(1)
+                if not flat_checksum:
+                    flat_checksum = crc32(flat)
+                if self.on_device["flat"] != flat_checksum:
+                    ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["flat"], numpy.ascontiguousarray(flat, dtype=numpy.float32))
+                    events.append("copy flat",ev)
+                    self.on_device["flat"] = flat_checksum
+            else:
+                do_flat = numpy.int32(0)
+            self._cl_kernel_args["corrections"][3] = do_flat
+
+            if solidAngle is not None:
+                do_solidAngle = numpy.int32(1)
+                if not solidAngle_checksum:
+                    solidAngle_checksum = crc32(solidAngle)
+                if solidAngle_checksum != self.on_device["solidangle"]:
+                    ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["solidangle"], numpy.ascontiguousarray(solidAngle, dtype=numpy.float32))
+	            events.append(("copy solidangle",ev))
+                    self.on_device["solidangle"] = solidAngle_checksum
+            else:
+                do_solidAngle = numpy.int32(0)
+            self._cl_kernel_args["corrections"][5] = do_solidAngle
+
+            if polarization is not None:
+                do_polarization = numpy.int32(1)
+                if not polarization_checksum:
+                    polarization_checksum = crc32(polarization)
+                if polarization_checksum != self.on_device["polarization"]:
+                    ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["polarization"], numpy.ascontiguousarray(polarization, dtype=numpy.float32))
+                    events.append(("copy polarization",ev))
+                    self.on_device["polarization"] = polarization_checksum
+            else:
+                do_polarization = numpy.int32(0)
+            self._cl_kernel_args["corrections"][7] = do_polarization
+            copy_image.wait()
+            if do_dummy + do_polarization + do_solidAngle + do_flat + do_dark > 0:
+                ev = self._program.corrections(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["corrections"])
+                events.append(("corrections",ev))
+            integrate1 = self._program.integrate1(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["integrate1"])
+            events.append(("integrate1",integrate1))
+            outMerge = numpy.empty(self.bins, dtype=numpy.float32)
+            outData = numpy.empty(self.bins, dtype=numpy.float32)
+            outCount = numpy.empty(self.bins, dtype=numpy.float32)
+            ev=pyopencl.enqueue_copy(self._queue, outData, self._cl_mem["outData"])
+            events.append(("copy D->H outData",ev))
+            ev=pyopencl.enqueue_copy(self._queue, outCount, self._cl_mem["outCount"])
+            events.append(("copy D->H outCount",ev))
+            global_size_integrate2 = (self.bins + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
+            integrate2 = self._program.integrate2(self._queue, global_size_integrate2, self.workgroup_size, *self._cl_kernel_args["integrate2"])
+            events.append(("integrate2",integrate2))
+            ev=pyopencl.enqueue_copy(self._queue, outMerge, self._cl_mem["outMerge"])
+            events.append(("copy D->H outMerge",ev))
+            ev.wait()
+        if self.profile: 
+            self.events+=events        
+        return outMerge, outData, outCount
+
+    def  log_profile(self):
+        """
+        If we are in profiling mode, prints out all timing for every single OpenCL call
+        """
+        t = 0.0
+        if self.profile:
+            for e in self.events:
+                if "__len__" in dir(e) and len(e) >= 2:
+                    et = 1e-6 * (e[1].profile.end - e[1].profile.start)
+                    print("%50s:\t%.3fms" % (e[0], et))
+                    t += et
+
+        print("_"*80)
+        print("%50s:\t%.3fms" % ("Total execution time", t))
--- a/setup.py
+++ b/setup.py
@ -49,14 +49,10 @@ from distutils.sysconfig import get_python_lib
 from distutils.command.install_data import install_data

 ################################################################################
-# Remove MANIFEST file ... it needs to be re-generated on the fly
+# Check for Cython
 ################################################################################
 if op.isfile("MANIFEST"):
    os.unlink("MANIFEST")
-
-################################################################################
-# Check for Cython
-################################################################################
 try:
    from Cython.Distutils import build_ext
    CYTHON = True
@ -124,7 +120,6 @@ if ("sdist" in sys.argv):
 # pyFAI extensions
 # ###############################################################################
 cython_modules = [os.path.splitext(os.path.basename(i))[0] for i in glob.glob("src/*.pyx")]
-
 src = dict([(ext, join("src", ext + cython_c_ext)) for ext in cython_modules])

 _geometry_dic = dict(name="_geometry",
@ -151,6 +146,16 @@ splitPixel_dic = dict(name="splitPixel",
                 sources=[src['splitPixel']],
                 )

+splitPixelFull_dic = dict(name="splitPixelFull",
+                 include_dirs=get_numpy_include_dirs(),
+                 sources=[src['splitPixelFull']],
+                 )
+
+splitPixelFullLUT_dic = dict(name="splitPixelFullLUT",
+                 include_dirs=get_numpy_include_dirs(),
+                 sources=[src['splitPixelFullLUT']],
+                 )
+
 splitBBox_dic = dict(name="splitBBox",
                     include_dirs=get_numpy_include_dirs(),
                     sources=[src['splitBBox']],
@ -200,6 +205,7 @@ _bispev_dic = dict(name="_bispev",
                        sources=[src['_bispev'] ],
                        extra_compile_args=['openmp'],
                        extra_link_args=['openmp'],
+
                        )

 _convolution_dic = dict(name="_convolution",
@ -266,6 +272,7 @@ if sys.platform == "win32":
        if (filein + ".py") not in script_files:
            shutil.copyfile(filein, filein + ".py")
            script_files.append(filein + ".py")
+
 else:
    script_files = glob.glob("scripts/*")

@ -420,10 +427,11 @@ This python module can be found on:
 http://pypi.python.org/pypi/pyopencl
 """)

+
 """
-################################################################################
+# ###############################################################################
 # check if OpenMP modules, freshly installed can import
-################################################################################
+# ###############################################################################
 pyFAI = None
 sys.path.insert(0, os.path.dirname(installDir))
 # print installDir
@ -436,7 +444,7 @@ for mod in sys.modules.copy():
 try:
    import pyFAI
 except ImportError as E:
-    print("Unable to import pyFAI from system: %s" % E)
+    print("Unable to import pyFAI: %s" % E)
 else:
    print("PyFAI is installed in %s" % pyFAI.__file__)
    try:
@ -445,5 +453,4 @@ else:
        print("PyFAI.histogram failed to import. It is likely there is an OpenMP error: %s" % E)
    else:
        print("OpenMP libraries were found and pyFAI.histogram was successfully imported")
-
 """
--- a/src/_distortion.pyx
+++ b/src/_distortion.pyx
@ -903,3 +903,114 @@ def cal_LUT(float[:,:,:,:] pos not None, shape, int size, max_pixel_size):
                        outMax[ml, nl] = k + 1
                idx += 1
    return lut.reshape(shape0 * shape1, size)
+
+
+def cal_CSR(float[:,:,:,:] pos not None, shape, bin_size, max_pixel_size):
+    """
+    @param pos: 4D position array 
+    @param shape: output shape
+    @param bin_size: number of input element per output element (as numpy array) 
+    @param max_pixel_size: (2-tuple of int) size of a buffer covering the largest pixel
+    @return: look-up table in CSR format: 3-tuple of array"""
+    cdef int i, j, ms, ml, ns, nl, shape0, shape1, delta0, delta1, buffer_size, i0, i1, bins, lut_size
+    cdef int offset0, offset1, box_size0, box_size1
+    cdef numpy.int32_t k, idx=0
+    cdef float A0, A1, B0, B1, C0, C1, D0, D1, pAB, pBC, pCD, pDA, cAB, cBC, cCD, cDA, area, value
+    cdef numpy.ndarray[lut_point, ndim = 3] lut
+    cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros(shape, dtype=numpy.int32)
+    cdef float[:,:] buffer
+    cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr, indices
+    cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
+
+    shape0, shape1 = shape
+    delta0, delta1 = max_pixel_size
+    bins = shape0*shape1
+    indptr = numpy.empty(bins+1, dtype=numpy.int32)
+    indptr[0] = 0
+    indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
+    lut_size = indptr[bins]
+                    
+    indices = numpy.zeros(shape=lut_size, dtype=numpy.int32)
+    data = numpy.zeros(shape=lut_size, dtype=numpy.float32)
+    
+    bins = shape0*shape1
+    indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
+    
+    indices_size = lut_size*sizeof(numpy.int32)
+    data_size = lut_size*sizeof(numpy.float32)
+    indptr_size = bins*sizeof(numpy.int32)
+    
+    logger.info("CSR matrix: %.3f MByte"%((indices_size+data_size+indptr_size)/1.0e6))
+    buffer = cvarray(shape=(delta0, delta1), itemsize=sizeof(float), format="f")
+    buffer_size = delta0 * delta1 * sizeof(float)
+    logger.info("Max pixel size: %ix%i; Max source pixel in target: %i"%(buffer.shape[1],buffer.shape[0], lut_size))
+    with nogil:
+        # i,j, idx are indices of the raw image uncorrected
+        for i in range(shape0):
+            for j in range(shape1):
+                #reinit of buffer
+                memset(&buffer[0,0], 0, buffer_size)
+                A0 = pos[i, j, 0, 0]
+                A1 = pos[i, j, 0, 1]
+                B0 = pos[i, j, 1, 0]
+                B1 = pos[i, j, 1, 1]
+                C0 = pos[i, j, 2, 0]
+                C1 = pos[i, j, 2, 1]
+                D0 = pos[i, j, 3, 0]
+                D1 = pos[i, j, 3, 1]
+                offset0 = (<int> floor(min4f(A0, B0, C0, D0)))
+                offset1 = (<int> floor(min4f(A1, B1, C1, D1)))
+                box_size0 = (<int> ceil(max4f(A0, B0, C0, D0))) - offset0
+                box_size1 = (<int> ceil(max4f(A1, B1, C1, D1))) - offset1
+                A0 -= <float> offset0
+                A1 -= <float> offset1
+                B0 -= <float> offset0
+                B1 -= <float> offset1
+                C0 -= <float> offset0
+                C1 -= <float> offset1
+                D0 -= <float> offset0
+                D1 -= <float> offset1
+                if B0 != A0:
+                    pAB = (B1 - A1) / (B0 - A0)
+                    cAB = A1 - pAB * A0
+                else:
+                    pAB = cAB = 0.0
+                if C0 != B0:
+                    pBC = (C1 - B1) / (C0 - B0)
+                    cBC = B1 - pBC * B0
+                else:
+                    pBC = cBC = 0.0
+                if D0 != C0:
+                    pCD = (D1 - C1) / (D0 - C0)
+                    cCD = C1 - pCD * C0
+                else:
+                    pCD = cCD = 0.0
+                if A0 != D0:
+                    pDA = (A1 - D1) / (A0 - D0)
+                    cDA = D1 - pDA * D0
+                else:
+                    pDA = cDA = 0.0
+                integrate(buffer, B0, A0, pAB, cAB)
+                integrate(buffer, A0, D0, pDA, cDA)
+                integrate(buffer, D0, C0, pCD, cCD)
+                integrate(buffer, C0, B0, pBC, cBC)
+                area = 0.5*((C0 - A0)*(D1 - B1)-(C1 - A1)*(D0 - B0))
+                for ms in range(box_size0):
+                    ml = ms + offset0
+                    if ml < 0 or ml >= shape0:
+                        continue
+                    for ns in range(box_size1):
+                        # ms,ns are indexes of the corrected image in short form, ml & nl are the same
+                        nl = ns + offset1
+                        if nl < 0 or nl >= shape1:
+                            continue
+                        value = buffer[ms, ns] / area
+                        if value <= 0:
+                            continue
+                        k = outMax[ml,nl]
+                        tmp_index = indptr[ml*shape1+nl]
+                        indices[tmp_index+k] = idx
+                        data[tmp_index+k] = value
+                        outMax[ml,nl] = k + 1
+                idx += 1
+    return (data, indices, indptr)
--- a/src/_distortionCSR.pyx
+++ b/src/_distortionCSR.pyx
@ -438,7 +438,7 @@ class Distortion(object):
    def correctHost(self, image):
        """
        Correct an image based on the look-up table calculated ...
-        Caclulation takes place on the Host
+        Calculation takes place on the Host

        @param image: 2D-array with the image
        @return: corrected 2D image
--- a/src/splitPixelFull.c
+++ b/src/splitPixelFull.c
--- a/src/splitPixelFull.pyx
+++ b/src/splitPixelFull.pyx
@ -0,0 +1,739 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Azimuthal integration
+#             https://forge.epn-campus.eu/projects/azimuthal
+#
+#    File: "$Id$"
+#
+#    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Giannis Ashiotis
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+
+import cython
+cimport numpy
+import numpy
+from libc.math cimport fabs, floor
+from libc.stdio cimport printf
+from cython.view cimport array as cvarray
+
+#cdef double areaTriangle(double a0,
+#                         double a1,
+#                         double b0,
+#                         double b1,
+#                         double c0,
+#                         double c1):
+#    """
+#    Calculate the area of the ABC triangle with corners:
+#    A(a0,a1)
+#    B(b0,b1)
+#    C(c0,c1)
+#    @return: area, i.e. 1/2 * (B-A)^(C-A)
+#    """
+#    return 0.5 * abs(((b0 - a0) * (c1 - a1)) - ((b1 - a1) * (c0 - a0)))
+#
+cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
+    """
+    Calculate the area of the ABCD quadrilataire  with corners:
+    A(a0,a1)
+    B(b0,b1)
+    C(c0,c1)
+    D(d0,d1)
+    @return: area, i.e. 1/2 * (AC ^ BD)
+    """
+    return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
+
+# cdef double area4(point2D *pixel):
+    # """
+    # Calculate the area of the ABCD quadrilataire  with corners:
+    # A(a0,a1)
+    # B(b0,b1)
+    # C(c0,c1)
+    # D(d0,d1)
+    # @return: area, i.e. 1/2 * (AC ^ BD)
+    # """
+    # return 0.5 * abs(((pixel[2].x - pixel[0].x) * (pixel[3].y - pixel[1].y)) - ((pixel[2].y - pixel[0].y) * (pixel[3].x - pixel[1].x)))
+
+# cdef struct point2D:
+    # numpy.float64_t x
+    # numpy.float64_t y
+    
+# cdef struct min_max:
+    # numpy.float64_t pos
+    # numpy.int32_t point
+    
+cdef struct Function:
+    double slope
+    double intersect
+    
+#cdef class Function:
+    
+    #cdef double _slope
+    #cdef double _intersect
+    
+    #def __cinit__(self, double A0=0.0, double A1=0.0, double B0=1.0, double B1=0.0):
+        #self._slope = (B1-A1)/(B0-A0)
+        #self._intersect = A1 - self._slope*A0
+
+    #def __cinit__(self):
+        #self._slope = 0.0
+        #self._intersect = 0.0
+        
+    #cdef double f(self, double x):
+        #return self._slope*x + self._intersect
+    
+    #cdef double integrate(self, double A0, double B0) nogil:
+        #if A0==B0:
+            #return 0.0
+        #else:
+            #return self._slope*(B0*B0 - A0*A0)*0.5 + self._intersect*(B0-A0)
+    
+    #cdef void reset(self, double A0, double A1, double B0, double B1) nogil:
+        #self._slope = (B1-A1)/(B0-A0)
+        #self._intersect = A1 - self._slope*A0
+        
+cdef double integrate( double A0, double B0, Function AB) nogil:
+    """
+    integrates the line defined by AB, from A0 to B0
+    param A0: first limit
+    param B0: second limit
+    param AB: struct with the slope and point of intersection of the line
+    """    
+    if A0==B0:
+        return 0.0
+    else:
+        return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
+    
+@cython.cdivision(True)
+cdef double getBinNr(double x0, double pos0_min, double dpos) nogil:
+    """
+    calculate the bin number for any point
+    param x0: current position
+    param pos0_min: position minimum
+    param dpos: bin width
+    """
+    return (x0 - pos0_min) / dpos
+
+# cdef min_max min4f(point2D *pixel, int dim) nogil:
+    # cdef min_max tmp
+    # if dim == 0:
+        # if (pixel[0].x <= pixel[1].x) and (pixel[0].x <= pixel[2].x) and (pixel[0].x <= pixel[3].x):
+            # tmp.pos = pixel[0].x
+            # tmp.pixel = 0
+            # return tmp
+        # if (pixel[1].x <= pixel[0].x) and (pixel[1].x <= pixel[2].x) and (pixel[1].x <= pixel[3].x):
+            # tmp.pos = pixel[1].x
+            # tmp.pixel = 1
+            # return tmp
+        # if (pixel[2].x <= pixel[0].x) and (pixel[2].x <= pixel[1].x) and (pixel[2].x <= pixel[3].x):
+            # tmp.pos = pixel[2].x
+            # tmp.pixel = 2
+            # return tmp
+        # else:
+            # tmp.pos = pixel[3].x
+            # tmp.pixel = 3
+            # return tmp
+    # elif dim == 1:
+        # if (pixel[0].y <= pixel[1].y) and (pixel[0].y <= pixel[2].y) and (pixel[0].y <= pixel[3].y):
+            # tmp.pos = pixel[0].y
+            # tmp.pixel = 0
+            # return tmp
+        # if (pixel[1].y <= pixel[0].y) and (pixel[1].y <= pixel[2].y) and (pixel[1].y <= pixel[3].y):
+            # tmp.pos = pixel[1].y
+            # tmp.pixel = 1
+            # return tmp
+        # if (pixel[2].y <= pixel[0].y) and (pixel[2].y <= pixel[1].y) and (pixel[2].y <= pixel[3].y):
+            # tmp.pos = pixel[2].y
+            # tmp.pixel = 2
+            # return tmp
+        # else:
+            # tmp.pos = pixel[3].y
+            
+            # tmp.pixel = 3
+            # return tmp
+
+# cdef min_max max4f(point2D *pixel, int dim) nogil:
+    # cdef min_max tmp
+    # if dim == 0:
+        # if (pixel[0].x >= pixel[1].x) and (pixel[0].x >= pixel[2].x) and (pixel[0].x >= pixel[3].x):
+            # tmp.pos = pixel[0].x
+            # tmp.pixel = 0
+            # return tmp
+        # if (pixel[1].x >= pixel[0].x) and (pixel[1].x >= pixel[2].x) and (pixel[1].x >= pixel[3].x):
+            # tmp.pos = pixel[1].x
+            # tmp.pixel = 1
+            # return tmp
+        # if (pixel[2].x >= pixel[0].x) and (pixel[2].x >= pixel[1].x) and (pixel[2].x >= pixel[3].x):
+            # tmp.pos = pixel[2].x
+            # tmp.pixel = 2
+            # return tmp
+        # else:
+            # tmp.pos = pixel[3].x
+            # tmp.pixel = 3
+            # return tmp
+    # elif dim == 1:
+        # if (pixel[0].y >= pixel[1].y) and (pixel[0].y >= pixel[2].y) and (pixel[0].y >= pixel[3].y):
+            # tmp.pos = pixel[0].y
+            # tmp.pixel = 0
+            # return tmp
+        # if (pixel[1].y >= pixel[0].y) and (pixel[1].y >= pixel[2].y) and (pixel[1].y >= pixel[3].y):
+            # tmp.pos = pixel[1].y
+            # tmp.pixel = 1
+            # return tmp
+        # if (pixel[2].y >= pixel[0].y) and (pixel[2].y >= pixel[1].y) and (pixel[2].y >= pixel[3].y):
+            # tmp.pos = pixel[2].y
+            # tmp.pixel = 2
+            # return tmp
+        # else:
+            # tmp.pos = pixel[3].y
+            
+            # tmp.pixel = 3
+            # return tmp
+
+            
+
+cdef double min4f(double a, double b, double c, double d) nogil:
+    """Calculates the min of 4 double numbers"""
+    if (a <= b) and (a <= c) and (a <= d):
+        return a
+    if (b <= a) and (b <= c) and (b <= d):
+        return b
+    if (c <= a) and (c <= b) and (c <= d):
+        return c
+    else:
+        return d
+
+cdef double max4f(double a, double b, double c, double d) nogil:
+    """Calculates the max of 4 double numbers"""
+    if (a >= b) and (a >= c) and (a >= d):
+        return a
+    if (b >= a) and (b >= c) and (b >= d):
+        return b
+    if (c >= a) and (c >= b) and (c >= d):
+        return c
+    else:
+        return d
+
+@cython.cdivision(True)
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def fullSplit1D(numpy.ndarray pos not None,
+                numpy.ndarray weights not None,
+                size_t bins=100,
+                pos0Range=None,
+                pos1Range=None,
+                dummy=None,
+                delta_dummy=None,
+                mask=None,
+                dark=None,
+                flat=None,
+                solidangle=None,
+                polarization=None
+              ):
+    """
+    Calculates histogram of pos weighted by weights
+
+    Splitting is done on the pixel's bounding box like fit2D.
+    No compromise for speed has been made here.
+
+
+    @param pos: 3D or 4D array with the coordinates of each pixel point
+    @param weights: array with intensities
+    @param bins: number of output bins
+    @param pos0Range: minimum and maximum  of the 2th range
+    @param pos1Range: minimum and maximum  of the chi range
+    @param dummy: value for bins without pixels
+    @param delta_dummy: precision of dummy value
+    @param mask: array (of int8) with masked pixels with 1 (0=not masked)
+    @param dark: array (of float64) with dark noise to be subtracted (or None)
+    @param flat: array (of float64) with flat image
+    @param polarization: array (of float64) with polarization correction
+    @param solidangle: array (of float64) with flat image
+    @return 2theta, I, weighted histogram, unweighted histogram
+    """
+    cdef size_t  size = weights.size
+    if pos.ndim>3: #create a view
+        pos = pos.reshape((-1,4,2))
+    assert pos.shape[0] == size
+    assert pos.shape[1] == 4
+    assert pos.shape[2] == 2
+    assert pos.ndim == 3
+    assert  bins > 1
+
+    cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(pos,dtype=numpy.float64)
+    cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float64)
+    cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(bins, dtype=numpy.float64)
+    cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(bins, dtype=numpy.float64)
+    cdef numpy.ndarray[numpy.float64_t, ndim = 1] outMerge = numpy.zeros(bins, dtype=numpy.float64)
+    cdef numpy.int8_t[:] cmask
+    cdef double[:] cflat, cdark, cpolarization, csolidangle
+
+    cdef double cdummy=0, cddummy=0, data=0
+    cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
+    cdef double areaPixel=0, dpos=0, fbin0_min=0, fbin0_max=0#, fbin1_min, fbin1_max 
+    cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
+    cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
+    cdef double oneOverArea=0, partialArea=0, tmp=0
+    cdef double max0, min0
+    #cdef point2D[:] pixel
+    cdef Function AB, BC, CD, DA
+    cdef double epsilon=1e-10
+
+    cdef bint check_pos1=False, check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
+    cdef int i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, cur_bin
+
+    if pos0Range is not None and len(pos0Range) > 1:
+        pos0_min = min(pos0Range)
+        pos0_maxin = max(pos0Range)
+    else:
+        pos0_min = pos[:, :, 0].min()
+        pos0_maxin = pos[:, :, 0].max()
+    pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
+    if pos1Range is not None and len(pos1Range) > 1:
+        pos1_min = min(pos1Range)
+        pos1_maxin = max(pos1Range)
+        do_pos1 = True
+    else:
+        pos1_min = pos[:, :, 1].min()
+        pos1_maxin = pos[:, :, 1].max()
+    pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
+    dpos = (pos0_max - pos0_min) / (< double > (bins))
+
+
+    outPos = numpy.linspace(pos0_min+0.5*dpos, pos0_maxin-0.5*dpos, bins)
+
+    if (dummy is not None) and (delta_dummy is not None):
+        check_dummy = True
+        cdummy =  float(dummy)
+        cddummy =  float(delta_dummy)
+    elif (dummy is not None):
+        check_dummy = True
+        cdummy = float(dummy)
+        cddummy = 0.0
+    else:
+        check_dummy = False
+        cdummy = 0.0
+        cddummy = 0.0
+
+    if mask is not None:
+        check_mask = True
+        assert mask.size == size
+        cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
+    if dark is not None:
+        do_dark = True
+        assert dark.size == size
+        cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
+    if flat is not None:
+        do_flat = True
+        assert flat.size == size
+        cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
+    if polarization is not None:
+        do_polarization = True
+        assert polarization.size == size
+        cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
+    if solidangle is not None:
+        do_solidangle = True
+        assert solidangle.size == size
+        csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
+
+    #pixel = cvarray(shape=4, itemsize=sizeof(point2D))
+    #AB = Function()
+    #BC = Function()
+    #CD = Function()
+    #DA = Function()
+        
+    with nogil:
+        for idx in range(size):
+
+            if (check_mask) and (cmask[idx]):
+                continue
+
+            data = cdata[idx]
+            if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
+                continue
+
+            # pixel[0].x = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
+            # pixel[0].y = < double > cpos[idx, 0, 1]
+            # pixel[1].x = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
+            # pixel[1].y = < double > cpos[idx, 1, 1]
+            # pixel[2].x = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
+            # pixel[2].y = < double > cpos[idx, 2, 1]
+            # pixel[3].x = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
+            # pixel[3].y = < double > cpos[idx, 3, 1]
+
+            A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
+            A1 = < double > cpos[idx, 0, 1]
+            B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
+            B1 = < double > cpos[idx, 1, 1]
+            C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
+            C1 = < double > cpos[idx, 2, 1]
+            D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
+            D1 = < double > cpos[idx, 3, 1]
+
+            min0 = min(A0, B0, C0, D0)
+            max0 = max(A0, B0, C0, D0)
+            if (max0<0) or (min0 >=bins):
+                continue
+            if check_pos1:
+                min1 = min(A1, B1, C1, D1)
+                max1 = max(A1, B1, C1, D1)
+                if (max1<pos1_min) or (min1 > pos1_maxin):
+                    continue
+
+            if do_dark:
+                data -= cdark[idx]
+            if do_flat:
+                data /= cflat[idx]
+            if do_polarization:
+                data /= cpolarization[idx]
+            if do_solidangle:
+                data /= csolidangle[idx]
+
+            bin0_min = < int > floor(min0)
+            bin0_max = < int > floor(max0)
+
+            #printf("%d - [(%f %f) (%f %f) (%f %f) (%f %f)] (%f %f) (%d %d)\n",idx, A0, A1, B0, B1, C0, C1, D0, D1, min0, max0, bin0_min, bin0_max)
+            if bin0_min == bin0_max:
+                #All pixel is within a single bin
+                outCount[bin0_min] += 1
+                outData[bin0_min] += data
+
+    #        else we have pixel spliting.
+            else:
+                AB.slope=(B1-A1)/(B0-A0)
+                AB.intersect= A1 - AB.slope*A0
+                BC.slope=(C1-B1)/(C0-B0)
+                BC.intersect= B1 - BC.slope*B0
+                CD.slope=(D1-C1)/(D0-C0)
+                CD.intersect= C1 - CD.slope*C0
+                DA.slope=(A1-D1)/(A0-D0)
+                DA.intersect= D1 - DA.slope*D0
+                areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
+                oneOverPixelArea = 1.0 / areaPixel
+                partialArea2 = 0.0
+                for bin in range(bin0_min, bin0_max+1):
+                    A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
+                    B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
+                    C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
+                    D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
+                    partialArea  = integrate(A_lim, B_lim, AB)
+                    partialArea += integrate(B_lim, C_lim, BC)
+                    partialArea += integrate(C_lim, D_lim, CD)
+                    partialArea += integrate(D_lim, A_lim, DA)
+                    tmp = fabs(partialArea) * oneOverPixelArea
+                    partialArea2 += partialArea
+                    outCount[bin] += tmp
+                    outData[bin] += data * tmp
+                #if fabs(partialArea2-areaPixel) > epsilon:
+                    #printf("%d -  %f \n",idx,(partialArea2-areaPixel)/areaPixel)
+                
+
+        for i in range(bins):
+            if outCount[i] > epsilon:
+                outMerge[i] = outData[i] / outCount[i]
+            else:
+                outMerge[i] = cdummy
+
+    return  outPos, outMerge, outData, outCount
+
+
+
+
+
+
+#@cython.cdivision(True)
+#@cython.boundscheck(False)
+#@cython.wraparound(False)
+#def fullSplit2D(numpy.ndarray pos not None,
+                #numpy.ndarray weights not None,
+                #bins not None,
+                #pos0Range=None,
+                #pos1Range=None,
+                #dummy=None,
+                #delta_dummy=None,
+                #mask=None,
+                #dark=None,
+                #flat=None,
+                #solidangle=None,
+                #polarization=None):
+    #"""
+    #Calculate 2D histogram of pos weighted by weights
+
+    #Splitting is done on the pixel's bounding box like fit2D
+
+
+    #@param pos: 3D array with pos0; Corner A,B,C,D; tth or chi
+    #@param weights: array with intensities
+    #@param bins: number of output bins int or 2-tuple of int
+    #@param pos0Range: minimum and maximum  of the 2th range
+    #@param pos1Range: minimum and maximum  of the chi range
+    #@param dummy: value for bins without pixels
+    #@param delta_dummy: precision of dummy value
+    #@param mask: array (of int8) with masked pixels with 1 (0=not masked)
+    #@param dark: array (of float64) with dark noise to be subtracted (or None)
+    #@param flat: array (of float64) with flat-field image
+    #@param polarization: array (of float64) with polarization correction
+    #@param solidangle: array (of float64)with solid angle corrections
+    #@return  I, edges0, edges1, weighted histogram(2D), unweighted histogram (2D)
+    #"""
+
+    #cdef size_t  bins0=0, bins1=0, size = weights.size
+    #if pos.ndim>3: #create a view
+        #pos = pos.reshape((-1,4,2))
+
+    #assert pos.shape[0] == size
+    #assert pos.shape[1] == 4 # 4 corners
+    #assert pos.shape[2] == 2 # tth and chi
+    #assert pos.ndim == 3
+    #try:
+        #bins0, bins1 = tuple(bins)
+    #except:
+        #bins0 = bins1 = < size_t > bins
+    #if bins0 <= 0:
+        #bins0 = 1
+    #if bins1 <= 0:
+        #bins1 = 1
+
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = pos.astype(numpy.float64)
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = weights.astype(numpy.float64).ravel()
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros((bins0, bins1), dtype=numpy.float64)
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros((bins0, bins1), dtype=numpy.float64)
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outMerge = numpy.zeros((bins0, bins1), dtype=numpy.float64)
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges0 = numpy.zeros(bins0, dtype=numpy.float64)
+    #cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges1 = numpy.zeros(bins1, dtype=numpy.float64)
+    #cdef numpy.int8_t[:] cmask
+    #cdef double[:] cflat, cdark, cpolarization, csolidangle
+
+    #cdef bint check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
+
+    #cdef double cdummy=0, cddummy=0, data=0
+    #cdef double min0=0, max0=0, min1=0, max1=0, deltaR=0, deltaL=0, deltaU=0, deltaD=0, deltaA=0
+    #cdef double pos0_min=0, pos0_max=0, pos1_min=0, pos1_max=0, pos0_maxin=0, pos1_maxin=0
+    #cdef double areaPixel=0, fbin0_min=0, fbin0_max=0, fbin1_min=0, fbin1_max=0
+    #cdef double a0=0, a1=0, b0=0, b1=0, c0=0, c1=0, d0=0, d1=0
+    #cdef double epsilon = 1e-10
+
+    #cdef size_t bin0_max=0, bin0_min=0, bin1_max=0, bin1_min=0, i=0, j=0, idx=0
+
+    #if pos0Range is not None and len(pos0Range) == 2:
+        #pos0_min = min(pos0Range)
+        #pos0_maxin = max(pos0Range)
+    #else:
+        #pos0_min = pos[:, :, 0].min()
+        #pos0_maxin = pos[:, :, 0].max()
+    #pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
+
+    #if pos1Range is not None and len(pos1Range) > 1:
+        #pos1_min = min(pos1Range)
+        #pos1_maxin = max(pos1Range)
+    #else:
+        #pos1_min = pos[:, :, 1].min()
+        #pos1_maxin = pos[:, :, 1].max()
+    #pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
+
+    #cdef double dpos0 = (pos0_max - pos0_min) / (< double > (bins0))
+    #cdef double dpos1 = (pos1_max - pos1_min) / (< double > (bins1))
+    #edges0 = numpy.linspace(pos0_min+0.5*dpos0, pos0_maxin-0.5*dpos0, bins0)
+    #edges1 = numpy.linspace(pos1_min+0.5*dpos1, pos1_maxin-0.5*dpos1, bins1)
+    
+    #if (dummy is not None) and (delta_dummy is not None):
+        #check_dummy = True
+        #cdummy =  float(dummy)
+        #cddummy =  float(delta_dummy)
+    #elif (dummy is not None):
+        #check_dummy = True
+        #cdummy = float(dummy)
+        #cddummy = 0.0
+    #else:
+        #check_dummy = False
+        #cdummy = 0.0
+        #cddummy = 0.0
+
+    #if mask is not None:
+        #check_mask = True
+        #assert mask.size == size
+        #cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
+    #if dark is not None:
+        #do_dark = True
+        #assert dark.size == size
+        #cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
+    #if flat is not None:
+        #do_flat = True
+        #assert flat.size == size
+        #cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
+    #if polarization is not None:
+        #do_polarization = True
+        #assert polarization.size == size
+        #cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
+    #if solidangle is not None:
+        #do_solidangle = True
+        #assert solidangle.size == size
+        #csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
+
+    #with nogil:
+        #for idx in range(size):
+
+            #if (check_mask) and (cmask[idx]):
+                #continue
+
+            #data = cdata[idx]
+            #if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
+                #continue
+
+            #a0 =  cpos[idx, 0, 0]
+            #a1 =  cpos[idx, 0, 1]
+            #b0 =  cpos[idx, 1, 0]
+            #b1 =  cpos[idx, 1, 1]
+            #c0 =  cpos[idx, 2, 0]
+            #c1 =  cpos[idx, 2, 1]
+            #d0 =  cpos[idx, 3, 0]
+            #d1 =  cpos[idx, 3, 1]
+
+            #min0 = min4f(a0, b0, c0, d0)
+            #max0 = max4f(a0, b0, c0, d0)
+            #min1 = min4f(a1, b1, c1, d1)
+            #max1 = max4f(a1, b1, c1, d1)
+
+            #if (max0<pos0_min) or (min0 > pos0_maxin) or (max1<pos1_min) or (min1 > pos1_maxin):
+                    #continue
+
+            #if do_dark:
+                #data -= cdark[idx]
+            #if do_flat:
+                #data /= cflat[idx]
+            #if do_polarization:
+                #data /= cpolarization[idx]
+            #if do_solidangle:
+                #data /= csolidangle[idx]
+
+
+            #if min0 < pos0_min:
+                #data = data * (pos0_min - min0) / (max0 - min0)
+                #min0 = pos0_min
+            #if min1 < pos1_min:
+                #data = data * (pos1_min - min1) / (max1 - min1)
+                #min1 = pos1_min
+            #if max0 > pos0_maxin:
+                #data = data * (max0 - pos0_maxin) / (max0 - min0)
+                #max0 = pos0_maxin
+            #if max1 > pos1_maxin:
+                #data = data * (max1 - pos1_maxin) / (max1 - min1)
+                #max1 = pos1_maxin
+
+###                treat data for pixel on chi discontinuity
+            #if ((max1 - min1) / dpos1) > (bins1 / 2.0):
+                #if pos1_maxin - max1 > min1 - pos1_min:
+                    #min1 = max1
+                    #max1 = pos1_maxin
+                #else:
+                    #max1 = min1
+                    #min1 = pos1_min
+
+            #fbin0_min = getBinNr(min0, pos0_min, dpos0)
+            #fbin0_max = getBinNr(max0, pos0_min, dpos0)
+            #fbin1_min = getBinNr(min1, pos1_min, dpos1)
+            #fbin1_max = getBinNr(max1, pos1_min, dpos1)
+
+            #bin0_min = < size_t > fbin0_min
+            #bin0_max = < size_t > fbin0_max
+            #bin1_min = < size_t > fbin1_min
+            #bin1_max = < size_t > fbin1_max
+
+
+            #if bin0_min == bin0_max:
+                #if bin1_min == bin1_max:
+                    ##All pixel is within a single bin
+                    #outCount[bin0_min, bin1_min] += 1.0
+                    #outData[bin0_min, bin1_min] += data
+                #else:
+                    ##spread on more than 2 bins
+                    #areaPixel = fbin1_max - fbin1_min
+                    #deltaD = (< double > (bin1_min + 1)) - fbin1_min
+                    #deltaU = fbin1_max - (< double > bin1_max)
+                    #deltaA = 1.0 / areaPixel
+
+                    #outCount[bin0_min, bin1_min] += deltaA * deltaD
+                    #outData[bin0_min, bin1_min] += data * deltaA * deltaD
+
+                    #outCount[bin0_min, bin1_max] += deltaA * deltaU
+                    #outData[bin0_min, bin1_max] += data * deltaA * deltaU
+##                    if bin1_min +1< bin1_max:
+                    #for j in range(bin1_min + 1, bin1_max):
+                            #outCount[bin0_min, j] += deltaA
+                            #outData[bin0_min, j] += data * deltaA
+
+            #else: #spread on more than 2 bins in dim 0
+                #if bin1_min == bin1_max:
+                    ##All pixel fall on 1 bins in dim 1
+                    #areaPixel = fbin0_max - fbin0_min
+                    #deltaL = (< double > (bin0_min + 1)) - fbin0_min
+                    #deltaA = deltaL / areaPixel
+                    #outCount[bin0_min, bin1_min] += deltaA
+                    #outData[bin0_min, bin1_min] += data * deltaA
+                    #deltaR = fbin0_max - (< double > bin0_max)
+                    #deltaA = deltaR / areaPixel
+                    #outCount[bin0_max, bin1_min] += deltaA
+                    #outData[bin0_max, bin1_min] += data * deltaA
+                    #deltaA = 1.0 / areaPixel
+                    #for i in range(bin0_min + 1, bin0_max):
+                            #outCount[i, bin1_min] += deltaA
+                            #outData[i, bin1_min] += data * deltaA
+                #else:
+                    ##spread on n pix in dim0 and m pixel in dim1:
+                    #areaPixel = (fbin0_max - fbin0_min) * (fbin1_max - fbin1_min)
+                    #deltaL = (< double > (bin0_min + 1.0)) - fbin0_min
+                    #deltaR = fbin0_max - (< double > bin0_max)
+                    #deltaD = (< double > (bin1_min + 1.0)) - fbin1_min
+                    #deltaU = fbin1_max - (< double > bin1_max)
+                    #deltaA = 1.0 / areaPixel
+
+                    #outCount[bin0_min, bin1_min] += deltaA * deltaL * deltaD
+                    #outData[bin0_min, bin1_min] += data * deltaA * deltaL * deltaD
+
+                    #outCount[bin0_min, bin1_max] += deltaA * deltaL * deltaU
+                    #outData[bin0_min, bin1_max] += data * deltaA * deltaL * deltaU
+
+                    #outCount[bin0_max, bin1_min] += deltaA * deltaR * deltaD
+                    #outData[bin0_max, bin1_min] += data * deltaA * deltaR * deltaD
+
+                    #outCount[bin0_max, bin1_max] += deltaA * deltaR * deltaU
+                    #outData[bin0_max, bin1_max] += data * deltaA * deltaR * deltaU
+                    #for i in range(bin0_min + 1, bin0_max):
+                            #outCount[i, bin1_min] += deltaA * deltaD
+                            #outData[i, bin1_min] += data * deltaA * deltaD
+                            #for j in range(bin1_min + 1, bin1_max):
+                                #outCount[i, j] += deltaA
+                                #outData[i, j] += data * deltaA
+                            #outCount[i, bin1_max] += deltaA * deltaU
+                            #outData[i, bin1_max] += data * deltaA * deltaU
+                    #for j in range(bin1_min + 1, bin1_max):
+                            #outCount[bin0_min, j] += deltaA * deltaL
+                            #outData[bin0_min, j] += data * deltaA * deltaL
+
+                            #outCount[bin0_max, j] += deltaA * deltaR
+                            #outData[bin0_max, j] += data * deltaA * deltaR
+
+    ##with nogil:
+        #for i in range(bins0):
+            #for j in range(bins1):
+                #if outCount[i, j] > epsilon:
+                    #outMerge[i, j] = outData[i, j] / outCount[i, j]
+                #else:
+                    #outMerge[i, j] = cdummy
+    #return outMerge.T, edges0, edges1, outData.T, outCount.T
+
--- a/src/splitPixelFullLUT.c
+++ b/src/splitPixelFullLUT.c
--- a/src/splitPixelFullLUT.pyx
+++ b/src/splitPixelFullLUT.pyx
@ -0,0 +1,974 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+#    Project: Azimuthal integration
+#             https://github.com/kif/pyFAI
+#
+#    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
+#
+#    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
+#
+#    This program is free software: you can redistribute it and/or modify
+#    it under the terms of the GNU General Public License as published by
+#    the Free Software Foundation, either version 3 of the License, or
+#    (at your option) any later version.
+#
+#    This program is distributed in the hope that it will be useful,
+#    but WITHOUT ANY WARRANTY; without even the implied warranty of
+#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#    GNU General Public License for more details.
+#
+#    You should have received a copy of the GNU General Public License
+#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+import cython
+import os, sys
+from cython.parallel import prange
+from libc.string cimport memset
+import numpy
+cimport numpy
+from libc.math cimport fabs, M_PI, floor
+from libc.stdio cimport printf
+
+cdef float pi = <float> M_PI 
+cdef float onef = <float> 1.0
+try:
+    from fastcrc import crc32
+except:
+    from zlib import crc32
+EPS32 = (1.0 + numpy.finfo(numpy.float32).eps)
+
+
+
+cdef struct Function:
+    double slope
+    double intersect
+
+cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
+    """
+    Calculate the area of the ABCD quadrilataire  with corners:
+    A(a0,a1)
+    B(b0,b1)
+    C(c0,c1)
+    D(d0,d1)
+    @return: area, i.e. 1/2 * (AC ^ BD)
+    """
+    return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
+    
+@cython.cdivision(True)
+cdef inline double getBinNr( double x0, double pos0_min, double delta) nogil:
+    """
+    calculate the bin number for any point
+    param x0: current position
+    param pos0_min: position minimum
+    param delta: bin width
+    """
+    return (x0 - pos0_min) / delta
+
+cdef double integrate( double A0, double B0, Function AB) nogil:
+    """
+    integrates the line defined by AB, from A0 to B0
+    param A0: first limit
+    param B0: second limit
+    param AB: struct with the slope and point of intersection of the line
+    """    
+    if A0==B0:
+        return 0.0
+    else:
+        return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
+    
+    
+class HistoLUT1dFullSplit(object):
+    """
+    Now uses CSR (Compressed Sparse raw) with main attributes:
+    * nnz: number of non zero elements
+    * data: coefficient of the matrix in a 1D vector of float32
+    * indices: Column index position for the data (same size as  
+    * indptr: row pointer indicates the start of a given row. len nrow+1
+    
+    Nota: nnz = indptr[-1]
+    """
+    @cython.boundscheck(False)
+    def __init__(self,
+                 numpy.ndarray pos not None,
+                 int bins=100,
+                 pos0Range=None,
+                 pos1Range=None,
+                 mask=None,
+                 mask_checksum=None,
+                 allow_pos0_neg=False,
+                 unit="undefined"): 
+        """
+        @param pos: 3D or 4D array with the coordinates of each pixel point
+        @param bins: number of output bins, 100 by default
+        @param pos0Range: minimum and maximum  of the 2th range
+        @param pos1Range: minimum and maximum  of the chi range
+        @param mask: array (of int8) with masked pixels with 1 (0=not masked)
+        @param allow_pos0_neg: enforce the q<0 is usually not possible  
+        @param unit: can be 2th_deg or r_nm^-1 ...
+        """
+
+#        self.padding = int(padding)
+        if pos.ndim>3: #create a view
+            pos = pos.reshape((-1,4,2))
+        assert pos.shape[1] == 4
+        assert pos.shape[2] == 2
+        assert pos.ndim == 3
+        self.pos = pos
+        self.size = pos.shape[0]
+        self.bins = bins
+        self.lut_size = 0
+        self.allow_pos0_neg = allow_pos0_neg
+        if  mask is not None:
+            assert mask.size == self.size
+            self.check_mask = True
+            self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
+            if mask_checksum:
+                self.mask_checksum = mask_checksum
+            else:
+                self.mask_checksum = crc32(mask)
+        else:
+            self.check_mask = False
+            self.mask_checksum = None
+        self.data = self.nnz = self.indices = self.indptr = None
+        self.pos0Range = pos0Range
+        self.pos1Range = pos1Range
+
+        self.calc_lut()
+        self.outPos = numpy.linspace(self.pos0_min+0.5*self.delta, self.pos0_maxin-0.5*self.delta, self.bins)
+        self.lut_checksum = crc32(self.data)
+        self.unit=unit
+        self.lut=(self.data,self.indices,self.indptr)
+        self.lut_nbytes = sum([i.nbytes for i in self.lut])
+                 
+                 
+    @cython.cdivision(True)
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    def calc_lut(self):
+        cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(self.pos,dtype=numpy.float64)
+        cdef numpy.int8_t[:] cmask
+        cdef numpy.ndarray[numpy.int32_t, ndim = 1] outMax = numpy.zeros(self.bins, dtype=numpy.int32)
+        cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros(self.bins+1, dtype=numpy.int32)
+        cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
+        cdef double max0, min0
+        cdef double areaPixel=0, delta=0
+        cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
+        cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
+        cdef double oneOverArea=0, partialArea=0, tmp=0
+        cdef Function AB, BC, CD, DA
+        cdef int bins, i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, k=0, size=0
+        cdef bint check_pos1=False, check_mask=False
+        
+        bins = self.bins
+        if self.pos0Range is not None and len(self.pos0Range) > 1:
+            self.pos0_min = min(self.pos0Range)
+            self.pos0_maxin = max(self.pos0Range)
+        else:
+            self.pos0_min = self.pos[:, :, 0].min()
+            self.pos0_maxin = self.pos[:, :, 0].max()
+        self.pos0_max = self.pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
+        if self.pos1Range is not None and len(self.pos1Range) > 1:
+            self.pos1_min = min(self.pos1Range)
+            self.pos1_maxin = max(self.pos1Range)
+            self.check_pos1 = True
+        else:
+            self.pos1_min = self.pos[:, :, 1].min()
+            self.pos1_maxin = self.pos[:, :, 1].max()
+        self.pos1_max = self.pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
+
+        self.delta = (self.pos0_max - self.pos0_min) / (< double > (bins))
+        
+        pos0_min = self.pos0_min
+        pos0_max = self.pos0_max
+        pos1_min = self.pos1_min
+        pos1_max = self.pos1_max
+        delta = self.delta
+        
+        size = self.size
+        check_mask = self.check_mask
+        if check_mask:
+            cmask = self.cmask
+        
+        with nogil:
+            for idx in range(size):
+
+                if (check_mask) and (cmask[idx]):
+                    continue
+
+                A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
+                A1 = < double > cpos[idx, 0, 1]
+                B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
+                B1 = < double > cpos[idx, 1, 1]
+                C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
+                C1 = < double > cpos[idx, 2, 1]
+                D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
+                D1 = < double > cpos[idx, 3, 1]
+
+                min0 = min(A0, B0, C0, D0)
+                max0 = max(A0, B0, C0, D0)
+                if (max0<0) or (min0 >=bins):
+                    continue
+                if check_pos1:
+                    if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
+                        continue
+
+                bin0_min = < int > floor(min0)
+                bin0_max = < int > floor(max0)
+
+                for bin in range(bin0_min, bin0_max+1):
+                    outMax[bin] += 1
+    
+        indptr[1:] = outMax.cumsum()
+        self.indptr = indptr
+        
+        cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices = numpy.zeros(indptr[bins], dtype=numpy.int32)
+        cdef numpy.ndarray[numpy.float32_t, ndim = 1] data = numpy.zeros(indptr[bins], dtype=numpy.float32)
+        
+        #just recycle the outMax array
+        memset(&outMax[0], 0, bins * sizeof(numpy.int32_t))
+        
+        with nogil:
+            for idx in range(size):
+
+                if (check_mask) and (cmask[idx]):
+                    continue
+
+                A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
+                A1 = < double > cpos[idx, 0, 1]
+                B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
+                B1 = < double > cpos[idx, 1, 1]
+                C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
+                C1 = < double > cpos[idx, 2, 1]
+                D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
+                D1 = < double > cpos[idx, 3, 1]
+
+                min0 = min(A0, B0, C0, D0)
+                max0 = max(A0, B0, C0, D0)
+                if (max0<0) or (min0 >=bins):
+                    continue
+                if check_pos1:
+                    if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
+                        continue
+
+                bin0_min = < int > floor(min0)
+                bin0_max = < int > floor(max0)
+
+                if bin0_min == bin0_max:
+                    #All pixel is within a single bin
+                    k = outMax[bin0_min]
+                    indices[indptr[bin0_min]+k] = idx
+                    data[indptr[bin0_min]+k] = 1.0
+                    outMax[bin0_min] += 1 #k+1
+                else:  #else we have pixel spliting.
+                    AB.slope=(B1-A1)/(B0-A0)
+                    AB.intersect= A1 - AB.slope*A0
+                    BC.slope=(C1-B1)/(C0-B0)
+                    BC.intersect= B1 - BC.slope*B0
+                    CD.slope=(D1-C1)/(D0-C0)
+                    CD.intersect= C1 - CD.slope*C0
+                    DA.slope=(A1-D1)/(A0-D0)
+                    DA.intersect= D1 - DA.slope*D0
+                    areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
+                    oneOverPixelArea = 1.0 / areaPixel
+                    partialArea2 = 0.0
+                    for bin in range(bin0_min, bin0_max+1):
+                        A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
+                        B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
+                        C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
+                        D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
+                        partialArea  = integrate(A_lim, B_lim, AB)
+                        partialArea += integrate(B_lim, C_lim, BC)
+                        partialArea += integrate(C_lim, D_lim, CD)
+                        partialArea += integrate(D_lim, A_lim, DA)
+                        tmp = fabs(partialArea) * oneOverPixelArea
+                        k = outMax[bin]
+                        indices[indptr[bin]+k] = idx
+                        data[indptr[bin]+k] = tmp
+                        outMax[bin] += 1 #k+1
+        self.data = data
+        self.indices = indices
+                            
+    @cython.cdivision(True)
+    @cython.boundscheck(False)
+    @cython.wraparound(False)
+    def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
+        """
+        Actually perform the integration which in this case looks more like a matrix-vector product
+
+        @param weights: input image
+        @type weights: ndarray
+        @param dummy: value for dead pixels (optional)
+        @type dummy: float
+        @param delta_dummy: precision for dead-pixel value in dynamic masking
+        @type delta_dummy: float
+        @param dark: array with the dark-current value to be subtracted (if any)
+        @type dark: ndarray
+        @param flat: array with the dark-current value to be divided by (if any)
+        @type flat: ndarray
+        @param solidAngle: array with the solid angle of each pixel to be divided by (if any)
+        @type solidAngle: ndarray
+        @param polarization: array with the polarization correction values to be divided by (if any)
+        @type polarization: ndarray
+        @return : positions, pattern, weighted_histogram and unweighted_histogram
+        @rtype: 4-tuple of ndarrays
+
+        """
+        cdef numpy.int32_t i=0, j=0, idx=0, bins=self.bins, size=self.size
+        cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
+        cdef float data=0, coef=0, cdummy=0, cddummy=0
+        cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
+        cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(self.bins, dtype=numpy.float64)
+        cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
+        cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
+        cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
+                      
+        cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
+        assert size == weights.size
+        
+        if dummy is not None:
+            do_dummy = True
+            cdummy =  <float>float(dummy)
+            if delta_dummy is None:
+                cddummy = <float>0.0
+            else:
+                cddummy = <float>float(delta_dummy)
+
+        if flat is not None:
+            do_flat = True
+            assert flat.size == size
+            cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
+        if dark is not None:
+            do_dark = True
+            assert dark.size == size
+            cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
+        if solidAngle is not None:
+            do_solidAngle = True
+            assert solidAngle.size == size
+            csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
+        if polarization is not None:
+            do_polarization = True
+            assert polarization.size == size
+            cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
+
+        if (do_dark + do_flat + do_polarization + do_solidAngle):
+            tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
+            cdata = numpy.zeros(size,dtype=numpy.float32)
+            if do_dummy:
+                for i in prange(size, nogil=True, schedule="static"):
+                    data = tdata[i]
+                    if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
+                        #Nota: -= and /= operatore are seen as reduction in cython parallel.
+                        if do_dark:
+                            data = data - cdark[i]
+                        if do_flat:
+                            data = data / cflat[i]
+                        if do_polarization:
+                            data = data / cpolarization[i]
+                        if do_solidAngle:
+                            data = data / csolidAngle[i]
+                        cdata[i]+=data
+                    else: #set all dummy_like values to cdummy. simplifies further processing
+                        cdata[i]+=cdummy
+            else:
+                for i in prange(size, nogil=True, schedule="static"):
+                    data = tdata[i]
+                    if do_dark:
+                        data = data - cdark[i]
+                    if do_flat:
+                        data = data / cflat[i]
+                    if do_polarization:
+                        data = data / cpolarization[i]
+                    if do_solidAngle:
+                        data = data / csolidAngle[i]
+                    cdata[i]+=data
+        else:
+            if do_dummy:
+                tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
+                cdata = numpy.zeros(size,dtype=numpy.float32)
+                for i in prange(size, nogil=True, schedule="static"):
+                    data = tdata[i]
+                    if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
+                        cdata[i]+=data
+                    else:
+                        cdata[i]+=cdummy
+            else:
+                cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
+        
+        for i in prange(bins, nogil=True, schedule="guided"):
+            sum_data = 0.0
+            sum_count = 0.0
+            for j in range(indptr[i],indptr[i+1]):
+                idx = indices[j]
+                coef = ccoef[j]
+                if coef == 0.0:
+                    continue
+                data = cdata[idx]
+                if do_dummy and data==cdummy:
+                    continue
+                sum_data = sum_data + coef * data
+                sum_count = sum_count + coef
+            outData[i] += sum_data
+            outCount[i] += sum_count
+            if sum_count > epsilon:
+                outMerge[i] += sum_data / sum_count
+            else:
+                outMerge[i] += cdummy
+        return  self.outPos, outMerge, outData, outCount
+
+
+
+################################################################################
+# Bidimensionnal regrouping
+################################################################################
+
+#class HistoBBox2d(object):
+    #@cython.boundscheck(False)
+    #def __init__(self,
+                    #pos0,
+                    #delta_pos0,
+                    #pos1,
+                    #delta_pos1,
+                    #bins=(100,36),
+                    #pos0Range=None,
+                    #pos1Range=None,
+                    #mask=None,
+                    #mask_checksum=None,
+                    #allow_pos0_neg=False,
+                    #unit="undefined",
+                    #chiDiscAtPi=True
+                    #):
+        #"""
+        #@param pos0: 1D array with pos0: tth or q_vect
+        #@param delta_pos0: 1D array with delta pos0: max center-corner distance
+        #@param pos1: 1D array with pos1: chi
+        #@param delta_pos1: 1D array with max pos1: max center-corner distance, unused !
+        #@param bins: number of output bins (tth=100, chi=36 by default)
+        #@param pos0Range: minimum and maximum  of the 2th range
+        #@param pos1Range: minimum and maximum  of the chi range
+        #@param mask: array (of int8) with masked pixels with 1 (0=not masked)
+        #@param allow_pos0_neg: enforce the q<0 is usually not possible  
+        #@param chiDiscAtPi: boolean; by default the chi_range is in the range ]-pi,pi[ set to 0 to have the range ]0,2pi[
+        #"""
+        #cdef int i, size, bin0, bin1
+        #self.size = pos0.size
+        #assert delta_pos0.size == self.size
+        #assert pos1.size == self.size
+        #assert delta_pos1.size == self.size
+        #self.chiDiscAtPi = 1 if chiDiscAtPi else 0
+        #self.allow_pos0_neg =  allow_pos0_neg
+
+        #try:
+            #bins0, bins1 = tuple(bins)
+        #except:
+            #bins0 = bins1 = bins
+        #if bins0 <= 0:
+            #bins0 = 1
+        #if bins1 <= 0:
+            #bins1 = 1
+        #self.bins = (int(bins0),int(bins1))
+        #self.lut_size = 0
+        #if  mask is not None:
+            #assert mask.size == self.size
+            #self.check_mask = True
+            #self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
+            #if mask_checksum:
+                #self.mask_checksum = mask_checksum
+            #else:
+                #self.mask_checksum = crc32(mask)
+        #else:
+            #self.check_mask = False
+            #self.mask_checksum = None
+            
+        #self.data = self.nnz = self.indices = self.indptr = None
+        #self.cpos0 = numpy.ascontiguousarray(pos0.ravel(), dtype=numpy.float32)
+        #self.dpos0 = numpy.ascontiguousarray(delta_pos0.ravel(), dtype=numpy.float32)
+        #self.cpos0_sup = numpy.empty_like(self.cpos0)
+        #self.cpos0_inf = numpy.empty_like(self.cpos0)
+        #self.pos0Range = pos0Range
+        #self.pos1Range = pos1Range
+
+        #self.cpos1 = numpy.ascontiguousarray((pos1).ravel(), dtype=numpy.float32)
+        #self.dpos1 = numpy.ascontiguousarray((delta_pos1).ravel(), dtype=numpy.float32)
+        #self.cpos1_sup = numpy.empty_like(self.cpos1)
+        #self.cpos1_inf = numpy.empty_like(self.cpos1)
+        #self.calc_boundaries(pos0Range, pos1Range)
+        #self.delta0 = (self.pos0_max - self.pos0_min) / float(bins0)
+        #self.delta1 = (self.pos1_max - self.pos1_min) / float(bins1)
+        #self.lut_max_idx = self.calc_lut()
+        #self.outPos0 = numpy.linspace(self.pos0_min+0.5*self.delta0, self.pos0_maxin-0.5*self.delta0, bins0)
+        #self.outPos1 = numpy.linspace(self.pos1_min+0.5*self.delta1, self.pos1_maxin-0.5*self.delta1, bins1)
+        #self.unit=unit
+        #self.lut=(self.data,self.indices,self.indptr)
+        #self.lut_checksum = crc32(self.data)
+
+    #@cython.boundscheck(False)
+    #@cython.wraparound(False)
+    #def calc_boundaries(self, pos0Range, pos1Range):
+        #cdef int size = self.cpos0.size
+        #cdef bint check_mask = self.check_mask
+        #cdef numpy.int8_t[:] cmask
+        #cdef float[:] cpos0, dpos0, cpos0_sup, cpos0_inf
+        #cdef float[:] cpos1, dpos1, cpos1_sup, cpos1_inf,
+        #cdef float upper0, lower0, pos0_max, pos0_min, c0, d0
+        #cdef float upper1, lower1, pos1_max, pos1_min, c1, d1
+        #cdef bint allow_pos0_neg=self.allow_pos0_neg
+        #cdef bint chiDiscAtPi = self.chiDiscAtPi
+        
+        #cpos0_sup = self.cpos0_sup
+        #cpos0_inf = self.cpos0_inf
+        #cpos0 = self.cpos0
+        #dpos0 = self.dpos0
+        #cpos1_sup = self.cpos1_sup
+        #cpos1_inf = self.cpos1_inf
+        #cpos1 = self.cpos1
+        #dpos1 = self.dpos1
+        #pos0_min=cpos0[0]
+        #pos0_max=cpos0[0]
+        #pos1_min=cpos1[0]
+        #pos1_max=cpos1[0]
+
+        #if check_mask:
+            #cmask = self.cmask
+        #with nogil:
+            #for idx in range(size):
+                #c0 = cpos0[idx]
+                #d0 = dpos0[idx]
+                #lower0 = c0 - d0
+                #upper0 = c0 + d0
+                #c1 = cpos1[idx]
+                #d1 = dpos1[idx]
+                #lower1 = c1 - d1
+                #upper1 = c1 + d1
+                #if not allow_pos0_neg and lower0<0:
+                    #lower0=0
+                #if upper1 > (2-chiDiscAtPi)*pi:
+                    #upper1 = (2-chiDiscAtPi)*pi
+                #if lower1 < (-chiDiscAtPi)*pi:
+                    #lower1 = (-chiDiscAtPi)*pi
+                #cpos0_sup[idx] = upper0
+                #cpos0_inf[idx] = lower0
+                #cpos1_sup[idx] = upper1
+                #cpos1_inf[idx] = lower1
+                #if not (check_mask and cmask[idx]):
+                    #if upper0>pos0_max:
+                        #pos0_max = upper0
+                    #if lower0<pos0_min:
+                        #pos0_min = lower0
+                    #if upper1>pos1_max:
+                        #pos1_max = upper1
+                    #if lower1<pos1_min:
+                        #pos1_min = lower1
+
+        #if pos0Range is not None and len(pos0Range) > 1:
+            #self.pos0_min = min(pos0Range)
+            #self.pos0_maxin = max(pos0Range)
+        #else:
+            #self.pos0_min = pos0_min
+            #self.pos0_maxin = pos0_max
+
+
+        #if pos1Range is not None and len(pos1Range) > 1:
+            #self.pos1_min = min(pos1Range)
+            #self.pos1_maxin = max(pos1Range)
+        #else:
+            #self.pos1_min = pos1_min
+            #self.pos1_maxin = pos1_max
+
+        #if (not allow_pos0_neg) and self.pos0_min < 0:
+            #self.pos0_min = 0
+        #self.pos0_max = self.pos0_maxin * EPS32
+        #self.cpos0_sup = cpos0_sup
+        #self.cpos0_inf = cpos0_inf
+        #self.pos1_max = self.pos1_maxin * EPS32
+        #self.cpos1_sup = cpos1_sup
+        #self.cpos1_inf = cpos1_inf
+
+
+    #@cython.boundscheck(False)
+    #@cython.wraparound(False)
+    #@cython.cdivision(True)
+    #def calc_lut(self):
+        #'calculate the max number of elements in the LUT and populate it'
+        #cdef float delta0=self.delta0, pos0_min=self.pos0_min, min0, max0, fbin0_min, fbin0_max
+        #cdef float delta1=self.delta1, pos1_min=self.pos1_min, min1, max1, fbin1_min, fbin1_max
+        #cdef int bin0_min, bin0_max, bins0 = self.bins[0]
+        #cdef int bin1_min, bin1_max, bins1 = self.bins[1]
+        #cdef numpy.int32_t k, idx, lut_size, i, j, size=self.size
+        #cdef bint check_mask
+        #cdef float[:] cpos0_sup = self.cpos0_sup
+        #cdef float[:] cpos0_inf = self.cpos0_inf
+        #cdef float[:] cpos1_inf = self.cpos1_inf
+        #cdef float[:] cpos1_sup = self.cpos1_sup
+        #cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
+        #cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros((bins0*bins1)+1, dtype=numpy.int32)
+        #cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices 
+        #cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
+        #cdef numpy.int8_t[:] cmask
+        #if self.check_mask:
+            #cmask = self.cmask
+            #check_mask = True
+        #else:
+            #check_mask = False
+
+    ##NOGIL
+        #with nogil:
+            #for idx in range(size):
+                #if (check_mask) and (cmask[idx]):
+                    #continue
+
+                #min0 = cpos0_inf[idx]
+                #max0 = cpos0_sup[idx]
+                #min1 = cpos1_inf[idx]
+                #max1 = cpos1_sup[idx]
+
+                #bin0_min = < int > getBinNr(min0, pos0_min, delta0)
+                #bin0_max = < int > getBinNr(max0, pos0_min, delta0)
+
+                #bin1_min = < int > getBinNr(min1, pos1_min, delta1)
+                #bin1_max = < int > getBinNr(max1, pos1_min, delta1)
+
+                #if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
+                    #continue
+
+                #if bin0_max >= bins0 :
+                    #bin0_max = bins0 - 1
+                #if  bin0_min < 0:
+                    #bin0_min = 0
+                #if bin1_max >= bins1 :
+                    #bin1_max = bins1 - 1
+                #if  bin1_min < 0:
+                    #bin1_min = 0
+
+                #for i in range(bin0_min, bin0_max+1):
+                    #for j in range(bin1_min , bin1_max+1):
+                        #outMax[i, j] +=  1
+
+        #self.nnz = outMax.sum()
+        #indptr[1:] = outMax.cumsum()
+        #self.indptr = indptr
+##        self.lut_size = lut_size = outMax.max()
+        ##just recycle the outMax array
+        ##outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
+        #memset(&outMax[0,0], 0, bins0*bins1*sizeof(numpy.int32_t))
+
+        #lut_nbytes = self.nnz * (sizeof(numpy.float32_t)+sizeof(numpy.int32_t)) + bins0*bins1*sizeof(numpy.int32_t)
+        #if (os.name == "posix") and ("SC_PAGE_SIZE" in os.sysconf_names) and ("SC_PHYS_PAGES" in os.sysconf_names):
+            #memsize =  os.sysconf("SC_PAGE_SIZE")*os.sysconf("SC_PHYS_PAGES")
+            #if memsize <  lut_nbytes:
+                #raise MemoryError("CSR Matrix is %.3fGB whereas the memory of the system is only %s"%(lut_nbytes, memsize))
+        ##else hope we have enough memory
+        #data = numpy.zeros(self.nnz,dtype=numpy.float32)
+        #indices = numpy.zeros(self.nnz,dtype=numpy.int32)
+##        lut = numpy.recarray(shape=(bins0, bins1, lut_size),dtype=[("idx",numpy.int32),("coef",numpy.float32)])
+##        memset(&lut[0,0,0], 0, lut_nbytes)
+        #with nogil:
+            #for idx in range(size):
+                #if (check_mask) and cmask[idx]:
+                    #continue
+
+                #min0 = cpos0_inf[idx]
+                #max0 = cpos0_sup[idx]
+                #min1 = cpos1_inf[idx]
+                #max1 = cpos1_sup[idx]
+
+                #fbin0_min = getBinNr(min0, pos0_min, delta0)
+                #fbin0_max = getBinNr(max0, pos0_min, delta0)
+                #fbin1_min = getBinNr(min1, pos1_min, delta1)
+                #fbin1_max = getBinNr(max1, pos1_min, delta1)
+
+                #bin0_min = <int> fbin0_min
+                #bin0_max = <int> fbin0_max
+                #bin1_min = <int> fbin1_min
+                #bin1_max = <int> fbin1_max
+
+                #if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
+                    #continue
+
+                #if bin0_max >= bins0 :
+                    #bin0_max = bins0 - 1
+                #if  bin0_min < 0:
+                    #bin0_min = 0
+                #if bin1_max >= bins1 :
+                    #bin1_max = bins1 - 1
+                #if  bin1_min < 0:
+                    #bin1_min = 0
+
+                #if bin0_min == bin0_max:
+                    #if bin1_min == bin1_max:
+                        ##All pixel is within a single bin
+                        #k = outMax[bin0_min, bin1_min]
+                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
+                        #data[indptr[bin0_min*bins1+bin1_min]+k] = onef
+##                        lut[bin0_min, bin1_min, k].idx = idx
+##                        lut[bin0_min, bin1_min, k].coef = onef
+                        #outMax[bin0_min, bin1_min]= k+1
+
+                    #else:
+                        ##spread on more than 2 bins
+                        #deltaD = (< float > (bin1_min + 1)) - fbin1_min
+                        #deltaU = fbin1_max - ( bin1_max)
+                        #deltaA = 1.0 / (fbin1_max - fbin1_min)
+
+                        #k = outMax[bin0_min, bin1_min]
+                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
+                        #data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaD
+##                        lut[bin0_min, bin1_min, k].idx = idx
+##                        lut[bin0_min, bin1_min, k].coef =  deltaA * deltaD
+                        #outMax[bin0_min, bin1_min] = k + 1
+
+                        #k = outMax[bin0_min, bin1_max]
+                        #indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
+                        #data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaU
+##                        lut[bin0_min, bin1_max, k].idx = idx
+##                        lut[bin0_min, bin1_max, k].coef =  deltaA * deltaU
+                        #outMax[bin0_min, bin1_max] = k + 1
+
+                        #for j in range(bin1_min + 1, bin1_max):
+                            #k = outMax[bin0_min, j]
+                            #indices[indptr[bin0_min*bins1+j]+k] = idx
+                            #data[indptr[bin0_min*bins1+j]+k] = deltaA
+##                            lut[bin0_min, j, k].idx = idx
+##                            lut[bin0_min, j, k].coef =  deltaA
+                            #outMax[bin0_min, j] = k + 1
+
+                #else: #spread on more than 2 bins in dim 0
+                    #if bin1_min == bin1_max:
+                        ##All pixel fall on 1 bins in dim 1
+                        #deltaA = 1.0 / (fbin0_max - fbin0_min)
+                        #deltaL = (< float > (bin0_min + 1)) - fbin0_min
+
+                        #k = outMax[bin0_min, bin1_min]
+                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
+                        #data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL
+##                        lut[bin0_min, bin1_min, k].idx = idx
+##                        lut[bin0_min, bin1_min, k].coef =  deltaA * deltaL
+                        #outMax[bin0_min, bin1_min] = k+1
+
+                        #deltaR = fbin0_max - (< float > bin0_max)
+
+                        #k = outMax[bin0_max, bin1_min]
+                        #indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
+                        #data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR
+##                        lut[bin0_max, bin1_min, k].idx = idx
+##                        lut[bin0_max, bin1_min, k].coef =  deltaA * deltaR
+                        #outMax[bin0_max, bin1_min] = k + 1
+
+                        #for i in range(bin0_min + 1, bin0_max):
+                            #k = outMax[i, bin1_min]
+                            #indices[indptr[i*bins1+bin1_min]+k] = idx
+                            #data[indptr[i*bins1+bin1_min]+k] = deltaA
+##                            lut[i, bin1_min ,k].idx = idx
+##                            lut[i, bin1_min, k].coef =  deltaA
+                            #outMax[i, bin1_min] = k + 1
+
+                    #else:
+                        ##spread on n pix in dim0 and m pixel in dim1:
+                        #deltaL = (< float > (bin0_min + 1)) - fbin0_min
+                        #deltaR = fbin0_max - (< float > bin0_max)
+                        #deltaD = (< float > (bin1_min + 1)) - fbin1_min
+                        #deltaU = fbin1_max - (< float > bin1_max)
+                        #deltaA = 1.0 / ((fbin0_max - fbin0_min) * (fbin1_max - fbin1_min))
+
+                        #k = outMax[bin0_min, bin1_min]
+                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
+                        #data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL * deltaD
+##                        lut[bin0_min, bin1_min ,k].idx = idx
+##                        lut[bin0_min, bin1_min, k].coef =  deltaA * deltaL * deltaD
+                        #outMax[bin0_min, bin1_min] = k + 1
+
+                        #k = outMax[bin0_min, bin1_max]
+                        #indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
+                        #data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaL * deltaU
+##                        lut[bin0_min, bin1_max, k].idx = idx
+##                        lut[bin0_min, bin1_max, k].coef =  deltaA * deltaL * deltaU
+                        #outMax[bin0_min, bin1_max] = k + 1
+
+                        #k = outMax[bin0_max, bin1_min]
+                        #indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
+                        #data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR * deltaD
+##                        lut[bin0_max, bin1_min, k].idx = idx
+##                        lut[bin0_max, bin1_min, k].coef =  deltaA * deltaR * deltaD
+                        #outMax[bin0_max, bin1_min] = k + 1
+
+                        #k = outMax[bin0_max, bin1_max]
+                        #indices[indptr[bin0_max*bins1+bin1_max]+k] = idx
+                        #data[indptr[bin0_max*bins1+bin1_max]+k] = deltaA * deltaR * deltaU
+##                        lut[bin0_max, bin1_max, k].idx = idx
+##                        lut[bin0_max, bin1_max, k].coef =  deltaA * deltaR * deltaU
+                        #outMax[bin0_max, bin1_max] = k + 1
+
+                        #for i in range(bin0_min + 1, bin0_max):
+                            #k = outMax[i, bin1_min]
+                            #indices[indptr[i*bins1+bin1_min]+k] = idx
+                            #data[indptr[i*bins1+bin1_min]+k] = deltaA * deltaD
+##                            lut[i, bin1_min, k].idx = idx
+##                            lut[i, bin1_min, k].coef =  deltaA * deltaD
+                            #outMax[i, bin1_min] = k + 1
+
+                            #for j in range(bin1_min + 1, bin1_max):
+                                #k = outMax[i, j]
+                                #indices[indptr[i*bins1+j]+k] = idx
+                                #data[indptr[i*bins1+j]+k] = deltaA
+##                                lut[i, j, k].idx = idx
+##                                lut[i, j, k].coef =  deltaA
+                                #outMax[i, j] = k + 1
+
+                            #k = outMax[i, bin1_max]
+                            #indices[indptr[i*bins1+bin1_max]+k] = idx
+                            #data[indptr[i*bins1+bin1_max]+k] = deltaA * deltaU
+##                            lut[i, bin1_max, k].idx = idx
+##                            lut[i, bin1_max, k].coef =  deltaA * deltaU
+                            #outMax[i, bin1_max] = k + 1
+
+                        #for j in range(bin1_min + 1, bin1_max):
+                            #k = outMax[bin0_min, j]
+                            #indices[indptr[bin0_min*bins1+j]+k] = idx
+                            #data[indptr[bin0_min*bins1+j]+k] = deltaA * deltaL
+##                            lut[bin0_min, j, k].idx = idx
+##                            lut[bin0_min, j, k].coef =  deltaA * deltaL
+                            #outMax[bin0_min, j] = k + 1
+
+                            #k = outMax[bin0_max, j]
+                            #indices[indptr[bin0_max*bins1+j]+k] = idx
+                            #data[indptr[bin0_max*bins1+j]+k] = deltaA * deltaR
+##                            lut[bin0_max, j, k].idx = idx
+##                            lut[bin0_max, j, k].coef =  deltaA * deltaR
+                            #outMax[bin0_max, j] = k + 1
+
+        #self.data = data
+        #self.indices = indices
+        #return outMax
+
+    #@cython.cdivision(True)
+    #@cython.boundscheck(False)
+    #@cython.wraparound(False)
+    #def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
+        #"""
+        #Actually perform the 2D integration which in this case looks more like a matrix-vector product
+
+        #@param weights: input image
+        #@type weights: ndarray
+        #@param dummy: value for dead pixels (optional)
+        #@type dummy: float
+        #@param delta_dummy: precision for dead-pixel value in dynamic masking
+        #@type delta_dummy: float
+        #@param dark: array with the dark-current value to be subtracted (if any)
+        #@type dark: ndarray
+        #@param flat: array with the dark-current value to be divided by (if any)
+        #@type flat: ndarray
+        #@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
+        #@type solidAngle: ndarray
+        #@param polarization: array with the polarization correction values to be divided by (if any)
+        #@type polarization: ndarray
+        #@return:  I(2d), edges0(1d), edges1(1d), weighted histogram(2d), unweighted histogram (2d)
+        #@rtype: 5-tuple of ndarrays
+
+        #"""
+        #cdef int i=0, j=0, idx=0, bins0=self.bins[0], bins1=self.bins[1], bins=bins0*bins1, size=self.size
+        #cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
+        #cdef float data=0, coef=0, cdummy=0, cddummy=0
+        #cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
+        #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros(self.bins, dtype=numpy.float64)
+        #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
+        #cdef numpy.ndarray[numpy.float32_t, ndim = 2] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
+        #cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData_1d = outData.ravel()
+        #cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount_1d = outCount.ravel()
+        #cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge_1d = outMerge.ravel()
+
+        #cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
+        #cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
+
+        #assert size == weights.size
+
+        #if dummy is not None:
+            #do_dummy = True
+            #cdummy =  <float>float(dummy)
+            #if delta_dummy is None:
+                #cddummy = <float>0.0
+            #else:
+                #cddummy = <float>float(delta_dummy)
+
+        #if flat is not None:
+            #do_flat = True
+            #assert flat.size == size
+            #cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
+        #if dark is not None:
+            #do_dark = True
+            #assert dark.size == size
+            #cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
+        #if solidAngle is not None:
+            #do_solidAngle = True
+            #assert solidAngle.size == size
+            #csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
+        #if polarization is not None:
+            #do_polarization = True
+            #assert polarization.size == size
+            #cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
+
+        #if (do_dark + do_flat + do_polarization + do_solidAngle):
+            #tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
+            #cdata = numpy.zeros(size,dtype=numpy.float32)
+            #if do_dummy:
+                #for i in prange(size, nogil=True, schedule="static"):
+                    #data = tdata[i]
+                    #if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
+                        ##Nota: -= and /= operatore are seen as reduction in cython parallel.
+                        #if do_dark:
+                            #data = data - cdark[i]
+                        #if do_flat:
+                            #data = data / cflat[i]
+                        #if do_polarization:
+                            #data = data / cpolarization[i]
+                        #if do_solidAngle:
+                            #data = data / csolidAngle[i]
+                        #cdata[i]+=data
+                    #else: #set all dummy_like values to cdummy. simplifies further processing
+                        #cdata[i]+=cdummy
+            #else:
+                #for i in prange(size, nogil=True, schedule="static"):
+                    #data = tdata[i]
+                    #if do_dark:
+                        #data = data - cdark[i]
+                    #if do_flat:
+                        #data = data / cflat[i]
+                    #if do_polarization:
+                        #data = data / cpolarization[i]
+                    #if do_solidAngle:
+                        #data = data / csolidAngle[i]
+                    #cdata[i]+=data
+        #else:
+            #if do_dummy:
+                #tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
+                #cdata = numpy.zeros(size,dtype=numpy.float32)
+                #for i in prange(size, nogil=True, schedule="static"):
+                    #data = tdata[i]
+                    #if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
+                        #cdata[i]+=data
+                    #else:
+                        #cdata[i]+=cdummy
+            #else:
+                #cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
+
+        #for i in prange(bins, nogil=True, schedule="guided"):
+            #sum_data = 0.0
+            #sum_count = 0.0
+            #for j in range(indptr[i],indptr[i+1]):
+                #idx = indices[j]
+                #coef = ccoef[j]
+                #data = cdata[idx]
+                #if do_dummy and data==cdummy:
+                    #continue
+
+                #sum_data = sum_data + coef * data
+                #sum_count = sum_count + coef
+            #outData_1d[i] += sum_data
+            #outCount_1d[i] += sum_count
+            #if sum_count > epsilon:
+                #outMerge_1d[i] += sum_data / sum_count
+            #else:
+                #outMerge_1d[i] += cdummy
+        #return  outMerge.T, self.outPos0, self.outPos1, outData.T, outCount.T
+
--- a/test/profile_csr_2d.py
+++ b/test/profile_csr_2d.py
@ -25,7 +25,7 @@ data = fabio.open("testimages/Pilatus1M.edf").data
 ref = ai.integrate2d(data, 100, 360, method="lut", unit="2th_deg")[0]
 obt = ai.integrate2d(data, 100, 360, method="ocl_csr", unit="2th_deg")[0]
 ##logger.debug("check LUT basics: %s"%abs(obt[1] - ref[1]).max())
-#assert numpy.allclose(ref,obt)
+assert numpy.allclose(ref,obt)


 plot(ref.ravel(), label="ocl_lut")
--- a/test/profile_csr_fullsplit.py
+++ b/test/profile_csr_fullsplit.py
@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 07 09:52:51 2014
+
+@author: ashiotis
+"""
+
+import sys, numpy, time
+import utilstest
+import fabio, pyopencl
+from pylab import *
+print "#"*50
+pyFAI = sys.modules["pyFAI"]
+from pyFAI import splitPixelFullLUT
+from pyFAI import ocl_azim_csr
+#from pyFAI import splitBBoxLUT
+#from pyFAI import splitBBoxCSR
+#logger = utilstest.getLogger("profile")
+
+
+ai = pyFAI.load("testimages/Pilatus1M.poni")
+data = fabio.open("testimages/Pilatus1M.edf").data
+ref = ai.xrpd_LUT(data, 1000)[1]
+#obt = ai.xrpd_LUT_OCL(data, 1000)[1]
+
+#ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
+
+pos = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
+foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
+
+boo = foo.integrate(data)[1]
+
+foo2 = ocl_azim_csr.OCL_CSR_Integrator(foo.lut, data.size, "GPU", block_size=32)
+boo2 = foo2.integrate(data)[0]
+
+plot(ref, label="ocl_csr")
+plot(boo, label="csr_fullsplit")
+plot(boo2, label="ocl_csr_fullsplit")
+legend()
+show()
+raw_input()
+
--- a/test/profile_ocl_hist_pixelsplit.py
+++ b/test/profile_ocl_hist_pixelsplit.py
@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 07 09:52:51 2014
+
+@author: ashiotis
+"""
+
+import sys, numpy, time
+import utilstest
+import fabio, pyopencl
+from pylab import *
+print "#"*50
+pyFAI = sys.modules["pyFAI"]
+from pyFAI import splitPixelFullLUT
+from pyFAI import ocl_hist_pixelsplit
+#from pyFAI import splitBBoxLUT
+#from pyFAI import splitBBoxCSR
+#logger = utilstest.getLogger("profile")
+
+
+ai = pyFAI.load("testimages/Pilatus1M.poni")
+data = fabio.open("testimages/Pilatus1M.edf").data
+ref = ai.xrpd_LUT(data, 1000)[1]
+#obt = ai.xrpd_LUT_OCL(data, 1000)[1]
+
+#ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
+pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
+
+pos = pos_in.reshape(pos_in.size/8,4,2)
+
+foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
+
+boo = foo.integrate(data)[1]
+
+foo2 = ocl_hist_pixelsplit.OCL_Hist_Pixelsplit(pos, 1000, data.size, devicetype="cpu", block_size=32)
+boo2 = foo2.integrate(data)[2]
+
+#plot(ref, label="ocl_csr")
+#plot(boo, label="csr_fullsplit")
+plot(boo2, label="ocl_csr_fullsplit")
+legend()
+show()
+raw_input()
+
--- a/test/profile_ocl_hist_pixelsplit2.py
+++ b/test/profile_ocl_hist_pixelsplit2.py
@ -0,0 +1,80 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Mar 07 09:52:51 2014
+
+@author: ashiotis
+"""
+
+import sys, numpy, time
+import utilstest
+import fabio
+import pyopencl as cl
+from pylab import *
+print "#"*50
+pyFAI = sys.modules["pyFAI"]
+from pyFAI import splitPixelFullLUT
+from pyFAI import ocl_hist_pixelsplit
+#from pyFAI import splitBBoxLUT
+#from pyFAI import splitBBoxCSR
+#logger = utilstest.getLogger("profile")
+
+
+ai = pyFAI.load("testimages/Pilatus1M.poni")
+data = fabio.open("testimages/Pilatus1M.edf").data
+
+workgroup_size = 256
+bins = 1000
+
+pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
+
+pos = pos_in.reshape(pos_in.size/8,4,2)
+
+pos_size = pos.size
+size = pos_size/8
+
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+mf = cl.mem_flags
+
+d_input     = cl.array.to_device(queue, pos)
+d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
+d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
+
+with open("../openCL/ocl_hist_pixelsplit.cl", "r") as kernelFile:
+    kernel_src = kernelFile.read()
+
+compile_options = "-D BINS=%i  -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
+                (bins, size, workgroup_size, numpy.finfo(numpy.float32).eps)
+
+program = cl.Program(ctx, kernel_src).build(options=compile_options)
+
+program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data,  numpy.uint32(pos_size), d_preresult)
+
+program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
+
+result = numpy.ndarray(4,dtype=numpy.float32)
+
+cl.enqueue_copy(queue,result, d_result)
+
+
+min0 = pos[:, :, 0].min()
+max0 = pos[:, :, 0].max()
+min1 = pos[:, :, 1].min()
+max1 = pos[:, :, 1].max()
+minmax=(min0,max0,min1,max1)
+
+print minmax
+print result
+
+
+
+
+
+#plot(ref, label="ocl_csr")
+#plot(boo, label="csr_fullsplit")
+#plot(boo2, label="ocl_csr_fullsplit")
+#legend()
+#show()
+#raw_input()
+