Merge remote branch 'giannis/master'

Conflicts: doc/source/pyFAI.rst openCL/ocl_azim_CSR.cl pyFAI-src/ocl_azim_csr_dis.py src/morphology.c
2014-06-16 14:13:16 +02:00 · 2014-06-16 14:13:16 +02:00 · 5c914a008e
parent d28a78766b e1531f2c46
commit 5c914a008e
23 changed files with 52761 additions and 66 deletions
--- a/benchmark/benchmark_ocl_csr3.py
+++ b/benchmark/benchmark_ocl_csr3.py
@ -6,6 +6,7 @@ from __future__ import print_function, division
 import json, sys, time, timeit, os, platform, subprocess
 import numpy
 from numpy import log2
 import fabio
 import os.path as op
 sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
@ -58,6 +59,8 @@ class Bench(object):
        self.repeat = repeat
        self.nbr = nbr
        self.results = {}
        self.flops = {}
        self.mem_band = {}
        self.meth = []
        self._cpu = None
        self.fig = None
@ -172,8 +175,10 @@ data = fabio.open(r"%s").data
            print("Working on processor: %s" % self.get_cpu())
            label = "1D_" + self.LABELS[method]
        results = {}
        flops = {}
        mem_band = {}
        first = True
-        param = "Frelon2k.poni"
+        param = "Pilatus1M.poni"
        block_size_list = [1,2,4,8,16,32,64,128,256]
        for block_size in block_size_list:
            self.update_mp()
@ -190,6 +195,14 @@ data = fabio.open(r"%s").data
            if check:
                if "csr" in method:
                    print("csr: size= %s \t nbytes %.3f MB " % (ai._csr_integrator.data.size, ai._csr_integrator.lut_nbytes / 2 ** 20))
            bins = ai._csr_integrator.bins
            nnz = ai._csr_integrator.nnz
            parallel_reduction = sum([2**i for i in range(1,int(log2(block_size)))])
            FLOPs = 9*nnz + 11*parallel_reduction + 1*bins
            mem_access = (2*block_size*bins + 5*nnz + 7*bins)*4
            del ai, data
            self.update_mp()
@ -216,9 +229,13 @@ data = fabio.open(r"%s").data
                self.update_mp()
                if R < self.LIMIT:
                    results[block_size ] = tmin
                    flops[block_size ] = (FLOPs/tmin)*1e-6
                    mem_band[block_size ] = (mem_access/tmin)*1e-6
                    self.update_mp()
            else:
                results[block_size ] = tmin
                flops[block_size ] = FLOPs/tmin
                mem_band[block_size ] = mem_access/tmin
            if first:
                self.new_curve(results, label)
                first = False
@ -227,6 +244,8 @@ data = fabio.open(r"%s").data
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.flops[label] = flops
        self.mem_band[label] = mem_band
        self.update_mp()
@ -234,12 +253,12 @@ data = fabio.open(r"%s").data
        self.update_mp()
        json.dump(self.results, open(filename, "w"))
-    def print_res(self):
+    def print_res(self,summary,results):
        self.update_mp()
-        print("Summary: execution time in milliseconds")
+        print(summary)
        print("Size/Meth\t" + "\t".join(self.meth))
        for i in self.size:
-            print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (self.results[j].get(i, 0)) for j in self.meth))
+            print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (results[j].get(i, 0)) for j in self.meth))
    def init_curve(self):
        self.update_mp()
@ -394,10 +413,16 @@ if __name__ == "__main__":
            bench.bench_1d_ocl_csr(True, {"devicetype":"ACC"})
    bench.save()
-    bench.print_res()
+    results = bench.results
    flops = bench.flops
    mem_band = bench.mem_band
    bench.print_res("Summary: Execution time in milliseconds", results)
    bench.print_res("Summary: MFLOPS",flops)
    bench.print_res("Summary: Memory Bandwidth in MB/s",mem_band)
    bench.update_mp()
    bench.ax.set_ylim(1, 200)
    # plt.show()
    plt.ion()
-    raw_input("Enter to quit")
+#    raw_input("Enter to quit")
--- a/benchmark/benchmark_split.py
+++ b/benchmark/benchmark_split.py
@ -0,0 +1,752 @@
 #!/usr/bin/python
 import json, sys, time, timeit, os, platform, subprocess
 import numpy
 import fabio
 import os.path as op
 sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
 import utilstest
 pyFAI = utilstest.UtilsTest.pyFAI
 ocl = pyFAI.opencl.ocl
 from matplotlib import pyplot as plt
 plt.ion()
 ds_list = ["Pilatus1M.poni", "halfccd.poni", "Frelon2k.poni", "Pilatus6M.poni", "Mar3450.poni", "Fairchild.poni"]
 datasets = {"Fairchild.poni":utilstest.UtilsTest.getimage("1880/Fairchild.edf"),
            "halfccd.poni":utilstest.UtilsTest.getimage("1882/halfccd.edf"),
            "Frelon2k.poni":utilstest.UtilsTest.getimage("1881/Frelon2k.edf"),
            "Pilatus6M.poni":utilstest.UtilsTest.getimage("1884/Pilatus6M.cbf"),
            "Pilatus1M.poni":utilstest.UtilsTest.getimage("1883/Pilatus1M.edf"),
            "Mar3450.poni":utilstest.UtilsTest.getimage("2201/LaB6_260210.mar3450")
      }
 b = None
 class Bench(object):
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    def __init__(self, nbr=10, memprofile=False):
        self.reference_1d = {}
        self.LIMIT = 8
        self.repeat = 1
        self.nbr = nbr
        self.results = {}
        self.meth = []
        self._cpu = None
        self.fig = None
        self.ax = None
        self.starttime = time.time()
        self.plot = None
        self.plot_x = []
        self.plot_y = []
        self.do_memprofile = memprofile
        self.fig_mp = None
        self.ax_mp = None
        self.plot_mp = None
        self.memory_profile = ([], [])
    def get_cpu(self):
        if self._cpu is None:
            if os.name == "nt":
                self._cpu = platform.processor()
            elif os.path.exists("/proc/cpuinfo"):
                self._cpu = [i.split(": ", 1)[1] for i in open("/proc/cpuinfo") if i.startswith("model name")][0].strip()
            elif os.path.exists("/usr/sbin/sysctl"):
                proc = subprocess.Popen(["sysctl", "-n", "machdep.cpu.brand_string"], stdout=subprocess.PIPE)
                proc.wait()
                self._cpu = proc.stdout.read().strip()
            old = self._cpu
            self._cpu = old.replace("  ", " ")
            while old != self._cpu:
                old = self._cpu
                self._cpu = old.replace("  ", " ")
        return self._cpu
    def get_gpu(self, devicetype="gpu", useFp64=False, platformid=None, deviceid=None):
        if ocl is None:
            return "NoGPU"
        ctx = ocl.create_context(devicetype, useFp64, platformid, deviceid)
        return ctx.devices[0].name
    def get_mem(self):
        """
        Returns the occupied memory for memory-leak hunting in MByte
        """
        pid = os.getpid()
        if os.path.exists("/proc/%i/status" % pid):
            for l in open("/proc/%i/status" % pid):
                if l.startswith("VmRSS"):
                    mem = int(l.split(":", 1)[1].split()[0]) / 1024.
        else:
            mem = 0
        return mem
    def print_init(self, t):
        print(" * Initialization time: %.1f ms" % (1000.0 * t))
        self.update_mp()
    def print_exec(self, t):
        print(" * Execution time rep : %.1f ms" % (1000.0 * t))
        self.update_mp()
    def print_sep(self):
        print("*"*80)
        self.update_mp()
    def get_ref(self, param):
        if param not in self.reference_1d:
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            N = min(data.shape)
            res = ai.xrpd(data, N)
            self.reference_1d[param] = res
            del ai, data
        return self.reference_1d[param]
    def bench_cpu1d(self):
        self.update_mp()
        print("Working on processor: %s" % self.get_cpu())
        results = {}
        label = "1D_CPU_serial_full_split"
        first = True
        for param in ds_list:
            self.update_mp()
            ref = self.get_ref(param)
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = min(data.shape)
            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            res = ai.integrate1d(data, 1000, method="splitpixelfull", unit="2th_deg", correctSolidAngle=False)
            t1 = time.time()
            self.print_init(t1 - t0)
            self.update_mp()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 N=min(data.shape)
 out=ai.xrpd(data,N)""" % (param, fn)
            t = timeit.Timer("ai.integrate1d(data, 1000, method='splitpixelfull', unit='2th_deg', correctSolidAngle=False)", setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.update_mp()
            self.print_exec(tmin)
            size /= 1e6
            tmin *= 1000.0
            results[size ] = tmin
            if first:
                self.new_curve(results, label)
                first = False
            else:
                self.new_point(size, tmin)
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_cpu1d_lut(self):
        self.update_mp()
        print("Working on processor: %s" % self.get_cpu())
        label = "1D_CPU_parallel_OpenMP"
        results = {}
        self.new_curve(results, label)
        for param in ds_list:
            self.update_mp()
            ref = self.get_ref(param)
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = min(data.shape)
            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            res = ai.xrpd_LUT(data, N)
            t1 = time.time()
            self.print_init(t1 - t0)
            print "lut.shape=", ai._lut_integrator.lut.shape, "lut.nbytes (MB)", ai._lut_integrator.size * 8 / 1e6
            self.update_mp()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 N=min(data.shape)
 out=ai.xrpd_LUT(data,N)""" % (param, fn)
            t = timeit.Timer("ai.xrpd_LUT(data,N,safe=False)", setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.print_exec(tmin)
            R = utilstest.Rwp(res, ref)
            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
            self.update_mp()
            if R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size ] = tmin
                self.new_point(size, tmin)
            self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_cpu1d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
        self.update_mp()
        if (ocl is None):
            print("No pyopencl")
            return
        if (platformid is None) or (deviceid is None):
            platdev = ocl.select_device(devicetype)
            if not platdev:
                print("No such OpenCL device: skipping benchmark")
                return
            platformid, deviceid = platdev
        print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
        label = "1D_%s_parallel_OpenCL" % devicetype
        first = True
        results = {}
        for param in ds_list:
            self.update_mp()
            ref = self.get_ref(param)
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = min(data.shape)
            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            try:
                res = ai.xrpd_LUT_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid)
            except MemoryError as error:
                print(error)
                break
            t1 = time.time()
            self.print_init(t1 - t0)
            self.update_mp()
            ai.reset()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 N=min(data.shape)
 out=ai.xrpd_LUT_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s)""" % (param, fn, devicetype, platformid, deviceid)
            t = timeit.Timer("ai.xrpd_LUT_OCL(data,N,safe=False)", setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.update_mp()
            del t
            self.update_mp()
            self.print_exec(tmin)
            R = utilstest.Rwp(res, ref)
            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
            if R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size] = tmin
                if first:
                    self.new_curve(results, label)
                    first = False
                else:
                    self.new_point(size, tmin)
            self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_cpu1d_csr_ocl(self, devicetype="GPU", platformid=None, deviceid=None, padded=False, block_size=32):
        self.update_mp()
        if (ocl is None):
            print("No pyopencl")
            return
        if (platformid is None) or (deviceid is None):
            platdev = ocl.select_device(devicetype)
            if not platdev:
                print("No such OpenCL device: skipping benchmark")
                return
            platformid, deviceid = platdev
        print("Working on device: %s platform: %s device: %s padding: %s block_size= %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid], padded, block_size))
        label = "1D_%s_parallel_OpenCL, padded=%s, block_size=%s" % (devicetype, padded, block_size)
        first = True
        results = {}
        for param in ds_list:
            self.update_mp()
            ref = self.get_ref(param)
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = min(data.shape)
            print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            try:
                res = ai.xrpd_CSR_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid, padded=padded, block_size=block_size)
            except MemoryError as error:
                print(error)
                break
            t1 = time.time()
            self.print_init(t1 - t0)
            self.update_mp()
            ai.reset()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 N=min(data.shape)
 out=ai.xrpd_CSR_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s,padded=%s,block_size=%s)""" % (param, fn, devicetype, platformid, deviceid, padded, block_size)
            t = timeit.Timer("ai.xrpd_CSR_OCL(data,N,safe=False,padded=%s,block_size=%s)" % (padded, block_size), setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.update_mp()
            del t
            self.update_mp()
            self.print_exec(tmin)
            R = utilstest.Rwp(res, ref)
            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
            if R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size] = tmin
                if first:
                    self.new_curve(results, label)
                    first = False
                else:
                    self.new_point(size, tmin)
            self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_cpu2d(self):
        self.update_mp()
        print("Working on processor: %s" % self.get_cpu())
        results = {}
        label = "2D_CPU_serial"
        first = True
        for param in ds_list:
            self.update_mp()
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = (500, 360)
            print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            _ = ai.xrpd2(data, N[0], N[1])
            t1 = time.time()
            self.print_init(t1 - t0)
            self.update_mp()
            ai.reset()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 out=ai.xrpd2(data,%s,%s)""" % (param, fn, N[0], N[1])
            t = timeit.Timer("ai.xrpd2(data,%s,%s)" % N, setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.update_mp()
            del t
            self.update_mp()
            self.print_exec(tmin)
            print("")
            if 1:  # R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size] = tmin
                if first:
                    self.new_curve(results, label)
                    first = False
                else:
                    self.new_point(size, tmin)
            self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_cpu2d_lut(self):
        print("Working on processor: %s" % self.get_cpu())
        label = "2D_CPU_parallel_OpenMP"
        first = True
        results = {}
        for param in ds_list:
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = (500, 360)
            print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            _ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut")
            t1 = time.time()
            self.print_init(t1 - t0)
            print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
            self.update_mp()
            ai.reset()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut")""" % (param, fn, N[0], N[1])
            t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut')" % N, setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.update_mp()
            del t
            self.update_mp()
            self.print_exec(tmin)
            print("")
            if 1:  # R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size] = tmin
                if first:
                    self.new_curve(results, label)
                    first = False
                else:
                    self.new_point(size, tmin)
                self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_cpu2d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
        self.update_mp()
        if (ocl is None):
            print("No pyopencl")
            return
        if (platformid is None) or (deviceid is None):
            platdev = ocl.select_device(devicetype)
            if not platdev:
                print("No such OpenCL device: skipping benchmark")
                return
            platformid, deviceid = platdev
        print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
        results = {}
        label = "2D_%s_parallel_OpenCL" % devicetype.upper()
        first = True
        for param in ds_list:
            self.update_mp()
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = (500, 360)
            print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
            t0 = time.time()
            try:
                _ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut_ocl_%i,%i" % (platformid, deviceid))
            except MemoryError as error:
                print(error)
                break
            t1 = time.time()
            self.print_init(t1 - t0)
            print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
            self.update_mp()
            ai.reset()
            del ai, data
            self.update_mp()
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut_ocl_%i,%i")""" % (param, fn, N[0], N[1], platformid, deviceid)
            t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut_ocl')" % N, setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            self.update_mp()
            del t
            self.update_mp()
            self.print_exec(tmin)
            print("")
            if 1:  # R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size] = tmin
                if first:
                    self.new_curve(results, label)
                    first = False
                else:
                    self.new_point(size, tmin)
                self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def bench_gpu1d(self, devicetype="gpu", useFp64=True, platformid=None, deviceid=None):
        self.update_mp()
        print("Working on %s, in " % devicetype + ("64 bits mode" if useFp64 else"32 bits mode") + "(%s.%s)" % (platformid, deviceid))
        if ocl is None or not ocl.select_device(devicetype):
            print("No pyopencl or no such device: skipping benchmark")
            return
        results = {}
        label = "Forward_OpenCL_%s_%s_bits" % (devicetype , ("64" if useFp64 else"32"))
        first = True
        for param in ds_list:
            self.update_mp()
            fn = datasets[param]
            ai = pyFAI.load(param)
            data = fabio.open(fn).data
            size = data.size
            N = min(data.shape)
            print("1D integration of %s %.1f Mpixel -> %i bins (%s)" % (op.basename(fn), size / 1e6, N, ("64 bits mode" if useFp64 else"32 bits mode")))
            try:
                t0 = time.time()
                res = ai.xrpd_OpenCL(data, N, devicetype=devicetype, useFp64=useFp64, platformid=platformid, deviceid=deviceid)
                t1 = time.time()
            except Exception as error:
                print("Failed to find an OpenCL GPU (useFp64:%s) %s" % (useFp64, error))
                continue
            self.print_init(t1 - t0)
            self.update_mp()
            ref = ai.xrpd(data, N)
            R = utilstest.Rwp(res, ref)
            print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
            setup = """
 import pyFAI,fabio
 ai=pyFAI.load(r"%s")
 data = fabio.open(r"%s").data
 N=min(data.shape)
 out=ai.xrpd_OpenCL(data,N, devicetype=r"%s", useFp64=%s, platformid=%s, deviceid=%s)""" % (param, fn, devicetype, useFp64, platformid, deviceid)
            t = timeit.Timer("ai.xrpd_OpenCL(data,N,safe=False)", setup)
            tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
            del t
            self.update_mp()
            self.print_exec(tmin)
            print("")
            if R < self.LIMIT:
                size /= 1e6
                tmin *= 1000.0
                results[size] = tmin
                if first:
                    self.new_curve(results, label)
                    first = False
                else:
                    self.new_point(size, tmin)
                self.update_mp()
        self.print_sep()
        self.meth.append(label)
        self.results[label] = results
        self.update_mp()
    def save(self, filename="benchmark.json"):
        self.update_mp()
        json.dump(self.results, open(filename, "w"))
    def print_res(self):
        self.update_mp()
        print("Summary: execution time in milliseconds")
        print "Size/Meth\t" + "\t".join(b.meth)
        for i in self.size:
            print "%7.2f\t\t" % i + "\t\t".join("%.2f" % (b.results[j].get(i, 0)) for j in b.meth)
    def init_curve(self):
        self.update_mp()
        if self.fig:
            print("Already initialized")
            return
        if "DISPLAY" in os.environ:
            plt.ion()
            self.fig = plt.figure()
            self.ax = self.fig.add_subplot(1, 1, 1)
            self.ax.set_autoscale_on(False)
            self.ax.set_xlabel("Image size in Mega-Pixels")
            self.ax.set_ylabel("Frames processed per second")
            self.ax.set_yscale("log", basey=2)
            t = [1, 2, 5, 10, 20, 50, 100, 200, 400, 500]
            self.ax.set_yticks([float(i) for i in t])
            self.ax.set_yticklabels([str(i)for i in t])
            self.ax.set_xlim(0.5, 20)
            self.ax.set_ylim(0.5, 500)
            self.ax.set_title(self.get_cpu() + " / " + self.get_gpu())
            if self.fig.canvas:
                self.fig.canvas.draw()
 #            plt.show()
    def new_curve(self, results, label):
        self.update_mp()
        if not self.fig:
            return
        self.plot_x = list(results.keys())
        self.plot_x.sort()
        self.plot_y = [1000.0 / results[i] for i in self.plot_x]
        self.plot = self.ax.plot(self.plot_x, self.plot_y, "o-", label=label)[0]
        self.ax.legend()
        if self.fig.canvas:
            self.fig.canvas.draw()
    def new_point(self, size, exec_time):
        """
        Add new point to current curve
        @param size: of the system
        @parm exec_time: execution time in ms
        """
        self.update_mp()
        if not self.plot:
            return
        self.plot_x.append(size)
        self.plot_y.append(1000.0 / exec_time)
        self.plot.set_data(self.plot_x, self.plot_y)
        if self.fig.canvas:
            self.fig.canvas.draw()
    def display_all(self):
        if not self.fig:
            return
        for k in self.meth:
            self.new_curve(self.results[k], k)
        self.ax.legend()
        self.fig.savefig("benchmark.png")
        self.fig.show()
 #        plt.ion()
    def update_mp(self):
        if not self.do_memprofile:
            return
        self.memory_profile[0].append(time.time() - self.starttime)
        self.memory_profile[1].append(self.get_mem())
        if not self.fig_mp:
            self.fig_mp = plt.figure()
            self.ax_mp = self.fig_mp.add_subplot(1, 1, 1)
            self.ax_mp.set_autoscale_on(False)
            self.ax_mp.set_xlabel("Run time (s)")
            self.ax_mp.set_xlim(0, 100)
            self.ax_mp.set_ylim(0, 2 ** 10)
            self.ax_mp.set_ylabel("Memory occupancy (MB)")
            self.ax_mp.set_title("Memory leak hunter")
            self.plot_mp = self.ax_mp.plot(*self.memory_profile)[0]
        else:
            self.plot_mp.set_data(*self.memory_profile)
            tmax = self.memory_profile[0][-1]
            mmax = max(self.memory_profile[1])
            if tmax > self.ax_mp.get_xlim()[-1]:
                self.ax_mp.set_xlim(0, tmax)
            if mmax > self.ax_mp.get_ylim()[-1]:
                self.ax_mp.set_ylim(0, mmax)
        if self.fig_mp.canvas:
            self.fig_mp.canvas.draw()
    def get_size(self):
        if len(self.meth) == 0:
            return []
        size = list(self.results[self.meth[0]].keys())
        for i in self.meth[1:]:
            s = list(self.results[i].keys())
            if len(s) > len(size):
                size = s
        size.sort()
        return size
    size = property(get_size)
 if __name__ == "__main__":
    try:
        from argparse import ArgumentParser
    except:
        from pyFAI.argparse import ArgumentParser
    description = """Benchmark for Azimuthal integration
    """
    epilog = """  """
    usage = """benchmark [options] """
    version = "pyFAI benchmark version " + pyFAI.version
    parser = ArgumentParser(usage=usage, description=description, epilog=epilog)
    parser.add_argument("-v", action='version', version=version)
    parser.add_argument("-d", "--debug",
                          action="store_true", dest="debug", default=False,
                          help="switch to verbose/debug mode")
    parser.add_argument("-c", "--cpu",
                      action="store_true", dest="opencl_cpu", default=False,
                      help="perform benchmark using OpenCL on the CPU")
    parser.add_argument("-g", "--gpu",
                      action="store_true", dest="opencl_gpu", default=False,
                      help="perform benchmark using OpenCL on the GPU")
    parser.add_argument("-a", "--acc",
                      action="store_true", dest="opencl_acc", default=False,
                      help="perform benchmark using OpenCL on the Accelerator (like XeonPhi/MIC)")
    parser.add_argument("-s", "--small",
                      action="store_true", dest="small", default=False,
                      help="Limit the size of the dataset to 6 Mpixel images (for computer with limited memory)")
    parser.add_argument("-n", "--number",
                      dest="number", default=10, type=int,
                      help="Number of repetition of the test, by default 10")
    parser.add_argument("-2d", "--2dimentions",
                      action="store_true", dest="twodim", default=False,
                      help="Benchmark also algorithm for 2D-regrouping")
    parser.add_argument("-m", "--memprof",
                      action="store_true", dest="memprof", default=False,
                      help="Perfrom memory profiling (Linux only)")
    parser.add_argument("-f", "--fullsplit",
                      action="store_true", dest="split_cpu", default=False,
                      help="perform benchmark using full pixel splitting on CPU")
    options = parser.parse_args()
    if options.small:
        ds_list = ds_list[:4]
    if options.debug:
            pyFAI.logger.setLevel(logging.DEBUG)
    print("Averaging over %i repetitions (best of 3)." % options.number)
    b = Bench(options.number, options.memprof)
    b.init_curve()
    b.bench_cpu1d()
    b.bench_cpu1d_lut()
    if options.opencl_cpu:
        b.bench_cpu1d_lut_ocl("CPU")
    if options.opencl_gpu:
        b.bench_cpu1d_lut_ocl("GPU")
    if options.opencl_acc:
        b.bench_cpu1d_lut_ocl("ACC")
    if options.split_cpu:
        b.bench_cpu1d
 #    b.bench_cpu1d_ocl_lut("CPU")
 #    b.bench_gpu1d("gpu", True)
 #    b.bench_gpu1d("gpu", False)
 #    b.bench_gpu1d("cpu", True)
 #    b.bench_gpu1d("cpu", False)
    if options.twodim:
        b.bench_cpu2d()
        b.bench_cpu2d_lut()
        if options.opencl_cpu:
            b.bench_cpu2d_lut_ocl("CPU")
        if options.opencl_gpu:
            b.bench_cpu2d_lut_ocl("GPU")
        if options.opencl_acc:
            b.bench_cpu2d_lut_ocl("ACC")
 #    b.bench_cpu2d_lut()
 #    b.bench_cpu2d_lut_ocl()
    b.save()
    b.print_res()
 #    b.display_all()
    b.update_mp()
    b.ax.set_ylim(1, 200)
    # plt.show()
    plt.ion()
    raw_input("Enter to quit")
--- a/doc/source/pyFAI.rst
+++ b/doc/source/pyFAI.rst
@ -140,12 +140,19 @@ PyFAI solves this problem by pixel
 splitting : in addition to the pixel position, its
 spatial extension is calculated and each pixel is then split and
 distributed over the corresponding bins, the intensity being considered
 <<<<<<< HEAD
 as homogeneous within a pixel and spread accordingly.
 The drawback of this is the correlation introduced between two adjacent bins.
 To simplify 
 calculations, this was initially done by abstracting the pixel shape 
 with a bounding box that circumscribes the pixel. In an effort to better
 the quality of the results this method was dropped in favo2r of a full 
 =======
 as homogeneous within a pixel and spread accordingly. To simplify 
 calculations, this was initially done by abstracting the pixel shape 
 with a bounding box that circumscribes the pixel. In an effort to better
 the quality of the results this method was dropped in favour of a full 
 >>>>>>> giannis/master
 pixel splitting scheme that actually uses the actual pixel geometry 
 for its calculations.
@ -194,6 +201,7 @@ but can still be too large to fit on an entry-level graphics card.
 By making this change we switched from a “linear read / random write” forward algorithm
 to a “random read / linear write” backward algorithm which is more suitable for parallelization.
 As a farther improvement on the algorithm, the use of compressed sparse row (CSR) format was 
 <<<<<<< HEAD
 introduced, to store the LUT data.
 This algorithm was implemented both in [Cython]_-OpenMP and OpenCL.
 The CSR approach has a double benefit: 
@ -206,6 +214,14 @@ This makes it very well suited to run on GPUs and accelerators
 where hundreds to thousands of simultaneous threads are available.
 When using OpenCL for the GPU we used a compensated (or Kahan_summation_), to reduce
 =======
 introduced, to store the LUT data. This reduced its size even more, giving this way the 
 opportunity of working with bigger images on the same hardware, when memory space is of concern, 
 as well as making the code better suited to be run on GPUs or accelerators, as transferring 
 data to the device is one of the most important bottlenecks of such computations.
 This algorithm was implemented in Cython-OpenMP and OpenCL.
 When using OpenCL for the GPU we used a compensated, or Kahan summation to reduce
 >>>>>>> giannis/master
 the error accumulation in the histogram summation (at the cost of more operations to be done).
 This allows accurate results to be obtained on cheap hardware that performs calculations
 in single precision floating-point arithmetic (32 bits) which are available on consumer
@ -214,12 +230,17 @@ Double precision operations are currently limited to high price and performance
 The additional cost of Kahan summation, 4x more arithmetic operations, is hidden by smaller data types,
 the higher number of single precision units and that the GPU is usually limited by the memory bandwidth anyway.
 <<<<<<< HEAD
 .. _Kahan_summation: http://en.wikipedia.org/wiki/Kahan_summation_algorithm
 The performances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s 
 on recent multi-core computer with a mid-range graphics card. 
 On multi-socket server featuring high-end GPUs like Tesla cards, the performances are similar with 
 the additional capability to work on multiple detector simultaneously.
 =======
 The perfomances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s 
 on recent multi-socket, multi-core computer or on high-end GPUs like Tesla cards.
 >>>>>>> giannis/master
 .. figure:: img/benchmark.png
   :align: center
--- a/openCL/ocl_azim_CSR.cl
+++ b/openCL/ocl_azim_CSR.cl
@ -174,25 +174,21 @@ corrections( 		__global float 	*image,
 /**
- * \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT
+ * \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
 *
 * An image instensity value is spread across the bins according to the positions stored in the LUT.
- * The lut is an 2D-array of index (contains the positions of the pixel in the input array)
+ * The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
 * and coeficients (fraction of pixel going to the bin)
 * Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
 *
 * This implementation is especially efficient on CPU where each core reads adjacents memory.
 * the use of local pointer can help on the CPU.
 *
 * @param weights     Float pointer to global memory storing the input image.
- * @param lut         Pointer to an 2D-array of (unsigned integers,float) containing the index of input pixels and the fraction of pixel going to the bin
+ * @param coefs       Float pointer to global memory holding the coeficient part of the LUT
 * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
 * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
 * @param do_dummy    Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
 * @param dummy       Float: value for bad pixels
 * @param delta_dummy Float: precision for bad pixel value
 * @param do_dark     Bool/int: shall dark-current correction be applied ?
 * @param dark        Float pointer to global memory storing the dark image.
 * @param do_flat     Bool/int: shall flat-field correction be applied ? (could contain polarization corrections)
 * @param flat        Float pointer to global memory storing the flat image.
 * @param outData     Float pointer to the output 1D array with the weighted histogram
 * @param outCount    Float pointer to the output 1D array with the unweighted histogram
 * @param outMerged   Float pointer to the output 1D array with the diffractogram
@ -292,6 +288,106 @@ csr_integrate(	const 	__global	float	*weights,
    cd = 0;
    cc = 0;
    if (thread_id_loc == 0)
    {
        outData[bin_num] = super_sum_data[0];
        outCount[bin_num] = super_sum_count[0];
        if (outCount[bin_num] > epsilon)
            outMerge[bin_num] =  outData[bin_num] / outCount[bin_num];
        else
            outMerge[bin_num] = dummy;
    }
 };//end kernel
 /**
 * \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
 *
 * An image instensity value is spread across the bins according to the positions stored in the LUT.
 * The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
 * Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
 *
 * This kernel is ment to be ran with padded data (the span of each bin must be a multiple of the workgroup size)
 *
 * @param weights     Float pointer to global memory storing the input image.
 * @param coefs       Float pointer to global memory holding the coeficient part of the LUT
 * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
 * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
 * @param do_dummy    Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
 * @param dummy       Float: value for bad pixels
 * @param outData     Float pointer to the output 1D array with the weighted histogram
 * @param outCount    Float pointer to the output 1D array with the unweighted histogram
 * @param outMerged   Float pointer to the output 1D array with the diffractogram
 *
 */
 __kernel void
 csr_integrate_padded(	const 	__global	float	*weights,
                        const   __global    float   *coefs,
                        const   __global    int     *row_ind,
                        const   __global    int     *col_ptr,
                        const				int   	do_dummy,
                        const			 	float 	dummy,
                                __global 	float	*outData,
                                __global 	float	*outCount,
                                __global 	float	*outMerge
                    )
 {
    int thread_id_loc = get_local_id(0);
    int bin_num = get_group_id(0); // each workgroup of size=warp is assinged to 1 bin
    int2 bin_bounds;
 //    bin_bounds = (int2) *(col_ptr+bin_num);  // cool stuff!
    bin_bounds.x = col_ptr[bin_num];
    bin_bounds.y = col_ptr[bin_num+1];
 	float sum_data = 0.0f;
 	float sum_count = 0.0f;
 	float cd = 0.0f;
 	float cc = 0.0f;
 	float t, y;
 	const float epsilon = 1e-10f;
 	float coef, data;
 	int idx, k, j;
 	for (j=bin_bounds.x;j<bin_bounds.y;j+=WORKGROUP_SIZE)
 	{
 		k = j+thread_id_loc;
   		coef = coefs[k];
        idx = row_ind[k];
   		data = weights[idx];
   		if( (!do_dummy) || (data!=dummy) )
   		{
   			//sum_data +=  coef * data;
   			//sum_count += coef;
   			//Kahan summation allows single precision arithmetics with error compensation
   			//http://en.wikipedia.org/wiki/Kahan_summation_algorithm
   			y = coef*data - cd;
   			t = sum_data + y;
   			cd = (t - sum_data) - y;
    		sum_data = t;
    		y = coef - cc;
    		t = sum_count + y;
    		cc = (t - sum_count) - y;
    		sum_count = t;
    	};//end if dummy
    };//for j
 /*
 * parallel reduction
 */
 // REMEMBER TO PASS WORKGROUP_SIZE AS A CPP DEF
    __local float super_sum_data[WORKGROUP_SIZE];
    __local float super_sum_data_correction[WORKGROUP_SIZE];
    __local float super_sum_count[WORKGROUP_SIZE];
    __local float super_sum_count_correction[WORKGROUP_SIZE];
    super_sum_data[thread_id_loc] = sum_data;
    super_sum_count[thread_id_loc] = sum_count;
    super_sum_data_correction[thread_id_loc] = cd;
    super_sum_count_correction[thread_id_loc] = cc;
    barrier(CLK_LOCAL_MEM_FENCE);
    float super_sum_temp = 0.0f;
    int index, active_threads = WORKGROUP_SIZE;
    cd = 0;
    cc = 0;
    while (active_threads != 1)
    {
        active_threads /= 2;
@ -326,6 +422,20 @@ csr_integrate(	const 	__global	float	*weights,
    }
 };//end kernel
 =======
 Correct an image based on the look-up table calculated ...
 /**
 * \brief Performs distortion corrections on an image using a LUT in CSR form
 *
 * @param weights     Float pointer to global memory storing the input image.
 * @param coefs       Float pointer to global memory holding the coeficient part of the LUT
 * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
 * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
 * @param outData     Float pointer to the output 1D array with the corrected image
 *
 */
 __kernel void
 csr_integrate_dis(  const   __global    float   *weights,
                const   __global    float   *coefs,
--- a/openCL/ocl_hist_pixelsplit.cl
+++ b/openCL/ocl_hist_pixelsplit.cl
@ -0,0 +1,405 @@
 //#pragma OPENCL EXTENSION cl_amd_printf : enable
 //#pragma OPENCL EXTENSION cl_intel_printf : enable
 float area4(float a0, float a1, float b0, float b1, float c0, float c1, float d0, float d1)
 {
    return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)));
 }
 float integrate_line( float A0, float B0, float2 AB)
 {
    return (A0==B0) ? 0.0 : AB.s0*(B0*B0 - A0*A0)*0.5 + AB.s1*(B0-A0);
 }
 float getBinNr(float x0, float delta, float pos0_min)
 {
    return (x0 - pos0_min) / delta;
 }
 float min4f(float a, float b, float c, float d)
 {
    return fmin(fmin(a,b),fmin(c,d));
 }
 float max4f(float a, float b, float c, float d)
 {
    return fmax(fmax(a,b),fmax(c,d));
 }
 void AtomicAdd(volatile __global float *source, const float operand) 
 {
    union {
        unsigned int intVal;
        float floatVal;
    } newVal;
    union {
        unsigned int intVal;
        float floatVal;
    } prevVal;
    do {
        prevVal.floatVal = *source;
        newVal.floatVal = prevVal.floatVal + operand;
    } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
 }
 /**
 * \brief cast values of an array of uint16 into a float output array.
 *
 * @param array_u16: Pointer to global memory with the input data as unsigned16 array
 * @param array_float:  Pointer to global memory with the output data as float array
 */
 __kernel void
 u16_to_float(__global unsigned short  *array_u16,
             __global float *array_float
 )
 {
  int i = get_global_id(0);
  //Global memory guard for padding
  if(i < NIMAGE)
    array_float[i]=(float)array_u16[i];
 }
 /**
 * \brief convert values of an array of int32 into a float output array.
 *
 * @param array_int:  Pointer to global memory with the data in int
 * @param array_float:  Pointer to global memory with the data in float
 */
 __kernel void
 s32_to_float(   __global int  *array_int,
                __global float  *array_float
        )
 {
  int i = get_global_id(0);
  //Global memory guard for padding
  if(i < NIMAGE)
    array_float[i] = (float)(array_int[i]);
 }
 /**
 * \brief Sets the values of 3 float output arrays to zero.
 *
 * Gridsize = size of arrays + padding.
 *
 * @param array0: float Pointer to global memory with the outMerge array
 * @param array1: float Pointer to global memory with the outCount array
 * @param array2: float Pointer to global memory with the outData array
 */
 __kernel void
 memset_out(__global float *array0,
           __global float *array1,
           __global float *array2
 )
 {
  int i = get_global_id(0);
  //Global memory guard for padding
  if(i < BINS)
  {
    array0[i]=0.0f;
    array1[i]=0.0f;
    array2[i]=0.0f;
  }
 }
 /**
 * \brief Performs the first part of a 2-step parallel reduction.
 * 
 * Together with the second part, it take a flattened 4D-array 
 * and returns the min and max of both of the 2 components of the 
 * last dimension
 *
 * @param buffer: float Pointer to global memory with the flattened 4D-array (pos in pyFAI)
 * @param length: interger value of the length of the buffer array
 * @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
 */
 __kernel
 void reduce1(__global float2* buffer,
             __const int length,
             __global float4* preresult) {
    int global_index = get_global_id(0);
    int global_size  = get_global_size(0);
    float4 accumulator;
    accumulator.x = INFINITY;
    accumulator.y = -INFINITY;
    accumulator.z = INFINITY;
    accumulator.w = -INFINITY;
    // Loop sequentially over chunks of input vector
    while (global_index < length/2) {
        float2 element = buffer[global_index];
        accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
        accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
        accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
        accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
        global_index += global_size;
    }
    __local float4 scratch[WORKGROUP_SIZE];
    // Perform parallel reduction
    int local_index = get_local_id(0);
    scratch[local_index] = accumulator;
    barrier(CLK_LOCAL_MEM_FENCE);
    int active_threads = get_local_size(0);
    while (active_threads != 1)
    {
        active_threads /= 2;
        if (local_index < active_threads)
        {
            float4 other = scratch[local_index + active_threads];
            float4 mine  = scratch[local_index];
            mine.x = (mine.x < other.x) ? mine.x : other.x;
            mine.y = (mine.y > other.y) ? mine.y : other.y;
            mine.z = (mine.z < other.z) ? mine.z : other.z;
            mine.w = (mine.w > other.w) ? mine.w : other.w;
            /*
            float2 tmp;
            tmp.x = (mine.x < other.x) ? mine.x : other.x;
            tmp.y = (mine.y > other.y) ? mine.y : other.y;
            scratch[local_index] = tmp;
            */
            scratch[local_index] = mine;
       }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (local_index == 0) {
        preresult[get_group_id(0)] = scratch[0];
    }
 }
 /**
 * \brief Performs the second part of a 2-step parallel reduction.
 * 
 * Together with the second part, it take a flattened 4D-array 
 * and returns the min and max of both of the 2 components of the 
 * last dimension
 *
 * @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
 * @param result: float Pointer to global memory with the min/max values requested
 */
 __kernel
 void reduce2(__global float4* preresult,
             __global float4* result) {
    __local float4 scratch[WORKGROUP_SIZE];
    int local_index = get_local_id(0);
    scratch[local_index] = preresult[local_index];
    barrier(CLK_LOCAL_MEM_FENCE);
    int active_threads = get_local_size(0);
    while (active_threads != 1)
    {
        active_threads /= 2;
        if (local_index < active_threads)
        {
            float4 other = scratch[local_index + active_threads];
            float4 mine  = scratch[local_index];
            mine.x = (mine.x < other.x) ? mine.x : other.x;
            mine.y = (mine.y > other.y) ? mine.y : other.y;
            mine.z = (mine.z < other.z) ? mine.z : other.z;
            mine.w = (mine.w > other.w) ? mine.w : other.w;
            /*
            float2 tmp;
            tmp.x = (mine.x < other.x) ? mine.x : other.x;
            tmp.y = (mine.y > other.y) ? mine.y : other.y;
            scratch[local_index] = tmp;
            */
            scratch[local_index] = mine;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (local_index == 0) {
        result[0] = scratch[0];
    }
 }
 /**
 * \brief Performs Normalization of input image
 *
 * Intensities of images are corrected by:
 *  - dark (read-out) noise subtraction
 *  - Solid angle correction (division)
 *  - polarization correction (division)
 *  - flat fiels correction (division)
 * Corrections are made in place unless the pixel is dummy.
 * Dummy pixels are left untouched so that they remain dummy
 *
 * @param image           Float pointer to global memory storing the input image.
 * @param do_dark         Bool/int: shall dark-current correction be applied ?
 * @param dark            Float pointer to global memory storing the dark image.
 * @param do_flat         Bool/int: shall flat-field correction be applied ?
 * @param flat            Float pointer to global memory storing the flat image.
 * @param do_solidangle   Bool/int: shall flat-field correction be applied ?
 * @param solidangle      Float pointer to global memory storing the solid angle of each pixel.
 * @param do_polarization Bool/int: shall flat-field correction be applied ?
 * @param polarization    Float pointer to global memory storing the polarization of each pixel.
 * @param do_dummy        Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
 * @param dummy           Float: value for bad pixels
 * @param delta_dummy     Float: precision for bad pixel value
 *
 **/
 __kernel void
 corrections(        __global float  *image,
            const            int    do_dark,
            const   __global float  *dark,
            const            int    do_flat,
            const   __global float  *flat,
            const            int    do_solidangle,
            const   __global float  *solidangle,
            const            int    do_polarization,
            const   __global float  *polarization,
            const            int    do_dummy,
            const            float  dummy,
            const            float  delta_dummy
            )
 {
    float data;
    int i= get_global_id(0);
    if(i < NIMAGE)
    {
        data = image[i];
        int dummy_condition = ((!do_dummy) || ((delta_dummy!=0.0f) && (fabs(data-dummy) > delta_dummy)) || ((delta_dummy==0.0f) && (data!=dummy)));
        data -= do_dark         ? dark[i]           : 0;
        data *= do_flat         ? 1/flat[i]         : 1;
        data *= do_solidangle   ? 1/solidangle[i]   : 1;
        data *= do_polarization ? 1/polarization[i] : 1;
        image[i] = dummy_condition ? data : dummy;
    };//end if NIMAGE
 };//end kernel
 /**
 * \brief Performs 1d azimuthal integration with full pixel splitting
 *
 * @param pos         Float pointer to global memory storting the flattened 4D-array with the pixel point coords
 * @param image       Float pointer to global memory storing the input image.
 * @param minmax      Float pointer to global memory holding the min/max results of the reduction kernels
 * @param length:     Interger value of the length of the buffer array
 * @param row_ind     Integer pointer to global memory holding the corresponding index of the coeficient
 * @param col_ptr     Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
 * @param do_dummy    Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
 * @param dummy       Float: value for bad pixels
 * @param outData     Float pointer to the output 1D array with the weighted histogram
 * @param outCount    Float pointer to the output 1D array with the unweighted histogram
 *
 */
 __kernel
 void integrate1(__global float8* pos,
                __global float*  image,
    //             __global int*    mask,
    //             __const  int     check_mask,
                __global float4* minmax,
                const    int     length,
    //                     float2  pos0Range,
    //                     float2  pos1Range,
                const    int     do_dummy,
                const    float   dummy,
                __global float*  outData,
                __global float*  outCount)
 {
    int global_index = get_global_id(0);
    if (global_index < length)
    {
 //         float pos0_min = fmax(fmin(pos0Range.x,pos0Range.y),minmax[0].s0);
 //         float pos0_max = fmin(fmax(pos0Range.x,pos0Range.y),minmax[0].s1);
        float pos0_min = minmax[0].s0;
        float pos0_max = minmax[0].s1;
        pos0_max *= 1 + EPS;
        float delta = (pos0_max - pos0_min) / BINS;
        int local_index  = get_local_id(0);
        float8 pixel = pos[global_index];
        float  data  = image[global_index];
        pixel.s0 = getBinNr(pixel.s0, delta, pos0_min);
        pixel.s2 = getBinNr(pixel.s2, delta, pos0_min);
        pixel.s4 = getBinNr(pixel.s4, delta, pos0_min);
        pixel.s6 = getBinNr(pixel.s6, delta, pos0_min);
        float min0 = min4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
        float max0 = max4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
        int bin0_min = floor(min0);
        int bin0_max = floor(max0);
        float2 AB, BC, CD, DA;
        AB.x=(pixel.s3-pixel.s1)/(pixel.s2-pixel.s0);
        AB.y= pixel.s1 - AB.x*pixel.s0;
        BC.x=(pixel.s5-pixel.s3)/(pixel.s4-pixel.s2);
        BC.y= pixel.s3 - BC.x*pixel.s2;
        CD.x=(pixel.s7-pixel.s5)/(pixel.s6-pixel.s4);
        CD.y= pixel.s5 - CD.x*pixel.s4;
        DA.x=(pixel.s1-pixel.s7)/(pixel.s0-pixel.s6);
        DA.y= pixel.s7 - DA.x*pixel.s6;
        float areaPixel = area4(pixel.s0, pixel.s1, pixel.s2, pixel.s3, pixel.s4, pixel.s5, pixel.s6, pixel.s7);
        float oneOverPixelArea = 1.0 / areaPixel;
        for (int bin=bin0_min; bin < bin0_max+1; bin++)
        {
            float A_lim = (pixel.s0<=bin)*(pixel.s0<=(bin+1))*bin + (pixel.s0>bin)*(pixel.s0<=(bin+1))*pixel.s0 + (pixel.s0>bin)*(pixel.s0>(bin+1))*(bin+1);
            float B_lim = (pixel.s2<=bin)*(pixel.s2<=(bin+1))*bin + (pixel.s2>bin)*(pixel.s2<=(bin+1))*pixel.s2 + (pixel.s2>bin)*(pixel.s2>(bin+1))*(bin+1);
            float C_lim = (pixel.s4<=bin)*(pixel.s4<=(bin+1))*bin + (pixel.s4>bin)*(pixel.s4<=(bin+1))*pixel.s4 + (pixel.s4>bin)*(pixel.s4>(bin+1))*(bin+1);
            float D_lim = (pixel.s6<=bin)*(pixel.s6<=(bin+1))*bin + (pixel.s6>bin)*(pixel.s6<=(bin+1))*pixel.s6 + (pixel.s6>bin)*(pixel.s6>(bin+1))*(bin+1);
            float partialArea  = integrate_line(A_lim, B_lim, AB);
            partialArea += integrate_line(B_lim, C_lim, BC);
            partialArea += integrate_line(C_lim, D_lim, CD);
            partialArea += integrate_line(D_lim, A_lim, DA);
            float tmp = fabs(partialArea) * oneOverPixelArea;
            outCount[bin] = tmp;
            outData[bin]  = data*tmp;
 //             AtomicAdd(&outCount[bin], tmp); 
 //             AtomicAdd(&outData[bin], data*tmp);
        }
    }
 }
 /**
 * \brief Finished the 1d azimuthal integration by calculating the ratio of the 2 histograms
 *
 * @param outData     Float pointer to the output 1D array with the weighted histogram
 * @param outCount    Float pointer to the output 1D array with the unweighted histogram
 * @param outMerged   Float pointer to the output 1D array with the diffractogram
 *
 */
 __kernel
 void integrate2(__global float*  outData,
                __global float*  outCount,
                __global float*  outMerge)
 {
    int global_index = get_global_id(0);
    if (global_index < BINS)
        outMerge[global_index] = outData[global_index]/outCount[global_index];
 }
--- a/openCL/reduction_test.cl
+++ b/openCL/reduction_test.cl
@ -0,0 +1,92 @@
 __kernel
 void reduce1(__global float* buffer,
             __const int length,
             __global float2* preresult) {
    int global_index = get_global_id(0);
    int global_size  = get_global_size(0);
    float2 accumulator;
    accumulator.x = INFINITY;
    accumulator.y = -INFINITY;
    // Loop sequentially over chunks of input vector
    while (global_index < length) {
        float element = buffer[global_index];
        accumulator.x = (accumulator.x < element) ? accumulator.x : element;
        accumulator.y = (accumulator.y > element) ? accumulator.y : element;
        global_index += global_size;
    }
    __local float2 scratch[WORKGROUP_SIZE];
    // Perform parallel reduction
    int local_index = get_local_id(0);
    scratch[local_index] = accumulator;
    barrier(CLK_LOCAL_MEM_FENCE);
    int active_threads = get_local_size(0);
    while (active_threads != 2)
    {
        active_threads /= 2;
        if (thread_id_loc < active_threads)
        {
            float2 other = scratch[local_index + active_threads];
            float2 mine  = scratch[local_index];
            mine.x = (mine.x < other.x) ? mine.x : other.x;
            mine.y = (mine.y > other.y) ? mine.y : other.y;
            /*
            float2 tmp;
            tmp.x = (mine.x < other.x) ? mine.x : other.x;
            tmp.y = (mine.y > other.y) ? mine.y : other.y;
            scratch[local_index] = tmp;
            */
            scratch[local_index] = mine;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (local_index == 0) {
        preresult[get_group_id(0)] = scratch[0];
    }
 }
 __kernel
 void reduce2(__global float2* preresult,
             __global float4* result) {
    __local float2 scratch[WORKGROUP_SIZE];
    int local_index = get_local_id(0);
    scratch[local_index] = preresult[local_index];
    barrier(CLK_LOCAL_MEM_FENCE);
    int active_threads = get_local_size(0);
    while (active_threads != 2)
    {
        active_threads /= 2;
        if (thread_id_loc < active_threads)
        {
            float2 other = scratch[local_index + active_threads];
            float2 mine  = scratch[local_index];
            mine.x = (mine.x < other.x) ? mine.x : other.x;
            mine.y = (mine.y > other.y) ? mine.y : other.y;
            /*
            float2 tmp;
            tmp.x = (mine.x < other.x) ? mine.x : other.x;
            tmp.y = (mine.y > other.y) ? mine.y : other.y;
            scratch[local_index] = tmp;
            */
            scratch[local_index] = mine;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (local_index == 0) {
        result[0] = vload4(0,scratch);
    }
 }
--- a/openCL/reduction_test4.cl
+++ b/openCL/reduction_test4.cl
@ -0,0 +1,104 @@
 __kernel
 void reduce1(__global float2* buffer,
             __const int length,
             __global float4* preresult) {
    int global_index = get_global_id(0);
    int global_size  = get_global_size(0);
    float4 accumulator;
    accumulator.x = INFINITY;
    accumulator.y = -INFINITY;
    accumulator.z = INFINITY;
    accumulator.w = -INFINITY;
    // Loop sequentially over chunks of input vector
    while (global_index < length/2) {
        float2 element = buffer[global_index];
        accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
        accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
        accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
        accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
        global_index += global_size;
    }
    __local float4 scratch[WORKGROUP_SIZE];
    // Perform parallel reduction
    int local_index = get_local_id(0);
    scratch[local_index] = accumulator;
    barrier(CLK_LOCAL_MEM_FENCE);
    int active_threads = get_local_size(0);
    while (active_threads != 1)
    {
        active_threads /= 2;
        if (local_index < active_threads)
        {
            float4 other = scratch[local_index + active_threads];
            float4 mine  = scratch[local_index];
            mine.x = (mine.x < other.x) ? mine.x : other.x;
            mine.y = (mine.y > other.y) ? mine.y : other.y;
            mine.z = (mine.z < other.z) ? mine.z : other.z;
            mine.w = (mine.w > other.w) ? mine.w : other.w;
            /*
            float2 tmp;
            tmp.x = (mine.x < other.x) ? mine.x : other.x;
            tmp.y = (mine.y > other.y) ? mine.y : other.y;
            scratch[local_index] = tmp;
            */
            scratch[local_index] = mine;
       }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (local_index == 0) {
        preresult[get_group_id(0)] = scratch[0];
    }
 }
 __kernel
 void reduce2(__global float4* preresult,
             __global float4* result) {
    __local float4 scratch[WORKGROUP_SIZE];
    int local_index = get_local_id(0);
    scratch[local_index] = preresult[local_index];
    barrier(CLK_LOCAL_MEM_FENCE);
    int active_threads = get_local_size(0);
    while (active_threads != 1)
    {
        active_threads /= 2;
        if (local_index < active_threads)
        {
            float4 other = scratch[local_index + active_threads];
            float4 mine  = scratch[local_index];
            mine.x = (mine.x < other.x) ? mine.x : other.x;
            mine.y = (mine.y > other.y) ? mine.y : other.y;
            mine.z = (mine.z < other.z) ? mine.z : other.z;
            mine.w = (mine.w > other.w) ? mine.w : other.w;
            /*
            float2 tmp;
            tmp.x = (mine.x < other.x) ? mine.x : other.x;
            tmp.y = (mine.y > other.y) ? mine.y : other.y;
            scratch[local_index] = tmp;
            */
            scratch[local_index] = mine;
        }
        barrier(CLK_LOCAL_MEM_FENCE);
    }
    if (local_index == 0) {
        result[0] = scratch[0];
    }
 }
--- a/openCL/test4.py
+++ b/openCL/test4.py
@ -0,0 +1,52 @@
 import pyopencl as cl
 from pyopencl import array
 import numpy
 length = 640000
 workgroup_size = 128
 a = numpy.random.rand(length).astype(numpy.float32)
 a.shape = (length/8,4,2)
 input_a = a.reshape(length)
 min0 = a[:, :, 0].min()
 max0 = a[:, :, 0].max()
 min1 = a[:, :, 1].min()
 max1 = a[:, :, 1].max()
 minmax=(min0,max0,min1,max1)
 platform = cl.get_platforms()[0]
 device = platform.get_devices()[0]
 ctx = cl.Context((device,))
 queue = cl.CommandQueue(ctx)
 mf = cl.mem_flags
 d_input     = array.to_device(queue, input_a)
 d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
 d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
 with open("reduction_test4.cl", "r") as kernelFile:
    kernel_src = kernelFile.read()
 compile_options = "-D WORKGROUP_SIZE=%i" % (workgroup_size)
 program = cl.Program(ctx, kernel_src).build(options=compile_options)
 program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data,  numpy.uint32(length), d_preresult)
 program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
 result = numpy.ndarray(4,dtype=numpy.float32)
 cl.enqueue_copy(queue,result, d_result)
 print minmax
 print result
--- a/pyFAI-src/azimuthalIntegrator.py
+++ b/pyFAI-src/azimuthalIntegrator.py
@ -84,6 +84,13 @@ except ImportError as error:
                  " full pixel splitting: %s" % error)
    splitPixel = None
 try:
    from . import splitPixelFull  # IGNORE:F0401
 except ImportError as error:
    logger.error("Unable to import pyFAI.splitPixelFull"
                  " full pixel splitting: %s" % error)
    splitPixelFull = None
 try:
    from . import splitBBox  # IGNORE:F0401
 except ImportError as error:
@ -2407,6 +2414,41 @@ class AzimuthalIntegrator(Geometry):
        if (I is None) and ("splitpix" in method):
            if "full" in method:
                if splitPixel is None:
                    logger.warning("SplitPixelFull is not available,"
                                " falling back on splitbbox histogram !")
                    method = "splitbbox"
                else:
                    logger.debug("integrate1d uses SplitPixel implementation")
                    pos = self.array_from_unit(shape, "corner", unit)
                    qAxis, I, a, b = splitPixelFull.fullSplit1D(pos=pos,
                                                            weights=data,
                                                            bins=nbPt,
                                                            pos0Range=radial_range,
                                                            pos1Range=azimuth_range,
                                                            dummy=dummy,
                                                            delta_dummy=delta_dummy,
                                                            mask=mask,
                                                            dark=dark,
                                                            flat=flat,
                                                            solidangle=solidangle,
                                                            polarization=polarization
                                                            )
                    if error_model == "azimuthal":
                        variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
                    if variance is not None:
                        _, var1d, a, b = splitPixelFull.fullSplit1D(pos=pos,
                                                                weights=variance,
                                                                bins=nbPt,
                                                                pos0Range=radial_range,
                                                                pos1Range=azimuth_range,
                                                                dummy=dummy,
                                                                delta_dummy=delta_dummy,
                                                                mask=mask,
                                                                )
                        sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
            else:
                if splitPixel is None:
                    logger.warning("SplitPixel is not available,"
                                " falling back on splitbbox histogram !")
--- a/pyFAI-src/ocl_azim_csr.py
+++ b/pyFAI-src/ocl_azim_csr.py
@ -58,6 +58,7 @@ class OCL_CSR_Integrator(object):
            indptr: row pointer indicates the start of a given row. len nbin+1
        @param image_size: size of the image (for pre-processing)
        @param devicetype: can be "cpu","gpu","acc" or "all"
        @param block_size: the chosen size for WORKGROUP_SIZE
        @param platformid: number of the platform as given by clinfo
        @type platformid: int
        @param deviceid: number of the device as given by clinfo
--- a/pyFAI-src/ocl_azim_csr_dis.py
+++ b/pyFAI-src/ocl_azim_csr_dis.py
@ -52,11 +52,13 @@ class OCL_CSR_Integrator(object):
                 platformid=None, deviceid=None,
                 checksum=None, profile=False):
        """
-        @param data: coefficient of the matrix in a 1D vector of float32 - size of nnz
+        @param lut: 3-tuple of arrays 
-        @param indices: Column index position for the data (same size as data)
+            data: coefficient of the matrix in a 1D vector of float32 - size of nnz
-        @param indptr: row pointer indicates the start of a given row. len nbin+1
+            indices: Column index position for the data (same size as data) 
            indptr: row pointer indicates the start of a given row. len nbin+1
        @param image_size: 
        @param devicetype: can be "cpu","gpu","acc" or "all"
        @param block_size: the chosen size for WORKGROUP_SIZE
        @param platformid: number of the platform as given by clinfo
        @type platformid: int
        @param deviceid: number of the device as given by clinfo
--- a/pyFAI-src/ocl_hist_pixelsplit.py
+++ b/pyFAI-src/ocl_hist_pixelsplit.py
@ -0,0 +1,380 @@
 # -*- coding: utf-8 -*-
 #
 #    Project: Azimuthal integration
 #             https://github.com/kif/pyFAI
 #
 #
 #    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #                            Giannis Ashiotis
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 __authors__ = ["Jérôme Kieffer", "Giannis Ashiotis"]
 __license__ = "GPLv3"
 __date__ = "04/04/2014"
 __copyright__ = "2014, ESRF, Grenoble"
 __contact__ = "jerome.kieffer@esrf.fr"
 import os, gc, logging
 import threading
 import hashlib
 import numpy
 from .opencl import ocl, pyopencl
 from .splitBBoxLUT import HistoBBox1d
 from .utils import get_cl_file
 from pyopencl import array
 if pyopencl:
    mf = pyopencl.mem_flags
 else:
    raise ImportError("pyopencl is not installed")
 try:
    from .fastcrc import crc32
 except:
    from zlib import crc32
 logger = logging.getLogger("pyFAI.ocl_azim_csr")
 class OCL_Hist_Pixelsplit(object):
    def __init__(self, pos, bins, image_size, pos0Range=None, pos1Range=None, devicetype="all",
                 padded=False, block_size=32,
                 platformid=None, deviceid=None, 
                 checksum=None, profile=False):
        """
        @param lut: 3-tuple of arrays 
            data: coefficient of the matrix in a 1D vector of float32 - size of nnz
            indices: Column index position for the data (same size as data) 
            indptr: row pointer indicates the start of a given row. len nbin+1
        @param image_size: size of the image (for pre-processing)
        @param devicetype: can be "cpu","gpu","acc" or "all"
        @param platformid: number of the platform as given by clinfo
        @type platformid: int
        @param deviceid: number of the device as given by clinfo
        @type deviceid: int
        @param checksum: pre - calculated checksum to prevent re - calculating it :)
        @param profile: store profiling elements
        """
        self.BLOCK_SIZE = block_size  # query for warp size
        self.padded = padded
        self._sem = threading.Semaphore()
        self.pos = pos
        self.bins = bins
        self.pos_size = pos.size
        self.size = image_size
        if self.pos_size != 8 * self.size:
            raise RuntimeError("pos.size != 8 * image_size")
        self.pos0Range = numpy.zeros(1, pyopencl.array.vec.float2) 
        self.pos1Range = numpy.zeros(1, pyopencl.array.vec.float2)
        if (pos0Range is not None) and (len(pos0Range) is 2):
            self.pos0Range[0][0] = min(pos0Range)
            self.pos0Range[0][1] = max(pos0Range)
        else:
            self.pos0Range[0][0] = -float("inf")
            self.pos0Range[0][1] =  float("inf")
        if (pos1Range is not None) and (len(pos1Range) is 2):
            self.pos1Range[0][0] = min(pos1Range)
            self.pos1Range[0][1] = max(pos1Range)
        else:
            self.pos1Range[0][0] = -float("inf")
            self.pos1Range[0][1] =  float("inf")
        self.profile = profile
        if not checksum:
            checksum = crc32(self.pos)
        self.on_device = {"pos":checksum, "dark":None, "flat":None, "polarization":None, "solidangle":None}
        self._cl_kernel_args = {}
        self._cl_mem = {}
        self.events = []
        if (platformid is None) and (deviceid is None):
            platformid, deviceid = ocl.select_device(devicetype)
        elif platformid is None:
            platformid = 0
        elif deviceid is None:
            deviceid = 0
        self.platform = ocl.platforms[platformid]
        self.device = self.platform.devices[deviceid]
        self.device_type = self.device.type
        if (self.device_type == "CPU") and (self.platform.vendor == "Apple"):
            logger.warning("This is a workaround for Apple's OpenCL on CPU: enforce BLOCK_SIZE=1")
            self.BLOCK_SIZE = 1
        self.workgroup_size = self.BLOCK_SIZE,
        self.wdim_bins = (self.bins * self.BLOCK_SIZE),
        self.wdim_data = (self.size + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
        try:
            #self._ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
            self._ctx = pyopencl.create_some_context()
            if self.profile:         
                self._queue = pyopencl.CommandQueue(self._ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
            else:
                self._queue = pyopencl.CommandQueue(self._ctx)
            self._allocate_buffers()
            self._compile_kernels()
            self._set_kernel_arguments()
        except pyopencl.MemoryError as error:
            raise MemoryError(error)
        ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["pos"], self.pos)
        if self.profile: self.events.append(("copy pos data",ev))
        reduction_wg_size = 256
        reduce1 = self._program.reduce1(self._queue, (reduction_wg_size*reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce1"])
        self.events.append(("reduce1",reduce1))
        reduce2 = self._program.reduce2(self._queue, (reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce2"])
        self.events.append(("reduce2",reduce2))
        result = numpy.ndarray(4,dtype=numpy.float32)
        pyopencl.enqueue_copy(self._queue,result, self._cl_mem["minmax"])
        print result
        min0 = pos[:, :, 0].min()
        max0 = pos[:, :, 0].max()
        min1 = pos[:, :, 1].min()
        max1 = pos[:, :, 1].max()
        minmax=(min0,max0,min1,max1)
        print minmax
    def __del__(self):
        """
        Destructor: release all buffers
        """
        self._free_kernels()
        self._free_buffers()
        self._queue = None
        self._ctx = None
        gc.collect()
    def _allocate_buffers(self):
        """
        Allocate OpenCL buffers required for a specific configuration
        Note that an OpenCL context also requires some memory, as well as Event and other OpenCL functionalities which cannot and
        are not taken into account here.
        The memory required by a context varies depending on the device. Typical for GTX580 is 65Mb but for a 9300m is ~15Mb
        In addition, a GPU will always have at least 3-5Mb of memory in use.
        Unfortunately, OpenCL does NOT have a built-in way to check the actual free memory on a device, only the total memory.
        """
        if self.size < self.BLOCK_SIZE:
            raise RuntimeError("Fatal error in _allocate_buffers. size (%d) must be >= BLOCK_SIZE (%d)\n", self.size, self.BLOCK_SIZE)
        size_of_float = numpy.dtype(numpy.float32).itemsize
        size_of_short = numpy.dtype(numpy.int16).itemsize
        size_of_int = numpy.dtype(numpy.int32).itemsize
        size_of_long = numpy.dtype(numpy.int64).itemsize
        ualloc  = (self.pos_size * size_of_float)
        ualloc += (4 * self.BLOCK_SIZE * size_of_float)
        ualloc += (self.size * size_of_float) * 5
        ualloc += (self.bins * size_of_float) * 3
        memory = self.device.memory
        logger.info("%.3fMB are needed on device which has %.3fMB" % (ualloc / 1.0e6, memory / 1.0e6))
        if ualloc >= memory:
            raise MemoryError("Fatal error in _allocate_buffers. Not enough device memory for buffers (%lu requested, %lu available)" % (ualloc, memory))
        # now actually allocate:
        try:
            self._cl_mem["pos"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.pos_size)
            self._cl_mem["preresult"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4 * self.BLOCK_SIZE)
            self._cl_mem["minmax"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4)
            self._cl_mem["outData"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
            self._cl_mem["outCount"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
            self._cl_mem["outMerge"] = pyopencl.Buffer(self._ctx, mf.WRITE_ONLY, size=size_of_float * self.bins)
            self._cl_mem["image_u16"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_short * self.size)
            self._cl_mem["image"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.size)
            self._cl_mem["dark"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
            self._cl_mem["flat"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
            self._cl_mem["polarization"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
            self._cl_mem["solidangle"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
        except pyopencl.MemoryError as error:
            self._free_buffers()
            raise MemoryError(error)
    def _free_buffers(self):
        """
        free all memory allocated on the device
        """
        for buffer_name in self._cl_mem:
            if self._cl_mem[buffer_name] is not None:
                try:
                    self._cl_mem[buffer_name].release()
                    self._cl_mem[buffer_name] = None
                except pyopencl.LogicError:
                    logger.error("Error while freeing buffer %s" % buffer_name)
    def _compile_kernels(self, kernel_file=None):
        """
        Call the OpenCL compiler
        @param kernel_file: path tothe
        """
        kernel_name = "ocl_hist_pixelsplit.cl"
        if kernel_file is None:
            if os.path.isfile(kernel_name):
                kernel_file = os.path.abspath(kernel_name)
            else:
                kernel_file = get_cl_file(kernel_name)
        else:
            kernel_file = str(kernel_file)
        with open(kernel_file, "r") as kernelFile:
            kernel_src = kernelFile.read()
        compile_options = "-D BINS=%i  -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
                (self.bins, self.size, self.BLOCK_SIZE, numpy.finfo(numpy.float32).eps)
        logger.info("Compiling file %s with options %s" % (kernel_file, compile_options))
        try:
            self._program = pyopencl.Program(self._ctx, kernel_src).build(options=compile_options)
        except pyopencl.MemoryError as error:
            raise MemoryError(error)
    def _free_kernels(self):
        """
        free all kernels
        """
        for kernel in self._cl_kernel_args:
            self._cl_kernel_args[kernel] = []
        self._program = None
    def _set_kernel_arguments(self):
        """Tie arguments of OpenCL kernel-functions to the actual kernels
        set_kernel_arguments() is a private method, called by configure().
        It uses the dictionary _cl_kernel_args.
        Note that by default, since TthRange is disabled, the integration kernels have tth_min_max tied to the tthRange argument slot.
        When setRange is called it replaces that argument with tthRange low and upper bounds. When unsetRange is called, the argument slot
        is reset to tth_min_max.
        """
        self._cl_kernel_args["reduce1"] = [self._cl_mem["pos"], numpy.int32(self.pos_size), self._cl_mem["preresult"]]
        self._cl_kernel_args["reduce2"] = [self._cl_mem["preresult"], self._cl_mem["minmax"]]
        self._cl_kernel_args["corrections"] = [self._cl_mem["image"], numpy.int32(0), self._cl_mem["dark"], numpy.int32(0), self._cl_mem["flat"], \
                                              numpy.int32(0), self._cl_mem["solidangle"], numpy.int32(0), self._cl_mem["polarization"], \
                                              numpy.int32(0), numpy.float32(0), numpy.float32(0)]
        self._cl_kernel_args["integrate1"] = [self._cl_mem["pos"], self._cl_mem["image"], self._cl_mem["minmax"], numpy.int32(0), self.pos0Range[0], \
                                            self.pos1Range[0], numpy.int32(0), numpy.float32(0), self._cl_mem["outData"], self._cl_mem["outCount"]]
        self._cl_kernel_args["integrate2"] = [self._cl_mem["outData"], self._cl_mem["outCount"], self._cl_mem["outMerge"]]
        self._cl_kernel_args["memset_out"] = [self._cl_mem[i] for i in ["outData", "outCount", "outMerge"]]
        self._cl_kernel_args["u16_to_float"] = [self._cl_mem[i] for i in ["image_u16", "image"]]
        self._cl_kernel_args["s32_to_float"] = [self._cl_mem[i] for i in ["image", "image"]]
    def  integrate(self, data, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None, dark_checksum=None, flat_checksum=None, solidAngle_checksum=None, polarization_checksum=None):
        events = []
        with self._sem:
            if data.dtype == numpy.uint16:
                copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image_u16"], numpy.ascontiguousarray(data))
                cast_u16_to_float = self._program.u16_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["u16_to_float"])
                events+=[("copy image",copy_image),("cast", cast_u16_to_float)]
            elif data.dtype == numpy.int32:
                copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data))
                cast_s32_to_float = self._program.s32_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["s32_to_float"])
                events+=[("copy image",copy_image),("cast", cast_s32_to_float)]
            else:
                copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data, dtype=numpy.float32))
                events+=[("copy image",copy_image)]
            memset = self._program.memset_out(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["memset_out"])
            events+=[("memset",memset)]
            if dummy is not None:
                do_dummy = numpy.int32(1)
                dummy = numpy.float32(dummy)
                if delta_dummy == None:
                    delta_dummy = numpy.float32(0)
                else:
                    delta_dummy = numpy.float32(abs(delta_dummy))
            else:
                do_dummy = numpy.int32(0)
                dummy = numpy.float32(0)
                delta_dummy = numpy.float32(0)
            self._cl_kernel_args["corrections"][9] = do_dummy
            self._cl_kernel_args["corrections"][10] = dummy
            self._cl_kernel_args["corrections"][11] = delta_dummy
            self._cl_kernel_args["integrate1"][6] = do_dummy
            self._cl_kernel_args["integrate1"][7] = dummy
            if dark is not None:
                do_dark = numpy.int32(1)
                if not dark_checksum:
                    dark_checksum = crc32(dark)
                if dark_checksum != self.on_device["dark"]:
                    ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["dark"], numpy.ascontiguousarray(dark, dtype=numpy.float32))
                    events.append("copy dark",ev)
                    self.on_device["dark"] = dark_checksum
            else:
                do_dark = numpy.int32(0)
            self._cl_kernel_args["corrections"][1] = do_dark
            if flat is not None:
                do_flat = numpy.int32(1)
                if not flat_checksum:
                    flat_checksum = crc32(flat)
                if self.on_device["flat"] != flat_checksum:
                    ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["flat"], numpy.ascontiguousarray(flat, dtype=numpy.float32))
                    events.append("copy flat",ev)
                    self.on_device["flat"] = flat_checksum
            else:
                do_flat = numpy.int32(0)
            self._cl_kernel_args["corrections"][3] = do_flat
            if solidAngle is not None:
                do_solidAngle = numpy.int32(1)
                if not solidAngle_checksum:
                    solidAngle_checksum = crc32(solidAngle)
                if solidAngle_checksum != self.on_device["solidangle"]:
                    ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["solidangle"], numpy.ascontiguousarray(solidAngle, dtype=numpy.float32))
 	            events.append(("copy solidangle",ev))
                    self.on_device["solidangle"] = solidAngle_checksum
            else:
                do_solidAngle = numpy.int32(0)
            self._cl_kernel_args["corrections"][5] = do_solidAngle
            if polarization is not None:
                do_polarization = numpy.int32(1)
                if not polarization_checksum:
                    polarization_checksum = crc32(polarization)
                if polarization_checksum != self.on_device["polarization"]:
                    ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["polarization"], numpy.ascontiguousarray(polarization, dtype=numpy.float32))
                    events.append(("copy polarization",ev))
                    self.on_device["polarization"] = polarization_checksum
            else:
                do_polarization = numpy.int32(0)
            self._cl_kernel_args["corrections"][7] = do_polarization
            copy_image.wait()
            if do_dummy + do_polarization + do_solidAngle + do_flat + do_dark > 0:
                ev = self._program.corrections(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["corrections"])
                events.append(("corrections",ev))
            integrate1 = self._program.integrate1(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["integrate1"])
            events.append(("integrate1",integrate1))
            outMerge = numpy.empty(self.bins, dtype=numpy.float32)
            outData = numpy.empty(self.bins, dtype=numpy.float32)
            outCount = numpy.empty(self.bins, dtype=numpy.float32)
            ev=pyopencl.enqueue_copy(self._queue, outData, self._cl_mem["outData"])
            events.append(("copy D->H outData",ev))
            ev=pyopencl.enqueue_copy(self._queue, outCount, self._cl_mem["outCount"])
            events.append(("copy D->H outCount",ev))
            global_size_integrate2 = (self.bins + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
            integrate2 = self._program.integrate2(self._queue, global_size_integrate2, self.workgroup_size, *self._cl_kernel_args["integrate2"])
            events.append(("integrate2",integrate2))
            ev=pyopencl.enqueue_copy(self._queue, outMerge, self._cl_mem["outMerge"])
            events.append(("copy D->H outMerge",ev))
            ev.wait()
        if self.profile: 
            self.events+=events        
        return outMerge, outData, outCount
    def  log_profile(self):
        """
        If we are in profiling mode, prints out all timing for every single OpenCL call
        """
        t = 0.0
        if self.profile:
            for e in self.events:
                if "__len__" in dir(e) and len(e) >= 2:
                    et = 1e-6 * (e[1].profile.end - e[1].profile.start)
                    print("%50s:\t%.3fms" % (e[0], et))
                    t += et
        print("_"*80)
        print("%50s:\t%.3fms" % ("Total execution time", t))
--- a/setup.py
+++ b/setup.py
@ -49,14 +49,10 @@ from distutils.sysconfig import get_python_lib
 from distutils.command.install_data import install_data
 ################################################################################
-# Remove MANIFEST file ... it needs to be re-generated on the fly
+# Check for Cython
 ################################################################################
 if op.isfile("MANIFEST"):
    os.unlink("MANIFEST")
 ################################################################################
 # Check for Cython
 ################################################################################
 try:
    from Cython.Distutils import build_ext
    CYTHON = True
@ -124,7 +120,6 @@ if ("sdist" in sys.argv):
 # pyFAI extensions
 # ###############################################################################
 cython_modules = [os.path.splitext(os.path.basename(i))[0] for i in glob.glob("src/*.pyx")]
 src = dict([(ext, join("src", ext + cython_c_ext)) for ext in cython_modules])
 _geometry_dic = dict(name="_geometry",
@ -151,6 +146,16 @@ splitPixel_dic = dict(name="splitPixel",
                 sources=[src['splitPixel']],
                 )
 splitPixelFull_dic = dict(name="splitPixelFull",
                 include_dirs=get_numpy_include_dirs(),
                 sources=[src['splitPixelFull']],
                 )
 splitPixelFullLUT_dic = dict(name="splitPixelFullLUT",
                 include_dirs=get_numpy_include_dirs(),
                 sources=[src['splitPixelFullLUT']],
                 )
 splitBBox_dic = dict(name="splitBBox",
                     include_dirs=get_numpy_include_dirs(),
                     sources=[src['splitBBox']],
@ -200,6 +205,7 @@ _bispev_dic = dict(name="_bispev",
                        sources=[src['_bispev'] ],
                        extra_compile_args=['openmp'],
                        extra_link_args=['openmp'],
                        )
 _convolution_dic = dict(name="_convolution",
@ -266,6 +272,7 @@ if sys.platform == "win32":
        if (filein + ".py") not in script_files:
            shutil.copyfile(filein, filein + ".py")
            script_files.append(filein + ".py")
 else:
    script_files = glob.glob("scripts/*")
@ -420,10 +427,11 @@ This python module can be found on:
 http://pypi.python.org/pypi/pyopencl
 """)
 """
-################################################################################
+# ###############################################################################
 # check if OpenMP modules, freshly installed can import
-################################################################################
+# ###############################################################################
 pyFAI = None
 sys.path.insert(0, os.path.dirname(installDir))
 # print installDir
@ -436,7 +444,7 @@ for mod in sys.modules.copy():
 try:
    import pyFAI
 except ImportError as E:
-    print("Unable to import pyFAI from system: %s" % E)
+    print("Unable to import pyFAI: %s" % E)
 else:
    print("PyFAI is installed in %s" % pyFAI.__file__)
    try:
@ -445,5 +453,4 @@ else:
        print("PyFAI.histogram failed to import. It is likely there is an OpenMP error: %s" % E)
    else:
        print("OpenMP libraries were found and pyFAI.histogram was successfully imported")
 """
--- a/src/_distortion.pyx
+++ b/src/_distortion.pyx
@ -903,3 +903,114 @@ def cal_LUT(float[:,:,:,:] pos not None, shape, int size, max_pixel_size):
                        outMax[ml, nl] = k + 1
                idx += 1
    return lut.reshape(shape0 * shape1, size)
 def cal_CSR(float[:,:,:,:] pos not None, shape, bin_size, max_pixel_size):
    """
    @param pos: 4D position array 
    @param shape: output shape
    @param bin_size: number of input element per output element (as numpy array) 
    @param max_pixel_size: (2-tuple of int) size of a buffer covering the largest pixel
    @return: look-up table in CSR format: 3-tuple of array"""
    cdef int i, j, ms, ml, ns, nl, shape0, shape1, delta0, delta1, buffer_size, i0, i1, bins, lut_size
    cdef int offset0, offset1, box_size0, box_size1
    cdef numpy.int32_t k, idx=0
    cdef float A0, A1, B0, B1, C0, C1, D0, D1, pAB, pBC, pCD, pDA, cAB, cBC, cCD, cDA, area, value
    cdef numpy.ndarray[lut_point, ndim = 3] lut
    cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros(shape, dtype=numpy.int32)
    cdef float[:,:] buffer
    cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr, indices
    cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
    shape0, shape1 = shape
    delta0, delta1 = max_pixel_size
    bins = shape0*shape1
    indptr = numpy.empty(bins+1, dtype=numpy.int32)
    indptr[0] = 0
    indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
    lut_size = indptr[bins]
    indices = numpy.zeros(shape=lut_size, dtype=numpy.int32)
    data = numpy.zeros(shape=lut_size, dtype=numpy.float32)
    bins = shape0*shape1
    indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
    indices_size = lut_size*sizeof(numpy.int32)
    data_size = lut_size*sizeof(numpy.float32)
    indptr_size = bins*sizeof(numpy.int32)
    logger.info("CSR matrix: %.3f MByte"%((indices_size+data_size+indptr_size)/1.0e6))
    buffer = cvarray(shape=(delta0, delta1), itemsize=sizeof(float), format="f")
    buffer_size = delta0 * delta1 * sizeof(float)
    logger.info("Max pixel size: %ix%i; Max source pixel in target: %i"%(buffer.shape[1],buffer.shape[0], lut_size))
    with nogil:
        # i,j, idx are indices of the raw image uncorrected
        for i in range(shape0):
            for j in range(shape1):
                #reinit of buffer
                memset(&buffer[0,0], 0, buffer_size)
                A0 = pos[i, j, 0, 0]
                A1 = pos[i, j, 0, 1]
                B0 = pos[i, j, 1, 0]
                B1 = pos[i, j, 1, 1]
                C0 = pos[i, j, 2, 0]
                C1 = pos[i, j, 2, 1]
                D0 = pos[i, j, 3, 0]
                D1 = pos[i, j, 3, 1]
                offset0 = (<int> floor(min4f(A0, B0, C0, D0)))
                offset1 = (<int> floor(min4f(A1, B1, C1, D1)))
                box_size0 = (<int> ceil(max4f(A0, B0, C0, D0))) - offset0
                box_size1 = (<int> ceil(max4f(A1, B1, C1, D1))) - offset1
                A0 -= <float> offset0
                A1 -= <float> offset1
                B0 -= <float> offset0
                B1 -= <float> offset1
                C0 -= <float> offset0
                C1 -= <float> offset1
                D0 -= <float> offset0
                D1 -= <float> offset1
                if B0 != A0:
                    pAB = (B1 - A1) / (B0 - A0)
                    cAB = A1 - pAB * A0
                else:
                    pAB = cAB = 0.0
                if C0 != B0:
                    pBC = (C1 - B1) / (C0 - B0)
                    cBC = B1 - pBC * B0
                else:
                    pBC = cBC = 0.0
                if D0 != C0:
                    pCD = (D1 - C1) / (D0 - C0)
                    cCD = C1 - pCD * C0
                else:
                    pCD = cCD = 0.0
                if A0 != D0:
                    pDA = (A1 - D1) / (A0 - D0)
                    cDA = D1 - pDA * D0
                else:
                    pDA = cDA = 0.0
                integrate(buffer, B0, A0, pAB, cAB)
                integrate(buffer, A0, D0, pDA, cDA)
                integrate(buffer, D0, C0, pCD, cCD)
                integrate(buffer, C0, B0, pBC, cBC)
                area = 0.5*((C0 - A0)*(D1 - B1)-(C1 - A1)*(D0 - B0))
                for ms in range(box_size0):
                    ml = ms + offset0
                    if ml < 0 or ml >= shape0:
                        continue
                    for ns in range(box_size1):
                        # ms,ns are indexes of the corrected image in short form, ml & nl are the same
                        nl = ns + offset1
                        if nl < 0 or nl >= shape1:
                            continue
                        value = buffer[ms, ns] / area
                        if value <= 0:
                            continue
                        k = outMax[ml,nl]
                        tmp_index = indptr[ml*shape1+nl]
                        indices[tmp_index+k] = idx
                        data[tmp_index+k] = value
                        outMax[ml,nl] = k + 1
                idx += 1
    return (data, indices, indptr)
--- a/src/_distortionCSR.pyx
+++ b/src/_distortionCSR.pyx
@ -438,7 +438,7 @@ class Distortion(object):
    def correctHost(self, image):
        """
        Correct an image based on the look-up table calculated ...
-        Caclulation takes place on the Host
+        Calculation takes place on the Host
        @param image: 2D-array with the image
        @return: corrected 2D image
--- a/src/splitPixelFull.c
+++ b/src/splitPixelFull.c
--- a/src/splitPixelFull.pyx
+++ b/src/splitPixelFull.pyx
@ -0,0 +1,739 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
 #    Project: Azimuthal integration
 #             https://forge.epn-campus.eu/projects/azimuthal
 #
 #    File: "$Id$"
 #
 #    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Giannis Ashiotis
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 import cython
 cimport numpy
 import numpy
 from libc.math cimport fabs, floor
 from libc.stdio cimport printf
 from cython.view cimport array as cvarray
 #cdef double areaTriangle(double a0,
 #                         double a1,
 #                         double b0,
 #                         double b1,
 #                         double c0,
 #                         double c1):
 #    """
 #    Calculate the area of the ABC triangle with corners:
 #    A(a0,a1)
 #    B(b0,b1)
 #    C(c0,c1)
 #    @return: area, i.e. 1/2 * (B-A)^(C-A)
 #    """
 #    return 0.5 * abs(((b0 - a0) * (c1 - a1)) - ((b1 - a1) * (c0 - a0)))
 #
 cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
    """
    Calculate the area of the ABCD quadrilataire  with corners:
    A(a0,a1)
    B(b0,b1)
    C(c0,c1)
    D(d0,d1)
    @return: area, i.e. 1/2 * (AC ^ BD)
    """
    return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
 # cdef double area4(point2D *pixel):
    # """
    # Calculate the area of the ABCD quadrilataire  with corners:
    # A(a0,a1)
    # B(b0,b1)
    # C(c0,c1)
    # D(d0,d1)
    # @return: area, i.e. 1/2 * (AC ^ BD)
    # """
    # return 0.5 * abs(((pixel[2].x - pixel[0].x) * (pixel[3].y - pixel[1].y)) - ((pixel[2].y - pixel[0].y) * (pixel[3].x - pixel[1].x)))
 # cdef struct point2D:
    # numpy.float64_t x
    # numpy.float64_t y
 # cdef struct min_max:
    # numpy.float64_t pos
    # numpy.int32_t point
 cdef struct Function:
    double slope
    double intersect
 #cdef class Function:
    #cdef double _slope
    #cdef double _intersect
    #def __cinit__(self, double A0=0.0, double A1=0.0, double B0=1.0, double B1=0.0):
        #self._slope = (B1-A1)/(B0-A0)
        #self._intersect = A1 - self._slope*A0
    #def __cinit__(self):
        #self._slope = 0.0
        #self._intersect = 0.0
    #cdef double f(self, double x):
        #return self._slope*x + self._intersect
    #cdef double integrate(self, double A0, double B0) nogil:
        #if A0==B0:
            #return 0.0
        #else:
            #return self._slope*(B0*B0 - A0*A0)*0.5 + self._intersect*(B0-A0)
    #cdef void reset(self, double A0, double A1, double B0, double B1) nogil:
        #self._slope = (B1-A1)/(B0-A0)
        #self._intersect = A1 - self._slope*A0
 cdef double integrate( double A0, double B0, Function AB) nogil:
    """
    integrates the line defined by AB, from A0 to B0
    param A0: first limit
    param B0: second limit
    param AB: struct with the slope and point of intersection of the line
    """    
    if A0==B0:
        return 0.0
    else:
        return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
@cython.cdivision(True)
 cdef double getBinNr(double x0, double pos0_min, double dpos) nogil:
    """
    calculate the bin number for any point
    param x0: current position
    param pos0_min: position minimum
    param dpos: bin width
    """
    return (x0 - pos0_min) / dpos
 # cdef min_max min4f(point2D *pixel, int dim) nogil:
    # cdef min_max tmp
    # if dim == 0:
        # if (pixel[0].x <= pixel[1].x) and (pixel[0].x <= pixel[2].x) and (pixel[0].x <= pixel[3].x):
            # tmp.pos = pixel[0].x
            # tmp.pixel = 0
            # return tmp
        # if (pixel[1].x <= pixel[0].x) and (pixel[1].x <= pixel[2].x) and (pixel[1].x <= pixel[3].x):
            # tmp.pos = pixel[1].x
            # tmp.pixel = 1
            # return tmp
        # if (pixel[2].x <= pixel[0].x) and (pixel[2].x <= pixel[1].x) and (pixel[2].x <= pixel[3].x):
            # tmp.pos = pixel[2].x
            # tmp.pixel = 2
            # return tmp
        # else:
            # tmp.pos = pixel[3].x
            # tmp.pixel = 3
            # return tmp
    # elif dim == 1:
        # if (pixel[0].y <= pixel[1].y) and (pixel[0].y <= pixel[2].y) and (pixel[0].y <= pixel[3].y):
            # tmp.pos = pixel[0].y
            # tmp.pixel = 0
            # return tmp
        # if (pixel[1].y <= pixel[0].y) and (pixel[1].y <= pixel[2].y) and (pixel[1].y <= pixel[3].y):
            # tmp.pos = pixel[1].y
            # tmp.pixel = 1
            # return tmp
        # if (pixel[2].y <= pixel[0].y) and (pixel[2].y <= pixel[1].y) and (pixel[2].y <= pixel[3].y):
            # tmp.pos = pixel[2].y
            # tmp.pixel = 2
            # return tmp
        # else:
            # tmp.pos = pixel[3].y
            # tmp.pixel = 3
            # return tmp
 # cdef min_max max4f(point2D *pixel, int dim) nogil:
    # cdef min_max tmp
    # if dim == 0:
        # if (pixel[0].x >= pixel[1].x) and (pixel[0].x >= pixel[2].x) and (pixel[0].x >= pixel[3].x):
            # tmp.pos = pixel[0].x
            # tmp.pixel = 0
            # return tmp
        # if (pixel[1].x >= pixel[0].x) and (pixel[1].x >= pixel[2].x) and (pixel[1].x >= pixel[3].x):
            # tmp.pos = pixel[1].x
            # tmp.pixel = 1
            # return tmp
        # if (pixel[2].x >= pixel[0].x) and (pixel[2].x >= pixel[1].x) and (pixel[2].x >= pixel[3].x):
            # tmp.pos = pixel[2].x
            # tmp.pixel = 2
            # return tmp
        # else:
            # tmp.pos = pixel[3].x
            # tmp.pixel = 3
            # return tmp
    # elif dim == 1:
        # if (pixel[0].y >= pixel[1].y) and (pixel[0].y >= pixel[2].y) and (pixel[0].y >= pixel[3].y):
            # tmp.pos = pixel[0].y
            # tmp.pixel = 0
            # return tmp
        # if (pixel[1].y >= pixel[0].y) and (pixel[1].y >= pixel[2].y) and (pixel[1].y >= pixel[3].y):
            # tmp.pos = pixel[1].y
            # tmp.pixel = 1
            # return tmp
        # if (pixel[2].y >= pixel[0].y) and (pixel[2].y >= pixel[1].y) and (pixel[2].y >= pixel[3].y):
            # tmp.pos = pixel[2].y
            # tmp.pixel = 2
            # return tmp
        # else:
            # tmp.pos = pixel[3].y
            # tmp.pixel = 3
            # return tmp
 cdef double min4f(double a, double b, double c, double d) nogil:
    """Calculates the min of 4 double numbers"""
    if (a <= b) and (a <= c) and (a <= d):
        return a
    if (b <= a) and (b <= c) and (b <= d):
        return b
    if (c <= a) and (c <= b) and (c <= d):
        return c
    else:
        return d
 cdef double max4f(double a, double b, double c, double d) nogil:
    """Calculates the max of 4 double numbers"""
    if (a >= b) and (a >= c) and (a >= d):
        return a
    if (b >= a) and (b >= c) and (b >= d):
        return b
    if (c >= a) and (c >= b) and (c >= d):
        return c
    else:
        return d
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
 def fullSplit1D(numpy.ndarray pos not None,
                numpy.ndarray weights not None,
                size_t bins=100,
                pos0Range=None,
                pos1Range=None,
                dummy=None,
                delta_dummy=None,
                mask=None,
                dark=None,
                flat=None,
                solidangle=None,
                polarization=None
              ):
    """
    Calculates histogram of pos weighted by weights
    Splitting is done on the pixel's bounding box like fit2D.
    No compromise for speed has been made here.
    @param pos: 3D or 4D array with the coordinates of each pixel point
    @param weights: array with intensities
    @param bins: number of output bins
    @param pos0Range: minimum and maximum  of the 2th range
    @param pos1Range: minimum and maximum  of the chi range
    @param dummy: value for bins without pixels
    @param delta_dummy: precision of dummy value
    @param mask: array (of int8) with masked pixels with 1 (0=not masked)
    @param dark: array (of float64) with dark noise to be subtracted (or None)
    @param flat: array (of float64) with flat image
    @param polarization: array (of float64) with polarization correction
    @param solidangle: array (of float64) with flat image
    @return 2theta, I, weighted histogram, unweighted histogram
    """
    cdef size_t  size = weights.size
    if pos.ndim>3: #create a view
        pos = pos.reshape((-1,4,2))
    assert pos.shape[0] == size
    assert pos.shape[1] == 4
    assert pos.shape[2] == 2
    assert pos.ndim == 3
    assert  bins > 1
    cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(pos,dtype=numpy.float64)
    cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float64)
    cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(bins, dtype=numpy.float64)
    cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(bins, dtype=numpy.float64)
    cdef numpy.ndarray[numpy.float64_t, ndim = 1] outMerge = numpy.zeros(bins, dtype=numpy.float64)
    cdef numpy.int8_t[:] cmask
    cdef double[:] cflat, cdark, cpolarization, csolidangle
    cdef double cdummy=0, cddummy=0, data=0
    cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
    cdef double areaPixel=0, dpos=0, fbin0_min=0, fbin0_max=0#, fbin1_min, fbin1_max 
    cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
    cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
    cdef double oneOverArea=0, partialArea=0, tmp=0
    cdef double max0, min0
    #cdef point2D[:] pixel
    cdef Function AB, BC, CD, DA
    cdef double epsilon=1e-10
    cdef bint check_pos1=False, check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
    cdef int i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, cur_bin
    if pos0Range is not None and len(pos0Range) > 1:
        pos0_min = min(pos0Range)
        pos0_maxin = max(pos0Range)
    else:
        pos0_min = pos[:, :, 0].min()
        pos0_maxin = pos[:, :, 0].max()
    pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
    if pos1Range is not None and len(pos1Range) > 1:
        pos1_min = min(pos1Range)
        pos1_maxin = max(pos1Range)
        do_pos1 = True
    else:
        pos1_min = pos[:, :, 1].min()
        pos1_maxin = pos[:, :, 1].max()
    pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
    dpos = (pos0_max - pos0_min) / (< double > (bins))
    outPos = numpy.linspace(pos0_min+0.5*dpos, pos0_maxin-0.5*dpos, bins)
    if (dummy is not None) and (delta_dummy is not None):
        check_dummy = True
        cdummy =  float(dummy)
        cddummy =  float(delta_dummy)
    elif (dummy is not None):
        check_dummy = True
        cdummy = float(dummy)
        cddummy = 0.0
    else:
        check_dummy = False
        cdummy = 0.0
        cddummy = 0.0
    if mask is not None:
        check_mask = True
        assert mask.size == size
        cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
    if dark is not None:
        do_dark = True
        assert dark.size == size
        cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
    if flat is not None:
        do_flat = True
        assert flat.size == size
        cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
    if polarization is not None:
        do_polarization = True
        assert polarization.size == size
        cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
    if solidangle is not None:
        do_solidangle = True
        assert solidangle.size == size
        csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
    #pixel = cvarray(shape=4, itemsize=sizeof(point2D))
    #AB = Function()
    #BC = Function()
    #CD = Function()
    #DA = Function()
    with nogil:
        for idx in range(size):
            if (check_mask) and (cmask[idx]):
                continue
            data = cdata[idx]
            if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
                continue
            # pixel[0].x = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
            # pixel[0].y = < double > cpos[idx, 0, 1]
            # pixel[1].x = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
            # pixel[1].y = < double > cpos[idx, 1, 1]
            # pixel[2].x = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
            # pixel[2].y = < double > cpos[idx, 2, 1]
            # pixel[3].x = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
            # pixel[3].y = < double > cpos[idx, 3, 1]
            A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
            A1 = < double > cpos[idx, 0, 1]
            B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
            B1 = < double > cpos[idx, 1, 1]
            C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
            C1 = < double > cpos[idx, 2, 1]
            D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
            D1 = < double > cpos[idx, 3, 1]
            min0 = min(A0, B0, C0, D0)
            max0 = max(A0, B0, C0, D0)
            if (max0<0) or (min0 >=bins):
                continue
            if check_pos1:
                min1 = min(A1, B1, C1, D1)
                max1 = max(A1, B1, C1, D1)
                if (max1<pos1_min) or (min1 > pos1_maxin):
                    continue
            if do_dark:
                data -= cdark[idx]
            if do_flat:
                data /= cflat[idx]
            if do_polarization:
                data /= cpolarization[idx]
            if do_solidangle:
                data /= csolidangle[idx]
            bin0_min = < int > floor(min0)
            bin0_max = < int > floor(max0)
            #printf("%d - [(%f %f) (%f %f) (%f %f) (%f %f)] (%f %f) (%d %d)\n",idx, A0, A1, B0, B1, C0, C1, D0, D1, min0, max0, bin0_min, bin0_max)
            if bin0_min == bin0_max:
                #All pixel is within a single bin
                outCount[bin0_min] += 1
                outData[bin0_min] += data
    #        else we have pixel spliting.
            else:
                AB.slope=(B1-A1)/(B0-A0)
                AB.intersect= A1 - AB.slope*A0
                BC.slope=(C1-B1)/(C0-B0)
                BC.intersect= B1 - BC.slope*B0
                CD.slope=(D1-C1)/(D0-C0)
                CD.intersect= C1 - CD.slope*C0
                DA.slope=(A1-D1)/(A0-D0)
                DA.intersect= D1 - DA.slope*D0
                areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
                oneOverPixelArea = 1.0 / areaPixel
                partialArea2 = 0.0
                for bin in range(bin0_min, bin0_max+1):
                    A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
                    B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
                    C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
                    D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
                    partialArea  = integrate(A_lim, B_lim, AB)
                    partialArea += integrate(B_lim, C_lim, BC)
                    partialArea += integrate(C_lim, D_lim, CD)
                    partialArea += integrate(D_lim, A_lim, DA)
                    tmp = fabs(partialArea) * oneOverPixelArea
                    partialArea2 += partialArea
                    outCount[bin] += tmp
                    outData[bin] += data * tmp
                #if fabs(partialArea2-areaPixel) > epsilon:
                    #printf("%d -  %f \n",idx,(partialArea2-areaPixel)/areaPixel)
        for i in range(bins):
            if outCount[i] > epsilon:
                outMerge[i] = outData[i] / outCount[i]
            else:
                outMerge[i] = cdummy
    return  outPos, outMerge, outData, outCount
 #@cython.cdivision(True)
 #@cython.boundscheck(False)
 #@cython.wraparound(False)
 #def fullSplit2D(numpy.ndarray pos not None,
                #numpy.ndarray weights not None,
                #bins not None,
                #pos0Range=None,
                #pos1Range=None,
                #dummy=None,
                #delta_dummy=None,
                #mask=None,
                #dark=None,
                #flat=None,
                #solidangle=None,
                #polarization=None):
    #"""
    #Calculate 2D histogram of pos weighted by weights
    #Splitting is done on the pixel's bounding box like fit2D
    #@param pos: 3D array with pos0; Corner A,B,C,D; tth or chi
    #@param weights: array with intensities
    #@param bins: number of output bins int or 2-tuple of int
    #@param pos0Range: minimum and maximum  of the 2th range
    #@param pos1Range: minimum and maximum  of the chi range
    #@param dummy: value for bins without pixels
    #@param delta_dummy: precision of dummy value
    #@param mask: array (of int8) with masked pixels with 1 (0=not masked)
    #@param dark: array (of float64) with dark noise to be subtracted (or None)
    #@param flat: array (of float64) with flat-field image
    #@param polarization: array (of float64) with polarization correction
    #@param solidangle: array (of float64)with solid angle corrections
    #@return  I, edges0, edges1, weighted histogram(2D), unweighted histogram (2D)
    #"""
    #cdef size_t  bins0=0, bins1=0, size = weights.size
    #if pos.ndim>3: #create a view
        #pos = pos.reshape((-1,4,2))
    #assert pos.shape[0] == size
    #assert pos.shape[1] == 4 # 4 corners
    #assert pos.shape[2] == 2 # tth and chi
    #assert pos.ndim == 3
    #try:
        #bins0, bins1 = tuple(bins)
    #except:
        #bins0 = bins1 = < size_t > bins
    #if bins0 <= 0:
        #bins0 = 1
    #if bins1 <= 0:
        #bins1 = 1
    #cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = pos.astype(numpy.float64)
    #cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = weights.astype(numpy.float64).ravel()
    #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros((bins0, bins1), dtype=numpy.float64)
    #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros((bins0, bins1), dtype=numpy.float64)
    #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outMerge = numpy.zeros((bins0, bins1), dtype=numpy.float64)
    #cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges0 = numpy.zeros(bins0, dtype=numpy.float64)
    #cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges1 = numpy.zeros(bins1, dtype=numpy.float64)
    #cdef numpy.int8_t[:] cmask
    #cdef double[:] cflat, cdark, cpolarization, csolidangle
    #cdef bint check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
    #cdef double cdummy=0, cddummy=0, data=0
    #cdef double min0=0, max0=0, min1=0, max1=0, deltaR=0, deltaL=0, deltaU=0, deltaD=0, deltaA=0
    #cdef double pos0_min=0, pos0_max=0, pos1_min=0, pos1_max=0, pos0_maxin=0, pos1_maxin=0
    #cdef double areaPixel=0, fbin0_min=0, fbin0_max=0, fbin1_min=0, fbin1_max=0
    #cdef double a0=0, a1=0, b0=0, b1=0, c0=0, c1=0, d0=0, d1=0
    #cdef double epsilon = 1e-10
    #cdef size_t bin0_max=0, bin0_min=0, bin1_max=0, bin1_min=0, i=0, j=0, idx=0
    #if pos0Range is not None and len(pos0Range) == 2:
        #pos0_min = min(pos0Range)
        #pos0_maxin = max(pos0Range)
    #else:
        #pos0_min = pos[:, :, 0].min()
        #pos0_maxin = pos[:, :, 0].max()
    #pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
    #if pos1Range is not None and len(pos1Range) > 1:
        #pos1_min = min(pos1Range)
        #pos1_maxin = max(pos1Range)
    #else:
        #pos1_min = pos[:, :, 1].min()
        #pos1_maxin = pos[:, :, 1].max()
    #pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
    #cdef double dpos0 = (pos0_max - pos0_min) / (< double > (bins0))
    #cdef double dpos1 = (pos1_max - pos1_min) / (< double > (bins1))
    #edges0 = numpy.linspace(pos0_min+0.5*dpos0, pos0_maxin-0.5*dpos0, bins0)
    #edges1 = numpy.linspace(pos1_min+0.5*dpos1, pos1_maxin-0.5*dpos1, bins1)
    #if (dummy is not None) and (delta_dummy is not None):
        #check_dummy = True
        #cdummy =  float(dummy)
        #cddummy =  float(delta_dummy)
    #elif (dummy is not None):
        #check_dummy = True
        #cdummy = float(dummy)
        #cddummy = 0.0
    #else:
        #check_dummy = False
        #cdummy = 0.0
        #cddummy = 0.0
    #if mask is not None:
        #check_mask = True
        #assert mask.size == size
        #cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
    #if dark is not None:
        #do_dark = True
        #assert dark.size == size
        #cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
    #if flat is not None:
        #do_flat = True
        #assert flat.size == size
        #cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
    #if polarization is not None:
        #do_polarization = True
        #assert polarization.size == size
        #cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
    #if solidangle is not None:
        #do_solidangle = True
        #assert solidangle.size == size
        #csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
    #with nogil:
        #for idx in range(size):
            #if (check_mask) and (cmask[idx]):
                #continue
            #data = cdata[idx]
            #if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
                #continue
            #a0 =  cpos[idx, 0, 0]
            #a1 =  cpos[idx, 0, 1]
            #b0 =  cpos[idx, 1, 0]
            #b1 =  cpos[idx, 1, 1]
            #c0 =  cpos[idx, 2, 0]
            #c1 =  cpos[idx, 2, 1]
            #d0 =  cpos[idx, 3, 0]
            #d1 =  cpos[idx, 3, 1]
            #min0 = min4f(a0, b0, c0, d0)
            #max0 = max4f(a0, b0, c0, d0)
            #min1 = min4f(a1, b1, c1, d1)
            #max1 = max4f(a1, b1, c1, d1)
            #if (max0<pos0_min) or (min0 > pos0_maxin) or (max1<pos1_min) or (min1 > pos1_maxin):
                    #continue
            #if do_dark:
                #data -= cdark[idx]
            #if do_flat:
                #data /= cflat[idx]
            #if do_polarization:
                #data /= cpolarization[idx]
            #if do_solidangle:
                #data /= csolidangle[idx]
            #if min0 < pos0_min:
                #data = data * (pos0_min - min0) / (max0 - min0)
                #min0 = pos0_min
            #if min1 < pos1_min:
                #data = data * (pos1_min - min1) / (max1 - min1)
                #min1 = pos1_min
            #if max0 > pos0_maxin:
                #data = data * (max0 - pos0_maxin) / (max0 - min0)
                #max0 = pos0_maxin
            #if max1 > pos1_maxin:
                #data = data * (max1 - pos1_maxin) / (max1 - min1)
                #max1 = pos1_maxin
 ###                treat data for pixel on chi discontinuity
            #if ((max1 - min1) / dpos1) > (bins1 / 2.0):
                #if pos1_maxin - max1 > min1 - pos1_min:
                    #min1 = max1
                    #max1 = pos1_maxin
                #else:
                    #max1 = min1
                    #min1 = pos1_min
            #fbin0_min = getBinNr(min0, pos0_min, dpos0)
            #fbin0_max = getBinNr(max0, pos0_min, dpos0)
            #fbin1_min = getBinNr(min1, pos1_min, dpos1)
            #fbin1_max = getBinNr(max1, pos1_min, dpos1)
            #bin0_min = < size_t > fbin0_min
            #bin0_max = < size_t > fbin0_max
            #bin1_min = < size_t > fbin1_min
            #bin1_max = < size_t > fbin1_max
            #if bin0_min == bin0_max:
                #if bin1_min == bin1_max:
                    ##All pixel is within a single bin
                    #outCount[bin0_min, bin1_min] += 1.0
                    #outData[bin0_min, bin1_min] += data
                #else:
                    ##spread on more than 2 bins
                    #areaPixel = fbin1_max - fbin1_min
                    #deltaD = (< double > (bin1_min + 1)) - fbin1_min
                    #deltaU = fbin1_max - (< double > bin1_max)
                    #deltaA = 1.0 / areaPixel
                    #outCount[bin0_min, bin1_min] += deltaA * deltaD
                    #outData[bin0_min, bin1_min] += data * deltaA * deltaD
                    #outCount[bin0_min, bin1_max] += deltaA * deltaU
                    #outData[bin0_min, bin1_max] += data * deltaA * deltaU
 ##                    if bin1_min +1< bin1_max:
                    #for j in range(bin1_min + 1, bin1_max):
                            #outCount[bin0_min, j] += deltaA
                            #outData[bin0_min, j] += data * deltaA
            #else: #spread on more than 2 bins in dim 0
                #if bin1_min == bin1_max:
                    ##All pixel fall on 1 bins in dim 1
                    #areaPixel = fbin0_max - fbin0_min
                    #deltaL = (< double > (bin0_min + 1)) - fbin0_min
                    #deltaA = deltaL / areaPixel
                    #outCount[bin0_min, bin1_min] += deltaA
                    #outData[bin0_min, bin1_min] += data * deltaA
                    #deltaR = fbin0_max - (< double > bin0_max)
                    #deltaA = deltaR / areaPixel
                    #outCount[bin0_max, bin1_min] += deltaA
                    #outData[bin0_max, bin1_min] += data * deltaA
                    #deltaA = 1.0 / areaPixel
                    #for i in range(bin0_min + 1, bin0_max):
                            #outCount[i, bin1_min] += deltaA
                            #outData[i, bin1_min] += data * deltaA
                #else:
                    ##spread on n pix in dim0 and m pixel in dim1:
                    #areaPixel = (fbin0_max - fbin0_min) * (fbin1_max - fbin1_min)
                    #deltaL = (< double > (bin0_min + 1.0)) - fbin0_min
                    #deltaR = fbin0_max - (< double > bin0_max)
                    #deltaD = (< double > (bin1_min + 1.0)) - fbin1_min
                    #deltaU = fbin1_max - (< double > bin1_max)
                    #deltaA = 1.0 / areaPixel
                    #outCount[bin0_min, bin1_min] += deltaA * deltaL * deltaD
                    #outData[bin0_min, bin1_min] += data * deltaA * deltaL * deltaD
                    #outCount[bin0_min, bin1_max] += deltaA * deltaL * deltaU
                    #outData[bin0_min, bin1_max] += data * deltaA * deltaL * deltaU
                    #outCount[bin0_max, bin1_min] += deltaA * deltaR * deltaD
                    #outData[bin0_max, bin1_min] += data * deltaA * deltaR * deltaD
                    #outCount[bin0_max, bin1_max] += deltaA * deltaR * deltaU
                    #outData[bin0_max, bin1_max] += data * deltaA * deltaR * deltaU
                    #for i in range(bin0_min + 1, bin0_max):
                            #outCount[i, bin1_min] += deltaA * deltaD
                            #outData[i, bin1_min] += data * deltaA * deltaD
                            #for j in range(bin1_min + 1, bin1_max):
                                #outCount[i, j] += deltaA
                                #outData[i, j] += data * deltaA
                            #outCount[i, bin1_max] += deltaA * deltaU
                            #outData[i, bin1_max] += data * deltaA * deltaU
                    #for j in range(bin1_min + 1, bin1_max):
                            #outCount[bin0_min, j] += deltaA * deltaL
                            #outData[bin0_min, j] += data * deltaA * deltaL
                            #outCount[bin0_max, j] += deltaA * deltaR
                            #outData[bin0_max, j] += data * deltaA * deltaR
    ##with nogil:
        #for i in range(bins0):
            #for j in range(bins1):
                #if outCount[i, j] > epsilon:
                    #outMerge[i, j] = outData[i, j] / outCount[i, j]
                #else:
                    #outMerge[i, j] = cdummy
    #return outMerge.T, edges0, edges1, outData.T, outCount.T
--- a/src/splitPixelFullLUT.c
+++ b/src/splitPixelFullLUT.c
--- a/src/splitPixelFullLUT.pyx
+++ b/src/splitPixelFullLUT.pyx
@ -0,0 +1,974 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
 #    Project: Azimuthal integration
 #             https://github.com/kif/pyFAI
 #
 #    Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
 #
 #    Principal author:       Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
 #
 #    This program is free software: you can redistribute it and/or modify
 #    it under the terms of the GNU General Public License as published by
 #    the Free Software Foundation, either version 3 of the License, or
 #    (at your option) any later version.
 #
 #    This program is distributed in the hope that it will be useful,
 #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 #    GNU General Public License for more details.
 #
 #    You should have received a copy of the GNU General Public License
 #    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #
 #
 import cython
 import os, sys
 from cython.parallel import prange
 from libc.string cimport memset
 import numpy
 cimport numpy
 from libc.math cimport fabs, M_PI, floor
 from libc.stdio cimport printf
 cdef float pi = <float> M_PI 
 cdef float onef = <float> 1.0
 try:
    from fastcrc import crc32
 except:
    from zlib import crc32
 EPS32 = (1.0 + numpy.finfo(numpy.float32).eps)
 cdef struct Function:
    double slope
    double intersect
 cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
    """
    Calculate the area of the ABCD quadrilataire  with corners:
    A(a0,a1)
    B(b0,b1)
    C(c0,c1)
    D(d0,d1)
    @return: area, i.e. 1/2 * (AC ^ BD)
    """
    return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
@cython.cdivision(True)
 cdef inline double getBinNr( double x0, double pos0_min, double delta) nogil:
    """
    calculate the bin number for any point
    param x0: current position
    param pos0_min: position minimum
    param delta: bin width
    """
    return (x0 - pos0_min) / delta
 cdef double integrate( double A0, double B0, Function AB) nogil:
    """
    integrates the line defined by AB, from A0 to B0
    param A0: first limit
    param B0: second limit
    param AB: struct with the slope and point of intersection of the line
    """    
    if A0==B0:
        return 0.0
    else:
        return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
 class HistoLUT1dFullSplit(object):
    """
    Now uses CSR (Compressed Sparse raw) with main attributes:
    * nnz: number of non zero elements
    * data: coefficient of the matrix in a 1D vector of float32
    * indices: Column index position for the data (same size as  
    * indptr: row pointer indicates the start of a given row. len nrow+1
    Nota: nnz = indptr[-1]
    """
    @cython.boundscheck(False)
    def __init__(self,
                 numpy.ndarray pos not None,
                 int bins=100,
                 pos0Range=None,
                 pos1Range=None,
                 mask=None,
                 mask_checksum=None,
                 allow_pos0_neg=False,
                 unit="undefined"): 
        """
        @param pos: 3D or 4D array with the coordinates of each pixel point
        @param bins: number of output bins, 100 by default
        @param pos0Range: minimum and maximum  of the 2th range
        @param pos1Range: minimum and maximum  of the chi range
        @param mask: array (of int8) with masked pixels with 1 (0=not masked)
        @param allow_pos0_neg: enforce the q<0 is usually not possible  
        @param unit: can be 2th_deg or r_nm^-1 ...
        """
 #        self.padding = int(padding)
        if pos.ndim>3: #create a view
            pos = pos.reshape((-1,4,2))
        assert pos.shape[1] == 4
        assert pos.shape[2] == 2
        assert pos.ndim == 3
        self.pos = pos
        self.size = pos.shape[0]
        self.bins = bins
        self.lut_size = 0
        self.allow_pos0_neg = allow_pos0_neg
        if  mask is not None:
            assert mask.size == self.size
            self.check_mask = True
            self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
            if mask_checksum:
                self.mask_checksum = mask_checksum
            else:
                self.mask_checksum = crc32(mask)
        else:
            self.check_mask = False
            self.mask_checksum = None
        self.data = self.nnz = self.indices = self.indptr = None
        self.pos0Range = pos0Range
        self.pos1Range = pos1Range
        self.calc_lut()
        self.outPos = numpy.linspace(self.pos0_min+0.5*self.delta, self.pos0_maxin-0.5*self.delta, self.bins)
        self.lut_checksum = crc32(self.data)
        self.unit=unit
        self.lut=(self.data,self.indices,self.indptr)
        self.lut_nbytes = sum([i.nbytes for i in self.lut])
    @cython.cdivision(True)
    @cython.boundscheck(False)
    @cython.wraparound(False)
    def calc_lut(self):
        cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(self.pos,dtype=numpy.float64)
        cdef numpy.int8_t[:] cmask
        cdef numpy.ndarray[numpy.int32_t, ndim = 1] outMax = numpy.zeros(self.bins, dtype=numpy.int32)
        cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros(self.bins+1, dtype=numpy.int32)
        cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
        cdef double max0, min0
        cdef double areaPixel=0, delta=0
        cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
        cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
        cdef double oneOverArea=0, partialArea=0, tmp=0
        cdef Function AB, BC, CD, DA
        cdef int bins, i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, k=0, size=0
        cdef bint check_pos1=False, check_mask=False
        bins = self.bins
        if self.pos0Range is not None and len(self.pos0Range) > 1:
            self.pos0_min = min(self.pos0Range)
            self.pos0_maxin = max(self.pos0Range)
        else:
            self.pos0_min = self.pos[:, :, 0].min()
            self.pos0_maxin = self.pos[:, :, 0].max()
        self.pos0_max = self.pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
        if self.pos1Range is not None and len(self.pos1Range) > 1:
            self.pos1_min = min(self.pos1Range)
            self.pos1_maxin = max(self.pos1Range)
            self.check_pos1 = True
        else:
            self.pos1_min = self.pos[:, :, 1].min()
            self.pos1_maxin = self.pos[:, :, 1].max()
        self.pos1_max = self.pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
        self.delta = (self.pos0_max - self.pos0_min) / (< double > (bins))
        pos0_min = self.pos0_min
        pos0_max = self.pos0_max
        pos1_min = self.pos1_min
        pos1_max = self.pos1_max
        delta = self.delta
        size = self.size
        check_mask = self.check_mask
        if check_mask:
            cmask = self.cmask
        with nogil:
            for idx in range(size):
                if (check_mask) and (cmask[idx]):
                    continue
                A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
                A1 = < double > cpos[idx, 0, 1]
                B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
                B1 = < double > cpos[idx, 1, 1]
                C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
                C1 = < double > cpos[idx, 2, 1]
                D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
                D1 = < double > cpos[idx, 3, 1]
                min0 = min(A0, B0, C0, D0)
                max0 = max(A0, B0, C0, D0)
                if (max0<0) or (min0 >=bins):
                    continue
                if check_pos1:
                    if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
                        continue
                bin0_min = < int > floor(min0)
                bin0_max = < int > floor(max0)
                for bin in range(bin0_min, bin0_max+1):
                    outMax[bin] += 1
        indptr[1:] = outMax.cumsum()
        self.indptr = indptr
        cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices = numpy.zeros(indptr[bins], dtype=numpy.int32)
        cdef numpy.ndarray[numpy.float32_t, ndim = 1] data = numpy.zeros(indptr[bins], dtype=numpy.float32)
        #just recycle the outMax array
        memset(&outMax[0], 0, bins * sizeof(numpy.int32_t))
        with nogil:
            for idx in range(size):
                if (check_mask) and (cmask[idx]):
                    continue
                A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
                A1 = < double > cpos[idx, 0, 1]
                B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
                B1 = < double > cpos[idx, 1, 1]
                C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
                C1 = < double > cpos[idx, 2, 1]
                D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
                D1 = < double > cpos[idx, 3, 1]
                min0 = min(A0, B0, C0, D0)
                max0 = max(A0, B0, C0, D0)
                if (max0<0) or (min0 >=bins):
                    continue
                if check_pos1:
                    if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
                        continue
                bin0_min = < int > floor(min0)
                bin0_max = < int > floor(max0)
                if bin0_min == bin0_max:
                    #All pixel is within a single bin
                    k = outMax[bin0_min]
                    indices[indptr[bin0_min]+k] = idx
                    data[indptr[bin0_min]+k] = 1.0
                    outMax[bin0_min] += 1 #k+1
                else:  #else we have pixel spliting.
                    AB.slope=(B1-A1)/(B0-A0)
                    AB.intersect= A1 - AB.slope*A0
                    BC.slope=(C1-B1)/(C0-B0)
                    BC.intersect= B1 - BC.slope*B0
                    CD.slope=(D1-C1)/(D0-C0)
                    CD.intersect= C1 - CD.slope*C0
                    DA.slope=(A1-D1)/(A0-D0)
                    DA.intersect= D1 - DA.slope*D0
                    areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
                    oneOverPixelArea = 1.0 / areaPixel
                    partialArea2 = 0.0
                    for bin in range(bin0_min, bin0_max+1):
                        A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
                        B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
                        C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
                        D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
                        partialArea  = integrate(A_lim, B_lim, AB)
                        partialArea += integrate(B_lim, C_lim, BC)
                        partialArea += integrate(C_lim, D_lim, CD)
                        partialArea += integrate(D_lim, A_lim, DA)
                        tmp = fabs(partialArea) * oneOverPixelArea
                        k = outMax[bin]
                        indices[indptr[bin]+k] = idx
                        data[indptr[bin]+k] = tmp
                        outMax[bin] += 1 #k+1
        self.data = data
        self.indices = indices
    @cython.cdivision(True)
    @cython.boundscheck(False)
    @cython.wraparound(False)
    def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
        """
        Actually perform the integration which in this case looks more like a matrix-vector product
        @param weights: input image
        @type weights: ndarray
        @param dummy: value for dead pixels (optional)
        @type dummy: float
        @param delta_dummy: precision for dead-pixel value in dynamic masking
        @type delta_dummy: float
        @param dark: array with the dark-current value to be subtracted (if any)
        @type dark: ndarray
        @param flat: array with the dark-current value to be divided by (if any)
        @type flat: ndarray
        @param solidAngle: array with the solid angle of each pixel to be divided by (if any)
        @type solidAngle: ndarray
        @param polarization: array with the polarization correction values to be divided by (if any)
        @type polarization: ndarray
        @return : positions, pattern, weighted_histogram and unweighted_histogram
        @rtype: 4-tuple of ndarrays
        """
        cdef numpy.int32_t i=0, j=0, idx=0, bins=self.bins, size=self.size
        cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
        cdef float data=0, coef=0, cdummy=0, cddummy=0
        cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
        cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(self.bins, dtype=numpy.float64)
        cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
        cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
        cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
        cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
        assert size == weights.size
        if dummy is not None:
            do_dummy = True
            cdummy =  <float>float(dummy)
            if delta_dummy is None:
                cddummy = <float>0.0
            else:
                cddummy = <float>float(delta_dummy)
        if flat is not None:
            do_flat = True
            assert flat.size == size
            cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
        if dark is not None:
            do_dark = True
            assert dark.size == size
            cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
        if solidAngle is not None:
            do_solidAngle = True
            assert solidAngle.size == size
            csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
        if polarization is not None:
            do_polarization = True
            assert polarization.size == size
            cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
        if (do_dark + do_flat + do_polarization + do_solidAngle):
            tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
            cdata = numpy.zeros(size,dtype=numpy.float32)
            if do_dummy:
                for i in prange(size, nogil=True, schedule="static"):
                    data = tdata[i]
                    if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
                        #Nota: -= and /= operatore are seen as reduction in cython parallel.
                        if do_dark:
                            data = data - cdark[i]
                        if do_flat:
                            data = data / cflat[i]
                        if do_polarization:
                            data = data / cpolarization[i]
                        if do_solidAngle:
                            data = data / csolidAngle[i]
                        cdata[i]+=data
                    else: #set all dummy_like values to cdummy. simplifies further processing
                        cdata[i]+=cdummy
            else:
                for i in prange(size, nogil=True, schedule="static"):
                    data = tdata[i]
                    if do_dark:
                        data = data - cdark[i]
                    if do_flat:
                        data = data / cflat[i]
                    if do_polarization:
                        data = data / cpolarization[i]
                    if do_solidAngle:
                        data = data / csolidAngle[i]
                    cdata[i]+=data
        else:
            if do_dummy:
                tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
                cdata = numpy.zeros(size,dtype=numpy.float32)
                for i in prange(size, nogil=True, schedule="static"):
                    data = tdata[i]
                    if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
                        cdata[i]+=data
                    else:
                        cdata[i]+=cdummy
            else:
                cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
        for i in prange(bins, nogil=True, schedule="guided"):
            sum_data = 0.0
            sum_count = 0.0
            for j in range(indptr[i],indptr[i+1]):
                idx = indices[j]
                coef = ccoef[j]
                if coef == 0.0:
                    continue
                data = cdata[idx]
                if do_dummy and data==cdummy:
                    continue
                sum_data = sum_data + coef * data
                sum_count = sum_count + coef
            outData[i] += sum_data
            outCount[i] += sum_count
            if sum_count > epsilon:
                outMerge[i] += sum_data / sum_count
            else:
                outMerge[i] += cdummy
        return  self.outPos, outMerge, outData, outCount
 ################################################################################
 # Bidimensionnal regrouping
 ################################################################################
 #class HistoBBox2d(object):
    #@cython.boundscheck(False)
    #def __init__(self,
                    #pos0,
                    #delta_pos0,
                    #pos1,
                    #delta_pos1,
                    #bins=(100,36),
                    #pos0Range=None,
                    #pos1Range=None,
                    #mask=None,
                    #mask_checksum=None,
                    #allow_pos0_neg=False,
                    #unit="undefined",
                    #chiDiscAtPi=True
                    #):
        #"""
        #@param pos0: 1D array with pos0: tth or q_vect
        #@param delta_pos0: 1D array with delta pos0: max center-corner distance
        #@param pos1: 1D array with pos1: chi
        #@param delta_pos1: 1D array with max pos1: max center-corner distance, unused !
        #@param bins: number of output bins (tth=100, chi=36 by default)
        #@param pos0Range: minimum and maximum  of the 2th range
        #@param pos1Range: minimum and maximum  of the chi range
        #@param mask: array (of int8) with masked pixels with 1 (0=not masked)
        #@param allow_pos0_neg: enforce the q<0 is usually not possible  
        #@param chiDiscAtPi: boolean; by default the chi_range is in the range ]-pi,pi[ set to 0 to have the range ]0,2pi[
        #"""
        #cdef int i, size, bin0, bin1
        #self.size = pos0.size
        #assert delta_pos0.size == self.size
        #assert pos1.size == self.size
        #assert delta_pos1.size == self.size
        #self.chiDiscAtPi = 1 if chiDiscAtPi else 0
        #self.allow_pos0_neg =  allow_pos0_neg
        #try:
            #bins0, bins1 = tuple(bins)
        #except:
            #bins0 = bins1 = bins
        #if bins0 <= 0:
            #bins0 = 1
        #if bins1 <= 0:
            #bins1 = 1
        #self.bins = (int(bins0),int(bins1))
        #self.lut_size = 0
        #if  mask is not None:
            #assert mask.size == self.size
            #self.check_mask = True
            #self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
            #if mask_checksum:
                #self.mask_checksum = mask_checksum
            #else:
                #self.mask_checksum = crc32(mask)
        #else:
            #self.check_mask = False
            #self.mask_checksum = None
        #self.data = self.nnz = self.indices = self.indptr = None
        #self.cpos0 = numpy.ascontiguousarray(pos0.ravel(), dtype=numpy.float32)
        #self.dpos0 = numpy.ascontiguousarray(delta_pos0.ravel(), dtype=numpy.float32)
        #self.cpos0_sup = numpy.empty_like(self.cpos0)
        #self.cpos0_inf = numpy.empty_like(self.cpos0)
        #self.pos0Range = pos0Range
        #self.pos1Range = pos1Range
        #self.cpos1 = numpy.ascontiguousarray((pos1).ravel(), dtype=numpy.float32)
        #self.dpos1 = numpy.ascontiguousarray((delta_pos1).ravel(), dtype=numpy.float32)
        #self.cpos1_sup = numpy.empty_like(self.cpos1)
        #self.cpos1_inf = numpy.empty_like(self.cpos1)
        #self.calc_boundaries(pos0Range, pos1Range)
        #self.delta0 = (self.pos0_max - self.pos0_min) / float(bins0)
        #self.delta1 = (self.pos1_max - self.pos1_min) / float(bins1)
        #self.lut_max_idx = self.calc_lut()
        #self.outPos0 = numpy.linspace(self.pos0_min+0.5*self.delta0, self.pos0_maxin-0.5*self.delta0, bins0)
        #self.outPos1 = numpy.linspace(self.pos1_min+0.5*self.delta1, self.pos1_maxin-0.5*self.delta1, bins1)
        #self.unit=unit
        #self.lut=(self.data,self.indices,self.indptr)
        #self.lut_checksum = crc32(self.data)
    #@cython.boundscheck(False)
    #@cython.wraparound(False)
    #def calc_boundaries(self, pos0Range, pos1Range):
        #cdef int size = self.cpos0.size
        #cdef bint check_mask = self.check_mask
        #cdef numpy.int8_t[:] cmask
        #cdef float[:] cpos0, dpos0, cpos0_sup, cpos0_inf
        #cdef float[:] cpos1, dpos1, cpos1_sup, cpos1_inf,
        #cdef float upper0, lower0, pos0_max, pos0_min, c0, d0
        #cdef float upper1, lower1, pos1_max, pos1_min, c1, d1
        #cdef bint allow_pos0_neg=self.allow_pos0_neg
        #cdef bint chiDiscAtPi = self.chiDiscAtPi
        #cpos0_sup = self.cpos0_sup
        #cpos0_inf = self.cpos0_inf
        #cpos0 = self.cpos0
        #dpos0 = self.dpos0
        #cpos1_sup = self.cpos1_sup
        #cpos1_inf = self.cpos1_inf
        #cpos1 = self.cpos1
        #dpos1 = self.dpos1
        #pos0_min=cpos0[0]
        #pos0_max=cpos0[0]
        #pos1_min=cpos1[0]
        #pos1_max=cpos1[0]
        #if check_mask:
            #cmask = self.cmask
        #with nogil:
            #for idx in range(size):
                #c0 = cpos0[idx]
                #d0 = dpos0[idx]
                #lower0 = c0 - d0
                #upper0 = c0 + d0
                #c1 = cpos1[idx]
                #d1 = dpos1[idx]
                #lower1 = c1 - d1
                #upper1 = c1 + d1
                #if not allow_pos0_neg and lower0<0:
                    #lower0=0
                #if upper1 > (2-chiDiscAtPi)*pi:
                    #upper1 = (2-chiDiscAtPi)*pi
                #if lower1 < (-chiDiscAtPi)*pi:
                    #lower1 = (-chiDiscAtPi)*pi
                #cpos0_sup[idx] = upper0
                #cpos0_inf[idx] = lower0
                #cpos1_sup[idx] = upper1
                #cpos1_inf[idx] = lower1
                #if not (check_mask and cmask[idx]):
                    #if upper0>pos0_max:
                        #pos0_max = upper0
                    #if lower0<pos0_min:
                        #pos0_min = lower0
                    #if upper1>pos1_max:
                        #pos1_max = upper1
                    #if lower1<pos1_min:
                        #pos1_min = lower1
        #if pos0Range is not None and len(pos0Range) > 1:
            #self.pos0_min = min(pos0Range)
            #self.pos0_maxin = max(pos0Range)
        #else:
            #self.pos0_min = pos0_min
            #self.pos0_maxin = pos0_max
        #if pos1Range is not None and len(pos1Range) > 1:
            #self.pos1_min = min(pos1Range)
            #self.pos1_maxin = max(pos1Range)
        #else:
            #self.pos1_min = pos1_min
            #self.pos1_maxin = pos1_max
        #if (not allow_pos0_neg) and self.pos0_min < 0:
            #self.pos0_min = 0
        #self.pos0_max = self.pos0_maxin * EPS32
        #self.cpos0_sup = cpos0_sup
        #self.cpos0_inf = cpos0_inf
        #self.pos1_max = self.pos1_maxin * EPS32
        #self.cpos1_sup = cpos1_sup
        #self.cpos1_inf = cpos1_inf
    #@cython.boundscheck(False)
    #@cython.wraparound(False)
    #@cython.cdivision(True)
    #def calc_lut(self):
        #'calculate the max number of elements in the LUT and populate it'
        #cdef float delta0=self.delta0, pos0_min=self.pos0_min, min0, max0, fbin0_min, fbin0_max
        #cdef float delta1=self.delta1, pos1_min=self.pos1_min, min1, max1, fbin1_min, fbin1_max
        #cdef int bin0_min, bin0_max, bins0 = self.bins[0]
        #cdef int bin1_min, bin1_max, bins1 = self.bins[1]
        #cdef numpy.int32_t k, idx, lut_size, i, j, size=self.size
        #cdef bint check_mask
        #cdef float[:] cpos0_sup = self.cpos0_sup
        #cdef float[:] cpos0_inf = self.cpos0_inf
        #cdef float[:] cpos1_inf = self.cpos1_inf
        #cdef float[:] cpos1_sup = self.cpos1_sup
        #cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
        #cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros((bins0*bins1)+1, dtype=numpy.int32)
        #cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices 
        #cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
        #cdef numpy.int8_t[:] cmask
        #if self.check_mask:
            #cmask = self.cmask
            #check_mask = True
        #else:
            #check_mask = False
    ##NOGIL
        #with nogil:
            #for idx in range(size):
                #if (check_mask) and (cmask[idx]):
                    #continue
                #min0 = cpos0_inf[idx]
                #max0 = cpos0_sup[idx]
                #min1 = cpos1_inf[idx]
                #max1 = cpos1_sup[idx]
                #bin0_min = < int > getBinNr(min0, pos0_min, delta0)
                #bin0_max = < int > getBinNr(max0, pos0_min, delta0)
                #bin1_min = < int > getBinNr(min1, pos1_min, delta1)
                #bin1_max = < int > getBinNr(max1, pos1_min, delta1)
                #if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
                    #continue
                #if bin0_max >= bins0 :
                    #bin0_max = bins0 - 1
                #if  bin0_min < 0:
                    #bin0_min = 0
                #if bin1_max >= bins1 :
                    #bin1_max = bins1 - 1
                #if  bin1_min < 0:
                    #bin1_min = 0
                #for i in range(bin0_min, bin0_max+1):
                    #for j in range(bin1_min , bin1_max+1):
                        #outMax[i, j] +=  1
        #self.nnz = outMax.sum()
        #indptr[1:] = outMax.cumsum()
        #self.indptr = indptr
 ##        self.lut_size = lut_size = outMax.max()
        ##just recycle the outMax array
        ##outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
        #memset(&outMax[0,0], 0, bins0*bins1*sizeof(numpy.int32_t))
        #lut_nbytes = self.nnz * (sizeof(numpy.float32_t)+sizeof(numpy.int32_t)) + bins0*bins1*sizeof(numpy.int32_t)
        #if (os.name == "posix") and ("SC_PAGE_SIZE" in os.sysconf_names) and ("SC_PHYS_PAGES" in os.sysconf_names):
            #memsize =  os.sysconf("SC_PAGE_SIZE")*os.sysconf("SC_PHYS_PAGES")
            #if memsize <  lut_nbytes:
                #raise MemoryError("CSR Matrix is %.3fGB whereas the memory of the system is only %s"%(lut_nbytes, memsize))
        ##else hope we have enough memory
        #data = numpy.zeros(self.nnz,dtype=numpy.float32)
        #indices = numpy.zeros(self.nnz,dtype=numpy.int32)
 ##        lut = numpy.recarray(shape=(bins0, bins1, lut_size),dtype=[("idx",numpy.int32),("coef",numpy.float32)])
 ##        memset(&lut[0,0,0], 0, lut_nbytes)
        #with nogil:
            #for idx in range(size):
                #if (check_mask) and cmask[idx]:
                    #continue
                #min0 = cpos0_inf[idx]
                #max0 = cpos0_sup[idx]
                #min1 = cpos1_inf[idx]
                #max1 = cpos1_sup[idx]
                #fbin0_min = getBinNr(min0, pos0_min, delta0)
                #fbin0_max = getBinNr(max0, pos0_min, delta0)
                #fbin1_min = getBinNr(min1, pos1_min, delta1)
                #fbin1_max = getBinNr(max1, pos1_min, delta1)
                #bin0_min = <int> fbin0_min
                #bin0_max = <int> fbin0_max
                #bin1_min = <int> fbin1_min
                #bin1_max = <int> fbin1_max
                #if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
                    #continue
                #if bin0_max >= bins0 :
                    #bin0_max = bins0 - 1
                #if  bin0_min < 0:
                    #bin0_min = 0
                #if bin1_max >= bins1 :
                    #bin1_max = bins1 - 1
                #if  bin1_min < 0:
                    #bin1_min = 0
                #if bin0_min == bin0_max:
                    #if bin1_min == bin1_max:
                        ##All pixel is within a single bin
                        #k = outMax[bin0_min, bin1_min]
                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
                        #data[indptr[bin0_min*bins1+bin1_min]+k] = onef
 ##                        lut[bin0_min, bin1_min, k].idx = idx
 ##                        lut[bin0_min, bin1_min, k].coef = onef
                        #outMax[bin0_min, bin1_min]= k+1
                    #else:
                        ##spread on more than 2 bins
                        #deltaD = (< float > (bin1_min + 1)) - fbin1_min
                        #deltaU = fbin1_max - ( bin1_max)
                        #deltaA = 1.0 / (fbin1_max - fbin1_min)
                        #k = outMax[bin0_min, bin1_min]
                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
                        #data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaD
 ##                        lut[bin0_min, bin1_min, k].idx = idx
 ##                        lut[bin0_min, bin1_min, k].coef =  deltaA * deltaD
                        #outMax[bin0_min, bin1_min] = k + 1
                        #k = outMax[bin0_min, bin1_max]
                        #indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
                        #data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaU
 ##                        lut[bin0_min, bin1_max, k].idx = idx
 ##                        lut[bin0_min, bin1_max, k].coef =  deltaA * deltaU
                        #outMax[bin0_min, bin1_max] = k + 1
                        #for j in range(bin1_min + 1, bin1_max):
                            #k = outMax[bin0_min, j]
                            #indices[indptr[bin0_min*bins1+j]+k] = idx
                            #data[indptr[bin0_min*bins1+j]+k] = deltaA
 ##                            lut[bin0_min, j, k].idx = idx
 ##                            lut[bin0_min, j, k].coef =  deltaA
                            #outMax[bin0_min, j] = k + 1
                #else: #spread on more than 2 bins in dim 0
                    #if bin1_min == bin1_max:
                        ##All pixel fall on 1 bins in dim 1
                        #deltaA = 1.0 / (fbin0_max - fbin0_min)
                        #deltaL = (< float > (bin0_min + 1)) - fbin0_min
                        #k = outMax[bin0_min, bin1_min]
                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
                        #data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL
 ##                        lut[bin0_min, bin1_min, k].idx = idx
 ##                        lut[bin0_min, bin1_min, k].coef =  deltaA * deltaL
                        #outMax[bin0_min, bin1_min] = k+1
                        #deltaR = fbin0_max - (< float > bin0_max)
                        #k = outMax[bin0_max, bin1_min]
                        #indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
                        #data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR
 ##                        lut[bin0_max, bin1_min, k].idx = idx
 ##                        lut[bin0_max, bin1_min, k].coef =  deltaA * deltaR
                        #outMax[bin0_max, bin1_min] = k + 1
                        #for i in range(bin0_min + 1, bin0_max):
                            #k = outMax[i, bin1_min]
                            #indices[indptr[i*bins1+bin1_min]+k] = idx
                            #data[indptr[i*bins1+bin1_min]+k] = deltaA
 ##                            lut[i, bin1_min ,k].idx = idx
 ##                            lut[i, bin1_min, k].coef =  deltaA
                            #outMax[i, bin1_min] = k + 1
                    #else:
                        ##spread on n pix in dim0 and m pixel in dim1:
                        #deltaL = (< float > (bin0_min + 1)) - fbin0_min
                        #deltaR = fbin0_max - (< float > bin0_max)
                        #deltaD = (< float > (bin1_min + 1)) - fbin1_min
                        #deltaU = fbin1_max - (< float > bin1_max)
                        #deltaA = 1.0 / ((fbin0_max - fbin0_min) * (fbin1_max - fbin1_min))
                        #k = outMax[bin0_min, bin1_min]
                        #indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
                        #data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL * deltaD
 ##                        lut[bin0_min, bin1_min ,k].idx = idx
 ##                        lut[bin0_min, bin1_min, k].coef =  deltaA * deltaL * deltaD
                        #outMax[bin0_min, bin1_min] = k + 1
                        #k = outMax[bin0_min, bin1_max]
                        #indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
                        #data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaL * deltaU
 ##                        lut[bin0_min, bin1_max, k].idx = idx
 ##                        lut[bin0_min, bin1_max, k].coef =  deltaA * deltaL * deltaU
                        #outMax[bin0_min, bin1_max] = k + 1
                        #k = outMax[bin0_max, bin1_min]
                        #indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
                        #data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR * deltaD
 ##                        lut[bin0_max, bin1_min, k].idx = idx
 ##                        lut[bin0_max, bin1_min, k].coef =  deltaA * deltaR * deltaD
                        #outMax[bin0_max, bin1_min] = k + 1
                        #k = outMax[bin0_max, bin1_max]
                        #indices[indptr[bin0_max*bins1+bin1_max]+k] = idx
                        #data[indptr[bin0_max*bins1+bin1_max]+k] = deltaA * deltaR * deltaU
 ##                        lut[bin0_max, bin1_max, k].idx = idx
 ##                        lut[bin0_max, bin1_max, k].coef =  deltaA * deltaR * deltaU
                        #outMax[bin0_max, bin1_max] = k + 1
                        #for i in range(bin0_min + 1, bin0_max):
                            #k = outMax[i, bin1_min]
                            #indices[indptr[i*bins1+bin1_min]+k] = idx
                            #data[indptr[i*bins1+bin1_min]+k] = deltaA * deltaD
 ##                            lut[i, bin1_min, k].idx = idx
 ##                            lut[i, bin1_min, k].coef =  deltaA * deltaD
                            #outMax[i, bin1_min] = k + 1
                            #for j in range(bin1_min + 1, bin1_max):
                                #k = outMax[i, j]
                                #indices[indptr[i*bins1+j]+k] = idx
                                #data[indptr[i*bins1+j]+k] = deltaA
 ##                                lut[i, j, k].idx = idx
 ##                                lut[i, j, k].coef =  deltaA
                                #outMax[i, j] = k + 1
                            #k = outMax[i, bin1_max]
                            #indices[indptr[i*bins1+bin1_max]+k] = idx
                            #data[indptr[i*bins1+bin1_max]+k] = deltaA * deltaU
 ##                            lut[i, bin1_max, k].idx = idx
 ##                            lut[i, bin1_max, k].coef =  deltaA * deltaU
                            #outMax[i, bin1_max] = k + 1
                        #for j in range(bin1_min + 1, bin1_max):
                            #k = outMax[bin0_min, j]
                            #indices[indptr[bin0_min*bins1+j]+k] = idx
                            #data[indptr[bin0_min*bins1+j]+k] = deltaA * deltaL
 ##                            lut[bin0_min, j, k].idx = idx
 ##                            lut[bin0_min, j, k].coef =  deltaA * deltaL
                            #outMax[bin0_min, j] = k + 1
                            #k = outMax[bin0_max, j]
                            #indices[indptr[bin0_max*bins1+j]+k] = idx
                            #data[indptr[bin0_max*bins1+j]+k] = deltaA * deltaR
 ##                            lut[bin0_max, j, k].idx = idx
 ##                            lut[bin0_max, j, k].coef =  deltaA * deltaR
                            #outMax[bin0_max, j] = k + 1
        #self.data = data
        #self.indices = indices
        #return outMax
    #@cython.cdivision(True)
    #@cython.boundscheck(False)
    #@cython.wraparound(False)
    #def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
        #"""
        #Actually perform the 2D integration which in this case looks more like a matrix-vector product
        #@param weights: input image
        #@type weights: ndarray
        #@param dummy: value for dead pixels (optional)
        #@type dummy: float
        #@param delta_dummy: precision for dead-pixel value in dynamic masking
        #@type delta_dummy: float
        #@param dark: array with the dark-current value to be subtracted (if any)
        #@type dark: ndarray
        #@param flat: array with the dark-current value to be divided by (if any)
        #@type flat: ndarray
        #@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
        #@type solidAngle: ndarray
        #@param polarization: array with the polarization correction values to be divided by (if any)
        #@type polarization: ndarray
        #@return:  I(2d), edges0(1d), edges1(1d), weighted histogram(2d), unweighted histogram (2d)
        #@rtype: 5-tuple of ndarrays
        #"""
        #cdef int i=0, j=0, idx=0, bins0=self.bins[0], bins1=self.bins[1], bins=bins0*bins1, size=self.size
        #cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
        #cdef float data=0, coef=0, cdummy=0, cddummy=0
        #cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
        #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros(self.bins, dtype=numpy.float64)
        #cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
        #cdef numpy.ndarray[numpy.float32_t, ndim = 2] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
        #cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData_1d = outData.ravel()
        #cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount_1d = outCount.ravel()
        #cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge_1d = outMerge.ravel()
        #cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
        #cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
        #assert size == weights.size
        #if dummy is not None:
            #do_dummy = True
            #cdummy =  <float>float(dummy)
            #if delta_dummy is None:
                #cddummy = <float>0.0
            #else:
                #cddummy = <float>float(delta_dummy)
        #if flat is not None:
            #do_flat = True
            #assert flat.size == size
            #cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
        #if dark is not None:
            #do_dark = True
            #assert dark.size == size
            #cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
        #if solidAngle is not None:
            #do_solidAngle = True
            #assert solidAngle.size == size
            #csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
        #if polarization is not None:
            #do_polarization = True
            #assert polarization.size == size
            #cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
        #if (do_dark + do_flat + do_polarization + do_solidAngle):
            #tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
            #cdata = numpy.zeros(size,dtype=numpy.float32)
            #if do_dummy:
                #for i in prange(size, nogil=True, schedule="static"):
                    #data = tdata[i]
                    #if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
                        ##Nota: -= and /= operatore are seen as reduction in cython parallel.
                        #if do_dark:
                            #data = data - cdark[i]
                        #if do_flat:
                            #data = data / cflat[i]
                        #if do_polarization:
                            #data = data / cpolarization[i]
                        #if do_solidAngle:
                            #data = data / csolidAngle[i]
                        #cdata[i]+=data
                    #else: #set all dummy_like values to cdummy. simplifies further processing
                        #cdata[i]+=cdummy
            #else:
                #for i in prange(size, nogil=True, schedule="static"):
                    #data = tdata[i]
                    #if do_dark:
                        #data = data - cdark[i]
                    #if do_flat:
                        #data = data / cflat[i]
                    #if do_polarization:
                        #data = data / cpolarization[i]
                    #if do_solidAngle:
                        #data = data / csolidAngle[i]
                    #cdata[i]+=data
        #else:
            #if do_dummy:
                #tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
                #cdata = numpy.zeros(size,dtype=numpy.float32)
                #for i in prange(size, nogil=True, schedule="static"):
                    #data = tdata[i]
                    #if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
                        #cdata[i]+=data
                    #else:
                        #cdata[i]+=cdummy
            #else:
                #cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
        #for i in prange(bins, nogil=True, schedule="guided"):
            #sum_data = 0.0
            #sum_count = 0.0
            #for j in range(indptr[i],indptr[i+1]):
                #idx = indices[j]
                #coef = ccoef[j]
                #data = cdata[idx]
                #if do_dummy and data==cdummy:
                    #continue
                #sum_data = sum_data + coef * data
                #sum_count = sum_count + coef
            #outData_1d[i] += sum_data
            #outCount_1d[i] += sum_count
            #if sum_count > epsilon:
                #outMerge_1d[i] += sum_data / sum_count
            #else:
                #outMerge_1d[i] += cdummy
        #return  outMerge.T, self.outPos0, self.outPos1, outData.T, outCount.T
--- a/test/profile_csr_2d.py
+++ b/test/profile_csr_2d.py
@ -25,7 +25,7 @@ data = fabio.open("testimages/Pilatus1M.edf").data
 ref = ai.integrate2d(data, 100, 360, method="lut", unit="2th_deg")[0]
 obt = ai.integrate2d(data, 100, 360, method="ocl_csr", unit="2th_deg")[0]
 ##logger.debug("check LUT basics: %s"%abs(obt[1] - ref[1]).max())
-#assert numpy.allclose(ref,obt)
+assert numpy.allclose(ref,obt)
 plot(ref.ravel(), label="ocl_lut")
--- a/test/profile_csr_fullsplit.py
+++ b/test/profile_csr_fullsplit.py
@ -0,0 +1,42 @@
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar 07 09:52:51 2014
@author: ashiotis
 """
 import sys, numpy, time
 import utilstest
 import fabio, pyopencl
 from pylab import *
 print "#"*50
 pyFAI = sys.modules["pyFAI"]
 from pyFAI import splitPixelFullLUT
 from pyFAI import ocl_azim_csr
 #from pyFAI import splitBBoxLUT
 #from pyFAI import splitBBoxCSR
 #logger = utilstest.getLogger("profile")
 ai = pyFAI.load("testimages/Pilatus1M.poni")
 data = fabio.open("testimages/Pilatus1M.edf").data
 ref = ai.xrpd_LUT(data, 1000)[1]
 #obt = ai.xrpd_LUT_OCL(data, 1000)[1]
 #ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
 pos = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
 foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
 boo = foo.integrate(data)[1]
 foo2 = ocl_azim_csr.OCL_CSR_Integrator(foo.lut, data.size, "GPU", block_size=32)
 boo2 = foo2.integrate(data)[0]
 plot(ref, label="ocl_csr")
 plot(boo, label="csr_fullsplit")
 plot(boo2, label="ocl_csr_fullsplit")
 legend()
 show()
 raw_input()
--- a/test/profile_ocl_hist_pixelsplit.py
+++ b/test/profile_ocl_hist_pixelsplit.py
@ -0,0 +1,44 @@
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar 07 09:52:51 2014
@author: ashiotis
 """
 import sys, numpy, time
 import utilstest
 import fabio, pyopencl
 from pylab import *
 print "#"*50
 pyFAI = sys.modules["pyFAI"]
 from pyFAI import splitPixelFullLUT
 from pyFAI import ocl_hist_pixelsplit
 #from pyFAI import splitBBoxLUT
 #from pyFAI import splitBBoxCSR
 #logger = utilstest.getLogger("profile")
 ai = pyFAI.load("testimages/Pilatus1M.poni")
 data = fabio.open("testimages/Pilatus1M.edf").data
 ref = ai.xrpd_LUT(data, 1000)[1]
 #obt = ai.xrpd_LUT_OCL(data, 1000)[1]
 #ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
 pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
 pos = pos_in.reshape(pos_in.size/8,4,2)
 foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
 boo = foo.integrate(data)[1]
 foo2 = ocl_hist_pixelsplit.OCL_Hist_Pixelsplit(pos, 1000, data.size, devicetype="cpu", block_size=32)
 boo2 = foo2.integrate(data)[2]
 #plot(ref, label="ocl_csr")
 #plot(boo, label="csr_fullsplit")
 plot(boo2, label="ocl_csr_fullsplit")
 legend()
 show()
 raw_input()
--- a/test/profile_ocl_hist_pixelsplit2.py
+++ b/test/profile_ocl_hist_pixelsplit2.py
@ -0,0 +1,80 @@
 # -*- coding: utf-8 -*-
 """
 Created on Fri Mar 07 09:52:51 2014
@author: ashiotis
 """
 import sys, numpy, time
 import utilstest
 import fabio
 import pyopencl as cl
 from pylab import *
 print "#"*50
 pyFAI = sys.modules["pyFAI"]
 from pyFAI import splitPixelFullLUT
 from pyFAI import ocl_hist_pixelsplit
 #from pyFAI import splitBBoxLUT
 #from pyFAI import splitBBoxCSR
 #logger = utilstest.getLogger("profile")
 ai = pyFAI.load("testimages/Pilatus1M.poni")
 data = fabio.open("testimages/Pilatus1M.edf").data
 workgroup_size = 256
 bins = 1000
 pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
 pos = pos_in.reshape(pos_in.size/8,4,2)
 pos_size = pos.size
 size = pos_size/8
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 mf = cl.mem_flags
 d_input     = cl.array.to_device(queue, pos)
 d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
 d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
 with open("../openCL/ocl_hist_pixelsplit.cl", "r") as kernelFile:
    kernel_src = kernelFile.read()
 compile_options = "-D BINS=%i  -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
                (bins, size, workgroup_size, numpy.finfo(numpy.float32).eps)
 program = cl.Program(ctx, kernel_src).build(options=compile_options)
 program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data,  numpy.uint32(pos_size), d_preresult)
 program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
 result = numpy.ndarray(4,dtype=numpy.float32)
 cl.enqueue_copy(queue,result, d_result)
 min0 = pos[:, :, 0].min()
 max0 = pos[:, :, 0].max()
 min1 = pos[:, :, 1].min()
 max1 = pos[:, :, 1].max()
 minmax=(min0,max0,min1,max1)
 print minmax
 print result
 #plot(ref, label="ocl_csr")
 #plot(boo, label="csr_fullsplit")
 #plot(boo2, label="ocl_csr_fullsplit")
 #legend()
 #show()
 #raw_input()