Merge remote branch 'giannis/master'

Conflicts:
	doc/source/pyFAI.rst
	openCL/ocl_azim_CSR.cl
	pyFAI-src/ocl_azim_csr_dis.py
	src/morphology.c
This commit is contained in:
Jerome Kieffer 2014-06-16 14:13:16 +02:00
commit 5c914a008e
23 changed files with 52761 additions and 66 deletions

View File

@ -6,6 +6,7 @@ from __future__ import print_function, division
import json, sys, time, timeit, os, platform, subprocess
import numpy
from numpy import log2
import fabio
import os.path as op
sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
@ -58,6 +59,8 @@ class Bench(object):
self.repeat = repeat
self.nbr = nbr
self.results = {}
self.flops = {}
self.mem_band = {}
self.meth = []
self._cpu = None
self.fig = None
@ -172,8 +175,10 @@ data = fabio.open(r"%s").data
print("Working on processor: %s" % self.get_cpu())
label = "1D_" + self.LABELS[method]
results = {}
flops = {}
mem_band = {}
first = True
param = "Frelon2k.poni"
param = "Pilatus1M.poni"
block_size_list = [1,2,4,8,16,32,64,128,256]
for block_size in block_size_list:
self.update_mp()
@ -190,6 +195,14 @@ data = fabio.open(r"%s").data
if check:
if "csr" in method:
print("csr: size= %s \t nbytes %.3f MB " % (ai._csr_integrator.data.size, ai._csr_integrator.lut_nbytes / 2 ** 20))
bins = ai._csr_integrator.bins
nnz = ai._csr_integrator.nnz
parallel_reduction = sum([2**i for i in range(1,int(log2(block_size)))])
FLOPs = 9*nnz + 11*parallel_reduction + 1*bins
mem_access = (2*block_size*bins + 5*nnz + 7*bins)*4
del ai, data
self.update_mp()
@ -216,9 +229,13 @@ data = fabio.open(r"%s").data
self.update_mp()
if R < self.LIMIT:
results[block_size ] = tmin
flops[block_size ] = (FLOPs/tmin)*1e-6
mem_band[block_size ] = (mem_access/tmin)*1e-6
self.update_mp()
else:
results[block_size ] = tmin
flops[block_size ] = FLOPs/tmin
mem_band[block_size ] = mem_access/tmin
if first:
self.new_curve(results, label)
first = False
@ -227,6 +244,8 @@ data = fabio.open(r"%s").data
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.flops[label] = flops
self.mem_band[label] = mem_band
self.update_mp()
@ -234,12 +253,12 @@ data = fabio.open(r"%s").data
self.update_mp()
json.dump(self.results, open(filename, "w"))
def print_res(self):
def print_res(self,summary,results):
self.update_mp()
print("Summary: execution time in milliseconds")
print(summary)
print("Size/Meth\t" + "\t".join(self.meth))
for i in self.size:
print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (self.results[j].get(i, 0)) for j in self.meth))
print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (results[j].get(i, 0)) for j in self.meth))
def init_curve(self):
self.update_mp()
@ -394,10 +413,16 @@ if __name__ == "__main__":
bench.bench_1d_ocl_csr(True, {"devicetype":"ACC"})
bench.save()
bench.print_res()
results = bench.results
flops = bench.flops
mem_band = bench.mem_band
bench.print_res("Summary: Execution time in milliseconds", results)
bench.print_res("Summary: MFLOPS",flops)
bench.print_res("Summary: Memory Bandwidth in MB/s",mem_band)
bench.update_mp()
bench.ax.set_ylim(1, 200)
# plt.show()
plt.ion()
raw_input("Enter to quit")
# raw_input("Enter to quit")

View File

@ -0,0 +1,752 @@
#!/usr/bin/python
import json, sys, time, timeit, os, platform, subprocess
import numpy
import fabio
import os.path as op
sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
import utilstest
pyFAI = utilstest.UtilsTest.pyFAI
ocl = pyFAI.opencl.ocl
from matplotlib import pyplot as plt
plt.ion()
ds_list = ["Pilatus1M.poni", "halfccd.poni", "Frelon2k.poni", "Pilatus6M.poni", "Mar3450.poni", "Fairchild.poni"]
datasets = {"Fairchild.poni":utilstest.UtilsTest.getimage("1880/Fairchild.edf"),
"halfccd.poni":utilstest.UtilsTest.getimage("1882/halfccd.edf"),
"Frelon2k.poni":utilstest.UtilsTest.getimage("1881/Frelon2k.edf"),
"Pilatus6M.poni":utilstest.UtilsTest.getimage("1884/Pilatus6M.cbf"),
"Pilatus1M.poni":utilstest.UtilsTest.getimage("1883/Pilatus1M.edf"),
"Mar3450.poni":utilstest.UtilsTest.getimage("2201/LaB6_260210.mar3450")
}
b = None
class Bench(object):
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
def __init__(self, nbr=10, memprofile=False):
self.reference_1d = {}
self.LIMIT = 8
self.repeat = 1
self.nbr = nbr
self.results = {}
self.meth = []
self._cpu = None
self.fig = None
self.ax = None
self.starttime = time.time()
self.plot = None
self.plot_x = []
self.plot_y = []
self.do_memprofile = memprofile
self.fig_mp = None
self.ax_mp = None
self.plot_mp = None
self.memory_profile = ([], [])
def get_cpu(self):
if self._cpu is None:
if os.name == "nt":
self._cpu = platform.processor()
elif os.path.exists("/proc/cpuinfo"):
self._cpu = [i.split(": ", 1)[1] for i in open("/proc/cpuinfo") if i.startswith("model name")][0].strip()
elif os.path.exists("/usr/sbin/sysctl"):
proc = subprocess.Popen(["sysctl", "-n", "machdep.cpu.brand_string"], stdout=subprocess.PIPE)
proc.wait()
self._cpu = proc.stdout.read().strip()
old = self._cpu
self._cpu = old.replace(" ", " ")
while old != self._cpu:
old = self._cpu
self._cpu = old.replace(" ", " ")
return self._cpu
def get_gpu(self, devicetype="gpu", useFp64=False, platformid=None, deviceid=None):
if ocl is None:
return "NoGPU"
ctx = ocl.create_context(devicetype, useFp64, platformid, deviceid)
return ctx.devices[0].name
def get_mem(self):
"""
Returns the occupied memory for memory-leak hunting in MByte
"""
pid = os.getpid()
if os.path.exists("/proc/%i/status" % pid):
for l in open("/proc/%i/status" % pid):
if l.startswith("VmRSS"):
mem = int(l.split(":", 1)[1].split()[0]) / 1024.
else:
mem = 0
return mem
def print_init(self, t):
print(" * Initialization time: %.1f ms" % (1000.0 * t))
self.update_mp()
def print_exec(self, t):
print(" * Execution time rep : %.1f ms" % (1000.0 * t))
self.update_mp()
def print_sep(self):
print("*"*80)
self.update_mp()
def get_ref(self, param):
if param not in self.reference_1d:
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
N = min(data.shape)
res = ai.xrpd(data, N)
self.reference_1d[param] = res
del ai, data
return self.reference_1d[param]
def bench_cpu1d(self):
self.update_mp()
print("Working on processor: %s" % self.get_cpu())
results = {}
label = "1D_CPU_serial_full_split"
first = True
for param in ds_list:
self.update_mp()
ref = self.get_ref(param)
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = min(data.shape)
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
res = ai.integrate1d(data, 1000, method="splitpixelfull", unit="2th_deg", correctSolidAngle=False)
t1 = time.time()
self.print_init(t1 - t0)
self.update_mp()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
N=min(data.shape)
out=ai.xrpd(data,N)""" % (param, fn)
t = timeit.Timer("ai.integrate1d(data, 1000, method='splitpixelfull', unit='2th_deg', correctSolidAngle=False)", setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.update_mp()
self.print_exec(tmin)
size /= 1e6
tmin *= 1000.0
results[size ] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_cpu1d_lut(self):
self.update_mp()
print("Working on processor: %s" % self.get_cpu())
label = "1D_CPU_parallel_OpenMP"
results = {}
self.new_curve(results, label)
for param in ds_list:
self.update_mp()
ref = self.get_ref(param)
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = min(data.shape)
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
res = ai.xrpd_LUT(data, N)
t1 = time.time()
self.print_init(t1 - t0)
print "lut.shape=", ai._lut_integrator.lut.shape, "lut.nbytes (MB)", ai._lut_integrator.size * 8 / 1e6
self.update_mp()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
N=min(data.shape)
out=ai.xrpd_LUT(data,N)""" % (param, fn)
t = timeit.Timer("ai.xrpd_LUT(data,N,safe=False)", setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.print_exec(tmin)
R = utilstest.Rwp(res, ref)
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
self.update_mp()
if R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size ] = tmin
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_cpu1d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
self.update_mp()
if (ocl is None):
print("No pyopencl")
return
if (platformid is None) or (deviceid is None):
platdev = ocl.select_device(devicetype)
if not platdev:
print("No such OpenCL device: skipping benchmark")
return
platformid, deviceid = platdev
print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
label = "1D_%s_parallel_OpenCL" % devicetype
first = True
results = {}
for param in ds_list:
self.update_mp()
ref = self.get_ref(param)
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = min(data.shape)
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
try:
res = ai.xrpd_LUT_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid)
except MemoryError as error:
print(error)
break
t1 = time.time()
self.print_init(t1 - t0)
self.update_mp()
ai.reset()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
N=min(data.shape)
out=ai.xrpd_LUT_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s)""" % (param, fn, devicetype, platformid, deviceid)
t = timeit.Timer("ai.xrpd_LUT_OCL(data,N,safe=False)", setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.update_mp()
del t
self.update_mp()
self.print_exec(tmin)
R = utilstest.Rwp(res, ref)
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
if R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_cpu1d_csr_ocl(self, devicetype="GPU", platformid=None, deviceid=None, padded=False, block_size=32):
self.update_mp()
if (ocl is None):
print("No pyopencl")
return
if (platformid is None) or (deviceid is None):
platdev = ocl.select_device(devicetype)
if not platdev:
print("No such OpenCL device: skipping benchmark")
return
platformid, deviceid = platdev
print("Working on device: %s platform: %s device: %s padding: %s block_size= %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid], padded, block_size))
label = "1D_%s_parallel_OpenCL, padded=%s, block_size=%s" % (devicetype, padded, block_size)
first = True
results = {}
for param in ds_list:
self.update_mp()
ref = self.get_ref(param)
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = min(data.shape)
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
try:
res = ai.xrpd_CSR_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid, padded=padded, block_size=block_size)
except MemoryError as error:
print(error)
break
t1 = time.time()
self.print_init(t1 - t0)
self.update_mp()
ai.reset()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
N=min(data.shape)
out=ai.xrpd_CSR_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s,padded=%s,block_size=%s)""" % (param, fn, devicetype, platformid, deviceid, padded, block_size)
t = timeit.Timer("ai.xrpd_CSR_OCL(data,N,safe=False,padded=%s,block_size=%s)" % (padded, block_size), setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.update_mp()
del t
self.update_mp()
self.print_exec(tmin)
R = utilstest.Rwp(res, ref)
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
if R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_cpu2d(self):
self.update_mp()
print("Working on processor: %s" % self.get_cpu())
results = {}
label = "2D_CPU_serial"
first = True
for param in ds_list:
self.update_mp()
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = (500, 360)
print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
_ = ai.xrpd2(data, N[0], N[1])
t1 = time.time()
self.print_init(t1 - t0)
self.update_mp()
ai.reset()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
out=ai.xrpd2(data,%s,%s)""" % (param, fn, N[0], N[1])
t = timeit.Timer("ai.xrpd2(data,%s,%s)" % N, setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.update_mp()
del t
self.update_mp()
self.print_exec(tmin)
print("")
if 1: # R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_cpu2d_lut(self):
print("Working on processor: %s" % self.get_cpu())
label = "2D_CPU_parallel_OpenMP"
first = True
results = {}
for param in ds_list:
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = (500, 360)
print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
_ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut")
t1 = time.time()
self.print_init(t1 - t0)
print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
self.update_mp()
ai.reset()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut")""" % (param, fn, N[0], N[1])
t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut')" % N, setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.update_mp()
del t
self.update_mp()
self.print_exec(tmin)
print("")
if 1: # R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_cpu2d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
self.update_mp()
if (ocl is None):
print("No pyopencl")
return
if (platformid is None) or (deviceid is None):
platdev = ocl.select_device(devicetype)
if not platdev:
print("No such OpenCL device: skipping benchmark")
return
platformid, deviceid = platdev
print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
results = {}
label = "2D_%s_parallel_OpenCL" % devicetype.upper()
first = True
for param in ds_list:
self.update_mp()
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = (500, 360)
print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
t0 = time.time()
try:
_ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut_ocl_%i,%i" % (platformid, deviceid))
except MemoryError as error:
print(error)
break
t1 = time.time()
self.print_init(t1 - t0)
print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
self.update_mp()
ai.reset()
del ai, data
self.update_mp()
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut_ocl_%i,%i")""" % (param, fn, N[0], N[1], platformid, deviceid)
t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut_ocl')" % N, setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
self.update_mp()
del t
self.update_mp()
self.print_exec(tmin)
print("")
if 1: # R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def bench_gpu1d(self, devicetype="gpu", useFp64=True, platformid=None, deviceid=None):
self.update_mp()
print("Working on %s, in " % devicetype + ("64 bits mode" if useFp64 else"32 bits mode") + "(%s.%s)" % (platformid, deviceid))
if ocl is None or not ocl.select_device(devicetype):
print("No pyopencl or no such device: skipping benchmark")
return
results = {}
label = "Forward_OpenCL_%s_%s_bits" % (devicetype , ("64" if useFp64 else"32"))
first = True
for param in ds_list:
self.update_mp()
fn = datasets[param]
ai = pyFAI.load(param)
data = fabio.open(fn).data
size = data.size
N = min(data.shape)
print("1D integration of %s %.1f Mpixel -> %i bins (%s)" % (op.basename(fn), size / 1e6, N, ("64 bits mode" if useFp64 else"32 bits mode")))
try:
t0 = time.time()
res = ai.xrpd_OpenCL(data, N, devicetype=devicetype, useFp64=useFp64, platformid=platformid, deviceid=deviceid)
t1 = time.time()
except Exception as error:
print("Failed to find an OpenCL GPU (useFp64:%s) %s" % (useFp64, error))
continue
self.print_init(t1 - t0)
self.update_mp()
ref = ai.xrpd(data, N)
R = utilstest.Rwp(res, ref)
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
setup = """
import pyFAI,fabio
ai=pyFAI.load(r"%s")
data = fabio.open(r"%s").data
N=min(data.shape)
out=ai.xrpd_OpenCL(data,N, devicetype=r"%s", useFp64=%s, platformid=%s, deviceid=%s)""" % (param, fn, devicetype, useFp64, platformid, deviceid)
t = timeit.Timer("ai.xrpd_OpenCL(data,N,safe=False)", setup)
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
del t
self.update_mp()
self.print_exec(tmin)
print("")
if R < self.LIMIT:
size /= 1e6
tmin *= 1000.0
results[size] = tmin
if first:
self.new_curve(results, label)
first = False
else:
self.new_point(size, tmin)
self.update_mp()
self.print_sep()
self.meth.append(label)
self.results[label] = results
self.update_mp()
def save(self, filename="benchmark.json"):
self.update_mp()
json.dump(self.results, open(filename, "w"))
def print_res(self):
self.update_mp()
print("Summary: execution time in milliseconds")
print "Size/Meth\t" + "\t".join(b.meth)
for i in self.size:
print "%7.2f\t\t" % i + "\t\t".join("%.2f" % (b.results[j].get(i, 0)) for j in b.meth)
def init_curve(self):
self.update_mp()
if self.fig:
print("Already initialized")
return
if "DISPLAY" in os.environ:
plt.ion()
self.fig = plt.figure()
self.ax = self.fig.add_subplot(1, 1, 1)
self.ax.set_autoscale_on(False)
self.ax.set_xlabel("Image size in Mega-Pixels")
self.ax.set_ylabel("Frames processed per second")
self.ax.set_yscale("log", basey=2)
t = [1, 2, 5, 10, 20, 50, 100, 200, 400, 500]
self.ax.set_yticks([float(i) for i in t])
self.ax.set_yticklabels([str(i)for i in t])
self.ax.set_xlim(0.5, 20)
self.ax.set_ylim(0.5, 500)
self.ax.set_title(self.get_cpu() + " / " + self.get_gpu())
if self.fig.canvas:
self.fig.canvas.draw()
# plt.show()
def new_curve(self, results, label):
self.update_mp()
if not self.fig:
return
self.plot_x = list(results.keys())
self.plot_x.sort()
self.plot_y = [1000.0 / results[i] for i in self.plot_x]
self.plot = self.ax.plot(self.plot_x, self.plot_y, "o-", label=label)[0]
self.ax.legend()
if self.fig.canvas:
self.fig.canvas.draw()
def new_point(self, size, exec_time):
"""
Add new point to current curve
@param size: of the system
@parm exec_time: execution time in ms
"""
self.update_mp()
if not self.plot:
return
self.plot_x.append(size)
self.plot_y.append(1000.0 / exec_time)
self.plot.set_data(self.plot_x, self.plot_y)
if self.fig.canvas:
self.fig.canvas.draw()
def display_all(self):
if not self.fig:
return
for k in self.meth:
self.new_curve(self.results[k], k)
self.ax.legend()
self.fig.savefig("benchmark.png")
self.fig.show()
# plt.ion()
def update_mp(self):
if not self.do_memprofile:
return
self.memory_profile[0].append(time.time() - self.starttime)
self.memory_profile[1].append(self.get_mem())
if not self.fig_mp:
self.fig_mp = plt.figure()
self.ax_mp = self.fig_mp.add_subplot(1, 1, 1)
self.ax_mp.set_autoscale_on(False)
self.ax_mp.set_xlabel("Run time (s)")
self.ax_mp.set_xlim(0, 100)
self.ax_mp.set_ylim(0, 2 ** 10)
self.ax_mp.set_ylabel("Memory occupancy (MB)")
self.ax_mp.set_title("Memory leak hunter")
self.plot_mp = self.ax_mp.plot(*self.memory_profile)[0]
else:
self.plot_mp.set_data(*self.memory_profile)
tmax = self.memory_profile[0][-1]
mmax = max(self.memory_profile[1])
if tmax > self.ax_mp.get_xlim()[-1]:
self.ax_mp.set_xlim(0, tmax)
if mmax > self.ax_mp.get_ylim()[-1]:
self.ax_mp.set_ylim(0, mmax)
if self.fig_mp.canvas:
self.fig_mp.canvas.draw()
def get_size(self):
if len(self.meth) == 0:
return []
size = list(self.results[self.meth[0]].keys())
for i in self.meth[1:]:
s = list(self.results[i].keys())
if len(s) > len(size):
size = s
size.sort()
return size
size = property(get_size)
if __name__ == "__main__":
try:
from argparse import ArgumentParser
except:
from pyFAI.argparse import ArgumentParser
description = """Benchmark for Azimuthal integration
"""
epilog = """ """
usage = """benchmark [options] """
version = "pyFAI benchmark version " + pyFAI.version
parser = ArgumentParser(usage=usage, description=description, epilog=epilog)
parser.add_argument("-v", action='version', version=version)
parser.add_argument("-d", "--debug",
action="store_true", dest="debug", default=False,
help="switch to verbose/debug mode")
parser.add_argument("-c", "--cpu",
action="store_true", dest="opencl_cpu", default=False,
help="perform benchmark using OpenCL on the CPU")
parser.add_argument("-g", "--gpu",
action="store_true", dest="opencl_gpu", default=False,
help="perform benchmark using OpenCL on the GPU")
parser.add_argument("-a", "--acc",
action="store_true", dest="opencl_acc", default=False,
help="perform benchmark using OpenCL on the Accelerator (like XeonPhi/MIC)")
parser.add_argument("-s", "--small",
action="store_true", dest="small", default=False,
help="Limit the size of the dataset to 6 Mpixel images (for computer with limited memory)")
parser.add_argument("-n", "--number",
dest="number", default=10, type=int,
help="Number of repetition of the test, by default 10")
parser.add_argument("-2d", "--2dimentions",
action="store_true", dest="twodim", default=False,
help="Benchmark also algorithm for 2D-regrouping")
parser.add_argument("-m", "--memprof",
action="store_true", dest="memprof", default=False,
help="Perfrom memory profiling (Linux only)")
parser.add_argument("-f", "--fullsplit",
action="store_true", dest="split_cpu", default=False,
help="perform benchmark using full pixel splitting on CPU")
options = parser.parse_args()
if options.small:
ds_list = ds_list[:4]
if options.debug:
pyFAI.logger.setLevel(logging.DEBUG)
print("Averaging over %i repetitions (best of 3)." % options.number)
b = Bench(options.number, options.memprof)
b.init_curve()
b.bench_cpu1d()
b.bench_cpu1d_lut()
if options.opencl_cpu:
b.bench_cpu1d_lut_ocl("CPU")
if options.opencl_gpu:
b.bench_cpu1d_lut_ocl("GPU")
if options.opencl_acc:
b.bench_cpu1d_lut_ocl("ACC")
if options.split_cpu:
b.bench_cpu1d
# b.bench_cpu1d_ocl_lut("CPU")
# b.bench_gpu1d("gpu", True)
# b.bench_gpu1d("gpu", False)
# b.bench_gpu1d("cpu", True)
# b.bench_gpu1d("cpu", False)
if options.twodim:
b.bench_cpu2d()
b.bench_cpu2d_lut()
if options.opencl_cpu:
b.bench_cpu2d_lut_ocl("CPU")
if options.opencl_gpu:
b.bench_cpu2d_lut_ocl("GPU")
if options.opencl_acc:
b.bench_cpu2d_lut_ocl("ACC")
# b.bench_cpu2d_lut()
# b.bench_cpu2d_lut_ocl()
b.save()
b.print_res()
# b.display_all()
b.update_mp()
b.ax.set_ylim(1, 200)
# plt.show()
plt.ion()
raw_input("Enter to quit")

View File

@ -140,12 +140,19 @@ PyFAI solves this problem by pixel
splitting : in addition to the pixel position, its
spatial extension is calculated and each pixel is then split and
distributed over the corresponding bins, the intensity being considered
<<<<<<< HEAD
as homogeneous within a pixel and spread accordingly.
The drawback of this is the correlation introduced between two adjacent bins.
To simplify
calculations, this was initially done by abstracting the pixel shape
with a bounding box that circumscribes the pixel. In an effort to better
the quality of the results this method was dropped in favo2r of a full
=======
as homogeneous within a pixel and spread accordingly. To simplify
calculations, this was initially done by abstracting the pixel shape
with a bounding box that circumscribes the pixel. In an effort to better
the quality of the results this method was dropped in favour of a full
>>>>>>> giannis/master
pixel splitting scheme that actually uses the actual pixel geometry
for its calculations.
@ -194,6 +201,7 @@ but can still be too large to fit on an entry-level graphics card.
By making this change we switched from a “linear read / random write” forward algorithm
to a “random read / linear write” backward algorithm which is more suitable for parallelization.
As a farther improvement on the algorithm, the use of compressed sparse row (CSR) format was
<<<<<<< HEAD
introduced, to store the LUT data.
This algorithm was implemented both in [Cython]_-OpenMP and OpenCL.
The CSR approach has a double benefit:
@ -206,6 +214,14 @@ This makes it very well suited to run on GPUs and accelerators
where hundreds to thousands of simultaneous threads are available.
When using OpenCL for the GPU we used a compensated (or Kahan_summation_), to reduce
=======
introduced, to store the LUT data. This reduced its size even more, giving this way the
opportunity of working with bigger images on the same hardware, when memory space is of concern,
as well as making the code better suited to be run on GPUs or accelerators, as transferring
data to the device is one of the most important bottlenecks of such computations.
This algorithm was implemented in Cython-OpenMP and OpenCL.
When using OpenCL for the GPU we used a compensated, or Kahan summation to reduce
>>>>>>> giannis/master
the error accumulation in the histogram summation (at the cost of more operations to be done).
This allows accurate results to be obtained on cheap hardware that performs calculations
in single precision floating-point arithmetic (32 bits) which are available on consumer
@ -214,12 +230,17 @@ Double precision operations are currently limited to high price and performance
The additional cost of Kahan summation, 4x more arithmetic operations, is hidden by smaller data types,
the higher number of single precision units and that the GPU is usually limited by the memory bandwidth anyway.
<<<<<<< HEAD
.. _Kahan_summation: http://en.wikipedia.org/wiki/Kahan_summation_algorithm
The performances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s
on recent multi-core computer with a mid-range graphics card.
On multi-socket server featuring high-end GPUs like Tesla cards, the performances are similar with
the additional capability to work on multiple detector simultaneously.
=======
The perfomances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s
on recent multi-socket, multi-core computer or on high-end GPUs like Tesla cards.
>>>>>>> giannis/master
.. figure:: img/benchmark.png
:align: center

View File

@ -174,25 +174,21 @@ corrections( __global float *image,
/**
* \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT
* \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
*
* An image instensity value is spread across the bins according to the positions stored in the LUT.
* The lut is an 2D-array of index (contains the positions of the pixel in the input array)
* and coeficients (fraction of pixel going to the bin)
* The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
* Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
*
* This implementation is especially efficient on CPU where each core reads adjacents memory.
* the use of local pointer can help on the CPU.
*
* @param weights Float pointer to global memory storing the input image.
* @param lut Pointer to an 2D-array of (unsigned integers,float) containing the index of input pixels and the fraction of pixel going to the bin
* @param coefs Float pointer to global memory holding the coeficient part of the LUT
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
* @param dummy Float: value for bad pixels
* @param delta_dummy Float: precision for bad pixel value
* @param do_dark Bool/int: shall dark-current correction be applied ?
* @param dark Float pointer to global memory storing the dark image.
* @param do_flat Bool/int: shall flat-field correction be applied ? (could contain polarization corrections)
* @param flat Float pointer to global memory storing the flat image.
* @param outData Float pointer to the output 1D array with the weighted histogram
* @param outCount Float pointer to the output 1D array with the unweighted histogram
* @param outMerged Float pointer to the output 1D array with the diffractogram
@ -208,7 +204,7 @@ csr_integrate( const __global float *weights,
__global float *outData,
__global float *outCount,
__global float *outMerge
)
)
{
int thread_id_loc = get_local_id(0);
int bin_num = get_group_id(0); // each workgroup of size=warp is assinged to 1 bin
@ -260,10 +256,10 @@ csr_integrate( const __global float *weights,
__local float super_sum_data_correction[WORKGROUP_SIZE];
__local float super_sum_count[WORKGROUP_SIZE];
__local float super_sum_count_correction[WORKGROUP_SIZE];
float super_sum_temp = 0.0f;
int index, active_threads = WORKGROUP_SIZE;
if (bin_size < WORKGROUP_SIZE)
{
if (thread_id_loc < bin_size)
@ -292,6 +288,106 @@ csr_integrate( const __global float *weights,
cd = 0;
cc = 0;
if (thread_id_loc == 0)
{
outData[bin_num] = super_sum_data[0];
outCount[bin_num] = super_sum_count[0];
if (outCount[bin_num] > epsilon)
outMerge[bin_num] = outData[bin_num] / outCount[bin_num];
else
outMerge[bin_num] = dummy;
}
};//end kernel
/**
* \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
*
* An image instensity value is spread across the bins according to the positions stored in the LUT.
* The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
* Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
*
* This kernel is ment to be ran with padded data (the span of each bin must be a multiple of the workgroup size)
*
* @param weights Float pointer to global memory storing the input image.
* @param coefs Float pointer to global memory holding the coeficient part of the LUT
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
* @param dummy Float: value for bad pixels
* @param outData Float pointer to the output 1D array with the weighted histogram
* @param outCount Float pointer to the output 1D array with the unweighted histogram
* @param outMerged Float pointer to the output 1D array with the diffractogram
*
*/
__kernel void
csr_integrate_padded( const __global float *weights,
const __global float *coefs,
const __global int *row_ind,
const __global int *col_ptr,
const int do_dummy,
const float dummy,
__global float *outData,
__global float *outCount,
__global float *outMerge
)
{
int thread_id_loc = get_local_id(0);
int bin_num = get_group_id(0); // each workgroup of size=warp is assinged to 1 bin
int2 bin_bounds;
// bin_bounds = (int2) *(col_ptr+bin_num); // cool stuff!
bin_bounds.x = col_ptr[bin_num];
bin_bounds.y = col_ptr[bin_num+1];
float sum_data = 0.0f;
float sum_count = 0.0f;
float cd = 0.0f;
float cc = 0.0f;
float t, y;
const float epsilon = 1e-10f;
float coef, data;
int idx, k, j;
for (j=bin_bounds.x;j<bin_bounds.y;j+=WORKGROUP_SIZE)
{
k = j+thread_id_loc;
coef = coefs[k];
idx = row_ind[k];
data = weights[idx];
if( (!do_dummy) || (data!=dummy) )
{
//sum_data += coef * data;
//sum_count += coef;
//Kahan summation allows single precision arithmetics with error compensation
//http://en.wikipedia.org/wiki/Kahan_summation_algorithm
y = coef*data - cd;
t = sum_data + y;
cd = (t - sum_data) - y;
sum_data = t;
y = coef - cc;
t = sum_count + y;
cc = (t - sum_count) - y;
sum_count = t;
};//end if dummy
};//for j
/*
* parallel reduction
*/
// REMEMBER TO PASS WORKGROUP_SIZE AS A CPP DEF
__local float super_sum_data[WORKGROUP_SIZE];
__local float super_sum_data_correction[WORKGROUP_SIZE];
__local float super_sum_count[WORKGROUP_SIZE];
__local float super_sum_count_correction[WORKGROUP_SIZE];
super_sum_data[thread_id_loc] = sum_data;
super_sum_count[thread_id_loc] = sum_count;
super_sum_data_correction[thread_id_loc] = cd;
super_sum_count_correction[thread_id_loc] = cc;
barrier(CLK_LOCAL_MEM_FENCE);
float super_sum_temp = 0.0f;
int index, active_threads = WORKGROUP_SIZE;
cd = 0;
cc = 0;
while (active_threads != 1)
{
active_threads /= 2;
@ -304,7 +400,7 @@ csr_integrate( const __global float *weights,
t = super_sum_temp + y;
super_sum_data_correction[thread_id_loc] = (t - super_sum_temp) - y;
super_sum_data[thread_id_loc] = t;
cc = super_sum_count_correction[thread_id_loc] + super_sum_count_correction[index];
super_sum_temp = super_sum_count[thread_id_loc];
y = super_sum_count[index] - cc;
@ -326,6 +422,20 @@ csr_integrate( const __global float *weights,
}
};//end kernel
=======
Correct an image based on the look-up table calculated ...
/**
* \brief Performs distortion corrections on an image using a LUT in CSR form
*
* @param weights Float pointer to global memory storing the input image.
* @param coefs Float pointer to global memory holding the coeficient part of the LUT
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
* @param outData Float pointer to the output 1D array with the corrected image
*
*/
__kernel void
csr_integrate_dis( const __global float *weights,
const __global float *coefs,
@ -381,9 +491,9 @@ csr_integrate_dis( const __global float *weights,
__local float super_sum_data_correction[WORKGROUP_SIZE];
float super_sum_temp = 0.0f;
int index, active_threads = WORKGROUP_SIZE;
if (bin_size < WORKGROUP_SIZE)
{
if (thread_id_loc < bin_size)
@ -403,9 +513,9 @@ csr_integrate_dis( const __global float *weights,
super_sum_data[thread_id_loc] = sum_data;
}
barrier(CLK_LOCAL_MEM_FENCE);
cd = 0;
while (active_threads != 1)
{
active_threads /= 2;

View File

@ -0,0 +1,405 @@
//#pragma OPENCL EXTENSION cl_amd_printf : enable
//#pragma OPENCL EXTENSION cl_intel_printf : enable
float area4(float a0, float a1, float b0, float b1, float c0, float c1, float d0, float d1)
{
return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)));
}
float integrate_line( float A0, float B0, float2 AB)
{
return (A0==B0) ? 0.0 : AB.s0*(B0*B0 - A0*A0)*0.5 + AB.s1*(B0-A0);
}
float getBinNr(float x0, float delta, float pos0_min)
{
return (x0 - pos0_min) / delta;
}
float min4f(float a, float b, float c, float d)
{
return fmin(fmin(a,b),fmin(c,d));
}
float max4f(float a, float b, float c, float d)
{
return fmax(fmax(a,b),fmax(c,d));
}
void AtomicAdd(volatile __global float *source, const float operand)
{
union {
unsigned int intVal;
float floatVal;
} newVal;
union {
unsigned int intVal;
float floatVal;
} prevVal;
do {
prevVal.floatVal = *source;
newVal.floatVal = prevVal.floatVal + operand;
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
}
/**
* \brief cast values of an array of uint16 into a float output array.
*
* @param array_u16: Pointer to global memory with the input data as unsigned16 array
* @param array_float: Pointer to global memory with the output data as float array
*/
__kernel void
u16_to_float(__global unsigned short *array_u16,
__global float *array_float
)
{
int i = get_global_id(0);
//Global memory guard for padding
if(i < NIMAGE)
array_float[i]=(float)array_u16[i];
}
/**
* \brief convert values of an array of int32 into a float output array.
*
* @param array_int: Pointer to global memory with the data in int
* @param array_float: Pointer to global memory with the data in float
*/
__kernel void
s32_to_float( __global int *array_int,
__global float *array_float
)
{
int i = get_global_id(0);
//Global memory guard for padding
if(i < NIMAGE)
array_float[i] = (float)(array_int[i]);
}
/**
* \brief Sets the values of 3 float output arrays to zero.
*
* Gridsize = size of arrays + padding.
*
* @param array0: float Pointer to global memory with the outMerge array
* @param array1: float Pointer to global memory with the outCount array
* @param array2: float Pointer to global memory with the outData array
*/
__kernel void
memset_out(__global float *array0,
__global float *array1,
__global float *array2
)
{
int i = get_global_id(0);
//Global memory guard for padding
if(i < BINS)
{
array0[i]=0.0f;
array1[i]=0.0f;
array2[i]=0.0f;
}
}
/**
* \brief Performs the first part of a 2-step parallel reduction.
*
* Together with the second part, it take a flattened 4D-array
* and returns the min and max of both of the 2 components of the
* last dimension
*
* @param buffer: float Pointer to global memory with the flattened 4D-array (pos in pyFAI)
* @param length: interger value of the length of the buffer array
* @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
*/
__kernel
void reduce1(__global float2* buffer,
__const int length,
__global float4* preresult) {
int global_index = get_global_id(0);
int global_size = get_global_size(0);
float4 accumulator;
accumulator.x = INFINITY;
accumulator.y = -INFINITY;
accumulator.z = INFINITY;
accumulator.w = -INFINITY;
// Loop sequentially over chunks of input vector
while (global_index < length/2) {
float2 element = buffer[global_index];
accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
global_index += global_size;
}
__local float4 scratch[WORKGROUP_SIZE];
// Perform parallel reduction
int local_index = get_local_id(0);
scratch[local_index] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
int active_threads = get_local_size(0);
while (active_threads != 1)
{
active_threads /= 2;
if (local_index < active_threads)
{
float4 other = scratch[local_index + active_threads];
float4 mine = scratch[local_index];
mine.x = (mine.x < other.x) ? mine.x : other.x;
mine.y = (mine.y > other.y) ? mine.y : other.y;
mine.z = (mine.z < other.z) ? mine.z : other.z;
mine.w = (mine.w > other.w) ? mine.w : other.w;
/*
float2 tmp;
tmp.x = (mine.x < other.x) ? mine.x : other.x;
tmp.y = (mine.y > other.y) ? mine.y : other.y;
scratch[local_index] = tmp;
*/
scratch[local_index] = mine;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
preresult[get_group_id(0)] = scratch[0];
}
}
/**
* \brief Performs the second part of a 2-step parallel reduction.
*
* Together with the second part, it take a flattened 4D-array
* and returns the min and max of both of the 2 components of the
* last dimension
*
* @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
* @param result: float Pointer to global memory with the min/max values requested
*/
__kernel
void reduce2(__global float4* preresult,
__global float4* result) {
__local float4 scratch[WORKGROUP_SIZE];
int local_index = get_local_id(0);
scratch[local_index] = preresult[local_index];
barrier(CLK_LOCAL_MEM_FENCE);
int active_threads = get_local_size(0);
while (active_threads != 1)
{
active_threads /= 2;
if (local_index < active_threads)
{
float4 other = scratch[local_index + active_threads];
float4 mine = scratch[local_index];
mine.x = (mine.x < other.x) ? mine.x : other.x;
mine.y = (mine.y > other.y) ? mine.y : other.y;
mine.z = (mine.z < other.z) ? mine.z : other.z;
mine.w = (mine.w > other.w) ? mine.w : other.w;
/*
float2 tmp;
tmp.x = (mine.x < other.x) ? mine.x : other.x;
tmp.y = (mine.y > other.y) ? mine.y : other.y;
scratch[local_index] = tmp;
*/
scratch[local_index] = mine;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
result[0] = scratch[0];
}
}
/**
* \brief Performs Normalization of input image
*
* Intensities of images are corrected by:
* - dark (read-out) noise subtraction
* - Solid angle correction (division)
* - polarization correction (division)
* - flat fiels correction (division)
* Corrections are made in place unless the pixel is dummy.
* Dummy pixels are left untouched so that they remain dummy
*
* @param image Float pointer to global memory storing the input image.
* @param do_dark Bool/int: shall dark-current correction be applied ?
* @param dark Float pointer to global memory storing the dark image.
* @param do_flat Bool/int: shall flat-field correction be applied ?
* @param flat Float pointer to global memory storing the flat image.
* @param do_solidangle Bool/int: shall flat-field correction be applied ?
* @param solidangle Float pointer to global memory storing the solid angle of each pixel.
* @param do_polarization Bool/int: shall flat-field correction be applied ?
* @param polarization Float pointer to global memory storing the polarization of each pixel.
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
* @param dummy Float: value for bad pixels
* @param delta_dummy Float: precision for bad pixel value
*
**/
__kernel void
corrections( __global float *image,
const int do_dark,
const __global float *dark,
const int do_flat,
const __global float *flat,
const int do_solidangle,
const __global float *solidangle,
const int do_polarization,
const __global float *polarization,
const int do_dummy,
const float dummy,
const float delta_dummy
)
{
float data;
int i= get_global_id(0);
if(i < NIMAGE)
{
data = image[i];
int dummy_condition = ((!do_dummy) || ((delta_dummy!=0.0f) && (fabs(data-dummy) > delta_dummy)) || ((delta_dummy==0.0f) && (data!=dummy)));
data -= do_dark ? dark[i] : 0;
data *= do_flat ? 1/flat[i] : 1;
data *= do_solidangle ? 1/solidangle[i] : 1;
data *= do_polarization ? 1/polarization[i] : 1;
image[i] = dummy_condition ? data : dummy;
};//end if NIMAGE
};//end kernel
/**
* \brief Performs 1d azimuthal integration with full pixel splitting
*
* @param pos Float pointer to global memory storting the flattened 4D-array with the pixel point coords
* @param image Float pointer to global memory storing the input image.
* @param minmax Float pointer to global memory holding the min/max results of the reduction kernels
* @param length: Interger value of the length of the buffer array
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
* @param dummy Float: value for bad pixels
* @param outData Float pointer to the output 1D array with the weighted histogram
* @param outCount Float pointer to the output 1D array with the unweighted histogram
*
*/
__kernel
void integrate1(__global float8* pos,
__global float* image,
// __global int* mask,
// __const int check_mask,
__global float4* minmax,
const int length,
// float2 pos0Range,
// float2 pos1Range,
const int do_dummy,
const float dummy,
__global float* outData,
__global float* outCount)
{
int global_index = get_global_id(0);
if (global_index < length)
{
// float pos0_min = fmax(fmin(pos0Range.x,pos0Range.y),minmax[0].s0);
// float pos0_max = fmin(fmax(pos0Range.x,pos0Range.y),minmax[0].s1);
float pos0_min = minmax[0].s0;
float pos0_max = minmax[0].s1;
pos0_max *= 1 + EPS;
float delta = (pos0_max - pos0_min) / BINS;
int local_index = get_local_id(0);
float8 pixel = pos[global_index];
float data = image[global_index];
pixel.s0 = getBinNr(pixel.s0, delta, pos0_min);
pixel.s2 = getBinNr(pixel.s2, delta, pos0_min);
pixel.s4 = getBinNr(pixel.s4, delta, pos0_min);
pixel.s6 = getBinNr(pixel.s6, delta, pos0_min);
float min0 = min4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
float max0 = max4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
int bin0_min = floor(min0);
int bin0_max = floor(max0);
float2 AB, BC, CD, DA;
AB.x=(pixel.s3-pixel.s1)/(pixel.s2-pixel.s0);
AB.y= pixel.s1 - AB.x*pixel.s0;
BC.x=(pixel.s5-pixel.s3)/(pixel.s4-pixel.s2);
BC.y= pixel.s3 - BC.x*pixel.s2;
CD.x=(pixel.s7-pixel.s5)/(pixel.s6-pixel.s4);
CD.y= pixel.s5 - CD.x*pixel.s4;
DA.x=(pixel.s1-pixel.s7)/(pixel.s0-pixel.s6);
DA.y= pixel.s7 - DA.x*pixel.s6;
float areaPixel = area4(pixel.s0, pixel.s1, pixel.s2, pixel.s3, pixel.s4, pixel.s5, pixel.s6, pixel.s7);
float oneOverPixelArea = 1.0 / areaPixel;
for (int bin=bin0_min; bin < bin0_max+1; bin++)
{
float A_lim = (pixel.s0<=bin)*(pixel.s0<=(bin+1))*bin + (pixel.s0>bin)*(pixel.s0<=(bin+1))*pixel.s0 + (pixel.s0>bin)*(pixel.s0>(bin+1))*(bin+1);
float B_lim = (pixel.s2<=bin)*(pixel.s2<=(bin+1))*bin + (pixel.s2>bin)*(pixel.s2<=(bin+1))*pixel.s2 + (pixel.s2>bin)*(pixel.s2>(bin+1))*(bin+1);
float C_lim = (pixel.s4<=bin)*(pixel.s4<=(bin+1))*bin + (pixel.s4>bin)*(pixel.s4<=(bin+1))*pixel.s4 + (pixel.s4>bin)*(pixel.s4>(bin+1))*(bin+1);
float D_lim = (pixel.s6<=bin)*(pixel.s6<=(bin+1))*bin + (pixel.s6>bin)*(pixel.s6<=(bin+1))*pixel.s6 + (pixel.s6>bin)*(pixel.s6>(bin+1))*(bin+1);
float partialArea = integrate_line(A_lim, B_lim, AB);
partialArea += integrate_line(B_lim, C_lim, BC);
partialArea += integrate_line(C_lim, D_lim, CD);
partialArea += integrate_line(D_lim, A_lim, DA);
float tmp = fabs(partialArea) * oneOverPixelArea;
outCount[bin] = tmp;
outData[bin] = data*tmp;
// AtomicAdd(&outCount[bin], tmp);
// AtomicAdd(&outData[bin], data*tmp);
}
}
}
/**
* \brief Finished the 1d azimuthal integration by calculating the ratio of the 2 histograms
*
* @param outData Float pointer to the output 1D array with the weighted histogram
* @param outCount Float pointer to the output 1D array with the unweighted histogram
* @param outMerged Float pointer to the output 1D array with the diffractogram
*
*/
__kernel
void integrate2(__global float* outData,
__global float* outCount,
__global float* outMerge)
{
int global_index = get_global_id(0);
if (global_index < BINS)
outMerge[global_index] = outData[global_index]/outCount[global_index];
}

92
openCL/reduction_test.cl Normal file
View File

@ -0,0 +1,92 @@
__kernel
void reduce1(__global float* buffer,
__const int length,
__global float2* preresult) {
int global_index = get_global_id(0);
int global_size = get_global_size(0);
float2 accumulator;
accumulator.x = INFINITY;
accumulator.y = -INFINITY;
// Loop sequentially over chunks of input vector
while (global_index < length) {
float element = buffer[global_index];
accumulator.x = (accumulator.x < element) ? accumulator.x : element;
accumulator.y = (accumulator.y > element) ? accumulator.y : element;
global_index += global_size;
}
__local float2 scratch[WORKGROUP_SIZE];
// Perform parallel reduction
int local_index = get_local_id(0);
scratch[local_index] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
int active_threads = get_local_size(0);
while (active_threads != 2)
{
active_threads /= 2;
if (thread_id_loc < active_threads)
{
float2 other = scratch[local_index + active_threads];
float2 mine = scratch[local_index];
mine.x = (mine.x < other.x) ? mine.x : other.x;
mine.y = (mine.y > other.y) ? mine.y : other.y;
/*
float2 tmp;
tmp.x = (mine.x < other.x) ? mine.x : other.x;
tmp.y = (mine.y > other.y) ? mine.y : other.y;
scratch[local_index] = tmp;
*/
scratch[local_index] = mine;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
preresult[get_group_id(0)] = scratch[0];
}
}
__kernel
void reduce2(__global float2* preresult,
__global float4* result) {
__local float2 scratch[WORKGROUP_SIZE];
int local_index = get_local_id(0);
scratch[local_index] = preresult[local_index];
barrier(CLK_LOCAL_MEM_FENCE);
int active_threads = get_local_size(0);
while (active_threads != 2)
{
active_threads /= 2;
if (thread_id_loc < active_threads)
{
float2 other = scratch[local_index + active_threads];
float2 mine = scratch[local_index];
mine.x = (mine.x < other.x) ? mine.x : other.x;
mine.y = (mine.y > other.y) ? mine.y : other.y;
/*
float2 tmp;
tmp.x = (mine.x < other.x) ? mine.x : other.x;
tmp.y = (mine.y > other.y) ? mine.y : other.y;
scratch[local_index] = tmp;
*/
scratch[local_index] = mine;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
result[0] = vload4(0,scratch);
}
}

104
openCL/reduction_test4.cl Normal file
View File

@ -0,0 +1,104 @@
__kernel
void reduce1(__global float2* buffer,
__const int length,
__global float4* preresult) {
int global_index = get_global_id(0);
int global_size = get_global_size(0);
float4 accumulator;
accumulator.x = INFINITY;
accumulator.y = -INFINITY;
accumulator.z = INFINITY;
accumulator.w = -INFINITY;
// Loop sequentially over chunks of input vector
while (global_index < length/2) {
float2 element = buffer[global_index];
accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
global_index += global_size;
}
__local float4 scratch[WORKGROUP_SIZE];
// Perform parallel reduction
int local_index = get_local_id(0);
scratch[local_index] = accumulator;
barrier(CLK_LOCAL_MEM_FENCE);
int active_threads = get_local_size(0);
while (active_threads != 1)
{
active_threads /= 2;
if (local_index < active_threads)
{
float4 other = scratch[local_index + active_threads];
float4 mine = scratch[local_index];
mine.x = (mine.x < other.x) ? mine.x : other.x;
mine.y = (mine.y > other.y) ? mine.y : other.y;
mine.z = (mine.z < other.z) ? mine.z : other.z;
mine.w = (mine.w > other.w) ? mine.w : other.w;
/*
float2 tmp;
tmp.x = (mine.x < other.x) ? mine.x : other.x;
tmp.y = (mine.y > other.y) ? mine.y : other.y;
scratch[local_index] = tmp;
*/
scratch[local_index] = mine;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
preresult[get_group_id(0)] = scratch[0];
}
}
__kernel
void reduce2(__global float4* preresult,
__global float4* result) {
__local float4 scratch[WORKGROUP_SIZE];
int local_index = get_local_id(0);
scratch[local_index] = preresult[local_index];
barrier(CLK_LOCAL_MEM_FENCE);
int active_threads = get_local_size(0);
while (active_threads != 1)
{
active_threads /= 2;
if (local_index < active_threads)
{
float4 other = scratch[local_index + active_threads];
float4 mine = scratch[local_index];
mine.x = (mine.x < other.x) ? mine.x : other.x;
mine.y = (mine.y > other.y) ? mine.y : other.y;
mine.z = (mine.z < other.z) ? mine.z : other.z;
mine.w = (mine.w > other.w) ? mine.w : other.w;
/*
float2 tmp;
tmp.x = (mine.x < other.x) ? mine.x : other.x;
tmp.y = (mine.y > other.y) ? mine.y : other.y;
scratch[local_index] = tmp;
*/
scratch[local_index] = mine;
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (local_index == 0) {
result[0] = scratch[0];
}
}

52
openCL/test4.py Normal file
View File

@ -0,0 +1,52 @@
import pyopencl as cl
from pyopencl import array
import numpy
length = 640000
workgroup_size = 128
a = numpy.random.rand(length).astype(numpy.float32)
a.shape = (length/8,4,2)
input_a = a.reshape(length)
min0 = a[:, :, 0].min()
max0 = a[:, :, 0].max()
min1 = a[:, :, 1].min()
max1 = a[:, :, 1].max()
minmax=(min0,max0,min1,max1)
platform = cl.get_platforms()[0]
device = platform.get_devices()[0]
ctx = cl.Context((device,))
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
d_input = array.to_device(queue, input_a)
d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
with open("reduction_test4.cl", "r") as kernelFile:
kernel_src = kernelFile.read()
compile_options = "-D WORKGROUP_SIZE=%i" % (workgroup_size)
program = cl.Program(ctx, kernel_src).build(options=compile_options)
program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data, numpy.uint32(length), d_preresult)
program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
result = numpy.ndarray(4,dtype=numpy.float32)
cl.enqueue_copy(queue,result, d_result)
print minmax
print result

View File

@ -84,6 +84,13 @@ except ImportError as error:
" full pixel splitting: %s" % error)
splitPixel = None
try:
from . import splitPixelFull # IGNORE:F0401
except ImportError as error:
logger.error("Unable to import pyFAI.splitPixelFull"
" full pixel splitting: %s" % error)
splitPixelFull = None
try:
from . import splitBBox # IGNORE:F0401
except ImportError as error:
@ -2407,39 +2414,74 @@ class AzimuthalIntegrator(Geometry):
if (I is None) and ("splitpix" in method):
if splitPixel is None:
logger.warning("SplitPixel is not available,"
" falling back on splitbbox histogram !")
method = "splitbbox"
else:
logger.debug("integrate1d uses SplitPixel implementation")
pos = self.array_from_unit(shape, "corner", unit)
qAxis, I, a, b = splitPixel.fullSplit1D(pos=pos,
weights=data,
bins=nbPt,
pos0Range=radial_range,
pos1Range=azimuth_range,
dummy=dummy,
delta_dummy=delta_dummy,
mask=mask,
dark=dark,
flat=flat,
solidangle=solidangle,
polarization=polarization
)
if error_model == "azimuthal":
variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
if variance is not None:
_, var1d, a, b = splitPixel.fullSplit1D(pos=pos,
weights=variance,
if "full" in method:
if splitPixel is None:
logger.warning("SplitPixelFull is not available,"
" falling back on splitbbox histogram !")
method = "splitbbox"
else:
logger.debug("integrate1d uses SplitPixel implementation")
pos = self.array_from_unit(shape, "corner", unit)
qAxis, I, a, b = splitPixelFull.fullSplit1D(pos=pos,
weights=data,
bins=nbPt,
pos0Range=radial_range,
pos1Range=azimuth_range,
dummy=dummy,
delta_dummy=delta_dummy,
mask=mask,
dark=dark,
flat=flat,
solidangle=solidangle,
polarization=polarization
)
sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
if error_model == "azimuthal":
variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
if variance is not None:
_, var1d, a, b = splitPixelFull.fullSplit1D(pos=pos,
weights=variance,
bins=nbPt,
pos0Range=radial_range,
pos1Range=azimuth_range,
dummy=dummy,
delta_dummy=delta_dummy,
mask=mask,
)
sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
else:
if splitPixel is None:
logger.warning("SplitPixel is not available,"
" falling back on splitbbox histogram !")
method = "splitbbox"
else:
logger.debug("integrate1d uses SplitPixel implementation")
pos = self.array_from_unit(shape, "corner", unit)
qAxis, I, a, b = splitPixel.fullSplit1D(pos=pos,
weights=data,
bins=nbPt,
pos0Range=radial_range,
pos1Range=azimuth_range,
dummy=dummy,
delta_dummy=delta_dummy,
mask=mask,
dark=dark,
flat=flat,
solidangle=solidangle,
polarization=polarization
)
if error_model == "azimuthal":
variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
if variance is not None:
_, var1d, a, b = splitPixel.fullSplit1D(pos=pos,
weights=variance,
bins=nbPt,
pos0Range=radial_range,
pos1Range=azimuth_range,
dummy=dummy,
delta_dummy=delta_dummy,
mask=mask,
)
sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
if (I is None) and ("bbox" in method):
if splitBBox is None:

View File

@ -58,6 +58,7 @@ class OCL_CSR_Integrator(object):
indptr: row pointer indicates the start of a given row. len nbin+1
@param image_size: size of the image (for pre-processing)
@param devicetype: can be "cpu","gpu","acc" or "all"
@param block_size: the chosen size for WORKGROUP_SIZE
@param platformid: number of the platform as given by clinfo
@type platformid: int
@param deviceid: number of the device as given by clinfo

View File

@ -52,11 +52,13 @@ class OCL_CSR_Integrator(object):
platformid=None, deviceid=None,
checksum=None, profile=False):
"""
@param data: coefficient of the matrix in a 1D vector of float32 - size of nnz
@param indices: Column index position for the data (same size as data)
@param indptr: row pointer indicates the start of a given row. len nbin+1
@param image_size:
@param lut: 3-tuple of arrays
data: coefficient of the matrix in a 1D vector of float32 - size of nnz
indices: Column index position for the data (same size as data)
indptr: row pointer indicates the start of a given row. len nbin+1
@param image_size:
@param devicetype: can be "cpu","gpu","acc" or "all"
@param block_size: the chosen size for WORKGROUP_SIZE
@param platformid: number of the platform as given by clinfo
@type platformid: int
@param deviceid: number of the device as given by clinfo

View File

@ -0,0 +1,380 @@
# -*- coding: utf-8 -*-
#
# Project: Azimuthal integration
# https://github.com/kif/pyFAI
#
#
# Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
#
# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
# Giannis Ashiotis
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
__authors__ = ["Jérôme Kieffer", "Giannis Ashiotis"]
__license__ = "GPLv3"
__date__ = "04/04/2014"
__copyright__ = "2014, ESRF, Grenoble"
__contact__ = "jerome.kieffer@esrf.fr"
import os, gc, logging
import threading
import hashlib
import numpy
from .opencl import ocl, pyopencl
from .splitBBoxLUT import HistoBBox1d
from .utils import get_cl_file
from pyopencl import array
if pyopencl:
mf = pyopencl.mem_flags
else:
raise ImportError("pyopencl is not installed")
try:
from .fastcrc import crc32
except:
from zlib import crc32
logger = logging.getLogger("pyFAI.ocl_azim_csr")
class OCL_Hist_Pixelsplit(object):
def __init__(self, pos, bins, image_size, pos0Range=None, pos1Range=None, devicetype="all",
padded=False, block_size=32,
platformid=None, deviceid=None,
checksum=None, profile=False):
"""
@param lut: 3-tuple of arrays
data: coefficient of the matrix in a 1D vector of float32 - size of nnz
indices: Column index position for the data (same size as data)
indptr: row pointer indicates the start of a given row. len nbin+1
@param image_size: size of the image (for pre-processing)
@param devicetype: can be "cpu","gpu","acc" or "all"
@param platformid: number of the platform as given by clinfo
@type platformid: int
@param deviceid: number of the device as given by clinfo
@type deviceid: int
@param checksum: pre - calculated checksum to prevent re - calculating it :)
@param profile: store profiling elements
"""
self.BLOCK_SIZE = block_size # query for warp size
self.padded = padded
self._sem = threading.Semaphore()
self.pos = pos
self.bins = bins
self.pos_size = pos.size
self.size = image_size
if self.pos_size != 8 * self.size:
raise RuntimeError("pos.size != 8 * image_size")
self.pos0Range = numpy.zeros(1, pyopencl.array.vec.float2)
self.pos1Range = numpy.zeros(1, pyopencl.array.vec.float2)
if (pos0Range is not None) and (len(pos0Range) is 2):
self.pos0Range[0][0] = min(pos0Range)
self.pos0Range[0][1] = max(pos0Range)
else:
self.pos0Range[0][0] = -float("inf")
self.pos0Range[0][1] = float("inf")
if (pos1Range is not None) and (len(pos1Range) is 2):
self.pos1Range[0][0] = min(pos1Range)
self.pos1Range[0][1] = max(pos1Range)
else:
self.pos1Range[0][0] = -float("inf")
self.pos1Range[0][1] = float("inf")
self.profile = profile
if not checksum:
checksum = crc32(self.pos)
self.on_device = {"pos":checksum, "dark":None, "flat":None, "polarization":None, "solidangle":None}
self._cl_kernel_args = {}
self._cl_mem = {}
self.events = []
if (platformid is None) and (deviceid is None):
platformid, deviceid = ocl.select_device(devicetype)
elif platformid is None:
platformid = 0
elif deviceid is None:
deviceid = 0
self.platform = ocl.platforms[platformid]
self.device = self.platform.devices[deviceid]
self.device_type = self.device.type
if (self.device_type == "CPU") and (self.platform.vendor == "Apple"):
logger.warning("This is a workaround for Apple's OpenCL on CPU: enforce BLOCK_SIZE=1")
self.BLOCK_SIZE = 1
self.workgroup_size = self.BLOCK_SIZE,
self.wdim_bins = (self.bins * self.BLOCK_SIZE),
self.wdim_data = (self.size + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
try:
#self._ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
self._ctx = pyopencl.create_some_context()
if self.profile:
self._queue = pyopencl.CommandQueue(self._ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
else:
self._queue = pyopencl.CommandQueue(self._ctx)
self._allocate_buffers()
self._compile_kernels()
self._set_kernel_arguments()
except pyopencl.MemoryError as error:
raise MemoryError(error)
ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["pos"], self.pos)
if self.profile: self.events.append(("copy pos data",ev))
reduction_wg_size = 256
reduce1 = self._program.reduce1(self._queue, (reduction_wg_size*reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce1"])
self.events.append(("reduce1",reduce1))
reduce2 = self._program.reduce2(self._queue, (reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce2"])
self.events.append(("reduce2",reduce2))
result = numpy.ndarray(4,dtype=numpy.float32)
pyopencl.enqueue_copy(self._queue,result, self._cl_mem["minmax"])
print result
min0 = pos[:, :, 0].min()
max0 = pos[:, :, 0].max()
min1 = pos[:, :, 1].min()
max1 = pos[:, :, 1].max()
minmax=(min0,max0,min1,max1)
print minmax
def __del__(self):
"""
Destructor: release all buffers
"""
self._free_kernels()
self._free_buffers()
self._queue = None
self._ctx = None
gc.collect()
def _allocate_buffers(self):
"""
Allocate OpenCL buffers required for a specific configuration
Note that an OpenCL context also requires some memory, as well as Event and other OpenCL functionalities which cannot and
are not taken into account here.
The memory required by a context varies depending on the device. Typical for GTX580 is 65Mb but for a 9300m is ~15Mb
In addition, a GPU will always have at least 3-5Mb of memory in use.
Unfortunately, OpenCL does NOT have a built-in way to check the actual free memory on a device, only the total memory.
"""
if self.size < self.BLOCK_SIZE:
raise RuntimeError("Fatal error in _allocate_buffers. size (%d) must be >= BLOCK_SIZE (%d)\n", self.size, self.BLOCK_SIZE)
size_of_float = numpy.dtype(numpy.float32).itemsize
size_of_short = numpy.dtype(numpy.int16).itemsize
size_of_int = numpy.dtype(numpy.int32).itemsize
size_of_long = numpy.dtype(numpy.int64).itemsize
ualloc = (self.pos_size * size_of_float)
ualloc += (4 * self.BLOCK_SIZE * size_of_float)
ualloc += (self.size * size_of_float) * 5
ualloc += (self.bins * size_of_float) * 3
memory = self.device.memory
logger.info("%.3fMB are needed on device which has %.3fMB" % (ualloc / 1.0e6, memory / 1.0e6))
if ualloc >= memory:
raise MemoryError("Fatal error in _allocate_buffers. Not enough device memory for buffers (%lu requested, %lu available)" % (ualloc, memory))
# now actually allocate:
try:
self._cl_mem["pos"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.pos_size)
self._cl_mem["preresult"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4 * self.BLOCK_SIZE)
self._cl_mem["minmax"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4)
self._cl_mem["outData"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
self._cl_mem["outCount"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
self._cl_mem["outMerge"] = pyopencl.Buffer(self._ctx, mf.WRITE_ONLY, size=size_of_float * self.bins)
self._cl_mem["image_u16"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_short * self.size)
self._cl_mem["image"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.size)
self._cl_mem["dark"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
self._cl_mem["flat"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
self._cl_mem["polarization"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
self._cl_mem["solidangle"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
except pyopencl.MemoryError as error:
self._free_buffers()
raise MemoryError(error)
def _free_buffers(self):
"""
free all memory allocated on the device
"""
for buffer_name in self._cl_mem:
if self._cl_mem[buffer_name] is not None:
try:
self._cl_mem[buffer_name].release()
self._cl_mem[buffer_name] = None
except pyopencl.LogicError:
logger.error("Error while freeing buffer %s" % buffer_name)
def _compile_kernels(self, kernel_file=None):
"""
Call the OpenCL compiler
@param kernel_file: path tothe
"""
kernel_name = "ocl_hist_pixelsplit.cl"
if kernel_file is None:
if os.path.isfile(kernel_name):
kernel_file = os.path.abspath(kernel_name)
else:
kernel_file = get_cl_file(kernel_name)
else:
kernel_file = str(kernel_file)
with open(kernel_file, "r") as kernelFile:
kernel_src = kernelFile.read()
compile_options = "-D BINS=%i -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
(self.bins, self.size, self.BLOCK_SIZE, numpy.finfo(numpy.float32).eps)
logger.info("Compiling file %s with options %s" % (kernel_file, compile_options))
try:
self._program = pyopencl.Program(self._ctx, kernel_src).build(options=compile_options)
except pyopencl.MemoryError as error:
raise MemoryError(error)
def _free_kernels(self):
"""
free all kernels
"""
for kernel in self._cl_kernel_args:
self._cl_kernel_args[kernel] = []
self._program = None
def _set_kernel_arguments(self):
"""Tie arguments of OpenCL kernel-functions to the actual kernels
set_kernel_arguments() is a private method, called by configure().
It uses the dictionary _cl_kernel_args.
Note that by default, since TthRange is disabled, the integration kernels have tth_min_max tied to the tthRange argument slot.
When setRange is called it replaces that argument with tthRange low and upper bounds. When unsetRange is called, the argument slot
is reset to tth_min_max.
"""
self._cl_kernel_args["reduce1"] = [self._cl_mem["pos"], numpy.int32(self.pos_size), self._cl_mem["preresult"]]
self._cl_kernel_args["reduce2"] = [self._cl_mem["preresult"], self._cl_mem["minmax"]]
self._cl_kernel_args["corrections"] = [self._cl_mem["image"], numpy.int32(0), self._cl_mem["dark"], numpy.int32(0), self._cl_mem["flat"], \
numpy.int32(0), self._cl_mem["solidangle"], numpy.int32(0), self._cl_mem["polarization"], \
numpy.int32(0), numpy.float32(0), numpy.float32(0)]
self._cl_kernel_args["integrate1"] = [self._cl_mem["pos"], self._cl_mem["image"], self._cl_mem["minmax"], numpy.int32(0), self.pos0Range[0], \
self.pos1Range[0], numpy.int32(0), numpy.float32(0), self._cl_mem["outData"], self._cl_mem["outCount"]]
self._cl_kernel_args["integrate2"] = [self._cl_mem["outData"], self._cl_mem["outCount"], self._cl_mem["outMerge"]]
self._cl_kernel_args["memset_out"] = [self._cl_mem[i] for i in ["outData", "outCount", "outMerge"]]
self._cl_kernel_args["u16_to_float"] = [self._cl_mem[i] for i in ["image_u16", "image"]]
self._cl_kernel_args["s32_to_float"] = [self._cl_mem[i] for i in ["image", "image"]]
def integrate(self, data, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None, dark_checksum=None, flat_checksum=None, solidAngle_checksum=None, polarization_checksum=None):
events = []
with self._sem:
if data.dtype == numpy.uint16:
copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image_u16"], numpy.ascontiguousarray(data))
cast_u16_to_float = self._program.u16_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["u16_to_float"])
events+=[("copy image",copy_image),("cast", cast_u16_to_float)]
elif data.dtype == numpy.int32:
copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data))
cast_s32_to_float = self._program.s32_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["s32_to_float"])
events+=[("copy image",copy_image),("cast", cast_s32_to_float)]
else:
copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data, dtype=numpy.float32))
events+=[("copy image",copy_image)]
memset = self._program.memset_out(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["memset_out"])
events+=[("memset",memset)]
if dummy is not None:
do_dummy = numpy.int32(1)
dummy = numpy.float32(dummy)
if delta_dummy == None:
delta_dummy = numpy.float32(0)
else:
delta_dummy = numpy.float32(abs(delta_dummy))
else:
do_dummy = numpy.int32(0)
dummy = numpy.float32(0)
delta_dummy = numpy.float32(0)
self._cl_kernel_args["corrections"][9] = do_dummy
self._cl_kernel_args["corrections"][10] = dummy
self._cl_kernel_args["corrections"][11] = delta_dummy
self._cl_kernel_args["integrate1"][6] = do_dummy
self._cl_kernel_args["integrate1"][7] = dummy
if dark is not None:
do_dark = numpy.int32(1)
if not dark_checksum:
dark_checksum = crc32(dark)
if dark_checksum != self.on_device["dark"]:
ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["dark"], numpy.ascontiguousarray(dark, dtype=numpy.float32))
events.append("copy dark",ev)
self.on_device["dark"] = dark_checksum
else:
do_dark = numpy.int32(0)
self._cl_kernel_args["corrections"][1] = do_dark
if flat is not None:
do_flat = numpy.int32(1)
if not flat_checksum:
flat_checksum = crc32(flat)
if self.on_device["flat"] != flat_checksum:
ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["flat"], numpy.ascontiguousarray(flat, dtype=numpy.float32))
events.append("copy flat",ev)
self.on_device["flat"] = flat_checksum
else:
do_flat = numpy.int32(0)
self._cl_kernel_args["corrections"][3] = do_flat
if solidAngle is not None:
do_solidAngle = numpy.int32(1)
if not solidAngle_checksum:
solidAngle_checksum = crc32(solidAngle)
if solidAngle_checksum != self.on_device["solidangle"]:
ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["solidangle"], numpy.ascontiguousarray(solidAngle, dtype=numpy.float32))
events.append(("copy solidangle",ev))
self.on_device["solidangle"] = solidAngle_checksum
else:
do_solidAngle = numpy.int32(0)
self._cl_kernel_args["corrections"][5] = do_solidAngle
if polarization is not None:
do_polarization = numpy.int32(1)
if not polarization_checksum:
polarization_checksum = crc32(polarization)
if polarization_checksum != self.on_device["polarization"]:
ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["polarization"], numpy.ascontiguousarray(polarization, dtype=numpy.float32))
events.append(("copy polarization",ev))
self.on_device["polarization"] = polarization_checksum
else:
do_polarization = numpy.int32(0)
self._cl_kernel_args["corrections"][7] = do_polarization
copy_image.wait()
if do_dummy + do_polarization + do_solidAngle + do_flat + do_dark > 0:
ev = self._program.corrections(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["corrections"])
events.append(("corrections",ev))
integrate1 = self._program.integrate1(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["integrate1"])
events.append(("integrate1",integrate1))
outMerge = numpy.empty(self.bins, dtype=numpy.float32)
outData = numpy.empty(self.bins, dtype=numpy.float32)
outCount = numpy.empty(self.bins, dtype=numpy.float32)
ev=pyopencl.enqueue_copy(self._queue, outData, self._cl_mem["outData"])
events.append(("copy D->H outData",ev))
ev=pyopencl.enqueue_copy(self._queue, outCount, self._cl_mem["outCount"])
events.append(("copy D->H outCount",ev))
global_size_integrate2 = (self.bins + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
integrate2 = self._program.integrate2(self._queue, global_size_integrate2, self.workgroup_size, *self._cl_kernel_args["integrate2"])
events.append(("integrate2",integrate2))
ev=pyopencl.enqueue_copy(self._queue, outMerge, self._cl_mem["outMerge"])
events.append(("copy D->H outMerge",ev))
ev.wait()
if self.profile:
self.events+=events
return outMerge, outData, outCount
def log_profile(self):
"""
If we are in profiling mode, prints out all timing for every single OpenCL call
"""
t = 0.0
if self.profile:
for e in self.events:
if "__len__" in dir(e) and len(e) >= 2:
et = 1e-6 * (e[1].profile.end - e[1].profile.start)
print("%50s:\t%.3fms" % (e[0], et))
t += et
print("_"*80)
print("%50s:\t%.3fms" % ("Total execution time", t))

View File

@ -49,14 +49,10 @@ from distutils.sysconfig import get_python_lib
from distutils.command.install_data import install_data
################################################################################
# Remove MANIFEST file ... it needs to be re-generated on the fly
# Check for Cython
################################################################################
if op.isfile("MANIFEST"):
os.unlink("MANIFEST")
################################################################################
# Check for Cython
################################################################################
try:
from Cython.Distutils import build_ext
CYTHON = True
@ -124,7 +120,6 @@ if ("sdist" in sys.argv):
# pyFAI extensions
# ###############################################################################
cython_modules = [os.path.splitext(os.path.basename(i))[0] for i in glob.glob("src/*.pyx")]
src = dict([(ext, join("src", ext + cython_c_ext)) for ext in cython_modules])
_geometry_dic = dict(name="_geometry",
@ -151,6 +146,16 @@ splitPixel_dic = dict(name="splitPixel",
sources=[src['splitPixel']],
)
splitPixelFull_dic = dict(name="splitPixelFull",
include_dirs=get_numpy_include_dirs(),
sources=[src['splitPixelFull']],
)
splitPixelFullLUT_dic = dict(name="splitPixelFullLUT",
include_dirs=get_numpy_include_dirs(),
sources=[src['splitPixelFullLUT']],
)
splitBBox_dic = dict(name="splitBBox",
include_dirs=get_numpy_include_dirs(),
sources=[src['splitBBox']],
@ -200,6 +205,7 @@ _bispev_dic = dict(name="_bispev",
sources=[src['_bispev'] ],
extra_compile_args=['openmp'],
extra_link_args=['openmp'],
)
_convolution_dic = dict(name="_convolution",
@ -266,6 +272,7 @@ if sys.platform == "win32":
if (filein + ".py") not in script_files:
shutil.copyfile(filein, filein + ".py")
script_files.append(filein + ".py")
else:
script_files = glob.glob("scripts/*")
@ -420,10 +427,11 @@ This python module can be found on:
http://pypi.python.org/pypi/pyopencl
""")
"""
################################################################################
# ###############################################################################
# check if OpenMP modules, freshly installed can import
################################################################################
# ###############################################################################
pyFAI = None
sys.path.insert(0, os.path.dirname(installDir))
# print installDir
@ -436,7 +444,7 @@ for mod in sys.modules.copy():
try:
import pyFAI
except ImportError as E:
print("Unable to import pyFAI from system: %s" % E)
print("Unable to import pyFAI: %s" % E)
else:
print("PyFAI is installed in %s" % pyFAI.__file__)
try:
@ -445,5 +453,4 @@ else:
print("PyFAI.histogram failed to import. It is likely there is an OpenMP error: %s" % E)
else:
print("OpenMP libraries were found and pyFAI.histogram was successfully imported")
"""

View File

@ -903,3 +903,114 @@ def cal_LUT(float[:,:,:,:] pos not None, shape, int size, max_pixel_size):
outMax[ml, nl] = k + 1
idx += 1
return lut.reshape(shape0 * shape1, size)
def cal_CSR(float[:,:,:,:] pos not None, shape, bin_size, max_pixel_size):
"""
@param pos: 4D position array
@param shape: output shape
@param bin_size: number of input element per output element (as numpy array)
@param max_pixel_size: (2-tuple of int) size of a buffer covering the largest pixel
@return: look-up table in CSR format: 3-tuple of array"""
cdef int i, j, ms, ml, ns, nl, shape0, shape1, delta0, delta1, buffer_size, i0, i1, bins, lut_size
cdef int offset0, offset1, box_size0, box_size1
cdef numpy.int32_t k, idx=0
cdef float A0, A1, B0, B1, C0, C1, D0, D1, pAB, pBC, pCD, pDA, cAB, cBC, cCD, cDA, area, value
cdef numpy.ndarray[lut_point, ndim = 3] lut
cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros(shape, dtype=numpy.int32)
cdef float[:,:] buffer
cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr, indices
cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
shape0, shape1 = shape
delta0, delta1 = max_pixel_size
bins = shape0*shape1
indptr = numpy.empty(bins+1, dtype=numpy.int32)
indptr[0] = 0
indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
lut_size = indptr[bins]
indices = numpy.zeros(shape=lut_size, dtype=numpy.int32)
data = numpy.zeros(shape=lut_size, dtype=numpy.float32)
bins = shape0*shape1
indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
indices_size = lut_size*sizeof(numpy.int32)
data_size = lut_size*sizeof(numpy.float32)
indptr_size = bins*sizeof(numpy.int32)
logger.info("CSR matrix: %.3f MByte"%((indices_size+data_size+indptr_size)/1.0e6))
buffer = cvarray(shape=(delta0, delta1), itemsize=sizeof(float), format="f")
buffer_size = delta0 * delta1 * sizeof(float)
logger.info("Max pixel size: %ix%i; Max source pixel in target: %i"%(buffer.shape[1],buffer.shape[0], lut_size))
with nogil:
# i,j, idx are indices of the raw image uncorrected
for i in range(shape0):
for j in range(shape1):
#reinit of buffer
memset(&buffer[0,0], 0, buffer_size)
A0 = pos[i, j, 0, 0]
A1 = pos[i, j, 0, 1]
B0 = pos[i, j, 1, 0]
B1 = pos[i, j, 1, 1]
C0 = pos[i, j, 2, 0]
C1 = pos[i, j, 2, 1]
D0 = pos[i, j, 3, 0]
D1 = pos[i, j, 3, 1]
offset0 = (<int> floor(min4f(A0, B0, C0, D0)))
offset1 = (<int> floor(min4f(A1, B1, C1, D1)))
box_size0 = (<int> ceil(max4f(A0, B0, C0, D0))) - offset0
box_size1 = (<int> ceil(max4f(A1, B1, C1, D1))) - offset1
A0 -= <float> offset0
A1 -= <float> offset1
B0 -= <float> offset0
B1 -= <float> offset1
C0 -= <float> offset0
C1 -= <float> offset1
D0 -= <float> offset0
D1 -= <float> offset1
if B0 != A0:
pAB = (B1 - A1) / (B0 - A0)
cAB = A1 - pAB * A0
else:
pAB = cAB = 0.0
if C0 != B0:
pBC = (C1 - B1) / (C0 - B0)
cBC = B1 - pBC * B0
else:
pBC = cBC = 0.0
if D0 != C0:
pCD = (D1 - C1) / (D0 - C0)
cCD = C1 - pCD * C0
else:
pCD = cCD = 0.0
if A0 != D0:
pDA = (A1 - D1) / (A0 - D0)
cDA = D1 - pDA * D0
else:
pDA = cDA = 0.0
integrate(buffer, B0, A0, pAB, cAB)
integrate(buffer, A0, D0, pDA, cDA)
integrate(buffer, D0, C0, pCD, cCD)
integrate(buffer, C0, B0, pBC, cBC)
area = 0.5*((C0 - A0)*(D1 - B1)-(C1 - A1)*(D0 - B0))
for ms in range(box_size0):
ml = ms + offset0
if ml < 0 or ml >= shape0:
continue
for ns in range(box_size1):
# ms,ns are indexes of the corrected image in short form, ml & nl are the same
nl = ns + offset1
if nl < 0 or nl >= shape1:
continue
value = buffer[ms, ns] / area
if value <= 0:
continue
k = outMax[ml,nl]
tmp_index = indptr[ml*shape1+nl]
indices[tmp_index+k] = idx
data[tmp_index+k] = value
outMax[ml,nl] = k + 1
idx += 1
return (data, indices, indptr)

View File

@ -438,7 +438,7 @@ class Distortion(object):
def correctHost(self, image):
"""
Correct an image based on the look-up table calculated ...
Caclulation takes place on the Host
Calculation takes place on the Host
@param image: 2D-array with the image
@return: corrected 2D image

22542
src/splitPixelFull.c Normal file

File diff suppressed because it is too large Load Diff

739
src/splitPixelFull.pyx Normal file
View File

@ -0,0 +1,739 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Project: Azimuthal integration
# https://forge.epn-campus.eu/projects/azimuthal
#
# File: "$Id$"
#
# Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
#
# Principal author: Giannis Ashiotis
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import cython
cimport numpy
import numpy
from libc.math cimport fabs, floor
from libc.stdio cimport printf
from cython.view cimport array as cvarray
#cdef double areaTriangle(double a0,
# double a1,
# double b0,
# double b1,
# double c0,
# double c1):
# """
# Calculate the area of the ABC triangle with corners:
# A(a0,a1)
# B(b0,b1)
# C(c0,c1)
# @return: area, i.e. 1/2 * (B-A)^(C-A)
# """
# return 0.5 * abs(((b0 - a0) * (c1 - a1)) - ((b1 - a1) * (c0 - a0)))
#
cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
"""
Calculate the area of the ABCD quadrilataire with corners:
A(a0,a1)
B(b0,b1)
C(c0,c1)
D(d0,d1)
@return: area, i.e. 1/2 * (AC ^ BD)
"""
return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
# cdef double area4(point2D *pixel):
# """
# Calculate the area of the ABCD quadrilataire with corners:
# A(a0,a1)
# B(b0,b1)
# C(c0,c1)
# D(d0,d1)
# @return: area, i.e. 1/2 * (AC ^ BD)
# """
# return 0.5 * abs(((pixel[2].x - pixel[0].x) * (pixel[3].y - pixel[1].y)) - ((pixel[2].y - pixel[0].y) * (pixel[3].x - pixel[1].x)))
# cdef struct point2D:
# numpy.float64_t x
# numpy.float64_t y
# cdef struct min_max:
# numpy.float64_t pos
# numpy.int32_t point
cdef struct Function:
double slope
double intersect
#cdef class Function:
#cdef double _slope
#cdef double _intersect
#def __cinit__(self, double A0=0.0, double A1=0.0, double B0=1.0, double B1=0.0):
#self._slope = (B1-A1)/(B0-A0)
#self._intersect = A1 - self._slope*A0
#def __cinit__(self):
#self._slope = 0.0
#self._intersect = 0.0
#cdef double f(self, double x):
#return self._slope*x + self._intersect
#cdef double integrate(self, double A0, double B0) nogil:
#if A0==B0:
#return 0.0
#else:
#return self._slope*(B0*B0 - A0*A0)*0.5 + self._intersect*(B0-A0)
#cdef void reset(self, double A0, double A1, double B0, double B1) nogil:
#self._slope = (B1-A1)/(B0-A0)
#self._intersect = A1 - self._slope*A0
cdef double integrate( double A0, double B0, Function AB) nogil:
"""
integrates the line defined by AB, from A0 to B0
param A0: first limit
param B0: second limit
param AB: struct with the slope and point of intersection of the line
"""
if A0==B0:
return 0.0
else:
return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
@cython.cdivision(True)
cdef double getBinNr(double x0, double pos0_min, double dpos) nogil:
"""
calculate the bin number for any point
param x0: current position
param pos0_min: position minimum
param dpos: bin width
"""
return (x0 - pos0_min) / dpos
# cdef min_max min4f(point2D *pixel, int dim) nogil:
# cdef min_max tmp
# if dim == 0:
# if (pixel[0].x <= pixel[1].x) and (pixel[0].x <= pixel[2].x) and (pixel[0].x <= pixel[3].x):
# tmp.pos = pixel[0].x
# tmp.pixel = 0
# return tmp
# if (pixel[1].x <= pixel[0].x) and (pixel[1].x <= pixel[2].x) and (pixel[1].x <= pixel[3].x):
# tmp.pos = pixel[1].x
# tmp.pixel = 1
# return tmp
# if (pixel[2].x <= pixel[0].x) and (pixel[2].x <= pixel[1].x) and (pixel[2].x <= pixel[3].x):
# tmp.pos = pixel[2].x
# tmp.pixel = 2
# return tmp
# else:
# tmp.pos = pixel[3].x
# tmp.pixel = 3
# return tmp
# elif dim == 1:
# if (pixel[0].y <= pixel[1].y) and (pixel[0].y <= pixel[2].y) and (pixel[0].y <= pixel[3].y):
# tmp.pos = pixel[0].y
# tmp.pixel = 0
# return tmp
# if (pixel[1].y <= pixel[0].y) and (pixel[1].y <= pixel[2].y) and (pixel[1].y <= pixel[3].y):
# tmp.pos = pixel[1].y
# tmp.pixel = 1
# return tmp
# if (pixel[2].y <= pixel[0].y) and (pixel[2].y <= pixel[1].y) and (pixel[2].y <= pixel[3].y):
# tmp.pos = pixel[2].y
# tmp.pixel = 2
# return tmp
# else:
# tmp.pos = pixel[3].y
# tmp.pixel = 3
# return tmp
# cdef min_max max4f(point2D *pixel, int dim) nogil:
# cdef min_max tmp
# if dim == 0:
# if (pixel[0].x >= pixel[1].x) and (pixel[0].x >= pixel[2].x) and (pixel[0].x >= pixel[3].x):
# tmp.pos = pixel[0].x
# tmp.pixel = 0
# return tmp
# if (pixel[1].x >= pixel[0].x) and (pixel[1].x >= pixel[2].x) and (pixel[1].x >= pixel[3].x):
# tmp.pos = pixel[1].x
# tmp.pixel = 1
# return tmp
# if (pixel[2].x >= pixel[0].x) and (pixel[2].x >= pixel[1].x) and (pixel[2].x >= pixel[3].x):
# tmp.pos = pixel[2].x
# tmp.pixel = 2
# return tmp
# else:
# tmp.pos = pixel[3].x
# tmp.pixel = 3
# return tmp
# elif dim == 1:
# if (pixel[0].y >= pixel[1].y) and (pixel[0].y >= pixel[2].y) and (pixel[0].y >= pixel[3].y):
# tmp.pos = pixel[0].y
# tmp.pixel = 0
# return tmp
# if (pixel[1].y >= pixel[0].y) and (pixel[1].y >= pixel[2].y) and (pixel[1].y >= pixel[3].y):
# tmp.pos = pixel[1].y
# tmp.pixel = 1
# return tmp
# if (pixel[2].y >= pixel[0].y) and (pixel[2].y >= pixel[1].y) and (pixel[2].y >= pixel[3].y):
# tmp.pos = pixel[2].y
# tmp.pixel = 2
# return tmp
# else:
# tmp.pos = pixel[3].y
# tmp.pixel = 3
# return tmp
cdef double min4f(double a, double b, double c, double d) nogil:
"""Calculates the min of 4 double numbers"""
if (a <= b) and (a <= c) and (a <= d):
return a
if (b <= a) and (b <= c) and (b <= d):
return b
if (c <= a) and (c <= b) and (c <= d):
return c
else:
return d
cdef double max4f(double a, double b, double c, double d) nogil:
"""Calculates the max of 4 double numbers"""
if (a >= b) and (a >= c) and (a >= d):
return a
if (b >= a) and (b >= c) and (b >= d):
return b
if (c >= a) and (c >= b) and (c >= d):
return c
else:
return d
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
def fullSplit1D(numpy.ndarray pos not None,
numpy.ndarray weights not None,
size_t bins=100,
pos0Range=None,
pos1Range=None,
dummy=None,
delta_dummy=None,
mask=None,
dark=None,
flat=None,
solidangle=None,
polarization=None
):
"""
Calculates histogram of pos weighted by weights
Splitting is done on the pixel's bounding box like fit2D.
No compromise for speed has been made here.
@param pos: 3D or 4D array with the coordinates of each pixel point
@param weights: array with intensities
@param bins: number of output bins
@param pos0Range: minimum and maximum of the 2th range
@param pos1Range: minimum and maximum of the chi range
@param dummy: value for bins without pixels
@param delta_dummy: precision of dummy value
@param mask: array (of int8) with masked pixels with 1 (0=not masked)
@param dark: array (of float64) with dark noise to be subtracted (or None)
@param flat: array (of float64) with flat image
@param polarization: array (of float64) with polarization correction
@param solidangle: array (of float64) with flat image
@return 2theta, I, weighted histogram, unweighted histogram
"""
cdef size_t size = weights.size
if pos.ndim>3: #create a view
pos = pos.reshape((-1,4,2))
assert pos.shape[0] == size
assert pos.shape[1] == 4
assert pos.shape[2] == 2
assert pos.ndim == 3
assert bins > 1
cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(pos,dtype=numpy.float64)
cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float64)
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(bins, dtype=numpy.float64)
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(bins, dtype=numpy.float64)
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outMerge = numpy.zeros(bins, dtype=numpy.float64)
cdef numpy.int8_t[:] cmask
cdef double[:] cflat, cdark, cpolarization, csolidangle
cdef double cdummy=0, cddummy=0, data=0
cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
cdef double areaPixel=0, dpos=0, fbin0_min=0, fbin0_max=0#, fbin1_min, fbin1_max
cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
cdef double oneOverArea=0, partialArea=0, tmp=0
cdef double max0, min0
#cdef point2D[:] pixel
cdef Function AB, BC, CD, DA
cdef double epsilon=1e-10
cdef bint check_pos1=False, check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
cdef int i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, cur_bin
if pos0Range is not None and len(pos0Range) > 1:
pos0_min = min(pos0Range)
pos0_maxin = max(pos0Range)
else:
pos0_min = pos[:, :, 0].min()
pos0_maxin = pos[:, :, 0].max()
pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
if pos1Range is not None and len(pos1Range) > 1:
pos1_min = min(pos1Range)
pos1_maxin = max(pos1Range)
do_pos1 = True
else:
pos1_min = pos[:, :, 1].min()
pos1_maxin = pos[:, :, 1].max()
pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
dpos = (pos0_max - pos0_min) / (< double > (bins))
outPos = numpy.linspace(pos0_min+0.5*dpos, pos0_maxin-0.5*dpos, bins)
if (dummy is not None) and (delta_dummy is not None):
check_dummy = True
cdummy = float(dummy)
cddummy = float(delta_dummy)
elif (dummy is not None):
check_dummy = True
cdummy = float(dummy)
cddummy = 0.0
else:
check_dummy = False
cdummy = 0.0
cddummy = 0.0
if mask is not None:
check_mask = True
assert mask.size == size
cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
if dark is not None:
do_dark = True
assert dark.size == size
cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
if flat is not None:
do_flat = True
assert flat.size == size
cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
if polarization is not None:
do_polarization = True
assert polarization.size == size
cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
if solidangle is not None:
do_solidangle = True
assert solidangle.size == size
csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
#pixel = cvarray(shape=4, itemsize=sizeof(point2D))
#AB = Function()
#BC = Function()
#CD = Function()
#DA = Function()
with nogil:
for idx in range(size):
if (check_mask) and (cmask[idx]):
continue
data = cdata[idx]
if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
continue
# pixel[0].x = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
# pixel[0].y = < double > cpos[idx, 0, 1]
# pixel[1].x = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
# pixel[1].y = < double > cpos[idx, 1, 1]
# pixel[2].x = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
# pixel[2].y = < double > cpos[idx, 2, 1]
# pixel[3].x = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
# pixel[3].y = < double > cpos[idx, 3, 1]
A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
A1 = < double > cpos[idx, 0, 1]
B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
B1 = < double > cpos[idx, 1, 1]
C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
C1 = < double > cpos[idx, 2, 1]
D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
D1 = < double > cpos[idx, 3, 1]
min0 = min(A0, B0, C0, D0)
max0 = max(A0, B0, C0, D0)
if (max0<0) or (min0 >=bins):
continue
if check_pos1:
min1 = min(A1, B1, C1, D1)
max1 = max(A1, B1, C1, D1)
if (max1<pos1_min) or (min1 > pos1_maxin):
continue
if do_dark:
data -= cdark[idx]
if do_flat:
data /= cflat[idx]
if do_polarization:
data /= cpolarization[idx]
if do_solidangle:
data /= csolidangle[idx]
bin0_min = < int > floor(min0)
bin0_max = < int > floor(max0)
#printf("%d - [(%f %f) (%f %f) (%f %f) (%f %f)] (%f %f) (%d %d)\n",idx, A0, A1, B0, B1, C0, C1, D0, D1, min0, max0, bin0_min, bin0_max)
if bin0_min == bin0_max:
#All pixel is within a single bin
outCount[bin0_min] += 1
outData[bin0_min] += data
# else we have pixel spliting.
else:
AB.slope=(B1-A1)/(B0-A0)
AB.intersect= A1 - AB.slope*A0
BC.slope=(C1-B1)/(C0-B0)
BC.intersect= B1 - BC.slope*B0
CD.slope=(D1-C1)/(D0-C0)
CD.intersect= C1 - CD.slope*C0
DA.slope=(A1-D1)/(A0-D0)
DA.intersect= D1 - DA.slope*D0
areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
oneOverPixelArea = 1.0 / areaPixel
partialArea2 = 0.0
for bin in range(bin0_min, bin0_max+1):
A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
partialArea = integrate(A_lim, B_lim, AB)
partialArea += integrate(B_lim, C_lim, BC)
partialArea += integrate(C_lim, D_lim, CD)
partialArea += integrate(D_lim, A_lim, DA)
tmp = fabs(partialArea) * oneOverPixelArea
partialArea2 += partialArea
outCount[bin] += tmp
outData[bin] += data * tmp
#if fabs(partialArea2-areaPixel) > epsilon:
#printf("%d - %f \n",idx,(partialArea2-areaPixel)/areaPixel)
for i in range(bins):
if outCount[i] > epsilon:
outMerge[i] = outData[i] / outCount[i]
else:
outMerge[i] = cdummy
return outPos, outMerge, outData, outCount
#@cython.cdivision(True)
#@cython.boundscheck(False)
#@cython.wraparound(False)
#def fullSplit2D(numpy.ndarray pos not None,
#numpy.ndarray weights not None,
#bins not None,
#pos0Range=None,
#pos1Range=None,
#dummy=None,
#delta_dummy=None,
#mask=None,
#dark=None,
#flat=None,
#solidangle=None,
#polarization=None):
#"""
#Calculate 2D histogram of pos weighted by weights
#Splitting is done on the pixel's bounding box like fit2D
#@param pos: 3D array with pos0; Corner A,B,C,D; tth or chi
#@param weights: array with intensities
#@param bins: number of output bins int or 2-tuple of int
#@param pos0Range: minimum and maximum of the 2th range
#@param pos1Range: minimum and maximum of the chi range
#@param dummy: value for bins without pixels
#@param delta_dummy: precision of dummy value
#@param mask: array (of int8) with masked pixels with 1 (0=not masked)
#@param dark: array (of float64) with dark noise to be subtracted (or None)
#@param flat: array (of float64) with flat-field image
#@param polarization: array (of float64) with polarization correction
#@param solidangle: array (of float64)with solid angle corrections
#@return I, edges0, edges1, weighted histogram(2D), unweighted histogram (2D)
#"""
#cdef size_t bins0=0, bins1=0, size = weights.size
#if pos.ndim>3: #create a view
#pos = pos.reshape((-1,4,2))
#assert pos.shape[0] == size
#assert pos.shape[1] == 4 # 4 corners
#assert pos.shape[2] == 2 # tth and chi
#assert pos.ndim == 3
#try:
#bins0, bins1 = tuple(bins)
#except:
#bins0 = bins1 = < size_t > bins
#if bins0 <= 0:
#bins0 = 1
#if bins1 <= 0:
#bins1 = 1
#cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = pos.astype(numpy.float64)
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = weights.astype(numpy.float64).ravel()
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros((bins0, bins1), dtype=numpy.float64)
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros((bins0, bins1), dtype=numpy.float64)
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outMerge = numpy.zeros((bins0, bins1), dtype=numpy.float64)
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges0 = numpy.zeros(bins0, dtype=numpy.float64)
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges1 = numpy.zeros(bins1, dtype=numpy.float64)
#cdef numpy.int8_t[:] cmask
#cdef double[:] cflat, cdark, cpolarization, csolidangle
#cdef bint check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
#cdef double cdummy=0, cddummy=0, data=0
#cdef double min0=0, max0=0, min1=0, max1=0, deltaR=0, deltaL=0, deltaU=0, deltaD=0, deltaA=0
#cdef double pos0_min=0, pos0_max=0, pos1_min=0, pos1_max=0, pos0_maxin=0, pos1_maxin=0
#cdef double areaPixel=0, fbin0_min=0, fbin0_max=0, fbin1_min=0, fbin1_max=0
#cdef double a0=0, a1=0, b0=0, b1=0, c0=0, c1=0, d0=0, d1=0
#cdef double epsilon = 1e-10
#cdef size_t bin0_max=0, bin0_min=0, bin1_max=0, bin1_min=0, i=0, j=0, idx=0
#if pos0Range is not None and len(pos0Range) == 2:
#pos0_min = min(pos0Range)
#pos0_maxin = max(pos0Range)
#else:
#pos0_min = pos[:, :, 0].min()
#pos0_maxin = pos[:, :, 0].max()
#pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
#if pos1Range is not None and len(pos1Range) > 1:
#pos1_min = min(pos1Range)
#pos1_maxin = max(pos1Range)
#else:
#pos1_min = pos[:, :, 1].min()
#pos1_maxin = pos[:, :, 1].max()
#pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
#cdef double dpos0 = (pos0_max - pos0_min) / (< double > (bins0))
#cdef double dpos1 = (pos1_max - pos1_min) / (< double > (bins1))
#edges0 = numpy.linspace(pos0_min+0.5*dpos0, pos0_maxin-0.5*dpos0, bins0)
#edges1 = numpy.linspace(pos1_min+0.5*dpos1, pos1_maxin-0.5*dpos1, bins1)
#if (dummy is not None) and (delta_dummy is not None):
#check_dummy = True
#cdummy = float(dummy)
#cddummy = float(delta_dummy)
#elif (dummy is not None):
#check_dummy = True
#cdummy = float(dummy)
#cddummy = 0.0
#else:
#check_dummy = False
#cdummy = 0.0
#cddummy = 0.0
#if mask is not None:
#check_mask = True
#assert mask.size == size
#cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
#if dark is not None:
#do_dark = True
#assert dark.size == size
#cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
#if flat is not None:
#do_flat = True
#assert flat.size == size
#cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
#if polarization is not None:
#do_polarization = True
#assert polarization.size == size
#cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
#if solidangle is not None:
#do_solidangle = True
#assert solidangle.size == size
#csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
#with nogil:
#for idx in range(size):
#if (check_mask) and (cmask[idx]):
#continue
#data = cdata[idx]
#if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
#continue
#a0 = cpos[idx, 0, 0]
#a1 = cpos[idx, 0, 1]
#b0 = cpos[idx, 1, 0]
#b1 = cpos[idx, 1, 1]
#c0 = cpos[idx, 2, 0]
#c1 = cpos[idx, 2, 1]
#d0 = cpos[idx, 3, 0]
#d1 = cpos[idx, 3, 1]
#min0 = min4f(a0, b0, c0, d0)
#max0 = max4f(a0, b0, c0, d0)
#min1 = min4f(a1, b1, c1, d1)
#max1 = max4f(a1, b1, c1, d1)
#if (max0<pos0_min) or (min0 > pos0_maxin) or (max1<pos1_min) or (min1 > pos1_maxin):
#continue
#if do_dark:
#data -= cdark[idx]
#if do_flat:
#data /= cflat[idx]
#if do_polarization:
#data /= cpolarization[idx]
#if do_solidangle:
#data /= csolidangle[idx]
#if min0 < pos0_min:
#data = data * (pos0_min - min0) / (max0 - min0)
#min0 = pos0_min
#if min1 < pos1_min:
#data = data * (pos1_min - min1) / (max1 - min1)
#min1 = pos1_min
#if max0 > pos0_maxin:
#data = data * (max0 - pos0_maxin) / (max0 - min0)
#max0 = pos0_maxin
#if max1 > pos1_maxin:
#data = data * (max1 - pos1_maxin) / (max1 - min1)
#max1 = pos1_maxin
### treat data for pixel on chi discontinuity
#if ((max1 - min1) / dpos1) > (bins1 / 2.0):
#if pos1_maxin - max1 > min1 - pos1_min:
#min1 = max1
#max1 = pos1_maxin
#else:
#max1 = min1
#min1 = pos1_min
#fbin0_min = getBinNr(min0, pos0_min, dpos0)
#fbin0_max = getBinNr(max0, pos0_min, dpos0)
#fbin1_min = getBinNr(min1, pos1_min, dpos1)
#fbin1_max = getBinNr(max1, pos1_min, dpos1)
#bin0_min = < size_t > fbin0_min
#bin0_max = < size_t > fbin0_max
#bin1_min = < size_t > fbin1_min
#bin1_max = < size_t > fbin1_max
#if bin0_min == bin0_max:
#if bin1_min == bin1_max:
##All pixel is within a single bin
#outCount[bin0_min, bin1_min] += 1.0
#outData[bin0_min, bin1_min] += data
#else:
##spread on more than 2 bins
#areaPixel = fbin1_max - fbin1_min
#deltaD = (< double > (bin1_min + 1)) - fbin1_min
#deltaU = fbin1_max - (< double > bin1_max)
#deltaA = 1.0 / areaPixel
#outCount[bin0_min, bin1_min] += deltaA * deltaD
#outData[bin0_min, bin1_min] += data * deltaA * deltaD
#outCount[bin0_min, bin1_max] += deltaA * deltaU
#outData[bin0_min, bin1_max] += data * deltaA * deltaU
## if bin1_min +1< bin1_max:
#for j in range(bin1_min + 1, bin1_max):
#outCount[bin0_min, j] += deltaA
#outData[bin0_min, j] += data * deltaA
#else: #spread on more than 2 bins in dim 0
#if bin1_min == bin1_max:
##All pixel fall on 1 bins in dim 1
#areaPixel = fbin0_max - fbin0_min
#deltaL = (< double > (bin0_min + 1)) - fbin0_min
#deltaA = deltaL / areaPixel
#outCount[bin0_min, bin1_min] += deltaA
#outData[bin0_min, bin1_min] += data * deltaA
#deltaR = fbin0_max - (< double > bin0_max)
#deltaA = deltaR / areaPixel
#outCount[bin0_max, bin1_min] += deltaA
#outData[bin0_max, bin1_min] += data * deltaA
#deltaA = 1.0 / areaPixel
#for i in range(bin0_min + 1, bin0_max):
#outCount[i, bin1_min] += deltaA
#outData[i, bin1_min] += data * deltaA
#else:
##spread on n pix in dim0 and m pixel in dim1:
#areaPixel = (fbin0_max - fbin0_min) * (fbin1_max - fbin1_min)
#deltaL = (< double > (bin0_min + 1.0)) - fbin0_min
#deltaR = fbin0_max - (< double > bin0_max)
#deltaD = (< double > (bin1_min + 1.0)) - fbin1_min
#deltaU = fbin1_max - (< double > bin1_max)
#deltaA = 1.0 / areaPixel
#outCount[bin0_min, bin1_min] += deltaA * deltaL * deltaD
#outData[bin0_min, bin1_min] += data * deltaA * deltaL * deltaD
#outCount[bin0_min, bin1_max] += deltaA * deltaL * deltaU
#outData[bin0_min, bin1_max] += data * deltaA * deltaL * deltaU
#outCount[bin0_max, bin1_min] += deltaA * deltaR * deltaD
#outData[bin0_max, bin1_min] += data * deltaA * deltaR * deltaD
#outCount[bin0_max, bin1_max] += deltaA * deltaR * deltaU
#outData[bin0_max, bin1_max] += data * deltaA * deltaR * deltaU
#for i in range(bin0_min + 1, bin0_max):
#outCount[i, bin1_min] += deltaA * deltaD
#outData[i, bin1_min] += data * deltaA * deltaD
#for j in range(bin1_min + 1, bin1_max):
#outCount[i, j] += deltaA
#outData[i, j] += data * deltaA
#outCount[i, bin1_max] += deltaA * deltaU
#outData[i, bin1_max] += data * deltaA * deltaU
#for j in range(bin1_min + 1, bin1_max):
#outCount[bin0_min, j] += deltaA * deltaL
#outData[bin0_min, j] += data * deltaA * deltaL
#outCount[bin0_max, j] += deltaA * deltaR
#outData[bin0_max, j] += data * deltaA * deltaR
##with nogil:
#for i in range(bins0):
#for j in range(bins1):
#if outCount[i, j] > epsilon:
#outMerge[i, j] = outData[i, j] / outCount[i, j]
#else:
#outMerge[i, j] = cdummy
#return outMerge.T, edges0, edges1, outData.T, outCount.T

26170
src/splitPixelFullLUT.c Normal file

File diff suppressed because it is too large Load Diff

974
src/splitPixelFullLUT.pyx Normal file
View File

@ -0,0 +1,974 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Project: Azimuthal integration
# https://github.com/kif/pyFAI
#
# Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
#
# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
import cython
import os, sys
from cython.parallel import prange
from libc.string cimport memset
import numpy
cimport numpy
from libc.math cimport fabs, M_PI, floor
from libc.stdio cimport printf
cdef float pi = <float> M_PI
cdef float onef = <float> 1.0
try:
from fastcrc import crc32
except:
from zlib import crc32
EPS32 = (1.0 + numpy.finfo(numpy.float32).eps)
cdef struct Function:
double slope
double intersect
cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
"""
Calculate the area of the ABCD quadrilataire with corners:
A(a0,a1)
B(b0,b1)
C(c0,c1)
D(d0,d1)
@return: area, i.e. 1/2 * (AC ^ BD)
"""
return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
@cython.cdivision(True)
cdef inline double getBinNr( double x0, double pos0_min, double delta) nogil:
"""
calculate the bin number for any point
param x0: current position
param pos0_min: position minimum
param delta: bin width
"""
return (x0 - pos0_min) / delta
cdef double integrate( double A0, double B0, Function AB) nogil:
"""
integrates the line defined by AB, from A0 to B0
param A0: first limit
param B0: second limit
param AB: struct with the slope and point of intersection of the line
"""
if A0==B0:
return 0.0
else:
return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
class HistoLUT1dFullSplit(object):
"""
Now uses CSR (Compressed Sparse raw) with main attributes:
* nnz: number of non zero elements
* data: coefficient of the matrix in a 1D vector of float32
* indices: Column index position for the data (same size as
* indptr: row pointer indicates the start of a given row. len nrow+1
Nota: nnz = indptr[-1]
"""
@cython.boundscheck(False)
def __init__(self,
numpy.ndarray pos not None,
int bins=100,
pos0Range=None,
pos1Range=None,
mask=None,
mask_checksum=None,
allow_pos0_neg=False,
unit="undefined"):
"""
@param pos: 3D or 4D array with the coordinates of each pixel point
@param bins: number of output bins, 100 by default
@param pos0Range: minimum and maximum of the 2th range
@param pos1Range: minimum and maximum of the chi range
@param mask: array (of int8) with masked pixels with 1 (0=not masked)
@param allow_pos0_neg: enforce the q<0 is usually not possible
@param unit: can be 2th_deg or r_nm^-1 ...
"""
# self.padding = int(padding)
if pos.ndim>3: #create a view
pos = pos.reshape((-1,4,2))
assert pos.shape[1] == 4
assert pos.shape[2] == 2
assert pos.ndim == 3
self.pos = pos
self.size = pos.shape[0]
self.bins = bins
self.lut_size = 0
self.allow_pos0_neg = allow_pos0_neg
if mask is not None:
assert mask.size == self.size
self.check_mask = True
self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
if mask_checksum:
self.mask_checksum = mask_checksum
else:
self.mask_checksum = crc32(mask)
else:
self.check_mask = False
self.mask_checksum = None
self.data = self.nnz = self.indices = self.indptr = None
self.pos0Range = pos0Range
self.pos1Range = pos1Range
self.calc_lut()
self.outPos = numpy.linspace(self.pos0_min+0.5*self.delta, self.pos0_maxin-0.5*self.delta, self.bins)
self.lut_checksum = crc32(self.data)
self.unit=unit
self.lut=(self.data,self.indices,self.indptr)
self.lut_nbytes = sum([i.nbytes for i in self.lut])
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
def calc_lut(self):
cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(self.pos,dtype=numpy.float64)
cdef numpy.int8_t[:] cmask
cdef numpy.ndarray[numpy.int32_t, ndim = 1] outMax = numpy.zeros(self.bins, dtype=numpy.int32)
cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros(self.bins+1, dtype=numpy.int32)
cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
cdef double max0, min0
cdef double areaPixel=0, delta=0
cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
cdef double oneOverArea=0, partialArea=0, tmp=0
cdef Function AB, BC, CD, DA
cdef int bins, i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, k=0, size=0
cdef bint check_pos1=False, check_mask=False
bins = self.bins
if self.pos0Range is not None and len(self.pos0Range) > 1:
self.pos0_min = min(self.pos0Range)
self.pos0_maxin = max(self.pos0Range)
else:
self.pos0_min = self.pos[:, :, 0].min()
self.pos0_maxin = self.pos[:, :, 0].max()
self.pos0_max = self.pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
if self.pos1Range is not None and len(self.pos1Range) > 1:
self.pos1_min = min(self.pos1Range)
self.pos1_maxin = max(self.pos1Range)
self.check_pos1 = True
else:
self.pos1_min = self.pos[:, :, 1].min()
self.pos1_maxin = self.pos[:, :, 1].max()
self.pos1_max = self.pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
self.delta = (self.pos0_max - self.pos0_min) / (< double > (bins))
pos0_min = self.pos0_min
pos0_max = self.pos0_max
pos1_min = self.pos1_min
pos1_max = self.pos1_max
delta = self.delta
size = self.size
check_mask = self.check_mask
if check_mask:
cmask = self.cmask
with nogil:
for idx in range(size):
if (check_mask) and (cmask[idx]):
continue
A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
A1 = < double > cpos[idx, 0, 1]
B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
B1 = < double > cpos[idx, 1, 1]
C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
C1 = < double > cpos[idx, 2, 1]
D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
D1 = < double > cpos[idx, 3, 1]
min0 = min(A0, B0, C0, D0)
max0 = max(A0, B0, C0, D0)
if (max0<0) or (min0 >=bins):
continue
if check_pos1:
if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
continue
bin0_min = < int > floor(min0)
bin0_max = < int > floor(max0)
for bin in range(bin0_min, bin0_max+1):
outMax[bin] += 1
indptr[1:] = outMax.cumsum()
self.indptr = indptr
cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices = numpy.zeros(indptr[bins], dtype=numpy.int32)
cdef numpy.ndarray[numpy.float32_t, ndim = 1] data = numpy.zeros(indptr[bins], dtype=numpy.float32)
#just recycle the outMax array
memset(&outMax[0], 0, bins * sizeof(numpy.int32_t))
with nogil:
for idx in range(size):
if (check_mask) and (cmask[idx]):
continue
A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
A1 = < double > cpos[idx, 0, 1]
B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
B1 = < double > cpos[idx, 1, 1]
C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
C1 = < double > cpos[idx, 2, 1]
D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
D1 = < double > cpos[idx, 3, 1]
min0 = min(A0, B0, C0, D0)
max0 = max(A0, B0, C0, D0)
if (max0<0) or (min0 >=bins):
continue
if check_pos1:
if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
continue
bin0_min = < int > floor(min0)
bin0_max = < int > floor(max0)
if bin0_min == bin0_max:
#All pixel is within a single bin
k = outMax[bin0_min]
indices[indptr[bin0_min]+k] = idx
data[indptr[bin0_min]+k] = 1.0
outMax[bin0_min] += 1 #k+1
else: #else we have pixel spliting.
AB.slope=(B1-A1)/(B0-A0)
AB.intersect= A1 - AB.slope*A0
BC.slope=(C1-B1)/(C0-B0)
BC.intersect= B1 - BC.slope*B0
CD.slope=(D1-C1)/(D0-C0)
CD.intersect= C1 - CD.slope*C0
DA.slope=(A1-D1)/(A0-D0)
DA.intersect= D1 - DA.slope*D0
areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
oneOverPixelArea = 1.0 / areaPixel
partialArea2 = 0.0
for bin in range(bin0_min, bin0_max+1):
A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
partialArea = integrate(A_lim, B_lim, AB)
partialArea += integrate(B_lim, C_lim, BC)
partialArea += integrate(C_lim, D_lim, CD)
partialArea += integrate(D_lim, A_lim, DA)
tmp = fabs(partialArea) * oneOverPixelArea
k = outMax[bin]
indices[indptr[bin]+k] = idx
data[indptr[bin]+k] = tmp
outMax[bin] += 1 #k+1
self.data = data
self.indices = indices
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
"""
Actually perform the integration which in this case looks more like a matrix-vector product
@param weights: input image
@type weights: ndarray
@param dummy: value for dead pixels (optional)
@type dummy: float
@param delta_dummy: precision for dead-pixel value in dynamic masking
@type delta_dummy: float
@param dark: array with the dark-current value to be subtracted (if any)
@type dark: ndarray
@param flat: array with the dark-current value to be divided by (if any)
@type flat: ndarray
@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
@type solidAngle: ndarray
@param polarization: array with the polarization correction values to be divided by (if any)
@type polarization: ndarray
@return : positions, pattern, weighted_histogram and unweighted_histogram
@rtype: 4-tuple of ndarrays
"""
cdef numpy.int32_t i=0, j=0, idx=0, bins=self.bins, size=self.size
cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
cdef float data=0, coef=0, cdummy=0, cddummy=0
cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(self.bins, dtype=numpy.float64)
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
assert size == weights.size
if dummy is not None:
do_dummy = True
cdummy = <float>float(dummy)
if delta_dummy is None:
cddummy = <float>0.0
else:
cddummy = <float>float(delta_dummy)
if flat is not None:
do_flat = True
assert flat.size == size
cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
if dark is not None:
do_dark = True
assert dark.size == size
cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
if solidAngle is not None:
do_solidAngle = True
assert solidAngle.size == size
csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
if polarization is not None:
do_polarization = True
assert polarization.size == size
cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
if (do_dark + do_flat + do_polarization + do_solidAngle):
tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
cdata = numpy.zeros(size,dtype=numpy.float32)
if do_dummy:
for i in prange(size, nogil=True, schedule="static"):
data = tdata[i]
if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
#Nota: -= and /= operatore are seen as reduction in cython parallel.
if do_dark:
data = data - cdark[i]
if do_flat:
data = data / cflat[i]
if do_polarization:
data = data / cpolarization[i]
if do_solidAngle:
data = data / csolidAngle[i]
cdata[i]+=data
else: #set all dummy_like values to cdummy. simplifies further processing
cdata[i]+=cdummy
else:
for i in prange(size, nogil=True, schedule="static"):
data = tdata[i]
if do_dark:
data = data - cdark[i]
if do_flat:
data = data / cflat[i]
if do_polarization:
data = data / cpolarization[i]
if do_solidAngle:
data = data / csolidAngle[i]
cdata[i]+=data
else:
if do_dummy:
tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
cdata = numpy.zeros(size,dtype=numpy.float32)
for i in prange(size, nogil=True, schedule="static"):
data = tdata[i]
if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
cdata[i]+=data
else:
cdata[i]+=cdummy
else:
cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
for i in prange(bins, nogil=True, schedule="guided"):
sum_data = 0.0
sum_count = 0.0
for j in range(indptr[i],indptr[i+1]):
idx = indices[j]
coef = ccoef[j]
if coef == 0.0:
continue
data = cdata[idx]
if do_dummy and data==cdummy:
continue
sum_data = sum_data + coef * data
sum_count = sum_count + coef
outData[i] += sum_data
outCount[i] += sum_count
if sum_count > epsilon:
outMerge[i] += sum_data / sum_count
else:
outMerge[i] += cdummy
return self.outPos, outMerge, outData, outCount
################################################################################
# Bidimensionnal regrouping
################################################################################
#class HistoBBox2d(object):
#@cython.boundscheck(False)
#def __init__(self,
#pos0,
#delta_pos0,
#pos1,
#delta_pos1,
#bins=(100,36),
#pos0Range=None,
#pos1Range=None,
#mask=None,
#mask_checksum=None,
#allow_pos0_neg=False,
#unit="undefined",
#chiDiscAtPi=True
#):
#"""
#@param pos0: 1D array with pos0: tth or q_vect
#@param delta_pos0: 1D array with delta pos0: max center-corner distance
#@param pos1: 1D array with pos1: chi
#@param delta_pos1: 1D array with max pos1: max center-corner distance, unused !
#@param bins: number of output bins (tth=100, chi=36 by default)
#@param pos0Range: minimum and maximum of the 2th range
#@param pos1Range: minimum and maximum of the chi range
#@param mask: array (of int8) with masked pixels with 1 (0=not masked)
#@param allow_pos0_neg: enforce the q<0 is usually not possible
#@param chiDiscAtPi: boolean; by default the chi_range is in the range ]-pi,pi[ set to 0 to have the range ]0,2pi[
#"""
#cdef int i, size, bin0, bin1
#self.size = pos0.size
#assert delta_pos0.size == self.size
#assert pos1.size == self.size
#assert delta_pos1.size == self.size
#self.chiDiscAtPi = 1 if chiDiscAtPi else 0
#self.allow_pos0_neg = allow_pos0_neg
#try:
#bins0, bins1 = tuple(bins)
#except:
#bins0 = bins1 = bins
#if bins0 <= 0:
#bins0 = 1
#if bins1 <= 0:
#bins1 = 1
#self.bins = (int(bins0),int(bins1))
#self.lut_size = 0
#if mask is not None:
#assert mask.size == self.size
#self.check_mask = True
#self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
#if mask_checksum:
#self.mask_checksum = mask_checksum
#else:
#self.mask_checksum = crc32(mask)
#else:
#self.check_mask = False
#self.mask_checksum = None
#self.data = self.nnz = self.indices = self.indptr = None
#self.cpos0 = numpy.ascontiguousarray(pos0.ravel(), dtype=numpy.float32)
#self.dpos0 = numpy.ascontiguousarray(delta_pos0.ravel(), dtype=numpy.float32)
#self.cpos0_sup = numpy.empty_like(self.cpos0)
#self.cpos0_inf = numpy.empty_like(self.cpos0)
#self.pos0Range = pos0Range
#self.pos1Range = pos1Range
#self.cpos1 = numpy.ascontiguousarray((pos1).ravel(), dtype=numpy.float32)
#self.dpos1 = numpy.ascontiguousarray((delta_pos1).ravel(), dtype=numpy.float32)
#self.cpos1_sup = numpy.empty_like(self.cpos1)
#self.cpos1_inf = numpy.empty_like(self.cpos1)
#self.calc_boundaries(pos0Range, pos1Range)
#self.delta0 = (self.pos0_max - self.pos0_min) / float(bins0)
#self.delta1 = (self.pos1_max - self.pos1_min) / float(bins1)
#self.lut_max_idx = self.calc_lut()
#self.outPos0 = numpy.linspace(self.pos0_min+0.5*self.delta0, self.pos0_maxin-0.5*self.delta0, bins0)
#self.outPos1 = numpy.linspace(self.pos1_min+0.5*self.delta1, self.pos1_maxin-0.5*self.delta1, bins1)
#self.unit=unit
#self.lut=(self.data,self.indices,self.indptr)
#self.lut_checksum = crc32(self.data)
#@cython.boundscheck(False)
#@cython.wraparound(False)
#def calc_boundaries(self, pos0Range, pos1Range):
#cdef int size = self.cpos0.size
#cdef bint check_mask = self.check_mask
#cdef numpy.int8_t[:] cmask
#cdef float[:] cpos0, dpos0, cpos0_sup, cpos0_inf
#cdef float[:] cpos1, dpos1, cpos1_sup, cpos1_inf,
#cdef float upper0, lower0, pos0_max, pos0_min, c0, d0
#cdef float upper1, lower1, pos1_max, pos1_min, c1, d1
#cdef bint allow_pos0_neg=self.allow_pos0_neg
#cdef bint chiDiscAtPi = self.chiDiscAtPi
#cpos0_sup = self.cpos0_sup
#cpos0_inf = self.cpos0_inf
#cpos0 = self.cpos0
#dpos0 = self.dpos0
#cpos1_sup = self.cpos1_sup
#cpos1_inf = self.cpos1_inf
#cpos1 = self.cpos1
#dpos1 = self.dpos1
#pos0_min=cpos0[0]
#pos0_max=cpos0[0]
#pos1_min=cpos1[0]
#pos1_max=cpos1[0]
#if check_mask:
#cmask = self.cmask
#with nogil:
#for idx in range(size):
#c0 = cpos0[idx]
#d0 = dpos0[idx]
#lower0 = c0 - d0
#upper0 = c0 + d0
#c1 = cpos1[idx]
#d1 = dpos1[idx]
#lower1 = c1 - d1
#upper1 = c1 + d1
#if not allow_pos0_neg and lower0<0:
#lower0=0
#if upper1 > (2-chiDiscAtPi)*pi:
#upper1 = (2-chiDiscAtPi)*pi
#if lower1 < (-chiDiscAtPi)*pi:
#lower1 = (-chiDiscAtPi)*pi
#cpos0_sup[idx] = upper0
#cpos0_inf[idx] = lower0
#cpos1_sup[idx] = upper1
#cpos1_inf[idx] = lower1
#if not (check_mask and cmask[idx]):
#if upper0>pos0_max:
#pos0_max = upper0
#if lower0<pos0_min:
#pos0_min = lower0
#if upper1>pos1_max:
#pos1_max = upper1
#if lower1<pos1_min:
#pos1_min = lower1
#if pos0Range is not None and len(pos0Range) > 1:
#self.pos0_min = min(pos0Range)
#self.pos0_maxin = max(pos0Range)
#else:
#self.pos0_min = pos0_min
#self.pos0_maxin = pos0_max
#if pos1Range is not None and len(pos1Range) > 1:
#self.pos1_min = min(pos1Range)
#self.pos1_maxin = max(pos1Range)
#else:
#self.pos1_min = pos1_min
#self.pos1_maxin = pos1_max
#if (not allow_pos0_neg) and self.pos0_min < 0:
#self.pos0_min = 0
#self.pos0_max = self.pos0_maxin * EPS32
#self.cpos0_sup = cpos0_sup
#self.cpos0_inf = cpos0_inf
#self.pos1_max = self.pos1_maxin * EPS32
#self.cpos1_sup = cpos1_sup
#self.cpos1_inf = cpos1_inf
#@cython.boundscheck(False)
#@cython.wraparound(False)
#@cython.cdivision(True)
#def calc_lut(self):
#'calculate the max number of elements in the LUT and populate it'
#cdef float delta0=self.delta0, pos0_min=self.pos0_min, min0, max0, fbin0_min, fbin0_max
#cdef float delta1=self.delta1, pos1_min=self.pos1_min, min1, max1, fbin1_min, fbin1_max
#cdef int bin0_min, bin0_max, bins0 = self.bins[0]
#cdef int bin1_min, bin1_max, bins1 = self.bins[1]
#cdef numpy.int32_t k, idx, lut_size, i, j, size=self.size
#cdef bint check_mask
#cdef float[:] cpos0_sup = self.cpos0_sup
#cdef float[:] cpos0_inf = self.cpos0_inf
#cdef float[:] cpos1_inf = self.cpos1_inf
#cdef float[:] cpos1_sup = self.cpos1_sup
#cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
#cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros((bins0*bins1)+1, dtype=numpy.int32)
#cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices
#cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
#cdef numpy.int8_t[:] cmask
#if self.check_mask:
#cmask = self.cmask
#check_mask = True
#else:
#check_mask = False
##NOGIL
#with nogil:
#for idx in range(size):
#if (check_mask) and (cmask[idx]):
#continue
#min0 = cpos0_inf[idx]
#max0 = cpos0_sup[idx]
#min1 = cpos1_inf[idx]
#max1 = cpos1_sup[idx]
#bin0_min = < int > getBinNr(min0, pos0_min, delta0)
#bin0_max = < int > getBinNr(max0, pos0_min, delta0)
#bin1_min = < int > getBinNr(min1, pos1_min, delta1)
#bin1_max = < int > getBinNr(max1, pos1_min, delta1)
#if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
#continue
#if bin0_max >= bins0 :
#bin0_max = bins0 - 1
#if bin0_min < 0:
#bin0_min = 0
#if bin1_max >= bins1 :
#bin1_max = bins1 - 1
#if bin1_min < 0:
#bin1_min = 0
#for i in range(bin0_min, bin0_max+1):
#for j in range(bin1_min , bin1_max+1):
#outMax[i, j] += 1
#self.nnz = outMax.sum()
#indptr[1:] = outMax.cumsum()
#self.indptr = indptr
## self.lut_size = lut_size = outMax.max()
##just recycle the outMax array
##outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
#memset(&outMax[0,0], 0, bins0*bins1*sizeof(numpy.int32_t))
#lut_nbytes = self.nnz * (sizeof(numpy.float32_t)+sizeof(numpy.int32_t)) + bins0*bins1*sizeof(numpy.int32_t)
#if (os.name == "posix") and ("SC_PAGE_SIZE" in os.sysconf_names) and ("SC_PHYS_PAGES" in os.sysconf_names):
#memsize = os.sysconf("SC_PAGE_SIZE")*os.sysconf("SC_PHYS_PAGES")
#if memsize < lut_nbytes:
#raise MemoryError("CSR Matrix is %.3fGB whereas the memory of the system is only %s"%(lut_nbytes, memsize))
##else hope we have enough memory
#data = numpy.zeros(self.nnz,dtype=numpy.float32)
#indices = numpy.zeros(self.nnz,dtype=numpy.int32)
## lut = numpy.recarray(shape=(bins0, bins1, lut_size),dtype=[("idx",numpy.int32),("coef",numpy.float32)])
## memset(&lut[0,0,0], 0, lut_nbytes)
#with nogil:
#for idx in range(size):
#if (check_mask) and cmask[idx]:
#continue
#min0 = cpos0_inf[idx]
#max0 = cpos0_sup[idx]
#min1 = cpos1_inf[idx]
#max1 = cpos1_sup[idx]
#fbin0_min = getBinNr(min0, pos0_min, delta0)
#fbin0_max = getBinNr(max0, pos0_min, delta0)
#fbin1_min = getBinNr(min1, pos1_min, delta1)
#fbin1_max = getBinNr(max1, pos1_min, delta1)
#bin0_min = <int> fbin0_min
#bin0_max = <int> fbin0_max
#bin1_min = <int> fbin1_min
#bin1_max = <int> fbin1_max
#if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
#continue
#if bin0_max >= bins0 :
#bin0_max = bins0 - 1
#if bin0_min < 0:
#bin0_min = 0
#if bin1_max >= bins1 :
#bin1_max = bins1 - 1
#if bin1_min < 0:
#bin1_min = 0
#if bin0_min == bin0_max:
#if bin1_min == bin1_max:
##All pixel is within a single bin
#k = outMax[bin0_min, bin1_min]
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
#data[indptr[bin0_min*bins1+bin1_min]+k] = onef
## lut[bin0_min, bin1_min, k].idx = idx
## lut[bin0_min, bin1_min, k].coef = onef
#outMax[bin0_min, bin1_min]= k+1
#else:
##spread on more than 2 bins
#deltaD = (< float > (bin1_min + 1)) - fbin1_min
#deltaU = fbin1_max - ( bin1_max)
#deltaA = 1.0 / (fbin1_max - fbin1_min)
#k = outMax[bin0_min, bin1_min]
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
#data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaD
## lut[bin0_min, bin1_min, k].idx = idx
## lut[bin0_min, bin1_min, k].coef = deltaA * deltaD
#outMax[bin0_min, bin1_min] = k + 1
#k = outMax[bin0_min, bin1_max]
#indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
#data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaU
## lut[bin0_min, bin1_max, k].idx = idx
## lut[bin0_min, bin1_max, k].coef = deltaA * deltaU
#outMax[bin0_min, bin1_max] = k + 1
#for j in range(bin1_min + 1, bin1_max):
#k = outMax[bin0_min, j]
#indices[indptr[bin0_min*bins1+j]+k] = idx
#data[indptr[bin0_min*bins1+j]+k] = deltaA
## lut[bin0_min, j, k].idx = idx
## lut[bin0_min, j, k].coef = deltaA
#outMax[bin0_min, j] = k + 1
#else: #spread on more than 2 bins in dim 0
#if bin1_min == bin1_max:
##All pixel fall on 1 bins in dim 1
#deltaA = 1.0 / (fbin0_max - fbin0_min)
#deltaL = (< float > (bin0_min + 1)) - fbin0_min
#k = outMax[bin0_min, bin1_min]
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
#data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL
## lut[bin0_min, bin1_min, k].idx = idx
## lut[bin0_min, bin1_min, k].coef = deltaA * deltaL
#outMax[bin0_min, bin1_min] = k+1
#deltaR = fbin0_max - (< float > bin0_max)
#k = outMax[bin0_max, bin1_min]
#indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
#data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR
## lut[bin0_max, bin1_min, k].idx = idx
## lut[bin0_max, bin1_min, k].coef = deltaA * deltaR
#outMax[bin0_max, bin1_min] = k + 1
#for i in range(bin0_min + 1, bin0_max):
#k = outMax[i, bin1_min]
#indices[indptr[i*bins1+bin1_min]+k] = idx
#data[indptr[i*bins1+bin1_min]+k] = deltaA
## lut[i, bin1_min ,k].idx = idx
## lut[i, bin1_min, k].coef = deltaA
#outMax[i, bin1_min] = k + 1
#else:
##spread on n pix in dim0 and m pixel in dim1:
#deltaL = (< float > (bin0_min + 1)) - fbin0_min
#deltaR = fbin0_max - (< float > bin0_max)
#deltaD = (< float > (bin1_min + 1)) - fbin1_min
#deltaU = fbin1_max - (< float > bin1_max)
#deltaA = 1.0 / ((fbin0_max - fbin0_min) * (fbin1_max - fbin1_min))
#k = outMax[bin0_min, bin1_min]
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
#data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL * deltaD
## lut[bin0_min, bin1_min ,k].idx = idx
## lut[bin0_min, bin1_min, k].coef = deltaA * deltaL * deltaD
#outMax[bin0_min, bin1_min] = k + 1
#k = outMax[bin0_min, bin1_max]
#indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
#data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaL * deltaU
## lut[bin0_min, bin1_max, k].idx = idx
## lut[bin0_min, bin1_max, k].coef = deltaA * deltaL * deltaU
#outMax[bin0_min, bin1_max] = k + 1
#k = outMax[bin0_max, bin1_min]
#indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
#data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR * deltaD
## lut[bin0_max, bin1_min, k].idx = idx
## lut[bin0_max, bin1_min, k].coef = deltaA * deltaR * deltaD
#outMax[bin0_max, bin1_min] = k + 1
#k = outMax[bin0_max, bin1_max]
#indices[indptr[bin0_max*bins1+bin1_max]+k] = idx
#data[indptr[bin0_max*bins1+bin1_max]+k] = deltaA * deltaR * deltaU
## lut[bin0_max, bin1_max, k].idx = idx
## lut[bin0_max, bin1_max, k].coef = deltaA * deltaR * deltaU
#outMax[bin0_max, bin1_max] = k + 1
#for i in range(bin0_min + 1, bin0_max):
#k = outMax[i, bin1_min]
#indices[indptr[i*bins1+bin1_min]+k] = idx
#data[indptr[i*bins1+bin1_min]+k] = deltaA * deltaD
## lut[i, bin1_min, k].idx = idx
## lut[i, bin1_min, k].coef = deltaA * deltaD
#outMax[i, bin1_min] = k + 1
#for j in range(bin1_min + 1, bin1_max):
#k = outMax[i, j]
#indices[indptr[i*bins1+j]+k] = idx
#data[indptr[i*bins1+j]+k] = deltaA
## lut[i, j, k].idx = idx
## lut[i, j, k].coef = deltaA
#outMax[i, j] = k + 1
#k = outMax[i, bin1_max]
#indices[indptr[i*bins1+bin1_max]+k] = idx
#data[indptr[i*bins1+bin1_max]+k] = deltaA * deltaU
## lut[i, bin1_max, k].idx = idx
## lut[i, bin1_max, k].coef = deltaA * deltaU
#outMax[i, bin1_max] = k + 1
#for j in range(bin1_min + 1, bin1_max):
#k = outMax[bin0_min, j]
#indices[indptr[bin0_min*bins1+j]+k] = idx
#data[indptr[bin0_min*bins1+j]+k] = deltaA * deltaL
## lut[bin0_min, j, k].idx = idx
## lut[bin0_min, j, k].coef = deltaA * deltaL
#outMax[bin0_min, j] = k + 1
#k = outMax[bin0_max, j]
#indices[indptr[bin0_max*bins1+j]+k] = idx
#data[indptr[bin0_max*bins1+j]+k] = deltaA * deltaR
## lut[bin0_max, j, k].idx = idx
## lut[bin0_max, j, k].coef = deltaA * deltaR
#outMax[bin0_max, j] = k + 1
#self.data = data
#self.indices = indices
#return outMax
#@cython.cdivision(True)
#@cython.boundscheck(False)
#@cython.wraparound(False)
#def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
#"""
#Actually perform the 2D integration which in this case looks more like a matrix-vector product
#@param weights: input image
#@type weights: ndarray
#@param dummy: value for dead pixels (optional)
#@type dummy: float
#@param delta_dummy: precision for dead-pixel value in dynamic masking
#@type delta_dummy: float
#@param dark: array with the dark-current value to be subtracted (if any)
#@type dark: ndarray
#@param flat: array with the dark-current value to be divided by (if any)
#@type flat: ndarray
#@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
#@type solidAngle: ndarray
#@param polarization: array with the polarization correction values to be divided by (if any)
#@type polarization: ndarray
#@return: I(2d), edges0(1d), edges1(1d), weighted histogram(2d), unweighted histogram (2d)
#@rtype: 5-tuple of ndarrays
#"""
#cdef int i=0, j=0, idx=0, bins0=self.bins[0], bins1=self.bins[1], bins=bins0*bins1, size=self.size
#cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
#cdef float data=0, coef=0, cdummy=0, cddummy=0
#cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros(self.bins, dtype=numpy.float64)
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
#cdef numpy.ndarray[numpy.float32_t, ndim = 2] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData_1d = outData.ravel()
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount_1d = outCount.ravel()
#cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge_1d = outMerge.ravel()
#cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
#cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
#assert size == weights.size
#if dummy is not None:
#do_dummy = True
#cdummy = <float>float(dummy)
#if delta_dummy is None:
#cddummy = <float>0.0
#else:
#cddummy = <float>float(delta_dummy)
#if flat is not None:
#do_flat = True
#assert flat.size == size
#cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
#if dark is not None:
#do_dark = True
#assert dark.size == size
#cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
#if solidAngle is not None:
#do_solidAngle = True
#assert solidAngle.size == size
#csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
#if polarization is not None:
#do_polarization = True
#assert polarization.size == size
#cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
#if (do_dark + do_flat + do_polarization + do_solidAngle):
#tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
#cdata = numpy.zeros(size,dtype=numpy.float32)
#if do_dummy:
#for i in prange(size, nogil=True, schedule="static"):
#data = tdata[i]
#if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
##Nota: -= and /= operatore are seen as reduction in cython parallel.
#if do_dark:
#data = data - cdark[i]
#if do_flat:
#data = data / cflat[i]
#if do_polarization:
#data = data / cpolarization[i]
#if do_solidAngle:
#data = data / csolidAngle[i]
#cdata[i]+=data
#else: #set all dummy_like values to cdummy. simplifies further processing
#cdata[i]+=cdummy
#else:
#for i in prange(size, nogil=True, schedule="static"):
#data = tdata[i]
#if do_dark:
#data = data - cdark[i]
#if do_flat:
#data = data / cflat[i]
#if do_polarization:
#data = data / cpolarization[i]
#if do_solidAngle:
#data = data / csolidAngle[i]
#cdata[i]+=data
#else:
#if do_dummy:
#tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
#cdata = numpy.zeros(size,dtype=numpy.float32)
#for i in prange(size, nogil=True, schedule="static"):
#data = tdata[i]
#if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
#cdata[i]+=data
#else:
#cdata[i]+=cdummy
#else:
#cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
#for i in prange(bins, nogil=True, schedule="guided"):
#sum_data = 0.0
#sum_count = 0.0
#for j in range(indptr[i],indptr[i+1]):
#idx = indices[j]
#coef = ccoef[j]
#data = cdata[idx]
#if do_dummy and data==cdummy:
#continue
#sum_data = sum_data + coef * data
#sum_count = sum_count + coef
#outData_1d[i] += sum_data
#outCount_1d[i] += sum_count
#if sum_count > epsilon:
#outMerge_1d[i] += sum_data / sum_count
#else:
#outMerge_1d[i] += cdummy
#return outMerge.T, self.outPos0, self.outPos1, outData.T, outCount.T

View File

@ -25,7 +25,7 @@ data = fabio.open("testimages/Pilatus1M.edf").data
ref = ai.integrate2d(data, 100, 360, method="lut", unit="2th_deg")[0]
obt = ai.integrate2d(data, 100, 360, method="ocl_csr", unit="2th_deg")[0]
##logger.debug("check LUT basics: %s"%abs(obt[1] - ref[1]).max())
#assert numpy.allclose(ref,obt)
assert numpy.allclose(ref,obt)
plot(ref.ravel(), label="ocl_lut")

View File

@ -0,0 +1,42 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 07 09:52:51 2014
@author: ashiotis
"""
import sys, numpy, time
import utilstest
import fabio, pyopencl
from pylab import *
print "#"*50
pyFAI = sys.modules["pyFAI"]
from pyFAI import splitPixelFullLUT
from pyFAI import ocl_azim_csr
#from pyFAI import splitBBoxLUT
#from pyFAI import splitBBoxCSR
#logger = utilstest.getLogger("profile")
ai = pyFAI.load("testimages/Pilatus1M.poni")
data = fabio.open("testimages/Pilatus1M.edf").data
ref = ai.xrpd_LUT(data, 1000)[1]
#obt = ai.xrpd_LUT_OCL(data, 1000)[1]
#ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
pos = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
boo = foo.integrate(data)[1]
foo2 = ocl_azim_csr.OCL_CSR_Integrator(foo.lut, data.size, "GPU", block_size=32)
boo2 = foo2.integrate(data)[0]
plot(ref, label="ocl_csr")
plot(boo, label="csr_fullsplit")
plot(boo2, label="ocl_csr_fullsplit")
legend()
show()
raw_input()

View File

@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 07 09:52:51 2014
@author: ashiotis
"""
import sys, numpy, time
import utilstest
import fabio, pyopencl
from pylab import *
print "#"*50
pyFAI = sys.modules["pyFAI"]
from pyFAI import splitPixelFullLUT
from pyFAI import ocl_hist_pixelsplit
#from pyFAI import splitBBoxLUT
#from pyFAI import splitBBoxCSR
#logger = utilstest.getLogger("profile")
ai = pyFAI.load("testimages/Pilatus1M.poni")
data = fabio.open("testimages/Pilatus1M.edf").data
ref = ai.xrpd_LUT(data, 1000)[1]
#obt = ai.xrpd_LUT_OCL(data, 1000)[1]
#ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
pos = pos_in.reshape(pos_in.size/8,4,2)
foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
boo = foo.integrate(data)[1]
foo2 = ocl_hist_pixelsplit.OCL_Hist_Pixelsplit(pos, 1000, data.size, devicetype="cpu", block_size=32)
boo2 = foo2.integrate(data)[2]
#plot(ref, label="ocl_csr")
#plot(boo, label="csr_fullsplit")
plot(boo2, label="ocl_csr_fullsplit")
legend()
show()
raw_input()

View File

@ -0,0 +1,80 @@
# -*- coding: utf-8 -*-
"""
Created on Fri Mar 07 09:52:51 2014
@author: ashiotis
"""
import sys, numpy, time
import utilstest
import fabio
import pyopencl as cl
from pylab import *
print "#"*50
pyFAI = sys.modules["pyFAI"]
from pyFAI import splitPixelFullLUT
from pyFAI import ocl_hist_pixelsplit
#from pyFAI import splitBBoxLUT
#from pyFAI import splitBBoxCSR
#logger = utilstest.getLogger("profile")
ai = pyFAI.load("testimages/Pilatus1M.poni")
data = fabio.open("testimages/Pilatus1M.edf").data
workgroup_size = 256
bins = 1000
pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
pos = pos_in.reshape(pos_in.size/8,4,2)
pos_size = pos.size
size = pos_size/8
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
d_input = cl.array.to_device(queue, pos)
d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
with open("../openCL/ocl_hist_pixelsplit.cl", "r") as kernelFile:
kernel_src = kernelFile.read()
compile_options = "-D BINS=%i -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
(bins, size, workgroup_size, numpy.finfo(numpy.float32).eps)
program = cl.Program(ctx, kernel_src).build(options=compile_options)
program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data, numpy.uint32(pos_size), d_preresult)
program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
result = numpy.ndarray(4,dtype=numpy.float32)
cl.enqueue_copy(queue,result, d_result)
min0 = pos[:, :, 0].min()
max0 = pos[:, :, 0].max()
min1 = pos[:, :, 1].min()
max1 = pos[:, :, 1].max()
minmax=(min0,max0,min1,max1)
print minmax
print result
#plot(ref, label="ocl_csr")
#plot(boo, label="csr_fullsplit")
#plot(boo2, label="ocl_csr_fullsplit")
#legend()
#show()
#raw_input()