mirror of https://github.com/silx-kit/pyFAI.git
Merge remote branch 'giannis/master'
Conflicts: doc/source/pyFAI.rst openCL/ocl_azim_CSR.cl pyFAI-src/ocl_azim_csr_dis.py src/morphology.c
This commit is contained in:
commit
5c914a008e
|
@ -6,6 +6,7 @@ from __future__ import print_function, division
|
|||
|
||||
import json, sys, time, timeit, os, platform, subprocess
|
||||
import numpy
|
||||
from numpy import log2
|
||||
import fabio
|
||||
import os.path as op
|
||||
sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
|
||||
|
@ -58,6 +59,8 @@ class Bench(object):
|
|||
self.repeat = repeat
|
||||
self.nbr = nbr
|
||||
self.results = {}
|
||||
self.flops = {}
|
||||
self.mem_band = {}
|
||||
self.meth = []
|
||||
self._cpu = None
|
||||
self.fig = None
|
||||
|
@ -172,8 +175,10 @@ data = fabio.open(r"%s").data
|
|||
print("Working on processor: %s" % self.get_cpu())
|
||||
label = "1D_" + self.LABELS[method]
|
||||
results = {}
|
||||
flops = {}
|
||||
mem_band = {}
|
||||
first = True
|
||||
param = "Frelon2k.poni"
|
||||
param = "Pilatus1M.poni"
|
||||
block_size_list = [1,2,4,8,16,32,64,128,256]
|
||||
for block_size in block_size_list:
|
||||
self.update_mp()
|
||||
|
@ -190,6 +195,14 @@ data = fabio.open(r"%s").data
|
|||
if check:
|
||||
if "csr" in method:
|
||||
print("csr: size= %s \t nbytes %.3f MB " % (ai._csr_integrator.data.size, ai._csr_integrator.lut_nbytes / 2 ** 20))
|
||||
|
||||
bins = ai._csr_integrator.bins
|
||||
nnz = ai._csr_integrator.nnz
|
||||
parallel_reduction = sum([2**i for i in range(1,int(log2(block_size)))])
|
||||
|
||||
FLOPs = 9*nnz + 11*parallel_reduction + 1*bins
|
||||
mem_access = (2*block_size*bins + 5*nnz + 7*bins)*4
|
||||
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
|
||||
|
@ -216,9 +229,13 @@ data = fabio.open(r"%s").data
|
|||
self.update_mp()
|
||||
if R < self.LIMIT:
|
||||
results[block_size ] = tmin
|
||||
flops[block_size ] = (FLOPs/tmin)*1e-6
|
||||
mem_band[block_size ] = (mem_access/tmin)*1e-6
|
||||
self.update_mp()
|
||||
else:
|
||||
results[block_size ] = tmin
|
||||
flops[block_size ] = FLOPs/tmin
|
||||
mem_band[block_size ] = mem_access/tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
|
@ -227,6 +244,8 @@ data = fabio.open(r"%s").data
|
|||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.flops[label] = flops
|
||||
self.mem_band[label] = mem_band
|
||||
self.update_mp()
|
||||
|
||||
|
||||
|
@ -234,12 +253,12 @@ data = fabio.open(r"%s").data
|
|||
self.update_mp()
|
||||
json.dump(self.results, open(filename, "w"))
|
||||
|
||||
def print_res(self):
|
||||
def print_res(self,summary,results):
|
||||
self.update_mp()
|
||||
print("Summary: execution time in milliseconds")
|
||||
print(summary)
|
||||
print("Size/Meth\t" + "\t".join(self.meth))
|
||||
for i in self.size:
|
||||
print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (self.results[j].get(i, 0)) for j in self.meth))
|
||||
print("%7.2f\t\t" % i + "\t\t".join("%.2f" % (results[j].get(i, 0)) for j in self.meth))
|
||||
|
||||
def init_curve(self):
|
||||
self.update_mp()
|
||||
|
@ -394,10 +413,16 @@ if __name__ == "__main__":
|
|||
bench.bench_1d_ocl_csr(True, {"devicetype":"ACC"})
|
||||
|
||||
bench.save()
|
||||
bench.print_res()
|
||||
results = bench.results
|
||||
flops = bench.flops
|
||||
mem_band = bench.mem_band
|
||||
|
||||
bench.print_res("Summary: Execution time in milliseconds", results)
|
||||
bench.print_res("Summary: MFLOPS",flops)
|
||||
bench.print_res("Summary: Memory Bandwidth in MB/s",mem_band)
|
||||
bench.update_mp()
|
||||
|
||||
bench.ax.set_ylim(1, 200)
|
||||
# plt.show()
|
||||
plt.ion()
|
||||
raw_input("Enter to quit")
|
||||
# raw_input("Enter to quit")
|
||||
|
|
|
@ -0,0 +1,752 @@
|
|||
#!/usr/bin/python
|
||||
import json, sys, time, timeit, os, platform, subprocess
|
||||
import numpy
|
||||
import fabio
|
||||
import os.path as op
|
||||
sys.path.append(op.join(op.dirname(op.dirname(op.abspath(__file__))), "test"))
|
||||
import utilstest
|
||||
pyFAI = utilstest.UtilsTest.pyFAI
|
||||
ocl = pyFAI.opencl.ocl
|
||||
from matplotlib import pyplot as plt
|
||||
plt.ion()
|
||||
|
||||
ds_list = ["Pilatus1M.poni", "halfccd.poni", "Frelon2k.poni", "Pilatus6M.poni", "Mar3450.poni", "Fairchild.poni"]
|
||||
datasets = {"Fairchild.poni":utilstest.UtilsTest.getimage("1880/Fairchild.edf"),
|
||||
"halfccd.poni":utilstest.UtilsTest.getimage("1882/halfccd.edf"),
|
||||
"Frelon2k.poni":utilstest.UtilsTest.getimage("1881/Frelon2k.edf"),
|
||||
"Pilatus6M.poni":utilstest.UtilsTest.getimage("1884/Pilatus6M.cbf"),
|
||||
"Pilatus1M.poni":utilstest.UtilsTest.getimage("1883/Pilatus1M.edf"),
|
||||
"Mar3450.poni":utilstest.UtilsTest.getimage("2201/LaB6_260210.mar3450")
|
||||
}
|
||||
b = None
|
||||
class Bench(object):
|
||||
HEADER = '\033[95m'
|
||||
OKBLUE = '\033[94m'
|
||||
OKGREEN = '\033[92m'
|
||||
WARNING = '\033[93m'
|
||||
FAIL = '\033[91m'
|
||||
ENDC = '\033[0m'
|
||||
def __init__(self, nbr=10, memprofile=False):
|
||||
self.reference_1d = {}
|
||||
self.LIMIT = 8
|
||||
self.repeat = 1
|
||||
self.nbr = nbr
|
||||
self.results = {}
|
||||
self.meth = []
|
||||
self._cpu = None
|
||||
self.fig = None
|
||||
self.ax = None
|
||||
self.starttime = time.time()
|
||||
self.plot = None
|
||||
self.plot_x = []
|
||||
self.plot_y = []
|
||||
self.do_memprofile = memprofile
|
||||
self.fig_mp = None
|
||||
self.ax_mp = None
|
||||
self.plot_mp = None
|
||||
self.memory_profile = ([], [])
|
||||
|
||||
|
||||
def get_cpu(self):
|
||||
if self._cpu is None:
|
||||
if os.name == "nt":
|
||||
self._cpu = platform.processor()
|
||||
elif os.path.exists("/proc/cpuinfo"):
|
||||
self._cpu = [i.split(": ", 1)[1] for i in open("/proc/cpuinfo") if i.startswith("model name")][0].strip()
|
||||
elif os.path.exists("/usr/sbin/sysctl"):
|
||||
proc = subprocess.Popen(["sysctl", "-n", "machdep.cpu.brand_string"], stdout=subprocess.PIPE)
|
||||
proc.wait()
|
||||
self._cpu = proc.stdout.read().strip()
|
||||
old = self._cpu
|
||||
self._cpu = old.replace(" ", " ")
|
||||
while old != self._cpu:
|
||||
old = self._cpu
|
||||
self._cpu = old.replace(" ", " ")
|
||||
return self._cpu
|
||||
|
||||
def get_gpu(self, devicetype="gpu", useFp64=False, platformid=None, deviceid=None):
|
||||
if ocl is None:
|
||||
return "NoGPU"
|
||||
ctx = ocl.create_context(devicetype, useFp64, platformid, deviceid)
|
||||
return ctx.devices[0].name
|
||||
|
||||
def get_mem(self):
|
||||
"""
|
||||
Returns the occupied memory for memory-leak hunting in MByte
|
||||
"""
|
||||
pid = os.getpid()
|
||||
if os.path.exists("/proc/%i/status" % pid):
|
||||
for l in open("/proc/%i/status" % pid):
|
||||
if l.startswith("VmRSS"):
|
||||
mem = int(l.split(":", 1)[1].split()[0]) / 1024.
|
||||
else:
|
||||
mem = 0
|
||||
return mem
|
||||
|
||||
|
||||
def print_init(self, t):
|
||||
print(" * Initialization time: %.1f ms" % (1000.0 * t))
|
||||
self.update_mp()
|
||||
|
||||
|
||||
def print_exec(self, t):
|
||||
print(" * Execution time rep : %.1f ms" % (1000.0 * t))
|
||||
self.update_mp()
|
||||
|
||||
|
||||
def print_sep(self):
|
||||
print("*"*80)
|
||||
self.update_mp()
|
||||
|
||||
def get_ref(self, param):
|
||||
if param not in self.reference_1d:
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
N = min(data.shape)
|
||||
res = ai.xrpd(data, N)
|
||||
self.reference_1d[param] = res
|
||||
del ai, data
|
||||
return self.reference_1d[param]
|
||||
|
||||
def bench_cpu1d(self):
|
||||
self.update_mp()
|
||||
print("Working on processor: %s" % self.get_cpu())
|
||||
results = {}
|
||||
label = "1D_CPU_serial_full_split"
|
||||
first = True
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
ref = self.get_ref(param)
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = min(data.shape)
|
||||
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
res = ai.integrate1d(data, 1000, method="splitpixelfull", unit="2th_deg", correctSolidAngle=False)
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
self.update_mp()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
N=min(data.shape)
|
||||
out=ai.xrpd(data,N)""" % (param, fn)
|
||||
t = timeit.Timer("ai.integrate1d(data, 1000, method='splitpixelfull', unit='2th_deg', correctSolidAngle=False)", setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size ] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
def bench_cpu1d_lut(self):
|
||||
self.update_mp()
|
||||
print("Working on processor: %s" % self.get_cpu())
|
||||
label = "1D_CPU_parallel_OpenMP"
|
||||
results = {}
|
||||
self.new_curve(results, label)
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
ref = self.get_ref(param)
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = min(data.shape)
|
||||
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
res = ai.xrpd_LUT(data, N)
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
print "lut.shape=", ai._lut_integrator.lut.shape, "lut.nbytes (MB)", ai._lut_integrator.size * 8 / 1e6
|
||||
self.update_mp()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
N=min(data.shape)
|
||||
out=ai.xrpd_LUT(data,N)""" % (param, fn)
|
||||
t = timeit.Timer("ai.xrpd_LUT(data,N,safe=False)", setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.print_exec(tmin)
|
||||
R = utilstest.Rwp(res, ref)
|
||||
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
|
||||
self.update_mp()
|
||||
if R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size ] = tmin
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
def bench_cpu1d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
|
||||
self.update_mp()
|
||||
if (ocl is None):
|
||||
print("No pyopencl")
|
||||
return
|
||||
if (platformid is None) or (deviceid is None):
|
||||
platdev = ocl.select_device(devicetype)
|
||||
if not platdev:
|
||||
print("No such OpenCL device: skipping benchmark")
|
||||
return
|
||||
platformid, deviceid = platdev
|
||||
print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
|
||||
label = "1D_%s_parallel_OpenCL" % devicetype
|
||||
first = True
|
||||
results = {}
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
ref = self.get_ref(param)
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = min(data.shape)
|
||||
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
try:
|
||||
res = ai.xrpd_LUT_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid)
|
||||
except MemoryError as error:
|
||||
print(error)
|
||||
break
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
self.update_mp()
|
||||
ai.reset()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
N=min(data.shape)
|
||||
out=ai.xrpd_LUT_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s)""" % (param, fn, devicetype, platformid, deviceid)
|
||||
t = timeit.Timer("ai.xrpd_LUT_OCL(data,N,safe=False)", setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.update_mp()
|
||||
del t
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
R = utilstest.Rwp(res, ref)
|
||||
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
|
||||
if R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
|
||||
def bench_cpu1d_csr_ocl(self, devicetype="GPU", platformid=None, deviceid=None, padded=False, block_size=32):
|
||||
self.update_mp()
|
||||
if (ocl is None):
|
||||
print("No pyopencl")
|
||||
return
|
||||
if (platformid is None) or (deviceid is None):
|
||||
platdev = ocl.select_device(devicetype)
|
||||
if not platdev:
|
||||
print("No such OpenCL device: skipping benchmark")
|
||||
return
|
||||
platformid, deviceid = platdev
|
||||
print("Working on device: %s platform: %s device: %s padding: %s block_size= %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid], padded, block_size))
|
||||
label = "1D_%s_parallel_OpenCL, padded=%s, block_size=%s" % (devicetype, padded, block_size)
|
||||
first = True
|
||||
results = {}
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
ref = self.get_ref(param)
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = min(data.shape)
|
||||
print("1D integration of %s %.1f Mpixel -> %i bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
try:
|
||||
res = ai.xrpd_CSR_OCL(data, N, devicetype=devicetype, platformid=platformid, deviceid=deviceid, padded=padded, block_size=block_size)
|
||||
except MemoryError as error:
|
||||
print(error)
|
||||
break
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
self.update_mp()
|
||||
ai.reset()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
N=min(data.shape)
|
||||
out=ai.xrpd_CSR_OCL(data,N,devicetype=r"%s",platformid=%s,deviceid=%s,padded=%s,block_size=%s)""" % (param, fn, devicetype, platformid, deviceid, padded, block_size)
|
||||
t = timeit.Timer("ai.xrpd_CSR_OCL(data,N,safe=False,padded=%s,block_size=%s)" % (padded, block_size), setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.update_mp()
|
||||
del t
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
R = utilstest.Rwp(res, ref)
|
||||
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
|
||||
if R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
|
||||
def bench_cpu2d(self):
|
||||
self.update_mp()
|
||||
print("Working on processor: %s" % self.get_cpu())
|
||||
results = {}
|
||||
label = "2D_CPU_serial"
|
||||
first = True
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = (500, 360)
|
||||
print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
_ = ai.xrpd2(data, N[0], N[1])
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
self.update_mp()
|
||||
ai.reset()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
out=ai.xrpd2(data,%s,%s)""" % (param, fn, N[0], N[1])
|
||||
t = timeit.Timer("ai.xrpd2(data,%s,%s)" % N, setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.update_mp()
|
||||
del t
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
print("")
|
||||
if 1: # R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
def bench_cpu2d_lut(self):
|
||||
print("Working on processor: %s" % self.get_cpu())
|
||||
label = "2D_CPU_parallel_OpenMP"
|
||||
first = True
|
||||
results = {}
|
||||
for param in ds_list:
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = (500, 360)
|
||||
print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
_ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut")
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
|
||||
self.update_mp()
|
||||
ai.reset()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut")""" % (param, fn, N[0], N[1])
|
||||
t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut')" % N, setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.update_mp()
|
||||
del t
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
print("")
|
||||
if 1: # R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
def bench_cpu2d_lut_ocl(self, devicetype="ALL", platformid=None, deviceid=None):
|
||||
self.update_mp()
|
||||
if (ocl is None):
|
||||
print("No pyopencl")
|
||||
return
|
||||
if (platformid is None) or (deviceid is None):
|
||||
platdev = ocl.select_device(devicetype)
|
||||
if not platdev:
|
||||
print("No such OpenCL device: skipping benchmark")
|
||||
return
|
||||
platformid, deviceid = platdev
|
||||
print("Working on device: %s platform: %s device: %s" % (devicetype, ocl.platforms[platformid], ocl.platforms[platformid].devices[deviceid]))
|
||||
results = {}
|
||||
label = "2D_%s_parallel_OpenCL" % devicetype.upper()
|
||||
first = True
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = (500, 360)
|
||||
print("2D integration of %s %.1f Mpixel -> %s bins" % (op.basename(fn), size / 1e6, N))
|
||||
t0 = time.time()
|
||||
try:
|
||||
_ = ai.integrate2d(data, N[0], N[1], unit="2th_deg", method="lut_ocl_%i,%i" % (platformid, deviceid))
|
||||
except MemoryError as error:
|
||||
print(error)
|
||||
break
|
||||
t1 = time.time()
|
||||
self.print_init(t1 - t0)
|
||||
print("Size of the LUT: %.3fMByte" % (ai._lut_integrator.lut.nbytes / 1e6))
|
||||
self.update_mp()
|
||||
ai.reset()
|
||||
del ai, data
|
||||
self.update_mp()
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
out=ai.integrate2d(data,%s,%s,unit="2th_deg", method="lut_ocl_%i,%i")""" % (param, fn, N[0], N[1], platformid, deviceid)
|
||||
t = timeit.Timer("out=ai.integrate2d(data,%s,%s,unit='2th_deg', method='lut_ocl')" % N, setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
self.update_mp()
|
||||
del t
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
print("")
|
||||
if 1: # R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
|
||||
def bench_gpu1d(self, devicetype="gpu", useFp64=True, platformid=None, deviceid=None):
|
||||
self.update_mp()
|
||||
print("Working on %s, in " % devicetype + ("64 bits mode" if useFp64 else"32 bits mode") + "(%s.%s)" % (platformid, deviceid))
|
||||
if ocl is None or not ocl.select_device(devicetype):
|
||||
print("No pyopencl or no such device: skipping benchmark")
|
||||
return
|
||||
results = {}
|
||||
label = "Forward_OpenCL_%s_%s_bits" % (devicetype , ("64" if useFp64 else"32"))
|
||||
first = True
|
||||
for param in ds_list:
|
||||
self.update_mp()
|
||||
fn = datasets[param]
|
||||
ai = pyFAI.load(param)
|
||||
data = fabio.open(fn).data
|
||||
size = data.size
|
||||
N = min(data.shape)
|
||||
print("1D integration of %s %.1f Mpixel -> %i bins (%s)" % (op.basename(fn), size / 1e6, N, ("64 bits mode" if useFp64 else"32 bits mode")))
|
||||
|
||||
try:
|
||||
t0 = time.time()
|
||||
res = ai.xrpd_OpenCL(data, N, devicetype=devicetype, useFp64=useFp64, platformid=platformid, deviceid=deviceid)
|
||||
t1 = time.time()
|
||||
except Exception as error:
|
||||
print("Failed to find an OpenCL GPU (useFp64:%s) %s" % (useFp64, error))
|
||||
continue
|
||||
self.print_init(t1 - t0)
|
||||
self.update_mp()
|
||||
ref = ai.xrpd(data, N)
|
||||
R = utilstest.Rwp(res, ref)
|
||||
print("%sResults are bad with R=%.3f%s" % (self.WARNING, R, self.ENDC) if R > self.LIMIT else"%sResults are good with R=%.3f%s" % (self.OKGREEN, R, self.ENDC))
|
||||
setup = """
|
||||
import pyFAI,fabio
|
||||
ai=pyFAI.load(r"%s")
|
||||
data = fabio.open(r"%s").data
|
||||
N=min(data.shape)
|
||||
out=ai.xrpd_OpenCL(data,N, devicetype=r"%s", useFp64=%s, platformid=%s, deviceid=%s)""" % (param, fn, devicetype, useFp64, platformid, deviceid)
|
||||
t = timeit.Timer("ai.xrpd_OpenCL(data,N,safe=False)", setup)
|
||||
tmin = min([i / self.nbr for i in t.repeat(repeat=self.repeat, number=self.nbr)])
|
||||
del t
|
||||
self.update_mp()
|
||||
self.print_exec(tmin)
|
||||
print("")
|
||||
if R < self.LIMIT:
|
||||
size /= 1e6
|
||||
tmin *= 1000.0
|
||||
results[size] = tmin
|
||||
if first:
|
||||
self.new_curve(results, label)
|
||||
first = False
|
||||
else:
|
||||
self.new_point(size, tmin)
|
||||
self.update_mp()
|
||||
self.print_sep()
|
||||
self.meth.append(label)
|
||||
self.results[label] = results
|
||||
self.update_mp()
|
||||
|
||||
def save(self, filename="benchmark.json"):
|
||||
self.update_mp()
|
||||
json.dump(self.results, open(filename, "w"))
|
||||
|
||||
def print_res(self):
|
||||
self.update_mp()
|
||||
print("Summary: execution time in milliseconds")
|
||||
print "Size/Meth\t" + "\t".join(b.meth)
|
||||
for i in self.size:
|
||||
print "%7.2f\t\t" % i + "\t\t".join("%.2f" % (b.results[j].get(i, 0)) for j in b.meth)
|
||||
|
||||
def init_curve(self):
|
||||
self.update_mp()
|
||||
if self.fig:
|
||||
print("Already initialized")
|
||||
return
|
||||
if "DISPLAY" in os.environ:
|
||||
plt.ion()
|
||||
self.fig = plt.figure()
|
||||
self.ax = self.fig.add_subplot(1, 1, 1)
|
||||
self.ax.set_autoscale_on(False)
|
||||
self.ax.set_xlabel("Image size in Mega-Pixels")
|
||||
self.ax.set_ylabel("Frames processed per second")
|
||||
self.ax.set_yscale("log", basey=2)
|
||||
t = [1, 2, 5, 10, 20, 50, 100, 200, 400, 500]
|
||||
self.ax.set_yticks([float(i) for i in t])
|
||||
self.ax.set_yticklabels([str(i)for i in t])
|
||||
self.ax.set_xlim(0.5, 20)
|
||||
self.ax.set_ylim(0.5, 500)
|
||||
self.ax.set_title(self.get_cpu() + " / " + self.get_gpu())
|
||||
|
||||
if self.fig.canvas:
|
||||
self.fig.canvas.draw()
|
||||
# plt.show()
|
||||
|
||||
def new_curve(self, results, label):
|
||||
self.update_mp()
|
||||
if not self.fig:
|
||||
return
|
||||
self.plot_x = list(results.keys())
|
||||
self.plot_x.sort()
|
||||
self.plot_y = [1000.0 / results[i] for i in self.plot_x]
|
||||
self.plot = self.ax.plot(self.plot_x, self.plot_y, "o-", label=label)[0]
|
||||
self.ax.legend()
|
||||
if self.fig.canvas:
|
||||
self.fig.canvas.draw()
|
||||
|
||||
def new_point(self, size, exec_time):
|
||||
"""
|
||||
Add new point to current curve
|
||||
@param size: of the system
|
||||
@parm exec_time: execution time in ms
|
||||
"""
|
||||
self.update_mp()
|
||||
if not self.plot:
|
||||
return
|
||||
|
||||
self.plot_x.append(size)
|
||||
self.plot_y.append(1000.0 / exec_time)
|
||||
self.plot.set_data(self.plot_x, self.plot_y)
|
||||
if self.fig.canvas:
|
||||
self.fig.canvas.draw()
|
||||
|
||||
def display_all(self):
|
||||
if not self.fig:
|
||||
return
|
||||
for k in self.meth:
|
||||
self.new_curve(self.results[k], k)
|
||||
self.ax.legend()
|
||||
self.fig.savefig("benchmark.png")
|
||||
self.fig.show()
|
||||
# plt.ion()
|
||||
|
||||
|
||||
def update_mp(self):
|
||||
if not self.do_memprofile:
|
||||
return
|
||||
self.memory_profile[0].append(time.time() - self.starttime)
|
||||
self.memory_profile[1].append(self.get_mem())
|
||||
if not self.fig_mp:
|
||||
self.fig_mp = plt.figure()
|
||||
self.ax_mp = self.fig_mp.add_subplot(1, 1, 1)
|
||||
self.ax_mp.set_autoscale_on(False)
|
||||
self.ax_mp.set_xlabel("Run time (s)")
|
||||
self.ax_mp.set_xlim(0, 100)
|
||||
self.ax_mp.set_ylim(0, 2 ** 10)
|
||||
self.ax_mp.set_ylabel("Memory occupancy (MB)")
|
||||
self.ax_mp.set_title("Memory leak hunter")
|
||||
self.plot_mp = self.ax_mp.plot(*self.memory_profile)[0]
|
||||
else:
|
||||
self.plot_mp.set_data(*self.memory_profile)
|
||||
tmax = self.memory_profile[0][-1]
|
||||
mmax = max(self.memory_profile[1])
|
||||
if tmax > self.ax_mp.get_xlim()[-1]:
|
||||
self.ax_mp.set_xlim(0, tmax)
|
||||
if mmax > self.ax_mp.get_ylim()[-1]:
|
||||
self.ax_mp.set_ylim(0, mmax)
|
||||
|
||||
if self.fig_mp.canvas:
|
||||
self.fig_mp.canvas.draw()
|
||||
|
||||
def get_size(self):
|
||||
if len(self.meth) == 0:
|
||||
return []
|
||||
size = list(self.results[self.meth[0]].keys())
|
||||
for i in self.meth[1:]:
|
||||
s = list(self.results[i].keys())
|
||||
if len(s) > len(size):
|
||||
size = s
|
||||
size.sort()
|
||||
return size
|
||||
size = property(get_size)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
from argparse import ArgumentParser
|
||||
except:
|
||||
from pyFAI.argparse import ArgumentParser
|
||||
description = """Benchmark for Azimuthal integration
|
||||
"""
|
||||
epilog = """ """
|
||||
usage = """benchmark [options] """
|
||||
version = "pyFAI benchmark version " + pyFAI.version
|
||||
parser = ArgumentParser(usage=usage, description=description, epilog=epilog)
|
||||
parser.add_argument("-v", action='version', version=version)
|
||||
parser.add_argument("-d", "--debug",
|
||||
action="store_true", dest="debug", default=False,
|
||||
help="switch to verbose/debug mode")
|
||||
parser.add_argument("-c", "--cpu",
|
||||
action="store_true", dest="opencl_cpu", default=False,
|
||||
help="perform benchmark using OpenCL on the CPU")
|
||||
parser.add_argument("-g", "--gpu",
|
||||
action="store_true", dest="opencl_gpu", default=False,
|
||||
help="perform benchmark using OpenCL on the GPU")
|
||||
parser.add_argument("-a", "--acc",
|
||||
action="store_true", dest="opencl_acc", default=False,
|
||||
help="perform benchmark using OpenCL on the Accelerator (like XeonPhi/MIC)")
|
||||
parser.add_argument("-s", "--small",
|
||||
action="store_true", dest="small", default=False,
|
||||
help="Limit the size of the dataset to 6 Mpixel images (for computer with limited memory)")
|
||||
parser.add_argument("-n", "--number",
|
||||
dest="number", default=10, type=int,
|
||||
help="Number of repetition of the test, by default 10")
|
||||
parser.add_argument("-2d", "--2dimentions",
|
||||
action="store_true", dest="twodim", default=False,
|
||||
help="Benchmark also algorithm for 2D-regrouping")
|
||||
parser.add_argument("-m", "--memprof",
|
||||
action="store_true", dest="memprof", default=False,
|
||||
help="Perfrom memory profiling (Linux only)")
|
||||
parser.add_argument("-f", "--fullsplit",
|
||||
action="store_true", dest="split_cpu", default=False,
|
||||
help="perform benchmark using full pixel splitting on CPU")
|
||||
|
||||
|
||||
options = parser.parse_args()
|
||||
if options.small:
|
||||
ds_list = ds_list[:4]
|
||||
if options.debug:
|
||||
pyFAI.logger.setLevel(logging.DEBUG)
|
||||
print("Averaging over %i repetitions (best of 3)." % options.number)
|
||||
b = Bench(options.number, options.memprof)
|
||||
b.init_curve()
|
||||
b.bench_cpu1d()
|
||||
b.bench_cpu1d_lut()
|
||||
if options.opencl_cpu:
|
||||
b.bench_cpu1d_lut_ocl("CPU")
|
||||
if options.opencl_gpu:
|
||||
b.bench_cpu1d_lut_ocl("GPU")
|
||||
if options.opencl_acc:
|
||||
b.bench_cpu1d_lut_ocl("ACC")
|
||||
if options.split_cpu:
|
||||
b.bench_cpu1d
|
||||
|
||||
# b.bench_cpu1d_ocl_lut("CPU")
|
||||
# b.bench_gpu1d("gpu", True)
|
||||
# b.bench_gpu1d("gpu", False)
|
||||
# b.bench_gpu1d("cpu", True)
|
||||
# b.bench_gpu1d("cpu", False)
|
||||
if options.twodim:
|
||||
b.bench_cpu2d()
|
||||
b.bench_cpu2d_lut()
|
||||
if options.opencl_cpu:
|
||||
b.bench_cpu2d_lut_ocl("CPU")
|
||||
if options.opencl_gpu:
|
||||
b.bench_cpu2d_lut_ocl("GPU")
|
||||
if options.opencl_acc:
|
||||
b.bench_cpu2d_lut_ocl("ACC")
|
||||
|
||||
# b.bench_cpu2d_lut()
|
||||
# b.bench_cpu2d_lut_ocl()
|
||||
b.save()
|
||||
b.print_res()
|
||||
# b.display_all()
|
||||
b.update_mp()
|
||||
|
||||
b.ax.set_ylim(1, 200)
|
||||
# plt.show()
|
||||
plt.ion()
|
||||
raw_input("Enter to quit")
|
|
@ -140,12 +140,19 @@ PyFAI solves this problem by pixel
|
|||
splitting : in addition to the pixel position, its
|
||||
spatial extension is calculated and each pixel is then split and
|
||||
distributed over the corresponding bins, the intensity being considered
|
||||
<<<<<<< HEAD
|
||||
as homogeneous within a pixel and spread accordingly.
|
||||
The drawback of this is the correlation introduced between two adjacent bins.
|
||||
To simplify
|
||||
calculations, this was initially done by abstracting the pixel shape
|
||||
with a bounding box that circumscribes the pixel. In an effort to better
|
||||
the quality of the results this method was dropped in favo2r of a full
|
||||
=======
|
||||
as homogeneous within a pixel and spread accordingly. To simplify
|
||||
calculations, this was initially done by abstracting the pixel shape
|
||||
with a bounding box that circumscribes the pixel. In an effort to better
|
||||
the quality of the results this method was dropped in favour of a full
|
||||
>>>>>>> giannis/master
|
||||
pixel splitting scheme that actually uses the actual pixel geometry
|
||||
for its calculations.
|
||||
|
||||
|
@ -194,6 +201,7 @@ but can still be too large to fit on an entry-level graphics card.
|
|||
By making this change we switched from a “linear read / random write” forward algorithm
|
||||
to a “random read / linear write” backward algorithm which is more suitable for parallelization.
|
||||
As a farther improvement on the algorithm, the use of compressed sparse row (CSR) format was
|
||||
<<<<<<< HEAD
|
||||
introduced, to store the LUT data.
|
||||
This algorithm was implemented both in [Cython]_-OpenMP and OpenCL.
|
||||
The CSR approach has a double benefit:
|
||||
|
@ -206,6 +214,14 @@ This makes it very well suited to run on GPUs and accelerators
|
|||
where hundreds to thousands of simultaneous threads are available.
|
||||
|
||||
When using OpenCL for the GPU we used a compensated (or Kahan_summation_), to reduce
|
||||
=======
|
||||
introduced, to store the LUT data. This reduced its size even more, giving this way the
|
||||
opportunity of working with bigger images on the same hardware, when memory space is of concern,
|
||||
as well as making the code better suited to be run on GPUs or accelerators, as transferring
|
||||
data to the device is one of the most important bottlenecks of such computations.
|
||||
This algorithm was implemented in Cython-OpenMP and OpenCL.
|
||||
When using OpenCL for the GPU we used a compensated, or Kahan summation to reduce
|
||||
>>>>>>> giannis/master
|
||||
the error accumulation in the histogram summation (at the cost of more operations to be done).
|
||||
This allows accurate results to be obtained on cheap hardware that performs calculations
|
||||
in single precision floating-point arithmetic (32 bits) which are available on consumer
|
||||
|
@ -214,12 +230,17 @@ Double precision operations are currently limited to high price and performance
|
|||
The additional cost of Kahan summation, 4x more arithmetic operations, is hidden by smaller data types,
|
||||
the higher number of single precision units and that the GPU is usually limited by the memory bandwidth anyway.
|
||||
|
||||
<<<<<<< HEAD
|
||||
.. _Kahan_summation: http://en.wikipedia.org/wiki/Kahan_summation_algorithm
|
||||
|
||||
The performances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s
|
||||
on recent multi-core computer with a mid-range graphics card.
|
||||
On multi-socket server featuring high-end GPUs like Tesla cards, the performances are similar with
|
||||
the additional capability to work on multiple detector simultaneously.
|
||||
=======
|
||||
The perfomances of the parallel implementation based on a LUT, stored in CSR format, can reach 750 MPix/s
|
||||
on recent multi-socket, multi-core computer or on high-end GPUs like Tesla cards.
|
||||
>>>>>>> giannis/master
|
||||
|
||||
.. figure:: img/benchmark.png
|
||||
:align: center
|
||||
|
|
|
@ -174,25 +174,21 @@ corrections( __global float *image,
|
|||
|
||||
|
||||
/**
|
||||
* \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT
|
||||
* \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
|
||||
*
|
||||
* An image instensity value is spread across the bins according to the positions stored in the LUT.
|
||||
* The lut is an 2D-array of index (contains the positions of the pixel in the input array)
|
||||
* and coeficients (fraction of pixel going to the bin)
|
||||
* The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
|
||||
* Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
|
||||
*
|
||||
* This implementation is especially efficient on CPU where each core reads adjacents memory.
|
||||
* the use of local pointer can help on the CPU.
|
||||
*
|
||||
* @param weights Float pointer to global memory storing the input image.
|
||||
* @param lut Pointer to an 2D-array of (unsigned integers,float) containing the index of input pixels and the fraction of pixel going to the bin
|
||||
* @param coefs Float pointer to global memory holding the coeficient part of the LUT
|
||||
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
|
||||
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
|
||||
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
|
||||
* @param dummy Float: value for bad pixels
|
||||
* @param delta_dummy Float: precision for bad pixel value
|
||||
* @param do_dark Bool/int: shall dark-current correction be applied ?
|
||||
* @param dark Float pointer to global memory storing the dark image.
|
||||
* @param do_flat Bool/int: shall flat-field correction be applied ? (could contain polarization corrections)
|
||||
* @param flat Float pointer to global memory storing the flat image.
|
||||
* @param outData Float pointer to the output 1D array with the weighted histogram
|
||||
* @param outCount Float pointer to the output 1D array with the unweighted histogram
|
||||
* @param outMerged Float pointer to the output 1D array with the diffractogram
|
||||
|
@ -208,7 +204,7 @@ csr_integrate( const __global float *weights,
|
|||
__global float *outData,
|
||||
__global float *outCount,
|
||||
__global float *outMerge
|
||||
)
|
||||
)
|
||||
{
|
||||
int thread_id_loc = get_local_id(0);
|
||||
int bin_num = get_group_id(0); // each workgroup of size=warp is assinged to 1 bin
|
||||
|
@ -260,10 +256,10 @@ csr_integrate( const __global float *weights,
|
|||
__local float super_sum_data_correction[WORKGROUP_SIZE];
|
||||
__local float super_sum_count[WORKGROUP_SIZE];
|
||||
__local float super_sum_count_correction[WORKGROUP_SIZE];
|
||||
|
||||
|
||||
float super_sum_temp = 0.0f;
|
||||
int index, active_threads = WORKGROUP_SIZE;
|
||||
|
||||
|
||||
if (bin_size < WORKGROUP_SIZE)
|
||||
{
|
||||
if (thread_id_loc < bin_size)
|
||||
|
@ -292,6 +288,106 @@ csr_integrate( const __global float *weights,
|
|||
cd = 0;
|
||||
cc = 0;
|
||||
|
||||
if (thread_id_loc == 0)
|
||||
{
|
||||
outData[bin_num] = super_sum_data[0];
|
||||
outCount[bin_num] = super_sum_count[0];
|
||||
if (outCount[bin_num] > epsilon)
|
||||
outMerge[bin_num] = outData[bin_num] / outCount[bin_num];
|
||||
else
|
||||
outMerge[bin_num] = dummy;
|
||||
}
|
||||
};//end kernel
|
||||
|
||||
/**
|
||||
* \brief Performs 1d azimuthal integration with full pixel splitting based on a LUT in CSR form
|
||||
*
|
||||
* An image instensity value is spread across the bins according to the positions stored in the LUT.
|
||||
* The lut is represented by a set of 3 arrays (coefs, row_ind, col_ptr)
|
||||
* Values of 0 in the mask are processed and values of 1 ignored as per PyFAI
|
||||
*
|
||||
* This kernel is ment to be ran with padded data (the span of each bin must be a multiple of the workgroup size)
|
||||
*
|
||||
* @param weights Float pointer to global memory storing the input image.
|
||||
* @param coefs Float pointer to global memory holding the coeficient part of the LUT
|
||||
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
|
||||
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
|
||||
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
|
||||
* @param dummy Float: value for bad pixels
|
||||
* @param outData Float pointer to the output 1D array with the weighted histogram
|
||||
* @param outCount Float pointer to the output 1D array with the unweighted histogram
|
||||
* @param outMerged Float pointer to the output 1D array with the diffractogram
|
||||
*
|
||||
*/
|
||||
__kernel void
|
||||
csr_integrate_padded( const __global float *weights,
|
||||
const __global float *coefs,
|
||||
const __global int *row_ind,
|
||||
const __global int *col_ptr,
|
||||
const int do_dummy,
|
||||
const float dummy,
|
||||
__global float *outData,
|
||||
__global float *outCount,
|
||||
__global float *outMerge
|
||||
)
|
||||
{
|
||||
int thread_id_loc = get_local_id(0);
|
||||
int bin_num = get_group_id(0); // each workgroup of size=warp is assinged to 1 bin
|
||||
int2 bin_bounds;
|
||||
// bin_bounds = (int2) *(col_ptr+bin_num); // cool stuff!
|
||||
bin_bounds.x = col_ptr[bin_num];
|
||||
bin_bounds.y = col_ptr[bin_num+1];
|
||||
float sum_data = 0.0f;
|
||||
float sum_count = 0.0f;
|
||||
float cd = 0.0f;
|
||||
float cc = 0.0f;
|
||||
float t, y;
|
||||
const float epsilon = 1e-10f;
|
||||
float coef, data;
|
||||
int idx, k, j;
|
||||
|
||||
for (j=bin_bounds.x;j<bin_bounds.y;j+=WORKGROUP_SIZE)
|
||||
{
|
||||
k = j+thread_id_loc;
|
||||
coef = coefs[k];
|
||||
idx = row_ind[k];
|
||||
data = weights[idx];
|
||||
if( (!do_dummy) || (data!=dummy) )
|
||||
{
|
||||
//sum_data += coef * data;
|
||||
//sum_count += coef;
|
||||
//Kahan summation allows single precision arithmetics with error compensation
|
||||
//http://en.wikipedia.org/wiki/Kahan_summation_algorithm
|
||||
y = coef*data - cd;
|
||||
t = sum_data + y;
|
||||
cd = (t - sum_data) - y;
|
||||
sum_data = t;
|
||||
y = coef - cc;
|
||||
t = sum_count + y;
|
||||
cc = (t - sum_count) - y;
|
||||
sum_count = t;
|
||||
};//end if dummy
|
||||
};//for j
|
||||
/*
|
||||
* parallel reduction
|
||||
*/
|
||||
|
||||
// REMEMBER TO PASS WORKGROUP_SIZE AS A CPP DEF
|
||||
__local float super_sum_data[WORKGROUP_SIZE];
|
||||
__local float super_sum_data_correction[WORKGROUP_SIZE];
|
||||
__local float super_sum_count[WORKGROUP_SIZE];
|
||||
__local float super_sum_count_correction[WORKGROUP_SIZE];
|
||||
super_sum_data[thread_id_loc] = sum_data;
|
||||
super_sum_count[thread_id_loc] = sum_count;
|
||||
super_sum_data_correction[thread_id_loc] = cd;
|
||||
super_sum_count_correction[thread_id_loc] = cc;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
float super_sum_temp = 0.0f;
|
||||
int index, active_threads = WORKGROUP_SIZE;
|
||||
cd = 0;
|
||||
cc = 0;
|
||||
|
||||
while (active_threads != 1)
|
||||
{
|
||||
active_threads /= 2;
|
||||
|
@ -304,7 +400,7 @@ csr_integrate( const __global float *weights,
|
|||
t = super_sum_temp + y;
|
||||
super_sum_data_correction[thread_id_loc] = (t - super_sum_temp) - y;
|
||||
super_sum_data[thread_id_loc] = t;
|
||||
|
||||
|
||||
cc = super_sum_count_correction[thread_id_loc] + super_sum_count_correction[index];
|
||||
super_sum_temp = super_sum_count[thread_id_loc];
|
||||
y = super_sum_count[index] - cc;
|
||||
|
@ -326,6 +422,20 @@ csr_integrate( const __global float *weights,
|
|||
}
|
||||
};//end kernel
|
||||
|
||||
=======
|
||||
|
||||
Correct an image based on the look-up table calculated ...
|
||||
/**
|
||||
* \brief Performs distortion corrections on an image using a LUT in CSR form
|
||||
*
|
||||
* @param weights Float pointer to global memory storing the input image.
|
||||
* @param coefs Float pointer to global memory holding the coeficient part of the LUT
|
||||
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
|
||||
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
|
||||
* @param outData Float pointer to the output 1D array with the corrected image
|
||||
*
|
||||
*/
|
||||
|
||||
__kernel void
|
||||
csr_integrate_dis( const __global float *weights,
|
||||
const __global float *coefs,
|
||||
|
@ -381,9 +491,9 @@ csr_integrate_dis( const __global float *weights,
|
|||
__local float super_sum_data_correction[WORKGROUP_SIZE];
|
||||
float super_sum_temp = 0.0f;
|
||||
int index, active_threads = WORKGROUP_SIZE;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if (bin_size < WORKGROUP_SIZE)
|
||||
{
|
||||
if (thread_id_loc < bin_size)
|
||||
|
@ -403,9 +513,9 @@ csr_integrate_dis( const __global float *weights,
|
|||
super_sum_data[thread_id_loc] = sum_data;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
|
||||
cd = 0;
|
||||
|
||||
|
||||
while (active_threads != 1)
|
||||
{
|
||||
active_threads /= 2;
|
||||
|
|
|
@ -0,0 +1,405 @@
|
|||
//#pragma OPENCL EXTENSION cl_amd_printf : enable
|
||||
//#pragma OPENCL EXTENSION cl_intel_printf : enable
|
||||
|
||||
|
||||
float area4(float a0, float a1, float b0, float b1, float c0, float c1, float d0, float d1)
|
||||
{
|
||||
return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)));
|
||||
}
|
||||
|
||||
|
||||
float integrate_line( float A0, float B0, float2 AB)
|
||||
{
|
||||
return (A0==B0) ? 0.0 : AB.s0*(B0*B0 - A0*A0)*0.5 + AB.s1*(B0-A0);
|
||||
}
|
||||
|
||||
|
||||
float getBinNr(float x0, float delta, float pos0_min)
|
||||
{
|
||||
return (x0 - pos0_min) / delta;
|
||||
}
|
||||
|
||||
|
||||
float min4f(float a, float b, float c, float d)
|
||||
{
|
||||
return fmin(fmin(a,b),fmin(c,d));
|
||||
}
|
||||
|
||||
|
||||
float max4f(float a, float b, float c, float d)
|
||||
{
|
||||
return fmax(fmax(a,b),fmax(c,d));
|
||||
}
|
||||
|
||||
|
||||
void AtomicAdd(volatile __global float *source, const float operand)
|
||||
{
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} newVal;
|
||||
union {
|
||||
unsigned int intVal;
|
||||
float floatVal;
|
||||
} prevVal;
|
||||
do {
|
||||
prevVal.floatVal = *source;
|
||||
newVal.floatVal = prevVal.floatVal + operand;
|
||||
} while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief cast values of an array of uint16 into a float output array.
|
||||
*
|
||||
* @param array_u16: Pointer to global memory with the input data as unsigned16 array
|
||||
* @param array_float: Pointer to global memory with the output data as float array
|
||||
*/
|
||||
__kernel void
|
||||
u16_to_float(__global unsigned short *array_u16,
|
||||
__global float *array_float
|
||||
)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
//Global memory guard for padding
|
||||
if(i < NIMAGE)
|
||||
array_float[i]=(float)array_u16[i];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief convert values of an array of int32 into a float output array.
|
||||
*
|
||||
* @param array_int: Pointer to global memory with the data in int
|
||||
* @param array_float: Pointer to global memory with the data in float
|
||||
*/
|
||||
__kernel void
|
||||
s32_to_float( __global int *array_int,
|
||||
__global float *array_float
|
||||
)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
//Global memory guard for padding
|
||||
if(i < NIMAGE)
|
||||
array_float[i] = (float)(array_int[i]);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* \brief Sets the values of 3 float output arrays to zero.
|
||||
*
|
||||
* Gridsize = size of arrays + padding.
|
||||
*
|
||||
* @param array0: float Pointer to global memory with the outMerge array
|
||||
* @param array1: float Pointer to global memory with the outCount array
|
||||
* @param array2: float Pointer to global memory with the outData array
|
||||
*/
|
||||
__kernel void
|
||||
memset_out(__global float *array0,
|
||||
__global float *array1,
|
||||
__global float *array2
|
||||
)
|
||||
{
|
||||
int i = get_global_id(0);
|
||||
//Global memory guard for padding
|
||||
if(i < BINS)
|
||||
{
|
||||
array0[i]=0.0f;
|
||||
array1[i]=0.0f;
|
||||
array2[i]=0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* \brief Performs the first part of a 2-step parallel reduction.
|
||||
*
|
||||
* Together with the second part, it take a flattened 4D-array
|
||||
* and returns the min and max of both of the 2 components of the
|
||||
* last dimension
|
||||
*
|
||||
* @param buffer: float Pointer to global memory with the flattened 4D-array (pos in pyFAI)
|
||||
* @param length: interger value of the length of the buffer array
|
||||
* @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
|
||||
*/
|
||||
|
||||
__kernel
|
||||
void reduce1(__global float2* buffer,
|
||||
__const int length,
|
||||
__global float4* preresult) {
|
||||
|
||||
|
||||
int global_index = get_global_id(0);
|
||||
int global_size = get_global_size(0);
|
||||
float4 accumulator;
|
||||
accumulator.x = INFINITY;
|
||||
accumulator.y = -INFINITY;
|
||||
accumulator.z = INFINITY;
|
||||
accumulator.w = -INFINITY;
|
||||
|
||||
// Loop sequentially over chunks of input vector
|
||||
while (global_index < length/2) {
|
||||
float2 element = buffer[global_index];
|
||||
accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
|
||||
accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
|
||||
accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
|
||||
accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
|
||||
global_index += global_size;
|
||||
}
|
||||
|
||||
__local float4 scratch[WORKGROUP_SIZE];
|
||||
|
||||
// Perform parallel reduction
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
scratch[local_index] = accumulator;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int active_threads = get_local_size(0);
|
||||
|
||||
while (active_threads != 1)
|
||||
{
|
||||
active_threads /= 2;
|
||||
if (local_index < active_threads)
|
||||
{
|
||||
float4 other = scratch[local_index + active_threads];
|
||||
float4 mine = scratch[local_index];
|
||||
mine.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
mine.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
mine.z = (mine.z < other.z) ? mine.z : other.z;
|
||||
mine.w = (mine.w > other.w) ? mine.w : other.w;
|
||||
/*
|
||||
float2 tmp;
|
||||
tmp.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
tmp.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
scratch[local_index] = tmp;
|
||||
*/
|
||||
scratch[local_index] = mine;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (local_index == 0) {
|
||||
preresult[get_group_id(0)] = scratch[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief Performs the second part of a 2-step parallel reduction.
|
||||
*
|
||||
* Together with the second part, it take a flattened 4D-array
|
||||
* and returns the min and max of both of the 2 components of the
|
||||
* last dimension
|
||||
*
|
||||
* @param preresult: float Pointer to global memory with the intermitiate data of the 2-step parallel reduction. Should be the size of the workgroup size
|
||||
* @param result: float Pointer to global memory with the min/max values requested
|
||||
*/
|
||||
|
||||
__kernel
|
||||
void reduce2(__global float4* preresult,
|
||||
__global float4* result) {
|
||||
|
||||
|
||||
__local float4 scratch[WORKGROUP_SIZE];
|
||||
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
scratch[local_index] = preresult[local_index];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int active_threads = get_local_size(0);
|
||||
|
||||
while (active_threads != 1)
|
||||
{
|
||||
active_threads /= 2;
|
||||
if (local_index < active_threads)
|
||||
{
|
||||
float4 other = scratch[local_index + active_threads];
|
||||
float4 mine = scratch[local_index];
|
||||
mine.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
mine.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
mine.z = (mine.z < other.z) ? mine.z : other.z;
|
||||
mine.w = (mine.w > other.w) ? mine.w : other.w;
|
||||
/*
|
||||
float2 tmp;
|
||||
tmp.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
tmp.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
scratch[local_index] = tmp;
|
||||
*/
|
||||
scratch[local_index] = mine;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
|
||||
if (local_index == 0) {
|
||||
result[0] = scratch[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* \brief Performs Normalization of input image
|
||||
*
|
||||
* Intensities of images are corrected by:
|
||||
* - dark (read-out) noise subtraction
|
||||
* - Solid angle correction (division)
|
||||
* - polarization correction (division)
|
||||
* - flat fiels correction (division)
|
||||
* Corrections are made in place unless the pixel is dummy.
|
||||
* Dummy pixels are left untouched so that they remain dummy
|
||||
*
|
||||
* @param image Float pointer to global memory storing the input image.
|
||||
* @param do_dark Bool/int: shall dark-current correction be applied ?
|
||||
* @param dark Float pointer to global memory storing the dark image.
|
||||
* @param do_flat Bool/int: shall flat-field correction be applied ?
|
||||
* @param flat Float pointer to global memory storing the flat image.
|
||||
* @param do_solidangle Bool/int: shall flat-field correction be applied ?
|
||||
* @param solidangle Float pointer to global memory storing the solid angle of each pixel.
|
||||
* @param do_polarization Bool/int: shall flat-field correction be applied ?
|
||||
* @param polarization Float pointer to global memory storing the polarization of each pixel.
|
||||
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
|
||||
* @param dummy Float: value for bad pixels
|
||||
* @param delta_dummy Float: precision for bad pixel value
|
||||
*
|
||||
**/
|
||||
__kernel void
|
||||
corrections( __global float *image,
|
||||
const int do_dark,
|
||||
const __global float *dark,
|
||||
const int do_flat,
|
||||
const __global float *flat,
|
||||
const int do_solidangle,
|
||||
const __global float *solidangle,
|
||||
const int do_polarization,
|
||||
const __global float *polarization,
|
||||
const int do_dummy,
|
||||
const float dummy,
|
||||
const float delta_dummy
|
||||
)
|
||||
{
|
||||
float data;
|
||||
int i= get_global_id(0);
|
||||
if(i < NIMAGE)
|
||||
{
|
||||
data = image[i];
|
||||
int dummy_condition = ((!do_dummy) || ((delta_dummy!=0.0f) && (fabs(data-dummy) > delta_dummy)) || ((delta_dummy==0.0f) && (data!=dummy)));
|
||||
data -= do_dark ? dark[i] : 0;
|
||||
data *= do_flat ? 1/flat[i] : 1;
|
||||
data *= do_solidangle ? 1/solidangle[i] : 1;
|
||||
data *= do_polarization ? 1/polarization[i] : 1;
|
||||
image[i] = dummy_condition ? data : dummy;
|
||||
};//end if NIMAGE
|
||||
};//end kernel
|
||||
|
||||
|
||||
/**
|
||||
* \brief Performs 1d azimuthal integration with full pixel splitting
|
||||
*
|
||||
* @param pos Float pointer to global memory storting the flattened 4D-array with the pixel point coords
|
||||
* @param image Float pointer to global memory storing the input image.
|
||||
* @param minmax Float pointer to global memory holding the min/max results of the reduction kernels
|
||||
* @param length: Interger value of the length of the buffer array
|
||||
* @param row_ind Integer pointer to global memory holding the corresponding index of the coeficient
|
||||
* @param col_ptr Integer pointer to global memory holding the pointers to the coefs and row_ind for the CSR matrix
|
||||
* @param do_dummy Bool/int: shall the dummy pixel be checked. Dummy pixel are pixels marked as bad and ignored
|
||||
* @param dummy Float: value for bad pixels
|
||||
* @param outData Float pointer to the output 1D array with the weighted histogram
|
||||
* @param outCount Float pointer to the output 1D array with the unweighted histogram
|
||||
*
|
||||
*/
|
||||
|
||||
__kernel
|
||||
void integrate1(__global float8* pos,
|
||||
__global float* image,
|
||||
// __global int* mask,
|
||||
// __const int check_mask,
|
||||
__global float4* minmax,
|
||||
const int length,
|
||||
// float2 pos0Range,
|
||||
// float2 pos1Range,
|
||||
const int do_dummy,
|
||||
const float dummy,
|
||||
__global float* outData,
|
||||
__global float* outCount)
|
||||
{
|
||||
int global_index = get_global_id(0);
|
||||
if (global_index < length)
|
||||
{
|
||||
// float pos0_min = fmax(fmin(pos0Range.x,pos0Range.y),minmax[0].s0);
|
||||
// float pos0_max = fmin(fmax(pos0Range.x,pos0Range.y),minmax[0].s1);
|
||||
float pos0_min = minmax[0].s0;
|
||||
float pos0_max = minmax[0].s1;
|
||||
pos0_max *= 1 + EPS;
|
||||
|
||||
float delta = (pos0_max - pos0_min) / BINS;
|
||||
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
float8 pixel = pos[global_index];
|
||||
float data = image[global_index];
|
||||
|
||||
pixel.s0 = getBinNr(pixel.s0, delta, pos0_min);
|
||||
pixel.s2 = getBinNr(pixel.s2, delta, pos0_min);
|
||||
pixel.s4 = getBinNr(pixel.s4, delta, pos0_min);
|
||||
pixel.s6 = getBinNr(pixel.s6, delta, pos0_min);
|
||||
|
||||
float min0 = min4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
|
||||
float max0 = max4f(pixel.s0, pixel.s2, pixel.s4, pixel.s6);
|
||||
|
||||
int bin0_min = floor(min0);
|
||||
int bin0_max = floor(max0);
|
||||
|
||||
float2 AB, BC, CD, DA;
|
||||
|
||||
AB.x=(pixel.s3-pixel.s1)/(pixel.s2-pixel.s0);
|
||||
AB.y= pixel.s1 - AB.x*pixel.s0;
|
||||
BC.x=(pixel.s5-pixel.s3)/(pixel.s4-pixel.s2);
|
||||
BC.y= pixel.s3 - BC.x*pixel.s2;
|
||||
CD.x=(pixel.s7-pixel.s5)/(pixel.s6-pixel.s4);
|
||||
CD.y= pixel.s5 - CD.x*pixel.s4;
|
||||
DA.x=(pixel.s1-pixel.s7)/(pixel.s0-pixel.s6);
|
||||
DA.y= pixel.s7 - DA.x*pixel.s6;
|
||||
|
||||
float areaPixel = area4(pixel.s0, pixel.s1, pixel.s2, pixel.s3, pixel.s4, pixel.s5, pixel.s6, pixel.s7);
|
||||
float oneOverPixelArea = 1.0 / areaPixel;
|
||||
for (int bin=bin0_min; bin < bin0_max+1; bin++)
|
||||
{
|
||||
float A_lim = (pixel.s0<=bin)*(pixel.s0<=(bin+1))*bin + (pixel.s0>bin)*(pixel.s0<=(bin+1))*pixel.s0 + (pixel.s0>bin)*(pixel.s0>(bin+1))*(bin+1);
|
||||
float B_lim = (pixel.s2<=bin)*(pixel.s2<=(bin+1))*bin + (pixel.s2>bin)*(pixel.s2<=(bin+1))*pixel.s2 + (pixel.s2>bin)*(pixel.s2>(bin+1))*(bin+1);
|
||||
float C_lim = (pixel.s4<=bin)*(pixel.s4<=(bin+1))*bin + (pixel.s4>bin)*(pixel.s4<=(bin+1))*pixel.s4 + (pixel.s4>bin)*(pixel.s4>(bin+1))*(bin+1);
|
||||
float D_lim = (pixel.s6<=bin)*(pixel.s6<=(bin+1))*bin + (pixel.s6>bin)*(pixel.s6<=(bin+1))*pixel.s6 + (pixel.s6>bin)*(pixel.s6>(bin+1))*(bin+1);
|
||||
float partialArea = integrate_line(A_lim, B_lim, AB);
|
||||
partialArea += integrate_line(B_lim, C_lim, BC);
|
||||
partialArea += integrate_line(C_lim, D_lim, CD);
|
||||
partialArea += integrate_line(D_lim, A_lim, DA);
|
||||
float tmp = fabs(partialArea) * oneOverPixelArea;
|
||||
outCount[bin] = tmp;
|
||||
outData[bin] = data*tmp;
|
||||
// AtomicAdd(&outCount[bin], tmp);
|
||||
// AtomicAdd(&outData[bin], data*tmp);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* \brief Finished the 1d azimuthal integration by calculating the ratio of the 2 histograms
|
||||
*
|
||||
* @param outData Float pointer to the output 1D array with the weighted histogram
|
||||
* @param outCount Float pointer to the output 1D array with the unweighted histogram
|
||||
* @param outMerged Float pointer to the output 1D array with the diffractogram
|
||||
*
|
||||
*/
|
||||
|
||||
__kernel
|
||||
void integrate2(__global float* outData,
|
||||
__global float* outCount,
|
||||
__global float* outMerge)
|
||||
{
|
||||
int global_index = get_global_id(0);
|
||||
if (global_index < BINS)
|
||||
outMerge[global_index] = outData[global_index]/outCount[global_index];
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
__kernel
|
||||
void reduce1(__global float* buffer,
|
||||
__const int length,
|
||||
__global float2* preresult) {
|
||||
|
||||
|
||||
int global_index = get_global_id(0);
|
||||
int global_size = get_global_size(0);
|
||||
float2 accumulator;
|
||||
accumulator.x = INFINITY;
|
||||
accumulator.y = -INFINITY;
|
||||
// Loop sequentially over chunks of input vector
|
||||
while (global_index < length) {
|
||||
float element = buffer[global_index];
|
||||
accumulator.x = (accumulator.x < element) ? accumulator.x : element;
|
||||
accumulator.y = (accumulator.y > element) ? accumulator.y : element;
|
||||
global_index += global_size;
|
||||
}
|
||||
|
||||
__local float2 scratch[WORKGROUP_SIZE];
|
||||
|
||||
// Perform parallel reduction
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
scratch[local_index] = accumulator;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int active_threads = get_local_size(0);
|
||||
|
||||
while (active_threads != 2)
|
||||
{
|
||||
active_threads /= 2;
|
||||
if (thread_id_loc < active_threads)
|
||||
{
|
||||
float2 other = scratch[local_index + active_threads];
|
||||
float2 mine = scratch[local_index];
|
||||
mine.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
mine.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
/*
|
||||
float2 tmp;
|
||||
tmp.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
tmp.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
scratch[local_index] = tmp;
|
||||
*/
|
||||
scratch[local_index] = mine;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (local_index == 0) {
|
||||
preresult[get_group_id(0)] = scratch[0];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel
|
||||
void reduce2(__global float2* preresult,
|
||||
__global float4* result) {
|
||||
|
||||
|
||||
__local float2 scratch[WORKGROUP_SIZE];
|
||||
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
scratch[local_index] = preresult[local_index];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int active_threads = get_local_size(0);
|
||||
|
||||
while (active_threads != 2)
|
||||
{
|
||||
active_threads /= 2;
|
||||
if (thread_id_loc < active_threads)
|
||||
{
|
||||
float2 other = scratch[local_index + active_threads];
|
||||
float2 mine = scratch[local_index];
|
||||
mine.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
mine.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
/*
|
||||
float2 tmp;
|
||||
tmp.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
tmp.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
scratch[local_index] = tmp;
|
||||
*/
|
||||
scratch[local_index] = mine;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
|
||||
if (local_index == 0) {
|
||||
result[0] = vload4(0,scratch);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,104 @@
|
|||
__kernel
|
||||
void reduce1(__global float2* buffer,
|
||||
__const int length,
|
||||
__global float4* preresult) {
|
||||
|
||||
|
||||
int global_index = get_global_id(0);
|
||||
int global_size = get_global_size(0);
|
||||
float4 accumulator;
|
||||
accumulator.x = INFINITY;
|
||||
accumulator.y = -INFINITY;
|
||||
accumulator.z = INFINITY;
|
||||
accumulator.w = -INFINITY;
|
||||
|
||||
// Loop sequentially over chunks of input vector
|
||||
while (global_index < length/2) {
|
||||
float2 element = buffer[global_index];
|
||||
accumulator.x = (accumulator.x < element.s0) ? accumulator.x : element.s0;
|
||||
accumulator.y = (accumulator.y > element.s0) ? accumulator.y : element.s0;
|
||||
accumulator.z = (accumulator.z < element.s1) ? accumulator.z : element.s1;
|
||||
accumulator.w = (accumulator.w > element.s1) ? accumulator.w : element.s1;
|
||||
global_index += global_size;
|
||||
}
|
||||
|
||||
__local float4 scratch[WORKGROUP_SIZE];
|
||||
|
||||
// Perform parallel reduction
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
scratch[local_index] = accumulator;
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int active_threads = get_local_size(0);
|
||||
|
||||
while (active_threads != 1)
|
||||
{
|
||||
active_threads /= 2;
|
||||
if (local_index < active_threads)
|
||||
{
|
||||
float4 other = scratch[local_index + active_threads];
|
||||
float4 mine = scratch[local_index];
|
||||
mine.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
mine.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
mine.z = (mine.z < other.z) ? mine.z : other.z;
|
||||
mine.w = (mine.w > other.w) ? mine.w : other.w;
|
||||
/*
|
||||
float2 tmp;
|
||||
tmp.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
tmp.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
scratch[local_index] = tmp;
|
||||
*/
|
||||
scratch[local_index] = mine;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
if (local_index == 0) {
|
||||
preresult[get_group_id(0)] = scratch[0];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
__kernel
|
||||
void reduce2(__global float4* preresult,
|
||||
__global float4* result) {
|
||||
|
||||
|
||||
__local float4 scratch[WORKGROUP_SIZE];
|
||||
|
||||
int local_index = get_local_id(0);
|
||||
|
||||
scratch[local_index] = preresult[local_index];
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
int active_threads = get_local_size(0);
|
||||
|
||||
while (active_threads != 1)
|
||||
{
|
||||
active_threads /= 2;
|
||||
if (local_index < active_threads)
|
||||
{
|
||||
float4 other = scratch[local_index + active_threads];
|
||||
float4 mine = scratch[local_index];
|
||||
mine.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
mine.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
mine.z = (mine.z < other.z) ? mine.z : other.z;
|
||||
mine.w = (mine.w > other.w) ? mine.w : other.w;
|
||||
/*
|
||||
float2 tmp;
|
||||
tmp.x = (mine.x < other.x) ? mine.x : other.x;
|
||||
tmp.y = (mine.y > other.y) ? mine.y : other.y;
|
||||
scratch[local_index] = tmp;
|
||||
*/
|
||||
scratch[local_index] = mine;
|
||||
}
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
|
||||
if (local_index == 0) {
|
||||
result[0] = scratch[0];
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
import pyopencl as cl
|
||||
from pyopencl import array
|
||||
import numpy
|
||||
|
||||
length = 640000
|
||||
workgroup_size = 128
|
||||
|
||||
a = numpy.random.rand(length).astype(numpy.float32)
|
||||
a.shape = (length/8,4,2)
|
||||
input_a = a.reshape(length)
|
||||
|
||||
min0 = a[:, :, 0].min()
|
||||
max0 = a[:, :, 0].max()
|
||||
min1 = a[:, :, 1].min()
|
||||
max1 = a[:, :, 1].max()
|
||||
minmax=(min0,max0,min1,max1)
|
||||
|
||||
|
||||
platform = cl.get_platforms()[0]
|
||||
device = platform.get_devices()[0]
|
||||
ctx = cl.Context((device,))
|
||||
queue = cl.CommandQueue(ctx)
|
||||
mf = cl.mem_flags
|
||||
|
||||
|
||||
d_input = array.to_device(queue, input_a)
|
||||
d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
|
||||
d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
|
||||
|
||||
with open("reduction_test4.cl", "r") as kernelFile:
|
||||
kernel_src = kernelFile.read()
|
||||
|
||||
compile_options = "-D WORKGROUP_SIZE=%i" % (workgroup_size)
|
||||
|
||||
program = cl.Program(ctx, kernel_src).build(options=compile_options)
|
||||
|
||||
program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data, numpy.uint32(length), d_preresult)
|
||||
|
||||
program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
|
||||
|
||||
result = numpy.ndarray(4,dtype=numpy.float32)
|
||||
|
||||
cl.enqueue_copy(queue,result, d_result)
|
||||
|
||||
|
||||
print minmax
|
||||
|
||||
print result
|
||||
|
||||
|
||||
|
||||
|
|
@ -84,6 +84,13 @@ except ImportError as error:
|
|||
" full pixel splitting: %s" % error)
|
||||
splitPixel = None
|
||||
|
||||
try:
|
||||
from . import splitPixelFull # IGNORE:F0401
|
||||
except ImportError as error:
|
||||
logger.error("Unable to import pyFAI.splitPixelFull"
|
||||
" full pixel splitting: %s" % error)
|
||||
splitPixelFull = None
|
||||
|
||||
try:
|
||||
from . import splitBBox # IGNORE:F0401
|
||||
except ImportError as error:
|
||||
|
@ -2407,39 +2414,74 @@ class AzimuthalIntegrator(Geometry):
|
|||
|
||||
|
||||
if (I is None) and ("splitpix" in method):
|
||||
if splitPixel is None:
|
||||
logger.warning("SplitPixel is not available,"
|
||||
" falling back on splitbbox histogram !")
|
||||
method = "splitbbox"
|
||||
else:
|
||||
logger.debug("integrate1d uses SplitPixel implementation")
|
||||
pos = self.array_from_unit(shape, "corner", unit)
|
||||
qAxis, I, a, b = splitPixel.fullSplit1D(pos=pos,
|
||||
weights=data,
|
||||
bins=nbPt,
|
||||
pos0Range=radial_range,
|
||||
pos1Range=azimuth_range,
|
||||
dummy=dummy,
|
||||
delta_dummy=delta_dummy,
|
||||
mask=mask,
|
||||
dark=dark,
|
||||
flat=flat,
|
||||
solidangle=solidangle,
|
||||
polarization=polarization
|
||||
)
|
||||
if error_model == "azimuthal":
|
||||
variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
|
||||
if variance is not None:
|
||||
_, var1d, a, b = splitPixel.fullSplit1D(pos=pos,
|
||||
weights=variance,
|
||||
if "full" in method:
|
||||
if splitPixel is None:
|
||||
logger.warning("SplitPixelFull is not available,"
|
||||
" falling back on splitbbox histogram !")
|
||||
method = "splitbbox"
|
||||
else:
|
||||
logger.debug("integrate1d uses SplitPixel implementation")
|
||||
pos = self.array_from_unit(shape, "corner", unit)
|
||||
qAxis, I, a, b = splitPixelFull.fullSplit1D(pos=pos,
|
||||
weights=data,
|
||||
bins=nbPt,
|
||||
pos0Range=radial_range,
|
||||
pos1Range=azimuth_range,
|
||||
dummy=dummy,
|
||||
delta_dummy=delta_dummy,
|
||||
mask=mask,
|
||||
dark=dark,
|
||||
flat=flat,
|
||||
solidangle=solidangle,
|
||||
polarization=polarization
|
||||
)
|
||||
sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
|
||||
if error_model == "azimuthal":
|
||||
variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
|
||||
if variance is not None:
|
||||
_, var1d, a, b = splitPixelFull.fullSplit1D(pos=pos,
|
||||
weights=variance,
|
||||
bins=nbPt,
|
||||
pos0Range=radial_range,
|
||||
pos1Range=azimuth_range,
|
||||
dummy=dummy,
|
||||
delta_dummy=delta_dummy,
|
||||
mask=mask,
|
||||
)
|
||||
sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
|
||||
else:
|
||||
if splitPixel is None:
|
||||
logger.warning("SplitPixel is not available,"
|
||||
" falling back on splitbbox histogram !")
|
||||
method = "splitbbox"
|
||||
else:
|
||||
logger.debug("integrate1d uses SplitPixel implementation")
|
||||
pos = self.array_from_unit(shape, "corner", unit)
|
||||
qAxis, I, a, b = splitPixel.fullSplit1D(pos=pos,
|
||||
weights=data,
|
||||
bins=nbPt,
|
||||
pos0Range=radial_range,
|
||||
pos1Range=azimuth_range,
|
||||
dummy=dummy,
|
||||
delta_dummy=delta_dummy,
|
||||
mask=mask,
|
||||
dark=dark,
|
||||
flat=flat,
|
||||
solidangle=solidangle,
|
||||
polarization=polarization
|
||||
)
|
||||
if error_model == "azimuthal":
|
||||
variance = (data - self.calcfrom1d(qAxis * pos0_scale, I, dim1_unit=unit)) ** 2
|
||||
if variance is not None:
|
||||
_, var1d, a, b = splitPixel.fullSplit1D(pos=pos,
|
||||
weights=variance,
|
||||
bins=nbPt,
|
||||
pos0Range=radial_range,
|
||||
pos1Range=azimuth_range,
|
||||
dummy=dummy,
|
||||
delta_dummy=delta_dummy,
|
||||
mask=mask,
|
||||
)
|
||||
sigma = numpy.sqrt(a) / numpy.maximum(b, 1)
|
||||
|
||||
if (I is None) and ("bbox" in method):
|
||||
if splitBBox is None:
|
||||
|
|
|
@ -58,6 +58,7 @@ class OCL_CSR_Integrator(object):
|
|||
indptr: row pointer indicates the start of a given row. len nbin+1
|
||||
@param image_size: size of the image (for pre-processing)
|
||||
@param devicetype: can be "cpu","gpu","acc" or "all"
|
||||
@param block_size: the chosen size for WORKGROUP_SIZE
|
||||
@param platformid: number of the platform as given by clinfo
|
||||
@type platformid: int
|
||||
@param deviceid: number of the device as given by clinfo
|
||||
|
|
|
@ -52,11 +52,13 @@ class OCL_CSR_Integrator(object):
|
|||
platformid=None, deviceid=None,
|
||||
checksum=None, profile=False):
|
||||
"""
|
||||
@param data: coefficient of the matrix in a 1D vector of float32 - size of nnz
|
||||
@param indices: Column index position for the data (same size as data)
|
||||
@param indptr: row pointer indicates the start of a given row. len nbin+1
|
||||
@param image_size:
|
||||
@param lut: 3-tuple of arrays
|
||||
data: coefficient of the matrix in a 1D vector of float32 - size of nnz
|
||||
indices: Column index position for the data (same size as data)
|
||||
indptr: row pointer indicates the start of a given row. len nbin+1
|
||||
@param image_size:
|
||||
@param devicetype: can be "cpu","gpu","acc" or "all"
|
||||
@param block_size: the chosen size for WORKGROUP_SIZE
|
||||
@param platformid: number of the platform as given by clinfo
|
||||
@type platformid: int
|
||||
@param deviceid: number of the device as given by clinfo
|
||||
|
|
|
@ -0,0 +1,380 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Project: Azimuthal integration
|
||||
# https://github.com/kif/pyFAI
|
||||
#
|
||||
#
|
||||
# Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
|
||||
#
|
||||
# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
|
||||
# Giannis Ashiotis
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
__authors__ = ["Jérôme Kieffer", "Giannis Ashiotis"]
|
||||
__license__ = "GPLv3"
|
||||
__date__ = "04/04/2014"
|
||||
__copyright__ = "2014, ESRF, Grenoble"
|
||||
__contact__ = "jerome.kieffer@esrf.fr"
|
||||
|
||||
import os, gc, logging
|
||||
import threading
|
||||
import hashlib
|
||||
import numpy
|
||||
from .opencl import ocl, pyopencl
|
||||
from .splitBBoxLUT import HistoBBox1d
|
||||
from .utils import get_cl_file
|
||||
from pyopencl import array
|
||||
if pyopencl:
|
||||
mf = pyopencl.mem_flags
|
||||
else:
|
||||
raise ImportError("pyopencl is not installed")
|
||||
try:
|
||||
from .fastcrc import crc32
|
||||
except:
|
||||
from zlib import crc32
|
||||
logger = logging.getLogger("pyFAI.ocl_azim_csr")
|
||||
|
||||
class OCL_Hist_Pixelsplit(object):
|
||||
def __init__(self, pos, bins, image_size, pos0Range=None, pos1Range=None, devicetype="all",
|
||||
padded=False, block_size=32,
|
||||
platformid=None, deviceid=None,
|
||||
checksum=None, profile=False):
|
||||
"""
|
||||
@param lut: 3-tuple of arrays
|
||||
data: coefficient of the matrix in a 1D vector of float32 - size of nnz
|
||||
indices: Column index position for the data (same size as data)
|
||||
indptr: row pointer indicates the start of a given row. len nbin+1
|
||||
@param image_size: size of the image (for pre-processing)
|
||||
@param devicetype: can be "cpu","gpu","acc" or "all"
|
||||
@param platformid: number of the platform as given by clinfo
|
||||
@type platformid: int
|
||||
@param deviceid: number of the device as given by clinfo
|
||||
@type deviceid: int
|
||||
@param checksum: pre - calculated checksum to prevent re - calculating it :)
|
||||
@param profile: store profiling elements
|
||||
"""
|
||||
self.BLOCK_SIZE = block_size # query for warp size
|
||||
self.padded = padded
|
||||
self._sem = threading.Semaphore()
|
||||
self.pos = pos
|
||||
self.bins = bins
|
||||
self.pos_size = pos.size
|
||||
self.size = image_size
|
||||
if self.pos_size != 8 * self.size:
|
||||
raise RuntimeError("pos.size != 8 * image_size")
|
||||
self.pos0Range = numpy.zeros(1, pyopencl.array.vec.float2)
|
||||
self.pos1Range = numpy.zeros(1, pyopencl.array.vec.float2)
|
||||
if (pos0Range is not None) and (len(pos0Range) is 2):
|
||||
self.pos0Range[0][0] = min(pos0Range)
|
||||
self.pos0Range[0][1] = max(pos0Range)
|
||||
else:
|
||||
self.pos0Range[0][0] = -float("inf")
|
||||
self.pos0Range[0][1] = float("inf")
|
||||
|
||||
if (pos1Range is not None) and (len(pos1Range) is 2):
|
||||
self.pos1Range[0][0] = min(pos1Range)
|
||||
self.pos1Range[0][1] = max(pos1Range)
|
||||
else:
|
||||
self.pos1Range[0][0] = -float("inf")
|
||||
self.pos1Range[0][1] = float("inf")
|
||||
|
||||
self.profile = profile
|
||||
if not checksum:
|
||||
checksum = crc32(self.pos)
|
||||
self.on_device = {"pos":checksum, "dark":None, "flat":None, "polarization":None, "solidangle":None}
|
||||
self._cl_kernel_args = {}
|
||||
self._cl_mem = {}
|
||||
self.events = []
|
||||
if (platformid is None) and (deviceid is None):
|
||||
platformid, deviceid = ocl.select_device(devicetype)
|
||||
elif platformid is None:
|
||||
platformid = 0
|
||||
elif deviceid is None:
|
||||
deviceid = 0
|
||||
self.platform = ocl.platforms[platformid]
|
||||
self.device = self.platform.devices[deviceid]
|
||||
self.device_type = self.device.type
|
||||
if (self.device_type == "CPU") and (self.platform.vendor == "Apple"):
|
||||
logger.warning("This is a workaround for Apple's OpenCL on CPU: enforce BLOCK_SIZE=1")
|
||||
self.BLOCK_SIZE = 1
|
||||
self.workgroup_size = self.BLOCK_SIZE,
|
||||
self.wdim_bins = (self.bins * self.BLOCK_SIZE),
|
||||
self.wdim_data = (self.size + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
|
||||
try:
|
||||
#self._ctx = pyopencl.Context(devices=[pyopencl.get_platforms()[platformid].get_devices()[deviceid]])
|
||||
self._ctx = pyopencl.create_some_context()
|
||||
if self.profile:
|
||||
self._queue = pyopencl.CommandQueue(self._ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE)
|
||||
else:
|
||||
self._queue = pyopencl.CommandQueue(self._ctx)
|
||||
self._allocate_buffers()
|
||||
self._compile_kernels()
|
||||
self._set_kernel_arguments()
|
||||
except pyopencl.MemoryError as error:
|
||||
raise MemoryError(error)
|
||||
ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["pos"], self.pos)
|
||||
if self.profile: self.events.append(("copy pos data",ev))
|
||||
reduction_wg_size = 256
|
||||
reduce1 = self._program.reduce1(self._queue, (reduction_wg_size*reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce1"])
|
||||
self.events.append(("reduce1",reduce1))
|
||||
reduce2 = self._program.reduce2(self._queue, (reduction_wg_size,), (reduction_wg_size,), *self._cl_kernel_args["reduce2"])
|
||||
self.events.append(("reduce2",reduce2))
|
||||
|
||||
result = numpy.ndarray(4,dtype=numpy.float32)
|
||||
pyopencl.enqueue_copy(self._queue,result, self._cl_mem["minmax"])
|
||||
print result
|
||||
min0 = pos[:, :, 0].min()
|
||||
max0 = pos[:, :, 0].max()
|
||||
min1 = pos[:, :, 1].min()
|
||||
max1 = pos[:, :, 1].max()
|
||||
minmax=(min0,max0,min1,max1)
|
||||
|
||||
print minmax
|
||||
|
||||
def __del__(self):
|
||||
"""
|
||||
Destructor: release all buffers
|
||||
"""
|
||||
self._free_kernels()
|
||||
self._free_buffers()
|
||||
self._queue = None
|
||||
self._ctx = None
|
||||
gc.collect()
|
||||
|
||||
def _allocate_buffers(self):
|
||||
"""
|
||||
Allocate OpenCL buffers required for a specific configuration
|
||||
|
||||
Note that an OpenCL context also requires some memory, as well as Event and other OpenCL functionalities which cannot and
|
||||
are not taken into account here.
|
||||
The memory required by a context varies depending on the device. Typical for GTX580 is 65Mb but for a 9300m is ~15Mb
|
||||
In addition, a GPU will always have at least 3-5Mb of memory in use.
|
||||
Unfortunately, OpenCL does NOT have a built-in way to check the actual free memory on a device, only the total memory.
|
||||
"""
|
||||
if self.size < self.BLOCK_SIZE:
|
||||
raise RuntimeError("Fatal error in _allocate_buffers. size (%d) must be >= BLOCK_SIZE (%d)\n", self.size, self.BLOCK_SIZE)
|
||||
size_of_float = numpy.dtype(numpy.float32).itemsize
|
||||
size_of_short = numpy.dtype(numpy.int16).itemsize
|
||||
size_of_int = numpy.dtype(numpy.int32).itemsize
|
||||
size_of_long = numpy.dtype(numpy.int64).itemsize
|
||||
|
||||
ualloc = (self.pos_size * size_of_float)
|
||||
ualloc += (4 * self.BLOCK_SIZE * size_of_float)
|
||||
ualloc += (self.size * size_of_float) * 5
|
||||
ualloc += (self.bins * size_of_float) * 3
|
||||
memory = self.device.memory
|
||||
logger.info("%.3fMB are needed on device which has %.3fMB" % (ualloc / 1.0e6, memory / 1.0e6))
|
||||
if ualloc >= memory:
|
||||
raise MemoryError("Fatal error in _allocate_buffers. Not enough device memory for buffers (%lu requested, %lu available)" % (ualloc, memory))
|
||||
# now actually allocate:
|
||||
try:
|
||||
self._cl_mem["pos"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.pos_size)
|
||||
self._cl_mem["preresult"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4 * self.BLOCK_SIZE)
|
||||
self._cl_mem["minmax"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * 4)
|
||||
self._cl_mem["outData"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
|
||||
self._cl_mem["outCount"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.bins)
|
||||
self._cl_mem["outMerge"] = pyopencl.Buffer(self._ctx, mf.WRITE_ONLY, size=size_of_float * self.bins)
|
||||
self._cl_mem["image_u16"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_short * self.size)
|
||||
self._cl_mem["image"] = pyopencl.Buffer(self._ctx, mf.READ_WRITE, size=size_of_float * self.size)
|
||||
self._cl_mem["dark"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
|
||||
self._cl_mem["flat"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
|
||||
self._cl_mem["polarization"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
|
||||
self._cl_mem["solidangle"] = pyopencl.Buffer(self._ctx, mf.READ_ONLY, size=size_of_float * self.size)
|
||||
except pyopencl.MemoryError as error:
|
||||
self._free_buffers()
|
||||
raise MemoryError(error)
|
||||
|
||||
def _free_buffers(self):
|
||||
"""
|
||||
free all memory allocated on the device
|
||||
"""
|
||||
for buffer_name in self._cl_mem:
|
||||
if self._cl_mem[buffer_name] is not None:
|
||||
try:
|
||||
self._cl_mem[buffer_name].release()
|
||||
self._cl_mem[buffer_name] = None
|
||||
except pyopencl.LogicError:
|
||||
logger.error("Error while freeing buffer %s" % buffer_name)
|
||||
|
||||
def _compile_kernels(self, kernel_file=None):
|
||||
"""
|
||||
Call the OpenCL compiler
|
||||
@param kernel_file: path tothe
|
||||
"""
|
||||
kernel_name = "ocl_hist_pixelsplit.cl"
|
||||
if kernel_file is None:
|
||||
if os.path.isfile(kernel_name):
|
||||
kernel_file = os.path.abspath(kernel_name)
|
||||
else:
|
||||
kernel_file = get_cl_file(kernel_name)
|
||||
else:
|
||||
kernel_file = str(kernel_file)
|
||||
with open(kernel_file, "r") as kernelFile:
|
||||
kernel_src = kernelFile.read()
|
||||
|
||||
compile_options = "-D BINS=%i -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
|
||||
(self.bins, self.size, self.BLOCK_SIZE, numpy.finfo(numpy.float32).eps)
|
||||
logger.info("Compiling file %s with options %s" % (kernel_file, compile_options))
|
||||
try:
|
||||
self._program = pyopencl.Program(self._ctx, kernel_src).build(options=compile_options)
|
||||
except pyopencl.MemoryError as error:
|
||||
raise MemoryError(error)
|
||||
|
||||
def _free_kernels(self):
|
||||
"""
|
||||
free all kernels
|
||||
"""
|
||||
for kernel in self._cl_kernel_args:
|
||||
self._cl_kernel_args[kernel] = []
|
||||
self._program = None
|
||||
|
||||
def _set_kernel_arguments(self):
|
||||
"""Tie arguments of OpenCL kernel-functions to the actual kernels
|
||||
|
||||
set_kernel_arguments() is a private method, called by configure().
|
||||
It uses the dictionary _cl_kernel_args.
|
||||
Note that by default, since TthRange is disabled, the integration kernels have tth_min_max tied to the tthRange argument slot.
|
||||
When setRange is called it replaces that argument with tthRange low and upper bounds. When unsetRange is called, the argument slot
|
||||
is reset to tth_min_max.
|
||||
"""
|
||||
self._cl_kernel_args["reduce1"] = [self._cl_mem["pos"], numpy.int32(self.pos_size), self._cl_mem["preresult"]]
|
||||
self._cl_kernel_args["reduce2"] = [self._cl_mem["preresult"], self._cl_mem["minmax"]]
|
||||
self._cl_kernel_args["corrections"] = [self._cl_mem["image"], numpy.int32(0), self._cl_mem["dark"], numpy.int32(0), self._cl_mem["flat"], \
|
||||
numpy.int32(0), self._cl_mem["solidangle"], numpy.int32(0), self._cl_mem["polarization"], \
|
||||
numpy.int32(0), numpy.float32(0), numpy.float32(0)]
|
||||
self._cl_kernel_args["integrate1"] = [self._cl_mem["pos"], self._cl_mem["image"], self._cl_mem["minmax"], numpy.int32(0), self.pos0Range[0], \
|
||||
self.pos1Range[0], numpy.int32(0), numpy.float32(0), self._cl_mem["outData"], self._cl_mem["outCount"]]
|
||||
self._cl_kernel_args["integrate2"] = [self._cl_mem["outData"], self._cl_mem["outCount"], self._cl_mem["outMerge"]]
|
||||
self._cl_kernel_args["memset_out"] = [self._cl_mem[i] for i in ["outData", "outCount", "outMerge"]]
|
||||
self._cl_kernel_args["u16_to_float"] = [self._cl_mem[i] for i in ["image_u16", "image"]]
|
||||
self._cl_kernel_args["s32_to_float"] = [self._cl_mem[i] for i in ["image", "image"]]
|
||||
|
||||
def integrate(self, data, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None, dark_checksum=None, flat_checksum=None, solidAngle_checksum=None, polarization_checksum=None):
|
||||
events = []
|
||||
with self._sem:
|
||||
if data.dtype == numpy.uint16:
|
||||
copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image_u16"], numpy.ascontiguousarray(data))
|
||||
cast_u16_to_float = self._program.u16_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["u16_to_float"])
|
||||
events+=[("copy image",copy_image),("cast", cast_u16_to_float)]
|
||||
elif data.dtype == numpy.int32:
|
||||
copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data))
|
||||
cast_s32_to_float = self._program.s32_to_float(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["s32_to_float"])
|
||||
events+=[("copy image",copy_image),("cast", cast_s32_to_float)]
|
||||
else:
|
||||
copy_image = pyopencl.enqueue_copy(self._queue, self._cl_mem["image"], numpy.ascontiguousarray(data, dtype=numpy.float32))
|
||||
events+=[("copy image",copy_image)]
|
||||
memset = self._program.memset_out(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["memset_out"])
|
||||
events+=[("memset",memset)]
|
||||
if dummy is not None:
|
||||
do_dummy = numpy.int32(1)
|
||||
dummy = numpy.float32(dummy)
|
||||
if delta_dummy == None:
|
||||
delta_dummy = numpy.float32(0)
|
||||
else:
|
||||
delta_dummy = numpy.float32(abs(delta_dummy))
|
||||
else:
|
||||
do_dummy = numpy.int32(0)
|
||||
dummy = numpy.float32(0)
|
||||
delta_dummy = numpy.float32(0)
|
||||
self._cl_kernel_args["corrections"][9] = do_dummy
|
||||
self._cl_kernel_args["corrections"][10] = dummy
|
||||
self._cl_kernel_args["corrections"][11] = delta_dummy
|
||||
self._cl_kernel_args["integrate1"][6] = do_dummy
|
||||
self._cl_kernel_args["integrate1"][7] = dummy
|
||||
|
||||
if dark is not None:
|
||||
do_dark = numpy.int32(1)
|
||||
if not dark_checksum:
|
||||
dark_checksum = crc32(dark)
|
||||
if dark_checksum != self.on_device["dark"]:
|
||||
ev = pyopencl.enqueue_copy(self._queue, self._cl_mem["dark"], numpy.ascontiguousarray(dark, dtype=numpy.float32))
|
||||
events.append("copy dark",ev)
|
||||
self.on_device["dark"] = dark_checksum
|
||||
else:
|
||||
do_dark = numpy.int32(0)
|
||||
self._cl_kernel_args["corrections"][1] = do_dark
|
||||
if flat is not None:
|
||||
do_flat = numpy.int32(1)
|
||||
if not flat_checksum:
|
||||
flat_checksum = crc32(flat)
|
||||
if self.on_device["flat"] != flat_checksum:
|
||||
ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["flat"], numpy.ascontiguousarray(flat, dtype=numpy.float32))
|
||||
events.append("copy flat",ev)
|
||||
self.on_device["flat"] = flat_checksum
|
||||
else:
|
||||
do_flat = numpy.int32(0)
|
||||
self._cl_kernel_args["corrections"][3] = do_flat
|
||||
|
||||
if solidAngle is not None:
|
||||
do_solidAngle = numpy.int32(1)
|
||||
if not solidAngle_checksum:
|
||||
solidAngle_checksum = crc32(solidAngle)
|
||||
if solidAngle_checksum != self.on_device["solidangle"]:
|
||||
ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["solidangle"], numpy.ascontiguousarray(solidAngle, dtype=numpy.float32))
|
||||
events.append(("copy solidangle",ev))
|
||||
self.on_device["solidangle"] = solidAngle_checksum
|
||||
else:
|
||||
do_solidAngle = numpy.int32(0)
|
||||
self._cl_kernel_args["corrections"][5] = do_solidAngle
|
||||
|
||||
if polarization is not None:
|
||||
do_polarization = numpy.int32(1)
|
||||
if not polarization_checksum:
|
||||
polarization_checksum = crc32(polarization)
|
||||
if polarization_checksum != self.on_device["polarization"]:
|
||||
ev=pyopencl.enqueue_copy(self._queue, self._cl_mem["polarization"], numpy.ascontiguousarray(polarization, dtype=numpy.float32))
|
||||
events.append(("copy polarization",ev))
|
||||
self.on_device["polarization"] = polarization_checksum
|
||||
else:
|
||||
do_polarization = numpy.int32(0)
|
||||
self._cl_kernel_args["corrections"][7] = do_polarization
|
||||
copy_image.wait()
|
||||
if do_dummy + do_polarization + do_solidAngle + do_flat + do_dark > 0:
|
||||
ev = self._program.corrections(self._queue, self.wdim_data, self.workgroup_size, *self._cl_kernel_args["corrections"])
|
||||
events.append(("corrections",ev))
|
||||
integrate1 = self._program.integrate1(self._queue, self.wdim_bins, self.workgroup_size, *self._cl_kernel_args["integrate1"])
|
||||
events.append(("integrate1",integrate1))
|
||||
outMerge = numpy.empty(self.bins, dtype=numpy.float32)
|
||||
outData = numpy.empty(self.bins, dtype=numpy.float32)
|
||||
outCount = numpy.empty(self.bins, dtype=numpy.float32)
|
||||
ev=pyopencl.enqueue_copy(self._queue, outData, self._cl_mem["outData"])
|
||||
events.append(("copy D->H outData",ev))
|
||||
ev=pyopencl.enqueue_copy(self._queue, outCount, self._cl_mem["outCount"])
|
||||
events.append(("copy D->H outCount",ev))
|
||||
global_size_integrate2 = (self.bins + self.BLOCK_SIZE - 1) & ~(self.BLOCK_SIZE - 1),
|
||||
integrate2 = self._program.integrate2(self._queue, global_size_integrate2, self.workgroup_size, *self._cl_kernel_args["integrate2"])
|
||||
events.append(("integrate2",integrate2))
|
||||
ev=pyopencl.enqueue_copy(self._queue, outMerge, self._cl_mem["outMerge"])
|
||||
events.append(("copy D->H outMerge",ev))
|
||||
ev.wait()
|
||||
if self.profile:
|
||||
self.events+=events
|
||||
return outMerge, outData, outCount
|
||||
|
||||
def log_profile(self):
|
||||
"""
|
||||
If we are in profiling mode, prints out all timing for every single OpenCL call
|
||||
"""
|
||||
t = 0.0
|
||||
if self.profile:
|
||||
for e in self.events:
|
||||
if "__len__" in dir(e) and len(e) >= 2:
|
||||
et = 1e-6 * (e[1].profile.end - e[1].profile.start)
|
||||
print("%50s:\t%.3fms" % (e[0], et))
|
||||
t += et
|
||||
|
||||
print("_"*80)
|
||||
print("%50s:\t%.3fms" % ("Total execution time", t))
|
27
setup.py
27
setup.py
|
@ -49,14 +49,10 @@ from distutils.sysconfig import get_python_lib
|
|||
from distutils.command.install_data import install_data
|
||||
|
||||
################################################################################
|
||||
# Remove MANIFEST file ... it needs to be re-generated on the fly
|
||||
# Check for Cython
|
||||
################################################################################
|
||||
if op.isfile("MANIFEST"):
|
||||
os.unlink("MANIFEST")
|
||||
|
||||
################################################################################
|
||||
# Check for Cython
|
||||
################################################################################
|
||||
try:
|
||||
from Cython.Distutils import build_ext
|
||||
CYTHON = True
|
||||
|
@ -124,7 +120,6 @@ if ("sdist" in sys.argv):
|
|||
# pyFAI extensions
|
||||
# ###############################################################################
|
||||
cython_modules = [os.path.splitext(os.path.basename(i))[0] for i in glob.glob("src/*.pyx")]
|
||||
|
||||
src = dict([(ext, join("src", ext + cython_c_ext)) for ext in cython_modules])
|
||||
|
||||
_geometry_dic = dict(name="_geometry",
|
||||
|
@ -151,6 +146,16 @@ splitPixel_dic = dict(name="splitPixel",
|
|||
sources=[src['splitPixel']],
|
||||
)
|
||||
|
||||
splitPixelFull_dic = dict(name="splitPixelFull",
|
||||
include_dirs=get_numpy_include_dirs(),
|
||||
sources=[src['splitPixelFull']],
|
||||
)
|
||||
|
||||
splitPixelFullLUT_dic = dict(name="splitPixelFullLUT",
|
||||
include_dirs=get_numpy_include_dirs(),
|
||||
sources=[src['splitPixelFullLUT']],
|
||||
)
|
||||
|
||||
splitBBox_dic = dict(name="splitBBox",
|
||||
include_dirs=get_numpy_include_dirs(),
|
||||
sources=[src['splitBBox']],
|
||||
|
@ -200,6 +205,7 @@ _bispev_dic = dict(name="_bispev",
|
|||
sources=[src['_bispev'] ],
|
||||
extra_compile_args=['openmp'],
|
||||
extra_link_args=['openmp'],
|
||||
|
||||
)
|
||||
|
||||
_convolution_dic = dict(name="_convolution",
|
||||
|
@ -266,6 +272,7 @@ if sys.platform == "win32":
|
|||
if (filein + ".py") not in script_files:
|
||||
shutil.copyfile(filein, filein + ".py")
|
||||
script_files.append(filein + ".py")
|
||||
|
||||
else:
|
||||
script_files = glob.glob("scripts/*")
|
||||
|
||||
|
@ -420,10 +427,11 @@ This python module can be found on:
|
|||
http://pypi.python.org/pypi/pyopencl
|
||||
""")
|
||||
|
||||
|
||||
"""
|
||||
################################################################################
|
||||
# ###############################################################################
|
||||
# check if OpenMP modules, freshly installed can import
|
||||
################################################################################
|
||||
# ###############################################################################
|
||||
pyFAI = None
|
||||
sys.path.insert(0, os.path.dirname(installDir))
|
||||
# print installDir
|
||||
|
@ -436,7 +444,7 @@ for mod in sys.modules.copy():
|
|||
try:
|
||||
import pyFAI
|
||||
except ImportError as E:
|
||||
print("Unable to import pyFAI from system: %s" % E)
|
||||
print("Unable to import pyFAI: %s" % E)
|
||||
else:
|
||||
print("PyFAI is installed in %s" % pyFAI.__file__)
|
||||
try:
|
||||
|
@ -445,5 +453,4 @@ else:
|
|||
print("PyFAI.histogram failed to import. It is likely there is an OpenMP error: %s" % E)
|
||||
else:
|
||||
print("OpenMP libraries were found and pyFAI.histogram was successfully imported")
|
||||
|
||||
"""
|
||||
|
|
|
@ -903,3 +903,114 @@ def cal_LUT(float[:,:,:,:] pos not None, shape, int size, max_pixel_size):
|
|||
outMax[ml, nl] = k + 1
|
||||
idx += 1
|
||||
return lut.reshape(shape0 * shape1, size)
|
||||
|
||||
|
||||
def cal_CSR(float[:,:,:,:] pos not None, shape, bin_size, max_pixel_size):
|
||||
"""
|
||||
@param pos: 4D position array
|
||||
@param shape: output shape
|
||||
@param bin_size: number of input element per output element (as numpy array)
|
||||
@param max_pixel_size: (2-tuple of int) size of a buffer covering the largest pixel
|
||||
@return: look-up table in CSR format: 3-tuple of array"""
|
||||
cdef int i, j, ms, ml, ns, nl, shape0, shape1, delta0, delta1, buffer_size, i0, i1, bins, lut_size
|
||||
cdef int offset0, offset1, box_size0, box_size1
|
||||
cdef numpy.int32_t k, idx=0
|
||||
cdef float A0, A1, B0, B1, C0, C1, D0, D1, pAB, pBC, pCD, pDA, cAB, cBC, cCD, cDA, area, value
|
||||
cdef numpy.ndarray[lut_point, ndim = 3] lut
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros(shape, dtype=numpy.int32)
|
||||
cdef float[:,:] buffer
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr, indices
|
||||
cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
|
||||
|
||||
shape0, shape1 = shape
|
||||
delta0, delta1 = max_pixel_size
|
||||
bins = shape0*shape1
|
||||
indptr = numpy.empty(bins+1, dtype=numpy.int32)
|
||||
indptr[0] = 0
|
||||
indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
|
||||
lut_size = indptr[bins]
|
||||
|
||||
indices = numpy.zeros(shape=lut_size, dtype=numpy.int32)
|
||||
data = numpy.zeros(shape=lut_size, dtype=numpy.float32)
|
||||
|
||||
bins = shape0*shape1
|
||||
indptr[1:] = bin_size.cumsum(dtype=numpy.int32)
|
||||
|
||||
indices_size = lut_size*sizeof(numpy.int32)
|
||||
data_size = lut_size*sizeof(numpy.float32)
|
||||
indptr_size = bins*sizeof(numpy.int32)
|
||||
|
||||
logger.info("CSR matrix: %.3f MByte"%((indices_size+data_size+indptr_size)/1.0e6))
|
||||
buffer = cvarray(shape=(delta0, delta1), itemsize=sizeof(float), format="f")
|
||||
buffer_size = delta0 * delta1 * sizeof(float)
|
||||
logger.info("Max pixel size: %ix%i; Max source pixel in target: %i"%(buffer.shape[1],buffer.shape[0], lut_size))
|
||||
with nogil:
|
||||
# i,j, idx are indices of the raw image uncorrected
|
||||
for i in range(shape0):
|
||||
for j in range(shape1):
|
||||
#reinit of buffer
|
||||
memset(&buffer[0,0], 0, buffer_size)
|
||||
A0 = pos[i, j, 0, 0]
|
||||
A1 = pos[i, j, 0, 1]
|
||||
B0 = pos[i, j, 1, 0]
|
||||
B1 = pos[i, j, 1, 1]
|
||||
C0 = pos[i, j, 2, 0]
|
||||
C1 = pos[i, j, 2, 1]
|
||||
D0 = pos[i, j, 3, 0]
|
||||
D1 = pos[i, j, 3, 1]
|
||||
offset0 = (<int> floor(min4f(A0, B0, C0, D0)))
|
||||
offset1 = (<int> floor(min4f(A1, B1, C1, D1)))
|
||||
box_size0 = (<int> ceil(max4f(A0, B0, C0, D0))) - offset0
|
||||
box_size1 = (<int> ceil(max4f(A1, B1, C1, D1))) - offset1
|
||||
A0 -= <float> offset0
|
||||
A1 -= <float> offset1
|
||||
B0 -= <float> offset0
|
||||
B1 -= <float> offset1
|
||||
C0 -= <float> offset0
|
||||
C1 -= <float> offset1
|
||||
D0 -= <float> offset0
|
||||
D1 -= <float> offset1
|
||||
if B0 != A0:
|
||||
pAB = (B1 - A1) / (B0 - A0)
|
||||
cAB = A1 - pAB * A0
|
||||
else:
|
||||
pAB = cAB = 0.0
|
||||
if C0 != B0:
|
||||
pBC = (C1 - B1) / (C0 - B0)
|
||||
cBC = B1 - pBC * B0
|
||||
else:
|
||||
pBC = cBC = 0.0
|
||||
if D0 != C0:
|
||||
pCD = (D1 - C1) / (D0 - C0)
|
||||
cCD = C1 - pCD * C0
|
||||
else:
|
||||
pCD = cCD = 0.0
|
||||
if A0 != D0:
|
||||
pDA = (A1 - D1) / (A0 - D0)
|
||||
cDA = D1 - pDA * D0
|
||||
else:
|
||||
pDA = cDA = 0.0
|
||||
integrate(buffer, B0, A0, pAB, cAB)
|
||||
integrate(buffer, A0, D0, pDA, cDA)
|
||||
integrate(buffer, D0, C0, pCD, cCD)
|
||||
integrate(buffer, C0, B0, pBC, cBC)
|
||||
area = 0.5*((C0 - A0)*(D1 - B1)-(C1 - A1)*(D0 - B0))
|
||||
for ms in range(box_size0):
|
||||
ml = ms + offset0
|
||||
if ml < 0 or ml >= shape0:
|
||||
continue
|
||||
for ns in range(box_size1):
|
||||
# ms,ns are indexes of the corrected image in short form, ml & nl are the same
|
||||
nl = ns + offset1
|
||||
if nl < 0 or nl >= shape1:
|
||||
continue
|
||||
value = buffer[ms, ns] / area
|
||||
if value <= 0:
|
||||
continue
|
||||
k = outMax[ml,nl]
|
||||
tmp_index = indptr[ml*shape1+nl]
|
||||
indices[tmp_index+k] = idx
|
||||
data[tmp_index+k] = value
|
||||
outMax[ml,nl] = k + 1
|
||||
idx += 1
|
||||
return (data, indices, indptr)
|
|
@ -438,7 +438,7 @@ class Distortion(object):
|
|||
def correctHost(self, image):
|
||||
"""
|
||||
Correct an image based on the look-up table calculated ...
|
||||
Caclulation takes place on the Host
|
||||
Calculation takes place on the Host
|
||||
|
||||
@param image: 2D-array with the image
|
||||
@return: corrected 2D image
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,739 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Project: Azimuthal integration
|
||||
# https://forge.epn-campus.eu/projects/azimuthal
|
||||
#
|
||||
# File: "$Id$"
|
||||
#
|
||||
# Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
|
||||
#
|
||||
# Principal author: Giannis Ashiotis
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
|
||||
import cython
|
||||
cimport numpy
|
||||
import numpy
|
||||
from libc.math cimport fabs, floor
|
||||
from libc.stdio cimport printf
|
||||
from cython.view cimport array as cvarray
|
||||
|
||||
#cdef double areaTriangle(double a0,
|
||||
# double a1,
|
||||
# double b0,
|
||||
# double b1,
|
||||
# double c0,
|
||||
# double c1):
|
||||
# """
|
||||
# Calculate the area of the ABC triangle with corners:
|
||||
# A(a0,a1)
|
||||
# B(b0,b1)
|
||||
# C(c0,c1)
|
||||
# @return: area, i.e. 1/2 * (B-A)^(C-A)
|
||||
# """
|
||||
# return 0.5 * abs(((b0 - a0) * (c1 - a1)) - ((b1 - a1) * (c0 - a0)))
|
||||
#
|
||||
cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
|
||||
"""
|
||||
Calculate the area of the ABCD quadrilataire with corners:
|
||||
A(a0,a1)
|
||||
B(b0,b1)
|
||||
C(c0,c1)
|
||||
D(d0,d1)
|
||||
@return: area, i.e. 1/2 * (AC ^ BD)
|
||||
"""
|
||||
return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
|
||||
|
||||
# cdef double area4(point2D *pixel):
|
||||
# """
|
||||
# Calculate the area of the ABCD quadrilataire with corners:
|
||||
# A(a0,a1)
|
||||
# B(b0,b1)
|
||||
# C(c0,c1)
|
||||
# D(d0,d1)
|
||||
# @return: area, i.e. 1/2 * (AC ^ BD)
|
||||
# """
|
||||
# return 0.5 * abs(((pixel[2].x - pixel[0].x) * (pixel[3].y - pixel[1].y)) - ((pixel[2].y - pixel[0].y) * (pixel[3].x - pixel[1].x)))
|
||||
|
||||
# cdef struct point2D:
|
||||
# numpy.float64_t x
|
||||
# numpy.float64_t y
|
||||
|
||||
# cdef struct min_max:
|
||||
# numpy.float64_t pos
|
||||
# numpy.int32_t point
|
||||
|
||||
cdef struct Function:
|
||||
double slope
|
||||
double intersect
|
||||
|
||||
#cdef class Function:
|
||||
|
||||
#cdef double _slope
|
||||
#cdef double _intersect
|
||||
|
||||
#def __cinit__(self, double A0=0.0, double A1=0.0, double B0=1.0, double B1=0.0):
|
||||
#self._slope = (B1-A1)/(B0-A0)
|
||||
#self._intersect = A1 - self._slope*A0
|
||||
|
||||
#def __cinit__(self):
|
||||
#self._slope = 0.0
|
||||
#self._intersect = 0.0
|
||||
|
||||
#cdef double f(self, double x):
|
||||
#return self._slope*x + self._intersect
|
||||
|
||||
#cdef double integrate(self, double A0, double B0) nogil:
|
||||
#if A0==B0:
|
||||
#return 0.0
|
||||
#else:
|
||||
#return self._slope*(B0*B0 - A0*A0)*0.5 + self._intersect*(B0-A0)
|
||||
|
||||
#cdef void reset(self, double A0, double A1, double B0, double B1) nogil:
|
||||
#self._slope = (B1-A1)/(B0-A0)
|
||||
#self._intersect = A1 - self._slope*A0
|
||||
|
||||
cdef double integrate( double A0, double B0, Function AB) nogil:
|
||||
"""
|
||||
integrates the line defined by AB, from A0 to B0
|
||||
param A0: first limit
|
||||
param B0: second limit
|
||||
param AB: struct with the slope and point of intersection of the line
|
||||
"""
|
||||
if A0==B0:
|
||||
return 0.0
|
||||
else:
|
||||
return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef double getBinNr(double x0, double pos0_min, double dpos) nogil:
|
||||
"""
|
||||
calculate the bin number for any point
|
||||
param x0: current position
|
||||
param pos0_min: position minimum
|
||||
param dpos: bin width
|
||||
"""
|
||||
return (x0 - pos0_min) / dpos
|
||||
|
||||
# cdef min_max min4f(point2D *pixel, int dim) nogil:
|
||||
# cdef min_max tmp
|
||||
# if dim == 0:
|
||||
# if (pixel[0].x <= pixel[1].x) and (pixel[0].x <= pixel[2].x) and (pixel[0].x <= pixel[3].x):
|
||||
# tmp.pos = pixel[0].x
|
||||
# tmp.pixel = 0
|
||||
# return tmp
|
||||
# if (pixel[1].x <= pixel[0].x) and (pixel[1].x <= pixel[2].x) and (pixel[1].x <= pixel[3].x):
|
||||
# tmp.pos = pixel[1].x
|
||||
# tmp.pixel = 1
|
||||
# return tmp
|
||||
# if (pixel[2].x <= pixel[0].x) and (pixel[2].x <= pixel[1].x) and (pixel[2].x <= pixel[3].x):
|
||||
# tmp.pos = pixel[2].x
|
||||
# tmp.pixel = 2
|
||||
# return tmp
|
||||
# else:
|
||||
# tmp.pos = pixel[3].x
|
||||
# tmp.pixel = 3
|
||||
# return tmp
|
||||
# elif dim == 1:
|
||||
# if (pixel[0].y <= pixel[1].y) and (pixel[0].y <= pixel[2].y) and (pixel[0].y <= pixel[3].y):
|
||||
# tmp.pos = pixel[0].y
|
||||
# tmp.pixel = 0
|
||||
# return tmp
|
||||
# if (pixel[1].y <= pixel[0].y) and (pixel[1].y <= pixel[2].y) and (pixel[1].y <= pixel[3].y):
|
||||
# tmp.pos = pixel[1].y
|
||||
# tmp.pixel = 1
|
||||
# return tmp
|
||||
# if (pixel[2].y <= pixel[0].y) and (pixel[2].y <= pixel[1].y) and (pixel[2].y <= pixel[3].y):
|
||||
# tmp.pos = pixel[2].y
|
||||
# tmp.pixel = 2
|
||||
# return tmp
|
||||
# else:
|
||||
# tmp.pos = pixel[3].y
|
||||
|
||||
# tmp.pixel = 3
|
||||
# return tmp
|
||||
|
||||
# cdef min_max max4f(point2D *pixel, int dim) nogil:
|
||||
# cdef min_max tmp
|
||||
# if dim == 0:
|
||||
# if (pixel[0].x >= pixel[1].x) and (pixel[0].x >= pixel[2].x) and (pixel[0].x >= pixel[3].x):
|
||||
# tmp.pos = pixel[0].x
|
||||
# tmp.pixel = 0
|
||||
# return tmp
|
||||
# if (pixel[1].x >= pixel[0].x) and (pixel[1].x >= pixel[2].x) and (pixel[1].x >= pixel[3].x):
|
||||
# tmp.pos = pixel[1].x
|
||||
# tmp.pixel = 1
|
||||
# return tmp
|
||||
# if (pixel[2].x >= pixel[0].x) and (pixel[2].x >= pixel[1].x) and (pixel[2].x >= pixel[3].x):
|
||||
# tmp.pos = pixel[2].x
|
||||
# tmp.pixel = 2
|
||||
# return tmp
|
||||
# else:
|
||||
# tmp.pos = pixel[3].x
|
||||
# tmp.pixel = 3
|
||||
# return tmp
|
||||
# elif dim == 1:
|
||||
# if (pixel[0].y >= pixel[1].y) and (pixel[0].y >= pixel[2].y) and (pixel[0].y >= pixel[3].y):
|
||||
# tmp.pos = pixel[0].y
|
||||
# tmp.pixel = 0
|
||||
# return tmp
|
||||
# if (pixel[1].y >= pixel[0].y) and (pixel[1].y >= pixel[2].y) and (pixel[1].y >= pixel[3].y):
|
||||
# tmp.pos = pixel[1].y
|
||||
# tmp.pixel = 1
|
||||
# return tmp
|
||||
# if (pixel[2].y >= pixel[0].y) and (pixel[2].y >= pixel[1].y) and (pixel[2].y >= pixel[3].y):
|
||||
# tmp.pos = pixel[2].y
|
||||
# tmp.pixel = 2
|
||||
# return tmp
|
||||
# else:
|
||||
# tmp.pos = pixel[3].y
|
||||
|
||||
# tmp.pixel = 3
|
||||
# return tmp
|
||||
|
||||
|
||||
|
||||
cdef double min4f(double a, double b, double c, double d) nogil:
|
||||
"""Calculates the min of 4 double numbers"""
|
||||
if (a <= b) and (a <= c) and (a <= d):
|
||||
return a
|
||||
if (b <= a) and (b <= c) and (b <= d):
|
||||
return b
|
||||
if (c <= a) and (c <= b) and (c <= d):
|
||||
return c
|
||||
else:
|
||||
return d
|
||||
|
||||
cdef double max4f(double a, double b, double c, double d) nogil:
|
||||
"""Calculates the max of 4 double numbers"""
|
||||
if (a >= b) and (a >= c) and (a >= d):
|
||||
return a
|
||||
if (b >= a) and (b >= c) and (b >= d):
|
||||
return b
|
||||
if (c >= a) and (c >= b) and (c >= d):
|
||||
return c
|
||||
else:
|
||||
return d
|
||||
|
||||
@cython.cdivision(True)
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
def fullSplit1D(numpy.ndarray pos not None,
|
||||
numpy.ndarray weights not None,
|
||||
size_t bins=100,
|
||||
pos0Range=None,
|
||||
pos1Range=None,
|
||||
dummy=None,
|
||||
delta_dummy=None,
|
||||
mask=None,
|
||||
dark=None,
|
||||
flat=None,
|
||||
solidangle=None,
|
||||
polarization=None
|
||||
):
|
||||
"""
|
||||
Calculates histogram of pos weighted by weights
|
||||
|
||||
Splitting is done on the pixel's bounding box like fit2D.
|
||||
No compromise for speed has been made here.
|
||||
|
||||
|
||||
@param pos: 3D or 4D array with the coordinates of each pixel point
|
||||
@param weights: array with intensities
|
||||
@param bins: number of output bins
|
||||
@param pos0Range: minimum and maximum of the 2th range
|
||||
@param pos1Range: minimum and maximum of the chi range
|
||||
@param dummy: value for bins without pixels
|
||||
@param delta_dummy: precision of dummy value
|
||||
@param mask: array (of int8) with masked pixels with 1 (0=not masked)
|
||||
@param dark: array (of float64) with dark noise to be subtracted (or None)
|
||||
@param flat: array (of float64) with flat image
|
||||
@param polarization: array (of float64) with polarization correction
|
||||
@param solidangle: array (of float64) with flat image
|
||||
@return 2theta, I, weighted histogram, unweighted histogram
|
||||
"""
|
||||
cdef size_t size = weights.size
|
||||
if pos.ndim>3: #create a view
|
||||
pos = pos.reshape((-1,4,2))
|
||||
assert pos.shape[0] == size
|
||||
assert pos.shape[1] == 4
|
||||
assert pos.shape[2] == 2
|
||||
assert pos.ndim == 3
|
||||
assert bins > 1
|
||||
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(pos,dtype=numpy.float64)
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float64)
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(bins, dtype=numpy.float64)
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(bins, dtype=numpy.float64)
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outMerge = numpy.zeros(bins, dtype=numpy.float64)
|
||||
cdef numpy.int8_t[:] cmask
|
||||
cdef double[:] cflat, cdark, cpolarization, csolidangle
|
||||
|
||||
cdef double cdummy=0, cddummy=0, data=0
|
||||
cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
|
||||
cdef double areaPixel=0, dpos=0, fbin0_min=0, fbin0_max=0#, fbin1_min, fbin1_max
|
||||
cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
|
||||
cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
|
||||
cdef double oneOverArea=0, partialArea=0, tmp=0
|
||||
cdef double max0, min0
|
||||
#cdef point2D[:] pixel
|
||||
cdef Function AB, BC, CD, DA
|
||||
cdef double epsilon=1e-10
|
||||
|
||||
cdef bint check_pos1=False, check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
|
||||
cdef int i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, cur_bin
|
||||
|
||||
if pos0Range is not None and len(pos0Range) > 1:
|
||||
pos0_min = min(pos0Range)
|
||||
pos0_maxin = max(pos0Range)
|
||||
else:
|
||||
pos0_min = pos[:, :, 0].min()
|
||||
pos0_maxin = pos[:, :, 0].max()
|
||||
pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
|
||||
if pos1Range is not None and len(pos1Range) > 1:
|
||||
pos1_min = min(pos1Range)
|
||||
pos1_maxin = max(pos1Range)
|
||||
do_pos1 = True
|
||||
else:
|
||||
pos1_min = pos[:, :, 1].min()
|
||||
pos1_maxin = pos[:, :, 1].max()
|
||||
pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
|
||||
dpos = (pos0_max - pos0_min) / (< double > (bins))
|
||||
|
||||
|
||||
outPos = numpy.linspace(pos0_min+0.5*dpos, pos0_maxin-0.5*dpos, bins)
|
||||
|
||||
if (dummy is not None) and (delta_dummy is not None):
|
||||
check_dummy = True
|
||||
cdummy = float(dummy)
|
||||
cddummy = float(delta_dummy)
|
||||
elif (dummy is not None):
|
||||
check_dummy = True
|
||||
cdummy = float(dummy)
|
||||
cddummy = 0.0
|
||||
else:
|
||||
check_dummy = False
|
||||
cdummy = 0.0
|
||||
cddummy = 0.0
|
||||
|
||||
if mask is not None:
|
||||
check_mask = True
|
||||
assert mask.size == size
|
||||
cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
|
||||
if dark is not None:
|
||||
do_dark = True
|
||||
assert dark.size == size
|
||||
cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
|
||||
if flat is not None:
|
||||
do_flat = True
|
||||
assert flat.size == size
|
||||
cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
|
||||
if polarization is not None:
|
||||
do_polarization = True
|
||||
assert polarization.size == size
|
||||
cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
|
||||
if solidangle is not None:
|
||||
do_solidangle = True
|
||||
assert solidangle.size == size
|
||||
csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
|
||||
|
||||
#pixel = cvarray(shape=4, itemsize=sizeof(point2D))
|
||||
#AB = Function()
|
||||
#BC = Function()
|
||||
#CD = Function()
|
||||
#DA = Function()
|
||||
|
||||
with nogil:
|
||||
for idx in range(size):
|
||||
|
||||
if (check_mask) and (cmask[idx]):
|
||||
continue
|
||||
|
||||
data = cdata[idx]
|
||||
if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
|
||||
continue
|
||||
|
||||
# pixel[0].x = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
|
||||
# pixel[0].y = < double > cpos[idx, 0, 1]
|
||||
# pixel[1].x = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
|
||||
# pixel[1].y = < double > cpos[idx, 1, 1]
|
||||
# pixel[2].x = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
|
||||
# pixel[2].y = < double > cpos[idx, 2, 1]
|
||||
# pixel[3].x = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
|
||||
# pixel[3].y = < double > cpos[idx, 3, 1]
|
||||
|
||||
A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, dpos)
|
||||
A1 = < double > cpos[idx, 0, 1]
|
||||
B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, dpos)
|
||||
B1 = < double > cpos[idx, 1, 1]
|
||||
C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, dpos)
|
||||
C1 = < double > cpos[idx, 2, 1]
|
||||
D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, dpos)
|
||||
D1 = < double > cpos[idx, 3, 1]
|
||||
|
||||
min0 = min(A0, B0, C0, D0)
|
||||
max0 = max(A0, B0, C0, D0)
|
||||
if (max0<0) or (min0 >=bins):
|
||||
continue
|
||||
if check_pos1:
|
||||
min1 = min(A1, B1, C1, D1)
|
||||
max1 = max(A1, B1, C1, D1)
|
||||
if (max1<pos1_min) or (min1 > pos1_maxin):
|
||||
continue
|
||||
|
||||
if do_dark:
|
||||
data -= cdark[idx]
|
||||
if do_flat:
|
||||
data /= cflat[idx]
|
||||
if do_polarization:
|
||||
data /= cpolarization[idx]
|
||||
if do_solidangle:
|
||||
data /= csolidangle[idx]
|
||||
|
||||
bin0_min = < int > floor(min0)
|
||||
bin0_max = < int > floor(max0)
|
||||
|
||||
#printf("%d - [(%f %f) (%f %f) (%f %f) (%f %f)] (%f %f) (%d %d)\n",idx, A0, A1, B0, B1, C0, C1, D0, D1, min0, max0, bin0_min, bin0_max)
|
||||
if bin0_min == bin0_max:
|
||||
#All pixel is within a single bin
|
||||
outCount[bin0_min] += 1
|
||||
outData[bin0_min] += data
|
||||
|
||||
# else we have pixel spliting.
|
||||
else:
|
||||
AB.slope=(B1-A1)/(B0-A0)
|
||||
AB.intersect= A1 - AB.slope*A0
|
||||
BC.slope=(C1-B1)/(C0-B0)
|
||||
BC.intersect= B1 - BC.slope*B0
|
||||
CD.slope=(D1-C1)/(D0-C0)
|
||||
CD.intersect= C1 - CD.slope*C0
|
||||
DA.slope=(A1-D1)/(A0-D0)
|
||||
DA.intersect= D1 - DA.slope*D0
|
||||
areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
|
||||
oneOverPixelArea = 1.0 / areaPixel
|
||||
partialArea2 = 0.0
|
||||
for bin in range(bin0_min, bin0_max+1):
|
||||
A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
|
||||
B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
|
||||
C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
|
||||
D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
|
||||
partialArea = integrate(A_lim, B_lim, AB)
|
||||
partialArea += integrate(B_lim, C_lim, BC)
|
||||
partialArea += integrate(C_lim, D_lim, CD)
|
||||
partialArea += integrate(D_lim, A_lim, DA)
|
||||
tmp = fabs(partialArea) * oneOverPixelArea
|
||||
partialArea2 += partialArea
|
||||
outCount[bin] += tmp
|
||||
outData[bin] += data * tmp
|
||||
#if fabs(partialArea2-areaPixel) > epsilon:
|
||||
#printf("%d - %f \n",idx,(partialArea2-areaPixel)/areaPixel)
|
||||
|
||||
|
||||
for i in range(bins):
|
||||
if outCount[i] > epsilon:
|
||||
outMerge[i] = outData[i] / outCount[i]
|
||||
else:
|
||||
outMerge[i] = cdummy
|
||||
|
||||
return outPos, outMerge, outData, outCount
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#@cython.cdivision(True)
|
||||
#@cython.boundscheck(False)
|
||||
#@cython.wraparound(False)
|
||||
#def fullSplit2D(numpy.ndarray pos not None,
|
||||
#numpy.ndarray weights not None,
|
||||
#bins not None,
|
||||
#pos0Range=None,
|
||||
#pos1Range=None,
|
||||
#dummy=None,
|
||||
#delta_dummy=None,
|
||||
#mask=None,
|
||||
#dark=None,
|
||||
#flat=None,
|
||||
#solidangle=None,
|
||||
#polarization=None):
|
||||
#"""
|
||||
#Calculate 2D histogram of pos weighted by weights
|
||||
|
||||
#Splitting is done on the pixel's bounding box like fit2D
|
||||
|
||||
|
||||
#@param pos: 3D array with pos0; Corner A,B,C,D; tth or chi
|
||||
#@param weights: array with intensities
|
||||
#@param bins: number of output bins int or 2-tuple of int
|
||||
#@param pos0Range: minimum and maximum of the 2th range
|
||||
#@param pos1Range: minimum and maximum of the chi range
|
||||
#@param dummy: value for bins without pixels
|
||||
#@param delta_dummy: precision of dummy value
|
||||
#@param mask: array (of int8) with masked pixels with 1 (0=not masked)
|
||||
#@param dark: array (of float64) with dark noise to be subtracted (or None)
|
||||
#@param flat: array (of float64) with flat-field image
|
||||
#@param polarization: array (of float64) with polarization correction
|
||||
#@param solidangle: array (of float64)with solid angle corrections
|
||||
#@return I, edges0, edges1, weighted histogram(2D), unweighted histogram (2D)
|
||||
#"""
|
||||
|
||||
#cdef size_t bins0=0, bins1=0, size = weights.size
|
||||
#if pos.ndim>3: #create a view
|
||||
#pos = pos.reshape((-1,4,2))
|
||||
|
||||
#assert pos.shape[0] == size
|
||||
#assert pos.shape[1] == 4 # 4 corners
|
||||
#assert pos.shape[2] == 2 # tth and chi
|
||||
#assert pos.ndim == 3
|
||||
#try:
|
||||
#bins0, bins1 = tuple(bins)
|
||||
#except:
|
||||
#bins0 = bins1 = < size_t > bins
|
||||
#if bins0 <= 0:
|
||||
#bins0 = 1
|
||||
#if bins1 <= 0:
|
||||
#bins1 = 1
|
||||
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = pos.astype(numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] cdata = weights.astype(numpy.float64).ravel()
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros((bins0, bins1), dtype=numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros((bins0, bins1), dtype=numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outMerge = numpy.zeros((bins0, bins1), dtype=numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges0 = numpy.zeros(bins0, dtype=numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] edges1 = numpy.zeros(bins1, dtype=numpy.float64)
|
||||
#cdef numpy.int8_t[:] cmask
|
||||
#cdef double[:] cflat, cdark, cpolarization, csolidangle
|
||||
|
||||
#cdef bint check_mask=False, do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidangle=False
|
||||
|
||||
#cdef double cdummy=0, cddummy=0, data=0
|
||||
#cdef double min0=0, max0=0, min1=0, max1=0, deltaR=0, deltaL=0, deltaU=0, deltaD=0, deltaA=0
|
||||
#cdef double pos0_min=0, pos0_max=0, pos1_min=0, pos1_max=0, pos0_maxin=0, pos1_maxin=0
|
||||
#cdef double areaPixel=0, fbin0_min=0, fbin0_max=0, fbin1_min=0, fbin1_max=0
|
||||
#cdef double a0=0, a1=0, b0=0, b1=0, c0=0, c1=0, d0=0, d1=0
|
||||
#cdef double epsilon = 1e-10
|
||||
|
||||
#cdef size_t bin0_max=0, bin0_min=0, bin1_max=0, bin1_min=0, i=0, j=0, idx=0
|
||||
|
||||
#if pos0Range is not None and len(pos0Range) == 2:
|
||||
#pos0_min = min(pos0Range)
|
||||
#pos0_maxin = max(pos0Range)
|
||||
#else:
|
||||
#pos0_min = pos[:, :, 0].min()
|
||||
#pos0_maxin = pos[:, :, 0].max()
|
||||
#pos0_max = pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
|
||||
|
||||
#if pos1Range is not None and len(pos1Range) > 1:
|
||||
#pos1_min = min(pos1Range)
|
||||
#pos1_maxin = max(pos1Range)
|
||||
#else:
|
||||
#pos1_min = pos[:, :, 1].min()
|
||||
#pos1_maxin = pos[:, :, 1].max()
|
||||
#pos1_max = pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
|
||||
|
||||
#cdef double dpos0 = (pos0_max - pos0_min) / (< double > (bins0))
|
||||
#cdef double dpos1 = (pos1_max - pos1_min) / (< double > (bins1))
|
||||
#edges0 = numpy.linspace(pos0_min+0.5*dpos0, pos0_maxin-0.5*dpos0, bins0)
|
||||
#edges1 = numpy.linspace(pos1_min+0.5*dpos1, pos1_maxin-0.5*dpos1, bins1)
|
||||
|
||||
#if (dummy is not None) and (delta_dummy is not None):
|
||||
#check_dummy = True
|
||||
#cdummy = float(dummy)
|
||||
#cddummy = float(delta_dummy)
|
||||
#elif (dummy is not None):
|
||||
#check_dummy = True
|
||||
#cdummy = float(dummy)
|
||||
#cddummy = 0.0
|
||||
#else:
|
||||
#check_dummy = False
|
||||
#cdummy = 0.0
|
||||
#cddummy = 0.0
|
||||
|
||||
#if mask is not None:
|
||||
#check_mask = True
|
||||
#assert mask.size == size
|
||||
#cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
|
||||
#if dark is not None:
|
||||
#do_dark = True
|
||||
#assert dark.size == size
|
||||
#cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float64)
|
||||
#if flat is not None:
|
||||
#do_flat = True
|
||||
#assert flat.size == size
|
||||
#cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float64)
|
||||
#if polarization is not None:
|
||||
#do_polarization = True
|
||||
#assert polarization.size == size
|
||||
#cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float64)
|
||||
#if solidangle is not None:
|
||||
#do_solidangle = True
|
||||
#assert solidangle.size == size
|
||||
#csolidangle = numpy.ascontiguousarray(solidangle.ravel(), dtype=numpy.float64)
|
||||
|
||||
#with nogil:
|
||||
#for idx in range(size):
|
||||
|
||||
#if (check_mask) and (cmask[idx]):
|
||||
#continue
|
||||
|
||||
#data = cdata[idx]
|
||||
#if check_dummy and ( (cddummy==0.0 and data==cdummy) or (cddummy!=0.0 and fabs(data-cdummy)<=cddummy)):
|
||||
#continue
|
||||
|
||||
#a0 = cpos[idx, 0, 0]
|
||||
#a1 = cpos[idx, 0, 1]
|
||||
#b0 = cpos[idx, 1, 0]
|
||||
#b1 = cpos[idx, 1, 1]
|
||||
#c0 = cpos[idx, 2, 0]
|
||||
#c1 = cpos[idx, 2, 1]
|
||||
#d0 = cpos[idx, 3, 0]
|
||||
#d1 = cpos[idx, 3, 1]
|
||||
|
||||
#min0 = min4f(a0, b0, c0, d0)
|
||||
#max0 = max4f(a0, b0, c0, d0)
|
||||
#min1 = min4f(a1, b1, c1, d1)
|
||||
#max1 = max4f(a1, b1, c1, d1)
|
||||
|
||||
#if (max0<pos0_min) or (min0 > pos0_maxin) or (max1<pos1_min) or (min1 > pos1_maxin):
|
||||
#continue
|
||||
|
||||
#if do_dark:
|
||||
#data -= cdark[idx]
|
||||
#if do_flat:
|
||||
#data /= cflat[idx]
|
||||
#if do_polarization:
|
||||
#data /= cpolarization[idx]
|
||||
#if do_solidangle:
|
||||
#data /= csolidangle[idx]
|
||||
|
||||
|
||||
#if min0 < pos0_min:
|
||||
#data = data * (pos0_min - min0) / (max0 - min0)
|
||||
#min0 = pos0_min
|
||||
#if min1 < pos1_min:
|
||||
#data = data * (pos1_min - min1) / (max1 - min1)
|
||||
#min1 = pos1_min
|
||||
#if max0 > pos0_maxin:
|
||||
#data = data * (max0 - pos0_maxin) / (max0 - min0)
|
||||
#max0 = pos0_maxin
|
||||
#if max1 > pos1_maxin:
|
||||
#data = data * (max1 - pos1_maxin) / (max1 - min1)
|
||||
#max1 = pos1_maxin
|
||||
|
||||
### treat data for pixel on chi discontinuity
|
||||
#if ((max1 - min1) / dpos1) > (bins1 / 2.0):
|
||||
#if pos1_maxin - max1 > min1 - pos1_min:
|
||||
#min1 = max1
|
||||
#max1 = pos1_maxin
|
||||
#else:
|
||||
#max1 = min1
|
||||
#min1 = pos1_min
|
||||
|
||||
#fbin0_min = getBinNr(min0, pos0_min, dpos0)
|
||||
#fbin0_max = getBinNr(max0, pos0_min, dpos0)
|
||||
#fbin1_min = getBinNr(min1, pos1_min, dpos1)
|
||||
#fbin1_max = getBinNr(max1, pos1_min, dpos1)
|
||||
|
||||
#bin0_min = < size_t > fbin0_min
|
||||
#bin0_max = < size_t > fbin0_max
|
||||
#bin1_min = < size_t > fbin1_min
|
||||
#bin1_max = < size_t > fbin1_max
|
||||
|
||||
|
||||
#if bin0_min == bin0_max:
|
||||
#if bin1_min == bin1_max:
|
||||
##All pixel is within a single bin
|
||||
#outCount[bin0_min, bin1_min] += 1.0
|
||||
#outData[bin0_min, bin1_min] += data
|
||||
#else:
|
||||
##spread on more than 2 bins
|
||||
#areaPixel = fbin1_max - fbin1_min
|
||||
#deltaD = (< double > (bin1_min + 1)) - fbin1_min
|
||||
#deltaU = fbin1_max - (< double > bin1_max)
|
||||
#deltaA = 1.0 / areaPixel
|
||||
|
||||
#outCount[bin0_min, bin1_min] += deltaA * deltaD
|
||||
#outData[bin0_min, bin1_min] += data * deltaA * deltaD
|
||||
|
||||
#outCount[bin0_min, bin1_max] += deltaA * deltaU
|
||||
#outData[bin0_min, bin1_max] += data * deltaA * deltaU
|
||||
## if bin1_min +1< bin1_max:
|
||||
#for j in range(bin1_min + 1, bin1_max):
|
||||
#outCount[bin0_min, j] += deltaA
|
||||
#outData[bin0_min, j] += data * deltaA
|
||||
|
||||
#else: #spread on more than 2 bins in dim 0
|
||||
#if bin1_min == bin1_max:
|
||||
##All pixel fall on 1 bins in dim 1
|
||||
#areaPixel = fbin0_max - fbin0_min
|
||||
#deltaL = (< double > (bin0_min + 1)) - fbin0_min
|
||||
#deltaA = deltaL / areaPixel
|
||||
#outCount[bin0_min, bin1_min] += deltaA
|
||||
#outData[bin0_min, bin1_min] += data * deltaA
|
||||
#deltaR = fbin0_max - (< double > bin0_max)
|
||||
#deltaA = deltaR / areaPixel
|
||||
#outCount[bin0_max, bin1_min] += deltaA
|
||||
#outData[bin0_max, bin1_min] += data * deltaA
|
||||
#deltaA = 1.0 / areaPixel
|
||||
#for i in range(bin0_min + 1, bin0_max):
|
||||
#outCount[i, bin1_min] += deltaA
|
||||
#outData[i, bin1_min] += data * deltaA
|
||||
#else:
|
||||
##spread on n pix in dim0 and m pixel in dim1:
|
||||
#areaPixel = (fbin0_max - fbin0_min) * (fbin1_max - fbin1_min)
|
||||
#deltaL = (< double > (bin0_min + 1.0)) - fbin0_min
|
||||
#deltaR = fbin0_max - (< double > bin0_max)
|
||||
#deltaD = (< double > (bin1_min + 1.0)) - fbin1_min
|
||||
#deltaU = fbin1_max - (< double > bin1_max)
|
||||
#deltaA = 1.0 / areaPixel
|
||||
|
||||
#outCount[bin0_min, bin1_min] += deltaA * deltaL * deltaD
|
||||
#outData[bin0_min, bin1_min] += data * deltaA * deltaL * deltaD
|
||||
|
||||
#outCount[bin0_min, bin1_max] += deltaA * deltaL * deltaU
|
||||
#outData[bin0_min, bin1_max] += data * deltaA * deltaL * deltaU
|
||||
|
||||
#outCount[bin0_max, bin1_min] += deltaA * deltaR * deltaD
|
||||
#outData[bin0_max, bin1_min] += data * deltaA * deltaR * deltaD
|
||||
|
||||
#outCount[bin0_max, bin1_max] += deltaA * deltaR * deltaU
|
||||
#outData[bin0_max, bin1_max] += data * deltaA * deltaR * deltaU
|
||||
#for i in range(bin0_min + 1, bin0_max):
|
||||
#outCount[i, bin1_min] += deltaA * deltaD
|
||||
#outData[i, bin1_min] += data * deltaA * deltaD
|
||||
#for j in range(bin1_min + 1, bin1_max):
|
||||
#outCount[i, j] += deltaA
|
||||
#outData[i, j] += data * deltaA
|
||||
#outCount[i, bin1_max] += deltaA * deltaU
|
||||
#outData[i, bin1_max] += data * deltaA * deltaU
|
||||
#for j in range(bin1_min + 1, bin1_max):
|
||||
#outCount[bin0_min, j] += deltaA * deltaL
|
||||
#outData[bin0_min, j] += data * deltaA * deltaL
|
||||
|
||||
#outCount[bin0_max, j] += deltaA * deltaR
|
||||
#outData[bin0_max, j] += data * deltaA * deltaR
|
||||
|
||||
##with nogil:
|
||||
#for i in range(bins0):
|
||||
#for j in range(bins1):
|
||||
#if outCount[i, j] > epsilon:
|
||||
#outMerge[i, j] = outData[i, j] / outCount[i, j]
|
||||
#else:
|
||||
#outMerge[i, j] = cdummy
|
||||
#return outMerge.T, edges0, edges1, outData.T, outCount.T
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,974 @@
|
|||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Project: Azimuthal integration
|
||||
# https://github.com/kif/pyFAI
|
||||
#
|
||||
# Copyright (C) European Synchrotron Radiation Facility, Grenoble, France
|
||||
#
|
||||
# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
|
||||
#
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
#
|
||||
|
||||
#
|
||||
import cython
|
||||
import os, sys
|
||||
from cython.parallel import prange
|
||||
from libc.string cimport memset
|
||||
import numpy
|
||||
cimport numpy
|
||||
from libc.math cimport fabs, M_PI, floor
|
||||
from libc.stdio cimport printf
|
||||
|
||||
cdef float pi = <float> M_PI
|
||||
cdef float onef = <float> 1.0
|
||||
try:
|
||||
from fastcrc import crc32
|
||||
except:
|
||||
from zlib import crc32
|
||||
EPS32 = (1.0 + numpy.finfo(numpy.float32).eps)
|
||||
|
||||
|
||||
|
||||
cdef struct Function:
|
||||
double slope
|
||||
double intersect
|
||||
|
||||
cdef double area4(double a0, double a1, double b0, double b1, double c0, double c1, double d0, double d1) nogil:
|
||||
"""
|
||||
Calculate the area of the ABCD quadrilataire with corners:
|
||||
A(a0,a1)
|
||||
B(b0,b1)
|
||||
C(c0,c1)
|
||||
D(d0,d1)
|
||||
@return: area, i.e. 1/2 * (AC ^ BD)
|
||||
"""
|
||||
return 0.5 * fabs(((c0 - a0) * (d1 - b1)) - ((c1 - a1) * (d0 - b0)))
|
||||
|
||||
@cython.cdivision(True)
|
||||
cdef inline double getBinNr( double x0, double pos0_min, double delta) nogil:
|
||||
"""
|
||||
calculate the bin number for any point
|
||||
param x0: current position
|
||||
param pos0_min: position minimum
|
||||
param delta: bin width
|
||||
"""
|
||||
return (x0 - pos0_min) / delta
|
||||
|
||||
cdef double integrate( double A0, double B0, Function AB) nogil:
|
||||
"""
|
||||
integrates the line defined by AB, from A0 to B0
|
||||
param A0: first limit
|
||||
param B0: second limit
|
||||
param AB: struct with the slope and point of intersection of the line
|
||||
"""
|
||||
if A0==B0:
|
||||
return 0.0
|
||||
else:
|
||||
return AB.slope*(B0*B0 - A0*A0)*0.5 + AB.intersect*(B0-A0)
|
||||
|
||||
|
||||
class HistoLUT1dFullSplit(object):
|
||||
"""
|
||||
Now uses CSR (Compressed Sparse raw) with main attributes:
|
||||
* nnz: number of non zero elements
|
||||
* data: coefficient of the matrix in a 1D vector of float32
|
||||
* indices: Column index position for the data (same size as
|
||||
* indptr: row pointer indicates the start of a given row. len nrow+1
|
||||
|
||||
Nota: nnz = indptr[-1]
|
||||
"""
|
||||
@cython.boundscheck(False)
|
||||
def __init__(self,
|
||||
numpy.ndarray pos not None,
|
||||
int bins=100,
|
||||
pos0Range=None,
|
||||
pos1Range=None,
|
||||
mask=None,
|
||||
mask_checksum=None,
|
||||
allow_pos0_neg=False,
|
||||
unit="undefined"):
|
||||
"""
|
||||
@param pos: 3D or 4D array with the coordinates of each pixel point
|
||||
@param bins: number of output bins, 100 by default
|
||||
@param pos0Range: minimum and maximum of the 2th range
|
||||
@param pos1Range: minimum and maximum of the chi range
|
||||
@param mask: array (of int8) with masked pixels with 1 (0=not masked)
|
||||
@param allow_pos0_neg: enforce the q<0 is usually not possible
|
||||
@param unit: can be 2th_deg or r_nm^-1 ...
|
||||
"""
|
||||
|
||||
# self.padding = int(padding)
|
||||
if pos.ndim>3: #create a view
|
||||
pos = pos.reshape((-1,4,2))
|
||||
assert pos.shape[1] == 4
|
||||
assert pos.shape[2] == 2
|
||||
assert pos.ndim == 3
|
||||
self.pos = pos
|
||||
self.size = pos.shape[0]
|
||||
self.bins = bins
|
||||
self.lut_size = 0
|
||||
self.allow_pos0_neg = allow_pos0_neg
|
||||
if mask is not None:
|
||||
assert mask.size == self.size
|
||||
self.check_mask = True
|
||||
self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
|
||||
if mask_checksum:
|
||||
self.mask_checksum = mask_checksum
|
||||
else:
|
||||
self.mask_checksum = crc32(mask)
|
||||
else:
|
||||
self.check_mask = False
|
||||
self.mask_checksum = None
|
||||
self.data = self.nnz = self.indices = self.indptr = None
|
||||
self.pos0Range = pos0Range
|
||||
self.pos1Range = pos1Range
|
||||
|
||||
self.calc_lut()
|
||||
self.outPos = numpy.linspace(self.pos0_min+0.5*self.delta, self.pos0_maxin-0.5*self.delta, self.bins)
|
||||
self.lut_checksum = crc32(self.data)
|
||||
self.unit=unit
|
||||
self.lut=(self.data,self.indices,self.indptr)
|
||||
self.lut_nbytes = sum([i.nbytes for i in self.lut])
|
||||
|
||||
|
||||
@cython.cdivision(True)
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
def calc_lut(self):
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 3] cpos = numpy.ascontiguousarray(self.pos,dtype=numpy.float64)
|
||||
cdef numpy.int8_t[:] cmask
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim = 1] outMax = numpy.zeros(self.bins, dtype=numpy.int32)
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros(self.bins+1, dtype=numpy.int32)
|
||||
cdef double pos0_min=0, pos0_max=0, pos0_maxin=0, pos1_min=0, pos1_max=0, pos1_maxin=0
|
||||
cdef double max0, min0
|
||||
cdef double areaPixel=0, delta=0
|
||||
cdef double A0=0, B0=0, C0=0, D0=0, A1=0, B1=0, C1=0, D1=0
|
||||
cdef double A_lim=0, B_lim=0, C_lim=0, D_lim=0
|
||||
cdef double oneOverArea=0, partialArea=0, tmp=0
|
||||
cdef Function AB, BC, CD, DA
|
||||
cdef int bins, i=0, idx=0, bin=0, bin0_max=0, bin0_min=0, pixel_bins=0, k=0, size=0
|
||||
cdef bint check_pos1=False, check_mask=False
|
||||
|
||||
bins = self.bins
|
||||
if self.pos0Range is not None and len(self.pos0Range) > 1:
|
||||
self.pos0_min = min(self.pos0Range)
|
||||
self.pos0_maxin = max(self.pos0Range)
|
||||
else:
|
||||
self.pos0_min = self.pos[:, :, 0].min()
|
||||
self.pos0_maxin = self.pos[:, :, 0].max()
|
||||
self.pos0_max = self.pos0_maxin * (1 + numpy.finfo(numpy.float32).eps)
|
||||
if self.pos1Range is not None and len(self.pos1Range) > 1:
|
||||
self.pos1_min = min(self.pos1Range)
|
||||
self.pos1_maxin = max(self.pos1Range)
|
||||
self.check_pos1 = True
|
||||
else:
|
||||
self.pos1_min = self.pos[:, :, 1].min()
|
||||
self.pos1_maxin = self.pos[:, :, 1].max()
|
||||
self.pos1_max = self.pos1_maxin * (1 + numpy.finfo(numpy.float32).eps)
|
||||
|
||||
self.delta = (self.pos0_max - self.pos0_min) / (< double > (bins))
|
||||
|
||||
pos0_min = self.pos0_min
|
||||
pos0_max = self.pos0_max
|
||||
pos1_min = self.pos1_min
|
||||
pos1_max = self.pos1_max
|
||||
delta = self.delta
|
||||
|
||||
size = self.size
|
||||
check_mask = self.check_mask
|
||||
if check_mask:
|
||||
cmask = self.cmask
|
||||
|
||||
with nogil:
|
||||
for idx in range(size):
|
||||
|
||||
if (check_mask) and (cmask[idx]):
|
||||
continue
|
||||
|
||||
A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
|
||||
A1 = < double > cpos[idx, 0, 1]
|
||||
B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
|
||||
B1 = < double > cpos[idx, 1, 1]
|
||||
C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
|
||||
C1 = < double > cpos[idx, 2, 1]
|
||||
D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
|
||||
D1 = < double > cpos[idx, 3, 1]
|
||||
|
||||
min0 = min(A0, B0, C0, D0)
|
||||
max0 = max(A0, B0, C0, D0)
|
||||
if (max0<0) or (min0 >=bins):
|
||||
continue
|
||||
if check_pos1:
|
||||
if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
|
||||
continue
|
||||
|
||||
bin0_min = < int > floor(min0)
|
||||
bin0_max = < int > floor(max0)
|
||||
|
||||
for bin in range(bin0_min, bin0_max+1):
|
||||
outMax[bin] += 1
|
||||
|
||||
indptr[1:] = outMax.cumsum()
|
||||
self.indptr = indptr
|
||||
|
||||
cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices = numpy.zeros(indptr[bins], dtype=numpy.int32)
|
||||
cdef numpy.ndarray[numpy.float32_t, ndim = 1] data = numpy.zeros(indptr[bins], dtype=numpy.float32)
|
||||
|
||||
#just recycle the outMax array
|
||||
memset(&outMax[0], 0, bins * sizeof(numpy.int32_t))
|
||||
|
||||
with nogil:
|
||||
for idx in range(size):
|
||||
|
||||
if (check_mask) and (cmask[idx]):
|
||||
continue
|
||||
|
||||
A0 = getBinNr(< double > cpos[idx, 0, 0], pos0_min, delta)
|
||||
A1 = < double > cpos[idx, 0, 1]
|
||||
B0 = getBinNr(< double > cpos[idx, 1, 0], pos0_min, delta)
|
||||
B1 = < double > cpos[idx, 1, 1]
|
||||
C0 = getBinNr(< double > cpos[idx, 2, 0], pos0_min, delta)
|
||||
C1 = < double > cpos[idx, 2, 1]
|
||||
D0 = getBinNr(< double > cpos[idx, 3, 0], pos0_min, delta)
|
||||
D1 = < double > cpos[idx, 3, 1]
|
||||
|
||||
min0 = min(A0, B0, C0, D0)
|
||||
max0 = max(A0, B0, C0, D0)
|
||||
if (max0<0) or (min0 >=bins):
|
||||
continue
|
||||
if check_pos1:
|
||||
if (max(A1, B1, C1, D1) < pos1_min) or (min(A1, B1, C1, D1) > pos1_maxin):
|
||||
continue
|
||||
|
||||
bin0_min = < int > floor(min0)
|
||||
bin0_max = < int > floor(max0)
|
||||
|
||||
if bin0_min == bin0_max:
|
||||
#All pixel is within a single bin
|
||||
k = outMax[bin0_min]
|
||||
indices[indptr[bin0_min]+k] = idx
|
||||
data[indptr[bin0_min]+k] = 1.0
|
||||
outMax[bin0_min] += 1 #k+1
|
||||
else: #else we have pixel spliting.
|
||||
AB.slope=(B1-A1)/(B0-A0)
|
||||
AB.intersect= A1 - AB.slope*A0
|
||||
BC.slope=(C1-B1)/(C0-B0)
|
||||
BC.intersect= B1 - BC.slope*B0
|
||||
CD.slope=(D1-C1)/(D0-C0)
|
||||
CD.intersect= C1 - CD.slope*C0
|
||||
DA.slope=(A1-D1)/(A0-D0)
|
||||
DA.intersect= D1 - DA.slope*D0
|
||||
areaPixel = area4(A0, A1, B0, B1, C0, C1, D0, D1)
|
||||
oneOverPixelArea = 1.0 / areaPixel
|
||||
partialArea2 = 0.0
|
||||
for bin in range(bin0_min, bin0_max+1):
|
||||
A_lim = (A0<=bin)*(A0<=(bin+1))*bin + (A0>bin)*(A0<=(bin+1))*A0 + (A0>bin)*(A0>(bin+1))*(bin+1)
|
||||
B_lim = (B0<=bin)*(B0<=(bin+1))*bin + (B0>bin)*(B0<=(bin+1))*B0 + (B0>bin)*(B0>(bin+1))*(bin+1)
|
||||
C_lim = (C0<=bin)*(C0<=(bin+1))*bin + (C0>bin)*(C0<=(bin+1))*C0 + (C0>bin)*(C0>(bin+1))*(bin+1)
|
||||
D_lim = (D0<=bin)*(D0<=(bin+1))*bin + (D0>bin)*(D0<=(bin+1))*D0 + (D0>bin)*(D0>(bin+1))*(bin+1)
|
||||
partialArea = integrate(A_lim, B_lim, AB)
|
||||
partialArea += integrate(B_lim, C_lim, BC)
|
||||
partialArea += integrate(C_lim, D_lim, CD)
|
||||
partialArea += integrate(D_lim, A_lim, DA)
|
||||
tmp = fabs(partialArea) * oneOverPixelArea
|
||||
k = outMax[bin]
|
||||
indices[indptr[bin]+k] = idx
|
||||
data[indptr[bin]+k] = tmp
|
||||
outMax[bin] += 1 #k+1
|
||||
self.data = data
|
||||
self.indices = indices
|
||||
|
||||
@cython.cdivision(True)
|
||||
@cython.boundscheck(False)
|
||||
@cython.wraparound(False)
|
||||
def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
|
||||
"""
|
||||
Actually perform the integration which in this case looks more like a matrix-vector product
|
||||
|
||||
@param weights: input image
|
||||
@type weights: ndarray
|
||||
@param dummy: value for dead pixels (optional)
|
||||
@type dummy: float
|
||||
@param delta_dummy: precision for dead-pixel value in dynamic masking
|
||||
@type delta_dummy: float
|
||||
@param dark: array with the dark-current value to be subtracted (if any)
|
||||
@type dark: ndarray
|
||||
@param flat: array with the dark-current value to be divided by (if any)
|
||||
@type flat: ndarray
|
||||
@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
|
||||
@type solidAngle: ndarray
|
||||
@param polarization: array with the polarization correction values to be divided by (if any)
|
||||
@type polarization: ndarray
|
||||
@return : positions, pattern, weighted_histogram and unweighted_histogram
|
||||
@rtype: 4-tuple of ndarrays
|
||||
|
||||
"""
|
||||
cdef numpy.int32_t i=0, j=0, idx=0, bins=self.bins, size=self.size
|
||||
cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
|
||||
cdef float data=0, coef=0, cdummy=0, cddummy=0
|
||||
cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData = numpy.zeros(self.bins, dtype=numpy.float64)
|
||||
cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
|
||||
cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
|
||||
cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
|
||||
|
||||
cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
|
||||
assert size == weights.size
|
||||
|
||||
if dummy is not None:
|
||||
do_dummy = True
|
||||
cdummy = <float>float(dummy)
|
||||
if delta_dummy is None:
|
||||
cddummy = <float>0.0
|
||||
else:
|
||||
cddummy = <float>float(delta_dummy)
|
||||
|
||||
if flat is not None:
|
||||
do_flat = True
|
||||
assert flat.size == size
|
||||
cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
|
||||
if dark is not None:
|
||||
do_dark = True
|
||||
assert dark.size == size
|
||||
cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
|
||||
if solidAngle is not None:
|
||||
do_solidAngle = True
|
||||
assert solidAngle.size == size
|
||||
csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
|
||||
if polarization is not None:
|
||||
do_polarization = True
|
||||
assert polarization.size == size
|
||||
cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
|
||||
|
||||
if (do_dark + do_flat + do_polarization + do_solidAngle):
|
||||
tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
|
||||
cdata = numpy.zeros(size,dtype=numpy.float32)
|
||||
if do_dummy:
|
||||
for i in prange(size, nogil=True, schedule="static"):
|
||||
data = tdata[i]
|
||||
if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
|
||||
#Nota: -= and /= operatore are seen as reduction in cython parallel.
|
||||
if do_dark:
|
||||
data = data - cdark[i]
|
||||
if do_flat:
|
||||
data = data / cflat[i]
|
||||
if do_polarization:
|
||||
data = data / cpolarization[i]
|
||||
if do_solidAngle:
|
||||
data = data / csolidAngle[i]
|
||||
cdata[i]+=data
|
||||
else: #set all dummy_like values to cdummy. simplifies further processing
|
||||
cdata[i]+=cdummy
|
||||
else:
|
||||
for i in prange(size, nogil=True, schedule="static"):
|
||||
data = tdata[i]
|
||||
if do_dark:
|
||||
data = data - cdark[i]
|
||||
if do_flat:
|
||||
data = data / cflat[i]
|
||||
if do_polarization:
|
||||
data = data / cpolarization[i]
|
||||
if do_solidAngle:
|
||||
data = data / csolidAngle[i]
|
||||
cdata[i]+=data
|
||||
else:
|
||||
if do_dummy:
|
||||
tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
|
||||
cdata = numpy.zeros(size,dtype=numpy.float32)
|
||||
for i in prange(size, nogil=True, schedule="static"):
|
||||
data = tdata[i]
|
||||
if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
|
||||
cdata[i]+=data
|
||||
else:
|
||||
cdata[i]+=cdummy
|
||||
else:
|
||||
cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
|
||||
|
||||
for i in prange(bins, nogil=True, schedule="guided"):
|
||||
sum_data = 0.0
|
||||
sum_count = 0.0
|
||||
for j in range(indptr[i],indptr[i+1]):
|
||||
idx = indices[j]
|
||||
coef = ccoef[j]
|
||||
if coef == 0.0:
|
||||
continue
|
||||
data = cdata[idx]
|
||||
if do_dummy and data==cdummy:
|
||||
continue
|
||||
sum_data = sum_data + coef * data
|
||||
sum_count = sum_count + coef
|
||||
outData[i] += sum_data
|
||||
outCount[i] += sum_count
|
||||
if sum_count > epsilon:
|
||||
outMerge[i] += sum_data / sum_count
|
||||
else:
|
||||
outMerge[i] += cdummy
|
||||
return self.outPos, outMerge, outData, outCount
|
||||
|
||||
|
||||
|
||||
################################################################################
|
||||
# Bidimensionnal regrouping
|
||||
################################################################################
|
||||
|
||||
#class HistoBBox2d(object):
|
||||
#@cython.boundscheck(False)
|
||||
#def __init__(self,
|
||||
#pos0,
|
||||
#delta_pos0,
|
||||
#pos1,
|
||||
#delta_pos1,
|
||||
#bins=(100,36),
|
||||
#pos0Range=None,
|
||||
#pos1Range=None,
|
||||
#mask=None,
|
||||
#mask_checksum=None,
|
||||
#allow_pos0_neg=False,
|
||||
#unit="undefined",
|
||||
#chiDiscAtPi=True
|
||||
#):
|
||||
#"""
|
||||
#@param pos0: 1D array with pos0: tth or q_vect
|
||||
#@param delta_pos0: 1D array with delta pos0: max center-corner distance
|
||||
#@param pos1: 1D array with pos1: chi
|
||||
#@param delta_pos1: 1D array with max pos1: max center-corner distance, unused !
|
||||
#@param bins: number of output bins (tth=100, chi=36 by default)
|
||||
#@param pos0Range: minimum and maximum of the 2th range
|
||||
#@param pos1Range: minimum and maximum of the chi range
|
||||
#@param mask: array (of int8) with masked pixels with 1 (0=not masked)
|
||||
#@param allow_pos0_neg: enforce the q<0 is usually not possible
|
||||
#@param chiDiscAtPi: boolean; by default the chi_range is in the range ]-pi,pi[ set to 0 to have the range ]0,2pi[
|
||||
#"""
|
||||
#cdef int i, size, bin0, bin1
|
||||
#self.size = pos0.size
|
||||
#assert delta_pos0.size == self.size
|
||||
#assert pos1.size == self.size
|
||||
#assert delta_pos1.size == self.size
|
||||
#self.chiDiscAtPi = 1 if chiDiscAtPi else 0
|
||||
#self.allow_pos0_neg = allow_pos0_neg
|
||||
|
||||
#try:
|
||||
#bins0, bins1 = tuple(bins)
|
||||
#except:
|
||||
#bins0 = bins1 = bins
|
||||
#if bins0 <= 0:
|
||||
#bins0 = 1
|
||||
#if bins1 <= 0:
|
||||
#bins1 = 1
|
||||
#self.bins = (int(bins0),int(bins1))
|
||||
#self.lut_size = 0
|
||||
#if mask is not None:
|
||||
#assert mask.size == self.size
|
||||
#self.check_mask = True
|
||||
#self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
|
||||
#if mask_checksum:
|
||||
#self.mask_checksum = mask_checksum
|
||||
#else:
|
||||
#self.mask_checksum = crc32(mask)
|
||||
#else:
|
||||
#self.check_mask = False
|
||||
#self.mask_checksum = None
|
||||
|
||||
#self.data = self.nnz = self.indices = self.indptr = None
|
||||
#self.cpos0 = numpy.ascontiguousarray(pos0.ravel(), dtype=numpy.float32)
|
||||
#self.dpos0 = numpy.ascontiguousarray(delta_pos0.ravel(), dtype=numpy.float32)
|
||||
#self.cpos0_sup = numpy.empty_like(self.cpos0)
|
||||
#self.cpos0_inf = numpy.empty_like(self.cpos0)
|
||||
#self.pos0Range = pos0Range
|
||||
#self.pos1Range = pos1Range
|
||||
|
||||
#self.cpos1 = numpy.ascontiguousarray((pos1).ravel(), dtype=numpy.float32)
|
||||
#self.dpos1 = numpy.ascontiguousarray((delta_pos1).ravel(), dtype=numpy.float32)
|
||||
#self.cpos1_sup = numpy.empty_like(self.cpos1)
|
||||
#self.cpos1_inf = numpy.empty_like(self.cpos1)
|
||||
#self.calc_boundaries(pos0Range, pos1Range)
|
||||
#self.delta0 = (self.pos0_max - self.pos0_min) / float(bins0)
|
||||
#self.delta1 = (self.pos1_max - self.pos1_min) / float(bins1)
|
||||
#self.lut_max_idx = self.calc_lut()
|
||||
#self.outPos0 = numpy.linspace(self.pos0_min+0.5*self.delta0, self.pos0_maxin-0.5*self.delta0, bins0)
|
||||
#self.outPos1 = numpy.linspace(self.pos1_min+0.5*self.delta1, self.pos1_maxin-0.5*self.delta1, bins1)
|
||||
#self.unit=unit
|
||||
#self.lut=(self.data,self.indices,self.indptr)
|
||||
#self.lut_checksum = crc32(self.data)
|
||||
|
||||
#@cython.boundscheck(False)
|
||||
#@cython.wraparound(False)
|
||||
#def calc_boundaries(self, pos0Range, pos1Range):
|
||||
#cdef int size = self.cpos0.size
|
||||
#cdef bint check_mask = self.check_mask
|
||||
#cdef numpy.int8_t[:] cmask
|
||||
#cdef float[:] cpos0, dpos0, cpos0_sup, cpos0_inf
|
||||
#cdef float[:] cpos1, dpos1, cpos1_sup, cpos1_inf,
|
||||
#cdef float upper0, lower0, pos0_max, pos0_min, c0, d0
|
||||
#cdef float upper1, lower1, pos1_max, pos1_min, c1, d1
|
||||
#cdef bint allow_pos0_neg=self.allow_pos0_neg
|
||||
#cdef bint chiDiscAtPi = self.chiDiscAtPi
|
||||
|
||||
#cpos0_sup = self.cpos0_sup
|
||||
#cpos0_inf = self.cpos0_inf
|
||||
#cpos0 = self.cpos0
|
||||
#dpos0 = self.dpos0
|
||||
#cpos1_sup = self.cpos1_sup
|
||||
#cpos1_inf = self.cpos1_inf
|
||||
#cpos1 = self.cpos1
|
||||
#dpos1 = self.dpos1
|
||||
#pos0_min=cpos0[0]
|
||||
#pos0_max=cpos0[0]
|
||||
#pos1_min=cpos1[0]
|
||||
#pos1_max=cpos1[0]
|
||||
|
||||
#if check_mask:
|
||||
#cmask = self.cmask
|
||||
#with nogil:
|
||||
#for idx in range(size):
|
||||
#c0 = cpos0[idx]
|
||||
#d0 = dpos0[idx]
|
||||
#lower0 = c0 - d0
|
||||
#upper0 = c0 + d0
|
||||
#c1 = cpos1[idx]
|
||||
#d1 = dpos1[idx]
|
||||
#lower1 = c1 - d1
|
||||
#upper1 = c1 + d1
|
||||
#if not allow_pos0_neg and lower0<0:
|
||||
#lower0=0
|
||||
#if upper1 > (2-chiDiscAtPi)*pi:
|
||||
#upper1 = (2-chiDiscAtPi)*pi
|
||||
#if lower1 < (-chiDiscAtPi)*pi:
|
||||
#lower1 = (-chiDiscAtPi)*pi
|
||||
#cpos0_sup[idx] = upper0
|
||||
#cpos0_inf[idx] = lower0
|
||||
#cpos1_sup[idx] = upper1
|
||||
#cpos1_inf[idx] = lower1
|
||||
#if not (check_mask and cmask[idx]):
|
||||
#if upper0>pos0_max:
|
||||
#pos0_max = upper0
|
||||
#if lower0<pos0_min:
|
||||
#pos0_min = lower0
|
||||
#if upper1>pos1_max:
|
||||
#pos1_max = upper1
|
||||
#if lower1<pos1_min:
|
||||
#pos1_min = lower1
|
||||
|
||||
#if pos0Range is not None and len(pos0Range) > 1:
|
||||
#self.pos0_min = min(pos0Range)
|
||||
#self.pos0_maxin = max(pos0Range)
|
||||
#else:
|
||||
#self.pos0_min = pos0_min
|
||||
#self.pos0_maxin = pos0_max
|
||||
|
||||
|
||||
#if pos1Range is not None and len(pos1Range) > 1:
|
||||
#self.pos1_min = min(pos1Range)
|
||||
#self.pos1_maxin = max(pos1Range)
|
||||
#else:
|
||||
#self.pos1_min = pos1_min
|
||||
#self.pos1_maxin = pos1_max
|
||||
|
||||
#if (not allow_pos0_neg) and self.pos0_min < 0:
|
||||
#self.pos0_min = 0
|
||||
#self.pos0_max = self.pos0_maxin * EPS32
|
||||
#self.cpos0_sup = cpos0_sup
|
||||
#self.cpos0_inf = cpos0_inf
|
||||
#self.pos1_max = self.pos1_maxin * EPS32
|
||||
#self.cpos1_sup = cpos1_sup
|
||||
#self.cpos1_inf = cpos1_inf
|
||||
|
||||
|
||||
#@cython.boundscheck(False)
|
||||
#@cython.wraparound(False)
|
||||
#@cython.cdivision(True)
|
||||
#def calc_lut(self):
|
||||
#'calculate the max number of elements in the LUT and populate it'
|
||||
#cdef float delta0=self.delta0, pos0_min=self.pos0_min, min0, max0, fbin0_min, fbin0_max
|
||||
#cdef float delta1=self.delta1, pos1_min=self.pos1_min, min1, max1, fbin1_min, fbin1_max
|
||||
#cdef int bin0_min, bin0_max, bins0 = self.bins[0]
|
||||
#cdef int bin1_min, bin1_max, bins1 = self.bins[1]
|
||||
#cdef numpy.int32_t k, idx, lut_size, i, j, size=self.size
|
||||
#cdef bint check_mask
|
||||
#cdef float[:] cpos0_sup = self.cpos0_sup
|
||||
#cdef float[:] cpos0_inf = self.cpos0_inf
|
||||
#cdef float[:] cpos1_inf = self.cpos1_inf
|
||||
#cdef float[:] cpos1_sup = self.cpos1_sup
|
||||
#cdef numpy.ndarray[numpy.int32_t, ndim = 2] outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
|
||||
#cdef numpy.ndarray[numpy.int32_t, ndim = 1] indptr = numpy.zeros((bins0*bins1)+1, dtype=numpy.int32)
|
||||
#cdef numpy.ndarray[numpy.int32_t, ndim = 1] indices
|
||||
#cdef numpy.ndarray[numpy.float32_t, ndim = 1] data
|
||||
#cdef numpy.int8_t[:] cmask
|
||||
#if self.check_mask:
|
||||
#cmask = self.cmask
|
||||
#check_mask = True
|
||||
#else:
|
||||
#check_mask = False
|
||||
|
||||
##NOGIL
|
||||
#with nogil:
|
||||
#for idx in range(size):
|
||||
#if (check_mask) and (cmask[idx]):
|
||||
#continue
|
||||
|
||||
#min0 = cpos0_inf[idx]
|
||||
#max0 = cpos0_sup[idx]
|
||||
#min1 = cpos1_inf[idx]
|
||||
#max1 = cpos1_sup[idx]
|
||||
|
||||
#bin0_min = < int > getBinNr(min0, pos0_min, delta0)
|
||||
#bin0_max = < int > getBinNr(max0, pos0_min, delta0)
|
||||
|
||||
#bin1_min = < int > getBinNr(min1, pos1_min, delta1)
|
||||
#bin1_max = < int > getBinNr(max1, pos1_min, delta1)
|
||||
|
||||
#if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
|
||||
#continue
|
||||
|
||||
#if bin0_max >= bins0 :
|
||||
#bin0_max = bins0 - 1
|
||||
#if bin0_min < 0:
|
||||
#bin0_min = 0
|
||||
#if bin1_max >= bins1 :
|
||||
#bin1_max = bins1 - 1
|
||||
#if bin1_min < 0:
|
||||
#bin1_min = 0
|
||||
|
||||
#for i in range(bin0_min, bin0_max+1):
|
||||
#for j in range(bin1_min , bin1_max+1):
|
||||
#outMax[i, j] += 1
|
||||
|
||||
#self.nnz = outMax.sum()
|
||||
#indptr[1:] = outMax.cumsum()
|
||||
#self.indptr = indptr
|
||||
## self.lut_size = lut_size = outMax.max()
|
||||
##just recycle the outMax array
|
||||
##outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
|
||||
#memset(&outMax[0,0], 0, bins0*bins1*sizeof(numpy.int32_t))
|
||||
|
||||
#lut_nbytes = self.nnz * (sizeof(numpy.float32_t)+sizeof(numpy.int32_t)) + bins0*bins1*sizeof(numpy.int32_t)
|
||||
#if (os.name == "posix") and ("SC_PAGE_SIZE" in os.sysconf_names) and ("SC_PHYS_PAGES" in os.sysconf_names):
|
||||
#memsize = os.sysconf("SC_PAGE_SIZE")*os.sysconf("SC_PHYS_PAGES")
|
||||
#if memsize < lut_nbytes:
|
||||
#raise MemoryError("CSR Matrix is %.3fGB whereas the memory of the system is only %s"%(lut_nbytes, memsize))
|
||||
##else hope we have enough memory
|
||||
#data = numpy.zeros(self.nnz,dtype=numpy.float32)
|
||||
#indices = numpy.zeros(self.nnz,dtype=numpy.int32)
|
||||
## lut = numpy.recarray(shape=(bins0, bins1, lut_size),dtype=[("idx",numpy.int32),("coef",numpy.float32)])
|
||||
## memset(&lut[0,0,0], 0, lut_nbytes)
|
||||
#with nogil:
|
||||
#for idx in range(size):
|
||||
#if (check_mask) and cmask[idx]:
|
||||
#continue
|
||||
|
||||
#min0 = cpos0_inf[idx]
|
||||
#max0 = cpos0_sup[idx]
|
||||
#min1 = cpos1_inf[idx]
|
||||
#max1 = cpos1_sup[idx]
|
||||
|
||||
#fbin0_min = getBinNr(min0, pos0_min, delta0)
|
||||
#fbin0_max = getBinNr(max0, pos0_min, delta0)
|
||||
#fbin1_min = getBinNr(min1, pos1_min, delta1)
|
||||
#fbin1_max = getBinNr(max1, pos1_min, delta1)
|
||||
|
||||
#bin0_min = <int> fbin0_min
|
||||
#bin0_max = <int> fbin0_max
|
||||
#bin1_min = <int> fbin1_min
|
||||
#bin1_max = <int> fbin1_max
|
||||
|
||||
#if (bin0_max < 0) or (bin0_min >= bins0) or (bin1_max < 0) or (bin1_min >= bins1):
|
||||
#continue
|
||||
|
||||
#if bin0_max >= bins0 :
|
||||
#bin0_max = bins0 - 1
|
||||
#if bin0_min < 0:
|
||||
#bin0_min = 0
|
||||
#if bin1_max >= bins1 :
|
||||
#bin1_max = bins1 - 1
|
||||
#if bin1_min < 0:
|
||||
#bin1_min = 0
|
||||
|
||||
#if bin0_min == bin0_max:
|
||||
#if bin1_min == bin1_max:
|
||||
##All pixel is within a single bin
|
||||
#k = outMax[bin0_min, bin1_min]
|
||||
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+bin1_min]+k] = onef
|
||||
## lut[bin0_min, bin1_min, k].idx = idx
|
||||
## lut[bin0_min, bin1_min, k].coef = onef
|
||||
#outMax[bin0_min, bin1_min]= k+1
|
||||
|
||||
#else:
|
||||
##spread on more than 2 bins
|
||||
#deltaD = (< float > (bin1_min + 1)) - fbin1_min
|
||||
#deltaU = fbin1_max - ( bin1_max)
|
||||
#deltaA = 1.0 / (fbin1_max - fbin1_min)
|
||||
|
||||
#k = outMax[bin0_min, bin1_min]
|
||||
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaD
|
||||
## lut[bin0_min, bin1_min, k].idx = idx
|
||||
## lut[bin0_min, bin1_min, k].coef = deltaA * deltaD
|
||||
#outMax[bin0_min, bin1_min] = k + 1
|
||||
|
||||
#k = outMax[bin0_min, bin1_max]
|
||||
#indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaU
|
||||
## lut[bin0_min, bin1_max, k].idx = idx
|
||||
## lut[bin0_min, bin1_max, k].coef = deltaA * deltaU
|
||||
#outMax[bin0_min, bin1_max] = k + 1
|
||||
|
||||
#for j in range(bin1_min + 1, bin1_max):
|
||||
#k = outMax[bin0_min, j]
|
||||
#indices[indptr[bin0_min*bins1+j]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+j]+k] = deltaA
|
||||
## lut[bin0_min, j, k].idx = idx
|
||||
## lut[bin0_min, j, k].coef = deltaA
|
||||
#outMax[bin0_min, j] = k + 1
|
||||
|
||||
#else: #spread on more than 2 bins in dim 0
|
||||
#if bin1_min == bin1_max:
|
||||
##All pixel fall on 1 bins in dim 1
|
||||
#deltaA = 1.0 / (fbin0_max - fbin0_min)
|
||||
#deltaL = (< float > (bin0_min + 1)) - fbin0_min
|
||||
|
||||
#k = outMax[bin0_min, bin1_min]
|
||||
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL
|
||||
## lut[bin0_min, bin1_min, k].idx = idx
|
||||
## lut[bin0_min, bin1_min, k].coef = deltaA * deltaL
|
||||
#outMax[bin0_min, bin1_min] = k+1
|
||||
|
||||
#deltaR = fbin0_max - (< float > bin0_max)
|
||||
|
||||
#k = outMax[bin0_max, bin1_min]
|
||||
#indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR
|
||||
## lut[bin0_max, bin1_min, k].idx = idx
|
||||
## lut[bin0_max, bin1_min, k].coef = deltaA * deltaR
|
||||
#outMax[bin0_max, bin1_min] = k + 1
|
||||
|
||||
#for i in range(bin0_min + 1, bin0_max):
|
||||
#k = outMax[i, bin1_min]
|
||||
#indices[indptr[i*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[i*bins1+bin1_min]+k] = deltaA
|
||||
## lut[i, bin1_min ,k].idx = idx
|
||||
## lut[i, bin1_min, k].coef = deltaA
|
||||
#outMax[i, bin1_min] = k + 1
|
||||
|
||||
#else:
|
||||
##spread on n pix in dim0 and m pixel in dim1:
|
||||
#deltaL = (< float > (bin0_min + 1)) - fbin0_min
|
||||
#deltaR = fbin0_max - (< float > bin0_max)
|
||||
#deltaD = (< float > (bin1_min + 1)) - fbin1_min
|
||||
#deltaU = fbin1_max - (< float > bin1_max)
|
||||
#deltaA = 1.0 / ((fbin0_max - fbin0_min) * (fbin1_max - fbin1_min))
|
||||
|
||||
#k = outMax[bin0_min, bin1_min]
|
||||
#indices[indptr[bin0_min*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+bin1_min]+k] = deltaA * deltaL * deltaD
|
||||
## lut[bin0_min, bin1_min ,k].idx = idx
|
||||
## lut[bin0_min, bin1_min, k].coef = deltaA * deltaL * deltaD
|
||||
#outMax[bin0_min, bin1_min] = k + 1
|
||||
|
||||
#k = outMax[bin0_min, bin1_max]
|
||||
#indices[indptr[bin0_min*bins1+bin1_max]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+bin1_max]+k] = deltaA * deltaL * deltaU
|
||||
## lut[bin0_min, bin1_max, k].idx = idx
|
||||
## lut[bin0_min, bin1_max, k].coef = deltaA * deltaL * deltaU
|
||||
#outMax[bin0_min, bin1_max] = k + 1
|
||||
|
||||
#k = outMax[bin0_max, bin1_min]
|
||||
#indices[indptr[bin0_max*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[bin0_max*bins1+bin1_min]+k] = deltaA * deltaR * deltaD
|
||||
## lut[bin0_max, bin1_min, k].idx = idx
|
||||
## lut[bin0_max, bin1_min, k].coef = deltaA * deltaR * deltaD
|
||||
#outMax[bin0_max, bin1_min] = k + 1
|
||||
|
||||
#k = outMax[bin0_max, bin1_max]
|
||||
#indices[indptr[bin0_max*bins1+bin1_max]+k] = idx
|
||||
#data[indptr[bin0_max*bins1+bin1_max]+k] = deltaA * deltaR * deltaU
|
||||
## lut[bin0_max, bin1_max, k].idx = idx
|
||||
## lut[bin0_max, bin1_max, k].coef = deltaA * deltaR * deltaU
|
||||
#outMax[bin0_max, bin1_max] = k + 1
|
||||
|
||||
#for i in range(bin0_min + 1, bin0_max):
|
||||
#k = outMax[i, bin1_min]
|
||||
#indices[indptr[i*bins1+bin1_min]+k] = idx
|
||||
#data[indptr[i*bins1+bin1_min]+k] = deltaA * deltaD
|
||||
## lut[i, bin1_min, k].idx = idx
|
||||
## lut[i, bin1_min, k].coef = deltaA * deltaD
|
||||
#outMax[i, bin1_min] = k + 1
|
||||
|
||||
#for j in range(bin1_min + 1, bin1_max):
|
||||
#k = outMax[i, j]
|
||||
#indices[indptr[i*bins1+j]+k] = idx
|
||||
#data[indptr[i*bins1+j]+k] = deltaA
|
||||
## lut[i, j, k].idx = idx
|
||||
## lut[i, j, k].coef = deltaA
|
||||
#outMax[i, j] = k + 1
|
||||
|
||||
#k = outMax[i, bin1_max]
|
||||
#indices[indptr[i*bins1+bin1_max]+k] = idx
|
||||
#data[indptr[i*bins1+bin1_max]+k] = deltaA * deltaU
|
||||
## lut[i, bin1_max, k].idx = idx
|
||||
## lut[i, bin1_max, k].coef = deltaA * deltaU
|
||||
#outMax[i, bin1_max] = k + 1
|
||||
|
||||
#for j in range(bin1_min + 1, bin1_max):
|
||||
#k = outMax[bin0_min, j]
|
||||
#indices[indptr[bin0_min*bins1+j]+k] = idx
|
||||
#data[indptr[bin0_min*bins1+j]+k] = deltaA * deltaL
|
||||
## lut[bin0_min, j, k].idx = idx
|
||||
## lut[bin0_min, j, k].coef = deltaA * deltaL
|
||||
#outMax[bin0_min, j] = k + 1
|
||||
|
||||
#k = outMax[bin0_max, j]
|
||||
#indices[indptr[bin0_max*bins1+j]+k] = idx
|
||||
#data[indptr[bin0_max*bins1+j]+k] = deltaA * deltaR
|
||||
## lut[bin0_max, j, k].idx = idx
|
||||
## lut[bin0_max, j, k].coef = deltaA * deltaR
|
||||
#outMax[bin0_max, j] = k + 1
|
||||
|
||||
#self.data = data
|
||||
#self.indices = indices
|
||||
#return outMax
|
||||
|
||||
#@cython.cdivision(True)
|
||||
#@cython.boundscheck(False)
|
||||
#@cython.wraparound(False)
|
||||
#def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
|
||||
#"""
|
||||
#Actually perform the 2D integration which in this case looks more like a matrix-vector product
|
||||
|
||||
#@param weights: input image
|
||||
#@type weights: ndarray
|
||||
#@param dummy: value for dead pixels (optional)
|
||||
#@type dummy: float
|
||||
#@param delta_dummy: precision for dead-pixel value in dynamic masking
|
||||
#@type delta_dummy: float
|
||||
#@param dark: array with the dark-current value to be subtracted (if any)
|
||||
#@type dark: ndarray
|
||||
#@param flat: array with the dark-current value to be divided by (if any)
|
||||
#@type flat: ndarray
|
||||
#@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
|
||||
#@type solidAngle: ndarray
|
||||
#@param polarization: array with the polarization correction values to be divided by (if any)
|
||||
#@type polarization: ndarray
|
||||
#@return: I(2d), edges0(1d), edges1(1d), weighted histogram(2d), unweighted histogram (2d)
|
||||
#@rtype: 5-tuple of ndarrays
|
||||
|
||||
#"""
|
||||
#cdef int i=0, j=0, idx=0, bins0=self.bins[0], bins1=self.bins[1], bins=bins0*bins1, size=self.size
|
||||
#cdef double sum_data=0.0, sum_count=0.0, epsilon=1e-10
|
||||
#cdef float data=0, coef=0, cdummy=0, cddummy=0
|
||||
#cdef bint do_dummy=False, do_dark=False, do_flat=False, do_polarization=False, do_solidAngle=False
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outData = numpy.zeros(self.bins, dtype=numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 2] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
|
||||
#cdef numpy.ndarray[numpy.float32_t, ndim = 2] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] outData_1d = outData.ravel()
|
||||
#cdef numpy.ndarray[numpy.float64_t, ndim = 1] outCount_1d = outCount.ravel()
|
||||
#cdef numpy.ndarray[numpy.float32_t, ndim = 1] outMerge_1d = outMerge.ravel()
|
||||
|
||||
#cdef float[:] ccoef = self.data, cdata, tdata, cflat, cdark, csolidAngle, cpolarization
|
||||
#cdef numpy.int32_t[:] indices = self.indices, indptr = self.indptr
|
||||
|
||||
#assert size == weights.size
|
||||
|
||||
#if dummy is not None:
|
||||
#do_dummy = True
|
||||
#cdummy = <float>float(dummy)
|
||||
#if delta_dummy is None:
|
||||
#cddummy = <float>0.0
|
||||
#else:
|
||||
#cddummy = <float>float(delta_dummy)
|
||||
|
||||
#if flat is not None:
|
||||
#do_flat = True
|
||||
#assert flat.size == size
|
||||
#cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
|
||||
#if dark is not None:
|
||||
#do_dark = True
|
||||
#assert dark.size == size
|
||||
#cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
|
||||
#if solidAngle is not None:
|
||||
#do_solidAngle = True
|
||||
#assert solidAngle.size == size
|
||||
#csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
|
||||
#if polarization is not None:
|
||||
#do_polarization = True
|
||||
#assert polarization.size == size
|
||||
#cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
|
||||
|
||||
#if (do_dark + do_flat + do_polarization + do_solidAngle):
|
||||
#tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
|
||||
#cdata = numpy.zeros(size,dtype=numpy.float32)
|
||||
#if do_dummy:
|
||||
#for i in prange(size, nogil=True, schedule="static"):
|
||||
#data = tdata[i]
|
||||
#if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
|
||||
##Nota: -= and /= operatore are seen as reduction in cython parallel.
|
||||
#if do_dark:
|
||||
#data = data - cdark[i]
|
||||
#if do_flat:
|
||||
#data = data / cflat[i]
|
||||
#if do_polarization:
|
||||
#data = data / cpolarization[i]
|
||||
#if do_solidAngle:
|
||||
#data = data / csolidAngle[i]
|
||||
#cdata[i]+=data
|
||||
#else: #set all dummy_like values to cdummy. simplifies further processing
|
||||
#cdata[i]+=cdummy
|
||||
#else:
|
||||
#for i in prange(size, nogil=True, schedule="static"):
|
||||
#data = tdata[i]
|
||||
#if do_dark:
|
||||
#data = data - cdark[i]
|
||||
#if do_flat:
|
||||
#data = data / cflat[i]
|
||||
#if do_polarization:
|
||||
#data = data / cpolarization[i]
|
||||
#if do_solidAngle:
|
||||
#data = data / csolidAngle[i]
|
||||
#cdata[i]+=data
|
||||
#else:
|
||||
#if do_dummy:
|
||||
#tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
|
||||
#cdata = numpy.zeros(size,dtype=numpy.float32)
|
||||
#for i in prange(size, nogil=True, schedule="static"):
|
||||
#data = tdata[i]
|
||||
#if ((cddummy!=0) and (fabs(data-cdummy) > cddummy)) or ((cddummy==0) and (data!=cdummy)):
|
||||
#cdata[i]+=data
|
||||
#else:
|
||||
#cdata[i]+=cdummy
|
||||
#else:
|
||||
#cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
|
||||
|
||||
#for i in prange(bins, nogil=True, schedule="guided"):
|
||||
#sum_data = 0.0
|
||||
#sum_count = 0.0
|
||||
#for j in range(indptr[i],indptr[i+1]):
|
||||
#idx = indices[j]
|
||||
#coef = ccoef[j]
|
||||
#data = cdata[idx]
|
||||
#if do_dummy and data==cdummy:
|
||||
#continue
|
||||
|
||||
#sum_data = sum_data + coef * data
|
||||
#sum_count = sum_count + coef
|
||||
#outData_1d[i] += sum_data
|
||||
#outCount_1d[i] += sum_count
|
||||
#if sum_count > epsilon:
|
||||
#outMerge_1d[i] += sum_data / sum_count
|
||||
#else:
|
||||
#outMerge_1d[i] += cdummy
|
||||
#return outMerge.T, self.outPos0, self.outPos1, outData.T, outCount.T
|
||||
|
|
@ -25,7 +25,7 @@ data = fabio.open("testimages/Pilatus1M.edf").data
|
|||
ref = ai.integrate2d(data, 100, 360, method="lut", unit="2th_deg")[0]
|
||||
obt = ai.integrate2d(data, 100, 360, method="ocl_csr", unit="2th_deg")[0]
|
||||
##logger.debug("check LUT basics: %s"%abs(obt[1] - ref[1]).max())
|
||||
#assert numpy.allclose(ref,obt)
|
||||
assert numpy.allclose(ref,obt)
|
||||
|
||||
|
||||
plot(ref.ravel(), label="ocl_lut")
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Mar 07 09:52:51 2014
|
||||
|
||||
@author: ashiotis
|
||||
"""
|
||||
|
||||
import sys, numpy, time
|
||||
import utilstest
|
||||
import fabio, pyopencl
|
||||
from pylab import *
|
||||
print "#"*50
|
||||
pyFAI = sys.modules["pyFAI"]
|
||||
from pyFAI import splitPixelFullLUT
|
||||
from pyFAI import ocl_azim_csr
|
||||
#from pyFAI import splitBBoxLUT
|
||||
#from pyFAI import splitBBoxCSR
|
||||
#logger = utilstest.getLogger("profile")
|
||||
|
||||
|
||||
ai = pyFAI.load("testimages/Pilatus1M.poni")
|
||||
data = fabio.open("testimages/Pilatus1M.edf").data
|
||||
ref = ai.xrpd_LUT(data, 1000)[1]
|
||||
#obt = ai.xrpd_LUT_OCL(data, 1000)[1]
|
||||
|
||||
#ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
|
||||
|
||||
pos = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
|
||||
foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
|
||||
|
||||
boo = foo.integrate(data)[1]
|
||||
|
||||
foo2 = ocl_azim_csr.OCL_CSR_Integrator(foo.lut, data.size, "GPU", block_size=32)
|
||||
boo2 = foo2.integrate(data)[0]
|
||||
|
||||
plot(ref, label="ocl_csr")
|
||||
plot(boo, label="csr_fullsplit")
|
||||
plot(boo2, label="ocl_csr_fullsplit")
|
||||
legend()
|
||||
show()
|
||||
raw_input()
|
||||
|
|
@ -0,0 +1,44 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Mar 07 09:52:51 2014
|
||||
|
||||
@author: ashiotis
|
||||
"""
|
||||
|
||||
import sys, numpy, time
|
||||
import utilstest
|
||||
import fabio, pyopencl
|
||||
from pylab import *
|
||||
print "#"*50
|
||||
pyFAI = sys.modules["pyFAI"]
|
||||
from pyFAI import splitPixelFullLUT
|
||||
from pyFAI import ocl_hist_pixelsplit
|
||||
#from pyFAI import splitBBoxLUT
|
||||
#from pyFAI import splitBBoxCSR
|
||||
#logger = utilstest.getLogger("profile")
|
||||
|
||||
|
||||
ai = pyFAI.load("testimages/Pilatus1M.poni")
|
||||
data = fabio.open("testimages/Pilatus1M.edf").data
|
||||
ref = ai.xrpd_LUT(data, 1000)[1]
|
||||
#obt = ai.xrpd_LUT_OCL(data, 1000)[1]
|
||||
|
||||
#ref = ai.integrate1d(data, 1000, method="ocl_csr", unit="2th_deg")[0]
|
||||
pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
|
||||
|
||||
pos = pos_in.reshape(pos_in.size/8,4,2)
|
||||
|
||||
foo = splitPixelFullLUT.HistoLUT1dFullSplit(pos, 1000, unit="2th_deg")
|
||||
|
||||
boo = foo.integrate(data)[1]
|
||||
|
||||
foo2 = ocl_hist_pixelsplit.OCL_Hist_Pixelsplit(pos, 1000, data.size, devicetype="cpu", block_size=32)
|
||||
boo2 = foo2.integrate(data)[2]
|
||||
|
||||
#plot(ref, label="ocl_csr")
|
||||
#plot(boo, label="csr_fullsplit")
|
||||
plot(boo2, label="ocl_csr_fullsplit")
|
||||
legend()
|
||||
show()
|
||||
raw_input()
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Fri Mar 07 09:52:51 2014
|
||||
|
||||
@author: ashiotis
|
||||
"""
|
||||
|
||||
import sys, numpy, time
|
||||
import utilstest
|
||||
import fabio
|
||||
import pyopencl as cl
|
||||
from pylab import *
|
||||
print "#"*50
|
||||
pyFAI = sys.modules["pyFAI"]
|
||||
from pyFAI import splitPixelFullLUT
|
||||
from pyFAI import ocl_hist_pixelsplit
|
||||
#from pyFAI import splitBBoxLUT
|
||||
#from pyFAI import splitBBoxCSR
|
||||
#logger = utilstest.getLogger("profile")
|
||||
|
||||
|
||||
ai = pyFAI.load("testimages/Pilatus1M.poni")
|
||||
data = fabio.open("testimages/Pilatus1M.edf").data
|
||||
|
||||
workgroup_size = 256
|
||||
bins = 1000
|
||||
|
||||
pos_in = ai.array_from_unit(data.shape, "corner", unit="2th_deg")
|
||||
|
||||
pos = pos_in.reshape(pos_in.size/8,4,2)
|
||||
|
||||
pos_size = pos.size
|
||||
size = pos_size/8
|
||||
|
||||
|
||||
ctx = cl.create_some_context()
|
||||
queue = cl.CommandQueue(ctx)
|
||||
mf = cl.mem_flags
|
||||
|
||||
d_input = cl.array.to_device(queue, pos)
|
||||
d_preresult = cl.Buffer(ctx, mf.READ_WRITE, 4*4*workgroup_size)
|
||||
d_result = cl.Buffer(ctx, mf.READ_WRITE, 4*4)
|
||||
|
||||
with open("../openCL/ocl_hist_pixelsplit.cl", "r") as kernelFile:
|
||||
kernel_src = kernelFile.read()
|
||||
|
||||
compile_options = "-D BINS=%i -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
|
||||
(bins, size, workgroup_size, numpy.finfo(numpy.float32).eps)
|
||||
|
||||
program = cl.Program(ctx, kernel_src).build(options=compile_options)
|
||||
|
||||
program.reduce1(queue, (workgroup_size*workgroup_size,), (workgroup_size,), d_input.data, numpy.uint32(pos_size), d_preresult)
|
||||
|
||||
program.reduce2(queue, (workgroup_size,), (workgroup_size,), d_preresult, d_result)
|
||||
|
||||
result = numpy.ndarray(4,dtype=numpy.float32)
|
||||
|
||||
cl.enqueue_copy(queue,result, d_result)
|
||||
|
||||
|
||||
min0 = pos[:, :, 0].min()
|
||||
max0 = pos[:, :, 0].max()
|
||||
min1 = pos[:, :, 1].min()
|
||||
max1 = pos[:, :, 1].max()
|
||||
minmax=(min0,max0,min1,max1)
|
||||
|
||||
print minmax
|
||||
print result
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#plot(ref, label="ocl_csr")
|
||||
#plot(boo, label="csr_fullsplit")
|
||||
#plot(boo2, label="ocl_csr_fullsplit")
|
||||
#legend()
|
||||
#show()
|
||||
#raw_input()
|
||||
|
Loading…
Reference in New Issue