pyFAI/sandbox/ocl_fullSplit_CSR.pyx

478 lines
19 KiB
Cython

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Project: Azimuthal integration
# https://github.com/silx-kit/pyFAI
#
# Copyright (C) 2014-2018 European Synchrotron Radiation Facility, Grenoble, France
#
# Principal author: Jérôme Kieffer (Jerome.Kieffer@ESRF.eu)
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
import cython
import os
import sys
from cpython.ref cimport PyObject, Py_XDECREF
from cython.parallel import prange
from libc.string cimport memset, memcpy
from cython cimport view
import numpy
cimport numpy
from libc.math cimport fabs, M_PI
cdef float pi = <float> M_PI
cdef struct lut_point:
numpy.int32_t idx
numpy.float32_t coef
dtype_lut = numpy.dtype([("idx", numpy.int32), ("coef", numpy.float32)])
from ..utils import crc32
cdef double EPS32 = (1.0 + numpy.finfo(numpy.float32).eps)
cdef bint NEED_DECREF = sys.version_info < (2, 7) and numpy.version.version < "1.5"
class OCLFullSplit1d(object):
"""
1D histogramming with full pixel splitting
based on a Look-up table in CSR format
The initialization of the class can take quite a while (operation are not parallelized)
but each integrate is parallelized and quite efficient.
"""
@cython.boundscheck(False)
def __init__(self,
pos,
int bins=100,
pos0Range=None,
pos1Range=None,
mask=None,
mask_checksum=None,
allow_pos0_neg=False,
unit="undefined"):
"""
@param pos: 4D array with the coodrinates of all pixels: tth or q_vect or r...
@param bins: number of output bins, 100 by default
@param pos0Range: minimum and maximum of the 2th range
@param pos1Range: minimum and maximum of the chi range
@param mask: array (of int8) with masked pixels with 1 (0=not masked)
@param allow_pos0_neg: enforce the q<0 is usually not possible, NOT USED
@param unit: can be 2th_deg or r_nm^-1 ...
"""
self.bins = bins
self.lut_size = 0
self.allow_pos0_neg = allow_pos0_neg
if len(pos.shape) == 3:
assert pos.shape[1] == 4
assert pos.shape[2] == 2
elif len(pos.shape) == 4:
assert pos.shape[2] == 4
assert pos.shape[3] == 2
else:
raise ValueError("Pos array dimentions are wrong")
self.size = pos.size / 8
if mask is not None:
assert mask.size == self.size
self.check_mask = True
self.cmask = numpy.ascontiguousarray(mask.ravel(), dtype=numpy.int8)
if mask_checksum:
self.mask_checksum = mask_checksum
else:
self.mask_checksum = crc32(mask)
else:
self.check_mask = False
self.mask_checksum = None
self.pos = numpy.ascontiguousarray(pos.ravel(), dtype=numpy.float32)
self.pos0Range = pos0Range
self.pos1Range = pos1Range
self.calc_boundaries(pos0Range)
if pos1Range is not None and len(pos1Range) > 1:
assert pos1.size == self.size
assert delta_pos1.size == self.size
self.check_pos1 = True
self.cpos1_min = numpy.ascontiguousarray((pos1 - delta_pos1).ravel(), dtype=numpy.float32)
self.cpos1_max = numpy.ascontiguousarray((pos1 + delta_pos1).ravel(), dtype=numpy.float32)
self.pos1_min = min(pos1Range)
pos1_maxin = max(pos1Range)
self.pos1_max = pos1_maxin * EPS32
else:
self.check_pos1 = False
self.cpos1_min = None
self.pos1_max = None
self.delta = (self.pos0_max - self.pos0_min) / bins
self._lut = None
self.lut_max_idx = None
self.calc_lut()
self.outPos = numpy.linspace(self.pos0_min + 0.5 * self.delta, self.pos0_maxin - 0.5 * self.delta, self.bins)
self.lut_checksum = None
self.unit = unit
self.lut_nbytes = self._lut.nbytes
@cython.boundscheck(False)
@cython.wraparound(False)
def calc_boundaries(self, pos0Range):
"""
Called by constructor to calculate the boundaries and the bin position
"""
cdef int size = self.cpos0.size
cdef bint check_mask = self.check_mask
cdef numpy.int8_t[:] cmask
cdef float[:] cpos0, dpos0, cpos0_sup, cpos0_inf,
cdef float upper, lower, pos0_max, pos0_min, c, d
cdef bint allow_pos0_neg = self.allow_pos0_neg
cpos0_sup = self.cpos0_sup
cpos0_inf = self.cpos0_inf
cpos0 = self.cpos0
dpos0 = self.dpos0
pos0_min = cpos0[0]
pos0_max = cpos0[0]
if check_mask:
cmask = self.cmask
with nogil:
for idx in range(size):
c = cpos0[idx]
d = dpos0[idx]
lower = c - d
upper = c + d
cpos0_sup[idx] = upper
cpos0_inf[idx] = lower
if not allow_pos0_neg and lower < 0:
lower = 0
if not (check_mask and cmask[idx]):
if upper > pos0_max:
pos0_max = upper
if lower < pos0_min:
pos0_min = lower
if pos0Range is not None and len(pos0Range) > 1:
self.pos0_min = min(pos0Range)
self.pos0_maxin = max(pos0Range)
else:
self.pos0_min = pos0_min
self.pos0_maxin = pos0_max
if (not allow_pos0_neg) and self.pos0_min < 0:
self.pos0_min = 0
self.pos0_max = self.pos0_maxin * EPS32
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
def calc_lut(self):
"""
Calculate the max number of elements in the LUT and populate it
"""
cdef float delta = self.delta, pos0_min = self.pos0_min, pos1_min, pos1_max, min0, max0, fbin0_min, fbin0_max, deltaL, deltaR, deltaA
cdef numpy.int32_t k, idx, bin0_min, bin0_max, bins = self.bins, lut_size, i, size
cdef bint check_mask, check_pos1
cdef numpy.ndarray[numpy.int32_t, ndim=1] outMax = numpy.zeros(bins, dtype=numpy.int32)
cdef float[:] cpos0_sup = self.cpos0_sup
cdef float[:] cpos0_inf = self.cpos0_inf
cdef float[:] cpos1_min, cpos1_max
cdef lut_point[:, :] lut
cdef numpy.int8_t[:] cmask
size = self.size
if self.check_mask:
cmask = self.cmask
check_mask = True
else:
check_mask = False
if self.check_pos1:
check_pos1 = True
cpos1_min = self.cpos1_min
cpos1_max = self.cpos1_max
pos1_max = self.pos1_max
pos1_min = self.pos1_min
else:
check_pos1 = False
with nogil:
for idx in range(size):
if (check_mask) and (cmask[idx]):
continue
min0 = cpos0_inf[idx]
max0 = cpos0_sup[idx]
if check_pos1 and ((cpos1_max[idx] < pos1_min) or (cpos1_min[idx] > pos1_max)):
continue
fbin0_min = getBinNr(min0, pos0_min, delta)
fbin0_max = getBinNr(max0, pos0_min, delta)
bin0_min = < numpy.int32_t > fbin0_min
bin0_max = < numpy.int32_t > fbin0_max
if (bin0_max < 0) or (bin0_min >= bins):
continue
if bin0_max >= bins:
bin0_max = bins - 1
if bin0_min < 0:
bin0_min = 0
if bin0_min == bin0_max:
# All pixel is within a single bin
outMax[bin0_min] += 1
else: # we have pixel spliting.
for i in range(bin0_min, bin0_max + 1):
outMax[i] += 1
lut_size = outMax.max()
# just recycle the outMax array
# outMax = numpy.zeros((bins0,bins1), dtype=numpy.int32)
memset(&outMax[0], 0, bins * sizeof(numpy.int32_t))
self.lut_size = lut_size
lut_nbytes = bins * lut_size * sizeof(lut_point)
if (os.name == "posix") and ("SC_PAGE_SIZE" in os.sysconf_names) and ("SC_PHYS_PAGES" in os.sysconf_names):
memsize = os.sysconf("SC_PAGE_SIZE") * os.sysconf("SC_PHYS_PAGES")
if memsize < lut_nbytes:
raise MemoryError("Lookup-table (%i, %i) is %.3fGB whereas the memory of the system is only %s" % (bins, lut_size, lut_nbytes, memsize))
# else hope we have enough memory
lut = view.array(shape=(bins, lut_size), itemsize=sizeof(lut_point), format="if")
# lut = numpy.zeros(shape=(bins, lut_size),dtype=dtype_lut)
# lut = < lut_point *>malloc(lut_nbytes)
memset(&lut[0, 0], 0, lut_nbytes)
with nogil:
for idx in range(size):
if (check_mask) and (cmask[idx]):
continue
min0 = cpos0_inf[idx]
max0 = cpos0_sup[idx]
if check_pos1 and ((cpos1_max[idx] < pos1_min) or (cpos1_min[idx] > pos1_max)):
continue
fbin0_min = getBinNr(min0, pos0_min, delta)
fbin0_max = getBinNr(max0, pos0_min, delta)
bin0_min = < numpy.int32_t > fbin0_min
bin0_max = < numpy.int32_t > fbin0_max
if (bin0_max < 0) or (bin0_min >= bins):
continue
if bin0_max >= bins:
bin0_max = bins - 1
if bin0_min < 0:
bin0_min = 0
if bin0_min == bin0_max:
# All pixel is within a single bin
k = outMax[bin0_min]
lut[bin0_min, k].idx = idx
lut[bin0_min, k].coef = 1.0
outMax[bin0_min] += 1
else: # we have pixel splitting.
deltaA = 1.0 / (fbin0_max - fbin0_min)
deltaL = (bin0_min + 1) - fbin0_min
deltaR = fbin0_max - (bin0_max)
k = outMax[bin0_min]
lut[bin0_min, k].idx = idx
lut[bin0_min, k].coef = (deltaA * deltaL)
outMax[bin0_min] += 1
k = outMax[bin0_max]
lut[bin0_max, k].idx = idx
lut[bin0_max, k].coef = (deltaA * deltaR)
outMax[bin0_max] += 1
if bin0_min + 1 < bin0_max:
for i in range(bin0_min + 1, bin0_max):
k = outMax[i]
lut[i, k].idx = idx
lut[i, k].coef = (deltaA)
outMax[i] += 1
self.lut_max_idx = outMax
self._lut = lut
def get_lut(self):
"""Getter for the LUT as actual numpy array"""
cdef int rc_before, rc_after
rc_before = sys.getrefcount(self._lut)
cdef lut_point[:, :] lut = self._lut
rc_after = sys.getrefcount(self._lut)
cdef bint need_decref = NEED_DECREF and ((rc_after - rc_before) >= 2)
cdef numpy.ndarray[numpy.float64_t, ndim=2] tmp_ary = numpy.empty(shape=self._lut.shape, dtype=numpy.float64)
memcpy(&tmp_ary[0, 0], &lut[0, 0], self._lut.nbytes)
self.lut_checksum = crc32(tmp_ary)
# Ugly against bug#89
if need_decref and (sys.getrefcount(self._lut) >= rc_before+2):
print("Decref needed")
Py_XDECREF(<PyObject *> self._lut)
# return tmp_ary.view(dtype=dtype_lut)
return numpy.core.records.array(tmp_ary.view(dtype=dtype_lut),
shape=self._lut.shape, dtype=dtype_lut,
copy=True)
lut = property(get_lut)
@cython.cdivision(True)
@cython.boundscheck(False)
@cython.wraparound(False)
def integrate(self, weights, dummy=None, delta_dummy=None, dark=None, flat=None, solidAngle=None, polarization=None):
"""
Actually perform the integration which in this case looks more like a matrix-vector product
@param weights: input image
@type weights: ndarray
@param dummy: value for dead pixels (optional)
@type dummy: float
@param delta_dummy: precision for dead-pixel value in dynamic masking
@type delta_dummy: float
@param dark: array with the dark-current value to be subtracted (if any)
@type dark: ndarray
@param flat: array with the dark-current value to be divided by (if any)
@type flat: ndarray
@param solidAngle: array with the solid angle of each pixel to be divided by (if any)
@type solidAngle: ndarray
@param polarization: array with the polarization correction values to be divided by (if any)
@type polarization: ndarray
@return : positions, pattern, weighted_histogram and unweighted_histogram
@rtype: 4-tuple of ndarrays
"""
cdef numpy.int32_t i = 0, j = 0, idx = 0, bins = self.bins, lut_size = self.lut_size, size = self.size
cdef double sum_data = 0, sum_count = 0, epsilon = 1e-10
cdef float data = 0, coef = 0, cdummy = 0, cddummy = 0
cdef bint do_dummy = False, do_dark = False, do_flat = False, do_polarization = False, do_solidAngle = False
cdef numpy.ndarray[numpy.float64_t, ndim=1] outData = numpy.zeros(self.bins, dtype=numpy.float64)
cdef numpy.ndarray[numpy.float64_t, ndim=1] outCount = numpy.zeros(self.bins, dtype=numpy.float64)
cdef numpy.ndarray[numpy.float32_t, ndim=1] outMerge = numpy.zeros(self.bins, dtype=numpy.float32)
cdef float[:] cdata, tdata, cflat, cdark, csolidAngle, cpolarization
# Ugly hack against bug #89: https://github.com/silx-kit/pyFAI/issues/89
cdef int rc_before, rc_after
rc_before = sys.getrefcount(self._lut)
cdef lut_point[:, :] lut = self._lut
rc_after = sys.getrefcount(self._lut)
cdef bint need_decref = NEED_DECREF & ((rc_after - rc_before) >= 2)
assert size == weights.size
if dummy is not None:
do_dummy = True
cdummy = <float> float(dummy)
if delta_dummy is None:
cddummy = <float> 0.0
else:
cddummy = <float> float(delta_dummy)
if flat is not None:
do_flat = True
assert flat.size == size
cflat = numpy.ascontiguousarray(flat.ravel(), dtype=numpy.float32)
if dark is not None:
do_dark = True
assert dark.size == size
cdark = numpy.ascontiguousarray(dark.ravel(), dtype=numpy.float32)
if solidAngle is not None:
do_solidAngle = True
assert solidAngle.size == size
csolidAngle = numpy.ascontiguousarray(solidAngle.ravel(), dtype=numpy.float32)
if polarization is not None:
do_polarization = True
assert polarization.size == size
cpolarization = numpy.ascontiguousarray(polarization.ravel(), dtype=numpy.float32)
if (do_dark + do_flat + do_polarization + do_solidAngle):
tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
cdata = numpy.zeros(size, dtype=numpy.float32)
if do_dummy:
for i in prange(size, nogil=True, schedule="static"):
data = tdata[i]
if ((cddummy != 0) and (fabs(data - cdummy) > cddummy)) or ((cddummy == 0) and (data != cdummy)):
# Nota: -= and /= operatore are seen as reduction in cython parallel.
if do_dark:
data = data - cdark[i]
if do_flat:
data = data / cflat[i]
if do_polarization:
data = data / cpolarization[i]
if do_solidAngle:
data = data / csolidAngle[i]
cdata[i] += data
else: # set all dummy_like values to cdummy. simplifies further processing
cdata[i] += cdummy
else:
for i in prange(size, nogil=True, schedule="static"):
data = tdata[i]
if do_dark:
data = data - cdark[i]
if do_flat:
data = data / cflat[i]
if do_polarization:
data = data / cpolarization[i]
if do_solidAngle:
data = data / csolidAngle[i]
cdata[i] += data
else:
if do_dummy:
tdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
cdata = numpy.zeros(size, dtype=numpy.float32)
for i in prange(size, nogil=True, schedule="static"):
data = tdata[i]
if ((cddummy != 0) and (fabs(data - cdummy) > cddummy)) or ((cddummy == 0) and (data != cdummy)):
cdata[i] += data
else:
cdata[i] += cdummy
else:
cdata = numpy.ascontiguousarray(weights.ravel(), dtype=numpy.float32)
# TODO: what is the best: static or guided ?
for i in prange(bins, nogil=True, schedule="guided"):
sum_data = 0.0
sum_count = 0.0
for j in range(lut_size):
idx = lut[i, j].idx
coef = lut[i, j].coef
if idx <= 0 and coef <= 0.0:
continue
data = cdata[idx]
if do_dummy and data == cdummy:
continue
sum_data = sum_data + coef * data
sum_count = sum_count + coef
outData[i] += sum_data
outCount[i] += sum_count
if sum_count > epsilon:
outMerge[i] += sum_data / sum_count
else:
outMerge[i] += cdummy
# Ugly against bug#89
if need_decref and (sys.getrefcount(self._lut) >= rc_before + 2):
print("Decref needed")
Py_XDECREF(<PyObject *> self._lut)
return self.outPos, outMerge, outData, outCount