Implement vload_half{,n} and vload(half)

v2: add vload(half) as well
    make helpers amdgpu specific (NVPTX uses different private AS numbering)
    use clang builtin on clang >= 6

Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu>
Reviewed-by: Tom Stellard <tstellar@redhat.com>
llvm-svn: 312839
This commit is contained in:
Jan Vesely 2017-09-08 23:59:00 +00:00
parent 661ac03a1b
commit 285d2fb85c
6 changed files with 132 additions and 20 deletions

View File

@ -1 +1,2 @@
shared/vload_half_helpers.ll
shared/vstore_half_helpers.ll

View File

@ -1 +1,2 @@
shared/vload_half_helpers.ll
shared/vstore_half_helpers.ll

View File

@ -0,0 +1,23 @@
define float @__clc_vload_half_float_helper__private(half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
%data = load half, half addrspace(0)* %ptr
%res = fpext half %data to float
ret float %res
}
define float @__clc_vload_half_float_helper__global(half addrspace(1)* nocapture %ptr) nounwind alwaysinline {
%data = load half, half addrspace(1)* %ptr
%res = fpext half %data to float
ret float %res
}
define float @__clc_vload_half_float_helper__local(half addrspace(3)* nocapture %ptr) nounwind alwaysinline {
%data = load half, half addrspace(3)* %ptr
%res = fpext half %data to float
ret float %res
}
define float @__clc_vload_half_float_helper__constant(half addrspace(2)* nocapture %ptr) nounwind alwaysinline {
%data = load half, half addrspace(2)* %ptr
%res = fpext half %data to float
ret float %res
}

View File

@ -1,18 +1,21 @@
#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
_CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
_CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);
#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \
#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
_CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) \
#define _CLC_VECTOR_VLOAD_PRIM() \
_CLC_VECTOR_VLOAD_PRIM1(char) \
@ -24,14 +27,26 @@
_CLC_VECTOR_VLOAD_PRIM1(long) \
_CLC_VECTOR_VLOAD_PRIM1(ulong) \
_CLC_VECTOR_VLOAD_PRIM1(float) \
_CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
#ifdef cl_khr_fp64
#define _CLC_VECTOR_VLOAD() \
_CLC_VECTOR_VLOAD_PRIM1(double) \
_CLC_VECTOR_VLOAD_PRIM()
#else
#define _CLC_VECTOR_VLOAD() \
_CLC_VECTOR_VLOAD_PRIM()
#pragma OPENCL EXTENSION cl_khr_fp64: enable
_CLC_VECTOR_VLOAD_PRIM1(double)
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16: enable
_CLC_VECTOR_VLOAD_PRIM1(half)
#endif
_CLC_VECTOR_VLOAD()
_CLC_VECTOR_VLOAD_PRIM()
// Plain vload_half also needs to be declared
_CLC_VLOAD_DECL(_half, half, float, , __constant)
_CLC_VLOAD_DECL(_half, half, float, , __global)
_CLC_VLOAD_DECL(_half, half, float, , __local)
_CLC_VLOAD_DECL(_half, half, float, , __private)
#undef _CLC_VLOAD_DECL
#undef _CLC_VECTOR_VLOAD_DECL
#undef _CLC_VECTOR_VLOAD_PRIM3
#undef _CLC_VECTOR_VLOAD_PRIM1
#undef _CLC_VECTOR_VLOAD_PRIM

View File

@ -50,3 +50,62 @@ VLOAD_TYPES()
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
VLOAD_ADDR_SPACES(double)
#endif
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
VLOAD_ADDR_SPACES(half)
#endif
/* vload_half are legal even without cl_khr_fp16 */
/* no vload_half for double */
#if __clang_major__ < 6
float __clc_vload_half_float_helper__constant(const __constant half *);
float __clc_vload_half_float_helper__global(const __global half *);
float __clc_vload_half_float_helper__local(const __local half *);
float __clc_vload_half_float_helper__private(const __private half *);
#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
#else
#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
#endif
#define VEC_LOAD2(val, AS) \
VEC_LOAD1(val.lo, AS) \
VEC_LOAD1(val.hi, AS)
#define VEC_LOAD3(val, AS) \
VEC_LOAD1(val.s0, AS) \
VEC_LOAD1(val.s1, AS) \
VEC_LOAD1(val.s2, AS)
#define VEC_LOAD4(val, AS) \
VEC_LOAD2(val.lo, AS) \
VEC_LOAD2(val.hi, AS)
#define VEC_LOAD8(val, AS) \
VEC_LOAD4(val.lo, AS) \
VEC_LOAD4(val.hi, AS)
#define VEC_LOAD16(val, AS) \
VEC_LOAD8(val.lo, AS) \
VEC_LOAD8(val.hi, AS)
#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
_CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
offset *= VEC_SIZE; \
TYPE __tmp; \
VEC_LOAD##VEC_SIZE(__tmp, AS) \
return __tmp; \
}
#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, AS)
#define __CLC_BODY "vload_half.inc"
#include <clc/math/gentype.inc>
#undef __CLC_BODY
#undef FUNC
#undef __FUNC
#undef VEC_LOAD16
#undef VEC_LOAD8
#undef VEC_LOAD4
#undef VEC_LOAD3
#undef VEC_LOAD2
#undef VEC_LOAD1
#undef VLOAD_TYPES
#undef VLOAD_ADDR_SPACES
#undef VLOAD_VECTORIZE

View File

@ -0,0 +1,13 @@
#if __CLC_FPSIZE == 32
#ifdef __CLC_VECSIZE
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private);
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local);
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global);
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __constant);
#else
FUNC(, 1, __CLC_GENTYPE, __private);
FUNC(, 1, __CLC_GENTYPE, __local);
FUNC(, 1, __CLC_GENTYPE, __global);
FUNC(, 1, __CLC_GENTYPE, __constant);
#endif
#endif