forked from OSchip/llvm-project
Implement vload_half{,n} and vload(half)
v2: add vload(half) as well make helpers amdgpu specific (NVPTX uses different private AS numbering) use clang builtin on clang >= 6 Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu> Reviewed-by: Tom Stellard <tstellar@redhat.com> llvm-svn: 312839
This commit is contained in:
parent
661ac03a1b
commit
285d2fb85c
|
@ -1 +1,2 @@
|
|||
shared/vload_half_helpers.ll
|
||||
shared/vstore_half_helpers.ll
|
||||
|
|
|
@ -1 +1,2 @@
|
|||
shared/vload_half_helpers.ll
|
||||
shared/vstore_half_helpers.ll
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
define float @__clc_vload_half_float_helper__private(half addrspace(0)* nocapture %ptr) nounwind alwaysinline {
|
||||
%data = load half, half addrspace(0)* %ptr
|
||||
%res = fpext half %data to float
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @__clc_vload_half_float_helper__global(half addrspace(1)* nocapture %ptr) nounwind alwaysinline {
|
||||
%data = load half, half addrspace(1)* %ptr
|
||||
%res = fpext half %data to float
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @__clc_vload_half_float_helper__local(half addrspace(3)* nocapture %ptr) nounwind alwaysinline {
|
||||
%data = load half, half addrspace(3)* %ptr
|
||||
%res = fpext half %data to float
|
||||
ret float %res
|
||||
}
|
||||
|
||||
define float @__clc_vload_half_float_helper__constant(half addrspace(2)* nocapture %ptr) nounwind alwaysinline {
|
||||
%data = load half, half addrspace(2)* %ptr
|
||||
%res = fpext half %data to float
|
||||
ret float %res
|
||||
}
|
|
@ -1,18 +1,21 @@
|
|||
#define _CLC_VLOAD_DECL(PRIM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
|
||||
_CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##WIDTH(size_t offset, const ADDR_SPACE PRIM_TYPE *x);
|
||||
#define _CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, VEC_TYPE, WIDTH, ADDR_SPACE) \
|
||||
_CLC_OVERLOAD _CLC_DECL VEC_TYPE vload##SUFFIX##WIDTH(size_t offset, const ADDR_SPACE MEM_TYPE *x);
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(PRIM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
|
||||
#define _CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##2, 2, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##3, 3, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##4, 4, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##8, 8, ADDR_SPACE) \
|
||||
_CLC_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE##16, 16, ADDR_SPACE)
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_PRIM3(SUFFIX, MEM_TYPE, PRIM_TYPE) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __private) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __local) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __constant) \
|
||||
_CLC_VECTOR_VLOAD_DECL(SUFFIX, MEM_TYPE, PRIM_TYPE, __global) \
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_PRIM1(PRIM_TYPE) \
|
||||
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __private) \
|
||||
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __local) \
|
||||
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __constant) \
|
||||
_CLC_VECTOR_VLOAD_DECL(PRIM_TYPE, __global) \
|
||||
_CLC_VECTOR_VLOAD_PRIM3(, PRIM_TYPE, PRIM_TYPE) \
|
||||
|
||||
#define _CLC_VECTOR_VLOAD_PRIM() \
|
||||
_CLC_VECTOR_VLOAD_PRIM1(char) \
|
||||
|
@ -24,14 +27,26 @@
|
|||
_CLC_VECTOR_VLOAD_PRIM1(long) \
|
||||
_CLC_VECTOR_VLOAD_PRIM1(ulong) \
|
||||
_CLC_VECTOR_VLOAD_PRIM1(float) \
|
||||
|
||||
_CLC_VECTOR_VLOAD_PRIM3(_half, half, float)
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#define _CLC_VECTOR_VLOAD() \
|
||||
_CLC_VECTOR_VLOAD_PRIM1(double) \
|
||||
_CLC_VECTOR_VLOAD_PRIM()
|
||||
#else
|
||||
#define _CLC_VECTOR_VLOAD() \
|
||||
_CLC_VECTOR_VLOAD_PRIM()
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
_CLC_VECTOR_VLOAD_PRIM1(double)
|
||||
#endif
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16: enable
|
||||
_CLC_VECTOR_VLOAD_PRIM1(half)
|
||||
#endif
|
||||
|
||||
_CLC_VECTOR_VLOAD()
|
||||
_CLC_VECTOR_VLOAD_PRIM()
|
||||
// Plain vload_half also needs to be declared
|
||||
_CLC_VLOAD_DECL(_half, half, float, , __constant)
|
||||
_CLC_VLOAD_DECL(_half, half, float, , __global)
|
||||
_CLC_VLOAD_DECL(_half, half, float, , __local)
|
||||
_CLC_VLOAD_DECL(_half, half, float, , __private)
|
||||
|
||||
#undef _CLC_VLOAD_DECL
|
||||
#undef _CLC_VECTOR_VLOAD_DECL
|
||||
#undef _CLC_VECTOR_VLOAD_PRIM3
|
||||
#undef _CLC_VECTOR_VLOAD_PRIM1
|
||||
#undef _CLC_VECTOR_VLOAD_PRIM
|
||||
|
|
|
@ -50,3 +50,62 @@ VLOAD_TYPES()
|
|||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
VLOAD_ADDR_SPACES(double)
|
||||
#endif
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
VLOAD_ADDR_SPACES(half)
|
||||
#endif
|
||||
|
||||
/* vload_half are legal even without cl_khr_fp16 */
|
||||
/* no vload_half for double */
|
||||
#if __clang_major__ < 6
|
||||
float __clc_vload_half_float_helper__constant(const __constant half *);
|
||||
float __clc_vload_half_float_helper__global(const __global half *);
|
||||
float __clc_vload_half_float_helper__local(const __local half *);
|
||||
float __clc_vload_half_float_helper__private(const __private half *);
|
||||
|
||||
#define VEC_LOAD1(val, AS) val = __clc_vload_half_float_helper##AS (&mem[offset++]);
|
||||
#else
|
||||
#define VEC_LOAD1(val, AS) val = __builtin_load_halff(&mem[offset++]);
|
||||
#endif
|
||||
|
||||
#define VEC_LOAD2(val, AS) \
|
||||
VEC_LOAD1(val.lo, AS) \
|
||||
VEC_LOAD1(val.hi, AS)
|
||||
#define VEC_LOAD3(val, AS) \
|
||||
VEC_LOAD1(val.s0, AS) \
|
||||
VEC_LOAD1(val.s1, AS) \
|
||||
VEC_LOAD1(val.s2, AS)
|
||||
#define VEC_LOAD4(val, AS) \
|
||||
VEC_LOAD2(val.lo, AS) \
|
||||
VEC_LOAD2(val.hi, AS)
|
||||
#define VEC_LOAD8(val, AS) \
|
||||
VEC_LOAD4(val.lo, AS) \
|
||||
VEC_LOAD4(val.hi, AS)
|
||||
#define VEC_LOAD16(val, AS) \
|
||||
VEC_LOAD8(val.lo, AS) \
|
||||
VEC_LOAD8(val.hi, AS)
|
||||
|
||||
#define __FUNC(SUFFIX, VEC_SIZE, TYPE, AS) \
|
||||
_CLC_OVERLOAD _CLC_DEF TYPE vload_half##SUFFIX(size_t offset, const AS half *mem) { \
|
||||
offset *= VEC_SIZE; \
|
||||
TYPE __tmp; \
|
||||
VEC_LOAD##VEC_SIZE(__tmp, AS) \
|
||||
return __tmp; \
|
||||
}
|
||||
|
||||
#define FUNC(SUFFIX, VEC_SIZE, TYPE, AS) __FUNC(SUFFIX, VEC_SIZE, TYPE, AS)
|
||||
|
||||
#define __CLC_BODY "vload_half.inc"
|
||||
#include <clc/math/gentype.inc>
|
||||
#undef __CLC_BODY
|
||||
#undef FUNC
|
||||
#undef __FUNC
|
||||
#undef VEC_LOAD16
|
||||
#undef VEC_LOAD8
|
||||
#undef VEC_LOAD4
|
||||
#undef VEC_LOAD3
|
||||
#undef VEC_LOAD2
|
||||
#undef VEC_LOAD1
|
||||
#undef VLOAD_TYPES
|
||||
#undef VLOAD_ADDR_SPACES
|
||||
#undef VLOAD_VECTORIZE
|
||||
|
|
|
@ -0,0 +1,13 @@
|
|||
#if __CLC_FPSIZE == 32
|
||||
#ifdef __CLC_VECSIZE
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __private);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __local);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __global);
|
||||
FUNC(__CLC_VECSIZE, __CLC_VECSIZE, __CLC_GENTYPE, __constant);
|
||||
#else
|
||||
FUNC(, 1, __CLC_GENTYPE, __private);
|
||||
FUNC(, 1, __CLC_GENTYPE, __local);
|
||||
FUNC(, 1, __CLC_GENTYPE, __global);
|
||||
FUNC(, 1, __CLC_GENTYPE, __constant);
|
||||
#endif
|
||||
#endif
|
Loading…
Reference in New Issue