forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7291 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
ab6e356808
commit
9c9282d024
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,979 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bddd_00000000-9_lj96_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.4Q2aYE)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bddd_00000000-8_lj96_cut_gpu_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lj96_cut_gpu_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<62>;
|
||||
.reg .f32 %f<103>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32504_10_non_const_f = 48
|
||||
// __cuda_local_var_32508_9_non_const_virial = 16
|
||||
.loc 16 88 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 95 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 96 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 97 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 98 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 107 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
rem.s32 %r6, %r2, %r1;
|
||||
cvt.s32.u32 %r7, %ctaid.x;
|
||||
mul.lo.s32 %r8, %r7, %r5;
|
||||
add.s32 %r9, %r3, %r8;
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
|
||||
setp.lt.s32 %p1, %r9, %r10;
|
||||
@!%p1 bra $Lt_0_19202;
|
||||
.loc 16 113 0
|
||||
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r11;
|
||||
mul.wide.s32 %rd3, %r11, 4;
|
||||
cvt.s64.s32 %rd4, %r9;
|
||||
mul.wide.s32 %rd5, %r9, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r12, [%rd8+0];
|
||||
add.u64 %rd9, %rd3, %rd8;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd10, %rd6;
|
||||
@%p2 bra $Lt_0_19714;
|
||||
.loc 16 119 0
|
||||
cvt.s32.s64 %r13, %rd2;
|
||||
mul.lo.s32 %r14, %r13, %r12;
|
||||
cvt.s64.s32 %rd11, %r14;
|
||||
mul.wide.s32 %rd12, %r14, 4;
|
||||
add.u64 %rd13, %rd9, %rd12;
|
||||
.loc 16 120 0
|
||||
mul.lo.s32 %r15, %r6, %r13;
|
||||
cvt.s64.s32 %rd14, %r15;
|
||||
mul.wide.s32 %rd15, %r15, 4;
|
||||
add.u64 %rd16, %rd9, %rd15;
|
||||
.loc 16 121 0
|
||||
mul.lo.s32 %r16, %r13, %r1;
|
||||
bra.uni $Lt_0_19458;
|
||||
$Lt_0_19714:
|
||||
.loc 16 123 0
|
||||
ld.global.s32 %r17, [%rd9+0];
|
||||
cvt.s64.s32 %rd17, %r17;
|
||||
mul.wide.s32 %rd18, %r17, 4;
|
||||
add.u64 %rd19, %rd10, %rd18;
|
||||
.loc 16 124 0
|
||||
cvt.s64.s32 %rd20, %r12;
|
||||
mul.wide.s32 %rd21, %r12, 4;
|
||||
add.u64 %rd13, %rd19, %rd21;
|
||||
.loc 16 125 0
|
||||
mov.s32 %r16, %r1;
|
||||
.loc 16 126 0
|
||||
cvt.s64.s32 %rd22, %r6;
|
||||
mul.wide.s32 %rd23, %r6, 4;
|
||||
add.u64 %rd16, %rd19, %rd23;
|
||||
$Lt_0_19458:
|
||||
.loc 16 129 0
|
||||
ld.global.s32 %r18, [%rd7+0];
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd16, %rd13;
|
||||
@%p3 bra $Lt_0_28162;
|
||||
cvt.rzi.ftz.s32.f32 %r26, %f24;
|
||||
cvt.s64.s32 %rd24, %r16;
|
||||
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r28, %r27, %r26;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
|
||||
$Lt_0_20482:
|
||||
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 135 0
|
||||
ld.global.s32 %r29, [%rd16+0];
|
||||
.loc 16 136 0
|
||||
shr.s32 %r30, %r29, 30;
|
||||
and.b32 %r31, %r30, 3;
|
||||
cvt.s64.s32 %rd27, %r31;
|
||||
mul.wide.s32 %rd28, %r31, 4;
|
||||
add.u64 %rd29, %rd26, %rd28;
|
||||
ld.shared.f32 %f29, [%rd29+0];
|
||||
.loc 16 139 0
|
||||
and.b32 %r32, %r29, 1073741823;
|
||||
mov.u32 %r33, %r32;
|
||||
mov.s32 %r34, 0;
|
||||
mov.u32 %r35, %r34;
|
||||
mov.s32 %r36, 0;
|
||||
mov.u32 %r37, %r36;
|
||||
mov.s32 %r38, 0;
|
||||
mov.u32 %r39, %r38;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r40, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r41, %r40, %r28;
|
||||
cvt.s64.s32 %rd30, %r41;
|
||||
mul.wide.s32 %rd31, %r41, 16;
|
||||
add.u64 %rd32, %rd31, %rd25;
|
||||
ld.global.f32 %f44, [%rd32+8];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21762;
|
||||
.loc 16 154 0
|
||||
rcp.approx.ftz.f32 %f45, %f43;
|
||||
mul.ftz.f32 %f46, %f45, %f45;
|
||||
mul.ftz.f32 %f47, %f45, %f46;
|
||||
sqrt.approx.ftz.f32 %f48, %f47;
|
||||
mul.ftz.f32 %f49, %f45, %f47;
|
||||
ld.global.v2.f32 {%f50,%f51}, [%rd32+0];
|
||||
mul.ftz.f32 %f52, %f50, %f48;
|
||||
sub.ftz.f32 %f53, %f52, %f51;
|
||||
mul.ftz.f32 %f54, %f49, %f53;
|
||||
mul.ftz.f32 %f55, %f29, %f54;
|
||||
.loc 16 156 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f55, %f27;
|
||||
.loc 16 157 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f55, %f26;
|
||||
.loc 16 158 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f55, %f25;
|
||||
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r43, 0;
|
||||
setp.le.s32 %p5, %r42, %r43;
|
||||
@%p5 bra $Lt_0_21250;
|
||||
.loc 16 162 0
|
||||
ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd34, %rd33, %rd31;
|
||||
ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd34+0];
|
||||
mul.ftz.f32 %f59, %f56, %f48;
|
||||
sub.ftz.f32 %f60, %f59, %f57;
|
||||
mul.ftz.f32 %f61, %f47, %f60;
|
||||
sub.ftz.f32 %f62, %f61, %f58;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f62, %f28;
|
||||
$Lt_0_21250:
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p6, %r44, %r45;
|
||||
@%p6 bra $Lt_0_21762;
|
||||
.loc 16 165 0
|
||||
mov.f32 %f63, %f6;
|
||||
mul.ftz.f32 %f64, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f65, %f55, %f64, %f63;
|
||||
mov.f32 %f6, %f65;
|
||||
.loc 16 166 0
|
||||
mov.f32 %f66, %f8;
|
||||
fma.rn.ftz.f32 %f67, %f55, %f41, %f66;
|
||||
mov.f32 %f8, %f67;
|
||||
.loc 16 167 0
|
||||
mov.f32 %f68, %f10;
|
||||
mul.ftz.f32 %f69, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f70, %f55, %f69, %f68;
|
||||
mov.f32 %f10, %f70;
|
||||
.loc 16 168 0
|
||||
mov.f32 %f71, %f12;
|
||||
mul.ftz.f32 %f72, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f73, %f55, %f72, %f71;
|
||||
mov.f32 %f12, %f73;
|
||||
.loc 16 169 0
|
||||
mov.f32 %f74, %f14;
|
||||
mul.ftz.f32 %f75, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f76, %f55, %f75, %f74;
|
||||
mov.f32 %f14, %f76;
|
||||
.loc 16 170 0
|
||||
mul.ftz.f32 %f77, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f55, %f77, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21762:
|
||||
$Lt_0_20738:
|
||||
.loc 16 133 0
|
||||
mul.lo.u64 %rd35, %rd24, 4;
|
||||
add.u64 %rd16, %rd16, %rd35;
|
||||
setp.lt.u64 %p7, %rd16, %rd13;
|
||||
@%p7 bra $Lt_0_20482;
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_28162:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_19202:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_18946:
|
||||
mov.u32 %r46, 1;
|
||||
setp.le.s32 %p8, %r1, %r46;
|
||||
@%p8 bra $Lt_0_24578;
|
||||
.loc 16 181 0
|
||||
mov.u64 %rd36, __cuda___cuda_local_var_32582_35_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd37, %r2;
|
||||
mul.wide.s32 %rd38, %r2, 4;
|
||||
add.u64 %rd39, %rd36, %rd38;
|
||||
mov.f32 %f78, %f27;
|
||||
st.shared.f32 [%rd39+0], %f78;
|
||||
.loc 16 182 0
|
||||
mov.f32 %f79, %f26;
|
||||
st.shared.f32 [%rd39+512], %f79;
|
||||
.loc 16 183 0
|
||||
mov.f32 %f80, %f25;
|
||||
st.shared.f32 [%rd39+1024], %f80;
|
||||
.loc 16 184 0
|
||||
mov.f32 %f81, %f28;
|
||||
st.shared.f32 [%rd39+1536], %f81;
|
||||
.loc 16 186 0
|
||||
shr.s32 %r47, %r1, 31;
|
||||
mov.s32 %r48, 1;
|
||||
and.b32 %r49, %r47, %r48;
|
||||
add.s32 %r50, %r49, %r1;
|
||||
shr.s32 %r51, %r50, 1;
|
||||
mov.s32 %r52, %r51;
|
||||
mov.u32 %r53, 0;
|
||||
setp.ne.u32 %p9, %r51, %r53;
|
||||
@!%p9 bra $Lt_0_23042;
|
||||
$Lt_0_23554:
|
||||
setp.ge.u32 %p10, %r6, %r52;
|
||||
@%p10 bra $Lt_0_23810;
|
||||
.loc 16 189 0
|
||||
add.u32 %r54, %r2, %r52;
|
||||
cvt.u64.u32 %rd40, %r54;
|
||||
mul.wide.u32 %rd41, %r54, 4;
|
||||
add.u64 %rd42, %rd36, %rd41;
|
||||
ld.shared.f32 %f82, [%rd42+0];
|
||||
add.ftz.f32 %f78, %f82, %f78;
|
||||
st.shared.f32 [%rd39+0], %f78;
|
||||
ld.shared.f32 %f83, [%rd42+512];
|
||||
add.ftz.f32 %f79, %f83, %f79;
|
||||
st.shared.f32 [%rd39+512], %f79;
|
||||
ld.shared.f32 %f84, [%rd42+1024];
|
||||
add.ftz.f32 %f80, %f84, %f80;
|
||||
st.shared.f32 [%rd39+1024], %f80;
|
||||
ld.shared.f32 %f85, [%rd42+1536];
|
||||
add.ftz.f32 %f81, %f85, %f81;
|
||||
st.shared.f32 [%rd39+1536], %f81;
|
||||
$Lt_0_23810:
|
||||
.loc 16 186 0
|
||||
shr.u32 %r52, %r52, 1;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p11, %r52, %r55;
|
||||
@%p11 bra $Lt_0_23554;
|
||||
$Lt_0_23042:
|
||||
.loc 16 193 0
|
||||
mov.f32 %f27, %f78;
|
||||
.loc 16 194 0
|
||||
mov.f32 %f26, %f79;
|
||||
.loc 16 195 0
|
||||
mov.f32 %f25, %f80;
|
||||
.loc 16 196 0
|
||||
mov.f32 %f28, %f81;
|
||||
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r57, 0;
|
||||
setp.le.s32 %p12, %r56, %r57;
|
||||
@%p12 bra $Lt_0_24578;
|
||||
.loc 16 200 0
|
||||
mov.f32 %f78, %f6;
|
||||
st.shared.f32 [%rd39+0], %f78;
|
||||
mov.f32 %f79, %f8;
|
||||
st.shared.f32 [%rd39+512], %f79;
|
||||
mov.f32 %f80, %f10;
|
||||
st.shared.f32 [%rd39+1024], %f80;
|
||||
mov.f32 %f81, %f12;
|
||||
st.shared.f32 [%rd39+1536], %f81;
|
||||
mov.f32 %f86, %f14;
|
||||
st.shared.f32 [%rd39+2048], %f86;
|
||||
mov.f32 %f87, %f16;
|
||||
st.shared.f32 [%rd39+2560], %f87;
|
||||
.loc 16 202 0
|
||||
mov.s32 %r58, %r51;
|
||||
@!%p9 bra $Lt_0_25090;
|
||||
$Lt_0_25602:
|
||||
setp.ge.u32 %p13, %r6, %r58;
|
||||
@%p13 bra $Lt_0_25858;
|
||||
.loc 16 205 0
|
||||
add.u32 %r59, %r2, %r58;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd36, %rd44;
|
||||
ld.shared.f32 %f88, [%rd45+0];
|
||||
add.ftz.f32 %f78, %f88, %f78;
|
||||
st.shared.f32 [%rd39+0], %f78;
|
||||
ld.shared.f32 %f89, [%rd45+512];
|
||||
add.ftz.f32 %f79, %f89, %f79;
|
||||
st.shared.f32 [%rd39+512], %f79;
|
||||
ld.shared.f32 %f90, [%rd45+1024];
|
||||
add.ftz.f32 %f80, %f90, %f80;
|
||||
st.shared.f32 [%rd39+1024], %f80;
|
||||
ld.shared.f32 %f91, [%rd45+1536];
|
||||
add.ftz.f32 %f81, %f91, %f81;
|
||||
st.shared.f32 [%rd39+1536], %f81;
|
||||
ld.shared.f32 %f92, [%rd45+2048];
|
||||
add.ftz.f32 %f86, %f92, %f86;
|
||||
st.shared.f32 [%rd39+2048], %f86;
|
||||
ld.shared.f32 %f93, [%rd45+2560];
|
||||
add.ftz.f32 %f87, %f93, %f87;
|
||||
st.shared.f32 [%rd39+2560], %f87;
|
||||
$Lt_0_25858:
|
||||
.loc 16 202 0
|
||||
shr.u32 %r58, %r58, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p14, %r58, %r60;
|
||||
@%p14 bra $Lt_0_25602;
|
||||
$Lt_0_25090:
|
||||
.loc 16 210 0
|
||||
mov.f32 %f6, %f78;
|
||||
mov.f32 %f8, %f79;
|
||||
mov.f32 %f10, %f80;
|
||||
mov.f32 %f12, %f81;
|
||||
mov.f32 %f14, %f86;
|
||||
mov.f32 %f16, %f87;
|
||||
$Lt_0_24578:
|
||||
$Lt_0_22530:
|
||||
selp.s32 %r61, 1, 0, %p1;
|
||||
mov.s32 %r62, 0;
|
||||
set.eq.u32.s32 %r63, %r6, %r62;
|
||||
neg.s32 %r64, %r63;
|
||||
and.b32 %r65, %r61, %r64;
|
||||
mov.u32 %r66, 0;
|
||||
setp.eq.s32 %p15, %r65, %r66;
|
||||
@%p15 bra $Lt_0_26626;
|
||||
.loc 16 216 0
|
||||
cvt.s64.s32 %rd46, %r9;
|
||||
ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];
|
||||
mul.wide.s32 %rd48, %r9, 4;
|
||||
add.u64 %rd49, %rd47, %rd48;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_27138;
|
||||
.loc 16 218 0
|
||||
st.global.f32 [%rd49+0], %f28;
|
||||
.loc 16 219 0
|
||||
cvt.s64.s32 %rd50, %r10;
|
||||
mul.wide.s32 %rd51, %r10, 4;
|
||||
add.u64 %rd49, %rd49, %rd51;
|
||||
$Lt_0_27138:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27650;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f94, %f6;
|
||||
st.global.f32 [%rd49+0], %f94;
|
||||
.loc 16 224 0
|
||||
cvt.s64.s32 %rd52, %r10;
|
||||
mul.wide.s32 %rd53, %r10, 4;
|
||||
add.u64 %rd54, %rd53, %rd49;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f95, %f8;
|
||||
st.global.f32 [%rd54+0], %f95;
|
||||
.loc 16 224 0
|
||||
add.u64 %rd55, %rd53, %rd54;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f96, %f10;
|
||||
st.global.f32 [%rd55+0], %f96;
|
||||
.loc 16 224 0
|
||||
add.u64 %rd56, %rd53, %rd55;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f97, %f12;
|
||||
st.global.f32 [%rd56+0], %f97;
|
||||
.loc 16 224 0
|
||||
add.u64 %rd49, %rd53, %rd56;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f98, %f14;
|
||||
st.global.f32 [%rd49+0], %f98;
|
||||
mov.f32 %f99, %f16;
|
||||
add.u64 %rd57, %rd53, %rd49;
|
||||
st.global.f32 [%rd57+0], %f99;
|
||||
$Lt_0_27650:
|
||||
.loc 16 227 0
|
||||
ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd59, %rd46, 16;
|
||||
add.u64 %rd60, %rd58, %rd59;
|
||||
mov.f32 %f100, %f101;
|
||||
st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f100};
|
||||
$Lt_0_26626:
|
||||
.loc 16 229 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<74>;
|
||||
.reg .f32 %f<109>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32647_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32737_35_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32658_10_non_const_f = 48
|
||||
// __cuda_local_var_32662_9_non_const_virial = 16
|
||||
.loc 16 237 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_21250;
|
||||
.loc 16 247 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_21250:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21762;
|
||||
.loc 16 249 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22274;
|
||||
.loc 16 251 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_22274:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
|
||||
$Lt_1_21762:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
|
||||
.loc 16 261 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 263 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
rem.s32 %r10, %r1, %r6;
|
||||
cvt.s32.u32 %r11, %ctaid.x;
|
||||
mul.lo.s32 %r12, %r11, %r9;
|
||||
add.s32 %r13, %r7, %r12;
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.lt.s32 %p4, %r13, %r14;
|
||||
@!%p4 bra $Lt_1_23042;
|
||||
.loc 16 269 0
|
||||
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r15;
|
||||
mul.wide.s32 %rd18, %r15, 4;
|
||||
cvt.s64.s32 %rd19, %r13;
|
||||
mul.wide.s32 %rd20, %r13, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r16, [%rd23+0];
|
||||
add.u64 %rd24, %rd18, %rd23;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd25, %rd21;
|
||||
@%p5 bra $Lt_1_23554;
|
||||
.loc 16 275 0
|
||||
cvt.s32.s64 %r17, %rd17;
|
||||
mul.lo.s32 %r18, %r17, %r16;
|
||||
cvt.s64.s32 %rd26, %r18;
|
||||
mul.wide.s32 %rd27, %r18, 4;
|
||||
add.u64 %rd28, %rd24, %rd27;
|
||||
.loc 16 276 0
|
||||
mul.lo.s32 %r19, %r10, %r17;
|
||||
cvt.s64.s32 %rd29, %r19;
|
||||
mul.wide.s32 %rd30, %r19, 4;
|
||||
add.u64 %rd31, %rd24, %rd30;
|
||||
.loc 16 277 0
|
||||
mul.lo.s32 %r20, %r17, %r6;
|
||||
bra.uni $Lt_1_23298;
|
||||
$Lt_1_23554:
|
||||
.loc 16 279 0
|
||||
ld.global.s32 %r21, [%rd24+0];
|
||||
cvt.s64.s32 %rd32, %r21;
|
||||
mul.wide.s32 %rd33, %r21, 4;
|
||||
add.u64 %rd34, %rd25, %rd33;
|
||||
.loc 16 280 0
|
||||
cvt.s64.s32 %rd35, %r16;
|
||||
mul.wide.s32 %rd36, %r16, 4;
|
||||
add.u64 %rd28, %rd34, %rd36;
|
||||
.loc 16 281 0
|
||||
mov.s32 %r20, %r6;
|
||||
.loc 16 282 0
|
||||
cvt.s64.s32 %rd37, %r10;
|
||||
mul.wide.s32 %rd38, %r10, 4;
|
||||
add.u64 %rd31, %rd34, %rd38;
|
||||
$Lt_1_23298:
|
||||
.loc 16 285 0
|
||||
ld.global.s32 %r22, [%rd22+0];
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
mov.s32 %r28, 0;
|
||||
mov.u32 %r29, %r28;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd31, %rd28;
|
||||
@%p6 bra $Lt_1_32002;
|
||||
cvt.rzi.ftz.s32.f32 %r30, %f29;
|
||||
cvt.s64.s32 %rd39, %r20;
|
||||
mul.lo.s32 %r31, %r30, 11;
|
||||
cvt.rn.f32.s32 %f30, %r31;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_24322:
|
||||
//<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 292 0
|
||||
ld.global.s32 %r32, [%rd31+0];
|
||||
.loc 16 296 0
|
||||
and.b32 %r33, %r32, 1073741823;
|
||||
mov.u32 %r34, %r33;
|
||||
mov.s32 %r35, 0;
|
||||
mov.u32 %r36, %r35;
|
||||
mov.s32 %r37, 0;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r34,%r36,%r38,%r40}];
|
||||
mov.f32 %f39, %f35;
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
sub.ftz.f32 %f43, %f27, %f40;
|
||||
sub.ftz.f32 %f44, %f26, %f39;
|
||||
sub.ftz.f32 %f45, %f28, %f41;
|
||||
mul.ftz.f32 %f46, %f43, %f43;
|
||||
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
add.ftz.f32 %f49, %f30, %f42;
|
||||
cvt.rzi.ftz.s32.f32 %r41, %f49;
|
||||
cvt.s64.s32 %rd40, %r41;
|
||||
mul.wide.s32 %rd41, %r41, 16;
|
||||
add.u64 %rd42, %rd41, %rd7;
|
||||
ld.shared.f32 %f50, [%rd42+8];
|
||||
setp.gt.ftz.f32 %p7, %f50, %f48;
|
||||
@!%p7 bra $Lt_1_25602;
|
||||
.loc 16 309 0
|
||||
rcp.approx.ftz.f32 %f51, %f48;
|
||||
mul.ftz.f32 %f52, %f51, %f51;
|
||||
mul.ftz.f32 %f53, %f51, %f52;
|
||||
sqrt.approx.ftz.f32 %f54, %f53;
|
||||
mul.ftz.f32 %f55, %f51, %f53;
|
||||
ld.shared.v2.f32 {%f56,%f57}, [%rd42+0];
|
||||
mul.ftz.f32 %f58, %f56, %f54;
|
||||
sub.ftz.f32 %f59, %f58, %f57;
|
||||
mul.ftz.f32 %f60, %f55, %f59;
|
||||
.loc 16 311 0
|
||||
fma.rn.ftz.f32 %f33, %f44, %f60, %f33;
|
||||
.loc 16 312 0
|
||||
fma.rn.ftz.f32 %f32, %f43, %f60, %f32;
|
||||
.loc 16 313 0
|
||||
fma.rn.ftz.f32 %f31, %f45, %f60, %f31;
|
||||
ld.param.s32 %r42, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r43, 0;
|
||||
setp.le.s32 %p8, %r42, %r43;
|
||||
@%p8 bra $Lt_1_25090;
|
||||
.loc 16 316 0
|
||||
add.u64 %rd43, %rd41, %rd13;
|
||||
ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd43+0];
|
||||
mul.ftz.f32 %f64, %f61, %f54;
|
||||
sub.ftz.f32 %f65, %f64, %f62;
|
||||
mul.ftz.f32 %f66, %f53, %f65;
|
||||
.loc 16 317 0
|
||||
shr.s32 %r44, %r32, 30;
|
||||
and.b32 %r45, %r44, 3;
|
||||
cvt.s64.s32 %rd44, %r45;
|
||||
mul.wide.s32 %rd45, %r45, 4;
|
||||
add.u64 %rd46, %rd1, %rd45;
|
||||
ld.shared.f32 %f67, [%rd46+0];
|
||||
sub.ftz.f32 %f68, %f66, %f63;
|
||||
fma.rn.ftz.f32 %f34, %f67, %f68, %f34;
|
||||
$Lt_1_25090:
|
||||
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r47, 0;
|
||||
setp.le.s32 %p9, %r46, %r47;
|
||||
@%p9 bra $Lt_1_25602;
|
||||
.loc 16 320 0
|
||||
mov.f32 %f69, %f11;
|
||||
mul.ftz.f32 %f70, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f71, %f60, %f70, %f69;
|
||||
mov.f32 %f11, %f71;
|
||||
.loc 16 321 0
|
||||
mov.f32 %f72, %f13;
|
||||
fma.rn.ftz.f32 %f73, %f60, %f46, %f72;
|
||||
mov.f32 %f13, %f73;
|
||||
.loc 16 322 0
|
||||
mov.f32 %f74, %f15;
|
||||
mul.ftz.f32 %f75, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f76, %f60, %f75, %f74;
|
||||
mov.f32 %f15, %f76;
|
||||
.loc 16 323 0
|
||||
mov.f32 %f77, %f17;
|
||||
mul.ftz.f32 %f78, %f43, %f44;
|
||||
fma.rn.ftz.f32 %f79, %f60, %f78, %f77;
|
||||
mov.f32 %f17, %f79;
|
||||
.loc 16 324 0
|
||||
mov.f32 %f80, %f19;
|
||||
mul.ftz.f32 %f81, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f82, %f60, %f81, %f80;
|
||||
mov.f32 %f19, %f82;
|
||||
.loc 16 325 0
|
||||
mul.ftz.f32 %f83, %f43, %f45;
|
||||
fma.rn.ftz.f32 %f20, %f60, %f83, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_25602:
|
||||
$Lt_1_24578:
|
||||
.loc 16 290 0
|
||||
mul.lo.u64 %rd47, %rd39, 4;
|
||||
add.u64 %rd31, %rd31, %rd47;
|
||||
setp.lt.u64 %p10, %rd31, %rd28;
|
||||
@%p10 bra $Lt_1_24322;
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_32002:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_23042:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_22786:
|
||||
mov.u32 %r48, 1;
|
||||
setp.le.s32 %p11, %r6, %r48;
|
||||
@%p11 bra $Lt_1_28418;
|
||||
.loc 16 336 0
|
||||
mov.u64 %rd48, __cuda___cuda_local_var_32737_35_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd49, %r1;
|
||||
mul.wide.s32 %rd50, %r1, 4;
|
||||
add.u64 %rd51, %rd48, %rd50;
|
||||
mov.f32 %f84, %f33;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
.loc 16 337 0
|
||||
mov.f32 %f85, %f32;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
.loc 16 338 0
|
||||
mov.f32 %f86, %f31;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
.loc 16 339 0
|
||||
mov.f32 %f87, %f34;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
.loc 16 341 0
|
||||
shr.s32 %r49, %r6, 31;
|
||||
mov.s32 %r50, 1;
|
||||
and.b32 %r51, %r49, %r50;
|
||||
add.s32 %r52, %r51, %r6;
|
||||
shr.s32 %r53, %r52, 1;
|
||||
mov.s32 %r54, %r53;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p12, %r53, %r55;
|
||||
@!%p12 bra $Lt_1_26882;
|
||||
$Lt_1_27394:
|
||||
setp.ge.u32 %p13, %r10, %r54;
|
||||
@%p13 bra $Lt_1_27650;
|
||||
.loc 16 344 0
|
||||
add.u32 %r56, %r1, %r54;
|
||||
cvt.u64.u32 %rd52, %r56;
|
||||
mul.wide.u32 %rd53, %r56, 4;
|
||||
add.u64 %rd54, %rd48, %rd53;
|
||||
ld.shared.f32 %f88, [%rd54+0];
|
||||
add.ftz.f32 %f84, %f88, %f84;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
ld.shared.f32 %f89, [%rd54+512];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
ld.shared.f32 %f90, [%rd54+1024];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
ld.shared.f32 %f91, [%rd54+1536];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
$Lt_1_27650:
|
||||
.loc 16 341 0
|
||||
shr.u32 %r54, %r54, 1;
|
||||
mov.u32 %r57, 0;
|
||||
setp.ne.u32 %p14, %r54, %r57;
|
||||
@%p14 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
.loc 16 348 0
|
||||
mov.f32 %f33, %f84;
|
||||
.loc 16 349 0
|
||||
mov.f32 %f32, %f85;
|
||||
.loc 16 350 0
|
||||
mov.f32 %f31, %f86;
|
||||
.loc 16 351 0
|
||||
mov.f32 %f34, %f87;
|
||||
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r59, 0;
|
||||
setp.le.s32 %p15, %r58, %r59;
|
||||
@%p15 bra $Lt_1_28418;
|
||||
.loc 16 355 0
|
||||
mov.f32 %f84, %f11;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
mov.f32 %f85, %f13;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
mov.f32 %f86, %f15;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
mov.f32 %f87, %f17;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
mov.f32 %f92, %f19;
|
||||
st.shared.f32 [%rd51+2048], %f92;
|
||||
mov.f32 %f93, %f21;
|
||||
st.shared.f32 [%rd51+2560], %f93;
|
||||
.loc 16 357 0
|
||||
mov.s32 %r60, %r53;
|
||||
@!%p12 bra $Lt_1_28930;
|
||||
$Lt_1_29442:
|
||||
setp.ge.u32 %p16, %r10, %r60;
|
||||
@%p16 bra $Lt_1_29698;
|
||||
.loc 16 360 0
|
||||
add.u32 %r61, %r1, %r60;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd48, %rd56;
|
||||
ld.shared.f32 %f94, [%rd57+0];
|
||||
add.ftz.f32 %f84, %f94, %f84;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
ld.shared.f32 %f95, [%rd57+512];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
ld.shared.f32 %f96, [%rd57+1024];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
ld.shared.f32 %f97, [%rd57+1536];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
ld.shared.f32 %f98, [%rd57+2048];
|
||||
add.ftz.f32 %f92, %f98, %f92;
|
||||
st.shared.f32 [%rd51+2048], %f92;
|
||||
ld.shared.f32 %f99, [%rd57+2560];
|
||||
add.ftz.f32 %f93, %f99, %f93;
|
||||
st.shared.f32 [%rd51+2560], %f93;
|
||||
$Lt_1_29698:
|
||||
.loc 16 357 0
|
||||
shr.u32 %r60, %r60, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p17, %r60, %r62;
|
||||
@%p17 bra $Lt_1_29442;
|
||||
$Lt_1_28930:
|
||||
.loc 16 365 0
|
||||
mov.f32 %f11, %f84;
|
||||
mov.f32 %f13, %f85;
|
||||
mov.f32 %f15, %f86;
|
||||
mov.f32 %f17, %f87;
|
||||
mov.f32 %f19, %f92;
|
||||
mov.f32 %f21, %f93;
|
||||
$Lt_1_28418:
|
||||
$Lt_1_26370:
|
||||
selp.s32 %r63, 1, 0, %p4;
|
||||
mov.s32 %r64, 0;
|
||||
set.eq.u32.s32 %r65, %r10, %r64;
|
||||
neg.s32 %r66, %r65;
|
||||
and.b32 %r67, %r63, %r66;
|
||||
mov.u32 %r68, 0;
|
||||
setp.eq.s32 %p18, %r67, %r68;
|
||||
@%p18 bra $Lt_1_30466;
|
||||
.loc 16 371 0
|
||||
cvt.s64.s32 %rd58, %r13;
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];
|
||||
mul.wide.s32 %rd60, %r13, 4;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30978;
|
||||
.loc 16 373 0
|
||||
st.global.f32 [%rd61+0], %f34;
|
||||
.loc 16 374 0
|
||||
cvt.s64.s32 %rd62, %r14;
|
||||
mul.wide.s32 %rd63, %r14, 4;
|
||||
add.u64 %rd61, %rd61, %rd63;
|
||||
$Lt_1_30978:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31490;
|
||||
.loc 16 378 0
|
||||
mov.f32 %f100, %f11;
|
||||
st.global.f32 [%rd61+0], %f100;
|
||||
.loc 16 379 0
|
||||
cvt.s64.s32 %rd64, %r14;
|
||||
mul.wide.s32 %rd65, %r14, 4;
|
||||
add.u64 %rd66, %rd65, %rd61;
|
||||
.loc 16 378 0
|
||||
mov.f32 %f101, %f13;
|
||||
st.global.f32 [%rd66+0], %f101;
|
||||
.loc 16 379 0
|
||||
add.u64 %rd67, %rd65, %rd66;
|
||||
.loc 16 378 0
|
||||
mov.f32 %f102, %f15;
|
||||
st.global.f32 [%rd67+0], %f102;
|
||||
.loc 16 379 0
|
||||
add.u64 %rd68, %rd65, %rd67;
|
||||
.loc 16 378 0
|
||||
mov.f32 %f103, %f17;
|
||||
st.global.f32 [%rd68+0], %f103;
|
||||
.loc 16 379 0
|
||||
add.u64 %rd61, %rd65, %rd68;
|
||||
.loc 16 378 0
|
||||
mov.f32 %f104, %f19;
|
||||
st.global.f32 [%rd61+0], %f104;
|
||||
mov.f32 %f105, %f21;
|
||||
add.u64 %rd69, %rd65, %rd61;
|
||||
st.global.f32 [%rd69+0], %f105;
|
||||
$Lt_1_31490:
|
||||
.loc 16 382 0
|
||||
ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd71, %rd58, 16;
|
||||
add.u64 %rd72, %rd70, %rd71;
|
||||
mov.f32 %f106, %f107;
|
||||
st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106};
|
||||
$Lt_1_30466:
|
||||
.loc 16 384 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
|
@ -1,979 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bd91_00000000-9_lj_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.gvU1PY)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bd91_00000000-8_lj_cut_gpu_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lj_cut_gpu_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<62>;
|
||||
.reg .f32 %f<102>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32581_35_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32504_10_non_const_f = 48
|
||||
// __cuda_local_var_32508_9_non_const_virial = 16
|
||||
.loc 16 88 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 95 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 96 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 97 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 98 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 107 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
rem.s32 %r6, %r2, %r1;
|
||||
cvt.s32.u32 %r7, %ctaid.x;
|
||||
mul.lo.s32 %r8, %r7, %r5;
|
||||
add.s32 %r9, %r3, %r8;
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
|
||||
setp.lt.s32 %p1, %r9, %r10;
|
||||
@!%p1 bra $Lt_0_19202;
|
||||
.loc 16 113 0
|
||||
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r11;
|
||||
mul.wide.s32 %rd3, %r11, 4;
|
||||
cvt.s64.s32 %rd4, %r9;
|
||||
mul.wide.s32 %rd5, %r9, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r12, [%rd8+0];
|
||||
add.u64 %rd9, %rd3, %rd8;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd10, %rd6;
|
||||
@%p2 bra $Lt_0_19714;
|
||||
.loc 16 119 0
|
||||
cvt.s32.s64 %r13, %rd2;
|
||||
mul.lo.s32 %r14, %r13, %r12;
|
||||
cvt.s64.s32 %rd11, %r14;
|
||||
mul.wide.s32 %rd12, %r14, 4;
|
||||
add.u64 %rd13, %rd9, %rd12;
|
||||
.loc 16 120 0
|
||||
mul.lo.s32 %r15, %r6, %r13;
|
||||
cvt.s64.s32 %rd14, %r15;
|
||||
mul.wide.s32 %rd15, %r15, 4;
|
||||
add.u64 %rd16, %rd9, %rd15;
|
||||
.loc 16 121 0
|
||||
mul.lo.s32 %r16, %r13, %r1;
|
||||
bra.uni $Lt_0_19458;
|
||||
$Lt_0_19714:
|
||||
.loc 16 123 0
|
||||
ld.global.s32 %r17, [%rd9+0];
|
||||
cvt.s64.s32 %rd17, %r17;
|
||||
mul.wide.s32 %rd18, %r17, 4;
|
||||
add.u64 %rd19, %rd10, %rd18;
|
||||
.loc 16 124 0
|
||||
cvt.s64.s32 %rd20, %r12;
|
||||
mul.wide.s32 %rd21, %r12, 4;
|
||||
add.u64 %rd13, %rd19, %rd21;
|
||||
.loc 16 125 0
|
||||
mov.s32 %r16, %r1;
|
||||
.loc 16 126 0
|
||||
cvt.s64.s32 %rd22, %r6;
|
||||
mul.wide.s32 %rd23, %r6, 4;
|
||||
add.u64 %rd16, %rd19, %rd23;
|
||||
$Lt_0_19458:
|
||||
.loc 16 129 0
|
||||
ld.global.s32 %r18, [%rd7+0];
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd16, %rd13;
|
||||
@%p3 bra $Lt_0_28162;
|
||||
cvt.rzi.ftz.s32.f32 %r26, %f24;
|
||||
cvt.s64.s32 %rd24, %r16;
|
||||
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r28, %r27, %r26;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
|
||||
$Lt_0_20482:
|
||||
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 135 0
|
||||
ld.global.s32 %r29, [%rd16+0];
|
||||
.loc 16 136 0
|
||||
shr.s32 %r30, %r29, 30;
|
||||
and.b32 %r31, %r30, 3;
|
||||
cvt.s64.s32 %rd27, %r31;
|
||||
mul.wide.s32 %rd28, %r31, 4;
|
||||
add.u64 %rd29, %rd26, %rd28;
|
||||
ld.shared.f32 %f29, [%rd29+0];
|
||||
.loc 16 139 0
|
||||
and.b32 %r32, %r29, 1073741823;
|
||||
mov.u32 %r33, %r32;
|
||||
mov.s32 %r34, 0;
|
||||
mov.u32 %r35, %r34;
|
||||
mov.s32 %r36, 0;
|
||||
mov.u32 %r37, %r36;
|
||||
mov.s32 %r38, 0;
|
||||
mov.u32 %r39, %r38;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r40, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r41, %r40, %r28;
|
||||
cvt.s64.s32 %rd30, %r41;
|
||||
mul.wide.s32 %rd31, %r41, 16;
|
||||
add.u64 %rd32, %rd31, %rd25;
|
||||
ld.global.f32 %f44, [%rd32+8];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21762;
|
||||
.loc 16 153 0
|
||||
rcp.approx.ftz.f32 %f45, %f43;
|
||||
mul.ftz.f32 %f46, %f45, %f45;
|
||||
mul.ftz.f32 %f47, %f45, %f46;
|
||||
mul.ftz.f32 %f48, %f45, %f47;
|
||||
ld.global.v2.f32 {%f49,%f50}, [%rd32+0];
|
||||
mul.ftz.f32 %f51, %f49, %f47;
|
||||
sub.ftz.f32 %f52, %f51, %f50;
|
||||
mul.ftz.f32 %f53, %f48, %f52;
|
||||
mul.ftz.f32 %f54, %f29, %f53;
|
||||
.loc 16 155 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f54, %f27;
|
||||
.loc 16 156 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f54, %f26;
|
||||
.loc 16 157 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f54, %f25;
|
||||
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r43, 0;
|
||||
setp.le.s32 %p5, %r42, %r43;
|
||||
@%p5 bra $Lt_0_21250;
|
||||
.loc 16 161 0
|
||||
ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd34, %rd33, %rd31;
|
||||
ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd34+0];
|
||||
mul.ftz.f32 %f58, %f55, %f47;
|
||||
sub.ftz.f32 %f59, %f58, %f56;
|
||||
mul.ftz.f32 %f60, %f47, %f59;
|
||||
sub.ftz.f32 %f61, %f60, %f57;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f61, %f28;
|
||||
$Lt_0_21250:
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p6, %r44, %r45;
|
||||
@%p6 bra $Lt_0_21762;
|
||||
.loc 16 164 0
|
||||
mov.f32 %f62, %f6;
|
||||
mul.ftz.f32 %f63, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f64, %f54, %f63, %f62;
|
||||
mov.f32 %f6, %f64;
|
||||
.loc 16 165 0
|
||||
mov.f32 %f65, %f8;
|
||||
fma.rn.ftz.f32 %f66, %f54, %f41, %f65;
|
||||
mov.f32 %f8, %f66;
|
||||
.loc 16 166 0
|
||||
mov.f32 %f67, %f10;
|
||||
mul.ftz.f32 %f68, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f69, %f54, %f68, %f67;
|
||||
mov.f32 %f10, %f69;
|
||||
.loc 16 167 0
|
||||
mov.f32 %f70, %f12;
|
||||
mul.ftz.f32 %f71, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f72, %f54, %f71, %f70;
|
||||
mov.f32 %f12, %f72;
|
||||
.loc 16 168 0
|
||||
mov.f32 %f73, %f14;
|
||||
mul.ftz.f32 %f74, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f75, %f54, %f74, %f73;
|
||||
mov.f32 %f14, %f75;
|
||||
.loc 16 169 0
|
||||
mul.ftz.f32 %f76, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f54, %f76, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21762:
|
||||
$Lt_0_20738:
|
||||
.loc 16 133 0
|
||||
mul.lo.u64 %rd35, %rd24, 4;
|
||||
add.u64 %rd16, %rd16, %rd35;
|
||||
setp.lt.u64 %p7, %rd16, %rd13;
|
||||
@%p7 bra $Lt_0_20482;
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_28162:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_19202:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_18946:
|
||||
mov.u32 %r46, 1;
|
||||
setp.le.s32 %p8, %r1, %r46;
|
||||
@%p8 bra $Lt_0_24578;
|
||||
.loc 16 180 0
|
||||
mov.u64 %rd36, __cuda___cuda_local_var_32581_35_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd37, %r2;
|
||||
mul.wide.s32 %rd38, %r2, 4;
|
||||
add.u64 %rd39, %rd36, %rd38;
|
||||
mov.f32 %f77, %f27;
|
||||
st.shared.f32 [%rd39+0], %f77;
|
||||
.loc 16 181 0
|
||||
mov.f32 %f78, %f26;
|
||||
st.shared.f32 [%rd39+512], %f78;
|
||||
.loc 16 182 0
|
||||
mov.f32 %f79, %f25;
|
||||
st.shared.f32 [%rd39+1024], %f79;
|
||||
.loc 16 183 0
|
||||
mov.f32 %f80, %f28;
|
||||
st.shared.f32 [%rd39+1536], %f80;
|
||||
.loc 16 185 0
|
||||
shr.s32 %r47, %r1, 31;
|
||||
mov.s32 %r48, 1;
|
||||
and.b32 %r49, %r47, %r48;
|
||||
add.s32 %r50, %r49, %r1;
|
||||
shr.s32 %r51, %r50, 1;
|
||||
mov.s32 %r52, %r51;
|
||||
mov.u32 %r53, 0;
|
||||
setp.ne.u32 %p9, %r51, %r53;
|
||||
@!%p9 bra $Lt_0_23042;
|
||||
$Lt_0_23554:
|
||||
setp.ge.u32 %p10, %r6, %r52;
|
||||
@%p10 bra $Lt_0_23810;
|
||||
.loc 16 188 0
|
||||
add.u32 %r54, %r2, %r52;
|
||||
cvt.u64.u32 %rd40, %r54;
|
||||
mul.wide.u32 %rd41, %r54, 4;
|
||||
add.u64 %rd42, %rd36, %rd41;
|
||||
ld.shared.f32 %f81, [%rd42+0];
|
||||
add.ftz.f32 %f77, %f81, %f77;
|
||||
st.shared.f32 [%rd39+0], %f77;
|
||||
ld.shared.f32 %f82, [%rd42+512];
|
||||
add.ftz.f32 %f78, %f82, %f78;
|
||||
st.shared.f32 [%rd39+512], %f78;
|
||||
ld.shared.f32 %f83, [%rd42+1024];
|
||||
add.ftz.f32 %f79, %f83, %f79;
|
||||
st.shared.f32 [%rd39+1024], %f79;
|
||||
ld.shared.f32 %f84, [%rd42+1536];
|
||||
add.ftz.f32 %f80, %f84, %f80;
|
||||
st.shared.f32 [%rd39+1536], %f80;
|
||||
$Lt_0_23810:
|
||||
.loc 16 185 0
|
||||
shr.u32 %r52, %r52, 1;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p11, %r52, %r55;
|
||||
@%p11 bra $Lt_0_23554;
|
||||
$Lt_0_23042:
|
||||
.loc 16 192 0
|
||||
mov.f32 %f27, %f77;
|
||||
.loc 16 193 0
|
||||
mov.f32 %f26, %f78;
|
||||
.loc 16 194 0
|
||||
mov.f32 %f25, %f79;
|
||||
.loc 16 195 0
|
||||
mov.f32 %f28, %f80;
|
||||
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r57, 0;
|
||||
setp.le.s32 %p12, %r56, %r57;
|
||||
@%p12 bra $Lt_0_24578;
|
||||
.loc 16 199 0
|
||||
mov.f32 %f77, %f6;
|
||||
st.shared.f32 [%rd39+0], %f77;
|
||||
mov.f32 %f78, %f8;
|
||||
st.shared.f32 [%rd39+512], %f78;
|
||||
mov.f32 %f79, %f10;
|
||||
st.shared.f32 [%rd39+1024], %f79;
|
||||
mov.f32 %f80, %f12;
|
||||
st.shared.f32 [%rd39+1536], %f80;
|
||||
mov.f32 %f85, %f14;
|
||||
st.shared.f32 [%rd39+2048], %f85;
|
||||
mov.f32 %f86, %f16;
|
||||
st.shared.f32 [%rd39+2560], %f86;
|
||||
.loc 16 201 0
|
||||
mov.s32 %r58, %r51;
|
||||
@!%p9 bra $Lt_0_25090;
|
||||
$Lt_0_25602:
|
||||
setp.ge.u32 %p13, %r6, %r58;
|
||||
@%p13 bra $Lt_0_25858;
|
||||
.loc 16 204 0
|
||||
add.u32 %r59, %r2, %r58;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd36, %rd44;
|
||||
ld.shared.f32 %f87, [%rd45+0];
|
||||
add.ftz.f32 %f77, %f87, %f77;
|
||||
st.shared.f32 [%rd39+0], %f77;
|
||||
ld.shared.f32 %f88, [%rd45+512];
|
||||
add.ftz.f32 %f78, %f88, %f78;
|
||||
st.shared.f32 [%rd39+512], %f78;
|
||||
ld.shared.f32 %f89, [%rd45+1024];
|
||||
add.ftz.f32 %f79, %f89, %f79;
|
||||
st.shared.f32 [%rd39+1024], %f79;
|
||||
ld.shared.f32 %f90, [%rd45+1536];
|
||||
add.ftz.f32 %f80, %f90, %f80;
|
||||
st.shared.f32 [%rd39+1536], %f80;
|
||||
ld.shared.f32 %f91, [%rd45+2048];
|
||||
add.ftz.f32 %f85, %f91, %f85;
|
||||
st.shared.f32 [%rd39+2048], %f85;
|
||||
ld.shared.f32 %f92, [%rd45+2560];
|
||||
add.ftz.f32 %f86, %f92, %f86;
|
||||
st.shared.f32 [%rd39+2560], %f86;
|
||||
$Lt_0_25858:
|
||||
.loc 16 201 0
|
||||
shr.u32 %r58, %r58, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p14, %r58, %r60;
|
||||
@%p14 bra $Lt_0_25602;
|
||||
$Lt_0_25090:
|
||||
.loc 16 209 0
|
||||
mov.f32 %f6, %f77;
|
||||
mov.f32 %f8, %f78;
|
||||
mov.f32 %f10, %f79;
|
||||
mov.f32 %f12, %f80;
|
||||
mov.f32 %f14, %f85;
|
||||
mov.f32 %f16, %f86;
|
||||
$Lt_0_24578:
|
||||
$Lt_0_22530:
|
||||
selp.s32 %r61, 1, 0, %p1;
|
||||
mov.s32 %r62, 0;
|
||||
set.eq.u32.s32 %r63, %r6, %r62;
|
||||
neg.s32 %r64, %r63;
|
||||
and.b32 %r65, %r61, %r64;
|
||||
mov.u32 %r66, 0;
|
||||
setp.eq.s32 %p15, %r65, %r66;
|
||||
@%p15 bra $Lt_0_26626;
|
||||
.loc 16 215 0
|
||||
cvt.s64.s32 %rd46, %r9;
|
||||
ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];
|
||||
mul.wide.s32 %rd48, %r9, 4;
|
||||
add.u64 %rd49, %rd47, %rd48;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_27138;
|
||||
.loc 16 217 0
|
||||
st.global.f32 [%rd49+0], %f28;
|
||||
.loc 16 218 0
|
||||
cvt.s64.s32 %rd50, %r10;
|
||||
mul.wide.s32 %rd51, %r10, 4;
|
||||
add.u64 %rd49, %rd49, %rd51;
|
||||
$Lt_0_27138:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27650;
|
||||
.loc 16 222 0
|
||||
mov.f32 %f93, %f6;
|
||||
st.global.f32 [%rd49+0], %f93;
|
||||
.loc 16 223 0
|
||||
cvt.s64.s32 %rd52, %r10;
|
||||
mul.wide.s32 %rd53, %r10, 4;
|
||||
add.u64 %rd54, %rd53, %rd49;
|
||||
.loc 16 222 0
|
||||
mov.f32 %f94, %f8;
|
||||
st.global.f32 [%rd54+0], %f94;
|
||||
.loc 16 223 0
|
||||
add.u64 %rd55, %rd53, %rd54;
|
||||
.loc 16 222 0
|
||||
mov.f32 %f95, %f10;
|
||||
st.global.f32 [%rd55+0], %f95;
|
||||
.loc 16 223 0
|
||||
add.u64 %rd56, %rd53, %rd55;
|
||||
.loc 16 222 0
|
||||
mov.f32 %f96, %f12;
|
||||
st.global.f32 [%rd56+0], %f96;
|
||||
.loc 16 223 0
|
||||
add.u64 %rd49, %rd53, %rd56;
|
||||
.loc 16 222 0
|
||||
mov.f32 %f97, %f14;
|
||||
st.global.f32 [%rd49+0], %f97;
|
||||
mov.f32 %f98, %f16;
|
||||
add.u64 %rd57, %rd53, %rd49;
|
||||
st.global.f32 [%rd57+0], %f98;
|
||||
$Lt_0_27650:
|
||||
.loc 16 226 0
|
||||
ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd59, %rd46, 16;
|
||||
add.u64 %rd60, %rd58, %rd59;
|
||||
mov.f32 %f99, %f100;
|
||||
st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f99};
|
||||
$Lt_0_26626:
|
||||
.loc 16 228 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<74>;
|
||||
.reg .f32 %f<109>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32647_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32645_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32735_35_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32657_10_non_const_f = 48
|
||||
// __cuda_local_var_32661_9_non_const_virial = 16
|
||||
.loc 16 236 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_21250;
|
||||
.loc 16 246 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_21250:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21762;
|
||||
.loc 16 248 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22274;
|
||||
.loc 16 250 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_22274:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
|
||||
$Lt_1_21762:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
|
||||
.loc 16 260 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 262 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
rem.s32 %r10, %r1, %r6;
|
||||
cvt.s32.u32 %r11, %ctaid.x;
|
||||
mul.lo.s32 %r12, %r11, %r9;
|
||||
add.s32 %r13, %r7, %r12;
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.lt.s32 %p4, %r13, %r14;
|
||||
@!%p4 bra $Lt_1_23042;
|
||||
.loc 16 268 0
|
||||
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r15;
|
||||
mul.wide.s32 %rd18, %r15, 4;
|
||||
cvt.s64.s32 %rd19, %r13;
|
||||
mul.wide.s32 %rd20, %r13, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r16, [%rd23+0];
|
||||
add.u64 %rd24, %rd18, %rd23;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd25, %rd21;
|
||||
@%p5 bra $Lt_1_23554;
|
||||
.loc 16 274 0
|
||||
cvt.s32.s64 %r17, %rd17;
|
||||
mul.lo.s32 %r18, %r17, %r16;
|
||||
cvt.s64.s32 %rd26, %r18;
|
||||
mul.wide.s32 %rd27, %r18, 4;
|
||||
add.u64 %rd28, %rd24, %rd27;
|
||||
.loc 16 275 0
|
||||
mul.lo.s32 %r19, %r10, %r17;
|
||||
cvt.s64.s32 %rd29, %r19;
|
||||
mul.wide.s32 %rd30, %r19, 4;
|
||||
add.u64 %rd31, %rd24, %rd30;
|
||||
.loc 16 276 0
|
||||
mul.lo.s32 %r20, %r17, %r6;
|
||||
bra.uni $Lt_1_23298;
|
||||
$Lt_1_23554:
|
||||
.loc 16 278 0
|
||||
ld.global.s32 %r21, [%rd24+0];
|
||||
cvt.s64.s32 %rd32, %r21;
|
||||
mul.wide.s32 %rd33, %r21, 4;
|
||||
add.u64 %rd34, %rd25, %rd33;
|
||||
.loc 16 279 0
|
||||
cvt.s64.s32 %rd35, %r16;
|
||||
mul.wide.s32 %rd36, %r16, 4;
|
||||
add.u64 %rd28, %rd34, %rd36;
|
||||
.loc 16 280 0
|
||||
mov.s32 %r20, %r6;
|
||||
.loc 16 281 0
|
||||
cvt.s64.s32 %rd37, %r10;
|
||||
mul.wide.s32 %rd38, %r10, 4;
|
||||
add.u64 %rd31, %rd34, %rd38;
|
||||
$Lt_1_23298:
|
||||
.loc 16 284 0
|
||||
ld.global.s32 %r22, [%rd22+0];
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
mov.s32 %r28, 0;
|
||||
mov.u32 %r29, %r28;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd31, %rd28;
|
||||
@%p6 bra $Lt_1_32002;
|
||||
cvt.rzi.ftz.s32.f32 %r30, %f29;
|
||||
cvt.s64.s32 %rd39, %r20;
|
||||
mul.lo.s32 %r31, %r30, 11;
|
||||
cvt.rn.f32.s32 %f30, %r31;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_24322:
|
||||
//<loop> Loop body line 284, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 291 0
|
||||
ld.global.s32 %r32, [%rd31+0];
|
||||
.loc 16 292 0
|
||||
shr.s32 %r33, %r32, 30;
|
||||
and.b32 %r34, %r33, 3;
|
||||
cvt.s64.s32 %rd40, %r34;
|
||||
mul.wide.s32 %rd41, %r34, 4;
|
||||
add.u64 %rd42, %rd1, %rd41;
|
||||
ld.shared.f32 %f35, [%rd42+0];
|
||||
.loc 16 295 0
|
||||
and.b32 %r35, %r32, 1073741823;
|
||||
mov.u32 %r36, %r35;
|
||||
mov.s32 %r37, 0;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
mov.f32 %f43, %f39;
|
||||
sub.ftz.f32 %f44, %f27, %f41;
|
||||
sub.ftz.f32 %f45, %f26, %f40;
|
||||
sub.ftz.f32 %f46, %f28, %f42;
|
||||
mul.ftz.f32 %f47, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
|
||||
add.ftz.f32 %f50, %f30, %f43;
|
||||
cvt.rzi.ftz.s32.f32 %r43, %f50;
|
||||
cvt.s64.s32 %rd43, %r43;
|
||||
mul.wide.s32 %rd44, %r43, 16;
|
||||
add.u64 %rd45, %rd44, %rd7;
|
||||
ld.shared.f32 %f51, [%rd45+8];
|
||||
setp.gt.ftz.f32 %p7, %f51, %f49;
|
||||
@!%p7 bra $Lt_1_25602;
|
||||
.loc 16 307 0
|
||||
rcp.approx.ftz.f32 %f52, %f49;
|
||||
mul.ftz.f32 %f53, %f52, %f52;
|
||||
mul.ftz.f32 %f54, %f52, %f53;
|
||||
mul.ftz.f32 %f55, %f52, %f35;
|
||||
mul.ftz.f32 %f56, %f54, %f55;
|
||||
ld.shared.v2.f32 {%f57,%f58}, [%rd45+0];
|
||||
mul.ftz.f32 %f59, %f57, %f54;
|
||||
sub.ftz.f32 %f60, %f59, %f58;
|
||||
mul.ftz.f32 %f61, %f56, %f60;
|
||||
.loc 16 309 0
|
||||
fma.rn.ftz.f32 %f33, %f45, %f61, %f33;
|
||||
.loc 16 310 0
|
||||
fma.rn.ftz.f32 %f32, %f44, %f61, %f32;
|
||||
.loc 16 311 0
|
||||
fma.rn.ftz.f32 %f31, %f46, %f61, %f31;
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p8, %r44, %r45;
|
||||
@%p8 bra $Lt_1_25090;
|
||||
.loc 16 314 0
|
||||
add.u64 %rd46, %rd44, %rd13;
|
||||
ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd46+0];
|
||||
mul.ftz.f32 %f65, %f62, %f54;
|
||||
sub.ftz.f32 %f66, %f65, %f63;
|
||||
mul.ftz.f32 %f67, %f54, %f66;
|
||||
.loc 16 315 0
|
||||
sub.ftz.f32 %f68, %f67, %f64;
|
||||
fma.rn.ftz.f32 %f34, %f35, %f68, %f34;
|
||||
$Lt_1_25090:
|
||||
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r47, 0;
|
||||
setp.le.s32 %p9, %r46, %r47;
|
||||
@%p9 bra $Lt_1_25602;
|
||||
.loc 16 318 0
|
||||
mov.f32 %f69, %f11;
|
||||
mul.ftz.f32 %f70, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f71, %f61, %f70, %f69;
|
||||
mov.f32 %f11, %f71;
|
||||
.loc 16 319 0
|
||||
mov.f32 %f72, %f13;
|
||||
fma.rn.ftz.f32 %f73, %f61, %f47, %f72;
|
||||
mov.f32 %f13, %f73;
|
||||
.loc 16 320 0
|
||||
mov.f32 %f74, %f15;
|
||||
mul.ftz.f32 %f75, %f46, %f46;
|
||||
fma.rn.ftz.f32 %f76, %f61, %f75, %f74;
|
||||
mov.f32 %f15, %f76;
|
||||
.loc 16 321 0
|
||||
mov.f32 %f77, %f17;
|
||||
mul.ftz.f32 %f78, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f79, %f61, %f78, %f77;
|
||||
mov.f32 %f17, %f79;
|
||||
.loc 16 322 0
|
||||
mov.f32 %f80, %f19;
|
||||
mul.ftz.f32 %f81, %f45, %f46;
|
||||
fma.rn.ftz.f32 %f82, %f61, %f81, %f80;
|
||||
mov.f32 %f19, %f82;
|
||||
.loc 16 323 0
|
||||
mul.ftz.f32 %f83, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f20, %f61, %f83, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_25602:
|
||||
$Lt_1_24578:
|
||||
.loc 16 289 0
|
||||
mul.lo.u64 %rd47, %rd39, 4;
|
||||
add.u64 %rd31, %rd31, %rd47;
|
||||
setp.lt.u64 %p10, %rd31, %rd28;
|
||||
@%p10 bra $Lt_1_24322;
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_32002:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_23042:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_22786:
|
||||
mov.u32 %r48, 1;
|
||||
setp.le.s32 %p11, %r6, %r48;
|
||||
@%p11 bra $Lt_1_28418;
|
||||
.loc 16 334 0
|
||||
mov.u64 %rd48, __cuda___cuda_local_var_32735_35_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd49, %r1;
|
||||
mul.wide.s32 %rd50, %r1, 4;
|
||||
add.u64 %rd51, %rd48, %rd50;
|
||||
mov.f32 %f84, %f33;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
.loc 16 335 0
|
||||
mov.f32 %f85, %f32;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
.loc 16 336 0
|
||||
mov.f32 %f86, %f31;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
.loc 16 337 0
|
||||
mov.f32 %f87, %f34;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
.loc 16 339 0
|
||||
shr.s32 %r49, %r6, 31;
|
||||
mov.s32 %r50, 1;
|
||||
and.b32 %r51, %r49, %r50;
|
||||
add.s32 %r52, %r51, %r6;
|
||||
shr.s32 %r53, %r52, 1;
|
||||
mov.s32 %r54, %r53;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p12, %r53, %r55;
|
||||
@!%p12 bra $Lt_1_26882;
|
||||
$Lt_1_27394:
|
||||
setp.ge.u32 %p13, %r10, %r54;
|
||||
@%p13 bra $Lt_1_27650;
|
||||
.loc 16 342 0
|
||||
add.u32 %r56, %r1, %r54;
|
||||
cvt.u64.u32 %rd52, %r56;
|
||||
mul.wide.u32 %rd53, %r56, 4;
|
||||
add.u64 %rd54, %rd48, %rd53;
|
||||
ld.shared.f32 %f88, [%rd54+0];
|
||||
add.ftz.f32 %f84, %f88, %f84;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
ld.shared.f32 %f89, [%rd54+512];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
ld.shared.f32 %f90, [%rd54+1024];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
ld.shared.f32 %f91, [%rd54+1536];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
$Lt_1_27650:
|
||||
.loc 16 339 0
|
||||
shr.u32 %r54, %r54, 1;
|
||||
mov.u32 %r57, 0;
|
||||
setp.ne.u32 %p14, %r54, %r57;
|
||||
@%p14 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
.loc 16 346 0
|
||||
mov.f32 %f33, %f84;
|
||||
.loc 16 347 0
|
||||
mov.f32 %f32, %f85;
|
||||
.loc 16 348 0
|
||||
mov.f32 %f31, %f86;
|
||||
.loc 16 349 0
|
||||
mov.f32 %f34, %f87;
|
||||
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r59, 0;
|
||||
setp.le.s32 %p15, %r58, %r59;
|
||||
@%p15 bra $Lt_1_28418;
|
||||
.loc 16 353 0
|
||||
mov.f32 %f84, %f11;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
mov.f32 %f85, %f13;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
mov.f32 %f86, %f15;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
mov.f32 %f87, %f17;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
mov.f32 %f92, %f19;
|
||||
st.shared.f32 [%rd51+2048], %f92;
|
||||
mov.f32 %f93, %f21;
|
||||
st.shared.f32 [%rd51+2560], %f93;
|
||||
.loc 16 355 0
|
||||
mov.s32 %r60, %r53;
|
||||
@!%p12 bra $Lt_1_28930;
|
||||
$Lt_1_29442:
|
||||
setp.ge.u32 %p16, %r10, %r60;
|
||||
@%p16 bra $Lt_1_29698;
|
||||
.loc 16 358 0
|
||||
add.u32 %r61, %r1, %r60;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd48, %rd56;
|
||||
ld.shared.f32 %f94, [%rd57+0];
|
||||
add.ftz.f32 %f84, %f94, %f84;
|
||||
st.shared.f32 [%rd51+0], %f84;
|
||||
ld.shared.f32 %f95, [%rd57+512];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd51+512], %f85;
|
||||
ld.shared.f32 %f96, [%rd57+1024];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd51+1024], %f86;
|
||||
ld.shared.f32 %f97, [%rd57+1536];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd51+1536], %f87;
|
||||
ld.shared.f32 %f98, [%rd57+2048];
|
||||
add.ftz.f32 %f92, %f98, %f92;
|
||||
st.shared.f32 [%rd51+2048], %f92;
|
||||
ld.shared.f32 %f99, [%rd57+2560];
|
||||
add.ftz.f32 %f93, %f99, %f93;
|
||||
st.shared.f32 [%rd51+2560], %f93;
|
||||
$Lt_1_29698:
|
||||
.loc 16 355 0
|
||||
shr.u32 %r60, %r60, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p17, %r60, %r62;
|
||||
@%p17 bra $Lt_1_29442;
|
||||
$Lt_1_28930:
|
||||
.loc 16 363 0
|
||||
mov.f32 %f11, %f84;
|
||||
mov.f32 %f13, %f85;
|
||||
mov.f32 %f15, %f86;
|
||||
mov.f32 %f17, %f87;
|
||||
mov.f32 %f19, %f92;
|
||||
mov.f32 %f21, %f93;
|
||||
$Lt_1_28418:
|
||||
$Lt_1_26370:
|
||||
selp.s32 %r63, 1, 0, %p4;
|
||||
mov.s32 %r64, 0;
|
||||
set.eq.u32.s32 %r65, %r10, %r64;
|
||||
neg.s32 %r66, %r65;
|
||||
and.b32 %r67, %r63, %r66;
|
||||
mov.u32 %r68, 0;
|
||||
setp.eq.s32 %p18, %r67, %r68;
|
||||
@%p18 bra $Lt_1_30466;
|
||||
.loc 16 369 0
|
||||
cvt.s64.s32 %rd58, %r13;
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];
|
||||
mul.wide.s32 %rd60, %r13, 4;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30978;
|
||||
.loc 16 371 0
|
||||
st.global.f32 [%rd61+0], %f34;
|
||||
.loc 16 372 0
|
||||
cvt.s64.s32 %rd62, %r14;
|
||||
mul.wide.s32 %rd63, %r14, 4;
|
||||
add.u64 %rd61, %rd61, %rd63;
|
||||
$Lt_1_30978:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31490;
|
||||
.loc 16 376 0
|
||||
mov.f32 %f100, %f11;
|
||||
st.global.f32 [%rd61+0], %f100;
|
||||
.loc 16 377 0
|
||||
cvt.s64.s32 %rd64, %r14;
|
||||
mul.wide.s32 %rd65, %r14, 4;
|
||||
add.u64 %rd66, %rd65, %rd61;
|
||||
.loc 16 376 0
|
||||
mov.f32 %f101, %f13;
|
||||
st.global.f32 [%rd66+0], %f101;
|
||||
.loc 16 377 0
|
||||
add.u64 %rd67, %rd65, %rd66;
|
||||
.loc 16 376 0
|
||||
mov.f32 %f102, %f15;
|
||||
st.global.f32 [%rd67+0], %f102;
|
||||
.loc 16 377 0
|
||||
add.u64 %rd68, %rd65, %rd67;
|
||||
.loc 16 376 0
|
||||
mov.f32 %f103, %f17;
|
||||
st.global.f32 [%rd68+0], %f103;
|
||||
.loc 16 377 0
|
||||
add.u64 %rd61, %rd65, %rd68;
|
||||
.loc 16 376 0
|
||||
mov.f32 %f104, %f19;
|
||||
st.global.f32 [%rd61+0], %f104;
|
||||
mov.f32 %f105, %f21;
|
||||
add.u64 %rd69, %rd65, %rd61;
|
||||
st.global.f32 [%rd69+0], %f105;
|
||||
$Lt_1_31490:
|
||||
.loc 16 380 0
|
||||
ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd71, %rd58, 16;
|
||||
add.u64 %rd72, %rd70, %rd71;
|
||||
mov.f32 %f106, %f107;
|
||||
st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106};
|
||||
$Lt_1_30466:
|
||||
.loc 16 382 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
|
@ -1,993 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000be22_00000000-9_lj_expand_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.LdVC9u)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000be22_00000000-8_lj_expand_gpu_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "lj_expand_gpu_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_lj1,
|
||||
.param .u64 __cudaparm_kernel_pair_lj3,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<62>;
|
||||
.reg .f32 %f<107>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32584_35_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32504_10_non_const_f = 48
|
||||
// __cuda_local_var_32508_9_non_const_virial = 16
|
||||
.loc 16 88 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 95 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 96 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 97 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 98 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 107 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
rem.s32 %r6, %r2, %r1;
|
||||
cvt.s32.u32 %r7, %ctaid.x;
|
||||
mul.lo.s32 %r8, %r7, %r5;
|
||||
add.s32 %r9, %r3, %r8;
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
|
||||
setp.lt.s32 %p1, %r9, %r10;
|
||||
@!%p1 bra $Lt_0_19202;
|
||||
.loc 16 113 0
|
||||
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r11;
|
||||
mul.wide.s32 %rd3, %r11, 4;
|
||||
cvt.s64.s32 %rd4, %r9;
|
||||
mul.wide.s32 %rd5, %r9, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r12, [%rd8+0];
|
||||
add.u64 %rd9, %rd3, %rd8;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd10, %rd6;
|
||||
@%p2 bra $Lt_0_19714;
|
||||
.loc 16 119 0
|
||||
cvt.s32.s64 %r13, %rd2;
|
||||
mul.lo.s32 %r14, %r13, %r12;
|
||||
cvt.s64.s32 %rd11, %r14;
|
||||
mul.wide.s32 %rd12, %r14, 4;
|
||||
add.u64 %rd13, %rd9, %rd12;
|
||||
.loc 16 120 0
|
||||
mul.lo.s32 %r15, %r6, %r13;
|
||||
cvt.s64.s32 %rd14, %r15;
|
||||
mul.wide.s32 %rd15, %r15, 4;
|
||||
add.u64 %rd16, %rd9, %rd15;
|
||||
.loc 16 121 0
|
||||
mul.lo.s32 %r16, %r13, %r1;
|
||||
bra.uni $Lt_0_19458;
|
||||
$Lt_0_19714:
|
||||
.loc 16 123 0
|
||||
ld.global.s32 %r17, [%rd9+0];
|
||||
cvt.s64.s32 %rd17, %r17;
|
||||
mul.wide.s32 %rd18, %r17, 4;
|
||||
add.u64 %rd19, %rd10, %rd18;
|
||||
.loc 16 124 0
|
||||
cvt.s64.s32 %rd20, %r12;
|
||||
mul.wide.s32 %rd21, %r12, 4;
|
||||
add.u64 %rd13, %rd19, %rd21;
|
||||
.loc 16 125 0
|
||||
mov.s32 %r16, %r1;
|
||||
.loc 16 126 0
|
||||
cvt.s64.s32 %rd22, %r6;
|
||||
mul.wide.s32 %rd23, %r6, 4;
|
||||
add.u64 %rd16, %rd19, %rd23;
|
||||
$Lt_0_19458:
|
||||
.loc 16 129 0
|
||||
ld.global.s32 %r18, [%rd7+0];
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd16, %rd13;
|
||||
@%p3 bra $Lt_0_28162;
|
||||
cvt.rzi.ftz.s32.f32 %r26, %f24;
|
||||
cvt.s64.s32 %rd24, %r16;
|
||||
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r28, %r27, %r26;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
|
||||
$Lt_0_20482:
|
||||
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 135 0
|
||||
ld.global.s32 %r29, [%rd16+0];
|
||||
.loc 16 136 0
|
||||
shr.s32 %r30, %r29, 30;
|
||||
and.b32 %r31, %r30, 3;
|
||||
cvt.s64.s32 %rd27, %r31;
|
||||
mul.wide.s32 %rd28, %r31, 4;
|
||||
add.u64 %rd29, %rd26, %rd28;
|
||||
ld.shared.f32 %f29, [%rd29+0];
|
||||
.loc 16 139 0
|
||||
and.b32 %r32, %r29, 1073741823;
|
||||
mov.u32 %r33, %r32;
|
||||
mov.s32 %r34, 0;
|
||||
mov.u32 %r35, %r34;
|
||||
mov.s32 %r36, 0;
|
||||
mov.u32 %r37, %r36;
|
||||
mov.s32 %r38, 0;
|
||||
mov.u32 %r39, %r38;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r40, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
add.s32 %r41, %r40, %r28;
|
||||
cvt.s64.s32 %rd30, %r41;
|
||||
mul.wide.s32 %rd31, %r41, 16;
|
||||
add.u64 %rd32, %rd31, %rd25;
|
||||
ld.global.f32 %f44, [%rd32+8];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21762;
|
||||
.loc 16 151 0
|
||||
sqrt.approx.ftz.f32 %f45, %f43;
|
||||
ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd32+0];
|
||||
sub.ftz.f32 %f49, %f45, %f48;
|
||||
.loc 16 156 0
|
||||
mul.ftz.f32 %f50, %f49, %f49;
|
||||
rcp.approx.ftz.f32 %f51, %f50;
|
||||
mul.ftz.f32 %f52, %f51, %f51;
|
||||
mul.ftz.f32 %f53, %f51, %f52;
|
||||
div.approx.ftz.f32 %f54, %f29, %f49;
|
||||
div.approx.ftz.f32 %f55, %f54, %f45;
|
||||
mul.ftz.f32 %f56, %f46, %f53;
|
||||
sub.ftz.f32 %f57, %f56, %f47;
|
||||
mul.ftz.f32 %f58, %f53, %f57;
|
||||
mul.ftz.f32 %f59, %f55, %f58;
|
||||
.loc 16 158 0
|
||||
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
|
||||
.loc 16 159 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
|
||||
.loc 16 160 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
|
||||
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r43, 0;
|
||||
setp.le.s32 %p5, %r42, %r43;
|
||||
@%p5 bra $Lt_0_21250;
|
||||
.loc 16 164 0
|
||||
ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];
|
||||
add.u64 %rd34, %rd33, %rd31;
|
||||
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd34+0];
|
||||
mul.ftz.f32 %f63, %f60, %f53;
|
||||
sub.ftz.f32 %f64, %f63, %f61;
|
||||
mul.ftz.f32 %f65, %f53, %f64;
|
||||
sub.ftz.f32 %f66, %f65, %f62;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f66, %f28;
|
||||
$Lt_0_21250:
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p6, %r44, %r45;
|
||||
@%p6 bra $Lt_0_21762;
|
||||
.loc 16 167 0
|
||||
mov.f32 %f67, %f6;
|
||||
mul.ftz.f32 %f68, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f69, %f59, %f68, %f67;
|
||||
mov.f32 %f6, %f69;
|
||||
.loc 16 168 0
|
||||
mov.f32 %f70, %f8;
|
||||
fma.rn.ftz.f32 %f71, %f59, %f41, %f70;
|
||||
mov.f32 %f8, %f71;
|
||||
.loc 16 169 0
|
||||
mov.f32 %f72, %f10;
|
||||
mul.ftz.f32 %f73, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
|
||||
mov.f32 %f10, %f74;
|
||||
.loc 16 170 0
|
||||
mov.f32 %f75, %f12;
|
||||
mul.ftz.f32 %f76, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
|
||||
mov.f32 %f12, %f77;
|
||||
.loc 16 171 0
|
||||
mov.f32 %f78, %f14;
|
||||
mul.ftz.f32 %f79, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f80, %f59, %f79, %f78;
|
||||
mov.f32 %f14, %f80;
|
||||
.loc 16 172 0
|
||||
mul.ftz.f32 %f81, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f59, %f81, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21762:
|
||||
$Lt_0_20738:
|
||||
.loc 16 133 0
|
||||
mul.lo.u64 %rd35, %rd24, 4;
|
||||
add.u64 %rd16, %rd16, %rd35;
|
||||
setp.lt.u64 %p7, %rd16, %rd13;
|
||||
@%p7 bra $Lt_0_20482;
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_28162:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_19202:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_18946:
|
||||
mov.u32 %r46, 1;
|
||||
setp.le.s32 %p8, %r1, %r46;
|
||||
@%p8 bra $Lt_0_24578;
|
||||
.loc 16 183 0
|
||||
mov.u64 %rd36, __cuda___cuda_local_var_32584_35_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd37, %r2;
|
||||
mul.wide.s32 %rd38, %r2, 4;
|
||||
add.u64 %rd39, %rd36, %rd38;
|
||||
mov.f32 %f82, %f27;
|
||||
st.shared.f32 [%rd39+0], %f82;
|
||||
.loc 16 184 0
|
||||
mov.f32 %f83, %f26;
|
||||
st.shared.f32 [%rd39+512], %f83;
|
||||
.loc 16 185 0
|
||||
mov.f32 %f84, %f25;
|
||||
st.shared.f32 [%rd39+1024], %f84;
|
||||
.loc 16 186 0
|
||||
mov.f32 %f85, %f28;
|
||||
st.shared.f32 [%rd39+1536], %f85;
|
||||
.loc 16 188 0
|
||||
shr.s32 %r47, %r1, 31;
|
||||
mov.s32 %r48, 1;
|
||||
and.b32 %r49, %r47, %r48;
|
||||
add.s32 %r50, %r49, %r1;
|
||||
shr.s32 %r51, %r50, 1;
|
||||
mov.s32 %r52, %r51;
|
||||
mov.u32 %r53, 0;
|
||||
setp.ne.u32 %p9, %r51, %r53;
|
||||
@!%p9 bra $Lt_0_23042;
|
||||
$Lt_0_23554:
|
||||
setp.ge.u32 %p10, %r6, %r52;
|
||||
@%p10 bra $Lt_0_23810;
|
||||
.loc 16 191 0
|
||||
add.u32 %r54, %r2, %r52;
|
||||
cvt.u64.u32 %rd40, %r54;
|
||||
mul.wide.u32 %rd41, %r54, 4;
|
||||
add.u64 %rd42, %rd36, %rd41;
|
||||
ld.shared.f32 %f86, [%rd42+0];
|
||||
add.ftz.f32 %f82, %f86, %f82;
|
||||
st.shared.f32 [%rd39+0], %f82;
|
||||
ld.shared.f32 %f87, [%rd42+512];
|
||||
add.ftz.f32 %f83, %f87, %f83;
|
||||
st.shared.f32 [%rd39+512], %f83;
|
||||
ld.shared.f32 %f88, [%rd42+1024];
|
||||
add.ftz.f32 %f84, %f88, %f84;
|
||||
st.shared.f32 [%rd39+1024], %f84;
|
||||
ld.shared.f32 %f89, [%rd42+1536];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd39+1536], %f85;
|
||||
$Lt_0_23810:
|
||||
.loc 16 188 0
|
||||
shr.u32 %r52, %r52, 1;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p11, %r52, %r55;
|
||||
@%p11 bra $Lt_0_23554;
|
||||
$Lt_0_23042:
|
||||
.loc 16 195 0
|
||||
mov.f32 %f27, %f82;
|
||||
.loc 16 196 0
|
||||
mov.f32 %f26, %f83;
|
||||
.loc 16 197 0
|
||||
mov.f32 %f25, %f84;
|
||||
.loc 16 198 0
|
||||
mov.f32 %f28, %f85;
|
||||
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r57, 0;
|
||||
setp.le.s32 %p12, %r56, %r57;
|
||||
@%p12 bra $Lt_0_24578;
|
||||
.loc 16 202 0
|
||||
mov.f32 %f82, %f6;
|
||||
st.shared.f32 [%rd39+0], %f82;
|
||||
mov.f32 %f83, %f8;
|
||||
st.shared.f32 [%rd39+512], %f83;
|
||||
mov.f32 %f84, %f10;
|
||||
st.shared.f32 [%rd39+1024], %f84;
|
||||
mov.f32 %f85, %f12;
|
||||
st.shared.f32 [%rd39+1536], %f85;
|
||||
mov.f32 %f90, %f14;
|
||||
st.shared.f32 [%rd39+2048], %f90;
|
||||
mov.f32 %f91, %f16;
|
||||
st.shared.f32 [%rd39+2560], %f91;
|
||||
.loc 16 204 0
|
||||
mov.s32 %r58, %r51;
|
||||
@!%p9 bra $Lt_0_25090;
|
||||
$Lt_0_25602:
|
||||
setp.ge.u32 %p13, %r6, %r58;
|
||||
@%p13 bra $Lt_0_25858;
|
||||
.loc 16 207 0
|
||||
add.u32 %r59, %r2, %r58;
|
||||
cvt.u64.u32 %rd43, %r59;
|
||||
mul.wide.u32 %rd44, %r59, 4;
|
||||
add.u64 %rd45, %rd36, %rd44;
|
||||
ld.shared.f32 %f92, [%rd45+0];
|
||||
add.ftz.f32 %f82, %f92, %f82;
|
||||
st.shared.f32 [%rd39+0], %f82;
|
||||
ld.shared.f32 %f93, [%rd45+512];
|
||||
add.ftz.f32 %f83, %f93, %f83;
|
||||
st.shared.f32 [%rd39+512], %f83;
|
||||
ld.shared.f32 %f94, [%rd45+1024];
|
||||
add.ftz.f32 %f84, %f94, %f84;
|
||||
st.shared.f32 [%rd39+1024], %f84;
|
||||
ld.shared.f32 %f95, [%rd45+1536];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd39+1536], %f85;
|
||||
ld.shared.f32 %f96, [%rd45+2048];
|
||||
add.ftz.f32 %f90, %f96, %f90;
|
||||
st.shared.f32 [%rd39+2048], %f90;
|
||||
ld.shared.f32 %f97, [%rd45+2560];
|
||||
add.ftz.f32 %f91, %f97, %f91;
|
||||
st.shared.f32 [%rd39+2560], %f91;
|
||||
$Lt_0_25858:
|
||||
.loc 16 204 0
|
||||
shr.u32 %r58, %r58, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p14, %r58, %r60;
|
||||
@%p14 bra $Lt_0_25602;
|
||||
$Lt_0_25090:
|
||||
.loc 16 212 0
|
||||
mov.f32 %f6, %f82;
|
||||
mov.f32 %f8, %f83;
|
||||
mov.f32 %f10, %f84;
|
||||
mov.f32 %f12, %f85;
|
||||
mov.f32 %f14, %f90;
|
||||
mov.f32 %f16, %f91;
|
||||
$Lt_0_24578:
|
||||
$Lt_0_22530:
|
||||
selp.s32 %r61, 1, 0, %p1;
|
||||
mov.s32 %r62, 0;
|
||||
set.eq.u32.s32 %r63, %r6, %r62;
|
||||
neg.s32 %r64, %r63;
|
||||
and.b32 %r65, %r61, %r64;
|
||||
mov.u32 %r66, 0;
|
||||
setp.eq.s32 %p15, %r65, %r66;
|
||||
@%p15 bra $Lt_0_26626;
|
||||
.loc 16 218 0
|
||||
cvt.s64.s32 %rd46, %r9;
|
||||
ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];
|
||||
mul.wide.s32 %rd48, %r9, 4;
|
||||
add.u64 %rd49, %rd47, %rd48;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_27138;
|
||||
.loc 16 220 0
|
||||
st.global.f32 [%rd49+0], %f28;
|
||||
.loc 16 221 0
|
||||
cvt.s64.s32 %rd50, %r10;
|
||||
mul.wide.s32 %rd51, %r10, 4;
|
||||
add.u64 %rd49, %rd49, %rd51;
|
||||
$Lt_0_27138:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27650;
|
||||
.loc 16 225 0
|
||||
mov.f32 %f98, %f6;
|
||||
st.global.f32 [%rd49+0], %f98;
|
||||
.loc 16 226 0
|
||||
cvt.s64.s32 %rd52, %r10;
|
||||
mul.wide.s32 %rd53, %r10, 4;
|
||||
add.u64 %rd54, %rd53, %rd49;
|
||||
.loc 16 225 0
|
||||
mov.f32 %f99, %f8;
|
||||
st.global.f32 [%rd54+0], %f99;
|
||||
.loc 16 226 0
|
||||
add.u64 %rd55, %rd53, %rd54;
|
||||
.loc 16 225 0
|
||||
mov.f32 %f100, %f10;
|
||||
st.global.f32 [%rd55+0], %f100;
|
||||
.loc 16 226 0
|
||||
add.u64 %rd56, %rd53, %rd55;
|
||||
.loc 16 225 0
|
||||
mov.f32 %f101, %f12;
|
||||
st.global.f32 [%rd56+0], %f101;
|
||||
.loc 16 226 0
|
||||
add.u64 %rd49, %rd53, %rd56;
|
||||
.loc 16 225 0
|
||||
mov.f32 %f102, %f14;
|
||||
st.global.f32 [%rd49+0], %f102;
|
||||
mov.f32 %f103, %f16;
|
||||
add.u64 %rd57, %rd53, %rd49;
|
||||
st.global.f32 [%rd57+0], %f103;
|
||||
$Lt_0_27650:
|
||||
.loc 16 229 0
|
||||
ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd59, %rd46, 16;
|
||||
add.u64 %rd60, %rd58, %rd59;
|
||||
mov.f32 %f104, %f105;
|
||||
st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f104};
|
||||
$Lt_0_26626:
|
||||
.loc 16 231 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<74>;
|
||||
.reg .f32 %f<114>;
|
||||
.reg .f64 %fd<4>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32650_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32648_34_non_const_lj13296[1936];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32649_34_non_const_lj35232[1936];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32742_35_non_const_red_acc7168[3072];
|
||||
// __cuda_local_var_32660_10_non_const_f = 48
|
||||
// __cuda_local_var_32664_9_non_const_virial = 16
|
||||
.loc 16 239 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_21250;
|
||||
.loc 16 249 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_21250:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21762;
|
||||
.loc 16 251 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22274;
|
||||
.loc 16 253 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
|
||||
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
|
||||
add.u64 %rd15, %rd14, %rd9;
|
||||
add.u64 %rd16, %rd9, %rd13;
|
||||
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
|
||||
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
|
||||
$Lt_1_22274:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
|
||||
$Lt_1_21762:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
|
||||
.loc 16 263 0
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
mov.f32 %f21, %f20;
|
||||
.loc 16 265 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
rem.s32 %r10, %r1, %r6;
|
||||
cvt.s32.u32 %r11, %ctaid.x;
|
||||
mul.lo.s32 %r12, %r11, %r9;
|
||||
add.s32 %r13, %r7, %r12;
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.lt.s32 %p4, %r13, %r14;
|
||||
@!%p4 bra $Lt_1_23042;
|
||||
.loc 16 271 0
|
||||
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd17, %r15;
|
||||
mul.wide.s32 %rd18, %r15, 4;
|
||||
cvt.s64.s32 %rd19, %r13;
|
||||
mul.wide.s32 %rd20, %r13, 4;
|
||||
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd22, %rd20, %rd21;
|
||||
add.u64 %rd23, %rd18, %rd22;
|
||||
ld.global.s32 %r16, [%rd23+0];
|
||||
add.u64 %rd24, %rd18, %rd23;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd25, %rd21;
|
||||
@%p5 bra $Lt_1_23554;
|
||||
.loc 16 277 0
|
||||
cvt.s32.s64 %r17, %rd17;
|
||||
mul.lo.s32 %r18, %r17, %r16;
|
||||
cvt.s64.s32 %rd26, %r18;
|
||||
mul.wide.s32 %rd27, %r18, 4;
|
||||
add.u64 %rd28, %rd24, %rd27;
|
||||
.loc 16 278 0
|
||||
mul.lo.s32 %r19, %r10, %r17;
|
||||
cvt.s64.s32 %rd29, %r19;
|
||||
mul.wide.s32 %rd30, %r19, 4;
|
||||
add.u64 %rd31, %rd24, %rd30;
|
||||
.loc 16 279 0
|
||||
mul.lo.s32 %r20, %r17, %r6;
|
||||
bra.uni $Lt_1_23298;
|
||||
$Lt_1_23554:
|
||||
.loc 16 281 0
|
||||
ld.global.s32 %r21, [%rd24+0];
|
||||
cvt.s64.s32 %rd32, %r21;
|
||||
mul.wide.s32 %rd33, %r21, 4;
|
||||
add.u64 %rd34, %rd25, %rd33;
|
||||
.loc 16 282 0
|
||||
cvt.s64.s32 %rd35, %r16;
|
||||
mul.wide.s32 %rd36, %r16, 4;
|
||||
add.u64 %rd28, %rd34, %rd36;
|
||||
.loc 16 283 0
|
||||
mov.s32 %r20, %r6;
|
||||
.loc 16 284 0
|
||||
cvt.s64.s32 %rd37, %r10;
|
||||
mul.wide.s32 %rd38, %r10, 4;
|
||||
add.u64 %rd31, %rd34, %rd38;
|
||||
$Lt_1_23298:
|
||||
.loc 16 287 0
|
||||
ld.global.s32 %r22, [%rd22+0];
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
mov.s32 %r28, 0;
|
||||
mov.u32 %r29, %r28;
|
||||
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
mov.f32 %f28, %f24;
|
||||
mov.f32 %f29, %f25;
|
||||
setp.ge.u64 %p6, %rd31, %rd28;
|
||||
@%p6 bra $Lt_1_32002;
|
||||
cvt.rzi.ftz.s32.f32 %r30, %f29;
|
||||
cvt.s64.s32 %rd39, %r20;
|
||||
mul.lo.s32 %r31, %r30, 11;
|
||||
cvt.rn.f32.s32 %f30, %r31;
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_24322:
|
||||
//<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 294 0
|
||||
ld.global.s32 %r32, [%rd31+0];
|
||||
.loc 16 295 0
|
||||
shr.s32 %r33, %r32, 30;
|
||||
and.b32 %r34, %r33, 3;
|
||||
cvt.s64.s32 %rd40, %r34;
|
||||
mul.wide.s32 %rd41, %r34, 4;
|
||||
add.u64 %rd42, %rd1, %rd41;
|
||||
ld.shared.f32 %f35, [%rd42+0];
|
||||
.loc 16 298 0
|
||||
and.b32 %r35, %r32, 1073741823;
|
||||
mov.u32 %r36, %r35;
|
||||
mov.s32 %r37, 0;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
mov.f32 %f42, %f38;
|
||||
mov.f32 %f43, %f39;
|
||||
sub.ftz.f32 %f44, %f27, %f41;
|
||||
sub.ftz.f32 %f45, %f26, %f40;
|
||||
sub.ftz.f32 %f46, %f28, %f42;
|
||||
mul.ftz.f32 %f47, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
|
||||
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
|
||||
add.ftz.f32 %f50, %f30, %f43;
|
||||
cvt.rzi.ftz.s32.f32 %r43, %f50;
|
||||
cvt.s64.s32 %rd43, %r43;
|
||||
mul.wide.s32 %rd44, %r43, 16;
|
||||
add.u64 %rd45, %rd44, %rd7;
|
||||
ld.shared.f32 %f51, [%rd45+8];
|
||||
setp.gt.ftz.f32 %p7, %f51, %f49;
|
||||
@!%p7 bra $Lt_1_25602;
|
||||
.loc 16 309 0
|
||||
sqrt.approx.ftz.f32 %f52, %f49;
|
||||
ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd45+0];
|
||||
sub.ftz.f32 %f56, %f52, %f55;
|
||||
.loc 16 313 0
|
||||
mul.ftz.f32 %f57, %f56, %f56;
|
||||
cvt.ftz.f64.f32 %fd1, %f57;
|
||||
rcp.rn.f64 %fd2, %fd1;
|
||||
cvt.rn.ftz.f32.f64 %f58, %fd2;
|
||||
mul.ftz.f32 %f59, %f58, %f58;
|
||||
mul.ftz.f32 %f60, %f58, %f59;
|
||||
mul.ftz.f32 %f61, %f53, %f60;
|
||||
sub.ftz.f32 %f62, %f61, %f54;
|
||||
mul.ftz.f32 %f63, %f60, %f62;
|
||||
.loc 16 314 0
|
||||
div.approx.ftz.f32 %f64, %f35, %f56;
|
||||
div.approx.ftz.f32 %f65, %f64, %f52;
|
||||
mul.ftz.f32 %f66, %f63, %f65;
|
||||
.loc 16 316 0
|
||||
fma.rn.ftz.f32 %f33, %f45, %f66, %f33;
|
||||
.loc 16 317 0
|
||||
fma.rn.ftz.f32 %f32, %f44, %f66, %f32;
|
||||
.loc 16 318 0
|
||||
fma.rn.ftz.f32 %f31, %f46, %f66, %f31;
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p8, %r44, %r45;
|
||||
@%p8 bra $Lt_1_25090;
|
||||
.loc 16 321 0
|
||||
add.u64 %rd46, %rd44, %rd13;
|
||||
ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd46+0];
|
||||
mul.ftz.f32 %f70, %f67, %f60;
|
||||
sub.ftz.f32 %f71, %f70, %f68;
|
||||
mul.ftz.f32 %f72, %f60, %f71;
|
||||
.loc 16 322 0
|
||||
sub.ftz.f32 %f73, %f72, %f69;
|
||||
fma.rn.ftz.f32 %f34, %f35, %f73, %f34;
|
||||
$Lt_1_25090:
|
||||
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r47, 0;
|
||||
setp.le.s32 %p9, %r46, %r47;
|
||||
@%p9 bra $Lt_1_25602;
|
||||
.loc 16 325 0
|
||||
mov.f32 %f74, %f11;
|
||||
mul.ftz.f32 %f75, %f45, %f45;
|
||||
fma.rn.ftz.f32 %f76, %f66, %f75, %f74;
|
||||
mov.f32 %f11, %f76;
|
||||
.loc 16 326 0
|
||||
mov.f32 %f77, %f13;
|
||||
fma.rn.ftz.f32 %f78, %f66, %f47, %f77;
|
||||
mov.f32 %f13, %f78;
|
||||
.loc 16 327 0
|
||||
mov.f32 %f79, %f15;
|
||||
mul.ftz.f32 %f80, %f46, %f46;
|
||||
fma.rn.ftz.f32 %f81, %f66, %f80, %f79;
|
||||
mov.f32 %f15, %f81;
|
||||
.loc 16 328 0
|
||||
mov.f32 %f82, %f17;
|
||||
mul.ftz.f32 %f83, %f44, %f45;
|
||||
fma.rn.ftz.f32 %f84, %f66, %f83, %f82;
|
||||
mov.f32 %f17, %f84;
|
||||
.loc 16 329 0
|
||||
mov.f32 %f85, %f19;
|
||||
mul.ftz.f32 %f86, %f45, %f46;
|
||||
fma.rn.ftz.f32 %f87, %f66, %f86, %f85;
|
||||
mov.f32 %f19, %f87;
|
||||
.loc 16 330 0
|
||||
mul.ftz.f32 %f88, %f44, %f46;
|
||||
fma.rn.ftz.f32 %f20, %f66, %f88, %f20;
|
||||
mov.f32 %f21, %f20;
|
||||
$Lt_1_25602:
|
||||
$Lt_1_24578:
|
||||
.loc 16 292 0
|
||||
mul.lo.u64 %rd47, %rd39, 4;
|
||||
add.u64 %rd31, %rd31, %rd47;
|
||||
setp.lt.u64 %p10, %rd31, %rd28;
|
||||
@%p10 bra $Lt_1_24322;
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_32002:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_23042:
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
$Lt_1_22786:
|
||||
mov.u32 %r48, 1;
|
||||
setp.le.s32 %p11, %r6, %r48;
|
||||
@%p11 bra $Lt_1_28418;
|
||||
.loc 16 341 0
|
||||
mov.u64 %rd48, __cuda___cuda_local_var_32742_35_non_const_red_acc7168;
|
||||
cvt.s64.s32 %rd49, %r1;
|
||||
mul.wide.s32 %rd50, %r1, 4;
|
||||
add.u64 %rd51, %rd48, %rd50;
|
||||
mov.f32 %f89, %f33;
|
||||
st.shared.f32 [%rd51+0], %f89;
|
||||
.loc 16 342 0
|
||||
mov.f32 %f90, %f32;
|
||||
st.shared.f32 [%rd51+512], %f90;
|
||||
.loc 16 343 0
|
||||
mov.f32 %f91, %f31;
|
||||
st.shared.f32 [%rd51+1024], %f91;
|
||||
.loc 16 344 0
|
||||
mov.f32 %f92, %f34;
|
||||
st.shared.f32 [%rd51+1536], %f92;
|
||||
.loc 16 346 0
|
||||
shr.s32 %r49, %r6, 31;
|
||||
mov.s32 %r50, 1;
|
||||
and.b32 %r51, %r49, %r50;
|
||||
add.s32 %r52, %r51, %r6;
|
||||
shr.s32 %r53, %r52, 1;
|
||||
mov.s32 %r54, %r53;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p12, %r53, %r55;
|
||||
@!%p12 bra $Lt_1_26882;
|
||||
$Lt_1_27394:
|
||||
setp.ge.u32 %p13, %r10, %r54;
|
||||
@%p13 bra $Lt_1_27650;
|
||||
.loc 16 349 0
|
||||
add.u32 %r56, %r1, %r54;
|
||||
cvt.u64.u32 %rd52, %r56;
|
||||
mul.wide.u32 %rd53, %r56, 4;
|
||||
add.u64 %rd54, %rd48, %rd53;
|
||||
ld.shared.f32 %f93, [%rd54+0];
|
||||
add.ftz.f32 %f89, %f93, %f89;
|
||||
st.shared.f32 [%rd51+0], %f89;
|
||||
ld.shared.f32 %f94, [%rd54+512];
|
||||
add.ftz.f32 %f90, %f94, %f90;
|
||||
st.shared.f32 [%rd51+512], %f90;
|
||||
ld.shared.f32 %f95, [%rd54+1024];
|
||||
add.ftz.f32 %f91, %f95, %f91;
|
||||
st.shared.f32 [%rd51+1024], %f91;
|
||||
ld.shared.f32 %f96, [%rd54+1536];
|
||||
add.ftz.f32 %f92, %f96, %f92;
|
||||
st.shared.f32 [%rd51+1536], %f92;
|
||||
$Lt_1_27650:
|
||||
.loc 16 346 0
|
||||
shr.u32 %r54, %r54, 1;
|
||||
mov.u32 %r57, 0;
|
||||
setp.ne.u32 %p14, %r54, %r57;
|
||||
@%p14 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
.loc 16 353 0
|
||||
mov.f32 %f33, %f89;
|
||||
.loc 16 354 0
|
||||
mov.f32 %f32, %f90;
|
||||
.loc 16 355 0
|
||||
mov.f32 %f31, %f91;
|
||||
.loc 16 356 0
|
||||
mov.f32 %f34, %f92;
|
||||
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r59, 0;
|
||||
setp.le.s32 %p15, %r58, %r59;
|
||||
@%p15 bra $Lt_1_28418;
|
||||
.loc 16 360 0
|
||||
mov.f32 %f89, %f11;
|
||||
st.shared.f32 [%rd51+0], %f89;
|
||||
mov.f32 %f90, %f13;
|
||||
st.shared.f32 [%rd51+512], %f90;
|
||||
mov.f32 %f91, %f15;
|
||||
st.shared.f32 [%rd51+1024], %f91;
|
||||
mov.f32 %f92, %f17;
|
||||
st.shared.f32 [%rd51+1536], %f92;
|
||||
mov.f32 %f97, %f19;
|
||||
st.shared.f32 [%rd51+2048], %f97;
|
||||
mov.f32 %f98, %f21;
|
||||
st.shared.f32 [%rd51+2560], %f98;
|
||||
.loc 16 362 0
|
||||
mov.s32 %r60, %r53;
|
||||
@!%p12 bra $Lt_1_28930;
|
||||
$Lt_1_29442:
|
||||
setp.ge.u32 %p16, %r10, %r60;
|
||||
@%p16 bra $Lt_1_29698;
|
||||
.loc 16 365 0
|
||||
add.u32 %r61, %r1, %r60;
|
||||
cvt.u64.u32 %rd55, %r61;
|
||||
mul.wide.u32 %rd56, %r61, 4;
|
||||
add.u64 %rd57, %rd48, %rd56;
|
||||
ld.shared.f32 %f99, [%rd57+0];
|
||||
add.ftz.f32 %f89, %f99, %f89;
|
||||
st.shared.f32 [%rd51+0], %f89;
|
||||
ld.shared.f32 %f100, [%rd57+512];
|
||||
add.ftz.f32 %f90, %f100, %f90;
|
||||
st.shared.f32 [%rd51+512], %f90;
|
||||
ld.shared.f32 %f101, [%rd57+1024];
|
||||
add.ftz.f32 %f91, %f101, %f91;
|
||||
st.shared.f32 [%rd51+1024], %f91;
|
||||
ld.shared.f32 %f102, [%rd57+1536];
|
||||
add.ftz.f32 %f92, %f102, %f92;
|
||||
st.shared.f32 [%rd51+1536], %f92;
|
||||
ld.shared.f32 %f103, [%rd57+2048];
|
||||
add.ftz.f32 %f97, %f103, %f97;
|
||||
st.shared.f32 [%rd51+2048], %f97;
|
||||
ld.shared.f32 %f104, [%rd57+2560];
|
||||
add.ftz.f32 %f98, %f104, %f98;
|
||||
st.shared.f32 [%rd51+2560], %f98;
|
||||
$Lt_1_29698:
|
||||
.loc 16 362 0
|
||||
shr.u32 %r60, %r60, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p17, %r60, %r62;
|
||||
@%p17 bra $Lt_1_29442;
|
||||
$Lt_1_28930:
|
||||
.loc 16 370 0
|
||||
mov.f32 %f11, %f89;
|
||||
mov.f32 %f13, %f90;
|
||||
mov.f32 %f15, %f91;
|
||||
mov.f32 %f17, %f92;
|
||||
mov.f32 %f19, %f97;
|
||||
mov.f32 %f21, %f98;
|
||||
$Lt_1_28418:
|
||||
$Lt_1_26370:
|
||||
selp.s32 %r63, 1, 0, %p4;
|
||||
mov.s32 %r64, 0;
|
||||
set.eq.u32.s32 %r65, %r10, %r64;
|
||||
neg.s32 %r66, %r65;
|
||||
and.b32 %r67, %r63, %r66;
|
||||
mov.u32 %r68, 0;
|
||||
setp.eq.s32 %p18, %r67, %r68;
|
||||
@%p18 bra $Lt_1_30466;
|
||||
.loc 16 376 0
|
||||
cvt.s64.s32 %rd58, %r13;
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];
|
||||
mul.wide.s32 %rd60, %r13, 4;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30978;
|
||||
.loc 16 378 0
|
||||
st.global.f32 [%rd61+0], %f34;
|
||||
.loc 16 379 0
|
||||
cvt.s64.s32 %rd62, %r14;
|
||||
mul.wide.s32 %rd63, %r14, 4;
|
||||
add.u64 %rd61, %rd61, %rd63;
|
||||
$Lt_1_30978:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31490;
|
||||
.loc 16 383 0
|
||||
mov.f32 %f105, %f11;
|
||||
st.global.f32 [%rd61+0], %f105;
|
||||
.loc 16 384 0
|
||||
cvt.s64.s32 %rd64, %r14;
|
||||
mul.wide.s32 %rd65, %r14, 4;
|
||||
add.u64 %rd66, %rd65, %rd61;
|
||||
.loc 16 383 0
|
||||
mov.f32 %f106, %f13;
|
||||
st.global.f32 [%rd66+0], %f106;
|
||||
.loc 16 384 0
|
||||
add.u64 %rd67, %rd65, %rd66;
|
||||
.loc 16 383 0
|
||||
mov.f32 %f107, %f15;
|
||||
st.global.f32 [%rd67+0], %f107;
|
||||
.loc 16 384 0
|
||||
add.u64 %rd68, %rd65, %rd67;
|
||||
.loc 16 383 0
|
||||
mov.f32 %f108, %f17;
|
||||
st.global.f32 [%rd68+0], %f108;
|
||||
.loc 16 384 0
|
||||
add.u64 %rd61, %rd65, %rd68;
|
||||
.loc 16 383 0
|
||||
mov.f32 %f109, %f19;
|
||||
st.global.f32 [%rd61+0], %f109;
|
||||
mov.f32 %f110, %f21;
|
||||
add.u64 %rd69, %rd65, %rd61;
|
||||
st.global.f32 [%rd69+0], %f110;
|
||||
$Lt_1_31490:
|
||||
.loc 16 387 0
|
||||
ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd71, %rd58, 16;
|
||||
add.u64 %rd72, %rd70, %rd71;
|
||||
mov.f32 %f111, %f112;
|
||||
st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f111};
|
||||
$Lt_1_30466:
|
||||
.loc 16 389 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -1,999 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bf97_00000000-9_morse_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.pRrhev)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bf97_00000000-8_morse_gpu_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "morse_gpu_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
|
||||
.entry kernel_pair (
|
||||
.param .u64 __cudaparm_kernel_pair_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_mor1,
|
||||
.param .u64 __cudaparm_kernel_pair_mor2,
|
||||
.param .s32 __cudaparm_kernel_pair_lj_types,
|
||||
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<72>;
|
||||
.reg .u64 %rd<63>;
|
||||
.reg .f32 %f<104>;
|
||||
.reg .f64 %fd<10>;
|
||||
.reg .pred %p<19>;
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];
|
||||
// __cuda_local_var_32504_10_non_const_f = 48
|
||||
// __cuda_local_var_32508_9_non_const_virial = 16
|
||||
.loc 16 88 0
|
||||
$LDWbegin_kernel_pair:
|
||||
.loc 16 95 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
|
||||
ldu.global.f32 %f1, [%rd1+0];
|
||||
.loc 16 96 0
|
||||
ld.global.f32 %f2, [%rd1+4];
|
||||
.loc 16 97 0
|
||||
ld.global.f32 %f3, [%rd1+8];
|
||||
.loc 16 98 0
|
||||
ld.global.f32 %f4, [%rd1+12];
|
||||
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
|
||||
.loc 16 107 0
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
mov.f32 %f6, %f5;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, %f7;
|
||||
mov.f32 %f9, 0f00000000; // 0
|
||||
mov.f32 %f10, %f9;
|
||||
mov.f32 %f11, 0f00000000; // 0
|
||||
mov.f32 %f12, %f11;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, %f13;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.f32 %f16, %f15;
|
||||
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
|
||||
cvt.s32.u32 %r2, %tid.x;
|
||||
div.s32 %r3, %r2, %r1;
|
||||
cvt.s32.u32 %r4, %ntid.x;
|
||||
div.s32 %r5, %r4, %r1;
|
||||
rem.s32 %r6, %r2, %r1;
|
||||
cvt.s32.u32 %r7, %ctaid.x;
|
||||
mul.lo.s32 %r8, %r7, %r5;
|
||||
add.s32 %r9, %r3, %r8;
|
||||
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
|
||||
setp.lt.s32 %p1, %r9, %r10;
|
||||
@!%p1 bra $Lt_0_19202;
|
||||
.loc 16 113 0
|
||||
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
|
||||
cvt.s64.s32 %rd2, %r11;
|
||||
mul.wide.s32 %rd3, %r11, 4;
|
||||
cvt.s64.s32 %rd4, %r9;
|
||||
mul.wide.s32 %rd5, %r9, 4;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
|
||||
add.u64 %rd7, %rd5, %rd6;
|
||||
add.u64 %rd8, %rd3, %rd7;
|
||||
ld.global.s32 %r12, [%rd8+0];
|
||||
add.u64 %rd9, %rd3, %rd8;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
|
||||
setp.ne.u64 %p2, %rd10, %rd6;
|
||||
@%p2 bra $Lt_0_19714;
|
||||
.loc 16 119 0
|
||||
cvt.s32.s64 %r13, %rd2;
|
||||
mul.lo.s32 %r14, %r13, %r12;
|
||||
cvt.s64.s32 %rd11, %r14;
|
||||
mul.wide.s32 %rd12, %r14, 4;
|
||||
add.u64 %rd13, %rd9, %rd12;
|
||||
.loc 16 120 0
|
||||
mul.lo.s32 %r15, %r6, %r13;
|
||||
cvt.s64.s32 %rd14, %r15;
|
||||
mul.wide.s32 %rd15, %r15, 4;
|
||||
add.u64 %rd16, %rd9, %rd15;
|
||||
.loc 16 121 0
|
||||
mul.lo.s32 %r16, %r13, %r1;
|
||||
bra.uni $Lt_0_19458;
|
||||
$Lt_0_19714:
|
||||
.loc 16 123 0
|
||||
ld.global.s32 %r17, [%rd9+0];
|
||||
cvt.s64.s32 %rd17, %r17;
|
||||
mul.wide.s32 %rd18, %r17, 4;
|
||||
add.u64 %rd19, %rd10, %rd18;
|
||||
.loc 16 124 0
|
||||
cvt.s64.s32 %rd20, %r12;
|
||||
mul.wide.s32 %rd21, %r12, 4;
|
||||
add.u64 %rd13, %rd19, %rd21;
|
||||
.loc 16 125 0
|
||||
mov.s32 %r16, %r1;
|
||||
.loc 16 126 0
|
||||
cvt.s64.s32 %rd22, %r6;
|
||||
mul.wide.s32 %rd23, %r6, 4;
|
||||
add.u64 %rd16, %rd19, %rd23;
|
||||
$Lt_0_19458:
|
||||
.loc 16 129 0
|
||||
ld.global.s32 %r18, [%rd7+0];
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
|
||||
mov.f32 %f21, %f17;
|
||||
mov.f32 %f22, %f18;
|
||||
mov.f32 %f23, %f19;
|
||||
mov.f32 %f24, %f20;
|
||||
setp.ge.u64 %p3, %rd16, %rd13;
|
||||
@%p3 bra $Lt_0_28162;
|
||||
cvt.rzi.ftz.s32.f32 %r26, %f24;
|
||||
cvt.s64.s32 %rd24, %r16;
|
||||
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
|
||||
mul.lo.s32 %r28, %r27, %r26;
|
||||
ld.param.u64 %rd25, [__cudaparm_kernel_pair_mor1];
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
|
||||
$Lt_0_20482:
|
||||
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 135 0
|
||||
ld.global.s32 %r29, [%rd16+0];
|
||||
.loc 16 136 0
|
||||
shr.s32 %r30, %r29, 30;
|
||||
and.b32 %r31, %r30, 3;
|
||||
cvt.s64.s32 %rd27, %r31;
|
||||
mul.wide.s32 %rd28, %r31, 4;
|
||||
add.u64 %rd29, %rd26, %rd28;
|
||||
ld.shared.f32 %f29, [%rd29+0];
|
||||
.loc 16 139 0
|
||||
and.b32 %r32, %r29, 1073741823;
|
||||
mov.u32 %r33, %r32;
|
||||
mov.s32 %r34, 0;
|
||||
mov.u32 %r35, %r34;
|
||||
mov.s32 %r36, 0;
|
||||
mov.u32 %r37, %r36;
|
||||
mov.s32 %r38, 0;
|
||||
mov.u32 %r39, %r38;
|
||||
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
|
||||
mov.f32 %f34, %f30;
|
||||
mov.f32 %f35, %f31;
|
||||
mov.f32 %f36, %f32;
|
||||
mov.f32 %f37, %f33;
|
||||
cvt.rzi.ftz.s32.f32 %r40, %f37;
|
||||
sub.ftz.f32 %f38, %f22, %f35;
|
||||
sub.ftz.f32 %f39, %f21, %f34;
|
||||
sub.ftz.f32 %f40, %f23, %f36;
|
||||
mul.ftz.f32 %f41, %f38, %f38;
|
||||
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
|
||||
add.s32 %r41, %r40, %r28;
|
||||
cvt.s64.s32 %rd30, %r41;
|
||||
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
|
||||
mul.wide.s32 %rd31, %r41, 16;
|
||||
add.u64 %rd32, %rd25, %rd31;
|
||||
ld.global.f32 %f44, [%rd32+0];
|
||||
setp.gt.ftz.f32 %p4, %f44, %f43;
|
||||
@!%p4 bra $Lt_0_21762;
|
||||
.loc 16 152 0
|
||||
sqrt.approx.ftz.f32 %f45, %f43;
|
||||
ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd32+0];
|
||||
sub.ftz.f32 %f49, %f45, %f47;
|
||||
mul.ftz.f32 %f50, %f48, %f49;
|
||||
neg.ftz.f32 %f51, %f50;
|
||||
.loc 16 154 0
|
||||
mov.f32 %f52, 0f3fb8aa3b; // 1.4427
|
||||
mul.ftz.f32 %f53, %f51, %f52;
|
||||
ex2.approx.ftz.f32 %f54, %f53;
|
||||
mul.ftz.f32 %f55, %f54, %f54;
|
||||
sub.ftz.f32 %f56, %f55, %f54;
|
||||
mul.ftz.f32 %f57, %f46, %f56;
|
||||
.loc 16 156 0
|
||||
div.approx.ftz.f32 %f58, %f57, %f45;
|
||||
mul.ftz.f32 %f59, %f58, %f29;
|
||||
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
|
||||
.loc 16 157 0
|
||||
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
|
||||
.loc 16 158 0
|
||||
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
|
||||
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r43, 0;
|
||||
setp.le.s32 %p5, %r42, %r43;
|
||||
@%p5 bra $Lt_0_21250;
|
||||
.loc 16 162 0
|
||||
cvt.ftz.f64.f32 %fd1, %f54;
|
||||
ld.param.u64 %rd33, [__cudaparm_kernel_pair_mor2];
|
||||
mul.lo.u64 %rd34, %rd30, 8;
|
||||
add.u64 %rd35, %rd33, %rd34;
|
||||
ld.global.v2.f32 {%f60,%f61}, [%rd35+0];
|
||||
cvt.ftz.f64.f32 %fd2, %f61;
|
||||
cvt.ftz.f64.f32 %fd3, %f60;
|
||||
mul.ftz.f32 %f62, %f54, %f54;
|
||||
cvt.ftz.f64.f32 %fd4, %f62;
|
||||
add.f64 %fd5, %fd1, %fd1;
|
||||
sub.f64 %fd6, %fd4, %fd5;
|
||||
mul.f64 %fd7, %fd3, %fd6;
|
||||
sub.f64 %fd8, %fd7, %fd2;
|
||||
cvt.rn.ftz.f32.f64 %f63, %fd8;
|
||||
fma.rn.ftz.f32 %f28, %f29, %f63, %f28;
|
||||
$Lt_0_21250:
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p6, %r44, %r45;
|
||||
@%p6 bra $Lt_0_21762;
|
||||
.loc 16 165 0
|
||||
mov.f32 %f64, %f6;
|
||||
mul.ftz.f32 %f65, %f39, %f39;
|
||||
fma.rn.ftz.f32 %f66, %f59, %f65, %f64;
|
||||
mov.f32 %f6, %f66;
|
||||
.loc 16 166 0
|
||||
mov.f32 %f67, %f8;
|
||||
fma.rn.ftz.f32 %f68, %f59, %f41, %f67;
|
||||
mov.f32 %f8, %f68;
|
||||
.loc 16 167 0
|
||||
mov.f32 %f69, %f10;
|
||||
mul.ftz.f32 %f70, %f40, %f40;
|
||||
fma.rn.ftz.f32 %f71, %f59, %f70, %f69;
|
||||
mov.f32 %f10, %f71;
|
||||
.loc 16 168 0
|
||||
mov.f32 %f72, %f12;
|
||||
mul.ftz.f32 %f73, %f38, %f39;
|
||||
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
|
||||
mov.f32 %f12, %f74;
|
||||
.loc 16 169 0
|
||||
mov.f32 %f75, %f14;
|
||||
mul.ftz.f32 %f76, %f39, %f40;
|
||||
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
|
||||
mov.f32 %f14, %f77;
|
||||
.loc 16 170 0
|
||||
mul.ftz.f32 %f78, %f38, %f40;
|
||||
fma.rn.ftz.f32 %f15, %f59, %f78, %f15;
|
||||
mov.f32 %f16, %f15;
|
||||
$Lt_0_21762:
|
||||
$Lt_0_20738:
|
||||
.loc 16 133 0
|
||||
mul.lo.u64 %rd36, %rd24, 4;
|
||||
add.u64 %rd16, %rd16, %rd36;
|
||||
setp.lt.u64 %p7, %rd16, %rd13;
|
||||
@%p7 bra $Lt_0_20482;
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_28162:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
bra.uni $Lt_0_18946;
|
||||
$Lt_0_19202:
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
mov.f32 %f26, 0f00000000; // 0
|
||||
mov.f32 %f27, 0f00000000; // 0
|
||||
mov.f32 %f28, 0f00000000; // 0
|
||||
$Lt_0_18946:
|
||||
mov.u32 %r46, 1;
|
||||
setp.le.s32 %p8, %r1, %r46;
|
||||
@%p8 bra $Lt_0_24578;
|
||||
.loc 16 181 0
|
||||
mov.u64 %rd37, __cuda___cuda_local_var_32582_35_non_const_red_acc108;
|
||||
cvt.s64.s32 %rd38, %r2;
|
||||
mul.wide.s32 %rd39, %r2, 4;
|
||||
add.u64 %rd40, %rd37, %rd39;
|
||||
mov.f32 %f79, %f27;
|
||||
st.shared.f32 [%rd40+0], %f79;
|
||||
.loc 16 182 0
|
||||
mov.f32 %f80, %f26;
|
||||
st.shared.f32 [%rd40+512], %f80;
|
||||
.loc 16 183 0
|
||||
mov.f32 %f81, %f25;
|
||||
st.shared.f32 [%rd40+1024], %f81;
|
||||
.loc 16 184 0
|
||||
mov.f32 %f82, %f28;
|
||||
st.shared.f32 [%rd40+1536], %f82;
|
||||
.loc 16 186 0
|
||||
shr.s32 %r47, %r1, 31;
|
||||
mov.s32 %r48, 1;
|
||||
and.b32 %r49, %r47, %r48;
|
||||
add.s32 %r50, %r49, %r1;
|
||||
shr.s32 %r51, %r50, 1;
|
||||
mov.s32 %r52, %r51;
|
||||
mov.u32 %r53, 0;
|
||||
setp.ne.u32 %p9, %r51, %r53;
|
||||
@!%p9 bra $Lt_0_23042;
|
||||
$Lt_0_23554:
|
||||
setp.ge.u32 %p10, %r6, %r52;
|
||||
@%p10 bra $Lt_0_23810;
|
||||
.loc 16 189 0
|
||||
add.u32 %r54, %r2, %r52;
|
||||
cvt.u64.u32 %rd41, %r54;
|
||||
mul.wide.u32 %rd42, %r54, 4;
|
||||
add.u64 %rd43, %rd37, %rd42;
|
||||
ld.shared.f32 %f83, [%rd43+0];
|
||||
add.ftz.f32 %f79, %f83, %f79;
|
||||
st.shared.f32 [%rd40+0], %f79;
|
||||
ld.shared.f32 %f84, [%rd43+512];
|
||||
add.ftz.f32 %f80, %f84, %f80;
|
||||
st.shared.f32 [%rd40+512], %f80;
|
||||
ld.shared.f32 %f85, [%rd43+1024];
|
||||
add.ftz.f32 %f81, %f85, %f81;
|
||||
st.shared.f32 [%rd40+1024], %f81;
|
||||
ld.shared.f32 %f86, [%rd43+1536];
|
||||
add.ftz.f32 %f82, %f86, %f82;
|
||||
st.shared.f32 [%rd40+1536], %f82;
|
||||
$Lt_0_23810:
|
||||
.loc 16 186 0
|
||||
shr.u32 %r52, %r52, 1;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p11, %r52, %r55;
|
||||
@%p11 bra $Lt_0_23554;
|
||||
$Lt_0_23042:
|
||||
.loc 16 193 0
|
||||
mov.f32 %f27, %f79;
|
||||
.loc 16 194 0
|
||||
mov.f32 %f26, %f80;
|
||||
.loc 16 195 0
|
||||
mov.f32 %f25, %f81;
|
||||
.loc 16 196 0
|
||||
mov.f32 %f28, %f82;
|
||||
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r57, 0;
|
||||
setp.le.s32 %p12, %r56, %r57;
|
||||
@%p12 bra $Lt_0_24578;
|
||||
.loc 16 200 0
|
||||
mov.f32 %f79, %f6;
|
||||
st.shared.f32 [%rd40+0], %f79;
|
||||
mov.f32 %f80, %f8;
|
||||
st.shared.f32 [%rd40+512], %f80;
|
||||
mov.f32 %f81, %f10;
|
||||
st.shared.f32 [%rd40+1024], %f81;
|
||||
mov.f32 %f82, %f12;
|
||||
st.shared.f32 [%rd40+1536], %f82;
|
||||
mov.f32 %f87, %f14;
|
||||
st.shared.f32 [%rd40+2048], %f87;
|
||||
mov.f32 %f88, %f16;
|
||||
st.shared.f32 [%rd40+2560], %f88;
|
||||
.loc 16 202 0
|
||||
mov.s32 %r58, %r51;
|
||||
@!%p9 bra $Lt_0_25090;
|
||||
$Lt_0_25602:
|
||||
setp.ge.u32 %p13, %r6, %r58;
|
||||
@%p13 bra $Lt_0_25858;
|
||||
.loc 16 205 0
|
||||
add.u32 %r59, %r2, %r58;
|
||||
cvt.u64.u32 %rd44, %r59;
|
||||
mul.wide.u32 %rd45, %r59, 4;
|
||||
add.u64 %rd46, %rd37, %rd45;
|
||||
ld.shared.f32 %f89, [%rd46+0];
|
||||
add.ftz.f32 %f79, %f89, %f79;
|
||||
st.shared.f32 [%rd40+0], %f79;
|
||||
ld.shared.f32 %f90, [%rd46+512];
|
||||
add.ftz.f32 %f80, %f90, %f80;
|
||||
st.shared.f32 [%rd40+512], %f80;
|
||||
ld.shared.f32 %f91, [%rd46+1024];
|
||||
add.ftz.f32 %f81, %f91, %f81;
|
||||
st.shared.f32 [%rd40+1024], %f81;
|
||||
ld.shared.f32 %f92, [%rd46+1536];
|
||||
add.ftz.f32 %f82, %f92, %f82;
|
||||
st.shared.f32 [%rd40+1536], %f82;
|
||||
ld.shared.f32 %f93, [%rd46+2048];
|
||||
add.ftz.f32 %f87, %f93, %f87;
|
||||
st.shared.f32 [%rd40+2048], %f87;
|
||||
ld.shared.f32 %f94, [%rd46+2560];
|
||||
add.ftz.f32 %f88, %f94, %f88;
|
||||
st.shared.f32 [%rd40+2560], %f88;
|
||||
$Lt_0_25858:
|
||||
.loc 16 202 0
|
||||
shr.u32 %r58, %r58, 1;
|
||||
mov.u32 %r60, 0;
|
||||
setp.ne.u32 %p14, %r58, %r60;
|
||||
@%p14 bra $Lt_0_25602;
|
||||
$Lt_0_25090:
|
||||
.loc 16 210 0
|
||||
mov.f32 %f6, %f79;
|
||||
mov.f32 %f8, %f80;
|
||||
mov.f32 %f10, %f81;
|
||||
mov.f32 %f12, %f82;
|
||||
mov.f32 %f14, %f87;
|
||||
mov.f32 %f16, %f88;
|
||||
$Lt_0_24578:
|
||||
$Lt_0_22530:
|
||||
selp.s32 %r61, 1, 0, %p1;
|
||||
mov.s32 %r62, 0;
|
||||
set.eq.u32.s32 %r63, %r6, %r62;
|
||||
neg.s32 %r64, %r63;
|
||||
and.b32 %r65, %r61, %r64;
|
||||
mov.u32 %r66, 0;
|
||||
setp.eq.s32 %p15, %r65, %r66;
|
||||
@%p15 bra $Lt_0_26626;
|
||||
.loc 16 216 0
|
||||
cvt.s64.s32 %rd47, %r9;
|
||||
ld.param.u64 %rd48, [__cudaparm_kernel_pair_engv];
|
||||
mul.wide.s32 %rd49, %r9, 4;
|
||||
add.u64 %rd50, %rd48, %rd49;
|
||||
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
|
||||
mov.u32 %r68, 0;
|
||||
setp.le.s32 %p16, %r67, %r68;
|
||||
@%p16 bra $Lt_0_27138;
|
||||
.loc 16 218 0
|
||||
st.global.f32 [%rd50+0], %f28;
|
||||
.loc 16 219 0
|
||||
cvt.s64.s32 %rd51, %r10;
|
||||
mul.wide.s32 %rd52, %r10, 4;
|
||||
add.u64 %rd50, %rd50, %rd52;
|
||||
$Lt_0_27138:
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p17, %r69, %r70;
|
||||
@%p17 bra $Lt_0_27650;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f95, %f6;
|
||||
st.global.f32 [%rd50+0], %f95;
|
||||
.loc 16 224 0
|
||||
cvt.s64.s32 %rd53, %r10;
|
||||
mul.wide.s32 %rd54, %r10, 4;
|
||||
add.u64 %rd55, %rd54, %rd50;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f96, %f8;
|
||||
st.global.f32 [%rd55+0], %f96;
|
||||
.loc 16 224 0
|
||||
add.u64 %rd56, %rd54, %rd55;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f97, %f10;
|
||||
st.global.f32 [%rd56+0], %f97;
|
||||
.loc 16 224 0
|
||||
add.u64 %rd57, %rd54, %rd56;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f98, %f12;
|
||||
st.global.f32 [%rd57+0], %f98;
|
||||
.loc 16 224 0
|
||||
add.u64 %rd50, %rd54, %rd57;
|
||||
.loc 16 223 0
|
||||
mov.f32 %f99, %f14;
|
||||
st.global.f32 [%rd50+0], %f99;
|
||||
mov.f32 %f100, %f16;
|
||||
add.u64 %rd58, %rd54, %rd50;
|
||||
st.global.f32 [%rd58+0], %f100;
|
||||
$Lt_0_27650:
|
||||
.loc 16 227 0
|
||||
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
|
||||
mul.lo.u64 %rd60, %rd47, 16;
|
||||
add.u64 %rd61, %rd59, %rd60;
|
||||
mov.f32 %f101, %f102;
|
||||
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f101};
|
||||
$Lt_0_26626:
|
||||
.loc 16 229 0
|
||||
exit;
|
||||
$LDWend_kernel_pair:
|
||||
} // kernel_pair
|
||||
|
||||
.entry kernel_pair_fast (
|
||||
.param .u64 __cudaparm_kernel_pair_fast_x_,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_mor1_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_mor2_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_ans,
|
||||
.param .u64 __cudaparm_kernel_pair_fast_engv,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_eflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_vflag,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_inum,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
|
||||
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
|
||||
{
|
||||
.reg .u32 %r<74>;
|
||||
.reg .u64 %rd<76>;
|
||||
.reg .f32 %f<110>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_mor13296[1936];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32647_34_non_const_mor25232[968];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32738_35_non_const_red_acc6200[3072];
|
||||
// __cuda_local_var_32658_10_non_const_f = 48
|
||||
// __cuda_local_var_32662_9_non_const_virial = 16
|
||||
.loc 16 237 0
|
||||
$LDWbegin_kernel_pair_fast:
|
||||
cvt.s32.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, 3;
|
||||
setp.gt.s32 %p1, %r1, %r2;
|
||||
@%p1 bra $Lt_1_21250;
|
||||
.loc 16 247 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
|
||||
cvt.s64.s32 %rd2, %r1;
|
||||
mul.wide.s32 %rd3, %r1, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_21250:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
|
||||
mov.u32 %r3, 120;
|
||||
setp.gt.s32 %p2, %r1, %r3;
|
||||
@%p2 bra $Lt_1_21762;
|
||||
.loc 16 249 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296;
|
||||
cvt.s64.s32 %rd8, %r1;
|
||||
mul.wide.s32 %rd9, %r1, 16;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];
|
||||
add.u64 %rd11, %rd10, %rd9;
|
||||
add.u64 %rd12, %rd9, %rd7;
|
||||
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
|
||||
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
|
||||
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r5, 0;
|
||||
setp.le.s32 %p3, %r4, %r5;
|
||||
@%p3 bra $Lt_1_22274;
|
||||
.loc 16 251 0
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;
|
||||
mul.lo.u64 %rd14, %rd8, 8;
|
||||
ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];
|
||||
add.u64 %rd16, %rd15, %rd14;
|
||||
add.u64 %rd17, %rd14, %rd13;
|
||||
ld.global.v2.f32 {%f6,%f7}, [%rd16+0];
|
||||
st.shared.v2.f32 [%rd17+0], {%f6,%f7};
|
||||
$Lt_1_22274:
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;
|
||||
$Lt_1_21762:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296;
|
||||
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;
|
||||
.loc 16 261 0
|
||||
mov.f32 %f8, 0f00000000; // 0
|
||||
mov.f32 %f9, %f8;
|
||||
mov.f32 %f10, 0f00000000; // 0
|
||||
mov.f32 %f11, %f10;
|
||||
mov.f32 %f12, 0f00000000; // 0
|
||||
mov.f32 %f13, %f12;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, %f14;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
mov.f32 %f17, %f16;
|
||||
mov.f32 %f18, 0f00000000; // 0
|
||||
mov.f32 %f19, %f18;
|
||||
.loc 16 263 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
|
||||
div.s32 %r7, %r1, %r6;
|
||||
cvt.s32.u32 %r8, %ntid.x;
|
||||
div.s32 %r9, %r8, %r6;
|
||||
rem.s32 %r10, %r1, %r6;
|
||||
cvt.s32.u32 %r11, %ctaid.x;
|
||||
mul.lo.s32 %r12, %r11, %r9;
|
||||
add.s32 %r13, %r7, %r12;
|
||||
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
|
||||
setp.lt.s32 %p4, %r13, %r14;
|
||||
@!%p4 bra $Lt_1_23042;
|
||||
.loc 16 269 0
|
||||
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
|
||||
cvt.s64.s32 %rd18, %r15;
|
||||
mul.wide.s32 %rd19, %r15, 4;
|
||||
cvt.s64.s32 %rd20, %r13;
|
||||
mul.wide.s32 %rd21, %r13, 4;
|
||||
ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];
|
||||
add.u64 %rd23, %rd21, %rd22;
|
||||
add.u64 %rd24, %rd19, %rd23;
|
||||
ld.global.s32 %r16, [%rd24+0];
|
||||
add.u64 %rd25, %rd19, %rd24;
|
||||
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
|
||||
setp.ne.u64 %p5, %rd26, %rd22;
|
||||
@%p5 bra $Lt_1_23554;
|
||||
.loc 16 275 0
|
||||
cvt.s32.s64 %r17, %rd18;
|
||||
mul.lo.s32 %r18, %r17, %r16;
|
||||
cvt.s64.s32 %rd27, %r18;
|
||||
mul.wide.s32 %rd28, %r18, 4;
|
||||
add.u64 %rd29, %rd25, %rd28;
|
||||
.loc 16 276 0
|
||||
mul.lo.s32 %r19, %r10, %r17;
|
||||
cvt.s64.s32 %rd30, %r19;
|
||||
mul.wide.s32 %rd31, %r19, 4;
|
||||
add.u64 %rd32, %rd25, %rd31;
|
||||
.loc 16 277 0
|
||||
mul.lo.s32 %r20, %r17, %r6;
|
||||
bra.uni $Lt_1_23298;
|
||||
$Lt_1_23554:
|
||||
.loc 16 279 0
|
||||
ld.global.s32 %r21, [%rd25+0];
|
||||
cvt.s64.s32 %rd33, %r21;
|
||||
mul.wide.s32 %rd34, %r21, 4;
|
||||
add.u64 %rd35, %rd26, %rd34;
|
||||
.loc 16 280 0
|
||||
cvt.s64.s32 %rd36, %r16;
|
||||
mul.wide.s32 %rd37, %r16, 4;
|
||||
add.u64 %rd29, %rd35, %rd37;
|
||||
.loc 16 281 0
|
||||
mov.s32 %r20, %r6;
|
||||
.loc 16 282 0
|
||||
cvt.s64.s32 %rd38, %r10;
|
||||
mul.wide.s32 %rd39, %r10, 4;
|
||||
add.u64 %rd32, %rd35, %rd39;
|
||||
$Lt_1_23298:
|
||||
.loc 16 285 0
|
||||
ld.global.s32 %r22, [%rd23+0];
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
mov.s32 %r28, 0;
|
||||
mov.u32 %r29, %r28;
|
||||
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r23,%r25,%r27,%r29}];
|
||||
mov.f32 %f24, %f20;
|
||||
mov.f32 %f25, %f21;
|
||||
mov.f32 %f26, %f22;
|
||||
mov.f32 %f27, %f23;
|
||||
setp.ge.u64 %p6, %rd32, %rd29;
|
||||
@%p6 bra $Lt_1_32002;
|
||||
cvt.rzi.ftz.s32.f32 %r30, %f27;
|
||||
cvt.s64.s32 %rd40, %r20;
|
||||
mul.lo.s32 %r31, %r30, 11;
|
||||
cvt.rn.f32.s32 %f28, %r31;
|
||||
mov.f32 %f29, 0f00000000; // 0
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
$Lt_1_24322:
|
||||
//<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 292 0
|
||||
ld.global.s32 %r32, [%rd32+0];
|
||||
.loc 16 293 0
|
||||
shr.s32 %r33, %r32, 30;
|
||||
and.b32 %r34, %r33, 3;
|
||||
cvt.s64.s32 %rd41, %r34;
|
||||
mul.wide.s32 %rd42, %r34, 4;
|
||||
add.u64 %rd43, %rd1, %rd42;
|
||||
ld.shared.f32 %f33, [%rd43+0];
|
||||
.loc 16 296 0
|
||||
and.b32 %r35, %r32, 1073741823;
|
||||
mov.u32 %r36, %r35;
|
||||
mov.s32 %r37, 0;
|
||||
mov.u32 %r38, %r37;
|
||||
mov.s32 %r39, 0;
|
||||
mov.u32 %r40, %r39;
|
||||
mov.s32 %r41, 0;
|
||||
mov.u32 %r42, %r41;
|
||||
tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r36,%r38,%r40,%r42}];
|
||||
mov.f32 %f38, %f34;
|
||||
mov.f32 %f39, %f35;
|
||||
mov.f32 %f40, %f36;
|
||||
mov.f32 %f41, %f37;
|
||||
sub.ftz.f32 %f42, %f25, %f39;
|
||||
sub.ftz.f32 %f43, %f24, %f38;
|
||||
sub.ftz.f32 %f44, %f26, %f40;
|
||||
mul.ftz.f32 %f45, %f42, %f42;
|
||||
fma.rn.ftz.f32 %f46, %f43, %f43, %f45;
|
||||
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
|
||||
add.ftz.f32 %f48, %f28, %f41;
|
||||
cvt.rzi.ftz.s32.f32 %r43, %f48;
|
||||
cvt.s64.s32 %rd44, %r43;
|
||||
mul.wide.s32 %rd45, %r43, 16;
|
||||
add.u64 %rd46, %rd7, %rd45;
|
||||
ld.shared.f32 %f49, [%rd46+0];
|
||||
setp.gt.ftz.f32 %p7, %f49, %f47;
|
||||
@!%p7 bra $Lt_1_25602;
|
||||
.loc 16 307 0
|
||||
sqrt.approx.ftz.f32 %f50, %f47;
|
||||
ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd46+0];
|
||||
sub.ftz.f32 %f54, %f50, %f52;
|
||||
.loc 16 308 0
|
||||
mul.ftz.f32 %f55, %f53, %f54;
|
||||
neg.ftz.f32 %f56, %f55;
|
||||
.loc 16 310 0
|
||||
mov.f32 %f57, 0f3fb8aa3b; // 1.4427
|
||||
mul.ftz.f32 %f58, %f56, %f57;
|
||||
ex2.approx.ftz.f32 %f59, %f58;
|
||||
mul.ftz.f32 %f60, %f59, %f59;
|
||||
sub.ftz.f32 %f61, %f60, %f59;
|
||||
mul.ftz.f32 %f62, %f51, %f61;
|
||||
.loc 16 312 0
|
||||
div.approx.ftz.f32 %f63, %f62, %f50;
|
||||
mul.ftz.f32 %f64, %f63, %f33;
|
||||
fma.rn.ftz.f32 %f31, %f43, %f64, %f31;
|
||||
.loc 16 313 0
|
||||
fma.rn.ftz.f32 %f30, %f42, %f64, %f30;
|
||||
.loc 16 314 0
|
||||
fma.rn.ftz.f32 %f29, %f44, %f64, %f29;
|
||||
ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r45, 0;
|
||||
setp.le.s32 %p8, %r44, %r45;
|
||||
@%p8 bra $Lt_1_25090;
|
||||
.loc 16 317 0
|
||||
mul.lo.u64 %rd47, %rd44, 8;
|
||||
add.u64 %rd48, %rd13, %rd47;
|
||||
ld.shared.v2.f32 {%f65,%f66}, [%rd48+0];
|
||||
sub.ftz.f32 %f67, %f61, %f59;
|
||||
mul.ftz.f32 %f68, %f65, %f67;
|
||||
sub.ftz.f32 %f69, %f68, %f66;
|
||||
.loc 16 318 0
|
||||
fma.rn.ftz.f32 %f32, %f33, %f69, %f32;
|
||||
$Lt_1_25090:
|
||||
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r47, 0;
|
||||
setp.le.s32 %p9, %r46, %r47;
|
||||
@%p9 bra $Lt_1_25602;
|
||||
.loc 16 321 0
|
||||
mov.f32 %f70, %f9;
|
||||
mul.ftz.f32 %f71, %f43, %f43;
|
||||
fma.rn.ftz.f32 %f72, %f64, %f71, %f70;
|
||||
mov.f32 %f9, %f72;
|
||||
.loc 16 322 0
|
||||
mov.f32 %f73, %f11;
|
||||
fma.rn.ftz.f32 %f74, %f64, %f45, %f73;
|
||||
mov.f32 %f11, %f74;
|
||||
.loc 16 323 0
|
||||
mov.f32 %f75, %f13;
|
||||
mul.ftz.f32 %f76, %f44, %f44;
|
||||
fma.rn.ftz.f32 %f77, %f64, %f76, %f75;
|
||||
mov.f32 %f13, %f77;
|
||||
.loc 16 324 0
|
||||
mov.f32 %f78, %f15;
|
||||
mul.ftz.f32 %f79, %f42, %f43;
|
||||
fma.rn.ftz.f32 %f80, %f64, %f79, %f78;
|
||||
mov.f32 %f15, %f80;
|
||||
.loc 16 325 0
|
||||
mov.f32 %f81, %f17;
|
||||
mul.ftz.f32 %f82, %f43, %f44;
|
||||
fma.rn.ftz.f32 %f83, %f64, %f82, %f81;
|
||||
mov.f32 %f17, %f83;
|
||||
.loc 16 326 0
|
||||
mul.ftz.f32 %f84, %f42, %f44;
|
||||
fma.rn.ftz.f32 %f18, %f64, %f84, %f18;
|
||||
mov.f32 %f19, %f18;
|
||||
$Lt_1_25602:
|
||||
$Lt_1_24578:
|
||||
.loc 16 290 0
|
||||
mul.lo.u64 %rd49, %rd40, 4;
|
||||
add.u64 %rd32, %rd32, %rd49;
|
||||
setp.lt.u64 %p10, %rd32, %rd29;
|
||||
@%p10 bra $Lt_1_24322;
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_32002:
|
||||
mov.f32 %f29, 0f00000000; // 0
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
bra.uni $Lt_1_22786;
|
||||
$Lt_1_23042:
|
||||
mov.f32 %f29, 0f00000000; // 0
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
mov.f32 %f31, 0f00000000; // 0
|
||||
mov.f32 %f32, 0f00000000; // 0
|
||||
$Lt_1_22786:
|
||||
mov.u32 %r48, 1;
|
||||
setp.le.s32 %p11, %r6, %r48;
|
||||
@%p11 bra $Lt_1_28418;
|
||||
.loc 16 337 0
|
||||
mov.u64 %rd50, __cuda___cuda_local_var_32738_35_non_const_red_acc6200;
|
||||
cvt.s64.s32 %rd51, %r1;
|
||||
mul.wide.s32 %rd52, %r1, 4;
|
||||
add.u64 %rd53, %rd50, %rd52;
|
||||
mov.f32 %f85, %f31;
|
||||
st.shared.f32 [%rd53+0], %f85;
|
||||
.loc 16 338 0
|
||||
mov.f32 %f86, %f30;
|
||||
st.shared.f32 [%rd53+512], %f86;
|
||||
.loc 16 339 0
|
||||
mov.f32 %f87, %f29;
|
||||
st.shared.f32 [%rd53+1024], %f87;
|
||||
.loc 16 340 0
|
||||
mov.f32 %f88, %f32;
|
||||
st.shared.f32 [%rd53+1536], %f88;
|
||||
.loc 16 342 0
|
||||
shr.s32 %r49, %r6, 31;
|
||||
mov.s32 %r50, 1;
|
||||
and.b32 %r51, %r49, %r50;
|
||||
add.s32 %r52, %r51, %r6;
|
||||
shr.s32 %r53, %r52, 1;
|
||||
mov.s32 %r54, %r53;
|
||||
mov.u32 %r55, 0;
|
||||
setp.ne.u32 %p12, %r53, %r55;
|
||||
@!%p12 bra $Lt_1_26882;
|
||||
$Lt_1_27394:
|
||||
setp.ge.u32 %p13, %r10, %r54;
|
||||
@%p13 bra $Lt_1_27650;
|
||||
.loc 16 345 0
|
||||
add.u32 %r56, %r1, %r54;
|
||||
cvt.u64.u32 %rd54, %r56;
|
||||
mul.wide.u32 %rd55, %r56, 4;
|
||||
add.u64 %rd56, %rd50, %rd55;
|
||||
ld.shared.f32 %f89, [%rd56+0];
|
||||
add.ftz.f32 %f85, %f89, %f85;
|
||||
st.shared.f32 [%rd53+0], %f85;
|
||||
ld.shared.f32 %f90, [%rd56+512];
|
||||
add.ftz.f32 %f86, %f90, %f86;
|
||||
st.shared.f32 [%rd53+512], %f86;
|
||||
ld.shared.f32 %f91, [%rd56+1024];
|
||||
add.ftz.f32 %f87, %f91, %f87;
|
||||
st.shared.f32 [%rd53+1024], %f87;
|
||||
ld.shared.f32 %f92, [%rd56+1536];
|
||||
add.ftz.f32 %f88, %f92, %f88;
|
||||
st.shared.f32 [%rd53+1536], %f88;
|
||||
$Lt_1_27650:
|
||||
.loc 16 342 0
|
||||
shr.u32 %r54, %r54, 1;
|
||||
mov.u32 %r57, 0;
|
||||
setp.ne.u32 %p14, %r54, %r57;
|
||||
@%p14 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
.loc 16 349 0
|
||||
mov.f32 %f31, %f85;
|
||||
.loc 16 350 0
|
||||
mov.f32 %f30, %f86;
|
||||
.loc 16 351 0
|
||||
mov.f32 %f29, %f87;
|
||||
.loc 16 352 0
|
||||
mov.f32 %f32, %f88;
|
||||
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r59, 0;
|
||||
setp.le.s32 %p15, %r58, %r59;
|
||||
@%p15 bra $Lt_1_28418;
|
||||
.loc 16 356 0
|
||||
mov.f32 %f85, %f9;
|
||||
st.shared.f32 [%rd53+0], %f85;
|
||||
mov.f32 %f86, %f11;
|
||||
st.shared.f32 [%rd53+512], %f86;
|
||||
mov.f32 %f87, %f13;
|
||||
st.shared.f32 [%rd53+1024], %f87;
|
||||
mov.f32 %f88, %f15;
|
||||
st.shared.f32 [%rd53+1536], %f88;
|
||||
mov.f32 %f93, %f17;
|
||||
st.shared.f32 [%rd53+2048], %f93;
|
||||
mov.f32 %f94, %f19;
|
||||
st.shared.f32 [%rd53+2560], %f94;
|
||||
.loc 16 358 0
|
||||
mov.s32 %r60, %r53;
|
||||
@!%p12 bra $Lt_1_28930;
|
||||
$Lt_1_29442:
|
||||
setp.ge.u32 %p16, %r10, %r60;
|
||||
@%p16 bra $Lt_1_29698;
|
||||
.loc 16 361 0
|
||||
add.u32 %r61, %r1, %r60;
|
||||
cvt.u64.u32 %rd57, %r61;
|
||||
mul.wide.u32 %rd58, %r61, 4;
|
||||
add.u64 %rd59, %rd50, %rd58;
|
||||
ld.shared.f32 %f95, [%rd59+0];
|
||||
add.ftz.f32 %f85, %f95, %f85;
|
||||
st.shared.f32 [%rd53+0], %f85;
|
||||
ld.shared.f32 %f96, [%rd59+512];
|
||||
add.ftz.f32 %f86, %f96, %f86;
|
||||
st.shared.f32 [%rd53+512], %f86;
|
||||
ld.shared.f32 %f97, [%rd59+1024];
|
||||
add.ftz.f32 %f87, %f97, %f87;
|
||||
st.shared.f32 [%rd53+1024], %f87;
|
||||
ld.shared.f32 %f98, [%rd59+1536];
|
||||
add.ftz.f32 %f88, %f98, %f88;
|
||||
st.shared.f32 [%rd53+1536], %f88;
|
||||
ld.shared.f32 %f99, [%rd59+2048];
|
||||
add.ftz.f32 %f93, %f99, %f93;
|
||||
st.shared.f32 [%rd53+2048], %f93;
|
||||
ld.shared.f32 %f100, [%rd59+2560];
|
||||
add.ftz.f32 %f94, %f100, %f94;
|
||||
st.shared.f32 [%rd53+2560], %f94;
|
||||
$Lt_1_29698:
|
||||
.loc 16 358 0
|
||||
shr.u32 %r60, %r60, 1;
|
||||
mov.u32 %r62, 0;
|
||||
setp.ne.u32 %p17, %r60, %r62;
|
||||
@%p17 bra $Lt_1_29442;
|
||||
$Lt_1_28930:
|
||||
.loc 16 366 0
|
||||
mov.f32 %f9, %f85;
|
||||
mov.f32 %f11, %f86;
|
||||
mov.f32 %f13, %f87;
|
||||
mov.f32 %f15, %f88;
|
||||
mov.f32 %f17, %f93;
|
||||
mov.f32 %f19, %f94;
|
||||
$Lt_1_28418:
|
||||
$Lt_1_26370:
|
||||
selp.s32 %r63, 1, 0, %p4;
|
||||
mov.s32 %r64, 0;
|
||||
set.eq.u32.s32 %r65, %r10, %r64;
|
||||
neg.s32 %r66, %r65;
|
||||
and.b32 %r67, %r63, %r66;
|
||||
mov.u32 %r68, 0;
|
||||
setp.eq.s32 %p18, %r67, %r68;
|
||||
@%p18 bra $Lt_1_30466;
|
||||
.loc 16 372 0
|
||||
cvt.s64.s32 %rd60, %r13;
|
||||
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast_engv];
|
||||
mul.wide.s32 %rd62, %r13, 4;
|
||||
add.u64 %rd63, %rd61, %rd62;
|
||||
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
|
||||
mov.u32 %r70, 0;
|
||||
setp.le.s32 %p19, %r69, %r70;
|
||||
@%p19 bra $Lt_1_30978;
|
||||
.loc 16 374 0
|
||||
st.global.f32 [%rd63+0], %f32;
|
||||
.loc 16 375 0
|
||||
cvt.s64.s32 %rd64, %r14;
|
||||
mul.wide.s32 %rd65, %r14, 4;
|
||||
add.u64 %rd63, %rd63, %rd65;
|
||||
$Lt_1_30978:
|
||||
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
|
||||
mov.u32 %r72, 0;
|
||||
setp.le.s32 %p20, %r71, %r72;
|
||||
@%p20 bra $Lt_1_31490;
|
||||
.loc 16 379 0
|
||||
mov.f32 %f101, %f9;
|
||||
st.global.f32 [%rd63+0], %f101;
|
||||
.loc 16 380 0
|
||||
cvt.s64.s32 %rd66, %r14;
|
||||
mul.wide.s32 %rd67, %r14, 4;
|
||||
add.u64 %rd68, %rd67, %rd63;
|
||||
.loc 16 379 0
|
||||
mov.f32 %f102, %f11;
|
||||
st.global.f32 [%rd68+0], %f102;
|
||||
.loc 16 380 0
|
||||
add.u64 %rd69, %rd67, %rd68;
|
||||
.loc 16 379 0
|
||||
mov.f32 %f103, %f13;
|
||||
st.global.f32 [%rd69+0], %f103;
|
||||
.loc 16 380 0
|
||||
add.u64 %rd70, %rd67, %rd69;
|
||||
.loc 16 379 0
|
||||
mov.f32 %f104, %f15;
|
||||
st.global.f32 [%rd70+0], %f104;
|
||||
.loc 16 380 0
|
||||
add.u64 %rd63, %rd67, %rd70;
|
||||
.loc 16 379 0
|
||||
mov.f32 %f105, %f17;
|
||||
st.global.f32 [%rd63+0], %f105;
|
||||
mov.f32 %f106, %f19;
|
||||
add.u64 %rd71, %rd67, %rd63;
|
||||
st.global.f32 [%rd71+0], %f106;
|
||||
$Lt_1_31490:
|
||||
.loc 16 383 0
|
||||
ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];
|
||||
mul.lo.u64 %rd73, %rd60, 16;
|
||||
add.u64 %rd74, %rd72, %rd73;
|
||||
mov.f32 %f107, %f108;
|
||||
st.global.v4.f32 [%rd74+0], {%f31,%f30,%f29,%f107};
|
||||
$Lt_1_30466:
|
||||
.loc 16 385 0
|
||||
exit;
|
||||
$LDWend_kernel_pair_fast:
|
||||
} // kernel_pair_fast
|
||||
|
|
@ -1,101 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bafa_00000000-9_pair_gpu_atom_kernel.cpp3.i (/home/sjplimp/ccBI#.kAZxYr)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bafa_00000000-8_pair_gpu_atom_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "pair_gpu_atom_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_cast_x (
|
||||
.param .u64 __cudaparm_kernel_cast_x_x_type,
|
||||
.param .u64 __cudaparm_kernel_cast_x_x,
|
||||
.param .u64 __cudaparm_kernel_cast_x_type,
|
||||
.param .s32 __cudaparm_kernel_cast_x_nall)
|
||||
{
|
||||
.reg .u32 %r<10>;
|
||||
.reg .u64 %rd<13>;
|
||||
.reg .f32 %f<6>;
|
||||
.reg .f64 %fd<5>;
|
||||
.reg .pred %p<3>;
|
||||
.loc 16 34 0
|
||||
$LDWbegin_kernel_cast_x:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %ntid.x;
|
||||
mul.lo.u32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_0_1026;
|
||||
.loc 16 39 0
|
||||
cvt.s64.s32 %rd1, %r5;
|
||||
ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];
|
||||
mul.wide.s32 %rd3, %r5, 4;
|
||||
add.u64 %rd4, %rd2, %rd3;
|
||||
ld.global.s32 %r7, [%rd4+0];
|
||||
cvt.rn.f32.s32 %f1, %r7;
|
||||
.loc 16 42 0
|
||||
ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];
|
||||
mul.lo.s32 %r8, %r5, 3;
|
||||
cvt.s64.s32 %rd6, %r8;
|
||||
mul.wide.s32 %rd7, %r8, 8;
|
||||
add.u64 %rd8, %rd5, %rd7;
|
||||
ld.global.f64 %fd1, [%rd8+8];
|
||||
cvt.rn.ftz.f32.f64 %f2, %fd1;
|
||||
.loc 16 43 0
|
||||
ld.global.f64 %fd2, [%rd8+16];
|
||||
cvt.rn.ftz.f32.f64 %f3, %fd2;
|
||||
.loc 16 44 0
|
||||
ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];
|
||||
mul.wide.s32 %rd10, %r5, 16;
|
||||
add.u64 %rd11, %rd9, %rd10;
|
||||
ld.global.f64 %fd3, [%rd8+0];
|
||||
cvt.rn.ftz.f32.f64 %f4, %fd3;
|
||||
st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};
|
||||
$Lt_0_1026:
|
||||
.loc 16 46 0
|
||||
exit;
|
||||
$LDWend_kernel_cast_x:
|
||||
} // kernel_cast_x
|
||||
|
|
@ -1,833 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bb79_00000000-9_pair_gpu_build_kernel.cpp3.i (/home/sjplimp/ccBI#.mdgTku)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bb79_00000000-8_pair_gpu_build_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "pair_gpu_build_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry transpose (
|
||||
.param .u64 __cudaparm_transpose_out,
|
||||
.param .u64 __cudaparm_transpose_in,
|
||||
.param .s32 __cudaparm_transpose_columns_in,
|
||||
.param .s32 __cudaparm_transpose_rows_in)
|
||||
{
|
||||
.reg .u32 %r<32>;
|
||||
.reg .u64 %rd<23>;
|
||||
.reg .f32 %f<4>;
|
||||
.reg .pred %p<4>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32483_32_non_const_block24[288];
|
||||
.loc 16 64 0
|
||||
$LDWbegin_transpose:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mul.lo.u32 %r2, %r1, 8;
|
||||
mov.u32 %r3, %ctaid.y;
|
||||
mul.lo.u32 %r4, %r3, 8;
|
||||
mov.u32 %r5, %tid.x;
|
||||
add.u32 %r6, %r2, %r5;
|
||||
mov.u32 %r7, %tid.y;
|
||||
add.u32 %r8, %r4, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_transpose_rows_in];
|
||||
ld.param.s32 %r10, [__cudaparm_transpose_columns_in];
|
||||
set.gt.u32.u32 %r11, %r9, %r8;
|
||||
neg.s32 %r12, %r11;
|
||||
set.gt.u32.u32 %r13, %r10, %r6;
|
||||
neg.s32 %r14, %r13;
|
||||
and.b32 %r15, %r12, %r14;
|
||||
mov.u32 %r16, 0;
|
||||
setp.eq.s32 %p1, %r15, %r16;
|
||||
@%p1 bra $Lt_0_2306;
|
||||
.loc 16 76 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24;
|
||||
ld.param.u64 %rd2, [__cudaparm_transpose_in];
|
||||
mul.lo.u32 %r17, %r10, %r8;
|
||||
add.u32 %r18, %r6, %r17;
|
||||
cvt.u64.u32 %rd3, %r18;
|
||||
mul.wide.u32 %rd4, %r18, 4;
|
||||
add.u64 %rd5, %rd2, %rd4;
|
||||
ld.global.s32 %r19, [%rd5+0];
|
||||
cvt.rn.f32.s32 %f1, %r19;
|
||||
cvt.u64.u32 %rd6, %r5;
|
||||
cvt.u64.u32 %rd7, %r7;
|
||||
mul.wide.u32 %rd8, %r7, 9;
|
||||
add.u64 %rd9, %rd6, %rd8;
|
||||
mul.lo.u64 %rd10, %rd9, 4;
|
||||
add.u64 %rd11, %rd1, %rd10;
|
||||
st.shared.f32 [%rd11+0], %f1;
|
||||
$Lt_0_2306:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24;
|
||||
.loc 16 78 0
|
||||
bar.sync 0;
|
||||
add.u32 %r20, %r2, %r7;
|
||||
add.u32 %r21, %r4, %r5;
|
||||
set.gt.u32.u32 %r22, %r9, %r21;
|
||||
neg.s32 %r23, %r22;
|
||||
set.gt.u32.u32 %r24, %r10, %r20;
|
||||
neg.s32 %r25, %r24;
|
||||
and.b32 %r26, %r23, %r25;
|
||||
mov.u32 %r27, 0;
|
||||
setp.eq.s32 %p2, %r26, %r27;
|
||||
@%p2 bra $Lt_0_2818;
|
||||
.loc 16 83 0
|
||||
cvt.u64.u32 %rd12, %r7;
|
||||
cvt.u64.u32 %rd13, %r5;
|
||||
mul.wide.u32 %rd14, %r5, 9;
|
||||
add.u64 %rd15, %rd12, %rd14;
|
||||
mul.lo.u64 %rd16, %rd15, 4;
|
||||
add.u64 %rd17, %rd1, %rd16;
|
||||
ld.shared.f32 %f2, [%rd17+0];
|
||||
cvt.rzi.ftz.s32.f32 %r28, %f2;
|
||||
ld.param.u64 %rd18, [__cudaparm_transpose_out];
|
||||
mul.lo.u32 %r29, %r9, %r20;
|
||||
add.u32 %r30, %r21, %r29;
|
||||
cvt.u64.u32 %rd19, %r30;
|
||||
mul.wide.u32 %rd20, %r30, 4;
|
||||
add.u64 %rd21, %rd18, %rd20;
|
||||
st.global.s32 [%rd21+0], %r28;
|
||||
$Lt_0_2818:
|
||||
.loc 16 84 0
|
||||
exit;
|
||||
$LDWend_transpose:
|
||||
} // transpose
|
||||
.global .texref neigh_tex;
|
||||
|
||||
.entry calc_cell_id (
|
||||
.param .u64 __cudaparm_calc_cell_id_pos,
|
||||
.param .u64 __cudaparm_calc_cell_id_cell_id,
|
||||
.param .u64 __cudaparm_calc_cell_id_particle_id,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxlo0,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxlo1,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxlo2,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxhi0,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxhi1,
|
||||
.param .f32 __cudaparm_calc_cell_id_boxhi2,
|
||||
.param .f32 __cudaparm_calc_cell_id_cell_size,
|
||||
.param .s32 __cudaparm_calc_cell_id_ncellx,
|
||||
.param .s32 __cudaparm_calc_cell_id_ncelly,
|
||||
.param .s32 __cudaparm_calc_cell_id_nall)
|
||||
{
|
||||
.reg .u32 %r<25>;
|
||||
.reg .u64 %rd<8>;
|
||||
.reg .f32 %f<35>;
|
||||
.reg .f64 %fd<11>;
|
||||
.reg .pred %p<3>;
|
||||
.loc 16 90 0
|
||||
$LDWbegin_calc_cell_id:
|
||||
mov.u32 %r1, %tid.x;
|
||||
mov.u32 %r2, %ctaid.x;
|
||||
mov.u32 %r3, %ntid.x;
|
||||
mul.lo.u32 %r4, %r2, %r3;
|
||||
add.u32 %r5, %r1, %r4;
|
||||
ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_1_1026;
|
||||
.loc 16 94 0
|
||||
mov.u32 %r7, %r5;
|
||||
mov.s32 %r8, 0;
|
||||
mov.u32 %r9, %r8;
|
||||
mov.s32 %r10, 0;
|
||||
mov.u32 %r11, %r10;
|
||||
mov.s32 %r12, 0;
|
||||
mov.u32 %r13, %r12;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 16 107 0
|
||||
ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];
|
||||
neg.ftz.f32 %f9, %f8;
|
||||
ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];
|
||||
ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];
|
||||
ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];
|
||||
ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];
|
||||
ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];
|
||||
ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];
|
||||
sub.ftz.f32 %f14, %f13, %f11;
|
||||
add.ftz.f32 %f15, %f8, %f14;
|
||||
sub.ftz.f32 %f16, %f7, %f11;
|
||||
max.ftz.f32 %f17, %f9, %f16;
|
||||
min.ftz.f32 %f18, %f15, %f17;
|
||||
div.approx.ftz.f32 %f19, %f18, %f8;
|
||||
cvt.ftz.f64.f32 %fd1, %f19;
|
||||
mov.f64 %fd2, 0d3ff0000000000000; // 1
|
||||
add.f64 %fd3, %fd1, %fd2;
|
||||
cvt.rzi.u32.f64 %r16, %fd3;
|
||||
mul.lo.u32 %r17, %r14, %r16;
|
||||
mul.lo.u32 %r18, %r15, %r17;
|
||||
ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];
|
||||
sub.ftz.f32 %f21, %f20, %f12;
|
||||
add.ftz.f32 %f22, %f8, %f21;
|
||||
sub.ftz.f32 %f23, %f6, %f12;
|
||||
max.ftz.f32 %f24, %f9, %f23;
|
||||
min.ftz.f32 %f25, %f22, %f24;
|
||||
div.approx.ftz.f32 %f26, %f25, %f8;
|
||||
cvt.ftz.f64.f32 %fd4, %f26;
|
||||
mov.f64 %fd5, 0d3ff0000000000000; // 1
|
||||
add.f64 %fd6, %fd4, %fd5;
|
||||
cvt.rzi.u32.f64 %r19, %fd6;
|
||||
mul.lo.u32 %r20, %r14, %r19;
|
||||
add.u32 %r21, %r18, %r20;
|
||||
ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];
|
||||
sub.ftz.f32 %f28, %f27, %f10;
|
||||
add.ftz.f32 %f29, %f8, %f28;
|
||||
sub.ftz.f32 %f30, %f5, %f10;
|
||||
max.ftz.f32 %f31, %f9, %f30;
|
||||
min.ftz.f32 %f32, %f29, %f31;
|
||||
div.approx.ftz.f32 %f33, %f32, %f8;
|
||||
cvt.ftz.f64.f32 %fd7, %f33;
|
||||
mov.f64 %fd8, 0d3ff0000000000000; // 1
|
||||
add.f64 %fd9, %fd7, %fd8;
|
||||
cvt.rzi.u32.f64 %r22, %fd9;
|
||||
add.u32 %r23, %r21, %r22;
|
||||
.loc 16 111 0
|
||||
cvt.s64.s32 %rd1, %r5;
|
||||
mul.wide.s32 %rd2, %r5, 4;
|
||||
ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];
|
||||
add.u64 %rd4, %rd3, %rd2;
|
||||
st.global.u32 [%rd4+0], %r23;
|
||||
.loc 16 112 0
|
||||
ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];
|
||||
add.u64 %rd6, %rd5, %rd2;
|
||||
st.global.s32 [%rd6+0], %r5;
|
||||
$Lt_1_1026:
|
||||
.loc 16 114 0
|
||||
exit;
|
||||
$LDWend_calc_cell_id:
|
||||
} // calc_cell_id
|
||||
|
||||
.entry kernel_calc_cell_counts (
|
||||
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,
|
||||
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,
|
||||
.param .s32 __cudaparm_kernel_calc_cell_counts_nall,
|
||||
.param .s32 __cudaparm_kernel_calc_cell_counts_ncell)
|
||||
{
|
||||
.reg .u32 %r<33>;
|
||||
.reg .u64 %rd<15>;
|
||||
.reg .pred %p<13>;
|
||||
.loc 16 117 0
|
||||
$LDWbegin_kernel_calc_cell_counts:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %ntid.x;
|
||||
mul.lo.u32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];
|
||||
setp.gt.s32 %p1, %r6, %r5;
|
||||
@!%p1 bra $Lt_2_7426;
|
||||
.loc 16 120 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];
|
||||
cvt.s64.s32 %rd2, %r5;
|
||||
mul.wide.s32 %rd3, %r5, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
ld.global.u32 %r7, [%rd4+0];
|
||||
mov.u32 %r8, 0;
|
||||
setp.ne.s32 %p2, %r5, %r8;
|
||||
@%p2 bra $Lt_2_7938;
|
||||
add.s32 %r9, %r7, 1;
|
||||
mov.u32 %r10, 0;
|
||||
setp.le.s32 %p3, %r9, %r10;
|
||||
@%p3 bra $Lt_2_8450;
|
||||
mov.s32 %r11, %r9;
|
||||
ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];
|
||||
mov.s32 %r12, 0;
|
||||
mov.s32 %r13, %r11;
|
||||
$Lt_2_8962:
|
||||
//<loop> Loop body line 120, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 125 0
|
||||
mov.s32 %r14, 0;
|
||||
st.global.s32 [%rd5+0], %r14;
|
||||
add.s32 %r12, %r12, 1;
|
||||
add.u64 %rd5, %rd5, 4;
|
||||
setp.ne.s32 %p4, %r9, %r12;
|
||||
@%p4 bra $Lt_2_8962;
|
||||
$Lt_2_8450:
|
||||
$Lt_2_7938:
|
||||
sub.s32 %r15, %r6, 1;
|
||||
setp.ne.s32 %p5, %r5, %r15;
|
||||
@%p5 bra $Lt_2_9474;
|
||||
.loc 16 128 0
|
||||
add.s32 %r9, %r7, 1;
|
||||
mov.s32 %r16, %r9;
|
||||
ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];
|
||||
setp.gt.s32 %p6, %r9, %r17;
|
||||
@%p6 bra $Lt_2_9986;
|
||||
sub.s32 %r18, %r17, %r7;
|
||||
add.s32 %r19, %r17, 1;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];
|
||||
cvt.s64.s32 %rd7, %r9;
|
||||
mul.wide.s32 %rd8, %r9, 4;
|
||||
add.u64 %rd9, %rd6, %rd8;
|
||||
mov.s32 %r20, %r18;
|
||||
$Lt_2_10498:
|
||||
//<loop> Loop body line 128, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 129 0
|
||||
st.global.s32 [%rd9+0], %r6;
|
||||
add.s32 %r16, %r16, 1;
|
||||
add.u64 %rd9, %rd9, 4;
|
||||
setp.ne.s32 %p7, %r19, %r16;
|
||||
@%p7 bra $Lt_2_10498;
|
||||
$Lt_2_9986:
|
||||
$Lt_2_9474:
|
||||
selp.s32 %r21, 1, 0, %p1;
|
||||
mov.s32 %r22, 0;
|
||||
set.gt.u32.s32 %r23, %r5, %r22;
|
||||
neg.s32 %r24, %r23;
|
||||
and.b32 %r25, %r21, %r24;
|
||||
mov.u32 %r26, 0;
|
||||
setp.eq.s32 %p8, %r25, %r26;
|
||||
@%p8 bra $Lt_2_11010;
|
||||
.loc 16 133 0
|
||||
ld.global.u32 %r27, [%rd4+-4];
|
||||
setp.eq.s32 %p9, %r7, %r27;
|
||||
@%p9 bra $Lt_2_11522;
|
||||
.loc 16 135 0
|
||||
add.s32 %r28, %r27, 1;
|
||||
mov.s32 %r29, %r28;
|
||||
setp.gt.s32 %p10, %r28, %r7;
|
||||
@%p10 bra $Lt_2_12034;
|
||||
sub.s32 %r30, %r7, %r27;
|
||||
add.s32 %r9, %r7, 1;
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];
|
||||
cvt.s64.s32 %rd11, %r28;
|
||||
mul.wide.s32 %rd12, %r28, 4;
|
||||
add.u64 %rd13, %rd10, %rd12;
|
||||
mov.s32 %r31, %r30;
|
||||
$Lt_2_12546:
|
||||
//<loop> Loop body line 135, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 136 0
|
||||
st.global.s32 [%rd13+0], %r5;
|
||||
add.s32 %r29, %r29, 1;
|
||||
add.u64 %rd13, %rd13, 4;
|
||||
setp.ne.s32 %p11, %r9, %r29;
|
||||
@%p11 bra $Lt_2_12546;
|
||||
$Lt_2_12034:
|
||||
$Lt_2_11522:
|
||||
$Lt_2_11010:
|
||||
$Lt_2_7426:
|
||||
.loc 16 140 0
|
||||
exit;
|
||||
$LDWend_kernel_calc_cell_counts:
|
||||
} // kernel_calc_cell_counts
|
||||
|
||||
.entry calc_neigh_list_cell (
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_pos,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,
|
||||
.param .u64 __cudaparm_calc_neigh_list_cell_host_numj,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,
|
||||
.param .f32 __cudaparm_calc_neigh_list_cell_cell_size,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_ncellx,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_ncelly,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_ncellz,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_inum,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_nt,
|
||||
.param .s32 __cudaparm_calc_neigh_list_cell_nall)
|
||||
{
|
||||
.reg .u32 %r<106>;
|
||||
.reg .u64 %rd<46>;
|
||||
.reg .f32 %f<43>;
|
||||
.reg .f64 %fd<4>;
|
||||
.reg .pred %p<22>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480[512];
|
||||
.shared .align 16 .b8 __cuda___cuda_local_var_32578_34_non_const_pos_sh992[2048];
|
||||
// __cuda_local_var_32592_12_non_const_atom_i = 16
|
||||
.loc 16 151 0
|
||||
$LDWbegin_calc_neigh_list_cell:
|
||||
.loc 16 163 0
|
||||
ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];
|
||||
mov.u32 %r2, %ctaid.y;
|
||||
rem.u32 %r3, %r2, %r1;
|
||||
div.u32 %r4, %r2, %r1;
|
||||
ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];
|
||||
mul.lo.s32 %r6, %r5, %r3;
|
||||
mul.lo.s32 %r7, %r5, %r4;
|
||||
mul.lo.s32 %r8, %r7, %r1;
|
||||
cvt.s32.u32 %r9, %ctaid.x;
|
||||
ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];
|
||||
add.s32 %r10, %r6, %r8;
|
||||
add.s32 %r11, %r9, %r10;
|
||||
cvt.s64.s32 %rd2, %r11;
|
||||
mul.wide.s32 %rd3, %r11, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
ldu.global.s32 %r12, [%rd4+0];
|
||||
.loc 16 164 0
|
||||
ldu.global.s32 %r13, [%rd4+4];
|
||||
.loc 16 172 0
|
||||
sub.s32 %r14, %r13, %r12;
|
||||
mov.u32 %r15, %ntid.x;
|
||||
cvt.rn.f32.u32 %f1, %r15;
|
||||
cvt.rn.f32.s32 %f2, %r14;
|
||||
div.approx.ftz.f32 %f3, %f2, %f1;
|
||||
cvt.rpi.ftz.f32.f32 %f4, %f3;
|
||||
mov.f32 %f5, 0f00000000; // 0
|
||||
setp.gt.ftz.f32 %p1, %f4, %f5;
|
||||
@!%p1 bra $Lt_3_13314;
|
||||
sub.s32 %r16, %r3, 1;
|
||||
mov.s32 %r17, 0;
|
||||
max.s32 %r18, %r16, %r17;
|
||||
sub.s32 %r19, %r1, 1;
|
||||
add.s32 %r20, %r3, 1;
|
||||
min.s32 %r21, %r19, %r20;
|
||||
ld.param.s32 %r22, [__cudaparm_calc_neigh_list_cell_ncellz];
|
||||
sub.s32 %r23, %r22, 1;
|
||||
add.s32 %r24, %r4, 1;
|
||||
min.s32 %r25, %r23, %r24;
|
||||
sub.s32 %r26, %r9, 1;
|
||||
mov.s32 %r27, 0;
|
||||
max.s32 %r28, %r26, %r27;
|
||||
add.s32 %r29, %r9, 1;
|
||||
sub.s32 %r30, %r5, 1;
|
||||
min.s32 %r31, %r29, %r30;
|
||||
cvt.s32.u32 %r32, %tid.x;
|
||||
add.s32 %r33, %r12, %r32;
|
||||
mov.u32 %r34, 0;
|
||||
ld.param.s32 %r35, [__cudaparm_calc_neigh_list_cell_inum];
|
||||
cvt.s64.s32 %rd5, %r35;
|
||||
sub.s32 %r36, %r4, 1;
|
||||
mov.s32 %r37, %r33;
|
||||
mul.wide.s32 %rd6, %r35, 4;
|
||||
mov.s32 %r38, 0;
|
||||
max.s32 %r39, %r36, %r38;
|
||||
setp.ge.s32 %p2, %r25, %r39;
|
||||
ld.param.s32 %r40, [__cudaparm_calc_neigh_list_cell_nt];
|
||||
ld.param.s32 %r41, [__cudaparm_calc_neigh_list_cell_nall];
|
||||
mov.s32 %r42, 0;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32578_34_non_const_pos_sh992;
|
||||
$Lt_3_13826:
|
||||
//<loop> Loop body line 172, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 174 0
|
||||
mov.s32 %r43, %r41;
|
||||
setp.ge.s32 %p3, %r37, %r13;
|
||||
@%p3 bra $Lt_3_14082;
|
||||
.loc 16 180 0
|
||||
ld.param.u64 %rd9, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
|
||||
add.u32 %r44, %r33, %r34;
|
||||
cvt.s64.s32 %rd10, %r44;
|
||||
mul.wide.s32 %rd11, %r44, 4;
|
||||
add.u64 %rd12, %rd9, %rd11;
|
||||
ld.global.s32 %r43, [%rd12+0];
|
||||
$Lt_3_14082:
|
||||
setp.lt.s32 %p4, %r43, %r40;
|
||||
@!%p4 bra $Lt_3_14594;
|
||||
.loc 16 183 0
|
||||
mov.u32 %r45, %r43;
|
||||
mov.s32 %r46, 0;
|
||||
mov.u32 %r47, %r46;
|
||||
mov.s32 %r48, 0;
|
||||
mov.u32 %r49, %r48;
|
||||
mov.s32 %r50, 0;
|
||||
mov.u32 %r51, %r50;
|
||||
tex.1d.v4.f32.s32 {%f6,%f7,%f8,%f9},[neigh_tex,{%r45,%r47,%r49,%r51}];
|
||||
mov.f32 %f10, %f6;
|
||||
mov.f32 %f11, %f7;
|
||||
mov.f32 %f12, %f8;
|
||||
mov.f32 %f13, %f10;
|
||||
mov.f32 %f14, %f11;
|
||||
mov.f32 %f15, %f12;
|
||||
$Lt_3_14594:
|
||||
cvt.s64.s32 %rd13, %r43;
|
||||
mul.wide.s32 %rd14, %r43, 4;
|
||||
setp.ge.s32 %p5, %r43, %r35;
|
||||
@%p5 bra $Lt_3_15362;
|
||||
.loc 16 186 0
|
||||
mov.s32 %r52, %r35;
|
||||
.loc 16 187 0
|
||||
ld.param.u64 %rd15, [__cudaparm_calc_neigh_list_cell_nbor_list];
|
||||
add.u64 %rd16, %rd13, %rd5;
|
||||
mul.lo.u64 %rd17, %rd16, 4;
|
||||
add.u64 %rd18, %rd15, %rd17;
|
||||
mov.s64 %rd19, %rd18;
|
||||
.loc 16 188 0
|
||||
add.u64 %rd20, %rd6, %rd18;
|
||||
.loc 16 189 0
|
||||
add.u64 %rd21, %rd14, %rd15;
|
||||
st.global.s32 [%rd21+0], %r43;
|
||||
bra.uni $Lt_3_15106;
|
||||
$Lt_3_15362:
|
||||
.loc 16 192 0
|
||||
ld.param.u64 %rd22, [__cudaparm_calc_neigh_list_cell_host_numj];
|
||||
add.u64 %rd23, %rd22, %rd14;
|
||||
sub.u64 %rd19, %rd23, %rd6;
|
||||
.loc 16 193 0
|
||||
ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_nbor_list];
|
||||
ld.param.s32 %r53, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
|
||||
sub.s32 %r54, %r43, %r35;
|
||||
mul.lo.s32 %r55, %r53, %r54;
|
||||
cvt.s64.s32 %rd25, %r55;
|
||||
mul.wide.s32 %rd26, %r55, 4;
|
||||
add.u64 %rd20, %rd24, %rd26;
|
||||
mov.s32 %r52, 1;
|
||||
$Lt_3_15106:
|
||||
.loc 16 198 0
|
||||
mov.s32 %r56, %r39;
|
||||
@!%p2 bra $Lt_3_23298;
|
||||
sub.s32 %r57, %r25, %r39;
|
||||
add.s32 %r58, %r57, 1;
|
||||
setp.le.s32 %p6, %r18, %r21;
|
||||
add.s32 %r59, %r25, 1;
|
||||
mov.s32 %r60, 0;
|
||||
mov.s32 %r61, %r58;
|
||||
$Lt_3_16130:
|
||||
//<loop> Loop body line 198, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 199 0
|
||||
mov.s32 %r62, %r18;
|
||||
@!%p6 bra $Lt_3_16386;
|
||||
sub.s32 %r63, %r21, %r18;
|
||||
add.s32 %r64, %r63, 1;
|
||||
setp.ge.s32 %p7, %r31, %r28;
|
||||
add.s32 %r65, %r21, 1;
|
||||
mov.s32 %r66, %r64;
|
||||
$Lt_3_16898:
|
||||
//<loop> Loop body line 199, nesting depth: 2, estimated iterations: unknown
|
||||
@!%p7 bra $Lt_3_17154;
|
||||
sub.s32 %r67, %r31, %r28;
|
||||
add.s32 %r68, %r67, 1;
|
||||
mul.lo.s32 %r69, %r62, %r5;
|
||||
mul.lo.s32 %r70, %r56, %r5;
|
||||
mul.lo.s32 %r71, %r70, %r1;
|
||||
add.s32 %r72, %r31, 1;
|
||||
add.s32 %r73, %r69, %r71;
|
||||
add.s32 %r74, %r73, %r28;
|
||||
add.s32 %r75, %r72, %r73;
|
||||
cvt.s64.s32 %rd27, %r74;
|
||||
mul.wide.s32 %rd28, %r74, 4;
|
||||
add.u64 %rd29, %rd1, %rd28;
|
||||
mov.s32 %r76, %r68;
|
||||
$Lt_3_17666:
|
||||
//<loop> Loop body line 199, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 16 204 0
|
||||
ld.global.s32 %r77, [%rd29+0];
|
||||
.loc 16 205 0
|
||||
ld.global.s32 %r78, [%rd29+4];
|
||||
.loc 16 209 0
|
||||
sub.s32 %r79, %r78, %r77;
|
||||
cvt.rn.f32.s32 %f16, %r79;
|
||||
mov.f32 %f17, 0f43000000; // 128
|
||||
div.approx.ftz.f32 %f18, %f16, %f17;
|
||||
cvt.rpi.ftz.f32.f32 %f19, %f18;
|
||||
cvt.rzi.ftz.s32.f32 %r80, %f19;
|
||||
mov.u32 %r81, 0;
|
||||
setp.le.s32 %p8, %r80, %r81;
|
||||
@%p8 bra $Lt_3_17922;
|
||||
mov.s32 %r82, %r80;
|
||||
mov.s32 %r83, 0;
|
||||
setp.lt.s32 %p9, %r43, %r40;
|
||||
mul.lo.s32 %r84, %r80, 128;
|
||||
mov.s32 %r85, %r82;
|
||||
$Lt_3_18434:
|
||||
//<loop> Loop body line 209, nesting depth: 4, estimated iterations: unknown
|
||||
sub.s32 %r86, %r79, %r83;
|
||||
mov.s32 %r87, 128;
|
||||
min.s32 %r88, %r86, %r87;
|
||||
setp.le.s32 %p10, %r88, %r32;
|
||||
@%p10 bra $Lt_3_18690;
|
||||
.loc 16 215 0
|
||||
ld.param.u64 %rd30, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
|
||||
add.s32 %r89, %r83, %r32;
|
||||
add.s32 %r90, %r77, %r89;
|
||||
cvt.s64.s32 %rd31, %r90;
|
||||
mul.wide.s32 %rd32, %r90, 4;
|
||||
add.u64 %rd33, %rd30, %rd32;
|
||||
ld.global.s32 %r91, [%rd33+0];
|
||||
.loc 16 216 0
|
||||
cvt.s64.s32 %rd34, %r32;
|
||||
mul.wide.s32 %rd35, %r32, 4;
|
||||
add.u64 %rd36, %rd7, %rd35;
|
||||
st.shared.s32 [%rd36+0], %r91;
|
||||
.loc 16 217 0
|
||||
mov.u32 %r92, %r91;
|
||||
mov.s32 %r93, 0;
|
||||
mov.u32 %r94, %r93;
|
||||
mov.s32 %r95, 0;
|
||||
mov.u32 %r96, %r95;
|
||||
mov.s32 %r97, 0;
|
||||
mov.u32 %r98, %r97;
|
||||
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[neigh_tex,{%r92,%r94,%r96,%r98}];
|
||||
mov.f32 %f24, %f20;
|
||||
mov.f32 %f25, %f21;
|
||||
mov.f32 %f26, %f22;
|
||||
.loc 16 218 0
|
||||
mul.lo.u64 %rd37, %rd34, 16;
|
||||
add.u64 %rd38, %rd8, %rd37;
|
||||
st.shared.v2.f32 [%rd38+0], {%f24,%f25};
|
||||
.loc 16 220 0
|
||||
st.shared.f32 [%rd38+8], %f26;
|
||||
$Lt_3_18690:
|
||||
.loc 16 222 0
|
||||
bar.sync 0;
|
||||
@!%p9 bra $Lt_3_19714;
|
||||
mov.u32 %r99, 0;
|
||||
setp.le.s32 %p11, %r88, %r99;
|
||||
@%p11 bra $Lt_3_19714;
|
||||
mov.s32 %r100, %r88;
|
||||
mov.s64 %rd39, 0;
|
||||
ld.param.f32 %f27, [__cudaparm_calc_neigh_list_cell_cell_size];
|
||||
mul.ftz.f32 %f28, %f27, %f27;
|
||||
mov.s64 %rd40, %rd8;
|
||||
mov.f32 %f29, %f15;
|
||||
mov.f32 %f30, %f14;
|
||||
mov.f32 %f31, %f13;
|
||||
mov.s32 %r101, 0;
|
||||
mov.s32 %r102, %r100;
|
||||
$Lt_3_20226:
|
||||
//<loop> Loop body line 222, nesting depth: 5, estimated iterations: unknown
|
||||
ld.shared.v4.f32 {%f32,%f33,%f34,_}, [%rd40+0];
|
||||
.loc 16 228 0
|
||||
sub.ftz.f32 %f35, %f31, %f32;
|
||||
.loc 16 229 0
|
||||
sub.ftz.f32 %f36, %f30, %f33;
|
||||
.loc 16 230 0
|
||||
sub.ftz.f32 %f37, %f29, %f34;
|
||||
.loc 16 227 0
|
||||
mul.ftz.f32 %f38, %f36, %f36;
|
||||
fma.rn.ftz.f32 %f39, %f35, %f35, %f38;
|
||||
fma.rn.ftz.f32 %f40, %f37, %f37, %f39;
|
||||
setp.gt.ftz.f32 %p12, %f28, %f40;
|
||||
@!%p12 bra $Lt_3_24578;
|
||||
cvt.ftz.f64.f32 %fd1, %f40;
|
||||
mov.f64 %fd2, 0d3ee4f8b588e368f1; // 1e-05
|
||||
setp.gt.f64 %p13, %fd1, %fd2;
|
||||
@!%p13 bra $Lt_3_24578;
|
||||
ld.param.s32 %r103, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
|
||||
setp.le.s32 %p14, %r103, %r60;
|
||||
@%p14 bra $Lt_3_20482;
|
||||
.loc 16 235 0
|
||||
mul.lo.u64 %rd41, %rd39, 4;
|
||||
add.u64 %rd42, %rd7, %rd41;
|
||||
ld.shared.s32 %r104, [%rd42+0];
|
||||
st.global.s32 [%rd20+0], %r104;
|
||||
.loc 16 236 0
|
||||
cvt.s64.s32 %rd43, %r52;
|
||||
mul.wide.s32 %rd44, %r52, 4;
|
||||
add.u64 %rd20, %rd20, %rd44;
|
||||
$Lt_3_20482:
|
||||
.loc 16 238 0
|
||||
add.s32 %r60, %r60, 1;
|
||||
$Lt_3_24578:
|
||||
$L_3_12802:
|
||||
add.s32 %r101, %r101, 1;
|
||||
add.s64 %rd39, %rd39, 1;
|
||||
add.u64 %rd40, %rd40, 16;
|
||||
setp.ne.s32 %p15, %r88, %r101;
|
||||
@%p15 bra $Lt_3_20226;
|
||||
$Lt_3_19714:
|
||||
$Lt_3_19202:
|
||||
.loc 16 242 0
|
||||
bar.sync 0;
|
||||
add.s32 %r83, %r83, 128;
|
||||
setp.ne.s32 %p16, %r83, %r84;
|
||||
@%p16 bra $Lt_3_18434;
|
||||
$Lt_3_17922:
|
||||
add.s32 %r74, %r74, 1;
|
||||
add.u64 %rd29, %rd29, 4;
|
||||
setp.ne.s32 %p17, %r74, %r75;
|
||||
@%p17 bra $Lt_3_17666;
|
||||
$Lt_3_17154:
|
||||
add.s32 %r62, %r62, 1;
|
||||
setp.ne.s32 %p18, %r65, %r62;
|
||||
@%p18 bra $Lt_3_16898;
|
||||
$Lt_3_16386:
|
||||
add.s32 %r56, %r56, 1;
|
||||
setp.ne.s32 %p19, %r59, %r56;
|
||||
@%p19 bra $Lt_3_16130;
|
||||
bra.uni $Lt_3_15618;
|
||||
$Lt_3_23298:
|
||||
mov.s32 %r60, 0;
|
||||
$Lt_3_15618:
|
||||
@!%p4 bra $Lt_3_22274;
|
||||
.loc 16 248 0
|
||||
st.global.s32 [%rd19+0], %r60;
|
||||
$Lt_3_22274:
|
||||
.loc 16 172 0
|
||||
add.s32 %r42, %r42, 1;
|
||||
add.u32 %r34, %r34, %r15;
|
||||
add.s32 %r37, %r37, %r15;
|
||||
cvt.rn.f32.s32 %f41, %r42;
|
||||
setp.lt.ftz.f32 %p20, %f41, %f4;
|
||||
@%p20 bra $Lt_3_13826;
|
||||
$Lt_3_13314:
|
||||
.loc 16 250 0
|
||||
exit;
|
||||
$LDWend_calc_neigh_list_cell:
|
||||
} // calc_neigh_list_cell
|
||||
|
||||
.entry kernel_special (
|
||||
.param .u64 __cudaparm_kernel_special_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_special_host_nbor_list,
|
||||
.param .u64 __cudaparm_kernel_special_host_numj,
|
||||
.param .u64 __cudaparm_kernel_special_tag,
|
||||
.param .u64 __cudaparm_kernel_special_nspecial,
|
||||
.param .u64 __cudaparm_kernel_special_special,
|
||||
.param .s32 __cudaparm_kernel_special_inum,
|
||||
.param .s32 __cudaparm_kernel_special_nt,
|
||||
.param .s32 __cudaparm_kernel_special_max_nbors)
|
||||
{
|
||||
.reg .u32 %r<34>;
|
||||
.reg .u64 %rd<36>;
|
||||
.reg .pred %p<11>;
|
||||
.loc 16 256 0
|
||||
$LDWbegin_kernel_special:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %ntid.x;
|
||||
mul.lo.u32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_special_nt];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_4_6146;
|
||||
.loc 16 264 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];
|
||||
mul.lo.s32 %r7, %r5, 3;
|
||||
cvt.s64.s32 %rd2, %r7;
|
||||
mul.wide.s32 %rd3, %r7, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
ld.global.s32 %r8, [%rd4+0];
|
||||
.loc 16 265 0
|
||||
ld.global.s32 %r9, [%rd4+4];
|
||||
.loc 16 266 0
|
||||
ld.global.s32 %r10, [%rd4+8];
|
||||
ld.param.s32 %r11, [__cudaparm_kernel_special_inum];
|
||||
setp.le.s32 %p2, %r11, %r5;
|
||||
@%p2 bra $Lt_4_6914;
|
||||
.loc 16 270 0
|
||||
mov.s32 %r12, %r11;
|
||||
.loc 16 272 0
|
||||
cvt.s64.s32 %rd5, %r11;
|
||||
ld.param.u64 %rd6, [__cudaparm_kernel_special_dev_nbor];
|
||||
cvt.s64.s32 %rd7, %r5;
|
||||
add.u64 %rd8, %rd7, %rd5;
|
||||
mul.lo.u64 %rd9, %rd8, 4;
|
||||
add.u64 %rd10, %rd6, %rd9;
|
||||
ld.global.s32 %r13, [%rd10+0];
|
||||
.loc 16 273 0
|
||||
mul.wide.s32 %rd11, %r11, 4;
|
||||
add.u64 %rd12, %rd10, %rd11;
|
||||
bra.uni $Lt_4_6658;
|
||||
$Lt_4_6914:
|
||||
.loc 16 276 0
|
||||
sub.s32 %r14, %r5, %r11;
|
||||
ld.param.u64 %rd13, [__cudaparm_kernel_special_host_nbor_list];
|
||||
ld.param.s32 %r15, [__cudaparm_kernel_special_max_nbors];
|
||||
mul.lo.s32 %r16, %r15, %r14;
|
||||
cvt.s64.s32 %rd14, %r16;
|
||||
mul.wide.s32 %rd15, %r16, 4;
|
||||
add.u64 %rd12, %rd13, %rd15;
|
||||
.loc 16 277 0
|
||||
ld.param.u64 %rd16, [__cudaparm_kernel_special_host_numj];
|
||||
cvt.s64.s32 %rd17, %r14;
|
||||
mul.wide.s32 %rd18, %r14, 4;
|
||||
add.u64 %rd19, %rd16, %rd18;
|
||||
ld.global.s32 %r13, [%rd19+0];
|
||||
mov.s32 %r12, 1;
|
||||
$Lt_4_6658:
|
||||
.loc 16 279 0
|
||||
mul.lo.s32 %r17, %r13, %r12;
|
||||
cvt.s64.s32 %rd20, %r17;
|
||||
mul.wide.s32 %rd21, %r17, 4;
|
||||
add.u64 %rd22, %rd12, %rd21;
|
||||
setp.le.u64 %p3, %rd22, %rd12;
|
||||
@%p3 bra $Lt_4_7170;
|
||||
mov.s32 %r18, 0;
|
||||
setp.gt.s32 %p4, %r10, %r18;
|
||||
cvt.s64.s32 %rd23, %r12;
|
||||
ld.param.u64 %rd24, [__cudaparm_kernel_special_tag];
|
||||
$Lt_4_7682:
|
||||
//<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 282 0
|
||||
ld.global.s32 %r19, [%rd12+0];
|
||||
.loc 16 283 0
|
||||
cvt.s64.s32 %rd25, %r19;
|
||||
mul.wide.s32 %rd26, %r19, 4;
|
||||
add.u64 %rd27, %rd24, %rd26;
|
||||
ld.global.s32 %r20, [%rd27+0];
|
||||
@!%p4 bra $Lt_4_7938;
|
||||
mov.s32 %r21, %r10;
|
||||
cvt.s64.s32 %rd28, %r5;
|
||||
cvt.s64.s32 %rd29, %r6;
|
||||
mul.wide.s32 %rd30, %r6, 4;
|
||||
ld.param.u64 %rd31, [__cudaparm_kernel_special_special];
|
||||
mul.wide.s32 %rd32, %r5, 4;
|
||||
add.u64 %rd33, %rd31, %rd32;
|
||||
mov.s32 %r22, 0;
|
||||
mov.s32 %r23, %r21;
|
||||
$Lt_4_8450:
|
||||
//<loop> Loop body line 283, nesting depth: 1, estimated iterations: unknown
|
||||
ld.global.s32 %r24, [%rd33+0];
|
||||
setp.ne.s32 %p5, %r24, %r20;
|
||||
@%p5 bra $Lt_4_8706;
|
||||
.loc 16 293 0
|
||||
setp.le.s32 %p6, %r8, %r22;
|
||||
mov.s32 %r25, 3;
|
||||
mov.s32 %r26, 2;
|
||||
selp.s32 %r27, %r25, %r26, %p6;
|
||||
mov.s32 %r28, 2;
|
||||
mov.s32 %r29, 1;
|
||||
selp.s32 %r30, %r28, %r29, %p6;
|
||||
setp.le.s32 %p7, %r9, %r22;
|
||||
selp.s32 %r31, %r27, %r30, %p7;
|
||||
shl.b32 %r32, %r31, 30;
|
||||
xor.b32 %r19, %r19, %r32;
|
||||
.loc 16 294 0
|
||||
st.global.s32 [%rd12+0], %r19;
|
||||
$Lt_4_8706:
|
||||
add.s32 %r22, %r22, 1;
|
||||
add.u64 %rd33, %rd30, %rd33;
|
||||
setp.ne.s32 %p8, %r10, %r22;
|
||||
@%p8 bra $Lt_4_8450;
|
||||
$Lt_4_7938:
|
||||
.loc 16 281 0
|
||||
mul.lo.u64 %rd34, %rd23, 4;
|
||||
add.u64 %rd12, %rd12, %rd34;
|
||||
setp.gt.u64 %p9, %rd22, %rd12;
|
||||
@%p9 bra $Lt_4_7682;
|
||||
$Lt_4_7170:
|
||||
$Lt_4_6146:
|
||||
.loc 16 300 0
|
||||
exit;
|
||||
$LDWend_kernel_special:
|
||||
} // kernel_special
|
||||
|
|
@ -1,134 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bba8_00000000-9_pair_gpu_dev_kernel.cpp3.i (/home/sjplimp/ccBI#.SuFQHy)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bba8_00000000-8_pair_gpu_dev_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "pair_gpu_dev_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_zero (
|
||||
.param .u64 __cudaparm_kernel_zero_mem,
|
||||
.param .s32 __cudaparm_kernel_zero_numel)
|
||||
{
|
||||
.reg .u32 %r<9>;
|
||||
.reg .u64 %rd<6>;
|
||||
.reg .pred %p<3>;
|
||||
.loc 16 95 0
|
||||
$LDWbegin_kernel_zero:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %ntid.x;
|
||||
mul.lo.u32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_0_1026;
|
||||
.loc 16 99 0
|
||||
mov.s32 %r7, 0;
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];
|
||||
cvt.s64.s32 %rd2, %r5;
|
||||
mul.wide.s32 %rd3, %r5, 4;
|
||||
add.u64 %rd4, %rd1, %rd3;
|
||||
st.global.s32 [%rd4+0], %r7;
|
||||
$Lt_0_1026:
|
||||
.loc 16 100 0
|
||||
exit;
|
||||
$LDWend_kernel_zero:
|
||||
} // kernel_zero
|
||||
|
||||
.entry kernel_info (
|
||||
.param .u64 __cudaparm_kernel_info_info)
|
||||
{
|
||||
.reg .u32 %r<16>;
|
||||
.reg .u64 %rd<3>;
|
||||
.loc 16 102 0
|
||||
$LDWbegin_kernel_info:
|
||||
.loc 16 103 0
|
||||
ld.param.u64 %rd1, [__cudaparm_kernel_info_info];
|
||||
mov.s32 %r1, 200;
|
||||
st.global.s32 [%rd1+0], %r1;
|
||||
.loc 16 104 0
|
||||
mov.s32 %r2, 32;
|
||||
st.global.s32 [%rd1+4], %r2;
|
||||
.loc 16 105 0
|
||||
mov.s32 %r3, 32;
|
||||
st.global.s32 [%rd1+8], %r3;
|
||||
.loc 16 106 0
|
||||
mov.s32 %r4, 1;
|
||||
st.global.s32 [%rd1+12], %r4;
|
||||
.loc 16 107 0
|
||||
mov.s32 %r5, 8;
|
||||
st.global.s32 [%rd1+16], %r5;
|
||||
.loc 16 108 0
|
||||
mov.s32 %r6, 64;
|
||||
st.global.s32 [%rd1+20], %r6;
|
||||
.loc 16 109 0
|
||||
mov.s32 %r7, 128;
|
||||
st.global.s32 [%rd1+24], %r7;
|
||||
.loc 16 110 0
|
||||
mov.s32 %r8, 11;
|
||||
st.global.s32 [%rd1+28], %r8;
|
||||
.loc 16 111 0
|
||||
mov.s32 %r9, 8;
|
||||
st.global.s32 [%rd1+32], %r9;
|
||||
.loc 16 112 0
|
||||
mov.s32 %r10, 128;
|
||||
st.global.s32 [%rd1+36], %r10;
|
||||
.loc 16 113 0
|
||||
mov.s32 %r11, 128;
|
||||
st.global.s32 [%rd1+40], %r11;
|
||||
.loc 16 114 0
|
||||
mov.s32 %r12, 128;
|
||||
st.global.s32 [%rd1+44], %r12;
|
||||
.loc 16 115 0
|
||||
mov.s32 %r13, 128;
|
||||
st.global.s32 [%rd1+48], %r13;
|
||||
.loc 16 116 0
|
||||
mov.s32 %r14, 8;
|
||||
st.global.s32 [%rd1+52], %r14;
|
||||
.loc 16 117 0
|
||||
exit;
|
||||
$LDWend_kernel_info:
|
||||
} // kernel_info
|
||||
|
|
@ -1,118 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bb58_00000000-9_pair_gpu_nbor_kernel.cpp3.i (/home/sjplimp/ccBI#.bBFvWV)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bb58_00000000-8_pair_gpu_nbor_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "pair_gpu_nbor_kernel.cu"
|
||||
.file 17 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 18 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 20 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
|
||||
.entry kernel_unpack (
|
||||
.param .u64 __cudaparm_kernel_unpack_dev_nbor,
|
||||
.param .u64 __cudaparm_kernel_unpack_dev_ij,
|
||||
.param .s32 __cudaparm_kernel_unpack_inum)
|
||||
{
|
||||
.reg .u32 %r<11>;
|
||||
.reg .u64 %rd<27>;
|
||||
.reg .pred %p<5>;
|
||||
.loc 16 29 0
|
||||
$LDWbegin_kernel_unpack:
|
||||
mov.u32 %r1, %ctaid.x;
|
||||
mov.u32 %r2, %ntid.x;
|
||||
mul.lo.u32 %r3, %r1, %r2;
|
||||
mov.u32 %r4, %tid.x;
|
||||
add.u32 %r5, %r4, %r3;
|
||||
ld.param.s32 %r6, [__cudaparm_kernel_unpack_inum];
|
||||
setp.le.s32 %p1, %r6, %r5;
|
||||
@%p1 bra $Lt_0_2050;
|
||||
.loc 16 35 0
|
||||
cvt.s64.s32 %rd1, %r6;
|
||||
ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];
|
||||
cvt.s64.s32 %rd3, %r5;
|
||||
add.u64 %rd4, %rd3, %rd1;
|
||||
mul.lo.u64 %rd5, %rd4, 4;
|
||||
add.u64 %rd6, %rd2, %rd5;
|
||||
ld.global.s32 %r7, [%rd6+0];
|
||||
.loc 16 36 0
|
||||
mul.wide.s32 %rd7, %r6, 4;
|
||||
add.u64 %rd8, %rd6, %rd7;
|
||||
mov.s64 %rd9, %rd8;
|
||||
.loc 16 37 0
|
||||
ld.param.u64 %rd10, [__cudaparm_kernel_unpack_dev_ij];
|
||||
ld.global.s32 %r8, [%rd8+0];
|
||||
cvt.s64.s32 %rd11, %r8;
|
||||
mul.wide.s32 %rd12, %r8, 4;
|
||||
add.u64 %rd13, %rd10, %rd12;
|
||||
.loc 16 38 0
|
||||
cvt.s64.s32 %rd14, %r7;
|
||||
mul.wide.s32 %rd15, %r7, 4;
|
||||
add.u64 %rd16, %rd15, %rd13;
|
||||
setp.le.u64 %p2, %rd16, %rd13;
|
||||
@%p2 bra $Lt_0_2562;
|
||||
add.u64 %rd17, %rd15, 3;
|
||||
shr.s64 %rd18, %rd17, 63;
|
||||
mov.s64 %rd19, 3;
|
||||
and.b64 %rd20, %rd18, %rd19;
|
||||
add.s64 %rd21, %rd20, %rd17;
|
||||
shr.s64 %rd22, %rd21, 2;
|
||||
mov.s64 %rd23, 1;
|
||||
max.s64 %rd24, %rd22, %rd23;
|
||||
mov.s64 %rd25, %rd24;
|
||||
$Lt_0_3074:
|
||||
//<loop> Loop body line 38, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 16 41 0
|
||||
ld.global.s32 %r9, [%rd13+0];
|
||||
st.global.s32 [%rd9+0], %r9;
|
||||
.loc 16 42 0
|
||||
add.u64 %rd9, %rd7, %rd9;
|
||||
.loc 16 40 0
|
||||
add.u64 %rd13, %rd13, 4;
|
||||
setp.gt.u64 %p3, %rd16, %rd13;
|
||||
@%p3 bra $Lt_0_3074;
|
||||
$Lt_0_2562:
|
||||
$Lt_0_2050:
|
||||
.loc 16 45 0
|
||||
exit;
|
||||
$LDWend_kernel_unpack:
|
||||
} // kernel_unpack
|
||||
|
|
@ -1,900 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bc69_00000000-9_pppm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.fFsh3D)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bc69_00000000-8_pppm_gpu_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 17 "pppm_gpu_kernel.cu"
|
||||
.file 18 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 20 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 21 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
.global .texref q_tex;
|
||||
|
||||
.entry particle_map (
|
||||
.param .u64 __cudaparm_particle_map_x_,
|
||||
.param .u64 __cudaparm_particle_map_q_,
|
||||
.param .f64 __cudaparm_particle_map_delvolinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal,
|
||||
.param .u64 __cudaparm_particle_map_counts,
|
||||
.param .u64 __cudaparm_particle_map_ans,
|
||||
.param .f64 __cudaparm_particle_map_b_lo_x,
|
||||
.param .f64 __cudaparm_particle_map_b_lo_y,
|
||||
.param .f64 __cudaparm_particle_map_b_lo_z,
|
||||
.param .f64 __cudaparm_particle_map_delxinv,
|
||||
.param .f64 __cudaparm_particle_map_delyinv,
|
||||
.param .f64 __cudaparm_particle_map_delzinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_x,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_y,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_z,
|
||||
.param .s32 __cudaparm_particle_map_atom_stride,
|
||||
.param .s32 __cudaparm_particle_map_max_atoms,
|
||||
.param .u64 __cudaparm_particle_map_error)
|
||||
{
|
||||
.reg .u32 %r<50>;
|
||||
.reg .u64 %rd<12>;
|
||||
.reg .f32 %f<14>;
|
||||
.reg .f64 %fd<36>;
|
||||
.reg .pred %p<11>;
|
||||
.loc 17 113 0
|
||||
$LDWbegin_particle_map:
|
||||
mov.u32 %r1, %ntid.x;
|
||||
mov.u32 %r2, %ctaid.x;
|
||||
mul.lo.u32 %r3, %r2, %r1;
|
||||
mov.u32 %r4, %nctaid.x;
|
||||
mul.lo.u32 %r5, %r4, %r1;
|
||||
mov.u32 %r6, %tid.x;
|
||||
add.u32 %r7, %r6, %r3;
|
||||
sub.s32 %r8, %r5, 1;
|
||||
mul.lo.s32 %r9, %r7, 64;
|
||||
div.s32 %r10, %r9, %r5;
|
||||
mul.lo.s32 %r11, %r8, %r10;
|
||||
sub.s32 %r12, %r9, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
|
||||
setp.le.s32 %p1, %r13, %r12;
|
||||
@%p1 bra $Lt_0_7426;
|
||||
.loc 17 125 0
|
||||
mov.u32 %r14, %r12;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
mov.s32 %r17, 0;
|
||||
mov.u32 %r18, %r17;
|
||||
mov.s32 %r19, 0;
|
||||
mov.u32 %r20, %r19;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 17 127 0
|
||||
mov.u32 %r21, %r12;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
|
||||
mov.f32 %f12, %f8;
|
||||
cvt.ftz.f64.f32 %fd1, %f12;
|
||||
ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];
|
||||
mul.f64 %fd3, %fd1, %fd2;
|
||||
mov.f64 %fd4, 0d0000000000000000; // 0
|
||||
setp.neu.f64 %p2, %fd3, %fd4;
|
||||
@!%p2 bra $Lt_0_7426;
|
||||
.loc 17 130 0
|
||||
ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];
|
||||
cvt.ftz.f64.f32 %fd6, %f5;
|
||||
ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];
|
||||
sub.f64 %fd8, %fd6, %fd7;
|
||||
mul.f64 %fd9, %fd5, %fd8;
|
||||
mov.f64 %fd10, 0d0000000000000000; // 0
|
||||
setp.lt.f64 %p3, %fd9, %fd10;
|
||||
@%p3 bra $Lt_0_8706;
|
||||
ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];
|
||||
cvt.ftz.f64.f32 %fd12, %f6;
|
||||
ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];
|
||||
sub.f64 %fd14, %fd12, %fd13;
|
||||
mul.f64 %fd15, %fd11, %fd14;
|
||||
mov.f64 %fd16, 0d0000000000000000; // 0
|
||||
setp.lt.f64 %p4, %fd15, %fd16;
|
||||
@%p4 bra $Lt_0_8706;
|
||||
ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];
|
||||
cvt.ftz.f64.f32 %fd18, %f7;
|
||||
ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];
|
||||
sub.f64 %fd20, %fd18, %fd19;
|
||||
mul.f64 %fd21, %fd17, %fd20;
|
||||
mov.f64 %fd22, 0d0000000000000000; // 0
|
||||
setp.lt.f64 %p5, %fd21, %fd22;
|
||||
@%p5 bra $Lt_0_8706;
|
||||
cvt.rzi.s32.f64 %r28, %fd9;
|
||||
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
|
||||
setp.ge.s32 %p6, %r28, %r29;
|
||||
@%p6 bra $Lt_0_8706;
|
||||
cvt.rzi.s32.f64 %r30, %fd15;
|
||||
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
|
||||
setp.ge.s32 %p7, %r30, %r31;
|
||||
@%p7 bra $Lt_0_8706;
|
||||
cvt.rzi.s32.f64 %r32, %fd21;
|
||||
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
|
||||
setp.gt.s32 %p8, %r33, %r32;
|
||||
@%p8 bra $L_0_4866;
|
||||
$Lt_0_8706:
|
||||
$L_0_5122:
|
||||
.loc 17 139 0
|
||||
mov.s32 %r34, 1;
|
||||
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd1+0], %r34;
|
||||
bra.uni $Lt_0_7426;
|
||||
$L_0_4866:
|
||||
.loc 17 146 0
|
||||
mul.lo.s32 %r35, %r32, %r31;
|
||||
add.s32 %r36, %r30, %r35;
|
||||
mul.lo.s32 %r37, %r36, %r29;
|
||||
add.s32 %r38, %r28, %r37;
|
||||
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
|
||||
cvt.s64.s32 %rd3, %r38;
|
||||
mul.wide.s32 %rd4, %r38, 4;
|
||||
add.u64 %rd5, %rd2, %rd4;
|
||||
mov.s32 %r39, 1;
|
||||
atom.global.add.s32 %r40, [%rd5], %r39;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
|
||||
setp.gt.s32 %p9, %r42, %r41;
|
||||
@%p9 bra $Lt_0_7682;
|
||||
.loc 17 148 0
|
||||
mov.s32 %r43, 2;
|
||||
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd6+0], %r43;
|
||||
.loc 16 118 0
|
||||
mov.s32 %r44, -1;
|
||||
atom.global.add.s32 %r45, [%rd5], %r44;
|
||||
bra.uni $Lt_0_7426;
|
||||
$Lt_0_7682:
|
||||
.loc 17 151 0
|
||||
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
|
||||
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
|
||||
mul.lo.s32 %r47, %r46, %r41;
|
||||
add.s32 %r48, %r38, %r47;
|
||||
cvt.s64.s32 %rd8, %r48;
|
||||
mul.wide.s32 %rd9, %r48, 32;
|
||||
add.u64 %rd10, %rd7, %rd9;
|
||||
cvt.rn.f64.s32 %fd23, %r28;
|
||||
mov.f64 %fd24, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd25, %fd23, %fd24;
|
||||
sub.f64 %fd26, %fd25, %fd9;
|
||||
cvt.rn.f64.s32 %fd27, %r30;
|
||||
mov.f64 %fd28, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd29, %fd27, %fd28;
|
||||
sub.f64 %fd30, %fd29, %fd15;
|
||||
st.global.v2.f64 [%rd10+0], {%fd26,%fd30};
|
||||
cvt.rn.f64.s32 %fd31, %r32;
|
||||
mov.f64 %fd32, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd33, %fd31, %fd32;
|
||||
sub.f64 %fd34, %fd33, %fd21;
|
||||
st.global.v2.f64 [%rd10+16], {%fd34,%fd3};
|
||||
$Lt_0_7426:
|
||||
$L_0_4610:
|
||||
$Lt_0_6914:
|
||||
$Lt_0_6402:
|
||||
.loc 17 155 0
|
||||
exit;
|
||||
$LDWend_particle_map:
|
||||
} // particle_map
|
||||
|
||||
.entry make_rho (
|
||||
.param .u64 __cudaparm_make_rho_counts,
|
||||
.param .u64 __cudaparm_make_rho_atoms,
|
||||
.param .u64 __cudaparm_make_rho_brick,
|
||||
.param .u64 __cudaparm_make_rho__rho_coeff,
|
||||
.param .s32 __cudaparm_make_rho_atom_stride,
|
||||
.param .s32 __cudaparm_make_rho_npts_x,
|
||||
.param .s32 __cudaparm_make_rho_npts_y,
|
||||
.param .s32 __cudaparm_make_rho_npts_z,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_x,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_y,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_z,
|
||||
.param .s32 __cudaparm_make_rho_order_m_1,
|
||||
.param .s32 __cudaparm_make_rho_order,
|
||||
.param .s32 __cudaparm_make_rho_order2)
|
||||
{
|
||||
.reg .u32 %r<119>;
|
||||
.reg .u64 %rd<57>;
|
||||
.reg .f64 %fd<26>;
|
||||
.reg .pred %p<27>;
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32531_34_non_const_rho_coeff200[512];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32532_34_non_const_front712[640];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32533_34_non_const_ans1352[4096];
|
||||
.loc 17 164 0
|
||||
$LDWbegin_make_rho:
|
||||
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_make_rho_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_1_16898;
|
||||
.loc 17 171 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f64 %fd1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f64 [%rd6+0], %fd1;
|
||||
$Lt_1_16898:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200;
|
||||
shr.s32 %r5, %r4, 31;
|
||||
mov.s32 %r6, 31;
|
||||
and.b32 %r7, %r5, %r6;
|
||||
add.s32 %r8, %r7, %r4;
|
||||
shr.s32 %r9, %r8, 5;
|
||||
mul.lo.s32 %r10, %r9, 32;
|
||||
sub.s32 %r11, %r4, %r10;
|
||||
setp.lt.s32 %p2, %r11, %r2;
|
||||
@!%p2 bra $Lt_1_17410;
|
||||
.loc 17 177 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712;
|
||||
mov.f64 %fd2, 0d0000000000000000; // 0
|
||||
cvt.s64.s32 %rd8, %r11;
|
||||
shr.s32 %r12, %r4, 31;
|
||||
mov.s32 %r13, 31;
|
||||
and.b32 %r14, %r12, %r13;
|
||||
add.s32 %r15, %r14, %r4;
|
||||
shr.s32 %r16, %r15, 5;
|
||||
cvt.s64.s32 %rd9, %r16;
|
||||
mul.wide.s32 %rd10, %r16, 40;
|
||||
add.u64 %rd11, %rd8, %rd10;
|
||||
mul.lo.u64 %rd12, %rd11, 8;
|
||||
add.u64 %rd13, %rd7, %rd12;
|
||||
st.shared.f64 [%rd13+256], %fd2;
|
||||
$Lt_1_17410:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712;
|
||||
.loc 17 179 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
|
||||
shr.s32 %r18, %r17, 31;
|
||||
mov.s32 %r19, 31;
|
||||
and.b32 %r20, %r18, %r19;
|
||||
add.s32 %r21, %r20, %r17;
|
||||
shr.s32 %r22, %r21, 5;
|
||||
add.s32 %r23, %r22, 1;
|
||||
mov.u32 %r24, 0;
|
||||
setp.le.s32 %p3, %r23, %r24;
|
||||
@%p3 bra $Lt_1_17922;
|
||||
shr.s32 %r25, %r4, 31;
|
||||
mov.s32 %r26, 31;
|
||||
and.b32 %r27, %r25, %r26;
|
||||
add.s32 %r28, %r27, %r4;
|
||||
shr.s32 %r29, %r28, 5;
|
||||
add.s32 %r30, %r11, 32;
|
||||
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
|
||||
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
|
||||
mul.lo.s32 %r33, %r31, %r32;
|
||||
mov.u32 %r34, %ctaid.x;
|
||||
mul.lo.u32 %r35, %r34, 2;
|
||||
add.u32 %r36, %r29, %r35;
|
||||
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
|
||||
div.s32 %r38, %r36, %r37;
|
||||
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
|
||||
setp.lt.s32 %p4, %r38, %r39;
|
||||
sub.s32 %r40, %r39, %r38;
|
||||
mov.s32 %r41, 0;
|
||||
selp.s32 %r42, %r40, %r41, %p4;
|
||||
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
|
||||
setp.ge.s32 %p5, %r38, %r43;
|
||||
sub.s32 %r44, %r43, %r38;
|
||||
add.s32 %r45, %r44, %r2;
|
||||
sub.s32 %r46, %r45, 1;
|
||||
selp.s32 %r47, %r46, %r2, %p5;
|
||||
rem.s32 %r48, %r36, %r37;
|
||||
setp.lt.s32 %p6, %r48, %r39;
|
||||
sub.s32 %r49, %r39, %r48;
|
||||
mov.s32 %r50, 0;
|
||||
selp.s32 %r51, %r49, %r50, %p6;
|
||||
setp.ge.s32 %p7, %r48, %r31;
|
||||
sub.s32 %r52, %r31, %r48;
|
||||
add.s32 %r53, %r52, %r2;
|
||||
sub.s32 %r54, %r53, 1;
|
||||
selp.s32 %r55, %r54, %r2, %p7;
|
||||
mov.s32 %r56, %r23;
|
||||
mov.s32 %r57, 0;
|
||||
setp.gt.s32 %p8, %r2, %r57;
|
||||
mov.s32 %r58, 0;
|
||||
cvt.s64.s32 %rd14, %r11;
|
||||
cvt.s64.s32 %rd15, %r29;
|
||||
mul.lo.s32 %r59, %r23, 32;
|
||||
mul.wide.s32 %rd16, %r29, 40;
|
||||
add.u64 %rd17, %rd14, %rd16;
|
||||
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
|
||||
setp.gt.s32 %p9, %r60, %r38;
|
||||
mul.lo.u64 %rd18, %rd17, 8;
|
||||
selp.s32 %r61, 1, 0, %p9;
|
||||
add.u64 %rd19, %rd18, %rd7;
|
||||
mov.u64 %rd20, __cuda___cuda_local_var_32533_34_non_const_ans1352;
|
||||
mov.s32 %r62, %r56;
|
||||
$Lt_1_18434:
|
||||
//<loop> Loop body line 179, nesting depth: 1, estimated iterations: unknown
|
||||
@!%p8 bra $Lt_1_18690;
|
||||
mov.s32 %r63, %r2;
|
||||
cvt.s64.s32 %rd21, %r4;
|
||||
mul.wide.s32 %rd22, %r4, 8;
|
||||
add.u64 %rd23, %rd20, %rd22;
|
||||
mov.s32 %r64, 0;
|
||||
mov.s32 %r65, %r63;
|
||||
$Lt_1_19202:
|
||||
//<loop> Loop body line 179, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 203 0
|
||||
mov.f64 %fd3, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd23+0], %fd3;
|
||||
add.s32 %r64, %r64, 1;
|
||||
add.u64 %rd23, %rd23, 512;
|
||||
setp.ne.s32 %p10, %r64, %r2;
|
||||
@%p10 bra $Lt_1_19202;
|
||||
$Lt_1_18690:
|
||||
add.s32 %r66, %r11, %r58;
|
||||
set.lt.u32.s32 %r67, %r66, %r32;
|
||||
neg.s32 %r68, %r67;
|
||||
and.b32 %r69, %r61, %r68;
|
||||
mov.u32 %r70, 0;
|
||||
setp.eq.s32 %p11, %r69, %r70;
|
||||
@%p11 bra $Lt_1_20226;
|
||||
.loc 17 206 0
|
||||
mov.s32 %r71, %r42;
|
||||
setp.ge.s32 %p12, %r42, %r47;
|
||||
@%p12 bra $Lt_1_20226;
|
||||
sub.s32 %r72, %r47, %r42;
|
||||
setp.lt.s32 %p13, %r51, %r55;
|
||||
mov.s32 %r73, %r72;
|
||||
$Lt_1_20738:
|
||||
//<loop> Loop body line 206, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 208 0
|
||||
mov.s32 %r74, %r51;
|
||||
@!%p13 bra $Lt_1_20994;
|
||||
sub.s32 %r75, %r55, %r51;
|
||||
sub.s32 %r76, %r71, %r42;
|
||||
add.s32 %r77, %r38, %r42;
|
||||
add.s32 %r78, %r48, %r51;
|
||||
sub.s32 %r79, %r77, %r39;
|
||||
sub.s32 %r80, %r78, %r39;
|
||||
add.s32 %r81, %r76, %r79;
|
||||
mul.lo.s32 %r82, %r33, %r81;
|
||||
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
|
||||
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
|
||||
mov.s32 %r84, %r75;
|
||||
$Lt_1_21506:
|
||||
//<loop> Loop body line 208, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 210 0
|
||||
sub.s32 %r85, %r74, %r51;
|
||||
add.s32 %r86, %r85, %r80;
|
||||
mul.lo.s32 %r87, %r86, %r32;
|
||||
add.s32 %r88, %r82, %r87;
|
||||
add.s32 %r89, %r66, %r88;
|
||||
cvt.s64.s32 %rd25, %r89;
|
||||
mul.wide.s32 %rd26, %r89, 4;
|
||||
add.u64 %rd27, %rd24, %rd26;
|
||||
ld.global.s32 %r90, [%rd27+0];
|
||||
mul.lo.s32 %r91, %r90, %r83;
|
||||
.loc 17 211 0
|
||||
mov.s32 %r92, %r89;
|
||||
setp.ge.s32 %p14, %r89, %r91;
|
||||
@%p14 bra $Lt_1_21762;
|
||||
sub.s32 %r93, %r3, 1;
|
||||
cvt.s64.s32 %rd28, %r83;
|
||||
mul.wide.s32 %rd29, %r83, 32;
|
||||
mov.s32 %r94, -1;
|
||||
setp.gt.s32 %p15, %r93, %r94;
|
||||
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
|
||||
mul.lo.u64 %rd31, %rd25, 32;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
$Lt_1_22274:
|
||||
//<loop> Loop body line 211, nesting depth: 4, estimated iterations: unknown
|
||||
.loc 17 212 0
|
||||
ld.global.f64 %fd4, [%rd32+0];
|
||||
@!%p15 bra $Lt_1_29954;
|
||||
sub.s32 %r95, %r93, %r74;
|
||||
mov.s32 %r96, -1;
|
||||
sub.s32 %r97, %r96, %r74;
|
||||
cvt.s64.s32 %rd33, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 8;
|
||||
ld.global.f64 %fd5, [%rd32+8];
|
||||
ld.global.f64 %fd6, [%rd32+16];
|
||||
cvt.s64.s32 %rd35, %r95;
|
||||
mul.wide.s32 %rd36, %r95, 8;
|
||||
add.u64 %rd37, %rd1, %rd36;
|
||||
sub.s32 %r98, %r93, %r71;
|
||||
cvt.s64.s32 %rd38, %r98;
|
||||
mul.wide.s32 %rd39, %r98, 8;
|
||||
add.u64 %rd40, %rd1, %rd39;
|
||||
mov.f64 %fd7, 0d0000000000000000; // 0
|
||||
mov.f64 %fd8, 0d0000000000000000; // 0
|
||||
$Lt_1_23042:
|
||||
//<loop> Loop body line 212, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 217 0
|
||||
ld.shared.f64 %fd9, [%rd37+0];
|
||||
mad.rn.f64 %fd8, %fd8, %fd5, %fd9;
|
||||
.loc 17 218 0
|
||||
ld.shared.f64 %fd10, [%rd40+0];
|
||||
mad.rn.f64 %fd7, %fd7, %fd6, %fd10;
|
||||
sub.u64 %rd40, %rd40, %rd34;
|
||||
sub.s32 %r95, %r95, %r2;
|
||||
sub.u64 %rd37, %rd37, %rd34;
|
||||
setp.gt.s32 %p16, %r95, %r97;
|
||||
@%p16 bra $Lt_1_23042;
|
||||
bra.uni $Lt_1_22530;
|
||||
$Lt_1_29954:
|
||||
mov.f64 %fd7, 0d0000000000000000; // 0
|
||||
mov.f64 %fd8, 0d0000000000000000; // 0
|
||||
$Lt_1_22530:
|
||||
.loc 17 220 0
|
||||
ld.global.f64 %fd11, [%rd32+24];
|
||||
mul.f64 %fd12, %fd7, %fd8;
|
||||
mul.f64 %fd13, %fd11, %fd12;
|
||||
@!%p8 bra $Lt_1_23554;
|
||||
mov.s32 %r99, %r2;
|
||||
cvt.s64.s32 %rd41, %r4;
|
||||
mul.wide.s32 %rd42, %r4, 8;
|
||||
add.u64 %rd43, %rd20, %rd42;
|
||||
mov.s32 %r100, 0;
|
||||
mov.s32 %r101, %r99;
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 220, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 224 0
|
||||
add.s32 %r102, %r100, %r1;
|
||||
mov.s32 %r103, %r102;
|
||||
setp.lt.s32 %p17, %r102, %r100;
|
||||
@%p17 bra $Lt_1_30466;
|
||||
cvt.s64.s32 %rd44, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 8;
|
||||
cvt.s64.s32 %rd45, %r102;
|
||||
mul.wide.s32 %rd46, %r102, 8;
|
||||
add.u64 %rd47, %rd1, %rd46;
|
||||
mov.f64 %fd14, 0d0000000000000000; // 0
|
||||
$Lt_1_24834:
|
||||
//<loop> Loop body line 224, nesting depth: 6, estimated iterations: unknown
|
||||
.loc 17 225 0
|
||||
ld.shared.f64 %fd15, [%rd47+0];
|
||||
mad.rn.f64 %fd14, %fd4, %fd14, %fd15;
|
||||
sub.s32 %r103, %r103, %r2;
|
||||
sub.u64 %rd47, %rd47, %rd34;
|
||||
setp.ge.s32 %p18, %r103, %r100;
|
||||
@%p18 bra $Lt_1_24834;
|
||||
bra.uni $Lt_1_24322;
|
||||
$Lt_1_30466:
|
||||
mov.f64 %fd14, 0d0000000000000000; // 0
|
||||
$Lt_1_24322:
|
||||
.loc 17 226 0
|
||||
ld.shared.f64 %fd16, [%rd43+0];
|
||||
mad.rn.f64 %fd17, %fd14, %fd13, %fd16;
|
||||
st.shared.f64 [%rd43+0], %fd17;
|
||||
add.s32 %r100, %r100, 1;
|
||||
add.u64 %rd43, %rd43, 512;
|
||||
setp.ne.s32 %p19, %r100, %r2;
|
||||
@%p19 bra $Lt_1_24066;
|
||||
$Lt_1_23554:
|
||||
add.s32 %r92, %r92, %r83;
|
||||
add.u64 %rd32, %rd29, %rd32;
|
||||
setp.gt.s32 %p20, %r91, %r92;
|
||||
@%p20 bra $Lt_1_22274;
|
||||
$Lt_1_21762:
|
||||
add.s32 %r74, %r74, 1;
|
||||
setp.ne.s32 %p21, %r55, %r74;
|
||||
@%p21 bra $Lt_1_21506;
|
||||
$Lt_1_20994:
|
||||
add.s32 %r71, %r71, 1;
|
||||
setp.ne.s32 %p22, %r47, %r71;
|
||||
@%p22 bra $Lt_1_20738;
|
||||
$Lt_1_20226:
|
||||
$Lt_1_19714:
|
||||
.loc 17 235 0
|
||||
bar.sync 0;
|
||||
@!%p2 bra $Lt_1_26626;
|
||||
.loc 17 237 0
|
||||
ld.shared.f64 %fd18, [%rd19+256];
|
||||
st.shared.f64 [%rd19+0], %fd18;
|
||||
.loc 17 238 0
|
||||
mov.f64 %fd19, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd19+256], %fd19;
|
||||
bra.uni $Lt_1_26370;
|
||||
$Lt_1_26626:
|
||||
.loc 17 240 0
|
||||
mov.f64 %fd20, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd19+0], %fd20;
|
||||
$Lt_1_26370:
|
||||
@!%p8 bra $Lt_1_26882;
|
||||
mov.s32 %r104, %r2;
|
||||
cvt.s64.s32 %rd48, %r4;
|
||||
mov.s32 %r105, %r11;
|
||||
add.s32 %r106, %r11, %r2;
|
||||
mul.wide.s32 %rd49, %r4, 8;
|
||||
add.u64 %rd50, %rd20, %rd49;
|
||||
mov.s64 %rd51, %rd19;
|
||||
mov.s32 %r107, %r104;
|
||||
$Lt_1_27394:
|
||||
//<loop> Loop body line 240, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 243 0
|
||||
ld.shared.f64 %fd21, [%rd50+0];
|
||||
ld.shared.f64 %fd22, [%rd51+0];
|
||||
add.f64 %fd23, %fd21, %fd22;
|
||||
st.shared.f64 [%rd51+0], %fd23;
|
||||
.loc 17 244 0
|
||||
bar.sync 0;
|
||||
add.s32 %r105, %r105, 1;
|
||||
add.u64 %rd51, %rd51, 8;
|
||||
add.u64 %rd50, %rd50, 512;
|
||||
setp.ne.s32 %p23, %r105, %r106;
|
||||
@%p23 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
set.lt.u32.s32 %r108, %r66, %r17;
|
||||
neg.s32 %r109, %r108;
|
||||
and.b32 %r110, %r61, %r109;
|
||||
mov.u32 %r111, 0;
|
||||
setp.eq.s32 %p24, %r110, %r111;
|
||||
@%p24 bra $Lt_1_27906;
|
||||
.loc 17 248 0
|
||||
ld.shared.f64 %fd24, [%rd19+0];
|
||||
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
|
||||
add.s32 %r112, %r11, %r58;
|
||||
mul.lo.s32 %r113, %r37, %r17;
|
||||
mul.lo.s32 %r114, %r38, %r113;
|
||||
mul.lo.s32 %r115, %r48, %r17;
|
||||
add.s32 %r116, %r114, %r115;
|
||||
add.s32 %r117, %r112, %r116;
|
||||
cvt.s64.s32 %rd53, %r117;
|
||||
mul.wide.s32 %rd54, %r117, 8;
|
||||
add.u64 %rd55, %rd52, %rd54;
|
||||
st.global.f64 [%rd55+0], %fd24;
|
||||
$Lt_1_27906:
|
||||
add.s32 %r58, %r58, 32;
|
||||
setp.ne.s32 %p25, %r58, %r59;
|
||||
@%p25 bra $Lt_1_18434;
|
||||
$Lt_1_17922:
|
||||
.loc 17 252 0
|
||||
exit;
|
||||
$LDWend_make_rho:
|
||||
} // make_rho
|
||||
|
||||
.entry interp (
|
||||
.param .u64 __cudaparm_interp_x_,
|
||||
.param .u64 __cudaparm_interp_q_,
|
||||
.param .s32 __cudaparm_interp_nlocal,
|
||||
.param .u64 __cudaparm_interp_brick,
|
||||
.param .u64 __cudaparm_interp__rho_coeff,
|
||||
.param .s32 __cudaparm_interp_npts_x,
|
||||
.param .s32 __cudaparm_interp_npts_yx,
|
||||
.param .f64 __cudaparm_interp_b_lo_x,
|
||||
.param .f64 __cudaparm_interp_b_lo_y,
|
||||
.param .f64 __cudaparm_interp_b_lo_z,
|
||||
.param .f64 __cudaparm_interp_delxinv,
|
||||
.param .f64 __cudaparm_interp_delyinv,
|
||||
.param .f64 __cudaparm_interp_delzinv,
|
||||
.param .s32 __cudaparm_interp_order,
|
||||
.param .s32 __cudaparm_interp_order2,
|
||||
.param .f64 __cudaparm_interp_qqrd2e_scale,
|
||||
.param .u64 __cudaparm_interp_ans)
|
||||
{
|
||||
.reg .u32 %r<56>;
|
||||
.reg .u64 %rd<37>;
|
||||
.reg .f32 %f<19>;
|
||||
.reg .f64 %fd<63>;
|
||||
.reg .pred %p<14>;
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568[512];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32630_34_non_const_rho1d_06080[4096];
|
||||
.shared .align 8 .b8 __cuda___cuda_local_var_32631_34_non_const_rho1d_110176[4096];
|
||||
// __cuda_local_var_32647_12_non_const_ek = 16
|
||||
.loc 17 262 0
|
||||
$LDWbegin_interp:
|
||||
ld.param.s32 %r1, [__cudaparm_interp_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_interp_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_2_8706;
|
||||
.loc 17 269 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f64 %fd1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f64 [%rd6+0], %fd1;
|
||||
$Lt_2_8706:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568;
|
||||
.loc 17 270 0
|
||||
bar.sync 0;
|
||||
mov.u32 %r5, %ctaid.x;
|
||||
mov.u32 %r6, %ntid.x;
|
||||
mul.lo.u32 %r7, %r5, %r6;
|
||||
add.u32 %r8, %r4, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
|
||||
setp.le.s32 %p2, %r9, %r8;
|
||||
@%p2 bra $Lt_2_9218;
|
||||
.loc 17 278 0
|
||||
mov.u32 %r10, %r8;
|
||||
mov.s32 %r11, 0;
|
||||
mov.u32 %r12, %r11;
|
||||
mov.s32 %r13, 0;
|
||||
mov.u32 %r14, %r13;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 17 279 0
|
||||
mov.u32 %r17, %r8;
|
||||
mov.s32 %r18, 0;
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];
|
||||
mov.f32 %f12, %f8;
|
||||
cvt.ftz.f64.f32 %fd2, %f12;
|
||||
ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];
|
||||
mul.f64 %fd4, %fd2, %fd3;
|
||||
mov.f64 %fd5, 0d0000000000000000; // 0
|
||||
setp.neu.f64 %p3, %fd4, %fd5;
|
||||
@!%p3 bra $Lt_2_9986;
|
||||
mov.s32 %r24, 0;
|
||||
setp.gt.s32 %p4, %r2, %r24;
|
||||
ld.param.f64 %fd6, [__cudaparm_interp_delxinv];
|
||||
cvt.ftz.f64.f32 %fd7, %f5;
|
||||
ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];
|
||||
sub.f64 %fd9, %fd7, %fd8;
|
||||
mul.f64 %fd10, %fd6, %fd9;
|
||||
@!%p4 bra $Lt_2_16386;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176;
|
||||
cvt.rzi.s32.f64 %r25, %fd10;
|
||||
cvt.rn.f64.s32 %fd11, %r25;
|
||||
mov.f64 %fd12, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd13, %fd11, %fd12;
|
||||
sub.f64 %fd14, %fd13, %fd10;
|
||||
ld.param.f64 %fd15, [__cudaparm_interp_delyinv];
|
||||
cvt.ftz.f64.f32 %fd16, %f6;
|
||||
ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];
|
||||
sub.f64 %fd18, %fd16, %fd17;
|
||||
mul.f64 %fd19, %fd15, %fd18;
|
||||
cvt.rzi.s32.f64 %r26, %fd19;
|
||||
cvt.rn.f64.s32 %fd20, %r26;
|
||||
mov.f64 %fd21, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd22, %fd20, %fd21;
|
||||
sub.f64 %fd23, %fd22, %fd19;
|
||||
mov.s32 %r27, %r2;
|
||||
cvt.s64.s32 %rd9, %r4;
|
||||
mov.s32 %r28, %r1;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
add.u64 %rd10, %rd3, %rd7;
|
||||
add.u64 %rd11, %rd3, %rd8;
|
||||
mov.s32 %r29, 0;
|
||||
mov.s32 %r30, %r27;
|
||||
$Lt_2_10754:
|
||||
//<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 298 0
|
||||
mov.f64 %fd24, 0d0000000000000000; // 0
|
||||
mov.f64 %fd25, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd10+0], %fd25;
|
||||
.loc 17 299 0
|
||||
mov.f64 %fd26, 0d0000000000000000; // 0
|
||||
mov.f64 %fd27, 0d0000000000000000; // 0
|
||||
st.shared.f64 [%rd11+0], %fd27;
|
||||
.loc 17 300 0
|
||||
mov.s32 %r31, %r28;
|
||||
setp.lt.s32 %p5, %r28, %r29;
|
||||
@%p5 bra $Lt_2_11010;
|
||||
cvt.s64.s32 %rd12, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 8;
|
||||
cvt.s64.s32 %rd14, %r28;
|
||||
mul.wide.s32 %rd15, %r28, 8;
|
||||
add.u64 %rd16, %rd1, %rd15;
|
||||
$Lt_2_11522:
|
||||
//<loop> Loop body line 300, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 301 0
|
||||
ld.shared.f64 %fd28, [%rd16+0];
|
||||
mad.rn.f64 %fd24, %fd24, %fd14, %fd28;
|
||||
st.shared.f64 [%rd10+0], %fd24;
|
||||
.loc 17 302 0
|
||||
mad.rn.f64 %fd26, %fd26, %fd23, %fd28;
|
||||
st.shared.f64 [%rd11+0], %fd26;
|
||||
sub.s32 %r31, %r31, %r2;
|
||||
sub.u64 %rd16, %rd16, %rd13;
|
||||
setp.ge.s32 %p6, %r31, %r29;
|
||||
@%p6 bra $Lt_2_11522;
|
||||
$Lt_2_11010:
|
||||
add.s32 %r29, %r29, 1;
|
||||
add.s32 %r28, %r28, 1;
|
||||
add.u64 %rd11, %rd11, 512;
|
||||
add.u64 %rd10, %rd10, 512;
|
||||
setp.ne.s32 %p7, %r28, %r3;
|
||||
@%p7 bra $Lt_2_10754;
|
||||
bra.uni $Lt_2_10242;
|
||||
$Lt_2_16386:
|
||||
cvt.rzi.s32.f64 %r25, %fd10;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080;
|
||||
$Lt_2_10242:
|
||||
.loc 17 306 0
|
||||
ld.param.f64 %fd29, [__cudaparm_interp_delzinv];
|
||||
cvt.ftz.f64.f32 %fd30, %f7;
|
||||
ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];
|
||||
sub.f64 %fd32, %fd30, %fd31;
|
||||
mul.f64 %fd33, %fd29, %fd32;
|
||||
cvt.rzi.s32.f64 %r32, %fd33;
|
||||
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
|
||||
mul.lo.s32 %r34, %r32, %r33;
|
||||
add.s32 %r35, %r25, %r34;
|
||||
@!%p4 bra $Lt_2_16898;
|
||||
cvt.rn.f64.s32 %fd34, %r32;
|
||||
mov.f64 %fd35, 0d3fe0000000000000; // 0.5
|
||||
add.f64 %fd36, %fd34, %fd35;
|
||||
sub.f64 %fd37, %fd36, %fd33;
|
||||
mov.s32 %r36, %r2;
|
||||
cvt.ftz.f64.f32 %fd38, %f6;
|
||||
cvt.s64.s32 %rd17, %r4;
|
||||
ld.param.f64 %fd39, [__cudaparm_interp_delyinv];
|
||||
ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];
|
||||
sub.f64 %fd41, %fd38, %fd40;
|
||||
mul.f64 %fd42, %fd39, %fd41;
|
||||
cvt.rzi.s32.f64 %r37, %fd42;
|
||||
mul.wide.s32 %rd3, %r4, 8;
|
||||
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
|
||||
mul.lo.s32 %r39, %r37, %r38;
|
||||
add.u64 %rd18, %rd3, %rd7;
|
||||
add.u64 %rd19, %rd3, %rd8;
|
||||
cvt.s64.s32 %rd20, %r38;
|
||||
mul.wide.s32 %rd21, %r38, 32;
|
||||
add.s32 %r40, %r39, %r35;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.u64 %rd22, [__cudaparm_interp_brick];
|
||||
mov.s32 %r42, 0;
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
mov.s32 %r43, %r36;
|
||||
$Lt_2_12802:
|
||||
//<loop> Loop body line 306, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 309 0
|
||||
add.s32 %r44, %r42, %r1;
|
||||
mov.s32 %r45, %r44;
|
||||
setp.lt.s32 %p8, %r44, %r42;
|
||||
@%p8 bra $Lt_2_17154;
|
||||
cvt.s64.s32 %rd23, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 8;
|
||||
cvt.s64.s32 %rd24, %r44;
|
||||
mul.wide.s32 %rd25, %r44, 8;
|
||||
add.u64 %rd26, %rd1, %rd25;
|
||||
mov.f64 %fd43, 0d0000000000000000; // 0
|
||||
$Lt_2_13570:
|
||||
//<loop> Loop body line 309, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 310 0
|
||||
ld.shared.f64 %fd44, [%rd26+0];
|
||||
mad.rn.f64 %fd43, %fd37, %fd43, %fd44;
|
||||
sub.s32 %r45, %r45, %r2;
|
||||
sub.u64 %rd26, %rd26, %rd13;
|
||||
setp.ge.s32 %p9, %r45, %r42;
|
||||
@%p9 bra $Lt_2_13570;
|
||||
bra.uni $Lt_2_13058;
|
||||
$Lt_2_17154:
|
||||
mov.f64 %fd43, 0d0000000000000000; // 0
|
||||
$Lt_2_13058:
|
||||
.loc 17 312 0
|
||||
mov.s32 %r46, %r41;
|
||||
mov.s32 %r47, %r2;
|
||||
mov.s32 %r48, %r46;
|
||||
mul.f64 %fd45, %fd4, %fd43;
|
||||
mov.s64 %rd27, %rd19;
|
||||
cvt.s64.s32 %rd28, %r46;
|
||||
mul.wide.s32 %rd29, %r46, 32;
|
||||
mov.s32 %r49, 0;
|
||||
mov.s32 %r50, %r47;
|
||||
$Lt_2_14594:
|
||||
//<loop> Loop body line 312, nesting depth: 2, estimated iterations: unknown
|
||||
mov.s32 %r51, %r2;
|
||||
mov.s32 %r52, %r48;
|
||||
add.s32 %r53, %r48, %r2;
|
||||
mov.s64 %rd30, %rd18;
|
||||
ld.shared.f64 %fd46, [%rd27+0];
|
||||
add.u64 %rd31, %rd29, %rd22;
|
||||
mul.f64 %fd47, %fd45, %fd46;
|
||||
mov.s32 %r54, %r51;
|
||||
$Lt_2_15362:
|
||||
//<loop> Loop body line 312, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 316 0
|
||||
ld.shared.f64 %fd48, [%rd30+0];
|
||||
mul.f64 %fd49, %fd48, %fd47;
|
||||
.loc 17 318 0
|
||||
cvt.ftz.f64.f32 %fd50, %f15;
|
||||
ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];
|
||||
mul.f64 %fd53, %fd49, %fd51;
|
||||
sub.f64 %fd54, %fd50, %fd53;
|
||||
cvt.rn.ftz.f32.f64 %f15, %fd54;
|
||||
.loc 17 319 0
|
||||
cvt.ftz.f64.f32 %fd55, %f14;
|
||||
mul.f64 %fd56, %fd49, %fd52;
|
||||
sub.f64 %fd57, %fd55, %fd56;
|
||||
cvt.rn.ftz.f32.f64 %f14, %fd57;
|
||||
.loc 17 320 0
|
||||
cvt.ftz.f64.f32 %fd58, %f13;
|
||||
ld.global.f64 %fd59, [%rd31+16];
|
||||
mul.f64 %fd60, %fd49, %fd59;
|
||||
sub.f64 %fd61, %fd58, %fd60;
|
||||
cvt.rn.ftz.f32.f64 %f13, %fd61;
|
||||
add.s32 %r52, %r52, 1;
|
||||
add.u64 %rd31, %rd31, 32;
|
||||
add.u64 %rd30, %rd30, 512;
|
||||
setp.ne.s32 %p10, %r52, %r53;
|
||||
@%p10 bra $Lt_2_15362;
|
||||
add.s32 %r49, %r49, 1;
|
||||
add.s32 %r48, %r48, %r38;
|
||||
add.u64 %rd29, %rd29, %rd21;
|
||||
add.u64 %rd27, %rd27, 512;
|
||||
setp.ne.s32 %p11, %r49, %r2;
|
||||
@%p11 bra $Lt_2_14594;
|
||||
add.s32 %r42, %r42, 1;
|
||||
add.s32 %r41, %r46, %r33;
|
||||
setp.ne.s32 %p12, %r42, %r2;
|
||||
@%p12 bra $Lt_2_12802;
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_16898:
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_9986:
|
||||
mov.f32 %f13, 0f00000000; // 0
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
$Lt_2_9730:
|
||||
.loc 17 327 0
|
||||
ld.param.u64 %rd32, [__cudaparm_interp_ans];
|
||||
cvt.s64.s32 %rd33, %r8;
|
||||
mul.wide.s32 %rd34, %r8, 16;
|
||||
add.u64 %rd35, %rd32, %rd34;
|
||||
mov.f32 %f16, %f17;
|
||||
st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};
|
||||
$Lt_2_9218:
|
||||
.loc 17 329 0
|
||||
exit;
|
||||
$LDWend_interp:
|
||||
} // interp
|
||||
|
|
@ -1,881 +0,0 @@
|
|||
.version 2.3
|
||||
.target sm_20
|
||||
.address_size 64
|
||||
// compiled with /usr/local/cuda/open64/lib//be
|
||||
// nvopencc 4.0 built on 2011-05-12
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Compiling /tmp/tmpxft_0000bc4a_00000000-9_pppm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.A49KLP)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
//-----------------------------------------------------------
|
||||
// Options:
|
||||
//-----------------------------------------------------------
|
||||
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
|
||||
// -O3 (Optimization level)
|
||||
// -g0 (Debug level)
|
||||
// -m2 (Report advisories)
|
||||
//-----------------------------------------------------------
|
||||
|
||||
.file 1 "<command-line>"
|
||||
.file 2 "/tmp/tmpxft_0000bc4a_00000000-8_pppm_gpu_kernel.cudafe2.gpu"
|
||||
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
|
||||
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
|
||||
.file 5 "/usr/local/cuda/include/host_defines.h"
|
||||
.file 6 "/usr/local/cuda/include/builtin_types.h"
|
||||
.file 7 "/usr/local/cuda/include/device_types.h"
|
||||
.file 8 "/usr/local/cuda/include/driver_types.h"
|
||||
.file 9 "/usr/local/cuda/include/surface_types.h"
|
||||
.file 10 "/usr/local/cuda/include/texture_types.h"
|
||||
.file 11 "/usr/local/cuda/include/vector_types.h"
|
||||
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
|
||||
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
|
||||
.file 14 "/usr/include/bits/types.h"
|
||||
.file 15 "/usr/include/time.h"
|
||||
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
|
||||
.file 17 "pppm_gpu_kernel.cu"
|
||||
.file 18 "/usr/local/cuda/include/common_functions.h"
|
||||
.file 19 "/usr/local/cuda/include/math_functions.h"
|
||||
.file 20 "/usr/local/cuda/include/math_constants.h"
|
||||
.file 21 "/usr/local/cuda/include/device_functions.h"
|
||||
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
|
||||
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
|
||||
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
|
||||
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
|
||||
.file 26 "/usr/local/cuda/include/surface_functions.h"
|
||||
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
|
||||
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
|
||||
|
||||
.global .texref pos_tex;
|
||||
.global .texref q_tex;
|
||||
|
||||
.entry particle_map (
|
||||
.param .u64 __cudaparm_particle_map_x_,
|
||||
.param .u64 __cudaparm_particle_map_q_,
|
||||
.param .f32 __cudaparm_particle_map_delvolinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal,
|
||||
.param .u64 __cudaparm_particle_map_counts,
|
||||
.param .u64 __cudaparm_particle_map_ans,
|
||||
.param .f32 __cudaparm_particle_map_b_lo_x,
|
||||
.param .f32 __cudaparm_particle_map_b_lo_y,
|
||||
.param .f32 __cudaparm_particle_map_b_lo_z,
|
||||
.param .f32 __cudaparm_particle_map_delxinv,
|
||||
.param .f32 __cudaparm_particle_map_delyinv,
|
||||
.param .f32 __cudaparm_particle_map_delzinv,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_x,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_y,
|
||||
.param .s32 __cudaparm_particle_map_nlocal_z,
|
||||
.param .s32 __cudaparm_particle_map_atom_stride,
|
||||
.param .s32 __cudaparm_particle_map_max_atoms,
|
||||
.param .u64 __cudaparm_particle_map_error)
|
||||
{
|
||||
.reg .u32 %r<50>;
|
||||
.reg .u64 %rd<12>;
|
||||
.reg .f32 %f<44>;
|
||||
.reg .pred %p<11>;
|
||||
.loc 17 113 0
|
||||
$LDWbegin_particle_map:
|
||||
mov.u32 %r1, %ntid.x;
|
||||
mov.u32 %r2, %ctaid.x;
|
||||
mul.lo.u32 %r3, %r2, %r1;
|
||||
mov.u32 %r4, %nctaid.x;
|
||||
mul.lo.u32 %r5, %r4, %r1;
|
||||
mov.u32 %r6, %tid.x;
|
||||
add.u32 %r7, %r6, %r3;
|
||||
sub.s32 %r8, %r5, 1;
|
||||
mul.lo.s32 %r9, %r7, 64;
|
||||
div.s32 %r10, %r9, %r5;
|
||||
mul.lo.s32 %r11, %r8, %r10;
|
||||
sub.s32 %r12, %r9, %r11;
|
||||
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
|
||||
setp.le.s32 %p1, %r13, %r12;
|
||||
@%p1 bra $Lt_0_7426;
|
||||
.loc 17 125 0
|
||||
mov.u32 %r14, %r12;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
mov.s32 %r17, 0;
|
||||
mov.u32 %r18, %r17;
|
||||
mov.s32 %r19, 0;
|
||||
mov.u32 %r20, %r19;
|
||||
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
|
||||
mov.f32 %f5, %f1;
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
.loc 17 127 0
|
||||
mov.u32 %r21, %r12;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
mov.s32 %r24, 0;
|
||||
mov.u32 %r25, %r24;
|
||||
mov.s32 %r26, 0;
|
||||
mov.u32 %r27, %r26;
|
||||
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
|
||||
mov.f32 %f12, %f8;
|
||||
ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];
|
||||
mul.ftz.f32 %f14, %f13, %f12;
|
||||
mov.f32 %f15, 0f00000000; // 0
|
||||
setp.neu.ftz.f32 %p2, %f14, %f15;
|
||||
@!%p2 bra $Lt_0_7426;
|
||||
.loc 17 130 0
|
||||
ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];
|
||||
sub.ftz.f32 %f17, %f5, %f16;
|
||||
ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];
|
||||
mul.ftz.f32 %f19, %f18, %f17;
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
setp.lt.ftz.f32 %p3, %f19, %f20;
|
||||
@%p3 bra $Lt_0_8706;
|
||||
ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];
|
||||
sub.ftz.f32 %f22, %f6, %f21;
|
||||
ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];
|
||||
mul.ftz.f32 %f24, %f23, %f22;
|
||||
mov.f32 %f25, 0f00000000; // 0
|
||||
setp.lt.ftz.f32 %p4, %f24, %f25;
|
||||
@%p4 bra $Lt_0_8706;
|
||||
ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];
|
||||
sub.ftz.f32 %f27, %f7, %f26;
|
||||
ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];
|
||||
mul.ftz.f32 %f29, %f28, %f27;
|
||||
mov.f32 %f30, 0f00000000; // 0
|
||||
setp.lt.ftz.f32 %p5, %f29, %f30;
|
||||
@%p5 bra $Lt_0_8706;
|
||||
cvt.rzi.ftz.s32.f32 %r28, %f19;
|
||||
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
|
||||
setp.ge.s32 %p6, %r28, %r29;
|
||||
@%p6 bra $Lt_0_8706;
|
||||
cvt.rzi.ftz.s32.f32 %r30, %f24;
|
||||
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
|
||||
setp.ge.s32 %p7, %r30, %r31;
|
||||
@%p7 bra $Lt_0_8706;
|
||||
cvt.rzi.ftz.s32.f32 %r32, %f29;
|
||||
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
|
||||
setp.gt.s32 %p8, %r33, %r32;
|
||||
@%p8 bra $L_0_4866;
|
||||
$Lt_0_8706:
|
||||
$L_0_5122:
|
||||
.loc 17 139 0
|
||||
mov.s32 %r34, 1;
|
||||
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd1+0], %r34;
|
||||
bra.uni $Lt_0_7426;
|
||||
$L_0_4866:
|
||||
.loc 17 146 0
|
||||
mul.lo.s32 %r35, %r32, %r31;
|
||||
add.s32 %r36, %r30, %r35;
|
||||
mul.lo.s32 %r37, %r36, %r29;
|
||||
add.s32 %r38, %r28, %r37;
|
||||
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
|
||||
cvt.s64.s32 %rd3, %r38;
|
||||
mul.wide.s32 %rd4, %r38, 4;
|
||||
add.u64 %rd5, %rd2, %rd4;
|
||||
mov.s32 %r39, 1;
|
||||
atom.global.add.s32 %r40, [%rd5], %r39;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
|
||||
setp.gt.s32 %p9, %r42, %r41;
|
||||
@%p9 bra $Lt_0_7682;
|
||||
.loc 17 148 0
|
||||
mov.s32 %r43, 2;
|
||||
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
|
||||
st.global.s32 [%rd6+0], %r43;
|
||||
.loc 16 118 0
|
||||
mov.s32 %r44, -1;
|
||||
atom.global.add.s32 %r45, [%rd5], %r44;
|
||||
bra.uni $Lt_0_7426;
|
||||
$Lt_0_7682:
|
||||
.loc 17 151 0
|
||||
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
|
||||
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
|
||||
mul.lo.s32 %r47, %r46, %r41;
|
||||
add.s32 %r48, %r38, %r47;
|
||||
cvt.s64.s32 %rd8, %r48;
|
||||
mul.wide.s32 %rd9, %r48, 16;
|
||||
add.u64 %rd10, %rd7, %rd9;
|
||||
cvt.rn.f32.s32 %f31, %r28;
|
||||
mov.f32 %f32, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f33, %f31, %f32;
|
||||
sub.ftz.f32 %f34, %f33, %f19;
|
||||
cvt.rn.f32.s32 %f35, %r30;
|
||||
mov.f32 %f36, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f37, %f35, %f36;
|
||||
sub.ftz.f32 %f38, %f37, %f24;
|
||||
cvt.rn.f32.s32 %f39, %r32;
|
||||
mov.f32 %f40, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f41, %f39, %f40;
|
||||
sub.ftz.f32 %f42, %f41, %f29;
|
||||
st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};
|
||||
$Lt_0_7426:
|
||||
$L_0_4610:
|
||||
$Lt_0_6914:
|
||||
$Lt_0_6402:
|
||||
.loc 17 155 0
|
||||
exit;
|
||||
$LDWend_particle_map:
|
||||
} // particle_map
|
||||
|
||||
.entry make_rho (
|
||||
.param .u64 __cudaparm_make_rho_counts,
|
||||
.param .u64 __cudaparm_make_rho_atoms,
|
||||
.param .u64 __cudaparm_make_rho_brick,
|
||||
.param .u64 __cudaparm_make_rho__rho_coeff,
|
||||
.param .s32 __cudaparm_make_rho_atom_stride,
|
||||
.param .s32 __cudaparm_make_rho_npts_x,
|
||||
.param .s32 __cudaparm_make_rho_npts_y,
|
||||
.param .s32 __cudaparm_make_rho_npts_z,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_x,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_y,
|
||||
.param .s32 __cudaparm_make_rho_nlocal_z,
|
||||
.param .s32 __cudaparm_make_rho_order_m_1,
|
||||
.param .s32 __cudaparm_make_rho_order,
|
||||
.param .s32 __cudaparm_make_rho_order2)
|
||||
{
|
||||
.reg .u32 %r<119>;
|
||||
.reg .u64 %rd<57>;
|
||||
.reg .f32 %f<26>;
|
||||
.reg .pred %p<27>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32531_33_non_const_rho_coeff168[256];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32532_33_non_const_front424[320];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32533_33_non_const_ans744[2048];
|
||||
.loc 17 164 0
|
||||
$LDWbegin_make_rho:
|
||||
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_make_rho_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_1_16898;
|
||||
.loc 17 171 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_1_16898:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168;
|
||||
shr.s32 %r5, %r4, 31;
|
||||
mov.s32 %r6, 31;
|
||||
and.b32 %r7, %r5, %r6;
|
||||
add.s32 %r8, %r7, %r4;
|
||||
shr.s32 %r9, %r8, 5;
|
||||
mul.lo.s32 %r10, %r9, 32;
|
||||
sub.s32 %r11, %r4, %r10;
|
||||
setp.lt.s32 %p2, %r11, %r2;
|
||||
@!%p2 bra $Lt_1_17410;
|
||||
.loc 17 177 0
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424;
|
||||
mov.f32 %f2, 0f00000000; // 0
|
||||
cvt.s64.s32 %rd8, %r11;
|
||||
shr.s32 %r12, %r4, 31;
|
||||
mov.s32 %r13, 31;
|
||||
and.b32 %r14, %r12, %r13;
|
||||
add.s32 %r15, %r14, %r4;
|
||||
shr.s32 %r16, %r15, 5;
|
||||
cvt.s64.s32 %rd9, %r16;
|
||||
mul.wide.s32 %rd10, %r16, 40;
|
||||
add.u64 %rd11, %rd8, %rd10;
|
||||
mul.lo.u64 %rd12, %rd11, 4;
|
||||
add.u64 %rd13, %rd7, %rd12;
|
||||
st.shared.f32 [%rd13+128], %f2;
|
||||
$Lt_1_17410:
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424;
|
||||
.loc 17 179 0
|
||||
bar.sync 0;
|
||||
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
|
||||
shr.s32 %r18, %r17, 31;
|
||||
mov.s32 %r19, 31;
|
||||
and.b32 %r20, %r18, %r19;
|
||||
add.s32 %r21, %r20, %r17;
|
||||
shr.s32 %r22, %r21, 5;
|
||||
add.s32 %r23, %r22, 1;
|
||||
mov.u32 %r24, 0;
|
||||
setp.le.s32 %p3, %r23, %r24;
|
||||
@%p3 bra $Lt_1_17922;
|
||||
shr.s32 %r25, %r4, 31;
|
||||
mov.s32 %r26, 31;
|
||||
and.b32 %r27, %r25, %r26;
|
||||
add.s32 %r28, %r27, %r4;
|
||||
shr.s32 %r29, %r28, 5;
|
||||
add.s32 %r30, %r11, 32;
|
||||
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
|
||||
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
|
||||
mul.lo.s32 %r33, %r31, %r32;
|
||||
mov.u32 %r34, %ctaid.x;
|
||||
mul.lo.u32 %r35, %r34, 2;
|
||||
add.u32 %r36, %r29, %r35;
|
||||
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
|
||||
div.s32 %r38, %r36, %r37;
|
||||
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
|
||||
setp.lt.s32 %p4, %r38, %r39;
|
||||
sub.s32 %r40, %r39, %r38;
|
||||
mov.s32 %r41, 0;
|
||||
selp.s32 %r42, %r40, %r41, %p4;
|
||||
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
|
||||
setp.ge.s32 %p5, %r38, %r43;
|
||||
sub.s32 %r44, %r43, %r38;
|
||||
add.s32 %r45, %r44, %r2;
|
||||
sub.s32 %r46, %r45, 1;
|
||||
selp.s32 %r47, %r46, %r2, %p5;
|
||||
rem.s32 %r48, %r36, %r37;
|
||||
setp.lt.s32 %p6, %r48, %r39;
|
||||
sub.s32 %r49, %r39, %r48;
|
||||
mov.s32 %r50, 0;
|
||||
selp.s32 %r51, %r49, %r50, %p6;
|
||||
setp.ge.s32 %p7, %r48, %r31;
|
||||
sub.s32 %r52, %r31, %r48;
|
||||
add.s32 %r53, %r52, %r2;
|
||||
sub.s32 %r54, %r53, 1;
|
||||
selp.s32 %r55, %r54, %r2, %p7;
|
||||
mov.s32 %r56, %r23;
|
||||
mov.s32 %r57, 0;
|
||||
setp.gt.s32 %p8, %r2, %r57;
|
||||
mov.s32 %r58, 0;
|
||||
cvt.s64.s32 %rd14, %r11;
|
||||
cvt.s64.s32 %rd15, %r29;
|
||||
mul.lo.s32 %r59, %r23, 32;
|
||||
mul.wide.s32 %rd16, %r29, 40;
|
||||
add.u64 %rd17, %rd14, %rd16;
|
||||
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
|
||||
setp.gt.s32 %p9, %r60, %r38;
|
||||
mul.lo.u64 %rd18, %rd17, 4;
|
||||
selp.s32 %r61, 1, 0, %p9;
|
||||
add.u64 %rd19, %rd18, %rd7;
|
||||
mov.u64 %rd20, __cuda___cuda_local_var_32533_33_non_const_ans744;
|
||||
mov.s32 %r62, %r56;
|
||||
$Lt_1_18434:
|
||||
//<loop> Loop body line 179, nesting depth: 1, estimated iterations: unknown
|
||||
@!%p8 bra $Lt_1_18690;
|
||||
mov.s32 %r63, %r2;
|
||||
cvt.s64.s32 %rd21, %r4;
|
||||
mul.wide.s32 %rd22, %r4, 4;
|
||||
add.u64 %rd23, %rd20, %rd22;
|
||||
mov.s32 %r64, 0;
|
||||
mov.s32 %r65, %r63;
|
||||
$Lt_1_19202:
|
||||
//<loop> Loop body line 179, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 203 0
|
||||
mov.f32 %f3, 0f00000000; // 0
|
||||
st.shared.f32 [%rd23+0], %f3;
|
||||
add.s32 %r64, %r64, 1;
|
||||
add.u64 %rd23, %rd23, 256;
|
||||
setp.ne.s32 %p10, %r64, %r2;
|
||||
@%p10 bra $Lt_1_19202;
|
||||
$Lt_1_18690:
|
||||
add.s32 %r66, %r11, %r58;
|
||||
set.lt.u32.s32 %r67, %r66, %r32;
|
||||
neg.s32 %r68, %r67;
|
||||
and.b32 %r69, %r61, %r68;
|
||||
mov.u32 %r70, 0;
|
||||
setp.eq.s32 %p11, %r69, %r70;
|
||||
@%p11 bra $Lt_1_20226;
|
||||
.loc 17 206 0
|
||||
mov.s32 %r71, %r42;
|
||||
setp.ge.s32 %p12, %r42, %r47;
|
||||
@%p12 bra $Lt_1_20226;
|
||||
sub.s32 %r72, %r47, %r42;
|
||||
setp.lt.s32 %p13, %r51, %r55;
|
||||
mov.s32 %r73, %r72;
|
||||
$Lt_1_20738:
|
||||
//<loop> Loop body line 206, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 208 0
|
||||
mov.s32 %r74, %r51;
|
||||
@!%p13 bra $Lt_1_20994;
|
||||
sub.s32 %r75, %r55, %r51;
|
||||
sub.s32 %r76, %r71, %r42;
|
||||
add.s32 %r77, %r38, %r42;
|
||||
add.s32 %r78, %r48, %r51;
|
||||
sub.s32 %r79, %r77, %r39;
|
||||
sub.s32 %r80, %r78, %r39;
|
||||
add.s32 %r81, %r76, %r79;
|
||||
mul.lo.s32 %r82, %r33, %r81;
|
||||
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
|
||||
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
|
||||
mov.s32 %r84, %r75;
|
||||
$Lt_1_21506:
|
||||
//<loop> Loop body line 208, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 210 0
|
||||
sub.s32 %r85, %r74, %r51;
|
||||
add.s32 %r86, %r85, %r80;
|
||||
mul.lo.s32 %r87, %r86, %r32;
|
||||
add.s32 %r88, %r82, %r87;
|
||||
add.s32 %r89, %r66, %r88;
|
||||
cvt.s64.s32 %rd25, %r89;
|
||||
mul.wide.s32 %rd26, %r89, 4;
|
||||
add.u64 %rd27, %rd24, %rd26;
|
||||
ld.global.s32 %r90, [%rd27+0];
|
||||
mul.lo.s32 %r91, %r90, %r83;
|
||||
.loc 17 211 0
|
||||
mov.s32 %r92, %r89;
|
||||
setp.ge.s32 %p14, %r89, %r91;
|
||||
@%p14 bra $Lt_1_21762;
|
||||
sub.s32 %r93, %r3, 1;
|
||||
cvt.s64.s32 %rd28, %r83;
|
||||
mul.wide.s32 %rd29, %r83, 16;
|
||||
mov.s32 %r94, -1;
|
||||
setp.gt.s32 %p15, %r93, %r94;
|
||||
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
|
||||
mul.lo.u64 %rd31, %rd25, 16;
|
||||
add.u64 %rd32, %rd30, %rd31;
|
||||
$Lt_1_22274:
|
||||
//<loop> Loop body line 211, nesting depth: 4, estimated iterations: unknown
|
||||
.loc 17 212 0
|
||||
ld.global.f32 %f4, [%rd32+0];
|
||||
@!%p15 bra $Lt_1_29954;
|
||||
sub.s32 %r95, %r93, %r74;
|
||||
mov.s32 %r96, -1;
|
||||
sub.s32 %r97, %r96, %r74;
|
||||
cvt.s64.s32 %rd33, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 4;
|
||||
ld.global.f32 %f5, [%rd32+4];
|
||||
ld.global.f32 %f6, [%rd32+8];
|
||||
cvt.s64.s32 %rd35, %r95;
|
||||
mul.wide.s32 %rd36, %r95, 4;
|
||||
add.u64 %rd37, %rd1, %rd36;
|
||||
sub.s32 %r98, %r93, %r71;
|
||||
cvt.s64.s32 %rd38, %r98;
|
||||
mul.wide.s32 %rd39, %r98, 4;
|
||||
add.u64 %rd40, %rd1, %rd39;
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, 0f00000000; // 0
|
||||
$Lt_1_23042:
|
||||
//<loop> Loop body line 212, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 217 0
|
||||
ld.shared.f32 %f9, [%rd37+0];
|
||||
fma.rn.ftz.f32 %f8, %f8, %f5, %f9;
|
||||
.loc 17 218 0
|
||||
ld.shared.f32 %f10, [%rd40+0];
|
||||
fma.rn.ftz.f32 %f7, %f7, %f6, %f10;
|
||||
sub.u64 %rd40, %rd40, %rd34;
|
||||
sub.s32 %r95, %r95, %r2;
|
||||
sub.u64 %rd37, %rd37, %rd34;
|
||||
setp.gt.s32 %p16, %r95, %r97;
|
||||
@%p16 bra $Lt_1_23042;
|
||||
bra.uni $Lt_1_22530;
|
||||
$Lt_1_29954:
|
||||
mov.f32 %f7, 0f00000000; // 0
|
||||
mov.f32 %f8, 0f00000000; // 0
|
||||
$Lt_1_22530:
|
||||
.loc 17 220 0
|
||||
ld.global.f32 %f11, [%rd32+12];
|
||||
mul.ftz.f32 %f12, %f7, %f8;
|
||||
mul.ftz.f32 %f13, %f11, %f12;
|
||||
@!%p8 bra $Lt_1_23554;
|
||||
mov.s32 %r99, %r2;
|
||||
cvt.s64.s32 %rd41, %r4;
|
||||
mul.wide.s32 %rd42, %r4, 4;
|
||||
add.u64 %rd43, %rd20, %rd42;
|
||||
mov.s32 %r100, 0;
|
||||
mov.s32 %r101, %r99;
|
||||
$Lt_1_24066:
|
||||
//<loop> Loop body line 220, nesting depth: 5, estimated iterations: unknown
|
||||
.loc 17 224 0
|
||||
add.s32 %r102, %r100, %r1;
|
||||
mov.s32 %r103, %r102;
|
||||
setp.lt.s32 %p17, %r102, %r100;
|
||||
@%p17 bra $Lt_1_30466;
|
||||
cvt.s64.s32 %rd44, %r2;
|
||||
mul.wide.s32 %rd34, %r2, 4;
|
||||
cvt.s64.s32 %rd45, %r102;
|
||||
mul.wide.s32 %rd46, %r102, 4;
|
||||
add.u64 %rd47, %rd1, %rd46;
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
$Lt_1_24834:
|
||||
//<loop> Loop body line 224, nesting depth: 6, estimated iterations: unknown
|
||||
.loc 17 225 0
|
||||
ld.shared.f32 %f15, [%rd47+0];
|
||||
fma.rn.ftz.f32 %f14, %f4, %f14, %f15;
|
||||
sub.s32 %r103, %r103, %r2;
|
||||
sub.u64 %rd47, %rd47, %rd34;
|
||||
setp.ge.s32 %p18, %r103, %r100;
|
||||
@%p18 bra $Lt_1_24834;
|
||||
bra.uni $Lt_1_24322;
|
||||
$Lt_1_30466:
|
||||
mov.f32 %f14, 0f00000000; // 0
|
||||
$Lt_1_24322:
|
||||
.loc 17 226 0
|
||||
ld.shared.f32 %f16, [%rd43+0];
|
||||
fma.rn.ftz.f32 %f17, %f14, %f13, %f16;
|
||||
st.shared.f32 [%rd43+0], %f17;
|
||||
add.s32 %r100, %r100, 1;
|
||||
add.u64 %rd43, %rd43, 256;
|
||||
setp.ne.s32 %p19, %r100, %r2;
|
||||
@%p19 bra $Lt_1_24066;
|
||||
$Lt_1_23554:
|
||||
add.s32 %r92, %r92, %r83;
|
||||
add.u64 %rd32, %rd29, %rd32;
|
||||
setp.gt.s32 %p20, %r91, %r92;
|
||||
@%p20 bra $Lt_1_22274;
|
||||
$Lt_1_21762:
|
||||
add.s32 %r74, %r74, 1;
|
||||
setp.ne.s32 %p21, %r55, %r74;
|
||||
@%p21 bra $Lt_1_21506;
|
||||
$Lt_1_20994:
|
||||
add.s32 %r71, %r71, 1;
|
||||
setp.ne.s32 %p22, %r47, %r71;
|
||||
@%p22 bra $Lt_1_20738;
|
||||
$Lt_1_20226:
|
||||
$Lt_1_19714:
|
||||
.loc 17 235 0
|
||||
bar.sync 0;
|
||||
@!%p2 bra $Lt_1_26626;
|
||||
.loc 17 237 0
|
||||
ld.shared.f32 %f18, [%rd19+128];
|
||||
st.shared.f32 [%rd19+0], %f18;
|
||||
.loc 17 238 0
|
||||
mov.f32 %f19, 0f00000000; // 0
|
||||
st.shared.f32 [%rd19+128], %f19;
|
||||
bra.uni $Lt_1_26370;
|
||||
$Lt_1_26626:
|
||||
.loc 17 240 0
|
||||
mov.f32 %f20, 0f00000000; // 0
|
||||
st.shared.f32 [%rd19+0], %f20;
|
||||
$Lt_1_26370:
|
||||
@!%p8 bra $Lt_1_26882;
|
||||
mov.s32 %r104, %r2;
|
||||
cvt.s64.s32 %rd48, %r4;
|
||||
mov.s32 %r105, %r11;
|
||||
add.s32 %r106, %r11, %r2;
|
||||
mul.wide.s32 %rd49, %r4, 4;
|
||||
add.u64 %rd50, %rd20, %rd49;
|
||||
mov.s64 %rd51, %rd19;
|
||||
mov.s32 %r107, %r104;
|
||||
$Lt_1_27394:
|
||||
//<loop> Loop body line 240, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 243 0
|
||||
ld.shared.f32 %f21, [%rd50+0];
|
||||
ld.shared.f32 %f22, [%rd51+0];
|
||||
add.ftz.f32 %f23, %f21, %f22;
|
||||
st.shared.f32 [%rd51+0], %f23;
|
||||
.loc 17 244 0
|
||||
bar.sync 0;
|
||||
add.s32 %r105, %r105, 1;
|
||||
add.u64 %rd51, %rd51, 4;
|
||||
add.u64 %rd50, %rd50, 256;
|
||||
setp.ne.s32 %p23, %r105, %r106;
|
||||
@%p23 bra $Lt_1_27394;
|
||||
$Lt_1_26882:
|
||||
set.lt.u32.s32 %r108, %r66, %r17;
|
||||
neg.s32 %r109, %r108;
|
||||
and.b32 %r110, %r61, %r109;
|
||||
mov.u32 %r111, 0;
|
||||
setp.eq.s32 %p24, %r110, %r111;
|
||||
@%p24 bra $Lt_1_27906;
|
||||
.loc 17 248 0
|
||||
ld.shared.f32 %f24, [%rd19+0];
|
||||
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
|
||||
add.s32 %r112, %r11, %r58;
|
||||
mul.lo.s32 %r113, %r37, %r17;
|
||||
mul.lo.s32 %r114, %r38, %r113;
|
||||
mul.lo.s32 %r115, %r48, %r17;
|
||||
add.s32 %r116, %r114, %r115;
|
||||
add.s32 %r117, %r112, %r116;
|
||||
cvt.s64.s32 %rd53, %r117;
|
||||
mul.wide.s32 %rd54, %r117, 4;
|
||||
add.u64 %rd55, %rd52, %rd54;
|
||||
st.global.f32 [%rd55+0], %f24;
|
||||
$Lt_1_27906:
|
||||
add.s32 %r58, %r58, 32;
|
||||
setp.ne.s32 %p25, %r58, %r59;
|
||||
@%p25 bra $Lt_1_18434;
|
||||
$Lt_1_17922:
|
||||
.loc 17 252 0
|
||||
exit;
|
||||
$LDWend_make_rho:
|
||||
} // make_rho
|
||||
|
||||
.entry interp (
|
||||
.param .u64 __cudaparm_interp_x_,
|
||||
.param .u64 __cudaparm_interp_q_,
|
||||
.param .s32 __cudaparm_interp_nlocal,
|
||||
.param .u64 __cudaparm_interp_brick,
|
||||
.param .u64 __cudaparm_interp__rho_coeff,
|
||||
.param .s32 __cudaparm_interp_npts_x,
|
||||
.param .s32 __cudaparm_interp_npts_yx,
|
||||
.param .f32 __cudaparm_interp_b_lo_x,
|
||||
.param .f32 __cudaparm_interp_b_lo_y,
|
||||
.param .f32 __cudaparm_interp_b_lo_z,
|
||||
.param .f32 __cudaparm_interp_delxinv,
|
||||
.param .f32 __cudaparm_interp_delyinv,
|
||||
.param .f32 __cudaparm_interp_delzinv,
|
||||
.param .s32 __cudaparm_interp_order,
|
||||
.param .s32 __cudaparm_interp_order2,
|
||||
.param .f32 __cudaparm_interp_qqrd2e_scale,
|
||||
.param .u64 __cudaparm_interp_ans)
|
||||
{
|
||||
.reg .u32 %r<56>;
|
||||
.reg .u64 %rd<37>;
|
||||
.reg .f32 %f<69>;
|
||||
.reg .pred %p<14>;
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888[256];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32630_33_non_const_rho1d_03144[2048];
|
||||
.shared .align 4 .b8 __cuda___cuda_local_var_32631_33_non_const_rho1d_15192[2048];
|
||||
// __cuda_local_var_32647_12_non_const_ek = 16
|
||||
.loc 17 262 0
|
||||
$LDWbegin_interp:
|
||||
ld.param.s32 %r1, [__cudaparm_interp_order2];
|
||||
ld.param.s32 %r2, [__cudaparm_interp_order];
|
||||
add.s32 %r3, %r1, %r2;
|
||||
cvt.s32.u32 %r4, %tid.x;
|
||||
setp.le.s32 %p1, %r3, %r4;
|
||||
@%p1 bra $Lt_2_8706;
|
||||
.loc 17 269 0
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888;
|
||||
cvt.s64.s32 %rd2, %r4;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
|
||||
add.u64 %rd5, %rd4, %rd3;
|
||||
ld.global.f32 %f1, [%rd5+0];
|
||||
add.u64 %rd6, %rd3, %rd1;
|
||||
st.shared.f32 [%rd6+0], %f1;
|
||||
$Lt_2_8706:
|
||||
mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888;
|
||||
.loc 17 270 0
|
||||
bar.sync 0;
|
||||
mov.u32 %r5, %ctaid.x;
|
||||
mov.u32 %r6, %ntid.x;
|
||||
mul.lo.u32 %r7, %r5, %r6;
|
||||
add.u32 %r8, %r4, %r7;
|
||||
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
|
||||
setp.le.s32 %p2, %r9, %r8;
|
||||
@%p2 bra $Lt_2_9218;
|
||||
.loc 17 278 0
|
||||
mov.u32 %r10, %r8;
|
||||
mov.s32 %r11, 0;
|
||||
mov.u32 %r12, %r11;
|
||||
mov.s32 %r13, 0;
|
||||
mov.u32 %r14, %r13;
|
||||
mov.s32 %r15, 0;
|
||||
mov.u32 %r16, %r15;
|
||||
tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];
|
||||
mov.f32 %f6, %f2;
|
||||
mov.f32 %f7, %f3;
|
||||
mov.f32 %f8, %f4;
|
||||
.loc 17 279 0
|
||||
mov.u32 %r17, %r8;
|
||||
mov.s32 %r18, 0;
|
||||
mov.u32 %r19, %r18;
|
||||
mov.s32 %r20, 0;
|
||||
mov.u32 %r21, %r20;
|
||||
mov.s32 %r22, 0;
|
||||
mov.u32 %r23, %r22;
|
||||
tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];
|
||||
mov.f32 %f13, %f9;
|
||||
ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];
|
||||
mul.ftz.f32 %f15, %f14, %f13;
|
||||
mov.f32 %f16, 0f00000000; // 0
|
||||
setp.neu.ftz.f32 %p3, %f15, %f16;
|
||||
@!%p3 bra $Lt_2_9986;
|
||||
mov.s32 %r24, 0;
|
||||
setp.gt.s32 %p4, %r2, %r24;
|
||||
ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];
|
||||
sub.ftz.f32 %f18, %f6, %f17;
|
||||
ld.param.f32 %f19, [__cudaparm_interp_delxinv];
|
||||
mul.ftz.f32 %f20, %f19, %f18;
|
||||
@!%p4 bra $Lt_2_16386;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192;
|
||||
cvt.rzi.ftz.s32.f32 %r25, %f20;
|
||||
cvt.rn.f32.s32 %f21, %r25;
|
||||
mov.f32 %f22, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f23, %f21, %f22;
|
||||
sub.ftz.f32 %f24, %f23, %f20;
|
||||
ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];
|
||||
sub.ftz.f32 %f26, %f7, %f25;
|
||||
ld.param.f32 %f27, [__cudaparm_interp_delyinv];
|
||||
mul.ftz.f32 %f28, %f27, %f26;
|
||||
cvt.rzi.ftz.s32.f32 %r26, %f28;
|
||||
cvt.rn.f32.s32 %f29, %r26;
|
||||
mov.f32 %f30, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f31, %f29, %f30;
|
||||
sub.ftz.f32 %f32, %f31, %f28;
|
||||
mov.s32 %r27, %r2;
|
||||
cvt.s64.s32 %rd9, %r4;
|
||||
mov.s32 %r28, %r1;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
add.u64 %rd10, %rd3, %rd7;
|
||||
add.u64 %rd11, %rd3, %rd8;
|
||||
mov.s32 %r29, 0;
|
||||
mov.s32 %r30, %r27;
|
||||
$Lt_2_10754:
|
||||
//<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 298 0
|
||||
mov.f32 %f33, 0f00000000; // 0
|
||||
mov.f32 %f34, 0f00000000; // 0
|
||||
st.shared.f32 [%rd10+0], %f34;
|
||||
.loc 17 299 0
|
||||
mov.f32 %f35, 0f00000000; // 0
|
||||
mov.f32 %f36, 0f00000000; // 0
|
||||
st.shared.f32 [%rd11+0], %f36;
|
||||
.loc 17 300 0
|
||||
mov.s32 %r31, %r28;
|
||||
setp.lt.s32 %p5, %r28, %r29;
|
||||
@%p5 bra $Lt_2_11010;
|
||||
cvt.s64.s32 %rd12, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 4;
|
||||
cvt.s64.s32 %rd14, %r28;
|
||||
mul.wide.s32 %rd15, %r28, 4;
|
||||
add.u64 %rd16, %rd1, %rd15;
|
||||
$Lt_2_11522:
|
||||
//<loop> Loop body line 300, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 301 0
|
||||
ld.shared.f32 %f37, [%rd16+0];
|
||||
fma.rn.ftz.f32 %f33, %f33, %f24, %f37;
|
||||
st.shared.f32 [%rd10+0], %f33;
|
||||
.loc 17 302 0
|
||||
fma.rn.ftz.f32 %f35, %f35, %f32, %f37;
|
||||
st.shared.f32 [%rd11+0], %f35;
|
||||
sub.s32 %r31, %r31, %r2;
|
||||
sub.u64 %rd16, %rd16, %rd13;
|
||||
setp.ge.s32 %p6, %r31, %r29;
|
||||
@%p6 bra $Lt_2_11522;
|
||||
$Lt_2_11010:
|
||||
add.s32 %r29, %r29, 1;
|
||||
add.s32 %r28, %r28, 1;
|
||||
add.u64 %rd11, %rd11, 256;
|
||||
add.u64 %rd10, %rd10, 256;
|
||||
setp.ne.s32 %p7, %r28, %r3;
|
||||
@%p7 bra $Lt_2_10754;
|
||||
bra.uni $Lt_2_10242;
|
||||
$Lt_2_16386:
|
||||
cvt.rzi.ftz.s32.f32 %r25, %f20;
|
||||
mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192;
|
||||
mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144;
|
||||
$Lt_2_10242:
|
||||
.loc 17 306 0
|
||||
ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];
|
||||
sub.ftz.f32 %f39, %f8, %f38;
|
||||
ld.param.f32 %f40, [__cudaparm_interp_delzinv];
|
||||
mul.ftz.f32 %f41, %f40, %f39;
|
||||
cvt.rzi.ftz.s32.f32 %r32, %f41;
|
||||
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
|
||||
mul.lo.s32 %r34, %r32, %r33;
|
||||
add.s32 %r35, %r25, %r34;
|
||||
@!%p4 bra $Lt_2_16898;
|
||||
cvt.rn.f32.s32 %f42, %r32;
|
||||
mov.f32 %f43, 0f3f000000; // 0.5
|
||||
add.ftz.f32 %f44, %f42, %f43;
|
||||
sub.ftz.f32 %f45, %f44, %f41;
|
||||
mov.s32 %r36, %r2;
|
||||
ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];
|
||||
sub.ftz.f32 %f47, %f7, %f46;
|
||||
cvt.s64.s32 %rd17, %r4;
|
||||
ld.param.f32 %f48, [__cudaparm_interp_delyinv];
|
||||
mul.ftz.f32 %f49, %f48, %f47;
|
||||
cvt.rzi.ftz.s32.f32 %r37, %f49;
|
||||
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
|
||||
mul.lo.s32 %r39, %r37, %r38;
|
||||
mul.wide.s32 %rd3, %r4, 4;
|
||||
add.s32 %r40, %r39, %r35;
|
||||
add.u64 %rd18, %rd3, %rd7;
|
||||
add.u64 %rd19, %rd3, %rd8;
|
||||
cvt.s64.s32 %rd20, %r38;
|
||||
mul.wide.s32 %rd21, %r38, 16;
|
||||
mov.s32 %r41, %r40;
|
||||
ld.param.u64 %rd22, [__cudaparm_interp_brick];
|
||||
mov.s32 %r42, 0;
|
||||
mov.f32 %f50, 0f00000000; // 0
|
||||
mov.f32 %f51, 0f00000000; // 0
|
||||
mov.f32 %f52, 0f00000000; // 0
|
||||
mov.s32 %r43, %r36;
|
||||
$Lt_2_12802:
|
||||
//<loop> Loop body line 306, nesting depth: 1, estimated iterations: unknown
|
||||
.loc 17 309 0
|
||||
add.s32 %r44, %r42, %r1;
|
||||
mov.s32 %r45, %r44;
|
||||
setp.lt.s32 %p8, %r44, %r42;
|
||||
@%p8 bra $Lt_2_17154;
|
||||
cvt.s64.s32 %rd23, %r2;
|
||||
mul.wide.s32 %rd13, %r2, 4;
|
||||
cvt.s64.s32 %rd24, %r44;
|
||||
mul.wide.s32 %rd25, %r44, 4;
|
||||
add.u64 %rd26, %rd1, %rd25;
|
||||
mov.f32 %f53, 0f00000000; // 0
|
||||
$Lt_2_13570:
|
||||
//<loop> Loop body line 309, nesting depth: 2, estimated iterations: unknown
|
||||
.loc 17 310 0
|
||||
ld.shared.f32 %f54, [%rd26+0];
|
||||
fma.rn.ftz.f32 %f53, %f45, %f53, %f54;
|
||||
sub.s32 %r45, %r45, %r2;
|
||||
sub.u64 %rd26, %rd26, %rd13;
|
||||
setp.ge.s32 %p9, %r45, %r42;
|
||||
@%p9 bra $Lt_2_13570;
|
||||
bra.uni $Lt_2_13058;
|
||||
$Lt_2_17154:
|
||||
mov.f32 %f53, 0f00000000; // 0
|
||||
$Lt_2_13058:
|
||||
.loc 17 312 0
|
||||
mov.s32 %r46, %r41;
|
||||
mov.s32 %r47, %r2;
|
||||
mul.ftz.f32 %f55, %f15, %f53;
|
||||
mov.s32 %r48, %r46;
|
||||
mov.s64 %rd27, %rd19;
|
||||
cvt.s64.s32 %rd28, %r46;
|
||||
mul.wide.s32 %rd29, %r46, 16;
|
||||
mov.s32 %r49, 0;
|
||||
mov.s32 %r50, %r47;
|
||||
$Lt_2_14594:
|
||||
//<loop> Loop body line 312, nesting depth: 2, estimated iterations: unknown
|
||||
mov.s32 %r51, %r2;
|
||||
mov.s32 %r52, %r48;
|
||||
add.s32 %r53, %r48, %r2;
|
||||
mov.s64 %rd30, %rd18;
|
||||
ld.shared.f32 %f56, [%rd27+0];
|
||||
add.u64 %rd31, %rd29, %rd22;
|
||||
mul.ftz.f32 %f57, %f55, %f56;
|
||||
mov.s32 %r54, %r51;
|
||||
$Lt_2_15362:
|
||||
//<loop> Loop body line 312, nesting depth: 3, estimated iterations: unknown
|
||||
.loc 17 316 0
|
||||
ld.shared.f32 %f58, [%rd30+0];
|
||||
mul.ftz.f32 %f59, %f58, %f57;
|
||||
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];
|
||||
.loc 17 318 0
|
||||
mul.ftz.f32 %f63, %f59, %f60;
|
||||
sub.ftz.f32 %f52, %f52, %f63;
|
||||
.loc 17 319 0
|
||||
mul.ftz.f32 %f64, %f59, %f61;
|
||||
sub.ftz.f32 %f51, %f51, %f64;
|
||||
.loc 17 320 0
|
||||
mul.ftz.f32 %f65, %f59, %f62;
|
||||
sub.ftz.f32 %f50, %f50, %f65;
|
||||
add.s32 %r52, %r52, 1;
|
||||
add.u64 %rd31, %rd31, 16;
|
||||
add.u64 %rd30, %rd30, 256;
|
||||
setp.ne.s32 %p10, %r52, %r53;
|
||||
@%p10 bra $Lt_2_15362;
|
||||
add.s32 %r49, %r49, 1;
|
||||
add.s32 %r48, %r48, %r38;
|
||||
add.u64 %rd29, %rd29, %rd21;
|
||||
add.u64 %rd27, %rd27, 256;
|
||||
setp.ne.s32 %p11, %r49, %r2;
|
||||
@%p11 bra $Lt_2_14594;
|
||||
add.s32 %r42, %r42, 1;
|
||||
add.s32 %r41, %r46, %r33;
|
||||
setp.ne.s32 %p12, %r42, %r2;
|
||||
@%p12 bra $Lt_2_12802;
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_16898:
|
||||
mov.f32 %f50, 0f00000000; // 0
|
||||
mov.f32 %f51, 0f00000000; // 0
|
||||
mov.f32 %f52, 0f00000000; // 0
|
||||
bra.uni $Lt_2_9730;
|
||||
$Lt_2_9986:
|
||||
mov.f32 %f50, 0f00000000; // 0
|
||||
mov.f32 %f51, 0f00000000; // 0
|
||||
mov.f32 %f52, 0f00000000; // 0
|
||||
$Lt_2_9730:
|
||||
.loc 17 327 0
|
||||
ld.param.u64 %rd32, [__cudaparm_interp_ans];
|
||||
cvt.s64.s32 %rd33, %r8;
|
||||
mul.wide.s32 %rd34, %r8, 16;
|
||||
add.u64 %rd35, %rd32, %rd34;
|
||||
mov.f32 %f66, %f67;
|
||||
st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};
|
||||
$Lt_2_9218:
|
||||
.loc 17 329 0
|
||||
exit;
|
||||
$LDWend_interp:
|
||||
} // interp
|
||||
|
Loading…
Reference in New Issue