git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7291 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp 2011-12-02 18:27:16 +00:00
parent ab6e356808
commit 9c9282d024
17 changed files with 0 additions and 15323 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,979 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bddd_00000000-9_lj96_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.4Q2aYE)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bddd_00000000-8_lj96_cut_gpu_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lj96_cut_gpu_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair_engv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<62>;
.reg .f32 %f<103>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];
// __cuda_local_var_32504_10_non_const_f = 48
// __cuda_local_var_32508_9_non_const_virial = 16
.loc 16 88 0
$LDWbegin_kernel_pair:
.loc 16 95 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 96 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 97 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 98 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 107 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
rem.s32 %r6, %r2, %r1;
cvt.s32.u32 %r7, %ctaid.x;
mul.lo.s32 %r8, %r7, %r5;
add.s32 %r9, %r3, %r8;
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
setp.lt.s32 %p1, %r9, %r10;
@!%p1 bra $Lt_0_19202;
.loc 16 113 0
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r11;
mul.wide.s32 %rd3, %r11, 4;
cvt.s64.s32 %rd4, %r9;
mul.wide.s32 %rd5, %r9, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r12, [%rd8+0];
add.u64 %rd9, %rd3, %rd8;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd10, %rd6;
@%p2 bra $Lt_0_19714;
.loc 16 119 0
cvt.s32.s64 %r13, %rd2;
mul.lo.s32 %r14, %r13, %r12;
cvt.s64.s32 %rd11, %r14;
mul.wide.s32 %rd12, %r14, 4;
add.u64 %rd13, %rd9, %rd12;
.loc 16 120 0
mul.lo.s32 %r15, %r6, %r13;
cvt.s64.s32 %rd14, %r15;
mul.wide.s32 %rd15, %r15, 4;
add.u64 %rd16, %rd9, %rd15;
.loc 16 121 0
mul.lo.s32 %r16, %r13, %r1;
bra.uni $Lt_0_19458;
$Lt_0_19714:
.loc 16 123 0
ld.global.s32 %r17, [%rd9+0];
cvt.s64.s32 %rd17, %r17;
mul.wide.s32 %rd18, %r17, 4;
add.u64 %rd19, %rd10, %rd18;
.loc 16 124 0
cvt.s64.s32 %rd20, %r12;
mul.wide.s32 %rd21, %r12, 4;
add.u64 %rd13, %rd19, %rd21;
.loc 16 125 0
mov.s32 %r16, %r1;
.loc 16 126 0
cvt.s64.s32 %rd22, %r6;
mul.wide.s32 %rd23, %r6, 4;
add.u64 %rd16, %rd19, %rd23;
$Lt_0_19458:
.loc 16 129 0
ld.global.s32 %r18, [%rd7+0];
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd16, %rd13;
@%p3 bra $Lt_0_28162;
cvt.rzi.ftz.s32.f32 %r26, %f24;
cvt.s64.s32 %rd24, %r16;
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r28, %r27, %r26;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
$Lt_0_20482:
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
.loc 16 135 0
ld.global.s32 %r29, [%rd16+0];
.loc 16 136 0
shr.s32 %r30, %r29, 30;
and.b32 %r31, %r30, 3;
cvt.s64.s32 %rd27, %r31;
mul.wide.s32 %rd28, %r31, 4;
add.u64 %rd29, %rd26, %rd28;
ld.shared.f32 %f29, [%rd29+0];
.loc 16 139 0
and.b32 %r32, %r29, 1073741823;
mov.u32 %r33, %r32;
mov.s32 %r34, 0;
mov.u32 %r35, %r34;
mov.s32 %r36, 0;
mov.u32 %r37, %r36;
mov.s32 %r38, 0;
mov.u32 %r39, %r38;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r40, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r41, %r40, %r28;
cvt.s64.s32 %rd30, %r41;
mul.wide.s32 %rd31, %r41, 16;
add.u64 %rd32, %rd31, %rd25;
ld.global.f32 %f44, [%rd32+8];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21762;
.loc 16 154 0
rcp.approx.ftz.f32 %f45, %f43;
mul.ftz.f32 %f46, %f45, %f45;
mul.ftz.f32 %f47, %f45, %f46;
sqrt.approx.ftz.f32 %f48, %f47;
mul.ftz.f32 %f49, %f45, %f47;
ld.global.v2.f32 {%f50,%f51}, [%rd32+0];
mul.ftz.f32 %f52, %f50, %f48;
sub.ftz.f32 %f53, %f52, %f51;
mul.ftz.f32 %f54, %f49, %f53;
mul.ftz.f32 %f55, %f29, %f54;
.loc 16 156 0
fma.rn.ftz.f32 %f27, %f39, %f55, %f27;
.loc 16 157 0
fma.rn.ftz.f32 %f26, %f38, %f55, %f26;
.loc 16 158 0
fma.rn.ftz.f32 %f25, %f40, %f55, %f25;
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
mov.u32 %r43, 0;
setp.le.s32 %p5, %r42, %r43;
@%p5 bra $Lt_0_21250;
.loc 16 162 0
ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];
add.u64 %rd34, %rd33, %rd31;
ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd34+0];
mul.ftz.f32 %f59, %f56, %f48;
sub.ftz.f32 %f60, %f59, %f57;
mul.ftz.f32 %f61, %f47, %f60;
sub.ftz.f32 %f62, %f61, %f58;
fma.rn.ftz.f32 %f28, %f29, %f62, %f28;
$Lt_0_21250:
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
mov.u32 %r45, 0;
setp.le.s32 %p6, %r44, %r45;
@%p6 bra $Lt_0_21762;
.loc 16 165 0
mov.f32 %f63, %f6;
mul.ftz.f32 %f64, %f39, %f39;
fma.rn.ftz.f32 %f65, %f55, %f64, %f63;
mov.f32 %f6, %f65;
.loc 16 166 0
mov.f32 %f66, %f8;
fma.rn.ftz.f32 %f67, %f55, %f41, %f66;
mov.f32 %f8, %f67;
.loc 16 167 0
mov.f32 %f68, %f10;
mul.ftz.f32 %f69, %f40, %f40;
fma.rn.ftz.f32 %f70, %f55, %f69, %f68;
mov.f32 %f10, %f70;
.loc 16 168 0
mov.f32 %f71, %f12;
mul.ftz.f32 %f72, %f38, %f39;
fma.rn.ftz.f32 %f73, %f55, %f72, %f71;
mov.f32 %f12, %f73;
.loc 16 169 0
mov.f32 %f74, %f14;
mul.ftz.f32 %f75, %f39, %f40;
fma.rn.ftz.f32 %f76, %f55, %f75, %f74;
mov.f32 %f14, %f76;
.loc 16 170 0
mul.ftz.f32 %f77, %f38, %f40;
fma.rn.ftz.f32 %f15, %f55, %f77, %f15;
mov.f32 %f16, %f15;
$Lt_0_21762:
$Lt_0_20738:
.loc 16 133 0
mul.lo.u64 %rd35, %rd24, 4;
add.u64 %rd16, %rd16, %rd35;
setp.lt.u64 %p7, %rd16, %rd13;
@%p7 bra $Lt_0_20482;
bra.uni $Lt_0_18946;
$Lt_0_28162:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
bra.uni $Lt_0_18946;
$Lt_0_19202:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_18946:
mov.u32 %r46, 1;
setp.le.s32 %p8, %r1, %r46;
@%p8 bra $Lt_0_24578;
.loc 16 181 0
mov.u64 %rd36, __cuda___cuda_local_var_32582_35_non_const_red_acc108;
cvt.s64.s32 %rd37, %r2;
mul.wide.s32 %rd38, %r2, 4;
add.u64 %rd39, %rd36, %rd38;
mov.f32 %f78, %f27;
st.shared.f32 [%rd39+0], %f78;
.loc 16 182 0
mov.f32 %f79, %f26;
st.shared.f32 [%rd39+512], %f79;
.loc 16 183 0
mov.f32 %f80, %f25;
st.shared.f32 [%rd39+1024], %f80;
.loc 16 184 0
mov.f32 %f81, %f28;
st.shared.f32 [%rd39+1536], %f81;
.loc 16 186 0
shr.s32 %r47, %r1, 31;
mov.s32 %r48, 1;
and.b32 %r49, %r47, %r48;
add.s32 %r50, %r49, %r1;
shr.s32 %r51, %r50, 1;
mov.s32 %r52, %r51;
mov.u32 %r53, 0;
setp.ne.u32 %p9, %r51, %r53;
@!%p9 bra $Lt_0_23042;
$Lt_0_23554:
setp.ge.u32 %p10, %r6, %r52;
@%p10 bra $Lt_0_23810;
.loc 16 189 0
add.u32 %r54, %r2, %r52;
cvt.u64.u32 %rd40, %r54;
mul.wide.u32 %rd41, %r54, 4;
add.u64 %rd42, %rd36, %rd41;
ld.shared.f32 %f82, [%rd42+0];
add.ftz.f32 %f78, %f82, %f78;
st.shared.f32 [%rd39+0], %f78;
ld.shared.f32 %f83, [%rd42+512];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd39+512], %f79;
ld.shared.f32 %f84, [%rd42+1024];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd39+1024], %f80;
ld.shared.f32 %f85, [%rd42+1536];
add.ftz.f32 %f81, %f85, %f81;
st.shared.f32 [%rd39+1536], %f81;
$Lt_0_23810:
.loc 16 186 0
shr.u32 %r52, %r52, 1;
mov.u32 %r55, 0;
setp.ne.u32 %p11, %r52, %r55;
@%p11 bra $Lt_0_23554;
$Lt_0_23042:
.loc 16 193 0
mov.f32 %f27, %f78;
.loc 16 194 0
mov.f32 %f26, %f79;
.loc 16 195 0
mov.f32 %f25, %f80;
.loc 16 196 0
mov.f32 %f28, %f81;
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
mov.u32 %r57, 0;
setp.le.s32 %p12, %r56, %r57;
@%p12 bra $Lt_0_24578;
.loc 16 200 0
mov.f32 %f78, %f6;
st.shared.f32 [%rd39+0], %f78;
mov.f32 %f79, %f8;
st.shared.f32 [%rd39+512], %f79;
mov.f32 %f80, %f10;
st.shared.f32 [%rd39+1024], %f80;
mov.f32 %f81, %f12;
st.shared.f32 [%rd39+1536], %f81;
mov.f32 %f86, %f14;
st.shared.f32 [%rd39+2048], %f86;
mov.f32 %f87, %f16;
st.shared.f32 [%rd39+2560], %f87;
.loc 16 202 0
mov.s32 %r58, %r51;
@!%p9 bra $Lt_0_25090;
$Lt_0_25602:
setp.ge.u32 %p13, %r6, %r58;
@%p13 bra $Lt_0_25858;
.loc 16 205 0
add.u32 %r59, %r2, %r58;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd36, %rd44;
ld.shared.f32 %f88, [%rd45+0];
add.ftz.f32 %f78, %f88, %f78;
st.shared.f32 [%rd39+0], %f78;
ld.shared.f32 %f89, [%rd45+512];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd39+512], %f79;
ld.shared.f32 %f90, [%rd45+1024];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd39+1024], %f80;
ld.shared.f32 %f91, [%rd45+1536];
add.ftz.f32 %f81, %f91, %f81;
st.shared.f32 [%rd39+1536], %f81;
ld.shared.f32 %f92, [%rd45+2048];
add.ftz.f32 %f86, %f92, %f86;
st.shared.f32 [%rd39+2048], %f86;
ld.shared.f32 %f93, [%rd45+2560];
add.ftz.f32 %f87, %f93, %f87;
st.shared.f32 [%rd39+2560], %f87;
$Lt_0_25858:
.loc 16 202 0
shr.u32 %r58, %r58, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p14, %r58, %r60;
@%p14 bra $Lt_0_25602;
$Lt_0_25090:
.loc 16 210 0
mov.f32 %f6, %f78;
mov.f32 %f8, %f79;
mov.f32 %f10, %f80;
mov.f32 %f12, %f81;
mov.f32 %f14, %f86;
mov.f32 %f16, %f87;
$Lt_0_24578:
$Lt_0_22530:
selp.s32 %r61, 1, 0, %p1;
mov.s32 %r62, 0;
set.eq.u32.s32 %r63, %r6, %r62;
neg.s32 %r64, %r63;
and.b32 %r65, %r61, %r64;
mov.u32 %r66, 0;
setp.eq.s32 %p15, %r65, %r66;
@%p15 bra $Lt_0_26626;
.loc 16 216 0
cvt.s64.s32 %rd46, %r9;
ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];
mul.wide.s32 %rd48, %r9, 4;
add.u64 %rd49, %rd47, %rd48;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_27138;
.loc 16 218 0
st.global.f32 [%rd49+0], %f28;
.loc 16 219 0
cvt.s64.s32 %rd50, %r10;
mul.wide.s32 %rd51, %r10, 4;
add.u64 %rd49, %rd49, %rd51;
$Lt_0_27138:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27650;
.loc 16 223 0
mov.f32 %f94, %f6;
st.global.f32 [%rd49+0], %f94;
.loc 16 224 0
cvt.s64.s32 %rd52, %r10;
mul.wide.s32 %rd53, %r10, 4;
add.u64 %rd54, %rd53, %rd49;
.loc 16 223 0
mov.f32 %f95, %f8;
st.global.f32 [%rd54+0], %f95;
.loc 16 224 0
add.u64 %rd55, %rd53, %rd54;
.loc 16 223 0
mov.f32 %f96, %f10;
st.global.f32 [%rd55+0], %f96;
.loc 16 224 0
add.u64 %rd56, %rd53, %rd55;
.loc 16 223 0
mov.f32 %f97, %f12;
st.global.f32 [%rd56+0], %f97;
.loc 16 224 0
add.u64 %rd49, %rd53, %rd56;
.loc 16 223 0
mov.f32 %f98, %f14;
st.global.f32 [%rd49+0], %f98;
mov.f32 %f99, %f16;
add.u64 %rd57, %rd53, %rd49;
st.global.f32 [%rd57+0], %f99;
$Lt_0_27650:
.loc 16 227 0
ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd59, %rd46, 16;
add.u64 %rd60, %rd58, %rd59;
mov.f32 %f100, %f101;
st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f100};
$Lt_0_26626:
.loc 16 229 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast_engv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<74>;
.reg .f32 %f<109>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32647_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32737_35_non_const_red_acc7168[3072];
// __cuda_local_var_32658_10_non_const_f = 48
// __cuda_local_var_32662_9_non_const_virial = 16
.loc 16 237 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_21250;
.loc 16 247 0
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_21250:
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21762;
.loc 16 249 0
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22274;
.loc 16 251 0
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_22274:
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
$Lt_1_21762:
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_lj13296;
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_lj35232;
.loc 16 261 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 263 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
rem.s32 %r10, %r1, %r6;
cvt.s32.u32 %r11, %ctaid.x;
mul.lo.s32 %r12, %r11, %r9;
add.s32 %r13, %r7, %r12;
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
setp.lt.s32 %p4, %r13, %r14;
@!%p4 bra $Lt_1_23042;
.loc 16 269 0
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r15;
mul.wide.s32 %rd18, %r15, 4;
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r16, [%rd23+0];
add.u64 %rd24, %rd18, %rd23;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd25, %rd21;
@%p5 bra $Lt_1_23554;
.loc 16 275 0
cvt.s32.s64 %r17, %rd17;
mul.lo.s32 %r18, %r17, %r16;
cvt.s64.s32 %rd26, %r18;
mul.wide.s32 %rd27, %r18, 4;
add.u64 %rd28, %rd24, %rd27;
.loc 16 276 0
mul.lo.s32 %r19, %r10, %r17;
cvt.s64.s32 %rd29, %r19;
mul.wide.s32 %rd30, %r19, 4;
add.u64 %rd31, %rd24, %rd30;
.loc 16 277 0
mul.lo.s32 %r20, %r17, %r6;
bra.uni $Lt_1_23298;
$Lt_1_23554:
.loc 16 279 0
ld.global.s32 %r21, [%rd24+0];
cvt.s64.s32 %rd32, %r21;
mul.wide.s32 %rd33, %r21, 4;
add.u64 %rd34, %rd25, %rd33;
.loc 16 280 0
cvt.s64.s32 %rd35, %r16;
mul.wide.s32 %rd36, %r16, 4;
add.u64 %rd28, %rd34, %rd36;
.loc 16 281 0
mov.s32 %r20, %r6;
.loc 16 282 0
cvt.s64.s32 %rd37, %r10;
mul.wide.s32 %rd38, %r10, 4;
add.u64 %rd31, %rd34, %rd38;
$Lt_1_23298:
.loc 16 285 0
ld.global.s32 %r22, [%rd22+0];
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
mov.s32 %r28, 0;
mov.u32 %r29, %r28;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd31, %rd28;
@%p6 bra $Lt_1_32002;
cvt.rzi.ftz.s32.f32 %r30, %f29;
cvt.s64.s32 %rd39, %r20;
mul.lo.s32 %r31, %r30, 11;
cvt.rn.f32.s32 %f30, %r31;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_24322:
//<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown
.loc 16 292 0
ld.global.s32 %r32, [%rd31+0];
.loc 16 296 0
and.b32 %r33, %r32, 1073741823;
mov.u32 %r34, %r33;
mov.s32 %r35, 0;
mov.u32 %r36, %r35;
mov.s32 %r37, 0;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
tex.1d.v4.f32.s32 {%f35,%f36,%f37,%f38},[pos_tex,{%r34,%r36,%r38,%r40}];
mov.f32 %f39, %f35;
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
sub.ftz.f32 %f43, %f27, %f40;
sub.ftz.f32 %f44, %f26, %f39;
sub.ftz.f32 %f45, %f28, %f41;
mul.ftz.f32 %f46, %f43, %f43;
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
add.ftz.f32 %f49, %f30, %f42;
cvt.rzi.ftz.s32.f32 %r41, %f49;
cvt.s64.s32 %rd40, %r41;
mul.wide.s32 %rd41, %r41, 16;
add.u64 %rd42, %rd41, %rd7;
ld.shared.f32 %f50, [%rd42+8];
setp.gt.ftz.f32 %p7, %f50, %f48;
@!%p7 bra $Lt_1_25602;
.loc 16 309 0
rcp.approx.ftz.f32 %f51, %f48;
mul.ftz.f32 %f52, %f51, %f51;
mul.ftz.f32 %f53, %f51, %f52;
sqrt.approx.ftz.f32 %f54, %f53;
mul.ftz.f32 %f55, %f51, %f53;
ld.shared.v2.f32 {%f56,%f57}, [%rd42+0];
mul.ftz.f32 %f58, %f56, %f54;
sub.ftz.f32 %f59, %f58, %f57;
mul.ftz.f32 %f60, %f55, %f59;
.loc 16 311 0
fma.rn.ftz.f32 %f33, %f44, %f60, %f33;
.loc 16 312 0
fma.rn.ftz.f32 %f32, %f43, %f60, %f32;
.loc 16 313 0
fma.rn.ftz.f32 %f31, %f45, %f60, %f31;
ld.param.s32 %r42, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r43, 0;
setp.le.s32 %p8, %r42, %r43;
@%p8 bra $Lt_1_25090;
.loc 16 316 0
add.u64 %rd43, %rd41, %rd13;
ld.shared.v4.f32 {%f61,%f62,%f63,_}, [%rd43+0];
mul.ftz.f32 %f64, %f61, %f54;
sub.ftz.f32 %f65, %f64, %f62;
mul.ftz.f32 %f66, %f53, %f65;
.loc 16 317 0
shr.s32 %r44, %r32, 30;
and.b32 %r45, %r44, 3;
cvt.s64.s32 %rd44, %r45;
mul.wide.s32 %rd45, %r45, 4;
add.u64 %rd46, %rd1, %rd45;
ld.shared.f32 %f67, [%rd46+0];
sub.ftz.f32 %f68, %f66, %f63;
fma.rn.ftz.f32 %f34, %f67, %f68, %f34;
$Lt_1_25090:
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r47, 0;
setp.le.s32 %p9, %r46, %r47;
@%p9 bra $Lt_1_25602;
.loc 16 320 0
mov.f32 %f69, %f11;
mul.ftz.f32 %f70, %f44, %f44;
fma.rn.ftz.f32 %f71, %f60, %f70, %f69;
mov.f32 %f11, %f71;
.loc 16 321 0
mov.f32 %f72, %f13;
fma.rn.ftz.f32 %f73, %f60, %f46, %f72;
mov.f32 %f13, %f73;
.loc 16 322 0
mov.f32 %f74, %f15;
mul.ftz.f32 %f75, %f45, %f45;
fma.rn.ftz.f32 %f76, %f60, %f75, %f74;
mov.f32 %f15, %f76;
.loc 16 323 0
mov.f32 %f77, %f17;
mul.ftz.f32 %f78, %f43, %f44;
fma.rn.ftz.f32 %f79, %f60, %f78, %f77;
mov.f32 %f17, %f79;
.loc 16 324 0
mov.f32 %f80, %f19;
mul.ftz.f32 %f81, %f44, %f45;
fma.rn.ftz.f32 %f82, %f60, %f81, %f80;
mov.f32 %f19, %f82;
.loc 16 325 0
mul.ftz.f32 %f83, %f43, %f45;
fma.rn.ftz.f32 %f20, %f60, %f83, %f20;
mov.f32 %f21, %f20;
$Lt_1_25602:
$Lt_1_24578:
.loc 16 290 0
mul.lo.u64 %rd47, %rd39, 4;
add.u64 %rd31, %rd31, %rd47;
setp.lt.u64 %p10, %rd31, %rd28;
@%p10 bra $Lt_1_24322;
bra.uni $Lt_1_22786;
$Lt_1_32002:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
bra.uni $Lt_1_22786;
$Lt_1_23042:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_22786:
mov.u32 %r48, 1;
setp.le.s32 %p11, %r6, %r48;
@%p11 bra $Lt_1_28418;
.loc 16 336 0
mov.u64 %rd48, __cuda___cuda_local_var_32737_35_non_const_red_acc7168;
cvt.s64.s32 %rd49, %r1;
mul.wide.s32 %rd50, %r1, 4;
add.u64 %rd51, %rd48, %rd50;
mov.f32 %f84, %f33;
st.shared.f32 [%rd51+0], %f84;
.loc 16 337 0
mov.f32 %f85, %f32;
st.shared.f32 [%rd51+512], %f85;
.loc 16 338 0
mov.f32 %f86, %f31;
st.shared.f32 [%rd51+1024], %f86;
.loc 16 339 0
mov.f32 %f87, %f34;
st.shared.f32 [%rd51+1536], %f87;
.loc 16 341 0
shr.s32 %r49, %r6, 31;
mov.s32 %r50, 1;
and.b32 %r51, %r49, %r50;
add.s32 %r52, %r51, %r6;
shr.s32 %r53, %r52, 1;
mov.s32 %r54, %r53;
mov.u32 %r55, 0;
setp.ne.u32 %p12, %r53, %r55;
@!%p12 bra $Lt_1_26882;
$Lt_1_27394:
setp.ge.u32 %p13, %r10, %r54;
@%p13 bra $Lt_1_27650;
.loc 16 344 0
add.u32 %r56, %r1, %r54;
cvt.u64.u32 %rd52, %r56;
mul.wide.u32 %rd53, %r56, 4;
add.u64 %rd54, %rd48, %rd53;
ld.shared.f32 %f88, [%rd54+0];
add.ftz.f32 %f84, %f88, %f84;
st.shared.f32 [%rd51+0], %f84;
ld.shared.f32 %f89, [%rd54+512];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd51+512], %f85;
ld.shared.f32 %f90, [%rd54+1024];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd51+1024], %f86;
ld.shared.f32 %f91, [%rd54+1536];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd51+1536], %f87;
$Lt_1_27650:
.loc 16 341 0
shr.u32 %r54, %r54, 1;
mov.u32 %r57, 0;
setp.ne.u32 %p14, %r54, %r57;
@%p14 bra $Lt_1_27394;
$Lt_1_26882:
.loc 16 348 0
mov.f32 %f33, %f84;
.loc 16 349 0
mov.f32 %f32, %f85;
.loc 16 350 0
mov.f32 %f31, %f86;
.loc 16 351 0
mov.f32 %f34, %f87;
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r59, 0;
setp.le.s32 %p15, %r58, %r59;
@%p15 bra $Lt_1_28418;
.loc 16 355 0
mov.f32 %f84, %f11;
st.shared.f32 [%rd51+0], %f84;
mov.f32 %f85, %f13;
st.shared.f32 [%rd51+512], %f85;
mov.f32 %f86, %f15;
st.shared.f32 [%rd51+1024], %f86;
mov.f32 %f87, %f17;
st.shared.f32 [%rd51+1536], %f87;
mov.f32 %f92, %f19;
st.shared.f32 [%rd51+2048], %f92;
mov.f32 %f93, %f21;
st.shared.f32 [%rd51+2560], %f93;
.loc 16 357 0
mov.s32 %r60, %r53;
@!%p12 bra $Lt_1_28930;
$Lt_1_29442:
setp.ge.u32 %p16, %r10, %r60;
@%p16 bra $Lt_1_29698;
.loc 16 360 0
add.u32 %r61, %r1, %r60;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd48, %rd56;
ld.shared.f32 %f94, [%rd57+0];
add.ftz.f32 %f84, %f94, %f84;
st.shared.f32 [%rd51+0], %f84;
ld.shared.f32 %f95, [%rd57+512];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd51+512], %f85;
ld.shared.f32 %f96, [%rd57+1024];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd51+1024], %f86;
ld.shared.f32 %f97, [%rd57+1536];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd51+1536], %f87;
ld.shared.f32 %f98, [%rd57+2048];
add.ftz.f32 %f92, %f98, %f92;
st.shared.f32 [%rd51+2048], %f92;
ld.shared.f32 %f99, [%rd57+2560];
add.ftz.f32 %f93, %f99, %f93;
st.shared.f32 [%rd51+2560], %f93;
$Lt_1_29698:
.loc 16 357 0
shr.u32 %r60, %r60, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p17, %r60, %r62;
@%p17 bra $Lt_1_29442;
$Lt_1_28930:
.loc 16 365 0
mov.f32 %f11, %f84;
mov.f32 %f13, %f85;
mov.f32 %f15, %f86;
mov.f32 %f17, %f87;
mov.f32 %f19, %f92;
mov.f32 %f21, %f93;
$Lt_1_28418:
$Lt_1_26370:
selp.s32 %r63, 1, 0, %p4;
mov.s32 %r64, 0;
set.eq.u32.s32 %r65, %r10, %r64;
neg.s32 %r66, %r65;
and.b32 %r67, %r63, %r66;
mov.u32 %r68, 0;
setp.eq.s32 %p18, %r67, %r68;
@%p18 bra $Lt_1_30466;
.loc 16 371 0
cvt.s64.s32 %rd58, %r13;
ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];
mul.wide.s32 %rd60, %r13, 4;
add.u64 %rd61, %rd59, %rd60;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30978;
.loc 16 373 0
st.global.f32 [%rd61+0], %f34;
.loc 16 374 0
cvt.s64.s32 %rd62, %r14;
mul.wide.s32 %rd63, %r14, 4;
add.u64 %rd61, %rd61, %rd63;
$Lt_1_30978:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31490;
.loc 16 378 0
mov.f32 %f100, %f11;
st.global.f32 [%rd61+0], %f100;
.loc 16 379 0
cvt.s64.s32 %rd64, %r14;
mul.wide.s32 %rd65, %r14, 4;
add.u64 %rd66, %rd65, %rd61;
.loc 16 378 0
mov.f32 %f101, %f13;
st.global.f32 [%rd66+0], %f101;
.loc 16 379 0
add.u64 %rd67, %rd65, %rd66;
.loc 16 378 0
mov.f32 %f102, %f15;
st.global.f32 [%rd67+0], %f102;
.loc 16 379 0
add.u64 %rd68, %rd65, %rd67;
.loc 16 378 0
mov.f32 %f103, %f17;
st.global.f32 [%rd68+0], %f103;
.loc 16 379 0
add.u64 %rd61, %rd65, %rd68;
.loc 16 378 0
mov.f32 %f104, %f19;
st.global.f32 [%rd61+0], %f104;
mov.f32 %f105, %f21;
add.u64 %rd69, %rd65, %rd61;
st.global.f32 [%rd69+0], %f105;
$Lt_1_31490:
.loc 16 382 0
ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd71, %rd58, 16;
add.u64 %rd72, %rd70, %rd71;
mov.f32 %f106, %f107;
st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106};
$Lt_1_30466:
.loc 16 384 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,979 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bd91_00000000-9_lj_cut_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.gvU1PY)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bd91_00000000-8_lj_cut_gpu_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lj_cut_gpu_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair_engv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<62>;
.reg .f32 %f<102>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32581_35_non_const_red_acc108[3072];
// __cuda_local_var_32504_10_non_const_f = 48
// __cuda_local_var_32508_9_non_const_virial = 16
.loc 16 88 0
$LDWbegin_kernel_pair:
.loc 16 95 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 96 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 97 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 98 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 107 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
rem.s32 %r6, %r2, %r1;
cvt.s32.u32 %r7, %ctaid.x;
mul.lo.s32 %r8, %r7, %r5;
add.s32 %r9, %r3, %r8;
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
setp.lt.s32 %p1, %r9, %r10;
@!%p1 bra $Lt_0_19202;
.loc 16 113 0
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r11;
mul.wide.s32 %rd3, %r11, 4;
cvt.s64.s32 %rd4, %r9;
mul.wide.s32 %rd5, %r9, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r12, [%rd8+0];
add.u64 %rd9, %rd3, %rd8;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd10, %rd6;
@%p2 bra $Lt_0_19714;
.loc 16 119 0
cvt.s32.s64 %r13, %rd2;
mul.lo.s32 %r14, %r13, %r12;
cvt.s64.s32 %rd11, %r14;
mul.wide.s32 %rd12, %r14, 4;
add.u64 %rd13, %rd9, %rd12;
.loc 16 120 0
mul.lo.s32 %r15, %r6, %r13;
cvt.s64.s32 %rd14, %r15;
mul.wide.s32 %rd15, %r15, 4;
add.u64 %rd16, %rd9, %rd15;
.loc 16 121 0
mul.lo.s32 %r16, %r13, %r1;
bra.uni $Lt_0_19458;
$Lt_0_19714:
.loc 16 123 0
ld.global.s32 %r17, [%rd9+0];
cvt.s64.s32 %rd17, %r17;
mul.wide.s32 %rd18, %r17, 4;
add.u64 %rd19, %rd10, %rd18;
.loc 16 124 0
cvt.s64.s32 %rd20, %r12;
mul.wide.s32 %rd21, %r12, 4;
add.u64 %rd13, %rd19, %rd21;
.loc 16 125 0
mov.s32 %r16, %r1;
.loc 16 126 0
cvt.s64.s32 %rd22, %r6;
mul.wide.s32 %rd23, %r6, 4;
add.u64 %rd16, %rd19, %rd23;
$Lt_0_19458:
.loc 16 129 0
ld.global.s32 %r18, [%rd7+0];
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd16, %rd13;
@%p3 bra $Lt_0_28162;
cvt.rzi.ftz.s32.f32 %r26, %f24;
cvt.s64.s32 %rd24, %r16;
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r28, %r27, %r26;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
$Lt_0_20482:
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
.loc 16 135 0
ld.global.s32 %r29, [%rd16+0];
.loc 16 136 0
shr.s32 %r30, %r29, 30;
and.b32 %r31, %r30, 3;
cvt.s64.s32 %rd27, %r31;
mul.wide.s32 %rd28, %r31, 4;
add.u64 %rd29, %rd26, %rd28;
ld.shared.f32 %f29, [%rd29+0];
.loc 16 139 0
and.b32 %r32, %r29, 1073741823;
mov.u32 %r33, %r32;
mov.s32 %r34, 0;
mov.u32 %r35, %r34;
mov.s32 %r36, 0;
mov.u32 %r37, %r36;
mov.s32 %r38, 0;
mov.u32 %r39, %r38;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r40, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r41, %r40, %r28;
cvt.s64.s32 %rd30, %r41;
mul.wide.s32 %rd31, %r41, 16;
add.u64 %rd32, %rd31, %rd25;
ld.global.f32 %f44, [%rd32+8];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21762;
.loc 16 153 0
rcp.approx.ftz.f32 %f45, %f43;
mul.ftz.f32 %f46, %f45, %f45;
mul.ftz.f32 %f47, %f45, %f46;
mul.ftz.f32 %f48, %f45, %f47;
ld.global.v2.f32 {%f49,%f50}, [%rd32+0];
mul.ftz.f32 %f51, %f49, %f47;
sub.ftz.f32 %f52, %f51, %f50;
mul.ftz.f32 %f53, %f48, %f52;
mul.ftz.f32 %f54, %f29, %f53;
.loc 16 155 0
fma.rn.ftz.f32 %f27, %f39, %f54, %f27;
.loc 16 156 0
fma.rn.ftz.f32 %f26, %f38, %f54, %f26;
.loc 16 157 0
fma.rn.ftz.f32 %f25, %f40, %f54, %f25;
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
mov.u32 %r43, 0;
setp.le.s32 %p5, %r42, %r43;
@%p5 bra $Lt_0_21250;
.loc 16 161 0
ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];
add.u64 %rd34, %rd33, %rd31;
ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd34+0];
mul.ftz.f32 %f58, %f55, %f47;
sub.ftz.f32 %f59, %f58, %f56;
mul.ftz.f32 %f60, %f47, %f59;
sub.ftz.f32 %f61, %f60, %f57;
fma.rn.ftz.f32 %f28, %f29, %f61, %f28;
$Lt_0_21250:
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
mov.u32 %r45, 0;
setp.le.s32 %p6, %r44, %r45;
@%p6 bra $Lt_0_21762;
.loc 16 164 0
mov.f32 %f62, %f6;
mul.ftz.f32 %f63, %f39, %f39;
fma.rn.ftz.f32 %f64, %f54, %f63, %f62;
mov.f32 %f6, %f64;
.loc 16 165 0
mov.f32 %f65, %f8;
fma.rn.ftz.f32 %f66, %f54, %f41, %f65;
mov.f32 %f8, %f66;
.loc 16 166 0
mov.f32 %f67, %f10;
mul.ftz.f32 %f68, %f40, %f40;
fma.rn.ftz.f32 %f69, %f54, %f68, %f67;
mov.f32 %f10, %f69;
.loc 16 167 0
mov.f32 %f70, %f12;
mul.ftz.f32 %f71, %f38, %f39;
fma.rn.ftz.f32 %f72, %f54, %f71, %f70;
mov.f32 %f12, %f72;
.loc 16 168 0
mov.f32 %f73, %f14;
mul.ftz.f32 %f74, %f39, %f40;
fma.rn.ftz.f32 %f75, %f54, %f74, %f73;
mov.f32 %f14, %f75;
.loc 16 169 0
mul.ftz.f32 %f76, %f38, %f40;
fma.rn.ftz.f32 %f15, %f54, %f76, %f15;
mov.f32 %f16, %f15;
$Lt_0_21762:
$Lt_0_20738:
.loc 16 133 0
mul.lo.u64 %rd35, %rd24, 4;
add.u64 %rd16, %rd16, %rd35;
setp.lt.u64 %p7, %rd16, %rd13;
@%p7 bra $Lt_0_20482;
bra.uni $Lt_0_18946;
$Lt_0_28162:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
bra.uni $Lt_0_18946;
$Lt_0_19202:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_18946:
mov.u32 %r46, 1;
setp.le.s32 %p8, %r1, %r46;
@%p8 bra $Lt_0_24578;
.loc 16 180 0
mov.u64 %rd36, __cuda___cuda_local_var_32581_35_non_const_red_acc108;
cvt.s64.s32 %rd37, %r2;
mul.wide.s32 %rd38, %r2, 4;
add.u64 %rd39, %rd36, %rd38;
mov.f32 %f77, %f27;
st.shared.f32 [%rd39+0], %f77;
.loc 16 181 0
mov.f32 %f78, %f26;
st.shared.f32 [%rd39+512], %f78;
.loc 16 182 0
mov.f32 %f79, %f25;
st.shared.f32 [%rd39+1024], %f79;
.loc 16 183 0
mov.f32 %f80, %f28;
st.shared.f32 [%rd39+1536], %f80;
.loc 16 185 0
shr.s32 %r47, %r1, 31;
mov.s32 %r48, 1;
and.b32 %r49, %r47, %r48;
add.s32 %r50, %r49, %r1;
shr.s32 %r51, %r50, 1;
mov.s32 %r52, %r51;
mov.u32 %r53, 0;
setp.ne.u32 %p9, %r51, %r53;
@!%p9 bra $Lt_0_23042;
$Lt_0_23554:
setp.ge.u32 %p10, %r6, %r52;
@%p10 bra $Lt_0_23810;
.loc 16 188 0
add.u32 %r54, %r2, %r52;
cvt.u64.u32 %rd40, %r54;
mul.wide.u32 %rd41, %r54, 4;
add.u64 %rd42, %rd36, %rd41;
ld.shared.f32 %f81, [%rd42+0];
add.ftz.f32 %f77, %f81, %f77;
st.shared.f32 [%rd39+0], %f77;
ld.shared.f32 %f82, [%rd42+512];
add.ftz.f32 %f78, %f82, %f78;
st.shared.f32 [%rd39+512], %f78;
ld.shared.f32 %f83, [%rd42+1024];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd39+1024], %f79;
ld.shared.f32 %f84, [%rd42+1536];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd39+1536], %f80;
$Lt_0_23810:
.loc 16 185 0
shr.u32 %r52, %r52, 1;
mov.u32 %r55, 0;
setp.ne.u32 %p11, %r52, %r55;
@%p11 bra $Lt_0_23554;
$Lt_0_23042:
.loc 16 192 0
mov.f32 %f27, %f77;
.loc 16 193 0
mov.f32 %f26, %f78;
.loc 16 194 0
mov.f32 %f25, %f79;
.loc 16 195 0
mov.f32 %f28, %f80;
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
mov.u32 %r57, 0;
setp.le.s32 %p12, %r56, %r57;
@%p12 bra $Lt_0_24578;
.loc 16 199 0
mov.f32 %f77, %f6;
st.shared.f32 [%rd39+0], %f77;
mov.f32 %f78, %f8;
st.shared.f32 [%rd39+512], %f78;
mov.f32 %f79, %f10;
st.shared.f32 [%rd39+1024], %f79;
mov.f32 %f80, %f12;
st.shared.f32 [%rd39+1536], %f80;
mov.f32 %f85, %f14;
st.shared.f32 [%rd39+2048], %f85;
mov.f32 %f86, %f16;
st.shared.f32 [%rd39+2560], %f86;
.loc 16 201 0
mov.s32 %r58, %r51;
@!%p9 bra $Lt_0_25090;
$Lt_0_25602:
setp.ge.u32 %p13, %r6, %r58;
@%p13 bra $Lt_0_25858;
.loc 16 204 0
add.u32 %r59, %r2, %r58;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd36, %rd44;
ld.shared.f32 %f87, [%rd45+0];
add.ftz.f32 %f77, %f87, %f77;
st.shared.f32 [%rd39+0], %f77;
ld.shared.f32 %f88, [%rd45+512];
add.ftz.f32 %f78, %f88, %f78;
st.shared.f32 [%rd39+512], %f78;
ld.shared.f32 %f89, [%rd45+1024];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd39+1024], %f79;
ld.shared.f32 %f90, [%rd45+1536];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd39+1536], %f80;
ld.shared.f32 %f91, [%rd45+2048];
add.ftz.f32 %f85, %f91, %f85;
st.shared.f32 [%rd39+2048], %f85;
ld.shared.f32 %f92, [%rd45+2560];
add.ftz.f32 %f86, %f92, %f86;
st.shared.f32 [%rd39+2560], %f86;
$Lt_0_25858:
.loc 16 201 0
shr.u32 %r58, %r58, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p14, %r58, %r60;
@%p14 bra $Lt_0_25602;
$Lt_0_25090:
.loc 16 209 0
mov.f32 %f6, %f77;
mov.f32 %f8, %f78;
mov.f32 %f10, %f79;
mov.f32 %f12, %f80;
mov.f32 %f14, %f85;
mov.f32 %f16, %f86;
$Lt_0_24578:
$Lt_0_22530:
selp.s32 %r61, 1, 0, %p1;
mov.s32 %r62, 0;
set.eq.u32.s32 %r63, %r6, %r62;
neg.s32 %r64, %r63;
and.b32 %r65, %r61, %r64;
mov.u32 %r66, 0;
setp.eq.s32 %p15, %r65, %r66;
@%p15 bra $Lt_0_26626;
.loc 16 215 0
cvt.s64.s32 %rd46, %r9;
ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];
mul.wide.s32 %rd48, %r9, 4;
add.u64 %rd49, %rd47, %rd48;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_27138;
.loc 16 217 0
st.global.f32 [%rd49+0], %f28;
.loc 16 218 0
cvt.s64.s32 %rd50, %r10;
mul.wide.s32 %rd51, %r10, 4;
add.u64 %rd49, %rd49, %rd51;
$Lt_0_27138:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27650;
.loc 16 222 0
mov.f32 %f93, %f6;
st.global.f32 [%rd49+0], %f93;
.loc 16 223 0
cvt.s64.s32 %rd52, %r10;
mul.wide.s32 %rd53, %r10, 4;
add.u64 %rd54, %rd53, %rd49;
.loc 16 222 0
mov.f32 %f94, %f8;
st.global.f32 [%rd54+0], %f94;
.loc 16 223 0
add.u64 %rd55, %rd53, %rd54;
.loc 16 222 0
mov.f32 %f95, %f10;
st.global.f32 [%rd55+0], %f95;
.loc 16 223 0
add.u64 %rd56, %rd53, %rd55;
.loc 16 222 0
mov.f32 %f96, %f12;
st.global.f32 [%rd56+0], %f96;
.loc 16 223 0
add.u64 %rd49, %rd53, %rd56;
.loc 16 222 0
mov.f32 %f97, %f14;
st.global.f32 [%rd49+0], %f97;
mov.f32 %f98, %f16;
add.u64 %rd57, %rd53, %rd49;
st.global.f32 [%rd57+0], %f98;
$Lt_0_27650:
.loc 16 226 0
ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd59, %rd46, 16;
add.u64 %rd60, %rd58, %rd59;
mov.f32 %f99, %f100;
st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f99};
$Lt_0_26626:
.loc 16 228 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast_engv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<74>;
.reg .f32 %f<109>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32647_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32645_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32735_35_non_const_red_acc7168[3072];
// __cuda_local_var_32657_10_non_const_f = 48
// __cuda_local_var_32661_9_non_const_virial = 16
.loc 16 236 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_21250;
.loc 16 246 0
mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_21250:
mov.u64 %rd1, __cuda___cuda_local_var_32647_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21762;
.loc 16 248 0
mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22274;
.loc 16 250 0
mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_22274:
mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
$Lt_1_21762:
mov.u64 %rd7, __cuda___cuda_local_var_32645_34_non_const_lj13296;
mov.u64 %rd13, __cuda___cuda_local_var_32646_34_non_const_lj35232;
.loc 16 260 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 262 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
rem.s32 %r10, %r1, %r6;
cvt.s32.u32 %r11, %ctaid.x;
mul.lo.s32 %r12, %r11, %r9;
add.s32 %r13, %r7, %r12;
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
setp.lt.s32 %p4, %r13, %r14;
@!%p4 bra $Lt_1_23042;
.loc 16 268 0
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r15;
mul.wide.s32 %rd18, %r15, 4;
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r16, [%rd23+0];
add.u64 %rd24, %rd18, %rd23;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd25, %rd21;
@%p5 bra $Lt_1_23554;
.loc 16 274 0
cvt.s32.s64 %r17, %rd17;
mul.lo.s32 %r18, %r17, %r16;
cvt.s64.s32 %rd26, %r18;
mul.wide.s32 %rd27, %r18, 4;
add.u64 %rd28, %rd24, %rd27;
.loc 16 275 0
mul.lo.s32 %r19, %r10, %r17;
cvt.s64.s32 %rd29, %r19;
mul.wide.s32 %rd30, %r19, 4;
add.u64 %rd31, %rd24, %rd30;
.loc 16 276 0
mul.lo.s32 %r20, %r17, %r6;
bra.uni $Lt_1_23298;
$Lt_1_23554:
.loc 16 278 0
ld.global.s32 %r21, [%rd24+0];
cvt.s64.s32 %rd32, %r21;
mul.wide.s32 %rd33, %r21, 4;
add.u64 %rd34, %rd25, %rd33;
.loc 16 279 0
cvt.s64.s32 %rd35, %r16;
mul.wide.s32 %rd36, %r16, 4;
add.u64 %rd28, %rd34, %rd36;
.loc 16 280 0
mov.s32 %r20, %r6;
.loc 16 281 0
cvt.s64.s32 %rd37, %r10;
mul.wide.s32 %rd38, %r10, 4;
add.u64 %rd31, %rd34, %rd38;
$Lt_1_23298:
.loc 16 284 0
ld.global.s32 %r22, [%rd22+0];
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
mov.s32 %r28, 0;
mov.u32 %r29, %r28;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd31, %rd28;
@%p6 bra $Lt_1_32002;
cvt.rzi.ftz.s32.f32 %r30, %f29;
cvt.s64.s32 %rd39, %r20;
mul.lo.s32 %r31, %r30, 11;
cvt.rn.f32.s32 %f30, %r31;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_24322:
//<loop> Loop body line 284, nesting depth: 1, estimated iterations: unknown
.loc 16 291 0
ld.global.s32 %r32, [%rd31+0];
.loc 16 292 0
shr.s32 %r33, %r32, 30;
and.b32 %r34, %r33, 3;
cvt.s64.s32 %rd40, %r34;
mul.wide.s32 %rd41, %r34, 4;
add.u64 %rd42, %rd1, %rd41;
ld.shared.f32 %f35, [%rd42+0];
.loc 16 295 0
and.b32 %r35, %r32, 1073741823;
mov.u32 %r36, %r35;
mov.s32 %r37, 0;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f27, %f41;
sub.ftz.f32 %f45, %f26, %f40;
sub.ftz.f32 %f46, %f28, %f42;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
add.ftz.f32 %f50, %f30, %f43;
cvt.rzi.ftz.s32.f32 %r43, %f50;
cvt.s64.s32 %rd43, %r43;
mul.wide.s32 %rd44, %r43, 16;
add.u64 %rd45, %rd44, %rd7;
ld.shared.f32 %f51, [%rd45+8];
setp.gt.ftz.f32 %p7, %f51, %f49;
@!%p7 bra $Lt_1_25602;
.loc 16 307 0
rcp.approx.ftz.f32 %f52, %f49;
mul.ftz.f32 %f53, %f52, %f52;
mul.ftz.f32 %f54, %f52, %f53;
mul.ftz.f32 %f55, %f52, %f35;
mul.ftz.f32 %f56, %f54, %f55;
ld.shared.v2.f32 {%f57,%f58}, [%rd45+0];
mul.ftz.f32 %f59, %f57, %f54;
sub.ftz.f32 %f60, %f59, %f58;
mul.ftz.f32 %f61, %f56, %f60;
.loc 16 309 0
fma.rn.ftz.f32 %f33, %f45, %f61, %f33;
.loc 16 310 0
fma.rn.ftz.f32 %f32, %f44, %f61, %f32;
.loc 16 311 0
fma.rn.ftz.f32 %f31, %f46, %f61, %f31;
ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r45, 0;
setp.le.s32 %p8, %r44, %r45;
@%p8 bra $Lt_1_25090;
.loc 16 314 0
add.u64 %rd46, %rd44, %rd13;
ld.shared.v4.f32 {%f62,%f63,%f64,_}, [%rd46+0];
mul.ftz.f32 %f65, %f62, %f54;
sub.ftz.f32 %f66, %f65, %f63;
mul.ftz.f32 %f67, %f54, %f66;
.loc 16 315 0
sub.ftz.f32 %f68, %f67, %f64;
fma.rn.ftz.f32 %f34, %f35, %f68, %f34;
$Lt_1_25090:
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r47, 0;
setp.le.s32 %p9, %r46, %r47;
@%p9 bra $Lt_1_25602;
.loc 16 318 0
mov.f32 %f69, %f11;
mul.ftz.f32 %f70, %f45, %f45;
fma.rn.ftz.f32 %f71, %f61, %f70, %f69;
mov.f32 %f11, %f71;
.loc 16 319 0
mov.f32 %f72, %f13;
fma.rn.ftz.f32 %f73, %f61, %f47, %f72;
mov.f32 %f13, %f73;
.loc 16 320 0
mov.f32 %f74, %f15;
mul.ftz.f32 %f75, %f46, %f46;
fma.rn.ftz.f32 %f76, %f61, %f75, %f74;
mov.f32 %f15, %f76;
.loc 16 321 0
mov.f32 %f77, %f17;
mul.ftz.f32 %f78, %f44, %f45;
fma.rn.ftz.f32 %f79, %f61, %f78, %f77;
mov.f32 %f17, %f79;
.loc 16 322 0
mov.f32 %f80, %f19;
mul.ftz.f32 %f81, %f45, %f46;
fma.rn.ftz.f32 %f82, %f61, %f81, %f80;
mov.f32 %f19, %f82;
.loc 16 323 0
mul.ftz.f32 %f83, %f44, %f46;
fma.rn.ftz.f32 %f20, %f61, %f83, %f20;
mov.f32 %f21, %f20;
$Lt_1_25602:
$Lt_1_24578:
.loc 16 289 0
mul.lo.u64 %rd47, %rd39, 4;
add.u64 %rd31, %rd31, %rd47;
setp.lt.u64 %p10, %rd31, %rd28;
@%p10 bra $Lt_1_24322;
bra.uni $Lt_1_22786;
$Lt_1_32002:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
bra.uni $Lt_1_22786;
$Lt_1_23042:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_22786:
mov.u32 %r48, 1;
setp.le.s32 %p11, %r6, %r48;
@%p11 bra $Lt_1_28418;
.loc 16 334 0
mov.u64 %rd48, __cuda___cuda_local_var_32735_35_non_const_red_acc7168;
cvt.s64.s32 %rd49, %r1;
mul.wide.s32 %rd50, %r1, 4;
add.u64 %rd51, %rd48, %rd50;
mov.f32 %f84, %f33;
st.shared.f32 [%rd51+0], %f84;
.loc 16 335 0
mov.f32 %f85, %f32;
st.shared.f32 [%rd51+512], %f85;
.loc 16 336 0
mov.f32 %f86, %f31;
st.shared.f32 [%rd51+1024], %f86;
.loc 16 337 0
mov.f32 %f87, %f34;
st.shared.f32 [%rd51+1536], %f87;
.loc 16 339 0
shr.s32 %r49, %r6, 31;
mov.s32 %r50, 1;
and.b32 %r51, %r49, %r50;
add.s32 %r52, %r51, %r6;
shr.s32 %r53, %r52, 1;
mov.s32 %r54, %r53;
mov.u32 %r55, 0;
setp.ne.u32 %p12, %r53, %r55;
@!%p12 bra $Lt_1_26882;
$Lt_1_27394:
setp.ge.u32 %p13, %r10, %r54;
@%p13 bra $Lt_1_27650;
.loc 16 342 0
add.u32 %r56, %r1, %r54;
cvt.u64.u32 %rd52, %r56;
mul.wide.u32 %rd53, %r56, 4;
add.u64 %rd54, %rd48, %rd53;
ld.shared.f32 %f88, [%rd54+0];
add.ftz.f32 %f84, %f88, %f84;
st.shared.f32 [%rd51+0], %f84;
ld.shared.f32 %f89, [%rd54+512];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd51+512], %f85;
ld.shared.f32 %f90, [%rd54+1024];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd51+1024], %f86;
ld.shared.f32 %f91, [%rd54+1536];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd51+1536], %f87;
$Lt_1_27650:
.loc 16 339 0
shr.u32 %r54, %r54, 1;
mov.u32 %r57, 0;
setp.ne.u32 %p14, %r54, %r57;
@%p14 bra $Lt_1_27394;
$Lt_1_26882:
.loc 16 346 0
mov.f32 %f33, %f84;
.loc 16 347 0
mov.f32 %f32, %f85;
.loc 16 348 0
mov.f32 %f31, %f86;
.loc 16 349 0
mov.f32 %f34, %f87;
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r59, 0;
setp.le.s32 %p15, %r58, %r59;
@%p15 bra $Lt_1_28418;
.loc 16 353 0
mov.f32 %f84, %f11;
st.shared.f32 [%rd51+0], %f84;
mov.f32 %f85, %f13;
st.shared.f32 [%rd51+512], %f85;
mov.f32 %f86, %f15;
st.shared.f32 [%rd51+1024], %f86;
mov.f32 %f87, %f17;
st.shared.f32 [%rd51+1536], %f87;
mov.f32 %f92, %f19;
st.shared.f32 [%rd51+2048], %f92;
mov.f32 %f93, %f21;
st.shared.f32 [%rd51+2560], %f93;
.loc 16 355 0
mov.s32 %r60, %r53;
@!%p12 bra $Lt_1_28930;
$Lt_1_29442:
setp.ge.u32 %p16, %r10, %r60;
@%p16 bra $Lt_1_29698;
.loc 16 358 0
add.u32 %r61, %r1, %r60;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd48, %rd56;
ld.shared.f32 %f94, [%rd57+0];
add.ftz.f32 %f84, %f94, %f84;
st.shared.f32 [%rd51+0], %f84;
ld.shared.f32 %f95, [%rd57+512];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd51+512], %f85;
ld.shared.f32 %f96, [%rd57+1024];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd51+1024], %f86;
ld.shared.f32 %f97, [%rd57+1536];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd51+1536], %f87;
ld.shared.f32 %f98, [%rd57+2048];
add.ftz.f32 %f92, %f98, %f92;
st.shared.f32 [%rd51+2048], %f92;
ld.shared.f32 %f99, [%rd57+2560];
add.ftz.f32 %f93, %f99, %f93;
st.shared.f32 [%rd51+2560], %f93;
$Lt_1_29698:
.loc 16 355 0
shr.u32 %r60, %r60, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p17, %r60, %r62;
@%p17 bra $Lt_1_29442;
$Lt_1_28930:
.loc 16 363 0
mov.f32 %f11, %f84;
mov.f32 %f13, %f85;
mov.f32 %f15, %f86;
mov.f32 %f17, %f87;
mov.f32 %f19, %f92;
mov.f32 %f21, %f93;
$Lt_1_28418:
$Lt_1_26370:
selp.s32 %r63, 1, 0, %p4;
mov.s32 %r64, 0;
set.eq.u32.s32 %r65, %r10, %r64;
neg.s32 %r66, %r65;
and.b32 %r67, %r63, %r66;
mov.u32 %r68, 0;
setp.eq.s32 %p18, %r67, %r68;
@%p18 bra $Lt_1_30466;
.loc 16 369 0
cvt.s64.s32 %rd58, %r13;
ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];
mul.wide.s32 %rd60, %r13, 4;
add.u64 %rd61, %rd59, %rd60;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30978;
.loc 16 371 0
st.global.f32 [%rd61+0], %f34;
.loc 16 372 0
cvt.s64.s32 %rd62, %r14;
mul.wide.s32 %rd63, %r14, 4;
add.u64 %rd61, %rd61, %rd63;
$Lt_1_30978:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31490;
.loc 16 376 0
mov.f32 %f100, %f11;
st.global.f32 [%rd61+0], %f100;
.loc 16 377 0
cvt.s64.s32 %rd64, %r14;
mul.wide.s32 %rd65, %r14, 4;
add.u64 %rd66, %rd65, %rd61;
.loc 16 376 0
mov.f32 %f101, %f13;
st.global.f32 [%rd66+0], %f101;
.loc 16 377 0
add.u64 %rd67, %rd65, %rd66;
.loc 16 376 0
mov.f32 %f102, %f15;
st.global.f32 [%rd67+0], %f102;
.loc 16 377 0
add.u64 %rd68, %rd65, %rd67;
.loc 16 376 0
mov.f32 %f103, %f17;
st.global.f32 [%rd68+0], %f103;
.loc 16 377 0
add.u64 %rd61, %rd65, %rd68;
.loc 16 376 0
mov.f32 %f104, %f19;
st.global.f32 [%rd61+0], %f104;
mov.f32 %f105, %f21;
add.u64 %rd69, %rd65, %rd61;
st.global.f32 [%rd69+0], %f105;
$Lt_1_31490:
.loc 16 380 0
ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd71, %rd58, 16;
add.u64 %rd72, %rd70, %rd71;
mov.f32 %f106, %f107;
st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f106};
$Lt_1_30466:
.loc 16 382 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,993 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000be22_00000000-9_lj_expand_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.LdVC9u)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000be22_00000000-8_lj_expand_gpu_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "lj_expand_gpu_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_lj1,
.param .u64 __cudaparm_kernel_pair_lj3,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair_engv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<62>;
.reg .f32 %f<107>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32584_35_non_const_red_acc108[3072];
// __cuda_local_var_32504_10_non_const_f = 48
// __cuda_local_var_32508_9_non_const_virial = 16
.loc 16 88 0
$LDWbegin_kernel_pair:
.loc 16 95 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 96 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 97 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 98 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 107 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
rem.s32 %r6, %r2, %r1;
cvt.s32.u32 %r7, %ctaid.x;
mul.lo.s32 %r8, %r7, %r5;
add.s32 %r9, %r3, %r8;
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
setp.lt.s32 %p1, %r9, %r10;
@!%p1 bra $Lt_0_19202;
.loc 16 113 0
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r11;
mul.wide.s32 %rd3, %r11, 4;
cvt.s64.s32 %rd4, %r9;
mul.wide.s32 %rd5, %r9, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r12, [%rd8+0];
add.u64 %rd9, %rd3, %rd8;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd10, %rd6;
@%p2 bra $Lt_0_19714;
.loc 16 119 0
cvt.s32.s64 %r13, %rd2;
mul.lo.s32 %r14, %r13, %r12;
cvt.s64.s32 %rd11, %r14;
mul.wide.s32 %rd12, %r14, 4;
add.u64 %rd13, %rd9, %rd12;
.loc 16 120 0
mul.lo.s32 %r15, %r6, %r13;
cvt.s64.s32 %rd14, %r15;
mul.wide.s32 %rd15, %r15, 4;
add.u64 %rd16, %rd9, %rd15;
.loc 16 121 0
mul.lo.s32 %r16, %r13, %r1;
bra.uni $Lt_0_19458;
$Lt_0_19714:
.loc 16 123 0
ld.global.s32 %r17, [%rd9+0];
cvt.s64.s32 %rd17, %r17;
mul.wide.s32 %rd18, %r17, 4;
add.u64 %rd19, %rd10, %rd18;
.loc 16 124 0
cvt.s64.s32 %rd20, %r12;
mul.wide.s32 %rd21, %r12, 4;
add.u64 %rd13, %rd19, %rd21;
.loc 16 125 0
mov.s32 %r16, %r1;
.loc 16 126 0
cvt.s64.s32 %rd22, %r6;
mul.wide.s32 %rd23, %r6, 4;
add.u64 %rd16, %rd19, %rd23;
$Lt_0_19458:
.loc 16 129 0
ld.global.s32 %r18, [%rd7+0];
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd16, %rd13;
@%p3 bra $Lt_0_28162;
cvt.rzi.ftz.s32.f32 %r26, %f24;
cvt.s64.s32 %rd24, %r16;
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r28, %r27, %r26;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
$Lt_0_20482:
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
.loc 16 135 0
ld.global.s32 %r29, [%rd16+0];
.loc 16 136 0
shr.s32 %r30, %r29, 30;
and.b32 %r31, %r30, 3;
cvt.s64.s32 %rd27, %r31;
mul.wide.s32 %rd28, %r31, 4;
add.u64 %rd29, %rd26, %rd28;
ld.shared.f32 %f29, [%rd29+0];
.loc 16 139 0
and.b32 %r32, %r29, 1073741823;
mov.u32 %r33, %r32;
mov.s32 %r34, 0;
mov.u32 %r35, %r34;
mov.s32 %r36, 0;
mov.u32 %r37, %r36;
mov.s32 %r38, 0;
mov.u32 %r39, %r38;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r40, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
add.s32 %r41, %r40, %r28;
cvt.s64.s32 %rd30, %r41;
mul.wide.s32 %rd31, %r41, 16;
add.u64 %rd32, %rd31, %rd25;
ld.global.f32 %f44, [%rd32+8];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21762;
.loc 16 151 0
sqrt.approx.ftz.f32 %f45, %f43;
ld.global.v4.f32 {%f46,%f47,_,%f48}, [%rd32+0];
sub.ftz.f32 %f49, %f45, %f48;
.loc 16 156 0
mul.ftz.f32 %f50, %f49, %f49;
rcp.approx.ftz.f32 %f51, %f50;
mul.ftz.f32 %f52, %f51, %f51;
mul.ftz.f32 %f53, %f51, %f52;
div.approx.ftz.f32 %f54, %f29, %f49;
div.approx.ftz.f32 %f55, %f54, %f45;
mul.ftz.f32 %f56, %f46, %f53;
sub.ftz.f32 %f57, %f56, %f47;
mul.ftz.f32 %f58, %f53, %f57;
mul.ftz.f32 %f59, %f55, %f58;
.loc 16 158 0
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
.loc 16 159 0
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
.loc 16 160 0
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
mov.u32 %r43, 0;
setp.le.s32 %p5, %r42, %r43;
@%p5 bra $Lt_0_21250;
.loc 16 164 0
ld.param.u64 %rd33, [__cudaparm_kernel_pair_lj3];
add.u64 %rd34, %rd33, %rd31;
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd34+0];
mul.ftz.f32 %f63, %f60, %f53;
sub.ftz.f32 %f64, %f63, %f61;
mul.ftz.f32 %f65, %f53, %f64;
sub.ftz.f32 %f66, %f65, %f62;
fma.rn.ftz.f32 %f28, %f29, %f66, %f28;
$Lt_0_21250:
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
mov.u32 %r45, 0;
setp.le.s32 %p6, %r44, %r45;
@%p6 bra $Lt_0_21762;
.loc 16 167 0
mov.f32 %f67, %f6;
mul.ftz.f32 %f68, %f39, %f39;
fma.rn.ftz.f32 %f69, %f59, %f68, %f67;
mov.f32 %f6, %f69;
.loc 16 168 0
mov.f32 %f70, %f8;
fma.rn.ftz.f32 %f71, %f59, %f41, %f70;
mov.f32 %f8, %f71;
.loc 16 169 0
mov.f32 %f72, %f10;
mul.ftz.f32 %f73, %f40, %f40;
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
mov.f32 %f10, %f74;
.loc 16 170 0
mov.f32 %f75, %f12;
mul.ftz.f32 %f76, %f38, %f39;
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
mov.f32 %f12, %f77;
.loc 16 171 0
mov.f32 %f78, %f14;
mul.ftz.f32 %f79, %f39, %f40;
fma.rn.ftz.f32 %f80, %f59, %f79, %f78;
mov.f32 %f14, %f80;
.loc 16 172 0
mul.ftz.f32 %f81, %f38, %f40;
fma.rn.ftz.f32 %f15, %f59, %f81, %f15;
mov.f32 %f16, %f15;
$Lt_0_21762:
$Lt_0_20738:
.loc 16 133 0
mul.lo.u64 %rd35, %rd24, 4;
add.u64 %rd16, %rd16, %rd35;
setp.lt.u64 %p7, %rd16, %rd13;
@%p7 bra $Lt_0_20482;
bra.uni $Lt_0_18946;
$Lt_0_28162:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
bra.uni $Lt_0_18946;
$Lt_0_19202:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_18946:
mov.u32 %r46, 1;
setp.le.s32 %p8, %r1, %r46;
@%p8 bra $Lt_0_24578;
.loc 16 183 0
mov.u64 %rd36, __cuda___cuda_local_var_32584_35_non_const_red_acc108;
cvt.s64.s32 %rd37, %r2;
mul.wide.s32 %rd38, %r2, 4;
add.u64 %rd39, %rd36, %rd38;
mov.f32 %f82, %f27;
st.shared.f32 [%rd39+0], %f82;
.loc 16 184 0
mov.f32 %f83, %f26;
st.shared.f32 [%rd39+512], %f83;
.loc 16 185 0
mov.f32 %f84, %f25;
st.shared.f32 [%rd39+1024], %f84;
.loc 16 186 0
mov.f32 %f85, %f28;
st.shared.f32 [%rd39+1536], %f85;
.loc 16 188 0
shr.s32 %r47, %r1, 31;
mov.s32 %r48, 1;
and.b32 %r49, %r47, %r48;
add.s32 %r50, %r49, %r1;
shr.s32 %r51, %r50, 1;
mov.s32 %r52, %r51;
mov.u32 %r53, 0;
setp.ne.u32 %p9, %r51, %r53;
@!%p9 bra $Lt_0_23042;
$Lt_0_23554:
setp.ge.u32 %p10, %r6, %r52;
@%p10 bra $Lt_0_23810;
.loc 16 191 0
add.u32 %r54, %r2, %r52;
cvt.u64.u32 %rd40, %r54;
mul.wide.u32 %rd41, %r54, 4;
add.u64 %rd42, %rd36, %rd41;
ld.shared.f32 %f86, [%rd42+0];
add.ftz.f32 %f82, %f86, %f82;
st.shared.f32 [%rd39+0], %f82;
ld.shared.f32 %f87, [%rd42+512];
add.ftz.f32 %f83, %f87, %f83;
st.shared.f32 [%rd39+512], %f83;
ld.shared.f32 %f88, [%rd42+1024];
add.ftz.f32 %f84, %f88, %f84;
st.shared.f32 [%rd39+1024], %f84;
ld.shared.f32 %f89, [%rd42+1536];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd39+1536], %f85;
$Lt_0_23810:
.loc 16 188 0
shr.u32 %r52, %r52, 1;
mov.u32 %r55, 0;
setp.ne.u32 %p11, %r52, %r55;
@%p11 bra $Lt_0_23554;
$Lt_0_23042:
.loc 16 195 0
mov.f32 %f27, %f82;
.loc 16 196 0
mov.f32 %f26, %f83;
.loc 16 197 0
mov.f32 %f25, %f84;
.loc 16 198 0
mov.f32 %f28, %f85;
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
mov.u32 %r57, 0;
setp.le.s32 %p12, %r56, %r57;
@%p12 bra $Lt_0_24578;
.loc 16 202 0
mov.f32 %f82, %f6;
st.shared.f32 [%rd39+0], %f82;
mov.f32 %f83, %f8;
st.shared.f32 [%rd39+512], %f83;
mov.f32 %f84, %f10;
st.shared.f32 [%rd39+1024], %f84;
mov.f32 %f85, %f12;
st.shared.f32 [%rd39+1536], %f85;
mov.f32 %f90, %f14;
st.shared.f32 [%rd39+2048], %f90;
mov.f32 %f91, %f16;
st.shared.f32 [%rd39+2560], %f91;
.loc 16 204 0
mov.s32 %r58, %r51;
@!%p9 bra $Lt_0_25090;
$Lt_0_25602:
setp.ge.u32 %p13, %r6, %r58;
@%p13 bra $Lt_0_25858;
.loc 16 207 0
add.u32 %r59, %r2, %r58;
cvt.u64.u32 %rd43, %r59;
mul.wide.u32 %rd44, %r59, 4;
add.u64 %rd45, %rd36, %rd44;
ld.shared.f32 %f92, [%rd45+0];
add.ftz.f32 %f82, %f92, %f82;
st.shared.f32 [%rd39+0], %f82;
ld.shared.f32 %f93, [%rd45+512];
add.ftz.f32 %f83, %f93, %f83;
st.shared.f32 [%rd39+512], %f83;
ld.shared.f32 %f94, [%rd45+1024];
add.ftz.f32 %f84, %f94, %f84;
st.shared.f32 [%rd39+1024], %f84;
ld.shared.f32 %f95, [%rd45+1536];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd39+1536], %f85;
ld.shared.f32 %f96, [%rd45+2048];
add.ftz.f32 %f90, %f96, %f90;
st.shared.f32 [%rd39+2048], %f90;
ld.shared.f32 %f97, [%rd45+2560];
add.ftz.f32 %f91, %f97, %f91;
st.shared.f32 [%rd39+2560], %f91;
$Lt_0_25858:
.loc 16 204 0
shr.u32 %r58, %r58, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p14, %r58, %r60;
@%p14 bra $Lt_0_25602;
$Lt_0_25090:
.loc 16 212 0
mov.f32 %f6, %f82;
mov.f32 %f8, %f83;
mov.f32 %f10, %f84;
mov.f32 %f12, %f85;
mov.f32 %f14, %f90;
mov.f32 %f16, %f91;
$Lt_0_24578:
$Lt_0_22530:
selp.s32 %r61, 1, 0, %p1;
mov.s32 %r62, 0;
set.eq.u32.s32 %r63, %r6, %r62;
neg.s32 %r64, %r63;
and.b32 %r65, %r61, %r64;
mov.u32 %r66, 0;
setp.eq.s32 %p15, %r65, %r66;
@%p15 bra $Lt_0_26626;
.loc 16 218 0
cvt.s64.s32 %rd46, %r9;
ld.param.u64 %rd47, [__cudaparm_kernel_pair_engv];
mul.wide.s32 %rd48, %r9, 4;
add.u64 %rd49, %rd47, %rd48;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_27138;
.loc 16 220 0
st.global.f32 [%rd49+0], %f28;
.loc 16 221 0
cvt.s64.s32 %rd50, %r10;
mul.wide.s32 %rd51, %r10, 4;
add.u64 %rd49, %rd49, %rd51;
$Lt_0_27138:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27650;
.loc 16 225 0
mov.f32 %f98, %f6;
st.global.f32 [%rd49+0], %f98;
.loc 16 226 0
cvt.s64.s32 %rd52, %r10;
mul.wide.s32 %rd53, %r10, 4;
add.u64 %rd54, %rd53, %rd49;
.loc 16 225 0
mov.f32 %f99, %f8;
st.global.f32 [%rd54+0], %f99;
.loc 16 226 0
add.u64 %rd55, %rd53, %rd54;
.loc 16 225 0
mov.f32 %f100, %f10;
st.global.f32 [%rd55+0], %f100;
.loc 16 226 0
add.u64 %rd56, %rd53, %rd55;
.loc 16 225 0
mov.f32 %f101, %f12;
st.global.f32 [%rd56+0], %f101;
.loc 16 226 0
add.u64 %rd49, %rd53, %rd56;
.loc 16 225 0
mov.f32 %f102, %f14;
st.global.f32 [%rd49+0], %f102;
mov.f32 %f103, %f16;
add.u64 %rd57, %rd53, %rd49;
st.global.f32 [%rd57+0], %f103;
$Lt_0_27650:
.loc 16 229 0
ld.param.u64 %rd58, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd59, %rd46, 16;
add.u64 %rd60, %rd58, %rd59;
mov.f32 %f104, %f105;
st.global.v4.f32 [%rd60+0], {%f27,%f26,%f25,%f104};
$Lt_0_26626:
.loc 16 231 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_lj1_in,
.param .u64 __cudaparm_kernel_pair_fast_lj3_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast_engv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<74>;
.reg .f32 %f<114>;
.reg .f64 %fd<4>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32650_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32648_34_non_const_lj13296[1936];
.shared .align 16 .b8 __cuda___cuda_local_var_32649_34_non_const_lj35232[1936];
.shared .align 4 .b8 __cuda___cuda_local_var_32742_35_non_const_red_acc7168[3072];
// __cuda_local_var_32660_10_non_const_f = 48
// __cuda_local_var_32664_9_non_const_virial = 16
.loc 16 239 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_21250;
.loc 16 249 0
mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_21250:
mov.u64 %rd1, __cuda___cuda_local_var_32650_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21762;
.loc 16 251 0
mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22274;
.loc 16 253 0
mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];
add.u64 %rd15, %rd14, %rd9;
add.u64 %rd16, %rd9, %rd13;
ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];
st.shared.v4.f32 [%rd16+0], {%f6,%f7,%f8,%f9};
$Lt_1_22274:
mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
$Lt_1_21762:
mov.u64 %rd7, __cuda___cuda_local_var_32648_34_non_const_lj13296;
mov.u64 %rd13, __cuda___cuda_local_var_32649_34_non_const_lj35232;
.loc 16 263 0
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
mov.f32 %f20, 0f00000000; // 0
mov.f32 %f21, %f20;
.loc 16 265 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
rem.s32 %r10, %r1, %r6;
cvt.s32.u32 %r11, %ctaid.x;
mul.lo.s32 %r12, %r11, %r9;
add.s32 %r13, %r7, %r12;
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
setp.lt.s32 %p4, %r13, %r14;
@!%p4 bra $Lt_1_23042;
.loc 16 271 0
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd17, %r15;
mul.wide.s32 %rd18, %r15, 4;
cvt.s64.s32 %rd19, %r13;
mul.wide.s32 %rd20, %r13, 4;
ld.param.u64 %rd21, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd22, %rd20, %rd21;
add.u64 %rd23, %rd18, %rd22;
ld.global.s32 %r16, [%rd23+0];
add.u64 %rd24, %rd18, %rd23;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd25, %rd21;
@%p5 bra $Lt_1_23554;
.loc 16 277 0
cvt.s32.s64 %r17, %rd17;
mul.lo.s32 %r18, %r17, %r16;
cvt.s64.s32 %rd26, %r18;
mul.wide.s32 %rd27, %r18, 4;
add.u64 %rd28, %rd24, %rd27;
.loc 16 278 0
mul.lo.s32 %r19, %r10, %r17;
cvt.s64.s32 %rd29, %r19;
mul.wide.s32 %rd30, %r19, 4;
add.u64 %rd31, %rd24, %rd30;
.loc 16 279 0
mul.lo.s32 %r20, %r17, %r6;
bra.uni $Lt_1_23298;
$Lt_1_23554:
.loc 16 281 0
ld.global.s32 %r21, [%rd24+0];
cvt.s64.s32 %rd32, %r21;
mul.wide.s32 %rd33, %r21, 4;
add.u64 %rd34, %rd25, %rd33;
.loc 16 282 0
cvt.s64.s32 %rd35, %r16;
mul.wide.s32 %rd36, %r16, 4;
add.u64 %rd28, %rd34, %rd36;
.loc 16 283 0
mov.s32 %r20, %r6;
.loc 16 284 0
cvt.s64.s32 %rd37, %r10;
mul.wide.s32 %rd38, %r10, 4;
add.u64 %rd31, %rd34, %rd38;
$Lt_1_23298:
.loc 16 287 0
ld.global.s32 %r22, [%rd22+0];
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
mov.s32 %r28, 0;
mov.u32 %r29, %r28;
tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r23,%r25,%r27,%r29}];
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
mov.f32 %f28, %f24;
mov.f32 %f29, %f25;
setp.ge.u64 %p6, %rd31, %rd28;
@%p6 bra $Lt_1_32002;
cvt.rzi.ftz.s32.f32 %r30, %f29;
cvt.s64.s32 %rd39, %r20;
mul.lo.s32 %r31, %r30, 11;
cvt.rn.f32.s32 %f30, %r31;
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_24322:
//<loop> Loop body line 287, nesting depth: 1, estimated iterations: unknown
.loc 16 294 0
ld.global.s32 %r32, [%rd31+0];
.loc 16 295 0
shr.s32 %r33, %r32, 30;
and.b32 %r34, %r33, 3;
cvt.s64.s32 %rd40, %r34;
mul.wide.s32 %rd41, %r34, 4;
add.u64 %rd42, %rd1, %rd41;
ld.shared.f32 %f35, [%rd42+0];
.loc 16 298 0
and.b32 %r35, %r32, 1073741823;
mov.u32 %r36, %r35;
mov.s32 %r37, 0;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r36,%r38,%r40,%r42}];
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
mov.f32 %f42, %f38;
mov.f32 %f43, %f39;
sub.ftz.f32 %f44, %f27, %f41;
sub.ftz.f32 %f45, %f26, %f40;
sub.ftz.f32 %f46, %f28, %f42;
mul.ftz.f32 %f47, %f44, %f44;
fma.rn.ftz.f32 %f48, %f45, %f45, %f47;
fma.rn.ftz.f32 %f49, %f46, %f46, %f48;
add.ftz.f32 %f50, %f30, %f43;
cvt.rzi.ftz.s32.f32 %r43, %f50;
cvt.s64.s32 %rd43, %r43;
mul.wide.s32 %rd44, %r43, 16;
add.u64 %rd45, %rd44, %rd7;
ld.shared.f32 %f51, [%rd45+8];
setp.gt.ftz.f32 %p7, %f51, %f49;
@!%p7 bra $Lt_1_25602;
.loc 16 309 0
sqrt.approx.ftz.f32 %f52, %f49;
ld.shared.v4.f32 {%f53,%f54,_,%f55}, [%rd45+0];
sub.ftz.f32 %f56, %f52, %f55;
.loc 16 313 0
mul.ftz.f32 %f57, %f56, %f56;
cvt.ftz.f64.f32 %fd1, %f57;
rcp.rn.f64 %fd2, %fd1;
cvt.rn.ftz.f32.f64 %f58, %fd2;
mul.ftz.f32 %f59, %f58, %f58;
mul.ftz.f32 %f60, %f58, %f59;
mul.ftz.f32 %f61, %f53, %f60;
sub.ftz.f32 %f62, %f61, %f54;
mul.ftz.f32 %f63, %f60, %f62;
.loc 16 314 0
div.approx.ftz.f32 %f64, %f35, %f56;
div.approx.ftz.f32 %f65, %f64, %f52;
mul.ftz.f32 %f66, %f63, %f65;
.loc 16 316 0
fma.rn.ftz.f32 %f33, %f45, %f66, %f33;
.loc 16 317 0
fma.rn.ftz.f32 %f32, %f44, %f66, %f32;
.loc 16 318 0
fma.rn.ftz.f32 %f31, %f46, %f66, %f31;
ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r45, 0;
setp.le.s32 %p8, %r44, %r45;
@%p8 bra $Lt_1_25090;
.loc 16 321 0
add.u64 %rd46, %rd44, %rd13;
ld.shared.v4.f32 {%f67,%f68,%f69,_}, [%rd46+0];
mul.ftz.f32 %f70, %f67, %f60;
sub.ftz.f32 %f71, %f70, %f68;
mul.ftz.f32 %f72, %f60, %f71;
.loc 16 322 0
sub.ftz.f32 %f73, %f72, %f69;
fma.rn.ftz.f32 %f34, %f35, %f73, %f34;
$Lt_1_25090:
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r47, 0;
setp.le.s32 %p9, %r46, %r47;
@%p9 bra $Lt_1_25602;
.loc 16 325 0
mov.f32 %f74, %f11;
mul.ftz.f32 %f75, %f45, %f45;
fma.rn.ftz.f32 %f76, %f66, %f75, %f74;
mov.f32 %f11, %f76;
.loc 16 326 0
mov.f32 %f77, %f13;
fma.rn.ftz.f32 %f78, %f66, %f47, %f77;
mov.f32 %f13, %f78;
.loc 16 327 0
mov.f32 %f79, %f15;
mul.ftz.f32 %f80, %f46, %f46;
fma.rn.ftz.f32 %f81, %f66, %f80, %f79;
mov.f32 %f15, %f81;
.loc 16 328 0
mov.f32 %f82, %f17;
mul.ftz.f32 %f83, %f44, %f45;
fma.rn.ftz.f32 %f84, %f66, %f83, %f82;
mov.f32 %f17, %f84;
.loc 16 329 0
mov.f32 %f85, %f19;
mul.ftz.f32 %f86, %f45, %f46;
fma.rn.ftz.f32 %f87, %f66, %f86, %f85;
mov.f32 %f19, %f87;
.loc 16 330 0
mul.ftz.f32 %f88, %f44, %f46;
fma.rn.ftz.f32 %f20, %f66, %f88, %f20;
mov.f32 %f21, %f20;
$Lt_1_25602:
$Lt_1_24578:
.loc 16 292 0
mul.lo.u64 %rd47, %rd39, 4;
add.u64 %rd31, %rd31, %rd47;
setp.lt.u64 %p10, %rd31, %rd28;
@%p10 bra $Lt_1_24322;
bra.uni $Lt_1_22786;
$Lt_1_32002:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
bra.uni $Lt_1_22786;
$Lt_1_23042:
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
$Lt_1_22786:
mov.u32 %r48, 1;
setp.le.s32 %p11, %r6, %r48;
@%p11 bra $Lt_1_28418;
.loc 16 341 0
mov.u64 %rd48, __cuda___cuda_local_var_32742_35_non_const_red_acc7168;
cvt.s64.s32 %rd49, %r1;
mul.wide.s32 %rd50, %r1, 4;
add.u64 %rd51, %rd48, %rd50;
mov.f32 %f89, %f33;
st.shared.f32 [%rd51+0], %f89;
.loc 16 342 0
mov.f32 %f90, %f32;
st.shared.f32 [%rd51+512], %f90;
.loc 16 343 0
mov.f32 %f91, %f31;
st.shared.f32 [%rd51+1024], %f91;
.loc 16 344 0
mov.f32 %f92, %f34;
st.shared.f32 [%rd51+1536], %f92;
.loc 16 346 0
shr.s32 %r49, %r6, 31;
mov.s32 %r50, 1;
and.b32 %r51, %r49, %r50;
add.s32 %r52, %r51, %r6;
shr.s32 %r53, %r52, 1;
mov.s32 %r54, %r53;
mov.u32 %r55, 0;
setp.ne.u32 %p12, %r53, %r55;
@!%p12 bra $Lt_1_26882;
$Lt_1_27394:
setp.ge.u32 %p13, %r10, %r54;
@%p13 bra $Lt_1_27650;
.loc 16 349 0
add.u32 %r56, %r1, %r54;
cvt.u64.u32 %rd52, %r56;
mul.wide.u32 %rd53, %r56, 4;
add.u64 %rd54, %rd48, %rd53;
ld.shared.f32 %f93, [%rd54+0];
add.ftz.f32 %f89, %f93, %f89;
st.shared.f32 [%rd51+0], %f89;
ld.shared.f32 %f94, [%rd54+512];
add.ftz.f32 %f90, %f94, %f90;
st.shared.f32 [%rd51+512], %f90;
ld.shared.f32 %f95, [%rd54+1024];
add.ftz.f32 %f91, %f95, %f91;
st.shared.f32 [%rd51+1024], %f91;
ld.shared.f32 %f96, [%rd54+1536];
add.ftz.f32 %f92, %f96, %f92;
st.shared.f32 [%rd51+1536], %f92;
$Lt_1_27650:
.loc 16 346 0
shr.u32 %r54, %r54, 1;
mov.u32 %r57, 0;
setp.ne.u32 %p14, %r54, %r57;
@%p14 bra $Lt_1_27394;
$Lt_1_26882:
.loc 16 353 0
mov.f32 %f33, %f89;
.loc 16 354 0
mov.f32 %f32, %f90;
.loc 16 355 0
mov.f32 %f31, %f91;
.loc 16 356 0
mov.f32 %f34, %f92;
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r59, 0;
setp.le.s32 %p15, %r58, %r59;
@%p15 bra $Lt_1_28418;
.loc 16 360 0
mov.f32 %f89, %f11;
st.shared.f32 [%rd51+0], %f89;
mov.f32 %f90, %f13;
st.shared.f32 [%rd51+512], %f90;
mov.f32 %f91, %f15;
st.shared.f32 [%rd51+1024], %f91;
mov.f32 %f92, %f17;
st.shared.f32 [%rd51+1536], %f92;
mov.f32 %f97, %f19;
st.shared.f32 [%rd51+2048], %f97;
mov.f32 %f98, %f21;
st.shared.f32 [%rd51+2560], %f98;
.loc 16 362 0
mov.s32 %r60, %r53;
@!%p12 bra $Lt_1_28930;
$Lt_1_29442:
setp.ge.u32 %p16, %r10, %r60;
@%p16 bra $Lt_1_29698;
.loc 16 365 0
add.u32 %r61, %r1, %r60;
cvt.u64.u32 %rd55, %r61;
mul.wide.u32 %rd56, %r61, 4;
add.u64 %rd57, %rd48, %rd56;
ld.shared.f32 %f99, [%rd57+0];
add.ftz.f32 %f89, %f99, %f89;
st.shared.f32 [%rd51+0], %f89;
ld.shared.f32 %f100, [%rd57+512];
add.ftz.f32 %f90, %f100, %f90;
st.shared.f32 [%rd51+512], %f90;
ld.shared.f32 %f101, [%rd57+1024];
add.ftz.f32 %f91, %f101, %f91;
st.shared.f32 [%rd51+1024], %f91;
ld.shared.f32 %f102, [%rd57+1536];
add.ftz.f32 %f92, %f102, %f92;
st.shared.f32 [%rd51+1536], %f92;
ld.shared.f32 %f103, [%rd57+2048];
add.ftz.f32 %f97, %f103, %f97;
st.shared.f32 [%rd51+2048], %f97;
ld.shared.f32 %f104, [%rd57+2560];
add.ftz.f32 %f98, %f104, %f98;
st.shared.f32 [%rd51+2560], %f98;
$Lt_1_29698:
.loc 16 362 0
shr.u32 %r60, %r60, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p17, %r60, %r62;
@%p17 bra $Lt_1_29442;
$Lt_1_28930:
.loc 16 370 0
mov.f32 %f11, %f89;
mov.f32 %f13, %f90;
mov.f32 %f15, %f91;
mov.f32 %f17, %f92;
mov.f32 %f19, %f97;
mov.f32 %f21, %f98;
$Lt_1_28418:
$Lt_1_26370:
selp.s32 %r63, 1, 0, %p4;
mov.s32 %r64, 0;
set.eq.u32.s32 %r65, %r10, %r64;
neg.s32 %r66, %r65;
and.b32 %r67, %r63, %r66;
mov.u32 %r68, 0;
setp.eq.s32 %p18, %r67, %r68;
@%p18 bra $Lt_1_30466;
.loc 16 376 0
cvt.s64.s32 %rd58, %r13;
ld.param.u64 %rd59, [__cudaparm_kernel_pair_fast_engv];
mul.wide.s32 %rd60, %r13, 4;
add.u64 %rd61, %rd59, %rd60;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30978;
.loc 16 378 0
st.global.f32 [%rd61+0], %f34;
.loc 16 379 0
cvt.s64.s32 %rd62, %r14;
mul.wide.s32 %rd63, %r14, 4;
add.u64 %rd61, %rd61, %rd63;
$Lt_1_30978:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31490;
.loc 16 383 0
mov.f32 %f105, %f11;
st.global.f32 [%rd61+0], %f105;
.loc 16 384 0
cvt.s64.s32 %rd64, %r14;
mul.wide.s32 %rd65, %r14, 4;
add.u64 %rd66, %rd65, %rd61;
.loc 16 383 0
mov.f32 %f106, %f13;
st.global.f32 [%rd66+0], %f106;
.loc 16 384 0
add.u64 %rd67, %rd65, %rd66;
.loc 16 383 0
mov.f32 %f107, %f15;
st.global.f32 [%rd67+0], %f107;
.loc 16 384 0
add.u64 %rd68, %rd65, %rd67;
.loc 16 383 0
mov.f32 %f108, %f17;
st.global.f32 [%rd68+0], %f108;
.loc 16 384 0
add.u64 %rd61, %rd65, %rd68;
.loc 16 383 0
mov.f32 %f109, %f19;
st.global.f32 [%rd61+0], %f109;
mov.f32 %f110, %f21;
add.u64 %rd69, %rd65, %rd61;
st.global.f32 [%rd69+0], %f110;
$Lt_1_31490:
.loc 16 387 0
ld.param.u64 %rd70, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd71, %rd58, 16;
add.u64 %rd72, %rd70, %rd71;
mov.f32 %f111, %f112;
st.global.v4.f32 [%rd72+0], {%f33,%f32,%f31,%f111};
$Lt_1_30466:
.loc 16 389 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,999 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bf97_00000000-9_morse_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.pRrhev)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bf97_00000000-8_morse_gpu_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "morse_gpu_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.entry kernel_pair (
.param .u64 __cudaparm_kernel_pair_x_,
.param .u64 __cudaparm_kernel_pair_mor1,
.param .u64 __cudaparm_kernel_pair_mor2,
.param .s32 __cudaparm_kernel_pair_lj_types,
.param .u64 __cudaparm_kernel_pair_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_dev_nbor,
.param .u64 __cudaparm_kernel_pair_dev_packed,
.param .u64 __cudaparm_kernel_pair_ans,
.param .u64 __cudaparm_kernel_pair_engv,
.param .s32 __cudaparm_kernel_pair_eflag,
.param .s32 __cudaparm_kernel_pair_vflag,
.param .s32 __cudaparm_kernel_pair_inum,
.param .s32 __cudaparm_kernel_pair_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_t_per_atom)
{
.reg .u32 %r<72>;
.reg .u64 %rd<63>;
.reg .f32 %f<104>;
.reg .f64 %fd<10>;
.reg .pred %p<19>;
.shared .align 16 .b8 __cuda___cuda_local_var_32497_33_non_const_sp_lj92[16];
.shared .align 4 .b8 __cuda___cuda_local_var_32582_35_non_const_red_acc108[3072];
// __cuda_local_var_32504_10_non_const_f = 48
// __cuda_local_var_32508_9_non_const_virial = 16
.loc 16 88 0
$LDWbegin_kernel_pair:
.loc 16 95 0
ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];
ldu.global.f32 %f1, [%rd1+0];
.loc 16 96 0
ld.global.f32 %f2, [%rd1+4];
.loc 16 97 0
ld.global.f32 %f3, [%rd1+8];
.loc 16 98 0
ld.global.f32 %f4, [%rd1+12];
st.shared.v4.f32 [__cuda___cuda_local_var_32497_33_non_const_sp_lj92+0], {%f1,%f2,%f3,%f4};
.loc 16 107 0
mov.f32 %f5, 0f00000000; // 0
mov.f32 %f6, %f5;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, %f7;
mov.f32 %f9, 0f00000000; // 0
mov.f32 %f10, %f9;
mov.f32 %f11, 0f00000000; // 0
mov.f32 %f12, %f11;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, %f13;
mov.f32 %f15, 0f00000000; // 0
mov.f32 %f16, %f15;
ld.param.s32 %r1, [__cudaparm_kernel_pair_t_per_atom];
cvt.s32.u32 %r2, %tid.x;
div.s32 %r3, %r2, %r1;
cvt.s32.u32 %r4, %ntid.x;
div.s32 %r5, %r4, %r1;
rem.s32 %r6, %r2, %r1;
cvt.s32.u32 %r7, %ctaid.x;
mul.lo.s32 %r8, %r7, %r5;
add.s32 %r9, %r3, %r8;
ld.param.s32 %r10, [__cudaparm_kernel_pair_inum];
setp.lt.s32 %p1, %r9, %r10;
@!%p1 bra $Lt_0_19202;
.loc 16 113 0
ld.param.s32 %r11, [__cudaparm_kernel_pair_nbor_pitch];
cvt.s64.s32 %rd2, %r11;
mul.wide.s32 %rd3, %r11, 4;
cvt.s64.s32 %rd4, %r9;
mul.wide.s32 %rd5, %r9, 4;
ld.param.u64 %rd6, [__cudaparm_kernel_pair_dev_nbor];
add.u64 %rd7, %rd5, %rd6;
add.u64 %rd8, %rd3, %rd7;
ld.global.s32 %r12, [%rd8+0];
add.u64 %rd9, %rd3, %rd8;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_dev_packed];
setp.ne.u64 %p2, %rd10, %rd6;
@%p2 bra $Lt_0_19714;
.loc 16 119 0
cvt.s32.s64 %r13, %rd2;
mul.lo.s32 %r14, %r13, %r12;
cvt.s64.s32 %rd11, %r14;
mul.wide.s32 %rd12, %r14, 4;
add.u64 %rd13, %rd9, %rd12;
.loc 16 120 0
mul.lo.s32 %r15, %r6, %r13;
cvt.s64.s32 %rd14, %r15;
mul.wide.s32 %rd15, %r15, 4;
add.u64 %rd16, %rd9, %rd15;
.loc 16 121 0
mul.lo.s32 %r16, %r13, %r1;
bra.uni $Lt_0_19458;
$Lt_0_19714:
.loc 16 123 0
ld.global.s32 %r17, [%rd9+0];
cvt.s64.s32 %rd17, %r17;
mul.wide.s32 %rd18, %r17, 4;
add.u64 %rd19, %rd10, %rd18;
.loc 16 124 0
cvt.s64.s32 %rd20, %r12;
mul.wide.s32 %rd21, %r12, 4;
add.u64 %rd13, %rd19, %rd21;
.loc 16 125 0
mov.s32 %r16, %r1;
.loc 16 126 0
cvt.s64.s32 %rd22, %r6;
mul.wide.s32 %rd23, %r6, 4;
add.u64 %rd16, %rd19, %rd23;
$Lt_0_19458:
.loc 16 129 0
ld.global.s32 %r18, [%rd7+0];
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r19,%r21,%r23,%r25}];
mov.f32 %f21, %f17;
mov.f32 %f22, %f18;
mov.f32 %f23, %f19;
mov.f32 %f24, %f20;
setp.ge.u64 %p3, %rd16, %rd13;
@%p3 bra $Lt_0_28162;
cvt.rzi.ftz.s32.f32 %r26, %f24;
cvt.s64.s32 %rd24, %r16;
ld.param.s32 %r27, [__cudaparm_kernel_pair_lj_types];
mul.lo.s32 %r28, %r27, %r26;
ld.param.u64 %rd25, [__cudaparm_kernel_pair_mor1];
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
mov.u64 %rd26, __cuda___cuda_local_var_32497_33_non_const_sp_lj92;
$Lt_0_20482:
//<loop> Loop body line 129, nesting depth: 1, estimated iterations: unknown
.loc 16 135 0
ld.global.s32 %r29, [%rd16+0];
.loc 16 136 0
shr.s32 %r30, %r29, 30;
and.b32 %r31, %r30, 3;
cvt.s64.s32 %rd27, %r31;
mul.wide.s32 %rd28, %r31, 4;
add.u64 %rd29, %rd26, %rd28;
ld.shared.f32 %f29, [%rd29+0];
.loc 16 139 0
and.b32 %r32, %r29, 1073741823;
mov.u32 %r33, %r32;
mov.s32 %r34, 0;
mov.u32 %r35, %r34;
mov.s32 %r36, 0;
mov.u32 %r37, %r36;
mov.s32 %r38, 0;
mov.u32 %r39, %r38;
tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r33,%r35,%r37,%r39}];
mov.f32 %f34, %f30;
mov.f32 %f35, %f31;
mov.f32 %f36, %f32;
mov.f32 %f37, %f33;
cvt.rzi.ftz.s32.f32 %r40, %f37;
sub.ftz.f32 %f38, %f22, %f35;
sub.ftz.f32 %f39, %f21, %f34;
sub.ftz.f32 %f40, %f23, %f36;
mul.ftz.f32 %f41, %f38, %f38;
fma.rn.ftz.f32 %f42, %f39, %f39, %f41;
add.s32 %r41, %r40, %r28;
cvt.s64.s32 %rd30, %r41;
fma.rn.ftz.f32 %f43, %f40, %f40, %f42;
mul.wide.s32 %rd31, %r41, 16;
add.u64 %rd32, %rd25, %rd31;
ld.global.f32 %f44, [%rd32+0];
setp.gt.ftz.f32 %p4, %f44, %f43;
@!%p4 bra $Lt_0_21762;
.loc 16 152 0
sqrt.approx.ftz.f32 %f45, %f43;
ld.global.v4.f32 {_,%f46,%f47,%f48}, [%rd32+0];
sub.ftz.f32 %f49, %f45, %f47;
mul.ftz.f32 %f50, %f48, %f49;
neg.ftz.f32 %f51, %f50;
.loc 16 154 0
mov.f32 %f52, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f53, %f51, %f52;
ex2.approx.ftz.f32 %f54, %f53;
mul.ftz.f32 %f55, %f54, %f54;
sub.ftz.f32 %f56, %f55, %f54;
mul.ftz.f32 %f57, %f46, %f56;
.loc 16 156 0
div.approx.ftz.f32 %f58, %f57, %f45;
mul.ftz.f32 %f59, %f58, %f29;
fma.rn.ftz.f32 %f27, %f39, %f59, %f27;
.loc 16 157 0
fma.rn.ftz.f32 %f26, %f38, %f59, %f26;
.loc 16 158 0
fma.rn.ftz.f32 %f25, %f40, %f59, %f25;
ld.param.s32 %r42, [__cudaparm_kernel_pair_eflag];
mov.u32 %r43, 0;
setp.le.s32 %p5, %r42, %r43;
@%p5 bra $Lt_0_21250;
.loc 16 162 0
cvt.ftz.f64.f32 %fd1, %f54;
ld.param.u64 %rd33, [__cudaparm_kernel_pair_mor2];
mul.lo.u64 %rd34, %rd30, 8;
add.u64 %rd35, %rd33, %rd34;
ld.global.v2.f32 {%f60,%f61}, [%rd35+0];
cvt.ftz.f64.f32 %fd2, %f61;
cvt.ftz.f64.f32 %fd3, %f60;
mul.ftz.f32 %f62, %f54, %f54;
cvt.ftz.f64.f32 %fd4, %f62;
add.f64 %fd5, %fd1, %fd1;
sub.f64 %fd6, %fd4, %fd5;
mul.f64 %fd7, %fd3, %fd6;
sub.f64 %fd8, %fd7, %fd2;
cvt.rn.ftz.f32.f64 %f63, %fd8;
fma.rn.ftz.f32 %f28, %f29, %f63, %f28;
$Lt_0_21250:
ld.param.s32 %r44, [__cudaparm_kernel_pair_vflag];
mov.u32 %r45, 0;
setp.le.s32 %p6, %r44, %r45;
@%p6 bra $Lt_0_21762;
.loc 16 165 0
mov.f32 %f64, %f6;
mul.ftz.f32 %f65, %f39, %f39;
fma.rn.ftz.f32 %f66, %f59, %f65, %f64;
mov.f32 %f6, %f66;
.loc 16 166 0
mov.f32 %f67, %f8;
fma.rn.ftz.f32 %f68, %f59, %f41, %f67;
mov.f32 %f8, %f68;
.loc 16 167 0
mov.f32 %f69, %f10;
mul.ftz.f32 %f70, %f40, %f40;
fma.rn.ftz.f32 %f71, %f59, %f70, %f69;
mov.f32 %f10, %f71;
.loc 16 168 0
mov.f32 %f72, %f12;
mul.ftz.f32 %f73, %f38, %f39;
fma.rn.ftz.f32 %f74, %f59, %f73, %f72;
mov.f32 %f12, %f74;
.loc 16 169 0
mov.f32 %f75, %f14;
mul.ftz.f32 %f76, %f39, %f40;
fma.rn.ftz.f32 %f77, %f59, %f76, %f75;
mov.f32 %f14, %f77;
.loc 16 170 0
mul.ftz.f32 %f78, %f38, %f40;
fma.rn.ftz.f32 %f15, %f59, %f78, %f15;
mov.f32 %f16, %f15;
$Lt_0_21762:
$Lt_0_20738:
.loc 16 133 0
mul.lo.u64 %rd36, %rd24, 4;
add.u64 %rd16, %rd16, %rd36;
setp.lt.u64 %p7, %rd16, %rd13;
@%p7 bra $Lt_0_20482;
bra.uni $Lt_0_18946;
$Lt_0_28162:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
bra.uni $Lt_0_18946;
$Lt_0_19202:
mov.f32 %f25, 0f00000000; // 0
mov.f32 %f26, 0f00000000; // 0
mov.f32 %f27, 0f00000000; // 0
mov.f32 %f28, 0f00000000; // 0
$Lt_0_18946:
mov.u32 %r46, 1;
setp.le.s32 %p8, %r1, %r46;
@%p8 bra $Lt_0_24578;
.loc 16 181 0
mov.u64 %rd37, __cuda___cuda_local_var_32582_35_non_const_red_acc108;
cvt.s64.s32 %rd38, %r2;
mul.wide.s32 %rd39, %r2, 4;
add.u64 %rd40, %rd37, %rd39;
mov.f32 %f79, %f27;
st.shared.f32 [%rd40+0], %f79;
.loc 16 182 0
mov.f32 %f80, %f26;
st.shared.f32 [%rd40+512], %f80;
.loc 16 183 0
mov.f32 %f81, %f25;
st.shared.f32 [%rd40+1024], %f81;
.loc 16 184 0
mov.f32 %f82, %f28;
st.shared.f32 [%rd40+1536], %f82;
.loc 16 186 0
shr.s32 %r47, %r1, 31;
mov.s32 %r48, 1;
and.b32 %r49, %r47, %r48;
add.s32 %r50, %r49, %r1;
shr.s32 %r51, %r50, 1;
mov.s32 %r52, %r51;
mov.u32 %r53, 0;
setp.ne.u32 %p9, %r51, %r53;
@!%p9 bra $Lt_0_23042;
$Lt_0_23554:
setp.ge.u32 %p10, %r6, %r52;
@%p10 bra $Lt_0_23810;
.loc 16 189 0
add.u32 %r54, %r2, %r52;
cvt.u64.u32 %rd41, %r54;
mul.wide.u32 %rd42, %r54, 4;
add.u64 %rd43, %rd37, %rd42;
ld.shared.f32 %f83, [%rd43+0];
add.ftz.f32 %f79, %f83, %f79;
st.shared.f32 [%rd40+0], %f79;
ld.shared.f32 %f84, [%rd43+512];
add.ftz.f32 %f80, %f84, %f80;
st.shared.f32 [%rd40+512], %f80;
ld.shared.f32 %f85, [%rd43+1024];
add.ftz.f32 %f81, %f85, %f81;
st.shared.f32 [%rd40+1024], %f81;
ld.shared.f32 %f86, [%rd43+1536];
add.ftz.f32 %f82, %f86, %f82;
st.shared.f32 [%rd40+1536], %f82;
$Lt_0_23810:
.loc 16 186 0
shr.u32 %r52, %r52, 1;
mov.u32 %r55, 0;
setp.ne.u32 %p11, %r52, %r55;
@%p11 bra $Lt_0_23554;
$Lt_0_23042:
.loc 16 193 0
mov.f32 %f27, %f79;
.loc 16 194 0
mov.f32 %f26, %f80;
.loc 16 195 0
mov.f32 %f25, %f81;
.loc 16 196 0
mov.f32 %f28, %f82;
ld.param.s32 %r56, [__cudaparm_kernel_pair_vflag];
mov.u32 %r57, 0;
setp.le.s32 %p12, %r56, %r57;
@%p12 bra $Lt_0_24578;
.loc 16 200 0
mov.f32 %f79, %f6;
st.shared.f32 [%rd40+0], %f79;
mov.f32 %f80, %f8;
st.shared.f32 [%rd40+512], %f80;
mov.f32 %f81, %f10;
st.shared.f32 [%rd40+1024], %f81;
mov.f32 %f82, %f12;
st.shared.f32 [%rd40+1536], %f82;
mov.f32 %f87, %f14;
st.shared.f32 [%rd40+2048], %f87;
mov.f32 %f88, %f16;
st.shared.f32 [%rd40+2560], %f88;
.loc 16 202 0
mov.s32 %r58, %r51;
@!%p9 bra $Lt_0_25090;
$Lt_0_25602:
setp.ge.u32 %p13, %r6, %r58;
@%p13 bra $Lt_0_25858;
.loc 16 205 0
add.u32 %r59, %r2, %r58;
cvt.u64.u32 %rd44, %r59;
mul.wide.u32 %rd45, %r59, 4;
add.u64 %rd46, %rd37, %rd45;
ld.shared.f32 %f89, [%rd46+0];
add.ftz.f32 %f79, %f89, %f79;
st.shared.f32 [%rd40+0], %f79;
ld.shared.f32 %f90, [%rd46+512];
add.ftz.f32 %f80, %f90, %f80;
st.shared.f32 [%rd40+512], %f80;
ld.shared.f32 %f91, [%rd46+1024];
add.ftz.f32 %f81, %f91, %f81;
st.shared.f32 [%rd40+1024], %f81;
ld.shared.f32 %f92, [%rd46+1536];
add.ftz.f32 %f82, %f92, %f82;
st.shared.f32 [%rd40+1536], %f82;
ld.shared.f32 %f93, [%rd46+2048];
add.ftz.f32 %f87, %f93, %f87;
st.shared.f32 [%rd40+2048], %f87;
ld.shared.f32 %f94, [%rd46+2560];
add.ftz.f32 %f88, %f94, %f88;
st.shared.f32 [%rd40+2560], %f88;
$Lt_0_25858:
.loc 16 202 0
shr.u32 %r58, %r58, 1;
mov.u32 %r60, 0;
setp.ne.u32 %p14, %r58, %r60;
@%p14 bra $Lt_0_25602;
$Lt_0_25090:
.loc 16 210 0
mov.f32 %f6, %f79;
mov.f32 %f8, %f80;
mov.f32 %f10, %f81;
mov.f32 %f12, %f82;
mov.f32 %f14, %f87;
mov.f32 %f16, %f88;
$Lt_0_24578:
$Lt_0_22530:
selp.s32 %r61, 1, 0, %p1;
mov.s32 %r62, 0;
set.eq.u32.s32 %r63, %r6, %r62;
neg.s32 %r64, %r63;
and.b32 %r65, %r61, %r64;
mov.u32 %r66, 0;
setp.eq.s32 %p15, %r65, %r66;
@%p15 bra $Lt_0_26626;
.loc 16 216 0
cvt.s64.s32 %rd47, %r9;
ld.param.u64 %rd48, [__cudaparm_kernel_pair_engv];
mul.wide.s32 %rd49, %r9, 4;
add.u64 %rd50, %rd48, %rd49;
ld.param.s32 %r67, [__cudaparm_kernel_pair_eflag];
mov.u32 %r68, 0;
setp.le.s32 %p16, %r67, %r68;
@%p16 bra $Lt_0_27138;
.loc 16 218 0
st.global.f32 [%rd50+0], %f28;
.loc 16 219 0
cvt.s64.s32 %rd51, %r10;
mul.wide.s32 %rd52, %r10, 4;
add.u64 %rd50, %rd50, %rd52;
$Lt_0_27138:
ld.param.s32 %r69, [__cudaparm_kernel_pair_vflag];
mov.u32 %r70, 0;
setp.le.s32 %p17, %r69, %r70;
@%p17 bra $Lt_0_27650;
.loc 16 223 0
mov.f32 %f95, %f6;
st.global.f32 [%rd50+0], %f95;
.loc 16 224 0
cvt.s64.s32 %rd53, %r10;
mul.wide.s32 %rd54, %r10, 4;
add.u64 %rd55, %rd54, %rd50;
.loc 16 223 0
mov.f32 %f96, %f8;
st.global.f32 [%rd55+0], %f96;
.loc 16 224 0
add.u64 %rd56, %rd54, %rd55;
.loc 16 223 0
mov.f32 %f97, %f10;
st.global.f32 [%rd56+0], %f97;
.loc 16 224 0
add.u64 %rd57, %rd54, %rd56;
.loc 16 223 0
mov.f32 %f98, %f12;
st.global.f32 [%rd57+0], %f98;
.loc 16 224 0
add.u64 %rd50, %rd54, %rd57;
.loc 16 223 0
mov.f32 %f99, %f14;
st.global.f32 [%rd50+0], %f99;
mov.f32 %f100, %f16;
add.u64 %rd58, %rd54, %rd50;
st.global.f32 [%rd58+0], %f100;
$Lt_0_27650:
.loc 16 227 0
ld.param.u64 %rd59, [__cudaparm_kernel_pair_ans];
mul.lo.u64 %rd60, %rd47, 16;
add.u64 %rd61, %rd59, %rd60;
mov.f32 %f101, %f102;
st.global.v4.f32 [%rd61+0], {%f27,%f26,%f25,%f101};
$Lt_0_26626:
.loc 16 229 0
exit;
$LDWend_kernel_pair:
} // kernel_pair
.entry kernel_pair_fast (
.param .u64 __cudaparm_kernel_pair_fast_x_,
.param .u64 __cudaparm_kernel_pair_fast_mor1_in,
.param .u64 __cudaparm_kernel_pair_fast_mor2_in,
.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,
.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,
.param .u64 __cudaparm_kernel_pair_fast_dev_packed,
.param .u64 __cudaparm_kernel_pair_fast_ans,
.param .u64 __cudaparm_kernel_pair_fast_engv,
.param .s32 __cudaparm_kernel_pair_fast_eflag,
.param .s32 __cudaparm_kernel_pair_fast_vflag,
.param .s32 __cudaparm_kernel_pair_fast_inum,
.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,
.param .s32 __cudaparm_kernel_pair_fast_t_per_atom)
{
.reg .u32 %r<74>;
.reg .u64 %rd<76>;
.reg .f32 %f<110>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32648_33_non_const_sp_lj3268[16];
.shared .align 16 .b8 __cuda___cuda_local_var_32646_34_non_const_mor13296[1936];
.shared .align 8 .b8 __cuda___cuda_local_var_32647_34_non_const_mor25232[968];
.shared .align 4 .b8 __cuda___cuda_local_var_32738_35_non_const_red_acc6200[3072];
// __cuda_local_var_32658_10_non_const_f = 48
// __cuda_local_var_32662_9_non_const_virial = 16
.loc 16 237 0
$LDWbegin_kernel_pair_fast:
cvt.s32.u32 %r1, %tid.x;
mov.u32 %r2, 3;
setp.gt.s32 %p1, %r1, %r2;
@%p1 bra $Lt_1_21250;
.loc 16 247 0
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
cvt.s64.s32 %rd2, %r1;
mul.wide.s32 %rd3, %r1, 4;
ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_21250:
mov.u64 %rd1, __cuda___cuda_local_var_32648_33_non_const_sp_lj3268;
mov.u32 %r3, 120;
setp.gt.s32 %p2, %r1, %r3;
@%p2 bra $Lt_1_21762;
.loc 16 249 0
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296;
cvt.s64.s32 %rd8, %r1;
mul.wide.s32 %rd9, %r1, 16;
ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_mor1_in];
add.u64 %rd11, %rd10, %rd9;
add.u64 %rd12, %rd9, %rd7;
ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];
st.shared.v4.f32 [%rd12+0], {%f2,%f3,%f4,%f5};
ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r5, 0;
setp.le.s32 %p3, %r4, %r5;
@%p3 bra $Lt_1_22274;
.loc 16 251 0
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;
mul.lo.u64 %rd14, %rd8, 8;
ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast_mor2_in];
add.u64 %rd16, %rd15, %rd14;
add.u64 %rd17, %rd14, %rd13;
ld.global.v2.f32 {%f6,%f7}, [%rd16+0];
st.shared.v2.f32 [%rd17+0], {%f6,%f7};
$Lt_1_22274:
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;
$Lt_1_21762:
mov.u64 %rd7, __cuda___cuda_local_var_32646_34_non_const_mor13296;
mov.u64 %rd13, __cuda___cuda_local_var_32647_34_non_const_mor25232;
.loc 16 261 0
mov.f32 %f8, 0f00000000; // 0
mov.f32 %f9, %f8;
mov.f32 %f10, 0f00000000; // 0
mov.f32 %f11, %f10;
mov.f32 %f12, 0f00000000; // 0
mov.f32 %f13, %f12;
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, %f14;
mov.f32 %f16, 0f00000000; // 0
mov.f32 %f17, %f16;
mov.f32 %f18, 0f00000000; // 0
mov.f32 %f19, %f18;
.loc 16 263 0
bar.sync 0;
ld.param.s32 %r6, [__cudaparm_kernel_pair_fast_t_per_atom];
div.s32 %r7, %r1, %r6;
cvt.s32.u32 %r8, %ntid.x;
div.s32 %r9, %r8, %r6;
rem.s32 %r10, %r1, %r6;
cvt.s32.u32 %r11, %ctaid.x;
mul.lo.s32 %r12, %r11, %r9;
add.s32 %r13, %r7, %r12;
ld.param.s32 %r14, [__cudaparm_kernel_pair_fast_inum];
setp.lt.s32 %p4, %r13, %r14;
@!%p4 bra $Lt_1_23042;
.loc 16 269 0
ld.param.s32 %r15, [__cudaparm_kernel_pair_fast_nbor_pitch];
cvt.s64.s32 %rd18, %r15;
mul.wide.s32 %rd19, %r15, 4;
cvt.s64.s32 %rd20, %r13;
mul.wide.s32 %rd21, %r13, 4;
ld.param.u64 %rd22, [__cudaparm_kernel_pair_fast_dev_nbor];
add.u64 %rd23, %rd21, %rd22;
add.u64 %rd24, %rd19, %rd23;
ld.global.s32 %r16, [%rd24+0];
add.u64 %rd25, %rd19, %rd24;
ld.param.u64 %rd26, [__cudaparm_kernel_pair_fast_dev_packed];
setp.ne.u64 %p5, %rd26, %rd22;
@%p5 bra $Lt_1_23554;
.loc 16 275 0
cvt.s32.s64 %r17, %rd18;
mul.lo.s32 %r18, %r17, %r16;
cvt.s64.s32 %rd27, %r18;
mul.wide.s32 %rd28, %r18, 4;
add.u64 %rd29, %rd25, %rd28;
.loc 16 276 0
mul.lo.s32 %r19, %r10, %r17;
cvt.s64.s32 %rd30, %r19;
mul.wide.s32 %rd31, %r19, 4;
add.u64 %rd32, %rd25, %rd31;
.loc 16 277 0
mul.lo.s32 %r20, %r17, %r6;
bra.uni $Lt_1_23298;
$Lt_1_23554:
.loc 16 279 0
ld.global.s32 %r21, [%rd25+0];
cvt.s64.s32 %rd33, %r21;
mul.wide.s32 %rd34, %r21, 4;
add.u64 %rd35, %rd26, %rd34;
.loc 16 280 0
cvt.s64.s32 %rd36, %r16;
mul.wide.s32 %rd37, %r16, 4;
add.u64 %rd29, %rd35, %rd37;
.loc 16 281 0
mov.s32 %r20, %r6;
.loc 16 282 0
cvt.s64.s32 %rd38, %r10;
mul.wide.s32 %rd39, %r10, 4;
add.u64 %rd32, %rd35, %rd39;
$Lt_1_23298:
.loc 16 285 0
ld.global.s32 %r22, [%rd23+0];
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
mov.s32 %r28, 0;
mov.u32 %r29, %r28;
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[pos_tex,{%r23,%r25,%r27,%r29}];
mov.f32 %f24, %f20;
mov.f32 %f25, %f21;
mov.f32 %f26, %f22;
mov.f32 %f27, %f23;
setp.ge.u64 %p6, %rd32, %rd29;
@%p6 bra $Lt_1_32002;
cvt.rzi.ftz.s32.f32 %r30, %f27;
cvt.s64.s32 %rd40, %r20;
mul.lo.s32 %r31, %r30, 11;
cvt.rn.f32.s32 %f28, %r31;
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
$Lt_1_24322:
//<loop> Loop body line 285, nesting depth: 1, estimated iterations: unknown
.loc 16 292 0
ld.global.s32 %r32, [%rd32+0];
.loc 16 293 0
shr.s32 %r33, %r32, 30;
and.b32 %r34, %r33, 3;
cvt.s64.s32 %rd41, %r34;
mul.wide.s32 %rd42, %r34, 4;
add.u64 %rd43, %rd1, %rd42;
ld.shared.f32 %f33, [%rd43+0];
.loc 16 296 0
and.b32 %r35, %r32, 1073741823;
mov.u32 %r36, %r35;
mov.s32 %r37, 0;
mov.u32 %r38, %r37;
mov.s32 %r39, 0;
mov.u32 %r40, %r39;
mov.s32 %r41, 0;
mov.u32 %r42, %r41;
tex.1d.v4.f32.s32 {%f34,%f35,%f36,%f37},[pos_tex,{%r36,%r38,%r40,%r42}];
mov.f32 %f38, %f34;
mov.f32 %f39, %f35;
mov.f32 %f40, %f36;
mov.f32 %f41, %f37;
sub.ftz.f32 %f42, %f25, %f39;
sub.ftz.f32 %f43, %f24, %f38;
sub.ftz.f32 %f44, %f26, %f40;
mul.ftz.f32 %f45, %f42, %f42;
fma.rn.ftz.f32 %f46, %f43, %f43, %f45;
fma.rn.ftz.f32 %f47, %f44, %f44, %f46;
add.ftz.f32 %f48, %f28, %f41;
cvt.rzi.ftz.s32.f32 %r43, %f48;
cvt.s64.s32 %rd44, %r43;
mul.wide.s32 %rd45, %r43, 16;
add.u64 %rd46, %rd7, %rd45;
ld.shared.f32 %f49, [%rd46+0];
setp.gt.ftz.f32 %p7, %f49, %f47;
@!%p7 bra $Lt_1_25602;
.loc 16 307 0
sqrt.approx.ftz.f32 %f50, %f47;
ld.shared.v4.f32 {_,%f51,%f52,%f53}, [%rd46+0];
sub.ftz.f32 %f54, %f50, %f52;
.loc 16 308 0
mul.ftz.f32 %f55, %f53, %f54;
neg.ftz.f32 %f56, %f55;
.loc 16 310 0
mov.f32 %f57, 0f3fb8aa3b; // 1.4427
mul.ftz.f32 %f58, %f56, %f57;
ex2.approx.ftz.f32 %f59, %f58;
mul.ftz.f32 %f60, %f59, %f59;
sub.ftz.f32 %f61, %f60, %f59;
mul.ftz.f32 %f62, %f51, %f61;
.loc 16 312 0
div.approx.ftz.f32 %f63, %f62, %f50;
mul.ftz.f32 %f64, %f63, %f33;
fma.rn.ftz.f32 %f31, %f43, %f64, %f31;
.loc 16 313 0
fma.rn.ftz.f32 %f30, %f42, %f64, %f30;
.loc 16 314 0
fma.rn.ftz.f32 %f29, %f44, %f64, %f29;
ld.param.s32 %r44, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r45, 0;
setp.le.s32 %p8, %r44, %r45;
@%p8 bra $Lt_1_25090;
.loc 16 317 0
mul.lo.u64 %rd47, %rd44, 8;
add.u64 %rd48, %rd13, %rd47;
ld.shared.v2.f32 {%f65,%f66}, [%rd48+0];
sub.ftz.f32 %f67, %f61, %f59;
mul.ftz.f32 %f68, %f65, %f67;
sub.ftz.f32 %f69, %f68, %f66;
.loc 16 318 0
fma.rn.ftz.f32 %f32, %f33, %f69, %f32;
$Lt_1_25090:
ld.param.s32 %r46, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r47, 0;
setp.le.s32 %p9, %r46, %r47;
@%p9 bra $Lt_1_25602;
.loc 16 321 0
mov.f32 %f70, %f9;
mul.ftz.f32 %f71, %f43, %f43;
fma.rn.ftz.f32 %f72, %f64, %f71, %f70;
mov.f32 %f9, %f72;
.loc 16 322 0
mov.f32 %f73, %f11;
fma.rn.ftz.f32 %f74, %f64, %f45, %f73;
mov.f32 %f11, %f74;
.loc 16 323 0
mov.f32 %f75, %f13;
mul.ftz.f32 %f76, %f44, %f44;
fma.rn.ftz.f32 %f77, %f64, %f76, %f75;
mov.f32 %f13, %f77;
.loc 16 324 0
mov.f32 %f78, %f15;
mul.ftz.f32 %f79, %f42, %f43;
fma.rn.ftz.f32 %f80, %f64, %f79, %f78;
mov.f32 %f15, %f80;
.loc 16 325 0
mov.f32 %f81, %f17;
mul.ftz.f32 %f82, %f43, %f44;
fma.rn.ftz.f32 %f83, %f64, %f82, %f81;
mov.f32 %f17, %f83;
.loc 16 326 0
mul.ftz.f32 %f84, %f42, %f44;
fma.rn.ftz.f32 %f18, %f64, %f84, %f18;
mov.f32 %f19, %f18;
$Lt_1_25602:
$Lt_1_24578:
.loc 16 290 0
mul.lo.u64 %rd49, %rd40, 4;
add.u64 %rd32, %rd32, %rd49;
setp.lt.u64 %p10, %rd32, %rd29;
@%p10 bra $Lt_1_24322;
bra.uni $Lt_1_22786;
$Lt_1_32002:
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
bra.uni $Lt_1_22786;
$Lt_1_23042:
mov.f32 %f29, 0f00000000; // 0
mov.f32 %f30, 0f00000000; // 0
mov.f32 %f31, 0f00000000; // 0
mov.f32 %f32, 0f00000000; // 0
$Lt_1_22786:
mov.u32 %r48, 1;
setp.le.s32 %p11, %r6, %r48;
@%p11 bra $Lt_1_28418;
.loc 16 337 0
mov.u64 %rd50, __cuda___cuda_local_var_32738_35_non_const_red_acc6200;
cvt.s64.s32 %rd51, %r1;
mul.wide.s32 %rd52, %r1, 4;
add.u64 %rd53, %rd50, %rd52;
mov.f32 %f85, %f31;
st.shared.f32 [%rd53+0], %f85;
.loc 16 338 0
mov.f32 %f86, %f30;
st.shared.f32 [%rd53+512], %f86;
.loc 16 339 0
mov.f32 %f87, %f29;
st.shared.f32 [%rd53+1024], %f87;
.loc 16 340 0
mov.f32 %f88, %f32;
st.shared.f32 [%rd53+1536], %f88;
.loc 16 342 0
shr.s32 %r49, %r6, 31;
mov.s32 %r50, 1;
and.b32 %r51, %r49, %r50;
add.s32 %r52, %r51, %r6;
shr.s32 %r53, %r52, 1;
mov.s32 %r54, %r53;
mov.u32 %r55, 0;
setp.ne.u32 %p12, %r53, %r55;
@!%p12 bra $Lt_1_26882;
$Lt_1_27394:
setp.ge.u32 %p13, %r10, %r54;
@%p13 bra $Lt_1_27650;
.loc 16 345 0
add.u32 %r56, %r1, %r54;
cvt.u64.u32 %rd54, %r56;
mul.wide.u32 %rd55, %r56, 4;
add.u64 %rd56, %rd50, %rd55;
ld.shared.f32 %f89, [%rd56+0];
add.ftz.f32 %f85, %f89, %f85;
st.shared.f32 [%rd53+0], %f85;
ld.shared.f32 %f90, [%rd56+512];
add.ftz.f32 %f86, %f90, %f86;
st.shared.f32 [%rd53+512], %f86;
ld.shared.f32 %f91, [%rd56+1024];
add.ftz.f32 %f87, %f91, %f87;
st.shared.f32 [%rd53+1024], %f87;
ld.shared.f32 %f92, [%rd56+1536];
add.ftz.f32 %f88, %f92, %f88;
st.shared.f32 [%rd53+1536], %f88;
$Lt_1_27650:
.loc 16 342 0
shr.u32 %r54, %r54, 1;
mov.u32 %r57, 0;
setp.ne.u32 %p14, %r54, %r57;
@%p14 bra $Lt_1_27394;
$Lt_1_26882:
.loc 16 349 0
mov.f32 %f31, %f85;
.loc 16 350 0
mov.f32 %f30, %f86;
.loc 16 351 0
mov.f32 %f29, %f87;
.loc 16 352 0
mov.f32 %f32, %f88;
ld.param.s32 %r58, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r59, 0;
setp.le.s32 %p15, %r58, %r59;
@%p15 bra $Lt_1_28418;
.loc 16 356 0
mov.f32 %f85, %f9;
st.shared.f32 [%rd53+0], %f85;
mov.f32 %f86, %f11;
st.shared.f32 [%rd53+512], %f86;
mov.f32 %f87, %f13;
st.shared.f32 [%rd53+1024], %f87;
mov.f32 %f88, %f15;
st.shared.f32 [%rd53+1536], %f88;
mov.f32 %f93, %f17;
st.shared.f32 [%rd53+2048], %f93;
mov.f32 %f94, %f19;
st.shared.f32 [%rd53+2560], %f94;
.loc 16 358 0
mov.s32 %r60, %r53;
@!%p12 bra $Lt_1_28930;
$Lt_1_29442:
setp.ge.u32 %p16, %r10, %r60;
@%p16 bra $Lt_1_29698;
.loc 16 361 0
add.u32 %r61, %r1, %r60;
cvt.u64.u32 %rd57, %r61;
mul.wide.u32 %rd58, %r61, 4;
add.u64 %rd59, %rd50, %rd58;
ld.shared.f32 %f95, [%rd59+0];
add.ftz.f32 %f85, %f95, %f85;
st.shared.f32 [%rd53+0], %f85;
ld.shared.f32 %f96, [%rd59+512];
add.ftz.f32 %f86, %f96, %f86;
st.shared.f32 [%rd53+512], %f86;
ld.shared.f32 %f97, [%rd59+1024];
add.ftz.f32 %f87, %f97, %f87;
st.shared.f32 [%rd53+1024], %f87;
ld.shared.f32 %f98, [%rd59+1536];
add.ftz.f32 %f88, %f98, %f88;
st.shared.f32 [%rd53+1536], %f88;
ld.shared.f32 %f99, [%rd59+2048];
add.ftz.f32 %f93, %f99, %f93;
st.shared.f32 [%rd53+2048], %f93;
ld.shared.f32 %f100, [%rd59+2560];
add.ftz.f32 %f94, %f100, %f94;
st.shared.f32 [%rd53+2560], %f94;
$Lt_1_29698:
.loc 16 358 0
shr.u32 %r60, %r60, 1;
mov.u32 %r62, 0;
setp.ne.u32 %p17, %r60, %r62;
@%p17 bra $Lt_1_29442;
$Lt_1_28930:
.loc 16 366 0
mov.f32 %f9, %f85;
mov.f32 %f11, %f86;
mov.f32 %f13, %f87;
mov.f32 %f15, %f88;
mov.f32 %f17, %f93;
mov.f32 %f19, %f94;
$Lt_1_28418:
$Lt_1_26370:
selp.s32 %r63, 1, 0, %p4;
mov.s32 %r64, 0;
set.eq.u32.s32 %r65, %r10, %r64;
neg.s32 %r66, %r65;
and.b32 %r67, %r63, %r66;
mov.u32 %r68, 0;
setp.eq.s32 %p18, %r67, %r68;
@%p18 bra $Lt_1_30466;
.loc 16 372 0
cvt.s64.s32 %rd60, %r13;
ld.param.u64 %rd61, [__cudaparm_kernel_pair_fast_engv];
mul.wide.s32 %rd62, %r13, 4;
add.u64 %rd63, %rd61, %rd62;
ld.param.s32 %r69, [__cudaparm_kernel_pair_fast_eflag];
mov.u32 %r70, 0;
setp.le.s32 %p19, %r69, %r70;
@%p19 bra $Lt_1_30978;
.loc 16 374 0
st.global.f32 [%rd63+0], %f32;
.loc 16 375 0
cvt.s64.s32 %rd64, %r14;
mul.wide.s32 %rd65, %r14, 4;
add.u64 %rd63, %rd63, %rd65;
$Lt_1_30978:
ld.param.s32 %r71, [__cudaparm_kernel_pair_fast_vflag];
mov.u32 %r72, 0;
setp.le.s32 %p20, %r71, %r72;
@%p20 bra $Lt_1_31490;
.loc 16 379 0
mov.f32 %f101, %f9;
st.global.f32 [%rd63+0], %f101;
.loc 16 380 0
cvt.s64.s32 %rd66, %r14;
mul.wide.s32 %rd67, %r14, 4;
add.u64 %rd68, %rd67, %rd63;
.loc 16 379 0
mov.f32 %f102, %f11;
st.global.f32 [%rd68+0], %f102;
.loc 16 380 0
add.u64 %rd69, %rd67, %rd68;
.loc 16 379 0
mov.f32 %f103, %f13;
st.global.f32 [%rd69+0], %f103;
.loc 16 380 0
add.u64 %rd70, %rd67, %rd69;
.loc 16 379 0
mov.f32 %f104, %f15;
st.global.f32 [%rd70+0], %f104;
.loc 16 380 0
add.u64 %rd63, %rd67, %rd70;
.loc 16 379 0
mov.f32 %f105, %f17;
st.global.f32 [%rd63+0], %f105;
mov.f32 %f106, %f19;
add.u64 %rd71, %rd67, %rd63;
st.global.f32 [%rd71+0], %f106;
$Lt_1_31490:
.loc 16 383 0
ld.param.u64 %rd72, [__cudaparm_kernel_pair_fast_ans];
mul.lo.u64 %rd73, %rd60, 16;
add.u64 %rd74, %rd72, %rd73;
mov.f32 %f107, %f108;
st.global.v4.f32 [%rd74+0], {%f31,%f30,%f29,%f107};
$Lt_1_30466:
.loc 16 385 0
exit;
$LDWend_kernel_pair_fast:
} // kernel_pair_fast

View File

@ -1,101 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bafa_00000000-9_pair_gpu_atom_kernel.cpp3.i (/home/sjplimp/ccBI#.kAZxYr)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bafa_00000000-8_pair_gpu_atom_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "pair_gpu_atom_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_cast_x (
.param .u64 __cudaparm_kernel_cast_x_x_type,
.param .u64 __cudaparm_kernel_cast_x_x,
.param .u64 __cudaparm_kernel_cast_x_type,
.param .s32 __cudaparm_kernel_cast_x_nall)
{
.reg .u32 %r<10>;
.reg .u64 %rd<13>;
.reg .f32 %f<6>;
.reg .f64 %fd<5>;
.reg .pred %p<3>;
.loc 16 34 0
$LDWbegin_kernel_cast_x:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_1026;
.loc 16 39 0
cvt.s64.s32 %rd1, %r5;
ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];
mul.wide.s32 %rd3, %r5, 4;
add.u64 %rd4, %rd2, %rd3;
ld.global.s32 %r7, [%rd4+0];
cvt.rn.f32.s32 %f1, %r7;
.loc 16 42 0
ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];
mul.lo.s32 %r8, %r5, 3;
cvt.s64.s32 %rd6, %r8;
mul.wide.s32 %rd7, %r8, 8;
add.u64 %rd8, %rd5, %rd7;
ld.global.f64 %fd1, [%rd8+8];
cvt.rn.ftz.f32.f64 %f2, %fd1;
.loc 16 43 0
ld.global.f64 %fd2, [%rd8+16];
cvt.rn.ftz.f32.f64 %f3, %fd2;
.loc 16 44 0
ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];
mul.wide.s32 %rd10, %r5, 16;
add.u64 %rd11, %rd9, %rd10;
ld.global.f64 %fd3, [%rd8+0];
cvt.rn.ftz.f32.f64 %f4, %fd3;
st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};
$Lt_0_1026:
.loc 16 46 0
exit;
$LDWend_kernel_cast_x:
} // kernel_cast_x

View File

@ -1,833 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bb79_00000000-9_pair_gpu_build_kernel.cpp3.i (/home/sjplimp/ccBI#.mdgTku)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bb79_00000000-8_pair_gpu_build_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "pair_gpu_build_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry transpose (
.param .u64 __cudaparm_transpose_out,
.param .u64 __cudaparm_transpose_in,
.param .s32 __cudaparm_transpose_columns_in,
.param .s32 __cudaparm_transpose_rows_in)
{
.reg .u32 %r<32>;
.reg .u64 %rd<23>;
.reg .f32 %f<4>;
.reg .pred %p<4>;
.shared .align 4 .b8 __cuda___cuda_local_var_32483_32_non_const_block24[288];
.loc 16 64 0
$LDWbegin_transpose:
mov.u32 %r1, %ctaid.x;
mul.lo.u32 %r2, %r1, 8;
mov.u32 %r3, %ctaid.y;
mul.lo.u32 %r4, %r3, 8;
mov.u32 %r5, %tid.x;
add.u32 %r6, %r2, %r5;
mov.u32 %r7, %tid.y;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_transpose_rows_in];
ld.param.s32 %r10, [__cudaparm_transpose_columns_in];
set.gt.u32.u32 %r11, %r9, %r8;
neg.s32 %r12, %r11;
set.gt.u32.u32 %r13, %r10, %r6;
neg.s32 %r14, %r13;
and.b32 %r15, %r12, %r14;
mov.u32 %r16, 0;
setp.eq.s32 %p1, %r15, %r16;
@%p1 bra $Lt_0_2306;
.loc 16 76 0
mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24;
ld.param.u64 %rd2, [__cudaparm_transpose_in];
mul.lo.u32 %r17, %r10, %r8;
add.u32 %r18, %r6, %r17;
cvt.u64.u32 %rd3, %r18;
mul.wide.u32 %rd4, %r18, 4;
add.u64 %rd5, %rd2, %rd4;
ld.global.s32 %r19, [%rd5+0];
cvt.rn.f32.s32 %f1, %r19;
cvt.u64.u32 %rd6, %r5;
cvt.u64.u32 %rd7, %r7;
mul.wide.u32 %rd8, %r7, 9;
add.u64 %rd9, %rd6, %rd8;
mul.lo.u64 %rd10, %rd9, 4;
add.u64 %rd11, %rd1, %rd10;
st.shared.f32 [%rd11+0], %f1;
$Lt_0_2306:
mov.u64 %rd1, __cuda___cuda_local_var_32483_32_non_const_block24;
.loc 16 78 0
bar.sync 0;
add.u32 %r20, %r2, %r7;
add.u32 %r21, %r4, %r5;
set.gt.u32.u32 %r22, %r9, %r21;
neg.s32 %r23, %r22;
set.gt.u32.u32 %r24, %r10, %r20;
neg.s32 %r25, %r24;
and.b32 %r26, %r23, %r25;
mov.u32 %r27, 0;
setp.eq.s32 %p2, %r26, %r27;
@%p2 bra $Lt_0_2818;
.loc 16 83 0
cvt.u64.u32 %rd12, %r7;
cvt.u64.u32 %rd13, %r5;
mul.wide.u32 %rd14, %r5, 9;
add.u64 %rd15, %rd12, %rd14;
mul.lo.u64 %rd16, %rd15, 4;
add.u64 %rd17, %rd1, %rd16;
ld.shared.f32 %f2, [%rd17+0];
cvt.rzi.ftz.s32.f32 %r28, %f2;
ld.param.u64 %rd18, [__cudaparm_transpose_out];
mul.lo.u32 %r29, %r9, %r20;
add.u32 %r30, %r21, %r29;
cvt.u64.u32 %rd19, %r30;
mul.wide.u32 %rd20, %r30, 4;
add.u64 %rd21, %rd18, %rd20;
st.global.s32 [%rd21+0], %r28;
$Lt_0_2818:
.loc 16 84 0
exit;
$LDWend_transpose:
} // transpose
.global .texref neigh_tex;
.entry calc_cell_id (
.param .u64 __cudaparm_calc_cell_id_pos,
.param .u64 __cudaparm_calc_cell_id_cell_id,
.param .u64 __cudaparm_calc_cell_id_particle_id,
.param .f32 __cudaparm_calc_cell_id_boxlo0,
.param .f32 __cudaparm_calc_cell_id_boxlo1,
.param .f32 __cudaparm_calc_cell_id_boxlo2,
.param .f32 __cudaparm_calc_cell_id_boxhi0,
.param .f32 __cudaparm_calc_cell_id_boxhi1,
.param .f32 __cudaparm_calc_cell_id_boxhi2,
.param .f32 __cudaparm_calc_cell_id_cell_size,
.param .s32 __cudaparm_calc_cell_id_ncellx,
.param .s32 __cudaparm_calc_cell_id_ncelly,
.param .s32 __cudaparm_calc_cell_id_nall)
{
.reg .u32 %r<25>;
.reg .u64 %rd<8>;
.reg .f32 %f<35>;
.reg .f64 %fd<11>;
.reg .pred %p<3>;
.loc 16 90 0
$LDWbegin_calc_cell_id:
mov.u32 %r1, %tid.x;
mov.u32 %r2, %ctaid.x;
mov.u32 %r3, %ntid.x;
mul.lo.u32 %r4, %r2, %r3;
add.u32 %r5, %r1, %r4;
ld.param.s32 %r6, [__cudaparm_calc_cell_id_nall];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_1_1026;
.loc 16 94 0
mov.u32 %r7, %r5;
mov.s32 %r8, 0;
mov.u32 %r9, %r8;
mov.s32 %r10, 0;
mov.u32 %r11, %r10;
mov.s32 %r12, 0;
mov.u32 %r13, %r12;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r7,%r9,%r11,%r13}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 16 107 0
ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];
neg.ftz.f32 %f9, %f8;
ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];
ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];
ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];
ld.param.s32 %r14, [__cudaparm_calc_cell_id_ncellx];
ld.param.s32 %r15, [__cudaparm_calc_cell_id_ncelly];
ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];
sub.ftz.f32 %f14, %f13, %f11;
add.ftz.f32 %f15, %f8, %f14;
sub.ftz.f32 %f16, %f7, %f11;
max.ftz.f32 %f17, %f9, %f16;
min.ftz.f32 %f18, %f15, %f17;
div.approx.ftz.f32 %f19, %f18, %f8;
cvt.ftz.f64.f32 %fd1, %f19;
mov.f64 %fd2, 0d3ff0000000000000; // 1
add.f64 %fd3, %fd1, %fd2;
cvt.rzi.u32.f64 %r16, %fd3;
mul.lo.u32 %r17, %r14, %r16;
mul.lo.u32 %r18, %r15, %r17;
ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];
sub.ftz.f32 %f21, %f20, %f12;
add.ftz.f32 %f22, %f8, %f21;
sub.ftz.f32 %f23, %f6, %f12;
max.ftz.f32 %f24, %f9, %f23;
min.ftz.f32 %f25, %f22, %f24;
div.approx.ftz.f32 %f26, %f25, %f8;
cvt.ftz.f64.f32 %fd4, %f26;
mov.f64 %fd5, 0d3ff0000000000000; // 1
add.f64 %fd6, %fd4, %fd5;
cvt.rzi.u32.f64 %r19, %fd6;
mul.lo.u32 %r20, %r14, %r19;
add.u32 %r21, %r18, %r20;
ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];
sub.ftz.f32 %f28, %f27, %f10;
add.ftz.f32 %f29, %f8, %f28;
sub.ftz.f32 %f30, %f5, %f10;
max.ftz.f32 %f31, %f9, %f30;
min.ftz.f32 %f32, %f29, %f31;
div.approx.ftz.f32 %f33, %f32, %f8;
cvt.ftz.f64.f32 %fd7, %f33;
mov.f64 %fd8, 0d3ff0000000000000; // 1
add.f64 %fd9, %fd7, %fd8;
cvt.rzi.u32.f64 %r22, %fd9;
add.u32 %r23, %r21, %r22;
.loc 16 111 0
cvt.s64.s32 %rd1, %r5;
mul.wide.s32 %rd2, %r5, 4;
ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];
add.u64 %rd4, %rd3, %rd2;
st.global.u32 [%rd4+0], %r23;
.loc 16 112 0
ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];
add.u64 %rd6, %rd5, %rd2;
st.global.s32 [%rd6+0], %r5;
$Lt_1_1026:
.loc 16 114 0
exit;
$LDWend_calc_cell_id:
} // calc_cell_id
.entry kernel_calc_cell_counts (
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,
.param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,
.param .s32 __cudaparm_kernel_calc_cell_counts_nall,
.param .s32 __cudaparm_kernel_calc_cell_counts_ncell)
{
.reg .u32 %r<33>;
.reg .u64 %rd<15>;
.reg .pred %p<13>;
.loc 16 117 0
$LDWbegin_kernel_calc_cell_counts:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm_kernel_calc_cell_counts_nall];
setp.gt.s32 %p1, %r6, %r5;
@!%p1 bra $Lt_2_7426;
.loc 16 120 0
ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];
cvt.s64.s32 %rd2, %r5;
mul.wide.s32 %rd3, %r5, 4;
add.u64 %rd4, %rd1, %rd3;
ld.global.u32 %r7, [%rd4+0];
mov.u32 %r8, 0;
setp.ne.s32 %p2, %r5, %r8;
@%p2 bra $Lt_2_7938;
add.s32 %r9, %r7, 1;
mov.u32 %r10, 0;
setp.le.s32 %p3, %r9, %r10;
@%p3 bra $Lt_2_8450;
mov.s32 %r11, %r9;
ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];
mov.s32 %r12, 0;
mov.s32 %r13, %r11;
$Lt_2_8962:
//<loop> Loop body line 120, nesting depth: 1, estimated iterations: unknown
.loc 16 125 0
mov.s32 %r14, 0;
st.global.s32 [%rd5+0], %r14;
add.s32 %r12, %r12, 1;
add.u64 %rd5, %rd5, 4;
setp.ne.s32 %p4, %r9, %r12;
@%p4 bra $Lt_2_8962;
$Lt_2_8450:
$Lt_2_7938:
sub.s32 %r15, %r6, 1;
setp.ne.s32 %p5, %r5, %r15;
@%p5 bra $Lt_2_9474;
.loc 16 128 0
add.s32 %r9, %r7, 1;
mov.s32 %r16, %r9;
ld.param.s32 %r17, [__cudaparm_kernel_calc_cell_counts_ncell];
setp.gt.s32 %p6, %r9, %r17;
@%p6 bra $Lt_2_9986;
sub.s32 %r18, %r17, %r7;
add.s32 %r19, %r17, 1;
ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];
cvt.s64.s32 %rd7, %r9;
mul.wide.s32 %rd8, %r9, 4;
add.u64 %rd9, %rd6, %rd8;
mov.s32 %r20, %r18;
$Lt_2_10498:
//<loop> Loop body line 128, nesting depth: 1, estimated iterations: unknown
.loc 16 129 0
st.global.s32 [%rd9+0], %r6;
add.s32 %r16, %r16, 1;
add.u64 %rd9, %rd9, 4;
setp.ne.s32 %p7, %r19, %r16;
@%p7 bra $Lt_2_10498;
$Lt_2_9986:
$Lt_2_9474:
selp.s32 %r21, 1, 0, %p1;
mov.s32 %r22, 0;
set.gt.u32.s32 %r23, %r5, %r22;
neg.s32 %r24, %r23;
and.b32 %r25, %r21, %r24;
mov.u32 %r26, 0;
setp.eq.s32 %p8, %r25, %r26;
@%p8 bra $Lt_2_11010;
.loc 16 133 0
ld.global.u32 %r27, [%rd4+-4];
setp.eq.s32 %p9, %r7, %r27;
@%p9 bra $Lt_2_11522;
.loc 16 135 0
add.s32 %r28, %r27, 1;
mov.s32 %r29, %r28;
setp.gt.s32 %p10, %r28, %r7;
@%p10 bra $Lt_2_12034;
sub.s32 %r30, %r7, %r27;
add.s32 %r9, %r7, 1;
ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];
cvt.s64.s32 %rd11, %r28;
mul.wide.s32 %rd12, %r28, 4;
add.u64 %rd13, %rd10, %rd12;
mov.s32 %r31, %r30;
$Lt_2_12546:
//<loop> Loop body line 135, nesting depth: 1, estimated iterations: unknown
.loc 16 136 0
st.global.s32 [%rd13+0], %r5;
add.s32 %r29, %r29, 1;
add.u64 %rd13, %rd13, 4;
setp.ne.s32 %p11, %r9, %r29;
@%p11 bra $Lt_2_12546;
$Lt_2_12034:
$Lt_2_11522:
$Lt_2_11010:
$Lt_2_7426:
.loc 16 140 0
exit;
$LDWend_kernel_calc_cell_counts:
} // kernel_calc_cell_counts
.entry calc_neigh_list_cell (
.param .u64 __cudaparm_calc_neigh_list_cell_pos,
.param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,
.param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,
.param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,
.param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,
.param .u64 __cudaparm_calc_neigh_list_cell_host_numj,
.param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,
.param .f32 __cudaparm_calc_neigh_list_cell_cell_size,
.param .s32 __cudaparm_calc_neigh_list_cell_ncellx,
.param .s32 __cudaparm_calc_neigh_list_cell_ncelly,
.param .s32 __cudaparm_calc_neigh_list_cell_ncellz,
.param .s32 __cudaparm_calc_neigh_list_cell_inum,
.param .s32 __cudaparm_calc_neigh_list_cell_nt,
.param .s32 __cudaparm_calc_neigh_list_cell_nall)
{
.reg .u32 %r<106>;
.reg .u64 %rd<46>;
.reg .f32 %f<43>;
.reg .f64 %fd<4>;
.reg .pred %p<22>;
.shared .align 4 .b8 __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480[512];
.shared .align 16 .b8 __cuda___cuda_local_var_32578_34_non_const_pos_sh992[2048];
// __cuda_local_var_32592_12_non_const_atom_i = 16
.loc 16 151 0
$LDWbegin_calc_neigh_list_cell:
.loc 16 163 0
ld.param.s32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];
mov.u32 %r2, %ctaid.y;
rem.u32 %r3, %r2, %r1;
div.u32 %r4, %r2, %r1;
ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];
mul.lo.s32 %r6, %r5, %r3;
mul.lo.s32 %r7, %r5, %r4;
mul.lo.s32 %r8, %r7, %r1;
cvt.s32.u32 %r9, %ctaid.x;
ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];
add.s32 %r10, %r6, %r8;
add.s32 %r11, %r9, %r10;
cvt.s64.s32 %rd2, %r11;
mul.wide.s32 %rd3, %r11, 4;
add.u64 %rd4, %rd1, %rd3;
ldu.global.s32 %r12, [%rd4+0];
.loc 16 164 0
ldu.global.s32 %r13, [%rd4+4];
.loc 16 172 0
sub.s32 %r14, %r13, %r12;
mov.u32 %r15, %ntid.x;
cvt.rn.f32.u32 %f1, %r15;
cvt.rn.f32.s32 %f2, %r14;
div.approx.ftz.f32 %f3, %f2, %f1;
cvt.rpi.ftz.f32.f32 %f4, %f3;
mov.f32 %f5, 0f00000000; // 0
setp.gt.ftz.f32 %p1, %f4, %f5;
@!%p1 bra $Lt_3_13314;
sub.s32 %r16, %r3, 1;
mov.s32 %r17, 0;
max.s32 %r18, %r16, %r17;
sub.s32 %r19, %r1, 1;
add.s32 %r20, %r3, 1;
min.s32 %r21, %r19, %r20;
ld.param.s32 %r22, [__cudaparm_calc_neigh_list_cell_ncellz];
sub.s32 %r23, %r22, 1;
add.s32 %r24, %r4, 1;
min.s32 %r25, %r23, %r24;
sub.s32 %r26, %r9, 1;
mov.s32 %r27, 0;
max.s32 %r28, %r26, %r27;
add.s32 %r29, %r9, 1;
sub.s32 %r30, %r5, 1;
min.s32 %r31, %r29, %r30;
cvt.s32.u32 %r32, %tid.x;
add.s32 %r33, %r12, %r32;
mov.u32 %r34, 0;
ld.param.s32 %r35, [__cudaparm_calc_neigh_list_cell_inum];
cvt.s64.s32 %rd5, %r35;
sub.s32 %r36, %r4, 1;
mov.s32 %r37, %r33;
mul.wide.s32 %rd6, %r35, 4;
mov.s32 %r38, 0;
max.s32 %r39, %r36, %r38;
setp.ge.s32 %p2, %r25, %r39;
ld.param.s32 %r40, [__cudaparm_calc_neigh_list_cell_nt];
ld.param.s32 %r41, [__cudaparm_calc_neigh_list_cell_nall];
mov.s32 %r42, 0;
mov.u64 %rd7, __cuda___cuda_local_var_32577_31_non_const_cell_list_sh480;
mov.u64 %rd8, __cuda___cuda_local_var_32578_34_non_const_pos_sh992;
$Lt_3_13826:
//<loop> Loop body line 172, nesting depth: 1, estimated iterations: unknown
.loc 16 174 0
mov.s32 %r43, %r41;
setp.ge.s32 %p3, %r37, %r13;
@%p3 bra $Lt_3_14082;
.loc 16 180 0
ld.param.u64 %rd9, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
add.u32 %r44, %r33, %r34;
cvt.s64.s32 %rd10, %r44;
mul.wide.s32 %rd11, %r44, 4;
add.u64 %rd12, %rd9, %rd11;
ld.global.s32 %r43, [%rd12+0];
$Lt_3_14082:
setp.lt.s32 %p4, %r43, %r40;
@!%p4 bra $Lt_3_14594;
.loc 16 183 0
mov.u32 %r45, %r43;
mov.s32 %r46, 0;
mov.u32 %r47, %r46;
mov.s32 %r48, 0;
mov.u32 %r49, %r48;
mov.s32 %r50, 0;
mov.u32 %r51, %r50;
tex.1d.v4.f32.s32 {%f6,%f7,%f8,%f9},[neigh_tex,{%r45,%r47,%r49,%r51}];
mov.f32 %f10, %f6;
mov.f32 %f11, %f7;
mov.f32 %f12, %f8;
mov.f32 %f13, %f10;
mov.f32 %f14, %f11;
mov.f32 %f15, %f12;
$Lt_3_14594:
cvt.s64.s32 %rd13, %r43;
mul.wide.s32 %rd14, %r43, 4;
setp.ge.s32 %p5, %r43, %r35;
@%p5 bra $Lt_3_15362;
.loc 16 186 0
mov.s32 %r52, %r35;
.loc 16 187 0
ld.param.u64 %rd15, [__cudaparm_calc_neigh_list_cell_nbor_list];
add.u64 %rd16, %rd13, %rd5;
mul.lo.u64 %rd17, %rd16, 4;
add.u64 %rd18, %rd15, %rd17;
mov.s64 %rd19, %rd18;
.loc 16 188 0
add.u64 %rd20, %rd6, %rd18;
.loc 16 189 0
add.u64 %rd21, %rd14, %rd15;
st.global.s32 [%rd21+0], %r43;
bra.uni $Lt_3_15106;
$Lt_3_15362:
.loc 16 192 0
ld.param.u64 %rd22, [__cudaparm_calc_neigh_list_cell_host_numj];
add.u64 %rd23, %rd22, %rd14;
sub.u64 %rd19, %rd23, %rd6;
.loc 16 193 0
ld.param.u64 %rd24, [__cudaparm_calc_neigh_list_cell_host_nbor_list];
ld.param.s32 %r53, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
sub.s32 %r54, %r43, %r35;
mul.lo.s32 %r55, %r53, %r54;
cvt.s64.s32 %rd25, %r55;
mul.wide.s32 %rd26, %r55, 4;
add.u64 %rd20, %rd24, %rd26;
mov.s32 %r52, 1;
$Lt_3_15106:
.loc 16 198 0
mov.s32 %r56, %r39;
@!%p2 bra $Lt_3_23298;
sub.s32 %r57, %r25, %r39;
add.s32 %r58, %r57, 1;
setp.le.s32 %p6, %r18, %r21;
add.s32 %r59, %r25, 1;
mov.s32 %r60, 0;
mov.s32 %r61, %r58;
$Lt_3_16130:
//<loop> Loop body line 198, nesting depth: 1, estimated iterations: unknown
.loc 16 199 0
mov.s32 %r62, %r18;
@!%p6 bra $Lt_3_16386;
sub.s32 %r63, %r21, %r18;
add.s32 %r64, %r63, 1;
setp.ge.s32 %p7, %r31, %r28;
add.s32 %r65, %r21, 1;
mov.s32 %r66, %r64;
$Lt_3_16898:
//<loop> Loop body line 199, nesting depth: 2, estimated iterations: unknown
@!%p7 bra $Lt_3_17154;
sub.s32 %r67, %r31, %r28;
add.s32 %r68, %r67, 1;
mul.lo.s32 %r69, %r62, %r5;
mul.lo.s32 %r70, %r56, %r5;
mul.lo.s32 %r71, %r70, %r1;
add.s32 %r72, %r31, 1;
add.s32 %r73, %r69, %r71;
add.s32 %r74, %r73, %r28;
add.s32 %r75, %r72, %r73;
cvt.s64.s32 %rd27, %r74;
mul.wide.s32 %rd28, %r74, 4;
add.u64 %rd29, %rd1, %rd28;
mov.s32 %r76, %r68;
$Lt_3_17666:
//<loop> Loop body line 199, nesting depth: 3, estimated iterations: unknown
.loc 16 204 0
ld.global.s32 %r77, [%rd29+0];
.loc 16 205 0
ld.global.s32 %r78, [%rd29+4];
.loc 16 209 0
sub.s32 %r79, %r78, %r77;
cvt.rn.f32.s32 %f16, %r79;
mov.f32 %f17, 0f43000000; // 128
div.approx.ftz.f32 %f18, %f16, %f17;
cvt.rpi.ftz.f32.f32 %f19, %f18;
cvt.rzi.ftz.s32.f32 %r80, %f19;
mov.u32 %r81, 0;
setp.le.s32 %p8, %r80, %r81;
@%p8 bra $Lt_3_17922;
mov.s32 %r82, %r80;
mov.s32 %r83, 0;
setp.lt.s32 %p9, %r43, %r40;
mul.lo.s32 %r84, %r80, 128;
mov.s32 %r85, %r82;
$Lt_3_18434:
//<loop> Loop body line 209, nesting depth: 4, estimated iterations: unknown
sub.s32 %r86, %r79, %r83;
mov.s32 %r87, 128;
min.s32 %r88, %r86, %r87;
setp.le.s32 %p10, %r88, %r32;
@%p10 bra $Lt_3_18690;
.loc 16 215 0
ld.param.u64 %rd30, [__cudaparm_calc_neigh_list_cell_cell_particle_id];
add.s32 %r89, %r83, %r32;
add.s32 %r90, %r77, %r89;
cvt.s64.s32 %rd31, %r90;
mul.wide.s32 %rd32, %r90, 4;
add.u64 %rd33, %rd30, %rd32;
ld.global.s32 %r91, [%rd33+0];
.loc 16 216 0
cvt.s64.s32 %rd34, %r32;
mul.wide.s32 %rd35, %r32, 4;
add.u64 %rd36, %rd7, %rd35;
st.shared.s32 [%rd36+0], %r91;
.loc 16 217 0
mov.u32 %r92, %r91;
mov.s32 %r93, 0;
mov.u32 %r94, %r93;
mov.s32 %r95, 0;
mov.u32 %r96, %r95;
mov.s32 %r97, 0;
mov.u32 %r98, %r97;
tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[neigh_tex,{%r92,%r94,%r96,%r98}];
mov.f32 %f24, %f20;
mov.f32 %f25, %f21;
mov.f32 %f26, %f22;
.loc 16 218 0
mul.lo.u64 %rd37, %rd34, 16;
add.u64 %rd38, %rd8, %rd37;
st.shared.v2.f32 [%rd38+0], {%f24,%f25};
.loc 16 220 0
st.shared.f32 [%rd38+8], %f26;
$Lt_3_18690:
.loc 16 222 0
bar.sync 0;
@!%p9 bra $Lt_3_19714;
mov.u32 %r99, 0;
setp.le.s32 %p11, %r88, %r99;
@%p11 bra $Lt_3_19714;
mov.s32 %r100, %r88;
mov.s64 %rd39, 0;
ld.param.f32 %f27, [__cudaparm_calc_neigh_list_cell_cell_size];
mul.ftz.f32 %f28, %f27, %f27;
mov.s64 %rd40, %rd8;
mov.f32 %f29, %f15;
mov.f32 %f30, %f14;
mov.f32 %f31, %f13;
mov.s32 %r101, 0;
mov.s32 %r102, %r100;
$Lt_3_20226:
//<loop> Loop body line 222, nesting depth: 5, estimated iterations: unknown
ld.shared.v4.f32 {%f32,%f33,%f34,_}, [%rd40+0];
.loc 16 228 0
sub.ftz.f32 %f35, %f31, %f32;
.loc 16 229 0
sub.ftz.f32 %f36, %f30, %f33;
.loc 16 230 0
sub.ftz.f32 %f37, %f29, %f34;
.loc 16 227 0
mul.ftz.f32 %f38, %f36, %f36;
fma.rn.ftz.f32 %f39, %f35, %f35, %f38;
fma.rn.ftz.f32 %f40, %f37, %f37, %f39;
setp.gt.ftz.f32 %p12, %f28, %f40;
@!%p12 bra $Lt_3_24578;
cvt.ftz.f64.f32 %fd1, %f40;
mov.f64 %fd2, 0d3ee4f8b588e368f1; // 1e-05
setp.gt.f64 %p13, %fd1, %fd2;
@!%p13 bra $Lt_3_24578;
ld.param.s32 %r103, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];
setp.le.s32 %p14, %r103, %r60;
@%p14 bra $Lt_3_20482;
.loc 16 235 0
mul.lo.u64 %rd41, %rd39, 4;
add.u64 %rd42, %rd7, %rd41;
ld.shared.s32 %r104, [%rd42+0];
st.global.s32 [%rd20+0], %r104;
.loc 16 236 0
cvt.s64.s32 %rd43, %r52;
mul.wide.s32 %rd44, %r52, 4;
add.u64 %rd20, %rd20, %rd44;
$Lt_3_20482:
.loc 16 238 0
add.s32 %r60, %r60, 1;
$Lt_3_24578:
$L_3_12802:
add.s32 %r101, %r101, 1;
add.s64 %rd39, %rd39, 1;
add.u64 %rd40, %rd40, 16;
setp.ne.s32 %p15, %r88, %r101;
@%p15 bra $Lt_3_20226;
$Lt_3_19714:
$Lt_3_19202:
.loc 16 242 0
bar.sync 0;
add.s32 %r83, %r83, 128;
setp.ne.s32 %p16, %r83, %r84;
@%p16 bra $Lt_3_18434;
$Lt_3_17922:
add.s32 %r74, %r74, 1;
add.u64 %rd29, %rd29, 4;
setp.ne.s32 %p17, %r74, %r75;
@%p17 bra $Lt_3_17666;
$Lt_3_17154:
add.s32 %r62, %r62, 1;
setp.ne.s32 %p18, %r65, %r62;
@%p18 bra $Lt_3_16898;
$Lt_3_16386:
add.s32 %r56, %r56, 1;
setp.ne.s32 %p19, %r59, %r56;
@%p19 bra $Lt_3_16130;
bra.uni $Lt_3_15618;
$Lt_3_23298:
mov.s32 %r60, 0;
$Lt_3_15618:
@!%p4 bra $Lt_3_22274;
.loc 16 248 0
st.global.s32 [%rd19+0], %r60;
$Lt_3_22274:
.loc 16 172 0
add.s32 %r42, %r42, 1;
add.u32 %r34, %r34, %r15;
add.s32 %r37, %r37, %r15;
cvt.rn.f32.s32 %f41, %r42;
setp.lt.ftz.f32 %p20, %f41, %f4;
@%p20 bra $Lt_3_13826;
$Lt_3_13314:
.loc 16 250 0
exit;
$LDWend_calc_neigh_list_cell:
} // calc_neigh_list_cell
.entry kernel_special (
.param .u64 __cudaparm_kernel_special_dev_nbor,
.param .u64 __cudaparm_kernel_special_host_nbor_list,
.param .u64 __cudaparm_kernel_special_host_numj,
.param .u64 __cudaparm_kernel_special_tag,
.param .u64 __cudaparm_kernel_special_nspecial,
.param .u64 __cudaparm_kernel_special_special,
.param .s32 __cudaparm_kernel_special_inum,
.param .s32 __cudaparm_kernel_special_nt,
.param .s32 __cudaparm_kernel_special_max_nbors)
{
.reg .u32 %r<34>;
.reg .u64 %rd<36>;
.reg .pred %p<11>;
.loc 16 256 0
$LDWbegin_kernel_special:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm_kernel_special_nt];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_4_6146;
.loc 16 264 0
ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];
mul.lo.s32 %r7, %r5, 3;
cvt.s64.s32 %rd2, %r7;
mul.wide.s32 %rd3, %r7, 4;
add.u64 %rd4, %rd1, %rd3;
ld.global.s32 %r8, [%rd4+0];
.loc 16 265 0
ld.global.s32 %r9, [%rd4+4];
.loc 16 266 0
ld.global.s32 %r10, [%rd4+8];
ld.param.s32 %r11, [__cudaparm_kernel_special_inum];
setp.le.s32 %p2, %r11, %r5;
@%p2 bra $Lt_4_6914;
.loc 16 270 0
mov.s32 %r12, %r11;
.loc 16 272 0
cvt.s64.s32 %rd5, %r11;
ld.param.u64 %rd6, [__cudaparm_kernel_special_dev_nbor];
cvt.s64.s32 %rd7, %r5;
add.u64 %rd8, %rd7, %rd5;
mul.lo.u64 %rd9, %rd8, 4;
add.u64 %rd10, %rd6, %rd9;
ld.global.s32 %r13, [%rd10+0];
.loc 16 273 0
mul.wide.s32 %rd11, %r11, 4;
add.u64 %rd12, %rd10, %rd11;
bra.uni $Lt_4_6658;
$Lt_4_6914:
.loc 16 276 0
sub.s32 %r14, %r5, %r11;
ld.param.u64 %rd13, [__cudaparm_kernel_special_host_nbor_list];
ld.param.s32 %r15, [__cudaparm_kernel_special_max_nbors];
mul.lo.s32 %r16, %r15, %r14;
cvt.s64.s32 %rd14, %r16;
mul.wide.s32 %rd15, %r16, 4;
add.u64 %rd12, %rd13, %rd15;
.loc 16 277 0
ld.param.u64 %rd16, [__cudaparm_kernel_special_host_numj];
cvt.s64.s32 %rd17, %r14;
mul.wide.s32 %rd18, %r14, 4;
add.u64 %rd19, %rd16, %rd18;
ld.global.s32 %r13, [%rd19+0];
mov.s32 %r12, 1;
$Lt_4_6658:
.loc 16 279 0
mul.lo.s32 %r17, %r13, %r12;
cvt.s64.s32 %rd20, %r17;
mul.wide.s32 %rd21, %r17, 4;
add.u64 %rd22, %rd12, %rd21;
setp.le.u64 %p3, %rd22, %rd12;
@%p3 bra $Lt_4_7170;
mov.s32 %r18, 0;
setp.gt.s32 %p4, %r10, %r18;
cvt.s64.s32 %rd23, %r12;
ld.param.u64 %rd24, [__cudaparm_kernel_special_tag];
$Lt_4_7682:
//<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown
.loc 16 282 0
ld.global.s32 %r19, [%rd12+0];
.loc 16 283 0
cvt.s64.s32 %rd25, %r19;
mul.wide.s32 %rd26, %r19, 4;
add.u64 %rd27, %rd24, %rd26;
ld.global.s32 %r20, [%rd27+0];
@!%p4 bra $Lt_4_7938;
mov.s32 %r21, %r10;
cvt.s64.s32 %rd28, %r5;
cvt.s64.s32 %rd29, %r6;
mul.wide.s32 %rd30, %r6, 4;
ld.param.u64 %rd31, [__cudaparm_kernel_special_special];
mul.wide.s32 %rd32, %r5, 4;
add.u64 %rd33, %rd31, %rd32;
mov.s32 %r22, 0;
mov.s32 %r23, %r21;
$Lt_4_8450:
//<loop> Loop body line 283, nesting depth: 1, estimated iterations: unknown
ld.global.s32 %r24, [%rd33+0];
setp.ne.s32 %p5, %r24, %r20;
@%p5 bra $Lt_4_8706;
.loc 16 293 0
setp.le.s32 %p6, %r8, %r22;
mov.s32 %r25, 3;
mov.s32 %r26, 2;
selp.s32 %r27, %r25, %r26, %p6;
mov.s32 %r28, 2;
mov.s32 %r29, 1;
selp.s32 %r30, %r28, %r29, %p6;
setp.le.s32 %p7, %r9, %r22;
selp.s32 %r31, %r27, %r30, %p7;
shl.b32 %r32, %r31, 30;
xor.b32 %r19, %r19, %r32;
.loc 16 294 0
st.global.s32 [%rd12+0], %r19;
$Lt_4_8706:
add.s32 %r22, %r22, 1;
add.u64 %rd33, %rd30, %rd33;
setp.ne.s32 %p8, %r10, %r22;
@%p8 bra $Lt_4_8450;
$Lt_4_7938:
.loc 16 281 0
mul.lo.u64 %rd34, %rd23, 4;
add.u64 %rd12, %rd12, %rd34;
setp.gt.u64 %p9, %rd22, %rd12;
@%p9 bra $Lt_4_7682;
$Lt_4_7170:
$Lt_4_6146:
.loc 16 300 0
exit;
$LDWend_kernel_special:
} // kernel_special

View File

@ -1,134 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bba8_00000000-9_pair_gpu_dev_kernel.cpp3.i (/home/sjplimp/ccBI#.SuFQHy)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bba8_00000000-8_pair_gpu_dev_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "pair_gpu_dev_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_zero (
.param .u64 __cudaparm_kernel_zero_mem,
.param .s32 __cudaparm_kernel_zero_numel)
{
.reg .u32 %r<9>;
.reg .u64 %rd<6>;
.reg .pred %p<3>;
.loc 16 95 0
$LDWbegin_kernel_zero:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm_kernel_zero_numel];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_1026;
.loc 16 99 0
mov.s32 %r7, 0;
ld.param.u64 %rd1, [__cudaparm_kernel_zero_mem];
cvt.s64.s32 %rd2, %r5;
mul.wide.s32 %rd3, %r5, 4;
add.u64 %rd4, %rd1, %rd3;
st.global.s32 [%rd4+0], %r7;
$Lt_0_1026:
.loc 16 100 0
exit;
$LDWend_kernel_zero:
} // kernel_zero
.entry kernel_info (
.param .u64 __cudaparm_kernel_info_info)
{
.reg .u32 %r<16>;
.reg .u64 %rd<3>;
.loc 16 102 0
$LDWbegin_kernel_info:
.loc 16 103 0
ld.param.u64 %rd1, [__cudaparm_kernel_info_info];
mov.s32 %r1, 200;
st.global.s32 [%rd1+0], %r1;
.loc 16 104 0
mov.s32 %r2, 32;
st.global.s32 [%rd1+4], %r2;
.loc 16 105 0
mov.s32 %r3, 32;
st.global.s32 [%rd1+8], %r3;
.loc 16 106 0
mov.s32 %r4, 1;
st.global.s32 [%rd1+12], %r4;
.loc 16 107 0
mov.s32 %r5, 8;
st.global.s32 [%rd1+16], %r5;
.loc 16 108 0
mov.s32 %r6, 64;
st.global.s32 [%rd1+20], %r6;
.loc 16 109 0
mov.s32 %r7, 128;
st.global.s32 [%rd1+24], %r7;
.loc 16 110 0
mov.s32 %r8, 11;
st.global.s32 [%rd1+28], %r8;
.loc 16 111 0
mov.s32 %r9, 8;
st.global.s32 [%rd1+32], %r9;
.loc 16 112 0
mov.s32 %r10, 128;
st.global.s32 [%rd1+36], %r10;
.loc 16 113 0
mov.s32 %r11, 128;
st.global.s32 [%rd1+40], %r11;
.loc 16 114 0
mov.s32 %r12, 128;
st.global.s32 [%rd1+44], %r12;
.loc 16 115 0
mov.s32 %r13, 128;
st.global.s32 [%rd1+48], %r13;
.loc 16 116 0
mov.s32 %r14, 8;
st.global.s32 [%rd1+52], %r14;
.loc 16 117 0
exit;
$LDWend_kernel_info:
} // kernel_info

View File

@ -1,118 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bb58_00000000-9_pair_gpu_nbor_kernel.cpp3.i (/home/sjplimp/ccBI#.bBFvWV)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bb58_00000000-8_pair_gpu_nbor_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "pair_gpu_nbor_kernel.cu"
.file 17 "/usr/local/cuda/include/common_functions.h"
.file 18 "/usr/local/cuda/include/math_functions.h"
.file 19 "/usr/local/cuda/include/math_constants.h"
.file 20 "/usr/local/cuda/include/device_functions.h"
.file 21 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.entry kernel_unpack (
.param .u64 __cudaparm_kernel_unpack_dev_nbor,
.param .u64 __cudaparm_kernel_unpack_dev_ij,
.param .s32 __cudaparm_kernel_unpack_inum)
{
.reg .u32 %r<11>;
.reg .u64 %rd<27>;
.reg .pred %p<5>;
.loc 16 29 0
$LDWbegin_kernel_unpack:
mov.u32 %r1, %ctaid.x;
mov.u32 %r2, %ntid.x;
mul.lo.u32 %r3, %r1, %r2;
mov.u32 %r4, %tid.x;
add.u32 %r5, %r4, %r3;
ld.param.s32 %r6, [__cudaparm_kernel_unpack_inum];
setp.le.s32 %p1, %r6, %r5;
@%p1 bra $Lt_0_2050;
.loc 16 35 0
cvt.s64.s32 %rd1, %r6;
ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];
cvt.s64.s32 %rd3, %r5;
add.u64 %rd4, %rd3, %rd1;
mul.lo.u64 %rd5, %rd4, 4;
add.u64 %rd6, %rd2, %rd5;
ld.global.s32 %r7, [%rd6+0];
.loc 16 36 0
mul.wide.s32 %rd7, %r6, 4;
add.u64 %rd8, %rd6, %rd7;
mov.s64 %rd9, %rd8;
.loc 16 37 0
ld.param.u64 %rd10, [__cudaparm_kernel_unpack_dev_ij];
ld.global.s32 %r8, [%rd8+0];
cvt.s64.s32 %rd11, %r8;
mul.wide.s32 %rd12, %r8, 4;
add.u64 %rd13, %rd10, %rd12;
.loc 16 38 0
cvt.s64.s32 %rd14, %r7;
mul.wide.s32 %rd15, %r7, 4;
add.u64 %rd16, %rd15, %rd13;
setp.le.u64 %p2, %rd16, %rd13;
@%p2 bra $Lt_0_2562;
add.u64 %rd17, %rd15, 3;
shr.s64 %rd18, %rd17, 63;
mov.s64 %rd19, 3;
and.b64 %rd20, %rd18, %rd19;
add.s64 %rd21, %rd20, %rd17;
shr.s64 %rd22, %rd21, 2;
mov.s64 %rd23, 1;
max.s64 %rd24, %rd22, %rd23;
mov.s64 %rd25, %rd24;
$Lt_0_3074:
//<loop> Loop body line 38, nesting depth: 1, estimated iterations: unknown
.loc 16 41 0
ld.global.s32 %r9, [%rd13+0];
st.global.s32 [%rd9+0], %r9;
.loc 16 42 0
add.u64 %rd9, %rd7, %rd9;
.loc 16 40 0
add.u64 %rd13, %rd13, 4;
setp.gt.u64 %p3, %rd16, %rd13;
@%p3 bra $Lt_0_3074;
$Lt_0_2562:
$Lt_0_2050:
.loc 16 45 0
exit;
$LDWend_kernel_unpack:
} // kernel_unpack

View File

@ -1,900 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bc69_00000000-9_pppm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.fFsh3D)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bc69_00000000-8_pppm_gpu_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 17 "pppm_gpu_kernel.cu"
.file 18 "/usr/local/cuda/include/common_functions.h"
.file 19 "/usr/local/cuda/include/math_functions.h"
.file 20 "/usr/local/cuda/include/math_constants.h"
.file 21 "/usr/local/cuda/include/device_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry particle_map (
.param .u64 __cudaparm_particle_map_x_,
.param .u64 __cudaparm_particle_map_q_,
.param .f64 __cudaparm_particle_map_delvolinv,
.param .s32 __cudaparm_particle_map_nlocal,
.param .u64 __cudaparm_particle_map_counts,
.param .u64 __cudaparm_particle_map_ans,
.param .f64 __cudaparm_particle_map_b_lo_x,
.param .f64 __cudaparm_particle_map_b_lo_y,
.param .f64 __cudaparm_particle_map_b_lo_z,
.param .f64 __cudaparm_particle_map_delxinv,
.param .f64 __cudaparm_particle_map_delyinv,
.param .f64 __cudaparm_particle_map_delzinv,
.param .s32 __cudaparm_particle_map_nlocal_x,
.param .s32 __cudaparm_particle_map_nlocal_y,
.param .s32 __cudaparm_particle_map_nlocal_z,
.param .s32 __cudaparm_particle_map_atom_stride,
.param .s32 __cudaparm_particle_map_max_atoms,
.param .u64 __cudaparm_particle_map_error)
{
.reg .u32 %r<50>;
.reg .u64 %rd<12>;
.reg .f32 %f<14>;
.reg .f64 %fd<36>;
.reg .pred %p<11>;
.loc 17 113 0
$LDWbegin_particle_map:
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mul.lo.u32 %r3, %r2, %r1;
mov.u32 %r4, %nctaid.x;
mul.lo.u32 %r5, %r4, %r1;
mov.u32 %r6, %tid.x;
add.u32 %r7, %r6, %r3;
sub.s32 %r8, %r5, 1;
mul.lo.s32 %r9, %r7, 64;
div.s32 %r10, %r9, %r5;
mul.lo.s32 %r11, %r8, %r10;
sub.s32 %r12, %r9, %r11;
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
setp.le.s32 %p1, %r13, %r12;
@%p1 bra $Lt_0_7426;
.loc 17 125 0
mov.u32 %r14, %r12;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
mov.s32 %r17, 0;
mov.u32 %r18, %r17;
mov.s32 %r19, 0;
mov.u32 %r20, %r19;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 127 0
mov.u32 %r21, %r12;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
mov.f32 %f12, %f8;
cvt.ftz.f64.f32 %fd1, %f12;
ld.param.f64 %fd2, [__cudaparm_particle_map_delvolinv];
mul.f64 %fd3, %fd1, %fd2;
mov.f64 %fd4, 0d0000000000000000; // 0
setp.neu.f64 %p2, %fd3, %fd4;
@!%p2 bra $Lt_0_7426;
.loc 17 130 0
ld.param.f64 %fd5, [__cudaparm_particle_map_delxinv];
cvt.ftz.f64.f32 %fd6, %f5;
ld.param.f64 %fd7, [__cudaparm_particle_map_b_lo_x];
sub.f64 %fd8, %fd6, %fd7;
mul.f64 %fd9, %fd5, %fd8;
mov.f64 %fd10, 0d0000000000000000; // 0
setp.lt.f64 %p3, %fd9, %fd10;
@%p3 bra $Lt_0_8706;
ld.param.f64 %fd11, [__cudaparm_particle_map_delyinv];
cvt.ftz.f64.f32 %fd12, %f6;
ld.param.f64 %fd13, [__cudaparm_particle_map_b_lo_y];
sub.f64 %fd14, %fd12, %fd13;
mul.f64 %fd15, %fd11, %fd14;
mov.f64 %fd16, 0d0000000000000000; // 0
setp.lt.f64 %p4, %fd15, %fd16;
@%p4 bra $Lt_0_8706;
ld.param.f64 %fd17, [__cudaparm_particle_map_delzinv];
cvt.ftz.f64.f32 %fd18, %f7;
ld.param.f64 %fd19, [__cudaparm_particle_map_b_lo_z];
sub.f64 %fd20, %fd18, %fd19;
mul.f64 %fd21, %fd17, %fd20;
mov.f64 %fd22, 0d0000000000000000; // 0
setp.lt.f64 %p5, %fd21, %fd22;
@%p5 bra $Lt_0_8706;
cvt.rzi.s32.f64 %r28, %fd9;
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
setp.ge.s32 %p6, %r28, %r29;
@%p6 bra $Lt_0_8706;
cvt.rzi.s32.f64 %r30, %fd15;
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
setp.ge.s32 %p7, %r30, %r31;
@%p7 bra $Lt_0_8706;
cvt.rzi.s32.f64 %r32, %fd21;
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
setp.gt.s32 %p8, %r33, %r32;
@%p8 bra $L_0_4866;
$Lt_0_8706:
$L_0_5122:
.loc 17 139 0
mov.s32 %r34, 1;
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
st.global.s32 [%rd1+0], %r34;
bra.uni $Lt_0_7426;
$L_0_4866:
.loc 17 146 0
mul.lo.s32 %r35, %r32, %r31;
add.s32 %r36, %r30, %r35;
mul.lo.s32 %r37, %r36, %r29;
add.s32 %r38, %r28, %r37;
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
cvt.s64.s32 %rd3, %r38;
mul.wide.s32 %rd4, %r38, 4;
add.u64 %rd5, %rd2, %rd4;
mov.s32 %r39, 1;
atom.global.add.s32 %r40, [%rd5], %r39;
mov.s32 %r41, %r40;
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
setp.gt.s32 %p9, %r42, %r41;
@%p9 bra $Lt_0_7682;
.loc 17 148 0
mov.s32 %r43, 2;
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
st.global.s32 [%rd6+0], %r43;
.loc 16 118 0
mov.s32 %r44, -1;
atom.global.add.s32 %r45, [%rd5], %r44;
bra.uni $Lt_0_7426;
$Lt_0_7682:
.loc 17 151 0
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
mul.lo.s32 %r47, %r46, %r41;
add.s32 %r48, %r38, %r47;
cvt.s64.s32 %rd8, %r48;
mul.wide.s32 %rd9, %r48, 32;
add.u64 %rd10, %rd7, %rd9;
cvt.rn.f64.s32 %fd23, %r28;
mov.f64 %fd24, 0d3fe0000000000000; // 0.5
add.f64 %fd25, %fd23, %fd24;
sub.f64 %fd26, %fd25, %fd9;
cvt.rn.f64.s32 %fd27, %r30;
mov.f64 %fd28, 0d3fe0000000000000; // 0.5
add.f64 %fd29, %fd27, %fd28;
sub.f64 %fd30, %fd29, %fd15;
st.global.v2.f64 [%rd10+0], {%fd26,%fd30};
cvt.rn.f64.s32 %fd31, %r32;
mov.f64 %fd32, 0d3fe0000000000000; // 0.5
add.f64 %fd33, %fd31, %fd32;
sub.f64 %fd34, %fd33, %fd21;
st.global.v2.f64 [%rd10+16], {%fd34,%fd3};
$Lt_0_7426:
$L_0_4610:
$Lt_0_6914:
$Lt_0_6402:
.loc 17 155 0
exit;
$LDWend_particle_map:
} // particle_map
.entry make_rho (
.param .u64 __cudaparm_make_rho_counts,
.param .u64 __cudaparm_make_rho_atoms,
.param .u64 __cudaparm_make_rho_brick,
.param .u64 __cudaparm_make_rho__rho_coeff,
.param .s32 __cudaparm_make_rho_atom_stride,
.param .s32 __cudaparm_make_rho_npts_x,
.param .s32 __cudaparm_make_rho_npts_y,
.param .s32 __cudaparm_make_rho_npts_z,
.param .s32 __cudaparm_make_rho_nlocal_x,
.param .s32 __cudaparm_make_rho_nlocal_y,
.param .s32 __cudaparm_make_rho_nlocal_z,
.param .s32 __cudaparm_make_rho_order_m_1,
.param .s32 __cudaparm_make_rho_order,
.param .s32 __cudaparm_make_rho_order2)
{
.reg .u32 %r<119>;
.reg .u64 %rd<57>;
.reg .f64 %fd<26>;
.reg .pred %p<27>;
.shared .align 8 .b8 __cuda___cuda_local_var_32531_34_non_const_rho_coeff200[512];
.shared .align 8 .b8 __cuda___cuda_local_var_32532_34_non_const_front712[640];
.shared .align 8 .b8 __cuda___cuda_local_var_32533_34_non_const_ans1352[4096];
.loc 17 164 0
$LDWbegin_make_rho:
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
ld.param.s32 %r2, [__cudaparm_make_rho_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_1_16898;
.loc 17 171 0
mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 8;
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f64 %fd1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f64 [%rd6+0], %fd1;
$Lt_1_16898:
mov.u64 %rd1, __cuda___cuda_local_var_32531_34_non_const_rho_coeff200;
shr.s32 %r5, %r4, 31;
mov.s32 %r6, 31;
and.b32 %r7, %r5, %r6;
add.s32 %r8, %r7, %r4;
shr.s32 %r9, %r8, 5;
mul.lo.s32 %r10, %r9, 32;
sub.s32 %r11, %r4, %r10;
setp.lt.s32 %p2, %r11, %r2;
@!%p2 bra $Lt_1_17410;
.loc 17 177 0
mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712;
mov.f64 %fd2, 0d0000000000000000; // 0
cvt.s64.s32 %rd8, %r11;
shr.s32 %r12, %r4, 31;
mov.s32 %r13, 31;
and.b32 %r14, %r12, %r13;
add.s32 %r15, %r14, %r4;
shr.s32 %r16, %r15, 5;
cvt.s64.s32 %rd9, %r16;
mul.wide.s32 %rd10, %r16, 40;
add.u64 %rd11, %rd8, %rd10;
mul.lo.u64 %rd12, %rd11, 8;
add.u64 %rd13, %rd7, %rd12;
st.shared.f64 [%rd13+256], %fd2;
$Lt_1_17410:
mov.u64 %rd7, __cuda___cuda_local_var_32532_34_non_const_front712;
.loc 17 179 0
bar.sync 0;
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
shr.s32 %r18, %r17, 31;
mov.s32 %r19, 31;
and.b32 %r20, %r18, %r19;
add.s32 %r21, %r20, %r17;
shr.s32 %r22, %r21, 5;
add.s32 %r23, %r22, 1;
mov.u32 %r24, 0;
setp.le.s32 %p3, %r23, %r24;
@%p3 bra $Lt_1_17922;
shr.s32 %r25, %r4, 31;
mov.s32 %r26, 31;
and.b32 %r27, %r25, %r26;
add.s32 %r28, %r27, %r4;
shr.s32 %r29, %r28, 5;
add.s32 %r30, %r11, 32;
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
mul.lo.s32 %r33, %r31, %r32;
mov.u32 %r34, %ctaid.x;
mul.lo.u32 %r35, %r34, 2;
add.u32 %r36, %r29, %r35;
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
div.s32 %r38, %r36, %r37;
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
setp.lt.s32 %p4, %r38, %r39;
sub.s32 %r40, %r39, %r38;
mov.s32 %r41, 0;
selp.s32 %r42, %r40, %r41, %p4;
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
setp.ge.s32 %p5, %r38, %r43;
sub.s32 %r44, %r43, %r38;
add.s32 %r45, %r44, %r2;
sub.s32 %r46, %r45, 1;
selp.s32 %r47, %r46, %r2, %p5;
rem.s32 %r48, %r36, %r37;
setp.lt.s32 %p6, %r48, %r39;
sub.s32 %r49, %r39, %r48;
mov.s32 %r50, 0;
selp.s32 %r51, %r49, %r50, %p6;
setp.ge.s32 %p7, %r48, %r31;
sub.s32 %r52, %r31, %r48;
add.s32 %r53, %r52, %r2;
sub.s32 %r54, %r53, 1;
selp.s32 %r55, %r54, %r2, %p7;
mov.s32 %r56, %r23;
mov.s32 %r57, 0;
setp.gt.s32 %p8, %r2, %r57;
mov.s32 %r58, 0;
cvt.s64.s32 %rd14, %r11;
cvt.s64.s32 %rd15, %r29;
mul.lo.s32 %r59, %r23, 32;
mul.wide.s32 %rd16, %r29, 40;
add.u64 %rd17, %rd14, %rd16;
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
setp.gt.s32 %p9, %r60, %r38;
mul.lo.u64 %rd18, %rd17, 8;
selp.s32 %r61, 1, 0, %p9;
add.u64 %rd19, %rd18, %rd7;
mov.u64 %rd20, __cuda___cuda_local_var_32533_34_non_const_ans1352;
mov.s32 %r62, %r56;
$Lt_1_18434:
//<loop> Loop body line 179, nesting depth: 1, estimated iterations: unknown
@!%p8 bra $Lt_1_18690;
mov.s32 %r63, %r2;
cvt.s64.s32 %rd21, %r4;
mul.wide.s32 %rd22, %r4, 8;
add.u64 %rd23, %rd20, %rd22;
mov.s32 %r64, 0;
mov.s32 %r65, %r63;
$Lt_1_19202:
//<loop> Loop body line 179, nesting depth: 2, estimated iterations: unknown
.loc 17 203 0
mov.f64 %fd3, 0d0000000000000000; // 0
st.shared.f64 [%rd23+0], %fd3;
add.s32 %r64, %r64, 1;
add.u64 %rd23, %rd23, 512;
setp.ne.s32 %p10, %r64, %r2;
@%p10 bra $Lt_1_19202;
$Lt_1_18690:
add.s32 %r66, %r11, %r58;
set.lt.u32.s32 %r67, %r66, %r32;
neg.s32 %r68, %r67;
and.b32 %r69, %r61, %r68;
mov.u32 %r70, 0;
setp.eq.s32 %p11, %r69, %r70;
@%p11 bra $Lt_1_20226;
.loc 17 206 0
mov.s32 %r71, %r42;
setp.ge.s32 %p12, %r42, %r47;
@%p12 bra $Lt_1_20226;
sub.s32 %r72, %r47, %r42;
setp.lt.s32 %p13, %r51, %r55;
mov.s32 %r73, %r72;
$Lt_1_20738:
//<loop> Loop body line 206, nesting depth: 2, estimated iterations: unknown
.loc 17 208 0
mov.s32 %r74, %r51;
@!%p13 bra $Lt_1_20994;
sub.s32 %r75, %r55, %r51;
sub.s32 %r76, %r71, %r42;
add.s32 %r77, %r38, %r42;
add.s32 %r78, %r48, %r51;
sub.s32 %r79, %r77, %r39;
sub.s32 %r80, %r78, %r39;
add.s32 %r81, %r76, %r79;
mul.lo.s32 %r82, %r33, %r81;
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
mov.s32 %r84, %r75;
$Lt_1_21506:
//<loop> Loop body line 208, nesting depth: 3, estimated iterations: unknown
.loc 17 210 0
sub.s32 %r85, %r74, %r51;
add.s32 %r86, %r85, %r80;
mul.lo.s32 %r87, %r86, %r32;
add.s32 %r88, %r82, %r87;
add.s32 %r89, %r66, %r88;
cvt.s64.s32 %rd25, %r89;
mul.wide.s32 %rd26, %r89, 4;
add.u64 %rd27, %rd24, %rd26;
ld.global.s32 %r90, [%rd27+0];
mul.lo.s32 %r91, %r90, %r83;
.loc 17 211 0
mov.s32 %r92, %r89;
setp.ge.s32 %p14, %r89, %r91;
@%p14 bra $Lt_1_21762;
sub.s32 %r93, %r3, 1;
cvt.s64.s32 %rd28, %r83;
mul.wide.s32 %rd29, %r83, 32;
mov.s32 %r94, -1;
setp.gt.s32 %p15, %r93, %r94;
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
mul.lo.u64 %rd31, %rd25, 32;
add.u64 %rd32, %rd30, %rd31;
$Lt_1_22274:
//<loop> Loop body line 211, nesting depth: 4, estimated iterations: unknown
.loc 17 212 0
ld.global.f64 %fd4, [%rd32+0];
@!%p15 bra $Lt_1_29954;
sub.s32 %r95, %r93, %r74;
mov.s32 %r96, -1;
sub.s32 %r97, %r96, %r74;
cvt.s64.s32 %rd33, %r2;
mul.wide.s32 %rd34, %r2, 8;
ld.global.f64 %fd5, [%rd32+8];
ld.global.f64 %fd6, [%rd32+16];
cvt.s64.s32 %rd35, %r95;
mul.wide.s32 %rd36, %r95, 8;
add.u64 %rd37, %rd1, %rd36;
sub.s32 %r98, %r93, %r71;
cvt.s64.s32 %rd38, %r98;
mul.wide.s32 %rd39, %r98, 8;
add.u64 %rd40, %rd1, %rd39;
mov.f64 %fd7, 0d0000000000000000; // 0
mov.f64 %fd8, 0d0000000000000000; // 0
$Lt_1_23042:
//<loop> Loop body line 212, nesting depth: 5, estimated iterations: unknown
.loc 17 217 0
ld.shared.f64 %fd9, [%rd37+0];
mad.rn.f64 %fd8, %fd8, %fd5, %fd9;
.loc 17 218 0
ld.shared.f64 %fd10, [%rd40+0];
mad.rn.f64 %fd7, %fd7, %fd6, %fd10;
sub.u64 %rd40, %rd40, %rd34;
sub.s32 %r95, %r95, %r2;
sub.u64 %rd37, %rd37, %rd34;
setp.gt.s32 %p16, %r95, %r97;
@%p16 bra $Lt_1_23042;
bra.uni $Lt_1_22530;
$Lt_1_29954:
mov.f64 %fd7, 0d0000000000000000; // 0
mov.f64 %fd8, 0d0000000000000000; // 0
$Lt_1_22530:
.loc 17 220 0
ld.global.f64 %fd11, [%rd32+24];
mul.f64 %fd12, %fd7, %fd8;
mul.f64 %fd13, %fd11, %fd12;
@!%p8 bra $Lt_1_23554;
mov.s32 %r99, %r2;
cvt.s64.s32 %rd41, %r4;
mul.wide.s32 %rd42, %r4, 8;
add.u64 %rd43, %rd20, %rd42;
mov.s32 %r100, 0;
mov.s32 %r101, %r99;
$Lt_1_24066:
//<loop> Loop body line 220, nesting depth: 5, estimated iterations: unknown
.loc 17 224 0
add.s32 %r102, %r100, %r1;
mov.s32 %r103, %r102;
setp.lt.s32 %p17, %r102, %r100;
@%p17 bra $Lt_1_30466;
cvt.s64.s32 %rd44, %r2;
mul.wide.s32 %rd34, %r2, 8;
cvt.s64.s32 %rd45, %r102;
mul.wide.s32 %rd46, %r102, 8;
add.u64 %rd47, %rd1, %rd46;
mov.f64 %fd14, 0d0000000000000000; // 0
$Lt_1_24834:
//<loop> Loop body line 224, nesting depth: 6, estimated iterations: unknown
.loc 17 225 0
ld.shared.f64 %fd15, [%rd47+0];
mad.rn.f64 %fd14, %fd4, %fd14, %fd15;
sub.s32 %r103, %r103, %r2;
sub.u64 %rd47, %rd47, %rd34;
setp.ge.s32 %p18, %r103, %r100;
@%p18 bra $Lt_1_24834;
bra.uni $Lt_1_24322;
$Lt_1_30466:
mov.f64 %fd14, 0d0000000000000000; // 0
$Lt_1_24322:
.loc 17 226 0
ld.shared.f64 %fd16, [%rd43+0];
mad.rn.f64 %fd17, %fd14, %fd13, %fd16;
st.shared.f64 [%rd43+0], %fd17;
add.s32 %r100, %r100, 1;
add.u64 %rd43, %rd43, 512;
setp.ne.s32 %p19, %r100, %r2;
@%p19 bra $Lt_1_24066;
$Lt_1_23554:
add.s32 %r92, %r92, %r83;
add.u64 %rd32, %rd29, %rd32;
setp.gt.s32 %p20, %r91, %r92;
@%p20 bra $Lt_1_22274;
$Lt_1_21762:
add.s32 %r74, %r74, 1;
setp.ne.s32 %p21, %r55, %r74;
@%p21 bra $Lt_1_21506;
$Lt_1_20994:
add.s32 %r71, %r71, 1;
setp.ne.s32 %p22, %r47, %r71;
@%p22 bra $Lt_1_20738;
$Lt_1_20226:
$Lt_1_19714:
.loc 17 235 0
bar.sync 0;
@!%p2 bra $Lt_1_26626;
.loc 17 237 0
ld.shared.f64 %fd18, [%rd19+256];
st.shared.f64 [%rd19+0], %fd18;
.loc 17 238 0
mov.f64 %fd19, 0d0000000000000000; // 0
st.shared.f64 [%rd19+256], %fd19;
bra.uni $Lt_1_26370;
$Lt_1_26626:
.loc 17 240 0
mov.f64 %fd20, 0d0000000000000000; // 0
st.shared.f64 [%rd19+0], %fd20;
$Lt_1_26370:
@!%p8 bra $Lt_1_26882;
mov.s32 %r104, %r2;
cvt.s64.s32 %rd48, %r4;
mov.s32 %r105, %r11;
add.s32 %r106, %r11, %r2;
mul.wide.s32 %rd49, %r4, 8;
add.u64 %rd50, %rd20, %rd49;
mov.s64 %rd51, %rd19;
mov.s32 %r107, %r104;
$Lt_1_27394:
//<loop> Loop body line 240, nesting depth: 2, estimated iterations: unknown
.loc 17 243 0
ld.shared.f64 %fd21, [%rd50+0];
ld.shared.f64 %fd22, [%rd51+0];
add.f64 %fd23, %fd21, %fd22;
st.shared.f64 [%rd51+0], %fd23;
.loc 17 244 0
bar.sync 0;
add.s32 %r105, %r105, 1;
add.u64 %rd51, %rd51, 8;
add.u64 %rd50, %rd50, 512;
setp.ne.s32 %p23, %r105, %r106;
@%p23 bra $Lt_1_27394;
$Lt_1_26882:
set.lt.u32.s32 %r108, %r66, %r17;
neg.s32 %r109, %r108;
and.b32 %r110, %r61, %r109;
mov.u32 %r111, 0;
setp.eq.s32 %p24, %r110, %r111;
@%p24 bra $Lt_1_27906;
.loc 17 248 0
ld.shared.f64 %fd24, [%rd19+0];
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
add.s32 %r112, %r11, %r58;
mul.lo.s32 %r113, %r37, %r17;
mul.lo.s32 %r114, %r38, %r113;
mul.lo.s32 %r115, %r48, %r17;
add.s32 %r116, %r114, %r115;
add.s32 %r117, %r112, %r116;
cvt.s64.s32 %rd53, %r117;
mul.wide.s32 %rd54, %r117, 8;
add.u64 %rd55, %rd52, %rd54;
st.global.f64 [%rd55+0], %fd24;
$Lt_1_27906:
add.s32 %r58, %r58, 32;
setp.ne.s32 %p25, %r58, %r59;
@%p25 bra $Lt_1_18434;
$Lt_1_17922:
.loc 17 252 0
exit;
$LDWend_make_rho:
} // make_rho
.entry interp (
.param .u64 __cudaparm_interp_x_,
.param .u64 __cudaparm_interp_q_,
.param .s32 __cudaparm_interp_nlocal,
.param .u64 __cudaparm_interp_brick,
.param .u64 __cudaparm_interp__rho_coeff,
.param .s32 __cudaparm_interp_npts_x,
.param .s32 __cudaparm_interp_npts_yx,
.param .f64 __cudaparm_interp_b_lo_x,
.param .f64 __cudaparm_interp_b_lo_y,
.param .f64 __cudaparm_interp_b_lo_z,
.param .f64 __cudaparm_interp_delxinv,
.param .f64 __cudaparm_interp_delyinv,
.param .f64 __cudaparm_interp_delzinv,
.param .s32 __cudaparm_interp_order,
.param .s32 __cudaparm_interp_order2,
.param .f64 __cudaparm_interp_qqrd2e_scale,
.param .u64 __cudaparm_interp_ans)
{
.reg .u32 %r<56>;
.reg .u64 %rd<37>;
.reg .f32 %f<19>;
.reg .f64 %fd<63>;
.reg .pred %p<14>;
.shared .align 8 .b8 __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568[512];
.shared .align 8 .b8 __cuda___cuda_local_var_32630_34_non_const_rho1d_06080[4096];
.shared .align 8 .b8 __cuda___cuda_local_var_32631_34_non_const_rho1d_110176[4096];
// __cuda_local_var_32647_12_non_const_ek = 16
.loc 17 262 0
$LDWbegin_interp:
ld.param.s32 %r1, [__cudaparm_interp_order2];
ld.param.s32 %r2, [__cudaparm_interp_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_2_8706;
.loc 17 269 0
mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 8;
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f64 %fd1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f64 [%rd6+0], %fd1;
$Lt_2_8706:
mov.u64 %rd1, __cuda___cuda_local_var_32629_34_non_const_rho_coeff5568;
.loc 17 270 0
bar.sync 0;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mul.lo.u32 %r7, %r5, %r6;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
setp.le.s32 %p2, %r9, %r8;
@%p2 bra $Lt_2_9218;
.loc 17 278 0
mov.u32 %r10, %r8;
mov.s32 %r11, 0;
mov.u32 %r12, %r11;
mov.s32 %r13, 0;
mov.u32 %r14, %r13;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r10,%r12,%r14,%r16}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 279 0
mov.u32 %r17, %r8;
mov.s32 %r18, 0;
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r17,%r19,%r21,%r23}];
mov.f32 %f12, %f8;
cvt.ftz.f64.f32 %fd2, %f12;
ld.param.f64 %fd3, [__cudaparm_interp_qqrd2e_scale];
mul.f64 %fd4, %fd2, %fd3;
mov.f64 %fd5, 0d0000000000000000; // 0
setp.neu.f64 %p3, %fd4, %fd5;
@!%p3 bra $Lt_2_9986;
mov.s32 %r24, 0;
setp.gt.s32 %p4, %r2, %r24;
ld.param.f64 %fd6, [__cudaparm_interp_delxinv];
cvt.ftz.f64.f32 %fd7, %f5;
ld.param.f64 %fd8, [__cudaparm_interp_b_lo_x];
sub.f64 %fd9, %fd7, %fd8;
mul.f64 %fd10, %fd6, %fd9;
@!%p4 bra $Lt_2_16386;
mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080;
mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176;
cvt.rzi.s32.f64 %r25, %fd10;
cvt.rn.f64.s32 %fd11, %r25;
mov.f64 %fd12, 0d3fe0000000000000; // 0.5
add.f64 %fd13, %fd11, %fd12;
sub.f64 %fd14, %fd13, %fd10;
ld.param.f64 %fd15, [__cudaparm_interp_delyinv];
cvt.ftz.f64.f32 %fd16, %f6;
ld.param.f64 %fd17, [__cudaparm_interp_b_lo_y];
sub.f64 %fd18, %fd16, %fd17;
mul.f64 %fd19, %fd15, %fd18;
cvt.rzi.s32.f64 %r26, %fd19;
cvt.rn.f64.s32 %fd20, %r26;
mov.f64 %fd21, 0d3fe0000000000000; // 0.5
add.f64 %fd22, %fd20, %fd21;
sub.f64 %fd23, %fd22, %fd19;
mov.s32 %r27, %r2;
cvt.s64.s32 %rd9, %r4;
mov.s32 %r28, %r1;
mul.wide.s32 %rd3, %r4, 8;
add.u64 %rd10, %rd3, %rd7;
add.u64 %rd11, %rd3, %rd8;
mov.s32 %r29, 0;
mov.s32 %r30, %r27;
$Lt_2_10754:
//<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown
.loc 17 298 0
mov.f64 %fd24, 0d0000000000000000; // 0
mov.f64 %fd25, 0d0000000000000000; // 0
st.shared.f64 [%rd10+0], %fd25;
.loc 17 299 0
mov.f64 %fd26, 0d0000000000000000; // 0
mov.f64 %fd27, 0d0000000000000000; // 0
st.shared.f64 [%rd11+0], %fd27;
.loc 17 300 0
mov.s32 %r31, %r28;
setp.lt.s32 %p5, %r28, %r29;
@%p5 bra $Lt_2_11010;
cvt.s64.s32 %rd12, %r2;
mul.wide.s32 %rd13, %r2, 8;
cvt.s64.s32 %rd14, %r28;
mul.wide.s32 %rd15, %r28, 8;
add.u64 %rd16, %rd1, %rd15;
$Lt_2_11522:
//<loop> Loop body line 300, nesting depth: 2, estimated iterations: unknown
.loc 17 301 0
ld.shared.f64 %fd28, [%rd16+0];
mad.rn.f64 %fd24, %fd24, %fd14, %fd28;
st.shared.f64 [%rd10+0], %fd24;
.loc 17 302 0
mad.rn.f64 %fd26, %fd26, %fd23, %fd28;
st.shared.f64 [%rd11+0], %fd26;
sub.s32 %r31, %r31, %r2;
sub.u64 %rd16, %rd16, %rd13;
setp.ge.s32 %p6, %r31, %r29;
@%p6 bra $Lt_2_11522;
$Lt_2_11010:
add.s32 %r29, %r29, 1;
add.s32 %r28, %r28, 1;
add.u64 %rd11, %rd11, 512;
add.u64 %rd10, %rd10, 512;
setp.ne.s32 %p7, %r28, %r3;
@%p7 bra $Lt_2_10754;
bra.uni $Lt_2_10242;
$Lt_2_16386:
cvt.rzi.s32.f64 %r25, %fd10;
mov.u64 %rd8, __cuda___cuda_local_var_32631_34_non_const_rho1d_110176;
mov.u64 %rd7, __cuda___cuda_local_var_32630_34_non_const_rho1d_06080;
$Lt_2_10242:
.loc 17 306 0
ld.param.f64 %fd29, [__cudaparm_interp_delzinv];
cvt.ftz.f64.f32 %fd30, %f7;
ld.param.f64 %fd31, [__cudaparm_interp_b_lo_z];
sub.f64 %fd32, %fd30, %fd31;
mul.f64 %fd33, %fd29, %fd32;
cvt.rzi.s32.f64 %r32, %fd33;
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
mul.lo.s32 %r34, %r32, %r33;
add.s32 %r35, %r25, %r34;
@!%p4 bra $Lt_2_16898;
cvt.rn.f64.s32 %fd34, %r32;
mov.f64 %fd35, 0d3fe0000000000000; // 0.5
add.f64 %fd36, %fd34, %fd35;
sub.f64 %fd37, %fd36, %fd33;
mov.s32 %r36, %r2;
cvt.ftz.f64.f32 %fd38, %f6;
cvt.s64.s32 %rd17, %r4;
ld.param.f64 %fd39, [__cudaparm_interp_delyinv];
ld.param.f64 %fd40, [__cudaparm_interp_b_lo_y];
sub.f64 %fd41, %fd38, %fd40;
mul.f64 %fd42, %fd39, %fd41;
cvt.rzi.s32.f64 %r37, %fd42;
mul.wide.s32 %rd3, %r4, 8;
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
mul.lo.s32 %r39, %r37, %r38;
add.u64 %rd18, %rd3, %rd7;
add.u64 %rd19, %rd3, %rd8;
cvt.s64.s32 %rd20, %r38;
mul.wide.s32 %rd21, %r38, 32;
add.s32 %r40, %r39, %r35;
mov.s32 %r41, %r40;
ld.param.u64 %rd22, [__cudaparm_interp_brick];
mov.s32 %r42, 0;
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, 0f00000000; // 0
mov.s32 %r43, %r36;
$Lt_2_12802:
//<loop> Loop body line 306, nesting depth: 1, estimated iterations: unknown
.loc 17 309 0
add.s32 %r44, %r42, %r1;
mov.s32 %r45, %r44;
setp.lt.s32 %p8, %r44, %r42;
@%p8 bra $Lt_2_17154;
cvt.s64.s32 %rd23, %r2;
mul.wide.s32 %rd13, %r2, 8;
cvt.s64.s32 %rd24, %r44;
mul.wide.s32 %rd25, %r44, 8;
add.u64 %rd26, %rd1, %rd25;
mov.f64 %fd43, 0d0000000000000000; // 0
$Lt_2_13570:
//<loop> Loop body line 309, nesting depth: 2, estimated iterations: unknown
.loc 17 310 0
ld.shared.f64 %fd44, [%rd26+0];
mad.rn.f64 %fd43, %fd37, %fd43, %fd44;
sub.s32 %r45, %r45, %r2;
sub.u64 %rd26, %rd26, %rd13;
setp.ge.s32 %p9, %r45, %r42;
@%p9 bra $Lt_2_13570;
bra.uni $Lt_2_13058;
$Lt_2_17154:
mov.f64 %fd43, 0d0000000000000000; // 0
$Lt_2_13058:
.loc 17 312 0
mov.s32 %r46, %r41;
mov.s32 %r47, %r2;
mov.s32 %r48, %r46;
mul.f64 %fd45, %fd4, %fd43;
mov.s64 %rd27, %rd19;
cvt.s64.s32 %rd28, %r46;
mul.wide.s32 %rd29, %r46, 32;
mov.s32 %r49, 0;
mov.s32 %r50, %r47;
$Lt_2_14594:
//<loop> Loop body line 312, nesting depth: 2, estimated iterations: unknown
mov.s32 %r51, %r2;
mov.s32 %r52, %r48;
add.s32 %r53, %r48, %r2;
mov.s64 %rd30, %rd18;
ld.shared.f64 %fd46, [%rd27+0];
add.u64 %rd31, %rd29, %rd22;
mul.f64 %fd47, %fd45, %fd46;
mov.s32 %r54, %r51;
$Lt_2_15362:
//<loop> Loop body line 312, nesting depth: 3, estimated iterations: unknown
.loc 17 316 0
ld.shared.f64 %fd48, [%rd30+0];
mul.f64 %fd49, %fd48, %fd47;
.loc 17 318 0
cvt.ftz.f64.f32 %fd50, %f15;
ld.global.v2.f64 {%fd51,%fd52}, [%rd31+0];
mul.f64 %fd53, %fd49, %fd51;
sub.f64 %fd54, %fd50, %fd53;
cvt.rn.ftz.f32.f64 %f15, %fd54;
.loc 17 319 0
cvt.ftz.f64.f32 %fd55, %f14;
mul.f64 %fd56, %fd49, %fd52;
sub.f64 %fd57, %fd55, %fd56;
cvt.rn.ftz.f32.f64 %f14, %fd57;
.loc 17 320 0
cvt.ftz.f64.f32 %fd58, %f13;
ld.global.f64 %fd59, [%rd31+16];
mul.f64 %fd60, %fd49, %fd59;
sub.f64 %fd61, %fd58, %fd60;
cvt.rn.ftz.f32.f64 %f13, %fd61;
add.s32 %r52, %r52, 1;
add.u64 %rd31, %rd31, 32;
add.u64 %rd30, %rd30, 512;
setp.ne.s32 %p10, %r52, %r53;
@%p10 bra $Lt_2_15362;
add.s32 %r49, %r49, 1;
add.s32 %r48, %r48, %r38;
add.u64 %rd29, %rd29, %rd21;
add.u64 %rd27, %rd27, 512;
setp.ne.s32 %p11, %r49, %r2;
@%p11 bra $Lt_2_14594;
add.s32 %r42, %r42, 1;
add.s32 %r41, %r46, %r33;
setp.ne.s32 %p12, %r42, %r2;
@%p12 bra $Lt_2_12802;
bra.uni $Lt_2_9730;
$Lt_2_16898:
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, 0f00000000; // 0
bra.uni $Lt_2_9730;
$Lt_2_9986:
mov.f32 %f13, 0f00000000; // 0
mov.f32 %f14, 0f00000000; // 0
mov.f32 %f15, 0f00000000; // 0
$Lt_2_9730:
.loc 17 327 0
ld.param.u64 %rd32, [__cudaparm_interp_ans];
cvt.s64.s32 %rd33, %r8;
mul.wide.s32 %rd34, %r8, 16;
add.u64 %rd35, %rd32, %rd34;
mov.f32 %f16, %f17;
st.global.v4.f32 [%rd35+0], {%f15,%f14,%f13,%f16};
$Lt_2_9218:
.loc 17 329 0
exit;
$LDWend_interp:
} // interp

View File

@ -1,881 +0,0 @@
.version 2.3
.target sm_20
.address_size 64
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 4.0 built on 2011-05-12
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_0000bc4a_00000000-9_pppm_gpu_kernel.cpp3.i (/home/sjplimp/ccBI#.A49KLP)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_20, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.file 1 "<command-line>"
.file 2 "/tmp/tmpxft_0000bc4a_00000000-8_pppm_gpu_kernel.cudafe2.gpu"
.file 3 "/usr/lib/gcc/x86_64-redhat-linux/4.4.5/include/stddef.h"
.file 4 "/usr/local/cuda/include/crt/device_runtime.h"
.file 5 "/usr/local/cuda/include/host_defines.h"
.file 6 "/usr/local/cuda/include/builtin_types.h"
.file 7 "/usr/local/cuda/include/device_types.h"
.file 8 "/usr/local/cuda/include/driver_types.h"
.file 9 "/usr/local/cuda/include/surface_types.h"
.file 10 "/usr/local/cuda/include/texture_types.h"
.file 11 "/usr/local/cuda/include/vector_types.h"
.file 12 "/usr/local/cuda/include/device_launch_parameters.h"
.file 13 "/usr/local/cuda/include/crt/storage_class.h"
.file 14 "/usr/include/bits/types.h"
.file 15 "/usr/include/time.h"
.file 16 "/usr/local/cuda/include/sm_11_atomic_functions.h"
.file 17 "pppm_gpu_kernel.cu"
.file 18 "/usr/local/cuda/include/common_functions.h"
.file 19 "/usr/local/cuda/include/math_functions.h"
.file 20 "/usr/local/cuda/include/math_constants.h"
.file 21 "/usr/local/cuda/include/device_functions.h"
.file 22 "/usr/local/cuda/include/sm_12_atomic_functions.h"
.file 23 "/usr/local/cuda/include/sm_13_double_functions.h"
.file 24 "/usr/local/cuda/include/sm_20_atomic_functions.h"
.file 25 "/usr/local/cuda/include/sm_20_intrinsics.h"
.file 26 "/usr/local/cuda/include/surface_functions.h"
.file 27 "/usr/local/cuda/include/texture_fetch_functions.h"
.file 28 "/usr/local/cuda/include/math_functions_dbl_ptx3.h"
.global .texref pos_tex;
.global .texref q_tex;
.entry particle_map (
.param .u64 __cudaparm_particle_map_x_,
.param .u64 __cudaparm_particle_map_q_,
.param .f32 __cudaparm_particle_map_delvolinv,
.param .s32 __cudaparm_particle_map_nlocal,
.param .u64 __cudaparm_particle_map_counts,
.param .u64 __cudaparm_particle_map_ans,
.param .f32 __cudaparm_particle_map_b_lo_x,
.param .f32 __cudaparm_particle_map_b_lo_y,
.param .f32 __cudaparm_particle_map_b_lo_z,
.param .f32 __cudaparm_particle_map_delxinv,
.param .f32 __cudaparm_particle_map_delyinv,
.param .f32 __cudaparm_particle_map_delzinv,
.param .s32 __cudaparm_particle_map_nlocal_x,
.param .s32 __cudaparm_particle_map_nlocal_y,
.param .s32 __cudaparm_particle_map_nlocal_z,
.param .s32 __cudaparm_particle_map_atom_stride,
.param .s32 __cudaparm_particle_map_max_atoms,
.param .u64 __cudaparm_particle_map_error)
{
.reg .u32 %r<50>;
.reg .u64 %rd<12>;
.reg .f32 %f<44>;
.reg .pred %p<11>;
.loc 17 113 0
$LDWbegin_particle_map:
mov.u32 %r1, %ntid.x;
mov.u32 %r2, %ctaid.x;
mul.lo.u32 %r3, %r2, %r1;
mov.u32 %r4, %nctaid.x;
mul.lo.u32 %r5, %r4, %r1;
mov.u32 %r6, %tid.x;
add.u32 %r7, %r6, %r3;
sub.s32 %r8, %r5, 1;
mul.lo.s32 %r9, %r7, 64;
div.s32 %r10, %r9, %r5;
mul.lo.s32 %r11, %r8, %r10;
sub.s32 %r12, %r9, %r11;
ld.param.s32 %r13, [__cudaparm_particle_map_nlocal];
setp.le.s32 %p1, %r13, %r12;
@%p1 bra $Lt_0_7426;
.loc 17 125 0
mov.u32 %r14, %r12;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
mov.s32 %r17, 0;
mov.u32 %r18, %r17;
mov.s32 %r19, 0;
mov.u32 %r20, %r19;
tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[pos_tex,{%r14,%r16,%r18,%r20}];
mov.f32 %f5, %f1;
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
.loc 17 127 0
mov.u32 %r21, %r12;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
mov.s32 %r24, 0;
mov.u32 %r25, %r24;
mov.s32 %r26, 0;
mov.u32 %r27, %r26;
tex.1d.v4.f32.s32 {%f8,%f9,%f10,%f11},[q_tex,{%r21,%r23,%r25,%r27}];
mov.f32 %f12, %f8;
ld.param.f32 %f13, [__cudaparm_particle_map_delvolinv];
mul.ftz.f32 %f14, %f13, %f12;
mov.f32 %f15, 0f00000000; // 0
setp.neu.ftz.f32 %p2, %f14, %f15;
@!%p2 bra $Lt_0_7426;
.loc 17 130 0
ld.param.f32 %f16, [__cudaparm_particle_map_b_lo_x];
sub.ftz.f32 %f17, %f5, %f16;
ld.param.f32 %f18, [__cudaparm_particle_map_delxinv];
mul.ftz.f32 %f19, %f18, %f17;
mov.f32 %f20, 0f00000000; // 0
setp.lt.ftz.f32 %p3, %f19, %f20;
@%p3 bra $Lt_0_8706;
ld.param.f32 %f21, [__cudaparm_particle_map_b_lo_y];
sub.ftz.f32 %f22, %f6, %f21;
ld.param.f32 %f23, [__cudaparm_particle_map_delyinv];
mul.ftz.f32 %f24, %f23, %f22;
mov.f32 %f25, 0f00000000; // 0
setp.lt.ftz.f32 %p4, %f24, %f25;
@%p4 bra $Lt_0_8706;
ld.param.f32 %f26, [__cudaparm_particle_map_b_lo_z];
sub.ftz.f32 %f27, %f7, %f26;
ld.param.f32 %f28, [__cudaparm_particle_map_delzinv];
mul.ftz.f32 %f29, %f28, %f27;
mov.f32 %f30, 0f00000000; // 0
setp.lt.ftz.f32 %p5, %f29, %f30;
@%p5 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r28, %f19;
ld.param.s32 %r29, [__cudaparm_particle_map_nlocal_x];
setp.ge.s32 %p6, %r28, %r29;
@%p6 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r30, %f24;
ld.param.s32 %r31, [__cudaparm_particle_map_nlocal_y];
setp.ge.s32 %p7, %r30, %r31;
@%p7 bra $Lt_0_8706;
cvt.rzi.ftz.s32.f32 %r32, %f29;
ld.param.s32 %r33, [__cudaparm_particle_map_nlocal_z];
setp.gt.s32 %p8, %r33, %r32;
@%p8 bra $L_0_4866;
$Lt_0_8706:
$L_0_5122:
.loc 17 139 0
mov.s32 %r34, 1;
ld.param.u64 %rd1, [__cudaparm_particle_map_error];
st.global.s32 [%rd1+0], %r34;
bra.uni $Lt_0_7426;
$L_0_4866:
.loc 17 146 0
mul.lo.s32 %r35, %r32, %r31;
add.s32 %r36, %r30, %r35;
mul.lo.s32 %r37, %r36, %r29;
add.s32 %r38, %r28, %r37;
ld.param.u64 %rd2, [__cudaparm_particle_map_counts];
cvt.s64.s32 %rd3, %r38;
mul.wide.s32 %rd4, %r38, 4;
add.u64 %rd5, %rd2, %rd4;
mov.s32 %r39, 1;
atom.global.add.s32 %r40, [%rd5], %r39;
mov.s32 %r41, %r40;
ld.param.s32 %r42, [__cudaparm_particle_map_max_atoms];
setp.gt.s32 %p9, %r42, %r41;
@%p9 bra $Lt_0_7682;
.loc 17 148 0
mov.s32 %r43, 2;
ld.param.u64 %rd6, [__cudaparm_particle_map_error];
st.global.s32 [%rd6+0], %r43;
.loc 16 118 0
mov.s32 %r44, -1;
atom.global.add.s32 %r45, [%rd5], %r44;
bra.uni $Lt_0_7426;
$Lt_0_7682:
.loc 17 151 0
ld.param.u64 %rd7, [__cudaparm_particle_map_ans];
ld.param.s32 %r46, [__cudaparm_particle_map_atom_stride];
mul.lo.s32 %r47, %r46, %r41;
add.s32 %r48, %r38, %r47;
cvt.s64.s32 %rd8, %r48;
mul.wide.s32 %rd9, %r48, 16;
add.u64 %rd10, %rd7, %rd9;
cvt.rn.f32.s32 %f31, %r28;
mov.f32 %f32, 0f3f000000; // 0.5
add.ftz.f32 %f33, %f31, %f32;
sub.ftz.f32 %f34, %f33, %f19;
cvt.rn.f32.s32 %f35, %r30;
mov.f32 %f36, 0f3f000000; // 0.5
add.ftz.f32 %f37, %f35, %f36;
sub.ftz.f32 %f38, %f37, %f24;
cvt.rn.f32.s32 %f39, %r32;
mov.f32 %f40, 0f3f000000; // 0.5
add.ftz.f32 %f41, %f39, %f40;
sub.ftz.f32 %f42, %f41, %f29;
st.global.v4.f32 [%rd10+0], {%f34,%f38,%f42,%f14};
$Lt_0_7426:
$L_0_4610:
$Lt_0_6914:
$Lt_0_6402:
.loc 17 155 0
exit;
$LDWend_particle_map:
} // particle_map
.entry make_rho (
.param .u64 __cudaparm_make_rho_counts,
.param .u64 __cudaparm_make_rho_atoms,
.param .u64 __cudaparm_make_rho_brick,
.param .u64 __cudaparm_make_rho__rho_coeff,
.param .s32 __cudaparm_make_rho_atom_stride,
.param .s32 __cudaparm_make_rho_npts_x,
.param .s32 __cudaparm_make_rho_npts_y,
.param .s32 __cudaparm_make_rho_npts_z,
.param .s32 __cudaparm_make_rho_nlocal_x,
.param .s32 __cudaparm_make_rho_nlocal_y,
.param .s32 __cudaparm_make_rho_nlocal_z,
.param .s32 __cudaparm_make_rho_order_m_1,
.param .s32 __cudaparm_make_rho_order,
.param .s32 __cudaparm_make_rho_order2)
{
.reg .u32 %r<119>;
.reg .u64 %rd<57>;
.reg .f32 %f<26>;
.reg .pred %p<27>;
.shared .align 4 .b8 __cuda___cuda_local_var_32531_33_non_const_rho_coeff168[256];
.shared .align 4 .b8 __cuda___cuda_local_var_32532_33_non_const_front424[320];
.shared .align 4 .b8 __cuda___cuda_local_var_32533_33_non_const_ans744[2048];
.loc 17 164 0
$LDWbegin_make_rho:
ld.param.s32 %r1, [__cudaparm_make_rho_order2];
ld.param.s32 %r2, [__cudaparm_make_rho_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_1_16898;
.loc 17 171 0
mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 4;
ld.param.u64 %rd4, [__cudaparm_make_rho__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_1_16898:
mov.u64 %rd1, __cuda___cuda_local_var_32531_33_non_const_rho_coeff168;
shr.s32 %r5, %r4, 31;
mov.s32 %r6, 31;
and.b32 %r7, %r5, %r6;
add.s32 %r8, %r7, %r4;
shr.s32 %r9, %r8, 5;
mul.lo.s32 %r10, %r9, 32;
sub.s32 %r11, %r4, %r10;
setp.lt.s32 %p2, %r11, %r2;
@!%p2 bra $Lt_1_17410;
.loc 17 177 0
mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424;
mov.f32 %f2, 0f00000000; // 0
cvt.s64.s32 %rd8, %r11;
shr.s32 %r12, %r4, 31;
mov.s32 %r13, 31;
and.b32 %r14, %r12, %r13;
add.s32 %r15, %r14, %r4;
shr.s32 %r16, %r15, 5;
cvt.s64.s32 %rd9, %r16;
mul.wide.s32 %rd10, %r16, 40;
add.u64 %rd11, %rd8, %rd10;
mul.lo.u64 %rd12, %rd11, 4;
add.u64 %rd13, %rd7, %rd12;
st.shared.f32 [%rd13+128], %f2;
$Lt_1_17410:
mov.u64 %rd7, __cuda___cuda_local_var_32532_33_non_const_front424;
.loc 17 179 0
bar.sync 0;
ld.param.s32 %r17, [__cudaparm_make_rho_npts_x];
shr.s32 %r18, %r17, 31;
mov.s32 %r19, 31;
and.b32 %r20, %r18, %r19;
add.s32 %r21, %r20, %r17;
shr.s32 %r22, %r21, 5;
add.s32 %r23, %r22, 1;
mov.u32 %r24, 0;
setp.le.s32 %p3, %r23, %r24;
@%p3 bra $Lt_1_17922;
shr.s32 %r25, %r4, 31;
mov.s32 %r26, 31;
and.b32 %r27, %r25, %r26;
add.s32 %r28, %r27, %r4;
shr.s32 %r29, %r28, 5;
add.s32 %r30, %r11, 32;
ld.param.s32 %r31, [__cudaparm_make_rho_nlocal_y];
ld.param.s32 %r32, [__cudaparm_make_rho_nlocal_x];
mul.lo.s32 %r33, %r31, %r32;
mov.u32 %r34, %ctaid.x;
mul.lo.u32 %r35, %r34, 2;
add.u32 %r36, %r29, %r35;
ld.param.s32 %r37, [__cudaparm_make_rho_npts_y];
div.s32 %r38, %r36, %r37;
ld.param.s32 %r39, [__cudaparm_make_rho_order_m_1];
setp.lt.s32 %p4, %r38, %r39;
sub.s32 %r40, %r39, %r38;
mov.s32 %r41, 0;
selp.s32 %r42, %r40, %r41, %p4;
ld.param.s32 %r43, [__cudaparm_make_rho_nlocal_z];
setp.ge.s32 %p5, %r38, %r43;
sub.s32 %r44, %r43, %r38;
add.s32 %r45, %r44, %r2;
sub.s32 %r46, %r45, 1;
selp.s32 %r47, %r46, %r2, %p5;
rem.s32 %r48, %r36, %r37;
setp.lt.s32 %p6, %r48, %r39;
sub.s32 %r49, %r39, %r48;
mov.s32 %r50, 0;
selp.s32 %r51, %r49, %r50, %p6;
setp.ge.s32 %p7, %r48, %r31;
sub.s32 %r52, %r31, %r48;
add.s32 %r53, %r52, %r2;
sub.s32 %r54, %r53, 1;
selp.s32 %r55, %r54, %r2, %p7;
mov.s32 %r56, %r23;
mov.s32 %r57, 0;
setp.gt.s32 %p8, %r2, %r57;
mov.s32 %r58, 0;
cvt.s64.s32 %rd14, %r11;
cvt.s64.s32 %rd15, %r29;
mul.lo.s32 %r59, %r23, 32;
mul.wide.s32 %rd16, %r29, 40;
add.u64 %rd17, %rd14, %rd16;
ld.param.s32 %r60, [__cudaparm_make_rho_npts_z];
setp.gt.s32 %p9, %r60, %r38;
mul.lo.u64 %rd18, %rd17, 4;
selp.s32 %r61, 1, 0, %p9;
add.u64 %rd19, %rd18, %rd7;
mov.u64 %rd20, __cuda___cuda_local_var_32533_33_non_const_ans744;
mov.s32 %r62, %r56;
$Lt_1_18434:
//<loop> Loop body line 179, nesting depth: 1, estimated iterations: unknown
@!%p8 bra $Lt_1_18690;
mov.s32 %r63, %r2;
cvt.s64.s32 %rd21, %r4;
mul.wide.s32 %rd22, %r4, 4;
add.u64 %rd23, %rd20, %rd22;
mov.s32 %r64, 0;
mov.s32 %r65, %r63;
$Lt_1_19202:
//<loop> Loop body line 179, nesting depth: 2, estimated iterations: unknown
.loc 17 203 0
mov.f32 %f3, 0f00000000; // 0
st.shared.f32 [%rd23+0], %f3;
add.s32 %r64, %r64, 1;
add.u64 %rd23, %rd23, 256;
setp.ne.s32 %p10, %r64, %r2;
@%p10 bra $Lt_1_19202;
$Lt_1_18690:
add.s32 %r66, %r11, %r58;
set.lt.u32.s32 %r67, %r66, %r32;
neg.s32 %r68, %r67;
and.b32 %r69, %r61, %r68;
mov.u32 %r70, 0;
setp.eq.s32 %p11, %r69, %r70;
@%p11 bra $Lt_1_20226;
.loc 17 206 0
mov.s32 %r71, %r42;
setp.ge.s32 %p12, %r42, %r47;
@%p12 bra $Lt_1_20226;
sub.s32 %r72, %r47, %r42;
setp.lt.s32 %p13, %r51, %r55;
mov.s32 %r73, %r72;
$Lt_1_20738:
//<loop> Loop body line 206, nesting depth: 2, estimated iterations: unknown
.loc 17 208 0
mov.s32 %r74, %r51;
@!%p13 bra $Lt_1_20994;
sub.s32 %r75, %r55, %r51;
sub.s32 %r76, %r71, %r42;
add.s32 %r77, %r38, %r42;
add.s32 %r78, %r48, %r51;
sub.s32 %r79, %r77, %r39;
sub.s32 %r80, %r78, %r39;
add.s32 %r81, %r76, %r79;
mul.lo.s32 %r82, %r33, %r81;
ld.param.s32 %r83, [__cudaparm_make_rho_atom_stride];
ld.param.u64 %rd24, [__cudaparm_make_rho_counts];
mov.s32 %r84, %r75;
$Lt_1_21506:
//<loop> Loop body line 208, nesting depth: 3, estimated iterations: unknown
.loc 17 210 0
sub.s32 %r85, %r74, %r51;
add.s32 %r86, %r85, %r80;
mul.lo.s32 %r87, %r86, %r32;
add.s32 %r88, %r82, %r87;
add.s32 %r89, %r66, %r88;
cvt.s64.s32 %rd25, %r89;
mul.wide.s32 %rd26, %r89, 4;
add.u64 %rd27, %rd24, %rd26;
ld.global.s32 %r90, [%rd27+0];
mul.lo.s32 %r91, %r90, %r83;
.loc 17 211 0
mov.s32 %r92, %r89;
setp.ge.s32 %p14, %r89, %r91;
@%p14 bra $Lt_1_21762;
sub.s32 %r93, %r3, 1;
cvt.s64.s32 %rd28, %r83;
mul.wide.s32 %rd29, %r83, 16;
mov.s32 %r94, -1;
setp.gt.s32 %p15, %r93, %r94;
ld.param.u64 %rd30, [__cudaparm_make_rho_atoms];
mul.lo.u64 %rd31, %rd25, 16;
add.u64 %rd32, %rd30, %rd31;
$Lt_1_22274:
//<loop> Loop body line 211, nesting depth: 4, estimated iterations: unknown
.loc 17 212 0
ld.global.f32 %f4, [%rd32+0];
@!%p15 bra $Lt_1_29954;
sub.s32 %r95, %r93, %r74;
mov.s32 %r96, -1;
sub.s32 %r97, %r96, %r74;
cvt.s64.s32 %rd33, %r2;
mul.wide.s32 %rd34, %r2, 4;
ld.global.f32 %f5, [%rd32+4];
ld.global.f32 %f6, [%rd32+8];
cvt.s64.s32 %rd35, %r95;
mul.wide.s32 %rd36, %r95, 4;
add.u64 %rd37, %rd1, %rd36;
sub.s32 %r98, %r93, %r71;
cvt.s64.s32 %rd38, %r98;
mul.wide.s32 %rd39, %r98, 4;
add.u64 %rd40, %rd1, %rd39;
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, 0f00000000; // 0
$Lt_1_23042:
//<loop> Loop body line 212, nesting depth: 5, estimated iterations: unknown
.loc 17 217 0
ld.shared.f32 %f9, [%rd37+0];
fma.rn.ftz.f32 %f8, %f8, %f5, %f9;
.loc 17 218 0
ld.shared.f32 %f10, [%rd40+0];
fma.rn.ftz.f32 %f7, %f7, %f6, %f10;
sub.u64 %rd40, %rd40, %rd34;
sub.s32 %r95, %r95, %r2;
sub.u64 %rd37, %rd37, %rd34;
setp.gt.s32 %p16, %r95, %r97;
@%p16 bra $Lt_1_23042;
bra.uni $Lt_1_22530;
$Lt_1_29954:
mov.f32 %f7, 0f00000000; // 0
mov.f32 %f8, 0f00000000; // 0
$Lt_1_22530:
.loc 17 220 0
ld.global.f32 %f11, [%rd32+12];
mul.ftz.f32 %f12, %f7, %f8;
mul.ftz.f32 %f13, %f11, %f12;
@!%p8 bra $Lt_1_23554;
mov.s32 %r99, %r2;
cvt.s64.s32 %rd41, %r4;
mul.wide.s32 %rd42, %r4, 4;
add.u64 %rd43, %rd20, %rd42;
mov.s32 %r100, 0;
mov.s32 %r101, %r99;
$Lt_1_24066:
//<loop> Loop body line 220, nesting depth: 5, estimated iterations: unknown
.loc 17 224 0
add.s32 %r102, %r100, %r1;
mov.s32 %r103, %r102;
setp.lt.s32 %p17, %r102, %r100;
@%p17 bra $Lt_1_30466;
cvt.s64.s32 %rd44, %r2;
mul.wide.s32 %rd34, %r2, 4;
cvt.s64.s32 %rd45, %r102;
mul.wide.s32 %rd46, %r102, 4;
add.u64 %rd47, %rd1, %rd46;
mov.f32 %f14, 0f00000000; // 0
$Lt_1_24834:
//<loop> Loop body line 224, nesting depth: 6, estimated iterations: unknown
.loc 17 225 0
ld.shared.f32 %f15, [%rd47+0];
fma.rn.ftz.f32 %f14, %f4, %f14, %f15;
sub.s32 %r103, %r103, %r2;
sub.u64 %rd47, %rd47, %rd34;
setp.ge.s32 %p18, %r103, %r100;
@%p18 bra $Lt_1_24834;
bra.uni $Lt_1_24322;
$Lt_1_30466:
mov.f32 %f14, 0f00000000; // 0
$Lt_1_24322:
.loc 17 226 0
ld.shared.f32 %f16, [%rd43+0];
fma.rn.ftz.f32 %f17, %f14, %f13, %f16;
st.shared.f32 [%rd43+0], %f17;
add.s32 %r100, %r100, 1;
add.u64 %rd43, %rd43, 256;
setp.ne.s32 %p19, %r100, %r2;
@%p19 bra $Lt_1_24066;
$Lt_1_23554:
add.s32 %r92, %r92, %r83;
add.u64 %rd32, %rd29, %rd32;
setp.gt.s32 %p20, %r91, %r92;
@%p20 bra $Lt_1_22274;
$Lt_1_21762:
add.s32 %r74, %r74, 1;
setp.ne.s32 %p21, %r55, %r74;
@%p21 bra $Lt_1_21506;
$Lt_1_20994:
add.s32 %r71, %r71, 1;
setp.ne.s32 %p22, %r47, %r71;
@%p22 bra $Lt_1_20738;
$Lt_1_20226:
$Lt_1_19714:
.loc 17 235 0
bar.sync 0;
@!%p2 bra $Lt_1_26626;
.loc 17 237 0
ld.shared.f32 %f18, [%rd19+128];
st.shared.f32 [%rd19+0], %f18;
.loc 17 238 0
mov.f32 %f19, 0f00000000; // 0
st.shared.f32 [%rd19+128], %f19;
bra.uni $Lt_1_26370;
$Lt_1_26626:
.loc 17 240 0
mov.f32 %f20, 0f00000000; // 0
st.shared.f32 [%rd19+0], %f20;
$Lt_1_26370:
@!%p8 bra $Lt_1_26882;
mov.s32 %r104, %r2;
cvt.s64.s32 %rd48, %r4;
mov.s32 %r105, %r11;
add.s32 %r106, %r11, %r2;
mul.wide.s32 %rd49, %r4, 4;
add.u64 %rd50, %rd20, %rd49;
mov.s64 %rd51, %rd19;
mov.s32 %r107, %r104;
$Lt_1_27394:
//<loop> Loop body line 240, nesting depth: 2, estimated iterations: unknown
.loc 17 243 0
ld.shared.f32 %f21, [%rd50+0];
ld.shared.f32 %f22, [%rd51+0];
add.ftz.f32 %f23, %f21, %f22;
st.shared.f32 [%rd51+0], %f23;
.loc 17 244 0
bar.sync 0;
add.s32 %r105, %r105, 1;
add.u64 %rd51, %rd51, 4;
add.u64 %rd50, %rd50, 256;
setp.ne.s32 %p23, %r105, %r106;
@%p23 bra $Lt_1_27394;
$Lt_1_26882:
set.lt.u32.s32 %r108, %r66, %r17;
neg.s32 %r109, %r108;
and.b32 %r110, %r61, %r109;
mov.u32 %r111, 0;
setp.eq.s32 %p24, %r110, %r111;
@%p24 bra $Lt_1_27906;
.loc 17 248 0
ld.shared.f32 %f24, [%rd19+0];
ld.param.u64 %rd52, [__cudaparm_make_rho_brick];
add.s32 %r112, %r11, %r58;
mul.lo.s32 %r113, %r37, %r17;
mul.lo.s32 %r114, %r38, %r113;
mul.lo.s32 %r115, %r48, %r17;
add.s32 %r116, %r114, %r115;
add.s32 %r117, %r112, %r116;
cvt.s64.s32 %rd53, %r117;
mul.wide.s32 %rd54, %r117, 4;
add.u64 %rd55, %rd52, %rd54;
st.global.f32 [%rd55+0], %f24;
$Lt_1_27906:
add.s32 %r58, %r58, 32;
setp.ne.s32 %p25, %r58, %r59;
@%p25 bra $Lt_1_18434;
$Lt_1_17922:
.loc 17 252 0
exit;
$LDWend_make_rho:
} // make_rho
.entry interp (
.param .u64 __cudaparm_interp_x_,
.param .u64 __cudaparm_interp_q_,
.param .s32 __cudaparm_interp_nlocal,
.param .u64 __cudaparm_interp_brick,
.param .u64 __cudaparm_interp__rho_coeff,
.param .s32 __cudaparm_interp_npts_x,
.param .s32 __cudaparm_interp_npts_yx,
.param .f32 __cudaparm_interp_b_lo_x,
.param .f32 __cudaparm_interp_b_lo_y,
.param .f32 __cudaparm_interp_b_lo_z,
.param .f32 __cudaparm_interp_delxinv,
.param .f32 __cudaparm_interp_delyinv,
.param .f32 __cudaparm_interp_delzinv,
.param .s32 __cudaparm_interp_order,
.param .s32 __cudaparm_interp_order2,
.param .f32 __cudaparm_interp_qqrd2e_scale,
.param .u64 __cudaparm_interp_ans)
{
.reg .u32 %r<56>;
.reg .u64 %rd<37>;
.reg .f32 %f<69>;
.reg .pred %p<14>;
.shared .align 4 .b8 __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888[256];
.shared .align 4 .b8 __cuda___cuda_local_var_32630_33_non_const_rho1d_03144[2048];
.shared .align 4 .b8 __cuda___cuda_local_var_32631_33_non_const_rho1d_15192[2048];
// __cuda_local_var_32647_12_non_const_ek = 16
.loc 17 262 0
$LDWbegin_interp:
ld.param.s32 %r1, [__cudaparm_interp_order2];
ld.param.s32 %r2, [__cudaparm_interp_order];
add.s32 %r3, %r1, %r2;
cvt.s32.u32 %r4, %tid.x;
setp.le.s32 %p1, %r3, %r4;
@%p1 bra $Lt_2_8706;
.loc 17 269 0
mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888;
cvt.s64.s32 %rd2, %r4;
mul.wide.s32 %rd3, %r4, 4;
ld.param.u64 %rd4, [__cudaparm_interp__rho_coeff];
add.u64 %rd5, %rd4, %rd3;
ld.global.f32 %f1, [%rd5+0];
add.u64 %rd6, %rd3, %rd1;
st.shared.f32 [%rd6+0], %f1;
$Lt_2_8706:
mov.u64 %rd1, __cuda___cuda_local_var_32629_33_non_const_rho_coeff2888;
.loc 17 270 0
bar.sync 0;
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ntid.x;
mul.lo.u32 %r7, %r5, %r6;
add.u32 %r8, %r4, %r7;
ld.param.s32 %r9, [__cudaparm_interp_nlocal];
setp.le.s32 %p2, %r9, %r8;
@%p2 bra $Lt_2_9218;
.loc 17 278 0
mov.u32 %r10, %r8;
mov.s32 %r11, 0;
mov.u32 %r12, %r11;
mov.s32 %r13, 0;
mov.u32 %r14, %r13;
mov.s32 %r15, 0;
mov.u32 %r16, %r15;
tex.1d.v4.f32.s32 {%f2,%f3,%f4,%f5},[pos_tex,{%r10,%r12,%r14,%r16}];
mov.f32 %f6, %f2;
mov.f32 %f7, %f3;
mov.f32 %f8, %f4;
.loc 17 279 0
mov.u32 %r17, %r8;
mov.s32 %r18, 0;
mov.u32 %r19, %r18;
mov.s32 %r20, 0;
mov.u32 %r21, %r20;
mov.s32 %r22, 0;
mov.u32 %r23, %r22;
tex.1d.v4.f32.s32 {%f9,%f10,%f11,%f12},[q_tex,{%r17,%r19,%r21,%r23}];
mov.f32 %f13, %f9;
ld.param.f32 %f14, [__cudaparm_interp_qqrd2e_scale];
mul.ftz.f32 %f15, %f14, %f13;
mov.f32 %f16, 0f00000000; // 0
setp.neu.ftz.f32 %p3, %f15, %f16;
@!%p3 bra $Lt_2_9986;
mov.s32 %r24, 0;
setp.gt.s32 %p4, %r2, %r24;
ld.param.f32 %f17, [__cudaparm_interp_b_lo_x];
sub.ftz.f32 %f18, %f6, %f17;
ld.param.f32 %f19, [__cudaparm_interp_delxinv];
mul.ftz.f32 %f20, %f19, %f18;
@!%p4 bra $Lt_2_16386;
mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144;
mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192;
cvt.rzi.ftz.s32.f32 %r25, %f20;
cvt.rn.f32.s32 %f21, %r25;
mov.f32 %f22, 0f3f000000; // 0.5
add.ftz.f32 %f23, %f21, %f22;
sub.ftz.f32 %f24, %f23, %f20;
ld.param.f32 %f25, [__cudaparm_interp_b_lo_y];
sub.ftz.f32 %f26, %f7, %f25;
ld.param.f32 %f27, [__cudaparm_interp_delyinv];
mul.ftz.f32 %f28, %f27, %f26;
cvt.rzi.ftz.s32.f32 %r26, %f28;
cvt.rn.f32.s32 %f29, %r26;
mov.f32 %f30, 0f3f000000; // 0.5
add.ftz.f32 %f31, %f29, %f30;
sub.ftz.f32 %f32, %f31, %f28;
mov.s32 %r27, %r2;
cvt.s64.s32 %rd9, %r4;
mov.s32 %r28, %r1;
mul.wide.s32 %rd3, %r4, 4;
add.u64 %rd10, %rd3, %rd7;
add.u64 %rd11, %rd3, %rd8;
mov.s32 %r29, 0;
mov.s32 %r30, %r27;
$Lt_2_10754:
//<loop> Loop body line 279, nesting depth: 1, estimated iterations: unknown
.loc 17 298 0
mov.f32 %f33, 0f00000000; // 0
mov.f32 %f34, 0f00000000; // 0
st.shared.f32 [%rd10+0], %f34;
.loc 17 299 0
mov.f32 %f35, 0f00000000; // 0
mov.f32 %f36, 0f00000000; // 0
st.shared.f32 [%rd11+0], %f36;
.loc 17 300 0
mov.s32 %r31, %r28;
setp.lt.s32 %p5, %r28, %r29;
@%p5 bra $Lt_2_11010;
cvt.s64.s32 %rd12, %r2;
mul.wide.s32 %rd13, %r2, 4;
cvt.s64.s32 %rd14, %r28;
mul.wide.s32 %rd15, %r28, 4;
add.u64 %rd16, %rd1, %rd15;
$Lt_2_11522:
//<loop> Loop body line 300, nesting depth: 2, estimated iterations: unknown
.loc 17 301 0
ld.shared.f32 %f37, [%rd16+0];
fma.rn.ftz.f32 %f33, %f33, %f24, %f37;
st.shared.f32 [%rd10+0], %f33;
.loc 17 302 0
fma.rn.ftz.f32 %f35, %f35, %f32, %f37;
st.shared.f32 [%rd11+0], %f35;
sub.s32 %r31, %r31, %r2;
sub.u64 %rd16, %rd16, %rd13;
setp.ge.s32 %p6, %r31, %r29;
@%p6 bra $Lt_2_11522;
$Lt_2_11010:
add.s32 %r29, %r29, 1;
add.s32 %r28, %r28, 1;
add.u64 %rd11, %rd11, 256;
add.u64 %rd10, %rd10, 256;
setp.ne.s32 %p7, %r28, %r3;
@%p7 bra $Lt_2_10754;
bra.uni $Lt_2_10242;
$Lt_2_16386:
cvt.rzi.ftz.s32.f32 %r25, %f20;
mov.u64 %rd8, __cuda___cuda_local_var_32631_33_non_const_rho1d_15192;
mov.u64 %rd7, __cuda___cuda_local_var_32630_33_non_const_rho1d_03144;
$Lt_2_10242:
.loc 17 306 0
ld.param.f32 %f38, [__cudaparm_interp_b_lo_z];
sub.ftz.f32 %f39, %f8, %f38;
ld.param.f32 %f40, [__cudaparm_interp_delzinv];
mul.ftz.f32 %f41, %f40, %f39;
cvt.rzi.ftz.s32.f32 %r32, %f41;
ld.param.s32 %r33, [__cudaparm_interp_npts_yx];
mul.lo.s32 %r34, %r32, %r33;
add.s32 %r35, %r25, %r34;
@!%p4 bra $Lt_2_16898;
cvt.rn.f32.s32 %f42, %r32;
mov.f32 %f43, 0f3f000000; // 0.5
add.ftz.f32 %f44, %f42, %f43;
sub.ftz.f32 %f45, %f44, %f41;
mov.s32 %r36, %r2;
ld.param.f32 %f46, [__cudaparm_interp_b_lo_y];
sub.ftz.f32 %f47, %f7, %f46;
cvt.s64.s32 %rd17, %r4;
ld.param.f32 %f48, [__cudaparm_interp_delyinv];
mul.ftz.f32 %f49, %f48, %f47;
cvt.rzi.ftz.s32.f32 %r37, %f49;
ld.param.s32 %r38, [__cudaparm_interp_npts_x];
mul.lo.s32 %r39, %r37, %r38;
mul.wide.s32 %rd3, %r4, 4;
add.s32 %r40, %r39, %r35;
add.u64 %rd18, %rd3, %rd7;
add.u64 %rd19, %rd3, %rd8;
cvt.s64.s32 %rd20, %r38;
mul.wide.s32 %rd21, %r38, 16;
mov.s32 %r41, %r40;
ld.param.u64 %rd22, [__cudaparm_interp_brick];
mov.s32 %r42, 0;
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
mov.s32 %r43, %r36;
$Lt_2_12802:
//<loop> Loop body line 306, nesting depth: 1, estimated iterations: unknown
.loc 17 309 0
add.s32 %r44, %r42, %r1;
mov.s32 %r45, %r44;
setp.lt.s32 %p8, %r44, %r42;
@%p8 bra $Lt_2_17154;
cvt.s64.s32 %rd23, %r2;
mul.wide.s32 %rd13, %r2, 4;
cvt.s64.s32 %rd24, %r44;
mul.wide.s32 %rd25, %r44, 4;
add.u64 %rd26, %rd1, %rd25;
mov.f32 %f53, 0f00000000; // 0
$Lt_2_13570:
//<loop> Loop body line 309, nesting depth: 2, estimated iterations: unknown
.loc 17 310 0
ld.shared.f32 %f54, [%rd26+0];
fma.rn.ftz.f32 %f53, %f45, %f53, %f54;
sub.s32 %r45, %r45, %r2;
sub.u64 %rd26, %rd26, %rd13;
setp.ge.s32 %p9, %r45, %r42;
@%p9 bra $Lt_2_13570;
bra.uni $Lt_2_13058;
$Lt_2_17154:
mov.f32 %f53, 0f00000000; // 0
$Lt_2_13058:
.loc 17 312 0
mov.s32 %r46, %r41;
mov.s32 %r47, %r2;
mul.ftz.f32 %f55, %f15, %f53;
mov.s32 %r48, %r46;
mov.s64 %rd27, %rd19;
cvt.s64.s32 %rd28, %r46;
mul.wide.s32 %rd29, %r46, 16;
mov.s32 %r49, 0;
mov.s32 %r50, %r47;
$Lt_2_14594:
//<loop> Loop body line 312, nesting depth: 2, estimated iterations: unknown
mov.s32 %r51, %r2;
mov.s32 %r52, %r48;
add.s32 %r53, %r48, %r2;
mov.s64 %rd30, %rd18;
ld.shared.f32 %f56, [%rd27+0];
add.u64 %rd31, %rd29, %rd22;
mul.ftz.f32 %f57, %f55, %f56;
mov.s32 %r54, %r51;
$Lt_2_15362:
//<loop> Loop body line 312, nesting depth: 3, estimated iterations: unknown
.loc 17 316 0
ld.shared.f32 %f58, [%rd30+0];
mul.ftz.f32 %f59, %f58, %f57;
ld.global.v4.f32 {%f60,%f61,%f62,_}, [%rd31+0];
.loc 17 318 0
mul.ftz.f32 %f63, %f59, %f60;
sub.ftz.f32 %f52, %f52, %f63;
.loc 17 319 0
mul.ftz.f32 %f64, %f59, %f61;
sub.ftz.f32 %f51, %f51, %f64;
.loc 17 320 0
mul.ftz.f32 %f65, %f59, %f62;
sub.ftz.f32 %f50, %f50, %f65;
add.s32 %r52, %r52, 1;
add.u64 %rd31, %rd31, 16;
add.u64 %rd30, %rd30, 256;
setp.ne.s32 %p10, %r52, %r53;
@%p10 bra $Lt_2_15362;
add.s32 %r49, %r49, 1;
add.s32 %r48, %r48, %r38;
add.u64 %rd29, %rd29, %rd21;
add.u64 %rd27, %rd27, 256;
setp.ne.s32 %p11, %r49, %r2;
@%p11 bra $Lt_2_14594;
add.s32 %r42, %r42, 1;
add.s32 %r41, %r46, %r33;
setp.ne.s32 %p12, %r42, %r2;
@%p12 bra $Lt_2_12802;
bra.uni $Lt_2_9730;
$Lt_2_16898:
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
bra.uni $Lt_2_9730;
$Lt_2_9986:
mov.f32 %f50, 0f00000000; // 0
mov.f32 %f51, 0f00000000; // 0
mov.f32 %f52, 0f00000000; // 0
$Lt_2_9730:
.loc 17 327 0
ld.param.u64 %rd32, [__cudaparm_interp_ans];
cvt.s64.s32 %rd33, %r8;
mul.wide.s32 %rd34, %r8, 16;
add.u64 %rd35, %rd32, %rd34;
mov.f32 %f66, %f67;
st.global.v4.f32 [%rd35+0], {%f52,%f51,%f50,%f66};
$Lt_2_9218:
.loc 17 329 0
exit;
$LDWend_interp:
} // interp