forked from lijiext/lammps
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6069 f3b2605a-c512-4ea7-a41b-209d697bcdaa
This commit is contained in:
parent
b8f1ff821f
commit
bf6bb59386
|
@ -0,0 +1,627 @@
|
|||
const char * cmm_cut_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<33>;\n"
|
||||
" .reg .u64 %rd<36>;\n"
|
||||
" .reg .f32 %f<95>;\n"
|
||||
" .reg .pred %p<10>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj84[16];\n"
|
||||
" .loc 14 87 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 91 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+0], %f1;\n"
|
||||
" .loc 14 92 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+4], %f2;\n"
|
||||
" .loc 14 93 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+8], %f3;\n"
|
||||
" .loc 14 94 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+12], %f4;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_9474;\n"
|
||||
" .loc 14 105 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" .loc 14 108 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 110 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 111 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 114 0\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mul24.lo.s32 %r14, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r14;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r15, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r16, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_14594;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" setp.gt.s32 %p3, %r16, %r17;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" setp.gt.s32 %p4, %r15, %r18;\n"
|
||||
" cvt.rzi.s32.f32 %r19, %f24;\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r21, %r20, %r19;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.u64 %rd15, __cuda_sp_lj84;\n"
|
||||
"$Lt_0_10498:\n"
|
||||
" .loc 14 120 0\n"
|
||||
" ld.global.s32 %r22, [%rd10+0];\n"
|
||||
" .loc 14 121 0\n"
|
||||
" shr.s32 %r23, %r22, 30;\n"
|
||||
" cvt.s64.s32 %rd16, %r23;\n"
|
||||
" and.b64 %rd17, %rd16, 3;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd15, %rd18;\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" and.b32 %r24, %r22, 1073741823;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r24,%r25,%r26,%r27}];\n"
|
||||
" .loc 14 124 0\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.s32.f32 %r28, %f37;\n"
|
||||
" sub.f32 %f38, %f22, %f35;\n"
|
||||
" sub.f32 %f39, %f21, %f34;\n"
|
||||
" sub.f32 %f40, %f23, %f36;\n"
|
||||
" mul.f32 %f41, %f38, %f38;\n"
|
||||
" mad.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" mad.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r29, %r28, %r21;\n"
|
||||
" cvt.u64.s32 %rd20, %r29;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
||||
" add.u64 %rd22, %rd21, %rd14;\n"
|
||||
" ld.global.f32 %f44, [%rd22+0];\n"
|
||||
" setp.gt.f32 %p5, %f44, %f43;\n"
|
||||
" @!%p5 bra $Lt_0_12802;\n"
|
||||
" rcp.approx.f32 %f45, %f43;\n"
|
||||
" ld.global.f32 %f46, [%rd22+4];\n"
|
||||
" mov.f32 %f47, 0f40000000; \n"
|
||||
" setp.eq.f32 %p6, %f46, %f47;\n"
|
||||
" @!%p6 bra $Lt_0_11522;\n"
|
||||
" .loc 14 139 0\n"
|
||||
" mul.f32 %f48, %f45, %f45;\n"
|
||||
" mov.f32 %f49, %f48;\n"
|
||||
" .loc 14 140 0\n"
|
||||
" mul.f32 %f50, %f48, %f48;\n"
|
||||
" bra.uni $Lt_0_11778;\n"
|
||||
"$Lt_0_11522:\n"
|
||||
" mov.f32 %f51, 0f3f800000; \n"
|
||||
" setp.eq.f32 %p7, %f46, %f51;\n"
|
||||
" @!%p7 bra $Lt_0_12034;\n"
|
||||
" .loc 14 142 0\n"
|
||||
" sqrt.approx.f32 %f52, %f45;\n"
|
||||
" mul.f32 %f53, %f45, %f52;\n"
|
||||
" mov.f32 %f50, %f53;\n"
|
||||
" .loc 14 143 0\n"
|
||||
" mul.f32 %f49, %f53, %f53;\n"
|
||||
" bra.uni $Lt_0_11778;\n"
|
||||
"$Lt_0_12034:\n"
|
||||
" .loc 14 145 0\n"
|
||||
" mul.f32 %f54, %f45, %f45;\n"
|
||||
" mul.f32 %f55, %f45, %f54;\n"
|
||||
" mov.f32 %f49, %f55;\n"
|
||||
" .loc 14 146 0\n"
|
||||
" mov.f32 %f50, %f55;\n"
|
||||
"$Lt_0_11778:\n"
|
||||
"$Lt_0_11266:\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" .loc 14 148 0\n"
|
||||
" mul.f32 %f56, %f45, %f29;\n"
|
||||
" mul.f32 %f57, %f49, %f56;\n"
|
||||
" ld.global.v2.f32 {%f58,%f59}, [%rd22+8];\n"
|
||||
" mul.f32 %f60, %f58, %f50;\n"
|
||||
" sub.f32 %f61, %f60, %f59;\n"
|
||||
" mul.f32 %f62, %f57, %f61;\n"
|
||||
" .loc 14 150 0\n"
|
||||
" mad.f32 %f27, %f39, %f62, %f27;\n"
|
||||
" .loc 14 151 0\n"
|
||||
" mad.f32 %f26, %f38, %f62, %f26;\n"
|
||||
" .loc 14 152 0\n"
|
||||
" mad.f32 %f25, %f40, %f62, %f25;\n"
|
||||
" @!%p3 bra $Lt_0_12290;\n"
|
||||
" .loc 14 154 0\n"
|
||||
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd24, %rd23, %rd21;\n"
|
||||
" ld.global.v4.f32 {%f63,%f64,%f65,_}, [%rd24+0];\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" .loc 14 154 0\n"
|
||||
" mul.f32 %f66, %f29, %f49;\n"
|
||||
" mul.f32 %f67, %f63, %f50;\n"
|
||||
" sub.f32 %f68, %f67, %f64;\n"
|
||||
" mul.f32 %f69, %f66, %f68;\n"
|
||||
" sub.f32 %f70, %f69, %f65;\n"
|
||||
" add.f32 %f28, %f28, %f70;\n"
|
||||
"$Lt_0_12290:\n"
|
||||
" @!%p4 bra $Lt_0_12802;\n"
|
||||
" .loc 14 157 0\n"
|
||||
" mov.f32 %f71, %f6;\n"
|
||||
" mul.f32 %f72, %f39, %f39;\n"
|
||||
" mad.f32 %f73, %f62, %f72, %f71;\n"
|
||||
" mov.f32 %f6, %f73;\n"
|
||||
" .loc 14 158 0\n"
|
||||
" mov.f32 %f74, %f8;\n"
|
||||
" mad.f32 %f75, %f62, %f41, %f74;\n"
|
||||
" mov.f32 %f8, %f75;\n"
|
||||
" .loc 14 159 0\n"
|
||||
" mov.f32 %f76, %f10;\n"
|
||||
" mul.f32 %f77, %f40, %f40;\n"
|
||||
" mad.f32 %f78, %f62, %f77, %f76;\n"
|
||||
" mov.f32 %f10, %f78;\n"
|
||||
" .loc 14 160 0\n"
|
||||
" mov.f32 %f79, %f12;\n"
|
||||
" mul.f32 %f80, %f38, %f39;\n"
|
||||
" mad.f32 %f81, %f62, %f80, %f79;\n"
|
||||
" mov.f32 %f12, %f81;\n"
|
||||
" .loc 14 161 0\n"
|
||||
" mov.f32 %f82, %f14;\n"
|
||||
" mul.f32 %f83, %f39, %f40;\n"
|
||||
" mad.f32 %f84, %f62, %f83, %f82;\n"
|
||||
" mov.f32 %f14, %f84;\n"
|
||||
" .loc 14 162 0\n"
|
||||
" mul.f32 %f85, %f38, %f40;\n"
|
||||
" mad.f32 %f15, %f62, %f85, %f15;\n"
|
||||
" mov.f32 %f86, %f15;\n"
|
||||
"$Lt_0_12802:\n"
|
||||
"$Lt_0_10754:\n"
|
||||
" .loc 14 118 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p8, %rd13, %rd10;\n"
|
||||
" @%p8 bra $Lt_0_10498;\n"
|
||||
" bra.uni $Lt_0_9986;\n"
|
||||
"$Lt_0_14594:\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" setp.gt.s32 %p3, %r16, %r30;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" setp.gt.s32 %p4, %r15, %r31;\n"
|
||||
"$Lt_0_9986:\n"
|
||||
" .loc 14 169 0\n"
|
||||
" ld.param.u64 %rd25, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd26, %rd25, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_13570;\n"
|
||||
" .loc 14 171 0\n"
|
||||
" st.global.f32 [%rd26+0], %f28;\n"
|
||||
" .loc 14 172 0\n"
|
||||
" cvt.u64.s32 %rd27, %r6;\n"
|
||||
" mul.lo.u64 %rd28, %rd27, 4;\n"
|
||||
" add.u64 %rd26, %rd26, %rd28;\n"
|
||||
"$Lt_0_13570:\n"
|
||||
" @!%p4 bra $Lt_0_14082;\n"
|
||||
" .loc 14 176 0\n"
|
||||
" mov.f32 %f87, %f6;\n"
|
||||
" st.global.f32 [%rd26+0], %f87;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" cvt.u64.s32 %rd29, %r6;\n"
|
||||
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 176 0\n"
|
||||
" mov.f32 %f88, %f8;\n"
|
||||
" st.global.f32 [%rd26+0], %f88;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 176 0\n"
|
||||
" mov.f32 %f89, %f10;\n"
|
||||
" st.global.f32 [%rd26+0], %f89;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 176 0\n"
|
||||
" mov.f32 %f90, %f12;\n"
|
||||
" st.global.f32 [%rd26+0], %f90;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 176 0\n"
|
||||
" mov.f32 %f91, %f14;\n"
|
||||
" st.global.f32 [%rd26+0], %f91;\n"
|
||||
" add.u64 %rd31, %rd30, %rd26;\n"
|
||||
" st.global.f32 [%rd31+0], %f15;\n"
|
||||
"$Lt_0_14082:\n"
|
||||
" .loc 14 180 0\n"
|
||||
" ld.param.u64 %rd32, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd33, %rd2, 16;\n"
|
||||
" add.u64 %rd34, %rd32, %rd33;\n"
|
||||
" mov.f32 %f92, %f93;\n"
|
||||
" st.global.v4.f32 [%rd34+0], {%f27,%f26,%f25,%f92};\n"
|
||||
"$Lt_0_9474:\n"
|
||||
" .loc 14 182 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<36>;\n"
|
||||
" .reg .u64 %rd<48>;\n"
|
||||
" .reg .f32 %f<102>;\n"
|
||||
" .reg .pred %p<13>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj180[16];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj1208[1024];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj31232[1024];\n"
|
||||
" .loc 14 189 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_11778;\n"
|
||||
" .loc 14 196 0\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj180;\n"
|
||||
" cvt.u64.s32 %rd2, %r1;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_11778:\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj180;\n"
|
||||
" mov.u32 %r3, 63;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_12290;\n"
|
||||
" .loc 14 198 0\n"
|
||||
" mov.u64 %rd7, __cuda_lj1208;\n"
|
||||
" cvt.u64.s32 %rd8, %r1;\n"
|
||||
" mul.lo.u64 %rd9, %rd8, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.f32 [%rd12+0], %f2;\n"
|
||||
" st.shared.f32 [%rd12+4], %f3;\n"
|
||||
" st.shared.f32 [%rd12+8], %f4;\n"
|
||||
" st.shared.f32 [%rd12+12], %f5;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_12802;\n"
|
||||
" .loc 14 200 0\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.f32 [%rd16+0], %f6;\n"
|
||||
" st.shared.f32 [%rd16+4], %f7;\n"
|
||||
" st.shared.f32 [%rd16+8], %f8;\n"
|
||||
" st.shared.f32 [%rd16+12], %f9;\n"
|
||||
"$Lt_1_12802:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
"$Lt_1_12290:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
" mov.u64 %rd7, __cuda_lj1208;\n"
|
||||
" .loc 14 203 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r6, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r7, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r8, %r6, %r7;\n"
|
||||
" add.s32 %r9, %r8, %r1;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r9, %r10;\n"
|
||||
" @%p4 bra $Lt_1_13314;\n"
|
||||
" .loc 14 214 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 14 217 0\n"
|
||||
" cvt.u64.s32 %rd17, %r9;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd20, %rd19, %rd18;\n"
|
||||
" ld.global.s32 %r11, [%rd20+0];\n"
|
||||
" .loc 14 219 0\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd21, %r12;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" ld.global.s32 %r13, [%rd23+0];\n"
|
||||
" .loc 14 220 0\n"
|
||||
" add.u64 %rd24, %rd23, %rd22;\n"
|
||||
" mov.s64 %rd25, %rd24;\n"
|
||||
" mov.s32 %r14, %r11;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 223 0\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" mul24.lo.s32 %r18, %r13, %r12;\n"
|
||||
" cvt.s64.s32 %rd26, %r18;\n"
|
||||
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
||||
" add.u64 %rd28, %rd24, %rd27;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p5, %rd24, %rd28;\n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" @%p5 bra $Lt_1_18434;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p6, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p7, %r19, %r22;\n"
|
||||
" cvt.rzi.s32.f32 %r23, %f29;\n"
|
||||
" mov.s32 %r24, 8;\n"
|
||||
" mul24.lo.s32 %r25, %r24, %r23;\n"
|
||||
" cvt.rn.f32.s32 %f34, %r25;\n"
|
||||
"$Lt_1_14338:\n"
|
||||
" .loc 14 230 0\n"
|
||||
" ld.global.s32 %r26, [%rd25+0];\n"
|
||||
" .loc 14 231 0\n"
|
||||
" shr.s32 %r27, %r26, 30;\n"
|
||||
" cvt.s64.s32 %rd29, %r27;\n"
|
||||
" and.b64 %rd30, %rd29, 3;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd32, %rd1, %rd31;\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" and.b32 %r28, %r26, 1073741823;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
|
||||
" .loc 14 234 0\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.f32 %f44, %f27, %f41;\n"
|
||||
" sub.f32 %f45, %f26, %f40;\n"
|
||||
" sub.f32 %f46, %f28, %f42;\n"
|
||||
" mul.f32 %f47, %f44, %f44;\n"
|
||||
" mad.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" mad.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" add.f32 %f50, %f34, %f43;\n"
|
||||
" cvt.rzi.s32.f32 %r32, %f50;\n"
|
||||
" cvt.u64.s32 %rd33, %r32;\n"
|
||||
" mul.lo.u64 %rd34, %rd33, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd7;\n"
|
||||
" ld.shared.f32 %f51, [%rd35+0];\n"
|
||||
" setp.gt.f32 %p8, %f51, %f49;\n"
|
||||
" @!%p8 bra $Lt_1_16642;\n"
|
||||
" rcp.approx.f32 %f52, %f49;\n"
|
||||
" ld.shared.f32 %f53, [%rd35+4];\n"
|
||||
" mov.f32 %f54, 0f40000000; \n"
|
||||
" setp.eq.f32 %p9, %f53, %f54;\n"
|
||||
" @!%p9 bra $Lt_1_15362;\n"
|
||||
" .loc 14 248 0\n"
|
||||
" mul.f32 %f55, %f52, %f52;\n"
|
||||
" mov.f32 %f56, %f55;\n"
|
||||
" .loc 14 249 0\n"
|
||||
" mul.f32 %f57, %f55, %f55;\n"
|
||||
" bra.uni $Lt_1_15618;\n"
|
||||
"$Lt_1_15362:\n"
|
||||
" mov.f32 %f58, 0f3f800000; \n"
|
||||
" .loc 14 234 0\n"
|
||||
" ld.shared.f32 %f53, [%rd35+4];\n"
|
||||
" .loc 14 249 0\n"
|
||||
" setp.eq.f32 %p10, %f53, %f58;\n"
|
||||
" @!%p10 bra $Lt_1_15874;\n"
|
||||
" .loc 14 251 0\n"
|
||||
" sqrt.approx.f32 %f59, %f52;\n"
|
||||
" mul.f32 %f60, %f52, %f59;\n"
|
||||
" mov.f32 %f57, %f60;\n"
|
||||
" .loc 14 252 0\n"
|
||||
" mul.f32 %f56, %f60, %f60;\n"
|
||||
" bra.uni $Lt_1_15618;\n"
|
||||
"$Lt_1_15874:\n"
|
||||
" .loc 14 254 0\n"
|
||||
" mul.f32 %f61, %f52, %f52;\n"
|
||||
" mul.f32 %f62, %f52, %f61;\n"
|
||||
" mov.f32 %f56, %f62;\n"
|
||||
" .loc 14 255 0\n"
|
||||
" mov.f32 %f57, %f62;\n"
|
||||
"$Lt_1_15618:\n"
|
||||
"$Lt_1_15106:\n"
|
||||
" .loc 14 231 0\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" .loc 14 257 0\n"
|
||||
" mul.f32 %f63, %f52, %f35;\n"
|
||||
" mul.f32 %f64, %f56, %f63;\n"
|
||||
" ld.shared.f32 %f65, [%rd35+12];\n"
|
||||
" ld.shared.f32 %f66, [%rd35+8];\n"
|
||||
" mul.f32 %f67, %f66, %f57;\n"
|
||||
" sub.f32 %f68, %f67, %f65;\n"
|
||||
" mul.f32 %f69, %f64, %f68;\n"
|
||||
" .loc 14 259 0\n"
|
||||
" mad.f32 %f32, %f45, %f69, %f32;\n"
|
||||
" .loc 14 260 0\n"
|
||||
" mad.f32 %f31, %f44, %f69, %f31;\n"
|
||||
" .loc 14 261 0\n"
|
||||
" mad.f32 %f30, %f46, %f69, %f30;\n"
|
||||
" @!%p6 bra $Lt_1_16130;\n"
|
||||
" .loc 14 263 0\n"
|
||||
" add.u64 %rd36, %rd34, %rd13;\n"
|
||||
" ld.shared.f32 %f70, [%rd36+8];\n"
|
||||
" .loc 14 231 0\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" .loc 14 263 0\n"
|
||||
" mul.f32 %f71, %f35, %f56;\n"
|
||||
" ld.shared.f32 %f72, [%rd36+4];\n"
|
||||
" ld.shared.f32 %f73, [%rd36+0];\n"
|
||||
" mul.f32 %f74, %f73, %f57;\n"
|
||||
" sub.f32 %f75, %f74, %f72;\n"
|
||||
" mul.f32 %f76, %f71, %f75;\n"
|
||||
" sub.f32 %f77, %f76, %f70;\n"
|
||||
" add.f32 %f33, %f33, %f77;\n"
|
||||
"$Lt_1_16130:\n"
|
||||
" @!%p7 bra $Lt_1_16642;\n"
|
||||
" .loc 14 266 0\n"
|
||||
" mov.f32 %f78, %f11;\n"
|
||||
" mul.f32 %f79, %f45, %f45;\n"
|
||||
" mad.f32 %f80, %f69, %f79, %f78;\n"
|
||||
" mov.f32 %f11, %f80;\n"
|
||||
" .loc 14 267 0\n"
|
||||
" mov.f32 %f81, %f13;\n"
|
||||
" mad.f32 %f82, %f69, %f47, %f81;\n"
|
||||
" mov.f32 %f13, %f82;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f83, %f15;\n"
|
||||
" mul.f32 %f84, %f46, %f46;\n"
|
||||
" mad.f32 %f85, %f69, %f84, %f83;\n"
|
||||
" mov.f32 %f15, %f85;\n"
|
||||
" .loc 14 269 0\n"
|
||||
" mov.f32 %f86, %f17;\n"
|
||||
" mul.f32 %f87, %f44, %f45;\n"
|
||||
" mad.f32 %f88, %f69, %f87, %f86;\n"
|
||||
" mov.f32 %f17, %f88;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" mov.f32 %f89, %f19;\n"
|
||||
" mul.f32 %f90, %f45, %f46;\n"
|
||||
" mad.f32 %f91, %f69, %f90, %f89;\n"
|
||||
" mov.f32 %f19, %f91;\n"
|
||||
" .loc 14 271 0\n"
|
||||
" mul.f32 %f92, %f44, %f46;\n"
|
||||
" mad.f32 %f20, %f69, %f92, %f20;\n"
|
||||
" mov.f32 %f93, %f20;\n"
|
||||
"$Lt_1_16642:\n"
|
||||
"$Lt_1_14594:\n"
|
||||
" .loc 14 228 0\n"
|
||||
" add.u64 %rd25, %rd22, %rd25;\n"
|
||||
" setp.gt.u64 %p11, %rd28, %rd25;\n"
|
||||
" @%p11 bra $Lt_1_14338;\n"
|
||||
" bra.uni $Lt_1_13826;\n"
|
||||
"$Lt_1_18434:\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" setp.gt.s32 %p6, %r20, %r33;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" setp.gt.s32 %p7, %r19, %r34;\n"
|
||||
"$Lt_1_13826:\n"
|
||||
" .loc 14 278 0\n"
|
||||
" ld.param.u64 %rd37, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd38, %rd37, %rd18;\n"
|
||||
" @!%p6 bra $Lt_1_17410;\n"
|
||||
" .loc 14 280 0\n"
|
||||
" st.global.f32 [%rd38+0], %f33;\n"
|
||||
" .loc 14 281 0\n"
|
||||
" cvt.u64.s32 %rd39, %r10;\n"
|
||||
" mul.lo.u64 %rd40, %rd39, 4;\n"
|
||||
" add.u64 %rd38, %rd38, %rd40;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
" @!%p7 bra $Lt_1_17922;\n"
|
||||
" .loc 14 285 0\n"
|
||||
" mov.f32 %f94, %f11;\n"
|
||||
" st.global.f32 [%rd38+0], %f94;\n"
|
||||
" .loc 14 286 0\n"
|
||||
" cvt.u64.s32 %rd41, %r10;\n"
|
||||
" mul.lo.u64 %rd42, %rd41, 4;\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 285 0\n"
|
||||
" mov.f32 %f95, %f13;\n"
|
||||
" st.global.f32 [%rd38+0], %f95;\n"
|
||||
" .loc 14 286 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 285 0\n"
|
||||
" mov.f32 %f96, %f15;\n"
|
||||
" st.global.f32 [%rd38+0], %f96;\n"
|
||||
" .loc 14 286 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 285 0\n"
|
||||
" mov.f32 %f97, %f17;\n"
|
||||
" st.global.f32 [%rd38+0], %f97;\n"
|
||||
" .loc 14 286 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 285 0\n"
|
||||
" mov.f32 %f98, %f19;\n"
|
||||
" st.global.f32 [%rd38+0], %f98;\n"
|
||||
" add.u64 %rd43, %rd42, %rd38;\n"
|
||||
" st.global.f32 [%rd43+0], %f20;\n"
|
||||
"$Lt_1_17922:\n"
|
||||
" .loc 14 289 0\n"
|
||||
" ld.param.u64 %rd44, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd45, %rd17, 16;\n"
|
||||
" add.u64 %rd46, %rd44, %rd45;\n"
|
||||
" mov.f32 %f99, %f100;\n"
|
||||
" st.global.v4.f32 [%rd46+0], {%f32,%f31,%f30,%f99};\n"
|
||||
"$Lt_1_13314:\n"
|
||||
" .loc 14 291 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,829 @@
|
|||
const char * cmmc_long_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .tex .u64 q_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_g_ewald)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<42>;\n"
|
||||
" .reg .u64 %rd<38>;\n"
|
||||
" .reg .f32 %f<156>;\n"
|
||||
" .reg .pred %p<12>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj108[32];\n"
|
||||
" .loc 14 107 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 111 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+0], %f1;\n"
|
||||
" .loc 14 112 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+4], %f2;\n"
|
||||
" .loc 14 113 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+8], %f3;\n"
|
||||
" .loc 14 114 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+12], %f4;\n"
|
||||
" .loc 14 115 0\n"
|
||||
" ld.global.f32 %f5, [%rd1+16];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+16], %f5;\n"
|
||||
" .loc 14 116 0\n"
|
||||
" ld.global.f32 %f6, [%rd1+20];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+20], %f6;\n"
|
||||
" .loc 14 117 0\n"
|
||||
" ld.global.f32 %f7, [%rd1+24];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+24], %f7;\n"
|
||||
" .loc 14 118 0\n"
|
||||
" ld.global.f32 %f8, [%rd1+28];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+28], %f8;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_11778;\n"
|
||||
" .loc 14 129 0\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" mov.f32 %f17, 0f00000000; \n"
|
||||
" mov.f32 %f18, %f17;\n"
|
||||
" mov.f32 %f19, 0f00000000; \n"
|
||||
" mov.f32 %f20, %f19;\n"
|
||||
" .loc 14 132 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 134 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 135 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 138 0\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.s32 %r14, %r7;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 139 0\n"
|
||||
" mov.f32 %f33, %f29;\n"
|
||||
" mul24.lo.s32 %r18, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r18;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_18434;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r22;\n"
|
||||
" cvt.rzi.s32.f32 %r23, %f28;\n"
|
||||
" ld.param.s32 %r24, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r25, %r24, %r23;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.u64 %rd15, __cuda_sp_lj108;\n"
|
||||
"$Lt_0_12802:\n"
|
||||
" .loc 14 143 0\n"
|
||||
" ld.global.s32 %r26, [%rd10+0];\n"
|
||||
" .loc 14 146 0\n"
|
||||
" shr.s32 %r27, %r26, 30;\n"
|
||||
" cvt.s64.s32 %rd16, %r27;\n"
|
||||
" and.b64 %rd17, %rd16, 3;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd15, %rd18;\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 147 0\n"
|
||||
" mov.f32 %f40, 0f3f800000; \n"
|
||||
" ld.shared.f32 %f41, [%rd19+16];\n"
|
||||
" sub.f32 %f42, %f40, %f41;\n"
|
||||
" and.b32 %r28, %r26, 1073741823;\n"
|
||||
" mov.s32 %r29, %r28;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.s32 %r32, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
|
||||
" .loc 14 150 0\n"
|
||||
" mov.f32 %f47, %f43;\n"
|
||||
" mov.f32 %f48, %f44;\n"
|
||||
" mov.f32 %f49, %f45;\n"
|
||||
" mov.f32 %f50, %f46;\n"
|
||||
" cvt.rzi.s32.f32 %r33, %f50;\n"
|
||||
" sub.f32 %f51, %f26, %f48;\n"
|
||||
" sub.f32 %f52, %f25, %f47;\n"
|
||||
" sub.f32 %f53, %f27, %f49;\n"
|
||||
" mul.f32 %f54, %f51, %f51;\n"
|
||||
" mad.f32 %f55, %f52, %f52, %f54;\n"
|
||||
" mad.f32 %f56, %f53, %f53, %f55;\n"
|
||||
" add.s32 %r34, %r33, %r25;\n"
|
||||
" cvt.u64.s32 %rd20, %r34;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
||||
" add.u64 %rd22, %rd21, %rd14;\n"
|
||||
" ld.global.f32 %f57, [%rd22+0];\n"
|
||||
" setp.gt.f32 %p5, %f57, %f56;\n"
|
||||
" @!%p5 bra $Lt_0_16642;\n"
|
||||
" rcp.approx.f32 %f58, %f56;\n"
|
||||
" ld.global.f32 %f59, [%rd22+4];\n"
|
||||
" setp.lt.f32 %p6, %f56, %f59;\n"
|
||||
" @!%p6 bra $Lt_0_13826;\n"
|
||||
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd24, %rd23, %rd21;\n"
|
||||
" ld.global.f32 %f60, [%rd24+0];\n"
|
||||
" mov.f32 %f61, 0f40000000; \n"
|
||||
" setp.eq.f32 %p7, %f60, %f61;\n"
|
||||
" @!%p7 bra $Lt_0_14338;\n"
|
||||
" .loc 14 166 0\n"
|
||||
" mul.f32 %f62, %f58, %f58;\n"
|
||||
" mov.f32 %f63, %f62;\n"
|
||||
" mov.f32 %f64, %f63;\n"
|
||||
" .loc 14 167 0\n"
|
||||
" mul.f32 %f65, %f62, %f62;\n"
|
||||
" mov.f32 %f66, %f65;\n"
|
||||
" bra.uni $Lt_0_14594;\n"
|
||||
"$Lt_0_14338:\n"
|
||||
" mov.f32 %f67, 0f3f800000; \n"
|
||||
" setp.eq.f32 %p8, %f60, %f67;\n"
|
||||
" @!%p8 bra $Lt_0_14850;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" sqrt.approx.f32 %f68, %f58;\n"
|
||||
" mul.f32 %f69, %f58, %f68;\n"
|
||||
" mov.f32 %f65, %f69;\n"
|
||||
" mov.f32 %f66, %f65;\n"
|
||||
" .loc 14 170 0\n"
|
||||
" mul.f32 %f63, %f69, %f69;\n"
|
||||
" mov.f32 %f64, %f63;\n"
|
||||
" bra.uni $Lt_0_14594;\n"
|
||||
"$Lt_0_14850:\n"
|
||||
" .loc 14 172 0\n"
|
||||
" mul.f32 %f70, %f58, %f58;\n"
|
||||
" mul.f32 %f71, %f58, %f70;\n"
|
||||
" mov.f32 %f63, %f71;\n"
|
||||
" mov.f32 %f64, %f63;\n"
|
||||
" .loc 14 173 0\n"
|
||||
" mov.f32 %f65, %f71;\n"
|
||||
" mov.f32 %f66, %f65;\n"
|
||||
"$Lt_0_14594:\n"
|
||||
"$Lt_0_14082:\n"
|
||||
" .loc 14 146 0\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 175 0\n"
|
||||
" mul.f32 %f72, %f39, %f63;\n"
|
||||
" ld.global.v2.f32 {%f73,%f74}, [%rd22+8];\n"
|
||||
" mul.f32 %f75, %f73, %f65;\n"
|
||||
" sub.f32 %f76, %f75, %f74;\n"
|
||||
" mul.f32 %f77, %f72, %f76;\n"
|
||||
" bra.uni $Lt_0_13570;\n"
|
||||
"$Lt_0_13826:\n"
|
||||
" .loc 14 177 0\n"
|
||||
" mov.f32 %f77, 0f00000000; \n"
|
||||
"$Lt_0_13570:\n"
|
||||
" ld.param.f32 %f78, [__cudaparm_kernel_pair_cut_coulsq];\n"
|
||||
" setp.gt.f32 %p9, %f78, %f56;\n"
|
||||
" @!%p9 bra $Lt_0_15362;\n"
|
||||
" .loc 14 184 0\n"
|
||||
" sqrt.approx.f32 %f79, %f56;\n"
|
||||
" ld.param.f32 %f80, [__cudaparm_kernel_pair_g_ewald];\n"
|
||||
" mul.f32 %f81, %f80, %f79;\n"
|
||||
" mul.f32 %f82, %f81, %f81;\n"
|
||||
" mov.f32 %f83, 0f3f800000; \n"
|
||||
" mov.f32 %f84, 0f3ea7ba05; \n"
|
||||
" mad.f32 %f85, %f84, %f81, %f83;\n"
|
||||
" neg.f32 %f86, %f82;\n"
|
||||
" rcp.approx.f32 %f87, %f85;\n"
|
||||
" mov.f32 %f88, 0f3fb8aa3b; \n"
|
||||
" mul.f32 %f89, %f86, %f88;\n"
|
||||
" ex2.approx.f32 %f90, %f89;\n"
|
||||
" mov.f32 %f91, 0f3e827906; \n"
|
||||
" mov.f32 %f92, 0fbe91a98e; \n"
|
||||
" mov.f32 %f93, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f94, 0fbfba00e3; \n"
|
||||
" mov.f32 %f95, 0f3f87dc22; \n"
|
||||
" mad.f32 %f96, %f95, %f87, %f94;\n"
|
||||
" mad.f32 %f97, %f87, %f96, %f93;\n"
|
||||
" mad.f32 %f98, %f87, %f97, %f92;\n"
|
||||
" mad.f32 %f99, %f87, %f98, %f91;\n"
|
||||
" mul.f32 %f100, %f87, %f99;\n"
|
||||
" mul.f32 %f101, %f90, %f100;\n"
|
||||
" mov.f32 %f102, %f101;\n"
|
||||
" mov.s32 %r35, %r28;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r35,%r36,%r37,%r38}];\n"
|
||||
" .loc 14 185 0\n"
|
||||
" mov.f32 %f107, %f103;\n"
|
||||
" ld.param.f32 %f108, [__cudaparm_kernel_pair_qqrd2e];\n"
|
||||
" mul.f32 %f109, %f108, %f33;\n"
|
||||
" mul.f32 %f110, %f109, %f107;\n"
|
||||
" div.approx.f32 %f111, %f110, %f79;\n"
|
||||
" mov.f32 %f112, %f111;\n"
|
||||
" .loc 14 186 0\n"
|
||||
" mov.f32 %f113, 0f3f906ebb; \n"
|
||||
" mul.f32 %f114, %f81, %f113;\n"
|
||||
" mad.f32 %f115, %f90, %f114, %f101;\n"
|
||||
" sub.f32 %f116, %f115, %f42;\n"
|
||||
" mul.f32 %f117, %f111, %f116;\n"
|
||||
" bra.uni $Lt_0_15106;\n"
|
||||
"$Lt_0_15362:\n"
|
||||
" .loc 14 189 0\n"
|
||||
" mov.f32 %f112, 0f00000000; \n"
|
||||
" mov.f32 %f117, 0f00000000; \n"
|
||||
"$Lt_0_15106:\n"
|
||||
" .loc 14 194 0\n"
|
||||
" add.f32 %f118, %f117, %f77;\n"
|
||||
" mul.f32 %f119, %f118, %f58;\n"
|
||||
" mad.f32 %f36, %f52, %f119, %f36;\n"
|
||||
" .loc 14 195 0\n"
|
||||
" mad.f32 %f35, %f51, %f119, %f35;\n"
|
||||
" .loc 14 196 0\n"
|
||||
" mad.f32 %f34, %f53, %f119, %f34;\n"
|
||||
" @!%p3 bra $Lt_0_16130;\n"
|
||||
" .loc 14 199 0\n"
|
||||
" mov.f32 %f120, %f102;\n"
|
||||
" sub.f32 %f121, %f120, %f42;\n"
|
||||
" mad.f32 %f37, %f112, %f121, %f37;\n"
|
||||
" @!%p6 bra $Lt_0_16130;\n"
|
||||
" .loc 14 201 0\n"
|
||||
" ld.param.u64 %rd25, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd26, %rd25, %rd21;\n"
|
||||
" ld.global.v4.f32 {_,%f122,%f123,%f124}, [%rd26+0];\n"
|
||||
" mov.f32 %f125, %f64;\n"
|
||||
" .loc 14 146 0\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 201 0\n"
|
||||
" mul.f32 %f126, %f125, %f39;\n"
|
||||
" mov.f32 %f127, %f66;\n"
|
||||
" mul.f32 %f128, %f122, %f127;\n"
|
||||
" sub.f32 %f129, %f128, %f123;\n"
|
||||
" mul.f32 %f130, %f126, %f129;\n"
|
||||
" sub.f32 %f131, %f130, %f124;\n"
|
||||
" add.f32 %f38, %f38, %f131;\n"
|
||||
"$Lt_0_16130:\n"
|
||||
"$Lt_0_15618:\n"
|
||||
" @!%p4 bra $Lt_0_16642;\n"
|
||||
" .loc 14 206 0\n"
|
||||
" mov.f32 %f132, %f10;\n"
|
||||
" mul.f32 %f133, %f52, %f52;\n"
|
||||
" mad.f32 %f134, %f119, %f133, %f132;\n"
|
||||
" mov.f32 %f10, %f134;\n"
|
||||
" .loc 14 207 0\n"
|
||||
" mov.f32 %f135, %f12;\n"
|
||||
" mad.f32 %f136, %f119, %f54, %f135;\n"
|
||||
" mov.f32 %f12, %f136;\n"
|
||||
" .loc 14 208 0\n"
|
||||
" mov.f32 %f137, %f14;\n"
|
||||
" mul.f32 %f138, %f53, %f53;\n"
|
||||
" mad.f32 %f139, %f119, %f138, %f137;\n"
|
||||
" mov.f32 %f14, %f139;\n"
|
||||
" .loc 14 209 0\n"
|
||||
" mov.f32 %f140, %f16;\n"
|
||||
" mul.f32 %f141, %f51, %f52;\n"
|
||||
" mad.f32 %f142, %f119, %f141, %f140;\n"
|
||||
" mov.f32 %f16, %f142;\n"
|
||||
" .loc 14 210 0\n"
|
||||
" mov.f32 %f143, %f18;\n"
|
||||
" mul.f32 %f144, %f52, %f53;\n"
|
||||
" mad.f32 %f145, %f119, %f144, %f143;\n"
|
||||
" mov.f32 %f18, %f145;\n"
|
||||
" .loc 14 211 0\n"
|
||||
" mul.f32 %f146, %f51, %f53;\n"
|
||||
" mad.f32 %f19, %f119, %f146, %f19;\n"
|
||||
" mov.f32 %f147, %f19;\n"
|
||||
"$Lt_0_16642:\n"
|
||||
"$Lt_0_13058:\n"
|
||||
" .loc 14 142 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p10, %rd13, %rd10;\n"
|
||||
" @%p10 bra $Lt_0_12802;\n"
|
||||
" bra.uni $Lt_0_12290;\n"
|
||||
"$Lt_0_18434:\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r39;\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r40;\n"
|
||||
"$Lt_0_12290:\n"
|
||||
" .loc 14 218 0\n"
|
||||
" ld.param.u64 %rd27, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd28, %rd27, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_17410;\n"
|
||||
" .loc 14 220 0\n"
|
||||
" st.global.f32 [%rd28+0], %f38;\n"
|
||||
" .loc 14 221 0\n"
|
||||
" cvt.u64.s32 %rd29, %r6;\n"
|
||||
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
||||
" add.u64 %rd28, %rd30, %rd28;\n"
|
||||
" .loc 14 222 0\n"
|
||||
" st.global.f32 [%rd28+0], %f37;\n"
|
||||
" .loc 14 223 0\n"
|
||||
" add.u64 %rd28, %rd30, %rd28;\n"
|
||||
"$Lt_0_17410:\n"
|
||||
" @!%p4 bra $Lt_0_17922;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f148, %f10;\n"
|
||||
" st.global.f32 [%rd28+0], %f148;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" cvt.u64.s32 %rd31, %r6;\n"
|
||||
" mul.lo.u64 %rd32, %rd31, 4;\n"
|
||||
" add.u64 %rd28, %rd32, %rd28;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f149, %f12;\n"
|
||||
" st.global.f32 [%rd28+0], %f149;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" add.u64 %rd28, %rd32, %rd28;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f150, %f14;\n"
|
||||
" st.global.f32 [%rd28+0], %f150;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" add.u64 %rd28, %rd32, %rd28;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f151, %f16;\n"
|
||||
" st.global.f32 [%rd28+0], %f151;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" add.u64 %rd28, %rd32, %rd28;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f152, %f18;\n"
|
||||
" st.global.f32 [%rd28+0], %f152;\n"
|
||||
" add.u64 %rd33, %rd32, %rd28;\n"
|
||||
" st.global.f32 [%rd33+0], %f19;\n"
|
||||
"$Lt_0_17922:\n"
|
||||
" .loc 14 231 0\n"
|
||||
" ld.param.u64 %rd34, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd35, %rd2, 16;\n"
|
||||
" add.u64 %rd36, %rd34, %rd35;\n"
|
||||
" mov.f32 %f153, %f154;\n"
|
||||
" st.global.v4.f32 [%rd36+0], {%f36,%f35,%f34,%f153};\n"
|
||||
"$Lt_0_11778:\n"
|
||||
" .loc 14 233 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_g_ewald)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<43>;\n"
|
||||
" .reg .u64 %rd<49>;\n"
|
||||
" .reg .f32 %f<159>;\n"
|
||||
" .reg .pred %p<14>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj244[32];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj3288[1024];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj11312[1024];\n"
|
||||
" .loc 14 242 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 7;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_13314;\n"
|
||||
" .loc 14 249 0\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
||||
" cvt.u64.s32 %rd2, %r1;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_13314:\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
||||
" mov.u32 %r3, 63;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_13826;\n"
|
||||
" .loc 14 251 0\n"
|
||||
" mov.u64 %rd7, __cuda_lj3288;\n"
|
||||
" mov.u64 %rd8, __cuda_lj11312;\n"
|
||||
" cvt.u64.s32 %rd9, %r1;\n"
|
||||
" mul.lo.u64 %rd10, %rd9, 16;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd12, %rd11, %rd10;\n"
|
||||
" add.u64 %rd13, %rd10, %rd8;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n"
|
||||
" st.shared.f32 [%rd13+0], %f2;\n"
|
||||
" st.shared.f32 [%rd13+4], %f3;\n"
|
||||
" st.shared.f32 [%rd13+8], %f4;\n"
|
||||
" st.shared.f32 [%rd13+12], %f5;\n"
|
||||
" .loc 14 252 0\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd10;\n"
|
||||
" add.u64 %rd16, %rd10, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.f32 [%rd16+0], %f6;\n"
|
||||
" st.shared.f32 [%rd16+4], %f7;\n"
|
||||
" st.shared.f32 [%rd16+8], %f8;\n"
|
||||
" st.shared.f32 [%rd16+12], %f9;\n"
|
||||
"$Lt_1_13826:\n"
|
||||
" mov.u64 %rd7, __cuda_lj3288;\n"
|
||||
" mov.u64 %rd8, __cuda_lj11312;\n"
|
||||
" .loc 14 255 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r4, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r5, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r6, %r4, %r5;\n"
|
||||
" add.s32 %r7, %r6, %r1;\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p3, %r7, %r8;\n"
|
||||
" @%p3 bra $Lt_1_14338;\n"
|
||||
" .loc 14 267 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" cvt.u64.s32 %rd17, %r7;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd20, %rd19, %rd18;\n"
|
||||
" ld.global.s32 %r9, [%rd20+0];\n"
|
||||
" .loc 14 272 0\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd21, %r10;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" ld.global.s32 %r11, [%rd23+0];\n"
|
||||
" .loc 14 273 0\n"
|
||||
" add.u64 %rd24, %rd23, %rd22;\n"
|
||||
" mov.s64 %rd25, %rd24;\n"
|
||||
" mov.s32 %r12, %r9;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" mov.s32 %r14, 0;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r12,%r13,%r14,%r15}];\n"
|
||||
" .loc 14 276 0\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" mov.s32 %r16, %r9;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r16,%r17,%r18,%r19}];\n"
|
||||
" .loc 14 277 0\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mul24.lo.s32 %r20, %r11, %r10;\n"
|
||||
" cvt.s64.s32 %rd26, %r20;\n"
|
||||
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
||||
" add.u64 %rd28, %rd24, %rd27;\n"
|
||||
" ld.param.s32 %r21, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r22, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p4, %rd24, %rd28;\n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" mov.f32 %f39, 0f00000000; \n"
|
||||
" @%p4 bra $Lt_1_20994;\n"
|
||||
" mov.s32 %r23, 0;\n"
|
||||
" setp.gt.s32 %p5, %r22, %r23;\n"
|
||||
" mov.s32 %r24, 0;\n"
|
||||
" setp.gt.s32 %p6, %r21, %r24;\n"
|
||||
" cvt.rzi.s32.f32 %r25, %f29;\n"
|
||||
" mov.s32 %r26, 8;\n"
|
||||
" mul24.lo.s32 %r27, %r26, %r25;\n"
|
||||
" cvt.rn.f32.s32 %f40, %r27;\n"
|
||||
"$Lt_1_15362:\n"
|
||||
" .loc 14 282 0\n"
|
||||
" ld.global.s32 %r28, [%rd25+0];\n"
|
||||
" .loc 14 285 0\n"
|
||||
" shr.s32 %r29, %r28, 30;\n"
|
||||
" cvt.s64.s32 %rd29, %r29;\n"
|
||||
" and.b64 %rd30, %rd29, 3;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd32, %rd1, %rd31;\n"
|
||||
" ld.shared.f32 %f41, [%rd32+0];\n"
|
||||
" .loc 14 286 0\n"
|
||||
" mov.f32 %f42, 0f3f800000; \n"
|
||||
" ld.shared.f32 %f43, [%rd32+16];\n"
|
||||
" sub.f32 %f44, %f42, %f43;\n"
|
||||
" and.b32 %r30, %r28, 1073741823;\n"
|
||||
" mov.s32 %r31, %r30;\n"
|
||||
" mov.s32 %r32, 0;\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r31,%r32,%r33,%r34}];\n"
|
||||
" .loc 14 289 0\n"
|
||||
" mov.f32 %f49, %f45;\n"
|
||||
" mov.f32 %f50, %f46;\n"
|
||||
" mov.f32 %f51, %f47;\n"
|
||||
" mov.f32 %f52, %f48;\n"
|
||||
" sub.f32 %f53, %f27, %f50;\n"
|
||||
" sub.f32 %f54, %f26, %f49;\n"
|
||||
" sub.f32 %f55, %f28, %f51;\n"
|
||||
" mul.f32 %f56, %f53, %f53;\n"
|
||||
" mad.f32 %f57, %f54, %f54, %f56;\n"
|
||||
" mad.f32 %f58, %f55, %f55, %f57;\n"
|
||||
" add.f32 %f59, %f40, %f52;\n"
|
||||
" cvt.rzi.s32.f32 %r35, %f59;\n"
|
||||
" cvt.u64.s32 %rd33, %r35;\n"
|
||||
" mul.lo.u64 %rd34, %rd33, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd8;\n"
|
||||
" ld.shared.f32 %f60, [%rd35+0];\n"
|
||||
" setp.gt.f32 %p7, %f60, %f58;\n"
|
||||
" @!%p7 bra $Lt_1_19202;\n"
|
||||
" rcp.approx.f32 %f61, %f58;\n"
|
||||
" ld.shared.f32 %f62, [%rd35+4];\n"
|
||||
" setp.lt.f32 %p8, %f58, %f62;\n"
|
||||
" @!%p8 bra $Lt_1_16386;\n"
|
||||
" add.u64 %rd36, %rd34, %rd7;\n"
|
||||
" ld.shared.f32 %f63, [%rd36+0];\n"
|
||||
" mov.f32 %f64, 0f40000000; \n"
|
||||
" setp.eq.f32 %p9, %f63, %f64;\n"
|
||||
" @!%p9 bra $Lt_1_16898;\n"
|
||||
" .loc 14 304 0\n"
|
||||
" mul.f32 %f65, %f61, %f61;\n"
|
||||
" mov.f32 %f66, %f65;\n"
|
||||
" mov.f32 %f67, %f66;\n"
|
||||
" .loc 14 305 0\n"
|
||||
" mul.f32 %f68, %f65, %f65;\n"
|
||||
" mov.f32 %f69, %f68;\n"
|
||||
" bra.uni $Lt_1_17154;\n"
|
||||
"$Lt_1_16898:\n"
|
||||
" mov.f32 %f70, 0f3f800000; \n"
|
||||
" .loc 14 289 0\n"
|
||||
" ld.shared.f32 %f63, [%rd36+0];\n"
|
||||
" .loc 14 305 0\n"
|
||||
" setp.eq.f32 %p10, %f63, %f70;\n"
|
||||
" @!%p10 bra $Lt_1_17410;\n"
|
||||
" .loc 14 307 0\n"
|
||||
" sqrt.approx.f32 %f71, %f61;\n"
|
||||
" mul.f32 %f72, %f61, %f71;\n"
|
||||
" mov.f32 %f68, %f72;\n"
|
||||
" mov.f32 %f69, %f68;\n"
|
||||
" .loc 14 308 0\n"
|
||||
" mul.f32 %f66, %f72, %f72;\n"
|
||||
" mov.f32 %f67, %f66;\n"
|
||||
" bra.uni $Lt_1_17154;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
" .loc 14 310 0\n"
|
||||
" mul.f32 %f73, %f61, %f61;\n"
|
||||
" mul.f32 %f74, %f61, %f73;\n"
|
||||
" mov.f32 %f66, %f74;\n"
|
||||
" mov.f32 %f67, %f66;\n"
|
||||
" .loc 14 311 0\n"
|
||||
" mov.f32 %f68, %f74;\n"
|
||||
" mov.f32 %f69, %f68;\n"
|
||||
"$Lt_1_17154:\n"
|
||||
"$Lt_1_16642:\n"
|
||||
" .loc 14 285 0\n"
|
||||
" ld.shared.f32 %f41, [%rd32+0];\n"
|
||||
" .loc 14 313 0\n"
|
||||
" mul.f32 %f75, %f41, %f66;\n"
|
||||
" ld.shared.f32 %f76, [%rd35+12];\n"
|
||||
" ld.shared.f32 %f77, [%rd35+8];\n"
|
||||
" mul.f32 %f78, %f77, %f68;\n"
|
||||
" sub.f32 %f79, %f78, %f76;\n"
|
||||
" mul.f32 %f80, %f75, %f79;\n"
|
||||
" bra.uni $Lt_1_16130;\n"
|
||||
"$Lt_1_16386:\n"
|
||||
" .loc 14 315 0\n"
|
||||
" mov.f32 %f80, 0f00000000; \n"
|
||||
"$Lt_1_16130:\n"
|
||||
" ld.param.f32 %f81, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
|
||||
" setp.gt.f32 %p11, %f81, %f58;\n"
|
||||
" @!%p11 bra $Lt_1_17922;\n"
|
||||
" .loc 14 322 0\n"
|
||||
" sqrt.approx.f32 %f82, %f58;\n"
|
||||
" ld.param.f32 %f83, [__cudaparm_kernel_pair_fast_g_ewald];\n"
|
||||
" mul.f32 %f84, %f83, %f82;\n"
|
||||
" mul.f32 %f85, %f84, %f84;\n"
|
||||
" mov.f32 %f86, 0f3f800000; \n"
|
||||
" mov.f32 %f87, 0f3ea7ba05; \n"
|
||||
" mad.f32 %f88, %f87, %f84, %f86;\n"
|
||||
" neg.f32 %f89, %f85;\n"
|
||||
" rcp.approx.f32 %f90, %f88;\n"
|
||||
" mov.f32 %f91, 0f3fb8aa3b; \n"
|
||||
" mul.f32 %f92, %f89, %f91;\n"
|
||||
" ex2.approx.f32 %f93, %f92;\n"
|
||||
" mov.f32 %f94, 0f3e827906; \n"
|
||||
" mov.f32 %f95, 0fbe91a98e; \n"
|
||||
" mov.f32 %f96, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f97, 0fbfba00e3; \n"
|
||||
" mov.f32 %f98, 0f3f87dc22; \n"
|
||||
" mad.f32 %f99, %f98, %f90, %f97;\n"
|
||||
" mad.f32 %f100, %f90, %f99, %f96;\n"
|
||||
" mad.f32 %f101, %f90, %f100, %f95;\n"
|
||||
" mad.f32 %f102, %f90, %f101, %f94;\n"
|
||||
" mul.f32 %f103, %f90, %f102;\n"
|
||||
" mul.f32 %f104, %f93, %f103;\n"
|
||||
" mov.f32 %f105, %f104;\n"
|
||||
" mov.s32 %r36, %r30;\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r36,%r37,%r38,%r39}];\n"
|
||||
" .loc 14 323 0\n"
|
||||
" mov.f32 %f110, %f106;\n"
|
||||
" ld.param.f32 %f111, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
|
||||
" mul.f32 %f112, %f111, %f34;\n"
|
||||
" mul.f32 %f113, %f112, %f110;\n"
|
||||
" div.approx.f32 %f114, %f113, %f82;\n"
|
||||
" mov.f32 %f115, %f114;\n"
|
||||
" .loc 14 324 0\n"
|
||||
" mov.f32 %f116, 0f3f906ebb; \n"
|
||||
" mul.f32 %f117, %f84, %f116;\n"
|
||||
" mad.f32 %f118, %f93, %f117, %f104;\n"
|
||||
" sub.f32 %f119, %f118, %f44;\n"
|
||||
" mul.f32 %f120, %f114, %f119;\n"
|
||||
" bra.uni $Lt_1_17666;\n"
|
||||
"$Lt_1_17922:\n"
|
||||
" .loc 14 327 0\n"
|
||||
" mov.f32 %f115, 0f00000000; \n"
|
||||
" mov.f32 %f120, 0f00000000; \n"
|
||||
"$Lt_1_17666:\n"
|
||||
" .loc 14 332 0\n"
|
||||
" add.f32 %f121, %f120, %f80;\n"
|
||||
" mul.f32 %f122, %f121, %f61;\n"
|
||||
" mad.f32 %f37, %f54, %f122, %f37;\n"
|
||||
" .loc 14 333 0\n"
|
||||
" mad.f32 %f36, %f53, %f122, %f36;\n"
|
||||
" .loc 14 334 0\n"
|
||||
" mad.f32 %f35, %f55, %f122, %f35;\n"
|
||||
" @!%p5 bra $Lt_1_18690;\n"
|
||||
" .loc 14 337 0\n"
|
||||
" mov.f32 %f123, %f105;\n"
|
||||
" sub.f32 %f124, %f123, %f44;\n"
|
||||
" mad.f32 %f38, %f115, %f124, %f38;\n"
|
||||
" @!%p8 bra $Lt_1_18690;\n"
|
||||
" .loc 14 339 0\n"
|
||||
" add.u64 %rd37, %rd34, %rd7;\n"
|
||||
" ld.shared.f32 %f125, [%rd37+12];\n"
|
||||
" mov.f32 %f126, %f67;\n"
|
||||
" .loc 14 285 0\n"
|
||||
" ld.shared.f32 %f41, [%rd32+0];\n"
|
||||
" .loc 14 339 0\n"
|
||||
" mul.f32 %f127, %f126, %f41;\n"
|
||||
" ld.shared.f32 %f128, [%rd37+8];\n"
|
||||
" ld.shared.f32 %f129, [%rd37+4];\n"
|
||||
" mov.f32 %f130, %f69;\n"
|
||||
" mul.f32 %f131, %f129, %f130;\n"
|
||||
" sub.f32 %f132, %f131, %f128;\n"
|
||||
" mul.f32 %f133, %f127, %f132;\n"
|
||||
" sub.f32 %f134, %f133, %f125;\n"
|
||||
" add.f32 %f39, %f39, %f134;\n"
|
||||
"$Lt_1_18690:\n"
|
||||
"$Lt_1_18178:\n"
|
||||
" @!%p6 bra $Lt_1_19202;\n"
|
||||
" .loc 14 344 0\n"
|
||||
" mov.f32 %f135, %f11;\n"
|
||||
" mul.f32 %f136, %f54, %f54;\n"
|
||||
" mad.f32 %f137, %f122, %f136, %f135;\n"
|
||||
" mov.f32 %f11, %f137;\n"
|
||||
" .loc 14 345 0\n"
|
||||
" mov.f32 %f138, %f13;\n"
|
||||
" mad.f32 %f139, %f122, %f56, %f138;\n"
|
||||
" mov.f32 %f13, %f139;\n"
|
||||
" .loc 14 346 0\n"
|
||||
" mov.f32 %f140, %f15;\n"
|
||||
" mul.f32 %f141, %f55, %f55;\n"
|
||||
" mad.f32 %f142, %f122, %f141, %f140;\n"
|
||||
" mov.f32 %f15, %f142;\n"
|
||||
" .loc 14 347 0\n"
|
||||
" mov.f32 %f143, %f17;\n"
|
||||
" mul.f32 %f144, %f53, %f54;\n"
|
||||
" mad.f32 %f145, %f122, %f144, %f143;\n"
|
||||
" mov.f32 %f17, %f145;\n"
|
||||
" .loc 14 348 0\n"
|
||||
" mov.f32 %f146, %f19;\n"
|
||||
" mul.f32 %f147, %f54, %f55;\n"
|
||||
" mad.f32 %f148, %f122, %f147, %f146;\n"
|
||||
" mov.f32 %f19, %f148;\n"
|
||||
" .loc 14 349 0\n"
|
||||
" mul.f32 %f149, %f53, %f55;\n"
|
||||
" mad.f32 %f20, %f122, %f149, %f20;\n"
|
||||
" mov.f32 %f150, %f20;\n"
|
||||
"$Lt_1_19202:\n"
|
||||
"$Lt_1_15618:\n"
|
||||
" .loc 14 281 0\n"
|
||||
" add.u64 %rd25, %rd22, %rd25;\n"
|
||||
" setp.gt.u64 %p12, %rd28, %rd25;\n"
|
||||
" @%p12 bra $Lt_1_15362;\n"
|
||||
" bra.uni $Lt_1_14850;\n"
|
||||
"$Lt_1_20994:\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" setp.gt.s32 %p5, %r22, %r40;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" setp.gt.s32 %p6, %r21, %r41;\n"
|
||||
"$Lt_1_14850:\n"
|
||||
" .loc 14 356 0\n"
|
||||
" ld.param.u64 %rd38, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd39, %rd38, %rd18;\n"
|
||||
" @!%p5 bra $Lt_1_19970;\n"
|
||||
" .loc 14 358 0\n"
|
||||
" st.global.f32 [%rd39+0], %f39;\n"
|
||||
" .loc 14 359 0\n"
|
||||
" cvt.u64.s32 %rd40, %r8;\n"
|
||||
" mul.lo.u64 %rd41, %rd40, 4;\n"
|
||||
" add.u64 %rd39, %rd41, %rd39;\n"
|
||||
" .loc 14 360 0\n"
|
||||
" st.global.f32 [%rd39+0], %f38;\n"
|
||||
" .loc 14 361 0\n"
|
||||
" add.u64 %rd39, %rd41, %rd39;\n"
|
||||
"$Lt_1_19970:\n"
|
||||
" @!%p6 bra $Lt_1_20482;\n"
|
||||
" .loc 14 365 0\n"
|
||||
" mov.f32 %f151, %f11;\n"
|
||||
" st.global.f32 [%rd39+0], %f151;\n"
|
||||
" .loc 14 366 0\n"
|
||||
" cvt.u64.s32 %rd42, %r8;\n"
|
||||
" mul.lo.u64 %rd43, %rd42, 4;\n"
|
||||
" add.u64 %rd39, %rd43, %rd39;\n"
|
||||
" .loc 14 365 0\n"
|
||||
" mov.f32 %f152, %f13;\n"
|
||||
" st.global.f32 [%rd39+0], %f152;\n"
|
||||
" .loc 14 366 0\n"
|
||||
" add.u64 %rd39, %rd43, %rd39;\n"
|
||||
" .loc 14 365 0\n"
|
||||
" mov.f32 %f153, %f15;\n"
|
||||
" st.global.f32 [%rd39+0], %f153;\n"
|
||||
" .loc 14 366 0\n"
|
||||
" add.u64 %rd39, %rd43, %rd39;\n"
|
||||
" .loc 14 365 0\n"
|
||||
" mov.f32 %f154, %f17;\n"
|
||||
" st.global.f32 [%rd39+0], %f154;\n"
|
||||
" .loc 14 366 0\n"
|
||||
" add.u64 %rd39, %rd43, %rd39;\n"
|
||||
" .loc 14 365 0\n"
|
||||
" mov.f32 %f155, %f19;\n"
|
||||
" st.global.f32 [%rd39+0], %f155;\n"
|
||||
" add.u64 %rd44, %rd43, %rd39;\n"
|
||||
" st.global.f32 [%rd44+0], %f20;\n"
|
||||
"$Lt_1_20482:\n"
|
||||
" .loc 14 369 0\n"
|
||||
" ld.param.u64 %rd45, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd46, %rd17, 16;\n"
|
||||
" add.u64 %rd47, %rd45, %rd46;\n"
|
||||
" mov.f32 %f156, %f157;\n"
|
||||
" st.global.v4.f32 [%rd47+0], {%f37,%f36,%f35,%f156};\n"
|
||||
"$Lt_1_14338:\n"
|
||||
" .loc 14 371 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,828 @@
|
|||
const char * crml_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .tex .u64 q_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_g_ewald,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_denom_lj,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_bothsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_ljsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_lj_innersq)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<47>;\n"
|
||||
" .reg .u64 %rd<37>;\n"
|
||||
" .reg .f32 %f<170>;\n"
|
||||
" .reg .pred %p<12>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj116[32];\n"
|
||||
" .loc 14 109 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 114 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+0], %f1;\n"
|
||||
" .loc 14 115 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+4], %f2;\n"
|
||||
" .loc 14 116 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+8], %f3;\n"
|
||||
" .loc 14 117 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+12], %f4;\n"
|
||||
" .loc 14 118 0\n"
|
||||
" ld.global.f32 %f5, [%rd1+16];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+16], %f5;\n"
|
||||
" .loc 14 119 0\n"
|
||||
" ld.global.f32 %f6, [%rd1+20];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+20], %f6;\n"
|
||||
" .loc 14 120 0\n"
|
||||
" ld.global.f32 %f7, [%rd1+24];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+24], %f7;\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.global.f32 %f8, [%rd1+28];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj116+28], %f8;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_11778;\n"
|
||||
" .loc 14 132 0\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" mov.f32 %f17, 0f00000000; \n"
|
||||
" mov.f32 %f18, %f17;\n"
|
||||
" mov.f32 %f19, 0f00000000; \n"
|
||||
" mov.f32 %f20, %f19;\n"
|
||||
" .loc 14 135 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 137 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 138 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 141 0\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.s32 %r14, %r7;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 142 0\n"
|
||||
" mov.f32 %f33, %f29;\n"
|
||||
" mul24.lo.s32 %r18, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r18;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_17922;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r22;\n"
|
||||
" ld.param.f32 %f39, [__cudaparm_kernel_pair_cut_bothsq];\n"
|
||||
" mov.u64 %rd14, __cuda_sp_lj116;\n"
|
||||
"$Lt_0_12802:\n"
|
||||
" .loc 14 146 0\n"
|
||||
" ld.global.s32 %r23, [%rd10+0];\n"
|
||||
" .loc 14 149 0\n"
|
||||
" shr.s32 %r24, %r23, 30;\n"
|
||||
" cvt.s64.s32 %rd15, %r24;\n"
|
||||
" and.b64 %rd16, %rd15, 3;\n"
|
||||
" mul.lo.u64 %rd17, %rd16, 4;\n"
|
||||
" add.u64 %rd18, %rd14, %rd17;\n"
|
||||
" ld.shared.f32 %f40, [%rd18+0];\n"
|
||||
" .loc 14 150 0\n"
|
||||
" mov.f32 %f41, 0f3f800000; \n"
|
||||
" ld.shared.f32 %f42, [%rd18+16];\n"
|
||||
" sub.f32 %f43, %f41, %f42;\n"
|
||||
" and.b32 %r25, %r23, 1073741823;\n"
|
||||
" mov.s32 %r26, %r25;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" mov.s32 %r28, 0;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r26,%r27,%r28,%r29}];\n"
|
||||
" .loc 14 153 0\n"
|
||||
" mov.f32 %f48, %f44;\n"
|
||||
" mov.f32 %f49, %f45;\n"
|
||||
" mov.f32 %f50, %f46;\n"
|
||||
" mov.f32 %f51, %f47;\n"
|
||||
" sub.f32 %f52, %f26, %f49;\n"
|
||||
" sub.f32 %f53, %f25, %f48;\n"
|
||||
" sub.f32 %f54, %f27, %f50;\n"
|
||||
" mul.f32 %f55, %f52, %f52;\n"
|
||||
" mad.f32 %f56, %f53, %f53, %f55;\n"
|
||||
" mad.f32 %f57, %f54, %f54, %f56;\n"
|
||||
" setp.lt.f32 %p5, %f57, %f39;\n"
|
||||
" @!%p5 bra $Lt_0_16130;\n"
|
||||
" ld.param.f32 %f58, [__cudaparm_kernel_pair_cut_ljsq];\n"
|
||||
" setp.lt.f32 %p6, %f57, %f58;\n"
|
||||
" rcp.approx.f32 %f59, %f57;\n"
|
||||
" @!%p6 bra $Lt_0_13826;\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mul.f32 %f60, %f59, %f59;\n"
|
||||
" mul.f32 %f61, %f59, %f60;\n"
|
||||
" mov.f32 %f62, %f61;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" cvt.rzi.s32.f32 %r30, %f51;\n"
|
||||
" cvt.rzi.s32.f32 %r31, %f28;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" ld.param.s32 %r32, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r33, %r32, %r31;\n"
|
||||
" add.s32 %r34, %r30, %r33;\n"
|
||||
" cvt.u64.s32 %rd20, %r34;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
||||
" add.u64 %rd22, %rd19, %rd21;\n"
|
||||
" .loc 14 149 0\n"
|
||||
" ld.shared.f32 %f40, [%rd18+0];\n"
|
||||
" .loc 14 169 0\n"
|
||||
" mul.f32 %f63, %f61, %f40;\n"
|
||||
" ld.global.v2.f32 {%f64,%f65}, [%rd22+0];\n"
|
||||
" mul.f32 %f66, %f64, %f61;\n"
|
||||
" sub.f32 %f67, %f66, %f65;\n"
|
||||
" mul.f32 %f68, %f63, %f67;\n"
|
||||
" ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_lj_innersq];\n"
|
||||
" setp.gt.f32 %p7, %f57, %f69;\n"
|
||||
" @!%p7 bra $Lt_0_13570;\n"
|
||||
" .loc 14 175 0\n"
|
||||
" add.f32 %f70, %f57, %f57;\n"
|
||||
" sub.f32 %f71, %f58, %f57;\n"
|
||||
" add.f32 %f72, %f70, %f58;\n"
|
||||
" mul.f32 %f73, %f71, %f71;\n"
|
||||
" mov.f32 %f74, 0f40400000; \n"
|
||||
" mul.f32 %f75, %f74, %f69;\n"
|
||||
" sub.f32 %f76, %f72, %f75;\n"
|
||||
" ld.param.f32 %f77, [__cudaparm_kernel_pair_denom_lj];\n"
|
||||
" div.approx.f32 %f78, %f76, %f77;\n"
|
||||
" mul.f32 %f79, %f73, %f78;\n"
|
||||
" mov.f32 %f80, %f79;\n"
|
||||
" .loc 14 178 0\n"
|
||||
" mov.f32 %f81, 0f41400000; \n"
|
||||
" mul.f32 %f82, %f57, %f81;\n"
|
||||
" mul.f32 %f83, %f71, %f82;\n"
|
||||
" sub.f32 %f84, %f57, %f69;\n"
|
||||
" mul.f32 %f85, %f83, %f84;\n"
|
||||
" div.approx.f32 %f86, %f85, %f77;\n"
|
||||
" ld.global.v2.f32 {%f87,%f88}, [%rd22+8];\n"
|
||||
" mul.f32 %f89, %f87, %f61;\n"
|
||||
" sub.f32 %f90, %f89, %f88;\n"
|
||||
" mul.f32 %f91, %f61, %f90;\n"
|
||||
" mul.f32 %f92, %f86, %f91;\n"
|
||||
" mad.f32 %f68, %f68, %f79, %f92;\n"
|
||||
" bra.uni $Lt_0_13570;\n"
|
||||
"$Lt_0_13826:\n"
|
||||
" .loc 14 181 0\n"
|
||||
" mov.f32 %f68, 0f00000000; \n"
|
||||
"$Lt_0_13570:\n"
|
||||
" ld.param.f32 %f93, [__cudaparm_kernel_pair_cut_coulsq];\n"
|
||||
" setp.gt.f32 %p8, %f93, %f57;\n"
|
||||
" @!%p8 bra $Lt_0_14850;\n"
|
||||
" .loc 14 188 0\n"
|
||||
" sqrt.approx.f32 %f94, %f57;\n"
|
||||
" ld.param.f32 %f95, [__cudaparm_kernel_pair_g_ewald];\n"
|
||||
" mul.f32 %f96, %f95, %f94;\n"
|
||||
" mul.f32 %f97, %f96, %f96;\n"
|
||||
" mov.f32 %f98, 0f3f800000; \n"
|
||||
" mov.f32 %f99, 0f3ea7ba05; \n"
|
||||
" mad.f32 %f100, %f99, %f96, %f98;\n"
|
||||
" neg.f32 %f101, %f97;\n"
|
||||
" rcp.approx.f32 %f102, %f100;\n"
|
||||
" mov.f32 %f103, 0f3fb8aa3b; \n"
|
||||
" mul.f32 %f104, %f101, %f103;\n"
|
||||
" ex2.approx.f32 %f105, %f104;\n"
|
||||
" mov.f32 %f106, 0f3e827906; \n"
|
||||
" mov.f32 %f107, 0fbe91a98e; \n"
|
||||
" mov.f32 %f108, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f109, 0fbfba00e3; \n"
|
||||
" mov.f32 %f110, 0f3f87dc22; \n"
|
||||
" mad.f32 %f111, %f110, %f102, %f109;\n"
|
||||
" mad.f32 %f112, %f102, %f111, %f108;\n"
|
||||
" mad.f32 %f113, %f102, %f112, %f107;\n"
|
||||
" mad.f32 %f114, %f102, %f113, %f106;\n"
|
||||
" mul.f32 %f115, %f102, %f114;\n"
|
||||
" mul.f32 %f116, %f105, %f115;\n"
|
||||
" mov.f32 %f117, %f116;\n"
|
||||
" mov.s32 %r35, %r25;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r35,%r36,%r37,%r38}];\n"
|
||||
" .loc 14 189 0\n"
|
||||
" mov.f32 %f122, %f118;\n"
|
||||
" ld.param.f32 %f123, [__cudaparm_kernel_pair_qqrd2e];\n"
|
||||
" mul.f32 %f124, %f123, %f33;\n"
|
||||
" mul.f32 %f125, %f124, %f122;\n"
|
||||
" div.approx.f32 %f126, %f125, %f94;\n"
|
||||
" mov.f32 %f127, %f126;\n"
|
||||
" .loc 14 190 0\n"
|
||||
" mov.f32 %f128, 0f3f906ebb; \n"
|
||||
" mul.f32 %f129, %f96, %f128;\n"
|
||||
" mad.f32 %f130, %f105, %f129, %f116;\n"
|
||||
" sub.f32 %f131, %f130, %f43;\n"
|
||||
" mul.f32 %f132, %f126, %f131;\n"
|
||||
" bra.uni $Lt_0_14594;\n"
|
||||
"$Lt_0_14850:\n"
|
||||
" .loc 14 193 0\n"
|
||||
" mov.f32 %f127, 0f00000000; \n"
|
||||
" mov.f32 %f132, 0f00000000; \n"
|
||||
"$Lt_0_14594:\n"
|
||||
" .loc 14 198 0\n"
|
||||
" add.f32 %f133, %f132, %f68;\n"
|
||||
" mul.f32 %f134, %f133, %f59;\n"
|
||||
" mad.f32 %f36, %f53, %f134, %f36;\n"
|
||||
" .loc 14 199 0\n"
|
||||
" mad.f32 %f35, %f52, %f134, %f35;\n"
|
||||
" .loc 14 200 0\n"
|
||||
" mad.f32 %f34, %f54, %f134, %f34;\n"
|
||||
" @!%p3 bra $Lt_0_15618;\n"
|
||||
" .loc 14 203 0\n"
|
||||
" mov.f32 %f135, %f117;\n"
|
||||
" sub.f32 %f136, %f135, %f43;\n"
|
||||
" mad.f32 %f37, %f127, %f136, %f37;\n"
|
||||
" @!%p6 bra $Lt_0_15618;\n"
|
||||
" .loc 14 205 0\n"
|
||||
" cvt.rzi.s32.f32 %r39, %f51;\n"
|
||||
" cvt.rzi.s32.f32 %r40, %f28;\n"
|
||||
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" ld.param.s32 %r41, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r42, %r41, %r40;\n"
|
||||
" add.s32 %r43, %r39, %r42;\n"
|
||||
" cvt.u64.s32 %rd24, %r43;\n"
|
||||
" mul.lo.u64 %rd25, %rd24, 16;\n"
|
||||
" add.u64 %rd22, %rd23, %rd25;\n"
|
||||
" mov.f32 %f137, %f62;\n"
|
||||
" ld.global.v2.f32 {%f138,%f139}, [%rd22+8];\n"
|
||||
" mul.f32 %f140, %f138, %f137;\n"
|
||||
" sub.f32 %f141, %f140, %f139;\n"
|
||||
" mul.f32 %f142, %f137, %f141;\n"
|
||||
" mov.f32 %f143, %f80;\n"
|
||||
" mul.f32 %f144, %f143, %f142;\n"
|
||||
" ld.param.f32 %f145, [__cudaparm_kernel_pair_cut_lj_innersq];\n"
|
||||
" setp.lt.f32 %p9, %f145, %f57;\n"
|
||||
" selp.f32 %f142, %f144, %f142, %p9;\n"
|
||||
" .loc 14 149 0\n"
|
||||
" ld.shared.f32 %f40, [%rd18+0];\n"
|
||||
" .loc 14 208 0\n"
|
||||
" mad.f32 %f38, %f40, %f142, %f38;\n"
|
||||
"$Lt_0_15618:\n"
|
||||
"$Lt_0_15106:\n"
|
||||
" @!%p4 bra $Lt_0_16130;\n"
|
||||
" .loc 14 212 0\n"
|
||||
" mov.f32 %f146, %f10;\n"
|
||||
" mul.f32 %f147, %f53, %f53;\n"
|
||||
" mad.f32 %f148, %f134, %f147, %f146;\n"
|
||||
" mov.f32 %f10, %f148;\n"
|
||||
" .loc 14 213 0\n"
|
||||
" mov.f32 %f149, %f12;\n"
|
||||
" mad.f32 %f150, %f134, %f55, %f149;\n"
|
||||
" mov.f32 %f12, %f150;\n"
|
||||
" .loc 14 214 0\n"
|
||||
" mov.f32 %f151, %f14;\n"
|
||||
" mul.f32 %f152, %f54, %f54;\n"
|
||||
" mad.f32 %f153, %f134, %f152, %f151;\n"
|
||||
" mov.f32 %f14, %f153;\n"
|
||||
" .loc 14 215 0\n"
|
||||
" mov.f32 %f154, %f16;\n"
|
||||
" mul.f32 %f155, %f52, %f53;\n"
|
||||
" mad.f32 %f156, %f134, %f155, %f154;\n"
|
||||
" mov.f32 %f16, %f156;\n"
|
||||
" .loc 14 216 0\n"
|
||||
" mov.f32 %f157, %f18;\n"
|
||||
" mul.f32 %f158, %f53, %f54;\n"
|
||||
" mad.f32 %f159, %f134, %f158, %f157;\n"
|
||||
" mov.f32 %f18, %f159;\n"
|
||||
" .loc 14 217 0\n"
|
||||
" mul.f32 %f160, %f52, %f54;\n"
|
||||
" mad.f32 %f19, %f134, %f160, %f19;\n"
|
||||
" mov.f32 %f161, %f19;\n"
|
||||
"$Lt_0_16130:\n"
|
||||
"$Lt_0_13058:\n"
|
||||
" .loc 14 145 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p10, %rd13, %rd10;\n"
|
||||
" @%p10 bra $Lt_0_12802;\n"
|
||||
" bra.uni $Lt_0_12290;\n"
|
||||
"$Lt_0_17922:\n"
|
||||
" mov.s32 %r44, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r44;\n"
|
||||
" mov.s32 %r45, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r45;\n"
|
||||
"$Lt_0_12290:\n"
|
||||
" .loc 14 224 0\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd27, %rd26, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_16898;\n"
|
||||
" .loc 14 226 0\n"
|
||||
" st.global.f32 [%rd27+0], %f38;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" cvt.u64.s32 %rd28, %r6;\n"
|
||||
" mul.lo.u64 %rd29, %rd28, 4;\n"
|
||||
" add.u64 %rd27, %rd29, %rd27;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" st.global.f32 [%rd27+0], %f37;\n"
|
||||
" .loc 14 229 0\n"
|
||||
" add.u64 %rd27, %rd29, %rd27;\n"
|
||||
"$Lt_0_16898:\n"
|
||||
" @!%p4 bra $Lt_0_17410;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" mov.f32 %f162, %f10;\n"
|
||||
" st.global.f32 [%rd27+0], %f162;\n"
|
||||
" .loc 14 234 0\n"
|
||||
" cvt.u64.s32 %rd30, %r6;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd27, %rd31, %rd27;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" mov.f32 %f163, %f12;\n"
|
||||
" st.global.f32 [%rd27+0], %f163;\n"
|
||||
" .loc 14 234 0\n"
|
||||
" add.u64 %rd27, %rd31, %rd27;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" mov.f32 %f164, %f14;\n"
|
||||
" st.global.f32 [%rd27+0], %f164;\n"
|
||||
" .loc 14 234 0\n"
|
||||
" add.u64 %rd27, %rd31, %rd27;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" mov.f32 %f165, %f16;\n"
|
||||
" st.global.f32 [%rd27+0], %f165;\n"
|
||||
" .loc 14 234 0\n"
|
||||
" add.u64 %rd27, %rd31, %rd27;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" mov.f32 %f166, %f18;\n"
|
||||
" st.global.f32 [%rd27+0], %f166;\n"
|
||||
" add.u64 %rd32, %rd31, %rd27;\n"
|
||||
" st.global.f32 [%rd32+0], %f19;\n"
|
||||
"$Lt_0_17410:\n"
|
||||
" .loc 14 237 0\n"
|
||||
" ld.param.u64 %rd33, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd34, %rd2, 16;\n"
|
||||
" add.u64 %rd35, %rd33, %rd34;\n"
|
||||
" mov.f32 %f167, %f168;\n"
|
||||
" st.global.v4.f32 [%rd35+0], {%f36,%f35,%f34,%f167};\n"
|
||||
"$Lt_0_11778:\n"
|
||||
" .loc 14 239 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ljd_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_denom_lj,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_bothsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_ljsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<40>;\n"
|
||||
" .reg .u64 %rd<45>;\n"
|
||||
" .reg .f32 %f<177>;\n"
|
||||
" .reg .pred %p<13>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj260[32];\n"
|
||||
" .shared .align 8 .b8 __cuda_ljd296[1024];\n"
|
||||
" .loc 14 250 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" cvt.u64.s32 %rd1, %r1;\n"
|
||||
" mov.u32 %r2, 7;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_12546;\n"
|
||||
" .loc 14 256 0\n"
|
||||
" mov.u64 %rd2, __cuda_sp_lj260;\n"
|
||||
" mul.lo.u64 %rd3, %rd1, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd2;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_12546:\n"
|
||||
" mov.u64 %rd7, __cuda_ljd296;\n"
|
||||
" mov.u64 %rd2, __cuda_sp_lj260;\n"
|
||||
" .loc 14 257 0\n"
|
||||
" mul.lo.u64 %rd8, %rd1, 8;\n"
|
||||
" ld.param.u64 %rd9, [__cudaparm_kernel_pair_fast_ljd_in];\n"
|
||||
" add.u64 %rd10, %rd9, %rd8;\n"
|
||||
" add.u64 %rd11, %rd8, %rd7;\n"
|
||||
" ld.global.v2.f32 {%f2,%f3}, [%rd10+0];\n"
|
||||
" st.shared.f32 [%rd11+0], %f2;\n"
|
||||
" st.shared.f32 [%rd11+4], %f3;\n"
|
||||
" ld.global.v2.f32 {%f4,%f5}, [%rd10+512];\n"
|
||||
" .loc 14 258 0\n"
|
||||
" st.shared.f32 [%rd11+512], %f4;\n"
|
||||
" st.shared.f32 [%rd11+516], %f5;\n"
|
||||
" .loc 14 261 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r3, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r4, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r5, %r3, %r4;\n"
|
||||
" add.s32 %r6, %r5, %r1;\n"
|
||||
" ld.param.s32 %r7, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p2, %r6, %r7;\n"
|
||||
" @%p2 bra $Lt_1_13058;\n"
|
||||
" .loc 14 273 0\n"
|
||||
" mov.f32 %f6, 0f00000000; \n"
|
||||
" mov.f32 %f7, %f6;\n"
|
||||
" mov.f32 %f8, 0f00000000; \n"
|
||||
" mov.f32 %f9, %f8;\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" .loc 14 276 0\n"
|
||||
" cvt.u64.s32 %rd12, %r6;\n"
|
||||
" mul.lo.u64 %rd13, %rd12, 4;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd15, %rd14, %rd13;\n"
|
||||
" ld.global.s32 %r8, [%rd15+0];\n"
|
||||
" .loc 14 278 0\n"
|
||||
" ld.param.s32 %r9, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd16, %r9;\n"
|
||||
" mul.lo.u64 %rd17, %rd16, 4;\n"
|
||||
" add.u64 %rd18, %rd15, %rd17;\n"
|
||||
" ld.global.s32 %r10, [%rd18+0];\n"
|
||||
" .loc 14 279 0\n"
|
||||
" add.u64 %rd19, %rd18, %rd17;\n"
|
||||
" mov.s64 %rd20, %rd19;\n"
|
||||
" mov.s32 %r11, %r8;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" mov.s32 %r14, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r11,%r12,%r13,%r14}];\n"
|
||||
" .loc 14 282 0\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.s32 %r15, %r8;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r15,%r16,%r17,%r18}];\n"
|
||||
" .loc 14 283 0\n"
|
||||
" mov.f32 %f30, %f26;\n"
|
||||
" mul24.lo.s32 %r19, %r10, %r9;\n"
|
||||
" cvt.s64.s32 %rd21, %r19;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd19, %rd22;\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r21, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p3, %rd19, %rd23;\n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" @%p3 bra $Lt_1_19202;\n"
|
||||
" cvt.rzi.s32.f32 %r22, %f25;\n"
|
||||
" mov.s32 %r23, 0;\n"
|
||||
" setp.gt.s32 %p4, %r21, %r23;\n"
|
||||
" mov.s32 %r24, 0;\n"
|
||||
" setp.gt.s32 %p5, %r20, %r24;\n"
|
||||
" ld.param.f32 %f36, [__cudaparm_kernel_pair_fast_cut_bothsq];\n"
|
||||
"$Lt_1_14082:\n"
|
||||
" .loc 14 287 0\n"
|
||||
" ld.global.s32 %r25, [%rd20+0];\n"
|
||||
" .loc 14 290 0\n"
|
||||
" shr.s32 %r26, %r25, 30;\n"
|
||||
" cvt.s64.s32 %rd24, %r26;\n"
|
||||
" and.b64 %rd25, %rd24, 3;\n"
|
||||
" mul.lo.u64 %rd26, %rd25, 4;\n"
|
||||
" add.u64 %rd27, %rd2, %rd26;\n"
|
||||
" ld.shared.f32 %f37, [%rd27+0];\n"
|
||||
" .loc 14 291 0\n"
|
||||
" mov.f32 %f38, 0f3f800000; \n"
|
||||
" ld.shared.f32 %f39, [%rd27+16];\n"
|
||||
" sub.f32 %f40, %f38, %f39;\n"
|
||||
" and.b32 %r27, %r25, 1073741823;\n"
|
||||
" mov.s32 %r28, %r27;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
|
||||
" .loc 14 294 0\n"
|
||||
" mov.f32 %f45, %f41;\n"
|
||||
" mov.f32 %f46, %f42;\n"
|
||||
" mov.f32 %f47, %f43;\n"
|
||||
" mov.f32 %f48, %f44;\n"
|
||||
" sub.f32 %f49, %f23, %f46;\n"
|
||||
" sub.f32 %f50, %f22, %f45;\n"
|
||||
" sub.f32 %f51, %f24, %f47;\n"
|
||||
" mul.f32 %f52, %f49, %f49;\n"
|
||||
" mad.f32 %f53, %f50, %f50, %f52;\n"
|
||||
" mad.f32 %f54, %f51, %f51, %f53;\n"
|
||||
" setp.lt.f32 %p6, %f54, %f36;\n"
|
||||
" @!%p6 bra $Lt_1_17410;\n"
|
||||
" ld.param.f32 %f55, [__cudaparm_kernel_pair_fast_cut_ljsq];\n"
|
||||
" setp.lt.f32 %p7, %f54, %f55;\n"
|
||||
" rcp.approx.f32 %f56, %f54;\n"
|
||||
" @!%p7 bra $Lt_1_15106;\n"
|
||||
" .loc 14 309 0\n"
|
||||
" cvt.rzi.s32.f32 %r32, %f48;\n"
|
||||
" cvt.u64.s32 %rd28, %r22;\n"
|
||||
" mul.lo.u64 %rd29, %rd28, 8;\n"
|
||||
" add.u64 %rd30, %rd7, %rd29;\n"
|
||||
" cvt.u64.s32 %rd31, %r32;\n"
|
||||
" mul.lo.u64 %rd32, %rd31, 8;\n"
|
||||
" add.u64 %rd33, %rd7, %rd32;\n"
|
||||
" ld.shared.f32 %f57, [%rd30+0];\n"
|
||||
" ld.shared.f32 %f58, [%rd33+0];\n"
|
||||
" mul.f32 %f59, %f57, %f58;\n"
|
||||
" .loc 14 310 0\n"
|
||||
" ld.shared.f32 %f60, [%rd30+4];\n"
|
||||
" ld.shared.f32 %f61, [%rd33+4];\n"
|
||||
" add.f32 %f62, %f60, %f61;\n"
|
||||
" mov.f32 %f63, 0f3f000000; \n"
|
||||
" mul.f32 %f64, %f62, %f63;\n"
|
||||
" .loc 14 314 0\n"
|
||||
" mul.f32 %f65, %f64, %f64;\n"
|
||||
" sqrt.approx.f32 %f66, %f59;\n"
|
||||
" mov.f32 %f67, 0f40800000; \n"
|
||||
" mul.f32 %f68, %f66, %f67;\n"
|
||||
" mul.f32 %f69, %f65, %f56;\n"
|
||||
" mul.f32 %f70, %f69, %f69;\n"
|
||||
" mul.f32 %f71, %f69, %f70;\n"
|
||||
" mul.f32 %f72, %f68, %f71;\n"
|
||||
" mov.f32 %f73, %f72;\n"
|
||||
" .loc 14 315 0\n"
|
||||
" mul.f32 %f74, %f71, %f72;\n"
|
||||
" mov.f32 %f75, %f74;\n"
|
||||
" .loc 14 316 0\n"
|
||||
" mov.f32 %f76, 0f40c00000; \n"
|
||||
" mul.f32 %f77, %f72, %f76;\n"
|
||||
" mov.f32 %f78, 0f41400000; \n"
|
||||
" mul.f32 %f79, %f78, %f74;\n"
|
||||
" sub.f32 %f80, %f79, %f77;\n"
|
||||
" .loc 14 290 0\n"
|
||||
" ld.shared.f32 %f37, [%rd27+0];\n"
|
||||
" .loc 14 316 0\n"
|
||||
" mul.f32 %f81, %f37, %f80;\n"
|
||||
" ld.param.f32 %f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n"
|
||||
" setp.gt.f32 %p8, %f54, %f82;\n"
|
||||
" @!%p8 bra $Lt_1_14850;\n"
|
||||
" .loc 14 322 0\n"
|
||||
" add.f32 %f83, %f54, %f54;\n"
|
||||
" sub.f32 %f84, %f55, %f54;\n"
|
||||
" add.f32 %f85, %f83, %f55;\n"
|
||||
" mul.f32 %f86, %f84, %f84;\n"
|
||||
" mov.f32 %f87, 0f40400000; \n"
|
||||
" mul.f32 %f88, %f87, %f82;\n"
|
||||
" sub.f32 %f89, %f85, %f88;\n"
|
||||
" ld.param.f32 %f90, [__cudaparm_kernel_pair_fast_denom_lj];\n"
|
||||
" div.approx.f32 %f91, %f89, %f90;\n"
|
||||
" mul.f32 %f92, %f86, %f91;\n"
|
||||
" mov.f32 %f93, %f92;\n"
|
||||
" .loc 14 325 0\n"
|
||||
" mov.f32 %f94, 0f41400000; \n"
|
||||
" mul.f32 %f95, %f54, %f94;\n"
|
||||
" mul.f32 %f96, %f84, %f95;\n"
|
||||
" sub.f32 %f97, %f54, %f82;\n"
|
||||
" mul.f32 %f98, %f96, %f97;\n"
|
||||
" div.approx.f32 %f99, %f98, %f90;\n"
|
||||
" sub.f32 %f100, %f74, %f72;\n"
|
||||
" mul.f32 %f101, %f99, %f100;\n"
|
||||
" mad.f32 %f81, %f81, %f92, %f101;\n"
|
||||
" bra.uni $Lt_1_14850;\n"
|
||||
"$Lt_1_15106:\n"
|
||||
" .loc 14 328 0\n"
|
||||
" mov.f32 %f81, 0f00000000; \n"
|
||||
"$Lt_1_14850:\n"
|
||||
" ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
|
||||
" setp.gt.f32 %p9, %f102, %f54;\n"
|
||||
" @!%p9 bra $Lt_1_16130;\n"
|
||||
" .loc 14 335 0\n"
|
||||
" sqrt.approx.f32 %f103, %f54;\n"
|
||||
" ld.param.f32 %f104, [__cudaparm_kernel_pair_fast_g_ewald];\n"
|
||||
" mul.f32 %f105, %f104, %f103;\n"
|
||||
" mul.f32 %f106, %f105, %f105;\n"
|
||||
" mov.f32 %f107, 0f3f800000; \n"
|
||||
" mov.f32 %f108, 0f3ea7ba05; \n"
|
||||
" mad.f32 %f109, %f108, %f105, %f107;\n"
|
||||
" neg.f32 %f110, %f106;\n"
|
||||
" rcp.approx.f32 %f111, %f109;\n"
|
||||
" mov.f32 %f112, 0f3fb8aa3b; \n"
|
||||
" mul.f32 %f113, %f110, %f112;\n"
|
||||
" ex2.approx.f32 %f114, %f113;\n"
|
||||
" mov.f32 %f115, 0f3e827906; \n"
|
||||
" mov.f32 %f116, 0fbe91a98e; \n"
|
||||
" mov.f32 %f117, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f118, 0fbfba00e3; \n"
|
||||
" mov.f32 %f119, 0f3f87dc22; \n"
|
||||
" mad.f32 %f120, %f119, %f111, %f118;\n"
|
||||
" mad.f32 %f121, %f111, %f120, %f117;\n"
|
||||
" mad.f32 %f122, %f111, %f121, %f116;\n"
|
||||
" mad.f32 %f123, %f111, %f122, %f115;\n"
|
||||
" mul.f32 %f124, %f111, %f123;\n"
|
||||
" mul.f32 %f125, %f114, %f124;\n"
|
||||
" mov.f32 %f126, %f125;\n"
|
||||
" mov.s32 %r33, %r27;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" mov.s32 %r35, 0;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r33,%r34,%r35,%r36}];\n"
|
||||
" .loc 14 336 0\n"
|
||||
" mov.f32 %f131, %f127;\n"
|
||||
" ld.param.f32 %f132, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
|
||||
" mul.f32 %f133, %f132, %f30;\n"
|
||||
" mul.f32 %f134, %f133, %f131;\n"
|
||||
" div.approx.f32 %f135, %f134, %f103;\n"
|
||||
" mov.f32 %f136, %f135;\n"
|
||||
" .loc 14 337 0\n"
|
||||
" mov.f32 %f137, 0f3f906ebb; \n"
|
||||
" mul.f32 %f138, %f105, %f137;\n"
|
||||
" mad.f32 %f139, %f114, %f138, %f125;\n"
|
||||
" sub.f32 %f140, %f139, %f40;\n"
|
||||
" mul.f32 %f141, %f135, %f140;\n"
|
||||
" bra.uni $Lt_1_15874;\n"
|
||||
"$Lt_1_16130:\n"
|
||||
" .loc 14 340 0\n"
|
||||
" mov.f32 %f136, 0f00000000; \n"
|
||||
" mov.f32 %f141, 0f00000000; \n"
|
||||
"$Lt_1_15874:\n"
|
||||
" .loc 14 345 0\n"
|
||||
" add.f32 %f142, %f141, %f81;\n"
|
||||
" mul.f32 %f143, %f142, %f56;\n"
|
||||
" mad.f32 %f33, %f50, %f143, %f33;\n"
|
||||
" .loc 14 346 0\n"
|
||||
" mad.f32 %f32, %f49, %f143, %f32;\n"
|
||||
" .loc 14 347 0\n"
|
||||
" mad.f32 %f31, %f51, %f143, %f31;\n"
|
||||
" @!%p4 bra $Lt_1_16898;\n"
|
||||
" .loc 14 350 0\n"
|
||||
" mov.f32 %f144, %f126;\n"
|
||||
" sub.f32 %f145, %f144, %f40;\n"
|
||||
" mad.f32 %f34, %f136, %f145, %f34;\n"
|
||||
" @!%p7 bra $Lt_1_16898;\n"
|
||||
" .loc 14 355 0\n"
|
||||
" mov.f32 %f146, %f75;\n"
|
||||
" mov.f32 %f147, %f73;\n"
|
||||
" sub.f32 %f148, %f146, %f147;\n"
|
||||
" mov.f32 %f149, %f93;\n"
|
||||
" mul.f32 %f150, %f149, %f148;\n"
|
||||
" ld.param.f32 %f151, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n"
|
||||
" setp.lt.f32 %p10, %f151, %f54;\n"
|
||||
" selp.f32 %f152, %f150, %f148, %p10;\n"
|
||||
" .loc 14 290 0\n"
|
||||
" ld.shared.f32 %f37, [%rd27+0];\n"
|
||||
" .loc 14 355 0\n"
|
||||
" mad.f32 %f35, %f37, %f152, %f35;\n"
|
||||
"$Lt_1_16898:\n"
|
||||
"$Lt_1_16386:\n"
|
||||
" @!%p5 bra $Lt_1_17410;\n"
|
||||
" .loc 14 359 0\n"
|
||||
" mov.f32 %f153, %f7;\n"
|
||||
" mul.f32 %f154, %f50, %f50;\n"
|
||||
" mad.f32 %f155, %f143, %f154, %f153;\n"
|
||||
" mov.f32 %f7, %f155;\n"
|
||||
" .loc 14 360 0\n"
|
||||
" mov.f32 %f156, %f9;\n"
|
||||
" mad.f32 %f157, %f143, %f52, %f156;\n"
|
||||
" mov.f32 %f9, %f157;\n"
|
||||
" .loc 14 361 0\n"
|
||||
" mov.f32 %f158, %f11;\n"
|
||||
" mul.f32 %f159, %f51, %f51;\n"
|
||||
" mad.f32 %f160, %f143, %f159, %f158;\n"
|
||||
" mov.f32 %f11, %f160;\n"
|
||||
" .loc 14 362 0\n"
|
||||
" mov.f32 %f161, %f13;\n"
|
||||
" mul.f32 %f162, %f49, %f50;\n"
|
||||
" mad.f32 %f163, %f143, %f162, %f161;\n"
|
||||
" mov.f32 %f13, %f163;\n"
|
||||
" .loc 14 363 0\n"
|
||||
" mov.f32 %f164, %f15;\n"
|
||||
" mul.f32 %f165, %f50, %f51;\n"
|
||||
" mad.f32 %f166, %f143, %f165, %f164;\n"
|
||||
" mov.f32 %f15, %f166;\n"
|
||||
" .loc 14 364 0\n"
|
||||
" mul.f32 %f167, %f49, %f51;\n"
|
||||
" mad.f32 %f16, %f143, %f167, %f16;\n"
|
||||
" mov.f32 %f168, %f16;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
"$Lt_1_14338:\n"
|
||||
" .loc 14 286 0\n"
|
||||
" add.u64 %rd20, %rd17, %rd20;\n"
|
||||
" setp.gt.u64 %p11, %rd23, %rd20;\n"
|
||||
" @%p11 bra $Lt_1_14082;\n"
|
||||
" bra.uni $Lt_1_13570;\n"
|
||||
"$Lt_1_19202:\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" setp.gt.s32 %p4, %r21, %r37;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" setp.gt.s32 %p5, %r20, %r38;\n"
|
||||
"$Lt_1_13570:\n"
|
||||
" .loc 14 371 0\n"
|
||||
" ld.param.u64 %rd34, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd35, %rd34, %rd13;\n"
|
||||
" @!%p4 bra $Lt_1_18178;\n"
|
||||
" .loc 14 373 0\n"
|
||||
" st.global.f32 [%rd35+0], %f35;\n"
|
||||
" .loc 14 374 0\n"
|
||||
" cvt.u64.s32 %rd36, %r7;\n"
|
||||
" mul.lo.u64 %rd37, %rd36, 4;\n"
|
||||
" add.u64 %rd35, %rd37, %rd35;\n"
|
||||
" .loc 14 375 0\n"
|
||||
" st.global.f32 [%rd35+0], %f34;\n"
|
||||
" .loc 14 376 0\n"
|
||||
" add.u64 %rd35, %rd37, %rd35;\n"
|
||||
"$Lt_1_18178:\n"
|
||||
" @!%p5 bra $Lt_1_18690;\n"
|
||||
" .loc 14 380 0\n"
|
||||
" mov.f32 %f169, %f7;\n"
|
||||
" st.global.f32 [%rd35+0], %f169;\n"
|
||||
" .loc 14 381 0\n"
|
||||
" cvt.u64.s32 %rd38, %r7;\n"
|
||||
" mul.lo.u64 %rd39, %rd38, 4;\n"
|
||||
" add.u64 %rd35, %rd39, %rd35;\n"
|
||||
" .loc 14 380 0\n"
|
||||
" mov.f32 %f170, %f9;\n"
|
||||
" st.global.f32 [%rd35+0], %f170;\n"
|
||||
" .loc 14 381 0\n"
|
||||
" add.u64 %rd35, %rd39, %rd35;\n"
|
||||
" .loc 14 380 0\n"
|
||||
" mov.f32 %f171, %f11;\n"
|
||||
" st.global.f32 [%rd35+0], %f171;\n"
|
||||
" .loc 14 381 0\n"
|
||||
" add.u64 %rd35, %rd39, %rd35;\n"
|
||||
" .loc 14 380 0\n"
|
||||
" mov.f32 %f172, %f13;\n"
|
||||
" st.global.f32 [%rd35+0], %f172;\n"
|
||||
" .loc 14 381 0\n"
|
||||
" add.u64 %rd35, %rd39, %rd35;\n"
|
||||
" .loc 14 380 0\n"
|
||||
" mov.f32 %f173, %f15;\n"
|
||||
" st.global.f32 [%rd35+0], %f173;\n"
|
||||
" add.u64 %rd40, %rd39, %rd35;\n"
|
||||
" st.global.f32 [%rd40+0], %f16;\n"
|
||||
"$Lt_1_18690:\n"
|
||||
" .loc 14 384 0\n"
|
||||
" ld.param.u64 %rd41, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd42, %rd12, 16;\n"
|
||||
" add.u64 %rd43, %rd41, %rd42;\n"
|
||||
" mov.f32 %f174, %f175;\n"
|
||||
" st.global.v4.f32 [%rd43+0], {%f33,%f32,%f31,%f174};\n"
|
||||
"$Lt_1_13058:\n"
|
||||
" .loc 14 386 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,567 @@
|
|||
const char * lj96_cut_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<33>;\n"
|
||||
" .reg .u64 %rd<36>;\n"
|
||||
" .reg .f32 %f<87>;\n"
|
||||
" .reg .pred %p<8>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj84[16];\n"
|
||||
" .loc 14 87 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 91 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+0], %f1;\n"
|
||||
" .loc 14 92 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+4], %f2;\n"
|
||||
" .loc 14 93 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+8], %f3;\n"
|
||||
" .loc 14 94 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+12], %f4;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_7938;\n"
|
||||
" .loc 14 105 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" .loc 14 108 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 110 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 111 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 114 0\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mul24.lo.s32 %r14, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r14;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r15, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r16, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_12034;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" setp.gt.s32 %p3, %r16, %r17;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" setp.gt.s32 %p4, %r15, %r18;\n"
|
||||
" cvt.rzi.s32.f32 %r19, %f24;\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r21, %r20, %r19;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.u64 %rd15, __cuda_sp_lj84;\n"
|
||||
"$Lt_0_8962:\n"
|
||||
" .loc 14 120 0\n"
|
||||
" ld.global.s32 %r22, [%rd10+0];\n"
|
||||
" .loc 14 121 0\n"
|
||||
" shr.s32 %r23, %r22, 30;\n"
|
||||
" cvt.s64.s32 %rd16, %r23;\n"
|
||||
" and.b64 %rd17, %rd16, 3;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd15, %rd18;\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" and.b32 %r24, %r22, 1073741823;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r24,%r25,%r26,%r27}];\n"
|
||||
" .loc 14 124 0\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.s32.f32 %r28, %f37;\n"
|
||||
" sub.f32 %f38, %f22, %f35;\n"
|
||||
" sub.f32 %f39, %f21, %f34;\n"
|
||||
" sub.f32 %f40, %f23, %f36;\n"
|
||||
" mul.f32 %f41, %f38, %f38;\n"
|
||||
" mad.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" mad.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r29, %r28, %r21;\n"
|
||||
" cvt.u64.s32 %rd20, %r29;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
||||
" add.u64 %rd22, %rd21, %rd14;\n"
|
||||
" ld.global.f32 %f44, [%rd22+8];\n"
|
||||
" setp.gt.f32 %p5, %f44, %f43;\n"
|
||||
" @!%p5 bra $Lt_0_10242;\n"
|
||||
" .loc 14 139 0\n"
|
||||
" rcp.approx.f32 %f45, %f43;\n"
|
||||
" mul.f32 %f46, %f45, %f45;\n"
|
||||
" mul.f32 %f47, %f45, %f46;\n"
|
||||
" sqrt.approx.f32 %f48, %f47;\n"
|
||||
" mul.f32 %f49, %f45, %f47;\n"
|
||||
" ld.global.v2.f32 {%f50,%f51}, [%rd22+0];\n"
|
||||
" mul.f32 %f52, %f50, %f48;\n"
|
||||
" sub.f32 %f53, %f52, %f51;\n"
|
||||
" mul.f32 %f54, %f49, %f53;\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" .loc 14 139 0\n"
|
||||
" mul.f32 %f55, %f29, %f54;\n"
|
||||
" .loc 14 141 0\n"
|
||||
" mad.f32 %f27, %f39, %f55, %f27;\n"
|
||||
" .loc 14 142 0\n"
|
||||
" mad.f32 %f26, %f38, %f55, %f26;\n"
|
||||
" .loc 14 143 0\n"
|
||||
" mad.f32 %f25, %f40, %f55, %f25;\n"
|
||||
" @!%p3 bra $Lt_0_9730;\n"
|
||||
" .loc 14 147 0\n"
|
||||
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd24, %rd23, %rd21;\n"
|
||||
" ld.global.v4.f32 {%f56,%f57,%f58,_}, [%rd24+0];\n"
|
||||
" mul.f32 %f59, %f56, %f48;\n"
|
||||
" sub.f32 %f60, %f59, %f57;\n"
|
||||
" mul.f32 %f61, %f47, %f60;\n"
|
||||
" sub.f32 %f62, %f61, %f58;\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" .loc 14 147 0\n"
|
||||
" mad.f32 %f28, %f29, %f62, %f28;\n"
|
||||
"$Lt_0_9730:\n"
|
||||
" @!%p4 bra $Lt_0_10242;\n"
|
||||
" .loc 14 150 0\n"
|
||||
" mov.f32 %f63, %f6;\n"
|
||||
" mul.f32 %f64, %f39, %f39;\n"
|
||||
" mad.f32 %f65, %f55, %f64, %f63;\n"
|
||||
" mov.f32 %f6, %f65;\n"
|
||||
" .loc 14 151 0\n"
|
||||
" mov.f32 %f66, %f8;\n"
|
||||
" mad.f32 %f67, %f55, %f41, %f66;\n"
|
||||
" mov.f32 %f8, %f67;\n"
|
||||
" .loc 14 152 0\n"
|
||||
" mov.f32 %f68, %f10;\n"
|
||||
" mul.f32 %f69, %f40, %f40;\n"
|
||||
" mad.f32 %f70, %f55, %f69, %f68;\n"
|
||||
" mov.f32 %f10, %f70;\n"
|
||||
" .loc 14 153 0\n"
|
||||
" mov.f32 %f71, %f12;\n"
|
||||
" mul.f32 %f72, %f38, %f39;\n"
|
||||
" mad.f32 %f73, %f55, %f72, %f71;\n"
|
||||
" mov.f32 %f12, %f73;\n"
|
||||
" .loc 14 154 0\n"
|
||||
" mov.f32 %f74, %f14;\n"
|
||||
" mul.f32 %f75, %f39, %f40;\n"
|
||||
" mad.f32 %f76, %f55, %f75, %f74;\n"
|
||||
" mov.f32 %f14, %f76;\n"
|
||||
" .loc 14 155 0\n"
|
||||
" mul.f32 %f77, %f38, %f40;\n"
|
||||
" mad.f32 %f15, %f55, %f77, %f15;\n"
|
||||
" mov.f32 %f78, %f15;\n"
|
||||
"$Lt_0_10242:\n"
|
||||
"$Lt_0_9218:\n"
|
||||
" .loc 14 118 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p6, %rd13, %rd10;\n"
|
||||
" @%p6 bra $Lt_0_8962;\n"
|
||||
" bra.uni $Lt_0_8450;\n"
|
||||
"$Lt_0_12034:\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" setp.gt.s32 %p3, %r16, %r30;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" setp.gt.s32 %p4, %r15, %r31;\n"
|
||||
"$Lt_0_8450:\n"
|
||||
" .loc 14 162 0\n"
|
||||
" ld.param.u64 %rd25, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd26, %rd25, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_11010;\n"
|
||||
" .loc 14 164 0\n"
|
||||
" st.global.f32 [%rd26+0], %f28;\n"
|
||||
" .loc 14 165 0\n"
|
||||
" cvt.u64.s32 %rd27, %r6;\n"
|
||||
" mul.lo.u64 %rd28, %rd27, 4;\n"
|
||||
" add.u64 %rd26, %rd26, %rd28;\n"
|
||||
"$Lt_0_11010:\n"
|
||||
" @!%p4 bra $Lt_0_11522;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" mov.f32 %f79, %f6;\n"
|
||||
" st.global.f32 [%rd26+0], %f79;\n"
|
||||
" .loc 14 170 0\n"
|
||||
" cvt.u64.s32 %rd29, %r6;\n"
|
||||
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" mov.f32 %f80, %f8;\n"
|
||||
" st.global.f32 [%rd26+0], %f80;\n"
|
||||
" .loc 14 170 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" mov.f32 %f81, %f10;\n"
|
||||
" st.global.f32 [%rd26+0], %f81;\n"
|
||||
" .loc 14 170 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" mov.f32 %f82, %f12;\n"
|
||||
" st.global.f32 [%rd26+0], %f82;\n"
|
||||
" .loc 14 170 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" mov.f32 %f83, %f14;\n"
|
||||
" st.global.f32 [%rd26+0], %f83;\n"
|
||||
" add.u64 %rd31, %rd30, %rd26;\n"
|
||||
" st.global.f32 [%rd31+0], %f15;\n"
|
||||
"$Lt_0_11522:\n"
|
||||
" .loc 14 173 0\n"
|
||||
" ld.param.u64 %rd32, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd33, %rd2, 16;\n"
|
||||
" add.u64 %rd34, %rd32, %rd33;\n"
|
||||
" mov.f32 %f84, %f85;\n"
|
||||
" st.global.v4.f32 [%rd34+0], {%f27,%f26,%f25,%f84};\n"
|
||||
"$Lt_0_7938:\n"
|
||||
" .loc 14 175 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<36>;\n"
|
||||
" .reg .u64 %rd<48>;\n"
|
||||
" .reg .f32 %f<93>;\n"
|
||||
" .reg .pred %p<11>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj180[16];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj1208[1024];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj31232[1024];\n"
|
||||
" .loc 14 182 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_10242;\n"
|
||||
" .loc 14 189 0\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj180;\n"
|
||||
" cvt.u64.s32 %rd2, %r1;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_10242:\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj180;\n"
|
||||
" mov.u32 %r3, 63;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_10754;\n"
|
||||
" .loc 14 191 0\n"
|
||||
" mov.u64 %rd7, __cuda_lj1208;\n"
|
||||
" cvt.u64.s32 %rd8, %r1;\n"
|
||||
" mul.lo.u64 %rd9, %rd8, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.f32 [%rd12+0], %f2;\n"
|
||||
" st.shared.f32 [%rd12+4], %f3;\n"
|
||||
" st.shared.f32 [%rd12+8], %f4;\n"
|
||||
" st.shared.f32 [%rd12+12], %f5;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_11266;\n"
|
||||
" .loc 14 193 0\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.f32 [%rd16+0], %f6;\n"
|
||||
" st.shared.f32 [%rd16+4], %f7;\n"
|
||||
" st.shared.f32 [%rd16+8], %f8;\n"
|
||||
" st.shared.f32 [%rd16+12], %f9;\n"
|
||||
"$Lt_1_11266:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
"$Lt_1_10754:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
" mov.u64 %rd7, __cuda_lj1208;\n"
|
||||
" .loc 14 196 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r6, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r7, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r8, %r6, %r7;\n"
|
||||
" add.s32 %r9, %r8, %r1;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r9, %r10;\n"
|
||||
" @%p4 bra $Lt_1_11778;\n"
|
||||
" .loc 14 207 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 14 210 0\n"
|
||||
" cvt.u64.s32 %rd17, %r9;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd20, %rd19, %rd18;\n"
|
||||
" ld.global.s32 %r11, [%rd20+0];\n"
|
||||
" .loc 14 212 0\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd21, %r12;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" ld.global.s32 %r13, [%rd23+0];\n"
|
||||
" .loc 14 213 0\n"
|
||||
" add.u64 %rd24, %rd23, %rd22;\n"
|
||||
" mov.s64 %rd25, %rd24;\n"
|
||||
" mov.s32 %r14, %r11;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 216 0\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" mul24.lo.s32 %r18, %r13, %r12;\n"
|
||||
" cvt.s64.s32 %rd26, %r18;\n"
|
||||
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
||||
" add.u64 %rd28, %rd24, %rd27;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p5, %rd24, %rd28;\n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" @%p5 bra $Lt_1_15874;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p6, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p7, %r19, %r22;\n"
|
||||
" cvt.rzi.s32.f32 %r23, %f29;\n"
|
||||
" mov.s32 %r24, 8;\n"
|
||||
" mul24.lo.s32 %r25, %r24, %r23;\n"
|
||||
" cvt.rn.f32.s32 %f34, %r25;\n"
|
||||
"$Lt_1_12802:\n"
|
||||
" .loc 14 223 0\n"
|
||||
" ld.global.s32 %r26, [%rd25+0];\n"
|
||||
" .loc 14 224 0\n"
|
||||
" shr.s32 %r27, %r26, 30;\n"
|
||||
" cvt.s64.s32 %rd29, %r27;\n"
|
||||
" and.b64 %rd30, %rd29, 3;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd32, %rd1, %rd31;\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" and.b32 %r28, %r26, 1073741823;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.f32 %f44, %f27, %f41;\n"
|
||||
" sub.f32 %f45, %f26, %f40;\n"
|
||||
" sub.f32 %f46, %f28, %f42;\n"
|
||||
" mul.f32 %f47, %f44, %f44;\n"
|
||||
" mad.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" mad.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" add.f32 %f50, %f34, %f43;\n"
|
||||
" cvt.rzi.s32.f32 %r32, %f50;\n"
|
||||
" cvt.u64.s32 %rd33, %r32;\n"
|
||||
" mul.lo.u64 %rd34, %rd33, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd7;\n"
|
||||
" ld.shared.f32 %f51, [%rd35+8];\n"
|
||||
" setp.gt.f32 %p8, %f51, %f49;\n"
|
||||
" @!%p8 bra $Lt_1_14082;\n"
|
||||
" .loc 14 240 0\n"
|
||||
" rcp.approx.f32 %f52, %f49;\n"
|
||||
" mul.f32 %f53, %f52, %f52;\n"
|
||||
" mul.f32 %f54, %f52, %f53;\n"
|
||||
" sqrt.approx.f32 %f55, %f54;\n"
|
||||
" mul.f32 %f56, %f52, %f54;\n"
|
||||
" ld.shared.f32 %f57, [%rd35+4];\n"
|
||||
" ld.shared.f32 %f58, [%rd35+0];\n"
|
||||
" mul.f32 %f59, %f58, %f55;\n"
|
||||
" sub.f32 %f60, %f59, %f57;\n"
|
||||
" mul.f32 %f61, %f56, %f60;\n"
|
||||
" .loc 14 242 0\n"
|
||||
" mad.f32 %f32, %f45, %f61, %f32;\n"
|
||||
" .loc 14 243 0\n"
|
||||
" mad.f32 %f31, %f44, %f61, %f31;\n"
|
||||
" .loc 14 244 0\n"
|
||||
" mad.f32 %f30, %f46, %f61, %f30;\n"
|
||||
" @!%p6 bra $Lt_1_13570;\n"
|
||||
" .loc 14 247 0\n"
|
||||
" add.u64 %rd36, %rd34, %rd13;\n"
|
||||
" ld.shared.f32 %f62, [%rd36+4];\n"
|
||||
" ld.shared.f32 %f63, [%rd36+0];\n"
|
||||
" mul.f32 %f64, %f63, %f55;\n"
|
||||
" sub.f32 %f65, %f64, %f62;\n"
|
||||
" mul.f32 %f66, %f54, %f65;\n"
|
||||
" .loc 14 248 0\n"
|
||||
" ld.shared.f32 %f67, [%rd36+8];\n"
|
||||
" sub.f32 %f68, %f66, %f67;\n"
|
||||
" .loc 14 224 0\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" .loc 14 248 0\n"
|
||||
" mad.f32 %f33, %f35, %f68, %f33;\n"
|
||||
"$Lt_1_13570:\n"
|
||||
" @!%p7 bra $Lt_1_14082;\n"
|
||||
" .loc 14 251 0\n"
|
||||
" mov.f32 %f69, %f11;\n"
|
||||
" mul.f32 %f70, %f45, %f45;\n"
|
||||
" mad.f32 %f71, %f61, %f70, %f69;\n"
|
||||
" mov.f32 %f11, %f71;\n"
|
||||
" .loc 14 252 0\n"
|
||||
" mov.f32 %f72, %f13;\n"
|
||||
" mad.f32 %f73, %f61, %f47, %f72;\n"
|
||||
" mov.f32 %f13, %f73;\n"
|
||||
" .loc 14 253 0\n"
|
||||
" mov.f32 %f74, %f15;\n"
|
||||
" mul.f32 %f75, %f46, %f46;\n"
|
||||
" mad.f32 %f76, %f61, %f75, %f74;\n"
|
||||
" mov.f32 %f15, %f76;\n"
|
||||
" .loc 14 254 0\n"
|
||||
" mov.f32 %f77, %f17;\n"
|
||||
" mul.f32 %f78, %f44, %f45;\n"
|
||||
" mad.f32 %f79, %f61, %f78, %f77;\n"
|
||||
" mov.f32 %f17, %f79;\n"
|
||||
" .loc 14 255 0\n"
|
||||
" mov.f32 %f80, %f19;\n"
|
||||
" mul.f32 %f81, %f45, %f46;\n"
|
||||
" mad.f32 %f82, %f61, %f81, %f80;\n"
|
||||
" mov.f32 %f19, %f82;\n"
|
||||
" .loc 14 256 0\n"
|
||||
" mul.f32 %f83, %f44, %f46;\n"
|
||||
" mad.f32 %f20, %f61, %f83, %f20;\n"
|
||||
" mov.f32 %f84, %f20;\n"
|
||||
"$Lt_1_14082:\n"
|
||||
"$Lt_1_13058:\n"
|
||||
" .loc 14 221 0\n"
|
||||
" add.u64 %rd25, %rd22, %rd25;\n"
|
||||
" setp.gt.u64 %p9, %rd28, %rd25;\n"
|
||||
" @%p9 bra $Lt_1_12802;\n"
|
||||
" bra.uni $Lt_1_12290;\n"
|
||||
"$Lt_1_15874:\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" setp.gt.s32 %p6, %r20, %r33;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" setp.gt.s32 %p7, %r19, %r34;\n"
|
||||
"$Lt_1_12290:\n"
|
||||
" .loc 14 263 0\n"
|
||||
" ld.param.u64 %rd37, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd38, %rd37, %rd18;\n"
|
||||
" @!%p6 bra $Lt_1_14850;\n"
|
||||
" .loc 14 265 0\n"
|
||||
" st.global.f32 [%rd38+0], %f33;\n"
|
||||
" .loc 14 266 0\n"
|
||||
" cvt.u64.s32 %rd39, %r10;\n"
|
||||
" mul.lo.u64 %rd40, %rd39, 4;\n"
|
||||
" add.u64 %rd38, %rd38, %rd40;\n"
|
||||
"$Lt_1_14850:\n"
|
||||
" @!%p7 bra $Lt_1_15362;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" mov.f32 %f85, %f11;\n"
|
||||
" st.global.f32 [%rd38+0], %f85;\n"
|
||||
" .loc 14 271 0\n"
|
||||
" cvt.u64.s32 %rd41, %r10;\n"
|
||||
" mul.lo.u64 %rd42, %rd41, 4;\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" mov.f32 %f86, %f13;\n"
|
||||
" st.global.f32 [%rd38+0], %f86;\n"
|
||||
" .loc 14 271 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" mov.f32 %f87, %f15;\n"
|
||||
" st.global.f32 [%rd38+0], %f87;\n"
|
||||
" .loc 14 271 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" mov.f32 %f88, %f17;\n"
|
||||
" st.global.f32 [%rd38+0], %f88;\n"
|
||||
" .loc 14 271 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 270 0\n"
|
||||
" mov.f32 %f89, %f19;\n"
|
||||
" st.global.f32 [%rd38+0], %f89;\n"
|
||||
" add.u64 %rd43, %rd42, %rd38;\n"
|
||||
" st.global.f32 [%rd43+0], %f20;\n"
|
||||
"$Lt_1_15362:\n"
|
||||
" .loc 14 274 0\n"
|
||||
" ld.param.u64 %rd44, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd45, %rd17, 16;\n"
|
||||
" add.u64 %rd46, %rd44, %rd45;\n"
|
||||
" mov.f32 %f90, %f91;\n"
|
||||
" st.global.v4.f32 [%rd46+0], {%f32,%f31,%f30,%f90};\n"
|
||||
"$Lt_1_11778:\n"
|
||||
" .loc 14 276 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,569 @@
|
|||
const char * lj_cut_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<33>;\n"
|
||||
" .reg .u64 %rd<36>;\n"
|
||||
" .reg .f32 %f<86>;\n"
|
||||
" .reg .pred %p<8>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj84[16];\n"
|
||||
" .loc 14 87 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 91 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+0], %f1;\n"
|
||||
" .loc 14 92 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+4], %f2;\n"
|
||||
" .loc 14 93 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+8], %f3;\n"
|
||||
" .loc 14 94 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj84+12], %f4;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_7938;\n"
|
||||
" .loc 14 105 0\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" mov.f32 %f6, %f5;\n"
|
||||
" mov.f32 %f7, 0f00000000; \n"
|
||||
" mov.f32 %f8, %f7;\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" .loc 14 108 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 110 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 111 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 114 0\n"
|
||||
" mov.f32 %f21, %f17;\n"
|
||||
" mov.f32 %f22, %f18;\n"
|
||||
" mov.f32 %f23, %f19;\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mul24.lo.s32 %r14, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r14;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r15, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r16, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f25, 0f00000000; \n"
|
||||
" mov.f32 %f26, 0f00000000; \n"
|
||||
" mov.f32 %f27, 0f00000000; \n"
|
||||
" mov.f32 %f28, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_12034;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" setp.gt.s32 %p3, %r16, %r17;\n"
|
||||
" mov.s32 %r18, 0;\n"
|
||||
" setp.gt.s32 %p4, %r15, %r18;\n"
|
||||
" cvt.rzi.s32.f32 %r19, %f24;\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r21, %r20, %r19;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.u64 %rd15, __cuda_sp_lj84;\n"
|
||||
"$Lt_0_8962:\n"
|
||||
" .loc 14 120 0\n"
|
||||
" ld.global.s32 %r22, [%rd10+0];\n"
|
||||
" .loc 14 121 0\n"
|
||||
" shr.s32 %r23, %r22, 30;\n"
|
||||
" cvt.s64.s32 %rd16, %r23;\n"
|
||||
" and.b64 %rd17, %rd16, 3;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd15, %rd18;\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" and.b32 %r24, %r22, 1073741823;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r24,%r25,%r26,%r27}];\n"
|
||||
" .loc 14 124 0\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mov.f32 %f36, %f32;\n"
|
||||
" mov.f32 %f37, %f33;\n"
|
||||
" cvt.rzi.s32.f32 %r28, %f37;\n"
|
||||
" sub.f32 %f38, %f22, %f35;\n"
|
||||
" sub.f32 %f39, %f21, %f34;\n"
|
||||
" sub.f32 %f40, %f23, %f36;\n"
|
||||
" mul.f32 %f41, %f38, %f38;\n"
|
||||
" mad.f32 %f42, %f39, %f39, %f41;\n"
|
||||
" mad.f32 %f43, %f40, %f40, %f42;\n"
|
||||
" add.s32 %r29, %r28, %r21;\n"
|
||||
" cvt.u64.s32 %rd20, %r29;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
||||
" add.u64 %rd22, %rd21, %rd14;\n"
|
||||
" ld.global.f32 %f44, [%rd22+8];\n"
|
||||
" setp.gt.f32 %p5, %f44, %f43;\n"
|
||||
" @!%p5 bra $Lt_0_10242;\n"
|
||||
" .loc 14 138 0\n"
|
||||
" rcp.approx.f32 %f45, %f43;\n"
|
||||
" mul.f32 %f46, %f45, %f45;\n"
|
||||
" mul.f32 %f47, %f45, %f46;\n"
|
||||
" mul.f32 %f48, %f45, %f47;\n"
|
||||
" ld.global.v2.f32 {%f49,%f50}, [%rd22+0];\n"
|
||||
" mul.f32 %f51, %f49, %f47;\n"
|
||||
" sub.f32 %f52, %f51, %f50;\n"
|
||||
" mul.f32 %f53, %f48, %f52;\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" .loc 14 138 0\n"
|
||||
" mul.f32 %f54, %f29, %f53;\n"
|
||||
" .loc 14 140 0\n"
|
||||
" mad.f32 %f27, %f39, %f54, %f27;\n"
|
||||
" .loc 14 141 0\n"
|
||||
" mad.f32 %f26, %f38, %f54, %f26;\n"
|
||||
" .loc 14 142 0\n"
|
||||
" mad.f32 %f25, %f40, %f54, %f25;\n"
|
||||
" @!%p3 bra $Lt_0_9730;\n"
|
||||
" .loc 14 146 0\n"
|
||||
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd24, %rd23, %rd21;\n"
|
||||
" ld.global.v4.f32 {%f55,%f56,%f57,_}, [%rd24+0];\n"
|
||||
" mul.f32 %f58, %f55, %f47;\n"
|
||||
" sub.f32 %f59, %f58, %f56;\n"
|
||||
" mul.f32 %f60, %f47, %f59;\n"
|
||||
" sub.f32 %f61, %f60, %f57;\n"
|
||||
" .loc 14 121 0\n"
|
||||
" ld.shared.f32 %f29, [%rd19+0];\n"
|
||||
" .loc 14 146 0\n"
|
||||
" mad.f32 %f28, %f29, %f61, %f28;\n"
|
||||
"$Lt_0_9730:\n"
|
||||
" @!%p4 bra $Lt_0_10242;\n"
|
||||
" .loc 14 149 0\n"
|
||||
" mov.f32 %f62, %f6;\n"
|
||||
" mul.f32 %f63, %f39, %f39;\n"
|
||||
" mad.f32 %f64, %f54, %f63, %f62;\n"
|
||||
" mov.f32 %f6, %f64;\n"
|
||||
" .loc 14 150 0\n"
|
||||
" mov.f32 %f65, %f8;\n"
|
||||
" mad.f32 %f66, %f54, %f41, %f65;\n"
|
||||
" mov.f32 %f8, %f66;\n"
|
||||
" .loc 14 151 0\n"
|
||||
" mov.f32 %f67, %f10;\n"
|
||||
" mul.f32 %f68, %f40, %f40;\n"
|
||||
" mad.f32 %f69, %f54, %f68, %f67;\n"
|
||||
" mov.f32 %f10, %f69;\n"
|
||||
" .loc 14 152 0\n"
|
||||
" mov.f32 %f70, %f12;\n"
|
||||
" mul.f32 %f71, %f38, %f39;\n"
|
||||
" mad.f32 %f72, %f54, %f71, %f70;\n"
|
||||
" mov.f32 %f12, %f72;\n"
|
||||
" .loc 14 153 0\n"
|
||||
" mov.f32 %f73, %f14;\n"
|
||||
" mul.f32 %f74, %f39, %f40;\n"
|
||||
" mad.f32 %f75, %f54, %f74, %f73;\n"
|
||||
" mov.f32 %f14, %f75;\n"
|
||||
" .loc 14 154 0\n"
|
||||
" mul.f32 %f76, %f38, %f40;\n"
|
||||
" mad.f32 %f15, %f54, %f76, %f15;\n"
|
||||
" mov.f32 %f77, %f15;\n"
|
||||
"$Lt_0_10242:\n"
|
||||
"$Lt_0_9218:\n"
|
||||
" .loc 14 118 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p6, %rd13, %rd10;\n"
|
||||
" @%p6 bra $Lt_0_8962;\n"
|
||||
" bra.uni $Lt_0_8450;\n"
|
||||
"$Lt_0_12034:\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" setp.gt.s32 %p3, %r16, %r30;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" setp.gt.s32 %p4, %r15, %r31;\n"
|
||||
"$Lt_0_8450:\n"
|
||||
" .loc 14 161 0\n"
|
||||
" ld.param.u64 %rd25, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd26, %rd25, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_11010;\n"
|
||||
" .loc 14 163 0\n"
|
||||
" st.global.f32 [%rd26+0], %f28;\n"
|
||||
" .loc 14 164 0\n"
|
||||
" cvt.u64.s32 %rd27, %r6;\n"
|
||||
" mul.lo.u64 %rd28, %rd27, 4;\n"
|
||||
" add.u64 %rd26, %rd26, %rd28;\n"
|
||||
"$Lt_0_11010:\n"
|
||||
" @!%p4 bra $Lt_0_11522;\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mov.f32 %f78, %f6;\n"
|
||||
" st.global.f32 [%rd26+0], %f78;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" cvt.u64.s32 %rd29, %r6;\n"
|
||||
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mov.f32 %f79, %f8;\n"
|
||||
" st.global.f32 [%rd26+0], %f79;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mov.f32 %f80, %f10;\n"
|
||||
" st.global.f32 [%rd26+0], %f80;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mov.f32 %f81, %f12;\n"
|
||||
" st.global.f32 [%rd26+0], %f81;\n"
|
||||
" .loc 14 169 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mov.f32 %f82, %f14;\n"
|
||||
" st.global.f32 [%rd26+0], %f82;\n"
|
||||
" add.u64 %rd31, %rd30, %rd26;\n"
|
||||
" st.global.f32 [%rd31+0], %f15;\n"
|
||||
"$Lt_0_11522:\n"
|
||||
" .loc 14 172 0\n"
|
||||
" ld.param.u64 %rd32, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd33, %rd2, 16;\n"
|
||||
" add.u64 %rd34, %rd32, %rd33;\n"
|
||||
" mov.f32 %f83, %f84;\n"
|
||||
" st.global.v4.f32 [%rd34+0], {%f27,%f26,%f25,%f83};\n"
|
||||
"$Lt_0_7938:\n"
|
||||
" .loc 14 174 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<36>;\n"
|
||||
" .reg .u64 %rd<48>;\n"
|
||||
" .reg .f32 %f<93>;\n"
|
||||
" .reg .pred %p<11>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj180[16];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj1208[1024];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj31232[1024];\n"
|
||||
" .loc 14 181 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 3;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_10242;\n"
|
||||
" .loc 14 188 0\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj180;\n"
|
||||
" cvt.u64.s32 %rd2, %r1;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_10242:\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj180;\n"
|
||||
" mov.u32 %r3, 63;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_10754;\n"
|
||||
" .loc 14 190 0\n"
|
||||
" mov.u64 %rd7, __cuda_lj1208;\n"
|
||||
" cvt.u64.s32 %rd8, %r1;\n"
|
||||
" mul.lo.u64 %rd9, %rd8, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.f32 [%rd12+0], %f2;\n"
|
||||
" st.shared.f32 [%rd12+4], %f3;\n"
|
||||
" st.shared.f32 [%rd12+8], %f4;\n"
|
||||
" st.shared.f32 [%rd12+12], %f5;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_11266;\n"
|
||||
" .loc 14 192 0\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.f32 [%rd16+0], %f6;\n"
|
||||
" st.shared.f32 [%rd16+4], %f7;\n"
|
||||
" st.shared.f32 [%rd16+8], %f8;\n"
|
||||
" st.shared.f32 [%rd16+12], %f9;\n"
|
||||
"$Lt_1_11266:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
"$Lt_1_10754:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31232;\n"
|
||||
" mov.u64 %rd7, __cuda_lj1208;\n"
|
||||
" .loc 14 195 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r6, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r7, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r8, %r6, %r7;\n"
|
||||
" add.s32 %r9, %r8, %r1;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r9, %r10;\n"
|
||||
" @%p4 bra $Lt_1_11778;\n"
|
||||
" .loc 14 206 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 14 209 0\n"
|
||||
" cvt.u64.s32 %rd17, %r9;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd20, %rd19, %rd18;\n"
|
||||
" ld.global.s32 %r11, [%rd20+0];\n"
|
||||
" .loc 14 211 0\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd21, %r12;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" ld.global.s32 %r13, [%rd23+0];\n"
|
||||
" .loc 14 212 0\n"
|
||||
" add.u64 %rd24, %rd23, %rd22;\n"
|
||||
" mov.s64 %rd25, %rd24;\n"
|
||||
" mov.s32 %r14, %r11;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 215 0\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" mul24.lo.s32 %r18, %r13, %r12;\n"
|
||||
" cvt.s64.s32 %rd26, %r18;\n"
|
||||
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
||||
" add.u64 %rd28, %rd24, %rd27;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p5, %rd24, %rd28;\n"
|
||||
" mov.f32 %f30, 0f00000000; \n"
|
||||
" mov.f32 %f31, 0f00000000; \n"
|
||||
" mov.f32 %f32, 0f00000000; \n"
|
||||
" mov.f32 %f33, 0f00000000; \n"
|
||||
" @%p5 bra $Lt_1_15874;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p6, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p7, %r19, %r22;\n"
|
||||
" cvt.rzi.s32.f32 %r23, %f29;\n"
|
||||
" mov.s32 %r24, 8;\n"
|
||||
" mul24.lo.s32 %r25, %r24, %r23;\n"
|
||||
" cvt.rn.f32.s32 %f34, %r25;\n"
|
||||
"$Lt_1_12802:\n"
|
||||
" .loc 14 222 0\n"
|
||||
" ld.global.s32 %r26, [%rd25+0];\n"
|
||||
" .loc 14 223 0\n"
|
||||
" shr.s32 %r27, %r26, 30;\n"
|
||||
" cvt.s64.s32 %rd29, %r27;\n"
|
||||
" and.b64 %rd30, %rd29, 3;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd32, %rd1, %rd31;\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" and.b32 %r28, %r26, 1073741823;\n"
|
||||
" mov.s32 %r29, 0;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
|
||||
" .loc 14 226 0\n"
|
||||
" mov.f32 %f40, %f36;\n"
|
||||
" mov.f32 %f41, %f37;\n"
|
||||
" mov.f32 %f42, %f38;\n"
|
||||
" mov.f32 %f43, %f39;\n"
|
||||
" sub.f32 %f44, %f27, %f41;\n"
|
||||
" sub.f32 %f45, %f26, %f40;\n"
|
||||
" sub.f32 %f46, %f28, %f42;\n"
|
||||
" mul.f32 %f47, %f44, %f44;\n"
|
||||
" mad.f32 %f48, %f45, %f45, %f47;\n"
|
||||
" mad.f32 %f49, %f46, %f46, %f48;\n"
|
||||
" add.f32 %f50, %f34, %f43;\n"
|
||||
" cvt.rzi.s32.f32 %r32, %f50;\n"
|
||||
" cvt.u64.s32 %rd33, %r32;\n"
|
||||
" mul.lo.u64 %rd34, %rd33, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd7;\n"
|
||||
" ld.shared.f32 %f51, [%rd35+8];\n"
|
||||
" setp.gt.f32 %p8, %f51, %f49;\n"
|
||||
" @!%p8 bra $Lt_1_14082;\n"
|
||||
" .loc 14 238 0\n"
|
||||
" rcp.approx.f32 %f52, %f49;\n"
|
||||
" mul.f32 %f53, %f52, %f52;\n"
|
||||
" mul.f32 %f54, %f52, %f53;\n"
|
||||
" .loc 14 223 0\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" .loc 14 238 0\n"
|
||||
" mul.f32 %f55, %f52, %f35;\n"
|
||||
" mul.f32 %f56, %f54, %f55;\n"
|
||||
" ld.shared.f32 %f57, [%rd35+4];\n"
|
||||
" ld.shared.f32 %f58, [%rd35+0];\n"
|
||||
" mul.f32 %f59, %f58, %f54;\n"
|
||||
" sub.f32 %f60, %f59, %f57;\n"
|
||||
" mul.f32 %f61, %f56, %f60;\n"
|
||||
" .loc 14 240 0\n"
|
||||
" mad.f32 %f32, %f45, %f61, %f32;\n"
|
||||
" .loc 14 241 0\n"
|
||||
" mad.f32 %f31, %f44, %f61, %f31;\n"
|
||||
" .loc 14 242 0\n"
|
||||
" mad.f32 %f30, %f46, %f61, %f30;\n"
|
||||
" @!%p6 bra $Lt_1_13570;\n"
|
||||
" .loc 14 245 0\n"
|
||||
" add.u64 %rd36, %rd34, %rd13;\n"
|
||||
" ld.shared.f32 %f62, [%rd36+4];\n"
|
||||
" ld.shared.f32 %f63, [%rd36+0];\n"
|
||||
" mul.f32 %f64, %f63, %f54;\n"
|
||||
" sub.f32 %f65, %f64, %f62;\n"
|
||||
" mul.f32 %f66, %f54, %f65;\n"
|
||||
" .loc 14 246 0\n"
|
||||
" ld.shared.f32 %f67, [%rd36+8];\n"
|
||||
" sub.f32 %f68, %f66, %f67;\n"
|
||||
" .loc 14 223 0\n"
|
||||
" ld.shared.f32 %f35, [%rd32+0];\n"
|
||||
" .loc 14 246 0\n"
|
||||
" mad.f32 %f33, %f35, %f68, %f33;\n"
|
||||
"$Lt_1_13570:\n"
|
||||
" @!%p7 bra $Lt_1_14082;\n"
|
||||
" .loc 14 249 0\n"
|
||||
" mov.f32 %f69, %f11;\n"
|
||||
" mul.f32 %f70, %f45, %f45;\n"
|
||||
" mad.f32 %f71, %f61, %f70, %f69;\n"
|
||||
" mov.f32 %f11, %f71;\n"
|
||||
" .loc 14 250 0\n"
|
||||
" mov.f32 %f72, %f13;\n"
|
||||
" mad.f32 %f73, %f61, %f47, %f72;\n"
|
||||
" mov.f32 %f13, %f73;\n"
|
||||
" .loc 14 251 0\n"
|
||||
" mov.f32 %f74, %f15;\n"
|
||||
" mul.f32 %f75, %f46, %f46;\n"
|
||||
" mad.f32 %f76, %f61, %f75, %f74;\n"
|
||||
" mov.f32 %f15, %f76;\n"
|
||||
" .loc 14 252 0\n"
|
||||
" mov.f32 %f77, %f17;\n"
|
||||
" mul.f32 %f78, %f44, %f45;\n"
|
||||
" mad.f32 %f79, %f61, %f78, %f77;\n"
|
||||
" mov.f32 %f17, %f79;\n"
|
||||
" .loc 14 253 0\n"
|
||||
" mov.f32 %f80, %f19;\n"
|
||||
" mul.f32 %f81, %f45, %f46;\n"
|
||||
" mad.f32 %f82, %f61, %f81, %f80;\n"
|
||||
" mov.f32 %f19, %f82;\n"
|
||||
" .loc 14 254 0\n"
|
||||
" mul.f32 %f83, %f44, %f46;\n"
|
||||
" mad.f32 %f20, %f61, %f83, %f20;\n"
|
||||
" mov.f32 %f84, %f20;\n"
|
||||
"$Lt_1_14082:\n"
|
||||
"$Lt_1_13058:\n"
|
||||
" .loc 14 220 0\n"
|
||||
" add.u64 %rd25, %rd22, %rd25;\n"
|
||||
" setp.gt.u64 %p9, %rd28, %rd25;\n"
|
||||
" @%p9 bra $Lt_1_12802;\n"
|
||||
" bra.uni $Lt_1_12290;\n"
|
||||
"$Lt_1_15874:\n"
|
||||
" mov.s32 %r33, 0;\n"
|
||||
" setp.gt.s32 %p6, %r20, %r33;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" setp.gt.s32 %p7, %r19, %r34;\n"
|
||||
"$Lt_1_12290:\n"
|
||||
" .loc 14 261 0\n"
|
||||
" ld.param.u64 %rd37, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd38, %rd37, %rd18;\n"
|
||||
" @!%p6 bra $Lt_1_14850;\n"
|
||||
" .loc 14 263 0\n"
|
||||
" st.global.f32 [%rd38+0], %f33;\n"
|
||||
" .loc 14 264 0\n"
|
||||
" cvt.u64.s32 %rd39, %r10;\n"
|
||||
" mul.lo.u64 %rd40, %rd39, 4;\n"
|
||||
" add.u64 %rd38, %rd38, %rd40;\n"
|
||||
"$Lt_1_14850:\n"
|
||||
" @!%p7 bra $Lt_1_15362;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f85, %f11;\n"
|
||||
" st.global.f32 [%rd38+0], %f85;\n"
|
||||
" .loc 14 269 0\n"
|
||||
" cvt.u64.s32 %rd41, %r10;\n"
|
||||
" mul.lo.u64 %rd42, %rd41, 4;\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f86, %f13;\n"
|
||||
" st.global.f32 [%rd38+0], %f86;\n"
|
||||
" .loc 14 269 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f87, %f15;\n"
|
||||
" st.global.f32 [%rd38+0], %f87;\n"
|
||||
" .loc 14 269 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f88, %f17;\n"
|
||||
" st.global.f32 [%rd38+0], %f88;\n"
|
||||
" .loc 14 269 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f89, %f19;\n"
|
||||
" st.global.f32 [%rd38+0], %f89;\n"
|
||||
" add.u64 %rd43, %rd42, %rd38;\n"
|
||||
" st.global.f32 [%rd43+0], %f20;\n"
|
||||
"$Lt_1_15362:\n"
|
||||
" .loc 14 272 0\n"
|
||||
" ld.param.u64 %rd44, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd45, %rd17, 16;\n"
|
||||
" add.u64 %rd46, %rd44, %rd45;\n"
|
||||
" mov.f32 %f90, %f91;\n"
|
||||
" st.global.v4.f32 [%rd46+0], {%f32,%f31,%f30,%f90};\n"
|
||||
"$Lt_1_11778:\n"
|
||||
" .loc 14 274 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,122 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "lj_expand_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static LJE_GPU_Memory<PRECISION,ACC_PRECISION> LJEMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **offset, double **shift, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
LJEMF.clear();
|
||||
gpu_mode=LJEMF.device->gpu_mode();
|
||||
double gpu_split=LJEMF.device->particle_split();
|
||||
int first_gpu=LJEMF.device->first_device();
|
||||
int last_gpu=LJEMF.device->last_device();
|
||||
int world_me=LJEMF.device->world_me();
|
||||
int gpu_rank=LJEMF.device->gpu_rank();
|
||||
int procs_per_gpu=LJEMF.device->procs_per_gpu();
|
||||
|
||||
LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (LJEMF.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, shift, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
LJEMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, shift, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split,screen);
|
||||
|
||||
LJEMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
LJEMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void lje_gpu_clear() {
|
||||
LJEMF.clear();
|
||||
}
|
||||
|
||||
int** lje_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success) {
|
||||
return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
void lje_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
double lje_gpu_bytes() {
|
||||
return LJEMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,393 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJE_GPU_KERNEL
|
||||
#define LJE_GPU_KERNEL
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
|
||||
__global numtyp4* lj3, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
numtyp r = sqrt(r2inv);
|
||||
numtyp rshift = r - lj1[mtype].w;
|
||||
numtyp rshiftsq = rshift*rshift;
|
||||
r2inv = (numtyp) 1.0/rshiftsq;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj/rshift/r;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
|
||||
__global numtyp4* lj3_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
lj1[tid]=lj1_in[tid];
|
||||
if (eflag>0)
|
||||
lj3[tid]=lj3_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(numtyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r2inv = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r2inv<lj1[mtype].z) {
|
||||
numtyp r = sqrt(r2inv);
|
||||
numtyp rshift = r - lj1[mtype].w;
|
||||
numtyp rshiftsq = rshift*rshift;
|
||||
r2inv = 1.0/rshiftsq;
|
||||
numtyp r6inv = r2inv*r2inv*r2inv;
|
||||
numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
|
||||
force*=factor_lj/rshift/r;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
|
||||
energy+=factor_lj*(e-lj3[mtype].z);
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "lj_expand_gpu_cl.h"
|
||||
#else
|
||||
#include "lj_expand_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "lj_expand_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define LJE_GPU_MemoryT LJE_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
LJE_GPU_MemoryT::LJE_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
LJE_GPU_MemoryT::~LJE_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJE_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
|
||||
double **host_lj1, double **host_lj2,
|
||||
double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double **host_shift,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,lj_expand_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int lj_types=ntypes;
|
||||
shared_types=false;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
lj_types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_lj_types=lj_types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<lj_types*lj_types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
|
||||
host_cutsq, host_shift);
|
||||
|
||||
lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
|
||||
host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||
ucl_copy(sp_lj,dview,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void LJE_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
lj1.clear();
|
||||
lj3.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double LJE_GPU_MemoryT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(LJE_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
|
||||
&lj3.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
|
||||
&_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
template class LJE_GPU_Memory<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,78 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef LJE_GPU_MEMORY_H
|
||||
#define LJE_GPU_MEMORY_H
|
||||
|
||||
#include "atomic_gpu_memory.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class LJE_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
||||
public:
|
||||
LJE_GPU_Memory();
|
||||
~LJE_GPU_Memory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq, double **host_lj1,
|
||||
double **host_lj2, double **host_lj3, double **host_lj4,
|
||||
double **host_offset, double **host_shift, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = shift
|
||||
UCL_D_Vec<numtyp4> lj1;
|
||||
/// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
|
||||
UCL_D_Vec<numtyp4> lj3;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _lj_types;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,711 @@
|
|||
const char * ljc_cut_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .tex .u64 q_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_q_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_cutsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_qqrd2e)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<42>;\n"
|
||||
" .reg .u64 %rd<39>;\n"
|
||||
" .reg .f32 %f<113>;\n"
|
||||
" .reg .pred %p<10>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj108[32];\n"
|
||||
" .loc 14 99 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 103 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+0], %f1;\n"
|
||||
" .loc 14 104 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+4], %f2;\n"
|
||||
" .loc 14 105 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+8], %f3;\n"
|
||||
" .loc 14 106 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+12], %f4;\n"
|
||||
" .loc 14 107 0\n"
|
||||
" ld.global.f32 %f5, [%rd1+16];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+16], %f5;\n"
|
||||
" .loc 14 108 0\n"
|
||||
" ld.global.f32 %f6, [%rd1+20];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+20], %f6;\n"
|
||||
" .loc 14 109 0\n"
|
||||
" ld.global.f32 %f7, [%rd1+24];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+24], %f7;\n"
|
||||
" .loc 14 110 0\n"
|
||||
" ld.global.f32 %f8, [%rd1+28];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+28], %f8;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_10242;\n"
|
||||
" .loc 14 121 0\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" mov.f32 %f17, 0f00000000; \n"
|
||||
" mov.f32 %f18, %f17;\n"
|
||||
" mov.f32 %f19, 0f00000000; \n"
|
||||
" mov.f32 %f20, %f19;\n"
|
||||
" .loc 14 124 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 126 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 127 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 130 0\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.s32 %r14, %r7;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 131 0\n"
|
||||
" mov.f32 %f33, %f29;\n"
|
||||
" mul24.lo.s32 %r18, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r18;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_15874;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r22;\n"
|
||||
" cvt.rzi.s32.f32 %r23, %f28;\n"
|
||||
" ld.param.s32 %r24, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r25, %r24, %r23;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_cutsq];\n"
|
||||
" mov.u64 %rd15, __cuda_sp_lj108;\n"
|
||||
"$Lt_0_11266:\n"
|
||||
" .loc 14 135 0\n"
|
||||
" ld.global.s32 %r26, [%rd10+0];\n"
|
||||
" .loc 14 138 0\n"
|
||||
" shr.s32 %r27, %r26, 30;\n"
|
||||
" cvt.s64.s32 %rd16, %r27;\n"
|
||||
" and.b64 %rd17, %rd16, 3;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd15, %rd18;\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 139 0\n"
|
||||
" ld.shared.f32 %f40, [%rd19+16];\n"
|
||||
" and.b32 %r28, %r26, 1073741823;\n"
|
||||
" mov.s32 %r29, %r28;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.s32 %r32, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
|
||||
" .loc 14 142 0\n"
|
||||
" mov.f32 %f45, %f41;\n"
|
||||
" mov.f32 %f46, %f42;\n"
|
||||
" mov.f32 %f47, %f43;\n"
|
||||
" mov.f32 %f48, %f44;\n"
|
||||
" cvt.rzi.s32.f32 %r33, %f48;\n"
|
||||
" sub.f32 %f49, %f26, %f46;\n"
|
||||
" sub.f32 %f50, %f25, %f45;\n"
|
||||
" sub.f32 %f51, %f27, %f47;\n"
|
||||
" mul.f32 %f52, %f49, %f49;\n"
|
||||
" mad.f32 %f53, %f50, %f50, %f52;\n"
|
||||
" add.s32 %r34, %r33, %r25;\n"
|
||||
" cvt.u64.s32 %rd20, %r34;\n"
|
||||
" mad.f32 %f54, %f51, %f51, %f53;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 4;\n"
|
||||
" add.u64 %rd22, %rd14, %rd21;\n"
|
||||
" ld.global.f32 %f55, [%rd22+0];\n"
|
||||
" setp.gt.f32 %p5, %f55, %f54;\n"
|
||||
" @!%p5 bra $Lt_0_14082;\n"
|
||||
" mul.lo.u64 %rd23, %rd20, 16;\n"
|
||||
" rcp.approx.f32 %f56, %f54;\n"
|
||||
" ld.param.u64 %rd24, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" add.u64 %rd25, %rd24, %rd23;\n"
|
||||
" ld.global.f32 %f57, [%rd25+8];\n"
|
||||
" setp.lt.f32 %p6, %f54, %f57;\n"
|
||||
" @!%p6 bra $Lt_0_12290;\n"
|
||||
" .loc 14 157 0\n"
|
||||
" mul.f32 %f58, %f56, %f56;\n"
|
||||
" mul.f32 %f59, %f56, %f58;\n"
|
||||
" mov.f32 %f60, %f59;\n"
|
||||
" .loc 14 138 0\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 158 0\n"
|
||||
" mul.f32 %f61, %f59, %f39;\n"
|
||||
" ld.global.v2.f32 {%f62,%f63}, [%rd25+0];\n"
|
||||
" mul.f32 %f64, %f62, %f59;\n"
|
||||
" sub.f32 %f65, %f64, %f63;\n"
|
||||
" mul.f32 %f66, %f61, %f65;\n"
|
||||
" bra.uni $Lt_0_12034;\n"
|
||||
"$Lt_0_12290:\n"
|
||||
" .loc 14 160 0\n"
|
||||
" mov.f32 %f66, 0f00000000; \n"
|
||||
"$Lt_0_12034:\n"
|
||||
" ld.global.f32 %f67, [%rd25+12];\n"
|
||||
" setp.gt.f32 %p7, %f67, %f54;\n"
|
||||
" @!%p7 bra $Lt_0_12802;\n"
|
||||
" mov.s32 %r35, %r28;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f68,%f69,%f70,%f71},[q_tex,{%r35,%r36,%r37,%r38}];\n"
|
||||
" .loc 14 163 0\n"
|
||||
" mov.f32 %f72, %f68;\n"
|
||||
" ld.param.f32 %f73, [__cudaparm_kernel_pair_qqrd2e];\n"
|
||||
" mul.f32 %f74, %f73, %f33;\n"
|
||||
" mul.f32 %f75, %f72, %f74;\n"
|
||||
" sqrt.approx.f32 %f76, %f56;\n"
|
||||
" mul.f32 %f77, %f75, %f76;\n"
|
||||
" .loc 14 139 0\n"
|
||||
" ld.shared.f32 %f40, [%rd19+16];\n"
|
||||
" .loc 14 163 0\n"
|
||||
" mul.f32 %f78, %f40, %f77;\n"
|
||||
" bra.uni $Lt_0_12546;\n"
|
||||
"$Lt_0_12802:\n"
|
||||
" .loc 14 165 0\n"
|
||||
" mov.f32 %f78, 0f00000000; \n"
|
||||
"$Lt_0_12546:\n"
|
||||
" .loc 14 169 0\n"
|
||||
" add.f32 %f79, %f78, %f66;\n"
|
||||
" mul.f32 %f80, %f79, %f56;\n"
|
||||
" mad.f32 %f36, %f50, %f80, %f36;\n"
|
||||
" .loc 14 170 0\n"
|
||||
" mad.f32 %f35, %f49, %f80, %f35;\n"
|
||||
" .loc 14 171 0\n"
|
||||
" mad.f32 %f34, %f51, %f80, %f34;\n"
|
||||
" @!%p3 bra $Lt_0_13570;\n"
|
||||
" .loc 14 174 0\n"
|
||||
" add.f32 %f37, %f78, %f37;\n"
|
||||
" @!%p6 bra $Lt_0_13570;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" ld.param.u64 %rd26, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd27, %rd26, %rd23;\n"
|
||||
" mov.f32 %f81, %f60;\n"
|
||||
" ld.global.v4.f32 {%f82,%f83,%f84,_}, [%rd27+0];\n"
|
||||
" mul.f32 %f85, %f82, %f81;\n"
|
||||
" sub.f32 %f86, %f85, %f83;\n"
|
||||
" mul.f32 %f87, %f81, %f86;\n"
|
||||
" sub.f32 %f88, %f87, %f84;\n"
|
||||
" .loc 14 138 0\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 177 0\n"
|
||||
" mad.f32 %f38, %f39, %f88, %f38;\n"
|
||||
"$Lt_0_13570:\n"
|
||||
"$Lt_0_13058:\n"
|
||||
" @!%p4 bra $Lt_0_14082;\n"
|
||||
" .loc 14 181 0\n"
|
||||
" mov.f32 %f89, %f10;\n"
|
||||
" mul.f32 %f90, %f50, %f50;\n"
|
||||
" mad.f32 %f91, %f80, %f90, %f89;\n"
|
||||
" mov.f32 %f10, %f91;\n"
|
||||
" .loc 14 182 0\n"
|
||||
" mov.f32 %f92, %f12;\n"
|
||||
" mad.f32 %f93, %f80, %f52, %f92;\n"
|
||||
" mov.f32 %f12, %f93;\n"
|
||||
" .loc 14 183 0\n"
|
||||
" mov.f32 %f94, %f14;\n"
|
||||
" mul.f32 %f95, %f51, %f51;\n"
|
||||
" mad.f32 %f96, %f80, %f95, %f94;\n"
|
||||
" mov.f32 %f14, %f96;\n"
|
||||
" .loc 14 184 0\n"
|
||||
" mov.f32 %f97, %f16;\n"
|
||||
" mul.f32 %f98, %f49, %f50;\n"
|
||||
" mad.f32 %f99, %f80, %f98, %f97;\n"
|
||||
" mov.f32 %f16, %f99;\n"
|
||||
" .loc 14 185 0\n"
|
||||
" mov.f32 %f100, %f18;\n"
|
||||
" mul.f32 %f101, %f50, %f51;\n"
|
||||
" mad.f32 %f102, %f80, %f101, %f100;\n"
|
||||
" mov.f32 %f18, %f102;\n"
|
||||
" .loc 14 186 0\n"
|
||||
" mul.f32 %f103, %f49, %f51;\n"
|
||||
" mad.f32 %f19, %f80, %f103, %f19;\n"
|
||||
" mov.f32 %f104, %f19;\n"
|
||||
"$Lt_0_14082:\n"
|
||||
"$Lt_0_11522:\n"
|
||||
" .loc 14 134 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p8, %rd13, %rd10;\n"
|
||||
" @%p8 bra $Lt_0_11266;\n"
|
||||
" bra.uni $Lt_0_10754;\n"
|
||||
"$Lt_0_15874:\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r39;\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r40;\n"
|
||||
"$Lt_0_10754:\n"
|
||||
" .loc 14 193 0\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd29, %rd28, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_14850;\n"
|
||||
" .loc 14 195 0\n"
|
||||
" st.global.f32 [%rd29+0], %f38;\n"
|
||||
" .loc 14 196 0\n"
|
||||
" cvt.u64.s32 %rd30, %r6;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd29, %rd31, %rd29;\n"
|
||||
" .loc 14 197 0\n"
|
||||
" st.global.f32 [%rd29+0], %f37;\n"
|
||||
" .loc 14 198 0\n"
|
||||
" add.u64 %rd29, %rd31, %rd29;\n"
|
||||
"$Lt_0_14850:\n"
|
||||
" @!%p4 bra $Lt_0_15362;\n"
|
||||
" .loc 14 202 0\n"
|
||||
" mov.f32 %f105, %f10;\n"
|
||||
" st.global.f32 [%rd29+0], %f105;\n"
|
||||
" .loc 14 203 0\n"
|
||||
" cvt.u64.s32 %rd32, %r6;\n"
|
||||
" mul.lo.u64 %rd33, %rd32, 4;\n"
|
||||
" add.u64 %rd29, %rd33, %rd29;\n"
|
||||
" .loc 14 202 0\n"
|
||||
" mov.f32 %f106, %f12;\n"
|
||||
" st.global.f32 [%rd29+0], %f106;\n"
|
||||
" .loc 14 203 0\n"
|
||||
" add.u64 %rd29, %rd33, %rd29;\n"
|
||||
" .loc 14 202 0\n"
|
||||
" mov.f32 %f107, %f14;\n"
|
||||
" st.global.f32 [%rd29+0], %f107;\n"
|
||||
" .loc 14 203 0\n"
|
||||
" add.u64 %rd29, %rd33, %rd29;\n"
|
||||
" .loc 14 202 0\n"
|
||||
" mov.f32 %f108, %f16;\n"
|
||||
" st.global.f32 [%rd29+0], %f108;\n"
|
||||
" .loc 14 203 0\n"
|
||||
" add.u64 %rd29, %rd33, %rd29;\n"
|
||||
" .loc 14 202 0\n"
|
||||
" mov.f32 %f109, %f18;\n"
|
||||
" st.global.f32 [%rd29+0], %f109;\n"
|
||||
" add.u64 %rd34, %rd33, %rd29;\n"
|
||||
" st.global.f32 [%rd34+0], %f19;\n"
|
||||
"$Lt_0_15362:\n"
|
||||
" .loc 14 206 0\n"
|
||||
" ld.param.u64 %rd35, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd36, %rd2, 16;\n"
|
||||
" add.u64 %rd37, %rd35, %rd36;\n"
|
||||
" mov.f32 %f110, %f111;\n"
|
||||
" st.global.v4.f32 [%rd37+0], {%f36,%f35,%f34,%f110};\n"
|
||||
"$Lt_0_10242:\n"
|
||||
" .loc 14 208 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast__cutsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<45>;\n"
|
||||
" .reg .u64 %rd<55>;\n"
|
||||
" .reg .f32 %f<117>;\n"
|
||||
" .reg .pred %p<13>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj244[32];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj1288[1024];\n"
|
||||
" .shared .align 4 .b8 __cuda_cutsq1312[256];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj31568[1024];\n"
|
||||
" .loc 14 217 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 7;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_12546;\n"
|
||||
" .loc 14 225 0\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
||||
" cvt.u64.s32 %rd2, %r1;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_12546:\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
||||
" mov.u32 %r3, 63;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_13058;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.u64 %rd7, __cuda_lj1288;\n"
|
||||
" mov.u64 %rd8, __cuda_cutsq1312;\n"
|
||||
" cvt.u64.s32 %rd9, %r1;\n"
|
||||
" mul.lo.u64 %rd10, %rd9, 16;\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd12, %rd11, %rd10;\n"
|
||||
" add.u64 %rd13, %rd10, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd12+0];\n"
|
||||
" st.shared.f32 [%rd13+0], %f2;\n"
|
||||
" st.shared.f32 [%rd13+4], %f3;\n"
|
||||
" st.shared.f32 [%rd13+8], %f4;\n"
|
||||
" st.shared.f32 [%rd13+12], %f5;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" mul.lo.u64 %rd14, %rd9, 4;\n"
|
||||
" ld.param.u64 %rd15, [__cudaparm_kernel_pair_fast__cutsq];\n"
|
||||
" add.u64 %rd16, %rd15, %rd14;\n"
|
||||
" ld.global.f32 %f6, [%rd16+0];\n"
|
||||
" add.u64 %rd17, %rd14, %rd8;\n"
|
||||
" st.shared.f32 [%rd17+0], %f6;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_13570;\n"
|
||||
" .loc 14 230 0\n"
|
||||
" mov.u64 %rd18, __cuda_lj31568;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd20, %rd19, %rd10;\n"
|
||||
" add.u64 %rd21, %rd10, %rd18;\n"
|
||||
" ld.global.v4.f32 {%f7,%f8,%f9,%f10}, [%rd20+0];\n"
|
||||
" st.shared.f32 [%rd21+0], %f7;\n"
|
||||
" st.shared.f32 [%rd21+4], %f8;\n"
|
||||
" st.shared.f32 [%rd21+8], %f9;\n"
|
||||
" st.shared.f32 [%rd21+12], %f10;\n"
|
||||
"$Lt_1_13570:\n"
|
||||
" mov.u64 %rd18, __cuda_lj31568;\n"
|
||||
"$Lt_1_13058:\n"
|
||||
" mov.u64 %rd7, __cuda_lj1288;\n"
|
||||
" mov.u64 %rd8, __cuda_cutsq1312;\n"
|
||||
" mov.u64 %rd18, __cuda_lj31568;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r6, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r7, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r8, %r6, %r7;\n"
|
||||
" add.s32 %r9, %r8, %r1;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r9, %r10;\n"
|
||||
" @%p4 bra $Lt_1_14082;\n"
|
||||
" .loc 14 245 0\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" mov.f32 %f17, 0f00000000; \n"
|
||||
" mov.f32 %f18, %f17;\n"
|
||||
" mov.f32 %f19, 0f00000000; \n"
|
||||
" mov.f32 %f20, %f19;\n"
|
||||
" mov.f32 %f21, 0f00000000; \n"
|
||||
" mov.f32 %f22, %f21;\n"
|
||||
" .loc 14 248 0\n"
|
||||
" cvt.u64.s32 %rd22, %r9;\n"
|
||||
" mul.lo.u64 %rd23, %rd22, 4;\n"
|
||||
" ld.param.u64 %rd24, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd25, %rd24, %rd23;\n"
|
||||
" ld.global.s32 %r11, [%rd25+0];\n"
|
||||
" .loc 14 250 0\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd26, %r12;\n"
|
||||
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
||||
" add.u64 %rd28, %rd25, %rd27;\n"
|
||||
" ld.global.s32 %r13, [%rd28+0];\n"
|
||||
" .loc 14 251 0\n"
|
||||
" add.u64 %rd29, %rd28, %rd27;\n"
|
||||
" mov.s64 %rd30, %rd29;\n"
|
||||
" mov.s32 %r14, %r11;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f23,%f24,%f25,%f26},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 254 0\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" mov.f32 %f30, %f26;\n"
|
||||
" mov.s32 %r18, %r11;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" mov.s32 %r20, 0;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f31,%f32,%f33,%f34},[q_tex,{%r18,%r19,%r20,%r21}];\n"
|
||||
" .loc 14 255 0\n"
|
||||
" mov.f32 %f35, %f31;\n"
|
||||
" mul24.lo.s32 %r22, %r13, %r12;\n"
|
||||
" cvt.s64.s32 %rd31, %r22;\n"
|
||||
" mul.lo.u64 %rd32, %rd31, 4;\n"
|
||||
" add.u64 %rd33, %rd29, %rd32;\n"
|
||||
" ld.param.s32 %r23, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r24, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p5, %rd29, %rd33;\n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" mov.f32 %f39, 0f00000000; \n"
|
||||
" mov.f32 %f40, 0f00000000; \n"
|
||||
" @%p5 bra $Lt_1_19714;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" setp.gt.s32 %p6, %r24, %r25;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" setp.gt.s32 %p7, %r23, %r26;\n"
|
||||
" cvt.rzi.s32.f32 %r27, %f30;\n"
|
||||
" mov.s32 %r28, 8;\n"
|
||||
" mul24.lo.s32 %r29, %r28, %r27;\n"
|
||||
" cvt.rn.f32.s32 %f41, %r29;\n"
|
||||
"$Lt_1_15106:\n"
|
||||
" .loc 14 260 0\n"
|
||||
" ld.global.s32 %r30, [%rd30+0];\n"
|
||||
" .loc 14 263 0\n"
|
||||
" shr.s32 %r31, %r30, 30;\n"
|
||||
" cvt.s64.s32 %rd34, %r31;\n"
|
||||
" and.b64 %rd35, %rd34, 3;\n"
|
||||
" mul.lo.u64 %rd36, %rd35, 4;\n"
|
||||
" add.u64 %rd37, %rd1, %rd36;\n"
|
||||
" ld.shared.f32 %f42, [%rd37+0];\n"
|
||||
" .loc 14 264 0\n"
|
||||
" ld.shared.f32 %f43, [%rd37+16];\n"
|
||||
" and.b32 %r32, %r30, 1073741823;\n"
|
||||
" mov.s32 %r33, %r32;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" mov.s32 %r35, 0;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r33,%r34,%r35,%r36}];\n"
|
||||
" .loc 14 267 0\n"
|
||||
" mov.f32 %f48, %f44;\n"
|
||||
" mov.f32 %f49, %f45;\n"
|
||||
" mov.f32 %f50, %f46;\n"
|
||||
" mov.f32 %f51, %f47;\n"
|
||||
" sub.f32 %f52, %f28, %f49;\n"
|
||||
" sub.f32 %f53, %f27, %f48;\n"
|
||||
" sub.f32 %f54, %f29, %f50;\n"
|
||||
" mul.f32 %f55, %f52, %f52;\n"
|
||||
" mad.f32 %f56, %f53, %f53, %f55;\n"
|
||||
" mad.f32 %f57, %f54, %f54, %f56;\n"
|
||||
" add.f32 %f58, %f41, %f51;\n"
|
||||
" cvt.rzi.s32.f32 %r37, %f58;\n"
|
||||
" cvt.u64.s32 %rd38, %r37;\n"
|
||||
" mul.lo.u64 %rd39, %rd38, 4;\n"
|
||||
" add.u64 %rd40, %rd8, %rd39;\n"
|
||||
" ld.shared.f32 %f59, [%rd40+0];\n"
|
||||
" setp.gt.f32 %p8, %f59, %f57;\n"
|
||||
" @!%p8 bra $Lt_1_17922;\n"
|
||||
" rcp.approx.f32 %f60, %f57;\n"
|
||||
" mul.lo.u64 %rd41, %rd38, 16;\n"
|
||||
" add.u64 %rd42, %rd41, %rd7;\n"
|
||||
" ld.shared.f32 %f61, [%rd42+8];\n"
|
||||
" setp.lt.f32 %p9, %f57, %f61;\n"
|
||||
" @!%p9 bra $Lt_1_16130;\n"
|
||||
" .loc 14 281 0\n"
|
||||
" mul.f32 %f62, %f60, %f60;\n"
|
||||
" mul.f32 %f63, %f60, %f62;\n"
|
||||
" mov.f32 %f64, %f63;\n"
|
||||
" .loc 14 263 0\n"
|
||||
" ld.shared.f32 %f42, [%rd37+0];\n"
|
||||
" .loc 14 282 0\n"
|
||||
" mul.f32 %f65, %f63, %f42;\n"
|
||||
" ld.shared.f32 %f66, [%rd42+4];\n"
|
||||
" ld.shared.f32 %f67, [%rd42+0];\n"
|
||||
" mul.f32 %f68, %f67, %f63;\n"
|
||||
" sub.f32 %f69, %f68, %f66;\n"
|
||||
" mul.f32 %f70, %f65, %f69;\n"
|
||||
" bra.uni $Lt_1_15874;\n"
|
||||
"$Lt_1_16130:\n"
|
||||
" .loc 14 284 0\n"
|
||||
" mov.f32 %f70, 0f00000000; \n"
|
||||
"$Lt_1_15874:\n"
|
||||
" ld.shared.f32 %f71, [%rd42+12];\n"
|
||||
" setp.gt.f32 %p10, %f71, %f57;\n"
|
||||
" @!%p10 bra $Lt_1_16642;\n"
|
||||
" mov.s32 %r38, %r32;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f72,%f73,%f74,%f75},[q_tex,{%r38,%r39,%r40,%r41}];\n"
|
||||
" .loc 14 287 0\n"
|
||||
" mov.f32 %f76, %f72;\n"
|
||||
" ld.param.f32 %f77, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
|
||||
" mul.f32 %f78, %f77, %f35;\n"
|
||||
" mul.f32 %f79, %f76, %f78;\n"
|
||||
" sqrt.approx.f32 %f80, %f60;\n"
|
||||
" mul.f32 %f81, %f79, %f80;\n"
|
||||
" .loc 14 264 0\n"
|
||||
" ld.shared.f32 %f43, [%rd37+16];\n"
|
||||
" .loc 14 287 0\n"
|
||||
" mul.f32 %f82, %f43, %f81;\n"
|
||||
" bra.uni $Lt_1_16386;\n"
|
||||
"$Lt_1_16642:\n"
|
||||
" .loc 14 289 0\n"
|
||||
" mov.f32 %f82, 0f00000000; \n"
|
||||
"$Lt_1_16386:\n"
|
||||
" .loc 14 293 0\n"
|
||||
" add.f32 %f83, %f82, %f70;\n"
|
||||
" mul.f32 %f84, %f83, %f60;\n"
|
||||
" mad.f32 %f38, %f53, %f84, %f38;\n"
|
||||
" .loc 14 294 0\n"
|
||||
" mad.f32 %f37, %f52, %f84, %f37;\n"
|
||||
" .loc 14 295 0\n"
|
||||
" mad.f32 %f36, %f54, %f84, %f36;\n"
|
||||
" @!%p6 bra $Lt_1_17410;\n"
|
||||
" .loc 14 298 0\n"
|
||||
" add.f32 %f39, %f82, %f39;\n"
|
||||
" @!%p9 bra $Lt_1_17410;\n"
|
||||
" .loc 14 300 0\n"
|
||||
" add.u64 %rd43, %rd41, %rd18;\n"
|
||||
" mov.f32 %f85, %f64;\n"
|
||||
" ld.shared.f32 %f86, [%rd43+4];\n"
|
||||
" ld.shared.f32 %f87, [%rd43+0];\n"
|
||||
" mul.f32 %f88, %f87, %f85;\n"
|
||||
" sub.f32 %f89, %f88, %f86;\n"
|
||||
" mul.f32 %f90, %f85, %f89;\n"
|
||||
" .loc 14 301 0\n"
|
||||
" ld.shared.f32 %f91, [%rd43+8];\n"
|
||||
" sub.f32 %f92, %f90, %f91;\n"
|
||||
" .loc 14 263 0\n"
|
||||
" ld.shared.f32 %f42, [%rd37+0];\n"
|
||||
" .loc 14 301 0\n"
|
||||
" mad.f32 %f40, %f42, %f92, %f40;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
"$Lt_1_16898:\n"
|
||||
" @!%p7 bra $Lt_1_17922;\n"
|
||||
" .loc 14 305 0\n"
|
||||
" mov.f32 %f93, %f12;\n"
|
||||
" mul.f32 %f94, %f53, %f53;\n"
|
||||
" mad.f32 %f95, %f84, %f94, %f93;\n"
|
||||
" mov.f32 %f12, %f95;\n"
|
||||
" .loc 14 306 0\n"
|
||||
" mov.f32 %f96, %f14;\n"
|
||||
" mad.f32 %f97, %f84, %f55, %f96;\n"
|
||||
" mov.f32 %f14, %f97;\n"
|
||||
" .loc 14 307 0\n"
|
||||
" mov.f32 %f98, %f16;\n"
|
||||
" mul.f32 %f99, %f54, %f54;\n"
|
||||
" mad.f32 %f100, %f84, %f99, %f98;\n"
|
||||
" mov.f32 %f16, %f100;\n"
|
||||
" .loc 14 308 0\n"
|
||||
" mov.f32 %f101, %f18;\n"
|
||||
" mul.f32 %f102, %f52, %f53;\n"
|
||||
" mad.f32 %f103, %f84, %f102, %f101;\n"
|
||||
" mov.f32 %f18, %f103;\n"
|
||||
" .loc 14 309 0\n"
|
||||
" mov.f32 %f104, %f20;\n"
|
||||
" mul.f32 %f105, %f53, %f54;\n"
|
||||
" mad.f32 %f106, %f84, %f105, %f104;\n"
|
||||
" mov.f32 %f20, %f106;\n"
|
||||
" .loc 14 310 0\n"
|
||||
" mul.f32 %f107, %f52, %f54;\n"
|
||||
" mad.f32 %f21, %f84, %f107, %f21;\n"
|
||||
" mov.f32 %f108, %f21;\n"
|
||||
"$Lt_1_17922:\n"
|
||||
"$Lt_1_15362:\n"
|
||||
" .loc 14 259 0\n"
|
||||
" add.u64 %rd30, %rd27, %rd30;\n"
|
||||
" setp.gt.u64 %p11, %rd33, %rd30;\n"
|
||||
" @%p11 bra $Lt_1_15106;\n"
|
||||
" bra.uni $Lt_1_14594;\n"
|
||||
"$Lt_1_19714:\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" setp.gt.s32 %p6, %r24, %r42;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" setp.gt.s32 %p7, %r23, %r43;\n"
|
||||
"$Lt_1_14594:\n"
|
||||
" .loc 14 317 0\n"
|
||||
" ld.param.u64 %rd44, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd45, %rd44, %rd23;\n"
|
||||
" @!%p6 bra $Lt_1_18690;\n"
|
||||
" .loc 14 319 0\n"
|
||||
" st.global.f32 [%rd45+0], %f40;\n"
|
||||
" .loc 14 320 0\n"
|
||||
" cvt.u64.s32 %rd46, %r10;\n"
|
||||
" mul.lo.u64 %rd47, %rd46, 4;\n"
|
||||
" add.u64 %rd45, %rd47, %rd45;\n"
|
||||
" .loc 14 321 0\n"
|
||||
" st.global.f32 [%rd45+0], %f39;\n"
|
||||
" .loc 14 322 0\n"
|
||||
" add.u64 %rd45, %rd47, %rd45;\n"
|
||||
"$Lt_1_18690:\n"
|
||||
" @!%p7 bra $Lt_1_19202;\n"
|
||||
" .loc 14 326 0\n"
|
||||
" mov.f32 %f109, %f12;\n"
|
||||
" st.global.f32 [%rd45+0], %f109;\n"
|
||||
" .loc 14 327 0\n"
|
||||
" cvt.u64.s32 %rd48, %r10;\n"
|
||||
" mul.lo.u64 %rd49, %rd48, 4;\n"
|
||||
" add.u64 %rd45, %rd49, %rd45;\n"
|
||||
" .loc 14 326 0\n"
|
||||
" mov.f32 %f110, %f14;\n"
|
||||
" st.global.f32 [%rd45+0], %f110;\n"
|
||||
" .loc 14 327 0\n"
|
||||
" add.u64 %rd45, %rd49, %rd45;\n"
|
||||
" .loc 14 326 0\n"
|
||||
" mov.f32 %f111, %f16;\n"
|
||||
" st.global.f32 [%rd45+0], %f111;\n"
|
||||
" .loc 14 327 0\n"
|
||||
" add.u64 %rd45, %rd49, %rd45;\n"
|
||||
" .loc 14 326 0\n"
|
||||
" mov.f32 %f112, %f18;\n"
|
||||
" st.global.f32 [%rd45+0], %f112;\n"
|
||||
" .loc 14 327 0\n"
|
||||
" add.u64 %rd45, %rd49, %rd45;\n"
|
||||
" .loc 14 326 0\n"
|
||||
" mov.f32 %f113, %f20;\n"
|
||||
" st.global.f32 [%rd45+0], %f113;\n"
|
||||
" add.u64 %rd50, %rd49, %rd45;\n"
|
||||
" st.global.f32 [%rd50+0], %f21;\n"
|
||||
"$Lt_1_19202:\n"
|
||||
" .loc 14 330 0\n"
|
||||
" ld.param.u64 %rd51, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd52, %rd22, 16;\n"
|
||||
" add.u64 %rd53, %rd51, %rd52;\n"
|
||||
" mov.f32 %f114, %f115;\n"
|
||||
" st.global.v4.f32 [%rd53+0], {%f38,%f37,%f36,%f114};\n"
|
||||
"$Lt_1_14082:\n"
|
||||
" .loc 14 332 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,762 @@
|
|||
const char * ljcl_cut_gpu_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 pos_tex;\n"
|
||||
" .tex .u64 q_tex;\n"
|
||||
" .entry kernel_pair (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj1,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_lj3,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_lj_types,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_g_ewald)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<42>;\n"
|
||||
" .reg .u64 %rd<36>;\n"
|
||||
" .reg .f32 %f<145>;\n"
|
||||
" .reg .pred %p<10>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj108[32];\n"
|
||||
" .loc 14 107 0\n"
|
||||
"$LBB1_kernel_pair:\n"
|
||||
" .loc 14 111 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
|
||||
" ld.global.f32 %f1, [%rd1+0];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+0], %f1;\n"
|
||||
" .loc 14 112 0\n"
|
||||
" ld.global.f32 %f2, [%rd1+4];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+4], %f2;\n"
|
||||
" .loc 14 113 0\n"
|
||||
" ld.global.f32 %f3, [%rd1+8];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+8], %f3;\n"
|
||||
" .loc 14 114 0\n"
|
||||
" ld.global.f32 %f4, [%rd1+12];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+12], %f4;\n"
|
||||
" .loc 14 115 0\n"
|
||||
" ld.global.f32 %f5, [%rd1+16];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+16], %f5;\n"
|
||||
" .loc 14 116 0\n"
|
||||
" ld.global.f32 %f6, [%rd1+20];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+20], %f6;\n"
|
||||
" .loc 14 117 0\n"
|
||||
" ld.global.f32 %f7, [%rd1+24];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+24], %f7;\n"
|
||||
" .loc 14 118 0\n"
|
||||
" ld.global.f32 %f8, [%rd1+28];\n"
|
||||
" st.shared.f32 [__cuda_sp_lj108+28], %f8;\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_pair_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_10242;\n"
|
||||
" .loc 14 129 0\n"
|
||||
" mov.f32 %f9, 0f00000000; \n"
|
||||
" mov.f32 %f10, %f9;\n"
|
||||
" mov.f32 %f11, 0f00000000; \n"
|
||||
" mov.f32 %f12, %f11;\n"
|
||||
" mov.f32 %f13, 0f00000000; \n"
|
||||
" mov.f32 %f14, %f13;\n"
|
||||
" mov.f32 %f15, 0f00000000; \n"
|
||||
" mov.f32 %f16, %f15;\n"
|
||||
" mov.f32 %f17, 0f00000000; \n"
|
||||
" mov.f32 %f18, %f17;\n"
|
||||
" mov.f32 %f19, 0f00000000; \n"
|
||||
" mov.f32 %f20, %f19;\n"
|
||||
" .loc 14 132 0\n"
|
||||
" cvt.u64.s32 %rd2, %r5;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd5+0];\n"
|
||||
" .loc 14 134 0\n"
|
||||
" ld.param.s32 %r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 4;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.s32 %r9, [%rd8+0];\n"
|
||||
" .loc 14 135 0\n"
|
||||
" add.u64 %rd9, %rd8, %rd7;\n"
|
||||
" mov.s64 %rd10, %rd9;\n"
|
||||
" mov.s32 %r10, %r7;\n"
|
||||
" mov.s32 %r11, 0;\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" mov.s32 %r13, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
|
||||
" .loc 14 138 0\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.s32 %r14, %r7;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 139 0\n"
|
||||
" mov.f32 %f33, %f29;\n"
|
||||
" mul24.lo.s32 %r18, %r9, %r8;\n"
|
||||
" cvt.s64.s32 %rd11, %r18;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd9, %rd12;\n"
|
||||
" ld.param.s32 %r19, [__cudaparm_kernel_pair_vflag];\n"
|
||||
" ld.param.s32 %r20, [__cudaparm_kernel_pair_eflag];\n"
|
||||
" setp.ge.u64 %p2, %rd9, %rd13;\n"
|
||||
" mov.f32 %f34, 0f00000000; \n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" @%p2 bra $Lt_0_15874;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r21;\n"
|
||||
" mov.s32 %r22, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r22;\n"
|
||||
" cvt.rzi.s32.f32 %r23, %f28;\n"
|
||||
" ld.param.s32 %r24, [__cudaparm_kernel_pair_lj_types];\n"
|
||||
" mul.lo.s32 %r25, %r24, %r23;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_lj1];\n"
|
||||
" mov.u64 %rd15, __cuda_sp_lj108;\n"
|
||||
"$Lt_0_11266:\n"
|
||||
" .loc 14 143 0\n"
|
||||
" ld.global.s32 %r26, [%rd10+0];\n"
|
||||
" .loc 14 146 0\n"
|
||||
" shr.s32 %r27, %r26, 30;\n"
|
||||
" cvt.s64.s32 %rd16, %r27;\n"
|
||||
" and.b64 %rd17, %rd16, 3;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd15, %rd18;\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 147 0\n"
|
||||
" mov.f32 %f40, 0f3f800000; \n"
|
||||
" ld.shared.f32 %f41, [%rd19+16];\n"
|
||||
" sub.f32 %f42, %f40, %f41;\n"
|
||||
" and.b32 %r28, %r26, 1073741823;\n"
|
||||
" mov.s32 %r29, %r28;\n"
|
||||
" mov.s32 %r30, 0;\n"
|
||||
" mov.s32 %r31, 0;\n"
|
||||
" mov.s32 %r32, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
|
||||
" .loc 14 150 0\n"
|
||||
" mov.f32 %f47, %f43;\n"
|
||||
" mov.f32 %f48, %f44;\n"
|
||||
" mov.f32 %f49, %f45;\n"
|
||||
" mov.f32 %f50, %f46;\n"
|
||||
" cvt.rzi.s32.f32 %r33, %f50;\n"
|
||||
" sub.f32 %f51, %f26, %f48;\n"
|
||||
" sub.f32 %f52, %f25, %f47;\n"
|
||||
" sub.f32 %f53, %f27, %f49;\n"
|
||||
" mul.f32 %f54, %f51, %f51;\n"
|
||||
" mad.f32 %f55, %f52, %f52, %f54;\n"
|
||||
" mad.f32 %f56, %f53, %f53, %f55;\n"
|
||||
" add.s32 %r34, %r33, %r25;\n"
|
||||
" cvt.u64.s32 %rd20, %r34;\n"
|
||||
" mul.lo.u64 %rd21, %rd20, 16;\n"
|
||||
" add.u64 %rd22, %rd21, %rd14;\n"
|
||||
" ld.global.f32 %f57, [%rd22+8];\n"
|
||||
" setp.gt.f32 %p5, %f57, %f56;\n"
|
||||
" @!%p5 bra $Lt_0_14082;\n"
|
||||
" rcp.approx.f32 %f58, %f56;\n"
|
||||
" ld.global.f32 %f59, [%rd22+12];\n"
|
||||
" setp.lt.f32 %p6, %f56, %f59;\n"
|
||||
" @!%p6 bra $Lt_0_12290;\n"
|
||||
" .loc 14 165 0\n"
|
||||
" mul.f32 %f60, %f58, %f58;\n"
|
||||
" mul.f32 %f61, %f58, %f60;\n"
|
||||
" mov.f32 %f62, %f61;\n"
|
||||
" .loc 14 146 0\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 166 0\n"
|
||||
" mul.f32 %f63, %f61, %f39;\n"
|
||||
" ld.global.v2.f32 {%f64,%f65}, [%rd22+0];\n"
|
||||
" mul.f32 %f66, %f64, %f61;\n"
|
||||
" sub.f32 %f67, %f66, %f65;\n"
|
||||
" mul.f32 %f68, %f63, %f67;\n"
|
||||
" bra.uni $Lt_0_12034;\n"
|
||||
"$Lt_0_12290:\n"
|
||||
" .loc 14 168 0\n"
|
||||
" mov.f32 %f68, 0f00000000; \n"
|
||||
"$Lt_0_12034:\n"
|
||||
" ld.param.f32 %f69, [__cudaparm_kernel_pair_cut_coulsq];\n"
|
||||
" setp.gt.f32 %p7, %f69, %f56;\n"
|
||||
" @!%p7 bra $Lt_0_12802;\n"
|
||||
" .loc 14 175 0\n"
|
||||
" sqrt.approx.f32 %f70, %f56;\n"
|
||||
" ld.param.f32 %f71, [__cudaparm_kernel_pair_g_ewald];\n"
|
||||
" mul.f32 %f72, %f71, %f70;\n"
|
||||
" mul.f32 %f73, %f72, %f72;\n"
|
||||
" mov.f32 %f74, 0f3f800000; \n"
|
||||
" mov.f32 %f75, 0f3ea7ba05; \n"
|
||||
" mad.f32 %f76, %f75, %f72, %f74;\n"
|
||||
" neg.f32 %f77, %f73;\n"
|
||||
" rcp.approx.f32 %f78, %f76;\n"
|
||||
" mov.f32 %f79, 0f3fb8aa3b; \n"
|
||||
" mul.f32 %f80, %f77, %f79;\n"
|
||||
" ex2.approx.f32 %f81, %f80;\n"
|
||||
" mov.f32 %f82, 0f3e827906; \n"
|
||||
" mov.f32 %f83, 0fbe91a98e; \n"
|
||||
" mov.f32 %f84, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f85, 0fbfba00e3; \n"
|
||||
" mov.f32 %f86, 0f3f87dc22; \n"
|
||||
" mad.f32 %f87, %f86, %f78, %f85;\n"
|
||||
" mad.f32 %f88, %f78, %f87, %f84;\n"
|
||||
" mad.f32 %f89, %f78, %f88, %f83;\n"
|
||||
" mad.f32 %f90, %f78, %f89, %f82;\n"
|
||||
" mul.f32 %f91, %f78, %f90;\n"
|
||||
" mul.f32 %f92, %f81, %f91;\n"
|
||||
" mov.f32 %f93, %f92;\n"
|
||||
" mov.s32 %r35, %r28;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" mov.s32 %r37, 0;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f94,%f95,%f96,%f97},[q_tex,{%r35,%r36,%r37,%r38}];\n"
|
||||
" .loc 14 176 0\n"
|
||||
" mov.f32 %f98, %f94;\n"
|
||||
" ld.param.f32 %f99, [__cudaparm_kernel_pair_qqrd2e];\n"
|
||||
" mul.f32 %f100, %f99, %f33;\n"
|
||||
" mul.f32 %f101, %f100, %f98;\n"
|
||||
" div.approx.f32 %f102, %f101, %f70;\n"
|
||||
" mov.f32 %f103, %f102;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" mov.f32 %f104, 0f3f906ebb; \n"
|
||||
" mul.f32 %f105, %f72, %f104;\n"
|
||||
" mad.f32 %f106, %f81, %f105, %f92;\n"
|
||||
" sub.f32 %f107, %f106, %f42;\n"
|
||||
" mul.f32 %f108, %f102, %f107;\n"
|
||||
" bra.uni $Lt_0_12546;\n"
|
||||
"$Lt_0_12802:\n"
|
||||
" .loc 14 180 0\n"
|
||||
" mov.f32 %f103, 0f00000000; \n"
|
||||
" mov.f32 %f108, 0f00000000; \n"
|
||||
"$Lt_0_12546:\n"
|
||||
" .loc 14 185 0\n"
|
||||
" add.f32 %f109, %f108, %f68;\n"
|
||||
" mul.f32 %f110, %f109, %f58;\n"
|
||||
" mad.f32 %f36, %f52, %f110, %f36;\n"
|
||||
" .loc 14 186 0\n"
|
||||
" mad.f32 %f35, %f51, %f110, %f35;\n"
|
||||
" .loc 14 187 0\n"
|
||||
" mad.f32 %f34, %f53, %f110, %f34;\n"
|
||||
" @!%p3 bra $Lt_0_13570;\n"
|
||||
" .loc 14 190 0\n"
|
||||
" mov.f32 %f111, %f93;\n"
|
||||
" sub.f32 %f112, %f111, %f42;\n"
|
||||
" mad.f32 %f37, %f103, %f112, %f37;\n"
|
||||
" @!%p6 bra $Lt_0_13570;\n"
|
||||
" .loc 14 193 0\n"
|
||||
" ld.param.u64 %rd23, [__cudaparm_kernel_pair_lj3];\n"
|
||||
" add.u64 %rd24, %rd23, %rd21;\n"
|
||||
" mov.f32 %f113, %f62;\n"
|
||||
" ld.global.v4.f32 {%f114,%f115,%f116,_}, [%rd24+0];\n"
|
||||
" mul.f32 %f117, %f114, %f113;\n"
|
||||
" sub.f32 %f118, %f117, %f115;\n"
|
||||
" mul.f32 %f119, %f113, %f118;\n"
|
||||
" sub.f32 %f120, %f119, %f116;\n"
|
||||
" .loc 14 146 0\n"
|
||||
" ld.shared.f32 %f39, [%rd19+0];\n"
|
||||
" .loc 14 193 0\n"
|
||||
" mad.f32 %f38, %f39, %f120, %f38;\n"
|
||||
"$Lt_0_13570:\n"
|
||||
"$Lt_0_13058:\n"
|
||||
" @!%p4 bra $Lt_0_14082;\n"
|
||||
" .loc 14 197 0\n"
|
||||
" mov.f32 %f121, %f10;\n"
|
||||
" mul.f32 %f122, %f52, %f52;\n"
|
||||
" mad.f32 %f123, %f110, %f122, %f121;\n"
|
||||
" mov.f32 %f10, %f123;\n"
|
||||
" .loc 14 198 0\n"
|
||||
" mov.f32 %f124, %f12;\n"
|
||||
" mad.f32 %f125, %f110, %f54, %f124;\n"
|
||||
" mov.f32 %f12, %f125;\n"
|
||||
" .loc 14 199 0\n"
|
||||
" mov.f32 %f126, %f14;\n"
|
||||
" mul.f32 %f127, %f53, %f53;\n"
|
||||
" mad.f32 %f128, %f110, %f127, %f126;\n"
|
||||
" mov.f32 %f14, %f128;\n"
|
||||
" .loc 14 200 0\n"
|
||||
" mov.f32 %f129, %f16;\n"
|
||||
" mul.f32 %f130, %f51, %f52;\n"
|
||||
" mad.f32 %f131, %f110, %f130, %f129;\n"
|
||||
" mov.f32 %f16, %f131;\n"
|
||||
" .loc 14 201 0\n"
|
||||
" mov.f32 %f132, %f18;\n"
|
||||
" mul.f32 %f133, %f52, %f53;\n"
|
||||
" mad.f32 %f134, %f110, %f133, %f132;\n"
|
||||
" mov.f32 %f18, %f134;\n"
|
||||
" .loc 14 202 0\n"
|
||||
" mul.f32 %f135, %f51, %f53;\n"
|
||||
" mad.f32 %f19, %f110, %f135, %f19;\n"
|
||||
" mov.f32 %f136, %f19;\n"
|
||||
"$Lt_0_14082:\n"
|
||||
"$Lt_0_11522:\n"
|
||||
" .loc 14 142 0\n"
|
||||
" add.u64 %rd10, %rd7, %rd10;\n"
|
||||
" setp.gt.u64 %p8, %rd13, %rd10;\n"
|
||||
" @%p8 bra $Lt_0_11266;\n"
|
||||
" bra.uni $Lt_0_10754;\n"
|
||||
"$Lt_0_15874:\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" setp.gt.s32 %p3, %r20, %r39;\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" setp.gt.s32 %p4, %r19, %r40;\n"
|
||||
"$Lt_0_10754:\n"
|
||||
" .loc 14 209 0\n"
|
||||
" ld.param.u64 %rd25, [__cudaparm_kernel_pair_engv];\n"
|
||||
" add.u64 %rd26, %rd25, %rd3;\n"
|
||||
" @!%p3 bra $Lt_0_14850;\n"
|
||||
" .loc 14 211 0\n"
|
||||
" st.global.f32 [%rd26+0], %f38;\n"
|
||||
" .loc 14 212 0\n"
|
||||
" cvt.u64.s32 %rd27, %r6;\n"
|
||||
" mul.lo.u64 %rd28, %rd27, 4;\n"
|
||||
" add.u64 %rd26, %rd28, %rd26;\n"
|
||||
" .loc 14 213 0\n"
|
||||
" st.global.f32 [%rd26+0], %f37;\n"
|
||||
" .loc 14 214 0\n"
|
||||
" add.u64 %rd26, %rd28, %rd26;\n"
|
||||
"$Lt_0_14850:\n"
|
||||
" @!%p4 bra $Lt_0_15362;\n"
|
||||
" .loc 14 218 0\n"
|
||||
" mov.f32 %f137, %f10;\n"
|
||||
" st.global.f32 [%rd26+0], %f137;\n"
|
||||
" .loc 14 219 0\n"
|
||||
" cvt.u64.s32 %rd29, %r6;\n"
|
||||
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 218 0\n"
|
||||
" mov.f32 %f138, %f12;\n"
|
||||
" st.global.f32 [%rd26+0], %f138;\n"
|
||||
" .loc 14 219 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 218 0\n"
|
||||
" mov.f32 %f139, %f14;\n"
|
||||
" st.global.f32 [%rd26+0], %f139;\n"
|
||||
" .loc 14 219 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 218 0\n"
|
||||
" mov.f32 %f140, %f16;\n"
|
||||
" st.global.f32 [%rd26+0], %f140;\n"
|
||||
" .loc 14 219 0\n"
|
||||
" add.u64 %rd26, %rd30, %rd26;\n"
|
||||
" .loc 14 218 0\n"
|
||||
" mov.f32 %f141, %f18;\n"
|
||||
" st.global.f32 [%rd26+0], %f141;\n"
|
||||
" add.u64 %rd31, %rd30, %rd26;\n"
|
||||
" st.global.f32 [%rd31+0], %f19;\n"
|
||||
"$Lt_0_15362:\n"
|
||||
" .loc 14 222 0\n"
|
||||
" ld.param.u64 %rd32, [__cudaparm_kernel_pair_ans];\n"
|
||||
" mul.lo.u64 %rd33, %rd2, 16;\n"
|
||||
" add.u64 %rd34, %rd32, %rd33;\n"
|
||||
" mov.f32 %f142, %f143;\n"
|
||||
" st.global.v4.f32 [%rd34+0], {%f36,%f35,%f34,%f142};\n"
|
||||
"$Lt_0_10242:\n"
|
||||
" .loc 14 224 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair:\n"
|
||||
" }\n"
|
||||
" .entry kernel_pair_fast (\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_x_,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_ans,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_engv,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
|
||||
" .param .u64 __cudaparm_kernel_pair_fast_q_,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
|
||||
" .param .f32 __cudaparm_kernel_pair_fast_g_ewald)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<45>;\n"
|
||||
" .reg .u64 %rd<48>;\n"
|
||||
" .reg .f32 %f<148>;\n"
|
||||
" .reg .pred %p<13>;\n"
|
||||
" .shared .align 4 .b8 __cuda_sp_lj244[32];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj1288[1024];\n"
|
||||
" .shared .align 16 .b8 __cuda_lj31312[1024];\n"
|
||||
" .loc 14 233 0\n"
|
||||
"$LBB1_kernel_pair_fast:\n"
|
||||
" cvt.s32.u16 %r1, %tid.x;\n"
|
||||
" mov.u32 %r2, 7;\n"
|
||||
" setp.gt.s32 %p1, %r1, %r2;\n"
|
||||
" @%p1 bra $Lt_1_12546;\n"
|
||||
" .loc 14 240 0\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
||||
" cvt.u64.s32 %rd2, %r1;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" ld.param.u64 %rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
|
||||
" add.u64 %rd5, %rd4, %rd3;\n"
|
||||
" ld.global.f32 %f1, [%rd5+0];\n"
|
||||
" add.u64 %rd6, %rd3, %rd1;\n"
|
||||
" st.shared.f32 [%rd6+0], %f1;\n"
|
||||
"$Lt_1_12546:\n"
|
||||
" mov.u64 %rd1, __cuda_sp_lj244;\n"
|
||||
" mov.u32 %r3, 63;\n"
|
||||
" setp.gt.s32 %p2, %r1, %r3;\n"
|
||||
" @%p2 bra $Lt_1_13058;\n"
|
||||
" .loc 14 242 0\n"
|
||||
" mov.u64 %rd7, __cuda_lj1288;\n"
|
||||
" cvt.u64.s32 %rd8, %r1;\n"
|
||||
" mul.lo.u64 %rd9, %rd8, 16;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
|
||||
" add.u64 %rd11, %rd10, %rd9;\n"
|
||||
" add.u64 %rd12, %rd9, %rd7;\n"
|
||||
" ld.global.v4.f32 {%f2,%f3,%f4,%f5}, [%rd11+0];\n"
|
||||
" st.shared.f32 [%rd12+0], %f2;\n"
|
||||
" st.shared.f32 [%rd12+4], %f3;\n"
|
||||
" st.shared.f32 [%rd12+8], %f4;\n"
|
||||
" st.shared.f32 [%rd12+12], %f5;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" mov.u32 %r5, 0;\n"
|
||||
" setp.le.s32 %p3, %r4, %r5;\n"
|
||||
" @%p3 bra $Lt_1_13570;\n"
|
||||
" .loc 14 244 0\n"
|
||||
" mov.u64 %rd13, __cuda_lj31312;\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
|
||||
" add.u64 %rd15, %rd14, %rd9;\n"
|
||||
" add.u64 %rd16, %rd9, %rd13;\n"
|
||||
" ld.global.v4.f32 {%f6,%f7,%f8,%f9}, [%rd15+0];\n"
|
||||
" st.shared.f32 [%rd16+0], %f6;\n"
|
||||
" st.shared.f32 [%rd16+4], %f7;\n"
|
||||
" st.shared.f32 [%rd16+8], %f8;\n"
|
||||
" st.shared.f32 [%rd16+12], %f9;\n"
|
||||
"$Lt_1_13570:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31312;\n"
|
||||
"$Lt_1_13058:\n"
|
||||
" mov.u64 %rd13, __cuda_lj31312;\n"
|
||||
" mov.u64 %rd7, __cuda_lj1288;\n"
|
||||
" .loc 14 247 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" cvt.s32.u16 %r6, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r7, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r8, %r6, %r7;\n"
|
||||
" add.s32 %r9, %r8, %r1;\n"
|
||||
" ld.param.s32 %r10, [__cudaparm_kernel_pair_fast_inum];\n"
|
||||
" setp.ge.s32 %p4, %r9, %r10;\n"
|
||||
" @%p4 bra $Lt_1_14082;\n"
|
||||
" .loc 14 259 0\n"
|
||||
" mov.f32 %f10, 0f00000000; \n"
|
||||
" mov.f32 %f11, %f10;\n"
|
||||
" mov.f32 %f12, 0f00000000; \n"
|
||||
" mov.f32 %f13, %f12;\n"
|
||||
" mov.f32 %f14, 0f00000000; \n"
|
||||
" mov.f32 %f15, %f14;\n"
|
||||
" mov.f32 %f16, 0f00000000; \n"
|
||||
" mov.f32 %f17, %f16;\n"
|
||||
" mov.f32 %f18, 0f00000000; \n"
|
||||
" mov.f32 %f19, %f18;\n"
|
||||
" mov.f32 %f20, 0f00000000; \n"
|
||||
" mov.f32 %f21, %f20;\n"
|
||||
" .loc 14 262 0\n"
|
||||
" cvt.u64.s32 %rd17, %r9;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
|
||||
" add.u64 %rd20, %rd19, %rd18;\n"
|
||||
" ld.global.s32 %r11, [%rd20+0];\n"
|
||||
" .loc 14 264 0\n"
|
||||
" ld.param.s32 %r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
|
||||
" cvt.u64.s32 %rd21, %r12;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" ld.global.s32 %r13, [%rd23+0];\n"
|
||||
" .loc 14 265 0\n"
|
||||
" add.u64 %rd24, %rd23, %rd22;\n"
|
||||
" mov.s64 %rd25, %rd24;\n"
|
||||
" mov.s32 %r14, %r11;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" mov.s32 %r16, 0;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
|
||||
" .loc 14 268 0\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" mov.f32 %f27, %f23;\n"
|
||||
" mov.f32 %f28, %f24;\n"
|
||||
" mov.f32 %f29, %f25;\n"
|
||||
" mov.s32 %r18, %r11;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" mov.s32 %r20, 0;\n"
|
||||
" mov.s32 %r21, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r18,%r19,%r20,%r21}];\n"
|
||||
" .loc 14 269 0\n"
|
||||
" mov.f32 %f34, %f30;\n"
|
||||
" mul24.lo.s32 %r22, %r13, %r12;\n"
|
||||
" cvt.s64.s32 %rd26, %r22;\n"
|
||||
" mul.lo.u64 %rd27, %rd26, 4;\n"
|
||||
" add.u64 %rd28, %rd24, %rd27;\n"
|
||||
" ld.param.s32 %r23, [__cudaparm_kernel_pair_fast_vflag];\n"
|
||||
" ld.param.s32 %r24, [__cudaparm_kernel_pair_fast_eflag];\n"
|
||||
" setp.ge.u64 %p5, %rd24, %rd28;\n"
|
||||
" mov.f32 %f35, 0f00000000; \n"
|
||||
" mov.f32 %f36, 0f00000000; \n"
|
||||
" mov.f32 %f37, 0f00000000; \n"
|
||||
" mov.f32 %f38, 0f00000000; \n"
|
||||
" mov.f32 %f39, 0f00000000; \n"
|
||||
" @%p5 bra $Lt_1_19714;\n"
|
||||
" mov.s32 %r25, 0;\n"
|
||||
" setp.gt.s32 %p6, %r24, %r25;\n"
|
||||
" mov.s32 %r26, 0;\n"
|
||||
" setp.gt.s32 %p7, %r23, %r26;\n"
|
||||
" cvt.rzi.s32.f32 %r27, %f29;\n"
|
||||
" mov.s32 %r28, 8;\n"
|
||||
" mul24.lo.s32 %r29, %r28, %r27;\n"
|
||||
" cvt.rn.f32.s32 %f40, %r29;\n"
|
||||
"$Lt_1_15106:\n"
|
||||
" .loc 14 274 0\n"
|
||||
" ld.global.s32 %r30, [%rd25+0];\n"
|
||||
" .loc 14 277 0\n"
|
||||
" shr.s32 %r31, %r30, 30;\n"
|
||||
" cvt.s64.s32 %rd29, %r31;\n"
|
||||
" and.b64 %rd30, %rd29, 3;\n"
|
||||
" mul.lo.u64 %rd31, %rd30, 4;\n"
|
||||
" add.u64 %rd32, %rd1, %rd31;\n"
|
||||
" ld.shared.f32 %f41, [%rd32+0];\n"
|
||||
" .loc 14 278 0\n"
|
||||
" mov.f32 %f42, 0f3f800000; \n"
|
||||
" ld.shared.f32 %f43, [%rd32+16];\n"
|
||||
" sub.f32 %f44, %f42, %f43;\n"
|
||||
" and.b32 %r32, %r30, 1073741823;\n"
|
||||
" mov.s32 %r33, %r32;\n"
|
||||
" mov.s32 %r34, 0;\n"
|
||||
" mov.s32 %r35, 0;\n"
|
||||
" mov.s32 %r36, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r33,%r34,%r35,%r36}];\n"
|
||||
" .loc 14 281 0\n"
|
||||
" mov.f32 %f49, %f45;\n"
|
||||
" mov.f32 %f50, %f46;\n"
|
||||
" mov.f32 %f51, %f47;\n"
|
||||
" mov.f32 %f52, %f48;\n"
|
||||
" sub.f32 %f53, %f27, %f50;\n"
|
||||
" sub.f32 %f54, %f26, %f49;\n"
|
||||
" sub.f32 %f55, %f28, %f51;\n"
|
||||
" mul.f32 %f56, %f53, %f53;\n"
|
||||
" mad.f32 %f57, %f54, %f54, %f56;\n"
|
||||
" mad.f32 %f58, %f55, %f55, %f57;\n"
|
||||
" add.f32 %f59, %f40, %f52;\n"
|
||||
" cvt.rzi.s32.f32 %r37, %f59;\n"
|
||||
" cvt.u64.s32 %rd33, %r37;\n"
|
||||
" mul.lo.u64 %rd34, %rd33, 16;\n"
|
||||
" add.u64 %rd35, %rd34, %rd7;\n"
|
||||
" ld.shared.f32 %f60, [%rd35+8];\n"
|
||||
" setp.gt.f32 %p8, %f60, %f58;\n"
|
||||
" @!%p8 bra $Lt_1_17922;\n"
|
||||
" rcp.approx.f32 %f61, %f58;\n"
|
||||
" ld.shared.f32 %f62, [%rd35+12];\n"
|
||||
" setp.lt.f32 %p9, %f58, %f62;\n"
|
||||
" @!%p9 bra $Lt_1_16130;\n"
|
||||
" .loc 14 295 0\n"
|
||||
" mul.f32 %f63, %f61, %f61;\n"
|
||||
" mul.f32 %f64, %f61, %f63;\n"
|
||||
" mov.f32 %f65, %f64;\n"
|
||||
" .loc 14 277 0\n"
|
||||
" ld.shared.f32 %f41, [%rd32+0];\n"
|
||||
" .loc 14 296 0\n"
|
||||
" mul.f32 %f66, %f64, %f41;\n"
|
||||
" ld.shared.f32 %f67, [%rd35+4];\n"
|
||||
" ld.shared.f32 %f68, [%rd35+0];\n"
|
||||
" mul.f32 %f69, %f68, %f64;\n"
|
||||
" sub.f32 %f70, %f69, %f67;\n"
|
||||
" mul.f32 %f71, %f66, %f70;\n"
|
||||
" bra.uni $Lt_1_15874;\n"
|
||||
"$Lt_1_16130:\n"
|
||||
" .loc 14 298 0\n"
|
||||
" mov.f32 %f71, 0f00000000; \n"
|
||||
"$Lt_1_15874:\n"
|
||||
" ld.param.f32 %f72, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
|
||||
" setp.gt.f32 %p10, %f72, %f58;\n"
|
||||
" @!%p10 bra $Lt_1_16642;\n"
|
||||
" .loc 14 305 0\n"
|
||||
" sqrt.approx.f32 %f73, %f58;\n"
|
||||
" ld.param.f32 %f74, [__cudaparm_kernel_pair_fast_g_ewald];\n"
|
||||
" mul.f32 %f75, %f74, %f73;\n"
|
||||
" mul.f32 %f76, %f75, %f75;\n"
|
||||
" mov.f32 %f77, 0f3f800000; \n"
|
||||
" mov.f32 %f78, 0f3ea7ba05; \n"
|
||||
" mad.f32 %f79, %f78, %f75, %f77;\n"
|
||||
" neg.f32 %f80, %f76;\n"
|
||||
" rcp.approx.f32 %f81, %f79;\n"
|
||||
" mov.f32 %f82, 0f3fb8aa3b; \n"
|
||||
" mul.f32 %f83, %f80, %f82;\n"
|
||||
" ex2.approx.f32 %f84, %f83;\n"
|
||||
" mov.f32 %f85, 0f3e827906; \n"
|
||||
" mov.f32 %f86, 0fbe91a98e; \n"
|
||||
" mov.f32 %f87, 0f3fb5f0e3; \n"
|
||||
" mov.f32 %f88, 0fbfba00e3; \n"
|
||||
" mov.f32 %f89, 0f3f87dc22; \n"
|
||||
" mad.f32 %f90, %f89, %f81, %f88;\n"
|
||||
" mad.f32 %f91, %f81, %f90, %f87;\n"
|
||||
" mad.f32 %f92, %f81, %f91, %f86;\n"
|
||||
" mad.f32 %f93, %f81, %f92, %f85;\n"
|
||||
" mul.f32 %f94, %f81, %f93;\n"
|
||||
" mul.f32 %f95, %f84, %f94;\n"
|
||||
" mov.f32 %f96, %f95;\n"
|
||||
" mov.s32 %r38, %r32;\n"
|
||||
" mov.s32 %r39, 0;\n"
|
||||
" mov.s32 %r40, 0;\n"
|
||||
" mov.s32 %r41, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f97,%f98,%f99,%f100},[q_tex,{%r38,%r39,%r40,%r41}];\n"
|
||||
" .loc 14 306 0\n"
|
||||
" mov.f32 %f101, %f97;\n"
|
||||
" ld.param.f32 %f102, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
|
||||
" mul.f32 %f103, %f102, %f34;\n"
|
||||
" mul.f32 %f104, %f103, %f101;\n"
|
||||
" div.approx.f32 %f105, %f104, %f73;\n"
|
||||
" mov.f32 %f106, %f105;\n"
|
||||
" .loc 14 307 0\n"
|
||||
" mov.f32 %f107, 0f3f906ebb; \n"
|
||||
" mul.f32 %f108, %f75, %f107;\n"
|
||||
" mad.f32 %f109, %f84, %f108, %f95;\n"
|
||||
" sub.f32 %f110, %f109, %f44;\n"
|
||||
" mul.f32 %f111, %f105, %f110;\n"
|
||||
" bra.uni $Lt_1_16386;\n"
|
||||
"$Lt_1_16642:\n"
|
||||
" .loc 14 310 0\n"
|
||||
" mov.f32 %f106, 0f00000000; \n"
|
||||
" mov.f32 %f111, 0f00000000; \n"
|
||||
"$Lt_1_16386:\n"
|
||||
" .loc 14 315 0\n"
|
||||
" add.f32 %f112, %f111, %f71;\n"
|
||||
" mul.f32 %f113, %f112, %f61;\n"
|
||||
" mad.f32 %f37, %f54, %f113, %f37;\n"
|
||||
" .loc 14 316 0\n"
|
||||
" mad.f32 %f36, %f53, %f113, %f36;\n"
|
||||
" .loc 14 317 0\n"
|
||||
" mad.f32 %f35, %f55, %f113, %f35;\n"
|
||||
" @!%p6 bra $Lt_1_17410;\n"
|
||||
" .loc 14 320 0\n"
|
||||
" mov.f32 %f114, %f96;\n"
|
||||
" sub.f32 %f115, %f114, %f44;\n"
|
||||
" mad.f32 %f38, %f106, %f115, %f38;\n"
|
||||
" @!%p9 bra $Lt_1_17410;\n"
|
||||
" .loc 14 322 0\n"
|
||||
" add.u64 %rd36, %rd34, %rd13;\n"
|
||||
" mov.f32 %f116, %f65;\n"
|
||||
" ld.shared.f32 %f117, [%rd36+4];\n"
|
||||
" ld.shared.f32 %f118, [%rd36+0];\n"
|
||||
" mul.f32 %f119, %f118, %f116;\n"
|
||||
" sub.f32 %f120, %f119, %f117;\n"
|
||||
" mul.f32 %f121, %f116, %f120;\n"
|
||||
" .loc 14 323 0\n"
|
||||
" ld.shared.f32 %f122, [%rd36+8];\n"
|
||||
" sub.f32 %f123, %f121, %f122;\n"
|
||||
" .loc 14 277 0\n"
|
||||
" ld.shared.f32 %f41, [%rd32+0];\n"
|
||||
" .loc 14 323 0\n"
|
||||
" mad.f32 %f39, %f41, %f123, %f39;\n"
|
||||
"$Lt_1_17410:\n"
|
||||
"$Lt_1_16898:\n"
|
||||
" @!%p7 bra $Lt_1_17922;\n"
|
||||
" .loc 14 327 0\n"
|
||||
" mov.f32 %f124, %f11;\n"
|
||||
" mul.f32 %f125, %f54, %f54;\n"
|
||||
" mad.f32 %f126, %f113, %f125, %f124;\n"
|
||||
" mov.f32 %f11, %f126;\n"
|
||||
" .loc 14 328 0\n"
|
||||
" mov.f32 %f127, %f13;\n"
|
||||
" mad.f32 %f128, %f113, %f56, %f127;\n"
|
||||
" mov.f32 %f13, %f128;\n"
|
||||
" .loc 14 329 0\n"
|
||||
" mov.f32 %f129, %f15;\n"
|
||||
" mul.f32 %f130, %f55, %f55;\n"
|
||||
" mad.f32 %f131, %f113, %f130, %f129;\n"
|
||||
" mov.f32 %f15, %f131;\n"
|
||||
" .loc 14 330 0\n"
|
||||
" mov.f32 %f132, %f17;\n"
|
||||
" mul.f32 %f133, %f53, %f54;\n"
|
||||
" mad.f32 %f134, %f113, %f133, %f132;\n"
|
||||
" mov.f32 %f17, %f134;\n"
|
||||
" .loc 14 331 0\n"
|
||||
" mov.f32 %f135, %f19;\n"
|
||||
" mul.f32 %f136, %f54, %f55;\n"
|
||||
" mad.f32 %f137, %f113, %f136, %f135;\n"
|
||||
" mov.f32 %f19, %f137;\n"
|
||||
" .loc 14 332 0\n"
|
||||
" mul.f32 %f138, %f53, %f55;\n"
|
||||
" mad.f32 %f20, %f113, %f138, %f20;\n"
|
||||
" mov.f32 %f139, %f20;\n"
|
||||
"$Lt_1_17922:\n"
|
||||
"$Lt_1_15362:\n"
|
||||
" .loc 14 273 0\n"
|
||||
" add.u64 %rd25, %rd22, %rd25;\n"
|
||||
" setp.gt.u64 %p11, %rd28, %rd25;\n"
|
||||
" @%p11 bra $Lt_1_15106;\n"
|
||||
" bra.uni $Lt_1_14594;\n"
|
||||
"$Lt_1_19714:\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" setp.gt.s32 %p6, %r24, %r42;\n"
|
||||
" mov.s32 %r43, 0;\n"
|
||||
" setp.gt.s32 %p7, %r23, %r43;\n"
|
||||
"$Lt_1_14594:\n"
|
||||
" .loc 14 339 0\n"
|
||||
" ld.param.u64 %rd37, [__cudaparm_kernel_pair_fast_engv];\n"
|
||||
" add.u64 %rd38, %rd37, %rd18;\n"
|
||||
" @!%p6 bra $Lt_1_18690;\n"
|
||||
" .loc 14 341 0\n"
|
||||
" st.global.f32 [%rd38+0], %f39;\n"
|
||||
" .loc 14 342 0\n"
|
||||
" cvt.u64.s32 %rd39, %r10;\n"
|
||||
" mul.lo.u64 %rd40, %rd39, 4;\n"
|
||||
" add.u64 %rd38, %rd40, %rd38;\n"
|
||||
" .loc 14 343 0\n"
|
||||
" st.global.f32 [%rd38+0], %f38;\n"
|
||||
" .loc 14 344 0\n"
|
||||
" add.u64 %rd38, %rd40, %rd38;\n"
|
||||
"$Lt_1_18690:\n"
|
||||
" @!%p7 bra $Lt_1_19202;\n"
|
||||
" .loc 14 348 0\n"
|
||||
" mov.f32 %f140, %f11;\n"
|
||||
" st.global.f32 [%rd38+0], %f140;\n"
|
||||
" .loc 14 349 0\n"
|
||||
" cvt.u64.s32 %rd41, %r10;\n"
|
||||
" mul.lo.u64 %rd42, %rd41, 4;\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 348 0\n"
|
||||
" mov.f32 %f141, %f13;\n"
|
||||
" st.global.f32 [%rd38+0], %f141;\n"
|
||||
" .loc 14 349 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 348 0\n"
|
||||
" mov.f32 %f142, %f15;\n"
|
||||
" st.global.f32 [%rd38+0], %f142;\n"
|
||||
" .loc 14 349 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 348 0\n"
|
||||
" mov.f32 %f143, %f17;\n"
|
||||
" st.global.f32 [%rd38+0], %f143;\n"
|
||||
" .loc 14 349 0\n"
|
||||
" add.u64 %rd38, %rd42, %rd38;\n"
|
||||
" .loc 14 348 0\n"
|
||||
" mov.f32 %f144, %f19;\n"
|
||||
" st.global.f32 [%rd38+0], %f144;\n"
|
||||
" add.u64 %rd43, %rd42, %rd38;\n"
|
||||
" st.global.f32 [%rd43+0], %f20;\n"
|
||||
"$Lt_1_19202:\n"
|
||||
" .loc 14 352 0\n"
|
||||
" ld.param.u64 %rd44, [__cudaparm_kernel_pair_fast_ans];\n"
|
||||
" mul.lo.u64 %rd45, %rd17, 16;\n"
|
||||
" add.u64 %rd46, %rd44, %rd45;\n"
|
||||
" mov.f32 %f145, %f146;\n"
|
||||
" st.global.v4.f32 [%rd46+0], {%f37,%f36,%f35,%f145};\n"
|
||||
"$Lt_1_14082:\n"
|
||||
" .loc 14 354 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_pair_fast:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,122 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "morse_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static MOR_GPU_Memory<PRECISION,ACC_PRECISION> MORMF;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
int mor_gpu_init(const int ntypes, double **cutsq,
|
||||
double **host_lj1, double **host_lj2, double **host_lj3,
|
||||
double **host_lj4, double **offset, double *special_lj,
|
||||
const int inum, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size, int &gpu_mode,
|
||||
FILE *screen) {
|
||||
MORMF.clear();
|
||||
gpu_mode=MORMF.device->gpu_mode();
|
||||
double gpu_split=MORMF.device->particle_split();
|
||||
int first_gpu=MORMF.device->first_device();
|
||||
int last_gpu=MORMF.device->last_device();
|
||||
int world_me=MORMF.device->world_me();
|
||||
int gpu_rank=MORMF.device->gpu_rank();
|
||||
int procs_per_gpu=MORMF.device->procs_per_gpu();
|
||||
|
||||
MORMF.device->init_message(screen,"morse",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (MORMF.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
int init_ok=0;
|
||||
if (world_me==0)
|
||||
init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
|
||||
host_lj4, offset, special_lj, inum, nall, 300,
|
||||
maxspecial, cell_size, gpu_split, screen);
|
||||
|
||||
MORMF.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
|
||||
offset, special_lj, inum, nall, 300, maxspecial,
|
||||
cell_size, gpu_split, screen);
|
||||
|
||||
MORMF.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
|
||||
if (init_ok==0)
|
||||
MORMF.estimate_gpu_overhead();
|
||||
return init_ok;
|
||||
}
|
||||
|
||||
void mor_gpu_clear() {
|
||||
MORMF.clear();
|
||||
}
|
||||
|
||||
int** mor_gpu_compute_n(const int ago, const int inum_full,
|
||||
const int nall, double **host_x, int *host_type,
|
||||
double *sublo, double *subhi, int *tag, int **nspecial,
|
||||
int **special, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
int **ilist, int **jnum, const double cpu_time,
|
||||
bool &success) {
|
||||
return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
|
||||
subhi, tag, nspecial, special, eflag, vflag, eatom,
|
||||
vatom, host_start, ilist, jnum, cpu_time, success);
|
||||
}
|
||||
|
||||
void mor_gpu_compute(const int ago, const int inum_full, const int nall,
|
||||
double **host_x, int *host_type, int *ilist, int *numj,
|
||||
int **firstneigh, const bool eflag, const bool vflag,
|
||||
const bool eatom, const bool vatom, int &host_start,
|
||||
const double cpu_time, bool &success) {
|
||||
MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
|
||||
firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
|
||||
}
|
||||
|
||||
double mor_gpu_bytes() {
|
||||
return MORMF.host_memory_usage();
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,389 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef MORSE_GPU_KERNEL
|
||||
#define MORSE_GPU_KERNEL
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp2 double2
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp2 float2
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "nv_kernel_def.h"
|
||||
texture<float4> pos_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#endif
|
||||
|
||||
#define SBBITS 30
|
||||
#define NEIGHMASK 0x3FFFFFFF
|
||||
__inline int sbmask(int j) { return j >> SBBITS & 3; }
|
||||
|
||||
__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
|
||||
__global numtyp2* mor2, const int lj_types,
|
||||
__global numtyp *sp_lj_in, __global int *dev_nbor,
|
||||
__global int *dev_packed, __global acctyp4 *ans,
|
||||
__global acctyp *engv, const int eflag,
|
||||
const int vflag, const int inum, const int nall,
|
||||
const int nbor_pitch, const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp sp_lj[4];
|
||||
sp_lj[0]=sp_lj_in[0];
|
||||
sp_lj[1]=sp_lj_in[1];
|
||||
sp_lj[2]=sp_lj_in[2];
|
||||
sp_lj[3]=sp_lj_in[3];
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int itype=ix.w;
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int jtype=jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
int mtype=itype*lj_types+jtype;
|
||||
if (r<mor1[mtype].x) {
|
||||
r=sqrt(r);
|
||||
numtyp dexp=r-mor1[mtype].z;
|
||||
dexp=exp(-mor1[mtype].w*dexp);
|
||||
numtyp dm=dexp*dexp-dexp;
|
||||
numtyp force = mor1[mtype].y*dm/r*factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
|
||||
energy+=e*factor_lj;
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii
|
||||
}
|
||||
|
||||
__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
|
||||
__global numtyp2* mor2_in,
|
||||
__global numtyp* sp_lj_in,
|
||||
__global int *dev_nbor, __global int *dev_packed,
|
||||
__global acctyp4 *ans, __global acctyp *engv,
|
||||
const int eflag, const int vflag, const int inum,
|
||||
const int nall, const int nbor_pitch,
|
||||
const int t_per_atom) {
|
||||
int tid=THREAD_ID_X;
|
||||
int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
|
||||
ii+=tid/t_per_atom;
|
||||
int offset=tid%t_per_atom;
|
||||
|
||||
__local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
|
||||
__local numtyp sp_lj[4];
|
||||
if (tid<4)
|
||||
sp_lj[tid]=sp_lj_in[tid];
|
||||
if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
|
||||
mor1[tid]=mor1_in[tid];
|
||||
if (eflag>0)
|
||||
mor2[tid]=mor2_in[tid];
|
||||
}
|
||||
|
||||
acctyp energy=(acctyp)0;
|
||||
acctyp4 f;
|
||||
f.x=(acctyp)0;
|
||||
f.y=(acctyp)0;
|
||||
f.z=(acctyp)0;
|
||||
acctyp virial[6];
|
||||
for (int i=0; i<6; i++)
|
||||
virial[i]=(acctyp)0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (ii<inum) {
|
||||
__global int *nbor=dev_nbor+ii;
|
||||
int i=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
int numj=*nbor;
|
||||
nbor+=nbor_pitch;
|
||||
|
||||
int n_stride;
|
||||
__global int *list_end;
|
||||
if (dev_nbor==dev_packed) {
|
||||
list_end=nbor+mul24(numj,nbor_pitch);
|
||||
nbor+=mul24(offset,nbor_pitch);
|
||||
n_stride=mul24(t_per_atom,nbor_pitch);
|
||||
} else {
|
||||
nbor=dev_packed+*nbor;
|
||||
list_end=nbor+numj;
|
||||
n_stride=t_per_atom;
|
||||
nbor+=offset;
|
||||
}
|
||||
|
||||
numtyp4 ix=fetch_pos(i,x_); //x_[i];
|
||||
int iw=ix.w;
|
||||
int itype=mul24((int)MAX_SHARED_TYPES,iw);
|
||||
|
||||
numtyp factor_lj;
|
||||
for ( ; nbor<list_end; nbor+=n_stride) {
|
||||
|
||||
int j=*nbor;
|
||||
factor_lj = sp_lj[sbmask(j)];
|
||||
j &= NEIGHMASK;
|
||||
|
||||
numtyp4 jx=fetch_pos(j,x_); //x_[j];
|
||||
int mtype=itype+jx.w;
|
||||
|
||||
// Compute r12
|
||||
numtyp delx = ix.x-jx.x;
|
||||
numtyp dely = ix.y-jx.y;
|
||||
numtyp delz = ix.z-jx.z;
|
||||
numtyp r = delx*delx+dely*dely+delz*delz;
|
||||
|
||||
if (r<mor1[mtype].x) {
|
||||
r=sqrt(r);
|
||||
numtyp dexp=r-mor1[mtype].z;
|
||||
dexp=exp(-mor1[mtype].w*dexp);
|
||||
numtyp dm=dexp*dexp-dexp;
|
||||
numtyp force = mor1[mtype].y*dm/r*factor_lj;
|
||||
|
||||
f.x+=delx*force;
|
||||
f.y+=dely*force;
|
||||
f.z+=delz*force;
|
||||
|
||||
if (eflag>0) {
|
||||
numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
|
||||
energy+=e*factor_lj;
|
||||
}
|
||||
if (vflag>0) {
|
||||
virial[0] += delx*delx*force;
|
||||
virial[1] += dely*dely*force;
|
||||
virial[2] += delz*delz*force;
|
||||
virial[3] += delx*dely*force;
|
||||
virial[4] += delx*delz*force;
|
||||
virial[5] += dely*delz*force;
|
||||
}
|
||||
}
|
||||
|
||||
} // for nbor
|
||||
} // if ii
|
||||
|
||||
// Reduce answers
|
||||
if (t_per_atom>1) {
|
||||
__local acctyp red_acc[6][BLOCK_PAIR];
|
||||
|
||||
red_acc[0][tid]=f.x;
|
||||
red_acc[1][tid]=f.y;
|
||||
red_acc[2][tid]=f.z;
|
||||
red_acc[3][tid]=energy;
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<4; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
f.x=red_acc[0][tid];
|
||||
f.y=red_acc[1][tid];
|
||||
f.z=red_acc[2][tid];
|
||||
energy=red_acc[3][tid];
|
||||
|
||||
if (vflag>0) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid]=virial[r];
|
||||
|
||||
for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
|
||||
if (offset < s) {
|
||||
for (int r=0; r<6; r++)
|
||||
red_acc[r][tid] += red_acc[r][tid+s];
|
||||
}
|
||||
}
|
||||
|
||||
for (int r=0; r<6; r++)
|
||||
virial[r]=red_acc[r][tid];
|
||||
}
|
||||
}
|
||||
|
||||
// Store answers
|
||||
if (ii<inum && offset==0) {
|
||||
__global acctyp *ap1=engv+ii;
|
||||
if (eflag>0) {
|
||||
*ap1=energy;
|
||||
ap1+=inum;
|
||||
}
|
||||
if (vflag>0) {
|
||||
for (int i=0; i<6; i++) {
|
||||
*ap1=virial[i];
|
||||
ap1+=inum;
|
||||
}
|
||||
}
|
||||
ans[ii]=f;
|
||||
} // if ii*/
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "morse_gpu_cl.h"
|
||||
#else
|
||||
#include "morse_gpu_ptx.h"
|
||||
#endif
|
||||
|
||||
#include "morse_gpu_memory.h"
|
||||
#include <cassert>
|
||||
#define MOR_GPU_MemoryT MOR_GPU_Memory<numtyp, acctyp>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
MOR_GPU_MemoryT::MOR_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
MOR_GPU_MemoryT::~MOR_GPU_Memory() {
|
||||
clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int MOR_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
|
||||
return this->bytes_per_atom_atomic(max_nbors);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int MOR_GPU_MemoryT::init(const int ntypes,
|
||||
double **host_cutsq, double **host_morse1,
|
||||
double **host_r0, double **host_alpha,
|
||||
double **host_d0, double **host_offset,
|
||||
double *host_special_lj, const int nlocal,
|
||||
const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *_screen) {
|
||||
int success;
|
||||
success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
|
||||
_screen,morse_gpu_kernel);
|
||||
if (success!=0)
|
||||
return success;
|
||||
|
||||
// If atom type constants fit in shared memory use fast kernel
|
||||
int types=ntypes;
|
||||
shared_types=false;
|
||||
int max_shared_types=this->device->max_shared_types();
|
||||
if (types<=max_shared_types && this->_block_size>=max_shared_types) {
|
||||
types=max_shared_types;
|
||||
shared_types=true;
|
||||
}
|
||||
_types=types;
|
||||
|
||||
// Allocate a host write buffer for data initialization
|
||||
UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
|
||||
UCL_WRITE_OPTIMIZED);
|
||||
|
||||
for (int i=0; i<types*types; i++)
|
||||
host_write[i]=0.0;
|
||||
|
||||
mor1.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack4(ntypes,types,mor1,host_write,host_cutsq,host_morse1,
|
||||
host_r0,host_alpha);
|
||||
|
||||
mor2.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
|
||||
this->atom->type_pack2(ntypes,types,mor2,host_write,host_d0,host_offset);
|
||||
|
||||
UCL_H_Vec<double> dview;
|
||||
sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
|
||||
dview.view(host_special_lj,4,*(this->ucl_device));
|
||||
ucl_copy(sp_lj,dview,false);
|
||||
|
||||
_allocated=true;
|
||||
this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes();
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void MOR_GPU_MemoryT::clear() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
mor1.clear();
|
||||
mor2.clear();
|
||||
sp_lj.clear();
|
||||
this->clear_atomic();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double MOR_GPU_MemoryT::host_memory_usage() const {
|
||||
return this->host_memory_usage_atomic()+sizeof(MOR_GPU_Memory<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Calculate energies, forces, and torques
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp>
|
||||
void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
const int BX=this->block_size();
|
||||
int eflag, vflag;
|
||||
if (_eflag)
|
||||
eflag=1;
|
||||
else
|
||||
eflag=0;
|
||||
|
||||
if (_vflag)
|
||||
vflag=1;
|
||||
else
|
||||
vflag=0;
|
||||
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
|
||||
(BX/this->_threads_per_atom)));
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
int anall=this->atom->nall();
|
||||
int nbor_pitch=this->nbor->nbor_pitch();
|
||||
this->time_pair.start();
|
||||
if (shared_types) {
|
||||
this->k_pair_fast.set_size(GX,BX);
|
||||
this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
|
||||
&mor2.begin(), &sp_lj.begin(),
|
||||
&this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(),
|
||||
&this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag,
|
||||
&ainum, &anall, &nbor_pitch,
|
||||
&this->_threads_per_atom);
|
||||
} else {
|
||||
this->k_pair.set_size(GX,BX);
|
||||
this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
|
||||
&_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
|
||||
&this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
|
||||
&this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
|
||||
&anall, &nbor_pitch, &this->_threads_per_atom);
|
||||
}
|
||||
this->time_pair.stop();
|
||||
}
|
||||
|
||||
template class MOR_GPU_Memory<PRECISION,ACC_PRECISION>;
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef MOR_GPU_MEMORY_H
|
||||
#define MOR_GPU_MEMORY_H
|
||||
|
||||
#include "atomic_gpu_memory.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class MOR_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
|
||||
public:
|
||||
MOR_GPU_Memory();
|
||||
~MOR_GPU_Memory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param max_nbors initial number of rows in the neighbor matrix
|
||||
* \param cell_size cutoff + skin
|
||||
* \param gpu_split fraction of particles handled by device
|
||||
*
|
||||
* Returns:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
int init(const int ntypes, double **host_cutsq,
|
||||
double **host_morse1, double **host_r0, double **host_alpha,
|
||||
double **host_d0, double **host_offset, double *host_special_lj,
|
||||
const int nlocal, const int nall, const int max_nbors,
|
||||
const int maxspecial, const double cell_size,
|
||||
const double gpu_split, FILE *screen);
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear();
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom(const int max_nbors) const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
// --------------------------- TYPE DATA --------------------------
|
||||
|
||||
/// mor1.x = cutsq, mor1.y = morse1, mor1.z = r0, mor1.w = alpha
|
||||
UCL_D_Vec<numtyp4> mor1;
|
||||
/// mor2.x = d0, mor2.y = offset
|
||||
UCL_D_Vec<numtyp2> mor2;
|
||||
/// Special LJ values
|
||||
UCL_D_Vec<numtyp> sp_lj;
|
||||
|
||||
/// If atom type constants fit in shared memory, use fast kernels
|
||||
bool shared_types;
|
||||
|
||||
/// Number of atom types
|
||||
int _types;
|
||||
|
||||
private:
|
||||
bool _allocated;
|
||||
void loop(const bool _eflag, const bool _vflag);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/*************************************************************************
|
||||
See pair_gpu_dev_kernel.cu for definitions
|
||||
of preprocessor constants
|
||||
*************************************************************************/
|
||||
|
||||
#ifndef NV_KERNEL_DEF
|
||||
#define NV_KERNEL_DEF
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define ARCH __CUDA_ARCH__
|
||||
#else
|
||||
#define ARCH 100
|
||||
#endif
|
||||
|
||||
#if (ARCH < 200)
|
||||
|
||||
#define THREADS_PER_ATOM 1
|
||||
#define THREADS_PER_CHARGE 8
|
||||
#define BLOCK_NBOR_BUILD 64
|
||||
#define BLOCK_PAIR 64
|
||||
#define BLOCK_BIO_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
|
||||
#else
|
||||
|
||||
#define THREADS_PER_ATOM 1
|
||||
#define THREADS_PER_CHARGE 8
|
||||
#define BLOCK_NBOR_BUILD 128
|
||||
#define BLOCK_PAIR 128
|
||||
#define BLOCK_BIO_PAIR 128
|
||||
#define MAX_SHARED_TYPES 11
|
||||
|
||||
#endif
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
#endif
|
|
@ -0,0 +1,407 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "pair_gpu_ans.h"
|
||||
|
||||
#define PairGPUAnsT PairGPUAns<numtyp,acctyp>
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false),
|
||||
_inum(0),_ilist(NULL),_newton(false) {
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
int PairGPUAnsT::bytes_per_atom() const {
|
||||
int bytes=11*sizeof(acctyp);
|
||||
if (_rot)
|
||||
bytes+=4*sizeof(acctyp);
|
||||
if (_charge)
|
||||
bytes+=sizeof(acctyp);
|
||||
return bytes;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAnsT::alloc(const int inum) {
|
||||
_max_local=static_cast<int>(static_cast<double>(inum)*1.10);
|
||||
|
||||
bool success=true;
|
||||
|
||||
int ans_elements=4;
|
||||
if (_rot)
|
||||
ans_elements+=4;
|
||||
|
||||
// Ignore host/device transfers?
|
||||
bool cpuview=false;
|
||||
if (dev->device_type()==UCL_CPU)
|
||||
cpuview=true;
|
||||
|
||||
// -------------------------- Host allocations
|
||||
success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
|
||||
success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
|
||||
|
||||
// --------------------------- Device allocations
|
||||
if (cpuview) {
|
||||
dev_engv.view(host_engv);
|
||||
dev_ans.view(host_ans);
|
||||
} else {
|
||||
success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
|
||||
UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
success=success && (dev_ans.alloc(ans_elements*_max_local,
|
||||
*dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
|
||||
}
|
||||
_gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
|
||||
|
||||
_allocated=true;
|
||||
return success;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot,
|
||||
UCL_Device &devi) {
|
||||
clear();
|
||||
|
||||
bool success=true;
|
||||
_charge=charge;
|
||||
_rot=rot;
|
||||
_other=_charge || _rot;
|
||||
dev=&devi;
|
||||
|
||||
_e_fields=1;
|
||||
if (_charge)
|
||||
_e_fields++;
|
||||
_ev_fields=6+_e_fields;
|
||||
|
||||
// Initialize atom and nbor data
|
||||
int ef_inum=inum;
|
||||
if (ef_inum==0)
|
||||
ef_inum=1000;
|
||||
|
||||
// Initialize timers for the selected device
|
||||
time_answer.init(*dev);
|
||||
time_answer.zero();
|
||||
_time_cast=0.0;
|
||||
_time_cpu_idle=0.0;
|
||||
|
||||
return success && alloc(ef_inum);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
bool PairGPUAnsT::add_fields(const bool charge, const bool rot) {
|
||||
bool realloc=false;
|
||||
if (charge && _charge==false) {
|
||||
_charge=true;
|
||||
_e_fields++;
|
||||
_ev_fields++;
|
||||
realloc=true;
|
||||
}
|
||||
if (rot && _rot==false) {
|
||||
_rot=true;
|
||||
realloc=true;
|
||||
}
|
||||
if (realloc) {
|
||||
_other=_charge || _rot;
|
||||
int inum=_max_local;
|
||||
clear_resize();
|
||||
return alloc(inum);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAnsT::clear_resize() {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
|
||||
dev_ans.clear();
|
||||
dev_engv.clear();
|
||||
host_ans.clear();
|
||||
host_engv.clear();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAnsT::clear() {
|
||||
_gpu_bytes=0;
|
||||
if (!_allocated)
|
||||
return;
|
||||
|
||||
time_answer.clear();
|
||||
clear_resize();
|
||||
_inum=0;
|
||||
_ilist=NULL;
|
||||
_eflag=false;
|
||||
_vflag=false;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAnsT::host_memory_usage() const {
|
||||
int atom_bytes=4;
|
||||
if (_charge)
|
||||
atom_bytes+=1;
|
||||
if (_rot)
|
||||
atom_bytes+=4;
|
||||
int ans_bytes=atom_bytes+_ev_fields;
|
||||
return ans_bytes*(_max_local)*sizeof(acctyp)+
|
||||
sizeof(PairGPUAns<numtyp,acctyp>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom) {
|
||||
time_answer.start();
|
||||
_eflag=eflag;
|
||||
_vflag=vflag;
|
||||
_ef_atom=ef_atom;
|
||||
_vf_atom=vf_atom;
|
||||
|
||||
int csize=_ev_fields;
|
||||
if (!eflag)
|
||||
csize-=_e_fields;
|
||||
if (!vflag)
|
||||
csize-=6;
|
||||
|
||||
if (csize>0)
|
||||
ucl_copy(host_engv,dev_engv,_inum*csize,true);
|
||||
if (_rot)
|
||||
ucl_copy(host_ans,dev_ans,_inum*4*2,true);
|
||||
else
|
||||
ucl_copy(host_ans,dev_ans,_inum*4,true);
|
||||
time_answer.stop();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom,
|
||||
int *ilist) {
|
||||
_ilist=ilist;
|
||||
copy_answers(eflag,vflag,ef_atom,vf_atom);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
|
||||
double *virial) {
|
||||
if (_eflag==false && _vflag==false)
|
||||
return 0.0;
|
||||
|
||||
double evdwl=0.0;
|
||||
double virial_acc[6];
|
||||
for (int i=0; i<6; i++) virial_acc[i]=0.0;
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]+=virial_acc[j]*0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]+=virial_acc[j]*0.5;
|
||||
}
|
||||
|
||||
evdwl*=0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
|
||||
double *virial, double &ecoul) {
|
||||
if (_eflag==false && _vflag==false)
|
||||
return 0.0;
|
||||
|
||||
if (_charge==false)
|
||||
return energy_virial(eatom,vatom,virial);
|
||||
|
||||
double evdwl=0.0;
|
||||
double _ecoul=0.0;
|
||||
double virial_acc[6];
|
||||
for (int i=0; i<6; i++) virial_acc[i]=0.0;
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[i]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[i][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]+=virial_acc[j]*0.5;
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
acctyp *ap=host_engv.begin()+i;
|
||||
int ii=_ilist[i];
|
||||
if (_eflag) {
|
||||
if (_ef_atom) {
|
||||
evdwl+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
eatom[ii]+=*ap*0.5;
|
||||
ap+=_inum;
|
||||
} else {
|
||||
evdwl+=*ap;
|
||||
ap+=_inum;
|
||||
_ecoul+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
if (_vflag) {
|
||||
if (_vf_atom) {
|
||||
for (int j=0; j<6; j++) {
|
||||
vatom[ii][j]+=*ap*0.5;
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
} else {
|
||||
for (int j=0; j<6; j++) {
|
||||
virial_acc[j]+=*ap;
|
||||
ap+=_inum;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (int j=0; j<6; j++)
|
||||
virial[j]+=virial_acc[j]*0.5;
|
||||
}
|
||||
|
||||
evdwl*=0.5;
|
||||
ecoul+=_ecoul*0.5;
|
||||
return evdwl;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
void PairGPUAnsT::get_answers(double **f, double **tor) {
|
||||
acctyp *ap=host_ans.begin();
|
||||
if (_ilist==NULL) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
f[i][0]+=*ap;
|
||||
ap++;
|
||||
f[i][1]+=*ap;
|
||||
ap++;
|
||||
f[i][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
tor[i][0]+=*ap;
|
||||
ap++;
|
||||
tor[i][1]+=*ap;
|
||||
ap++;
|
||||
tor[i][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
f[ii][0]+=*ap;
|
||||
ap++;
|
||||
f[ii][1]+=*ap;
|
||||
ap++;
|
||||
f[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
if (_rot) {
|
||||
for (int i=0; i<_inum; i++) {
|
||||
int ii=_ilist[i];
|
||||
tor[ii][0]+=*ap;
|
||||
ap++;
|
||||
tor[ii][1]+=*ap;
|
||||
ap++;
|
||||
tor[ii][2]+=*ap;
|
||||
ap+=2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template class PairGPUAns<PRECISION,ACC_PRECISION>;
|
|
@ -0,0 +1,170 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_GPU_ANS_H
|
||||
#define PAIR_GPU_ANS_H
|
||||
|
||||
#include <math.h>
|
||||
#include "mpi.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#include "geryon/ocl_timer.h"
|
||||
#include "geryon/ocl_mat.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_timer.h"
|
||||
#include "geryon/nvd_mat.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
#include "pair_gpu_precision.h"
|
||||
|
||||
template <class numtyp, class acctyp>
|
||||
class PairGPUAns {
|
||||
public:
|
||||
PairGPUAns();
|
||||
~PairGPUAns() { clear(); }
|
||||
|
||||
/// Current number of local atoms stored
|
||||
inline int inum() const { return _inum; }
|
||||
/// Set number of local atoms for future copy operations
|
||||
inline void inum(const int n) { _inum=n; }
|
||||
|
||||
/// Memory usage per atom in this class
|
||||
int bytes_per_atom() const;
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||
bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
|
||||
|
||||
/// Check if we have enough device storage and realloc if not
|
||||
inline void resize(const int inum, bool &success) {
|
||||
_inum=inum;
|
||||
if (inum>_max_local) {
|
||||
clear_resize();
|
||||
success = success && alloc(inum);
|
||||
}
|
||||
}
|
||||
|
||||
/// If already initialized by another LAMMPS style, add fields as necessary
|
||||
/** \param rot True if atom storage needs quaternions
|
||||
* \param gpu_nbor True if neighboring will be performed on device **/
|
||||
bool add_fields(const bool charge, const bool rot);
|
||||
|
||||
/// Free all memory on host and device needed to realloc for more atoms
|
||||
void clear_resize();
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Return the total amount of host memory used by class in bytes
|
||||
double host_memory_usage() const;
|
||||
|
||||
/// Add copy times to timers
|
||||
inline void acc_timers() {
|
||||
time_answer.add_to_total();
|
||||
}
|
||||
|
||||
/// Add copy times to timers
|
||||
inline void zero_timers() {
|
||||
time_answer.zero();
|
||||
}
|
||||
|
||||
/// Return the total time for host/device data transfer
|
||||
inline double transfer_time() {
|
||||
return time_answer.total_seconds();
|
||||
}
|
||||
|
||||
/// Return the total time for data cast/pack
|
||||
inline double cast_time() { return _time_cast; }
|
||||
|
||||
/// Return number of bytes used on device
|
||||
inline double gpu_bytes() { return _gpu_bytes; }
|
||||
|
||||
// -------------------------COPY FROM GPU -------------------------------
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom);
|
||||
|
||||
/// Copy answers from device into read buffer asynchronously
|
||||
void copy_answers(const bool eflag, const bool vflag,
|
||||
const bool ef_atom, const bool vf_atom, int *ilist);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial);
|
||||
|
||||
/// Copy energy and virial data into LAMMPS memory
|
||||
double energy_virial(double *eatom, double **vatom, double *virial,
|
||||
double &ecoul);
|
||||
|
||||
/// Add forces and torques from the GPU into a LAMMPS pointer
|
||||
void get_answers(double **f, double **tor);
|
||||
|
||||
inline double get_answers(double **f, double **tor, double *eatom,
|
||||
double **vatom, double *virial, double &ecoul) {
|
||||
double ta=MPI_Wtime();
|
||||
time_answer.sync_stop();
|
||||
_time_cpu_idle+=MPI_Wtime()-ta;
|
||||
double ts=MPI_Wtime();
|
||||
double evdw=energy_virial(eatom,vatom,virial,ecoul);
|
||||
get_answers(f,tor);
|
||||
_time_cast+=MPI_Wtime()-ts;
|
||||
return evdw;
|
||||
}
|
||||
|
||||
/// Return the time the CPU was idle waiting for GPU
|
||||
inline double cpu_idle_time() { return _time_cpu_idle; }
|
||||
|
||||
// ------------------------------ DATA ----------------------------------
|
||||
|
||||
/// Force and possibly torque
|
||||
UCL_D_Vec<acctyp> dev_ans;
|
||||
/// Energy and virial per-atom storage
|
||||
UCL_D_Vec<acctyp> dev_engv;
|
||||
|
||||
/// Force and possibly torque data on host
|
||||
UCL_H_Vec<acctyp> host_ans;
|
||||
/// Energy/virial data on host
|
||||
UCL_H_Vec<acctyp> host_engv;
|
||||
|
||||
/// Device timers
|
||||
UCL_Timer time_answer;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *dev;
|
||||
|
||||
private:
|
||||
bool alloc(const int inum);
|
||||
|
||||
bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
|
||||
int _max_local, _inum, _e_fields, _ev_fields;
|
||||
int *_ilist;
|
||||
double _time_cast, _time_cpu_idle;
|
||||
|
||||
double _gpu_bytes;
|
||||
|
||||
bool _newton;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,55 @@
|
|||
const char * pair_gpu_atom_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .entry kernel_cast_x (\n"
|
||||
" .param .u64 __cudaparm_kernel_cast_x_x_type,\n"
|
||||
" .param .u64 __cudaparm_kernel_cast_x_x,\n"
|
||||
" .param .u64 __cudaparm_kernel_cast_x_type,\n"
|
||||
" .param .s32 __cudaparm_kernel_cast_x_nall)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<10>;\n"
|
||||
" .reg .u64 %rd<13>;\n"
|
||||
" .reg .f32 %f<6>;\n"
|
||||
" .reg .f64 %fd<5>;\n"
|
||||
" .reg .pred %p<3>;\n"
|
||||
" .loc 14 34 0\n"
|
||||
"$LBB1_kernel_cast_x:\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_cast_x_nall];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_1026;\n"
|
||||
" .loc 14 39 0\n"
|
||||
" cvt.u64.s32 %rd1, %r5;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_kernel_cast_x_type];\n"
|
||||
" mul.lo.u64 %rd3, %rd1, 4;\n"
|
||||
" add.u64 %rd4, %rd2, %rd3;\n"
|
||||
" ld.global.s32 %r7, [%rd4+0];\n"
|
||||
" cvt.rn.f32.s32 %f1, %r7;\n"
|
||||
" .loc 14 42 0\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_kernel_cast_x_x];\n"
|
||||
" mul.lo.s32 %r8, %r5, 3;\n"
|
||||
" cvt.u64.s32 %rd6, %r8;\n"
|
||||
" mul.lo.u64 %rd7, %rd6, 8;\n"
|
||||
" add.u64 %rd8, %rd5, %rd7;\n"
|
||||
" ld.global.f64 %fd1, [%rd8+8];\n"
|
||||
" cvt.rn.f32.f64 %f2, %fd1;\n"
|
||||
" .loc 14 43 0\n"
|
||||
" ld.global.f64 %fd2, [%rd8+16];\n"
|
||||
" cvt.rn.f32.f64 %f3, %fd2;\n"
|
||||
" .loc 14 44 0\n"
|
||||
" ld.param.u64 %rd9, [__cudaparm_kernel_cast_x_x_type];\n"
|
||||
" mul.lo.u64 %rd10, %rd1, 16;\n"
|
||||
" add.u64 %rd11, %rd9, %rd10;\n"
|
||||
" ld.global.f64 %fd3, [%rd8+0];\n"
|
||||
" cvt.rn.f32.f64 %f4, %fd3;\n"
|
||||
" st.global.v4.f32 [%rd11+0], {%f4,%f2,%f3,%f1};\n"
|
||||
"$Lt_0_1026:\n"
|
||||
" .loc 14 46 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_cast_x:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,765 @@
|
|||
const char * pair_gpu_build_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .tex .u64 neigh_tex;\n"
|
||||
" .entry transpose (\n"
|
||||
" .param .u64 __cudaparm_transpose_out,\n"
|
||||
" .param .u64 __cudaparm_transpose_in,\n"
|
||||
" .param .s32 __cudaparm_transpose_columns_in,\n"
|
||||
" .param .s32 __cudaparm_transpose_rows_in)\n"
|
||||
" {\n"
|
||||
" .reg .u16 %rh<4>;\n"
|
||||
" .reg .u32 %r<30>;\n"
|
||||
" .reg .u64 %rd<23>;\n"
|
||||
" .reg .f32 %f<4>;\n"
|
||||
" .reg .pred %p<4>;\n"
|
||||
" .shared .align 4 .b8 __cuda_block24[288];\n"
|
||||
" .loc 14 62 0\n"
|
||||
"$LBB1_transpose:\n"
|
||||
" mov.u16 %rh1, %ctaid.x;\n"
|
||||
" mul.wide.u16 %r1, %rh1, 8;\n"
|
||||
" mov.u16 %rh2, %ctaid.y;\n"
|
||||
" mul.wide.u16 %r2, %rh2, 8;\n"
|
||||
" cvt.u32.u16 %r3, %tid.x;\n"
|
||||
" add.u32 %r4, %r1, %r3;\n"
|
||||
" cvt.u32.u16 %r5, %tid.y;\n"
|
||||
" add.u32 %r6, %r2, %r5;\n"
|
||||
" ld.param.u32 %r7, [__cudaparm_transpose_rows_in];\n"
|
||||
" ld.param.u32 %r8, [__cudaparm_transpose_columns_in];\n"
|
||||
" set.lt.u32.u32 %r9, %r4, %r8;\n"
|
||||
" neg.s32 %r10, %r9;\n"
|
||||
" set.lt.u32.u32 %r11, %r6, %r7;\n"
|
||||
" neg.s32 %r12, %r11;\n"
|
||||
" and.b32 %r13, %r10, %r12;\n"
|
||||
" mov.u32 %r14, 0;\n"
|
||||
" setp.eq.s32 %p1, %r13, %r14;\n"
|
||||
" @%p1 bra $Lt_0_2306;\n"
|
||||
" .loc 14 74 0\n"
|
||||
" mov.u64 %rd1, __cuda_block24;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_transpose_in];\n"
|
||||
" mul.lo.u32 %r15, %r6, %r8;\n"
|
||||
" add.u32 %r16, %r4, %r15;\n"
|
||||
" cvt.u64.u32 %rd3, %r16;\n"
|
||||
" mul.lo.u64 %rd4, %rd3, 4;\n"
|
||||
" add.u64 %rd5, %rd2, %rd4;\n"
|
||||
" ld.global.s32 %r17, [%rd5+0];\n"
|
||||
" cvt.rn.f32.s32 %f1, %r17;\n"
|
||||
" cvt.u64.u32 %rd6, %r3;\n"
|
||||
" cvt.u64.u32 %rd7, %r5;\n"
|
||||
" mul.lo.u64 %rd8, %rd7, 9;\n"
|
||||
" add.u64 %rd9, %rd6, %rd8;\n"
|
||||
" mul.lo.u64 %rd10, %rd9, 4;\n"
|
||||
" add.u64 %rd11, %rd1, %rd10;\n"
|
||||
" st.shared.f32 [%rd11+0], %f1;\n"
|
||||
"$Lt_0_2306:\n"
|
||||
" mov.u64 %rd1, __cuda_block24;\n"
|
||||
" .loc 14 76 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" add.u32 %r18, %r1, %r5;\n"
|
||||
" add.u32 %r19, %r2, %r3;\n"
|
||||
" set.lt.u32.u32 %r20, %r18, %r8;\n"
|
||||
" neg.s32 %r21, %r20;\n"
|
||||
" set.lt.u32.u32 %r22, %r19, %r7;\n"
|
||||
" neg.s32 %r23, %r22;\n"
|
||||
" and.b32 %r24, %r21, %r23;\n"
|
||||
" mov.u32 %r25, 0;\n"
|
||||
" setp.eq.s32 %p2, %r24, %r25;\n"
|
||||
" @%p2 bra $Lt_0_2818;\n"
|
||||
" .loc 14 81 0\n"
|
||||
" cvt.u64.u32 %rd12, %r5;\n"
|
||||
" cvt.u64.u32 %rd13, %r3;\n"
|
||||
" mul.lo.u64 %rd14, %rd13, 9;\n"
|
||||
" add.u64 %rd15, %rd12, %rd14;\n"
|
||||
" mul.lo.u64 %rd16, %rd15, 4;\n"
|
||||
" add.u64 %rd17, %rd1, %rd16;\n"
|
||||
" ld.shared.f32 %f2, [%rd17+0];\n"
|
||||
" cvt.rzi.s32.f32 %r26, %f2;\n"
|
||||
" ld.param.u64 %rd18, [__cudaparm_transpose_out];\n"
|
||||
" mul.lo.u32 %r27, %r18, %r7;\n"
|
||||
" add.u32 %r28, %r19, %r27;\n"
|
||||
" cvt.u64.u32 %rd19, %r28;\n"
|
||||
" mul.lo.u64 %rd20, %rd19, 4;\n"
|
||||
" add.u64 %rd21, %rd18, %rd20;\n"
|
||||
" st.global.s32 [%rd21+0], %r26;\n"
|
||||
"$Lt_0_2818:\n"
|
||||
" .loc 14 82 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_transpose:\n"
|
||||
" }\n"
|
||||
" .entry calc_cell_id (\n"
|
||||
" .param .u64 __cudaparm_calc_cell_id_pos,\n"
|
||||
" .param .u64 __cudaparm_calc_cell_id_cell_id,\n"
|
||||
" .param .u64 __cudaparm_calc_cell_id_particle_id,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxlo0,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxlo1,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxlo2,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxhi0,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxhi1,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_boxhi2,\n"
|
||||
" .param .f32 __cudaparm_calc_cell_id_cell_size,\n"
|
||||
" .param .s32 __cudaparm_calc_cell_id_ncellx,\n"
|
||||
" .param .s32 __cudaparm_calc_cell_id_ncelly,\n"
|
||||
" .param .s32 __cudaparm_calc_cell_id_nall)\n"
|
||||
" {\n"
|
||||
" .reg .u16 %rh<4>;\n"
|
||||
" .reg .u32 %r<20>;\n"
|
||||
" .reg .u64 %rd<8>;\n"
|
||||
" .reg .f32 %f<35>;\n"
|
||||
" .reg .f64 %fd<11>;\n"
|
||||
" .reg .pred %p<3>;\n"
|
||||
" .loc 14 88 0\n"
|
||||
"$LBB1_calc_cell_id:\n"
|
||||
" cvt.u32.u16 %r1, %tid.x;\n"
|
||||
" mov.u16 %rh1, %ctaid.x;\n"
|
||||
" mov.u16 %rh2, %ntid.x;\n"
|
||||
" mul.wide.u16 %r2, %rh1, %rh2;\n"
|
||||
" add.u32 %r3, %r1, %r2;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_calc_cell_id_nall];\n"
|
||||
" setp.le.s32 %p1, %r4, %r3;\n"
|
||||
" @%p1 bra $Lt_1_1026;\n"
|
||||
" mov.s32 %r5, %r3;\n"
|
||||
" mov.s32 %r6, 0;\n"
|
||||
" mov.s32 %r7, 0;\n"
|
||||
" mov.s32 %r8, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r5,%r6,%r7,%r8}];\n"
|
||||
" .loc 14 92 0\n"
|
||||
" mov.f32 %f5, %f1;\n"
|
||||
" mov.f32 %f6, %f2;\n"
|
||||
" mov.f32 %f7, %f3;\n"
|
||||
" .loc 14 105 0\n"
|
||||
" ld.param.f32 %f8, [__cudaparm_calc_cell_id_cell_size];\n"
|
||||
" neg.f32 %f9, %f8;\n"
|
||||
" ld.param.f32 %f10, [__cudaparm_calc_cell_id_boxlo0];\n"
|
||||
" ld.param.f32 %f11, [__cudaparm_calc_cell_id_boxlo2];\n"
|
||||
" ld.param.f32 %f12, [__cudaparm_calc_cell_id_boxlo1];\n"
|
||||
" ld.param.u32 %r9, [__cudaparm_calc_cell_id_ncellx];\n"
|
||||
" ld.param.u32 %r10, [__cudaparm_calc_cell_id_ncelly];\n"
|
||||
" ld.param.f32 %f13, [__cudaparm_calc_cell_id_boxhi2];\n"
|
||||
" sub.f32 %f14, %f13, %f11;\n"
|
||||
" add.f32 %f15, %f8, %f14;\n"
|
||||
" sub.f32 %f16, %f7, %f11;\n"
|
||||
" max.f32 %f17, %f9, %f16;\n"
|
||||
" min.f32 %f18, %f15, %f17;\n"
|
||||
" div.approx.f32 %f19, %f18, %f8;\n"
|
||||
" cvt.f64.f32 %fd1, %f19;\n"
|
||||
" mov.f64 %fd2, 0d3ff0000000000000; \n"
|
||||
" add.f64 %fd3, %fd1, %fd2;\n"
|
||||
" cvt.rzi.u32.f64 %r11, %fd3;\n"
|
||||
" mul.lo.u32 %r12, %r9, %r11;\n"
|
||||
" mul.lo.u32 %r13, %r10, %r12;\n"
|
||||
" ld.param.f32 %f20, [__cudaparm_calc_cell_id_boxhi1];\n"
|
||||
" sub.f32 %f21, %f20, %f12;\n"
|
||||
" add.f32 %f22, %f8, %f21;\n"
|
||||
" sub.f32 %f23, %f6, %f12;\n"
|
||||
" max.f32 %f24, %f9, %f23;\n"
|
||||
" min.f32 %f25, %f22, %f24;\n"
|
||||
" div.approx.f32 %f26, %f25, %f8;\n"
|
||||
" cvt.f64.f32 %fd4, %f26;\n"
|
||||
" mov.f64 %fd5, 0d3ff0000000000000; \n"
|
||||
" add.f64 %fd6, %fd4, %fd5;\n"
|
||||
" cvt.rzi.u32.f64 %r14, %fd6;\n"
|
||||
" mul.lo.u32 %r15, %r9, %r14;\n"
|
||||
" add.u32 %r16, %r13, %r15;\n"
|
||||
" ld.param.f32 %f27, [__cudaparm_calc_cell_id_boxhi0];\n"
|
||||
" sub.f32 %f28, %f27, %f10;\n"
|
||||
" add.f32 %f29, %f8, %f28;\n"
|
||||
" sub.f32 %f30, %f5, %f10;\n"
|
||||
" max.f32 %f31, %f9, %f30;\n"
|
||||
" min.f32 %f32, %f29, %f31;\n"
|
||||
" div.approx.f32 %f33, %f32, %f8;\n"
|
||||
" cvt.f64.f32 %fd7, %f33;\n"
|
||||
" mov.f64 %fd8, 0d3ff0000000000000; \n"
|
||||
" add.f64 %fd9, %fd7, %fd8;\n"
|
||||
" cvt.rzi.u32.f64 %r17, %fd9;\n"
|
||||
" add.u32 %r18, %r16, %r17;\n"
|
||||
" .loc 14 109 0\n"
|
||||
" cvt.u64.s32 %rd1, %r3;\n"
|
||||
" mul.lo.u64 %rd2, %rd1, 4;\n"
|
||||
" ld.param.u64 %rd3, [__cudaparm_calc_cell_id_cell_id];\n"
|
||||
" add.u64 %rd4, %rd3, %rd2;\n"
|
||||
" st.global.u32 [%rd4+0], %r18;\n"
|
||||
" .loc 14 110 0\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_calc_cell_id_particle_id];\n"
|
||||
" add.u64 %rd6, %rd5, %rd2;\n"
|
||||
" st.global.s32 [%rd6+0], %r3;\n"
|
||||
"$Lt_1_1026:\n"
|
||||
" .loc 14 112 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_calc_cell_id:\n"
|
||||
" }\n"
|
||||
" .entry kernel_calc_cell_counts (\n"
|
||||
" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n"
|
||||
" .param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n"
|
||||
" .param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n"
|
||||
" .param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n"
|
||||
" {\n"
|
||||
" .reg .u16 %rh<4>;\n"
|
||||
" .reg .u32 %r<31>;\n"
|
||||
" .reg .u64 %rd<15>;\n"
|
||||
" .reg .pred %p<13>;\n"
|
||||
" .loc 14 115 0\n"
|
||||
"$LBB1_kernel_calc_cell_counts:\n"
|
||||
" mov.u16 %rh1, %ctaid.x;\n"
|
||||
" mov.u16 %rh2, %ntid.x;\n"
|
||||
" mul.wide.u16 %r1, %rh1, %rh2;\n"
|
||||
" cvt.u32.u16 %r2, %tid.x;\n"
|
||||
" add.u32 %r3, %r2, %r1;\n"
|
||||
" ld.param.s32 %r4, [__cudaparm_kernel_calc_cell_counts_nall];\n"
|
||||
" setp.gt.s32 %p1, %r4, %r3;\n"
|
||||
" @!%p1 bra $Lt_2_7426;\n"
|
||||
" .loc 14 118 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n"
|
||||
" cvt.u64.s32 %rd2, %r3;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" ld.global.s32 %r5, [%rd4+0];\n"
|
||||
" mov.u32 %r6, 0;\n"
|
||||
" setp.ne.s32 %p2, %r3, %r6;\n"
|
||||
" @%p2 bra $Lt_2_7938;\n"
|
||||
" add.s32 %r7, %r5, 1;\n"
|
||||
" mov.u32 %r8, 0;\n"
|
||||
" setp.le.s32 %p3, %r7, %r8;\n"
|
||||
" @%p3 bra $Lt_2_8450;\n"
|
||||
" mov.s32 %r9, %r7;\n"
|
||||
" ld.param.u64 %rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
|
||||
" mov.s32 %r10, 0;\n"
|
||||
" mov.s32 %r11, %r9;\n"
|
||||
"$Lt_2_8962:\n"
|
||||
" .loc 14 123 0\n"
|
||||
" mov.s32 %r12, 0;\n"
|
||||
" st.global.s32 [%rd5+0], %r12;\n"
|
||||
" add.s32 %r10, %r10, 1;\n"
|
||||
" add.u64 %rd5, %rd5, 4;\n"
|
||||
" setp.ne.s32 %p4, %r7, %r10;\n"
|
||||
" @%p4 bra $Lt_2_8962;\n"
|
||||
"$Lt_2_8450:\n"
|
||||
"$Lt_2_7938:\n"
|
||||
" sub.s32 %r13, %r4, 1;\n"
|
||||
" setp.ne.s32 %p5, %r3, %r13;\n"
|
||||
" @%p5 bra $Lt_2_9474;\n"
|
||||
" .loc 14 126 0\n"
|
||||
" add.s32 %r7, %r5, 1;\n"
|
||||
" mov.s32 %r14, %r7;\n"
|
||||
" ld.param.s32 %r15, [__cudaparm_kernel_calc_cell_counts_ncell];\n"
|
||||
" setp.gt.s32 %p6, %r7, %r15;\n"
|
||||
" @%p6 bra $Lt_2_9986;\n"
|
||||
" sub.s32 %r16, %r15, %r5;\n"
|
||||
" add.s32 %r17, %r15, 1;\n"
|
||||
" ld.param.u64 %rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
|
||||
" cvt.u64.s32 %rd7, %r7;\n"
|
||||
" mul.lo.u64 %rd8, %rd7, 4;\n"
|
||||
" add.u64 %rd9, %rd6, %rd8;\n"
|
||||
" mov.s32 %r18, %r16;\n"
|
||||
"$Lt_2_10498:\n"
|
||||
" .loc 14 127 0\n"
|
||||
" st.global.s32 [%rd9+0], %r4;\n"
|
||||
" add.s32 %r14, %r14, 1;\n"
|
||||
" add.u64 %rd9, %rd9, 4;\n"
|
||||
" setp.ne.s32 %p7, %r17, %r14;\n"
|
||||
" @%p7 bra $Lt_2_10498;\n"
|
||||
"$Lt_2_9986:\n"
|
||||
"$Lt_2_9474:\n"
|
||||
" selp.s32 %r19, 1, 0, %p1;\n"
|
||||
" mov.s32 %r20, 0;\n"
|
||||
" set.gt.u32.s32 %r21, %r3, %r20;\n"
|
||||
" neg.s32 %r22, %r21;\n"
|
||||
" and.b32 %r23, %r19, %r22;\n"
|
||||
" mov.u32 %r24, 0;\n"
|
||||
" setp.eq.s32 %p8, %r23, %r24;\n"
|
||||
" @%p8 bra $Lt_2_11010;\n"
|
||||
" .loc 14 131 0\n"
|
||||
" ld.global.s32 %r25, [%rd4+-4];\n"
|
||||
" setp.eq.s32 %p9, %r5, %r25;\n"
|
||||
" @%p9 bra $Lt_2_11522;\n"
|
||||
" .loc 14 133 0\n"
|
||||
" add.s32 %r26, %r25, 1;\n"
|
||||
" mov.s32 %r27, %r26;\n"
|
||||
" setp.gt.s32 %p10, %r26, %r5;\n"
|
||||
" @%p10 bra $Lt_2_12034;\n"
|
||||
" sub.s32 %r28, %r5, %r25;\n"
|
||||
" add.s32 %r7, %r5, 1;\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
|
||||
" cvt.u64.s32 %rd11, %r26;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd10, %rd12;\n"
|
||||
" mov.s32 %r29, %r28;\n"
|
||||
"$Lt_2_12546:\n"
|
||||
" .loc 14 134 0\n"
|
||||
" st.global.s32 [%rd13+0], %r3;\n"
|
||||
" add.s32 %r27, %r27, 1;\n"
|
||||
" add.u64 %rd13, %rd13, 4;\n"
|
||||
" setp.ne.s32 %p11, %r7, %r27;\n"
|
||||
" @%p11 bra $Lt_2_12546;\n"
|
||||
"$Lt_2_12034:\n"
|
||||
"$Lt_2_11522:\n"
|
||||
"$Lt_2_11010:\n"
|
||||
"$Lt_2_7426:\n"
|
||||
" .loc 14 138 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_calc_cell_counts:\n"
|
||||
" }\n"
|
||||
" .entry calc_neigh_list_cell (\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_pos,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n"
|
||||
" .param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n"
|
||||
" .param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_inum,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_nt,\n"
|
||||
" .param .s32 __cudaparm_calc_neigh_list_cell_nall)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<105>;\n"
|
||||
" .reg .u64 %rd<43>;\n"
|
||||
" .reg .f32 %f<43>;\n"
|
||||
" .reg .f64 %fd<4>;\n"
|
||||
" .reg .pred %p<24>;\n"
|
||||
" .shared .align 16 .b8 __cuda_pos_sh480[1024];\n"
|
||||
" .shared .align 4 .b8 __cuda_cell_list_sh1504[256];\n"
|
||||
" .loc 14 148 0\n"
|
||||
"$LBB1_calc_neigh_list_cell:\n"
|
||||
" .loc 14 160 0\n"
|
||||
" ld.param.u32 %r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n"
|
||||
" cvt.u32.u16 %r2, %ctaid.y;\n"
|
||||
" rem.u32 %r3, %r2, %r1;\n"
|
||||
" div.u32 %r4, %r2, %r1;\n"
|
||||
" ld.param.s32 %r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n"
|
||||
" mul.lo.s32 %r6, %r5, %r3;\n"
|
||||
" mul.lo.s32 %r7, %r5, %r4;\n"
|
||||
" mul.lo.s32 %r8, %r7, %r1;\n"
|
||||
" cvt.s32.u16 %r9, %ctaid.x;\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n"
|
||||
" add.s32 %r10, %r6, %r8;\n"
|
||||
" add.s32 %r11, %r9, %r10;\n"
|
||||
" cvt.u64.s32 %rd2, %r11;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" ld.global.s32 %r12, [%rd4+0];\n"
|
||||
" .loc 14 161 0\n"
|
||||
" ld.global.s32 %r13, [%rd4+4];\n"
|
||||
" .loc 14 169 0\n"
|
||||
" sub.s32 %r14, %r13, %r12;\n"
|
||||
" cvt.u32.u16 %r15, %ntid.x;\n"
|
||||
" cvt.rn.f32.u32 %f1, %r15;\n"
|
||||
" cvt.rn.f32.s32 %f2, %r14;\n"
|
||||
" div.approx.f32 %f3, %f2, %f1;\n"
|
||||
" cvt.rpi.f32.f32 %f4, %f3;\n"
|
||||
" mov.f32 %f5, 0f00000000; \n"
|
||||
" setp.gt.f32 %p1, %f4, %f5;\n"
|
||||
" @!%p1 bra $Lt_3_14594;\n"
|
||||
" sub.s32 %r16, %r3, 1;\n"
|
||||
" mov.s32 %r17, 0;\n"
|
||||
" max.s32 %r18, %r16, %r17;\n"
|
||||
" sub.s32 %r19, %r1, 1;\n"
|
||||
" add.s32 %r20, %r3, 1;\n"
|
||||
" min.s32 %r21, %r19, %r20;\n"
|
||||
" ld.param.s32 %r22, [__cudaparm_calc_neigh_list_cell_ncellz];\n"
|
||||
" sub.s32 %r23, %r22, 1;\n"
|
||||
" add.s32 %r24, %r4, 1;\n"
|
||||
" min.s32 %r25, %r23, %r24;\n"
|
||||
" sub.s32 %r26, %r9, 1;\n"
|
||||
" mov.s32 %r27, 0;\n"
|
||||
" max.s32 %r28, %r26, %r27;\n"
|
||||
" add.s32 %r29, %r9, 1;\n"
|
||||
" sub.s32 %r30, %r5, 1;\n"
|
||||
" min.s32 %r31, %r29, %r30;\n"
|
||||
" cvt.s32.u16 %r32, %tid.x;\n"
|
||||
" add.s32 %r33, %r12, %r32;\n"
|
||||
" mov.u32 %r34, 0;\n"
|
||||
" ld.param.s32 %r35, [__cudaparm_calc_neigh_list_cell_inum];\n"
|
||||
" cvt.u64.s32 %rd5, %r35;\n"
|
||||
" sub.s32 %r36, %r4, 1;\n"
|
||||
" mov.s32 %r37, %r33;\n"
|
||||
" mov.s32 %r38, 0;\n"
|
||||
" max.s32 %r39, %r36, %r38;\n"
|
||||
" setp.ge.s32 %p2, %r25, %r39;\n"
|
||||
" ld.param.s32 %r40, [__cudaparm_calc_neigh_list_cell_nt];\n"
|
||||
" ld.param.s32 %r41, [__cudaparm_calc_neigh_list_cell_nall];\n"
|
||||
" mov.s32 %r42, 0;\n"
|
||||
" mov.u64 %rd6, __cuda_pos_sh480;\n"
|
||||
" mov.u64 %rd7, __cuda_cell_list_sh1504;\n"
|
||||
"$Lt_3_15106:\n"
|
||||
" .loc 14 171 0\n"
|
||||
" mov.s32 %r43, %r41;\n"
|
||||
" setp.ge.s32 %p3, %r37, %r13;\n"
|
||||
" @%p3 bra $Lt_3_15362;\n"
|
||||
" .loc 14 177 0\n"
|
||||
" ld.param.u64 %rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
|
||||
" add.u32 %r44, %r33, %r34;\n"
|
||||
" cvt.u64.s32 %rd9, %r44;\n"
|
||||
" mul.lo.u64 %rd10, %rd9, 4;\n"
|
||||
" add.u64 %rd11, %rd8, %rd10;\n"
|
||||
" ld.global.s32 %r43, [%rd11+0];\n"
|
||||
"$Lt_3_15362:\n"
|
||||
" setp.lt.s32 %p4, %r43, %r40;\n"
|
||||
" @!%p4 bra $Lt_3_15874;\n"
|
||||
" mov.s32 %r45, %r43;\n"
|
||||
" mov.s32 %r46, 0;\n"
|
||||
" mov.s32 %r47, 0;\n"
|
||||
" mov.s32 %r48, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f6,%f7,%f8,%f9},[neigh_tex,{%r45,%r46,%r47,%r48}];\n"
|
||||
" .loc 14 180 0\n"
|
||||
" mov.f32 %f10, %f6;\n"
|
||||
" mov.f32 %f11, %f7;\n"
|
||||
" mov.f32 %f12, %f8;\n"
|
||||
" mov.f32 %f13, %f10;\n"
|
||||
" mov.f32 %f14, %f11;\n"
|
||||
" mov.f32 %f15, %f12;\n"
|
||||
"$Lt_3_15874:\n"
|
||||
" cvt.u64.s32 %rd12, %r43;\n"
|
||||
" mul.lo.u64 %rd13, %rd12, 4;\n"
|
||||
" setp.ge.s32 %p5, %r43, %r35;\n"
|
||||
" @%p5 bra $Lt_3_16642;\n"
|
||||
" .loc 14 183 0\n"
|
||||
" mov.s32 %r49, %r35;\n"
|
||||
" .loc 14 184 0\n"
|
||||
" ld.param.u64 %rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];\n"
|
||||
" add.u64 %rd15, %rd12, %rd5;\n"
|
||||
" mul.lo.u64 %rd16, %rd15, 4;\n"
|
||||
" add.u64 %rd17, %rd14, %rd16;\n"
|
||||
" .loc 14 186 0\n"
|
||||
" add.u64 %rd18, %rd13, %rd14;\n"
|
||||
" st.global.s32 [%rd18+0], %r43;\n"
|
||||
" bra.uni $Lt_3_16386;\n"
|
||||
"$Lt_3_16642:\n"
|
||||
" .loc 14 188 0\n"
|
||||
" sub.s32 %r49, %r40, %r35;\n"
|
||||
" .loc 14 189 0\n"
|
||||
" ld.param.u64 %rd19, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n"
|
||||
" add.u64 %rd20, %rd19, %rd13;\n"
|
||||
" mul.lo.u64 %rd21, %rd5, 4;\n"
|
||||
" sub.u64 %rd17, %rd20, %rd21;\n"
|
||||
"$Lt_3_16386:\n"
|
||||
" cvt.u64.s32 %rd22, %r49;\n"
|
||||
" mul.lo.u64 %rd23, %rd22, 4;\n"
|
||||
" add.u64 %rd24, %rd17, %rd23;\n"
|
||||
" .loc 14 195 0\n"
|
||||
" mov.s32 %r50, %r39;\n"
|
||||
" mov.s32 %r51, 0;\n"
|
||||
" @!%p2 bra $Lt_3_25090;\n"
|
||||
" sub.s32 %r52, %r25, %r39;\n"
|
||||
" add.s32 %r53, %r52, 1;\n"
|
||||
" setp.le.s32 %p6, %r18, %r21;\n"
|
||||
" add.s32 %r54, %r25, 1;\n"
|
||||
" mov.s32 %r55, %r53;\n"
|
||||
"$Lt_3_17410:\n"
|
||||
" .loc 14 196 0\n"
|
||||
" mov.s32 %r56, %r18;\n"
|
||||
" @!%p6 bra $Lt_3_17666;\n"
|
||||
" sub.s32 %r57, %r21, %r18;\n"
|
||||
" add.s32 %r58, %r57, 1;\n"
|
||||
" setp.ge.s32 %p7, %r31, %r28;\n"
|
||||
" add.s32 %r59, %r21, 1;\n"
|
||||
" mov.s32 %r60, %r58;\n"
|
||||
"$Lt_3_18178:\n"
|
||||
" @!%p7 bra $Lt_3_18434;\n"
|
||||
" sub.s32 %r61, %r31, %r28;\n"
|
||||
" add.s32 %r62, %r61, 1;\n"
|
||||
" mul.lo.s32 %r63, %r56, %r5;\n"
|
||||
" mul.lo.s32 %r64, %r50, %r5;\n"
|
||||
" mul.lo.s32 %r65, %r64, %r1;\n"
|
||||
" add.s32 %r66, %r31, 1;\n"
|
||||
" add.s32 %r67, %r63, %r65;\n"
|
||||
" add.s32 %r68, %r67, %r28;\n"
|
||||
" add.s32 %r69, %r66, %r67;\n"
|
||||
" cvt.u64.s32 %rd25, %r68;\n"
|
||||
" mul.lo.u64 %rd26, %rd25, 4;\n"
|
||||
" add.u64 %rd27, %rd1, %rd26;\n"
|
||||
" mov.s32 %r70, %r62;\n"
|
||||
"$Lt_3_18946:\n"
|
||||
" .loc 14 201 0\n"
|
||||
" ld.global.s32 %r71, [%rd27+0];\n"
|
||||
" .loc 14 202 0\n"
|
||||
" ld.global.s32 %r72, [%rd27+4];\n"
|
||||
" .loc 14 206 0\n"
|
||||
" sub.s32 %r73, %r72, %r71;\n"
|
||||
" cvt.rn.f32.s32 %f16, %r73;\n"
|
||||
" mov.f32 %f17, 0f42800000; \n"
|
||||
" div.approx.f32 %f18, %f16, %f17;\n"
|
||||
" cvt.rpi.f32.f32 %f19, %f18;\n"
|
||||
" cvt.rzi.s32.f32 %r74, %f19;\n"
|
||||
" mov.u32 %r75, 0;\n"
|
||||
" setp.le.s32 %p8, %r74, %r75;\n"
|
||||
" @%p8 bra $Lt_3_19202;\n"
|
||||
" mov.s32 %r76, %r74;\n"
|
||||
" mov.s32 %r77, 0;\n"
|
||||
" setp.lt.s32 %p9, %r43, %r40;\n"
|
||||
" mul.lo.s32 %r78, %r74, 64;\n"
|
||||
" mov.s32 %r79, %r76;\n"
|
||||
"$Lt_3_19714:\n"
|
||||
" .loc 14 209 0\n"
|
||||
" sub.s32 %r80, %r73, %r77;\n"
|
||||
" mov.s32 %r81, 64;\n"
|
||||
" min.s32 %r82, %r80, %r81;\n"
|
||||
" setp.le.s32 %p10, %r82, %r32;\n"
|
||||
" @%p10 bra $Lt_3_19970;\n"
|
||||
" .loc 14 212 0\n"
|
||||
" ld.param.u64 %rd28, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
|
||||
" add.s32 %r83, %r77, %r32;\n"
|
||||
" add.s32 %r84, %r71, %r83;\n"
|
||||
" cvt.s64.s32 %rd29, %r84;\n"
|
||||
" mul.lo.u64 %rd30, %rd29, 4;\n"
|
||||
" add.u64 %rd31, %rd28, %rd30;\n"
|
||||
" ld.global.s32 %r85, [%rd31+0];\n"
|
||||
" .loc 14 213 0\n"
|
||||
" cvt.u64.s32 %rd32, %r32;\n"
|
||||
" mul.lo.u64 %rd33, %rd32, 4;\n"
|
||||
" add.u64 %rd34, %rd7, %rd33;\n"
|
||||
" st.shared.s32 [%rd34+0], %r85;\n"
|
||||
" mov.s32 %r86, %r85;\n"
|
||||
" mov.s32 %r87, 0;\n"
|
||||
" mov.s32 %r88, 0;\n"
|
||||
" mov.s32 %r89, 0;\n"
|
||||
" tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[neigh_tex,{%r86,%r87,%r88,%r89}];\n"
|
||||
" .loc 14 214 0\n"
|
||||
" mov.f32 %f24, %f20;\n"
|
||||
" mov.f32 %f25, %f21;\n"
|
||||
" mov.f32 %f26, %f22;\n"
|
||||
" .loc 14 215 0\n"
|
||||
" mul.lo.u64 %rd35, %rd32, 16;\n"
|
||||
" add.u64 %rd36, %rd6, %rd35;\n"
|
||||
" st.shared.f32 [%rd36+0], %f24;\n"
|
||||
" .loc 14 216 0\n"
|
||||
" st.shared.f32 [%rd36+4], %f25;\n"
|
||||
" .loc 14 217 0\n"
|
||||
" st.shared.f32 [%rd36+8], %f26;\n"
|
||||
"$Lt_3_19970:\n"
|
||||
" .loc 14 219 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" @!%p9 bra $Lt_3_20994;\n"
|
||||
" mov.u32 %r90, 0;\n"
|
||||
" setp.le.s32 %p11, %r82, %r90;\n"
|
||||
" @%p11 bra $Lt_3_20994;\n"
|
||||
" mov.s32 %r91, %r82;\n"
|
||||
" mov.u64 %rd37, 0;\n"
|
||||
" setp.lt.s32 %p12, %r43, %r35;\n"
|
||||
" selp.s32 %r92, 1, 0, %p12;\n"
|
||||
" mov.s64 %rd38, %rd7;\n"
|
||||
" mov.s32 %r93, 0;\n"
|
||||
" mov.s32 %r94, %r91;\n"
|
||||
"$Lt_3_21506:\n"
|
||||
" .loc 14 224 0\n"
|
||||
" ld.shared.s32 %r95, [%rd38+0];\n"
|
||||
" set.lt.u32.s32 %r96, %r43, %r95;\n"
|
||||
" neg.s32 %r97, %r96;\n"
|
||||
" set.lt.u32.s32 %r98, %r95, %r35;\n"
|
||||
" neg.s32 %r99, %r98;\n"
|
||||
" or.b32 %r100, %r92, %r99;\n"
|
||||
" or.b32 %r101, %r97, %r100;\n"
|
||||
" mov.u32 %r102, 0;\n"
|
||||
" setp.eq.s32 %p13, %r101, %r102;\n"
|
||||
" @%p13 bra $Lt_3_26370;\n"
|
||||
" .loc 14 226 0\n"
|
||||
" mul.lo.u64 %rd39, %rd37, 16;\n"
|
||||
" add.u64 %rd40, %rd6, %rd39;\n"
|
||||
" mov.f32 %f27, %f13;\n"
|
||||
" ld.shared.f32 %f28, [%rd40+0];\n"
|
||||
" sub.f32 %f29, %f27, %f28;\n"
|
||||
" .loc 14 227 0\n"
|
||||
" mov.f32 %f30, %f14;\n"
|
||||
" ld.shared.f32 %f31, [%rd40+4];\n"
|
||||
" sub.f32 %f32, %f30, %f31;\n"
|
||||
" .loc 14 228 0\n"
|
||||
" mov.f32 %f33, %f15;\n"
|
||||
" ld.shared.f32 %f34, [%rd40+8];\n"
|
||||
" sub.f32 %f35, %f33, %f34;\n"
|
||||
" .loc 14 226 0\n"
|
||||
" mul.f32 %f36, %f32, %f32;\n"
|
||||
" mad.f32 %f37, %f29, %f29, %f36;\n"
|
||||
" mad.f32 %f38, %f35, %f35, %f37;\n"
|
||||
" ld.param.f32 %f39, [__cudaparm_calc_neigh_list_cell_cell_size];\n"
|
||||
" mul.f32 %f40, %f39, %f39;\n"
|
||||
" setp.lt.f32 %p14, %f38, %f40;\n"
|
||||
" @!%p14 bra $Lt_3_26370;\n"
|
||||
" cvt.f64.f32 %fd1, %f38;\n"
|
||||
" mov.f64 %fd2, 0d3ee4f8b588e368f1; \n"
|
||||
" setp.gt.f64 %p15, %fd1, %fd2;\n"
|
||||
" @!%p15 bra $Lt_3_26370;\n"
|
||||
" ld.param.s32 %r103, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
|
||||
" setp.le.s32 %p16, %r103, %r51;\n"
|
||||
" @%p16 bra $Lt_3_22274;\n"
|
||||
" .loc 14 233 0\n"
|
||||
" st.global.s32 [%rd24+0], %r95;\n"
|
||||
" .loc 14 234 0\n"
|
||||
" mul.lo.u64 %rd41, %rd22, 4;\n"
|
||||
" add.u64 %rd24, %rd24, %rd41;\n"
|
||||
"$Lt_3_22274:\n"
|
||||
" .loc 14 236 0\n"
|
||||
" add.s32 %r51, %r51, 1;\n"
|
||||
"$Lt_3_26370:\n"
|
||||
"$L_3_14082:\n"
|
||||
"$Lt_3_21762:\n"
|
||||
" add.s32 %r93, %r93, 1;\n"
|
||||
" add.u64 %rd37, %rd37, 1;\n"
|
||||
" add.u64 %rd38, %rd38, 4;\n"
|
||||
" setp.ne.s32 %p17, %r82, %r93;\n"
|
||||
" @%p17 bra $Lt_3_21506;\n"
|
||||
"$Lt_3_20994:\n"
|
||||
"$Lt_3_20482:\n"
|
||||
" .loc 14 241 0\n"
|
||||
" bar.sync 0;\n"
|
||||
" add.s32 %r77, %r77, 64;\n"
|
||||
" setp.ne.s32 %p18, %r77, %r78;\n"
|
||||
" @%p18 bra $Lt_3_19714;\n"
|
||||
"$Lt_3_19202:\n"
|
||||
" add.s32 %r68, %r68, 1;\n"
|
||||
" add.u64 %rd27, %rd27, 4;\n"
|
||||
" setp.ne.s32 %p19, %r68, %r69;\n"
|
||||
" @%p19 bra $Lt_3_18946;\n"
|
||||
"$Lt_3_18434:\n"
|
||||
" add.s32 %r56, %r56, 1;\n"
|
||||
" setp.ne.s32 %p20, %r59, %r56;\n"
|
||||
" @%p20 bra $Lt_3_18178;\n"
|
||||
"$Lt_3_17666:\n"
|
||||
" add.s32 %r50, %r50, 1;\n"
|
||||
" setp.ne.s32 %p21, %r54, %r50;\n"
|
||||
" @%p21 bra $Lt_3_17410;\n"
|
||||
" bra.uni $Lt_3_16898;\n"
|
||||
"$Lt_3_25090:\n"
|
||||
"$Lt_3_16898:\n"
|
||||
" @!%p4 bra $Lt_3_24066;\n"
|
||||
" .loc 14 247 0\n"
|
||||
" st.global.s32 [%rd17+0], %r51;\n"
|
||||
"$Lt_3_24066:\n"
|
||||
" .loc 14 169 0\n"
|
||||
" add.s32 %r42, %r42, 1;\n"
|
||||
" add.u32 %r34, %r34, %r15;\n"
|
||||
" add.s32 %r37, %r37, %r15;\n"
|
||||
" cvt.rn.f32.s32 %f41, %r42;\n"
|
||||
" setp.lt.f32 %p22, %f41, %f4;\n"
|
||||
" @%p22 bra $Lt_3_15106;\n"
|
||||
"$Lt_3_14594:\n"
|
||||
" .loc 14 249 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_calc_neigh_list_cell:\n"
|
||||
" }\n"
|
||||
" .entry kernel_special (\n"
|
||||
" .param .u64 __cudaparm_kernel_special_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_host_nbor_list,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_tag,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_nspecial,\n"
|
||||
" .param .u64 __cudaparm_kernel_special_special,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_inum,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_nt,\n"
|
||||
" .param .s32 __cudaparm_kernel_special_nall)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<31>;\n"
|
||||
" .reg .u64 %rd<31>;\n"
|
||||
" .reg .pred %p<11>;\n"
|
||||
" .loc 14 254 0\n"
|
||||
"$LBB1_kernel_special:\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_special_nt];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_4_6146;\n"
|
||||
" .loc 14 262 0\n"
|
||||
" ld.param.u64 %rd1, [__cudaparm_kernel_special_nspecial];\n"
|
||||
" mul.lo.s32 %r7, %r5, 3;\n"
|
||||
" cvt.s64.s32 %rd2, %r7;\n"
|
||||
" mul.lo.u64 %rd3, %rd2, 4;\n"
|
||||
" add.u64 %rd4, %rd1, %rd3;\n"
|
||||
" ld.global.s32 %r8, [%rd4+0];\n"
|
||||
" .loc 14 263 0\n"
|
||||
" ld.global.s32 %r9, [%rd4+4];\n"
|
||||
" .loc 14 264 0\n"
|
||||
" ld.global.s32 %r10, [%rd4+8];\n"
|
||||
" ld.param.s32 %r11, [__cudaparm_kernel_special_inum];\n"
|
||||
" cvt.u64.s32 %rd5, %r11;\n"
|
||||
" cvt.u64.s32 %rd6, %r5;\n"
|
||||
" setp.le.s32 %p2, %r11, %r5;\n"
|
||||
" @%p2 bra $Lt_4_6914;\n"
|
||||
" .loc 14 267 0\n"
|
||||
" mov.s32 %r12, %r11;\n"
|
||||
" .loc 14 268 0\n"
|
||||
" ld.param.u64 %rd7, [__cudaparm_kernel_special_dev_nbor];\n"
|
||||
" add.u64 %rd8, %rd5, %rd6;\n"
|
||||
" mul.lo.u64 %rd9, %rd8, 4;\n"
|
||||
" add.u64 %rd10, %rd7, %rd9;\n"
|
||||
" bra.uni $Lt_4_6658;\n"
|
||||
"$Lt_4_6914:\n"
|
||||
" .loc 14 270 0\n"
|
||||
" sub.s32 %r12, %r6, %r11;\n"
|
||||
" .loc 14 271 0\n"
|
||||
" ld.param.u64 %rd11, [__cudaparm_kernel_special_host_nbor_list];\n"
|
||||
" mul.lo.u64 %rd12, %rd6, 4;\n"
|
||||
" add.u64 %rd13, %rd11, %rd12;\n"
|
||||
" mul.lo.u64 %rd14, %rd5, 4;\n"
|
||||
" sub.u64 %rd10, %rd13, %rd14;\n"
|
||||
"$Lt_4_6658:\n"
|
||||
" .loc 14 273 0\n"
|
||||
" ld.global.s32 %r13, [%rd10+0];\n"
|
||||
" .loc 14 274 0\n"
|
||||
" cvt.u64.s32 %rd15, %r12;\n"
|
||||
" mul.lo.u64 %rd16, %rd15, 4;\n"
|
||||
" add.u64 %rd10, %rd10, %rd16;\n"
|
||||
" .loc 14 275 0\n"
|
||||
" mul.lo.s32 %r14, %r12, %r13;\n"
|
||||
" cvt.s64.s32 %rd17, %r14;\n"
|
||||
" mul.lo.u64 %rd18, %rd17, 4;\n"
|
||||
" add.u64 %rd19, %rd10, %rd18;\n"
|
||||
" setp.le.u64 %p3, %rd19, %rd10;\n"
|
||||
" @%p3 bra $Lt_4_7170;\n"
|
||||
" mov.s32 %r15, 0;\n"
|
||||
" setp.gt.s32 %p4, %r10, %r15;\n"
|
||||
" ld.param.u64 %rd20, [__cudaparm_kernel_special_tag];\n"
|
||||
"$Lt_4_7682:\n"
|
||||
" .loc 14 278 0\n"
|
||||
" ld.global.s32 %r16, [%rd10+0];\n"
|
||||
" .loc 14 279 0\n"
|
||||
" cvt.u64.s32 %rd21, %r16;\n"
|
||||
" mul.lo.u64 %rd22, %rd21, 4;\n"
|
||||
" add.u64 %rd23, %rd20, %rd22;\n"
|
||||
" ld.global.s32 %r17, [%rd23+0];\n"
|
||||
" @!%p4 bra $Lt_4_7938;\n"
|
||||
" mov.s32 %r18, %r10;\n"
|
||||
" mul.lo.u64 %rd24, %rd6, 4;\n"
|
||||
" cvt.s64.s32 %rd25, %r6;\n"
|
||||
" mul.lo.u64 %rd26, %rd25, 4;\n"
|
||||
" ld.param.u64 %rd27, [__cudaparm_kernel_special_special];\n"
|
||||
" add.u64 %rd28, %rd27, %rd24;\n"
|
||||
" mov.s32 %r19, 0;\n"
|
||||
" mov.s32 %r20, %r18;\n"
|
||||
"$Lt_4_8450:\n"
|
||||
" ld.global.s32 %r21, [%rd28+0];\n"
|
||||
" setp.ne.s32 %p5, %r21, %r17;\n"
|
||||
" @%p5 bra $Lt_4_8706;\n"
|
||||
" .loc 14 289 0\n"
|
||||
" setp.le.s32 %p6, %r8, %r19;\n"
|
||||
" mov.s32 %r22, 3;\n"
|
||||
" mov.s32 %r23, 2;\n"
|
||||
" selp.s32 %r24, %r22, %r23, %p6;\n"
|
||||
" mov.s32 %r25, 2;\n"
|
||||
" mov.s32 %r26, 1;\n"
|
||||
" selp.s32 %r27, %r25, %r26, %p6;\n"
|
||||
" setp.le.s32 %p7, %r9, %r19;\n"
|
||||
" selp.s32 %r28, %r24, %r27, %p7;\n"
|
||||
" shl.b32 %r29, %r28, 30;\n"
|
||||
" xor.b32 %r16, %r16, %r29;\n"
|
||||
" .loc 14 290 0\n"
|
||||
" st.global.s32 [%rd10+0], %r16;\n"
|
||||
"$Lt_4_8706:\n"
|
||||
" add.s32 %r19, %r19, 1;\n"
|
||||
" add.u64 %rd28, %rd26, %rd28;\n"
|
||||
" setp.ne.s32 %p8, %r10, %r19;\n"
|
||||
" @%p8 bra $Lt_4_8450;\n"
|
||||
"$Lt_4_7938:\n"
|
||||
" .loc 14 277 0\n"
|
||||
" mul.lo.u64 %rd29, %rd15, 4;\n"
|
||||
" add.u64 %rd10, %rd10, %rd29;\n"
|
||||
" setp.gt.u64 %p9, %rd19, %rd10;\n"
|
||||
" @%p9 bra $Lt_4_7682;\n"
|
||||
"$Lt_4_7170:\n"
|
||||
"$Lt_4_6146:\n"
|
||||
" .loc 14 296 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_special:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,120 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/*************************************************************************
|
||||
Preprocessor Definitions
|
||||
|
||||
Note: It is assumed that constants with the same names are defined with
|
||||
the same values in all files.
|
||||
|
||||
ARCH
|
||||
Definition: Architecture number for accelerator
|
||||
MEM_THREADS
|
||||
Definition: Number of threads with sequential ids accessing memory
|
||||
simultaneously on multiprocessor
|
||||
WARP_SIZE:
|
||||
Definition: Number of threads guaranteed to be on the same instruction
|
||||
THREADS_PER_ATOM
|
||||
Definition: Default number of threads assigned per atom for pair styles
|
||||
Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
|
||||
THREADS_PER_CHARGE
|
||||
Definition: Default number of threads assigned per atom for pair styles
|
||||
with charge
|
||||
Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
|
||||
PPPM_MAX_SPLINE
|
||||
Definition: Maximum order for splines in PPPM
|
||||
PPPM_BLOCK_1D
|
||||
Definition: Thread block size for PPPM kernels
|
||||
Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
|
||||
PPPM_BLOCK_1D%32==0
|
||||
BLOCK_PAIR
|
||||
Definition: Default thread block size for pair styles
|
||||
Restrictions:
|
||||
MAX_SHARED_TYPES 8
|
||||
Definition: Max number of atom type params can be stored in shared memory
|
||||
Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
|
||||
BLOCK_CELL_2D
|
||||
Definition: Default block size in each dimension for cell list builds
|
||||
and matrix transpose
|
||||
BLOCK_CELL_ID
|
||||
Definition: Default block size for binning atoms in cell list builds
|
||||
BLOCK_NBOR_BUILD
|
||||
Definition: Default block size for neighbor list builds
|
||||
BLOCK_BIO_PAIR
|
||||
Definition: Default thread block size for "bio" pair styles
|
||||
MAX_BIO_SHARED_TYPES
|
||||
Definition: Max number of atom type params can be stored in shared memory
|
||||
Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
|
||||
MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
|
||||
|
||||
*************************************************************************/
|
||||
|
||||
#ifndef PAIR_GPU_DEV_KERNEL
|
||||
#define PAIR_GPU_DEV_KERNEL
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "nv_kernel_def.h"
|
||||
|
||||
#else
|
||||
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define ARCH 0
|
||||
#define DRIVER 0
|
||||
#define MEM_THREADS 16
|
||||
#define WARP_SIZE 1
|
||||
#define THREADS_PER_ATOM 1
|
||||
#define THREADS_PER_CHARGE 1
|
||||
#define BLOCK_PAIR 64
|
||||
#define MAX_SHARED_TYPES 8
|
||||
#define BLOCK_NBOR_BUILD 64
|
||||
#define BLOCK_BIO_PAIR 64
|
||||
|
||||
#endif
|
||||
|
||||
#define PPPM_MAX_SPLINE 8
|
||||
#define PPPM_BLOCK_1D 64
|
||||
#define BLOCK_CELL_2D 8
|
||||
#define BLOCK_CELL_ID 128
|
||||
#define MAX_BIO_SHARED_TYPES 128
|
||||
|
||||
__kernel void kernel_zero(__global int *mem, int numel) {
|
||||
int ii=GLOBAL_ID_X;
|
||||
|
||||
if (ii<numel)
|
||||
mem[ii]=0;
|
||||
}
|
||||
|
||||
__kernel void kernel_info(__global int *info) {
|
||||
info[0]=ARCH;
|
||||
info[1]=MEM_THREADS;
|
||||
info[2]=WARP_SIZE;
|
||||
info[3]=THREADS_PER_ATOM;
|
||||
info[4]=PPPM_MAX_SPLINE;
|
||||
info[5]=PPPM_BLOCK_1D;
|
||||
info[6]=BLOCK_PAIR;
|
||||
info[7]=MAX_SHARED_TYPES;
|
||||
info[8]=BLOCK_CELL_2D;
|
||||
info[9]=BLOCK_CELL_ID;
|
||||
info[10]=BLOCK_NBOR_BUILD;
|
||||
info[11]=BLOCK_BIO_PAIR;
|
||||
info[12]=MAX_BIO_SHARED_TYPES;
|
||||
info[13]=THREADS_PER_CHARGE;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
const char * pair_gpu_nbor_kernel =
|
||||
" .version 1.4\n"
|
||||
" .target sm_13\n"
|
||||
" .entry kernel_unpack (\n"
|
||||
" .param .u64 __cudaparm_kernel_unpack_dev_nbor,\n"
|
||||
" .param .u64 __cudaparm_kernel_unpack_dev_ij,\n"
|
||||
" .param .s32 __cudaparm_kernel_unpack_inum)\n"
|
||||
" {\n"
|
||||
" .reg .u32 %r<11>;\n"
|
||||
" .reg .u64 %rd<27>;\n"
|
||||
" .reg .pred %p<5>;\n"
|
||||
" .loc 14 29 0\n"
|
||||
"$LBB1_kernel_unpack:\n"
|
||||
" cvt.s32.u16 %r1, %ctaid.x;\n"
|
||||
" cvt.s32.u16 %r2, %ntid.x;\n"
|
||||
" mul24.lo.s32 %r3, %r1, %r2;\n"
|
||||
" cvt.u32.u16 %r4, %tid.x;\n"
|
||||
" add.u32 %r5, %r3, %r4;\n"
|
||||
" ld.param.s32 %r6, [__cudaparm_kernel_unpack_inum];\n"
|
||||
" setp.le.s32 %p1, %r6, %r5;\n"
|
||||
" @%p1 bra $Lt_0_2050;\n"
|
||||
" .loc 14 35 0\n"
|
||||
" cvt.u64.s32 %rd1, %r6;\n"
|
||||
" ld.param.u64 %rd2, [__cudaparm_kernel_unpack_dev_nbor];\n"
|
||||
" cvt.u64.s32 %rd3, %r5;\n"
|
||||
" add.u64 %rd4, %rd3, %rd1;\n"
|
||||
" mul.lo.u64 %rd5, %rd4, 4;\n"
|
||||
" add.u64 %rd6, %rd2, %rd5;\n"
|
||||
" ld.global.s32 %r7, [%rd6+0];\n"
|
||||
" .loc 14 36 0\n"
|
||||
" mul.lo.u64 %rd7, %rd1, 4;\n"
|
||||
" add.u64 %rd8, %rd6, %rd7;\n"
|
||||
" mov.s64 %rd9, %rd8;\n"
|
||||
" .loc 14 37 0\n"
|
||||
" ld.param.u64 %rd10, [__cudaparm_kernel_unpack_dev_ij];\n"
|
||||
" ld.global.s32 %r8, [%rd8+0];\n"
|
||||
" cvt.u64.s32 %rd11, %r8;\n"
|
||||
" mul.lo.u64 %rd12, %rd11, 4;\n"
|
||||
" add.u64 %rd13, %rd10, %rd12;\n"
|
||||
" .loc 14 38 0\n"
|
||||
" cvt.u64.s32 %rd14, %r7;\n"
|
||||
" mul.lo.u64 %rd15, %rd14, 4;\n"
|
||||
" add.u64 %rd16, %rd15, %rd13;\n"
|
||||
" setp.le.u64 %p2, %rd16, %rd13;\n"
|
||||
" @%p2 bra $Lt_0_2562;\n"
|
||||
" add.u64 %rd17, %rd15, 3;\n"
|
||||
" shr.s64 %rd18, %rd17, 63;\n"
|
||||
" mov.s64 %rd19, 3;\n"
|
||||
" and.b64 %rd20, %rd18, %rd19;\n"
|
||||
" add.s64 %rd21, %rd20, %rd17;\n"
|
||||
" shr.s64 %rd22, %rd21, 2;\n"
|
||||
" mov.s64 %rd23, 1;\n"
|
||||
" max.s64 %rd24, %rd22, %rd23;\n"
|
||||
" mov.s64 %rd25, %rd24;\n"
|
||||
"$Lt_0_3074:\n"
|
||||
" .loc 14 41 0\n"
|
||||
" ld.global.s32 %r9, [%rd13+0];\n"
|
||||
" st.global.s32 [%rd9+0], %r9;\n"
|
||||
" .loc 14 42 0\n"
|
||||
" add.u64 %rd9, %rd7, %rd9;\n"
|
||||
" .loc 14 40 0\n"
|
||||
" add.u64 %rd13, %rd13, 4;\n"
|
||||
" setp.gt.u64 %p3, %rd16, %rd13;\n"
|
||||
" @%p3 bra $Lt_0_3074;\n"
|
||||
"$Lt_0_2562:\n"
|
||||
"$Lt_0_2050:\n"
|
||||
" .loc 14 45 0\n"
|
||||
" exit;\n"
|
||||
"$LDWend_kernel_unpack:\n"
|
||||
" }\n"
|
||||
;
|
|
@ -0,0 +1,71 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include "pair_gpu_nbor_shared.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "pair_gpu_nbor_cl.h"
|
||||
#else
|
||||
#include "pair_gpu_nbor_ptx.h"
|
||||
#include "pair_gpu_build_ptx.h"
|
||||
#endif
|
||||
|
||||
void PairGPUNborShared::clear() {
|
||||
if (_compiled) {
|
||||
if (_gpu_nbor) {
|
||||
k_cell_id.clear();
|
||||
k_cell_counts.clear();
|
||||
k_build_nbor.clear();
|
||||
k_transpose.clear();
|
||||
k_special.clear();
|
||||
delete build_program;
|
||||
} else {
|
||||
k_nbor.clear();
|
||||
delete nbor_program;
|
||||
}
|
||||
_compiled=false;
|
||||
}
|
||||
}
|
||||
|
||||
void PairGPUNborShared::compile_kernels(UCL_Device &dev, const bool gpu_nbor) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
_gpu_nbor=gpu_nbor;
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
|
||||
|
||||
if (gpu_nbor==false) {
|
||||
nbor_program=new UCL_Program(dev);
|
||||
nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
|
||||
k_nbor.set_function(*nbor_program,"kernel_unpack");
|
||||
} else {
|
||||
build_program=new UCL_Program(dev);
|
||||
#ifdef USE_OPENCL
|
||||
std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
|
||||
exit(1);
|
||||
#else
|
||||
build_program->load_string(pair_gpu_build_kernel,flags.c_str());
|
||||
#endif
|
||||
k_cell_id.set_function(*build_program,"calc_cell_id");
|
||||
k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
|
||||
k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
|
||||
k_transpose.set_function(*build_program,"transpose");
|
||||
k_special.set_function(*build_program,"kernel_special");
|
||||
neigh_tex.get_texture(*build_program,"neigh_tex");
|
||||
}
|
||||
_compiled=true;
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PAIR_GPU_NBOR_SHARED_H
|
||||
#define PAIR_GPU_NBOR_SHARED_H
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
|
||||
#include "geryon/ocl_kernel.h"
|
||||
#include "geryon/ocl_texture.h"
|
||||
using namespace ucl_opencl;
|
||||
|
||||
#else
|
||||
|
||||
#include "geryon/nvd_kernel.h"
|
||||
#include "geryon/nvd_texture.h"
|
||||
using namespace ucl_cudadr;
|
||||
|
||||
#endif
|
||||
|
||||
class PairGPUNborShared {
|
||||
public:
|
||||
PairGPUNborShared() : _compiled(false) {}
|
||||
~PairGPUNborShared() { clear(); }
|
||||
|
||||
/// Free all memory on host and device
|
||||
void clear();
|
||||
|
||||
/// Texture for cached position/type access with CUDA
|
||||
UCL_Texture neigh_tex;
|
||||
|
||||
/// Compile kernels for neighbor lists
|
||||
void compile_kernels(UCL_Device &dev, const bool gpu_nbor);
|
||||
|
||||
// ----------------------------- Kernels
|
||||
UCL_Program *nbor_program, *build_program;
|
||||
UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
|
||||
UCL_Kernel k_transpose, k_special;
|
||||
|
||||
private:
|
||||
bool _compiled, _gpu_nbor;
|
||||
};
|
||||
|
||||
#endif
|
|
@ -0,0 +1,326 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PPPM_GPU_KERNEL
|
||||
#define PPPM_GPU_KERNEL
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
#define numtyp double
|
||||
#define numtyp4 double4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifdef _SINGLE_DOUBLE
|
||||
#define numtyp float
|
||||
#define numtyp4 float4
|
||||
#define acctyp double
|
||||
#define acctyp4 double4
|
||||
#endif
|
||||
|
||||
#ifndef numtyp
|
||||
#define numtyp float
|
||||
#define numtyp4 float4
|
||||
#define acctyp float
|
||||
#define acctyp4 float4
|
||||
#endif
|
||||
|
||||
#ifdef NV_KERNEL
|
||||
|
||||
#include "geryon/ucl_nv_kernel.h"
|
||||
texture<float4> pos_tex;
|
||||
texture<float> q_tex;
|
||||
|
||||
#ifdef _DOUBLE_DOUBLE
|
||||
__inline double4 fetch_pos(const int& i, const double4 *pos)
|
||||
{
|
||||
return pos[i];
|
||||
}
|
||||
__inline double fetch_q(const int& i, const double *q)
|
||||
{
|
||||
return q[i];
|
||||
}
|
||||
|
||||
#else
|
||||
__inline float4 fetch_pos(const int& i, const float4 *pos)
|
||||
{
|
||||
return tex1Dfetch(pos_tex, i);
|
||||
}
|
||||
__inline float fetch_q(const int& i, const float *q)
|
||||
{
|
||||
return tex1Dfetch(q_tex, i);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
|
||||
#define GLOBAL_ID_X get_global_id(0)
|
||||
#define THREAD_ID_X get_local_id(0)
|
||||
#define BLOCK_ID_X get_group_id(0)
|
||||
#define BLOCK_SIZE_X get_local_size(0)
|
||||
#define GLOBAL_SIZE_X get_global_size(0)
|
||||
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
|
||||
#define __inline inline
|
||||
|
||||
#define fetch_pos(i,y) x_[i]
|
||||
#define fetch_q(i,y) q_[i]
|
||||
#define MEM_THREADS 16
|
||||
|
||||
#endif
|
||||
|
||||
// Maximum order for spline
|
||||
#define PPPM_MAX_SPLINE 8
|
||||
// Thread block size for PPPM kernels
|
||||
// - Must be >=PPPM_MAX_SPLINE^2
|
||||
// - Must be a multiple of 32
|
||||
#define PPPM_BLOCK_1D 64
|
||||
// Number of threads per pencil for charge spread
|
||||
#define PENCIL_SIZE MEM_THREADS
|
||||
// Number of pencils per block for charge spread
|
||||
#define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
|
||||
|
||||
__kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_,
|
||||
const grdtyp delvolinv, const int nlocal,
|
||||
__global int *counts, __global grdtyp4 *ans,
|
||||
const grdtyp b_lo_x, const grdtyp b_lo_y,
|
||||
const grdtyp b_lo_z, const grdtyp delxinv,
|
||||
const grdtyp delyinv, const grdtyp delzinv,
|
||||
const int nlocal_x, const int nlocal_y,
|
||||
const int nlocal_z, const int atom_stride,
|
||||
const int max_atoms, __global int *error) {
|
||||
// ii indexes the two interacting particles in gi
|
||||
int ii=GLOBAL_ID_X;
|
||||
|
||||
// Resequence the atom indices to avoid collisions during atomic ops
|
||||
int nthreads=GLOBAL_SIZE_X;
|
||||
ii=mul24(ii,PPPM_BLOCK_1D);
|
||||
ii-=(ii/nthreads)*(nthreads-1);
|
||||
|
||||
int nx,ny,nz;
|
||||
|
||||
if (ii<nlocal) {
|
||||
numtyp4 p=fetch_pos(ii,x_);
|
||||
grdtyp4 delta;
|
||||
delta.w=delvolinv*fetch_q(ii,q_);
|
||||
|
||||
if (delta.w!=(grdtyp)0.0) {
|
||||
delta.x=(p.x-b_lo_x)*delxinv;
|
||||
nx=delta.x;
|
||||
delta.y=(p.y-b_lo_y)*delyinv;
|
||||
ny=delta.y;
|
||||
delta.z=(p.z-b_lo_z)*delzinv;
|
||||
nz=delta.z;
|
||||
|
||||
if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 ||
|
||||
nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
|
||||
*error=1;
|
||||
else {
|
||||
delta.x=nx+(grdtyp)0.5-delta.x;
|
||||
delta.y=ny+(grdtyp)0.5-delta.y;
|
||||
delta.z=nz+(grdtyp)0.5-delta.z;
|
||||
|
||||
int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
|
||||
int old=atom_add(counts+i, 1);
|
||||
if (old==max_atoms) {
|
||||
*error=2;
|
||||
atom_add(counts+i, -1);
|
||||
} else
|
||||
ans[atom_stride*old+i]=delta;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* --------------------------- */
|
||||
|
||||
__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
|
||||
__global grdtyp *brick, __global grdtyp *_rho_coeff,
|
||||
const int atom_stride, const int npts_x,
|
||||
const int npts_y, const int npts_z, const int nlocal_x,
|
||||
const int nlocal_y, const int nlocal_z,
|
||||
const int order_m_1, const int order, const int order2) {
|
||||
__local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
|
||||
__local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE];
|
||||
__local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
|
||||
|
||||
int tid=THREAD_ID_X;
|
||||
if (tid<order2+order)
|
||||
rho_coeff[tid]=_rho_coeff[tid];
|
||||
|
||||
int pid=tid/PENCIL_SIZE;
|
||||
int fid=tid%PENCIL_SIZE;
|
||||
int fid_halo=PENCIL_SIZE+fid;
|
||||
if (fid<order)
|
||||
front[pid][fid_halo]=(grdtyp)0.0;
|
||||
|
||||
__syncthreads();
|
||||
|
||||
int bt=BLOCK_ID_X*BLOCK_PENCILS+pid;
|
||||
int ny=bt%npts_y;
|
||||
int nz=bt/npts_y;
|
||||
int y_start=0;
|
||||
int z_start=0;
|
||||
int y_stop=order;
|
||||
int z_stop=order;
|
||||
if (ny<order_m_1)
|
||||
y_start=order_m_1-ny;
|
||||
if (nz<order_m_1)
|
||||
z_start=order_m_1-nz;
|
||||
if (ny>=nlocal_y)
|
||||
y_stop-=ny-nlocal_y+1;
|
||||
if (nz>=nlocal_z)
|
||||
z_stop-=nz-nlocal_z+1;
|
||||
int z_stride=mul24(nlocal_x,nlocal_y);
|
||||
|
||||
int loop_count=npts_x/PENCIL_SIZE+1;
|
||||
int nx=fid;
|
||||
int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx;
|
||||
for (int i=0 ; i<loop_count; i++) {
|
||||
for (int n=0; n<order; n++)
|
||||
ans[n][tid]=(grdtyp)0.0;
|
||||
if (nx<nlocal_x && nz<npts_z) {
|
||||
int z_pos=mul24(nz+z_start-order_m_1,z_stride);
|
||||
for (int m=z_start; m<z_stop; m++) {
|
||||
int y_pos=mul24(ny+y_start-order_m_1,nlocal_x);
|
||||
for (int l=y_start; l<y_stop; l++) {
|
||||
int pos=z_pos+y_pos+nx;
|
||||
int natoms=mul24(counts[pos],atom_stride);
|
||||
for (int row=pos; row<natoms; row+=atom_stride) {
|
||||
grdtyp4 delta=atoms[row];
|
||||
|
||||
grdtyp rho1d_1=(grdtyp)0.0;
|
||||
grdtyp rho1d_2=(grdtyp)0.0;
|
||||
for (int k=order2+order-1; k > -1; k-=order) {
|
||||
rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y;
|
||||
rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z;
|
||||
}
|
||||
delta.w*=rho1d_1*rho1d_2;
|
||||
|
||||
for (int n=0; n<order; n++) {
|
||||
grdtyp rho1d_0=(grdtyp)0.0;
|
||||
for (int k=order2+n; k>=n; k-=order)
|
||||
rho1d_0=rho_coeff[k]+rho1d_0*delta.x;
|
||||
ans[n][tid]+=delta.w*rho1d_0;
|
||||
}
|
||||
}
|
||||
y_pos+=nlocal_x;
|
||||
}
|
||||
z_pos+=z_stride;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
if (fid<order) {
|
||||
front[pid][fid]=front[pid][fid_halo];
|
||||
front[pid][fid_halo]=(grdtyp)0.0;
|
||||
} else
|
||||
front[pid][fid]=(grdtyp)0.0;
|
||||
|
||||
for (int n=0; n<order; n++) {
|
||||
front[pid][fid+n]+=ans[n][tid];
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
if (nx<npts_x && nz<npts_z)
|
||||
brick[pt]=front[pid][fid];
|
||||
pt+=PENCIL_SIZE;
|
||||
nx+=PENCIL_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
|
||||
const int nlocal, __global grdtyp4 *brick,
|
||||
__global grdtyp *_rho_coeff, const int npts_x,
|
||||
const int npts_yx, const grdtyp b_lo_x,
|
||||
const grdtyp b_lo_y, const grdtyp b_lo_z,
|
||||
const grdtyp delxinv, const grdtyp delyinv,
|
||||
const grdtyp delzinv, const int order,
|
||||
const int order2, const grdtyp qqrd2e_scale,
|
||||
__global acctyp4 *ans) {
|
||||
__local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
|
||||
__local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
|
||||
__local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
|
||||
|
||||
int tid=THREAD_ID_X;
|
||||
if (tid<order2+order)
|
||||
rho_coeff[tid]=_rho_coeff[tid];
|
||||
__syncthreads();
|
||||
|
||||
int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
|
||||
|
||||
int nx,ny,nz;
|
||||
grdtyp tx,ty,tz;
|
||||
|
||||
if (ii<nlocal) {
|
||||
numtyp4 p=fetch_pos(ii,x_);
|
||||
grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
|
||||
|
||||
acctyp4 ek;
|
||||
ek.x=(acctyp)0.0;
|
||||
ek.y=(acctyp)0.0;
|
||||
ek.z=(acctyp)0.0;
|
||||
if (qs!=(grdtyp)0.0) {
|
||||
tx=(p.x-b_lo_x)*delxinv;
|
||||
nx=tx;
|
||||
ty=(p.y-b_lo_y)*delyinv;
|
||||
ny=ty;
|
||||
tz=(p.z-b_lo_z)*delzinv;
|
||||
nz=tz;
|
||||
|
||||
grdtyp dx=nx+(grdtyp)0.5-tx;
|
||||
grdtyp dy=ny+(grdtyp)0.5-ty;
|
||||
grdtyp dz=nz+(grdtyp)0.5-tz;
|
||||
|
||||
for (int k=0; k<order; k++) {
|
||||
rho1d_0[k][tid]=(grdtyp)0.0;
|
||||
rho1d_1[k][tid]=(grdtyp)0.0;
|
||||
for (int l=order2+k; l>=k; l-=order) {
|
||||
rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx;
|
||||
rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy;
|
||||
}
|
||||
}
|
||||
|
||||
int mz=mul24(nz,npts_yx)+nx;
|
||||
for (int n=0; n<order; n++) {
|
||||
grdtyp rho1d_2=(grdtyp)0.0;
|
||||
for (int k=order2+n; k>=n; k-=order)
|
||||
rho1d_2=rho_coeff[k]+rho1d_2*dz;
|
||||
grdtyp z0=qs*rho1d_2;
|
||||
int my=mz+mul24(ny,npts_x);
|
||||
for (int m=0; m<order; m++) {
|
||||
grdtyp y0=z0*rho1d_1[m][tid];
|
||||
for (int l=0; l<order; l++) {
|
||||
grdtyp x0=y0*rho1d_0[l][tid];
|
||||
grdtyp4 el=brick[my+l];
|
||||
ek.x-=x0*el.x;
|
||||
ek.y-=x0*el.y;
|
||||
ek.z-=x0*el.z;
|
||||
}
|
||||
my+=npts_x;
|
||||
}
|
||||
mz+=npts_yx;
|
||||
}
|
||||
}
|
||||
ans[ii]=ek;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,405 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "pppm_gpu_cl.h"
|
||||
#else
|
||||
#include "pppm_f_gpu_ptx.h"
|
||||
#include "pppm_d_gpu_ptx.h"
|
||||
#endif
|
||||
#include "pppm_gpu_memory.h"
|
||||
#include <cassert>
|
||||
|
||||
#define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp, grdtyp, grdtyp4>
|
||||
|
||||
extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false),
|
||||
_max_bytes(0) {
|
||||
device=&pair_gpu_device;
|
||||
ans=new PairGPUAns<numtyp,acctyp>();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
PPPMGPUMemoryT::~PPPMGPUMemory() {
|
||||
clear(0.0);
|
||||
delete ans;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
int PPPMGPUMemoryT::bytes_per_atom() const {
|
||||
return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
|
||||
const int order, const int nxlo_out,
|
||||
const int nylo_out, const int nzlo_out,
|
||||
const int nxhi_out, const int nyhi_out,
|
||||
const int nzhi_out, double **rho_coeff,
|
||||
grdtyp **vd_brick, const double slab_volfactor,
|
||||
const int nx_pppm, const int ny_pppm,
|
||||
const int nz_pppm, int &flag) {
|
||||
_max_bytes=10;
|
||||
screen=_screen;
|
||||
bool success=true;
|
||||
|
||||
flag=device->init(*ans,nlocal,nall);
|
||||
if (flag!=0)
|
||||
return 0;
|
||||
if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
|
||||
flag=-5;
|
||||
return 0;
|
||||
}
|
||||
|
||||
ucl_device=device->gpu;
|
||||
atom=&device->atom;
|
||||
|
||||
_block_size=device->pppm_block();
|
||||
_pencil_size=device->num_mem_threads();
|
||||
_block_pencils=_block_size/_pencil_size;
|
||||
|
||||
compile_kernels(*ucl_device);
|
||||
|
||||
// Initialize timers for the selected GPU
|
||||
time_in.init(*ucl_device);
|
||||
time_in.zero();
|
||||
time_out.init(*ucl_device);
|
||||
time_out.zero();
|
||||
time_map.init(*ucl_device);
|
||||
time_map.zero();
|
||||
time_rho.init(*ucl_device);
|
||||
time_rho.zero();
|
||||
time_interp.init(*ucl_device);
|
||||
time_interp.zero();
|
||||
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
|
||||
_allocated=true;
|
||||
_max_bytes=0;
|
||||
_max_an_bytes=ans->gpu_bytes();
|
||||
|
||||
_order=order;
|
||||
_order_m_1=order-1;
|
||||
_order2=_order_m_1*_order;
|
||||
_nlower=-(_order-1)/2;
|
||||
_nupper=order/2;
|
||||
_nxlo_out=nxlo_out;
|
||||
_nylo_out=nylo_out;
|
||||
_nzlo_out=nzlo_out;
|
||||
_nxhi_out=nxhi_out;
|
||||
_nyhi_out=nyhi_out;
|
||||
_nzhi_out=nzhi_out;
|
||||
|
||||
_slab_volfactor=slab_volfactor;
|
||||
_nx_pppm=nx_pppm;
|
||||
_ny_pppm=ny_pppm;
|
||||
_nz_pppm=nz_pppm;
|
||||
|
||||
_max_brick_atoms=10;
|
||||
|
||||
// Get rho_coeff on device
|
||||
int n2lo=(1-order)/2;
|
||||
int numel=order*( order/2 - n2lo + 1 );
|
||||
success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
|
||||
UCL_SUCCESS);
|
||||
UCL_H_Vec<double> view;
|
||||
view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
|
||||
ucl_copy(d_rho_coeff,view,true);
|
||||
_max_bytes+=d_rho_coeff.row_bytes();
|
||||
|
||||
// Allocate storage for grid
|
||||
_npts_x=nxhi_out-nxlo_out+1;
|
||||
_npts_y=nyhi_out-nylo_out+1;
|
||||
_npts_z=nzhi_out-nzlo_out+1;
|
||||
_npts_yx=_npts_x*_npts_y;
|
||||
success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
*vd_brick=h_vd_brick.begin();
|
||||
_max_bytes+=d_brick.row_bytes();
|
||||
|
||||
// Allocate vector with count of atoms assigned to each grid point
|
||||
_nlocal_x=_npts_x+_nlower-_nupper;
|
||||
_nlocal_y=_npts_y+_nlower-_nupper;
|
||||
_nlocal_z=_npts_z+_nlower-_nupper;
|
||||
_nlocal_yx=_nlocal_x*_nlocal_y;
|
||||
_atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
|
||||
success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
|
||||
UCL_SUCCESS);
|
||||
_max_bytes+=d_brick_counts.row_bytes();
|
||||
|
||||
// Allocate storage for atoms assigned to each grid point
|
||||
success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
|
||||
*ucl_device)==UCL_SUCCESS);
|
||||
_max_bytes+=d_brick_atoms.row_bytes();
|
||||
|
||||
// Allocate error flags for checking out of bounds atoms
|
||||
success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
|
||||
success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
|
||||
UCL_SUCCESS);
|
||||
if (!success) {
|
||||
flag=-3;
|
||||
return 0;
|
||||
}
|
||||
|
||||
d_error_flag.zero();
|
||||
_max_bytes+=1;
|
||||
|
||||
_cpu_idle_time=0.0;
|
||||
|
||||
return h_brick.begin();
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
void PPPMGPUMemoryT::clear(const double cpu_time) {
|
||||
if (!_allocated)
|
||||
return;
|
||||
_allocated=false;
|
||||
_precompute_done=false;
|
||||
|
||||
d_brick.clear();
|
||||
h_brick.clear();
|
||||
h_vd_brick.clear();
|
||||
d_brick_counts.clear();
|
||||
h_error_flag.clear();
|
||||
d_error_flag.clear();
|
||||
d_brick_atoms.clear();
|
||||
|
||||
acc_timers();
|
||||
device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
|
||||
*ans,_max_bytes+_max_an_bytes,cpu_time,
|
||||
_cpu_idle_time,screen);
|
||||
|
||||
if (_compiled) {
|
||||
k_particle_map.clear();
|
||||
k_make_rho.clear();
|
||||
k_interp.clear();
|
||||
delete pppm_program;
|
||||
_compiled=false;
|
||||
}
|
||||
|
||||
time_in.clear();
|
||||
time_out.clear();
|
||||
time_map.clear();
|
||||
time_rho.clear();
|
||||
time_interp.clear();
|
||||
|
||||
device->clear();
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Charge assignment that can be performed asynchronously
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
void PPPMGPUMemoryT::_precompute(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *host_q, double *boxlo,
|
||||
const double delxinv, const double delyinv,
|
||||
const double delzinv) {
|
||||
acc_timers();
|
||||
if (nlocal==0) {
|
||||
zero_timers();
|
||||
return;
|
||||
}
|
||||
|
||||
ans->inum(nlocal);
|
||||
|
||||
if (ago==0) {
|
||||
resize_atom(nlocal,nall,success);
|
||||
resize_local(nlocal,success);
|
||||
if (!success)
|
||||
return;
|
||||
|
||||
double bytes=ans->gpu_bytes();
|
||||
if (bytes>_max_an_bytes)
|
||||
_max_an_bytes=bytes;
|
||||
}
|
||||
|
||||
atom->cast_x_data(host_x,host_type);
|
||||
atom->cast_q_data(host_q);
|
||||
atom->add_x_data(host_x,host_type);
|
||||
atom->add_q_data();
|
||||
|
||||
time_map.start();
|
||||
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
|
||||
// Boxlo adjusted to be upper left brick and shift for even spline order
|
||||
double shift=0.0;
|
||||
if (_order % 2)
|
||||
shift=0.5;
|
||||
_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
|
||||
_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
|
||||
_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
|
||||
|
||||
_delxinv=delxinv;
|
||||
_delyinv=delyinv;
|
||||
_delzinv=delzinv;
|
||||
double delvolinv = delxinv*delyinv*delzinv;
|
||||
grdtyp f_delvolinv = delvolinv;
|
||||
|
||||
device->zero(d_brick_counts,d_brick_counts.numel());
|
||||
k_particle_map.set_size(GX,BX);
|
||||
k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
|
||||
&ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
|
||||
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv,
|
||||
&_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z,
|
||||
&_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
|
||||
time_map.stop();
|
||||
|
||||
time_rho.start();
|
||||
BX=block_size();
|
||||
|
||||
GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
|
||||
_block_pencils));
|
||||
k_make_rho.set_size(GX,BX);
|
||||
k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
|
||||
&d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride,
|
||||
&_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
|
||||
&_nlocal_z, &_order_m_1, &_order, &_order2);
|
||||
time_rho.stop();
|
||||
|
||||
time_out.start();
|
||||
ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
|
||||
ucl_copy(h_error_flag,d_error_flag,true);
|
||||
time_out.stop();
|
||||
|
||||
_precompute_done=true;
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Charge spreading stuff
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *host_q, double *boxlo,
|
||||
const double delxinv, const double delyinv,
|
||||
const double delzinv) {
|
||||
if (_precompute_done==false) {
|
||||
atom->acc_timers();
|
||||
_precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
|
||||
delyinv,delzinv);
|
||||
}
|
||||
|
||||
device->stop_host_timer();
|
||||
|
||||
if (!success || nlocal==0)
|
||||
return 0;
|
||||
|
||||
double t=MPI_Wtime();
|
||||
time_out.sync_stop();
|
||||
_cpu_idle_time+=MPI_Wtime()-t;
|
||||
|
||||
_precompute_done=false;
|
||||
|
||||
if (h_error_flag[0]==2) {
|
||||
// Not enough storage for atoms on the brick
|
||||
_max_brick_atoms*=2;
|
||||
d_error_flag.zero();
|
||||
d_brick_atoms.clear();
|
||||
d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
|
||||
_max_bytes+=d_brick_atoms.row_bytes();
|
||||
return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
|
||||
delxinv,delyinv,delzinv);
|
||||
}
|
||||
|
||||
return h_error_flag[0];
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Charge spreading stuff
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
void PPPMGPUMemoryT::interp(const grdtyp qqrd2e_scale) {
|
||||
time_in.start();
|
||||
ucl_copy(d_brick,h_vd_brick,true);
|
||||
time_in.stop();
|
||||
|
||||
time_interp.start();
|
||||
// Compute the block size and grid size to keep all cores busy
|
||||
int BX=this->block_size();
|
||||
int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
|
||||
|
||||
int ainum=this->ans->inum();
|
||||
|
||||
k_interp.set_size(GX,BX);
|
||||
k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
|
||||
&d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
|
||||
&_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
|
||||
&_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
|
||||
time_interp.stop();
|
||||
|
||||
ans->copy_answers(false,false,false,false);
|
||||
device->add_ans_object(ans);
|
||||
}
|
||||
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
double PPPMGPUMemoryT::host_memory_usage() const {
|
||||
return device->atom.host_memory_usage()+
|
||||
sizeof(PPPMGPUMemory<numtyp,acctyp,grdtyp,grdtyp4>);
|
||||
}
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
|
||||
if (_compiled)
|
||||
return;
|
||||
|
||||
if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
|
||||
return;
|
||||
|
||||
std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
|
||||
std::string(OCL_PRECISION_COMPILE);
|
||||
#ifdef USE_OPENCL
|
||||
flags+=std::string(" -D grdtyp=")+ucl_template_name<grdtyp>()+" -D grdtyp4="+
|
||||
ucl_template_name<grdtyp>()+"4";
|
||||
#endif
|
||||
|
||||
pppm_program=new UCL_Program(dev);
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
pppm_program->load_string(pppm_gpu_kernel,flags.c_str());
|
||||
#else
|
||||
if (sizeof(grdtyp)==sizeof(float))
|
||||
pppm_program->load_string(pppm_f_gpu_kernel,flags.c_str());
|
||||
else
|
||||
pppm_program->load_string(pppm_d_gpu_kernel,flags.c_str());
|
||||
#endif
|
||||
|
||||
k_particle_map.set_function(*pppm_program,"particle_map");
|
||||
k_make_rho.set_function(*pppm_program,"make_rho");
|
||||
k_interp.set_function(*pppm_program,"interp");
|
||||
pos_tex.get_texture(*pppm_program,"pos_tex");
|
||||
q_tex.get_texture(*pppm_program,"q_tex");
|
||||
|
||||
_compiled=true;
|
||||
}
|
||||
|
||||
template class PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
|
||||
template class PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
|
||||
|
|
@ -0,0 +1,195 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#ifndef PPPM_GPU_MEMORY_H
|
||||
#define PPPM_GPU_MEMORY_H
|
||||
|
||||
#include "mpi.h"
|
||||
#include "pair_gpu_device.h"
|
||||
|
||||
#ifdef USE_OPENCL
|
||||
#include "geryon/ocl_texture.h"
|
||||
#else
|
||||
#include "geryon/nvd_texture.h"
|
||||
#endif
|
||||
|
||||
template <class numtyp, class acctyp> class PairGPUDevice;
|
||||
|
||||
template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
|
||||
class PPPMGPUMemory {
|
||||
public:
|
||||
PPPMGPUMemory();
|
||||
virtual ~PPPMGPUMemory();
|
||||
|
||||
/// Clear any previous data and set up for a new LAMMPS run
|
||||
/** Success will be:
|
||||
* - 0 if successfull
|
||||
* - -1 if fix gpu not found
|
||||
* - -2 if GPU could not be found
|
||||
* - -3 if there is an out of memory error
|
||||
* - -4 if the GPU library was not compiled for GPU
|
||||
* - -5 Double precision is not supported on card **/
|
||||
grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
|
||||
const int nxlo_out, const int nylo_out, const int nzlo_out,
|
||||
const int nxhi_out, const int nyhi_out, const int nzhi_out,
|
||||
double **rho_coeff, grdtyp **vd_brick,
|
||||
const double slab_volfactor, const int nx_pppm,
|
||||
const int ny_pppm, const int nz_pppm, int &success);
|
||||
|
||||
/// Check if there is enough storage for atom arrays and realloc if not
|
||||
/** \param success set to false if insufficient memory **/
|
||||
inline void resize_atom(const int inum, const int nall, bool &success) {
|
||||
if (atom->resize(nall, success)) {
|
||||
pos_tex.bind_float(atom->dev_x,4);
|
||||
q_tex.bind_float(atom->dev_q,1);
|
||||
}
|
||||
ans->resize(inum,success);
|
||||
}
|
||||
|
||||
/// Check if there is enough storage for local atoms and realloc if not
|
||||
inline void resize_local(const int inum, bool &success) {
|
||||
}
|
||||
|
||||
/// Clear all host and device data
|
||||
/** \note This is called at the beginning of the init() routine **/
|
||||
void clear(const double cpu_time);
|
||||
|
||||
/// Returns memory usage on device per atom
|
||||
int bytes_per_atom() const;
|
||||
|
||||
/// Total host memory used by library for pair style
|
||||
double host_memory_usage() const;
|
||||
|
||||
/// Accumulate timers
|
||||
inline void acc_timers() {
|
||||
if (device->time_device()) {
|
||||
ans->acc_timers();
|
||||
time_in.add_to_total();
|
||||
time_out.add_to_total();
|
||||
time_map.add_to_total();
|
||||
time_rho.add_to_total();
|
||||
time_interp.add_to_total();
|
||||
}
|
||||
}
|
||||
|
||||
/// Zero timers
|
||||
inline void zero_timers() {
|
||||
atom->zero_timers();
|
||||
ans->zero_timers();
|
||||
time_in.zero();
|
||||
time_out.zero();
|
||||
time_map.zero();
|
||||
time_rho.zero();
|
||||
time_interp.zero();
|
||||
}
|
||||
|
||||
/// Precomputations for charge assignment that can be done asynchronously
|
||||
inline void precompute(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *charge, double *boxlo, double *prd) {
|
||||
double delxinv=_nx_pppm/prd[0];
|
||||
double delyinv=_ny_pppm/prd[1];
|
||||
double delzinv=_nz_pppm/(prd[2]*_slab_volfactor);
|
||||
_precompute(ago,nlocal,nall,host_x,host_type,success,charge,boxlo,delxinv,
|
||||
delyinv,delzinv);
|
||||
}
|
||||
|
||||
/// Returns non-zero if out of bounds atoms
|
||||
int spread(const int ago, const int nlocal, const int nall, double **host_x,
|
||||
int *host_type, bool &success, double *charge, double *boxlo,
|
||||
const double delxinv, const double delyinv, const double delzinv);
|
||||
|
||||
void interp(const grdtyp qqrd2e_scale);
|
||||
|
||||
// -------------------------- DEVICE DATA -------------------------
|
||||
|
||||
/// Device Properties and Atom and Neighbor storage
|
||||
PairGPUDevice<numtyp,acctyp> *device;
|
||||
|
||||
/// Geryon device
|
||||
UCL_Device *ucl_device;
|
||||
|
||||
/// Device Timers
|
||||
UCL_Timer time_in, time_out, time_map, time_rho, time_interp;
|
||||
|
||||
/// LAMMPS pointer for screen output
|
||||
FILE *screen;
|
||||
|
||||
// --------------------------- ATOM DATA --------------------------
|
||||
|
||||
/// Atom Data
|
||||
PairGPUAtom<numtyp,acctyp> *atom;
|
||||
|
||||
|
||||
// --------------------------- GRID DATA --------------------------
|
||||
|
||||
UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
|
||||
UCL_D_Vec<grdtyp> d_brick;
|
||||
|
||||
// Count of number of atoms assigned to each grid point
|
||||
UCL_D_Vec<int> d_brick_counts;
|
||||
// Atoms assigned to each grid point
|
||||
UCL_D_Vec<grdtyp4> d_brick_atoms;
|
||||
|
||||
// Error checking for out of bounds atoms
|
||||
UCL_D_Vec<int> d_error_flag;
|
||||
UCL_H_Vec<int> h_error_flag;
|
||||
|
||||
// Number of grid points in brick (including ghost)
|
||||
int _npts_x, _npts_y, _npts_z, _npts_yx;
|
||||
|
||||
// Number of local grid points in brick
|
||||
int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
|
||||
|
||||
// -------------------------- SPLINE DATA -------------------------
|
||||
UCL_D_Vec<grdtyp> d_rho_coeff;
|
||||
int _order, _nlower, _nupper, _order_m_1, _order2;
|
||||
int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;
|
||||
|
||||
// ------------------------ FORCE/ENERGY DATA -----------------------
|
||||
|
||||
PairGPUAns<numtyp,acctyp> *ans;
|
||||
|
||||
// ------------------------- DEVICE KERNELS -------------------------
|
||||
UCL_Program *pppm_program;
|
||||
UCL_Kernel k_particle_map, k_make_rho, k_interp;
|
||||
inline int block_size() { return _block_size; }
|
||||
|
||||
// --------------------------- TEXTURES -----------------------------
|
||||
UCL_Texture pos_tex;
|
||||
UCL_Texture q_tex;
|
||||
|
||||
protected:
|
||||
bool _allocated, _compiled, _precompute_done;
|
||||
int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
|
||||
double _max_bytes, _max_an_bytes;
|
||||
double _cpu_idle_time;
|
||||
|
||||
grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv;
|
||||
|
||||
double _slab_volfactor;
|
||||
int _nx_pppm, _ny_pppm, _nz_pppm;
|
||||
|
||||
void compile_kernels(UCL_Device &dev);
|
||||
void _precompute(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *charge, double *boxlo, const double delxinv,
|
||||
const double delyinv, const double delzinv);
|
||||
};
|
||||
|
||||
#endif
|
||||
|
|
@ -0,0 +1,164 @@
|
|||
/* ----------------------------------------------------------------------
|
||||
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
|
||||
http://lammps.sandia.gov, Sandia National Laboratories
|
||||
Steve Plimpton, sjplimp@sandia.gov
|
||||
|
||||
Copyright (2003) Sandia Corporation. Under the terms of Contract
|
||||
DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
|
||||
certain rights in this software. This software is distributed under
|
||||
the GNU General Public License.
|
||||
|
||||
See the README file in the top-level LAMMPS directory.
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
/* ----------------------------------------------------------------------
|
||||
Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <math.h>
|
||||
|
||||
#include "pppm_gpu_memory.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
static PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF;
|
||||
static PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Allocate memory on host and device and copy constants to device
|
||||
// ---------------------------------------------------------------------------
|
||||
template <class grdtyp, class memtyp>
|
||||
grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
|
||||
FILE *screen, const int order, const int nxlo_out,
|
||||
const int nylo_out, const int nzlo_out,
|
||||
const int nxhi_out, const int nyhi_out,
|
||||
const int nzhi_out, double **rho_coeff,
|
||||
grdtyp **vd_brick, const double slab_volfactor,
|
||||
const int nx_pppm, const int ny_pppm, const int nz_pppm,
|
||||
int &success) {
|
||||
pppm.clear(0.0);
|
||||
int first_gpu=pppm.device->first_device();
|
||||
int last_gpu=pppm.device->last_device();
|
||||
int world_me=pppm.device->world_me();
|
||||
int gpu_rank=pppm.device->gpu_rank();
|
||||
int procs_per_gpu=pppm.device->procs_per_gpu();
|
||||
|
||||
pppm.device->init_message(screen,"pppm",first_gpu,last_gpu);
|
||||
|
||||
bool message=false;
|
||||
if (pppm.device->replica_me()==0 && screen)
|
||||
message=true;
|
||||
|
||||
if (message) {
|
||||
fprintf(screen,"Initializing GPU and compiling on process 0...");
|
||||
fflush(screen);
|
||||
}
|
||||
|
||||
success=0;
|
||||
grdtyp * host_brick=NULL;
|
||||
if (world_me==0)
|
||||
host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out,
|
||||
nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
|
||||
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
|
||||
|
||||
pppm.device->world_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
|
||||
for (int i=0; i<procs_per_gpu; i++) {
|
||||
if (message) {
|
||||
if (last_gpu-first_gpu==0)
|
||||
fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
|
||||
else
|
||||
fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
|
||||
last_gpu,i);
|
||||
fflush(screen);
|
||||
}
|
||||
if (gpu_rank==i && world_me!=0)
|
||||
host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
|
||||
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
|
||||
vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
|
||||
success);
|
||||
|
||||
pppm.device->gpu_barrier();
|
||||
if (message)
|
||||
fprintf(screen,"Done.\n");
|
||||
}
|
||||
if (message)
|
||||
fprintf(screen,"\n");
|
||||
return host_brick;
|
||||
}
|
||||
|
||||
float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
|
||||
const int order, const int nxlo_out,
|
||||
const int nylo_out, const int nzlo_out,
|
||||
const int nxhi_out, const int nyhi_out,
|
||||
const int nzhi_out, double **rho_coeff,
|
||||
float **vd_brick, const double slab_volfactor,
|
||||
const int nx_pppm, const int ny_pppm, const int nz_pppm,
|
||||
int &success) {
|
||||
float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
|
||||
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
|
||||
slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
|
||||
PPPMF.device->set_single_precompute(&PPPMF);
|
||||
return b;
|
||||
}
|
||||
|
||||
void pppm_gpu_clear_f(const double cpu_time) {
|
||||
PPPMF.clear(cpu_time);
|
||||
}
|
||||
|
||||
int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *host_q, double *boxlo, const double delxinv,
|
||||
const double delyinv, const double delzinv) {
|
||||
return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
|
||||
delxinv,delyinv,delzinv);
|
||||
}
|
||||
|
||||
void pppm_gpu_interp_f(const float qqrd2e_scale) {
|
||||
return PPPMF.interp(qqrd2e_scale);
|
||||
}
|
||||
|
||||
double pppm_gpu_bytes_f() {
|
||||
return PPPMF.host_memory_usage();
|
||||
}
|
||||
|
||||
double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
|
||||
const int order, const int nxlo_out,
|
||||
const int nylo_out, const int nzlo_out,
|
||||
const int nxhi_out, const int nyhi_out,
|
||||
const int nzhi_out, double **rho_coeff,
|
||||
double **vd_brick, const double slab_volfactor,
|
||||
const int nx_pppm, const int ny_pppm,
|
||||
const int nz_pppm, int &success) {
|
||||
double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
|
||||
nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
|
||||
vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
|
||||
success);
|
||||
PPPMF.device->set_double_precompute(&PPPMD);
|
||||
return b;
|
||||
}
|
||||
|
||||
void pppm_gpu_clear_d(const double cpu_time) {
|
||||
PPPMD.clear(cpu_time);
|
||||
}
|
||||
|
||||
int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
|
||||
double **host_x, int *host_type, bool &success,
|
||||
double *host_q, double *boxlo, const double delxinv,
|
||||
const double delyinv, const double delzinv) {
|
||||
return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
|
||||
delxinv,delyinv,delzinv);
|
||||
}
|
||||
|
||||
void pppm_gpu_interp_d(const double qqrd2e_scale) {
|
||||
return PPPMD.interp(qqrd2e_scale);
|
||||
}
|
||||
|
||||
double pppm_gpu_bytes_d() {
|
||||
return PPPMD.host_memory_usage();
|
||||
}
|
||||
|
Loading…
Reference in New Issue