git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6069 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-05-04 16:24:42 +00:00 · 2011-05-04 16:24:42 +00:00 · bf6bb59386
parent b8f1ff821f
commit bf6bb59386
29 changed files with 12387 additions and 0 deletions
--- a/lib/gpu/cmm_cut_gpu_ptx.h
+++ b/lib/gpu/cmm_cut_gpu_ptx.h
@ -0,0 +1,627 @@
+const char * cmm_cut_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch)\n"
+"	{\n"
+"	.reg .u32 %r<33>;\n"
+"	.reg .u64 %rd<36>;\n"
+"	.reg .f32 %f<95>;\n"
+"	.reg .pred %p<10>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj84[16];\n"
+"	.loc	14	87	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	91	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+0], %f1;\n"
+"	.loc	14	92	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+4], %f2;\n"
+"	.loc	14	93	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+8], %f3;\n"
+"	.loc	14	94	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+12], %f4;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_9474;\n"
+"	.loc	14	105	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	.loc	14	108	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	110	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	111	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	114	0\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	mul24.lo.s32 	%r14, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r16, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_14594;\n"
+"	mov.s32 	%r17, 0;\n"
+"	setp.gt.s32 	%p3, %r16, %r17;\n"
+"	mov.s32 	%r18, 0;\n"
+"	setp.gt.s32 	%p4, %r15, %r18;\n"
+"	cvt.rzi.s32.f32 	%r19, %f24;\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r21, %r20, %r19;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.u64 	%rd15, __cuda_sp_lj84;\n"
+"$Lt_0_10498:\n"
+"	.loc	14	120	0\n"
+"	ld.global.s32 	%r22, [%rd10+0];\n"
+"	.loc	14	121	0\n"
+"	shr.s32 	%r23, %r22, 30;\n"
+"	cvt.s64.s32 	%rd16, %r23;\n"
+"	and.b64 	%rd17, %rd16, 3;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd15, %rd18;\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	and.b32 	%r24, %r22, 1073741823;\n"
+"	mov.s32 	%r25, 0;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.s32 	%r27, 0;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r24,%r25,%r26,%r27}];\n"
+"	.loc	14	124	0\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.s32.f32 	%r28, %f37;\n"
+"	sub.f32 	%f38, %f22, %f35;\n"
+"	sub.f32 	%f39, %f21, %f34;\n"
+"	sub.f32 	%f40, %f23, %f36;\n"
+"	mul.f32 	%f41, %f38, %f38;\n"
+"	mad.f32 	%f42, %f39, %f39, %f41;\n"
+"	mad.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r29, %r28, %r21;\n"
+"	cvt.u64.s32 	%rd20, %r29;\n"
+"	mul.lo.u64 	%rd21, %rd20, 16;\n"
+"	add.u64 	%rd22, %rd21, %rd14;\n"
+"	ld.global.f32 	%f44, [%rd22+0];\n"
+"	setp.gt.f32 	%p5, %f44, %f43;\n"
+"	@!%p5 bra 	$Lt_0_12802;\n"
+"	rcp.approx.f32 	%f45, %f43;\n"
+"	ld.global.f32 	%f46, [%rd22+4];\n"
+"	mov.f32 	%f47, 0f40000000;    	\n"
+"	setp.eq.f32 	%p6, %f46, %f47;\n"
+"	@!%p6 bra 	$Lt_0_11522;\n"
+"	.loc	14	139	0\n"
+"	mul.f32 	%f48, %f45, %f45;\n"
+"	mov.f32 	%f49, %f48;\n"
+"	.loc	14	140	0\n"
+"	mul.f32 	%f50, %f48, %f48;\n"
+"	bra.uni 	$Lt_0_11778;\n"
+"$Lt_0_11522:\n"
+"	mov.f32 	%f51, 0f3f800000;    	\n"
+"	setp.eq.f32 	%p7, %f46, %f51;\n"
+"	@!%p7 bra 	$Lt_0_12034;\n"
+"	.loc	14	142	0\n"
+"	sqrt.approx.f32 	%f52, %f45;\n"
+"	mul.f32 	%f53, %f45, %f52;\n"
+"	mov.f32 	%f50, %f53;\n"
+"	.loc	14	143	0\n"
+"	mul.f32 	%f49, %f53, %f53;\n"
+"	bra.uni 	$Lt_0_11778;\n"
+"$Lt_0_12034:\n"
+"	.loc	14	145	0\n"
+"	mul.f32 	%f54, %f45, %f45;\n"
+"	mul.f32 	%f55, %f45, %f54;\n"
+"	mov.f32 	%f49, %f55;\n"
+"	.loc	14	146	0\n"
+"	mov.f32 	%f50, %f55;\n"
+"$Lt_0_11778:\n"
+"$Lt_0_11266:\n"
+"	.loc	14	121	0\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	.loc	14	148	0\n"
+"	mul.f32 	%f56, %f45, %f29;\n"
+"	mul.f32 	%f57, %f49, %f56;\n"
+"	ld.global.v2.f32 	{%f58,%f59}, [%rd22+8];\n"
+"	mul.f32 	%f60, %f58, %f50;\n"
+"	sub.f32 	%f61, %f60, %f59;\n"
+"	mul.f32 	%f62, %f57, %f61;\n"
+"	.loc	14	150	0\n"
+"	mad.f32 	%f27, %f39, %f62, %f27;\n"
+"	.loc	14	151	0\n"
+"	mad.f32 	%f26, %f38, %f62, %f26;\n"
+"	.loc	14	152	0\n"
+"	mad.f32 	%f25, %f40, %f62, %f25;\n"
+"	@!%p3 bra 	$Lt_0_12290;\n"
+"	.loc	14	154	0\n"
+"	ld.param.u64 	%rd23, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd24, %rd23, %rd21;\n"
+"	ld.global.v4.f32 	{%f63,%f64,%f65,_}, [%rd24+0];\n"
+"	.loc	14	121	0\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	.loc	14	154	0\n"
+"	mul.f32 	%f66, %f29, %f49;\n"
+"	mul.f32 	%f67, %f63, %f50;\n"
+"	sub.f32 	%f68, %f67, %f64;\n"
+"	mul.f32 	%f69, %f66, %f68;\n"
+"	sub.f32 	%f70, %f69, %f65;\n"
+"	add.f32 	%f28, %f28, %f70;\n"
+"$Lt_0_12290:\n"
+"	@!%p4 bra 	$Lt_0_12802;\n"
+"	.loc	14	157	0\n"
+"	mov.f32 	%f71, %f6;\n"
+"	mul.f32 	%f72, %f39, %f39;\n"
+"	mad.f32 	%f73, %f62, %f72, %f71;\n"
+"	mov.f32 	%f6, %f73;\n"
+"	.loc	14	158	0\n"
+"	mov.f32 	%f74, %f8;\n"
+"	mad.f32 	%f75, %f62, %f41, %f74;\n"
+"	mov.f32 	%f8, %f75;\n"
+"	.loc	14	159	0\n"
+"	mov.f32 	%f76, %f10;\n"
+"	mul.f32 	%f77, %f40, %f40;\n"
+"	mad.f32 	%f78, %f62, %f77, %f76;\n"
+"	mov.f32 	%f10, %f78;\n"
+"	.loc	14	160	0\n"
+"	mov.f32 	%f79, %f12;\n"
+"	mul.f32 	%f80, %f38, %f39;\n"
+"	mad.f32 	%f81, %f62, %f80, %f79;\n"
+"	mov.f32 	%f12, %f81;\n"
+"	.loc	14	161	0\n"
+"	mov.f32 	%f82, %f14;\n"
+"	mul.f32 	%f83, %f39, %f40;\n"
+"	mad.f32 	%f84, %f62, %f83, %f82;\n"
+"	mov.f32 	%f14, %f84;\n"
+"	.loc	14	162	0\n"
+"	mul.f32 	%f85, %f38, %f40;\n"
+"	mad.f32 	%f15, %f62, %f85, %f15;\n"
+"	mov.f32 	%f86, %f15;\n"
+"$Lt_0_12802:\n"
+"$Lt_0_10754:\n"
+"	.loc	14	118	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p8, %rd13, %rd10;\n"
+"	@%p8 bra 	$Lt_0_10498;\n"
+"	bra.uni 	$Lt_0_9986;\n"
+"$Lt_0_14594:\n"
+"	mov.s32 	%r30, 0;\n"
+"	setp.gt.s32 	%p3, %r16, %r30;\n"
+"	mov.s32 	%r31, 0;\n"
+"	setp.gt.s32 	%p4, %r15, %r31;\n"
+"$Lt_0_9986:\n"
+"	.loc	14	169	0\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd26, %rd25, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_13570;\n"
+"	.loc	14	171	0\n"
+"	st.global.f32 	[%rd26+0], %f28;\n"
+"	.loc	14	172	0\n"
+"	cvt.u64.s32 	%rd27, %r6;\n"
+"	mul.lo.u64 	%rd28, %rd27, 4;\n"
+"	add.u64 	%rd26, %rd26, %rd28;\n"
+"$Lt_0_13570:\n"
+"	@!%p4 bra 	$Lt_0_14082;\n"
+"	.loc	14	176	0\n"
+"	mov.f32 	%f87, %f6;\n"
+"	st.global.f32 	[%rd26+0], %f87;\n"
+"	.loc	14	177	0\n"
+"	cvt.u64.s32 	%rd29, %r6;\n"
+"	mul.lo.u64 	%rd30, %rd29, 4;\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	176	0\n"
+"	mov.f32 	%f88, %f8;\n"
+"	st.global.f32 	[%rd26+0], %f88;\n"
+"	.loc	14	177	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	176	0\n"
+"	mov.f32 	%f89, %f10;\n"
+"	st.global.f32 	[%rd26+0], %f89;\n"
+"	.loc	14	177	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	176	0\n"
+"	mov.f32 	%f90, %f12;\n"
+"	st.global.f32 	[%rd26+0], %f90;\n"
+"	.loc	14	177	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	176	0\n"
+"	mov.f32 	%f91, %f14;\n"
+"	st.global.f32 	[%rd26+0], %f91;\n"
+"	add.u64 	%rd31, %rd30, %rd26;\n"
+"	st.global.f32 	[%rd31+0], %f15;\n"
+"$Lt_0_14082:\n"
+"	.loc	14	180	0\n"
+"	ld.param.u64 	%rd32, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd33, %rd2, 16;\n"
+"	add.u64 	%rd34, %rd32, %rd33;\n"
+"	mov.f32 	%f92, %f93;\n"
+"	st.global.v4.f32 	[%rd34+0], {%f27,%f26,%f25,%f92};\n"
+"$Lt_0_9474:\n"
+"	.loc	14	182	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch)\n"
+"	{\n"
+"	.reg .u32 %r<36>;\n"
+"	.reg .u64 %rd<48>;\n"
+"	.reg .f32 %f<102>;\n"
+"	.reg .pred %p<13>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj180[16];\n"
+"	.shared .align 16 .b8 __cuda_lj1208[1024];\n"
+"	.shared .align 16 .b8 __cuda_lj31232[1024];\n"
+"	.loc	14	189	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_11778;\n"
+"	.loc	14	196	0\n"
+"	mov.u64 	%rd1, __cuda_sp_lj180;\n"
+"	cvt.u64.s32 	%rd2, %r1;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_11778:\n"
+"	mov.u64 	%rd1, __cuda_sp_lj180;\n"
+"	mov.u32 	%r3, 63;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_12290;\n"
+"	.loc	14	198	0\n"
+"	mov.u64 	%rd7, __cuda_lj1208;\n"
+"	cvt.u64.s32 	%rd8, %r1;\n"
+"	mul.lo.u64 	%rd9, %rd8, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.f32 	[%rd12+0], %f2;\n"
+"	st.shared.f32 	[%rd12+4], %f3;\n"
+"	st.shared.f32 	[%rd12+8], %f4;\n"
+"	st.shared.f32 	[%rd12+12], %f5;\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_12802;\n"
+"	.loc	14	200	0\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.f32 	[%rd16+0], %f6;\n"
+"	st.shared.f32 	[%rd16+4], %f7;\n"
+"	st.shared.f32 	[%rd16+8], %f8;\n"
+"	st.shared.f32 	[%rd16+12], %f9;\n"
+"$Lt_1_12802:\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"$Lt_1_12290:\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"	mov.u64 	%rd7, __cuda_lj1208;\n"
+"	.loc	14	203	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r6, %ctaid.x;\n"
+"	cvt.s32.u16 	%r7, %ntid.x;\n"
+"	mul24.lo.s32 	%r8, %r6, %r7;\n"
+"	add.s32 	%r9, %r8, %r1;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p4, %r9, %r10;\n"
+"	@%p4 bra 	$Lt_1_13314;\n"
+"	.loc	14	214	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	14	217	0\n"
+"	cvt.u64.s32 	%rd17, %r9;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd20, %rd19, %rd18;\n"
+"	ld.global.s32 	%r11, [%rd20+0];\n"
+"	.loc	14	219	0\n"
+"	ld.param.s32 	%r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd21, %r12;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd20, %rd22;\n"
+"	ld.global.s32 	%r13, [%rd23+0];\n"
+"	.loc	14	220	0\n"
+"	add.u64 	%rd24, %rd23, %rd22;\n"
+"	mov.s64 	%rd25, %rd24;\n"
+"	mov.s32 	%r14, %r11;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	223	0\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	mul24.lo.s32 	%r18, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.lo.u64 	%rd27, %rd26, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p5, %rd24, %rd28;\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	@%p5 bra 	$Lt_1_18434;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p6, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p7, %r19, %r22;\n"
+"	cvt.rzi.s32.f32 	%r23, %f29;\n"
+"	mov.s32 	%r24, 8;\n"
+"	mul24.lo.s32 	%r25, %r24, %r23;\n"
+"	cvt.rn.f32.s32 	%f34, %r25;\n"
+"$Lt_1_14338:\n"
+"	.loc	14	230	0\n"
+"	ld.global.s32 	%r26, [%rd25+0];\n"
+"	.loc	14	231	0\n"
+"	shr.s32 	%r27, %r26, 30;\n"
+"	cvt.s64.s32 	%rd29, %r27;\n"
+"	and.b64 	%rd30, %rd29, 3;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd32, %rd1, %rd31;\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	and.b32 	%r28, %r26, 1073741823;\n"
+"	mov.s32 	%r29, 0;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
+"	.loc	14	234	0\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.f32 	%f44, %f27, %f41;\n"
+"	sub.f32 	%f45, %f26, %f40;\n"
+"	sub.f32 	%f46, %f28, %f42;\n"
+"	mul.f32 	%f47, %f44, %f44;\n"
+"	mad.f32 	%f48, %f45, %f45, %f47;\n"
+"	mad.f32 	%f49, %f46, %f46, %f48;\n"
+"	add.f32 	%f50, %f34, %f43;\n"
+"	cvt.rzi.s32.f32 	%r32, %f50;\n"
+"	cvt.u64.s32 	%rd33, %r32;\n"
+"	mul.lo.u64 	%rd34, %rd33, 16;\n"
+"	add.u64 	%rd35, %rd34, %rd7;\n"
+"	ld.shared.f32 	%f51, [%rd35+0];\n"
+"	setp.gt.f32 	%p8, %f51, %f49;\n"
+"	@!%p8 bra 	$Lt_1_16642;\n"
+"	rcp.approx.f32 	%f52, %f49;\n"
+"	ld.shared.f32 	%f53, [%rd35+4];\n"
+"	mov.f32 	%f54, 0f40000000;    	\n"
+"	setp.eq.f32 	%p9, %f53, %f54;\n"
+"	@!%p9 bra 	$Lt_1_15362;\n"
+"	.loc	14	248	0\n"
+"	mul.f32 	%f55, %f52, %f52;\n"
+"	mov.f32 	%f56, %f55;\n"
+"	.loc	14	249	0\n"
+"	mul.f32 	%f57, %f55, %f55;\n"
+"	bra.uni 	$Lt_1_15618;\n"
+"$Lt_1_15362:\n"
+"	mov.f32 	%f58, 0f3f800000;    	\n"
+"	.loc	14	234	0\n"
+"	ld.shared.f32 	%f53, [%rd35+4];\n"
+"	.loc	14	249	0\n"
+"	setp.eq.f32 	%p10, %f53, %f58;\n"
+"	@!%p10 bra 	$Lt_1_15874;\n"
+"	.loc	14	251	0\n"
+"	sqrt.approx.f32 	%f59, %f52;\n"
+"	mul.f32 	%f60, %f52, %f59;\n"
+"	mov.f32 	%f57, %f60;\n"
+"	.loc	14	252	0\n"
+"	mul.f32 	%f56, %f60, %f60;\n"
+"	bra.uni 	$Lt_1_15618;\n"
+"$Lt_1_15874:\n"
+"	.loc	14	254	0\n"
+"	mul.f32 	%f61, %f52, %f52;\n"
+"	mul.f32 	%f62, %f52, %f61;\n"
+"	mov.f32 	%f56, %f62;\n"
+"	.loc	14	255	0\n"
+"	mov.f32 	%f57, %f62;\n"
+"$Lt_1_15618:\n"
+"$Lt_1_15106:\n"
+"	.loc	14	231	0\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	.loc	14	257	0\n"
+"	mul.f32 	%f63, %f52, %f35;\n"
+"	mul.f32 	%f64, %f56, %f63;\n"
+"	ld.shared.f32 	%f65, [%rd35+12];\n"
+"	ld.shared.f32 	%f66, [%rd35+8];\n"
+"	mul.f32 	%f67, %f66, %f57;\n"
+"	sub.f32 	%f68, %f67, %f65;\n"
+"	mul.f32 	%f69, %f64, %f68;\n"
+"	.loc	14	259	0\n"
+"	mad.f32 	%f32, %f45, %f69, %f32;\n"
+"	.loc	14	260	0\n"
+"	mad.f32 	%f31, %f44, %f69, %f31;\n"
+"	.loc	14	261	0\n"
+"	mad.f32 	%f30, %f46, %f69, %f30;\n"
+"	@!%p6 bra 	$Lt_1_16130;\n"
+"	.loc	14	263	0\n"
+"	add.u64 	%rd36, %rd34, %rd13;\n"
+"	ld.shared.f32 	%f70, [%rd36+8];\n"
+"	.loc	14	231	0\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	.loc	14	263	0\n"
+"	mul.f32 	%f71, %f35, %f56;\n"
+"	ld.shared.f32 	%f72, [%rd36+4];\n"
+"	ld.shared.f32 	%f73, [%rd36+0];\n"
+"	mul.f32 	%f74, %f73, %f57;\n"
+"	sub.f32 	%f75, %f74, %f72;\n"
+"	mul.f32 	%f76, %f71, %f75;\n"
+"	sub.f32 	%f77, %f76, %f70;\n"
+"	add.f32 	%f33, %f33, %f77;\n"
+"$Lt_1_16130:\n"
+"	@!%p7 bra 	$Lt_1_16642;\n"
+"	.loc	14	266	0\n"
+"	mov.f32 	%f78, %f11;\n"
+"	mul.f32 	%f79, %f45, %f45;\n"
+"	mad.f32 	%f80, %f69, %f79, %f78;\n"
+"	mov.f32 	%f11, %f80;\n"
+"	.loc	14	267	0\n"
+"	mov.f32 	%f81, %f13;\n"
+"	mad.f32 	%f82, %f69, %f47, %f81;\n"
+"	mov.f32 	%f13, %f82;\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f83, %f15;\n"
+"	mul.f32 	%f84, %f46, %f46;\n"
+"	mad.f32 	%f85, %f69, %f84, %f83;\n"
+"	mov.f32 	%f15, %f85;\n"
+"	.loc	14	269	0\n"
+"	mov.f32 	%f86, %f17;\n"
+"	mul.f32 	%f87, %f44, %f45;\n"
+"	mad.f32 	%f88, %f69, %f87, %f86;\n"
+"	mov.f32 	%f17, %f88;\n"
+"	.loc	14	270	0\n"
+"	mov.f32 	%f89, %f19;\n"
+"	mul.f32 	%f90, %f45, %f46;\n"
+"	mad.f32 	%f91, %f69, %f90, %f89;\n"
+"	mov.f32 	%f19, %f91;\n"
+"	.loc	14	271	0\n"
+"	mul.f32 	%f92, %f44, %f46;\n"
+"	mad.f32 	%f20, %f69, %f92, %f20;\n"
+"	mov.f32 	%f93, %f20;\n"
+"$Lt_1_16642:\n"
+"$Lt_1_14594:\n"
+"	.loc	14	228	0\n"
+"	add.u64 	%rd25, %rd22, %rd25;\n"
+"	setp.gt.u64 	%p11, %rd28, %rd25;\n"
+"	@%p11 bra 	$Lt_1_14338;\n"
+"	bra.uni 	$Lt_1_13826;\n"
+"$Lt_1_18434:\n"
+"	mov.s32 	%r33, 0;\n"
+"	setp.gt.s32 	%p6, %r20, %r33;\n"
+"	mov.s32 	%r34, 0;\n"
+"	setp.gt.s32 	%p7, %r19, %r34;\n"
+"$Lt_1_13826:\n"
+"	.loc	14	278	0\n"
+"	ld.param.u64 	%rd37, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd38, %rd37, %rd18;\n"
+"	@!%p6 bra 	$Lt_1_17410;\n"
+"	.loc	14	280	0\n"
+"	st.global.f32 	[%rd38+0], %f33;\n"
+"	.loc	14	281	0\n"
+"	cvt.u64.s32 	%rd39, %r10;\n"
+"	mul.lo.u64 	%rd40, %rd39, 4;\n"
+"	add.u64 	%rd38, %rd38, %rd40;\n"
+"$Lt_1_17410:\n"
+"	@!%p7 bra 	$Lt_1_17922;\n"
+"	.loc	14	285	0\n"
+"	mov.f32 	%f94, %f11;\n"
+"	st.global.f32 	[%rd38+0], %f94;\n"
+"	.loc	14	286	0\n"
+"	cvt.u64.s32 	%rd41, %r10;\n"
+"	mul.lo.u64 	%rd42, %rd41, 4;\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	285	0\n"
+"	mov.f32 	%f95, %f13;\n"
+"	st.global.f32 	[%rd38+0], %f95;\n"
+"	.loc	14	286	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	285	0\n"
+"	mov.f32 	%f96, %f15;\n"
+"	st.global.f32 	[%rd38+0], %f96;\n"
+"	.loc	14	286	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	285	0\n"
+"	mov.f32 	%f97, %f17;\n"
+"	st.global.f32 	[%rd38+0], %f97;\n"
+"	.loc	14	286	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	285	0\n"
+"	mov.f32 	%f98, %f19;\n"
+"	st.global.f32 	[%rd38+0], %f98;\n"
+"	add.u64 	%rd43, %rd42, %rd38;\n"
+"	st.global.f32 	[%rd43+0], %f20;\n"
+"$Lt_1_17922:\n"
+"	.loc	14	289	0\n"
+"	ld.param.u64 	%rd44, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd45, %rd17, 16;\n"
+"	add.u64 	%rd46, %rd44, %rd45;\n"
+"	mov.f32 	%f99, %f100;\n"
+"	st.global.v4.f32 	[%rd46+0], {%f32,%f31,%f30,%f99};\n"
+"$Lt_1_13314:\n"
+"	.loc	14	291	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/cmmc_long_gpu_ptx.h
+++ b/lib/gpu/cmmc_long_gpu_ptx.h
@ -0,0 +1,829 @@
+const char * cmmc_long_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.tex .u64 q_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_g_ewald)\n"
+"	{\n"
+"	.reg .u32 %r<42>;\n"
+"	.reg .u64 %rd<38>;\n"
+"	.reg .f32 %f<156>;\n"
+"	.reg .pred %p<12>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj108[32];\n"
+"	.loc	14	107	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	111	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+0], %f1;\n"
+"	.loc	14	112	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+4], %f2;\n"
+"	.loc	14	113	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+8], %f3;\n"
+"	.loc	14	114	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+12], %f4;\n"
+"	.loc	14	115	0\n"
+"	ld.global.f32 	%f5, [%rd1+16];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+16], %f5;\n"
+"	.loc	14	116	0\n"
+"	ld.global.f32 	%f6, [%rd1+20];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+20], %f6;\n"
+"	.loc	14	117	0\n"
+"	ld.global.f32 	%f7, [%rd1+24];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+24], %f7;\n"
+"	.loc	14	118	0\n"
+"	ld.global.f32 	%f8, [%rd1+28];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+28], %f8;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_11778;\n"
+"	.loc	14	129	0\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	mov.f32 	%f17, 0f00000000;    	\n"
+"	mov.f32 	%f18, %f17;\n"
+"	mov.f32 	%f19, 0f00000000;    	\n"
+"	mov.f32 	%f20, %f19;\n"
+"	.loc	14	132	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	134	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	135	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	138	0\n"
+"	mov.f32 	%f25, %f21;\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.s32 	%r14, %r7;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	139	0\n"
+"	mov.f32 	%f33, %f29;\n"
+"	mul24.lo.s32 	%r18, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r18;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_18434;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r22;\n"
+"	cvt.rzi.s32.f32 	%r23, %f28;\n"
+"	ld.param.s32 	%r24, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r25, %r24, %r23;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.u64 	%rd15, __cuda_sp_lj108;\n"
+"$Lt_0_12802:\n"
+"	.loc	14	143	0\n"
+"	ld.global.s32 	%r26, [%rd10+0];\n"
+"	.loc	14	146	0\n"
+"	shr.s32 	%r27, %r26, 30;\n"
+"	cvt.s64.s32 	%rd16, %r27;\n"
+"	and.b64 	%rd17, %rd16, 3;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd15, %rd18;\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	147	0\n"
+"	mov.f32 	%f40, 0f3f800000;    	\n"
+"	ld.shared.f32 	%f41, [%rd19+16];\n"
+"	sub.f32 	%f42, %f40, %f41;\n"
+"	and.b32 	%r28, %r26, 1073741823;\n"
+"	mov.s32 	%r29, %r28;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	mov.s32 	%r32, 0;\n"
+"	tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
+"	.loc	14	150	0\n"
+"	mov.f32 	%f47, %f43;\n"
+"	mov.f32 	%f48, %f44;\n"
+"	mov.f32 	%f49, %f45;\n"
+"	mov.f32 	%f50, %f46;\n"
+"	cvt.rzi.s32.f32 	%r33, %f50;\n"
+"	sub.f32 	%f51, %f26, %f48;\n"
+"	sub.f32 	%f52, %f25, %f47;\n"
+"	sub.f32 	%f53, %f27, %f49;\n"
+"	mul.f32 	%f54, %f51, %f51;\n"
+"	mad.f32 	%f55, %f52, %f52, %f54;\n"
+"	mad.f32 	%f56, %f53, %f53, %f55;\n"
+"	add.s32 	%r34, %r33, %r25;\n"
+"	cvt.u64.s32 	%rd20, %r34;\n"
+"	mul.lo.u64 	%rd21, %rd20, 16;\n"
+"	add.u64 	%rd22, %rd21, %rd14;\n"
+"	ld.global.f32 	%f57, [%rd22+0];\n"
+"	setp.gt.f32 	%p5, %f57, %f56;\n"
+"	@!%p5 bra 	$Lt_0_16642;\n"
+"	rcp.approx.f32 	%f58, %f56;\n"
+"	ld.global.f32 	%f59, [%rd22+4];\n"
+"	setp.lt.f32 	%p6, %f56, %f59;\n"
+"	@!%p6 bra 	$Lt_0_13826;\n"
+"	ld.param.u64 	%rd23, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd24, %rd23, %rd21;\n"
+"	ld.global.f32 	%f60, [%rd24+0];\n"
+"	mov.f32 	%f61, 0f40000000;    	\n"
+"	setp.eq.f32 	%p7, %f60, %f61;\n"
+"	@!%p7 bra 	$Lt_0_14338;\n"
+"	.loc	14	166	0\n"
+"	mul.f32 	%f62, %f58, %f58;\n"
+"	mov.f32 	%f63, %f62;\n"
+"	mov.f32 	%f64, %f63;\n"
+"	.loc	14	167	0\n"
+"	mul.f32 	%f65, %f62, %f62;\n"
+"	mov.f32 	%f66, %f65;\n"
+"	bra.uni 	$Lt_0_14594;\n"
+"$Lt_0_14338:\n"
+"	mov.f32 	%f67, 0f3f800000;    	\n"
+"	setp.eq.f32 	%p8, %f60, %f67;\n"
+"	@!%p8 bra 	$Lt_0_14850;\n"
+"	.loc	14	169	0\n"
+"	sqrt.approx.f32 	%f68, %f58;\n"
+"	mul.f32 	%f69, %f58, %f68;\n"
+"	mov.f32 	%f65, %f69;\n"
+"	mov.f32 	%f66, %f65;\n"
+"	.loc	14	170	0\n"
+"	mul.f32 	%f63, %f69, %f69;\n"
+"	mov.f32 	%f64, %f63;\n"
+"	bra.uni 	$Lt_0_14594;\n"
+"$Lt_0_14850:\n"
+"	.loc	14	172	0\n"
+"	mul.f32 	%f70, %f58, %f58;\n"
+"	mul.f32 	%f71, %f58, %f70;\n"
+"	mov.f32 	%f63, %f71;\n"
+"	mov.f32 	%f64, %f63;\n"
+"	.loc	14	173	0\n"
+"	mov.f32 	%f65, %f71;\n"
+"	mov.f32 	%f66, %f65;\n"
+"$Lt_0_14594:\n"
+"$Lt_0_14082:\n"
+"	.loc	14	146	0\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	175	0\n"
+"	mul.f32 	%f72, %f39, %f63;\n"
+"	ld.global.v2.f32 	{%f73,%f74}, [%rd22+8];\n"
+"	mul.f32 	%f75, %f73, %f65;\n"
+"	sub.f32 	%f76, %f75, %f74;\n"
+"	mul.f32 	%f77, %f72, %f76;\n"
+"	bra.uni 	$Lt_0_13570;\n"
+"$Lt_0_13826:\n"
+"	.loc	14	177	0\n"
+"	mov.f32 	%f77, 0f00000000;    	\n"
+"$Lt_0_13570:\n"
+"	ld.param.f32 	%f78, [__cudaparm_kernel_pair_cut_coulsq];\n"
+"	setp.gt.f32 	%p9, %f78, %f56;\n"
+"	@!%p9 bra 	$Lt_0_15362;\n"
+"	.loc	14	184	0\n"
+"	sqrt.approx.f32 	%f79, %f56;\n"
+"	ld.param.f32 	%f80, [__cudaparm_kernel_pair_g_ewald];\n"
+"	mul.f32 	%f81, %f80, %f79;\n"
+"	mul.f32 	%f82, %f81, %f81;\n"
+"	mov.f32 	%f83, 0f3f800000;    	\n"
+"	mov.f32 	%f84, 0f3ea7ba05;    	\n"
+"	mad.f32 	%f85, %f84, %f81, %f83;\n"
+"	neg.f32 	%f86, %f82;\n"
+"	rcp.approx.f32 	%f87, %f85;\n"
+"	mov.f32 	%f88, 0f3fb8aa3b;    	\n"
+"	mul.f32 	%f89, %f86, %f88;\n"
+"	ex2.approx.f32 	%f90, %f89;\n"
+"	mov.f32 	%f91, 0f3e827906;    	\n"
+"	mov.f32 	%f92, 0fbe91a98e;    	\n"
+"	mov.f32 	%f93, 0f3fb5f0e3;    	\n"
+"	mov.f32 	%f94, 0fbfba00e3;    	\n"
+"	mov.f32 	%f95, 0f3f87dc22;    	\n"
+"	mad.f32 	%f96, %f95, %f87, %f94;\n"
+"	mad.f32 	%f97, %f87, %f96, %f93;\n"
+"	mad.f32 	%f98, %f87, %f97, %f92;\n"
+"	mad.f32 	%f99, %f87, %f98, %f91;\n"
+"	mul.f32 	%f100, %f87, %f99;\n"
+"	mul.f32 	%f101, %f90, %f100;\n"
+"	mov.f32 	%f102, %f101;\n"
+"	mov.s32 	%r35, %r28;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.s32 	%r38, 0;\n"
+"	tex.1d.v4.f32.s32 {%f103,%f104,%f105,%f106},[q_tex,{%r35,%r36,%r37,%r38}];\n"
+"	.loc	14	185	0\n"
+"	mov.f32 	%f107, %f103;\n"
+"	ld.param.f32 	%f108, [__cudaparm_kernel_pair_qqrd2e];\n"
+"	mul.f32 	%f109, %f108, %f33;\n"
+"	mul.f32 	%f110, %f109, %f107;\n"
+"	div.approx.f32 	%f111, %f110, %f79;\n"
+"	mov.f32 	%f112, %f111;\n"
+"	.loc	14	186	0\n"
+"	mov.f32 	%f113, 0f3f906ebb;   	\n"
+"	mul.f32 	%f114, %f81, %f113;\n"
+"	mad.f32 	%f115, %f90, %f114, %f101;\n"
+"	sub.f32 	%f116, %f115, %f42;\n"
+"	mul.f32 	%f117, %f111, %f116;\n"
+"	bra.uni 	$Lt_0_15106;\n"
+"$Lt_0_15362:\n"
+"	.loc	14	189	0\n"
+"	mov.f32 	%f112, 0f00000000;   	\n"
+"	mov.f32 	%f117, 0f00000000;   	\n"
+"$Lt_0_15106:\n"
+"	.loc	14	194	0\n"
+"	add.f32 	%f118, %f117, %f77;\n"
+"	mul.f32 	%f119, %f118, %f58;\n"
+"	mad.f32 	%f36, %f52, %f119, %f36;\n"
+"	.loc	14	195	0\n"
+"	mad.f32 	%f35, %f51, %f119, %f35;\n"
+"	.loc	14	196	0\n"
+"	mad.f32 	%f34, %f53, %f119, %f34;\n"
+"	@!%p3 bra 	$Lt_0_16130;\n"
+"	.loc	14	199	0\n"
+"	mov.f32 	%f120, %f102;\n"
+"	sub.f32 	%f121, %f120, %f42;\n"
+"	mad.f32 	%f37, %f112, %f121, %f37;\n"
+"	@!%p6 bra 	$Lt_0_16130;\n"
+"	.loc	14	201	0\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd26, %rd25, %rd21;\n"
+"	ld.global.v4.f32 	{_,%f122,%f123,%f124}, [%rd26+0];\n"
+"	mov.f32 	%f125, %f64;\n"
+"	.loc	14	146	0\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	201	0\n"
+"	mul.f32 	%f126, %f125, %f39;\n"
+"	mov.f32 	%f127, %f66;\n"
+"	mul.f32 	%f128, %f122, %f127;\n"
+"	sub.f32 	%f129, %f128, %f123;\n"
+"	mul.f32 	%f130, %f126, %f129;\n"
+"	sub.f32 	%f131, %f130, %f124;\n"
+"	add.f32 	%f38, %f38, %f131;\n"
+"$Lt_0_16130:\n"
+"$Lt_0_15618:\n"
+"	@!%p4 bra 	$Lt_0_16642;\n"
+"	.loc	14	206	0\n"
+"	mov.f32 	%f132, %f10;\n"
+"	mul.f32 	%f133, %f52, %f52;\n"
+"	mad.f32 	%f134, %f119, %f133, %f132;\n"
+"	mov.f32 	%f10, %f134;\n"
+"	.loc	14	207	0\n"
+"	mov.f32 	%f135, %f12;\n"
+"	mad.f32 	%f136, %f119, %f54, %f135;\n"
+"	mov.f32 	%f12, %f136;\n"
+"	.loc	14	208	0\n"
+"	mov.f32 	%f137, %f14;\n"
+"	mul.f32 	%f138, %f53, %f53;\n"
+"	mad.f32 	%f139, %f119, %f138, %f137;\n"
+"	mov.f32 	%f14, %f139;\n"
+"	.loc	14	209	0\n"
+"	mov.f32 	%f140, %f16;\n"
+"	mul.f32 	%f141, %f51, %f52;\n"
+"	mad.f32 	%f142, %f119, %f141, %f140;\n"
+"	mov.f32 	%f16, %f142;\n"
+"	.loc	14	210	0\n"
+"	mov.f32 	%f143, %f18;\n"
+"	mul.f32 	%f144, %f52, %f53;\n"
+"	mad.f32 	%f145, %f119, %f144, %f143;\n"
+"	mov.f32 	%f18, %f145;\n"
+"	.loc	14	211	0\n"
+"	mul.f32 	%f146, %f51, %f53;\n"
+"	mad.f32 	%f19, %f119, %f146, %f19;\n"
+"	mov.f32 	%f147, %f19;\n"
+"$Lt_0_16642:\n"
+"$Lt_0_13058:\n"
+"	.loc	14	142	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p10, %rd13, %rd10;\n"
+"	@%p10 bra 	$Lt_0_12802;\n"
+"	bra.uni 	$Lt_0_12290;\n"
+"$Lt_0_18434:\n"
+"	mov.s32 	%r39, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r39;\n"
+"	mov.s32 	%r40, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r40;\n"
+"$Lt_0_12290:\n"
+"	.loc	14	218	0\n"
+"	ld.param.u64 	%rd27, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd28, %rd27, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_17410;\n"
+"	.loc	14	220	0\n"
+"	st.global.f32 	[%rd28+0], %f38;\n"
+"	.loc	14	221	0\n"
+"	cvt.u64.s32 	%rd29, %r6;\n"
+"	mul.lo.u64 	%rd30, %rd29, 4;\n"
+"	add.u64 	%rd28, %rd30, %rd28;\n"
+"	.loc	14	222	0\n"
+"	st.global.f32 	[%rd28+0], %f37;\n"
+"	.loc	14	223	0\n"
+"	add.u64 	%rd28, %rd30, %rd28;\n"
+"$Lt_0_17410:\n"
+"	@!%p4 bra 	$Lt_0_17922;\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f148, %f10;\n"
+"	st.global.f32 	[%rd28+0], %f148;\n"
+"	.loc	14	228	0\n"
+"	cvt.u64.s32 	%rd31, %r6;\n"
+"	mul.lo.u64 	%rd32, %rd31, 4;\n"
+"	add.u64 	%rd28, %rd32, %rd28;\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f149, %f12;\n"
+"	st.global.f32 	[%rd28+0], %f149;\n"
+"	.loc	14	228	0\n"
+"	add.u64 	%rd28, %rd32, %rd28;\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f150, %f14;\n"
+"	st.global.f32 	[%rd28+0], %f150;\n"
+"	.loc	14	228	0\n"
+"	add.u64 	%rd28, %rd32, %rd28;\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f151, %f16;\n"
+"	st.global.f32 	[%rd28+0], %f151;\n"
+"	.loc	14	228	0\n"
+"	add.u64 	%rd28, %rd32, %rd28;\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f152, %f18;\n"
+"	st.global.f32 	[%rd28+0], %f152;\n"
+"	add.u64 	%rd33, %rd32, %rd28;\n"
+"	st.global.f32 	[%rd33+0], %f19;\n"
+"$Lt_0_17922:\n"
+"	.loc	14	231	0\n"
+"	ld.param.u64 	%rd34, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd35, %rd2, 16;\n"
+"	add.u64 	%rd36, %rd34, %rd35;\n"
+"	mov.f32 	%f153, %f154;\n"
+"	st.global.v4.f32 	[%rd36+0], {%f36,%f35,%f34,%f153};\n"
+"$Lt_0_11778:\n"
+"	.loc	14	233	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_g_ewald)\n"
+"	{\n"
+"	.reg .u32 %r<43>;\n"
+"	.reg .u64 %rd<49>;\n"
+"	.reg .f32 %f<159>;\n"
+"	.reg .pred %p<14>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj244[32];\n"
+"	.shared .align 16 .b8 __cuda_lj3288[1024];\n"
+"	.shared .align 16 .b8 __cuda_lj11312[1024];\n"
+"	.loc	14	242	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 7;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_13314;\n"
+"	.loc	14	249	0\n"
+"	mov.u64 	%rd1, __cuda_sp_lj244;\n"
+"	cvt.u64.s32 	%rd2, %r1;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_13314:\n"
+"	mov.u64 	%rd1, __cuda_sp_lj244;\n"
+"	mov.u32 	%r3, 63;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_13826;\n"
+"	.loc	14	251	0\n"
+"	mov.u64 	%rd7, __cuda_lj3288;\n"
+"	mov.u64 	%rd8, __cuda_lj11312;\n"
+"	cvt.u64.s32 	%rd9, %r1;\n"
+"	mul.lo.u64 	%rd10, %rd9, 16;\n"
+"	ld.param.u64 	%rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd12, %rd11, %rd10;\n"
+"	add.u64 	%rd13, %rd10, %rd8;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd12+0];\n"
+"	st.shared.f32 	[%rd13+0], %f2;\n"
+"	st.shared.f32 	[%rd13+4], %f3;\n"
+"	st.shared.f32 	[%rd13+8], %f4;\n"
+"	st.shared.f32 	[%rd13+12], %f5;\n"
+"	.loc	14	252	0\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd10;\n"
+"	add.u64 	%rd16, %rd10, %rd7;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.f32 	[%rd16+0], %f6;\n"
+"	st.shared.f32 	[%rd16+4], %f7;\n"
+"	st.shared.f32 	[%rd16+8], %f8;\n"
+"	st.shared.f32 	[%rd16+12], %f9;\n"
+"$Lt_1_13826:\n"
+"	mov.u64 	%rd7, __cuda_lj3288;\n"
+"	mov.u64 	%rd8, __cuda_lj11312;\n"
+"	.loc	14	255	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r4, %ctaid.x;\n"
+"	cvt.s32.u16 	%r5, %ntid.x;\n"
+"	mul24.lo.s32 	%r6, %r4, %r5;\n"
+"	add.s32 	%r7, %r6, %r1;\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p3, %r7, %r8;\n"
+"	@%p3 bra 	$Lt_1_14338;\n"
+"	.loc	14	267	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	14	270	0\n"
+"	cvt.u64.s32 	%rd17, %r7;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd20, %rd19, %rd18;\n"
+"	ld.global.s32 	%r9, [%rd20+0];\n"
+"	.loc	14	272	0\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd21, %r10;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd20, %rd22;\n"
+"	ld.global.s32 	%r11, [%rd23+0];\n"
+"	.loc	14	273	0\n"
+"	add.u64 	%rd24, %rd23, %rd22;\n"
+"	mov.s64 	%rd25, %rd24;\n"
+"	mov.s32 	%r12, %r9;\n"
+"	mov.s32 	%r13, 0;\n"
+"	mov.s32 	%r14, 0;\n"
+"	mov.s32 	%r15, 0;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r12,%r13,%r14,%r15}];\n"
+"	.loc	14	276	0\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	mov.s32 	%r16, %r9;\n"
+"	mov.s32 	%r17, 0;\n"
+"	mov.s32 	%r18, 0;\n"
+"	mov.s32 	%r19, 0;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r16,%r17,%r18,%r19}];\n"
+"	.loc	14	277	0\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mul24.lo.s32 	%r20, %r11, %r10;\n"
+"	cvt.s64.s32 	%rd26, %r20;\n"
+"	mul.lo.u64 	%rd27, %rd26, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	ld.param.s32 	%r21, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r22, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p4, %rd24, %rd28;\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	mov.f32 	%f39, 0f00000000;    	\n"
+"	@%p4 bra 	$Lt_1_20994;\n"
+"	mov.s32 	%r23, 0;\n"
+"	setp.gt.s32 	%p5, %r22, %r23;\n"
+"	mov.s32 	%r24, 0;\n"
+"	setp.gt.s32 	%p6, %r21, %r24;\n"
+"	cvt.rzi.s32.f32 	%r25, %f29;\n"
+"	mov.s32 	%r26, 8;\n"
+"	mul24.lo.s32 	%r27, %r26, %r25;\n"
+"	cvt.rn.f32.s32 	%f40, %r27;\n"
+"$Lt_1_15362:\n"
+"	.loc	14	282	0\n"
+"	ld.global.s32 	%r28, [%rd25+0];\n"
+"	.loc	14	285	0\n"
+"	shr.s32 	%r29, %r28, 30;\n"
+"	cvt.s64.s32 	%rd29, %r29;\n"
+"	and.b64 	%rd30, %rd29, 3;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd32, %rd1, %rd31;\n"
+"	ld.shared.f32 	%f41, [%rd32+0];\n"
+"	.loc	14	286	0\n"
+"	mov.f32 	%f42, 0f3f800000;    	\n"
+"	ld.shared.f32 	%f43, [%rd32+16];\n"
+"	sub.f32 	%f44, %f42, %f43;\n"
+"	and.b32 	%r30, %r28, 1073741823;\n"
+"	mov.s32 	%r31, %r30;\n"
+"	mov.s32 	%r32, 0;\n"
+"	mov.s32 	%r33, 0;\n"
+"	mov.s32 	%r34, 0;\n"
+"	tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r31,%r32,%r33,%r34}];\n"
+"	.loc	14	289	0\n"
+"	mov.f32 	%f49, %f45;\n"
+"	mov.f32 	%f50, %f46;\n"
+"	mov.f32 	%f51, %f47;\n"
+"	mov.f32 	%f52, %f48;\n"
+"	sub.f32 	%f53, %f27, %f50;\n"
+"	sub.f32 	%f54, %f26, %f49;\n"
+"	sub.f32 	%f55, %f28, %f51;\n"
+"	mul.f32 	%f56, %f53, %f53;\n"
+"	mad.f32 	%f57, %f54, %f54, %f56;\n"
+"	mad.f32 	%f58, %f55, %f55, %f57;\n"
+"	add.f32 	%f59, %f40, %f52;\n"
+"	cvt.rzi.s32.f32 	%r35, %f59;\n"
+"	cvt.u64.s32 	%rd33, %r35;\n"
+"	mul.lo.u64 	%rd34, %rd33, 16;\n"
+"	add.u64 	%rd35, %rd34, %rd8;\n"
+"	ld.shared.f32 	%f60, [%rd35+0];\n"
+"	setp.gt.f32 	%p7, %f60, %f58;\n"
+"	@!%p7 bra 	$Lt_1_19202;\n"
+"	rcp.approx.f32 	%f61, %f58;\n"
+"	ld.shared.f32 	%f62, [%rd35+4];\n"
+"	setp.lt.f32 	%p8, %f58, %f62;\n"
+"	@!%p8 bra 	$Lt_1_16386;\n"
+"	add.u64 	%rd36, %rd34, %rd7;\n"
+"	ld.shared.f32 	%f63, [%rd36+0];\n"
+"	mov.f32 	%f64, 0f40000000;    	\n"
+"	setp.eq.f32 	%p9, %f63, %f64;\n"
+"	@!%p9 bra 	$Lt_1_16898;\n"
+"	.loc	14	304	0\n"
+"	mul.f32 	%f65, %f61, %f61;\n"
+"	mov.f32 	%f66, %f65;\n"
+"	mov.f32 	%f67, %f66;\n"
+"	.loc	14	305	0\n"
+"	mul.f32 	%f68, %f65, %f65;\n"
+"	mov.f32 	%f69, %f68;\n"
+"	bra.uni 	$Lt_1_17154;\n"
+"$Lt_1_16898:\n"
+"	mov.f32 	%f70, 0f3f800000;    	\n"
+"	.loc	14	289	0\n"
+"	ld.shared.f32 	%f63, [%rd36+0];\n"
+"	.loc	14	305	0\n"
+"	setp.eq.f32 	%p10, %f63, %f70;\n"
+"	@!%p10 bra 	$Lt_1_17410;\n"
+"	.loc	14	307	0\n"
+"	sqrt.approx.f32 	%f71, %f61;\n"
+"	mul.f32 	%f72, %f61, %f71;\n"
+"	mov.f32 	%f68, %f72;\n"
+"	mov.f32 	%f69, %f68;\n"
+"	.loc	14	308	0\n"
+"	mul.f32 	%f66, %f72, %f72;\n"
+"	mov.f32 	%f67, %f66;\n"
+"	bra.uni 	$Lt_1_17154;\n"
+"$Lt_1_17410:\n"
+"	.loc	14	310	0\n"
+"	mul.f32 	%f73, %f61, %f61;\n"
+"	mul.f32 	%f74, %f61, %f73;\n"
+"	mov.f32 	%f66, %f74;\n"
+"	mov.f32 	%f67, %f66;\n"
+"	.loc	14	311	0\n"
+"	mov.f32 	%f68, %f74;\n"
+"	mov.f32 	%f69, %f68;\n"
+"$Lt_1_17154:\n"
+"$Lt_1_16642:\n"
+"	.loc	14	285	0\n"
+"	ld.shared.f32 	%f41, [%rd32+0];\n"
+"	.loc	14	313	0\n"
+"	mul.f32 	%f75, %f41, %f66;\n"
+"	ld.shared.f32 	%f76, [%rd35+12];\n"
+"	ld.shared.f32 	%f77, [%rd35+8];\n"
+"	mul.f32 	%f78, %f77, %f68;\n"
+"	sub.f32 	%f79, %f78, %f76;\n"
+"	mul.f32 	%f80, %f75, %f79;\n"
+"	bra.uni 	$Lt_1_16130;\n"
+"$Lt_1_16386:\n"
+"	.loc	14	315	0\n"
+"	mov.f32 	%f80, 0f00000000;    	\n"
+"$Lt_1_16130:\n"
+"	ld.param.f32 	%f81, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
+"	setp.gt.f32 	%p11, %f81, %f58;\n"
+"	@!%p11 bra 	$Lt_1_17922;\n"
+"	.loc	14	322	0\n"
+"	sqrt.approx.f32 	%f82, %f58;\n"
+"	ld.param.f32 	%f83, [__cudaparm_kernel_pair_fast_g_ewald];\n"
+"	mul.f32 	%f84, %f83, %f82;\n"
+"	mul.f32 	%f85, %f84, %f84;\n"
+"	mov.f32 	%f86, 0f3f800000;    	\n"
+"	mov.f32 	%f87, 0f3ea7ba05;    	\n"
+"	mad.f32 	%f88, %f87, %f84, %f86;\n"
+"	neg.f32 	%f89, %f85;\n"
+"	rcp.approx.f32 	%f90, %f88;\n"
+"	mov.f32 	%f91, 0f3fb8aa3b;    	\n"
+"	mul.f32 	%f92, %f89, %f91;\n"
+"	ex2.approx.f32 	%f93, %f92;\n"
+"	mov.f32 	%f94, 0f3e827906;    	\n"
+"	mov.f32 	%f95, 0fbe91a98e;    	\n"
+"	mov.f32 	%f96, 0f3fb5f0e3;    	\n"
+"	mov.f32 	%f97, 0fbfba00e3;    	\n"
+"	mov.f32 	%f98, 0f3f87dc22;    	\n"
+"	mad.f32 	%f99, %f98, %f90, %f97;\n"
+"	mad.f32 	%f100, %f90, %f99, %f96;\n"
+"	mad.f32 	%f101, %f90, %f100, %f95;\n"
+"	mad.f32 	%f102, %f90, %f101, %f94;\n"
+"	mul.f32 	%f103, %f90, %f102;\n"
+"	mul.f32 	%f104, %f93, %f103;\n"
+"	mov.f32 	%f105, %f104;\n"
+"	mov.s32 	%r36, %r30;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.s32 	%r38, 0;\n"
+"	mov.s32 	%r39, 0;\n"
+"	tex.1d.v4.f32.s32 {%f106,%f107,%f108,%f109},[q_tex,{%r36,%r37,%r38,%r39}];\n"
+"	.loc	14	323	0\n"
+"	mov.f32 	%f110, %f106;\n"
+"	ld.param.f32 	%f111, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
+"	mul.f32 	%f112, %f111, %f34;\n"
+"	mul.f32 	%f113, %f112, %f110;\n"
+"	div.approx.f32 	%f114, %f113, %f82;\n"
+"	mov.f32 	%f115, %f114;\n"
+"	.loc	14	324	0\n"
+"	mov.f32 	%f116, 0f3f906ebb;   	\n"
+"	mul.f32 	%f117, %f84, %f116;\n"
+"	mad.f32 	%f118, %f93, %f117, %f104;\n"
+"	sub.f32 	%f119, %f118, %f44;\n"
+"	mul.f32 	%f120, %f114, %f119;\n"
+"	bra.uni 	$Lt_1_17666;\n"
+"$Lt_1_17922:\n"
+"	.loc	14	327	0\n"
+"	mov.f32 	%f115, 0f00000000;   	\n"
+"	mov.f32 	%f120, 0f00000000;   	\n"
+"$Lt_1_17666:\n"
+"	.loc	14	332	0\n"
+"	add.f32 	%f121, %f120, %f80;\n"
+"	mul.f32 	%f122, %f121, %f61;\n"
+"	mad.f32 	%f37, %f54, %f122, %f37;\n"
+"	.loc	14	333	0\n"
+"	mad.f32 	%f36, %f53, %f122, %f36;\n"
+"	.loc	14	334	0\n"
+"	mad.f32 	%f35, %f55, %f122, %f35;\n"
+"	@!%p5 bra 	$Lt_1_18690;\n"
+"	.loc	14	337	0\n"
+"	mov.f32 	%f123, %f105;\n"
+"	sub.f32 	%f124, %f123, %f44;\n"
+"	mad.f32 	%f38, %f115, %f124, %f38;\n"
+"	@!%p8 bra 	$Lt_1_18690;\n"
+"	.loc	14	339	0\n"
+"	add.u64 	%rd37, %rd34, %rd7;\n"
+"	ld.shared.f32 	%f125, [%rd37+12];\n"
+"	mov.f32 	%f126, %f67;\n"
+"	.loc	14	285	0\n"
+"	ld.shared.f32 	%f41, [%rd32+0];\n"
+"	.loc	14	339	0\n"
+"	mul.f32 	%f127, %f126, %f41;\n"
+"	ld.shared.f32 	%f128, [%rd37+8];\n"
+"	ld.shared.f32 	%f129, [%rd37+4];\n"
+"	mov.f32 	%f130, %f69;\n"
+"	mul.f32 	%f131, %f129, %f130;\n"
+"	sub.f32 	%f132, %f131, %f128;\n"
+"	mul.f32 	%f133, %f127, %f132;\n"
+"	sub.f32 	%f134, %f133, %f125;\n"
+"	add.f32 	%f39, %f39, %f134;\n"
+"$Lt_1_18690:\n"
+"$Lt_1_18178:\n"
+"	@!%p6 bra 	$Lt_1_19202;\n"
+"	.loc	14	344	0\n"
+"	mov.f32 	%f135, %f11;\n"
+"	mul.f32 	%f136, %f54, %f54;\n"
+"	mad.f32 	%f137, %f122, %f136, %f135;\n"
+"	mov.f32 	%f11, %f137;\n"
+"	.loc	14	345	0\n"
+"	mov.f32 	%f138, %f13;\n"
+"	mad.f32 	%f139, %f122, %f56, %f138;\n"
+"	mov.f32 	%f13, %f139;\n"
+"	.loc	14	346	0\n"
+"	mov.f32 	%f140, %f15;\n"
+"	mul.f32 	%f141, %f55, %f55;\n"
+"	mad.f32 	%f142, %f122, %f141, %f140;\n"
+"	mov.f32 	%f15, %f142;\n"
+"	.loc	14	347	0\n"
+"	mov.f32 	%f143, %f17;\n"
+"	mul.f32 	%f144, %f53, %f54;\n"
+"	mad.f32 	%f145, %f122, %f144, %f143;\n"
+"	mov.f32 	%f17, %f145;\n"
+"	.loc	14	348	0\n"
+"	mov.f32 	%f146, %f19;\n"
+"	mul.f32 	%f147, %f54, %f55;\n"
+"	mad.f32 	%f148, %f122, %f147, %f146;\n"
+"	mov.f32 	%f19, %f148;\n"
+"	.loc	14	349	0\n"
+"	mul.f32 	%f149, %f53, %f55;\n"
+"	mad.f32 	%f20, %f122, %f149, %f20;\n"
+"	mov.f32 	%f150, %f20;\n"
+"$Lt_1_19202:\n"
+"$Lt_1_15618:\n"
+"	.loc	14	281	0\n"
+"	add.u64 	%rd25, %rd22, %rd25;\n"
+"	setp.gt.u64 	%p12, %rd28, %rd25;\n"
+"	@%p12 bra 	$Lt_1_15362;\n"
+"	bra.uni 	$Lt_1_14850;\n"
+"$Lt_1_20994:\n"
+"	mov.s32 	%r40, 0;\n"
+"	setp.gt.s32 	%p5, %r22, %r40;\n"
+"	mov.s32 	%r41, 0;\n"
+"	setp.gt.s32 	%p6, %r21, %r41;\n"
+"$Lt_1_14850:\n"
+"	.loc	14	356	0\n"
+"	ld.param.u64 	%rd38, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd39, %rd38, %rd18;\n"
+"	@!%p5 bra 	$Lt_1_19970;\n"
+"	.loc	14	358	0\n"
+"	st.global.f32 	[%rd39+0], %f39;\n"
+"	.loc	14	359	0\n"
+"	cvt.u64.s32 	%rd40, %r8;\n"
+"	mul.lo.u64 	%rd41, %rd40, 4;\n"
+"	add.u64 	%rd39, %rd41, %rd39;\n"
+"	.loc	14	360	0\n"
+"	st.global.f32 	[%rd39+0], %f38;\n"
+"	.loc	14	361	0\n"
+"	add.u64 	%rd39, %rd41, %rd39;\n"
+"$Lt_1_19970:\n"
+"	@!%p6 bra 	$Lt_1_20482;\n"
+"	.loc	14	365	0\n"
+"	mov.f32 	%f151, %f11;\n"
+"	st.global.f32 	[%rd39+0], %f151;\n"
+"	.loc	14	366	0\n"
+"	cvt.u64.s32 	%rd42, %r8;\n"
+"	mul.lo.u64 	%rd43, %rd42, 4;\n"
+"	add.u64 	%rd39, %rd43, %rd39;\n"
+"	.loc	14	365	0\n"
+"	mov.f32 	%f152, %f13;\n"
+"	st.global.f32 	[%rd39+0], %f152;\n"
+"	.loc	14	366	0\n"
+"	add.u64 	%rd39, %rd43, %rd39;\n"
+"	.loc	14	365	0\n"
+"	mov.f32 	%f153, %f15;\n"
+"	st.global.f32 	[%rd39+0], %f153;\n"
+"	.loc	14	366	0\n"
+"	add.u64 	%rd39, %rd43, %rd39;\n"
+"	.loc	14	365	0\n"
+"	mov.f32 	%f154, %f17;\n"
+"	st.global.f32 	[%rd39+0], %f154;\n"
+"	.loc	14	366	0\n"
+"	add.u64 	%rd39, %rd43, %rd39;\n"
+"	.loc	14	365	0\n"
+"	mov.f32 	%f155, %f19;\n"
+"	st.global.f32 	[%rd39+0], %f155;\n"
+"	add.u64 	%rd44, %rd43, %rd39;\n"
+"	st.global.f32 	[%rd44+0], %f20;\n"
+"$Lt_1_20482:\n"
+"	.loc	14	369	0\n"
+"	ld.param.u64 	%rd45, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd46, %rd17, 16;\n"
+"	add.u64 	%rd47, %rd45, %rd46;\n"
+"	mov.f32 	%f156, %f157;\n"
+"	st.global.v4.f32 	[%rd47+0], {%f37,%f36,%f35,%f156};\n"
+"$Lt_1_14338:\n"
+"	.loc	14	371	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/crml_gpu_ptx.h
+++ b/lib/gpu/crml_gpu_ptx.h
@ -0,0 +1,828 @@
+const char * crml_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.tex .u64 q_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_g_ewald,\n"
+"		.param .f32 __cudaparm_kernel_pair_denom_lj,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_bothsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_ljsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_lj_innersq)\n"
+"	{\n"
+"	.reg .u32 %r<47>;\n"
+"	.reg .u64 %rd<37>;\n"
+"	.reg .f32 %f<170>;\n"
+"	.reg .pred %p<12>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj116[32];\n"
+"	.loc	14	109	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	114	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+0], %f1;\n"
+"	.loc	14	115	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+4], %f2;\n"
+"	.loc	14	116	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+8], %f3;\n"
+"	.loc	14	117	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+12], %f4;\n"
+"	.loc	14	118	0\n"
+"	ld.global.f32 	%f5, [%rd1+16];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+16], %f5;\n"
+"	.loc	14	119	0\n"
+"	ld.global.f32 	%f6, [%rd1+20];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+20], %f6;\n"
+"	.loc	14	120	0\n"
+"	ld.global.f32 	%f7, [%rd1+24];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+24], %f7;\n"
+"	.loc	14	121	0\n"
+"	ld.global.f32 	%f8, [%rd1+28];\n"
+"	st.shared.f32 	[__cuda_sp_lj116+28], %f8;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_11778;\n"
+"	.loc	14	132	0\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	mov.f32 	%f17, 0f00000000;    	\n"
+"	mov.f32 	%f18, %f17;\n"
+"	mov.f32 	%f19, 0f00000000;    	\n"
+"	mov.f32 	%f20, %f19;\n"
+"	.loc	14	135	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	137	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	138	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	141	0\n"
+"	mov.f32 	%f25, %f21;\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.s32 	%r14, %r7;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	142	0\n"
+"	mov.f32 	%f33, %f29;\n"
+"	mul24.lo.s32 	%r18, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r18;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_17922;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r22;\n"
+"	ld.param.f32 	%f39, [__cudaparm_kernel_pair_cut_bothsq];\n"
+"	mov.u64 	%rd14, __cuda_sp_lj116;\n"
+"$Lt_0_12802:\n"
+"	.loc	14	146	0\n"
+"	ld.global.s32 	%r23, [%rd10+0];\n"
+"	.loc	14	149	0\n"
+"	shr.s32 	%r24, %r23, 30;\n"
+"	cvt.s64.s32 	%rd15, %r24;\n"
+"	and.b64 	%rd16, %rd15, 3;\n"
+"	mul.lo.u64 	%rd17, %rd16, 4;\n"
+"	add.u64 	%rd18, %rd14, %rd17;\n"
+"	ld.shared.f32 	%f40, [%rd18+0];\n"
+"	.loc	14	150	0\n"
+"	mov.f32 	%f41, 0f3f800000;    	\n"
+"	ld.shared.f32 	%f42, [%rd18+16];\n"
+"	sub.f32 	%f43, %f41, %f42;\n"
+"	and.b32 	%r25, %r23, 1073741823;\n"
+"	mov.s32 	%r26, %r25;\n"
+"	mov.s32 	%r27, 0;\n"
+"	mov.s32 	%r28, 0;\n"
+"	mov.s32 	%r29, 0;\n"
+"	tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r26,%r27,%r28,%r29}];\n"
+"	.loc	14	153	0\n"
+"	mov.f32 	%f48, %f44;\n"
+"	mov.f32 	%f49, %f45;\n"
+"	mov.f32 	%f50, %f46;\n"
+"	mov.f32 	%f51, %f47;\n"
+"	sub.f32 	%f52, %f26, %f49;\n"
+"	sub.f32 	%f53, %f25, %f48;\n"
+"	sub.f32 	%f54, %f27, %f50;\n"
+"	mul.f32 	%f55, %f52, %f52;\n"
+"	mad.f32 	%f56, %f53, %f53, %f55;\n"
+"	mad.f32 	%f57, %f54, %f54, %f56;\n"
+"	setp.lt.f32 	%p5, %f57, %f39;\n"
+"	@!%p5 bra 	$Lt_0_16130;\n"
+"	ld.param.f32 	%f58, [__cudaparm_kernel_pair_cut_ljsq];\n"
+"	setp.lt.f32 	%p6, %f57, %f58;\n"
+"	rcp.approx.f32 	%f59, %f57;\n"
+"	@!%p6 bra 	$Lt_0_13826;\n"
+"	.loc	14	168	0\n"
+"	mul.f32 	%f60, %f59, %f59;\n"
+"	mul.f32 	%f61, %f59, %f60;\n"
+"	mov.f32 	%f62, %f61;\n"
+"	.loc	14	169	0\n"
+"	cvt.rzi.s32.f32 	%r30, %f51;\n"
+"	cvt.rzi.s32.f32 	%r31, %f28;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_lj1];\n"
+"	ld.param.s32 	%r32, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r33, %r32, %r31;\n"
+"	add.s32 	%r34, %r30, %r33;\n"
+"	cvt.u64.s32 	%rd20, %r34;\n"
+"	mul.lo.u64 	%rd21, %rd20, 16;\n"
+"	add.u64 	%rd22, %rd19, %rd21;\n"
+"	.loc	14	149	0\n"
+"	ld.shared.f32 	%f40, [%rd18+0];\n"
+"	.loc	14	169	0\n"
+"	mul.f32 	%f63, %f61, %f40;\n"
+"	ld.global.v2.f32 	{%f64,%f65}, [%rd22+0];\n"
+"	mul.f32 	%f66, %f64, %f61;\n"
+"	sub.f32 	%f67, %f66, %f65;\n"
+"	mul.f32 	%f68, %f63, %f67;\n"
+"	ld.param.f32 	%f69, [__cudaparm_kernel_pair_cut_lj_innersq];\n"
+"	setp.gt.f32 	%p7, %f57, %f69;\n"
+"	@!%p7 bra 	$Lt_0_13570;\n"
+"	.loc	14	175	0\n"
+"	add.f32 	%f70, %f57, %f57;\n"
+"	sub.f32 	%f71, %f58, %f57;\n"
+"	add.f32 	%f72, %f70, %f58;\n"
+"	mul.f32 	%f73, %f71, %f71;\n"
+"	mov.f32 	%f74, 0f40400000;    	\n"
+"	mul.f32 	%f75, %f74, %f69;\n"
+"	sub.f32 	%f76, %f72, %f75;\n"
+"	ld.param.f32 	%f77, [__cudaparm_kernel_pair_denom_lj];\n"
+"	div.approx.f32 	%f78, %f76, %f77;\n"
+"	mul.f32 	%f79, %f73, %f78;\n"
+"	mov.f32 	%f80, %f79;\n"
+"	.loc	14	178	0\n"
+"	mov.f32 	%f81, 0f41400000;    	\n"
+"	mul.f32 	%f82, %f57, %f81;\n"
+"	mul.f32 	%f83, %f71, %f82;\n"
+"	sub.f32 	%f84, %f57, %f69;\n"
+"	mul.f32 	%f85, %f83, %f84;\n"
+"	div.approx.f32 	%f86, %f85, %f77;\n"
+"	ld.global.v2.f32 	{%f87,%f88}, [%rd22+8];\n"
+"	mul.f32 	%f89, %f87, %f61;\n"
+"	sub.f32 	%f90, %f89, %f88;\n"
+"	mul.f32 	%f91, %f61, %f90;\n"
+"	mul.f32 	%f92, %f86, %f91;\n"
+"	mad.f32 	%f68, %f68, %f79, %f92;\n"
+"	bra.uni 	$Lt_0_13570;\n"
+"$Lt_0_13826:\n"
+"	.loc	14	181	0\n"
+"	mov.f32 	%f68, 0f00000000;    	\n"
+"$Lt_0_13570:\n"
+"	ld.param.f32 	%f93, [__cudaparm_kernel_pair_cut_coulsq];\n"
+"	setp.gt.f32 	%p8, %f93, %f57;\n"
+"	@!%p8 bra 	$Lt_0_14850;\n"
+"	.loc	14	188	0\n"
+"	sqrt.approx.f32 	%f94, %f57;\n"
+"	ld.param.f32 	%f95, [__cudaparm_kernel_pair_g_ewald];\n"
+"	mul.f32 	%f96, %f95, %f94;\n"
+"	mul.f32 	%f97, %f96, %f96;\n"
+"	mov.f32 	%f98, 0f3f800000;    	\n"
+"	mov.f32 	%f99, 0f3ea7ba05;    	\n"
+"	mad.f32 	%f100, %f99, %f96, %f98;\n"
+"	neg.f32 	%f101, %f97;\n"
+"	rcp.approx.f32 	%f102, %f100;\n"
+"	mov.f32 	%f103, 0f3fb8aa3b;   	\n"
+"	mul.f32 	%f104, %f101, %f103;\n"
+"	ex2.approx.f32 	%f105, %f104;\n"
+"	mov.f32 	%f106, 0f3e827906;   	\n"
+"	mov.f32 	%f107, 0fbe91a98e;   	\n"
+"	mov.f32 	%f108, 0f3fb5f0e3;   	\n"
+"	mov.f32 	%f109, 0fbfba00e3;   	\n"
+"	mov.f32 	%f110, 0f3f87dc22;   	\n"
+"	mad.f32 	%f111, %f110, %f102, %f109;\n"
+"	mad.f32 	%f112, %f102, %f111, %f108;\n"
+"	mad.f32 	%f113, %f102, %f112, %f107;\n"
+"	mad.f32 	%f114, %f102, %f113, %f106;\n"
+"	mul.f32 	%f115, %f102, %f114;\n"
+"	mul.f32 	%f116, %f105, %f115;\n"
+"	mov.f32 	%f117, %f116;\n"
+"	mov.s32 	%r35, %r25;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.s32 	%r38, 0;\n"
+"	tex.1d.v4.f32.s32 {%f118,%f119,%f120,%f121},[q_tex,{%r35,%r36,%r37,%r38}];\n"
+"	.loc	14	189	0\n"
+"	mov.f32 	%f122, %f118;\n"
+"	ld.param.f32 	%f123, [__cudaparm_kernel_pair_qqrd2e];\n"
+"	mul.f32 	%f124, %f123, %f33;\n"
+"	mul.f32 	%f125, %f124, %f122;\n"
+"	div.approx.f32 	%f126, %f125, %f94;\n"
+"	mov.f32 	%f127, %f126;\n"
+"	.loc	14	190	0\n"
+"	mov.f32 	%f128, 0f3f906ebb;   	\n"
+"	mul.f32 	%f129, %f96, %f128;\n"
+"	mad.f32 	%f130, %f105, %f129, %f116;\n"
+"	sub.f32 	%f131, %f130, %f43;\n"
+"	mul.f32 	%f132, %f126, %f131;\n"
+"	bra.uni 	$Lt_0_14594;\n"
+"$Lt_0_14850:\n"
+"	.loc	14	193	0\n"
+"	mov.f32 	%f127, 0f00000000;   	\n"
+"	mov.f32 	%f132, 0f00000000;   	\n"
+"$Lt_0_14594:\n"
+"	.loc	14	198	0\n"
+"	add.f32 	%f133, %f132, %f68;\n"
+"	mul.f32 	%f134, %f133, %f59;\n"
+"	mad.f32 	%f36, %f53, %f134, %f36;\n"
+"	.loc	14	199	0\n"
+"	mad.f32 	%f35, %f52, %f134, %f35;\n"
+"	.loc	14	200	0\n"
+"	mad.f32 	%f34, %f54, %f134, %f34;\n"
+"	@!%p3 bra 	$Lt_0_15618;\n"
+"	.loc	14	203	0\n"
+"	mov.f32 	%f135, %f117;\n"
+"	sub.f32 	%f136, %f135, %f43;\n"
+"	mad.f32 	%f37, %f127, %f136, %f37;\n"
+"	@!%p6 bra 	$Lt_0_15618;\n"
+"	.loc	14	205	0\n"
+"	cvt.rzi.s32.f32 	%r39, %f51;\n"
+"	cvt.rzi.s32.f32 	%r40, %f28;\n"
+"	ld.param.u64 	%rd23, [__cudaparm_kernel_pair_lj1];\n"
+"	ld.param.s32 	%r41, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r42, %r41, %r40;\n"
+"	add.s32 	%r43, %r39, %r42;\n"
+"	cvt.u64.s32 	%rd24, %r43;\n"
+"	mul.lo.u64 	%rd25, %rd24, 16;\n"
+"	add.u64 	%rd22, %rd23, %rd25;\n"
+"	mov.f32 	%f137, %f62;\n"
+"	ld.global.v2.f32 	{%f138,%f139}, [%rd22+8];\n"
+"	mul.f32 	%f140, %f138, %f137;\n"
+"	sub.f32 	%f141, %f140, %f139;\n"
+"	mul.f32 	%f142, %f137, %f141;\n"
+"	mov.f32 	%f143, %f80;\n"
+"	mul.f32 	%f144, %f143, %f142;\n"
+"	ld.param.f32 	%f145, [__cudaparm_kernel_pair_cut_lj_innersq];\n"
+"	setp.lt.f32 	%p9, %f145, %f57;\n"
+"	selp.f32 	%f142, %f144, %f142, %p9;\n"
+"	.loc	14	149	0\n"
+"	ld.shared.f32 	%f40, [%rd18+0];\n"
+"	.loc	14	208	0\n"
+"	mad.f32 	%f38, %f40, %f142, %f38;\n"
+"$Lt_0_15618:\n"
+"$Lt_0_15106:\n"
+"	@!%p4 bra 	$Lt_0_16130;\n"
+"	.loc	14	212	0\n"
+"	mov.f32 	%f146, %f10;\n"
+"	mul.f32 	%f147, %f53, %f53;\n"
+"	mad.f32 	%f148, %f134, %f147, %f146;\n"
+"	mov.f32 	%f10, %f148;\n"
+"	.loc	14	213	0\n"
+"	mov.f32 	%f149, %f12;\n"
+"	mad.f32 	%f150, %f134, %f55, %f149;\n"
+"	mov.f32 	%f12, %f150;\n"
+"	.loc	14	214	0\n"
+"	mov.f32 	%f151, %f14;\n"
+"	mul.f32 	%f152, %f54, %f54;\n"
+"	mad.f32 	%f153, %f134, %f152, %f151;\n"
+"	mov.f32 	%f14, %f153;\n"
+"	.loc	14	215	0\n"
+"	mov.f32 	%f154, %f16;\n"
+"	mul.f32 	%f155, %f52, %f53;\n"
+"	mad.f32 	%f156, %f134, %f155, %f154;\n"
+"	mov.f32 	%f16, %f156;\n"
+"	.loc	14	216	0\n"
+"	mov.f32 	%f157, %f18;\n"
+"	mul.f32 	%f158, %f53, %f54;\n"
+"	mad.f32 	%f159, %f134, %f158, %f157;\n"
+"	mov.f32 	%f18, %f159;\n"
+"	.loc	14	217	0\n"
+"	mul.f32 	%f160, %f52, %f54;\n"
+"	mad.f32 	%f19, %f134, %f160, %f19;\n"
+"	mov.f32 	%f161, %f19;\n"
+"$Lt_0_16130:\n"
+"$Lt_0_13058:\n"
+"	.loc	14	145	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p10, %rd13, %rd10;\n"
+"	@%p10 bra 	$Lt_0_12802;\n"
+"	bra.uni 	$Lt_0_12290;\n"
+"$Lt_0_17922:\n"
+"	mov.s32 	%r44, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r44;\n"
+"	mov.s32 	%r45, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r45;\n"
+"$Lt_0_12290:\n"
+"	.loc	14	224	0\n"
+"	ld.param.u64 	%rd26, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd27, %rd26, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_16898;\n"
+"	.loc	14	226	0\n"
+"	st.global.f32 	[%rd27+0], %f38;\n"
+"	.loc	14	227	0\n"
+"	cvt.u64.s32 	%rd28, %r6;\n"
+"	mul.lo.u64 	%rd29, %rd28, 4;\n"
+"	add.u64 	%rd27, %rd29, %rd27;\n"
+"	.loc	14	228	0\n"
+"	st.global.f32 	[%rd27+0], %f37;\n"
+"	.loc	14	229	0\n"
+"	add.u64 	%rd27, %rd29, %rd27;\n"
+"$Lt_0_16898:\n"
+"	@!%p4 bra 	$Lt_0_17410;\n"
+"	.loc	14	233	0\n"
+"	mov.f32 	%f162, %f10;\n"
+"	st.global.f32 	[%rd27+0], %f162;\n"
+"	.loc	14	234	0\n"
+"	cvt.u64.s32 	%rd30, %r6;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd27, %rd31, %rd27;\n"
+"	.loc	14	233	0\n"
+"	mov.f32 	%f163, %f12;\n"
+"	st.global.f32 	[%rd27+0], %f163;\n"
+"	.loc	14	234	0\n"
+"	add.u64 	%rd27, %rd31, %rd27;\n"
+"	.loc	14	233	0\n"
+"	mov.f32 	%f164, %f14;\n"
+"	st.global.f32 	[%rd27+0], %f164;\n"
+"	.loc	14	234	0\n"
+"	add.u64 	%rd27, %rd31, %rd27;\n"
+"	.loc	14	233	0\n"
+"	mov.f32 	%f165, %f16;\n"
+"	st.global.f32 	[%rd27+0], %f165;\n"
+"	.loc	14	234	0\n"
+"	add.u64 	%rd27, %rd31, %rd27;\n"
+"	.loc	14	233	0\n"
+"	mov.f32 	%f166, %f18;\n"
+"	st.global.f32 	[%rd27+0], %f166;\n"
+"	add.u64 	%rd32, %rd31, %rd27;\n"
+"	st.global.f32 	[%rd32+0], %f19;\n"
+"$Lt_0_17410:\n"
+"	.loc	14	237	0\n"
+"	ld.param.u64 	%rd33, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd34, %rd2, 16;\n"
+"	add.u64 	%rd35, %rd33, %rd34;\n"
+"	mov.f32 	%f167, %f168;\n"
+"	st.global.v4.f32 	[%rd35+0], {%f36,%f35,%f34,%f167};\n"
+"$Lt_0_11778:\n"
+"	.loc	14	239	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ljd_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_g_ewald,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_denom_lj,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_bothsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_ljsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_lj_innersq)\n"
+"	{\n"
+"	.reg .u32 %r<40>;\n"
+"	.reg .u64 %rd<45>;\n"
+"	.reg .f32 %f<177>;\n"
+"	.reg .pred %p<13>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj260[32];\n"
+"	.shared .align 8 .b8 __cuda_ljd296[1024];\n"
+"	.loc	14	250	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	cvt.u64.s32 	%rd1, %r1;\n"
+"	mov.u32 	%r2, 7;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_12546;\n"
+"	.loc	14	256	0\n"
+"	mov.u64 	%rd2, __cuda_sp_lj260;\n"
+"	mul.lo.u64 	%rd3, %rd1, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd2;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_12546:\n"
+"	mov.u64 	%rd7, __cuda_ljd296;\n"
+"	mov.u64 	%rd2, __cuda_sp_lj260;\n"
+"	.loc	14	257	0\n"
+"	mul.lo.u64 	%rd8, %rd1, 8;\n"
+"	ld.param.u64 	%rd9, [__cudaparm_kernel_pair_fast_ljd_in];\n"
+"	add.u64 	%rd10, %rd9, %rd8;\n"
+"	add.u64 	%rd11, %rd8, %rd7;\n"
+"	ld.global.v2.f32 	{%f2,%f3}, [%rd10+0];\n"
+"	st.shared.f32 	[%rd11+0], %f2;\n"
+"	st.shared.f32 	[%rd11+4], %f3;\n"
+"	ld.global.v2.f32 	{%f4,%f5}, [%rd10+512];\n"
+"	.loc	14	258	0\n"
+"	st.shared.f32 	[%rd11+512], %f4;\n"
+"	st.shared.f32 	[%rd11+516], %f5;\n"
+"	.loc	14	261	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r3, %ctaid.x;\n"
+"	cvt.s32.u16 	%r4, %ntid.x;\n"
+"	mul24.lo.s32 	%r5, %r3, %r4;\n"
+"	add.s32 	%r6, %r5, %r1;\n"
+"	ld.param.s32 	%r7, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p2, %r6, %r7;\n"
+"	@%p2 bra 	$Lt_1_13058;\n"
+"	.loc	14	273	0\n"
+"	mov.f32 	%f6, 0f00000000;     	\n"
+"	mov.f32 	%f7, %f6;\n"
+"	mov.f32 	%f8, 0f00000000;     	\n"
+"	mov.f32 	%f9, %f8;\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	.loc	14	276	0\n"
+"	cvt.u64.s32 	%rd12, %r6;\n"
+"	mul.lo.u64 	%rd13, %rd12, 4;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd15, %rd14, %rd13;\n"
+"	ld.global.s32 	%r8, [%rd15+0];\n"
+"	.loc	14	278	0\n"
+"	ld.param.s32 	%r9, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd16, %r9;\n"
+"	mul.lo.u64 	%rd17, %rd16, 4;\n"
+"	add.u64 	%rd18, %rd15, %rd17;\n"
+"	ld.global.s32 	%r10, [%rd18+0];\n"
+"	.loc	14	279	0\n"
+"	add.u64 	%rd19, %rd18, %rd17;\n"
+"	mov.s64 	%rd20, %rd19;\n"
+"	mov.s32 	%r11, %r8;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	mov.s32 	%r14, 0;\n"
+"	tex.1d.v4.f32.s32 {%f18,%f19,%f20,%f21},[pos_tex,{%r11,%r12,%r13,%r14}];\n"
+"	.loc	14	282	0\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	mov.f32 	%f25, %f21;\n"
+"	mov.s32 	%r15, %r8;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	mov.s32 	%r18, 0;\n"
+"	tex.1d.v4.f32.s32 {%f26,%f27,%f28,%f29},[q_tex,{%r15,%r16,%r17,%r18}];\n"
+"	.loc	14	283	0\n"
+"	mov.f32 	%f30, %f26;\n"
+"	mul24.lo.s32 	%r19, %r10, %r9;\n"
+"	cvt.s64.s32 	%rd21, %r19;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd19, %rd22;\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r21, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p3, %rd19, %rd23;\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	@%p3 bra 	$Lt_1_19202;\n"
+"	cvt.rzi.s32.f32 	%r22, %f25;\n"
+"	mov.s32 	%r23, 0;\n"
+"	setp.gt.s32 	%p4, %r21, %r23;\n"
+"	mov.s32 	%r24, 0;\n"
+"	setp.gt.s32 	%p5, %r20, %r24;\n"
+"	ld.param.f32 	%f36, [__cudaparm_kernel_pair_fast_cut_bothsq];\n"
+"$Lt_1_14082:\n"
+"	.loc	14	287	0\n"
+"	ld.global.s32 	%r25, [%rd20+0];\n"
+"	.loc	14	290	0\n"
+"	shr.s32 	%r26, %r25, 30;\n"
+"	cvt.s64.s32 	%rd24, %r26;\n"
+"	and.b64 	%rd25, %rd24, 3;\n"
+"	mul.lo.u64 	%rd26, %rd25, 4;\n"
+"	add.u64 	%rd27, %rd2, %rd26;\n"
+"	ld.shared.f32 	%f37, [%rd27+0];\n"
+"	.loc	14	291	0\n"
+"	mov.f32 	%f38, 0f3f800000;    	\n"
+"	ld.shared.f32 	%f39, [%rd27+16];\n"
+"	sub.f32 	%f40, %f38, %f39;\n"
+"	and.b32 	%r27, %r25, 1073741823;\n"
+"	mov.s32 	%r28, %r27;\n"
+"	mov.s32 	%r29, 0;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
+"	.loc	14	294	0\n"
+"	mov.f32 	%f45, %f41;\n"
+"	mov.f32 	%f46, %f42;\n"
+"	mov.f32 	%f47, %f43;\n"
+"	mov.f32 	%f48, %f44;\n"
+"	sub.f32 	%f49, %f23, %f46;\n"
+"	sub.f32 	%f50, %f22, %f45;\n"
+"	sub.f32 	%f51, %f24, %f47;\n"
+"	mul.f32 	%f52, %f49, %f49;\n"
+"	mad.f32 	%f53, %f50, %f50, %f52;\n"
+"	mad.f32 	%f54, %f51, %f51, %f53;\n"
+"	setp.lt.f32 	%p6, %f54, %f36;\n"
+"	@!%p6 bra 	$Lt_1_17410;\n"
+"	ld.param.f32 	%f55, [__cudaparm_kernel_pair_fast_cut_ljsq];\n"
+"	setp.lt.f32 	%p7, %f54, %f55;\n"
+"	rcp.approx.f32 	%f56, %f54;\n"
+"	@!%p7 bra 	$Lt_1_15106;\n"
+"	.loc	14	309	0\n"
+"	cvt.rzi.s32.f32 	%r32, %f48;\n"
+"	cvt.u64.s32 	%rd28, %r22;\n"
+"	mul.lo.u64 	%rd29, %rd28, 8;\n"
+"	add.u64 	%rd30, %rd7, %rd29;\n"
+"	cvt.u64.s32 	%rd31, %r32;\n"
+"	mul.lo.u64 	%rd32, %rd31, 8;\n"
+"	add.u64 	%rd33, %rd7, %rd32;\n"
+"	ld.shared.f32 	%f57, [%rd30+0];\n"
+"	ld.shared.f32 	%f58, [%rd33+0];\n"
+"	mul.f32 	%f59, %f57, %f58;\n"
+"	.loc	14	310	0\n"
+"	ld.shared.f32 	%f60, [%rd30+4];\n"
+"	ld.shared.f32 	%f61, [%rd33+4];\n"
+"	add.f32 	%f62, %f60, %f61;\n"
+"	mov.f32 	%f63, 0f3f000000;    	\n"
+"	mul.f32 	%f64, %f62, %f63;\n"
+"	.loc	14	314	0\n"
+"	mul.f32 	%f65, %f64, %f64;\n"
+"	sqrt.approx.f32 	%f66, %f59;\n"
+"	mov.f32 	%f67, 0f40800000;    	\n"
+"	mul.f32 	%f68, %f66, %f67;\n"
+"	mul.f32 	%f69, %f65, %f56;\n"
+"	mul.f32 	%f70, %f69, %f69;\n"
+"	mul.f32 	%f71, %f69, %f70;\n"
+"	mul.f32 	%f72, %f68, %f71;\n"
+"	mov.f32 	%f73, %f72;\n"
+"	.loc	14	315	0\n"
+"	mul.f32 	%f74, %f71, %f72;\n"
+"	mov.f32 	%f75, %f74;\n"
+"	.loc	14	316	0\n"
+"	mov.f32 	%f76, 0f40c00000;    	\n"
+"	mul.f32 	%f77, %f72, %f76;\n"
+"	mov.f32 	%f78, 0f41400000;    	\n"
+"	mul.f32 	%f79, %f78, %f74;\n"
+"	sub.f32 	%f80, %f79, %f77;\n"
+"	.loc	14	290	0\n"
+"	ld.shared.f32 	%f37, [%rd27+0];\n"
+"	.loc	14	316	0\n"
+"	mul.f32 	%f81, %f37, %f80;\n"
+"	ld.param.f32 	%f82, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n"
+"	setp.gt.f32 	%p8, %f54, %f82;\n"
+"	@!%p8 bra 	$Lt_1_14850;\n"
+"	.loc	14	322	0\n"
+"	add.f32 	%f83, %f54, %f54;\n"
+"	sub.f32 	%f84, %f55, %f54;\n"
+"	add.f32 	%f85, %f83, %f55;\n"
+"	mul.f32 	%f86, %f84, %f84;\n"
+"	mov.f32 	%f87, 0f40400000;    	\n"
+"	mul.f32 	%f88, %f87, %f82;\n"
+"	sub.f32 	%f89, %f85, %f88;\n"
+"	ld.param.f32 	%f90, [__cudaparm_kernel_pair_fast_denom_lj];\n"
+"	div.approx.f32 	%f91, %f89, %f90;\n"
+"	mul.f32 	%f92, %f86, %f91;\n"
+"	mov.f32 	%f93, %f92;\n"
+"	.loc	14	325	0\n"
+"	mov.f32 	%f94, 0f41400000;    	\n"
+"	mul.f32 	%f95, %f54, %f94;\n"
+"	mul.f32 	%f96, %f84, %f95;\n"
+"	sub.f32 	%f97, %f54, %f82;\n"
+"	mul.f32 	%f98, %f96, %f97;\n"
+"	div.approx.f32 	%f99, %f98, %f90;\n"
+"	sub.f32 	%f100, %f74, %f72;\n"
+"	mul.f32 	%f101, %f99, %f100;\n"
+"	mad.f32 	%f81, %f81, %f92, %f101;\n"
+"	bra.uni 	$Lt_1_14850;\n"
+"$Lt_1_15106:\n"
+"	.loc	14	328	0\n"
+"	mov.f32 	%f81, 0f00000000;    	\n"
+"$Lt_1_14850:\n"
+"	ld.param.f32 	%f102, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
+"	setp.gt.f32 	%p9, %f102, %f54;\n"
+"	@!%p9 bra 	$Lt_1_16130;\n"
+"	.loc	14	335	0\n"
+"	sqrt.approx.f32 	%f103, %f54;\n"
+"	ld.param.f32 	%f104, [__cudaparm_kernel_pair_fast_g_ewald];\n"
+"	mul.f32 	%f105, %f104, %f103;\n"
+"	mul.f32 	%f106, %f105, %f105;\n"
+"	mov.f32 	%f107, 0f3f800000;   	\n"
+"	mov.f32 	%f108, 0f3ea7ba05;   	\n"
+"	mad.f32 	%f109, %f108, %f105, %f107;\n"
+"	neg.f32 	%f110, %f106;\n"
+"	rcp.approx.f32 	%f111, %f109;\n"
+"	mov.f32 	%f112, 0f3fb8aa3b;   	\n"
+"	mul.f32 	%f113, %f110, %f112;\n"
+"	ex2.approx.f32 	%f114, %f113;\n"
+"	mov.f32 	%f115, 0f3e827906;   	\n"
+"	mov.f32 	%f116, 0fbe91a98e;   	\n"
+"	mov.f32 	%f117, 0f3fb5f0e3;   	\n"
+"	mov.f32 	%f118, 0fbfba00e3;   	\n"
+"	mov.f32 	%f119, 0f3f87dc22;   	\n"
+"	mad.f32 	%f120, %f119, %f111, %f118;\n"
+"	mad.f32 	%f121, %f111, %f120, %f117;\n"
+"	mad.f32 	%f122, %f111, %f121, %f116;\n"
+"	mad.f32 	%f123, %f111, %f122, %f115;\n"
+"	mul.f32 	%f124, %f111, %f123;\n"
+"	mul.f32 	%f125, %f114, %f124;\n"
+"	mov.f32 	%f126, %f125;\n"
+"	mov.s32 	%r33, %r27;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.s32 	%r35, 0;\n"
+"	mov.s32 	%r36, 0;\n"
+"	tex.1d.v4.f32.s32 {%f127,%f128,%f129,%f130},[q_tex,{%r33,%r34,%r35,%r36}];\n"
+"	.loc	14	336	0\n"
+"	mov.f32 	%f131, %f127;\n"
+"	ld.param.f32 	%f132, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
+"	mul.f32 	%f133, %f132, %f30;\n"
+"	mul.f32 	%f134, %f133, %f131;\n"
+"	div.approx.f32 	%f135, %f134, %f103;\n"
+"	mov.f32 	%f136, %f135;\n"
+"	.loc	14	337	0\n"
+"	mov.f32 	%f137, 0f3f906ebb;   	\n"
+"	mul.f32 	%f138, %f105, %f137;\n"
+"	mad.f32 	%f139, %f114, %f138, %f125;\n"
+"	sub.f32 	%f140, %f139, %f40;\n"
+"	mul.f32 	%f141, %f135, %f140;\n"
+"	bra.uni 	$Lt_1_15874;\n"
+"$Lt_1_16130:\n"
+"	.loc	14	340	0\n"
+"	mov.f32 	%f136, 0f00000000;   	\n"
+"	mov.f32 	%f141, 0f00000000;   	\n"
+"$Lt_1_15874:\n"
+"	.loc	14	345	0\n"
+"	add.f32 	%f142, %f141, %f81;\n"
+"	mul.f32 	%f143, %f142, %f56;\n"
+"	mad.f32 	%f33, %f50, %f143, %f33;\n"
+"	.loc	14	346	0\n"
+"	mad.f32 	%f32, %f49, %f143, %f32;\n"
+"	.loc	14	347	0\n"
+"	mad.f32 	%f31, %f51, %f143, %f31;\n"
+"	@!%p4 bra 	$Lt_1_16898;\n"
+"	.loc	14	350	0\n"
+"	mov.f32 	%f144, %f126;\n"
+"	sub.f32 	%f145, %f144, %f40;\n"
+"	mad.f32 	%f34, %f136, %f145, %f34;\n"
+"	@!%p7 bra 	$Lt_1_16898;\n"
+"	.loc	14	355	0\n"
+"	mov.f32 	%f146, %f75;\n"
+"	mov.f32 	%f147, %f73;\n"
+"	sub.f32 	%f148, %f146, %f147;\n"
+"	mov.f32 	%f149, %f93;\n"
+"	mul.f32 	%f150, %f149, %f148;\n"
+"	ld.param.f32 	%f151, [__cudaparm_kernel_pair_fast_cut_lj_innersq];\n"
+"	setp.lt.f32 	%p10, %f151, %f54;\n"
+"	selp.f32 	%f152, %f150, %f148, %p10;\n"
+"	.loc	14	290	0\n"
+"	ld.shared.f32 	%f37, [%rd27+0];\n"
+"	.loc	14	355	0\n"
+"	mad.f32 	%f35, %f37, %f152, %f35;\n"
+"$Lt_1_16898:\n"
+"$Lt_1_16386:\n"
+"	@!%p5 bra 	$Lt_1_17410;\n"
+"	.loc	14	359	0\n"
+"	mov.f32 	%f153, %f7;\n"
+"	mul.f32 	%f154, %f50, %f50;\n"
+"	mad.f32 	%f155, %f143, %f154, %f153;\n"
+"	mov.f32 	%f7, %f155;\n"
+"	.loc	14	360	0\n"
+"	mov.f32 	%f156, %f9;\n"
+"	mad.f32 	%f157, %f143, %f52, %f156;\n"
+"	mov.f32 	%f9, %f157;\n"
+"	.loc	14	361	0\n"
+"	mov.f32 	%f158, %f11;\n"
+"	mul.f32 	%f159, %f51, %f51;\n"
+"	mad.f32 	%f160, %f143, %f159, %f158;\n"
+"	mov.f32 	%f11, %f160;\n"
+"	.loc	14	362	0\n"
+"	mov.f32 	%f161, %f13;\n"
+"	mul.f32 	%f162, %f49, %f50;\n"
+"	mad.f32 	%f163, %f143, %f162, %f161;\n"
+"	mov.f32 	%f13, %f163;\n"
+"	.loc	14	363	0\n"
+"	mov.f32 	%f164, %f15;\n"
+"	mul.f32 	%f165, %f50, %f51;\n"
+"	mad.f32 	%f166, %f143, %f165, %f164;\n"
+"	mov.f32 	%f15, %f166;\n"
+"	.loc	14	364	0\n"
+"	mul.f32 	%f167, %f49, %f51;\n"
+"	mad.f32 	%f16, %f143, %f167, %f16;\n"
+"	mov.f32 	%f168, %f16;\n"
+"$Lt_1_17410:\n"
+"$Lt_1_14338:\n"
+"	.loc	14	286	0\n"
+"	add.u64 	%rd20, %rd17, %rd20;\n"
+"	setp.gt.u64 	%p11, %rd23, %rd20;\n"
+"	@%p11 bra 	$Lt_1_14082;\n"
+"	bra.uni 	$Lt_1_13570;\n"
+"$Lt_1_19202:\n"
+"	mov.s32 	%r37, 0;\n"
+"	setp.gt.s32 	%p4, %r21, %r37;\n"
+"	mov.s32 	%r38, 0;\n"
+"	setp.gt.s32 	%p5, %r20, %r38;\n"
+"$Lt_1_13570:\n"
+"	.loc	14	371	0\n"
+"	ld.param.u64 	%rd34, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd35, %rd34, %rd13;\n"
+"	@!%p4 bra 	$Lt_1_18178;\n"
+"	.loc	14	373	0\n"
+"	st.global.f32 	[%rd35+0], %f35;\n"
+"	.loc	14	374	0\n"
+"	cvt.u64.s32 	%rd36, %r7;\n"
+"	mul.lo.u64 	%rd37, %rd36, 4;\n"
+"	add.u64 	%rd35, %rd37, %rd35;\n"
+"	.loc	14	375	0\n"
+"	st.global.f32 	[%rd35+0], %f34;\n"
+"	.loc	14	376	0\n"
+"	add.u64 	%rd35, %rd37, %rd35;\n"
+"$Lt_1_18178:\n"
+"	@!%p5 bra 	$Lt_1_18690;\n"
+"	.loc	14	380	0\n"
+"	mov.f32 	%f169, %f7;\n"
+"	st.global.f32 	[%rd35+0], %f169;\n"
+"	.loc	14	381	0\n"
+"	cvt.u64.s32 	%rd38, %r7;\n"
+"	mul.lo.u64 	%rd39, %rd38, 4;\n"
+"	add.u64 	%rd35, %rd39, %rd35;\n"
+"	.loc	14	380	0\n"
+"	mov.f32 	%f170, %f9;\n"
+"	st.global.f32 	[%rd35+0], %f170;\n"
+"	.loc	14	381	0\n"
+"	add.u64 	%rd35, %rd39, %rd35;\n"
+"	.loc	14	380	0\n"
+"	mov.f32 	%f171, %f11;\n"
+"	st.global.f32 	[%rd35+0], %f171;\n"
+"	.loc	14	381	0\n"
+"	add.u64 	%rd35, %rd39, %rd35;\n"
+"	.loc	14	380	0\n"
+"	mov.f32 	%f172, %f13;\n"
+"	st.global.f32 	[%rd35+0], %f172;\n"
+"	.loc	14	381	0\n"
+"	add.u64 	%rd35, %rd39, %rd35;\n"
+"	.loc	14	380	0\n"
+"	mov.f32 	%f173, %f15;\n"
+"	st.global.f32 	[%rd35+0], %f173;\n"
+"	add.u64 	%rd40, %rd39, %rd35;\n"
+"	st.global.f32 	[%rd40+0], %f16;\n"
+"$Lt_1_18690:\n"
+"	.loc	14	384	0\n"
+"	ld.param.u64 	%rd41, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd42, %rd12, 16;\n"
+"	add.u64 	%rd43, %rd41, %rd42;\n"
+"	mov.f32 	%f174, %f175;\n"
+"	st.global.v4.f32 	[%rd43+0], {%f33,%f32,%f31,%f174};\n"
+"$Lt_1_13058:\n"
+"	.loc	14	386	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/gb_gpu_ptx.h
+++ b/lib/gpu/gb_gpu_ptx.h
--- a/lib/gpu/lj96_cut_gpu_ptx.h
+++ b/lib/gpu/lj96_cut_gpu_ptx.h
@ -0,0 +1,567 @@
+const char * lj96_cut_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch)\n"
+"	{\n"
+"	.reg .u32 %r<33>;\n"
+"	.reg .u64 %rd<36>;\n"
+"	.reg .f32 %f<87>;\n"
+"	.reg .pred %p<8>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj84[16];\n"
+"	.loc	14	87	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	91	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+0], %f1;\n"
+"	.loc	14	92	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+4], %f2;\n"
+"	.loc	14	93	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+8], %f3;\n"
+"	.loc	14	94	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+12], %f4;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_7938;\n"
+"	.loc	14	105	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	.loc	14	108	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	110	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	111	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	114	0\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	mul24.lo.s32 	%r14, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r16, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_12034;\n"
+"	mov.s32 	%r17, 0;\n"
+"	setp.gt.s32 	%p3, %r16, %r17;\n"
+"	mov.s32 	%r18, 0;\n"
+"	setp.gt.s32 	%p4, %r15, %r18;\n"
+"	cvt.rzi.s32.f32 	%r19, %f24;\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r21, %r20, %r19;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.u64 	%rd15, __cuda_sp_lj84;\n"
+"$Lt_0_8962:\n"
+"	.loc	14	120	0\n"
+"	ld.global.s32 	%r22, [%rd10+0];\n"
+"	.loc	14	121	0\n"
+"	shr.s32 	%r23, %r22, 30;\n"
+"	cvt.s64.s32 	%rd16, %r23;\n"
+"	and.b64 	%rd17, %rd16, 3;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd15, %rd18;\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	and.b32 	%r24, %r22, 1073741823;\n"
+"	mov.s32 	%r25, 0;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.s32 	%r27, 0;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r24,%r25,%r26,%r27}];\n"
+"	.loc	14	124	0\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.s32.f32 	%r28, %f37;\n"
+"	sub.f32 	%f38, %f22, %f35;\n"
+"	sub.f32 	%f39, %f21, %f34;\n"
+"	sub.f32 	%f40, %f23, %f36;\n"
+"	mul.f32 	%f41, %f38, %f38;\n"
+"	mad.f32 	%f42, %f39, %f39, %f41;\n"
+"	mad.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r29, %r28, %r21;\n"
+"	cvt.u64.s32 	%rd20, %r29;\n"
+"	mul.lo.u64 	%rd21, %rd20, 16;\n"
+"	add.u64 	%rd22, %rd21, %rd14;\n"
+"	ld.global.f32 	%f44, [%rd22+8];\n"
+"	setp.gt.f32 	%p5, %f44, %f43;\n"
+"	@!%p5 bra 	$Lt_0_10242;\n"
+"	.loc	14	139	0\n"
+"	rcp.approx.f32 	%f45, %f43;\n"
+"	mul.f32 	%f46, %f45, %f45;\n"
+"	mul.f32 	%f47, %f45, %f46;\n"
+"	sqrt.approx.f32 	%f48, %f47;\n"
+"	mul.f32 	%f49, %f45, %f47;\n"
+"	ld.global.v2.f32 	{%f50,%f51}, [%rd22+0];\n"
+"	mul.f32 	%f52, %f50, %f48;\n"
+"	sub.f32 	%f53, %f52, %f51;\n"
+"	mul.f32 	%f54, %f49, %f53;\n"
+"	.loc	14	121	0\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	.loc	14	139	0\n"
+"	mul.f32 	%f55, %f29, %f54;\n"
+"	.loc	14	141	0\n"
+"	mad.f32 	%f27, %f39, %f55, %f27;\n"
+"	.loc	14	142	0\n"
+"	mad.f32 	%f26, %f38, %f55, %f26;\n"
+"	.loc	14	143	0\n"
+"	mad.f32 	%f25, %f40, %f55, %f25;\n"
+"	@!%p3 bra 	$Lt_0_9730;\n"
+"	.loc	14	147	0\n"
+"	ld.param.u64 	%rd23, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd24, %rd23, %rd21;\n"
+"	ld.global.v4.f32 	{%f56,%f57,%f58,_}, [%rd24+0];\n"
+"	mul.f32 	%f59, %f56, %f48;\n"
+"	sub.f32 	%f60, %f59, %f57;\n"
+"	mul.f32 	%f61, %f47, %f60;\n"
+"	sub.f32 	%f62, %f61, %f58;\n"
+"	.loc	14	121	0\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	.loc	14	147	0\n"
+"	mad.f32 	%f28, %f29, %f62, %f28;\n"
+"$Lt_0_9730:\n"
+"	@!%p4 bra 	$Lt_0_10242;\n"
+"	.loc	14	150	0\n"
+"	mov.f32 	%f63, %f6;\n"
+"	mul.f32 	%f64, %f39, %f39;\n"
+"	mad.f32 	%f65, %f55, %f64, %f63;\n"
+"	mov.f32 	%f6, %f65;\n"
+"	.loc	14	151	0\n"
+"	mov.f32 	%f66, %f8;\n"
+"	mad.f32 	%f67, %f55, %f41, %f66;\n"
+"	mov.f32 	%f8, %f67;\n"
+"	.loc	14	152	0\n"
+"	mov.f32 	%f68, %f10;\n"
+"	mul.f32 	%f69, %f40, %f40;\n"
+"	mad.f32 	%f70, %f55, %f69, %f68;\n"
+"	mov.f32 	%f10, %f70;\n"
+"	.loc	14	153	0\n"
+"	mov.f32 	%f71, %f12;\n"
+"	mul.f32 	%f72, %f38, %f39;\n"
+"	mad.f32 	%f73, %f55, %f72, %f71;\n"
+"	mov.f32 	%f12, %f73;\n"
+"	.loc	14	154	0\n"
+"	mov.f32 	%f74, %f14;\n"
+"	mul.f32 	%f75, %f39, %f40;\n"
+"	mad.f32 	%f76, %f55, %f75, %f74;\n"
+"	mov.f32 	%f14, %f76;\n"
+"	.loc	14	155	0\n"
+"	mul.f32 	%f77, %f38, %f40;\n"
+"	mad.f32 	%f15, %f55, %f77, %f15;\n"
+"	mov.f32 	%f78, %f15;\n"
+"$Lt_0_10242:\n"
+"$Lt_0_9218:\n"
+"	.loc	14	118	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p6, %rd13, %rd10;\n"
+"	@%p6 bra 	$Lt_0_8962;\n"
+"	bra.uni 	$Lt_0_8450;\n"
+"$Lt_0_12034:\n"
+"	mov.s32 	%r30, 0;\n"
+"	setp.gt.s32 	%p3, %r16, %r30;\n"
+"	mov.s32 	%r31, 0;\n"
+"	setp.gt.s32 	%p4, %r15, %r31;\n"
+"$Lt_0_8450:\n"
+"	.loc	14	162	0\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd26, %rd25, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_11010;\n"
+"	.loc	14	164	0\n"
+"	st.global.f32 	[%rd26+0], %f28;\n"
+"	.loc	14	165	0\n"
+"	cvt.u64.s32 	%rd27, %r6;\n"
+"	mul.lo.u64 	%rd28, %rd27, 4;\n"
+"	add.u64 	%rd26, %rd26, %rd28;\n"
+"$Lt_0_11010:\n"
+"	@!%p4 bra 	$Lt_0_11522;\n"
+"	.loc	14	169	0\n"
+"	mov.f32 	%f79, %f6;\n"
+"	st.global.f32 	[%rd26+0], %f79;\n"
+"	.loc	14	170	0\n"
+"	cvt.u64.s32 	%rd29, %r6;\n"
+"	mul.lo.u64 	%rd30, %rd29, 4;\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	169	0\n"
+"	mov.f32 	%f80, %f8;\n"
+"	st.global.f32 	[%rd26+0], %f80;\n"
+"	.loc	14	170	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	169	0\n"
+"	mov.f32 	%f81, %f10;\n"
+"	st.global.f32 	[%rd26+0], %f81;\n"
+"	.loc	14	170	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	169	0\n"
+"	mov.f32 	%f82, %f12;\n"
+"	st.global.f32 	[%rd26+0], %f82;\n"
+"	.loc	14	170	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	169	0\n"
+"	mov.f32 	%f83, %f14;\n"
+"	st.global.f32 	[%rd26+0], %f83;\n"
+"	add.u64 	%rd31, %rd30, %rd26;\n"
+"	st.global.f32 	[%rd31+0], %f15;\n"
+"$Lt_0_11522:\n"
+"	.loc	14	173	0\n"
+"	ld.param.u64 	%rd32, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd33, %rd2, 16;\n"
+"	add.u64 	%rd34, %rd32, %rd33;\n"
+"	mov.f32 	%f84, %f85;\n"
+"	st.global.v4.f32 	[%rd34+0], {%f27,%f26,%f25,%f84};\n"
+"$Lt_0_7938:\n"
+"	.loc	14	175	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch)\n"
+"	{\n"
+"	.reg .u32 %r<36>;\n"
+"	.reg .u64 %rd<48>;\n"
+"	.reg .f32 %f<93>;\n"
+"	.reg .pred %p<11>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj180[16];\n"
+"	.shared .align 16 .b8 __cuda_lj1208[1024];\n"
+"	.shared .align 16 .b8 __cuda_lj31232[1024];\n"
+"	.loc	14	182	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_10242;\n"
+"	.loc	14	189	0\n"
+"	mov.u64 	%rd1, __cuda_sp_lj180;\n"
+"	cvt.u64.s32 	%rd2, %r1;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_10242:\n"
+"	mov.u64 	%rd1, __cuda_sp_lj180;\n"
+"	mov.u32 	%r3, 63;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_10754;\n"
+"	.loc	14	191	0\n"
+"	mov.u64 	%rd7, __cuda_lj1208;\n"
+"	cvt.u64.s32 	%rd8, %r1;\n"
+"	mul.lo.u64 	%rd9, %rd8, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.f32 	[%rd12+0], %f2;\n"
+"	st.shared.f32 	[%rd12+4], %f3;\n"
+"	st.shared.f32 	[%rd12+8], %f4;\n"
+"	st.shared.f32 	[%rd12+12], %f5;\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_11266;\n"
+"	.loc	14	193	0\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.f32 	[%rd16+0], %f6;\n"
+"	st.shared.f32 	[%rd16+4], %f7;\n"
+"	st.shared.f32 	[%rd16+8], %f8;\n"
+"	st.shared.f32 	[%rd16+12], %f9;\n"
+"$Lt_1_11266:\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"$Lt_1_10754:\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"	mov.u64 	%rd7, __cuda_lj1208;\n"
+"	.loc	14	196	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r6, %ctaid.x;\n"
+"	cvt.s32.u16 	%r7, %ntid.x;\n"
+"	mul24.lo.s32 	%r8, %r6, %r7;\n"
+"	add.s32 	%r9, %r8, %r1;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p4, %r9, %r10;\n"
+"	@%p4 bra 	$Lt_1_11778;\n"
+"	.loc	14	207	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	14	210	0\n"
+"	cvt.u64.s32 	%rd17, %r9;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd20, %rd19, %rd18;\n"
+"	ld.global.s32 	%r11, [%rd20+0];\n"
+"	.loc	14	212	0\n"
+"	ld.param.s32 	%r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd21, %r12;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd20, %rd22;\n"
+"	ld.global.s32 	%r13, [%rd23+0];\n"
+"	.loc	14	213	0\n"
+"	add.u64 	%rd24, %rd23, %rd22;\n"
+"	mov.s64 	%rd25, %rd24;\n"
+"	mov.s32 	%r14, %r11;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	216	0\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	mul24.lo.s32 	%r18, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.lo.u64 	%rd27, %rd26, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p5, %rd24, %rd28;\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	@%p5 bra 	$Lt_1_15874;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p6, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p7, %r19, %r22;\n"
+"	cvt.rzi.s32.f32 	%r23, %f29;\n"
+"	mov.s32 	%r24, 8;\n"
+"	mul24.lo.s32 	%r25, %r24, %r23;\n"
+"	cvt.rn.f32.s32 	%f34, %r25;\n"
+"$Lt_1_12802:\n"
+"	.loc	14	223	0\n"
+"	ld.global.s32 	%r26, [%rd25+0];\n"
+"	.loc	14	224	0\n"
+"	shr.s32 	%r27, %r26, 30;\n"
+"	cvt.s64.s32 	%rd29, %r27;\n"
+"	and.b64 	%rd30, %rd29, 3;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd32, %rd1, %rd31;\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	and.b32 	%r28, %r26, 1073741823;\n"
+"	mov.s32 	%r29, 0;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.f32 	%f44, %f27, %f41;\n"
+"	sub.f32 	%f45, %f26, %f40;\n"
+"	sub.f32 	%f46, %f28, %f42;\n"
+"	mul.f32 	%f47, %f44, %f44;\n"
+"	mad.f32 	%f48, %f45, %f45, %f47;\n"
+"	mad.f32 	%f49, %f46, %f46, %f48;\n"
+"	add.f32 	%f50, %f34, %f43;\n"
+"	cvt.rzi.s32.f32 	%r32, %f50;\n"
+"	cvt.u64.s32 	%rd33, %r32;\n"
+"	mul.lo.u64 	%rd34, %rd33, 16;\n"
+"	add.u64 	%rd35, %rd34, %rd7;\n"
+"	ld.shared.f32 	%f51, [%rd35+8];\n"
+"	setp.gt.f32 	%p8, %f51, %f49;\n"
+"	@!%p8 bra 	$Lt_1_14082;\n"
+"	.loc	14	240	0\n"
+"	rcp.approx.f32 	%f52, %f49;\n"
+"	mul.f32 	%f53, %f52, %f52;\n"
+"	mul.f32 	%f54, %f52, %f53;\n"
+"	sqrt.approx.f32 	%f55, %f54;\n"
+"	mul.f32 	%f56, %f52, %f54;\n"
+"	ld.shared.f32 	%f57, [%rd35+4];\n"
+"	ld.shared.f32 	%f58, [%rd35+0];\n"
+"	mul.f32 	%f59, %f58, %f55;\n"
+"	sub.f32 	%f60, %f59, %f57;\n"
+"	mul.f32 	%f61, %f56, %f60;\n"
+"	.loc	14	242	0\n"
+"	mad.f32 	%f32, %f45, %f61, %f32;\n"
+"	.loc	14	243	0\n"
+"	mad.f32 	%f31, %f44, %f61, %f31;\n"
+"	.loc	14	244	0\n"
+"	mad.f32 	%f30, %f46, %f61, %f30;\n"
+"	@!%p6 bra 	$Lt_1_13570;\n"
+"	.loc	14	247	0\n"
+"	add.u64 	%rd36, %rd34, %rd13;\n"
+"	ld.shared.f32 	%f62, [%rd36+4];\n"
+"	ld.shared.f32 	%f63, [%rd36+0];\n"
+"	mul.f32 	%f64, %f63, %f55;\n"
+"	sub.f32 	%f65, %f64, %f62;\n"
+"	mul.f32 	%f66, %f54, %f65;\n"
+"	.loc	14	248	0\n"
+"	ld.shared.f32 	%f67, [%rd36+8];\n"
+"	sub.f32 	%f68, %f66, %f67;\n"
+"	.loc	14	224	0\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	.loc	14	248	0\n"
+"	mad.f32 	%f33, %f35, %f68, %f33;\n"
+"$Lt_1_13570:\n"
+"	@!%p7 bra 	$Lt_1_14082;\n"
+"	.loc	14	251	0\n"
+"	mov.f32 	%f69, %f11;\n"
+"	mul.f32 	%f70, %f45, %f45;\n"
+"	mad.f32 	%f71, %f61, %f70, %f69;\n"
+"	mov.f32 	%f11, %f71;\n"
+"	.loc	14	252	0\n"
+"	mov.f32 	%f72, %f13;\n"
+"	mad.f32 	%f73, %f61, %f47, %f72;\n"
+"	mov.f32 	%f13, %f73;\n"
+"	.loc	14	253	0\n"
+"	mov.f32 	%f74, %f15;\n"
+"	mul.f32 	%f75, %f46, %f46;\n"
+"	mad.f32 	%f76, %f61, %f75, %f74;\n"
+"	mov.f32 	%f15, %f76;\n"
+"	.loc	14	254	0\n"
+"	mov.f32 	%f77, %f17;\n"
+"	mul.f32 	%f78, %f44, %f45;\n"
+"	mad.f32 	%f79, %f61, %f78, %f77;\n"
+"	mov.f32 	%f17, %f79;\n"
+"	.loc	14	255	0\n"
+"	mov.f32 	%f80, %f19;\n"
+"	mul.f32 	%f81, %f45, %f46;\n"
+"	mad.f32 	%f82, %f61, %f81, %f80;\n"
+"	mov.f32 	%f19, %f82;\n"
+"	.loc	14	256	0\n"
+"	mul.f32 	%f83, %f44, %f46;\n"
+"	mad.f32 	%f20, %f61, %f83, %f20;\n"
+"	mov.f32 	%f84, %f20;\n"
+"$Lt_1_14082:\n"
+"$Lt_1_13058:\n"
+"	.loc	14	221	0\n"
+"	add.u64 	%rd25, %rd22, %rd25;\n"
+"	setp.gt.u64 	%p9, %rd28, %rd25;\n"
+"	@%p9 bra 	$Lt_1_12802;\n"
+"	bra.uni 	$Lt_1_12290;\n"
+"$Lt_1_15874:\n"
+"	mov.s32 	%r33, 0;\n"
+"	setp.gt.s32 	%p6, %r20, %r33;\n"
+"	mov.s32 	%r34, 0;\n"
+"	setp.gt.s32 	%p7, %r19, %r34;\n"
+"$Lt_1_12290:\n"
+"	.loc	14	263	0\n"
+"	ld.param.u64 	%rd37, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd38, %rd37, %rd18;\n"
+"	@!%p6 bra 	$Lt_1_14850;\n"
+"	.loc	14	265	0\n"
+"	st.global.f32 	[%rd38+0], %f33;\n"
+"	.loc	14	266	0\n"
+"	cvt.u64.s32 	%rd39, %r10;\n"
+"	mul.lo.u64 	%rd40, %rd39, 4;\n"
+"	add.u64 	%rd38, %rd38, %rd40;\n"
+"$Lt_1_14850:\n"
+"	@!%p7 bra 	$Lt_1_15362;\n"
+"	.loc	14	270	0\n"
+"	mov.f32 	%f85, %f11;\n"
+"	st.global.f32 	[%rd38+0], %f85;\n"
+"	.loc	14	271	0\n"
+"	cvt.u64.s32 	%rd41, %r10;\n"
+"	mul.lo.u64 	%rd42, %rd41, 4;\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	270	0\n"
+"	mov.f32 	%f86, %f13;\n"
+"	st.global.f32 	[%rd38+0], %f86;\n"
+"	.loc	14	271	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	270	0\n"
+"	mov.f32 	%f87, %f15;\n"
+"	st.global.f32 	[%rd38+0], %f87;\n"
+"	.loc	14	271	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	270	0\n"
+"	mov.f32 	%f88, %f17;\n"
+"	st.global.f32 	[%rd38+0], %f88;\n"
+"	.loc	14	271	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	270	0\n"
+"	mov.f32 	%f89, %f19;\n"
+"	st.global.f32 	[%rd38+0], %f89;\n"
+"	add.u64 	%rd43, %rd42, %rd38;\n"
+"	st.global.f32 	[%rd43+0], %f20;\n"
+"$Lt_1_15362:\n"
+"	.loc	14	274	0\n"
+"	ld.param.u64 	%rd44, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd45, %rd17, 16;\n"
+"	add.u64 	%rd46, %rd44, %rd45;\n"
+"	mov.f32 	%f90, %f91;\n"
+"	st.global.v4.f32 	[%rd46+0], {%f32,%f31,%f30,%f90};\n"
+"$Lt_1_11778:\n"
+"	.loc	14	276	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/lj_cut_gpu_ptx.h
+++ b/lib/gpu/lj_cut_gpu_ptx.h
@ -0,0 +1,569 @@
+const char * lj_cut_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch)\n"
+"	{\n"
+"	.reg .u32 %r<33>;\n"
+"	.reg .u64 %rd<36>;\n"
+"	.reg .f32 %f<86>;\n"
+"	.reg .pred %p<8>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj84[16];\n"
+"	.loc	14	87	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	91	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+0], %f1;\n"
+"	.loc	14	92	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+4], %f2;\n"
+"	.loc	14	93	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+8], %f3;\n"
+"	.loc	14	94	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj84+12], %f4;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_7938;\n"
+"	.loc	14	105	0\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	mov.f32 	%f6, %f5;\n"
+"	mov.f32 	%f7, 0f00000000;     	\n"
+"	mov.f32 	%f8, %f7;\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	.loc	14	108	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	110	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	111	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f17,%f18,%f19,%f20},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	114	0\n"
+"	mov.f32 	%f21, %f17;\n"
+"	mov.f32 	%f22, %f18;\n"
+"	mov.f32 	%f23, %f19;\n"
+"	mov.f32 	%f24, %f20;\n"
+"	mul24.lo.s32 	%r14, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r14;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r16, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f25, 0f00000000;    	\n"
+"	mov.f32 	%f26, 0f00000000;    	\n"
+"	mov.f32 	%f27, 0f00000000;    	\n"
+"	mov.f32 	%f28, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_12034;\n"
+"	mov.s32 	%r17, 0;\n"
+"	setp.gt.s32 	%p3, %r16, %r17;\n"
+"	mov.s32 	%r18, 0;\n"
+"	setp.gt.s32 	%p4, %r15, %r18;\n"
+"	cvt.rzi.s32.f32 	%r19, %f24;\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r21, %r20, %r19;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.u64 	%rd15, __cuda_sp_lj84;\n"
+"$Lt_0_8962:\n"
+"	.loc	14	120	0\n"
+"	ld.global.s32 	%r22, [%rd10+0];\n"
+"	.loc	14	121	0\n"
+"	shr.s32 	%r23, %r22, 30;\n"
+"	cvt.s64.s32 	%rd16, %r23;\n"
+"	and.b64 	%rd17, %rd16, 3;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd15, %rd18;\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	and.b32 	%r24, %r22, 1073741823;\n"
+"	mov.s32 	%r25, 0;\n"
+"	mov.s32 	%r26, 0;\n"
+"	mov.s32 	%r27, 0;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[pos_tex,{%r24,%r25,%r26,%r27}];\n"
+"	.loc	14	124	0\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mov.f32 	%f36, %f32;\n"
+"	mov.f32 	%f37, %f33;\n"
+"	cvt.rzi.s32.f32 	%r28, %f37;\n"
+"	sub.f32 	%f38, %f22, %f35;\n"
+"	sub.f32 	%f39, %f21, %f34;\n"
+"	sub.f32 	%f40, %f23, %f36;\n"
+"	mul.f32 	%f41, %f38, %f38;\n"
+"	mad.f32 	%f42, %f39, %f39, %f41;\n"
+"	mad.f32 	%f43, %f40, %f40, %f42;\n"
+"	add.s32 	%r29, %r28, %r21;\n"
+"	cvt.u64.s32 	%rd20, %r29;\n"
+"	mul.lo.u64 	%rd21, %rd20, 16;\n"
+"	add.u64 	%rd22, %rd21, %rd14;\n"
+"	ld.global.f32 	%f44, [%rd22+8];\n"
+"	setp.gt.f32 	%p5, %f44, %f43;\n"
+"	@!%p5 bra 	$Lt_0_10242;\n"
+"	.loc	14	138	0\n"
+"	rcp.approx.f32 	%f45, %f43;\n"
+"	mul.f32 	%f46, %f45, %f45;\n"
+"	mul.f32 	%f47, %f45, %f46;\n"
+"	mul.f32 	%f48, %f45, %f47;\n"
+"	ld.global.v2.f32 	{%f49,%f50}, [%rd22+0];\n"
+"	mul.f32 	%f51, %f49, %f47;\n"
+"	sub.f32 	%f52, %f51, %f50;\n"
+"	mul.f32 	%f53, %f48, %f52;\n"
+"	.loc	14	121	0\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	.loc	14	138	0\n"
+"	mul.f32 	%f54, %f29, %f53;\n"
+"	.loc	14	140	0\n"
+"	mad.f32 	%f27, %f39, %f54, %f27;\n"
+"	.loc	14	141	0\n"
+"	mad.f32 	%f26, %f38, %f54, %f26;\n"
+"	.loc	14	142	0\n"
+"	mad.f32 	%f25, %f40, %f54, %f25;\n"
+"	@!%p3 bra 	$Lt_0_9730;\n"
+"	.loc	14	146	0\n"
+"	ld.param.u64 	%rd23, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd24, %rd23, %rd21;\n"
+"	ld.global.v4.f32 	{%f55,%f56,%f57,_}, [%rd24+0];\n"
+"	mul.f32 	%f58, %f55, %f47;\n"
+"	sub.f32 	%f59, %f58, %f56;\n"
+"	mul.f32 	%f60, %f47, %f59;\n"
+"	sub.f32 	%f61, %f60, %f57;\n"
+"	.loc	14	121	0\n"
+"	ld.shared.f32 	%f29, [%rd19+0];\n"
+"	.loc	14	146	0\n"
+"	mad.f32 	%f28, %f29, %f61, %f28;\n"
+"$Lt_0_9730:\n"
+"	@!%p4 bra 	$Lt_0_10242;\n"
+"	.loc	14	149	0\n"
+"	mov.f32 	%f62, %f6;\n"
+"	mul.f32 	%f63, %f39, %f39;\n"
+"	mad.f32 	%f64, %f54, %f63, %f62;\n"
+"	mov.f32 	%f6, %f64;\n"
+"	.loc	14	150	0\n"
+"	mov.f32 	%f65, %f8;\n"
+"	mad.f32 	%f66, %f54, %f41, %f65;\n"
+"	mov.f32 	%f8, %f66;\n"
+"	.loc	14	151	0\n"
+"	mov.f32 	%f67, %f10;\n"
+"	mul.f32 	%f68, %f40, %f40;\n"
+"	mad.f32 	%f69, %f54, %f68, %f67;\n"
+"	mov.f32 	%f10, %f69;\n"
+"	.loc	14	152	0\n"
+"	mov.f32 	%f70, %f12;\n"
+"	mul.f32 	%f71, %f38, %f39;\n"
+"	mad.f32 	%f72, %f54, %f71, %f70;\n"
+"	mov.f32 	%f12, %f72;\n"
+"	.loc	14	153	0\n"
+"	mov.f32 	%f73, %f14;\n"
+"	mul.f32 	%f74, %f39, %f40;\n"
+"	mad.f32 	%f75, %f54, %f74, %f73;\n"
+"	mov.f32 	%f14, %f75;\n"
+"	.loc	14	154	0\n"
+"	mul.f32 	%f76, %f38, %f40;\n"
+"	mad.f32 	%f15, %f54, %f76, %f15;\n"
+"	mov.f32 	%f77, %f15;\n"
+"$Lt_0_10242:\n"
+"$Lt_0_9218:\n"
+"	.loc	14	118	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p6, %rd13, %rd10;\n"
+"	@%p6 bra 	$Lt_0_8962;\n"
+"	bra.uni 	$Lt_0_8450;\n"
+"$Lt_0_12034:\n"
+"	mov.s32 	%r30, 0;\n"
+"	setp.gt.s32 	%p3, %r16, %r30;\n"
+"	mov.s32 	%r31, 0;\n"
+"	setp.gt.s32 	%p4, %r15, %r31;\n"
+"$Lt_0_8450:\n"
+"	.loc	14	161	0\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd26, %rd25, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_11010;\n"
+"	.loc	14	163	0\n"
+"	st.global.f32 	[%rd26+0], %f28;\n"
+"	.loc	14	164	0\n"
+"	cvt.u64.s32 	%rd27, %r6;\n"
+"	mul.lo.u64 	%rd28, %rd27, 4;\n"
+"	add.u64 	%rd26, %rd26, %rd28;\n"
+"$Lt_0_11010:\n"
+"	@!%p4 bra 	$Lt_0_11522;\n"
+"	.loc	14	168	0\n"
+"	mov.f32 	%f78, %f6;\n"
+"	st.global.f32 	[%rd26+0], %f78;\n"
+"	.loc	14	169	0\n"
+"	cvt.u64.s32 	%rd29, %r6;\n"
+"	mul.lo.u64 	%rd30, %rd29, 4;\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	168	0\n"
+"	mov.f32 	%f79, %f8;\n"
+"	st.global.f32 	[%rd26+0], %f79;\n"
+"	.loc	14	169	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	168	0\n"
+"	mov.f32 	%f80, %f10;\n"
+"	st.global.f32 	[%rd26+0], %f80;\n"
+"	.loc	14	169	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	168	0\n"
+"	mov.f32 	%f81, %f12;\n"
+"	st.global.f32 	[%rd26+0], %f81;\n"
+"	.loc	14	169	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	168	0\n"
+"	mov.f32 	%f82, %f14;\n"
+"	st.global.f32 	[%rd26+0], %f82;\n"
+"	add.u64 	%rd31, %rd30, %rd26;\n"
+"	st.global.f32 	[%rd31+0], %f15;\n"
+"$Lt_0_11522:\n"
+"	.loc	14	172	0\n"
+"	ld.param.u64 	%rd32, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd33, %rd2, 16;\n"
+"	add.u64 	%rd34, %rd32, %rd33;\n"
+"	mov.f32 	%f83, %f84;\n"
+"	st.global.v4.f32 	[%rd34+0], {%f27,%f26,%f25,%f83};\n"
+"$Lt_0_7938:\n"
+"	.loc	14	174	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch)\n"
+"	{\n"
+"	.reg .u32 %r<36>;\n"
+"	.reg .u64 %rd<48>;\n"
+"	.reg .f32 %f<93>;\n"
+"	.reg .pred %p<11>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj180[16];\n"
+"	.shared .align 16 .b8 __cuda_lj1208[1024];\n"
+"	.shared .align 16 .b8 __cuda_lj31232[1024];\n"
+"	.loc	14	181	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 3;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_10242;\n"
+"	.loc	14	188	0\n"
+"	mov.u64 	%rd1, __cuda_sp_lj180;\n"
+"	cvt.u64.s32 	%rd2, %r1;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_10242:\n"
+"	mov.u64 	%rd1, __cuda_sp_lj180;\n"
+"	mov.u32 	%r3, 63;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_10754;\n"
+"	.loc	14	190	0\n"
+"	mov.u64 	%rd7, __cuda_lj1208;\n"
+"	cvt.u64.s32 	%rd8, %r1;\n"
+"	mul.lo.u64 	%rd9, %rd8, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.f32 	[%rd12+0], %f2;\n"
+"	st.shared.f32 	[%rd12+4], %f3;\n"
+"	st.shared.f32 	[%rd12+8], %f4;\n"
+"	st.shared.f32 	[%rd12+12], %f5;\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_11266;\n"
+"	.loc	14	192	0\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.f32 	[%rd16+0], %f6;\n"
+"	st.shared.f32 	[%rd16+4], %f7;\n"
+"	st.shared.f32 	[%rd16+8], %f8;\n"
+"	st.shared.f32 	[%rd16+12], %f9;\n"
+"$Lt_1_11266:\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"$Lt_1_10754:\n"
+"	mov.u64 	%rd13, __cuda_lj31232;\n"
+"	mov.u64 	%rd7, __cuda_lj1208;\n"
+"	.loc	14	195	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r6, %ctaid.x;\n"
+"	cvt.s32.u16 	%r7, %ntid.x;\n"
+"	mul24.lo.s32 	%r8, %r6, %r7;\n"
+"	add.s32 	%r9, %r8, %r1;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p4, %r9, %r10;\n"
+"	@%p4 bra 	$Lt_1_11778;\n"
+"	.loc	14	206	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	14	209	0\n"
+"	cvt.u64.s32 	%rd17, %r9;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd20, %rd19, %rd18;\n"
+"	ld.global.s32 	%r11, [%rd20+0];\n"
+"	.loc	14	211	0\n"
+"	ld.param.s32 	%r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd21, %r12;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd20, %rd22;\n"
+"	ld.global.s32 	%r13, [%rd23+0];\n"
+"	.loc	14	212	0\n"
+"	add.u64 	%rd24, %rd23, %rd22;\n"
+"	mov.s64 	%rd25, %rd24;\n"
+"	mov.s32 	%r14, %r11;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	215	0\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	mul24.lo.s32 	%r18, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd26, %r18;\n"
+"	mul.lo.u64 	%rd27, %rd26, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p5, %rd24, %rd28;\n"
+"	mov.f32 	%f30, 0f00000000;    	\n"
+"	mov.f32 	%f31, 0f00000000;    	\n"
+"	mov.f32 	%f32, 0f00000000;    	\n"
+"	mov.f32 	%f33, 0f00000000;    	\n"
+"	@%p5 bra 	$Lt_1_15874;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p6, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p7, %r19, %r22;\n"
+"	cvt.rzi.s32.f32 	%r23, %f29;\n"
+"	mov.s32 	%r24, 8;\n"
+"	mul24.lo.s32 	%r25, %r24, %r23;\n"
+"	cvt.rn.f32.s32 	%f34, %r25;\n"
+"$Lt_1_12802:\n"
+"	.loc	14	222	0\n"
+"	ld.global.s32 	%r26, [%rd25+0];\n"
+"	.loc	14	223	0\n"
+"	shr.s32 	%r27, %r26, 30;\n"
+"	cvt.s64.s32 	%rd29, %r27;\n"
+"	and.b64 	%rd30, %rd29, 3;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd32, %rd1, %rd31;\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	and.b32 	%r28, %r26, 1073741823;\n"
+"	mov.s32 	%r29, 0;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	tex.1d.v4.f32.s32 {%f36,%f37,%f38,%f39},[pos_tex,{%r28,%r29,%r30,%r31}];\n"
+"	.loc	14	226	0\n"
+"	mov.f32 	%f40, %f36;\n"
+"	mov.f32 	%f41, %f37;\n"
+"	mov.f32 	%f42, %f38;\n"
+"	mov.f32 	%f43, %f39;\n"
+"	sub.f32 	%f44, %f27, %f41;\n"
+"	sub.f32 	%f45, %f26, %f40;\n"
+"	sub.f32 	%f46, %f28, %f42;\n"
+"	mul.f32 	%f47, %f44, %f44;\n"
+"	mad.f32 	%f48, %f45, %f45, %f47;\n"
+"	mad.f32 	%f49, %f46, %f46, %f48;\n"
+"	add.f32 	%f50, %f34, %f43;\n"
+"	cvt.rzi.s32.f32 	%r32, %f50;\n"
+"	cvt.u64.s32 	%rd33, %r32;\n"
+"	mul.lo.u64 	%rd34, %rd33, 16;\n"
+"	add.u64 	%rd35, %rd34, %rd7;\n"
+"	ld.shared.f32 	%f51, [%rd35+8];\n"
+"	setp.gt.f32 	%p8, %f51, %f49;\n"
+"	@!%p8 bra 	$Lt_1_14082;\n"
+"	.loc	14	238	0\n"
+"	rcp.approx.f32 	%f52, %f49;\n"
+"	mul.f32 	%f53, %f52, %f52;\n"
+"	mul.f32 	%f54, %f52, %f53;\n"
+"	.loc	14	223	0\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	.loc	14	238	0\n"
+"	mul.f32 	%f55, %f52, %f35;\n"
+"	mul.f32 	%f56, %f54, %f55;\n"
+"	ld.shared.f32 	%f57, [%rd35+4];\n"
+"	ld.shared.f32 	%f58, [%rd35+0];\n"
+"	mul.f32 	%f59, %f58, %f54;\n"
+"	sub.f32 	%f60, %f59, %f57;\n"
+"	mul.f32 	%f61, %f56, %f60;\n"
+"	.loc	14	240	0\n"
+"	mad.f32 	%f32, %f45, %f61, %f32;\n"
+"	.loc	14	241	0\n"
+"	mad.f32 	%f31, %f44, %f61, %f31;\n"
+"	.loc	14	242	0\n"
+"	mad.f32 	%f30, %f46, %f61, %f30;\n"
+"	@!%p6 bra 	$Lt_1_13570;\n"
+"	.loc	14	245	0\n"
+"	add.u64 	%rd36, %rd34, %rd13;\n"
+"	ld.shared.f32 	%f62, [%rd36+4];\n"
+"	ld.shared.f32 	%f63, [%rd36+0];\n"
+"	mul.f32 	%f64, %f63, %f54;\n"
+"	sub.f32 	%f65, %f64, %f62;\n"
+"	mul.f32 	%f66, %f54, %f65;\n"
+"	.loc	14	246	0\n"
+"	ld.shared.f32 	%f67, [%rd36+8];\n"
+"	sub.f32 	%f68, %f66, %f67;\n"
+"	.loc	14	223	0\n"
+"	ld.shared.f32 	%f35, [%rd32+0];\n"
+"	.loc	14	246	0\n"
+"	mad.f32 	%f33, %f35, %f68, %f33;\n"
+"$Lt_1_13570:\n"
+"	@!%p7 bra 	$Lt_1_14082;\n"
+"	.loc	14	249	0\n"
+"	mov.f32 	%f69, %f11;\n"
+"	mul.f32 	%f70, %f45, %f45;\n"
+"	mad.f32 	%f71, %f61, %f70, %f69;\n"
+"	mov.f32 	%f11, %f71;\n"
+"	.loc	14	250	0\n"
+"	mov.f32 	%f72, %f13;\n"
+"	mad.f32 	%f73, %f61, %f47, %f72;\n"
+"	mov.f32 	%f13, %f73;\n"
+"	.loc	14	251	0\n"
+"	mov.f32 	%f74, %f15;\n"
+"	mul.f32 	%f75, %f46, %f46;\n"
+"	mad.f32 	%f76, %f61, %f75, %f74;\n"
+"	mov.f32 	%f15, %f76;\n"
+"	.loc	14	252	0\n"
+"	mov.f32 	%f77, %f17;\n"
+"	mul.f32 	%f78, %f44, %f45;\n"
+"	mad.f32 	%f79, %f61, %f78, %f77;\n"
+"	mov.f32 	%f17, %f79;\n"
+"	.loc	14	253	0\n"
+"	mov.f32 	%f80, %f19;\n"
+"	mul.f32 	%f81, %f45, %f46;\n"
+"	mad.f32 	%f82, %f61, %f81, %f80;\n"
+"	mov.f32 	%f19, %f82;\n"
+"	.loc	14	254	0\n"
+"	mul.f32 	%f83, %f44, %f46;\n"
+"	mad.f32 	%f20, %f61, %f83, %f20;\n"
+"	mov.f32 	%f84, %f20;\n"
+"$Lt_1_14082:\n"
+"$Lt_1_13058:\n"
+"	.loc	14	220	0\n"
+"	add.u64 	%rd25, %rd22, %rd25;\n"
+"	setp.gt.u64 	%p9, %rd28, %rd25;\n"
+"	@%p9 bra 	$Lt_1_12802;\n"
+"	bra.uni 	$Lt_1_12290;\n"
+"$Lt_1_15874:\n"
+"	mov.s32 	%r33, 0;\n"
+"	setp.gt.s32 	%p6, %r20, %r33;\n"
+"	mov.s32 	%r34, 0;\n"
+"	setp.gt.s32 	%p7, %r19, %r34;\n"
+"$Lt_1_12290:\n"
+"	.loc	14	261	0\n"
+"	ld.param.u64 	%rd37, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd38, %rd37, %rd18;\n"
+"	@!%p6 bra 	$Lt_1_14850;\n"
+"	.loc	14	263	0\n"
+"	st.global.f32 	[%rd38+0], %f33;\n"
+"	.loc	14	264	0\n"
+"	cvt.u64.s32 	%rd39, %r10;\n"
+"	mul.lo.u64 	%rd40, %rd39, 4;\n"
+"	add.u64 	%rd38, %rd38, %rd40;\n"
+"$Lt_1_14850:\n"
+"	@!%p7 bra 	$Lt_1_15362;\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f85, %f11;\n"
+"	st.global.f32 	[%rd38+0], %f85;\n"
+"	.loc	14	269	0\n"
+"	cvt.u64.s32 	%rd41, %r10;\n"
+"	mul.lo.u64 	%rd42, %rd41, 4;\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f86, %f13;\n"
+"	st.global.f32 	[%rd38+0], %f86;\n"
+"	.loc	14	269	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f87, %f15;\n"
+"	st.global.f32 	[%rd38+0], %f87;\n"
+"	.loc	14	269	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f88, %f17;\n"
+"	st.global.f32 	[%rd38+0], %f88;\n"
+"	.loc	14	269	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f89, %f19;\n"
+"	st.global.f32 	[%rd38+0], %f89;\n"
+"	add.u64 	%rd43, %rd42, %rd38;\n"
+"	st.global.f32 	[%rd43+0], %f20;\n"
+"$Lt_1_15362:\n"
+"	.loc	14	272	0\n"
+"	ld.param.u64 	%rd44, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd45, %rd17, 16;\n"
+"	add.u64 	%rd46, %rd44, %rd45;\n"
+"	mov.f32 	%f90, %f91;\n"
+"	st.global.v4.f32 	[%rd46+0], {%f32,%f31,%f30,%f90};\n"
+"$Lt_1_11778:\n"
+"	.loc	14	274	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/lj_expand_gpu.cpp
+++ b/lib/gpu/lj_expand_gpu.cpp
@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "lj_expand_gpu_memory.h"
+
+using namespace std;
+
+static LJE_GPU_Memory<PRECISION,ACC_PRECISION> LJEMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double **shift, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
+  LJEMF.clear();
+  gpu_mode=LJEMF.device->gpu_mode();
+  double gpu_split=LJEMF.device->particle_split();
+  int first_gpu=LJEMF.device->first_device();
+  int last_gpu=LJEMF.device->last_device();
+  int world_me=LJEMF.device->world_me();
+  int gpu_rank=LJEMF.device->gpu_rank();
+  int procs_per_gpu=LJEMF.device->procs_per_gpu();
+
+  LJEMF.device->init_message(screen,"lj/expand",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJEMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, shift, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  LJEMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, shift, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split,screen);
+
+    LJEMF.device->world_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJEMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void lje_gpu_clear() {
+  LJEMF.clear();
+}
+
+int** lje_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return LJEMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void lje_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  LJEMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double lje_gpu_bytes() {
+  return LJEMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lj_expand_gpu_kernel.cu
+++ b/lib/gpu/lj_expand_gpu_kernel.cu
@ -0,0 +1,393 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
+------------------------------------------------------------------------- */
+
+#ifndef LJE_GPU_KERNEL
+#define LJE_GPU_KERNEL
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "nv_kernel_def.h"
+texture<float4> pos_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
+                          __global numtyp4* lj3, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r2inv<lj1[mtype].z) {
+        numtyp r = sqrt(r2inv);
+	numtyp rshift = r - lj1[mtype].w;
+	numtyp rshiftsq = rshift*rshift;
+	r2inv = (numtyp) 1.0/rshiftsq;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        force*=factor_lj/rshift/r;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
+                               __global numtyp4* lj3_in, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(numtyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r2inv = delx*delx+dely*dely+delz*delz;
+        
+      if (r2inv<lj1[mtype].z) {
+        numtyp r = sqrt(r2inv);
+	numtyp rshift = r - lj1[mtype].w;
+	numtyp rshiftsq = rshift*rshift;
+	r2inv = 1.0/rshiftsq;
+        numtyp r6inv = r2inv*r2inv*r2inv;
+        numtyp force = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        force*=factor_lj/rshift/r;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+          energy+=factor_lj*(e-lj3[mtype].z); 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/lj_expand_gpu_memory.cpp
+++ b/lib/gpu/lj_expand_gpu_memory.cpp
@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "lj_expand_gpu_cl.h"
+#else
+#include "lj_expand_gpu_ptx.h"
+#endif
+
+#include "lj_expand_gpu_memory.h"
+#include <cassert>
+#define LJE_GPU_MemoryT LJE_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+LJE_GPU_MemoryT::LJE_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJE_GPU_MemoryT::~LJE_GPU_Memory() {
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int LJE_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJE_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
+                          double **host_lj1, double **host_lj2,
+                          double **host_lj3, double **host_lj4,
+                          double **host_offset, double **host_shift,
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_expand_gpu_kernel);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+			 host_cutsq, host_shift);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+		         host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJE_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJE_GPU_MemoryT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJE_GPU_Memory<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJE_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
+                          &lj3.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
+                     &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJE_GPU_Memory<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lj_expand_gpu_memory.h
+++ b/lib/gpu/lj_expand_gpu_memory.h
@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
+------------------------------------------------------------------------- */
+
+#ifndef LJE_GPU_MEMORY_H
+#define LJE_GPU_MEMORY_H
+
+#include "atomic_gpu_memory.h"
+
+template <class numtyp, class acctyp>
+class LJE_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
+ public:
+  LJE_GPU_Memory();
+  ~LJE_GPU_Memory();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double **host_shift, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = shift
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _lj_types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
--- a/lib/gpu/ljc_cut_gpu_ptx.h
+++ b/lib/gpu/ljc_cut_gpu_ptx.h
@ -0,0 +1,711 @@
+const char * ljc_cut_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.tex .u64 q_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_q_,\n"
+"		.param .u64 __cudaparm_kernel_pair_cutsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_qqrd2e)\n"
+"	{\n"
+"	.reg .u32 %r<42>;\n"
+"	.reg .u64 %rd<39>;\n"
+"	.reg .f32 %f<113>;\n"
+"	.reg .pred %p<10>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj108[32];\n"
+"	.loc	14	99	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	103	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+0], %f1;\n"
+"	.loc	14	104	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+4], %f2;\n"
+"	.loc	14	105	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+8], %f3;\n"
+"	.loc	14	106	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+12], %f4;\n"
+"	.loc	14	107	0\n"
+"	ld.global.f32 	%f5, [%rd1+16];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+16], %f5;\n"
+"	.loc	14	108	0\n"
+"	ld.global.f32 	%f6, [%rd1+20];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+20], %f6;\n"
+"	.loc	14	109	0\n"
+"	ld.global.f32 	%f7, [%rd1+24];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+24], %f7;\n"
+"	.loc	14	110	0\n"
+"	ld.global.f32 	%f8, [%rd1+28];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+28], %f8;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_10242;\n"
+"	.loc	14	121	0\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	mov.f32 	%f17, 0f00000000;    	\n"
+"	mov.f32 	%f18, %f17;\n"
+"	mov.f32 	%f19, 0f00000000;    	\n"
+"	mov.f32 	%f20, %f19;\n"
+"	.loc	14	124	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	126	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	127	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	130	0\n"
+"	mov.f32 	%f25, %f21;\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.s32 	%r14, %r7;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	131	0\n"
+"	mov.f32 	%f33, %f29;\n"
+"	mul24.lo.s32 	%r18, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r18;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_15874;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r22;\n"
+"	cvt.rzi.s32.f32 	%r23, %f28;\n"
+"	ld.param.s32 	%r24, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r25, %r24, %r23;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_cutsq];\n"
+"	mov.u64 	%rd15, __cuda_sp_lj108;\n"
+"$Lt_0_11266:\n"
+"	.loc	14	135	0\n"
+"	ld.global.s32 	%r26, [%rd10+0];\n"
+"	.loc	14	138	0\n"
+"	shr.s32 	%r27, %r26, 30;\n"
+"	cvt.s64.s32 	%rd16, %r27;\n"
+"	and.b64 	%rd17, %rd16, 3;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd15, %rd18;\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	139	0\n"
+"	ld.shared.f32 	%f40, [%rd19+16];\n"
+"	and.b32 	%r28, %r26, 1073741823;\n"
+"	mov.s32 	%r29, %r28;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	mov.s32 	%r32, 0;\n"
+"	tex.1d.v4.f32.s32 {%f41,%f42,%f43,%f44},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
+"	.loc	14	142	0\n"
+"	mov.f32 	%f45, %f41;\n"
+"	mov.f32 	%f46, %f42;\n"
+"	mov.f32 	%f47, %f43;\n"
+"	mov.f32 	%f48, %f44;\n"
+"	cvt.rzi.s32.f32 	%r33, %f48;\n"
+"	sub.f32 	%f49, %f26, %f46;\n"
+"	sub.f32 	%f50, %f25, %f45;\n"
+"	sub.f32 	%f51, %f27, %f47;\n"
+"	mul.f32 	%f52, %f49, %f49;\n"
+"	mad.f32 	%f53, %f50, %f50, %f52;\n"
+"	add.s32 	%r34, %r33, %r25;\n"
+"	cvt.u64.s32 	%rd20, %r34;\n"
+"	mad.f32 	%f54, %f51, %f51, %f53;\n"
+"	mul.lo.u64 	%rd21, %rd20, 4;\n"
+"	add.u64 	%rd22, %rd14, %rd21;\n"
+"	ld.global.f32 	%f55, [%rd22+0];\n"
+"	setp.gt.f32 	%p5, %f55, %f54;\n"
+"	@!%p5 bra 	$Lt_0_14082;\n"
+"	mul.lo.u64 	%rd23, %rd20, 16;\n"
+"	rcp.approx.f32 	%f56, %f54;\n"
+"	ld.param.u64 	%rd24, [__cudaparm_kernel_pair_lj1];\n"
+"	add.u64 	%rd25, %rd24, %rd23;\n"
+"	ld.global.f32 	%f57, [%rd25+8];\n"
+"	setp.lt.f32 	%p6, %f54, %f57;\n"
+"	@!%p6 bra 	$Lt_0_12290;\n"
+"	.loc	14	157	0\n"
+"	mul.f32 	%f58, %f56, %f56;\n"
+"	mul.f32 	%f59, %f56, %f58;\n"
+"	mov.f32 	%f60, %f59;\n"
+"	.loc	14	138	0\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	158	0\n"
+"	mul.f32 	%f61, %f59, %f39;\n"
+"	ld.global.v2.f32 	{%f62,%f63}, [%rd25+0];\n"
+"	mul.f32 	%f64, %f62, %f59;\n"
+"	sub.f32 	%f65, %f64, %f63;\n"
+"	mul.f32 	%f66, %f61, %f65;\n"
+"	bra.uni 	$Lt_0_12034;\n"
+"$Lt_0_12290:\n"
+"	.loc	14	160	0\n"
+"	mov.f32 	%f66, 0f00000000;    	\n"
+"$Lt_0_12034:\n"
+"	ld.global.f32 	%f67, [%rd25+12];\n"
+"	setp.gt.f32 	%p7, %f67, %f54;\n"
+"	@!%p7 bra 	$Lt_0_12802;\n"
+"	mov.s32 	%r35, %r28;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.s32 	%r38, 0;\n"
+"	tex.1d.v4.f32.s32 {%f68,%f69,%f70,%f71},[q_tex,{%r35,%r36,%r37,%r38}];\n"
+"	.loc	14	163	0\n"
+"	mov.f32 	%f72, %f68;\n"
+"	ld.param.f32 	%f73, [__cudaparm_kernel_pair_qqrd2e];\n"
+"	mul.f32 	%f74, %f73, %f33;\n"
+"	mul.f32 	%f75, %f72, %f74;\n"
+"	sqrt.approx.f32 	%f76, %f56;\n"
+"	mul.f32 	%f77, %f75, %f76;\n"
+"	.loc	14	139	0\n"
+"	ld.shared.f32 	%f40, [%rd19+16];\n"
+"	.loc	14	163	0\n"
+"	mul.f32 	%f78, %f40, %f77;\n"
+"	bra.uni 	$Lt_0_12546;\n"
+"$Lt_0_12802:\n"
+"	.loc	14	165	0\n"
+"	mov.f32 	%f78, 0f00000000;    	\n"
+"$Lt_0_12546:\n"
+"	.loc	14	169	0\n"
+"	add.f32 	%f79, %f78, %f66;\n"
+"	mul.f32 	%f80, %f79, %f56;\n"
+"	mad.f32 	%f36, %f50, %f80, %f36;\n"
+"	.loc	14	170	0\n"
+"	mad.f32 	%f35, %f49, %f80, %f35;\n"
+"	.loc	14	171	0\n"
+"	mad.f32 	%f34, %f51, %f80, %f34;\n"
+"	@!%p3 bra 	$Lt_0_13570;\n"
+"	.loc	14	174	0\n"
+"	add.f32 	%f37, %f78, %f37;\n"
+"	@!%p6 bra 	$Lt_0_13570;\n"
+"	.loc	14	177	0\n"
+"	ld.param.u64 	%rd26, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd27, %rd26, %rd23;\n"
+"	mov.f32 	%f81, %f60;\n"
+"	ld.global.v4.f32 	{%f82,%f83,%f84,_}, [%rd27+0];\n"
+"	mul.f32 	%f85, %f82, %f81;\n"
+"	sub.f32 	%f86, %f85, %f83;\n"
+"	mul.f32 	%f87, %f81, %f86;\n"
+"	sub.f32 	%f88, %f87, %f84;\n"
+"	.loc	14	138	0\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	177	0\n"
+"	mad.f32 	%f38, %f39, %f88, %f38;\n"
+"$Lt_0_13570:\n"
+"$Lt_0_13058:\n"
+"	@!%p4 bra 	$Lt_0_14082;\n"
+"	.loc	14	181	0\n"
+"	mov.f32 	%f89, %f10;\n"
+"	mul.f32 	%f90, %f50, %f50;\n"
+"	mad.f32 	%f91, %f80, %f90, %f89;\n"
+"	mov.f32 	%f10, %f91;\n"
+"	.loc	14	182	0\n"
+"	mov.f32 	%f92, %f12;\n"
+"	mad.f32 	%f93, %f80, %f52, %f92;\n"
+"	mov.f32 	%f12, %f93;\n"
+"	.loc	14	183	0\n"
+"	mov.f32 	%f94, %f14;\n"
+"	mul.f32 	%f95, %f51, %f51;\n"
+"	mad.f32 	%f96, %f80, %f95, %f94;\n"
+"	mov.f32 	%f14, %f96;\n"
+"	.loc	14	184	0\n"
+"	mov.f32 	%f97, %f16;\n"
+"	mul.f32 	%f98, %f49, %f50;\n"
+"	mad.f32 	%f99, %f80, %f98, %f97;\n"
+"	mov.f32 	%f16, %f99;\n"
+"	.loc	14	185	0\n"
+"	mov.f32 	%f100, %f18;\n"
+"	mul.f32 	%f101, %f50, %f51;\n"
+"	mad.f32 	%f102, %f80, %f101, %f100;\n"
+"	mov.f32 	%f18, %f102;\n"
+"	.loc	14	186	0\n"
+"	mul.f32 	%f103, %f49, %f51;\n"
+"	mad.f32 	%f19, %f80, %f103, %f19;\n"
+"	mov.f32 	%f104, %f19;\n"
+"$Lt_0_14082:\n"
+"$Lt_0_11522:\n"
+"	.loc	14	134	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p8, %rd13, %rd10;\n"
+"	@%p8 bra 	$Lt_0_11266;\n"
+"	bra.uni 	$Lt_0_10754;\n"
+"$Lt_0_15874:\n"
+"	mov.s32 	%r39, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r39;\n"
+"	mov.s32 	%r40, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r40;\n"
+"$Lt_0_10754:\n"
+"	.loc	14	193	0\n"
+"	ld.param.u64 	%rd28, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd29, %rd28, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_14850;\n"
+"	.loc	14	195	0\n"
+"	st.global.f32 	[%rd29+0], %f38;\n"
+"	.loc	14	196	0\n"
+"	cvt.u64.s32 	%rd30, %r6;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd29, %rd31, %rd29;\n"
+"	.loc	14	197	0\n"
+"	st.global.f32 	[%rd29+0], %f37;\n"
+"	.loc	14	198	0\n"
+"	add.u64 	%rd29, %rd31, %rd29;\n"
+"$Lt_0_14850:\n"
+"	@!%p4 bra 	$Lt_0_15362;\n"
+"	.loc	14	202	0\n"
+"	mov.f32 	%f105, %f10;\n"
+"	st.global.f32 	[%rd29+0], %f105;\n"
+"	.loc	14	203	0\n"
+"	cvt.u64.s32 	%rd32, %r6;\n"
+"	mul.lo.u64 	%rd33, %rd32, 4;\n"
+"	add.u64 	%rd29, %rd33, %rd29;\n"
+"	.loc	14	202	0\n"
+"	mov.f32 	%f106, %f12;\n"
+"	st.global.f32 	[%rd29+0], %f106;\n"
+"	.loc	14	203	0\n"
+"	add.u64 	%rd29, %rd33, %rd29;\n"
+"	.loc	14	202	0\n"
+"	mov.f32 	%f107, %f14;\n"
+"	st.global.f32 	[%rd29+0], %f107;\n"
+"	.loc	14	203	0\n"
+"	add.u64 	%rd29, %rd33, %rd29;\n"
+"	.loc	14	202	0\n"
+"	mov.f32 	%f108, %f16;\n"
+"	st.global.f32 	[%rd29+0], %f108;\n"
+"	.loc	14	203	0\n"
+"	add.u64 	%rd29, %rd33, %rd29;\n"
+"	.loc	14	202	0\n"
+"	mov.f32 	%f109, %f18;\n"
+"	st.global.f32 	[%rd29+0], %f109;\n"
+"	add.u64 	%rd34, %rd33, %rd29;\n"
+"	st.global.f32 	[%rd34+0], %f19;\n"
+"$Lt_0_15362:\n"
+"	.loc	14	206	0\n"
+"	ld.param.u64 	%rd35, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd36, %rd2, 16;\n"
+"	add.u64 	%rd37, %rd35, %rd36;\n"
+"	mov.f32 	%f110, %f111;\n"
+"	st.global.v4.f32 	[%rd37+0], {%f36,%f35,%f34,%f110};\n"
+"$Lt_0_10242:\n"
+"	.loc	14	208	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_q_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast__cutsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_qqrd2e)\n"
+"	{\n"
+"	.reg .u32 %r<45>;\n"
+"	.reg .u64 %rd<55>;\n"
+"	.reg .f32 %f<117>;\n"
+"	.reg .pred %p<13>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj244[32];\n"
+"	.shared .align 16 .b8 __cuda_lj1288[1024];\n"
+"	.shared .align 4 .b8 __cuda_cutsq1312[256];\n"
+"	.shared .align 16 .b8 __cuda_lj31568[1024];\n"
+"	.loc	14	217	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 7;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_12546;\n"
+"	.loc	14	225	0\n"
+"	mov.u64 	%rd1, __cuda_sp_lj244;\n"
+"	cvt.u64.s32 	%rd2, %r1;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_12546:\n"
+"	mov.u64 	%rd1, __cuda_sp_lj244;\n"
+"	mov.u32 	%r3, 63;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_13058;\n"
+"	.loc	14	227	0\n"
+"	mov.u64 	%rd7, __cuda_lj1288;\n"
+"	mov.u64 	%rd8, __cuda_cutsq1312;\n"
+"	cvt.u64.s32 	%rd9, %r1;\n"
+"	mul.lo.u64 	%rd10, %rd9, 16;\n"
+"	ld.param.u64 	%rd11, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd12, %rd11, %rd10;\n"
+"	add.u64 	%rd13, %rd10, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd12+0];\n"
+"	st.shared.f32 	[%rd13+0], %f2;\n"
+"	st.shared.f32 	[%rd13+4], %f3;\n"
+"	st.shared.f32 	[%rd13+8], %f4;\n"
+"	st.shared.f32 	[%rd13+12], %f5;\n"
+"	.loc	14	228	0\n"
+"	mul.lo.u64 	%rd14, %rd9, 4;\n"
+"	ld.param.u64 	%rd15, [__cudaparm_kernel_pair_fast__cutsq];\n"
+"	add.u64 	%rd16, %rd15, %rd14;\n"
+"	ld.global.f32 	%f6, [%rd16+0];\n"
+"	add.u64 	%rd17, %rd14, %rd8;\n"
+"	st.shared.f32 	[%rd17+0], %f6;\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_13570;\n"
+"	.loc	14	230	0\n"
+"	mov.u64 	%rd18, __cuda_lj31568;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd20, %rd19, %rd10;\n"
+"	add.u64 	%rd21, %rd10, %rd18;\n"
+"	ld.global.v4.f32 	{%f7,%f8,%f9,%f10}, [%rd20+0];\n"
+"	st.shared.f32 	[%rd21+0], %f7;\n"
+"	st.shared.f32 	[%rd21+4], %f8;\n"
+"	st.shared.f32 	[%rd21+8], %f9;\n"
+"	st.shared.f32 	[%rd21+12], %f10;\n"
+"$Lt_1_13570:\n"
+"	mov.u64 	%rd18, __cuda_lj31568;\n"
+"$Lt_1_13058:\n"
+"	mov.u64 	%rd7, __cuda_lj1288;\n"
+"	mov.u64 	%rd8, __cuda_cutsq1312;\n"
+"	mov.u64 	%rd18, __cuda_lj31568;\n"
+"	.loc	14	233	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r6, %ctaid.x;\n"
+"	cvt.s32.u16 	%r7, %ntid.x;\n"
+"	mul24.lo.s32 	%r8, %r6, %r7;\n"
+"	add.s32 	%r9, %r8, %r1;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p4, %r9, %r10;\n"
+"	@%p4 bra 	$Lt_1_14082;\n"
+"	.loc	14	245	0\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	mov.f32 	%f17, 0f00000000;    	\n"
+"	mov.f32 	%f18, %f17;\n"
+"	mov.f32 	%f19, 0f00000000;    	\n"
+"	mov.f32 	%f20, %f19;\n"
+"	mov.f32 	%f21, 0f00000000;    	\n"
+"	mov.f32 	%f22, %f21;\n"
+"	.loc	14	248	0\n"
+"	cvt.u64.s32 	%rd22, %r9;\n"
+"	mul.lo.u64 	%rd23, %rd22, 4;\n"
+"	ld.param.u64 	%rd24, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd25, %rd24, %rd23;\n"
+"	ld.global.s32 	%r11, [%rd25+0];\n"
+"	.loc	14	250	0\n"
+"	ld.param.s32 	%r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd26, %r12;\n"
+"	mul.lo.u64 	%rd27, %rd26, 4;\n"
+"	add.u64 	%rd28, %rd25, %rd27;\n"
+"	ld.global.s32 	%r13, [%rd28+0];\n"
+"	.loc	14	251	0\n"
+"	add.u64 	%rd29, %rd28, %rd27;\n"
+"	mov.s64 	%rd30, %rd29;\n"
+"	mov.s32 	%r14, %r11;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f23,%f24,%f25,%f26},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	254	0\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	mov.f32 	%f30, %f26;\n"
+"	mov.s32 	%r18, %r11;\n"
+"	mov.s32 	%r19, 0;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.s32 	%r21, 0;\n"
+"	tex.1d.v4.f32.s32 {%f31,%f32,%f33,%f34},[q_tex,{%r18,%r19,%r20,%r21}];\n"
+"	.loc	14	255	0\n"
+"	mov.f32 	%f35, %f31;\n"
+"	mul24.lo.s32 	%r22, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd31, %r22;\n"
+"	mul.lo.u64 	%rd32, %rd31, 4;\n"
+"	add.u64 	%rd33, %rd29, %rd32;\n"
+"	ld.param.s32 	%r23, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r24, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p5, %rd29, %rd33;\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	mov.f32 	%f39, 0f00000000;    	\n"
+"	mov.f32 	%f40, 0f00000000;    	\n"
+"	@%p5 bra 	$Lt_1_19714;\n"
+"	mov.s32 	%r25, 0;\n"
+"	setp.gt.s32 	%p6, %r24, %r25;\n"
+"	mov.s32 	%r26, 0;\n"
+"	setp.gt.s32 	%p7, %r23, %r26;\n"
+"	cvt.rzi.s32.f32 	%r27, %f30;\n"
+"	mov.s32 	%r28, 8;\n"
+"	mul24.lo.s32 	%r29, %r28, %r27;\n"
+"	cvt.rn.f32.s32 	%f41, %r29;\n"
+"$Lt_1_15106:\n"
+"	.loc	14	260	0\n"
+"	ld.global.s32 	%r30, [%rd30+0];\n"
+"	.loc	14	263	0\n"
+"	shr.s32 	%r31, %r30, 30;\n"
+"	cvt.s64.s32 	%rd34, %r31;\n"
+"	and.b64 	%rd35, %rd34, 3;\n"
+"	mul.lo.u64 	%rd36, %rd35, 4;\n"
+"	add.u64 	%rd37, %rd1, %rd36;\n"
+"	ld.shared.f32 	%f42, [%rd37+0];\n"
+"	.loc	14	264	0\n"
+"	ld.shared.f32 	%f43, [%rd37+16];\n"
+"	and.b32 	%r32, %r30, 1073741823;\n"
+"	mov.s32 	%r33, %r32;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.s32 	%r35, 0;\n"
+"	mov.s32 	%r36, 0;\n"
+"	tex.1d.v4.f32.s32 {%f44,%f45,%f46,%f47},[pos_tex,{%r33,%r34,%r35,%r36}];\n"
+"	.loc	14	267	0\n"
+"	mov.f32 	%f48, %f44;\n"
+"	mov.f32 	%f49, %f45;\n"
+"	mov.f32 	%f50, %f46;\n"
+"	mov.f32 	%f51, %f47;\n"
+"	sub.f32 	%f52, %f28, %f49;\n"
+"	sub.f32 	%f53, %f27, %f48;\n"
+"	sub.f32 	%f54, %f29, %f50;\n"
+"	mul.f32 	%f55, %f52, %f52;\n"
+"	mad.f32 	%f56, %f53, %f53, %f55;\n"
+"	mad.f32 	%f57, %f54, %f54, %f56;\n"
+"	add.f32 	%f58, %f41, %f51;\n"
+"	cvt.rzi.s32.f32 	%r37, %f58;\n"
+"	cvt.u64.s32 	%rd38, %r37;\n"
+"	mul.lo.u64 	%rd39, %rd38, 4;\n"
+"	add.u64 	%rd40, %rd8, %rd39;\n"
+"	ld.shared.f32 	%f59, [%rd40+0];\n"
+"	setp.gt.f32 	%p8, %f59, %f57;\n"
+"	@!%p8 bra 	$Lt_1_17922;\n"
+"	rcp.approx.f32 	%f60, %f57;\n"
+"	mul.lo.u64 	%rd41, %rd38, 16;\n"
+"	add.u64 	%rd42, %rd41, %rd7;\n"
+"	ld.shared.f32 	%f61, [%rd42+8];\n"
+"	setp.lt.f32 	%p9, %f57, %f61;\n"
+"	@!%p9 bra 	$Lt_1_16130;\n"
+"	.loc	14	281	0\n"
+"	mul.f32 	%f62, %f60, %f60;\n"
+"	mul.f32 	%f63, %f60, %f62;\n"
+"	mov.f32 	%f64, %f63;\n"
+"	.loc	14	263	0\n"
+"	ld.shared.f32 	%f42, [%rd37+0];\n"
+"	.loc	14	282	0\n"
+"	mul.f32 	%f65, %f63, %f42;\n"
+"	ld.shared.f32 	%f66, [%rd42+4];\n"
+"	ld.shared.f32 	%f67, [%rd42+0];\n"
+"	mul.f32 	%f68, %f67, %f63;\n"
+"	sub.f32 	%f69, %f68, %f66;\n"
+"	mul.f32 	%f70, %f65, %f69;\n"
+"	bra.uni 	$Lt_1_15874;\n"
+"$Lt_1_16130:\n"
+"	.loc	14	284	0\n"
+"	mov.f32 	%f70, 0f00000000;    	\n"
+"$Lt_1_15874:\n"
+"	ld.shared.f32 	%f71, [%rd42+12];\n"
+"	setp.gt.f32 	%p10, %f71, %f57;\n"
+"	@!%p10 bra 	$Lt_1_16642;\n"
+"	mov.s32 	%r38, %r32;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.s32 	%r40, 0;\n"
+"	mov.s32 	%r41, 0;\n"
+"	tex.1d.v4.f32.s32 {%f72,%f73,%f74,%f75},[q_tex,{%r38,%r39,%r40,%r41}];\n"
+"	.loc	14	287	0\n"
+"	mov.f32 	%f76, %f72;\n"
+"	ld.param.f32 	%f77, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
+"	mul.f32 	%f78, %f77, %f35;\n"
+"	mul.f32 	%f79, %f76, %f78;\n"
+"	sqrt.approx.f32 	%f80, %f60;\n"
+"	mul.f32 	%f81, %f79, %f80;\n"
+"	.loc	14	264	0\n"
+"	ld.shared.f32 	%f43, [%rd37+16];\n"
+"	.loc	14	287	0\n"
+"	mul.f32 	%f82, %f43, %f81;\n"
+"	bra.uni 	$Lt_1_16386;\n"
+"$Lt_1_16642:\n"
+"	.loc	14	289	0\n"
+"	mov.f32 	%f82, 0f00000000;    	\n"
+"$Lt_1_16386:\n"
+"	.loc	14	293	0\n"
+"	add.f32 	%f83, %f82, %f70;\n"
+"	mul.f32 	%f84, %f83, %f60;\n"
+"	mad.f32 	%f38, %f53, %f84, %f38;\n"
+"	.loc	14	294	0\n"
+"	mad.f32 	%f37, %f52, %f84, %f37;\n"
+"	.loc	14	295	0\n"
+"	mad.f32 	%f36, %f54, %f84, %f36;\n"
+"	@!%p6 bra 	$Lt_1_17410;\n"
+"	.loc	14	298	0\n"
+"	add.f32 	%f39, %f82, %f39;\n"
+"	@!%p9 bra 	$Lt_1_17410;\n"
+"	.loc	14	300	0\n"
+"	add.u64 	%rd43, %rd41, %rd18;\n"
+"	mov.f32 	%f85, %f64;\n"
+"	ld.shared.f32 	%f86, [%rd43+4];\n"
+"	ld.shared.f32 	%f87, [%rd43+0];\n"
+"	mul.f32 	%f88, %f87, %f85;\n"
+"	sub.f32 	%f89, %f88, %f86;\n"
+"	mul.f32 	%f90, %f85, %f89;\n"
+"	.loc	14	301	0\n"
+"	ld.shared.f32 	%f91, [%rd43+8];\n"
+"	sub.f32 	%f92, %f90, %f91;\n"
+"	.loc	14	263	0\n"
+"	ld.shared.f32 	%f42, [%rd37+0];\n"
+"	.loc	14	301	0\n"
+"	mad.f32 	%f40, %f42, %f92, %f40;\n"
+"$Lt_1_17410:\n"
+"$Lt_1_16898:\n"
+"	@!%p7 bra 	$Lt_1_17922;\n"
+"	.loc	14	305	0\n"
+"	mov.f32 	%f93, %f12;\n"
+"	mul.f32 	%f94, %f53, %f53;\n"
+"	mad.f32 	%f95, %f84, %f94, %f93;\n"
+"	mov.f32 	%f12, %f95;\n"
+"	.loc	14	306	0\n"
+"	mov.f32 	%f96, %f14;\n"
+"	mad.f32 	%f97, %f84, %f55, %f96;\n"
+"	mov.f32 	%f14, %f97;\n"
+"	.loc	14	307	0\n"
+"	mov.f32 	%f98, %f16;\n"
+"	mul.f32 	%f99, %f54, %f54;\n"
+"	mad.f32 	%f100, %f84, %f99, %f98;\n"
+"	mov.f32 	%f16, %f100;\n"
+"	.loc	14	308	0\n"
+"	mov.f32 	%f101, %f18;\n"
+"	mul.f32 	%f102, %f52, %f53;\n"
+"	mad.f32 	%f103, %f84, %f102, %f101;\n"
+"	mov.f32 	%f18, %f103;\n"
+"	.loc	14	309	0\n"
+"	mov.f32 	%f104, %f20;\n"
+"	mul.f32 	%f105, %f53, %f54;\n"
+"	mad.f32 	%f106, %f84, %f105, %f104;\n"
+"	mov.f32 	%f20, %f106;\n"
+"	.loc	14	310	0\n"
+"	mul.f32 	%f107, %f52, %f54;\n"
+"	mad.f32 	%f21, %f84, %f107, %f21;\n"
+"	mov.f32 	%f108, %f21;\n"
+"$Lt_1_17922:\n"
+"$Lt_1_15362:\n"
+"	.loc	14	259	0\n"
+"	add.u64 	%rd30, %rd27, %rd30;\n"
+"	setp.gt.u64 	%p11, %rd33, %rd30;\n"
+"	@%p11 bra 	$Lt_1_15106;\n"
+"	bra.uni 	$Lt_1_14594;\n"
+"$Lt_1_19714:\n"
+"	mov.s32 	%r42, 0;\n"
+"	setp.gt.s32 	%p6, %r24, %r42;\n"
+"	mov.s32 	%r43, 0;\n"
+"	setp.gt.s32 	%p7, %r23, %r43;\n"
+"$Lt_1_14594:\n"
+"	.loc	14	317	0\n"
+"	ld.param.u64 	%rd44, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd45, %rd44, %rd23;\n"
+"	@!%p6 bra 	$Lt_1_18690;\n"
+"	.loc	14	319	0\n"
+"	st.global.f32 	[%rd45+0], %f40;\n"
+"	.loc	14	320	0\n"
+"	cvt.u64.s32 	%rd46, %r10;\n"
+"	mul.lo.u64 	%rd47, %rd46, 4;\n"
+"	add.u64 	%rd45, %rd47, %rd45;\n"
+"	.loc	14	321	0\n"
+"	st.global.f32 	[%rd45+0], %f39;\n"
+"	.loc	14	322	0\n"
+"	add.u64 	%rd45, %rd47, %rd45;\n"
+"$Lt_1_18690:\n"
+"	@!%p7 bra 	$Lt_1_19202;\n"
+"	.loc	14	326	0\n"
+"	mov.f32 	%f109, %f12;\n"
+"	st.global.f32 	[%rd45+0], %f109;\n"
+"	.loc	14	327	0\n"
+"	cvt.u64.s32 	%rd48, %r10;\n"
+"	mul.lo.u64 	%rd49, %rd48, 4;\n"
+"	add.u64 	%rd45, %rd49, %rd45;\n"
+"	.loc	14	326	0\n"
+"	mov.f32 	%f110, %f14;\n"
+"	st.global.f32 	[%rd45+0], %f110;\n"
+"	.loc	14	327	0\n"
+"	add.u64 	%rd45, %rd49, %rd45;\n"
+"	.loc	14	326	0\n"
+"	mov.f32 	%f111, %f16;\n"
+"	st.global.f32 	[%rd45+0], %f111;\n"
+"	.loc	14	327	0\n"
+"	add.u64 	%rd45, %rd49, %rd45;\n"
+"	.loc	14	326	0\n"
+"	mov.f32 	%f112, %f18;\n"
+"	st.global.f32 	[%rd45+0], %f112;\n"
+"	.loc	14	327	0\n"
+"	add.u64 	%rd45, %rd49, %rd45;\n"
+"	.loc	14	326	0\n"
+"	mov.f32 	%f113, %f20;\n"
+"	st.global.f32 	[%rd45+0], %f113;\n"
+"	add.u64 	%rd50, %rd49, %rd45;\n"
+"	st.global.f32 	[%rd50+0], %f21;\n"
+"$Lt_1_19202:\n"
+"	.loc	14	330	0\n"
+"	ld.param.u64 	%rd51, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd52, %rd22, 16;\n"
+"	add.u64 	%rd53, %rd51, %rd52;\n"
+"	mov.f32 	%f114, %f115;\n"
+"	st.global.v4.f32 	[%rd53+0], {%f38,%f37,%f36,%f114};\n"
+"$Lt_1_14082:\n"
+"	.loc	14	332	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/ljcl_cut_gpu_ptx.h
+++ b/lib/gpu/ljcl_cut_gpu_ptx.h
@ -0,0 +1,762 @@
+const char * ljcl_cut_gpu_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 pos_tex;\n"
+"	.tex .u64 q_tex;\n"
+"	.entry kernel_pair (\n"
+"		.param .u64 __cudaparm_kernel_pair_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj1,\n"
+"		.param .u64 __cudaparm_kernel_pair_lj3,\n"
+"		.param .s32 __cudaparm_kernel_pair_lj_types,\n"
+"		.param .u64 __cudaparm_kernel_pair_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_g_ewald)\n"
+"	{\n"
+"	.reg .u32 %r<42>;\n"
+"	.reg .u64 %rd<36>;\n"
+"	.reg .f32 %f<145>;\n"
+"	.reg .pred %p<10>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj108[32];\n"
+"	.loc	14	107	0\n"
+"$LBB1_kernel_pair:\n"
+"	.loc	14	111	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_pair_sp_lj_in];\n"
+"	ld.global.f32 	%f1, [%rd1+0];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+0], %f1;\n"
+"	.loc	14	112	0\n"
+"	ld.global.f32 	%f2, [%rd1+4];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+4], %f2;\n"
+"	.loc	14	113	0\n"
+"	ld.global.f32 	%f3, [%rd1+8];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+8], %f3;\n"
+"	.loc	14	114	0\n"
+"	ld.global.f32 	%f4, [%rd1+12];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+12], %f4;\n"
+"	.loc	14	115	0\n"
+"	ld.global.f32 	%f5, [%rd1+16];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+16], %f5;\n"
+"	.loc	14	116	0\n"
+"	ld.global.f32 	%f6, [%rd1+20];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+20], %f6;\n"
+"	.loc	14	117	0\n"
+"	ld.global.f32 	%f7, [%rd1+24];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+24], %f7;\n"
+"	.loc	14	118	0\n"
+"	ld.global.f32 	%f8, [%rd1+28];\n"
+"	st.shared.f32 	[__cuda_sp_lj108+28], %f8;\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_pair_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_10242;\n"
+"	.loc	14	129	0\n"
+"	mov.f32 	%f9, 0f00000000;     	\n"
+"	mov.f32 	%f10, %f9;\n"
+"	mov.f32 	%f11, 0f00000000;    	\n"
+"	mov.f32 	%f12, %f11;\n"
+"	mov.f32 	%f13, 0f00000000;    	\n"
+"	mov.f32 	%f14, %f13;\n"
+"	mov.f32 	%f15, 0f00000000;    	\n"
+"	mov.f32 	%f16, %f15;\n"
+"	mov.f32 	%f17, 0f00000000;    	\n"
+"	mov.f32 	%f18, %f17;\n"
+"	mov.f32 	%f19, 0f00000000;    	\n"
+"	mov.f32 	%f20, %f19;\n"
+"	.loc	14	132	0\n"
+"	cvt.u64.s32 	%rd2, %r5;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_dev_nbor];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd5+0];\n"
+"	.loc	14	134	0\n"
+"	ld.param.s32 	%r8, [__cudaparm_kernel_pair_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 4;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.s32 	%r9, [%rd8+0];\n"
+"	.loc	14	135	0\n"
+"	add.u64 	%rd9, %rd8, %rd7;\n"
+"	mov.s64 	%rd10, %rd9;\n"
+"	mov.s32 	%r10, %r7;\n"
+"	mov.s32 	%r11, 0;\n"
+"	mov.s32 	%r12, 0;\n"
+"	mov.s32 	%r13, 0;\n"
+"	tex.1d.v4.f32.s32 {%f21,%f22,%f23,%f24},[pos_tex,{%r10,%r11,%r12,%r13}];\n"
+"	.loc	14	138	0\n"
+"	mov.f32 	%f25, %f21;\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.s32 	%r14, %r7;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f29,%f30,%f31,%f32},[q_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	139	0\n"
+"	mov.f32 	%f33, %f29;\n"
+"	mul24.lo.s32 	%r18, %r9, %r8;\n"
+"	cvt.s64.s32 	%rd11, %r18;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd9, %rd12;\n"
+"	ld.param.s32 	%r19, [__cudaparm_kernel_pair_vflag];\n"
+"	ld.param.s32 	%r20, [__cudaparm_kernel_pair_eflag];\n"
+"	setp.ge.u64 	%p2, %rd9, %rd13;\n"
+"	mov.f32 	%f34, 0f00000000;    	\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	@%p2 bra 	$Lt_0_15874;\n"
+"	mov.s32 	%r21, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r21;\n"
+"	mov.s32 	%r22, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r22;\n"
+"	cvt.rzi.s32.f32 	%r23, %f28;\n"
+"	ld.param.s32 	%r24, [__cudaparm_kernel_pair_lj_types];\n"
+"	mul.lo.s32 	%r25, %r24, %r23;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_lj1];\n"
+"	mov.u64 	%rd15, __cuda_sp_lj108;\n"
+"$Lt_0_11266:\n"
+"	.loc	14	143	0\n"
+"	ld.global.s32 	%r26, [%rd10+0];\n"
+"	.loc	14	146	0\n"
+"	shr.s32 	%r27, %r26, 30;\n"
+"	cvt.s64.s32 	%rd16, %r27;\n"
+"	and.b64 	%rd17, %rd16, 3;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd15, %rd18;\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	147	0\n"
+"	mov.f32 	%f40, 0f3f800000;    	\n"
+"	ld.shared.f32 	%f41, [%rd19+16];\n"
+"	sub.f32 	%f42, %f40, %f41;\n"
+"	and.b32 	%r28, %r26, 1073741823;\n"
+"	mov.s32 	%r29, %r28;\n"
+"	mov.s32 	%r30, 0;\n"
+"	mov.s32 	%r31, 0;\n"
+"	mov.s32 	%r32, 0;\n"
+"	tex.1d.v4.f32.s32 {%f43,%f44,%f45,%f46},[pos_tex,{%r29,%r30,%r31,%r32}];\n"
+"	.loc	14	150	0\n"
+"	mov.f32 	%f47, %f43;\n"
+"	mov.f32 	%f48, %f44;\n"
+"	mov.f32 	%f49, %f45;\n"
+"	mov.f32 	%f50, %f46;\n"
+"	cvt.rzi.s32.f32 	%r33, %f50;\n"
+"	sub.f32 	%f51, %f26, %f48;\n"
+"	sub.f32 	%f52, %f25, %f47;\n"
+"	sub.f32 	%f53, %f27, %f49;\n"
+"	mul.f32 	%f54, %f51, %f51;\n"
+"	mad.f32 	%f55, %f52, %f52, %f54;\n"
+"	mad.f32 	%f56, %f53, %f53, %f55;\n"
+"	add.s32 	%r34, %r33, %r25;\n"
+"	cvt.u64.s32 	%rd20, %r34;\n"
+"	mul.lo.u64 	%rd21, %rd20, 16;\n"
+"	add.u64 	%rd22, %rd21, %rd14;\n"
+"	ld.global.f32 	%f57, [%rd22+8];\n"
+"	setp.gt.f32 	%p5, %f57, %f56;\n"
+"	@!%p5 bra 	$Lt_0_14082;\n"
+"	rcp.approx.f32 	%f58, %f56;\n"
+"	ld.global.f32 	%f59, [%rd22+12];\n"
+"	setp.lt.f32 	%p6, %f56, %f59;\n"
+"	@!%p6 bra 	$Lt_0_12290;\n"
+"	.loc	14	165	0\n"
+"	mul.f32 	%f60, %f58, %f58;\n"
+"	mul.f32 	%f61, %f58, %f60;\n"
+"	mov.f32 	%f62, %f61;\n"
+"	.loc	14	146	0\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	166	0\n"
+"	mul.f32 	%f63, %f61, %f39;\n"
+"	ld.global.v2.f32 	{%f64,%f65}, [%rd22+0];\n"
+"	mul.f32 	%f66, %f64, %f61;\n"
+"	sub.f32 	%f67, %f66, %f65;\n"
+"	mul.f32 	%f68, %f63, %f67;\n"
+"	bra.uni 	$Lt_0_12034;\n"
+"$Lt_0_12290:\n"
+"	.loc	14	168	0\n"
+"	mov.f32 	%f68, 0f00000000;    	\n"
+"$Lt_0_12034:\n"
+"	ld.param.f32 	%f69, [__cudaparm_kernel_pair_cut_coulsq];\n"
+"	setp.gt.f32 	%p7, %f69, %f56;\n"
+"	@!%p7 bra 	$Lt_0_12802;\n"
+"	.loc	14	175	0\n"
+"	sqrt.approx.f32 	%f70, %f56;\n"
+"	ld.param.f32 	%f71, [__cudaparm_kernel_pair_g_ewald];\n"
+"	mul.f32 	%f72, %f71, %f70;\n"
+"	mul.f32 	%f73, %f72, %f72;\n"
+"	mov.f32 	%f74, 0f3f800000;    	\n"
+"	mov.f32 	%f75, 0f3ea7ba05;    	\n"
+"	mad.f32 	%f76, %f75, %f72, %f74;\n"
+"	neg.f32 	%f77, %f73;\n"
+"	rcp.approx.f32 	%f78, %f76;\n"
+"	mov.f32 	%f79, 0f3fb8aa3b;    	\n"
+"	mul.f32 	%f80, %f77, %f79;\n"
+"	ex2.approx.f32 	%f81, %f80;\n"
+"	mov.f32 	%f82, 0f3e827906;    	\n"
+"	mov.f32 	%f83, 0fbe91a98e;    	\n"
+"	mov.f32 	%f84, 0f3fb5f0e3;    	\n"
+"	mov.f32 	%f85, 0fbfba00e3;    	\n"
+"	mov.f32 	%f86, 0f3f87dc22;    	\n"
+"	mad.f32 	%f87, %f86, %f78, %f85;\n"
+"	mad.f32 	%f88, %f78, %f87, %f84;\n"
+"	mad.f32 	%f89, %f78, %f88, %f83;\n"
+"	mad.f32 	%f90, %f78, %f89, %f82;\n"
+"	mul.f32 	%f91, %f78, %f90;\n"
+"	mul.f32 	%f92, %f81, %f91;\n"
+"	mov.f32 	%f93, %f92;\n"
+"	mov.s32 	%r35, %r28;\n"
+"	mov.s32 	%r36, 0;\n"
+"	mov.s32 	%r37, 0;\n"
+"	mov.s32 	%r38, 0;\n"
+"	tex.1d.v4.f32.s32 {%f94,%f95,%f96,%f97},[q_tex,{%r35,%r36,%r37,%r38}];\n"
+"	.loc	14	176	0\n"
+"	mov.f32 	%f98, %f94;\n"
+"	ld.param.f32 	%f99, [__cudaparm_kernel_pair_qqrd2e];\n"
+"	mul.f32 	%f100, %f99, %f33;\n"
+"	mul.f32 	%f101, %f100, %f98;\n"
+"	div.approx.f32 	%f102, %f101, %f70;\n"
+"	mov.f32 	%f103, %f102;\n"
+"	.loc	14	177	0\n"
+"	mov.f32 	%f104, 0f3f906ebb;   	\n"
+"	mul.f32 	%f105, %f72, %f104;\n"
+"	mad.f32 	%f106, %f81, %f105, %f92;\n"
+"	sub.f32 	%f107, %f106, %f42;\n"
+"	mul.f32 	%f108, %f102, %f107;\n"
+"	bra.uni 	$Lt_0_12546;\n"
+"$Lt_0_12802:\n"
+"	.loc	14	180	0\n"
+"	mov.f32 	%f103, 0f00000000;   	\n"
+"	mov.f32 	%f108, 0f00000000;   	\n"
+"$Lt_0_12546:\n"
+"	.loc	14	185	0\n"
+"	add.f32 	%f109, %f108, %f68;\n"
+"	mul.f32 	%f110, %f109, %f58;\n"
+"	mad.f32 	%f36, %f52, %f110, %f36;\n"
+"	.loc	14	186	0\n"
+"	mad.f32 	%f35, %f51, %f110, %f35;\n"
+"	.loc	14	187	0\n"
+"	mad.f32 	%f34, %f53, %f110, %f34;\n"
+"	@!%p3 bra 	$Lt_0_13570;\n"
+"	.loc	14	190	0\n"
+"	mov.f32 	%f111, %f93;\n"
+"	sub.f32 	%f112, %f111, %f42;\n"
+"	mad.f32 	%f37, %f103, %f112, %f37;\n"
+"	@!%p6 bra 	$Lt_0_13570;\n"
+"	.loc	14	193	0\n"
+"	ld.param.u64 	%rd23, [__cudaparm_kernel_pair_lj3];\n"
+"	add.u64 	%rd24, %rd23, %rd21;\n"
+"	mov.f32 	%f113, %f62;\n"
+"	ld.global.v4.f32 	{%f114,%f115,%f116,_}, [%rd24+0];\n"
+"	mul.f32 	%f117, %f114, %f113;\n"
+"	sub.f32 	%f118, %f117, %f115;\n"
+"	mul.f32 	%f119, %f113, %f118;\n"
+"	sub.f32 	%f120, %f119, %f116;\n"
+"	.loc	14	146	0\n"
+"	ld.shared.f32 	%f39, [%rd19+0];\n"
+"	.loc	14	193	0\n"
+"	mad.f32 	%f38, %f39, %f120, %f38;\n"
+"$Lt_0_13570:\n"
+"$Lt_0_13058:\n"
+"	@!%p4 bra 	$Lt_0_14082;\n"
+"	.loc	14	197	0\n"
+"	mov.f32 	%f121, %f10;\n"
+"	mul.f32 	%f122, %f52, %f52;\n"
+"	mad.f32 	%f123, %f110, %f122, %f121;\n"
+"	mov.f32 	%f10, %f123;\n"
+"	.loc	14	198	0\n"
+"	mov.f32 	%f124, %f12;\n"
+"	mad.f32 	%f125, %f110, %f54, %f124;\n"
+"	mov.f32 	%f12, %f125;\n"
+"	.loc	14	199	0\n"
+"	mov.f32 	%f126, %f14;\n"
+"	mul.f32 	%f127, %f53, %f53;\n"
+"	mad.f32 	%f128, %f110, %f127, %f126;\n"
+"	mov.f32 	%f14, %f128;\n"
+"	.loc	14	200	0\n"
+"	mov.f32 	%f129, %f16;\n"
+"	mul.f32 	%f130, %f51, %f52;\n"
+"	mad.f32 	%f131, %f110, %f130, %f129;\n"
+"	mov.f32 	%f16, %f131;\n"
+"	.loc	14	201	0\n"
+"	mov.f32 	%f132, %f18;\n"
+"	mul.f32 	%f133, %f52, %f53;\n"
+"	mad.f32 	%f134, %f110, %f133, %f132;\n"
+"	mov.f32 	%f18, %f134;\n"
+"	.loc	14	202	0\n"
+"	mul.f32 	%f135, %f51, %f53;\n"
+"	mad.f32 	%f19, %f110, %f135, %f19;\n"
+"	mov.f32 	%f136, %f19;\n"
+"$Lt_0_14082:\n"
+"$Lt_0_11522:\n"
+"	.loc	14	142	0\n"
+"	add.u64 	%rd10, %rd7, %rd10;\n"
+"	setp.gt.u64 	%p8, %rd13, %rd10;\n"
+"	@%p8 bra 	$Lt_0_11266;\n"
+"	bra.uni 	$Lt_0_10754;\n"
+"$Lt_0_15874:\n"
+"	mov.s32 	%r39, 0;\n"
+"	setp.gt.s32 	%p3, %r20, %r39;\n"
+"	mov.s32 	%r40, 0;\n"
+"	setp.gt.s32 	%p4, %r19, %r40;\n"
+"$Lt_0_10754:\n"
+"	.loc	14	209	0\n"
+"	ld.param.u64 	%rd25, [__cudaparm_kernel_pair_engv];\n"
+"	add.u64 	%rd26, %rd25, %rd3;\n"
+"	@!%p3 bra 	$Lt_0_14850;\n"
+"	.loc	14	211	0\n"
+"	st.global.f32 	[%rd26+0], %f38;\n"
+"	.loc	14	212	0\n"
+"	cvt.u64.s32 	%rd27, %r6;\n"
+"	mul.lo.u64 	%rd28, %rd27, 4;\n"
+"	add.u64 	%rd26, %rd28, %rd26;\n"
+"	.loc	14	213	0\n"
+"	st.global.f32 	[%rd26+0], %f37;\n"
+"	.loc	14	214	0\n"
+"	add.u64 	%rd26, %rd28, %rd26;\n"
+"$Lt_0_14850:\n"
+"	@!%p4 bra 	$Lt_0_15362;\n"
+"	.loc	14	218	0\n"
+"	mov.f32 	%f137, %f10;\n"
+"	st.global.f32 	[%rd26+0], %f137;\n"
+"	.loc	14	219	0\n"
+"	cvt.u64.s32 	%rd29, %r6;\n"
+"	mul.lo.u64 	%rd30, %rd29, 4;\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	218	0\n"
+"	mov.f32 	%f138, %f12;\n"
+"	st.global.f32 	[%rd26+0], %f138;\n"
+"	.loc	14	219	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	218	0\n"
+"	mov.f32 	%f139, %f14;\n"
+"	st.global.f32 	[%rd26+0], %f139;\n"
+"	.loc	14	219	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	218	0\n"
+"	mov.f32 	%f140, %f16;\n"
+"	st.global.f32 	[%rd26+0], %f140;\n"
+"	.loc	14	219	0\n"
+"	add.u64 	%rd26, %rd30, %rd26;\n"
+"	.loc	14	218	0\n"
+"	mov.f32 	%f141, %f18;\n"
+"	st.global.f32 	[%rd26+0], %f141;\n"
+"	add.u64 	%rd31, %rd30, %rd26;\n"
+"	st.global.f32 	[%rd31+0], %f19;\n"
+"$Lt_0_15362:\n"
+"	.loc	14	222	0\n"
+"	ld.param.u64 	%rd32, [__cudaparm_kernel_pair_ans];\n"
+"	mul.lo.u64 	%rd33, %rd2, 16;\n"
+"	add.u64 	%rd34, %rd32, %rd33;\n"
+"	mov.f32 	%f142, %f143;\n"
+"	st.global.v4.f32 	[%rd34+0], {%f36,%f35,%f34,%f142};\n"
+"$Lt_0_10242:\n"
+"	.loc	14	224	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair:\n"
+"	}\n"
+"	.entry kernel_pair_fast (\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_x_,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj1_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_lj3_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_sp_lj_in,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_ans,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_engv,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_eflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_vflag,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_inum,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nall,\n"
+"		.param .s32 __cudaparm_kernel_pair_fast_nbor_pitch,\n"
+"		.param .u64 __cudaparm_kernel_pair_fast_q_,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_cut_coulsq,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_qqrd2e,\n"
+"		.param .f32 __cudaparm_kernel_pair_fast_g_ewald)\n"
+"	{\n"
+"	.reg .u32 %r<45>;\n"
+"	.reg .u64 %rd<48>;\n"
+"	.reg .f32 %f<148>;\n"
+"	.reg .pred %p<13>;\n"
+"	.shared .align 4 .b8 __cuda_sp_lj244[32];\n"
+"	.shared .align 16 .b8 __cuda_lj1288[1024];\n"
+"	.shared .align 16 .b8 __cuda_lj31312[1024];\n"
+"	.loc	14	233	0\n"
+"$LBB1_kernel_pair_fast:\n"
+"	cvt.s32.u16 	%r1, %tid.x;\n"
+"	mov.u32 	%r2, 7;\n"
+"	setp.gt.s32 	%p1, %r1, %r2;\n"
+"	@%p1 bra 	$Lt_1_12546;\n"
+"	.loc	14	240	0\n"
+"	mov.u64 	%rd1, __cuda_sp_lj244;\n"
+"	cvt.u64.s32 	%rd2, %r1;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	ld.param.u64 	%rd4, [__cudaparm_kernel_pair_fast_sp_lj_in];\n"
+"	add.u64 	%rd5, %rd4, %rd3;\n"
+"	ld.global.f32 	%f1, [%rd5+0];\n"
+"	add.u64 	%rd6, %rd3, %rd1;\n"
+"	st.shared.f32 	[%rd6+0], %f1;\n"
+"$Lt_1_12546:\n"
+"	mov.u64 	%rd1, __cuda_sp_lj244;\n"
+"	mov.u32 	%r3, 63;\n"
+"	setp.gt.s32 	%p2, %r1, %r3;\n"
+"	@%p2 bra 	$Lt_1_13058;\n"
+"	.loc	14	242	0\n"
+"	mov.u64 	%rd7, __cuda_lj1288;\n"
+"	cvt.u64.s32 	%rd8, %r1;\n"
+"	mul.lo.u64 	%rd9, %rd8, 16;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_pair_fast_lj1_in];\n"
+"	add.u64 	%rd11, %rd10, %rd9;\n"
+"	add.u64 	%rd12, %rd9, %rd7;\n"
+"	ld.global.v4.f32 	{%f2,%f3,%f4,%f5}, [%rd11+0];\n"
+"	st.shared.f32 	[%rd12+0], %f2;\n"
+"	st.shared.f32 	[%rd12+4], %f3;\n"
+"	st.shared.f32 	[%rd12+8], %f4;\n"
+"	st.shared.f32 	[%rd12+12], %f5;\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	mov.u32 	%r5, 0;\n"
+"	setp.le.s32 	%p3, %r4, %r5;\n"
+"	@%p3 bra 	$Lt_1_13570;\n"
+"	.loc	14	244	0\n"
+"	mov.u64 	%rd13, __cuda_lj31312;\n"
+"	ld.param.u64 	%rd14, [__cudaparm_kernel_pair_fast_lj3_in];\n"
+"	add.u64 	%rd15, %rd14, %rd9;\n"
+"	add.u64 	%rd16, %rd9, %rd13;\n"
+"	ld.global.v4.f32 	{%f6,%f7,%f8,%f9}, [%rd15+0];\n"
+"	st.shared.f32 	[%rd16+0], %f6;\n"
+"	st.shared.f32 	[%rd16+4], %f7;\n"
+"	st.shared.f32 	[%rd16+8], %f8;\n"
+"	st.shared.f32 	[%rd16+12], %f9;\n"
+"$Lt_1_13570:\n"
+"	mov.u64 	%rd13, __cuda_lj31312;\n"
+"$Lt_1_13058:\n"
+"	mov.u64 	%rd13, __cuda_lj31312;\n"
+"	mov.u64 	%rd7, __cuda_lj1288;\n"
+"	.loc	14	247	0\n"
+"	bar.sync 	0;\n"
+"	cvt.s32.u16 	%r6, %ctaid.x;\n"
+"	cvt.s32.u16 	%r7, %ntid.x;\n"
+"	mul24.lo.s32 	%r8, %r6, %r7;\n"
+"	add.s32 	%r9, %r8, %r1;\n"
+"	ld.param.s32 	%r10, [__cudaparm_kernel_pair_fast_inum];\n"
+"	setp.ge.s32 	%p4, %r9, %r10;\n"
+"	@%p4 bra 	$Lt_1_14082;\n"
+"	.loc	14	259	0\n"
+"	mov.f32 	%f10, 0f00000000;    	\n"
+"	mov.f32 	%f11, %f10;\n"
+"	mov.f32 	%f12, 0f00000000;    	\n"
+"	mov.f32 	%f13, %f12;\n"
+"	mov.f32 	%f14, 0f00000000;    	\n"
+"	mov.f32 	%f15, %f14;\n"
+"	mov.f32 	%f16, 0f00000000;    	\n"
+"	mov.f32 	%f17, %f16;\n"
+"	mov.f32 	%f18, 0f00000000;    	\n"
+"	mov.f32 	%f19, %f18;\n"
+"	mov.f32 	%f20, 0f00000000;    	\n"
+"	mov.f32 	%f21, %f20;\n"
+"	.loc	14	262	0\n"
+"	cvt.u64.s32 	%rd17, %r9;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	ld.param.u64 	%rd19, [__cudaparm_kernel_pair_fast_dev_nbor];\n"
+"	add.u64 	%rd20, %rd19, %rd18;\n"
+"	ld.global.s32 	%r11, [%rd20+0];\n"
+"	.loc	14	264	0\n"
+"	ld.param.s32 	%r12, [__cudaparm_kernel_pair_fast_nbor_pitch];\n"
+"	cvt.u64.s32 	%rd21, %r12;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd20, %rd22;\n"
+"	ld.global.s32 	%r13, [%rd23+0];\n"
+"	.loc	14	265	0\n"
+"	add.u64 	%rd24, %rd23, %rd22;\n"
+"	mov.s64 	%rd25, %rd24;\n"
+"	mov.s32 	%r14, %r11;\n"
+"	mov.s32 	%r15, 0;\n"
+"	mov.s32 	%r16, 0;\n"
+"	mov.s32 	%r17, 0;\n"
+"	tex.1d.v4.f32.s32 {%f22,%f23,%f24,%f25},[pos_tex,{%r14,%r15,%r16,%r17}];\n"
+"	.loc	14	268	0\n"
+"	mov.f32 	%f26, %f22;\n"
+"	mov.f32 	%f27, %f23;\n"
+"	mov.f32 	%f28, %f24;\n"
+"	mov.f32 	%f29, %f25;\n"
+"	mov.s32 	%r18, %r11;\n"
+"	mov.s32 	%r19, 0;\n"
+"	mov.s32 	%r20, 0;\n"
+"	mov.s32 	%r21, 0;\n"
+"	tex.1d.v4.f32.s32 {%f30,%f31,%f32,%f33},[q_tex,{%r18,%r19,%r20,%r21}];\n"
+"	.loc	14	269	0\n"
+"	mov.f32 	%f34, %f30;\n"
+"	mul24.lo.s32 	%r22, %r13, %r12;\n"
+"	cvt.s64.s32 	%rd26, %r22;\n"
+"	mul.lo.u64 	%rd27, %rd26, 4;\n"
+"	add.u64 	%rd28, %rd24, %rd27;\n"
+"	ld.param.s32 	%r23, [__cudaparm_kernel_pair_fast_vflag];\n"
+"	ld.param.s32 	%r24, [__cudaparm_kernel_pair_fast_eflag];\n"
+"	setp.ge.u64 	%p5, %rd24, %rd28;\n"
+"	mov.f32 	%f35, 0f00000000;    	\n"
+"	mov.f32 	%f36, 0f00000000;    	\n"
+"	mov.f32 	%f37, 0f00000000;    	\n"
+"	mov.f32 	%f38, 0f00000000;    	\n"
+"	mov.f32 	%f39, 0f00000000;    	\n"
+"	@%p5 bra 	$Lt_1_19714;\n"
+"	mov.s32 	%r25, 0;\n"
+"	setp.gt.s32 	%p6, %r24, %r25;\n"
+"	mov.s32 	%r26, 0;\n"
+"	setp.gt.s32 	%p7, %r23, %r26;\n"
+"	cvt.rzi.s32.f32 	%r27, %f29;\n"
+"	mov.s32 	%r28, 8;\n"
+"	mul24.lo.s32 	%r29, %r28, %r27;\n"
+"	cvt.rn.f32.s32 	%f40, %r29;\n"
+"$Lt_1_15106:\n"
+"	.loc	14	274	0\n"
+"	ld.global.s32 	%r30, [%rd25+0];\n"
+"	.loc	14	277	0\n"
+"	shr.s32 	%r31, %r30, 30;\n"
+"	cvt.s64.s32 	%rd29, %r31;\n"
+"	and.b64 	%rd30, %rd29, 3;\n"
+"	mul.lo.u64 	%rd31, %rd30, 4;\n"
+"	add.u64 	%rd32, %rd1, %rd31;\n"
+"	ld.shared.f32 	%f41, [%rd32+0];\n"
+"	.loc	14	278	0\n"
+"	mov.f32 	%f42, 0f3f800000;    	\n"
+"	ld.shared.f32 	%f43, [%rd32+16];\n"
+"	sub.f32 	%f44, %f42, %f43;\n"
+"	and.b32 	%r32, %r30, 1073741823;\n"
+"	mov.s32 	%r33, %r32;\n"
+"	mov.s32 	%r34, 0;\n"
+"	mov.s32 	%r35, 0;\n"
+"	mov.s32 	%r36, 0;\n"
+"	tex.1d.v4.f32.s32 {%f45,%f46,%f47,%f48},[pos_tex,{%r33,%r34,%r35,%r36}];\n"
+"	.loc	14	281	0\n"
+"	mov.f32 	%f49, %f45;\n"
+"	mov.f32 	%f50, %f46;\n"
+"	mov.f32 	%f51, %f47;\n"
+"	mov.f32 	%f52, %f48;\n"
+"	sub.f32 	%f53, %f27, %f50;\n"
+"	sub.f32 	%f54, %f26, %f49;\n"
+"	sub.f32 	%f55, %f28, %f51;\n"
+"	mul.f32 	%f56, %f53, %f53;\n"
+"	mad.f32 	%f57, %f54, %f54, %f56;\n"
+"	mad.f32 	%f58, %f55, %f55, %f57;\n"
+"	add.f32 	%f59, %f40, %f52;\n"
+"	cvt.rzi.s32.f32 	%r37, %f59;\n"
+"	cvt.u64.s32 	%rd33, %r37;\n"
+"	mul.lo.u64 	%rd34, %rd33, 16;\n"
+"	add.u64 	%rd35, %rd34, %rd7;\n"
+"	ld.shared.f32 	%f60, [%rd35+8];\n"
+"	setp.gt.f32 	%p8, %f60, %f58;\n"
+"	@!%p8 bra 	$Lt_1_17922;\n"
+"	rcp.approx.f32 	%f61, %f58;\n"
+"	ld.shared.f32 	%f62, [%rd35+12];\n"
+"	setp.lt.f32 	%p9, %f58, %f62;\n"
+"	@!%p9 bra 	$Lt_1_16130;\n"
+"	.loc	14	295	0\n"
+"	mul.f32 	%f63, %f61, %f61;\n"
+"	mul.f32 	%f64, %f61, %f63;\n"
+"	mov.f32 	%f65, %f64;\n"
+"	.loc	14	277	0\n"
+"	ld.shared.f32 	%f41, [%rd32+0];\n"
+"	.loc	14	296	0\n"
+"	mul.f32 	%f66, %f64, %f41;\n"
+"	ld.shared.f32 	%f67, [%rd35+4];\n"
+"	ld.shared.f32 	%f68, [%rd35+0];\n"
+"	mul.f32 	%f69, %f68, %f64;\n"
+"	sub.f32 	%f70, %f69, %f67;\n"
+"	mul.f32 	%f71, %f66, %f70;\n"
+"	bra.uni 	$Lt_1_15874;\n"
+"$Lt_1_16130:\n"
+"	.loc	14	298	0\n"
+"	mov.f32 	%f71, 0f00000000;    	\n"
+"$Lt_1_15874:\n"
+"	ld.param.f32 	%f72, [__cudaparm_kernel_pair_fast_cut_coulsq];\n"
+"	setp.gt.f32 	%p10, %f72, %f58;\n"
+"	@!%p10 bra 	$Lt_1_16642;\n"
+"	.loc	14	305	0\n"
+"	sqrt.approx.f32 	%f73, %f58;\n"
+"	ld.param.f32 	%f74, [__cudaparm_kernel_pair_fast_g_ewald];\n"
+"	mul.f32 	%f75, %f74, %f73;\n"
+"	mul.f32 	%f76, %f75, %f75;\n"
+"	mov.f32 	%f77, 0f3f800000;    	\n"
+"	mov.f32 	%f78, 0f3ea7ba05;    	\n"
+"	mad.f32 	%f79, %f78, %f75, %f77;\n"
+"	neg.f32 	%f80, %f76;\n"
+"	rcp.approx.f32 	%f81, %f79;\n"
+"	mov.f32 	%f82, 0f3fb8aa3b;    	\n"
+"	mul.f32 	%f83, %f80, %f82;\n"
+"	ex2.approx.f32 	%f84, %f83;\n"
+"	mov.f32 	%f85, 0f3e827906;    	\n"
+"	mov.f32 	%f86, 0fbe91a98e;    	\n"
+"	mov.f32 	%f87, 0f3fb5f0e3;    	\n"
+"	mov.f32 	%f88, 0fbfba00e3;    	\n"
+"	mov.f32 	%f89, 0f3f87dc22;    	\n"
+"	mad.f32 	%f90, %f89, %f81, %f88;\n"
+"	mad.f32 	%f91, %f81, %f90, %f87;\n"
+"	mad.f32 	%f92, %f81, %f91, %f86;\n"
+"	mad.f32 	%f93, %f81, %f92, %f85;\n"
+"	mul.f32 	%f94, %f81, %f93;\n"
+"	mul.f32 	%f95, %f84, %f94;\n"
+"	mov.f32 	%f96, %f95;\n"
+"	mov.s32 	%r38, %r32;\n"
+"	mov.s32 	%r39, 0;\n"
+"	mov.s32 	%r40, 0;\n"
+"	mov.s32 	%r41, 0;\n"
+"	tex.1d.v4.f32.s32 {%f97,%f98,%f99,%f100},[q_tex,{%r38,%r39,%r40,%r41}];\n"
+"	.loc	14	306	0\n"
+"	mov.f32 	%f101, %f97;\n"
+"	ld.param.f32 	%f102, [__cudaparm_kernel_pair_fast_qqrd2e];\n"
+"	mul.f32 	%f103, %f102, %f34;\n"
+"	mul.f32 	%f104, %f103, %f101;\n"
+"	div.approx.f32 	%f105, %f104, %f73;\n"
+"	mov.f32 	%f106, %f105;\n"
+"	.loc	14	307	0\n"
+"	mov.f32 	%f107, 0f3f906ebb;   	\n"
+"	mul.f32 	%f108, %f75, %f107;\n"
+"	mad.f32 	%f109, %f84, %f108, %f95;\n"
+"	sub.f32 	%f110, %f109, %f44;\n"
+"	mul.f32 	%f111, %f105, %f110;\n"
+"	bra.uni 	$Lt_1_16386;\n"
+"$Lt_1_16642:\n"
+"	.loc	14	310	0\n"
+"	mov.f32 	%f106, 0f00000000;   	\n"
+"	mov.f32 	%f111, 0f00000000;   	\n"
+"$Lt_1_16386:\n"
+"	.loc	14	315	0\n"
+"	add.f32 	%f112, %f111, %f71;\n"
+"	mul.f32 	%f113, %f112, %f61;\n"
+"	mad.f32 	%f37, %f54, %f113, %f37;\n"
+"	.loc	14	316	0\n"
+"	mad.f32 	%f36, %f53, %f113, %f36;\n"
+"	.loc	14	317	0\n"
+"	mad.f32 	%f35, %f55, %f113, %f35;\n"
+"	@!%p6 bra 	$Lt_1_17410;\n"
+"	.loc	14	320	0\n"
+"	mov.f32 	%f114, %f96;\n"
+"	sub.f32 	%f115, %f114, %f44;\n"
+"	mad.f32 	%f38, %f106, %f115, %f38;\n"
+"	@!%p9 bra 	$Lt_1_17410;\n"
+"	.loc	14	322	0\n"
+"	add.u64 	%rd36, %rd34, %rd13;\n"
+"	mov.f32 	%f116, %f65;\n"
+"	ld.shared.f32 	%f117, [%rd36+4];\n"
+"	ld.shared.f32 	%f118, [%rd36+0];\n"
+"	mul.f32 	%f119, %f118, %f116;\n"
+"	sub.f32 	%f120, %f119, %f117;\n"
+"	mul.f32 	%f121, %f116, %f120;\n"
+"	.loc	14	323	0\n"
+"	ld.shared.f32 	%f122, [%rd36+8];\n"
+"	sub.f32 	%f123, %f121, %f122;\n"
+"	.loc	14	277	0\n"
+"	ld.shared.f32 	%f41, [%rd32+0];\n"
+"	.loc	14	323	0\n"
+"	mad.f32 	%f39, %f41, %f123, %f39;\n"
+"$Lt_1_17410:\n"
+"$Lt_1_16898:\n"
+"	@!%p7 bra 	$Lt_1_17922;\n"
+"	.loc	14	327	0\n"
+"	mov.f32 	%f124, %f11;\n"
+"	mul.f32 	%f125, %f54, %f54;\n"
+"	mad.f32 	%f126, %f113, %f125, %f124;\n"
+"	mov.f32 	%f11, %f126;\n"
+"	.loc	14	328	0\n"
+"	mov.f32 	%f127, %f13;\n"
+"	mad.f32 	%f128, %f113, %f56, %f127;\n"
+"	mov.f32 	%f13, %f128;\n"
+"	.loc	14	329	0\n"
+"	mov.f32 	%f129, %f15;\n"
+"	mul.f32 	%f130, %f55, %f55;\n"
+"	mad.f32 	%f131, %f113, %f130, %f129;\n"
+"	mov.f32 	%f15, %f131;\n"
+"	.loc	14	330	0\n"
+"	mov.f32 	%f132, %f17;\n"
+"	mul.f32 	%f133, %f53, %f54;\n"
+"	mad.f32 	%f134, %f113, %f133, %f132;\n"
+"	mov.f32 	%f17, %f134;\n"
+"	.loc	14	331	0\n"
+"	mov.f32 	%f135, %f19;\n"
+"	mul.f32 	%f136, %f54, %f55;\n"
+"	mad.f32 	%f137, %f113, %f136, %f135;\n"
+"	mov.f32 	%f19, %f137;\n"
+"	.loc	14	332	0\n"
+"	mul.f32 	%f138, %f53, %f55;\n"
+"	mad.f32 	%f20, %f113, %f138, %f20;\n"
+"	mov.f32 	%f139, %f20;\n"
+"$Lt_1_17922:\n"
+"$Lt_1_15362:\n"
+"	.loc	14	273	0\n"
+"	add.u64 	%rd25, %rd22, %rd25;\n"
+"	setp.gt.u64 	%p11, %rd28, %rd25;\n"
+"	@%p11 bra 	$Lt_1_15106;\n"
+"	bra.uni 	$Lt_1_14594;\n"
+"$Lt_1_19714:\n"
+"	mov.s32 	%r42, 0;\n"
+"	setp.gt.s32 	%p6, %r24, %r42;\n"
+"	mov.s32 	%r43, 0;\n"
+"	setp.gt.s32 	%p7, %r23, %r43;\n"
+"$Lt_1_14594:\n"
+"	.loc	14	339	0\n"
+"	ld.param.u64 	%rd37, [__cudaparm_kernel_pair_fast_engv];\n"
+"	add.u64 	%rd38, %rd37, %rd18;\n"
+"	@!%p6 bra 	$Lt_1_18690;\n"
+"	.loc	14	341	0\n"
+"	st.global.f32 	[%rd38+0], %f39;\n"
+"	.loc	14	342	0\n"
+"	cvt.u64.s32 	%rd39, %r10;\n"
+"	mul.lo.u64 	%rd40, %rd39, 4;\n"
+"	add.u64 	%rd38, %rd40, %rd38;\n"
+"	.loc	14	343	0\n"
+"	st.global.f32 	[%rd38+0], %f38;\n"
+"	.loc	14	344	0\n"
+"	add.u64 	%rd38, %rd40, %rd38;\n"
+"$Lt_1_18690:\n"
+"	@!%p7 bra 	$Lt_1_19202;\n"
+"	.loc	14	348	0\n"
+"	mov.f32 	%f140, %f11;\n"
+"	st.global.f32 	[%rd38+0], %f140;\n"
+"	.loc	14	349	0\n"
+"	cvt.u64.s32 	%rd41, %r10;\n"
+"	mul.lo.u64 	%rd42, %rd41, 4;\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	348	0\n"
+"	mov.f32 	%f141, %f13;\n"
+"	st.global.f32 	[%rd38+0], %f141;\n"
+"	.loc	14	349	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	348	0\n"
+"	mov.f32 	%f142, %f15;\n"
+"	st.global.f32 	[%rd38+0], %f142;\n"
+"	.loc	14	349	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	348	0\n"
+"	mov.f32 	%f143, %f17;\n"
+"	st.global.f32 	[%rd38+0], %f143;\n"
+"	.loc	14	349	0\n"
+"	add.u64 	%rd38, %rd42, %rd38;\n"
+"	.loc	14	348	0\n"
+"	mov.f32 	%f144, %f19;\n"
+"	st.global.f32 	[%rd38+0], %f144;\n"
+"	add.u64 	%rd43, %rd42, %rd38;\n"
+"	st.global.f32 	[%rd43+0], %f20;\n"
+"$Lt_1_19202:\n"
+"	.loc	14	352	0\n"
+"	ld.param.u64 	%rd44, [__cudaparm_kernel_pair_fast_ans];\n"
+"	mul.lo.u64 	%rd45, %rd17, 16;\n"
+"	add.u64 	%rd46, %rd44, %rd45;\n"
+"	mov.f32 	%f145, %f146;\n"
+"	st.global.v4.f32 	[%rd46+0], {%f37,%f36,%f35,%f145};\n"
+"$Lt_1_14082:\n"
+"	.loc	14	354	0\n"
+"	exit;\n"
+"$LDWend_kernel_pair_fast:\n"
+"	}\n"
+;
--- a/lib/gpu/morse_gpu.cpp
+++ b/lib/gpu/morse_gpu.cpp
@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "morse_gpu_memory.h"
+
+using namespace std;
+
+static MOR_GPU_Memory<PRECISION,ACC_PRECISION> MORMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int mor_gpu_init(const int ntypes, double **cutsq,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
+  MORMF.clear();
+  gpu_mode=MORMF.device->gpu_mode();
+  double gpu_split=MORMF.device->particle_split();
+  int first_gpu=MORMF.device->first_device();
+  int last_gpu=MORMF.device->last_device();
+  int world_me=MORMF.device->world_me();
+  int gpu_rank=MORMF.device->gpu_rank();
+  int procs_per_gpu=MORMF.device->procs_per_gpu();
+
+  MORMF.device->init_message(screen,"morse",first_gpu,last_gpu);
+
+  bool message=false;
+  if (MORMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
+
+  MORMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
+    MORMF.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    MORMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void mor_gpu_clear() {
+  MORMF.clear();
+}
+
+int** mor_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
+                        int **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return MORMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
+}  
+			
+void mor_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  MORMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+}
+
+double mor_gpu_bytes() {
+  return MORMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/morse_gpu_kernel.cu
+++ b/lib/gpu/morse_gpu_kernel.cu
@ -0,0 +1,389 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef MORSE_GPU_KERNEL
+#define MORSE_GPU_KERNEL
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp2 double2
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp2 float2
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "nv_kernel_def.h"
+texture<float4> pos_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+
+#endif
+
+#define SBBITS 30
+#define NEIGHMASK 0x3FFFFFFF
+__inline int sbmask(int j) { return j >> SBBITS & 3; }
+
+__kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *mor1,
+                          __global numtyp2* mor2, const int lj_types, 
+                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp sp_lj[4];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int itype=ix.w;
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r = delx*delx+dely*dely+delz*delz;
+        
+      int mtype=itype*lj_types+jtype;
+      if (r<mor1[mtype].x) {
+        r=sqrt(r);
+        numtyp dexp=r-mor1[mtype].z;
+        dexp=exp(-mor1[mtype].w*dexp);
+        numtyp dm=dexp*dexp-dexp;
+        numtyp force = mor1[mtype].y*dm/r*factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
+          energy+=e*factor_lj; 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii
+}
+
+__kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *mor1_in,
+                               __global numtyp2* mor2_in, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
+                               __global acctyp4 *ans, __global acctyp *engv, 
+                               const int eflag, const int vflag, const int inum, 
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
+  __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[4];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    mor1[tid]=mor1_in[tid];
+    if (eflag>0)
+      mor2[tid]=mor2_in[tid];
+  }
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
+  __syncthreads();
+  
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=nbor_pitch;
+    int numj=*nbor;
+    nbor+=nbor_pitch;
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
+  
+    numtyp4 ix=fetch_pos(i,x_); //x_[i];
+    int iw=ix.w;
+    int itype=mul24((int)MAX_SHARED_TYPES,iw);
+
+    numtyp factor_lj;
+    for ( ; nbor<list_end; nbor+=n_stride) {
+  
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=fetch_pos(j,x_); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp r = delx*delx+dely*dely+delz*delz;
+        
+      if (r<mor1[mtype].x) {
+        r=sqrt(r);
+        numtyp dexp=r-mor1[mtype].z;
+        dexp=exp(-mor1[mtype].w*dexp);
+        numtyp dm=dexp*dexp-dexp;
+        numtyp force = mor1[mtype].y*dm/r*factor_lj;
+      
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
+          energy+=e*factor_lj; 
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
+      ap1+=inum;
+    }
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=inum;
+      }
+    }
+    ans[ii]=f;
+  } // if ii*/
+}
+
+#endif
+
--- a/lib/gpu/morse_gpu_memory.cpp
+++ b/lib/gpu/morse_gpu_memory.cpp
@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "morse_gpu_cl.h"
+#else
+#include "morse_gpu_ptx.h"
+#endif
+
+#include "morse_gpu_memory.h"
+#include <cassert>
+#define MOR_GPU_MemoryT MOR_GPU_Memory<numtyp, acctyp>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+MOR_GPU_MemoryT::MOR_GPU_Memory() : AtomicGPUMemory<numtyp,acctyp>(), _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+MOR_GPU_MemoryT::~MOR_GPU_Memory() { 
+  clear();
+}
+ 
+template <class numtyp, class acctyp>
+int MOR_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int MOR_GPU_MemoryT::init(const int ntypes, 
+                          double **host_cutsq, double **host_morse1, 
+                          double **host_r0, double **host_alpha, 
+                          double **host_d0, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,morse_gpu_kernel);
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (types<=max_shared_types && this->_block_size>=max_shared_types) {
+    types=max_shared_types;
+    shared_types=true;
+  }
+  _types=types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(types*types*32,*(this->ucl_device),
+                               UCL_WRITE_OPTIMIZED);
+
+  for (int i=0; i<types*types; i++)
+    host_write[i]=0.0;
+
+  mor1.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,types,mor1,host_write,host_cutsq,host_morse1,
+                         host_r0,host_alpha);
+
+  mor2.alloc(types*types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,types,mor2,host_write,host_d0,host_offset);
+
+  UCL_H_Vec<double> dview;
+  sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  dview.view(host_special_lj,4,*(this->ucl_device));
+  ucl_copy(sp_lj,dview,false);
+
+  _allocated=true;
+  this->_max_bytes=mor1.row_bytes()+mor2.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void MOR_GPU_MemoryT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  mor1.clear();
+  mor2.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double MOR_GPU_MemoryT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(MOR_GPU_Memory<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void MOR_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int anall=this->atom->nall();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->dev_x.begin(), &mor1.begin(),
+                          &mor2.begin(), &sp_lj.begin(),
+                          &this->nbor->dev_nbor.begin(),
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->dev_x.begin(), &mor1.begin(), &mor2.begin(),
+                     &_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class MOR_GPU_Memory<PRECISION,ACC_PRECISION>;
+
--- a/lib/gpu/morse_gpu_memory.h
+++ b/lib/gpu/morse_gpu_memory.h
@ -0,0 +1,78 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef MOR_GPU_MEMORY_H
+#define MOR_GPU_MEMORY_H
+
+#include "atomic_gpu_memory.h"
+
+template <class numtyp, class acctyp>
+class MOR_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
+ public:
+  MOR_GPU_Memory();
+  ~MOR_GPU_Memory(); 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_morse1, double **host_r0, double **host_alpha,
+           double **host_d0, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// mor1.x = cutsq, mor1.y = morse1, mor1.z = r0, mor1.w = alpha
+  UCL_D_Vec<numtyp4> mor1;
+  /// mor2.x = d0, mor2.y = offset
+  UCL_D_Vec<numtyp2> mor2;
+  /// Special LJ values
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types 
+  int _types;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+#endif
+
--- a/lib/gpu/nv_kernel_def.h
+++ b/lib/gpu/nv_kernel_def.h
@ -0,0 +1,55 @@
+/* ----------------------------------------------------------------------
+   LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+/*************************************************************************
+                 See pair_gpu_dev_kernel.cu for definitions
+                       of preprocessor constants
+*************************************************************************/
+
+#ifndef NV_KERNEL_DEF
+#define NV_KERNEL_DEF
+
+#include "geryon/ucl_nv_kernel.h"
+#ifdef __CUDA_ARCH__
+#define ARCH __CUDA_ARCH__
+#else
+#define ARCH 100
+#endif
+
+#if (ARCH < 200)
+
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 8
+#define BLOCK_NBOR_BUILD 64
+#define BLOCK_PAIR 64
+#define BLOCK_BIO_PAIR 64
+#define MAX_SHARED_TYPES 8
+
+#else
+
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 8
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_PAIR 128
+#define BLOCK_BIO_PAIR 128
+#define MAX_SHARED_TYPES 11
+
+#endif
+
+#define WARP_SIZE 32
+
+#endif
--- a/lib/gpu/pair_gpu_ans.cpp
+++ b/lib/gpu/pair_gpu_ans.cpp
@ -0,0 +1,407 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include "pair_gpu_ans.h"
+
+#define PairGPUAnsT PairGPUAns<numtyp,acctyp>
+
+template <class numtyp, class acctyp>
+PairGPUAnsT::PairGPUAns() : _allocated(false),_eflag(false),_vflag(false),
+                            _inum(0),_ilist(NULL),_newton(false) {
+}
+
+template <class numtyp, class acctyp>
+int PairGPUAnsT::bytes_per_atom() const { 
+  int bytes=11*sizeof(acctyp);
+  if (_rot)
+    bytes+=4*sizeof(acctyp);
+  if (_charge)
+    bytes+=sizeof(acctyp);
+  return bytes;
+}
+
+template <class numtyp, class acctyp>
+bool PairGPUAnsT::alloc(const int inum) {
+  _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
+
+  bool success=true;
+  
+  int ans_elements=4;
+  if (_rot)
+    ans_elements+=4;
+  
+  // Ignore host/device transfers?
+  bool cpuview=false;
+  if (dev->device_type()==UCL_CPU)
+    cpuview=true;
+    
+  // --------------------------   Host allocations
+  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
+  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
+    
+  // ---------------------------  Device allocations
+  if (cpuview) {
+    dev_engv.view(host_engv);
+    dev_ans.view(host_ans);
+  } else {
+    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
+                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (dev_ans.alloc(ans_elements*_max_local,
+                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
+  }
+  _gpu_bytes=dev_engv.row_bytes()+dev_ans.row_bytes();
+  
+  _allocated=true;  
+  return success;
+}
+
+template <class numtyp, class acctyp>
+bool PairGPUAnsT::init(const int inum, const bool charge, const bool rot,
+                       UCL_Device &devi) {
+  clear();
+
+  bool success=true;
+  _charge=charge;
+  _rot=rot;
+  _other=_charge || _rot;
+  dev=&devi;
+
+  _e_fields=1;
+  if (_charge)
+    _e_fields++;
+  _ev_fields=6+_e_fields;
+    
+  // Initialize atom and nbor data
+  int ef_inum=inum;
+  if (ef_inum==0)
+    ef_inum=1000;
+  
+  // Initialize timers for the selected device
+  time_answer.init(*dev);
+  time_answer.zero();
+  _time_cast=0.0;
+  _time_cpu_idle=0.0;
+  
+  return success && alloc(ef_inum);
+}
+  
+template <class numtyp, class acctyp>
+bool PairGPUAnsT::add_fields(const bool charge, const bool rot) {
+  bool realloc=false;
+  if (charge && _charge==false) {
+    _charge=true;
+    _e_fields++;
+    _ev_fields++;
+    realloc=true;
+  }
+  if (rot && _rot==false) {
+    _rot=true;
+    realloc=true;
+  }
+  if (realloc) {
+    _other=_charge || _rot;
+    int inum=_max_local;
+    clear_resize();
+    return alloc(inum);
+  }
+  return true;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::clear_resize() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  dev_ans.clear();
+  dev_engv.clear();
+  host_ans.clear();
+  host_engv.clear();
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::clear() {
+  _gpu_bytes=0;
+  if (!_allocated)
+    return;
+
+  time_answer.clear();
+  clear_resize();
+  _inum=0;
+  _ilist=NULL;
+  _eflag=false;
+  _vflag=false;
+}
+
+template <class numtyp, class acctyp>
+double PairGPUAnsT::host_memory_usage() const {
+  int atom_bytes=4;
+  if (_charge) 
+    atom_bytes+=1;
+  if (_rot) 
+    atom_bytes+=4;
+  int ans_bytes=atom_bytes+_ev_fields;
+  return ans_bytes*(_max_local)*sizeof(acctyp)+
+         sizeof(PairGPUAns<numtyp,acctyp>);
+}
+  
+template <class numtyp, class acctyp>
+void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom) {
+  time_answer.start();
+  _eflag=eflag;
+  _vflag=vflag;
+  _ef_atom=ef_atom;
+  _vf_atom=vf_atom;
+    
+  int csize=_ev_fields;    
+  if (!eflag)
+    csize-=_e_fields;
+  if (!vflag)
+    csize-=6;
+      
+  if (csize>0)
+    ucl_copy(host_engv,dev_engv,_inum*csize,true);
+  if (_rot)
+    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
+  else
+    ucl_copy(host_ans,dev_ans,_inum*4,true);
+  time_answer.stop();
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::copy_answers(const bool eflag, const bool vflag,
+                               const bool ef_atom, const bool vf_atom,
+                               int *ilist) {
+  _ilist=ilist;
+  copy_answers(eflag,vflag,ef_atom,vf_atom);
+}
+
+template <class numtyp, class acctyp>
+double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
+                                  double *virial) {
+  if (_eflag==false && _vflag==false)
+    return 0.0;
+
+  double evdwl=0.0;
+  double virial_acc[6];
+  for (int i=0; i<6; i++) virial_acc[i]=0.0;
+  if (_ilist==NULL) {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[i][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  } else {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      int ii=_ilist[i];
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[ii][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  }
+  
+  evdwl*=0.5;
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+double PairGPUAnsT::energy_virial(double *eatom, double **vatom,
+                                   double *virial, double &ecoul) {
+  if (_eflag==false && _vflag==false)
+    return 0.0;
+
+  if (_charge==false)
+    return energy_virial(eatom,vatom,virial);
+
+  double evdwl=0.0;
+  double _ecoul=0.0;
+  double virial_acc[6];
+  for (int i=0; i<6; i++) virial_acc[i]=0.0;
+  if (_ilist==NULL) {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+          _ecoul+=*ap;
+          eatom[i]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+          _ecoul+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[i][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  } else {
+    for (int i=0; i<_inum; i++) {
+      acctyp *ap=host_engv.begin()+i;
+      int ii=_ilist[i];
+      if (_eflag) {
+        if (_ef_atom) {
+          evdwl+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+          _ecoul+=*ap;
+          eatom[ii]+=*ap*0.5;
+          ap+=_inum;
+        } else {
+          evdwl+=*ap;
+          ap+=_inum;
+          _ecoul+=*ap;
+          ap+=_inum;
+        }
+      }
+      if (_vflag) {
+        if (_vf_atom) {
+          for (int j=0; j<6; j++) {
+            vatom[ii][j]+=*ap*0.5;
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        } else {
+          for (int j=0; j<6; j++) {
+            virial_acc[j]+=*ap;
+            ap+=_inum;
+          }
+        }
+      }
+    }
+    for (int j=0; j<6; j++)
+      virial[j]+=virial_acc[j]*0.5;
+  }
+  
+  evdwl*=0.5;
+  ecoul+=_ecoul*0.5;
+  return evdwl;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUAnsT::get_answers(double **f, double **tor) {
+  acctyp *ap=host_ans.begin();
+  if (_ilist==NULL) {
+    for (int i=0; i<_inum; i++) {
+      f[i][0]+=*ap;
+      ap++;
+      f[i][1]+=*ap;
+      ap++;
+      f[i][2]+=*ap;
+      ap+=2;
+    }
+    if (_rot) {
+      for (int i=0; i<_inum; i++) {
+        tor[i][0]+=*ap;
+        ap++;
+        tor[i][1]+=*ap;
+        ap++;
+        tor[i][2]+=*ap;
+        ap+=2;
+      }
+    }
+  } else {
+    for (int i=0; i<_inum; i++) {
+      int ii=_ilist[i];
+      f[ii][0]+=*ap;
+      ap++;
+      f[ii][1]+=*ap;
+      ap++;
+      f[ii][2]+=*ap;
+      ap+=2;
+    }
+    if (_rot) {
+      for (int i=0; i<_inum; i++) {
+        int ii=_ilist[i];
+        tor[ii][0]+=*ap;
+        ap++;
+        tor[ii][1]+=*ap;
+        ap++;
+        tor[ii][2]+=*ap;
+        ap+=2;
+      }
+    }
+  }
+}
+
+template class PairGPUAns<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/pair_gpu_ans.h
+++ b/lib/gpu/pair_gpu_ans.h
@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_ANS_H
+#define PAIR_GPU_ANS_H
+
+#include <math.h>
+#include "mpi.h"
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_timer.h"
+#include "geryon/ocl_mat.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "geryon/nvd_timer.h"
+#include "geryon/nvd_mat.h"
+using namespace ucl_cudadr;
+
+#endif
+
+#include "pair_gpu_precision.h"
+
+template <class numtyp, class acctyp>
+class PairGPUAns {
+ public:
+  PairGPUAns();
+  ~PairGPUAns() { clear(); }
+
+  /// Current number of local atoms stored
+  inline int inum() const { return _inum; }
+  /// Set number of local atoms for future copy operations
+  inline void inum(const int n) { _inum=n; }
+  
+  /// Memory usage per atom in this class
+  int bytes_per_atom() const; 
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool init(const int inum, const bool charge, const bool rot, UCL_Device &dev);
+  
+  /// Check if we have enough device storage and realloc if not
+  inline void resize(const int inum, bool &success) {
+    _inum=inum;
+    if (inum>_max_local) {
+      clear_resize();
+      success = success && alloc(inum);
+    }
+  }
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot);
+  
+  /// Free all memory on host and device needed to realloc for more atoms
+  void clear_resize();
+
+  /// Free all memory on host and device
+  void clear();
+ 
+  /// Return the total amount of host memory used by class in bytes
+  double host_memory_usage() const;
+
+  /// Add copy times to timers
+  inline void acc_timers() {
+    time_answer.add_to_total();
+  }
+
+  /// Add copy times to timers
+  inline void zero_timers() {
+    time_answer.zero();
+  }
+
+  /// Return the total time for host/device data transfer
+  inline double transfer_time() {
+    return time_answer.total_seconds();
+  }
+  
+  /// Return the total time for data cast/pack
+  inline double cast_time() { return _time_cast; }
+  
+  /// Return number of bytes used on device
+  inline double gpu_bytes() { return _gpu_bytes; } 
+
+  // -------------------------COPY FROM GPU -------------------------------
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom);
+
+  /// Copy answers from device into read buffer asynchronously
+  void copy_answers(const bool eflag, const bool vflag,
+                    const bool ef_atom, const bool vf_atom, int *ilist);
+  
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial);
+
+  /// Copy energy and virial data into LAMMPS memory
+  double energy_virial(double *eatom, double **vatom, double *virial,
+                       double &ecoul);
+
+  /// Add forces and torques from the GPU into a LAMMPS pointer
+  void get_answers(double **f, double **tor);
+
+  inline double get_answers(double **f, double **tor, double *eatom, 
+                            double **vatom, double *virial, double &ecoul) {
+    double ta=MPI_Wtime();
+    time_answer.sync_stop();
+    _time_cpu_idle+=MPI_Wtime()-ta;
+    double ts=MPI_Wtime();
+    double evdw=energy_virial(eatom,vatom,virial,ecoul);
+    get_answers(f,tor);
+    _time_cast+=MPI_Wtime()-ts;
+    return evdw;
+  }
+  
+  /// Return the time the CPU was idle waiting for GPU
+  inline double cpu_idle_time() { return _time_cpu_idle; }
+
+  // ------------------------------ DATA ----------------------------------
+
+  /// Force and possibly torque
+  UCL_D_Vec<acctyp> dev_ans;
+  /// Energy and virial per-atom storage
+  UCL_D_Vec<acctyp> dev_engv;
+  
+  /// Force and possibly torque data on host
+  UCL_H_Vec<acctyp> host_ans;
+  /// Energy/virial data on host
+  UCL_H_Vec<acctyp> host_engv;
+  
+  /// Device timers
+  UCL_Timer time_answer;
+  
+  /// Geryon device
+  UCL_Device *dev;
+
+ private:
+  bool alloc(const int inum);
+  
+  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
+  int _max_local, _inum, _e_fields, _ev_fields;
+  int *_ilist;
+  double _time_cast, _time_cpu_idle;
+  
+  double _gpu_bytes;
+  
+  bool _newton;
+};
+
+#endif
+
--- a/lib/gpu/pair_gpu_atom_ptx.h
+++ b/lib/gpu/pair_gpu_atom_ptx.h
@ -0,0 +1,55 @@
+const char * pair_gpu_atom_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.entry kernel_cast_x (\n"
+"		.param .u64 __cudaparm_kernel_cast_x_x_type,\n"
+"		.param .u64 __cudaparm_kernel_cast_x_x,\n"
+"		.param .u64 __cudaparm_kernel_cast_x_type,\n"
+"		.param .s32 __cudaparm_kernel_cast_x_nall)\n"
+"	{\n"
+"	.reg .u32 %r<10>;\n"
+"	.reg .u64 %rd<13>;\n"
+"	.reg .f32 %f<6>;\n"
+"	.reg .f64 %fd<5>;\n"
+"	.reg .pred %p<3>;\n"
+"	.loc	14	34	0\n"
+"$LBB1_kernel_cast_x:\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_cast_x_nall];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_1026;\n"
+"	.loc	14	39	0\n"
+"	cvt.u64.s32 	%rd1, %r5;\n"
+"	ld.param.u64 	%rd2, [__cudaparm_kernel_cast_x_type];\n"
+"	mul.lo.u64 	%rd3, %rd1, 4;\n"
+"	add.u64 	%rd4, %rd2, %rd3;\n"
+"	ld.global.s32 	%r7, [%rd4+0];\n"
+"	cvt.rn.f32.s32 	%f1, %r7;\n"
+"	.loc	14	42	0\n"
+"	ld.param.u64 	%rd5, [__cudaparm_kernel_cast_x_x];\n"
+"	mul.lo.s32 	%r8, %r5, 3;\n"
+"	cvt.u64.s32 	%rd6, %r8;\n"
+"	mul.lo.u64 	%rd7, %rd6, 8;\n"
+"	add.u64 	%rd8, %rd5, %rd7;\n"
+"	ld.global.f64 	%fd1, [%rd8+8];\n"
+"	cvt.rn.f32.f64 	%f2, %fd1;\n"
+"	.loc	14	43	0\n"
+"	ld.global.f64 	%fd2, [%rd8+16];\n"
+"	cvt.rn.f32.f64 	%f3, %fd2;\n"
+"	.loc	14	44	0\n"
+"	ld.param.u64 	%rd9, [__cudaparm_kernel_cast_x_x_type];\n"
+"	mul.lo.u64 	%rd10, %rd1, 16;\n"
+"	add.u64 	%rd11, %rd9, %rd10;\n"
+"	ld.global.f64 	%fd3, [%rd8+0];\n"
+"	cvt.rn.f32.f64 	%f4, %fd3;\n"
+"	st.global.v4.f32 	[%rd11+0], {%f4,%f2,%f3,%f1};\n"
+"$Lt_0_1026:\n"
+"	.loc	14	46	0\n"
+"	exit;\n"
+"$LDWend_kernel_cast_x:\n"
+"	}\n"
+;
--- a/lib/gpu/pair_gpu_build_ptx.h
+++ b/lib/gpu/pair_gpu_build_ptx.h
@ -0,0 +1,765 @@
+const char * pair_gpu_build_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.tex .u64 neigh_tex;\n"
+"	.entry transpose (\n"
+"		.param .u64 __cudaparm_transpose_out,\n"
+"		.param .u64 __cudaparm_transpose_in,\n"
+"		.param .s32 __cudaparm_transpose_columns_in,\n"
+"		.param .s32 __cudaparm_transpose_rows_in)\n"
+"	{\n"
+"	.reg .u16 %rh<4>;\n"
+"	.reg .u32 %r<30>;\n"
+"	.reg .u64 %rd<23>;\n"
+"	.reg .f32 %f<4>;\n"
+"	.reg .pred %p<4>;\n"
+"	.shared .align 4 .b8 __cuda_block24[288];\n"
+"	.loc	14	62	0\n"
+"$LBB1_transpose:\n"
+"	mov.u16 	%rh1, %ctaid.x;\n"
+"	mul.wide.u16 	%r1, %rh1, 8;\n"
+"	mov.u16 	%rh2, %ctaid.y;\n"
+"	mul.wide.u16 	%r2, %rh2, 8;\n"
+"	cvt.u32.u16 	%r3, %tid.x;\n"
+"	add.u32 	%r4, %r1, %r3;\n"
+"	cvt.u32.u16 	%r5, %tid.y;\n"
+"	add.u32 	%r6, %r2, %r5;\n"
+"	ld.param.u32 	%r7, [__cudaparm_transpose_rows_in];\n"
+"	ld.param.u32 	%r8, [__cudaparm_transpose_columns_in];\n"
+"	set.lt.u32.u32 	%r9, %r4, %r8;\n"
+"	neg.s32 	%r10, %r9;\n"
+"	set.lt.u32.u32 	%r11, %r6, %r7;\n"
+"	neg.s32 	%r12, %r11;\n"
+"	and.b32 	%r13, %r10, %r12;\n"
+"	mov.u32 	%r14, 0;\n"
+"	setp.eq.s32 	%p1, %r13, %r14;\n"
+"	@%p1 bra 	$Lt_0_2306;\n"
+"	.loc	14	74	0\n"
+"	mov.u64 	%rd1, __cuda_block24;\n"
+"	ld.param.u64 	%rd2, [__cudaparm_transpose_in];\n"
+"	mul.lo.u32 	%r15, %r6, %r8;\n"
+"	add.u32 	%r16, %r4, %r15;\n"
+"	cvt.u64.u32 	%rd3, %r16;\n"
+"	mul.lo.u64 	%rd4, %rd3, 4;\n"
+"	add.u64 	%rd5, %rd2, %rd4;\n"
+"	ld.global.s32 	%r17, [%rd5+0];\n"
+"	cvt.rn.f32.s32 	%f1, %r17;\n"
+"	cvt.u64.u32 	%rd6, %r3;\n"
+"	cvt.u64.u32 	%rd7, %r5;\n"
+"	mul.lo.u64 	%rd8, %rd7, 9;\n"
+"	add.u64 	%rd9, %rd6, %rd8;\n"
+"	mul.lo.u64 	%rd10, %rd9, 4;\n"
+"	add.u64 	%rd11, %rd1, %rd10;\n"
+"	st.shared.f32 	[%rd11+0], %f1;\n"
+"$Lt_0_2306:\n"
+"	mov.u64 	%rd1, __cuda_block24;\n"
+"	.loc	14	76	0\n"
+"	bar.sync 	0;\n"
+"	add.u32 	%r18, %r1, %r5;\n"
+"	add.u32 	%r19, %r2, %r3;\n"
+"	set.lt.u32.u32 	%r20, %r18, %r8;\n"
+"	neg.s32 	%r21, %r20;\n"
+"	set.lt.u32.u32 	%r22, %r19, %r7;\n"
+"	neg.s32 	%r23, %r22;\n"
+"	and.b32 	%r24, %r21, %r23;\n"
+"	mov.u32 	%r25, 0;\n"
+"	setp.eq.s32 	%p2, %r24, %r25;\n"
+"	@%p2 bra 	$Lt_0_2818;\n"
+"	.loc	14	81	0\n"
+"	cvt.u64.u32 	%rd12, %r5;\n"
+"	cvt.u64.u32 	%rd13, %r3;\n"
+"	mul.lo.u64 	%rd14, %rd13, 9;\n"
+"	add.u64 	%rd15, %rd12, %rd14;\n"
+"	mul.lo.u64 	%rd16, %rd15, 4;\n"
+"	add.u64 	%rd17, %rd1, %rd16;\n"
+"	ld.shared.f32 	%f2, [%rd17+0];\n"
+"	cvt.rzi.s32.f32 	%r26, %f2;\n"
+"	ld.param.u64 	%rd18, [__cudaparm_transpose_out];\n"
+"	mul.lo.u32 	%r27, %r18, %r7;\n"
+"	add.u32 	%r28, %r19, %r27;\n"
+"	cvt.u64.u32 	%rd19, %r28;\n"
+"	mul.lo.u64 	%rd20, %rd19, 4;\n"
+"	add.u64 	%rd21, %rd18, %rd20;\n"
+"	st.global.s32 	[%rd21+0], %r26;\n"
+"$Lt_0_2818:\n"
+"	.loc	14	82	0\n"
+"	exit;\n"
+"$LDWend_transpose:\n"
+"	}\n"
+"	.entry calc_cell_id (\n"
+"		.param .u64 __cudaparm_calc_cell_id_pos,\n"
+"		.param .u64 __cudaparm_calc_cell_id_cell_id,\n"
+"		.param .u64 __cudaparm_calc_cell_id_particle_id,\n"
+"		.param .f32 __cudaparm_calc_cell_id_boxlo0,\n"
+"		.param .f32 __cudaparm_calc_cell_id_boxlo1,\n"
+"		.param .f32 __cudaparm_calc_cell_id_boxlo2,\n"
+"		.param .f32 __cudaparm_calc_cell_id_boxhi0,\n"
+"		.param .f32 __cudaparm_calc_cell_id_boxhi1,\n"
+"		.param .f32 __cudaparm_calc_cell_id_boxhi2,\n"
+"		.param .f32 __cudaparm_calc_cell_id_cell_size,\n"
+"		.param .s32 __cudaparm_calc_cell_id_ncellx,\n"
+"		.param .s32 __cudaparm_calc_cell_id_ncelly,\n"
+"		.param .s32 __cudaparm_calc_cell_id_nall)\n"
+"	{\n"
+"	.reg .u16 %rh<4>;\n"
+"	.reg .u32 %r<20>;\n"
+"	.reg .u64 %rd<8>;\n"
+"	.reg .f32 %f<35>;\n"
+"	.reg .f64 %fd<11>;\n"
+"	.reg .pred %p<3>;\n"
+"	.loc	14	88	0\n"
+"$LBB1_calc_cell_id:\n"
+"	cvt.u32.u16 	%r1, %tid.x;\n"
+"	mov.u16 	%rh1, %ctaid.x;\n"
+"	mov.u16 	%rh2, %ntid.x;\n"
+"	mul.wide.u16 	%r2, %rh1, %rh2;\n"
+"	add.u32 	%r3, %r1, %r2;\n"
+"	ld.param.s32 	%r4, [__cudaparm_calc_cell_id_nall];\n"
+"	setp.le.s32 	%p1, %r4, %r3;\n"
+"	@%p1 bra 	$Lt_1_1026;\n"
+"	mov.s32 	%r5, %r3;\n"
+"	mov.s32 	%r6, 0;\n"
+"	mov.s32 	%r7, 0;\n"
+"	mov.s32 	%r8, 0;\n"
+"	tex.1d.v4.f32.s32 {%f1,%f2,%f3,%f4},[neigh_tex,{%r5,%r6,%r7,%r8}];\n"
+"	.loc	14	92	0\n"
+"	mov.f32 	%f5, %f1;\n"
+"	mov.f32 	%f6, %f2;\n"
+"	mov.f32 	%f7, %f3;\n"
+"	.loc	14	105	0\n"
+"	ld.param.f32 	%f8, [__cudaparm_calc_cell_id_cell_size];\n"
+"	neg.f32 	%f9, %f8;\n"
+"	ld.param.f32 	%f10, [__cudaparm_calc_cell_id_boxlo0];\n"
+"	ld.param.f32 	%f11, [__cudaparm_calc_cell_id_boxlo2];\n"
+"	ld.param.f32 	%f12, [__cudaparm_calc_cell_id_boxlo1];\n"
+"	ld.param.u32 	%r9, [__cudaparm_calc_cell_id_ncellx];\n"
+"	ld.param.u32 	%r10, [__cudaparm_calc_cell_id_ncelly];\n"
+"	ld.param.f32 	%f13, [__cudaparm_calc_cell_id_boxhi2];\n"
+"	sub.f32 	%f14, %f13, %f11;\n"
+"	add.f32 	%f15, %f8, %f14;\n"
+"	sub.f32 	%f16, %f7, %f11;\n"
+"	max.f32 	%f17, %f9, %f16;\n"
+"	min.f32 	%f18, %f15, %f17;\n"
+"	div.approx.f32 	%f19, %f18, %f8;\n"
+"	cvt.f64.f32 	%fd1, %f19;\n"
+"	mov.f64 	%fd2, 0d3ff0000000000000;	\n"
+"	add.f64 	%fd3, %fd1, %fd2;\n"
+"	cvt.rzi.u32.f64 	%r11, %fd3;\n"
+"	mul.lo.u32 	%r12, %r9, %r11;\n"
+"	mul.lo.u32 	%r13, %r10, %r12;\n"
+"	ld.param.f32 	%f20, [__cudaparm_calc_cell_id_boxhi1];\n"
+"	sub.f32 	%f21, %f20, %f12;\n"
+"	add.f32 	%f22, %f8, %f21;\n"
+"	sub.f32 	%f23, %f6, %f12;\n"
+"	max.f32 	%f24, %f9, %f23;\n"
+"	min.f32 	%f25, %f22, %f24;\n"
+"	div.approx.f32 	%f26, %f25, %f8;\n"
+"	cvt.f64.f32 	%fd4, %f26;\n"
+"	mov.f64 	%fd5, 0d3ff0000000000000;	\n"
+"	add.f64 	%fd6, %fd4, %fd5;\n"
+"	cvt.rzi.u32.f64 	%r14, %fd6;\n"
+"	mul.lo.u32 	%r15, %r9, %r14;\n"
+"	add.u32 	%r16, %r13, %r15;\n"
+"	ld.param.f32 	%f27, [__cudaparm_calc_cell_id_boxhi0];\n"
+"	sub.f32 	%f28, %f27, %f10;\n"
+"	add.f32 	%f29, %f8, %f28;\n"
+"	sub.f32 	%f30, %f5, %f10;\n"
+"	max.f32 	%f31, %f9, %f30;\n"
+"	min.f32 	%f32, %f29, %f31;\n"
+"	div.approx.f32 	%f33, %f32, %f8;\n"
+"	cvt.f64.f32 	%fd7, %f33;\n"
+"	mov.f64 	%fd8, 0d3ff0000000000000;	\n"
+"	add.f64 	%fd9, %fd7, %fd8;\n"
+"	cvt.rzi.u32.f64 	%r17, %fd9;\n"
+"	add.u32 	%r18, %r16, %r17;\n"
+"	.loc	14	109	0\n"
+"	cvt.u64.s32 	%rd1, %r3;\n"
+"	mul.lo.u64 	%rd2, %rd1, 4;\n"
+"	ld.param.u64 	%rd3, [__cudaparm_calc_cell_id_cell_id];\n"
+"	add.u64 	%rd4, %rd3, %rd2;\n"
+"	st.global.u32 	[%rd4+0], %r18;\n"
+"	.loc	14	110	0\n"
+"	ld.param.u64 	%rd5, [__cudaparm_calc_cell_id_particle_id];\n"
+"	add.u64 	%rd6, %rd5, %rd2;\n"
+"	st.global.s32 	[%rd6+0], %r3;\n"
+"$Lt_1_1026:\n"
+"	.loc	14	112	0\n"
+"	exit;\n"
+"$LDWend_calc_cell_id:\n"
+"	}\n"
+"	.entry kernel_calc_cell_counts (\n"
+"		.param .u64 __cudaparm_kernel_calc_cell_counts_cell_id,\n"
+"		.param .u64 __cudaparm_kernel_calc_cell_counts_cell_counts,\n"
+"		.param .s32 __cudaparm_kernel_calc_cell_counts_nall,\n"
+"		.param .s32 __cudaparm_kernel_calc_cell_counts_ncell)\n"
+"	{\n"
+"	.reg .u16 %rh<4>;\n"
+"	.reg .u32 %r<31>;\n"
+"	.reg .u64 %rd<15>;\n"
+"	.reg .pred %p<13>;\n"
+"	.loc	14	115	0\n"
+"$LBB1_kernel_calc_cell_counts:\n"
+"	mov.u16 	%rh1, %ctaid.x;\n"
+"	mov.u16 	%rh2, %ntid.x;\n"
+"	mul.wide.u16 	%r1, %rh1, %rh2;\n"
+"	cvt.u32.u16 	%r2, %tid.x;\n"
+"	add.u32 	%r3, %r2, %r1;\n"
+"	ld.param.s32 	%r4, [__cudaparm_kernel_calc_cell_counts_nall];\n"
+"	setp.gt.s32 	%p1, %r4, %r3;\n"
+"	@!%p1 bra 	$Lt_2_7426;\n"
+"	.loc	14	118	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_calc_cell_counts_cell_id];\n"
+"	cvt.u64.s32 	%rd2, %r3;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	add.u64 	%rd4, %rd1, %rd3;\n"
+"	ld.global.s32 	%r5, [%rd4+0];\n"
+"	mov.u32 	%r6, 0;\n"
+"	setp.ne.s32 	%p2, %r3, %r6;\n"
+"	@%p2 bra 	$Lt_2_7938;\n"
+"	add.s32 	%r7, %r5, 1;\n"
+"	mov.u32 	%r8, 0;\n"
+"	setp.le.s32 	%p3, %r7, %r8;\n"
+"	@%p3 bra 	$Lt_2_8450;\n"
+"	mov.s32 	%r9, %r7;\n"
+"	ld.param.u64 	%rd5, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
+"	mov.s32 	%r10, 0;\n"
+"	mov.s32 	%r11, %r9;\n"
+"$Lt_2_8962:\n"
+"	.loc	14	123	0\n"
+"	mov.s32 	%r12, 0;\n"
+"	st.global.s32 	[%rd5+0], %r12;\n"
+"	add.s32 	%r10, %r10, 1;\n"
+"	add.u64 	%rd5, %rd5, 4;\n"
+"	setp.ne.s32 	%p4, %r7, %r10;\n"
+"	@%p4 bra 	$Lt_2_8962;\n"
+"$Lt_2_8450:\n"
+"$Lt_2_7938:\n"
+"	sub.s32 	%r13, %r4, 1;\n"
+"	setp.ne.s32 	%p5, %r3, %r13;\n"
+"	@%p5 bra 	$Lt_2_9474;\n"
+"	.loc	14	126	0\n"
+"	add.s32 	%r7, %r5, 1;\n"
+"	mov.s32 	%r14, %r7;\n"
+"	ld.param.s32 	%r15, [__cudaparm_kernel_calc_cell_counts_ncell];\n"
+"	setp.gt.s32 	%p6, %r7, %r15;\n"
+"	@%p6 bra 	$Lt_2_9986;\n"
+"	sub.s32 	%r16, %r15, %r5;\n"
+"	add.s32 	%r17, %r15, 1;\n"
+"	ld.param.u64 	%rd6, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
+"	cvt.u64.s32 	%rd7, %r7;\n"
+"	mul.lo.u64 	%rd8, %rd7, 4;\n"
+"	add.u64 	%rd9, %rd6, %rd8;\n"
+"	mov.s32 	%r18, %r16;\n"
+"$Lt_2_10498:\n"
+"	.loc	14	127	0\n"
+"	st.global.s32 	[%rd9+0], %r4;\n"
+"	add.s32 	%r14, %r14, 1;\n"
+"	add.u64 	%rd9, %rd9, 4;\n"
+"	setp.ne.s32 	%p7, %r17, %r14;\n"
+"	@%p7 bra 	$Lt_2_10498;\n"
+"$Lt_2_9986:\n"
+"$Lt_2_9474:\n"
+"	selp.s32 	%r19, 1, 0, %p1;\n"
+"	mov.s32 	%r20, 0;\n"
+"	set.gt.u32.s32 	%r21, %r3, %r20;\n"
+"	neg.s32 	%r22, %r21;\n"
+"	and.b32 	%r23, %r19, %r22;\n"
+"	mov.u32 	%r24, 0;\n"
+"	setp.eq.s32 	%p8, %r23, %r24;\n"
+"	@%p8 bra 	$Lt_2_11010;\n"
+"	.loc	14	131	0\n"
+"	ld.global.s32 	%r25, [%rd4+-4];\n"
+"	setp.eq.s32 	%p9, %r5, %r25;\n"
+"	@%p9 bra 	$Lt_2_11522;\n"
+"	.loc	14	133	0\n"
+"	add.s32 	%r26, %r25, 1;\n"
+"	mov.s32 	%r27, %r26;\n"
+"	setp.gt.s32 	%p10, %r26, %r5;\n"
+"	@%p10 bra 	$Lt_2_12034;\n"
+"	sub.s32 	%r28, %r5, %r25;\n"
+"	add.s32 	%r7, %r5, 1;\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_calc_cell_counts_cell_counts];\n"
+"	cvt.u64.s32 	%rd11, %r26;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd10, %rd12;\n"
+"	mov.s32 	%r29, %r28;\n"
+"$Lt_2_12546:\n"
+"	.loc	14	134	0\n"
+"	st.global.s32 	[%rd13+0], %r3;\n"
+"	add.s32 	%r27, %r27, 1;\n"
+"	add.u64 	%rd13, %rd13, 4;\n"
+"	setp.ne.s32 	%p11, %r7, %r27;\n"
+"	@%p11 bra 	$Lt_2_12546;\n"
+"$Lt_2_12034:\n"
+"$Lt_2_11522:\n"
+"$Lt_2_11010:\n"
+"$Lt_2_7426:\n"
+"	.loc	14	138	0\n"
+"	exit;\n"
+"$LDWend_kernel_calc_cell_counts:\n"
+"	}\n"
+"	.entry calc_neigh_list_cell (\n"
+"		.param .u64 __cudaparm_calc_neigh_list_cell_pos,\n"
+"		.param .u64 __cudaparm_calc_neigh_list_cell_cell_particle_id,\n"
+"		.param .u64 __cudaparm_calc_neigh_list_cell_cell_counts,\n"
+"		.param .u64 __cudaparm_calc_neigh_list_cell_nbor_list,\n"
+"		.param .u64 __cudaparm_calc_neigh_list_cell_host_nbor_list,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_neigh_bin_size,\n"
+"		.param .f32 __cudaparm_calc_neigh_list_cell_cell_size,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_ncellx,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_ncelly,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_ncellz,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_inum,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_nt,\n"
+"		.param .s32 __cudaparm_calc_neigh_list_cell_nall)\n"
+"	{\n"
+"	.reg .u32 %r<105>;\n"
+"	.reg .u64 %rd<43>;\n"
+"	.reg .f32 %f<43>;\n"
+"	.reg .f64 %fd<4>;\n"
+"	.reg .pred %p<24>;\n"
+"	.shared .align 16 .b8 __cuda_pos_sh480[1024];\n"
+"	.shared .align 4 .b8 __cuda_cell_list_sh1504[256];\n"
+"	.loc	14	148	0\n"
+"$LBB1_calc_neigh_list_cell:\n"
+"	.loc	14	160	0\n"
+"	ld.param.u32 	%r1, [__cudaparm_calc_neigh_list_cell_ncelly];\n"
+"	cvt.u32.u16 	%r2, %ctaid.y;\n"
+"	rem.u32 	%r3, %r2, %r1;\n"
+"	div.u32 	%r4, %r2, %r1;\n"
+"	ld.param.s32 	%r5, [__cudaparm_calc_neigh_list_cell_ncellx];\n"
+"	mul.lo.s32 	%r6, %r5, %r3;\n"
+"	mul.lo.s32 	%r7, %r5, %r4;\n"
+"	mul.lo.s32 	%r8, %r7, %r1;\n"
+"	cvt.s32.u16 	%r9, %ctaid.x;\n"
+"	ld.param.u64 	%rd1, [__cudaparm_calc_neigh_list_cell_cell_counts];\n"
+"	add.s32 	%r10, %r6, %r8;\n"
+"	add.s32 	%r11, %r9, %r10;\n"
+"	cvt.u64.s32 	%rd2, %r11;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	add.u64 	%rd4, %rd1, %rd3;\n"
+"	ld.global.s32 	%r12, [%rd4+0];\n"
+"	.loc	14	161	0\n"
+"	ld.global.s32 	%r13, [%rd4+4];\n"
+"	.loc	14	169	0\n"
+"	sub.s32 	%r14, %r13, %r12;\n"
+"	cvt.u32.u16 	%r15, %ntid.x;\n"
+"	cvt.rn.f32.u32 	%f1, %r15;\n"
+"	cvt.rn.f32.s32 	%f2, %r14;\n"
+"	div.approx.f32 	%f3, %f2, %f1;\n"
+"	cvt.rpi.f32.f32 	%f4, %f3;\n"
+"	mov.f32 	%f5, 0f00000000;     	\n"
+"	setp.gt.f32 	%p1, %f4, %f5;\n"
+"	@!%p1 bra 	$Lt_3_14594;\n"
+"	sub.s32 	%r16, %r3, 1;\n"
+"	mov.s32 	%r17, 0;\n"
+"	max.s32 	%r18, %r16, %r17;\n"
+"	sub.s32 	%r19, %r1, 1;\n"
+"	add.s32 	%r20, %r3, 1;\n"
+"	min.s32 	%r21, %r19, %r20;\n"
+"	ld.param.s32 	%r22, [__cudaparm_calc_neigh_list_cell_ncellz];\n"
+"	sub.s32 	%r23, %r22, 1;\n"
+"	add.s32 	%r24, %r4, 1;\n"
+"	min.s32 	%r25, %r23, %r24;\n"
+"	sub.s32 	%r26, %r9, 1;\n"
+"	mov.s32 	%r27, 0;\n"
+"	max.s32 	%r28, %r26, %r27;\n"
+"	add.s32 	%r29, %r9, 1;\n"
+"	sub.s32 	%r30, %r5, 1;\n"
+"	min.s32 	%r31, %r29, %r30;\n"
+"	cvt.s32.u16 	%r32, %tid.x;\n"
+"	add.s32 	%r33, %r12, %r32;\n"
+"	mov.u32 	%r34, 0;\n"
+"	ld.param.s32 	%r35, [__cudaparm_calc_neigh_list_cell_inum];\n"
+"	cvt.u64.s32 	%rd5, %r35;\n"
+"	sub.s32 	%r36, %r4, 1;\n"
+"	mov.s32 	%r37, %r33;\n"
+"	mov.s32 	%r38, 0;\n"
+"	max.s32 	%r39, %r36, %r38;\n"
+"	setp.ge.s32 	%p2, %r25, %r39;\n"
+"	ld.param.s32 	%r40, [__cudaparm_calc_neigh_list_cell_nt];\n"
+"	ld.param.s32 	%r41, [__cudaparm_calc_neigh_list_cell_nall];\n"
+"	mov.s32 	%r42, 0;\n"
+"	mov.u64 	%rd6, __cuda_pos_sh480;\n"
+"	mov.u64 	%rd7, __cuda_cell_list_sh1504;\n"
+"$Lt_3_15106:\n"
+"	.loc	14	171	0\n"
+"	mov.s32 	%r43, %r41;\n"
+"	setp.ge.s32 	%p3, %r37, %r13;\n"
+"	@%p3 bra 	$Lt_3_15362;\n"
+"	.loc	14	177	0\n"
+"	ld.param.u64 	%rd8, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
+"	add.u32 	%r44, %r33, %r34;\n"
+"	cvt.u64.s32 	%rd9, %r44;\n"
+"	mul.lo.u64 	%rd10, %rd9, 4;\n"
+"	add.u64 	%rd11, %rd8, %rd10;\n"
+"	ld.global.s32 	%r43, [%rd11+0];\n"
+"$Lt_3_15362:\n"
+"	setp.lt.s32 	%p4, %r43, %r40;\n"
+"	@!%p4 bra 	$Lt_3_15874;\n"
+"	mov.s32 	%r45, %r43;\n"
+"	mov.s32 	%r46, 0;\n"
+"	mov.s32 	%r47, 0;\n"
+"	mov.s32 	%r48, 0;\n"
+"	tex.1d.v4.f32.s32 {%f6,%f7,%f8,%f9},[neigh_tex,{%r45,%r46,%r47,%r48}];\n"
+"	.loc	14	180	0\n"
+"	mov.f32 	%f10, %f6;\n"
+"	mov.f32 	%f11, %f7;\n"
+"	mov.f32 	%f12, %f8;\n"
+"	mov.f32 	%f13, %f10;\n"
+"	mov.f32 	%f14, %f11;\n"
+"	mov.f32 	%f15, %f12;\n"
+"$Lt_3_15874:\n"
+"	cvt.u64.s32 	%rd12, %r43;\n"
+"	mul.lo.u64 	%rd13, %rd12, 4;\n"
+"	setp.ge.s32 	%p5, %r43, %r35;\n"
+"	@%p5 bra 	$Lt_3_16642;\n"
+"	.loc	14	183	0\n"
+"	mov.s32 	%r49, %r35;\n"
+"	.loc	14	184	0\n"
+"	ld.param.u64 	%rd14, [__cudaparm_calc_neigh_list_cell_nbor_list];\n"
+"	add.u64 	%rd15, %rd12, %rd5;\n"
+"	mul.lo.u64 	%rd16, %rd15, 4;\n"
+"	add.u64 	%rd17, %rd14, %rd16;\n"
+"	.loc	14	186	0\n"
+"	add.u64 	%rd18, %rd13, %rd14;\n"
+"	st.global.s32 	[%rd18+0], %r43;\n"
+"	bra.uni 	$Lt_3_16386;\n"
+"$Lt_3_16642:\n"
+"	.loc	14	188	0\n"
+"	sub.s32 	%r49, %r40, %r35;\n"
+"	.loc	14	189	0\n"
+"	ld.param.u64 	%rd19, [__cudaparm_calc_neigh_list_cell_host_nbor_list];\n"
+"	add.u64 	%rd20, %rd19, %rd13;\n"
+"	mul.lo.u64 	%rd21, %rd5, 4;\n"
+"	sub.u64 	%rd17, %rd20, %rd21;\n"
+"$Lt_3_16386:\n"
+"	cvt.u64.s32 	%rd22, %r49;\n"
+"	mul.lo.u64 	%rd23, %rd22, 4;\n"
+"	add.u64 	%rd24, %rd17, %rd23;\n"
+"	.loc	14	195	0\n"
+"	mov.s32 	%r50, %r39;\n"
+"	mov.s32 	%r51, 0;\n"
+"	@!%p2 bra 	$Lt_3_25090;\n"
+"	sub.s32 	%r52, %r25, %r39;\n"
+"	add.s32 	%r53, %r52, 1;\n"
+"	setp.le.s32 	%p6, %r18, %r21;\n"
+"	add.s32 	%r54, %r25, 1;\n"
+"	mov.s32 	%r55, %r53;\n"
+"$Lt_3_17410:\n"
+"	.loc	14	196	0\n"
+"	mov.s32 	%r56, %r18;\n"
+"	@!%p6 bra 	$Lt_3_17666;\n"
+"	sub.s32 	%r57, %r21, %r18;\n"
+"	add.s32 	%r58, %r57, 1;\n"
+"	setp.ge.s32 	%p7, %r31, %r28;\n"
+"	add.s32 	%r59, %r21, 1;\n"
+"	mov.s32 	%r60, %r58;\n"
+"$Lt_3_18178:\n"
+"	@!%p7 bra 	$Lt_3_18434;\n"
+"	sub.s32 	%r61, %r31, %r28;\n"
+"	add.s32 	%r62, %r61, 1;\n"
+"	mul.lo.s32 	%r63, %r56, %r5;\n"
+"	mul.lo.s32 	%r64, %r50, %r5;\n"
+"	mul.lo.s32 	%r65, %r64, %r1;\n"
+"	add.s32 	%r66, %r31, 1;\n"
+"	add.s32 	%r67, %r63, %r65;\n"
+"	add.s32 	%r68, %r67, %r28;\n"
+"	add.s32 	%r69, %r66, %r67;\n"
+"	cvt.u64.s32 	%rd25, %r68;\n"
+"	mul.lo.u64 	%rd26, %rd25, 4;\n"
+"	add.u64 	%rd27, %rd1, %rd26;\n"
+"	mov.s32 	%r70, %r62;\n"
+"$Lt_3_18946:\n"
+"	.loc	14	201	0\n"
+"	ld.global.s32 	%r71, [%rd27+0];\n"
+"	.loc	14	202	0\n"
+"	ld.global.s32 	%r72, [%rd27+4];\n"
+"	.loc	14	206	0\n"
+"	sub.s32 	%r73, %r72, %r71;\n"
+"	cvt.rn.f32.s32 	%f16, %r73;\n"
+"	mov.f32 	%f17, 0f42800000;    	\n"
+"	div.approx.f32 	%f18, %f16, %f17;\n"
+"	cvt.rpi.f32.f32 	%f19, %f18;\n"
+"	cvt.rzi.s32.f32 	%r74, %f19;\n"
+"	mov.u32 	%r75, 0;\n"
+"	setp.le.s32 	%p8, %r74, %r75;\n"
+"	@%p8 bra 	$Lt_3_19202;\n"
+"	mov.s32 	%r76, %r74;\n"
+"	mov.s32 	%r77, 0;\n"
+"	setp.lt.s32 	%p9, %r43, %r40;\n"
+"	mul.lo.s32 	%r78, %r74, 64;\n"
+"	mov.s32 	%r79, %r76;\n"
+"$Lt_3_19714:\n"
+"	.loc	14	209	0\n"
+"	sub.s32 	%r80, %r73, %r77;\n"
+"	mov.s32 	%r81, 64;\n"
+"	min.s32 	%r82, %r80, %r81;\n"
+"	setp.le.s32 	%p10, %r82, %r32;\n"
+"	@%p10 bra 	$Lt_3_19970;\n"
+"	.loc	14	212	0\n"
+"	ld.param.u64 	%rd28, [__cudaparm_calc_neigh_list_cell_cell_particle_id];\n"
+"	add.s32 	%r83, %r77, %r32;\n"
+"	add.s32 	%r84, %r71, %r83;\n"
+"	cvt.s64.s32 	%rd29, %r84;\n"
+"	mul.lo.u64 	%rd30, %rd29, 4;\n"
+"	add.u64 	%rd31, %rd28, %rd30;\n"
+"	ld.global.s32 	%r85, [%rd31+0];\n"
+"	.loc	14	213	0\n"
+"	cvt.u64.s32 	%rd32, %r32;\n"
+"	mul.lo.u64 	%rd33, %rd32, 4;\n"
+"	add.u64 	%rd34, %rd7, %rd33;\n"
+"	st.shared.s32 	[%rd34+0], %r85;\n"
+"	mov.s32 	%r86, %r85;\n"
+"	mov.s32 	%r87, 0;\n"
+"	mov.s32 	%r88, 0;\n"
+"	mov.s32 	%r89, 0;\n"
+"	tex.1d.v4.f32.s32 {%f20,%f21,%f22,%f23},[neigh_tex,{%r86,%r87,%r88,%r89}];\n"
+"	.loc	14	214	0\n"
+"	mov.f32 	%f24, %f20;\n"
+"	mov.f32 	%f25, %f21;\n"
+"	mov.f32 	%f26, %f22;\n"
+"	.loc	14	215	0\n"
+"	mul.lo.u64 	%rd35, %rd32, 16;\n"
+"	add.u64 	%rd36, %rd6, %rd35;\n"
+"	st.shared.f32 	[%rd36+0], %f24;\n"
+"	.loc	14	216	0\n"
+"	st.shared.f32 	[%rd36+4], %f25;\n"
+"	.loc	14	217	0\n"
+"	st.shared.f32 	[%rd36+8], %f26;\n"
+"$Lt_3_19970:\n"
+"	.loc	14	219	0\n"
+"	bar.sync 	0;\n"
+"	@!%p9 bra 	$Lt_3_20994;\n"
+"	mov.u32 	%r90, 0;\n"
+"	setp.le.s32 	%p11, %r82, %r90;\n"
+"	@%p11 bra 	$Lt_3_20994;\n"
+"	mov.s32 	%r91, %r82;\n"
+"	mov.u64 	%rd37, 0;\n"
+"	setp.lt.s32 	%p12, %r43, %r35;\n"
+"	selp.s32 	%r92, 1, 0, %p12;\n"
+"	mov.s64 	%rd38, %rd7;\n"
+"	mov.s32 	%r93, 0;\n"
+"	mov.s32 	%r94, %r91;\n"
+"$Lt_3_21506:\n"
+"	.loc	14	224	0\n"
+"	ld.shared.s32 	%r95, [%rd38+0];\n"
+"	set.lt.u32.s32 	%r96, %r43, %r95;\n"
+"	neg.s32 	%r97, %r96;\n"
+"	set.lt.u32.s32 	%r98, %r95, %r35;\n"
+"	neg.s32 	%r99, %r98;\n"
+"	or.b32 	%r100, %r92, %r99;\n"
+"	or.b32 	%r101, %r97, %r100;\n"
+"	mov.u32 	%r102, 0;\n"
+"	setp.eq.s32 	%p13, %r101, %r102;\n"
+"	@%p13 bra 	$Lt_3_26370;\n"
+"	.loc	14	226	0\n"
+"	mul.lo.u64 	%rd39, %rd37, 16;\n"
+"	add.u64 	%rd40, %rd6, %rd39;\n"
+"	mov.f32 	%f27, %f13;\n"
+"	ld.shared.f32 	%f28, [%rd40+0];\n"
+"	sub.f32 	%f29, %f27, %f28;\n"
+"	.loc	14	227	0\n"
+"	mov.f32 	%f30, %f14;\n"
+"	ld.shared.f32 	%f31, [%rd40+4];\n"
+"	sub.f32 	%f32, %f30, %f31;\n"
+"	.loc	14	228	0\n"
+"	mov.f32 	%f33, %f15;\n"
+"	ld.shared.f32 	%f34, [%rd40+8];\n"
+"	sub.f32 	%f35, %f33, %f34;\n"
+"	.loc	14	226	0\n"
+"	mul.f32 	%f36, %f32, %f32;\n"
+"	mad.f32 	%f37, %f29, %f29, %f36;\n"
+"	mad.f32 	%f38, %f35, %f35, %f37;\n"
+"	ld.param.f32 	%f39, [__cudaparm_calc_neigh_list_cell_cell_size];\n"
+"	mul.f32 	%f40, %f39, %f39;\n"
+"	setp.lt.f32 	%p14, %f38, %f40;\n"
+"	@!%p14 bra 	$Lt_3_26370;\n"
+"	cvt.f64.f32 	%fd1, %f38;\n"
+"	mov.f64 	%fd2, 0d3ee4f8b588e368f1;	\n"
+"	setp.gt.f64 	%p15, %fd1, %fd2;\n"
+"	@!%p15 bra 	$Lt_3_26370;\n"
+"	ld.param.s32 	%r103, [__cudaparm_calc_neigh_list_cell_neigh_bin_size];\n"
+"	setp.le.s32 	%p16, %r103, %r51;\n"
+"	@%p16 bra 	$Lt_3_22274;\n"
+"	.loc	14	233	0\n"
+"	st.global.s32 	[%rd24+0], %r95;\n"
+"	.loc	14	234	0\n"
+"	mul.lo.u64 	%rd41, %rd22, 4;\n"
+"	add.u64 	%rd24, %rd24, %rd41;\n"
+"$Lt_3_22274:\n"
+"	.loc	14	236	0\n"
+"	add.s32 	%r51, %r51, 1;\n"
+"$Lt_3_26370:\n"
+"$L_3_14082:\n"
+"$Lt_3_21762:\n"
+"	add.s32 	%r93, %r93, 1;\n"
+"	add.u64 	%rd37, %rd37, 1;\n"
+"	add.u64 	%rd38, %rd38, 4;\n"
+"	setp.ne.s32 	%p17, %r82, %r93;\n"
+"	@%p17 bra 	$Lt_3_21506;\n"
+"$Lt_3_20994:\n"
+"$Lt_3_20482:\n"
+"	.loc	14	241	0\n"
+"	bar.sync 	0;\n"
+"	add.s32 	%r77, %r77, 64;\n"
+"	setp.ne.s32 	%p18, %r77, %r78;\n"
+"	@%p18 bra 	$Lt_3_19714;\n"
+"$Lt_3_19202:\n"
+"	add.s32 	%r68, %r68, 1;\n"
+"	add.u64 	%rd27, %rd27, 4;\n"
+"	setp.ne.s32 	%p19, %r68, %r69;\n"
+"	@%p19 bra 	$Lt_3_18946;\n"
+"$Lt_3_18434:\n"
+"	add.s32 	%r56, %r56, 1;\n"
+"	setp.ne.s32 	%p20, %r59, %r56;\n"
+"	@%p20 bra 	$Lt_3_18178;\n"
+"$Lt_3_17666:\n"
+"	add.s32 	%r50, %r50, 1;\n"
+"	setp.ne.s32 	%p21, %r54, %r50;\n"
+"	@%p21 bra 	$Lt_3_17410;\n"
+"	bra.uni 	$Lt_3_16898;\n"
+"$Lt_3_25090:\n"
+"$Lt_3_16898:\n"
+"	@!%p4 bra 	$Lt_3_24066;\n"
+"	.loc	14	247	0\n"
+"	st.global.s32 	[%rd17+0], %r51;\n"
+"$Lt_3_24066:\n"
+"	.loc	14	169	0\n"
+"	add.s32 	%r42, %r42, 1;\n"
+"	add.u32 	%r34, %r34, %r15;\n"
+"	add.s32 	%r37, %r37, %r15;\n"
+"	cvt.rn.f32.s32 	%f41, %r42;\n"
+"	setp.lt.f32 	%p22, %f41, %f4;\n"
+"	@%p22 bra 	$Lt_3_15106;\n"
+"$Lt_3_14594:\n"
+"	.loc	14	249	0\n"
+"	exit;\n"
+"$LDWend_calc_neigh_list_cell:\n"
+"	}\n"
+"	.entry kernel_special (\n"
+"		.param .u64 __cudaparm_kernel_special_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_special_host_nbor_list,\n"
+"		.param .u64 __cudaparm_kernel_special_tag,\n"
+"		.param .u64 __cudaparm_kernel_special_nspecial,\n"
+"		.param .u64 __cudaparm_kernel_special_special,\n"
+"		.param .s32 __cudaparm_kernel_special_inum,\n"
+"		.param .s32 __cudaparm_kernel_special_nt,\n"
+"		.param .s32 __cudaparm_kernel_special_nall)\n"
+"	{\n"
+"	.reg .u32 %r<31>;\n"
+"	.reg .u64 %rd<31>;\n"
+"	.reg .pred %p<11>;\n"
+"	.loc	14	254	0\n"
+"$LBB1_kernel_special:\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_special_nt];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_4_6146;\n"
+"	.loc	14	262	0\n"
+"	ld.param.u64 	%rd1, [__cudaparm_kernel_special_nspecial];\n"
+"	mul.lo.s32 	%r7, %r5, 3;\n"
+"	cvt.s64.s32 	%rd2, %r7;\n"
+"	mul.lo.u64 	%rd3, %rd2, 4;\n"
+"	add.u64 	%rd4, %rd1, %rd3;\n"
+"	ld.global.s32 	%r8, [%rd4+0];\n"
+"	.loc	14	263	0\n"
+"	ld.global.s32 	%r9, [%rd4+4];\n"
+"	.loc	14	264	0\n"
+"	ld.global.s32 	%r10, [%rd4+8];\n"
+"	ld.param.s32 	%r11, [__cudaparm_kernel_special_inum];\n"
+"	cvt.u64.s32 	%rd5, %r11;\n"
+"	cvt.u64.s32 	%rd6, %r5;\n"
+"	setp.le.s32 	%p2, %r11, %r5;\n"
+"	@%p2 bra 	$Lt_4_6914;\n"
+"	.loc	14	267	0\n"
+"	mov.s32 	%r12, %r11;\n"
+"	.loc	14	268	0\n"
+"	ld.param.u64 	%rd7, [__cudaparm_kernel_special_dev_nbor];\n"
+"	add.u64 	%rd8, %rd5, %rd6;\n"
+"	mul.lo.u64 	%rd9, %rd8, 4;\n"
+"	add.u64 	%rd10, %rd7, %rd9;\n"
+"	bra.uni 	$Lt_4_6658;\n"
+"$Lt_4_6914:\n"
+"	.loc	14	270	0\n"
+"	sub.s32 	%r12, %r6, %r11;\n"
+"	.loc	14	271	0\n"
+"	ld.param.u64 	%rd11, [__cudaparm_kernel_special_host_nbor_list];\n"
+"	mul.lo.u64 	%rd12, %rd6, 4;\n"
+"	add.u64 	%rd13, %rd11, %rd12;\n"
+"	mul.lo.u64 	%rd14, %rd5, 4;\n"
+"	sub.u64 	%rd10, %rd13, %rd14;\n"
+"$Lt_4_6658:\n"
+"	.loc	14	273	0\n"
+"	ld.global.s32 	%r13, [%rd10+0];\n"
+"	.loc	14	274	0\n"
+"	cvt.u64.s32 	%rd15, %r12;\n"
+"	mul.lo.u64 	%rd16, %rd15, 4;\n"
+"	add.u64 	%rd10, %rd10, %rd16;\n"
+"	.loc	14	275	0\n"
+"	mul.lo.s32 	%r14, %r12, %r13;\n"
+"	cvt.s64.s32 	%rd17, %r14;\n"
+"	mul.lo.u64 	%rd18, %rd17, 4;\n"
+"	add.u64 	%rd19, %rd10, %rd18;\n"
+"	setp.le.u64 	%p3, %rd19, %rd10;\n"
+"	@%p3 bra 	$Lt_4_7170;\n"
+"	mov.s32 	%r15, 0;\n"
+"	setp.gt.s32 	%p4, %r10, %r15;\n"
+"	ld.param.u64 	%rd20, [__cudaparm_kernel_special_tag];\n"
+"$Lt_4_7682:\n"
+"	.loc	14	278	0\n"
+"	ld.global.s32 	%r16, [%rd10+0];\n"
+"	.loc	14	279	0\n"
+"	cvt.u64.s32 	%rd21, %r16;\n"
+"	mul.lo.u64 	%rd22, %rd21, 4;\n"
+"	add.u64 	%rd23, %rd20, %rd22;\n"
+"	ld.global.s32 	%r17, [%rd23+0];\n"
+"	@!%p4 bra 	$Lt_4_7938;\n"
+"	mov.s32 	%r18, %r10;\n"
+"	mul.lo.u64 	%rd24, %rd6, 4;\n"
+"	cvt.s64.s32 	%rd25, %r6;\n"
+"	mul.lo.u64 	%rd26, %rd25, 4;\n"
+"	ld.param.u64 	%rd27, [__cudaparm_kernel_special_special];\n"
+"	add.u64 	%rd28, %rd27, %rd24;\n"
+"	mov.s32 	%r19, 0;\n"
+"	mov.s32 	%r20, %r18;\n"
+"$Lt_4_8450:\n"
+"	ld.global.s32 	%r21, [%rd28+0];\n"
+"	setp.ne.s32 	%p5, %r21, %r17;\n"
+"	@%p5 bra 	$Lt_4_8706;\n"
+"	.loc	14	289	0\n"
+"	setp.le.s32 	%p6, %r8, %r19;\n"
+"	mov.s32 	%r22, 3;\n"
+"	mov.s32 	%r23, 2;\n"
+"	selp.s32 	%r24, %r22, %r23, %p6;\n"
+"	mov.s32 	%r25, 2;\n"
+"	mov.s32 	%r26, 1;\n"
+"	selp.s32 	%r27, %r25, %r26, %p6;\n"
+"	setp.le.s32 	%p7, %r9, %r19;\n"
+"	selp.s32 	%r28, %r24, %r27, %p7;\n"
+"	shl.b32 	%r29, %r28, 30;\n"
+"	xor.b32 	%r16, %r16, %r29;\n"
+"	.loc	14	290	0\n"
+"	st.global.s32 	[%rd10+0], %r16;\n"
+"$Lt_4_8706:\n"
+"	add.s32 	%r19, %r19, 1;\n"
+"	add.u64 	%rd28, %rd26, %rd28;\n"
+"	setp.ne.s32 	%p8, %r10, %r19;\n"
+"	@%p8 bra 	$Lt_4_8450;\n"
+"$Lt_4_7938:\n"
+"	.loc	14	277	0\n"
+"	mul.lo.u64 	%rd29, %rd15, 4;\n"
+"	add.u64 	%rd10, %rd10, %rd29;\n"
+"	setp.gt.u64 	%p9, %rd19, %rd10;\n"
+"	@%p9 bra 	$Lt_4_7682;\n"
+"$Lt_4_7170:\n"
+"$Lt_4_6146:\n"
+"	.loc	14	296	0\n"
+"	exit;\n"
+"$LDWend_kernel_special:\n"
+"	}\n"
+;
--- a/lib/gpu/pair_gpu_dev_kernel.cu
+++ b/lib/gpu/pair_gpu_dev_kernel.cu
@ -0,0 +1,120 @@
+/* ----------------------------------------------------------------------
+   LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+/*************************************************************************
+                           Preprocessor Definitions
+                           
+  Note: It is assumed that constants with the same names are defined with
+  the same values in all files.
+  
+  ARCH
+     Definition:   Architecture number for accelerator
+  MEM_THREADS
+     Definition:   Number of threads with sequential ids accessing memory
+                   simultaneously on multiprocessor
+  WARP_SIZE:
+     Definition:   Number of threads guaranteed to be on the same instruction
+  THREADS_PER_ATOM
+     Definition:   Default number of threads assigned per atom for pair styles
+     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+  THREADS_PER_CHARGE
+     Definition:   Default number of threads assigned per atom for pair styles
+                   with charge
+     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+  PPPM_MAX_SPLINE
+     Definition:   Maximum order for splines in PPPM
+  PPPM_BLOCK_1D    
+     Definition:   Thread block size for PPPM kernels
+     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
+                   PPPM_BLOCK_1D%32==0 
+  BLOCK_PAIR
+     Definition:   Default thread block size for pair styles
+     Restrictions:
+  MAX_SHARED_TYPES 8
+     Definition:   Max number of atom type params can be stored in shared memory
+     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
+  BLOCK_CELL_2D 
+     Definition:   Default block size in each dimension for cell list builds
+                   and matrix transpose
+  BLOCK_CELL_ID    
+     Definition:   Default block size for binning atoms in cell list builds
+  BLOCK_NBOR_BUILD 
+     Definition:   Default block size for neighbor list builds
+  BLOCK_BIO_PAIR
+     Definition:   Default thread block size for "bio" pair styles
+  MAX_BIO_SHARED_TYPES
+     Definition:   Max number of atom type params can be stored in shared memory
+     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2 &&
+                    MAX_BIO_SHARED_TYPES>=BLOCK_BIO_PAIR
+
+*************************************************************************/
+
+#ifndef PAIR_GPU_DEV_KERNEL
+#define PAIR_GPU_DEV_KERNEL
+
+#ifdef NV_KERNEL
+
+#include "nv_kernel_def.h"
+
+#else
+
+#define GLOBAL_ID_X get_global_id(0)
+#define ARCH 0
+#define DRIVER 0
+#define MEM_THREADS 16
+#define WARP_SIZE 1
+#define THREADS_PER_ATOM 1
+#define THREADS_PER_CHARGE 1
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
+#define BLOCK_NBOR_BUILD 64
+#define BLOCK_BIO_PAIR 64
+
+#endif
+
+#define PPPM_MAX_SPLINE 8
+#define PPPM_BLOCK_1D 64
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+#define MAX_BIO_SHARED_TYPES 128
+
+__kernel void kernel_zero(__global int *mem, int numel) {
+  int ii=GLOBAL_ID_X;
+  
+  if (ii<numel)
+    mem[ii]=0;
+}
+
+__kernel void kernel_info(__global int *info) {
+  info[0]=ARCH;
+  info[1]=MEM_THREADS;
+  info[2]=WARP_SIZE;
+  info[3]=THREADS_PER_ATOM;
+  info[4]=PPPM_MAX_SPLINE;
+  info[5]=PPPM_BLOCK_1D;
+  info[6]=BLOCK_PAIR;
+  info[7]=MAX_SHARED_TYPES;
+  info[8]=BLOCK_CELL_2D;
+  info[9]=BLOCK_CELL_ID;
+  info[10]=BLOCK_NBOR_BUILD;
+  info[11]=BLOCK_BIO_PAIR;
+  info[12]=MAX_BIO_SHARED_TYPES;
+  info[13]=THREADS_PER_CHARGE;
+}
+
+#endif
+
--- a/lib/gpu/pair_gpu_nbor_ptx.h
+++ b/lib/gpu/pair_gpu_nbor_ptx.h
@ -0,0 +1,71 @@
+const char * pair_gpu_nbor_kernel = 
+"	.version 1.4\n"
+"	.target sm_13\n"
+"	.entry kernel_unpack (\n"
+"		.param .u64 __cudaparm_kernel_unpack_dev_nbor,\n"
+"		.param .u64 __cudaparm_kernel_unpack_dev_ij,\n"
+"		.param .s32 __cudaparm_kernel_unpack_inum)\n"
+"	{\n"
+"	.reg .u32 %r<11>;\n"
+"	.reg .u64 %rd<27>;\n"
+"	.reg .pred %p<5>;\n"
+"	.loc	14	29	0\n"
+"$LBB1_kernel_unpack:\n"
+"	cvt.s32.u16 	%r1, %ctaid.x;\n"
+"	cvt.s32.u16 	%r2, %ntid.x;\n"
+"	mul24.lo.s32 	%r3, %r1, %r2;\n"
+"	cvt.u32.u16 	%r4, %tid.x;\n"
+"	add.u32 	%r5, %r3, %r4;\n"
+"	ld.param.s32 	%r6, [__cudaparm_kernel_unpack_inum];\n"
+"	setp.le.s32 	%p1, %r6, %r5;\n"
+"	@%p1 bra 	$Lt_0_2050;\n"
+"	.loc	14	35	0\n"
+"	cvt.u64.s32 	%rd1, %r6;\n"
+"	ld.param.u64 	%rd2, [__cudaparm_kernel_unpack_dev_nbor];\n"
+"	cvt.u64.s32 	%rd3, %r5;\n"
+"	add.u64 	%rd4, %rd3, %rd1;\n"
+"	mul.lo.u64 	%rd5, %rd4, 4;\n"
+"	add.u64 	%rd6, %rd2, %rd5;\n"
+"	ld.global.s32 	%r7, [%rd6+0];\n"
+"	.loc	14	36	0\n"
+"	mul.lo.u64 	%rd7, %rd1, 4;\n"
+"	add.u64 	%rd8, %rd6, %rd7;\n"
+"	mov.s64 	%rd9, %rd8;\n"
+"	.loc	14	37	0\n"
+"	ld.param.u64 	%rd10, [__cudaparm_kernel_unpack_dev_ij];\n"
+"	ld.global.s32 	%r8, [%rd8+0];\n"
+"	cvt.u64.s32 	%rd11, %r8;\n"
+"	mul.lo.u64 	%rd12, %rd11, 4;\n"
+"	add.u64 	%rd13, %rd10, %rd12;\n"
+"	.loc	14	38	0\n"
+"	cvt.u64.s32 	%rd14, %r7;\n"
+"	mul.lo.u64 	%rd15, %rd14, 4;\n"
+"	add.u64 	%rd16, %rd15, %rd13;\n"
+"	setp.le.u64 	%p2, %rd16, %rd13;\n"
+"	@%p2 bra 	$Lt_0_2562;\n"
+"	add.u64 	%rd17, %rd15, 3;\n"
+"	shr.s64 	%rd18, %rd17, 63;\n"
+"	mov.s64 	%rd19, 3;\n"
+"	and.b64 	%rd20, %rd18, %rd19;\n"
+"	add.s64 	%rd21, %rd20, %rd17;\n"
+"	shr.s64 	%rd22, %rd21, 2;\n"
+"	mov.s64 	%rd23, 1;\n"
+"	max.s64 	%rd24, %rd22, %rd23;\n"
+"	mov.s64 	%rd25, %rd24;\n"
+"$Lt_0_3074:\n"
+"	.loc	14	41	0\n"
+"	ld.global.s32 	%r9, [%rd13+0];\n"
+"	st.global.s32 	[%rd9+0], %r9;\n"
+"	.loc	14	42	0\n"
+"	add.u64 	%rd9, %rd7, %rd9;\n"
+"	.loc	14	40	0\n"
+"	add.u64 	%rd13, %rd13, 4;\n"
+"	setp.gt.u64 	%p3, %rd16, %rd13;\n"
+"	@%p3 bra 	$Lt_0_3074;\n"
+"$Lt_0_2562:\n"
+"$Lt_0_2050:\n"
+"	.loc	14	45	0\n"
+"	exit;\n"
+"$LDWend_kernel_unpack:\n"
+"	}\n"
+;
--- a/lib/gpu/pair_gpu_nbor_shared.cpp
+++ b/lib/gpu/pair_gpu_nbor_shared.cpp
@ -0,0 +1,71 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include "pair_gpu_nbor_shared.h"
+
+#ifdef USE_OPENCL
+#include "pair_gpu_nbor_cl.h"
+#else
+#include "pair_gpu_nbor_ptx.h"
+#include "pair_gpu_build_ptx.h"
+#endif
+  
+void PairGPUNborShared::clear() {
+  if (_compiled) {
+    if (_gpu_nbor) {
+      k_cell_id.clear();
+      k_cell_counts.clear();
+      k_build_nbor.clear();
+      k_transpose.clear();
+      k_special.clear();
+      delete build_program;
+    } else {
+      k_nbor.clear();
+      delete nbor_program;
+    }
+    _compiled=false;
+  }
+}
+
+void PairGPUNborShared::compile_kernels(UCL_Device &dev, const bool gpu_nbor) {
+  if (_compiled)
+  	return;
+  	
+  _gpu_nbor=gpu_nbor;
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
+
+  if (gpu_nbor==false) {
+    nbor_program=new UCL_Program(dev);
+    nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
+    k_nbor.set_function(*nbor_program,"kernel_unpack");
+  } else {
+    build_program=new UCL_Program(dev);
+    #ifdef USE_OPENCL
+    std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
+    exit(1);
+    #else
+    build_program->load_string(pair_gpu_build_kernel,flags.c_str());
+    #endif
+    k_cell_id.set_function(*build_program,"calc_cell_id");
+    k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
+    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
+    k_transpose.set_function(*build_program,"transpose");
+    k_special.set_function(*build_program,"kernel_special");
+    neigh_tex.get_texture(*build_program,"neigh_tex");
+  }
+  _compiled=true;
+}
--- a/lib/gpu/pair_gpu_nbor_shared.h
+++ b/lib/gpu/pair_gpu_nbor_shared.h
@ -0,0 +1,58 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PAIR_GPU_NBOR_SHARED_H
+#define PAIR_GPU_NBOR_SHARED_H
+
+#ifdef USE_OPENCL
+
+#include "geryon/ocl_kernel.h"
+#include "geryon/ocl_texture.h"
+using namespace ucl_opencl;
+
+#else
+
+#include "geryon/nvd_kernel.h"
+#include "geryon/nvd_texture.h"
+using namespace ucl_cudadr;
+
+#endif
+
+class PairGPUNborShared {
+ public:
+  PairGPUNborShared() : _compiled(false) {}
+  ~PairGPUNborShared() { clear(); }
+ 
+  /// Free all memory on host and device
+  void clear();
+
+  /// Texture for cached position/type access with CUDA
+  UCL_Texture neigh_tex;
+
+  /// Compile kernels for neighbor lists
+  void compile_kernels(UCL_Device &dev, const bool gpu_nbor);
+
+  // ----------------------------- Kernels
+  UCL_Program *nbor_program, *build_program;
+  UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
+  UCL_Kernel k_transpose, k_special;
+
+ private:
+  bool _compiled, _gpu_nbor;
+};
+
+#endif
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@ -0,0 +1,326 @@
+/* ----------------------------------------------------------------------
+   LAMMPS-Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PPPM_GPU_KERNEL
+#define PPPM_GPU_KERNEL
+
+#ifdef _DOUBLE_DOUBLE
+#define numtyp double
+#define numtyp4 double4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifdef _SINGLE_DOUBLE
+#define numtyp float
+#define numtyp4 float4
+#define acctyp double
+#define acctyp4 double4
+#endif
+
+#ifndef numtyp
+#define numtyp float
+#define numtyp4 float4
+#define acctyp float
+#define acctyp4 float4
+#endif
+
+#ifdef NV_KERNEL
+
+#include "geryon/ucl_nv_kernel.h"
+texture<float4> pos_tex;
+texture<float> q_tex;
+
+#ifdef _DOUBLE_DOUBLE
+__inline double4 fetch_pos(const int& i, const double4 *pos)
+{
+  return pos[i];
+}
+__inline double fetch_q(const int& i, const double *q)
+{
+  return q[i];
+}
+
+#else
+__inline float4 fetch_pos(const int& i, const float4 *pos)
+{
+  return tex1Dfetch(pos_tex, i);
+}
+__inline float fetch_q(const int& i, const float *q)
+{
+  return tex1Dfetch(q_tex, i);
+}
+
+#endif
+
+#else
+
+#pragma OPENCL EXTENSION cl_khr_fp64: enable
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define GLOBAL_SIZE_X get_global_size(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define __inline inline
+
+#define fetch_pos(i,y) x_[i]
+#define fetch_q(i,y) q_[i]
+#define MEM_THREADS 16
+
+#endif
+
+// Maximum order for spline
+#define PPPM_MAX_SPLINE 8
+// Thread block size for PPPM kernels
+// - Must be >=PPPM_MAX_SPLINE^2
+// - Must be a multiple of 32
+#define PPPM_BLOCK_1D 64
+// Number of threads per pencil for charge spread
+#define PENCIL_SIZE MEM_THREADS
+// Number of pencils per block for charge spread
+#define BLOCK_PENCILS (PPPM_BLOCK_1D/PENCIL_SIZE)
+
+__kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
+                           const grdtyp delvolinv, const int nlocal, 
+                           __global int *counts, __global grdtyp4 *ans, 
+                           const grdtyp b_lo_x, const grdtyp b_lo_y,
+                           const grdtyp b_lo_z, const grdtyp delxinv,
+                           const grdtyp delyinv, const grdtyp delzinv,
+                           const int nlocal_x, const int nlocal_y,
+                           const int nlocal_z, const int atom_stride,
+                           const int max_atoms, __global int *error) {
+  // ii indexes the two interacting particles in gi
+  int ii=GLOBAL_ID_X;
+
+  // Resequence the atom indices to avoid collisions during atomic ops
+  int nthreads=GLOBAL_SIZE_X;
+  ii=mul24(ii,PPPM_BLOCK_1D);
+  ii-=(ii/nthreads)*(nthreads-1);
+
+  int nx,ny,nz;
+
+  if (ii<nlocal) {
+    numtyp4 p=fetch_pos(ii,x_);
+    grdtyp4 delta;
+    delta.w=delvolinv*fetch_q(ii,q_);
+    
+    if (delta.w!=(grdtyp)0.0) {
+      delta.x=(p.x-b_lo_x)*delxinv;
+      nx=delta.x;
+      delta.y=(p.y-b_lo_y)*delyinv;
+      ny=delta.y;
+      delta.z=(p.z-b_lo_z)*delzinv;
+      nz=delta.z;
+
+      if (delta.x<(grdtyp)0 || delta.y<(grdtyp)0 || delta.z<(grdtyp)0 || 
+          nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
+        *error=1;
+      else {
+        delta.x=nx+(grdtyp)0.5-delta.x;
+        delta.y=ny+(grdtyp)0.5-delta.y;
+        delta.z=nz+(grdtyp)0.5-delta.z;
+      
+        int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
+        int old=atom_add(counts+i, 1);
+        if (old==max_atoms) {
+          *error=2;
+          atom_add(counts+i, -1);
+        } else
+          ans[atom_stride*old+i]=delta;
+      }
+    }
+  }
+}
+
+/* --------------------------- */
+
+__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms,
+                       __global grdtyp *brick, __global grdtyp *_rho_coeff,
+                       const int atom_stride, const int npts_x,
+                       const int npts_y, const int npts_z, const int nlocal_x,
+                       const int nlocal_y, const int nlocal_z,
+                       const int order_m_1, const int order, const int order2) {
+  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
+  __local grdtyp front[BLOCK_PENCILS][PENCIL_SIZE+PPPM_MAX_SPLINE];
+  __local grdtyp ans[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
+  
+  int tid=THREAD_ID_X;
+  if (tid<order2+order)
+    rho_coeff[tid]=_rho_coeff[tid];
+    
+  int pid=tid/PENCIL_SIZE;
+  int fid=tid%PENCIL_SIZE;
+  int fid_halo=PENCIL_SIZE+fid;
+  if (fid<order) 
+    front[pid][fid_halo]=(grdtyp)0.0;
+
+  __syncthreads();
+
+  int bt=BLOCK_ID_X*BLOCK_PENCILS+pid;
+  int ny=bt%npts_y;
+  int nz=bt/npts_y;
+  int y_start=0;
+  int z_start=0;
+  int y_stop=order;
+  int z_stop=order;
+  if (ny<order_m_1)
+    y_start=order_m_1-ny;
+  if (nz<order_m_1)
+    z_start=order_m_1-nz;
+  if (ny>=nlocal_y)
+    y_stop-=ny-nlocal_y+1;
+  if (nz>=nlocal_z)
+    z_stop-=nz-nlocal_z+1;
+  int z_stride=mul24(nlocal_x,nlocal_y);
+
+  int loop_count=npts_x/PENCIL_SIZE+1;
+  int nx=fid;
+  int pt=mul24(nz,mul24(npts_y,npts_x))+mul24(ny,npts_x)+nx;
+  for (int i=0 ; i<loop_count; i++) {
+    for (int n=0; n<order; n++)
+      ans[n][tid]=(grdtyp)0.0;
+    if (nx<nlocal_x && nz<npts_z) {
+      int z_pos=mul24(nz+z_start-order_m_1,z_stride);
+      for (int m=z_start; m<z_stop; m++) {
+        int y_pos=mul24(ny+y_start-order_m_1,nlocal_x);
+        for (int l=y_start; l<y_stop; l++) {
+          int pos=z_pos+y_pos+nx;
+          int natoms=mul24(counts[pos],atom_stride);
+          for (int row=pos; row<natoms; row+=atom_stride) {
+            grdtyp4 delta=atoms[row];
+      
+            grdtyp rho1d_1=(grdtyp)0.0;
+            grdtyp rho1d_2=(grdtyp)0.0;
+            for (int k=order2+order-1; k > -1; k-=order) {
+              rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y;
+              rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z;
+            }
+            delta.w*=rho1d_1*rho1d_2;
+
+            for (int n=0; n<order; n++) {
+              grdtyp rho1d_0=(grdtyp)0.0;
+              for (int k=order2+n; k>=n; k-=order)
+                rho1d_0=rho_coeff[k]+rho1d_0*delta.x;
+              ans[n][tid]+=delta.w*rho1d_0;
+            }
+          }
+          y_pos+=nlocal_x;
+        }
+        z_pos+=z_stride;
+      }
+    }
+    
+    __syncthreads();
+    if (fid<order) {
+      front[pid][fid]=front[pid][fid_halo];
+      front[pid][fid_halo]=(grdtyp)0.0;
+    } else 
+      front[pid][fid]=(grdtyp)0.0;
+    
+    for (int n=0; n<order; n++) {
+      front[pid][fid+n]+=ans[n][tid];
+      __syncthreads();
+    }
+
+    if (nx<npts_x && nz<npts_z)
+      brick[pt]=front[pid][fid];
+    pt+=PENCIL_SIZE;
+    nx+=PENCIL_SIZE;
+  }
+}
+
+__kernel void interp(__global numtyp4 *x_, __global numtyp *q_,
+                     const int nlocal, __global grdtyp4 *brick,
+                     __global grdtyp *_rho_coeff, const int npts_x,
+                     const int npts_yx, const grdtyp b_lo_x,
+                     const grdtyp b_lo_y, const grdtyp b_lo_z,
+                     const grdtyp delxinv,  const grdtyp delyinv,
+                     const grdtyp delzinv, const int order,
+                     const int order2, const grdtyp qqrd2e_scale, 
+                     __global acctyp4 *ans) {
+  __local grdtyp rho_coeff[PPPM_MAX_SPLINE*PPPM_MAX_SPLINE];
+  __local grdtyp rho1d_0[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
+  __local grdtyp rho1d_1[PPPM_MAX_SPLINE][PPPM_BLOCK_1D];
+
+  int tid=THREAD_ID_X;
+  if (tid<order2+order)
+    rho_coeff[tid]=_rho_coeff[tid];
+  __syncthreads();
+  
+  int ii=tid+BLOCK_ID_X*BLOCK_SIZE_X;
+  
+  int nx,ny,nz;
+  grdtyp tx,ty,tz;
+
+  if (ii<nlocal) {
+    numtyp4 p=fetch_pos(ii,x_);
+    grdtyp qs=qqrd2e_scale*fetch_q(ii,q_);
+
+    acctyp4 ek;
+    ek.x=(acctyp)0.0;
+    ek.y=(acctyp)0.0;
+    ek.z=(acctyp)0.0;
+    if (qs!=(grdtyp)0.0) {
+      tx=(p.x-b_lo_x)*delxinv;
+      nx=tx;
+      ty=(p.y-b_lo_y)*delyinv;
+      ny=ty;
+      tz=(p.z-b_lo_z)*delzinv;
+      nz=tz;
+
+      grdtyp dx=nx+(grdtyp)0.5-tx;
+      grdtyp dy=ny+(grdtyp)0.5-ty;
+      grdtyp dz=nz+(grdtyp)0.5-tz;
+
+      for (int k=0; k<order; k++) {
+        rho1d_0[k][tid]=(grdtyp)0.0;
+        rho1d_1[k][tid]=(grdtyp)0.0;
+        for (int l=order2+k; l>=k; l-=order) {
+          rho1d_0[k][tid]=rho_coeff[l]+rho1d_0[k][tid]*dx;
+          rho1d_1[k][tid]=rho_coeff[l]+rho1d_1[k][tid]*dy;
+        }
+      }
+        
+      int mz=mul24(nz,npts_yx)+nx;
+      for (int n=0; n<order; n++) {
+        grdtyp rho1d_2=(grdtyp)0.0;
+        for (int k=order2+n; k>=n; k-=order)
+          rho1d_2=rho_coeff[k]+rho1d_2*dz;
+        grdtyp z0=qs*rho1d_2;
+        int my=mz+mul24(ny,npts_x);
+        for (int m=0; m<order; m++) {
+          grdtyp y0=z0*rho1d_1[m][tid];
+  	      for (int l=0; l<order; l++) {
+  	        grdtyp x0=y0*rho1d_0[l][tid];
+  	        grdtyp4 el=brick[my+l];
+  	        ek.x-=x0*el.x;
+  	        ek.y-=x0*el.y;
+  	        ek.z-=x0*el.z;
+  	      }
+          my+=npts_x;
+        }
+        mz+=npts_yx;
+  	  }
+    }
+    ans[ii]=ek;
+	}
+}
+
+#endif
+
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@ -0,0 +1,405 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifdef USE_OPENCL
+#include "pppm_gpu_cl.h"
+#else
+#include "pppm_f_gpu_ptx.h"
+#include "pppm_d_gpu_ptx.h"
+#endif
+#include "pppm_gpu_memory.h"
+#include <cassert>
+
+#define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp, grdtyp, grdtyp4>
+
+extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+PPPMGPUMemoryT::PPPMGPUMemory() : _allocated(false), _compiled(false),
+                                  _max_bytes(0) {
+  device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+PPPMGPUMemoryT::~PPPMGPUMemory() {
+  clear(0.0);
+  delete ans;
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+int PPPMGPUMemoryT::bytes_per_atom() const {
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+1;
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
+                              const int order, const int nxlo_out,
+                              const int nylo_out, const int nzlo_out,
+                              const int nxhi_out, const int nyhi_out,
+                              const int nzhi_out, double **rho_coeff,
+                              grdtyp **vd_brick, const double slab_volfactor, 
+                              const int nx_pppm, const int ny_pppm,
+                              const int nz_pppm, int &flag) {
+  _max_bytes=10;
+  screen=_screen;
+  bool success=true;
+
+  flag=device->init(*ans,nlocal,nall);
+  if (flag!=0)
+    return 0;
+  if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
+    flag=-5;
+    return 0;
+  }
+  
+  ucl_device=device->gpu;
+  atom=&device->atom;
+
+  _block_size=device->pppm_block();
+  _pencil_size=device->num_mem_threads();
+  _block_pencils=_block_size/_pencil_size;
+
+  compile_kernels(*ucl_device);
+
+  // Initialize timers for the selected GPU
+  time_in.init(*ucl_device);
+  time_in.zero();
+  time_out.init(*ucl_device);
+  time_out.zero();
+  time_map.init(*ucl_device);
+  time_map.zero();
+  time_rho.init(*ucl_device);
+  time_rho.zero();
+  time_interp.init(*ucl_device);
+  time_interp.zero();
+
+  pos_tex.bind_float(atom->dev_x,4);
+  q_tex.bind_float(atom->dev_q,1);
+
+  _allocated=true;
+  _max_bytes=0;
+  _max_an_bytes=ans->gpu_bytes();
+  
+  _order=order;
+  _order_m_1=order-1;
+  _order2=_order_m_1*_order;
+  _nlower=-(_order-1)/2;
+  _nupper=order/2;
+  _nxlo_out=nxlo_out;
+  _nylo_out=nylo_out;
+  _nzlo_out=nzlo_out;
+  _nxhi_out=nxhi_out;
+  _nyhi_out=nyhi_out;
+  _nzhi_out=nzhi_out;
+
+  _slab_volfactor=slab_volfactor;
+  _nx_pppm=nx_pppm;
+  _ny_pppm=ny_pppm;
+  _nz_pppm=nz_pppm;
+
+  _max_brick_atoms=10;
+
+  // Get rho_coeff on device
+  int n2lo=(1-order)/2;
+  int numel=order*( order/2 - n2lo + 1 );
+  success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
+                      UCL_SUCCESS);
+  UCL_H_Vec<double> view;
+  view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
+  ucl_copy(d_rho_coeff,view,true);
+  _max_bytes+=d_rho_coeff.row_bytes();
+  
+  // Allocate storage for grid
+  _npts_x=nxhi_out-nxlo_out+1;
+  _npts_y=nyhi_out-nylo_out+1;
+  _npts_z=nzhi_out-nzlo_out+1;
+  _npts_yx=_npts_x*_npts_y;
+  success=success && (d_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
+                      UCL_SUCCESS);
+  success=success && (h_brick.alloc(_npts_x*_npts_y*_npts_z,*ucl_device)==
+                      UCL_SUCCESS);
+  success=success && (h_vd_brick.alloc(_npts_x*_npts_y*_npts_z*4,*ucl_device)==
+                      UCL_SUCCESS);
+  *vd_brick=h_vd_brick.begin();
+  _max_bytes+=d_brick.row_bytes();
+
+  // Allocate vector with count of atoms assigned to each grid point
+  _nlocal_x=_npts_x+_nlower-_nupper;
+  _nlocal_y=_npts_y+_nlower-_nupper;
+  _nlocal_z=_npts_z+_nlower-_nupper;
+  _nlocal_yx=_nlocal_x*_nlocal_y;
+  _atom_stride=_nlocal_x*_nlocal_y*_nlocal_z;
+  success=success && (d_brick_counts.alloc(_atom_stride,*ucl_device)==
+                      UCL_SUCCESS);
+  _max_bytes+=d_brick_counts.row_bytes();
+
+  // Allocate storage for atoms assigned to each grid point
+  success=success && (d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,
+                                          *ucl_device)==UCL_SUCCESS);
+  _max_bytes+=d_brick_atoms.row_bytes();
+
+  // Allocate error flags for checking out of bounds atoms
+  success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS);
+  success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)==
+                                         UCL_SUCCESS);
+  if (!success) {
+    flag=-3;
+    return 0;
+  }
+  
+  d_error_flag.zero();
+  _max_bytes+=1;
+  
+  _cpu_idle_time=0.0;
+
+  return h_brick.begin();
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMGPUMemoryT::clear(const double cpu_time) {
+  if (!_allocated)
+    return;
+  _allocated=false;
+  _precompute_done=false;
+  
+  d_brick.clear();
+  h_brick.clear();
+  h_vd_brick.clear();
+  d_brick_counts.clear();
+  h_error_flag.clear();
+  d_error_flag.clear();
+  d_brick_atoms.clear();
+  
+  acc_timers();
+  device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
+                              *ans,_max_bytes+_max_an_bytes,cpu_time,
+                              _cpu_idle_time,screen);
+
+  if (_compiled) {
+    k_particle_map.clear();
+    k_make_rho.clear();
+    k_interp.clear();
+    delete pppm_program;
+    _compiled=false;
+  }
+
+  time_in.clear();
+  time_out.clear();
+  time_map.clear();
+  time_rho.clear();
+  time_interp.clear();
+
+  device->clear();
+}
+
+// ---------------------------------------------------------------------------
+// Charge assignment that can be performed asynchronously
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMGPUMemoryT::_precompute(const int ago, const int nlocal, const int nall,
+                                 double **host_x, int *host_type, bool &success,
+                                 double *host_q, double *boxlo, 
+                                 const double delxinv, const double delyinv,
+                                 const double delzinv) {
+  acc_timers();
+  if (nlocal==0) {
+    zero_timers();
+    return;
+  }
+  
+  ans->inum(nlocal);
+
+  if (ago==0) {
+    resize_atom(nlocal,nall,success);
+    resize_local(nlocal,success);
+    if (!success)
+      return;
+
+    double bytes=ans->gpu_bytes();
+    if (bytes>_max_an_bytes)
+      _max_an_bytes=bytes;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+
+  time_map.start();
+
+  // Compute the block size and grid size to keep all cores busy
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int ainum=this->ans->inum();
+  
+  // Boxlo adjusted to be upper left brick and shift for even spline order
+  double shift=0.0;
+  if (_order % 2)
+    shift=0.5;
+  _brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
+  _brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
+  _brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
+  
+  _delxinv=delxinv;
+  _delyinv=delyinv;
+  _delzinv=delzinv;
+  double delvolinv = delxinv*delyinv*delzinv;
+  grdtyp f_delvolinv = delvolinv;
+
+  device->zero(d_brick_counts,d_brick_counts.numel());
+  k_particle_map.set_size(GX,BX);
+  k_particle_map.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &f_delvolinv,
+                     &ainum, &d_brick_counts.begin(), &d_brick_atoms.begin(),
+                     &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, 
+                     &_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, 
+                     &_atom_stride, &_max_brick_atoms, &d_error_flag.begin());
+  time_map.stop();
+
+  time_rho.start();
+  BX=block_size();
+
+  GX=static_cast<int>(ceil(static_cast<double>(_npts_y*_npts_z)/
+                      _block_pencils));
+  k_make_rho.set_size(GX,BX);
+  k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(),
+                 &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, 
+                 &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y,
+                 &_nlocal_z, &_order_m_1, &_order, &_order2);
+  time_rho.stop();
+
+  time_out.start();
+  ucl_copy(h_brick,d_brick,_npts_yx*_npts_z,true);
+  ucl_copy(h_error_flag,d_error_flag,true);
+  time_out.stop();
+
+  _precompute_done=true;
+}
+
+// ---------------------------------------------------------------------------
+// Charge spreading stuff
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall,
+                           double **host_x, int *host_type, bool &success,
+                           double *host_q, double *boxlo, 
+                           const double delxinv, const double delyinv,
+                           const double delzinv) {
+  if (_precompute_done==false) {
+    atom->acc_timers();
+    _precompute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv,
+                delyinv,delzinv);
+  }
+
+  device->stop_host_timer();
+  
+  if (!success || nlocal==0)
+    return 0;
+    
+  double t=MPI_Wtime();
+  time_out.sync_stop();
+  _cpu_idle_time+=MPI_Wtime()-t;
+
+  _precompute_done=false;
+
+  if (h_error_flag[0]==2) {
+    // Not enough storage for atoms on the brick
+    _max_brick_atoms*=2;
+    d_error_flag.zero();
+    d_brick_atoms.clear();
+    d_brick_atoms.alloc(_atom_stride*_max_brick_atoms,*ucl_device);
+    _max_bytes+=d_brick_atoms.row_bytes();
+    return spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo, 
+                  delxinv,delyinv,delzinv);
+  }
+  
+  return h_error_flag[0];
+}
+
+// ---------------------------------------------------------------------------
+// Charge spreading stuff
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMGPUMemoryT::interp(const grdtyp qqrd2e_scale) {
+  time_in.start();
+  ucl_copy(d_brick,h_vd_brick,true);
+  time_in.stop();
+  
+  time_interp.start();
+  // Compute the block size and grid size to keep all cores busy
+  int BX=this->block_size();
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int ainum=this->ans->inum();
+  
+  k_interp.set_size(GX,BX);
+  k_interp.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, 
+               &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx,
+               &_brick_x, &_brick_y, &_brick_z, &_delxinv, &_delyinv, &_delzinv,
+               &_order, &_order2, &qqrd2e_scale, &ans->dev_ans.begin());
+  time_interp.stop();
+
+  ans->copy_answers(false,false,false,false);
+  device->add_ans_object(ans);
+}
+
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+double PPPMGPUMemoryT::host_memory_usage() const {
+  return device->atom.host_memory_usage()+
+         sizeof(PPPMGPUMemory<numtyp,acctyp,grdtyp,grdtyp4>);
+}
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
+  if (_compiled)
+    return;
+
+  if (sizeof(grdtyp)==sizeof(double) && ucl_device->double_precision()==false)
+    return;
+
+  std::string flags="-cl-fast-relaxed-math -cl-mad-enable "+
+                    std::string(OCL_PRECISION_COMPILE);
+  #ifdef USE_OPENCL
+  flags+=std::string(" -D grdtyp=")+ucl_template_name<grdtyp>()+" -D grdtyp4="+
+         ucl_template_name<grdtyp>()+"4";
+  #endif
+
+  pppm_program=new UCL_Program(dev);
+  
+  #ifdef USE_OPENCL
+  pppm_program->load_string(pppm_gpu_kernel,flags.c_str());
+  #else
+  if (sizeof(grdtyp)==sizeof(float))
+    pppm_program->load_string(pppm_f_gpu_kernel,flags.c_str());
+  else
+    pppm_program->load_string(pppm_d_gpu_kernel,flags.c_str());
+  #endif
+
+  k_particle_map.set_function(*pppm_program,"particle_map");
+  k_make_rho.set_function(*pppm_program,"make_rho");
+  k_interp.set_function(*pppm_program,"interp");
+  pos_tex.get_texture(*pppm_program,"pos_tex");
+  q_tex.get_texture(*pppm_program,"q_tex");
+
+  _compiled=true;
+}
+
+template class PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4>;
+template class PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4>;
+
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@ -0,0 +1,195 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Charge/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#ifndef PPPM_GPU_MEMORY_H
+#define PPPM_GPU_MEMORY_H
+
+#include "mpi.h"
+#include "pair_gpu_device.h"
+
+#ifdef USE_OPENCL
+#include "geryon/ocl_texture.h"
+#else
+#include "geryon/nvd_texture.h"
+#endif
+
+template <class numtyp, class acctyp> class PairGPUDevice;
+
+template <class numtyp, class acctyp, class grdtyp, class grdtyp4>
+class PPPMGPUMemory {
+ public:
+  PPPMGPUMemory();
+  virtual ~PPPMGPUMemory();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** Success will be:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -2 if GPU could not be found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
+                const int nxlo_out, const int nylo_out, const int nzlo_out,
+                const int nxhi_out, const int nyhi_out, const int nzhi_out,
+                double **rho_coeff, grdtyp **vd_brick, 
+                const double slab_volfactor, const int nx_pppm, 
+                const int ny_pppm, const int nz_pppm, int &success);
+
+  /// Check if there is enough storage for atom arrays and realloc if not
+  /** \param success set to false if insufficient memory **/
+  inline void resize_atom(const int inum, const int nall, bool &success) {
+    if (atom->resize(nall, success)) {
+      pos_tex.bind_float(atom->dev_x,4);
+      q_tex.bind_float(atom->dev_q,1);
+    }
+    ans->resize(inum,success);
+  }
+
+  /// Check if there is enough storage for local atoms and realloc if not
+  inline void resize_local(const int inum, bool &success) {
+  }
+  
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear(const double cpu_time);
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom() const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  /// Accumulate timers
+  inline void acc_timers() {
+    if (device->time_device()) {
+      ans->acc_timers();
+      time_in.add_to_total();
+      time_out.add_to_total();
+      time_map.add_to_total();
+      time_rho.add_to_total();
+      time_interp.add_to_total();
+    }
+  }
+
+  /// Zero timers
+  inline void zero_timers() {
+    atom->zero_timers();
+    ans->zero_timers();
+    time_in.zero();
+    time_out.zero();
+    time_map.zero();
+    time_rho.zero();
+    time_interp.zero();
+  }
+
+  /// Precomputations for charge assignment that can be done asynchronously
+  inline void precompute(const int ago, const int nlocal, const int nall,
+                         double **host_x, int *host_type, bool &success,
+                         double *charge, double *boxlo, double *prd) {
+    double delxinv=_nx_pppm/prd[0];
+    double delyinv=_ny_pppm/prd[1];
+    double delzinv=_nz_pppm/(prd[2]*_slab_volfactor);
+    _precompute(ago,nlocal,nall,host_x,host_type,success,charge,boxlo,delxinv,
+                delyinv,delzinv);
+  }
+
+  /// Returns non-zero if out of bounds atoms
+  int spread(const int ago, const int nlocal, const int nall, double **host_x,
+             int *host_type, bool &success, double *charge, double *boxlo,
+             const double delxinv, const double delyinv, const double delzinv);
+
+  void interp(const grdtyp qqrd2e_scale);
+
+  // -------------------------- DEVICE DATA ------------------------- 
+
+  /// Device Properties and Atom and Neighbor storage
+  PairGPUDevice<numtyp,acctyp> *device;
+
+  /// Geryon device
+  UCL_Device *ucl_device;
+
+  /// Device Timers
+  UCL_Timer time_in, time_out, time_map, time_rho, time_interp;
+
+  /// LAMMPS pointer for screen output
+  FILE *screen;
+
+  // --------------------------- ATOM DATA --------------------------
+
+  /// Atom Data
+  PairGPUAtom<numtyp,acctyp> *atom;
+
+
+  // --------------------------- GRID DATA --------------------------
+
+  UCL_H_Vec<grdtyp> h_brick, h_vd_brick;
+  UCL_D_Vec<grdtyp> d_brick;
+  
+  // Count of number of atoms assigned to each grid point
+  UCL_D_Vec<int> d_brick_counts;
+  // Atoms assigned to each grid point
+  UCL_D_Vec<grdtyp4> d_brick_atoms;
+  
+  // Error checking for out of bounds atoms
+  UCL_D_Vec<int> d_error_flag;
+  UCL_H_Vec<int> h_error_flag;
+  
+  // Number of grid points in brick (including ghost)
+  int _npts_x, _npts_y, _npts_z, _npts_yx;
+  
+  // Number of local grid points in brick
+  int _nlocal_x, _nlocal_y, _nlocal_z, _nlocal_yx, _atom_stride;
+  
+  // -------------------------- SPLINE DATA -------------------------
+  UCL_D_Vec<grdtyp> d_rho_coeff;
+  int _order, _nlower, _nupper, _order_m_1, _order2;
+  int _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out;
+
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
+  // ------------------------- DEVICE KERNELS -------------------------
+  UCL_Program *pppm_program;
+  UCL_Kernel k_particle_map, k_make_rho, k_interp;
+  inline int block_size() { return _block_size; }
+
+  // --------------------------- TEXTURES -----------------------------
+  UCL_Texture pos_tex;
+  UCL_Texture q_tex;
+
+ protected:
+  bool _allocated, _compiled, _precompute_done;
+  int _block_size, _block_pencils, _pencil_size, _max_brick_atoms, _max_atoms;
+  double  _max_bytes, _max_an_bytes;
+  double _cpu_idle_time;
+  
+  grdtyp _brick_x, _brick_y, _brick_z, _delxinv, _delyinv, _delzinv; 
+
+  double _slab_volfactor;
+  int _nx_pppm, _ny_pppm, _nz_pppm;
+  
+  void compile_kernels(UCL_Device &dev);
+  void _precompute(const int ago, const int nlocal, const int nall,
+                   double **host_x, int *host_type, bool &success,
+                   double *charge, double *boxlo, const double delxinv,
+                   const double delyinv, const double delzinv);
+};
+
+#endif
+
--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+ 
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
+------------------------------------------------------------------------- */
+
+#include <iostream>
+#include <cassert>
+#include <math.h>
+
+#include "pppm_gpu_memory.h"
+
+using namespace std;
+
+static PPPMGPUMemory<PRECISION,ACC_PRECISION,float,_lgpu_float4> PPPMF;
+static PPPMGPUMemory<PRECISION,ACC_PRECISION,double,_lgpu_double4> PPPMD;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+template <class grdtyp, class memtyp>
+grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
+                       FILE *screen, const int order, const int nxlo_out, 
+                       const int nylo_out, const int nzlo_out,
+                       const int nxhi_out, const int nyhi_out,
+                       const int nzhi_out, double **rho_coeff,
+                       grdtyp **vd_brick, const double slab_volfactor,
+                       const int nx_pppm, const int ny_pppm, const int nz_pppm,
+                       int &success) {
+  pppm.clear(0.0);
+  int first_gpu=pppm.device->first_device();
+  int last_gpu=pppm.device->last_device();
+  int world_me=pppm.device->world_me();
+  int gpu_rank=pppm.device->gpu_rank();
+  int procs_per_gpu=pppm.device->procs_per_gpu();
+
+  pppm.device->init_message(screen,"pppm",first_gpu,last_gpu);
+
+  bool message=false;
+  if (pppm.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing GPU and compiling on process 0...");
+    fflush(screen);
+  }
+
+  success=0;
+  grdtyp * host_brick=NULL;
+  if (world_me==0)
+    host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out,
+                         nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
+                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+
+  pppm.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing GPU %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing GPUs %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
+                           nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
+                           vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
+                           success);
+
+    pppm.device->gpu_barrier();
+    if (message) 
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+  return host_brick;
+}
+
+float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
+                        const int order, const int nxlo_out, 
+                        const int nylo_out, const int nzlo_out,
+                        const int nxhi_out, const int nyhi_out,
+                        const int nzhi_out, double **rho_coeff,
+                        float **vd_brick, const double slab_volfactor,
+                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
+                        int &success) {
+  float *b=pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out,
+                         nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,
+                         slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+  PPPMF.device->set_single_precompute(&PPPMF);                         
+  return b;
+}
+
+void pppm_gpu_clear_f(const double cpu_time) {
+  PPPMF.clear(cpu_time);
+}
+
+int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
+                     double **host_x, int *host_type, bool &success,
+                     double *host_q, double *boxlo, const double delxinv,
+                     const double delyinv, const double delzinv) {
+  return PPPMF.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
+                      delxinv,delyinv,delzinv);
+}
+
+void pppm_gpu_interp_f(const float qqrd2e_scale) {
+  return PPPMF.interp(qqrd2e_scale);
+}
+
+double pppm_gpu_bytes_f() {
+  return PPPMF.host_memory_usage();
+}
+
+double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
+                         const int order, const int nxlo_out, 
+                         const int nylo_out, const int nzlo_out,
+                         const int nxhi_out, const int nyhi_out,
+                         const int nzhi_out, double **rho_coeff,
+                         double **vd_brick, const double slab_volfactor,
+                         const int nx_pppm, const int ny_pppm,
+                         const int nz_pppm, int &success) {
+  double *b=pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out,
+                          nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
+                          vd_brick,slab_volfactor,nx_pppm,ny_pppm,nz_pppm,
+                          success);                        
+  PPPMF.device->set_double_precompute(&PPPMD);                         
+  return b;
+}
+
+void pppm_gpu_clear_d(const double cpu_time) {
+  PPPMD.clear(cpu_time);
+}
+
+int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
+                      double **host_x, int *host_type, bool &success,
+                      double *host_q, double *boxlo, const double delxinv,
+                      const double delyinv, const double delzinv) {
+  return PPPMD.spread(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,
+                      delxinv,delyinv,delzinv);
+}
+
+void pppm_gpu_interp_d(const double qqrd2e_scale) {
+  return PPPMD.interp(qqrd2e_scale);
+}
+
+double pppm_gpu_bytes_d() {
+  return PPPMD.host_memory_usage();
+}
+