diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp index fc88136e8c..04dd343af6 100644 --- a/src/KOKKOS/fft3d_kokkos.cpp +++ b/src/KOKKOS/fft3d_kokkos.cpp @@ -38,7 +38,8 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int scaled, int permute, int *nbuf, int usecollective) : + int scaled, int permute, int *nbuf, int usecollective, + int usecuda_aware) : Pointers(lmp) { int nthreads = lmp->kokkos->nthreads; @@ -70,7 +71,7 @@ FFT3dKokkos::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - scaled,permute,nbuf,usecollective,nthreads); + scaled,permute,nbuf,usecollective,nthreads,usecuda_aware); if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan"); } @@ -368,6 +369,7 @@ void FFT3dKokkos::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in, 2 = permute twice = slow->fast, fast->mid, mid->slow nbuf returns size of internal storage buffers used by FFT usecollective use collective MPI operations for remapping data + usecuda_aware use CUDA-Aware MPI or not ------------------------------------------------------------------------- */ template @@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int scaled, int permute, int *nbuf, int usecollective, - int nthreads) + int nthreads, int usecuda_aware) { struct fft_plan_3d_kokkos *plan; int me,nprocs; @@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl plan->pre_plan = remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, first_ilo,first_ihi,first_jlo,first_jhi, - first_klo,first_khi,2,0,0,FFT_PRECISION,0); + first_klo,first_khi,2,0,0,FFT_PRECISION, + usecollective,usecuda_aware); if (plan->pre_plan == NULL) return NULL; } @@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl first_klo,first_khi, second_ilo,second_ihi,second_jlo,second_jhi, second_klo,second_khi,2,1,0,FFT_PRECISION, - usecollective); + usecollective,usecuda_aware); if (plan->mid1_plan == NULL) return NULL; // 1d FFTs along mid axis @@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl second_jlo,second_jhi,second_klo,second_khi, second_ilo,second_ihi, third_jlo,third_jhi,third_klo,third_khi, - third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective); + third_ilo,third_ihi,2,1,0,FFT_PRECISION, + usecollective,usecuda_aware); if (plan->mid2_plan == NULL) return NULL; // 1d FFTs along slow axis @@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos* FFT3dKokkos::fft_3d_create_pl third_klo,third_khi,third_ilo,third_ihi, third_jlo,third_jhi, out_klo,out_khi,out_ilo,out_ihi, - out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0); + out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION, + usecollective,usecuda_aware); if (plan->post_plan == NULL) return NULL; } diff --git a/src/KOKKOS/fft3d_kokkos.h b/src/KOKKOS/fft3d_kokkos.h index ffdd2a38ce..192d2cb7da 100644 --- a/src/KOKKOS/fft3d_kokkos.h +++ b/src/KOKKOS/fft3d_kokkos.h @@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers { FFT3dKokkos(class LAMMPS *, MPI_Comm, int,int,int,int,int,int,int,int,int,int,int,int,int,int,int, - int,int,int *,int); + int,int,int *,int,int); ~FFT3dKokkos(); void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int); void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int); @@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers { struct fft_plan_3d_kokkos *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int *, int, int); + int, int, int *, int, int, int); void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos *); diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index 0940bda7ac..68a72c9a53 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -840,21 +840,23 @@ void PPPMKokkos::allocate() // 2nd FFT returns data in 3d brick decomposition // remap takes data from 3d brick to FFT decomposition + int collective_flag = 0; // not yet supported in Kokkos version + int cuda_aware_flag = lmp->kokkos->cuda_aware_flag; int tmp; fft1 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,cuda_aware_flag); fft2 = new FFT3dKokkos(lmp,world,nx_pppm,ny_pppm,nz_pppm, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp,collective_flag); + 0,0,&tmp,collective_flag,cuda_aware_flag); remap = new RemapKokkos(lmp,world, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,FFT_PRECISION,collective_flag); + 1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag); // create ghost grid object for rho and electric field communication diff --git a/src/KOKKOS/remap_kokkos.cpp b/src/KOKKOS/remap_kokkos.cpp index e94a4ddd9f..604b438a89 100644 --- a/src/KOKKOS/remap_kokkos.cpp +++ b/src/KOKKOS/remap_kokkos.cpp @@ -38,12 +38,14 @@ RemapKokkos::RemapKokkos(LAMMPS *lmp, MPI_Comm comm, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, int nqty, int permute, int memory, - int precision, int usecollective) : Pointers(lmp) + int precision, int usecollective, + int usecuda_aware) : Pointers(lmp) { plan = remap_3d_create_plan_kokkos(comm, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, - nqty,permute,memory,precision,usecollective); + nqty,permute,memory,precision,usecollective, + usecuda_aware); if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan"); } @@ -119,20 +121,36 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d // post all recvs into scratch space + FFT_SCALAR* v_scratch = d_scratch.data(); + if (!plan->usecuda_aware) { + plan->h_scratch = Kokkos::create_mirror_view(d_scratch); + v_scratch = plan->h_scratch.data(); + } + for (irecv = 0; irecv < plan->nrecv; irecv++) { - FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv]; + FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv]; MPI_Irecv(scratch,plan->recv_size[irecv], MPI_FFT_SCALAR,plan->recv_proc[irecv],0, plan->comm,&plan->request[irecv]); } + FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data(); + if (!plan->usecuda_aware) { + plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf); + v_sendbuf = plan->h_sendbuf.data(); + } + // send all messages to other procs for (isend = 0; isend < plan->nsend; isend++) { int in_offset = plan->send_offset[isend]; plan->pack(d_in,in_offset, plan->d_sendbuf,0,&plan->packplan[isend]); - MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR, + + if (!plan->usecuda_aware) + Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf); + + MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR, plan->send_proc[isend],0,plan->comm); } @@ -161,6 +179,9 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d int scratch_offset = plan->recv_bufloc[irecv]; int out_offset = plan->recv_offset[irecv]; + if (!plan->usecuda_aware) + Kokkos::deep_copy(d_scratch,plan->h_scratch); + plan->unpack(d_scratch,scratch_offset, d_out,out_offset,&plan->unpackplan[irecv]); } @@ -189,6 +210,7 @@ void RemapKokkos::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d 1 = single precision (4 bytes per datum) 2 = double precision (8 bytes per datum) usecollective whether to use collective MPI or point-to-point + usecuda_aware whether to use CUDA-Aware MPI or not ------------------------------------------------------------------------- */ template @@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat int in_klo, int in_khi, int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_klo, int out_khi, - int nqty, int permute, int memory, int precision, int usecollective) + int nqty, int permute, int memory, int precision, + int usecollective, int usecuda_aware) { struct remap_plan_3d_kokkos *plan; @@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos* RemapKokkos::remap_3d_creat plan = new struct remap_plan_3d_kokkos; if (plan == NULL) return NULL; plan->usecollective = usecollective; + plan->usecuda_aware = usecuda_aware; // store parameters in local data structs diff --git a/src/KOKKOS/remap_kokkos.h b/src/KOKKOS/remap_kokkos.h index 9f1334be26..d6e4ab828e 100644 --- a/src/KOKKOS/remap_kokkos.h +++ b/src/KOKKOS/remap_kokkos.h @@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos { typedef DeviceType device_type; typedef FFTArrayTypes FFT_AT; typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends + FFT_HAT::t_FFT_SCALAR_1d h_sendbuf; // host buffer for MPI sends typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs + FFT_HAT::t_FFT_SCALAR_1d h_scratch; // host scratch buffer for MPI recvs void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); // which pack function to use void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); @@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos { int usecollective; // use collective or point-to-point MPI int commringlen; // length of commringlist int *commringlist; // ranks on communication ring of this plan + int usecuda_aware; // use CUDA-Aware MPI or not }; template @@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers { typedef FFTArrayTypes FFT_AT; RemapKokkos(class LAMMPS *); RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int, - int,int,int,int,int,int,int,int,int,int,int); + int,int,int,int,int,int,int,int,int,int,int,int); ~RemapKokkos(); void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d); @@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers { struct remap_plan_3d_kokkos *remap_3d_create_plan_kokkos(MPI_Comm, int, int, int, int, int, int, int, int, int, int, int, int, - int, int, int, int, int); + int, int, int, int, int, int); void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos *); };