Allow Kokkos remap to turn off Cuda-aware MPI

This commit is contained in:
Stan Moore 2020-05-01 12:09:32 -06:00
parent 5c2f0ecc65
commit f8226508f4
5 changed files with 53 additions and 19 deletions

View File

@ -38,7 +38,8 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
int in_klo, int in_khi, int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi, int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective) : int scaled, int permute, int *nbuf, int usecollective,
int usecuda_aware) :
Pointers(lmp) Pointers(lmp)
{ {
int nthreads = lmp->kokkos->nthreads; int nthreads = lmp->kokkos->nthreads;
@ -70,7 +71,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow, plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
scaled,permute,nbuf,usecollective,nthreads); scaled,permute,nbuf,usecollective,nthreads,usecuda_aware);
if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan"); if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
} }
@ -368,6 +369,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
2 = permute twice = slow->fast, fast->mid, mid->slow 2 = permute twice = slow->fast, fast->mid, mid->slow
nbuf returns size of internal storage buffers used by FFT nbuf returns size of internal storage buffers used by FFT
usecollective use collective MPI operations for remapping data usecollective use collective MPI operations for remapping data
usecuda_aware use CUDA-Aware MPI or not
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi, int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective, int scaled, int permute, int *nbuf, int usecollective,
int nthreads) int nthreads, int usecuda_aware)
{ {
struct fft_plan_3d_kokkos<DeviceType> *plan; struct fft_plan_3d_kokkos<DeviceType> *plan;
int me,nprocs; int me,nprocs;
@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
plan->pre_plan = plan->pre_plan =
remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
first_ilo,first_ihi,first_jlo,first_jhi, first_ilo,first_ihi,first_jlo,first_jhi,
first_klo,first_khi,2,0,0,FFT_PRECISION,0); first_klo,first_khi,2,0,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->pre_plan == NULL) return NULL; if (plan->pre_plan == NULL) return NULL;
} }
@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
first_klo,first_khi, first_klo,first_khi,
second_ilo,second_ihi,second_jlo,second_jhi, second_ilo,second_ihi,second_jlo,second_jhi,
second_klo,second_khi,2,1,0,FFT_PRECISION, second_klo,second_khi,2,1,0,FFT_PRECISION,
usecollective); usecollective,usecuda_aware);
if (plan->mid1_plan == NULL) return NULL; if (plan->mid1_plan == NULL) return NULL;
// 1d FFTs along mid axis // 1d FFTs along mid axis
@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
second_jlo,second_jhi,second_klo,second_khi, second_jlo,second_jhi,second_klo,second_khi,
second_ilo,second_ihi, second_ilo,second_ihi,
third_jlo,third_jhi,third_klo,third_khi, third_jlo,third_jhi,third_klo,third_khi,
third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective); third_ilo,third_ihi,2,1,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->mid2_plan == NULL) return NULL; if (plan->mid2_plan == NULL) return NULL;
// 1d FFTs along slow axis // 1d FFTs along slow axis
@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
third_klo,third_khi,third_ilo,third_ihi, third_klo,third_khi,third_ilo,third_ihi,
third_jlo,third_jhi, third_jlo,third_jhi,
out_klo,out_khi,out_ilo,out_ihi, out_klo,out_khi,out_ilo,out_ihi,
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0); out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->post_plan == NULL) return NULL; if (plan->post_plan == NULL) return NULL;
} }

View File

@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers {
FFT3dKokkos(class LAMMPS *, MPI_Comm, FFT3dKokkos(class LAMMPS *, MPI_Comm,
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int, int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
int,int,int *,int); int,int,int *,int,int);
~FFT3dKokkos(); ~FFT3dKokkos();
void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int); void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int); void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers {
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int, struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int *, int, int); int, int, int *, int, int, int);
void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *); void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);

View File

@ -840,21 +840,23 @@ void PPPMKokkos<DeviceType>::allocate()
// 2nd FFT returns data in 3d brick decomposition // 2nd FFT returns data in 3d brick decomposition
// remap takes data from 3d brick to FFT decomposition // remap takes data from 3d brick to FFT decomposition
int collective_flag = 0; // not yet supported in Kokkos version
int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
int tmp; int tmp;
fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm, fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp,collective_flag); 0,0,&tmp,collective_flag,cuda_aware_flag);
fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm, fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp,collective_flag); 0,0,&tmp,collective_flag,cuda_aware_flag);
remap = new RemapKokkos<DeviceType>(lmp,world, remap = new RemapKokkos<DeviceType>(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION,collective_flag); 1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);
// create ghost grid object for rho and electric field communication // create ghost grid object for rho and electric field communication

View File

@ -38,12 +38,14 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi, int out_klo, int out_khi,
int nqty, int permute, int memory, int nqty, int permute, int memory,
int precision, int usecollective) : Pointers(lmp) int precision, int usecollective,
int usecuda_aware) : Pointers(lmp)
{ {
plan = remap_3d_create_plan_kokkos(comm, plan = remap_3d_create_plan_kokkos(comm,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi, in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi, out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
nqty,permute,memory,precision,usecollective); nqty,permute,memory,precision,usecollective,
usecuda_aware);
if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan"); if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
} }
@ -119,20 +121,36 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
// post all recvs into scratch space // post all recvs into scratch space
FFT_SCALAR* v_scratch = d_scratch.data();
if (!plan->usecuda_aware) {
plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
v_scratch = plan->h_scratch.data();
}
for (irecv = 0; irecv < plan->nrecv; irecv++) { for (irecv = 0; irecv < plan->nrecv; irecv++) {
FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv]; FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
MPI_Irecv(scratch,plan->recv_size[irecv], MPI_Irecv(scratch,plan->recv_size[irecv],
MPI_FFT_SCALAR,plan->recv_proc[irecv],0, MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
plan->comm,&plan->request[irecv]); plan->comm,&plan->request[irecv]);
} }
FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
if (!plan->usecuda_aware) {
plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
v_sendbuf = plan->h_sendbuf.data();
}
// send all messages to other procs // send all messages to other procs
for (isend = 0; isend < plan->nsend; isend++) { for (isend = 0; isend < plan->nsend; isend++) {
int in_offset = plan->send_offset[isend]; int in_offset = plan->send_offset[isend];
plan->pack(d_in,in_offset, plan->pack(d_in,in_offset,
plan->d_sendbuf,0,&plan->packplan[isend]); plan->d_sendbuf,0,&plan->packplan[isend]);
MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR,
if (!plan->usecuda_aware)
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
plan->send_proc[isend],0,plan->comm); plan->send_proc[isend],0,plan->comm);
} }
@ -161,6 +179,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
int scratch_offset = plan->recv_bufloc[irecv]; int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv]; int out_offset = plan->recv_offset[irecv];
if (!plan->usecuda_aware)
Kokkos::deep_copy(d_scratch,plan->h_scratch);
plan->unpack(d_scratch,scratch_offset, plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]); d_out,out_offset,&plan->unpackplan[irecv]);
} }
@ -189,6 +210,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
1 = single precision (4 bytes per datum) 1 = single precision (4 bytes per datum)
2 = double precision (8 bytes per datum) 2 = double precision (8 bytes per datum)
usecollective whether to use collective MPI or point-to-point usecollective whether to use collective MPI or point-to-point
usecuda_aware whether to use CUDA-Aware MPI or not
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
int in_klo, int in_khi, int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi, int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi, int out_klo, int out_khi,
int nqty, int permute, int memory, int precision, int usecollective) int nqty, int permute, int memory, int precision,
int usecollective, int usecuda_aware)
{ {
struct remap_plan_3d_kokkos<DeviceType> *plan; struct remap_plan_3d_kokkos<DeviceType> *plan;
@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
plan = new struct remap_plan_3d_kokkos<DeviceType>; plan = new struct remap_plan_3d_kokkos<DeviceType>;
if (plan == NULL) return NULL; if (plan == NULL) return NULL;
plan->usecollective = usecollective; plan->usecollective = usecollective;
plan->usecuda_aware = usecuda_aware;
// store parameters in local data structs // store parameters in local data structs

View File

@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos {
typedef DeviceType device_type; typedef DeviceType device_type;
typedef FFTArrayTypes<DeviceType> FFT_AT; typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
FFT_HAT::t_FFT_SCALAR_1d h_sendbuf; // host buffer for MPI sends
typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
FFT_HAT::t_FFT_SCALAR_1d h_scratch; // host scratch buffer for MPI recvs
void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which pack function to use // which pack function to use
void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *); void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos {
int usecollective; // use collective or point-to-point MPI int usecollective; // use collective or point-to-point MPI
int commringlen; // length of commringlist int commringlen; // length of commringlist
int *commringlist; // ranks on communication ring of this plan int *commringlist; // ranks on communication ring of this plan
int usecuda_aware; // use CUDA-Aware MPI or not
}; };
template<class DeviceType> template<class DeviceType>
@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers {
typedef FFTArrayTypes<DeviceType> FFT_AT; typedef FFTArrayTypes<DeviceType> FFT_AT;
RemapKokkos(class LAMMPS *); RemapKokkos(class LAMMPS *);
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int, RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,int); int,int,int,int,int,int,int,int,int,int,int,int);
~RemapKokkos(); ~RemapKokkos();
void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d); void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);
@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers {
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm, struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int, int, int, int, int, int, int, int,
int, int, int, int, int); int, int, int, int, int, int);
void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *); void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
}; };