Allow Kokkos remap to turn off Cuda-aware MPI

This commit is contained in:
Stan Moore 2020-05-01 12:09:32 -06:00
parent 5c2f0ecc65
commit f8226508f4
5 changed files with 53 additions and 19 deletions

View File

@ -38,7 +38,8 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective) :
int scaled, int permute, int *nbuf, int usecollective,
int usecuda_aware) :
Pointers(lmp)
{
int nthreads = lmp->kokkos->nthreads;
@ -70,7 +71,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
scaled,permute,nbuf,usecollective,nthreads);
scaled,permute,nbuf,usecollective,nthreads,usecuda_aware);
if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
}
@ -368,6 +369,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
2 = permute twice = slow->fast, fast->mid, mid->slow
nbuf returns size of internal storage buffers used by FFT
usecollective use collective MPI operations for remapping data
usecuda_aware use CUDA-Aware MPI or not
------------------------------------------------------------------------- */
template<class DeviceType>
@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int scaled, int permute, int *nbuf, int usecollective,
int nthreads)
int nthreads, int usecuda_aware)
{
struct fft_plan_3d_kokkos<DeviceType> *plan;
int me,nprocs;
@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
plan->pre_plan =
remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
first_ilo,first_ihi,first_jlo,first_jhi,
first_klo,first_khi,2,0,0,FFT_PRECISION,0);
first_klo,first_khi,2,0,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->pre_plan == NULL) return NULL;
}
@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
first_klo,first_khi,
second_ilo,second_ihi,second_jlo,second_jhi,
second_klo,second_khi,2,1,0,FFT_PRECISION,
usecollective);
usecollective,usecuda_aware);
if (plan->mid1_plan == NULL) return NULL;
// 1d FFTs along mid axis
@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
second_jlo,second_jhi,second_klo,second_khi,
second_ilo,second_ihi,
third_jlo,third_jhi,third_klo,third_khi,
third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
third_ilo,third_ihi,2,1,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->mid2_plan == NULL) return NULL;
// 1d FFTs along slow axis
@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
third_klo,third_khi,third_ilo,third_ihi,
third_jlo,third_jhi,
out_klo,out_khi,out_ilo,out_ihi,
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
usecollective,usecuda_aware);
if (plan->post_plan == NULL) return NULL;
}

View File

@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers {
FFT3dKokkos(class LAMMPS *, MPI_Comm,
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
int,int,int *,int);
int,int,int *,int,int);
~FFT3dKokkos();
void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers {
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
int, int, int, int, int,
int, int, int, int, int, int, int,
int, int, int *, int, int);
int, int, int *, int, int, int);
void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);

View File

@ -840,21 +840,23 @@ void PPPMKokkos<DeviceType>::allocate()
// 2nd FFT returns data in 3d brick decomposition
// remap takes data from 3d brick to FFT decomposition
int collective_flag = 0; // not yet supported in Kokkos version
int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
int tmp;
fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
0,0,&tmp,collective_flag);
0,0,&tmp,collective_flag,cuda_aware_flag);
fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
0,0,&tmp,collective_flag);
0,0,&tmp,collective_flag,cuda_aware_flag);
remap = new RemapKokkos<DeviceType>(lmp,world,
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
1,0,0,FFT_PRECISION,collective_flag);
1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);
// create ghost grid object for rho and electric field communication

View File

@ -38,12 +38,14 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int nqty, int permute, int memory,
int precision, int usecollective) : Pointers(lmp)
int precision, int usecollective,
int usecuda_aware) : Pointers(lmp)
{
plan = remap_3d_create_plan_kokkos(comm,
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
nqty,permute,memory,precision,usecollective);
nqty,permute,memory,precision,usecollective,
usecuda_aware);
if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
}
@ -119,20 +121,36 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
// post all recvs into scratch space
FFT_SCALAR* v_scratch = d_scratch.data();
if (!plan->usecuda_aware) {
plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
v_scratch = plan->h_scratch.data();
}
for (irecv = 0; irecv < plan->nrecv; irecv++) {
FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv];
FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
MPI_Irecv(scratch,plan->recv_size[irecv],
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
plan->comm,&plan->request[irecv]);
}
FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
if (!plan->usecuda_aware) {
plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
v_sendbuf = plan->h_sendbuf.data();
}
// send all messages to other procs
for (isend = 0; isend < plan->nsend; isend++) {
int in_offset = plan->send_offset[isend];
plan->pack(d_in,in_offset,
plan->d_sendbuf,0,&plan->packplan[isend]);
MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR,
if (!plan->usecuda_aware)
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
plan->send_proc[isend],0,plan->comm);
}
@ -161,6 +179,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
int scratch_offset = plan->recv_bufloc[irecv];
int out_offset = plan->recv_offset[irecv];
if (!plan->usecuda_aware)
Kokkos::deep_copy(d_scratch,plan->h_scratch);
plan->unpack(d_scratch,scratch_offset,
d_out,out_offset,&plan->unpackplan[irecv]);
}
@ -189,6 +210,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
1 = single precision (4 bytes per datum)
2 = double precision (8 bytes per datum)
usecollective whether to use collective MPI or point-to-point
usecuda_aware whether to use CUDA-Aware MPI or not
------------------------------------------------------------------------- */
template<class DeviceType>
@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
int in_klo, int in_khi,
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
int out_klo, int out_khi,
int nqty, int permute, int memory, int precision, int usecollective)
int nqty, int permute, int memory, int precision,
int usecollective, int usecuda_aware)
{
struct remap_plan_3d_kokkos<DeviceType> *plan;
@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
plan = new struct remap_plan_3d_kokkos<DeviceType>;
if (plan == NULL) return NULL;
plan->usecollective = usecollective;
plan->usecuda_aware = usecuda_aware;
// store parameters in local data structs

View File

@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos {
typedef DeviceType device_type;
typedef FFTArrayTypes<DeviceType> FFT_AT;
typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
FFT_HAT::t_FFT_SCALAR_1d h_sendbuf; // host buffer for MPI sends
typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
FFT_HAT::t_FFT_SCALAR_1d h_scratch; // host scratch buffer for MPI recvs
void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
// which pack function to use
void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos {
int usecollective; // use collective or point-to-point MPI
int commringlen; // length of commringlist
int *commringlist; // ranks on communication ring of this plan
int usecuda_aware; // use CUDA-Aware MPI or not
};
template<class DeviceType>
@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers {
typedef FFTArrayTypes<DeviceType> FFT_AT;
RemapKokkos(class LAMMPS *);
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
int,int,int,int,int,int,int,int,int,int,int);
int,int,int,int,int,int,int,int,int,int,int,int);
~RemapKokkos();
void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);
@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers {
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
int, int, int, int, int, int,
int, int, int, int, int, int,
int, int, int, int, int);
int, int, int, int, int, int);
void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
};