forked from lijiext/lammps
Allow Kokkos remap to turn off Cuda-aware MPI
This commit is contained in:
parent
5c2f0ecc65
commit
f8226508f4
|
@ -38,7 +38,8 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
|
|||
int in_klo, int in_khi,
|
||||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int scaled, int permute, int *nbuf, int usecollective) :
|
||||
int scaled, int permute, int *nbuf, int usecollective,
|
||||
int usecuda_aware) :
|
||||
Pointers(lmp)
|
||||
{
|
||||
int nthreads = lmp->kokkos->nthreads;
|
||||
|
@ -70,7 +71,7 @@ FFT3dKokkos<DeviceType>::FFT3dKokkos(LAMMPS *lmp, MPI_Comm comm, int nfast, int
|
|||
plan = fft_3d_create_plan_kokkos(comm,nfast,nmid,nslow,
|
||||
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
|
||||
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
|
||||
scaled,permute,nbuf,usecollective,nthreads);
|
||||
scaled,permute,nbuf,usecollective,nthreads,usecuda_aware);
|
||||
if (plan == NULL) error->one(FLERR,"Could not create 3d FFT plan");
|
||||
}
|
||||
|
||||
|
@ -368,6 +369,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
|
|||
2 = permute twice = slow->fast, fast->mid, mid->slow
|
||||
nbuf returns size of internal storage buffers used by FFT
|
||||
usecollective use collective MPI operations for remapping data
|
||||
usecuda_aware use CUDA-Aware MPI or not
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -378,7 +380,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
|
|||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int scaled, int permute, int *nbuf, int usecollective,
|
||||
int nthreads)
|
||||
int nthreads, int usecuda_aware)
|
||||
{
|
||||
struct fft_plan_3d_kokkos<DeviceType> *plan;
|
||||
int me,nprocs;
|
||||
|
@ -435,7 +437,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
|
|||
plan->pre_plan =
|
||||
remapKK->remap_3d_create_plan_kokkos(comm,in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
|
||||
first_ilo,first_ihi,first_jlo,first_jhi,
|
||||
first_klo,first_khi,2,0,0,FFT_PRECISION,0);
|
||||
first_klo,first_khi,2,0,0,FFT_PRECISION,
|
||||
usecollective,usecuda_aware);
|
||||
if (plan->pre_plan == NULL) return NULL;
|
||||
}
|
||||
|
||||
|
@ -460,7 +463,7 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
|
|||
first_klo,first_khi,
|
||||
second_ilo,second_ihi,second_jlo,second_jhi,
|
||||
second_klo,second_khi,2,1,0,FFT_PRECISION,
|
||||
usecollective);
|
||||
usecollective,usecuda_aware);
|
||||
if (plan->mid1_plan == NULL) return NULL;
|
||||
|
||||
// 1d FFTs along mid axis
|
||||
|
@ -500,7 +503,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
|
|||
second_jlo,second_jhi,second_klo,second_khi,
|
||||
second_ilo,second_ihi,
|
||||
third_jlo,third_jhi,third_klo,third_khi,
|
||||
third_ilo,third_ihi,2,1,0,FFT_PRECISION,usecollective);
|
||||
third_ilo,third_ihi,2,1,0,FFT_PRECISION,
|
||||
usecollective,usecuda_aware);
|
||||
if (plan->mid2_plan == NULL) return NULL;
|
||||
|
||||
// 1d FFTs along slow axis
|
||||
|
@ -527,7 +531,8 @@ struct fft_plan_3d_kokkos<DeviceType>* FFT3dKokkos<DeviceType>::fft_3d_create_pl
|
|||
third_klo,third_khi,third_ilo,third_ihi,
|
||||
third_jlo,third_jhi,
|
||||
out_klo,out_khi,out_ilo,out_ihi,
|
||||
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,0);
|
||||
out_jlo,out_jhi,2,(permute+1)%3,0,FFT_PRECISION,
|
||||
usecollective,usecuda_aware);
|
||||
if (plan->post_plan == NULL) return NULL;
|
||||
}
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ class FFT3dKokkos : protected Pointers {
|
|||
|
||||
FFT3dKokkos(class LAMMPS *, MPI_Comm,
|
||||
int,int,int,int,int,int,int,int,int,int,int,int,int,int,int,
|
||||
int,int,int *,int);
|
||||
int,int,int *,int,int);
|
||||
~FFT3dKokkos();
|
||||
void compute(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, int);
|
||||
void timing1d(typename FFT_AT::t_FFT_SCALAR_1d, int, int);
|
||||
|
@ -95,7 +95,7 @@ class FFT3dKokkos : protected Pointers {
|
|||
struct fft_plan_3d_kokkos<DeviceType> *fft_3d_create_plan_kokkos(MPI_Comm, int, int, int,
|
||||
int, int, int, int, int,
|
||||
int, int, int, int, int, int, int,
|
||||
int, int, int *, int, int);
|
||||
int, int, int *, int, int, int);
|
||||
|
||||
void fft_3d_destroy_plan_kokkos(struct fft_plan_3d_kokkos<DeviceType> *);
|
||||
|
||||
|
|
|
@ -840,21 +840,23 @@ void PPPMKokkos<DeviceType>::allocate()
|
|||
// 2nd FFT returns data in 3d brick decomposition
|
||||
// remap takes data from 3d brick to FFT decomposition
|
||||
|
||||
int collective_flag = 0; // not yet supported in Kokkos version
|
||||
int cuda_aware_flag = lmp->kokkos->cuda_aware_flag;
|
||||
int tmp;
|
||||
|
||||
fft1 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
0,0,&tmp,collective_flag);
|
||||
0,0,&tmp,collective_flag,cuda_aware_flag);
|
||||
|
||||
fft2 = new FFT3dKokkos<DeviceType>(lmp,world,nx_pppm,ny_pppm,nz_pppm,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
0,0,&tmp,collective_flag);
|
||||
0,0,&tmp,collective_flag,cuda_aware_flag);
|
||||
remap = new RemapKokkos<DeviceType>(lmp,world,
|
||||
nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
|
||||
nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
|
||||
1,0,0,FFT_PRECISION,collective_flag);
|
||||
1,0,0,FFT_PRECISION,collective_flag,cuda_aware_flag);
|
||||
|
||||
// create ghost grid object for rho and electric field communication
|
||||
|
||||
|
|
|
@ -38,12 +38,14 @@ RemapKokkos<DeviceType>::RemapKokkos(LAMMPS *lmp, MPI_Comm comm,
|
|||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int nqty, int permute, int memory,
|
||||
int precision, int usecollective) : Pointers(lmp)
|
||||
int precision, int usecollective,
|
||||
int usecuda_aware) : Pointers(lmp)
|
||||
{
|
||||
plan = remap_3d_create_plan_kokkos(comm,
|
||||
in_ilo,in_ihi,in_jlo,in_jhi,in_klo,in_khi,
|
||||
out_ilo,out_ihi,out_jlo,out_jhi,out_klo,out_khi,
|
||||
nqty,permute,memory,precision,usecollective);
|
||||
nqty,permute,memory,precision,usecollective,
|
||||
usecuda_aware);
|
||||
if (plan == NULL) error->one(FLERR,"Could not create 3d remap plan");
|
||||
}
|
||||
|
||||
|
@ -119,20 +121,36 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
|
|||
|
||||
// post all recvs into scratch space
|
||||
|
||||
FFT_SCALAR* v_scratch = d_scratch.data();
|
||||
if (!plan->usecuda_aware) {
|
||||
plan->h_scratch = Kokkos::create_mirror_view(d_scratch);
|
||||
v_scratch = plan->h_scratch.data();
|
||||
}
|
||||
|
||||
for (irecv = 0; irecv < plan->nrecv; irecv++) {
|
||||
FFT_SCALAR* scratch = d_scratch.data() + plan->recv_bufloc[irecv];
|
||||
FFT_SCALAR* scratch = v_scratch + plan->recv_bufloc[irecv];
|
||||
MPI_Irecv(scratch,plan->recv_size[irecv],
|
||||
MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
|
||||
plan->comm,&plan->request[irecv]);
|
||||
}
|
||||
|
||||
FFT_SCALAR* v_sendbuf = plan->d_sendbuf.data();
|
||||
if (!plan->usecuda_aware) {
|
||||
plan->h_sendbuf = Kokkos::create_mirror_view(plan->d_sendbuf);
|
||||
v_sendbuf = plan->h_sendbuf.data();
|
||||
}
|
||||
|
||||
// send all messages to other procs
|
||||
|
||||
for (isend = 0; isend < plan->nsend; isend++) {
|
||||
int in_offset = plan->send_offset[isend];
|
||||
plan->pack(d_in,in_offset,
|
||||
plan->d_sendbuf,0,&plan->packplan[isend]);
|
||||
MPI_Send(plan->d_sendbuf.data(),plan->send_size[isend],MPI_FFT_SCALAR,
|
||||
|
||||
if (!plan->usecuda_aware)
|
||||
Kokkos::deep_copy(plan->h_sendbuf,plan->d_sendbuf);
|
||||
|
||||
MPI_Send(v_sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
|
||||
plan->send_proc[isend],0,plan->comm);
|
||||
}
|
||||
|
||||
|
@ -161,6 +179,9 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
|
|||
int scratch_offset = plan->recv_bufloc[irecv];
|
||||
int out_offset = plan->recv_offset[irecv];
|
||||
|
||||
if (!plan->usecuda_aware)
|
||||
Kokkos::deep_copy(d_scratch,plan->h_scratch);
|
||||
|
||||
plan->unpack(d_scratch,scratch_offset,
|
||||
d_out,out_offset,&plan->unpackplan[irecv]);
|
||||
}
|
||||
|
@ -189,6 +210,7 @@ void RemapKokkos<DeviceType>::remap_3d_kokkos(typename FFT_AT::t_FFT_SCALAR_1d d
|
|||
1 = single precision (4 bytes per datum)
|
||||
2 = double precision (8 bytes per datum)
|
||||
usecollective whether to use collective MPI or point-to-point
|
||||
usecuda_aware whether to use CUDA-Aware MPI or not
|
||||
------------------------------------------------------------------------- */
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -198,7 +220,8 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
|
|||
int in_klo, int in_khi,
|
||||
int out_ilo, int out_ihi, int out_jlo, int out_jhi,
|
||||
int out_klo, int out_khi,
|
||||
int nqty, int permute, int memory, int precision, int usecollective)
|
||||
int nqty, int permute, int memory, int precision,
|
||||
int usecollective, int usecuda_aware)
|
||||
{
|
||||
|
||||
struct remap_plan_3d_kokkos<DeviceType> *plan;
|
||||
|
@ -216,6 +239,7 @@ struct remap_plan_3d_kokkos<DeviceType>* RemapKokkos<DeviceType>::remap_3d_creat
|
|||
plan = new struct remap_plan_3d_kokkos<DeviceType>;
|
||||
if (plan == NULL) return NULL;
|
||||
plan->usecollective = usecollective;
|
||||
plan->usecuda_aware = usecuda_aware;
|
||||
|
||||
// store parameters in local data structs
|
||||
|
||||
|
|
|
@ -28,7 +28,9 @@ struct remap_plan_3d_kokkos {
|
|||
typedef DeviceType device_type;
|
||||
typedef FFTArrayTypes<DeviceType> FFT_AT;
|
||||
typename FFT_AT::t_FFT_SCALAR_1d d_sendbuf; // buffer for MPI sends
|
||||
FFT_HAT::t_FFT_SCALAR_1d h_sendbuf; // host buffer for MPI sends
|
||||
typename FFT_AT::t_FFT_SCALAR_1d d_scratch; // scratch buffer for MPI recvs
|
||||
FFT_HAT::t_FFT_SCALAR_1d h_scratch; // host scratch buffer for MPI recvs
|
||||
void (*pack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
|
||||
// which pack function to use
|
||||
void (*unpack)(typename FFT_AT::t_FFT_SCALAR_1d_um, int, typename FFT_AT::t_FFT_SCALAR_1d_um, int, struct pack_plan_3d *);
|
||||
|
@ -51,6 +53,7 @@ struct remap_plan_3d_kokkos {
|
|||
int usecollective; // use collective or point-to-point MPI
|
||||
int commringlen; // length of commringlist
|
||||
int *commringlist; // ranks on communication ring of this plan
|
||||
int usecuda_aware; // use CUDA-Aware MPI or not
|
||||
};
|
||||
|
||||
template<class DeviceType>
|
||||
|
@ -60,7 +63,7 @@ class RemapKokkos : protected Pointers {
|
|||
typedef FFTArrayTypes<DeviceType> FFT_AT;
|
||||
RemapKokkos(class LAMMPS *);
|
||||
RemapKokkos(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
|
||||
int,int,int,int,int,int,int,int,int,int,int);
|
||||
int,int,int,int,int,int,int,int,int,int,int,int);
|
||||
~RemapKokkos();
|
||||
void perform(typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d, typename FFT_AT::t_FFT_SCALAR_1d);
|
||||
|
||||
|
@ -70,7 +73,7 @@ class RemapKokkos : protected Pointers {
|
|||
struct remap_plan_3d_kokkos<DeviceType> *remap_3d_create_plan_kokkos(MPI_Comm,
|
||||
int, int, int, int, int, int,
|
||||
int, int, int, int, int, int,
|
||||
int, int, int, int, int);
|
||||
int, int, int, int, int, int);
|
||||
void remap_3d_destroy_plan_kokkos(struct remap_plan_3d_kokkos<DeviceType> *);
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue