818 lines
22 KiB
Fortran
818 lines
22 KiB
Fortran
!
|
|
! Copyright (C) 1996-2016 The SIESTA group
|
|
! This file is distributed under the terms of the
|
|
! GNU General Public License: see COPYING in the top directory
|
|
! or http://www.gnu.org/copyleft/gpl.txt.
|
|
! See Docs/Contributors.txt for a list of contributors.
|
|
!
|
|
!******************************************************************************
|
|
! MODULE m_fft
|
|
! 3-D fast complex Fourier transform
|
|
! Written by J.D.Gale (July 1999)
|
|
!******************************************************************************
|
|
!
|
|
! PUBLIC procedures available from this module:
|
|
! fft : 3-D complex FFT. Old version of J.D.Gale
|
|
!
|
|
! PUBLIC parameters, types, and variables available from this module:
|
|
! none
|
|
!
|
|
!******************************************************************************
|
|
!
|
|
! USED module procedures:
|
|
! use sys, only: die ! Termination routine
|
|
! use alloc, only: de_alloc ! De-allocation routine
|
|
! use alloc, only: re_alloc ! Re-allocation routine
|
|
! use parallelsubs, only: HowManyMeshPerNode
|
|
! use fft1d, only: gpfa ! 1-D FFT routine
|
|
! use fft1d, only: setgpfa ! Sets gpfa routine
|
|
! use m_timer, only: timer_start ! Start counting CPU time
|
|
! use m_timer, only: timer_stop ! Stop counting CPU time
|
|
!
|
|
! USED MPI procedures
|
|
! use mpi_siesta
|
|
!
|
|
! USED module parameters:
|
|
! use precision, only: dp ! Real double precision type
|
|
! use precision, only: gp=>grid_p ! Real precision type of mesh arrays
|
|
! use parallel, only: Node, Nodes, ProcessorY
|
|
! use mesh, only: nsm
|
|
!
|
|
! EXTERNAL procedures used:
|
|
! gpfa : 1D complex FFT
|
|
! setgpfa : Initializes gpfa
|
|
! timer : CPU time counter
|
|
!
|
|
!******************************************************************************
|
|
|
|
MODULE m_fft
|
|
|
|
use precision, only : dp, grid_p
|
|
use parallel, only : Node, Nodes, ProcessorY
|
|
use parallelsubs, only : HowManyMeshPerNode
|
|
use sys, only : die
|
|
use alloc, only : re_alloc, de_alloc
|
|
use mesh, only : nsm
|
|
use gpfa_fft, only : gpfa ! 1-D FFT routine
|
|
use gpfa_fft, only : setgpfa=>setgpfa_check ! Sets gpfa routine
|
|
use m_timer, only : timer_start ! Start counting CPU time
|
|
use m_timer, only : timer_stop ! Stop counting CPU time
|
|
#ifdef MPI
|
|
use mpi_siesta
|
|
#endif
|
|
|
|
implicit none
|
|
|
|
PUBLIC::
|
|
. fft ! 3D complex FFT
|
|
|
|
PRIVATE ! Nothing is declared public beyond this point
|
|
|
|
CONTAINS
|
|
|
|
!******************************************************************************
|
|
|
|
subroutine fft( f, nMesh, isn )
|
|
C
|
|
C Parallel FFT based on FFT routine of Templeton
|
|
C
|
|
C On input :
|
|
C
|
|
C real*8 f() : contains data to be Fourier transformed
|
|
C integer nMesh(3) : contains global dimensions of grid
|
|
C integer isn : indicates the direction of the transform
|
|
C
|
|
C On exit :
|
|
C
|
|
C real*8 f() : contains data Fourier transformed
|
|
C
|
|
C Julian Gale, July 1999
|
|
C
|
|
implicit none
|
|
C
|
|
C Passed arguments
|
|
C
|
|
real(grid_p)
|
|
. f(*)
|
|
integer
|
|
. nMesh(3), isn
|
|
C
|
|
C Local variables
|
|
C
|
|
integer
|
|
. maxmaxtrigs, ntrigs,
|
|
. n1, n2, n3, n2l, n3l, CMeshG(3), CMeshL(3), i, n,
|
|
. nf, nft, ng, nmeshl, ProcessorZ, IOffSet, OldMesh(3)
|
|
integer, save ::
|
|
. maxtrigs
|
|
logical
|
|
. LocalPoints, lredimension
|
|
real(dp)
|
|
. scale
|
|
real(dp), dimension(:,:), pointer, save :: trigs => null()
|
|
#ifdef MPI
|
|
integer
|
|
. n1lf, nrem, Py, Pz
|
|
real(grid_p), dimension(:), pointer :: ft
|
|
#endif
|
|
|
|
save OldMesh
|
|
data OldMesh/0,0,0/
|
|
|
|
call timer( 'fft', 1 )
|
|
C
|
|
C Set mesh size variables
|
|
C
|
|
n1 = nMesh(1)
|
|
n2 = nMesh(2)
|
|
n3 = nMesh(3)
|
|
CMeshG(1) = n1/nsm
|
|
CMeshG(2) = n2/nsm
|
|
CMeshG(3) = n3/nsm
|
|
call HowManyMeshPerNode(CMeshG, Node, Nodes, nmeshl, CMeshL)
|
|
n2l = CMeshL(2)*nsm
|
|
n3l = CMeshL(3)*nsm
|
|
|
|
n = n1*n2l*n3l
|
|
ng = n1*n2*n3
|
|
nf = 2*ng ! size(f)
|
|
|
|
C
|
|
C Set logical as to whether there are any locally stored points
|
|
C
|
|
LocalPoints = (n.gt.0)
|
|
|
|
C
|
|
C Work out processor grid size
|
|
C
|
|
ProcessorZ = Nodes/ProcessorY
|
|
if (ProcessorY*ProcessorZ.ne.Nodes)
|
|
$ call die('ERROR: ProcessorY must be a factor of the' //
|
|
$ ' number of processors!')
|
|
|
|
C
|
|
C Allocate trigs array
|
|
C
|
|
if (.not.associated(trigs)) then
|
|
maxtrigs = 256
|
|
nullify( trigs )
|
|
call re_alloc( trigs, 1, maxtrigs, 1, 3, name='trigs',
|
|
& routine='fft' )
|
|
endif
|
|
C
|
|
C Initialise the tables for the FFT if the mesh has changed
|
|
C
|
|
10 lredimension = .false.
|
|
maxmaxtrigs = maxtrigs
|
|
do i=1,3
|
|
if (OldMesh(i).ne.nMesh(i)) then
|
|
call setgpfa(trigs(:,i), maxtrigs, ntrigs, nMesh(i))
|
|
if (ntrigs.gt.maxmaxtrigs) then
|
|
lredimension = .true.
|
|
maxmaxtrigs = ntrigs
|
|
else
|
|
OldMesh(i) = nMesh(i)
|
|
endif
|
|
endif
|
|
enddo
|
|
if (lredimension) then
|
|
C
|
|
C Resize FFT array for trig factors and set OldMesh to 0 to force recalculation
|
|
C
|
|
maxtrigs = maxmaxtrigs
|
|
call re_alloc( trigs, 1, maxtrigs, 1, 3, name='trigs',
|
|
& routine='fft' )
|
|
OldMesh(1:3) = 0
|
|
goto 10
|
|
endif
|
|
|
|
C
|
|
C FFT in X direction
|
|
C
|
|
if (LocalPoints) then
|
|
call gpfa(f(1:nf),f(2:nf),trigs(:,1),2,2*n1,n1,n2l*n3l,-isn)
|
|
endif
|
|
#ifdef MPI
|
|
C***********************
|
|
C 2-D Processor Grid *
|
|
C***********************
|
|
n1lf = n1/ProcessorY
|
|
nrem = n1 - n1lf*ProcessorY
|
|
Py = (Node/ProcessorZ) + 1
|
|
Pz = Node - (Py - 1)*ProcessorZ + 1
|
|
if (Py.le.nrem) n1lf = n1lf + 1
|
|
C
|
|
C Allocate local memory
|
|
C
|
|
nullify( ft )
|
|
call re_alloc( ft, 1, 2*n1lf*n2*n3l, name='ft', routine='fft' )
|
|
C
|
|
C Redistribute data to be distributed by X and Z
|
|
C
|
|
call redistribXZ(f,n1,n2l,n3l,ft,n1lf,n2,1,nsm,Node,Nodes)
|
|
C
|
|
C FFT in Y direction
|
|
C
|
|
if (LocalPoints) then
|
|
nft = size(ft)
|
|
do i=0,n3l-1
|
|
IOffSet=2*n1lf*n2*i
|
|
call gpfa(ft(IOffSet+1:nft),ft(IOffSet+2:nft),trigs(:,2),
|
|
. 2*n1lf,2,n2,n1lf,-isn)
|
|
enddo
|
|
endif
|
|
C
|
|
C Redistribute data back to original form
|
|
C
|
|
call redistribXZ(f,n1,n2l,n3l,ft,n1lf,n2,-1,nsm,Node,Nodes)
|
|
C
|
|
C Free local memory ready for re-use
|
|
C
|
|
call de_alloc( ft, name='ft', routine='fft' )
|
|
C
|
|
C Find new distributed x dimension
|
|
C
|
|
n1lf = n1/ProcessorZ
|
|
nrem = n1 - n1lf*ProcessorZ
|
|
if (Pz.le.nrem) n1lf = n1lf + 1
|
|
C
|
|
C Allocate local memory
|
|
C
|
|
nullify( ft ) ! AG
|
|
call re_alloc( ft, 1, 2*n1lf*n2l*n3, name='ft', routine='fft' )
|
|
C
|
|
C Redistribute data to be distributed by X and Y
|
|
C
|
|
call redistribXY(f,n1,n2l,n3l,ft,n1lf,n3,1,nsm,Node,Nodes)
|
|
C
|
|
C FFT in Z direction
|
|
C
|
|
if (LocalPoints) then
|
|
nft = size(ft)
|
|
call gpfa(ft(1:nft),ft(2:nft),trigs(:,3),
|
|
$ 2*n1lf*n2l,2,n3,n1lf*n2l,-isn)
|
|
endif
|
|
C
|
|
C Redistribute data to be distributed by Z again
|
|
C
|
|
call redistribXY(f,n1,n2l,n3l,ft,n1lf,n3,-1,nsm,Node,Nodes)
|
|
C
|
|
C Free local memory
|
|
C
|
|
call de_alloc( ft, name='ft', routine='fft' )
|
|
#else
|
|
C
|
|
C FFT in Y direction
|
|
C
|
|
do i=0,n3-1
|
|
IOffSet=2*n1*n2*i
|
|
call gpfa(f(IOffSet+1:nf),f(IOffSet+2:nf),trigs(:,2),
|
|
. 2*n1,2,n2,n1,-isn)
|
|
enddo
|
|
C
|
|
C FFT in Z direction
|
|
C
|
|
call gpfa(f(1:nf),f(2:nf),trigs(:,3),2*n1*n2,2,n3,n1*n2,-isn)
|
|
#endif
|
|
|
|
C
|
|
C Scale values
|
|
C
|
|
if (LocalPoints.and.isn.gt.0) then
|
|
scale=1.0_dp/dble(ng)
|
|
do i=1,2*n
|
|
f(i)=f(i)*scale
|
|
enddo
|
|
endif
|
|
|
|
call timer( 'fft', 2 )
|
|
|
|
return
|
|
end subroutine fft
|
|
|
|
|
|
!******************************************************************************
|
|
|
|
#ifdef MPI
|
|
subroutine redistribXZ(f,n1,n2l,n3l,ft,n1lf,n2,idir,nsm,
|
|
. Node,Nodes)
|
|
C
|
|
C This routine redistributes the data over the Nodes as needed
|
|
C for the FFT routines between the arrays f and ft
|
|
C
|
|
C Array f is distributed in the Y/Z direction while ft is distributed
|
|
C in the X/Z direction
|
|
C
|
|
C idir = direction of redistribution : > 0 => f->ft
|
|
C < 0 => ft->f
|
|
C
|
|
C Use the processor grid to divide communicator according to Z
|
|
C
|
|
C Julian Gale, July 1999
|
|
C
|
|
|
|
implicit none
|
|
|
|
C
|
|
C Passed arguments
|
|
C
|
|
integer
|
|
. n1, n2, n2l, n1lf, n3l, Node, Nodes, idir, nsm
|
|
real(grid_p)
|
|
. f(2,n1,n2l,n3l), ft(2,n1lf,n2,n3l)
|
|
C
|
|
C Local variables
|
|
C
|
|
integer
|
|
. BlockSizeY, BlockSizeYMax, jmin, jmax, jloc, n1lmax, NRemY,
|
|
. i, j, k, jl, kl, BNode, INode, SNode, jminS, jmaxS
|
|
|
|
integer
|
|
. MPIerror, RequestR, RequestS, Status(MPI_Status_Size)
|
|
|
|
integer, save ::
|
|
. ProcessorZ, Py, Pz, ZCommunicator, ZNode, ZNodes
|
|
|
|
logical, save :: firsttimeZ = .true.
|
|
|
|
real(grid_p), dimension(:,:,:,:), pointer :: ftmp,ftmp2
|
|
|
|
if (firsttimeZ) then
|
|
C
|
|
C Determine processor grid coordinates
|
|
C
|
|
ProcessorZ = Nodes/ProcessorY
|
|
if (ProcessorY*ProcessorZ.ne.Nodes)
|
|
$ call die('ERROR: ProcessorY must be a factor of the' //
|
|
$ ' number of processors!')
|
|
Py = (Node/ProcessorZ) + 1
|
|
Pz = Node - (Py - 1)*ProcessorZ + 1
|
|
C
|
|
C Group processors into subsets by Z
|
|
C
|
|
call MPI_Comm_Split(MPI_Comm_World, Pz, Py, ZCommunicator,
|
|
. MPIerror)
|
|
call MPI_Comm_Rank(ZCommunicator,ZNode,MPIerror)
|
|
call MPI_Comm_Size(ZCommunicator,ZNodes,MPIerror)
|
|
firsttimeZ = .false.
|
|
endif
|
|
|
|
BlockSizeY = ((n2/nsm)/ProcessorY)*nsm
|
|
NRemY = (n2 - ProcessorY*BlockSizeY)/nsm
|
|
if (NRemY.gt.0) then
|
|
BlockSizeYMax = BlockSizeY + nsm
|
|
else
|
|
BlockSizeYMax = BlockSizeY
|
|
endif
|
|
n1lmax = ((n1-1)/ProcessorY) + 1
|
|
C
|
|
C Work out local dimensions
|
|
C
|
|
jmin = (Py-1)*BlockSizeY + nsm*min(Py-1,NRemY) + 1
|
|
jmax = jmin + BlockSizeY - 1
|
|
if (Py-1.lt.NRemY) jmax = jmax + nsm
|
|
jmax = min(jmax,n2)
|
|
jloc = jmax - jmin + 1
|
|
C
|
|
C Allocate local memory and initialise
|
|
C
|
|
nullify( ftmp )
|
|
call re_alloc( ftmp, 1, 2, 1, n1lmax, 1, BlockSizeYMax, 1, n3l,
|
|
& name='ftmp', routine='redistribXZ' )
|
|
nullify( ftmp2 )
|
|
call re_alloc( ftmp2, 1, 2, 1, n1lmax, 1, BlockSizeYMax, 1, n3l,
|
|
& name='ftmp2', routine='redistribXZ' )
|
|
|
|
do i = 1,n3l
|
|
do j = 1,BlockSizeYMax
|
|
do k = 1,n1lmax
|
|
ftmp(1,k,j,i) = 0.0_grid_p
|
|
ftmp(2,k,j,i) = 0.0_grid_p
|
|
ftmp2(1,k,j,i) = 0.0_grid_p
|
|
ftmp2(2,k,j,i) = 0.0_grid_p
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
if (idir.gt.0) then
|
|
C***********
|
|
C F -> FT *
|
|
C***********
|
|
C
|
|
C Handle transfer of terms which are purely local
|
|
C
|
|
do i = 1,n3l
|
|
do j = jmin,jmax
|
|
jl = j - jmin + 1
|
|
kl = 0
|
|
do k = 1+ZNode, n1, ZNodes
|
|
kl = kl + 1
|
|
ft(1,kl,j,i) = f(1,k,jl,i)
|
|
ft(2,kl,j,i) = f(2,k,jl,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Loop over all Node-Node vectors exchanging local data
|
|
C
|
|
do INode = 1,ProcessorY-1
|
|
BNode = (ZNode+INode)
|
|
BNode = mod(BNode,ProcessorY)
|
|
SNode = (ZNode-INode)
|
|
SNode = mod(SNode+ProcessorY,ProcessorY)
|
|
C
|
|
C Collect data to send
|
|
do i = 1,n3l
|
|
do jl = 1,jloc
|
|
kl = 0
|
|
do k = 1+BNode, n1, ZNodes
|
|
kl = kl + 1
|
|
ftmp(1,kl,jl,i) = f(1,k,jl,i)
|
|
ftmp(2,kl,jl,i) = f(2,k,jl,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C
|
|
C Exchange data - send to right and receive from left
|
|
C
|
|
call MPI_IRecv(ftmp2(1,1,1,1),2*n1lmax*BlockSizeYMax*n3l,
|
|
. MPI_grid_real,SNode,1,ZCommunicator,RequestR,MPIerror)
|
|
call MPI_ISend(ftmp(1,1,1,1),2*n1lmax*BlockSizeYMax*n3l,
|
|
. MPI_grid_real,BNode,1,ZCommunicator,RequestS,MPIerror)
|
|
C
|
|
C Wait for receive to complete
|
|
C
|
|
call MPI_Wait(RequestR,Status,MPIerror)
|
|
C
|
|
C Place received data into correct array
|
|
C
|
|
jminS = SNode*BlockSizeY + nsm*min(SNode,NRemY) + 1
|
|
jmaxS = jminS + BlockSizeY - 1
|
|
if (SNode.lt.NRemY) jmaxS = jmaxS + nsm
|
|
jmaxS = min(jmaxS,n2)
|
|
do i = 1,n3l
|
|
do j = jminS,jmaxS
|
|
jl = j - jminS + 1
|
|
kl = 0
|
|
do k = 1+ZNode, n1, ZNodes
|
|
kl = kl + 1
|
|
ft(1,kl,j,i) = ftmp2(1,kl,jl,i)
|
|
ft(2,kl,j,i) = ftmp2(2,kl,jl,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Wait for send to complete
|
|
C
|
|
call MPI_Wait(RequestS,Status,MPIerror)
|
|
enddo
|
|
elseif (idir.lt.0) then
|
|
C***********
|
|
C FT -> F *
|
|
C***********
|
|
C
|
|
C Handle transfer of terms which are purely local
|
|
|
|
do i = 1,n3l
|
|
do j = jmin,jmax
|
|
jl = j - jmin + 1
|
|
kl = 0
|
|
do k = 1+ZNode, n1, ZNodes
|
|
kl = kl + 1
|
|
f(1,k,jl,i) = ft(1,kl,j,i)
|
|
f(2,k,jl,i) = ft(2,kl,j,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Loop over all Node-Node vectors exchanging local data
|
|
C
|
|
do INode = 1,ProcessorY-1
|
|
BNode = (ZNode+INode)
|
|
BNode = mod(BNode,ProcessorY)
|
|
SNode = (ZNode-INode)
|
|
SNode = mod(SNode+ProcessorY,ProcessorY)
|
|
C
|
|
C Collect data to send
|
|
C
|
|
jminS = SNode*BlockSizeY + nsm*min(SNode,NRemY) + 1
|
|
jmaxS = jminS + BlockSizeY - 1
|
|
if (SNode.lt.NRemY) jmaxS = jmaxS + nsm
|
|
jmaxS = min(jmaxS,n2)
|
|
do i = 1,n3l
|
|
do j = jminS,jmaxS
|
|
jl = j - jminS + 1
|
|
kl = 0
|
|
do k = 1+ZNode, n1, ZNodes
|
|
kl = kl + 1
|
|
ftmp(1,kl,jl,i) = ft(1,kl,j,i)
|
|
ftmp(2,kl,jl,i) = ft(2,kl,j,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Exchange data - send to right and receive from left
|
|
C
|
|
call MPI_IRecv(ftmp2(1,1,1,1),2*n1lmax*BlockSizeYMax*n3l,
|
|
. MPI_grid_real,BNode,1,ZCommunicator,RequestR,MPIerror)
|
|
call MPI_ISend(ftmp(1,1,1,1),2*n1lmax*BlockSizeYMax*n3l,
|
|
. MPI_grid_real,SNode,1,ZCommunicator,RequestS,MPIerror)
|
|
C
|
|
C Wait for receive to complete
|
|
C
|
|
call MPI_Wait(RequestR,Status,MPIerror)
|
|
C
|
|
C Place received data into correct array
|
|
|
|
do i = 1,n3l
|
|
do jl = 1,jloc
|
|
kl = 0
|
|
do k = 1+BNode, n1, ZNodes
|
|
kl = kl + 1
|
|
f(1,k,jl,i) = ftmp2(1,kl,jl,i)
|
|
f(2,k,jl,i) = ftmp2(2,kl,jl,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Wait for send to complete
|
|
C
|
|
call MPI_Wait(RequestS,Status,MPIerror)
|
|
enddo
|
|
endif
|
|
C
|
|
C Free local memory
|
|
C
|
|
call de_alloc( ftmp2, name='ftmp2', routine='redistribXZ' )
|
|
call de_alloc( ftmp, name='ftmp', routine='redistribXZ' )
|
|
|
|
end subroutine redistribXZ
|
|
|
|
!******************************************************************************
|
|
|
|
subroutine redistribXY(f,n1,n2l,n3l,ft,n1lf,n3,idir,nsm,
|
|
. Node,Nodes)
|
|
C
|
|
C This routine redistributes the data over the Nodes as needed
|
|
C for the FFT routines between the arrays f and ft
|
|
C
|
|
C Array f is distributed in the Y/Z direction while ft is distributed
|
|
C in the X/Y direction
|
|
C
|
|
C idir = direction of redistribution : > 0 => f->ft
|
|
C < 0 => ft->f
|
|
C
|
|
C Use the processor grid to divide communicator according to Y
|
|
C
|
|
C Currently written in not the most efficient but simple way!
|
|
C Need to improve communication later.
|
|
C
|
|
C Julian Gale, July 1999
|
|
C
|
|
|
|
implicit none
|
|
|
|
C
|
|
C Passed arguments
|
|
C
|
|
integer
|
|
. n1, n3, n2l, n1lf, n3l, nsm, Node, Nodes, idir
|
|
real(grid_p)
|
|
. f(2,n1,n2l,n3l), ft(2,n1lf,n2l,n3)
|
|
C
|
|
C Local variables
|
|
C
|
|
integer
|
|
. i, j, k, il, kl, BNode, BlockSizeZ, imin, imax, iloc,
|
|
. INode, n1lmax, SNode, iminS, imaxS, NRemZ, BlockSizeZMax
|
|
|
|
integer
|
|
. MPIerror, RequestR, RequestS, Status(MPI_Status_Size)
|
|
|
|
integer, save ::
|
|
. ProcessorZ, Py, Pz, YCommunicator, YNode, YNodes
|
|
|
|
logical, save :: firsttimeY = .true.
|
|
|
|
real(grid_p), dimension(:,:,:,:), pointer :: ftmp,ftmp2
|
|
|
|
if (firsttimeY) then
|
|
C
|
|
C Determine processor grid coordinates
|
|
C
|
|
ProcessorZ = Nodes/ProcessorY
|
|
if (ProcessorY*ProcessorZ.ne.Nodes)
|
|
$ call die('ERROR: ProcessorY must be a factor of the' //
|
|
$ ' number of processors!')
|
|
Py = (Node/ProcessorZ) + 1
|
|
Pz = Node - (Py - 1)*ProcessorZ + 1
|
|
C
|
|
C Group processors into subsets by Y
|
|
C
|
|
call MPI_Comm_Split(MPI_Comm_World, Py, Pz, YCommunicator,
|
|
. MPIerror)
|
|
call MPI_Comm_Rank(YCommunicator,YNode,MPIerror)
|
|
call MPI_Comm_Size(YCommunicator,YNodes,MPIerror)
|
|
firsttimeY = .false.
|
|
endif
|
|
|
|
BlockSizeZ = ((n3/nsm)/ProcessorZ)*nsm
|
|
NRemZ = (n3 - ProcessorZ*BlockSizeZ)/nsm
|
|
if (NRemZ.gt.0) then
|
|
BlockSizeZMax = BlockSizeZ + nsm
|
|
else
|
|
BlockSizeZMax = BlockSizeZ
|
|
endif
|
|
n1lmax = ((n1-1)/ProcessorZ) + 1
|
|
C
|
|
C Allocate local memory and initialise
|
|
C
|
|
nullify( ftmp )
|
|
call re_alloc( ftmp, 1, 2, 1, n1lmax, 1, n2l, 1, BlockSizeZMax,
|
|
& name='ftmp', routine='redistribXY' )
|
|
nullify( ftmp2 )
|
|
call re_alloc( ftmp2, 1, 2, 1, n1lmax, 1, n2l, 1, BlockSizeZMax,
|
|
& name='ftmp2', routine='redistribXY' )
|
|
|
|
do i = 1,BlockSizeZMax
|
|
do j = 1,n2l
|
|
do k = 1,n1lmax
|
|
ftmp(1,k,j,i) = 0.0_grid_p
|
|
ftmp(2,k,j,i) = 0.0_grid_p
|
|
ftmp2(1,k,j,i) = 0.0_grid_p
|
|
ftmp2(2,k,j,i) = 0.0_grid_p
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Work out local dimensions
|
|
C
|
|
imin = (Pz-1)*BlockSizeZ + nsm*min(Pz-1,NRemZ) + 1
|
|
imax = imin + BlockSizeZ - 1
|
|
if (Pz-1.lt.NRemZ) imax = imax + nsm
|
|
imax = min(imax,n3)
|
|
iloc = imax - imin + 1
|
|
|
|
if (idir.gt.0) then
|
|
C***********
|
|
C F -> FT *
|
|
C***********
|
|
C
|
|
C Handle transfer of terms which are purely local
|
|
|
|
do i = imin,imax
|
|
il = i - imin + 1
|
|
do j = 1,n2l
|
|
kl = 0
|
|
do k = 1+YNode, n1, YNodes
|
|
kl = kl + 1
|
|
ft(1,kl,j,i) = f(1,k,j,il)
|
|
ft(2,kl,j,i) = f(2,k,j,il)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Loop over all Node-Node vectors exchanging local data
|
|
C
|
|
do INode = 1,ProcessorZ-1
|
|
BNode = (YNode+INode)
|
|
BNode = mod(BNode,ProcessorZ)
|
|
SNode = (YNode-INode)
|
|
SNode = mod(SNode+ProcessorZ,ProcessorZ)
|
|
C
|
|
C Collect data to send
|
|
|
|
do il = 1,iloc
|
|
do j = 1,n2l
|
|
kl = 0
|
|
do k = 1+BNode, n1, YNodes
|
|
kl = kl + 1
|
|
ftmp(1,kl,j,il) = f(1,k,j,il)
|
|
ftmp(2,kl,j,il) = f(2,k,j,il)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Exchange data - send to right and receive from left
|
|
C
|
|
call MPI_IRecv(ftmp2(1,1,1,1),2*n1lmax*n2l*BlockSizeZMax,
|
|
. MPI_grid_real,SNode,1,YCommunicator,RequestR,MPIerror)
|
|
call MPI_ISend(ftmp(1,1,1,1),2*n1lmax*n2l*BlockSizeZMax,
|
|
. MPI_grid_real,BNode,1,YCommunicator,RequestS,MPIerror)
|
|
C
|
|
C Wait for receive to complete
|
|
C
|
|
call MPI_Wait(RequestR,Status,MPIerror)
|
|
C
|
|
C Place received data into correct array
|
|
C
|
|
iminS = SNode*BlockSizeZ + nsm*min(SNode,NRemZ) + 1
|
|
imaxS = iminS + BlockSizeZ - 1
|
|
if (SNode.lt.NRemZ) imaxS = imaxS + nsm
|
|
imaxS = min(imaxS,n3)
|
|
do i = iminS,imaxS
|
|
il = i - iminS + 1
|
|
do j = 1,n2l
|
|
kl = 0
|
|
do k = 1+YNode, n1, YNodes
|
|
kl = kl + 1
|
|
ft(1,kl,j,i) = ftmp2(1,kl,j,il)
|
|
ft(2,kl,j,i) = ftmp2(2,kl,j,il)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Wait for send to complete
|
|
C
|
|
call MPI_Wait(RequestS,Status,MPIerror)
|
|
enddo
|
|
elseif (idir.lt.0) then
|
|
C***********
|
|
C FT -> F *
|
|
C***********
|
|
C
|
|
C Handle transfer of terms which are purely local
|
|
|
|
do i = imin,imax
|
|
il = i - imin + 1
|
|
do j = 1,n2l
|
|
kl = 0
|
|
do k = 1+YNode, n1, YNodes
|
|
kl = kl + 1
|
|
f(1,k,j,il) = ft(1,kl,j,i)
|
|
f(2,k,j,il) = ft(2,kl,j,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Loop over all Node-Node vectors exchanging local data
|
|
C
|
|
do INode = 1,ProcessorZ-1
|
|
BNode = (YNode+INode)
|
|
BNode = mod(BNode,ProcessorZ)
|
|
SNode = (YNode-INode)
|
|
SNode = mod(SNode+ProcessorZ,ProcessorZ)
|
|
C
|
|
C Collect data to send
|
|
C
|
|
iminS = SNode*BlockSizeZ + nsm*min(SNode,NRemZ) + 1
|
|
imaxS = iminS + BlockSizeZ - 1
|
|
if (SNode.lt.NRemZ) imaxS = imaxS + nsm
|
|
imaxS = min(imaxS,n3)
|
|
do i = iminS,imaxS
|
|
il = i - iminS + 1
|
|
do j = 1,n2l
|
|
kl = 0
|
|
do k = 1+YNode, n1, YNodes
|
|
kl = kl + 1
|
|
ftmp(1,kl,j,il) = ft(1,kl,j,i)
|
|
ftmp(2,kl,j,il) = ft(2,kl,j,i)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C
|
|
C Exchange data - send to right and receive from left
|
|
C
|
|
call MPI_IRecv(ftmp2(1,1,1,1),2*n1lmax*n2l*BlockSizeZMax,
|
|
. MPI_grid_real,BNode,1,YCommunicator,RequestR,MPIerror)
|
|
call MPI_ISend(ftmp(1,1,1,1),2*n1lmax*n2l*BlockSizeZMax,
|
|
. MPI_grid_real,SNode,1,YCommunicator,RequestS,MPIerror)
|
|
C
|
|
C Wait for receive to complete
|
|
C
|
|
call MPI_Wait(RequestR,Status,MPIerror)
|
|
C
|
|
C Place received data into correct array
|
|
C
|
|
do il = 1,iloc
|
|
do j = 1,n2l
|
|
kl = 0
|
|
do k = 1+BNode, n1, YNodes
|
|
kl = kl + 1
|
|
f(1,k,j,il) = ftmp2(1,kl,j,il)
|
|
f(2,k,j,il) = ftmp2(2,kl,j,il)
|
|
enddo
|
|
enddo
|
|
enddo
|
|
|
|
C Wait for send to complete
|
|
C
|
|
call MPI_Wait(RequestS,Status,MPIerror)
|
|
enddo
|
|
endif
|
|
C
|
|
C Free local memory
|
|
C
|
|
call de_alloc( ftmp2, name='ftmp2', routine='redistribXY' )
|
|
call de_alloc( ftmp, name='ftmp', routine='redistribXY' )
|
|
|
|
return
|
|
end subroutine redistribXY
|
|
#endif
|
|
|
|
!******************************************************************************
|
|
|
|
END MODULE m_fft
|