forked from lijiext/lammps
447 lines
15 KiB
C++
447 lines
15 KiB
C++
//@HEADER
|
|
// ************************************************************************
|
|
//
|
|
// Kokkos v. 2.0
|
|
// Copyright (2014) Sandia Corporation
|
|
//
|
|
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
|
// the U.S. Government retains certain rights in this software.
|
|
//
|
|
// Redistribution and use in source and binary forms, with or without
|
|
// modification, are permitted provided that the following conditions are
|
|
// met:
|
|
//
|
|
// 1. Redistributions of source code must retain the above copyright
|
|
// notice, this list of conditions and the following disclaimer.
|
|
//
|
|
// 2. Redistributions in binary form must reproduce the above copyright
|
|
// notice, this list of conditions and the following disclaimer in the
|
|
// documentation and/or other materials provided with the distribution.
|
|
//
|
|
// 3. Neither the name of the Corporation nor the names of the
|
|
// contributors may be used to endorse or promote products derived from
|
|
// this software without specific prior written permission.
|
|
//
|
|
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
|
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
|
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
//
|
|
// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
|
|
//
|
|
// ************************************************************************
|
|
//@HEADER
|
|
|
|
#include <cmath>
|
|
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <cstring>
|
|
|
|
#include <utility>
|
|
#include <string>
|
|
#include <vector>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
#include <iomanip>
|
|
|
|
#include <Kokkos_Core.hpp>
|
|
|
|
#include <WrapMPI.hpp>
|
|
#include <fenl.hpp>
|
|
|
|
// For vtune
|
|
#include <sys/types.h>
|
|
#include <unistd.h>
|
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
enum { CMD_USE_THREADS = 0
|
|
, CMD_USE_NUMA
|
|
, CMD_USE_CORE_PER_NUMA
|
|
, CMD_USE_CUDA
|
|
, CMD_USE_ROCM
|
|
, CMD_USE_OPENMP
|
|
, CMD_USE_CUDA_DEV
|
|
, CMD_USE_FIXTURE_X
|
|
, CMD_USE_FIXTURE_Y
|
|
, CMD_USE_FIXTURE_Z
|
|
, CMD_USE_FIXTURE_BEGIN
|
|
, CMD_USE_FIXTURE_END
|
|
, CMD_USE_FIXTURE_QUADRATIC
|
|
, CMD_USE_ATOMIC
|
|
, CMD_USE_TRIALS
|
|
, CMD_VTUNE
|
|
, CMD_PRINT
|
|
, CMD_ECHO
|
|
, CMD_ERROR
|
|
, CMD_COUNT };
|
|
|
|
void print_cmdline( std::ostream & s , const int cmd[] )
|
|
{
|
|
if ( cmd[ CMD_USE_THREADS ] ) {
|
|
s << " Threads(" << cmd[ CMD_USE_THREADS ]
|
|
<< ") NUMA(" << cmd[ CMD_USE_NUMA ]
|
|
<< ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
|
|
<< ")" ;
|
|
}
|
|
if ( cmd[ CMD_USE_OPENMP ] ) {
|
|
s << " OpenMP(" << cmd[ CMD_USE_OPENMP ]
|
|
<< ") NUMA(" << cmd[ CMD_USE_NUMA ]
|
|
<< ") CORE_PER_NUMA(" << cmd[ CMD_USE_CORE_PER_NUMA ]
|
|
<< ")" ;
|
|
}
|
|
if ( cmd[ CMD_USE_FIXTURE_X ] ) {
|
|
s << " Fixture(" << cmd[ CMD_USE_FIXTURE_X ]
|
|
<< "x" << cmd[ CMD_USE_FIXTURE_Y ]
|
|
<< "x" << cmd[ CMD_USE_FIXTURE_Z ]
|
|
<< ")" ;
|
|
}
|
|
if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
|
|
s << " Fixture( " << cmd[ CMD_USE_FIXTURE_BEGIN ]
|
|
<< " .. " << cmd[ CMD_USE_FIXTURE_END ]
|
|
<< " )" ;
|
|
}
|
|
if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) {
|
|
s << " Quadratic-Element" ;
|
|
}
|
|
if ( cmd[ CMD_USE_CUDA ] ) {
|
|
s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ;
|
|
}
|
|
if ( cmd[ CMD_USE_ROCM ] ) {
|
|
s << " ROCM" ;
|
|
}
|
|
if ( cmd[ CMD_USE_ATOMIC ] ) {
|
|
s << " ATOMIC" ;
|
|
}
|
|
if ( cmd[ CMD_USE_TRIALS ] ) {
|
|
s << " TRIALS(" << cmd[ CMD_USE_TRIALS ] << ")" ;
|
|
}
|
|
if ( cmd[ CMD_VTUNE ] ) {
|
|
s << " VTUNE" ;
|
|
}
|
|
if ( cmd[ CMD_PRINT ] ) {
|
|
s << " PRINT" ;
|
|
}
|
|
s << std::endl ;
|
|
}
|
|
|
|
void print_perf_value( std::ostream & s , const std::vector<size_t> & widths, const Kokkos::Example::FENL::Perf & perf )
|
|
{
|
|
int i=0;
|
|
s << std::setw(widths[i++]) << perf.global_elem_count << " ,";
|
|
s << std::setw(widths[i++]) << perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << perf.newton_iter_count << " ,";
|
|
s << std::setw(widths[i++]) << perf.cg_iter_count << " ,";
|
|
s << std::setw(widths[i++]) << perf.map_ratio << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.fill_node_set * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.scan_node_count * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.fill_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.sort_graph_entries * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.fill_element_graph * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.create_sparse_matrix * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.fill_time * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( perf.bc_time * 1000.0 ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( ( perf.matvec_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i++]) << ( ( perf.cg_time * 1000.0 ) / perf.cg_iter_count ) / perf.global_node_count << " ,";
|
|
s << std::setw(widths[i]) << perf.error_max;
|
|
s << std::endl ;
|
|
}
|
|
|
|
template< class Device , Kokkos::Example::BoxElemPart::ElemOrder ElemOrder >
|
|
void run( MPI_Comm comm , const int cmd[] )
|
|
{
|
|
int comm_rank = 0 ;
|
|
|
|
#if defined( KOKKOS_ENABLE_MPI )
|
|
MPI_Comm_rank( comm , & comm_rank );
|
|
#else
|
|
comm = 0 ;
|
|
#endif
|
|
|
|
|
|
if ( 0 == comm_rank ) {
|
|
if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; }
|
|
else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; }
|
|
else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; }
|
|
else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; }
|
|
|
|
if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; }
|
|
else { std::cout << " , LINEAR-ELEMENT" ; }
|
|
|
|
if ( cmd[ CMD_USE_ATOMIC ] ) { std::cout << " , USING ATOMICS" ; }
|
|
}
|
|
|
|
std::vector< std::pair<std::string,std::string> > headers;
|
|
|
|
|
|
headers.push_back(std::make_pair("ELEMS","count"));
|
|
headers.push_back(std::make_pair("NODES","count"));
|
|
headers.push_back(std::make_pair("NEWTON","iter"));
|
|
headers.push_back(std::make_pair("CG","iter"));
|
|
headers.push_back(std::make_pair("MAP_RATIO","ratio"));
|
|
headers.push_back(std::make_pair("SET_FILL/NODE","millisec"));
|
|
headers.push_back(std::make_pair("SCAN/NODE","millisec"));
|
|
headers.push_back(std::make_pair("GRAPH_FILL/NODE","millisec"));
|
|
headers.push_back(std::make_pair("SORT/NODE","millisec"));
|
|
headers.push_back(std::make_pair("ELEM_GRAPH_FILL/NODE","millisec"));
|
|
headers.push_back(std::make_pair("MATRIX_CREATE/NODE","millisec"));
|
|
headers.push_back(std::make_pair("MATRIX_FILL/NODE","millisec"));
|
|
headers.push_back(std::make_pair("BOUNDARY/NODE","millisec"));
|
|
headers.push_back(std::make_pair("MAT_VEC/ITER/ROW","millisec"));
|
|
headers.push_back(std::make_pair("CG/ITER/ROW","millisec"));
|
|
headers.push_back(std::make_pair("ERROR","ratio"));
|
|
|
|
// find print widths
|
|
size_t min_width = 10;
|
|
std::vector< size_t > widths(headers.size());
|
|
for (size_t i=0, ie=headers.size(); i<ie; ++i)
|
|
widths[i] = std::max(min_width, headers[i].first.size()+1);
|
|
|
|
// print column headers
|
|
if ( 0 == comm_rank ) {
|
|
std::cout << std::endl ;
|
|
for (size_t i=0; i<headers.size(); ++i)
|
|
std::cout << std::setw(widths[i]) << headers[i].first << " ,";
|
|
std::cout << "\b\b " << std::endl;
|
|
for (size_t i=0; i<headers.size(); ++i)
|
|
std::cout << std::setw(widths[i]) << headers[i].second << " ,";
|
|
std::cout << "\b\b " << std::endl;
|
|
|
|
std::cout << std::scientific;
|
|
std::cout.precision(3);
|
|
}
|
|
|
|
if ( cmd[ CMD_USE_FIXTURE_BEGIN ] ) {
|
|
for ( int i = cmd[CMD_USE_FIXTURE_BEGIN] ; i < cmd[CMD_USE_FIXTURE_END] * 2 ; i *= 2 ) {
|
|
int nelem[3] ;
|
|
nelem[0] = std::max( 1 , (int) cbrt( ((double) i) / 2.0 ) );
|
|
nelem[1] = 1 + nelem[0] ;
|
|
nelem[2] = 2 * nelem[0] ;
|
|
|
|
const Kokkos::Example::FENL::Perf perf =
|
|
cmd[ CMD_USE_FIXTURE_QUADRATIC ]
|
|
? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
|
|
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
|
|
: Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
|
|
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
|
|
;
|
|
|
|
if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
|
|
}
|
|
}
|
|
else {
|
|
int nelem[3] = { cmd[ CMD_USE_FIXTURE_X ] ,
|
|
cmd[ CMD_USE_FIXTURE_Y ] ,
|
|
cmd[ CMD_USE_FIXTURE_Z ] };
|
|
|
|
const Kokkos::Example::FENL::Perf perf =
|
|
cmd[ CMD_USE_FIXTURE_QUADRATIC ]
|
|
? Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemQuadratic >
|
|
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
|
|
: Kokkos::Example::FENL::fenl< Device , Kokkos::Example::BoxElemPart::ElemLinear >
|
|
( comm , cmd[CMD_PRINT], cmd[CMD_USE_TRIALS], cmd[CMD_USE_ATOMIC], nelem )
|
|
;
|
|
|
|
if ( 0 == comm_rank ) print_perf_value( std::cout , widths, perf );
|
|
}
|
|
}
|
|
|
|
//----------------------------------------------------------------------------
|
|
|
|
int main( int argc , char ** argv )
|
|
{
|
|
int comm_rank = 0 ;
|
|
|
|
#if defined( KOKKOS_ENABLE_MPI )
|
|
MPI_Init( & argc , & argv );
|
|
MPI_Comm comm = MPI_COMM_WORLD ;
|
|
MPI_Comm_rank( comm , & comm_rank );
|
|
#else
|
|
MPI_Comm comm = 0 ;
|
|
(void) comm ; // suppress warning
|
|
#endif
|
|
|
|
int cmdline[ CMD_COUNT ] ;
|
|
|
|
for ( int i = 0 ; i < CMD_COUNT ; ++i ) cmdline[i] = 0 ;
|
|
|
|
if ( 0 == comm_rank ) {
|
|
for ( int i = 1 ; i < argc ; ++i ) {
|
|
if ( 0 == strcasecmp( argv[i] , "threads" ) ) {
|
|
cmdline[ CMD_USE_THREADS ] = atoi( argv[++i] );
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "openmp" ) ) {
|
|
cmdline[ CMD_USE_OPENMP ] = atoi( argv[++i] );
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "cores" ) ) {
|
|
sscanf( argv[++i] , "%dx%d" ,
|
|
cmdline + CMD_USE_NUMA ,
|
|
cmdline + CMD_USE_CORE_PER_NUMA );
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "cuda" ) ) {
|
|
cmdline[ CMD_USE_CUDA ] = 1 ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "cuda-dev" ) ) {
|
|
cmdline[ CMD_USE_CUDA ] = 1 ;
|
|
cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) {
|
|
cmdline[ CMD_USE_ROCM ] = 1 ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) {
|
|
sscanf( argv[++i] , "%dx%dx%d" ,
|
|
cmdline + CMD_USE_FIXTURE_X ,
|
|
cmdline + CMD_USE_FIXTURE_Y ,
|
|
cmdline + CMD_USE_FIXTURE_Z );
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "fixture-range" ) ) {
|
|
sscanf( argv[++i] , "%d..%d" ,
|
|
cmdline + CMD_USE_FIXTURE_BEGIN ,
|
|
cmdline + CMD_USE_FIXTURE_END );
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "fixture-quadratic" ) ) {
|
|
cmdline[ CMD_USE_FIXTURE_QUADRATIC ] = 1 ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "atomic" ) ) {
|
|
cmdline[ CMD_USE_ATOMIC ] = 1 ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "trials" ) ) {
|
|
cmdline[ CMD_USE_TRIALS ] = atoi( argv[++i] ) ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "vtune" ) ) {
|
|
cmdline[ CMD_VTUNE ] = 1 ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "print" ) ) {
|
|
cmdline[ CMD_PRINT ] = 1 ;
|
|
}
|
|
else if ( 0 == strcasecmp( argv[i] , "echo" ) ) {
|
|
cmdline[ CMD_ECHO ] = 1 ;
|
|
}
|
|
else {
|
|
cmdline[ CMD_ERROR ] = 1 ;
|
|
|
|
std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
|
|
}
|
|
}
|
|
|
|
if ( cmdline[ CMD_ECHO ] && 0 == comm_rank ) { print_cmdline( std::cout , cmdline ); }
|
|
}
|
|
|
|
#if defined( KOKKOS_ENABLE_MPI )
|
|
MPI_Bcast( cmdline , CMD_COUNT , MPI_INT , 0 , comm );
|
|
#endif
|
|
|
|
if ( cmdline[ CMD_VTUNE ] ) {
|
|
std::stringstream cmd;
|
|
pid_t my_os_pid=getpid();
|
|
const std::string vtune_loc =
|
|
"/usr/local/intel/vtune_amplifier_xe_2013/bin64/amplxe-cl";
|
|
const std::string output_dir = "./vtune/vtune.";
|
|
const int p_rank = comm_rank;
|
|
cmd << vtune_loc
|
|
<< " -collect hotspots -result-dir " << output_dir << p_rank
|
|
<< " -target-pid " << my_os_pid << " &";
|
|
if (p_rank == 0)
|
|
std::cout << cmd.str() << std::endl;
|
|
system(cmd.str().c_str());
|
|
system("sleep 10");
|
|
}
|
|
|
|
if ( ! cmdline[ CMD_ERROR ] && ! cmdline[ CMD_ECHO ] ) {
|
|
|
|
if ( ! cmdline[ CMD_USE_TRIALS ] ) { cmdline[ CMD_USE_TRIALS ] = 1 ; }
|
|
|
|
if ( ! cmdline[ CMD_USE_FIXTURE_X ] && ! cmdline[ CMD_USE_FIXTURE_BEGIN ] ) {
|
|
cmdline[ CMD_USE_FIXTURE_X ] = 2 ;
|
|
cmdline[ CMD_USE_FIXTURE_Y ] = 2 ;
|
|
cmdline[ CMD_USE_FIXTURE_Z ] = 2 ;
|
|
}
|
|
|
|
#if defined( KOKKOS_ENABLE_THREADS )
|
|
|
|
if ( cmdline[ CMD_USE_THREADS ] ) {
|
|
|
|
if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
|
|
Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] ,
|
|
cmdline[ CMD_USE_NUMA ] ,
|
|
cmdline[ CMD_USE_CORE_PER_NUMA ] );
|
|
}
|
|
else {
|
|
Kokkos::Threads::initialize( cmdline[ CMD_USE_THREADS ] );
|
|
}
|
|
|
|
run< Kokkos::Threads , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
|
|
|
|
Kokkos::Threads::finalize();
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined( KOKKOS_ENABLE_OPENMP )
|
|
|
|
if ( cmdline[ CMD_USE_OPENMP ] ) {
|
|
|
|
if ( cmdline[ CMD_USE_NUMA ] && cmdline[ CMD_USE_CORE_PER_NUMA ] ) {
|
|
Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] ,
|
|
cmdline[ CMD_USE_NUMA ] ,
|
|
cmdline[ CMD_USE_CORE_PER_NUMA ] );
|
|
}
|
|
else {
|
|
Kokkos::OpenMP::initialize( cmdline[ CMD_USE_OPENMP ] );
|
|
}
|
|
|
|
run< Kokkos::OpenMP , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
|
|
|
|
Kokkos::OpenMP::finalize();
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined( KOKKOS_ENABLE_CUDA )
|
|
if ( cmdline[ CMD_USE_CUDA ] ) {
|
|
// Use the last device:
|
|
|
|
Kokkos::HostSpace::execution_space::initialize();
|
|
Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( cmdline[ CMD_USE_CUDA_DEV ] ) );
|
|
|
|
run< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
|
|
|
|
Kokkos::Cuda::finalize();
|
|
Kokkos::HostSpace::execution_space::finalize();
|
|
}
|
|
|
|
#endif
|
|
|
|
#if defined( KOKKOS_ENABLE_ROCM )
|
|
if ( cmdline[ CMD_USE_ROCM ] ) {
|
|
// Use the last device:
|
|
|
|
Kokkos::HostSpace::execution_space::initialize();
|
|
Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) );
|
|
|
|
run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline );
|
|
|
|
Kokkos::Experimental::ROCm::finalize();
|
|
Kokkos::HostSpace::execution_space::finalize();
|
|
}
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
#if defined( KOKKOS_ENABLE_MPI )
|
|
MPI_Finalize();
|
|
#endif
|
|
|
|
return cmdline[ CMD_ERROR ] ? -1 : 0 ;
|
|
}
|
|
|