Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(DO NOT MERGE) - Tpetra: performance improvements to CrsMatrix::copyAndPermute - working branch #13598

Draft
wants to merge 5 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@
#include "Thyra_TpetraLinearOp.hpp"
#include "Tpetra_CrsMatrix.hpp"

extern bool in_eval_J;
extern double timer_evalJ;
extern double timer_capsg;

// Constructors/Initializers/Accessors

template<typename Scalar>
Expand Down Expand Up @@ -1569,7 +1573,8 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs<Scalar> &inArgs,
else if(Teuchos::is_null(f_out) && !Teuchos::is_null(W_out)) {

PANZER_FUNC_TIME_MONITOR("panzer::ModelEvaluator::evalModel(J)");

double time_ = Teuchos::Time::wallTime();
in_eval_J = true;
// only add auxiliary global data if Jacobian is being formed
ae_inargs.addGlobalEvaluationData(nonParamGlobalEvaluationData_);

Expand All @@ -1582,6 +1587,8 @@ evalModelImpl_basic(const Thyra::ModelEvaluatorBase::InArgs<Scalar> &inArgs,
thGhostedContainer->initializeMatrix(0.0);

ae_tm_.template getAsObject<panzer::Traits::Jacobian>()->evaluate(ae_inargs);
in_eval_J = false;
timer_evalJ += -time_ + Teuchos::Time::wallTime();
}

// HACK: set A to null before calling responses to avoid touching the
Expand Down
187 changes: 163 additions & 24 deletions packages/panzer/mini-em/example/BlockPrec/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,12 @@

#include <string>
#include <iostream>

#include <set>
#include <vector>
#include <unordered_map>
#include <algorithm>
#include <chrono>
#include <thread>

template <class Scalar>
void writeToExodus(double time_stamp,
Expand Down Expand Up @@ -93,6 +98,29 @@ using mini_em::physicsType, mini_em::MAXWELL, mini_em::DARCY;
using mini_em::solverType, mini_em::AUGMENTATION, mini_em::MUELU, mini_em::ML, mini_em::CG, mini_em::GMRES;
using mini_em::linearAlgebraType, mini_em::linAlgTpetra, mini_em::linAlgEpetra;

bool panzer_impl_old = true;
bool panzer_impl_new = false;

int panzer_impl_inp = 0; // 0, 1, 2=both

double timer_MV=0.0;
double timer_ICI=0.0;
bool in_eval_MV = false;
bool in_eval_J = false;
double timer_evalJ=0.0;
double timer_capsg=0.0;
double timer_main=0.0;

int numRepeatRuns = 1;
int repeat = 0;

template<class T>
static T parallel_reduce(Teuchos::RCP<const Teuchos::MpiComm<int> > comm, T& localVal, Teuchos::EReductionType red) {
T globalVal;
Teuchos::reduceAll<int, T> (*comm, red,
localVal, Teuchos::outArg (globalVal));
return globalVal;
}

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class blockedLinObjFactory, bool useTpetra>
int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
Expand All @@ -108,16 +136,18 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
if (comm->getSize() > 1) {
out->setOutputToRootOnly(0);
}

Teuchos::RCP<Teuchos::StackedTimer> stacked_timer;
bool use_stacked_timer;
Teuchos::RCP<Teuchos::TimeMonitor> timer;
bool use_stacked_timer, use_timer;
std::string test_name = "MiniEM 3D RefMaxwell";

// Figure of merit data for acceptance testing
bool print_fom;
size_t fom_num_cells;

{

// defaults for command-line options
int x_elements=-1,y_elements=-1,z_elements=-1,basis_order=1;
int workset_size=2000;
Expand All @@ -138,12 +168,14 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
bool resetSolver = false;
bool doSolveTimings = false;
bool matrixFree = false;
bool use_timer_test = false;
int numReps = 0;
linearAlgebraType linAlgebraValues[2] = {linAlgTpetra, linAlgEpetra};
const char * linAlgebraNames[2] = {"Tpetra", "Epetra"};
linearAlgebraType linAlgebra = linAlgTpetra;
clp.setOption<linearAlgebraType>("linAlgebra",&linAlgebra,2,linAlgebraValues,linAlgebraNames);
use_stacked_timer = true;
use_stacked_timer = false;
use_timer = true;
print_fom = true;
clp.setOption("x-elements",&x_elements);
clp.setOption("y-elements",&y_elements);
Expand All @@ -166,6 +198,8 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
clp.setOption("resetSolver","no-resetSolver",&resetSolver,"update the solver in every timestep");
clp.setOption("doSolveTimings","no-doSolveTimings",&doSolveTimings,"repeat the first solve \"numTimeSteps\" times");
clp.setOption("stacked-timer","no-stacked-timer",&use_stacked_timer,"Run with or without stacked timer output");
clp.setOption("timer","no-timer",&use_timer,"Run with or without timer output");
clp.setOption("new-impl",&panzer_impl_inp,"Run without (0) or with (1) new tpetra code, or both old & new (2)");
clp.setOption("test-name", &test_name, "Name of test (for Watchr output)");
clp.setOption("print-fom","no-print-fom",&print_fom,"print the figure of merit for acceptance testing");
#ifdef HAVE_TEUCHOS_STACKTRACE
Expand All @@ -184,12 +218,31 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
case Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL: break;
}

switch (panzer_impl_inp) {
case 0:
panzer_impl_new = false;
panzer_impl_old = true;
break;
case 1:
panzer_impl_new = true;
panzer_impl_old = false;
break;
case 2:
panzer_impl_new = true;
panzer_impl_old = true;
break;
default:
return EXIT_FAILURE;
}

std::cout << "P" << comm->getRank() << ": [dbg] panzer_impl_old= " << panzer_impl_old << " panzer_impl_new= " << panzer_impl_new << std::endl;


#ifdef HAVE_TEUCHOS_STACKTRACE
if (stacktrace)
Teuchos::print_stack_on_segfault();
#endif


if (use_stacked_timer) {
stacked_timer = rcp(new Teuchos::StackedTimer("Mini-EM"));
Teuchos::RCP<Teuchos::FancyOStream> verbose_out = Teuchos::rcp(new Teuchos::FancyOStream(Teuchos::rcpFromRef(std::cout)));
Expand All @@ -199,6 +252,10 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
Teuchos::TimeMonitor::setStackedTimer(stacked_timer);

Teuchos::TimeMonitor tM(*Teuchos::TimeMonitor::getNewTimer(std::string("Mini-EM: Total Time")));
Teuchos::Time mainTimer("mainTimer", true);

std::cout << "panzer_impl_new= " << panzer_impl_new << std::endl;
std::cout << "panzer_impl_old= " << panzer_impl_old << std::endl;

if (doSolveTimings) {
numReps = numTimeSteps;
Expand Down Expand Up @@ -251,7 +308,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
physicsEqSet.set("Integration Order", 2*basis_order);

RCP<panzer_stk::STK_Interface> mesh;
int dim;
int dim=3;
Teuchos::RCP<panzer_stk::STK_MeshFactory> mesh_factory;
{
Teuchos::TimeMonitor tMmesh(*Teuchos::TimeMonitor::getNewTimer(std::string("Mini-EM: build mesh")));
Expand Down Expand Up @@ -589,7 +646,7 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
auxOutArgs.set_W_op(aux_W_op);
auxPhysicsME->evalModel(auxInArgs, auxOutArgs);
}

// setup a response library to write to the mesh
RCP<panzer::ResponseLibrary<panzer::Traits> > stkIOResponseLibrary
= buildSTKIOResponseLibrary(physicsBlocks,linObjFactory,wkstContainer,dofManager,cm_factory,mesh,
Expand Down Expand Up @@ -726,13 +783,35 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])

// Collect FOM data before everything goes out of scope
fom_num_cells = mesh->getEntityCounts(dim);

mainTimer.stop();
timer_main = mainTimer.totalElapsedTime();
if (comm->getRank() == 0) {
std::cout << "mainTimer(run: " << repeat << "/" << numRepeatRuns << ") = " << timer_main << std::endl;
}

if (use_timer) {
if (comm->getRank() == 0) std::cout << "summarize...\n";
tM.summarize();
if (comm->getRank() == 0) std::cout << "report...\n";
auto params = rcp(new Teuchos::ParameterList());
params->set("Report format", "Table"); // (default), "YAML" </li>
params->set("YAML style", "spacious"); // (default), "compact" </li>
params->set("How to merge timer sets", "Union"); // "Intersection"); // (default), "Union" </li>
params->set("alwaysWriteLocal", true); // , false (default) </li>
params->set("writeGlobalStats", true); // (default), false </li>
params->set("writeZeroTimers", true); // : true (default), false </li>

tM.report(std::cout, "panzer", params);
}
}

// Output timer data
if (use_stacked_timer) {
stacked_timer->stop("Mini-EM");
Teuchos::StackedTimer::OutputOptions options;
options.output_fraction = options.output_histogram = options.output_minmax = true;
// options.output_fraction = options.output_minmax = options.align_columns = true;
stacked_timer->report(*out, comm, options);
auto xmlOut = stacked_timer->reportWatchrXML(test_name + ' ' + std::to_string(comm->getSize()) + " ranks", comm);
if(xmlOut.length())
Expand All @@ -756,10 +835,9 @@ int main_(Teuchos::CommandLineProcessor &clp, int argc,char * argv[])
*out << "=================================\n\n";
}

} else {
} else if (use_timer) {
Teuchos::TimeMonitor::summarize(*out,false,true,false,Teuchos::Union,"",true);
}

return EXIT_SUCCESS;
}

Expand All @@ -777,6 +855,8 @@ int main(int argc,char * argv[]){
const char * solverNames[5] = {"Augmentation", "MueLu", "ML", "CG", "GMRES"};
solverType solver = MUELU;
clp.setOption<solverType>("solver",&solver,5,solverValues,solverNames,"Solver that is used");
clp.setOption("num-repeat-runs",&numRepeatRuns);

// bool useComplex = false;
// clp.setOption("complex","real",&useComplex);
clp.recogniseAllOptions(false);
Expand All @@ -792,30 +872,89 @@ int main(int argc,char * argv[]){
// TEUCHOS_ASSERT(!useComplex);
}

int retVal;
Teuchos::RCP<const Teuchos::MpiComm<int> > comm
= Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(Teuchos::DefaultComm<int>::getComm());

int retVal=0;
std::vector<double> timer_evalJ_vec(numRepeatRuns), timer_capsg_vec(numRepeatRuns), timer_main_vec(numRepeatRuns);
// ==========================================================================================================================
for (repeat=0; repeat < numRepeatRuns; ++repeat) {
// ==========================================================================================================================

in_eval_J = false;
timer_main = 0.0;
timer_evalJ = 0.0;
timer_capsg = 0.0;

if (linAlgebra == linAlgTpetra) {

// if (useComplex) {
// #if defined(HAVE_TPETRA_COMPLEX_DOUBLE)
// typedef typename panzer::BlockedTpetraLinearObjFactory<panzer::Traits,std::complex<double>,int,panzer::GlobalOrdinal> blockedLinObjFactory;
// retVal = main_<std::complex<double>,int,panzer::GlobalOrdinal,blockedLinObjFactory,true>(clp, argc, argv);
// #else
// std::cout << std::endl
// << "WARNING" << std::endl
// << "Tpetra was compiled without Scalar=std::complex<double>." << std::endl << std::endl;
// return EXIT_FAILURE;
// #endif
// } else {
typedef typename panzer::BlockedTpetraLinearObjFactory<panzer::Traits,double,int,panzer::GlobalOrdinal> blockedLinObjFactory;
retVal = main_<double,int,panzer::GlobalOrdinal,blockedLinObjFactory,true>(clp, argc, argv);
// }
// #if defined(HAVE_TPETRA_COMPLEX_DOUBLE)
// typedef typename panzer::BlockedTpetraLinearObjFactory<panzer::Traits,std::complex<double>,int,panzer::GlobalOrdinal> blockedLinObjFactory;
// retVal = main_<std::complex<double>,int,panzer::GlobalOrdinal,blockedLinObjFactory,true>(clp, argc, argv);
// #else
// std::cout << std::endl
// << "WARNING" << std::endl
// << "Tpetra was compiled without Scalar=std::complex<double>." << std::endl << std::endl;
// return EXIT_FAILURE;
// #endif
// } else {
typedef typename panzer::BlockedTpetraLinearObjFactory<panzer::Traits,double,int,panzer::GlobalOrdinal> blockedLinObjFactory;
retVal = main_<double,int,panzer::GlobalOrdinal,blockedLinObjFactory,true>(clp, argc, argv);
// }
#ifdef PANZER_HAVE_EPETRA_STACK
} else if (linAlgebra == linAlgEpetra) {
// TEUCHOS_ASSERT(!useComplex);
typedef typename panzer::BlockedEpetraLinearObjFactory<panzer::Traits,int> blockedLinObjFactory;
retVal = main_<double,int,int,blockedLinObjFactory,false>(clp, argc, argv);
#endif
} else
} else {
TEUCHOS_ASSERT(false);
}

if (1) {
timer_main = parallel_reduce(comm, timer_main, Teuchos::REDUCE_MAX);
timer_evalJ = parallel_reduce(comm, timer_evalJ, Teuchos::REDUCE_MAX);
timer_capsg = parallel_reduce(comm, timer_capsg, Teuchos::REDUCE_MAX);
if (!comm->getRank()) {
std::cout << "[TIMER] repeat= " << repeat << " timer_evalJ= " << timer_evalJ << std::endl;
std::cout << "[TIMER] repeat= " << repeat << " timer_capsg= " << timer_capsg << std::endl;
timer_main_vec[repeat] = timer_main;
timer_evalJ_vec[repeat] = timer_evalJ;
timer_capsg_vec[repeat] = timer_capsg;
}
}


// ==========================================================================================================================
} //for (int repeat=0; repeat < numRepeatRuns; ++repeat) {
// ==========================================================================================================================

auto minMaxAve = [&] (const std::vector<double>& vec, double MinMaxAve[3]) {
MinMaxAve[0] = std::numeric_limits<double>::max();
MinMaxAve[1] = -MinMaxAve[0];
MinMaxAve[2] = 0.0;
for (auto v : vec) {
MinMaxAve[0] = std::min(MinMaxAve[0], v);
MinMaxAve[1] = std::max(MinMaxAve[1], v);
MinMaxAve[2] += v / double(vec.size());
}
};

if (!comm->getRank()) {
double MinMaxAve[3][3];
minMaxAve(timer_main_vec, MinMaxAve[0]);
minMaxAve(timer_evalJ_vec, MinMaxAve[1]);
minMaxAve(timer_capsg_vec, MinMaxAve[2]);
auto pr = [&](int j, const std::string& name) {
std::cout << "[TIMER] " << name << " AVE(" << numRepeatRuns << " runs): " << MinMaxAve[j][2]
<< " MIN: " << MinMaxAve[j][0] << " MAX: " << MinMaxAve[j][1]
<< " PAR-IMB: " << MinMaxAve[j][1]/(MinMaxAve[j][0] == 0.0 ? 1.0 : MinMaxAve[j][0]) << std::endl;
};
pr(0, "timer_main");
pr(1, "timer_evalJ");
pr(2, "timer_capsg");
}

Kokkos::finalize();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<ParameterList name="Mesh">
<Parameter name="Source" type="string" value="Inline Mesh" />
<ParameterList name="Inline Mesh">
<Parameter name="final time" type="double" value="5e-9"/>
<Parameter name="final time" type="double" value="5.0e-9"/>
<Parameter name="Mesh Dimension" type="int" value="3"/>
<Parameter name="Mesh Type" type="string" value="quad"/>
<Parameter name="CFL" type="double" value="4.0"/>
Expand Down
Loading
Loading