diff --git a/include/general.hh b/include/general.hh index dcf4388..573ae6c 100644 --- a/include/general.hh +++ b/include/general.hh @@ -185,6 +185,8 @@ inline void multitask_sync_barrier(void) #endif } +extern size_t global_mem_high_mark, local_mem_high_mark; + namespace CONFIG { extern int MPI_thread_support; diff --git a/include/memory_stat.hh b/include/memory_stat.hh new file mode 100644 index 0000000..f0561f2 --- /dev/null +++ b/include/memory_stat.hh @@ -0,0 +1,125 @@ +/* + * Author: David Robert Nadeau + * Site: http://NadeauSoftware.com/ + * License: Creative Commons Attribution 3.0 Unported License + * http://creativecommons.org/licenses/by/3.0/deed.en_US + */ +#pragma once + +namespace memory +{ + +#if defined(_WIN32) +#include +#include + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) +#include +#include + +#if defined(__APPLE__) && defined(__MACH__) +#include + +#elif (defined(_AIX) || defined(__TOS__AIX__)) || (defined(__sun__) || defined(__sun) || defined(sun) && (defined(__SVR4) || defined(__svr4__))) +#include +#include + +#elif defined(__linux__) || defined(__linux) || defined(linux) || defined(__gnu_linux__) +#include + +#endif + +#else +#error "Cannot define getPeakRSS( ) or getCurrentRSS( ) for an unknown OS." +#endif + + +/** + * Returns the peak (maximum so far) resident set size (physical + * memory use) measured in bytes, or zero if the value cannot be + * determined on this OS. + */ +inline size_t getPeakRSS( ) +{ +#if defined(_WIN32) + /* Windows -------------------------------------------------- */ + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.PeakWorkingSetSize; + +#elif (defined(_AIX) || defined(__TOS__AIX__)) || (defined(__sun__) || defined(__sun) || defined(sun) && (defined(__SVR4) || defined(__svr4__))) + /* AIX and Solaris ------------------------------------------ */ + struct psinfo psinfo; + int fd = -1; + if ( (fd = open( "/proc/self/psinfo", O_RDONLY )) == -1 ) + return (size_t)0L; /* Can't open? */ + if ( read( fd, &psinfo, sizeof(psinfo) ) != sizeof(psinfo) ) + { + close( fd ); + return (size_t)0L; /* Can't read? */ + } + close( fd ); + return (size_t)(psinfo.pr_rssize * 1024L); + +#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) + /* BSD, Linux, and OSX -------------------------------------- */ + struct rusage rusage; + getrusage( RUSAGE_SELF, &rusage ); +#if defined(__APPLE__) && defined(__MACH__) + return (size_t)rusage.ru_maxrss; +#else + return (size_t)(rusage.ru_maxrss * 1024L); +#endif + +#else + /* Unknown OS ----------------------------------------------- */ + return (size_t)0L; /* Unsupported. */ +#endif +} + + + + + +/** + * Returns the current resident set size (physical memory use) measured + * in bytes, or zero if the value cannot be determined on this OS. + */ +inline size_t getCurrentRSS( ) +{ +#if defined(_WIN32) + /* Windows -------------------------------------------------- */ + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo( GetCurrentProcess( ), &info, sizeof(info) ); + return (size_t)info.WorkingSetSize; + +#elif defined(__APPLE__) && defined(__MACH__) + /* OSX ------------------------------------------------------ */ + struct mach_task_basic_info info; + mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; + if ( task_info( mach_task_self( ), MACH_TASK_BASIC_INFO, + (task_info_t)&info, &infoCount ) != KERN_SUCCESS ) + return (size_t)0L; /* Can't access? */ + return (size_t)info.resident_size; + +#elif defined(__linux__) || defined(__linux) || defined(linux) || defined(__gnu_linux__) + /* Linux ---------------------------------------------------- */ + long rss = 0L; + FILE* fp = NULL; + if ( (fp = fopen( "/proc/self/statm", "r" )) == NULL ) + return (size_t)0L; /* Can't open? */ + if ( fscanf( fp, "%*s%ld", &rss ) != 1 ) + { + fclose( fp ); + return (size_t)0L; /* Can't read? */ + } + fclose( fp ); + return (size_t)rss * (size_t)sysconf( _SC_PAGESIZE); + +#else + /* AIX, BSD, Solaris, and Unknown OS ------------------------ */ + return (size_t)0L; /* Unsupported. */ +#endif +} + +}; diff --git a/src/grid_fft.cc b/src/grid_fft.cc index 0668029..f2767af 100644 --- a/src/grid_fft.cc +++ b/src/grid_fft.cc @@ -19,6 +19,24 @@ #include #include +#include "memory_stat.hh" + +void memory_report(void) +{ + //... report memory usage + size_t curr_mem_high_mark = 0; + local_mem_high_mark = memory::getCurrentRSS(); +#if defined(USE_MPI) + MPI_Allreduce(&local_mem_high_mark, &curr_mem_high_mark, 1, MPI_UNSIGNED_LONG_LONG, MPI_MAX, MPI_COMM_WORLD); +#else + curr_mem_high_mark = local_mem_high_mark; +#endif + if( curr_mem_high_mark > 1.1*global_mem_high_mark ){ + music::ilog << "----mem-> new memory high mark: " << curr_mem_high_mark/(1ull<<20) << " MBytes / task" << std::endl; + global_mem_high_mark = curr_mem_high_mark; + } +} + template void Grid_FFT::allocate(void) { @@ -175,6 +193,7 @@ void Grid_FFT::allocate(void) #endif //// of #ifdef #else USE_MPI //////////////////////////////////////////////////////////////////////////////////// } ballocated_ = true; + memory_report(); } template diff --git a/src/ic_generator.cc b/src/ic_generator.cc index 72bdecf..5308fae 100644 --- a/src/ic_generator.cc +++ b/src/ic_generator.cc @@ -354,10 +354,10 @@ int run( config_file& the_config ) // phi = - delta / k^2 music::ilog << "-------------------------------------------------------------------------------" << std::endl; - music::ilog << "Generating LPT fields...." << std::endl; + music::ilog << "\n>>> Generating LPT fields.... <<<\n" << std::endl; double wtime = get_wtime(); - music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(1) term" << std::flush; + music::ilog << std::setw(40) << std::setfill('.') << std::left << ">> Computing phi(1) term" << std::endl; phi.FourierTransformForward(false); phi.assign_function_of_grids_kdep([&](auto k, auto wn) { @@ -368,7 +368,7 @@ int run( config_file& the_config ) phi.zero_DC_mode(); - music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl; + music::ilog << "----cpu-> phi(1) took " << get_wtime() - wtime << "s" << std::endl; //====================================================================== //... compute 2LPT displacement potential .... @@ -379,7 +379,7 @@ int run( config_file& the_config ) phi2.FourierTransformForward(false); wtime = get_wtime(); - music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(2) term" << std::flush; + music::ilog << std::setw(40) << std::setfill('.') << std::left << ">> Computing phi(2) term" << std::endl; Conv.convolve_SumOfHessians(phi, {0, 0}, phi, {1, 1}, {2, 2}, op::assign_to(phi2)); Conv.convolve_Hessians(phi, {1, 1}, phi, {2, 2}, op::add_to(phi2)); Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 1}, op::subtract_from(phi2)); @@ -398,7 +398,7 @@ int run( config_file& the_config ) } phi2.apply_InverseLaplacian(); - music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl; + music::ilog << "----cpu-> phi(2) took " << get_wtime() - wtime << "s" << std::endl; if (bAddExternalTides) { @@ -419,19 +419,18 @@ int run( config_file& the_config ) //... phi3 = phi3a - 10/7 phi3b //... 3a term ... wtime = get_wtime(); - music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3a) term" << std::flush; + music::ilog << std::setw(40) << std::setfill('.') << std::left << ">> Computing phi(3a) term" << std::endl; Conv.convolve_Hessians(phi, {0, 0}, phi, {1, 1}, phi, {2, 2}, op::assign_to(phi3)); Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 2}, phi, {1, 2}, op::multiply_add_to(phi3,2.0)); Conv.convolve_Hessians(phi, {1, 2}, phi, {1, 2}, phi, {0, 0}, op::subtract_from(phi3)); Conv.convolve_Hessians(phi, {0, 2}, phi, {0, 2}, phi, {1, 1}, op::subtract_from(phi3)); Conv.convolve_Hessians(phi, {0, 1}, phi, {0, 1}, phi, {2, 2}, op::subtract_from(phi3)); // phi3a.apply_InverseLaplacian(); - music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl; + music::ilog << "----cpu-> phi(3a) took " << get_wtime() - wtime << "s" << std::endl; //... 3b term ... wtime = get_wtime(); - music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing phi(3b) term" << std::flush; - // phi3b.FourierTransformForward(false); + music::ilog << std::setw(40) << std::setfill('.') << std::left << ">> Computing phi(3b) term" << std::endl; Conv.convolve_SumOfHessians(phi, {0, 0}, phi2, {1, 1}, {2, 2}, op::multiply_add_to(phi3,-5.0/7.0)); Conv.convolve_SumOfHessians(phi, {1, 1}, phi2, {2, 2}, {0, 0}, op::multiply_add_to(phi3,-5.0/7.0)); Conv.convolve_SumOfHessians(phi, {2, 2}, phi2, {0, 0}, {1, 1}, op::multiply_add_to(phi3,-5.0/7.0)); @@ -439,12 +438,11 @@ int run( config_file& the_config ) Conv.convolve_Hessians(phi, {0, 2}, phi2, {0, 2}, op::multiply_add_to(phi3,+10.0/7.0)); Conv.convolve_Hessians(phi, {1, 2}, phi2, {1, 2}, op::multiply_add_to(phi3,+10.0/7.0)); phi3.apply_InverseLaplacian(); - //phi3b *= 0.5; // factor 1/2 from definition of phi(3b)! - music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl; + music::ilog << "----cpu-> phi(3b) took " << get_wtime() - wtime << "s" << std::endl; //... transversal term ... wtime = get_wtime(); - music::ilog << std::setw(40) << std::setfill('.') << std::left << "Computing A(3) term" << std::flush; + music::ilog << std::setw(40) << std::setfill('.') << std::left << ">> Computing A(3) term" << std::endl; for (int idim = 0; idim < 3; ++idim) { // cyclic rotations of indices @@ -457,7 +455,7 @@ int run( config_file& the_config ) Conv.convolve_DifferenceOfHessians(phi2, {idimp, idimpp}, phi, {idimp, idimp}, {idimpp, idimpp}, op::subtract_from(*A3[idim])); A3[idim]->apply_InverseLaplacian(); } - music::ilog << std::setw(20) << std::setfill(' ') << std::right << "took " << get_wtime() - wtime << "s" << std::endl; + music::ilog << "----cpu-> A(3) took " << get_wtime() - wtime << "s" << std::endl; } ///... scale all potentials with respective growth factors diff --git a/src/main.cc b/src/main.cc index eb38c10..381b170 100644 --- a/src/main.cc +++ b/src/main.cc @@ -43,8 +43,11 @@ bool FFTW_threads_ok = false; int num_threads = 1; } +size_t global_mem_high_mark, local_mem_high_mark; #include "system_stat.hh" +#include "memory_stat.hh" + #include #include @@ -76,6 +79,8 @@ int main( int argc, char** argv ) music::logger::set_level(music::log_level::debug); #endif + global_mem_high_mark = local_mem_high_mark = 0; + //------------------------------------------------------------------------------ // initialise MPI //------------------------------------------------------------------------------ @@ -259,13 +264,25 @@ int main( int argc, char** argv ) ic_generator::reset(); /////////////////////////////////////////////////////////////////////// + music::ilog << "-------------------------------------------------------------------------------" << std::endl; + size_t peak_mem = memory::getPeakRSS(); +#if defined(USE_MPI) + size_t peak_mem_max{0}; + MPI_Allreduce(&peak_mem, &peak_mem_max, 1, MPI_UNSIGNED_LONG_LONG, MPI_MAX, MPI_COMM_WORLD); + peak_mem = peak_mem_max; +#endif + + if( peak_mem > (1ull<<30) ) + music::ilog << "----mem-> peak memory usage was " << peak_mem /(1ull<<30) << " GBytes / task" << std::endl; + else + music::ilog << "----mem-> peak memory usage was " << peak_mem /(1ull<<20) << " MBytes / task" << std::endl; + #if defined(USE_MPI) MPI_Barrier(MPI_COMM_WORLD); MPI_Finalize(); #endif - - music::ilog << "-------------------------------------------------------------------------------" << std::endl; + music::ilog << "Done. Have a nice day!\n" << std::endl; return 0;