1
0
Fork 0
mirror of https://github.com/cosmo-sims/MUSIC.git synced 2024-09-19 17:03:46 +02:00

WIP refactoring. Got rid of FFTW2, some of the old single/double precision templating,...

This commit is contained in:
Oliver Hahn 2023-02-14 17:12:19 -08:00
parent 265981d93e
commit 6b32a7d6e4
27 changed files with 3654 additions and 4400 deletions

View file

@ -1,4 +1,21 @@
cmake_minimum_required(VERSION 3.9) # This file is part of MUSIC2
# A software package to generate ICs for cosmological simulations
# Copyright (C) 2023 by Oliver Hahn
#
# monofonIC is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# monofonIC is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
cmake_minimum_required(VERSION 3.11)
set(PRGNAME MUSIC) set(PRGNAME MUSIC)
project(MUSIC) project(MUSIC)
@ -27,12 +44,10 @@ mark_as_advanced(CMAKE_CXX_FLAGS_DEBUGSANADD CMAKE_CXX_FLAGS_DEBUGSANUNDEF)
mark_as_advanced(CMAKE_C_FLAGS_DEBUGSANADD CMAKE_C_FLAGS_DEBUGSANUNDEF) mark_as_advanced(CMAKE_C_FLAGS_DEBUGSANADD CMAKE_C_FLAGS_DEBUGSANUNDEF)
mark_as_advanced(CMAKE_EXECUTABLE_FORMAT CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT) mark_as_advanced(CMAKE_EXECUTABLE_FORMAT CMAKE_OSX_ARCHITECTURES CMAKE_OSX_DEPLOYMENT_TARGET CMAKE_OSX_SYSROOT)
# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -march=native -Wall -pedantic -DCMAKE_BUILD")
find_package(PkgConfig REQUIRED) find_package(PkgConfig REQUIRED)
set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}") set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}")
option(MUSIC_ENABLE_SINGLE_PRECISION "Enable Single Precision Mode" OFF)
######################################################################################################################## ########################################################################################################################
# OpenMP # OpenMP
@ -52,12 +67,44 @@ find_package(Threads REQUIRED)
if(POLICY CMP0074) if(POLICY CMP0074)
cmake_policy(SET CMP0074 NEW) cmake_policy(SET CMP0074 NEW)
endif() endif()
find_package(FFTW3 COMPONENTS SINGLE DOUBLE THREADS) if(ENABLE_MPI)
find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS MPI)
else()
find_package(FFTW3 COMPONENTS SINGLE DOUBLE LONGDOUBLE OPENMP THREADS)
endif(ENABLE_MPI)
mark_as_advanced(FFTW3_SINGLE_MPI_LIBRARY FFTW3_SINGLE_OPENMP_LIBRARY FFTW3_SINGLE_SERIAL_LIBRARY FFTW3_SINGLE_THREADS_LIBRARY)
mark_as_advanced(FFTW3_DOUBLE_MPI_LIBRARY FFTW3_DOUBLE_OPENMP_LIBRARY FFTW3_DOUBLE_SERIAL_LIBRARY FFTW3_DOUBLE_THREADS_LIBRARY)
mark_as_advanced(FFTW3_LONGDOUBLE_MPI_LIBRARY FFTW3_LONGDOUBLE_OPENMP_LIBRARY FFTW3_LONGDOUBLE_SERIAL_LIBRARY FFTW3_LONGDOUBLE_THREADS_LIBRARY)
mark_as_advanced(FFTW3_INCLUDE_DIR FFTW3_MPI_INCLUDE_DIR)
mark_as_advanced(pkgcfg_lib_PC_FFTW_fftw3)
######################################################################################################################## ########################################################################################################################
# TIRPC, needed only for Tipsy format # TIRPC, needed only for Tipsy format
find_package(TIRPC) find_package(TIRPC)
########################################################################################################################
# GSL
find_package(GSL REQUIRED)
mark_as_advanced(pkgcfg_lib_GSL_gsl pkgcfg_lib_GSL_gslcblas pkgcfg_lib_GSL_m)
########################################################################################################################
# HDF5
find_package(HDF5)
if( HDF5_FOUND )
mark_as_advanced(HDF5_C_LIBRARY_dl HDF5_C_LIBRARY_hdf5 HDF5_C_LIBRARY_m HDF5_C_LIBRARY_pthread HDF5_C_LIBRARY_z HDF5_C_LIBRARY_sz)
endif()
########################################################################################################################
# floating point precision
set (
CODE_PRECISION "DOUBLE"
CACHE STRING "Floating point type used for internal computations and FFTs"
)
set_property (
CACHE CODE_PRECISION
PROPERTY STRINGS FLOAT DOUBLE LONGDOUBLE
)
######################################################################################################################## ########################################################################################################################
# Add a custom command that produces version.cc, plus # Add a custom command that produces version.cc, plus
# a dummy output that's not actually produced, in order # a dummy output that's not actually produced, in order
@ -68,16 +115,6 @@ ADD_CUSTOM_COMMAND(
COMMAND ${CMAKE_COMMAND} -P COMMAND ${CMAKE_COMMAND} -P
${CMAKE_CURRENT_SOURCE_DIR}/version.cmake) ${CMAKE_CURRENT_SOURCE_DIR}/version.cmake)
########################################################################################################################
# GSL
find_package(GSL REQUIRED)
########################################################################################################################
# HDF5
find_package(HDF5)
######################################################################################################################## ########################################################################################################################
# INCLUDES # INCLUDES
include_directories(${PROJECT_SOURCE_DIR}/src) include_directories(${PROJECT_SOURCE_DIR}/src)
@ -112,40 +149,38 @@ list (APPEND SOURCES
# target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho) # target_include_directories(${PRGNAME} PRIVATE ${PROJECT_SOURCE_DIR}/external/panphasia_ho)
endif(ENABLE_PANPHASIA) endif(ENABLE_PANPHASIA)
# project configuration header
configure_file(
${PROJECT_SOURCE_DIR}/src/cmake_config.hh.in
${PROJECT_SOURCE_DIR}/src/cmake_config.hh
)
add_executable(${PRGNAME} ${SOURCES} ${PLUGINS}) add_executable(${PRGNAME} ${SOURCES} ${PLUGINS})
set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 11) set_target_properties(${PRGNAME} PROPERTIES CXX_STANDARD 14)
if(FFTW3_FOUND) if(CODE_PRECISION STREQUAL "FLOAT")
target_compile_options(${PRGNAME} PRIVATE "-DFFTW3") if(FFTW3_SINGLE_THREADS_FOUND)
target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_SINGLE_THREADS)
if( MUSIC_ENABLE_SINGLE_PRECISION ) target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
target_compile_options(${PRGNAME} PRIVATE "-DSINGLE_PRECISION") endif()
if (FFTW3_SINGLE_THREADS_FOUND) target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_SINGLE_SERIAL)
target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_THREADS_LIBRARY}) elseif(CODE_PRECISION STREQUAL "DOUBLE")
target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS") if(FFTW3_DOUBLE_THREADS_FOUND)
elseif(FFTW3_SINGLE_SERIAL_FOUND) target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_DOUBLE_THREADS)
target_link_libraries(${PRGNAME} ${FFTW3_SINGLE_SERIAL_LIBRARY}) target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
message( WARNING "using serial version of FFTW3 -- this will most likely cause a very slow version of MUSIC. Rec: install FFTW3 with thread support") endif()
else() target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_DOUBLE_SERIAL)
message( FATAL "chose compilation in single precision, but FFTW3 not found for single precision") elseif(CODE_PRECISION STREQUAL "LONGDOUBLE")
endif() if(FFTW3_LONGDOUBLE_THREADS_FOUND)
else(MUSIC_ENABLE_SINGLE_PRECISION) target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_LONGDOUBLE_THREADS)
if (FFTW3_DOUBLE_THREADS_FOUND) target_compile_definitions(${PRGNAME} PRIVATE "USE_FFTW_THREADS")
target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_THREADS_LIBRARY}) endif()
target_compile_options(${PRGNAME} PRIVATE "-DUSE_FFTW_THREADS") target_link_libraries(${PRGNAME} PRIVATE FFTW3::FFTW3_LONGDOUBLE_SERIAL)
elseif(FFTW3_DOUBLE_SERIAL_FOUND) endif()
target_link_libraries(${PRGNAME} ${FFTW3_DOUBLE_SERIAL_LIBRARY})
message( WARNING "using serial version of FFTW3 -- this will most likely cause a very slow version of MUSIC. Rec: install FFTW3 with thread support")
else()
message( FATAL "chose compilation in double precision, but FFTW3 not found for double precision")
endif()
endif(MUSIC_ENABLE_SINGLE_PRECISION)
endif(FFTW3_FOUND)
if(HDF5_FOUND) if(HDF5_FOUND)
# target_link_libraries(${PRGNAME} ${HDF5_C_LIBRARY_DIRS}) target_link_libraries(${PRGNAME} PRIVATE ${HDF5_LIBRARIES})
target_link_libraries(${PRGNAME} ${HDF5_LIBRARIES})
target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIRS}) target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIRS})
target_compile_options(${PRGNAME} PRIVATE "-DHAVE_HDF5") target_compile_options(${PRGNAME} PRIVATE "-DHAVE_HDF5")
target_compile_options(${PRGNAME} PRIVATE "-DH5_USE_16_API") target_compile_options(${PRGNAME} PRIVATE "-DH5_USE_16_API")
@ -161,11 +196,5 @@ if(ENABLE_PANPHASIA)
target_compile_options(${PRGNAME} PRIVATE "-DHAVE_PANPHASIA") target_compile_options(${PRGNAME} PRIVATE "-DHAVE_PANPHASIA")
endif(ENABLE_PANPHASIA) endif(ENABLE_PANPHASIA)
target_link_libraries(${PRGNAME} ${FFTW3_LIBRARIES}) target_link_libraries(${PRGNAME} PRIVATE GSL::gsl)
target_include_directories(${PRGNAME} PRIVATE ${FFTW3_INCLUDE_DIRS})
target_link_libraries(${PRGNAME} ${GSL_LIBRARIES})
target_include_directories(${PRGNAME} PRIVATE ${GSL_INCLUDE_DIR})
target_link_libraries(${PRGNAME} ${HDF5_LIBRARIES})
target_include_directories(${PRGNAME} PRIVATE ${HDF5_INCLUDE_DIR})

View file

@ -230,3 +230,14 @@ find_package_handle_standard_args(FFTW3
VERSION_VAR FFTW3_VERSION_STRING VERSION_VAR FFTW3_VERSION_STRING
HANDLE_COMPONENTS HANDLE_COMPONENTS
) )
if(FFTW3_FOUND)
foreach(component ${FFTW3_FIND_COMPONENTS})
if(NOT TARGET FFTW3::FFTW3_${component})
add_library(FFTW3::FFTW3_${component} UNKNOWN IMPORTED)
set_target_properties(FFTW3::FFTW3_${component} PROPERTIES
IMPORTED_LOCATION "${FFTW3_${component}_LIBRARY}"
INTERFACE_INCLUDE_DIRECTORIES ${FFTW3_INCLUDE_DIR})
endif()
endforeach()
endif()

View file

@ -130,7 +130,7 @@ protected:
double fftnorm = 1.0/N; double fftnorm = 1.0/N;
fftw_complex in[N], out[N]; complex_t in[N], out[N];
fftw_plan p,ip; fftw_plan p,ip;
//... perform anti-ringing correction from Hamilton (2000) //... perform anti-ringing correction from Hamilton (2000)

25
src/cmake_config.hh.in Normal file
View file

@ -0,0 +1,25 @@
#pragma once
#define USE_PRECISION_${CODE_PRECISION}
#ifdef __cplusplus
constexpr char CMAKE_BUILDTYPE_STR[] = "${CMAKE_BUILD_TYPE}";
#if defined(USE_PRECISION_FLOAT)
constexpr char CMAKE_PRECISION_STR[] = "single";
#elif defined(USE_PRECISION_DOUBLE)
constexpr char CMAKE_PRECISION_STR[] = "double";
#elif defined(USE_PRECISION_LONGDOUBLE)
constexpr char CMAKE_PRECISION_STR[] = "long double";
#endif
// These variables are autogenerated and compiled
// into the library by the version.cmake script. do not touch!
extern "C"
{
extern const char *GIT_TAG;
extern const char *GIT_REV;
extern const char *GIT_BRANCH;
}
#endif // __cplusplus

View file

@ -267,7 +267,7 @@ constraint_set::constraint_set( config_file& cf, transfer_function *ptf )
} }
void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftw_complex* cw ) void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, complex_t* cw )
{ {
double lsub = nx*dx; double lsub = nx*dx;
double dk = 2.0*M_PI/lsub, d3k=dk*dk*dk; double dk = 2.0*M_PI/lsub, d3k=dk*dk*dk;
@ -374,7 +374,7 @@ void constraint_set::wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t
void constraint_set::wnoise_constr_corr( double dx, fftw_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 ) void constraint_set::wnoise_constr_corr( double dx, complex_t* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 )
{ {
size_t nconstr = cset_.size(); size_t nconstr = cset_.size();
size_t nzp=nz/2+1; size_t nzp=nz/2+1;

View file

@ -7,84 +7,84 @@
Copyright (C) 2010 Oliver Hahn Copyright (C) 2010 Oliver Hahn
*/ */
#pragma once
#ifndef __CONSTRAINTS_HH
#define __CONSTRAINTS_HH
#include <vector> #include <vector>
#include <complex> #include <complex>
#include <gsl/gsl_linalg.h> #include <gsl/gsl_linalg.h>
#include "general.hh" #include <general.hh>
#include "config_file.hh" #include <config_file.hh>
#include "transfer_function.hh" #include <transfer_function.hh>
#include "cosmology.hh" #include <cosmology.hh>
//! matrix class serving as a gsl wrapper //! matrix class serving as a gsl wrapper
class matrix class matrix
{ {
protected: protected:
gsl_matrix * m_; gsl_matrix *m_;
//double *data_; // double *data_;
size_t M_, N_; size_t M_, N_;
public: public:
matrix( size_t M, size_t N ) matrix(size_t M, size_t N)
: M_(M), N_(N) : M_(M), N_(N)
{ {
m_ = gsl_matrix_alloc(M_,N_); m_ = gsl_matrix_alloc(M_, N_);
} }
matrix( size_t N ) matrix(size_t N)
: M_(N), N_(N) : M_(N), N_(N)
{ {
m_ = gsl_matrix_alloc(M_,N_); m_ = gsl_matrix_alloc(M_, N_);
} }
matrix( const matrix& o ) matrix(const matrix &o)
{ {
M_ = o.M_; M_ = o.M_;
N_ = o.N_; N_ = o.N_;
m_ = gsl_matrix_alloc(M_,N_); m_ = gsl_matrix_alloc(M_, N_);
gsl_matrix_memcpy(m_, o.m_ ); gsl_matrix_memcpy(m_, o.m_);
} }
~matrix() ~matrix()
{ {
gsl_matrix_free( m_ ); gsl_matrix_free(m_);
} }
double& operator()( size_t i, size_t j ) double &operator()(size_t i, size_t j)
{ return *gsl_matrix_ptr( m_, i, j ); }
const double& operator()( size_t i, size_t j ) const
{ return *gsl_matrix_const_ptr( m_, i, j ); }
matrix& operator=( const matrix& o )
{ {
gsl_matrix_free( m_ ); return *gsl_matrix_ptr(m_, i, j);
}
const double &operator()(size_t i, size_t j) const
{
return *gsl_matrix_const_ptr(m_, i, j);
}
matrix &operator=(const matrix &o)
{
gsl_matrix_free(m_);
M_ = o.M_; M_ = o.M_;
N_ = o.N_; N_ = o.N_;
m_ = gsl_matrix_alloc(M_,N_); m_ = gsl_matrix_alloc(M_, N_);
gsl_matrix_memcpy(m_, o.m_ ); gsl_matrix_memcpy(m_, o.m_);
return *this; return *this;
} }
matrix &invert()
matrix& invert()
{ {
if( M_!=N_ ) if (M_ != N_)
throw std::runtime_error("Attempt to invert a non-square matrix!"); throw std::runtime_error("Attempt to invert a non-square matrix!");
int s; int s;
gsl_matrix* im = gsl_matrix_alloc(M_,N_); gsl_matrix *im = gsl_matrix_alloc(M_, N_);
gsl_permutation * p = gsl_permutation_alloc (M_); gsl_permutation *p = gsl_permutation_alloc(M_);
gsl_linalg_LU_decomp( m_, p, &s ); gsl_linalg_LU_decomp(m_, p, &s);
gsl_linalg_LU_invert( m_, p, im ); gsl_linalg_LU_invert(m_, p, im);
gsl_matrix_memcpy(m_, im); gsl_matrix_memcpy(m_, im);
@ -94,20 +94,23 @@ public:
} }
}; };
//! class to impose constraints on the white noise field (van de Weygaert & Bertschinger 1996) //! class to impose constraints on the white noise field (van de Weygaert & Bertschinger 1996)
class constraint_set class constraint_set
{ {
public: public:
enum constr_type{ halo, peak }; enum constr_type
{
halo,
peak
};
protected: protected:
struct constraint
struct constraint{ {
constr_type type; constr_type type;
double x,y,z; double x, y, z;
double gx,gy,gz; double gx, gy, gz;
double Rg, Rg2; double Rg, Rg2;
double gRg, gRg2; double gRg, gRg2;
double sigma; double sigma;
@ -121,48 +124,31 @@ protected:
double dplus0_; double dplus0_;
unsigned constr_level_; unsigned constr_level_;
inline std::complex<double> eval_constr(size_t icon, double kx, double ky, double kz)
inline std::complex<double> eval_constr( size_t icon, double kx, double ky, double kz )
{ {
double re, im, kdotx, k2; double re, im, kdotx, k2;
kdotx = cset_[icon].gx*kx+cset_[icon].gy*ky+cset_[icon].gz*kz; kdotx = cset_[icon].gx * kx + cset_[icon].gy * ky + cset_[icon].gz * kz;
k2 = kx*kx+ky*ky+kz*kz; k2 = kx * kx + ky * ky + kz * kz;
re = im = exp(-k2*cset_[icon].gRg2/2.0); re = im = exp(-k2 * cset_[icon].gRg2 / 2.0);
re *= cos( kdotx ); re *= cos(kdotx);
im *= sin( kdotx ); im *= sin(kdotx);
return std::complex<double>(re,im); return std::complex<double>(re, im);
} }
#if defined(FFTW3) && defined(SINGLE_PRECISION)
//! apply constraints to the white noise //! apply constraints to the white noise
void wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftwf_complex* cw ); void wnoise_constr_corr(double dx, size_t nx, size_t ny, size_t nz, std::vector<double> &g0, matrix &cinv, complex_t *cw);
//! measure sigma for each constraint in the unconstrained noise //! measure sigma for each constraint in the unconstrained noise
void wnoise_constr_corr( double dx, fftwf_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 ); void wnoise_constr_corr(double dx, complex_t *cw, size_t nx, size_t ny, size_t nz, std::vector<double> &g0);
#else
//! apply constraints to the white noise
void wnoise_constr_corr( double dx, size_t nx, size_t ny, size_t nz, std::vector<double>& g0, matrix& cinv, fftw_complex* cw );
//! measure sigma for each constraint in the unconstrained noise
void wnoise_constr_corr( double dx, fftw_complex* cw, size_t nx, size_t ny, size_t nz, std::vector<double>& g0 );
#endif
//! compute the covariance between the constraints //! compute the covariance between the constraints
void icov_constr( double dx, size_t nx, size_t ny, size_t nz, matrix& cij ); void icov_constr(double dx, size_t nx, size_t ny, size_t nz, matrix &cij);
public: public:
//! constructor //! constructor
constraint_set( config_file& cf, transfer_function *ptf ); constraint_set(config_file &cf, transfer_function *ptf);
//! destructor //! destructor
~constraint_set() ~constraint_set()
@ -171,252 +157,151 @@ public:
delete pcosmo_; delete pcosmo_;
} }
template <typename rng>
template< typename rng > void apply(unsigned ilevel, int x0[], int lx[], rng *wnoise)
void apply( unsigned ilevel, int x0[], int lx[], rng* wnoise )
{ {
if( cset_.size() == 0 || constr_level_ != ilevel ) if (cset_.size() == 0 || constr_level_ != ilevel)
return; return;
unsigned nlvl = 1<<ilevel; unsigned nlvl = 1 << ilevel;
double boxlength = pcf_->get_value<double>("setup","boxlength"); double boxlength = pcf_->get_value<double>("setup", "boxlength");
//... compute constraint coordinates for grid //... compute constraint coordinates for grid
for( size_t i=0; i<cset_.size(); ++i ) for (size_t i = 0; i < cset_.size(); ++i)
{ {
cset_[i].gx = cset_[i].x * (double)nlvl; cset_[i].gx = cset_[i].x * (double)nlvl;
cset_[i].gy = cset_[i].y * (double)nlvl; cset_[i].gy = cset_[i].y * (double)nlvl;
cset_[i].gz = cset_[i].z * (double)nlvl; cset_[i].gz = cset_[i].z * (double)nlvl;
cset_[i].gRg = cset_[i].Rg/boxlength * (double)nlvl; cset_[i].gRg = cset_[i].Rg / boxlength * (double)nlvl;
cset_[i].gRg2 = cset_[i].gRg*cset_[i].gRg; cset_[i].gRg2 = cset_[i].gRg * cset_[i].gRg;
if(cset_[i].gRg > 0.5*lx[0]) if (cset_[i].gRg > 0.5 * lx[0])
music::wlog.Print("Constraint %d appears to be too large scale",i); music::wlog.Print("Constraint %d appears to be too large scale", i);
} }
std::vector<double> g0; std::vector<double> g0;
// unsigned levelmax = pcf_->get_value<unsigned>("setup","levelmax"); // unsigned levelmax = pcf_->get_value<unsigned>("setup","levelmax");
unsigned levelmin = pcf_->get_value<unsigned>("setup","levelmin_TF"); unsigned levelmin = pcf_->get_value<unsigned>("setup", "levelmin_TF");
bool bperiodic = ilevel==levelmin;
double dx = pcf_->get_value<double>("setup","boxlength")/(1<<ilevel);
bool bperiodic = ilevel == levelmin;
double dx = pcf_->get_value<double>("setup", "boxlength") / (1 << ilevel);
music::ilog.Print("Computing constrained realization..."); music::ilog.Print("Computing constrained realization...");
if( bperiodic ) if (bperiodic)
{ {
//... we are operating on the periodic coarse grid //... we are operating on the periodic coarse grid
size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz+2; size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz + 2;
fftw_real * w = new fftw_real[nx*ny*nzp]; real_t *w = new real_t[nx * ny * nzp];
complex_t *cw = reinterpret_cast<complex_t *>(w);
fftw_plan_t p = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, w, cw, FFTW_ESTIMATE),
ip = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cw, w, FFTW_ESTIMATE);
#ifdef FFTW3 double fftnorm = 1.0 / sqrt(nx * ny * nz);
#ifdef SINGLE_PRECISION
fftwf_complex * cw = reinterpret_cast<fftwf_complex*> (w);
fftwf_plan p = fftwf_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
ip = fftwf_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
#else
fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
fftw_plan p = fftw_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
ip = fftw_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
#endif
#else
fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
rfftwnd_plan p = rfftw3d_create_plan( nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
ip = rfftw3d_create_plan( nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
#endif
double fftnorm = 1.0/sqrt(nx*ny*nz); #pragma omp parallel for
for (int i = 0; i < (int)nx; i++)
#pragma omp parallel for for (int j = 0; j < (int)ny; j++)
for( int i=0; i<(int)nx; i++ ) for (int k = 0; k < (int)nz; k++)
for( int j=0; j<(int)ny; j++ )
for( int k=0; k<(int)nz; k++ )
{ {
size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k; size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
w[q] = (*wnoise)((x0[0]+i)%nx,(x0[1]+j)%ny,(x0[2]+k)%nz)*fftnorm; w[q] = (*wnoise)((x0[0] + i) % nx, (x0[1] + j) % ny, (x0[2] + k) % nz) * fftnorm;
} }
#ifdef FFTW3 FFTW_API(execute)(p);
#ifdef SINGLE_PRECISION wnoise_constr_corr(dx, cw, nx, ny, nz, g0);
fftwf_execute( p );
#else
fftw_execute( p );
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), p, w, NULL );
#else
rfftwnd_one_real_to_complex( p, w, NULL );
#endif
#endif
wnoise_constr_corr( dx, cw, nx, ny, nz, g0 );
matrix c(2,2); matrix c(2, 2);
icov_constr( dx, nx, ny, nz, c ); icov_constr(dx, nx, ny, nz, c);
wnoise_constr_corr(dx, nx, ny, nz, g0, c, cw);
wnoise_constr_corr( dx, nx, ny, nz, g0, c, cw ); FFTW_API(execute)(ip);
#ifdef FFTW3 #pragma omp parallel for
#ifdef SINGLE_PRECISION for (int i = 0; i < (int)nx; i++)
fftwf_execute( ip ); for (int j = 0; j < (int)ny; j++)
#else for (int k = 0; k < (int)nz; k++)
fftw_execute( ip );
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), ip, cw, NULL );
#else
rfftwnd_one_complex_to_real( ip, cw, NULL );
#endif
#endif
#pragma omp parallel for
for( int i=0; i<(int)nx; i++ )
for( int j=0; j<(int)ny; j++ )
for( int k=0; k<(int)nz; k++ )
{ {
size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k; size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
(*wnoise)((x0[0]+i),(x0[1]+j),(x0[2]+k)) = w[q]*fftnorm; (*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) = w[q] * fftnorm;
} }
music::ilog.Print("Applied constraints to level %d.",ilevel); music::ilog.Print("Applied constraints to level %d.", ilevel);
delete[] w; delete[] w;
FFTW_API(destroy_plan)(p);
#ifdef FFTW3 FFTW_API(destroy_plan)(ip);
#ifdef SINGLE_PRECISION }
fftwf_destroy_plan(p); else
#else {
fftw_destroy_plan(p);
#endif
#else
fftwnd_destroy_plan(p);
#endif
}else{
//... we are operating on a refinement grid, not necessarily the finest //... we are operating on a refinement grid, not necessarily the finest
size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz+2; size_t nx = lx[0], ny = lx[1], nz = lx[2], nzp = nz + 2;
fftw_real * w = new fftw_real[nx*ny*nzp]; real_t *w = new real_t[nx * ny * nzp];
complex_t *cw = reinterpret_cast<complex_t *>(w);
fftw_plan_t p = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, w, cw, FFTW_ESTIMATE),
ip = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cw, w, FFTW_ESTIMATE);
#ifdef FFTW3 double fftnorm = 1.0 / sqrt(nx * ny * nz);
#ifdef SINGLE_PRECISION
fftwf_complex * cw = reinterpret_cast<fftwf_complex*> (w);
fftwf_plan p = fftwf_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
ip = fftwf_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
#else
fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
fftw_plan p = fftw_plan_dft_r2c_3d( nx, ny, nz, w, cw, FFTW_ESTIMATE),
ip = fftw_plan_dft_c2r_3d( nx, ny, nz, cw, w, FFTW_ESTIMATE);
#endif
#else
fftw_complex * cw = reinterpret_cast<fftw_complex*> (w);
rfftwnd_plan p = rfftw3d_create_plan( nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
ip = rfftw3d_create_plan( nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
#endif
double fftnorm = 1.0/sqrt(nx*ny*nz); int il = nx / 4, ir = 3 * nx / 4, jl = ny / 4, jr = 3 * ny / 4, kl = nz / 4, kr = 3 * nz / 4;
int il = nx/4, ir = 3*nx/4, jl=ny/4, jr = 3*ny/4, kl = nz/4, kr = 3*nz/4; #pragma omp parallel for
for (int i = 0; i < (int)nx; i++)
#pragma omp parallel for for (int j = 0; j < (int)ny; j++)
for( int i=0; i<(int)nx; i++ ) for (int k = 0; k < (int)nz; k++)
for( int j=0; j<(int)ny; j++ )
for( int k=0; k<(int)nz; k++ )
{ {
size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k; size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
if( i>=il && i<ir && j>=jl && j<jr && k>=kl && k<kr ) if (i >= il && i < ir && j >= jl && j < jr && k >= kl && k < kr)
w[q] = (*wnoise)((x0[0]+i),(x0[1]+j),(x0[2]+k))*fftnorm; w[q] = (*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) * fftnorm;
else else
w[q] = 0.0; w[q] = 0.0;
} }
int nlvl05 = 1<<(ilevel-1); int nlvl05 = 1 << (ilevel - 1);
int xs = nlvl05-x0[0], ys = nlvl05-x0[1], zs = nlvl05-x0[2]; int xs = nlvl05 - x0[0], ys = nlvl05 - x0[1], zs = nlvl05 - x0[2];
for( size_t i=0; i<cset_.size(); ++i ) for (size_t i = 0; i < cset_.size(); ++i)
{ {
cset_[i].gx -= xs; cset_[i].gx -= xs;
cset_[i].gy -= ys; cset_[i].gy -= ys;
cset_[i].gz -= zs; cset_[i].gz -= zs;
} }
#ifdef FFTW3 FFTW_API(execute)(p);
#ifdef SINGLE_PRECISION
fftwf_execute( p );
#else
fftw_execute( p );
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), p, w, NULL );
#else
rfftwnd_one_real_to_complex( p, w, NULL );
#endif
#endif
wnoise_constr_corr( dx, cw, nx, ny, nz, g0 );
matrix c(2,2); wnoise_constr_corr(dx, cw, nx, ny, nz, g0);
icov_constr( dx, nx, ny, nz, c );
matrix c(2, 2);
icov_constr(dx, nx, ny, nz, c);
wnoise_constr_corr( dx, nx, ny, nz, g0, c, cw ); wnoise_constr_corr(dx, nx, ny, nz, g0, c, cw);
#ifdef FFTW3 FFTW_API(execute)(ip);
#ifdef SINGLE_PRECISION
fftwf_execute( ip );
#else
fftw_execute( ip );
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), ip, cw, NULL );
#else
rfftwnd_one_complex_to_real( ip, cw, NULL );
#endif
#endif
#pragma omp parallel for #pragma omp parallel for
for( int i=0; i<(int)nx; i++ ) for (int i = 0; i < (int)nx; i++)
for( int j=0; j<(int)ny; j++ ) for (int j = 0; j < (int)ny; j++)
for( int k=0; k<(int)nz; k++ ) for (int k = 0; k < (int)nz; k++)
{ {
size_t q = ((size_t)i*ny+(size_t)j)*nzp+(size_t)k; size_t q = ((size_t)i * ny + (size_t)j) * nzp + (size_t)k;
if( i>=il && i<ir && j>=jl && j<jr && k>=kl && k<kr ) if (i >= il && i < ir && j >= jl && j < jr && k >= kl && k < kr)
(*wnoise)((x0[0]+i),(x0[1]+j),(x0[2]+k)) = w[q]*fftnorm; (*wnoise)((x0[0] + i), (x0[1] + j), (x0[2] + k)) = w[q] * fftnorm;
} }
music::ilog.Print("Applied constraints to level %d.", ilevel);
music::ilog.Print("Applied constraints to level %d.",ilevel);
delete[] w; delete[] w;
FFTW_API(destroy_plan)(p);
#ifdef FFTW3 FFTW_API(destroy_plan)(ip);
#ifdef SINGLE_PRECISION
fftwf_destroy_plan(p);
#else
fftw_destroy_plan(p);
#endif
#else
fftwnd_destroy_plan(p);
#endif
} }
} }
}; };
#endif // __CONSTRAINTS_HH

View file

@ -7,13 +7,9 @@
*/ */
#include "general.hh" #include <general.hh>
#include "densities.hh" #include <densities.hh>
#include "convolution_kernel.hh" #include <convolution_kernel.hh>
#if defined(FFTW3) && defined(SINGLE_PRECISION)
typedef fftw_complex fftwf_complex;
#endif
namespace convolution namespace convolution
{ {
@ -25,7 +21,6 @@ get_kernel_map()
return kernel_map; return kernel_map;
} }
template <typename real_t>
void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip) void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
{ {
//return; //return;
@ -34,49 +29,26 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
double fftnormp = 1.0/sqrt((double)cparam_.nx * (double)cparam_.ny * (double)cparam_.nz); double fftnormp = 1.0/sqrt((double)cparam_.nx * (double)cparam_.ny * (double)cparam_.nz);
double fftnorm = pow(2.0 * M_PI, 1.5) / sqrt(cparam_.lx * cparam_.ly * cparam_.lz) * fftnormp; double fftnorm = pow(2.0 * M_PI, 1.5) / sqrt(cparam_.lx * cparam_.ly * cparam_.lz) * fftnormp;
fftw_complex *cdata; complex_t *cdata;
[[maybe_unused]] fftw_complex *ckernel; [[maybe_unused]] complex_t *ckernel;
fftw_real *data; real_t *data;
data = reinterpret_cast<fftw_real *>(pd); data = reinterpret_cast<real_t *>(pd);
cdata = reinterpret_cast<fftw_complex *>(data); cdata = reinterpret_cast<complex_t *>(data);
ckernel = reinterpret_cast<fftw_complex *>(pk->get_ptr()); ckernel = reinterpret_cast<complex_t *>(pk->get_ptr());
std::cout << " - Performing density convolution... (" std::cout << " - Performing density convolution... ("
<< cparam_.nx << ", " << cparam_.ny << ", " << cparam_.nz << ")\n"; << cparam_.nx << ", " << cparam_.ny << ", " << cparam_.nz << ")\n";
music::ulog.Print("Performing kernel convolution on (%5d,%5d,%5d) grid", cparam_.nx, cparam_.ny, cparam_.nz); music::ulog.Print("Performing kernel convolution on (%5d,%5d,%5d) grid", cparam_.nx, cparam_.ny, cparam_.nz);
music::ulog.Print("Performing forward FFT..."); music::ulog.Print("Performing forward FFT...");
#ifdef FFTW3
#ifdef SINGLE_PRECISION
fftwf_plan plan, iplan;
plan = fftwf_plan_dft_r2c_3d(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
iplan = fftwf_plan_dft_c2r_3d(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
fftwf_execute(plan); fftw_plan_t plan, iplan;
#else plan = FFTW_API(plan_dft_r2c_3d)(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
fftw_plan plan, iplan; iplan = FFTW_API(plan_dft_c2r_3d)(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
plan = fftw_plan_dft_r2c_3d(cparam_.nx, cparam_.ny, cparam_.nz, data, cdata, FFTW_ESTIMATE);
iplan = fftw_plan_dft_c2r_3d(cparam_.nx, cparam_.ny, cparam_.nz, cdata, data, FFTW_ESTIMATE);
fftw_execute(plan); FFTW_API(execute)(plan);
#endif
#else
rfftwnd_plan iplan, plan;
plan = rfftw3d_create_plan(cparam_.nx, cparam_.ny, cparam_.nz,
FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
iplan = rfftw3d_create_plan(cparam_.nx, cparam_.ny, cparam_.nz,
FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
#else
rfftwnd_one_real_to_complex(plan, data, NULL);
#endif
#endif
//..... need a phase shift for baryons for SPH //..... need a phase shift for baryons for SPH
double dstag = 0.0; double dstag = 0.0;
@ -163,27 +135,9 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
music::ulog.Print("Performing backward FFT..."); music::ulog.Print("Performing backward FFT...");
#ifdef FFTW3 FFTW_API(execute)(iplan);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(plan);
fftwf_execute(iplan); FFTW_API(destroy_plan)(iplan);
fftwf_destroy_plan(plan);
fftwf_destroy_plan(iplan);
#else
fftw_execute(iplan);
fftw_destroy_plan(plan);
fftw_destroy_plan(iplan);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
#else
rfftwnd_one_complex_to_real(iplan, cdata, NULL);
#endif
rfftwnd_destroy_plan(plan);
rfftwnd_destroy_plan(iplan);
#endif
// set the DC mode here to avoid a possible truncation error in single precision // set the DC mode here to avoid a possible truncation error in single precision
{ {
@ -196,14 +150,12 @@ void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip)
} }
} }
template void perform<double>(kernel *pk, void *pd, bool shift, bool fix, bool flip); void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip);
template void perform<float>(kernel *pk, void *pd, bool shift, bool fix, bool flip);
/*****************************************************************************************/ /*****************************************************************************************/
/*** SPECIFIC KERNEL IMPLEMENTATIONS *********************************************/ /*** SPECIFIC KERNEL IMPLEMENTATIONS *********************************************/
/*****************************************************************************************/ /*****************************************************************************************/
template <typename real_t>
class kernel_k : public kernel class kernel_k : public kernel
{ {
protected: protected:
@ -298,8 +250,4 @@ public:
/**************************************************************************************/ /**************************************************************************************/
/**************************************************************************************/ /**************************************************************************************/
namespace convolution::kernel_creator_concrete<convolution::kernel_k> creator_kd("tf_kernel_k");
{
convolution::kernel_creator_concrete<convolution::kernel_k<double>> creator_kd("tf_kernel_k_double");
convolution::kernel_creator_concrete<convolution::kernel_k<float>> creator_kf("tf_kernel_k_float");
} // namespace

View file

@ -112,7 +112,6 @@ struct kernel_creator_concrete : public kernel_creator
}; };
//! actual implementation of the FFT convolution (independent of the actual kernel) //! actual implementation of the FFT convolution (independent of the actual kernel)
template <typename real_t>
void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip); void perform(kernel *pk, void *pd, bool shift, bool fix, bool flip);
} //namespace convolution } //namespace convolution

View file

@ -13,246 +13,199 @@
#include "mg_operators.hh" #include "mg_operators.hh"
#include "general.hh" #include "general.hh"
#define ACC(i,j,k) ((*u.get_grid((ilevel)))((i),(j),(k))) #define ACC(i, j, k) ((*u.get_grid((ilevel)))((i), (j), (k)))
#define SQR(x) ((x)*(x)) #define SQR(x) ((x) * (x))
#if defined(FFTW3) && defined(SINGLE_PRECISION) void compute_LLA_density(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
#define fftw_complex fftwf_complex
#endif
void compute_LLA_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned order )
{ {
fnew = u; fnew = u;
for( unsigned ilevel=u.levelmin(); ilevel<=u.levelmax(); ++ilevel ) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0,ilevel), h2 = h*h, h2_4 = 0.25*h2; double h = pow(2.0, ilevel), h2 = h * h, h2_4 = 0.25 * h2;
meshvar_bnd *pvar = fnew.get_grid(ilevel); meshvar_bnd *pvar = fnew.get_grid(ilevel);
if (order == 2)
if( order == 2 )
{ {
#pragma omp parallel for //reduction(+:sum_corr,sum,sum2) #pragma omp parallel for // reduction(+:sum_corr,sum,sum2)
for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix ) for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy ) for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz ) for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
{ {
double D[3][3]; double D[3][3];
D[0][0] = (ACC(ix-1,iy,iz)-2.0*ACC(ix,iy,iz)+ACC(ix+1,iy,iz)) * h2; D[0][0] = (ACC(ix - 1, iy, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix + 1, iy, iz)) * h2;
D[1][1] = (ACC(ix,iy-1,iz)-2.0*ACC(ix,iy,iz)+ACC(ix,iy+1,iz)) * h2; D[1][1] = (ACC(ix, iy - 1, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 1, iz)) * h2;
D[2][2] = (ACC(ix,iy,iz-1)-2.0*ACC(ix,iy,iz)+ACC(ix,iy,iz+1)) * h2; D[2][2] = (ACC(ix, iy, iz - 1) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 1)) * h2;
D[0][1] = D[1][0] = (ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))*h2_4; D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
D[0][2] = D[2][0] = (ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))*h2_4; D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
D[1][2] = D[2][1] = (ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))*h2_4; D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
D[0][0] += 1.0; D[0][0] += 1.0;
D[1][1] += 1.0; D[1][1] += 1.0;
D[2][2] += 1.0; D[2][2] += 1.0;
double det = D[0][0]*D[1][1]*D[2][2] double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
- D[0][0]*D[1][2]*D[2][1]
- D[1][0]*D[0][1]*D[2][2]
+ D[1][0]*D[0][2]*D[1][2]
+ D[2][0]*D[0][1]*D[1][2]
- D[2][0]*D[0][2]*D[1][1];
(*pvar)(ix,iy,iz) = 1.0/det-1.0;
(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
} }
} }
else if ( order == 4 ) else if (order == 4)
{ {
#pragma omp parallel for #pragma omp parallel for
for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix ) for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy ) for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz ) for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
{ {
double D[3][3]; double D[3][3];
D[0][0] = (-ACC(ix-2,iy,iz)+16.*ACC(ix-1,iy,iz)-30.0*ACC(ix,iy,iz)+16.*ACC(ix+1,iy,iz)-ACC(ix+2,iy,iz)) * h2/12.0; D[0][0] = (-ACC(ix - 2, iy, iz) + 16. * ACC(ix - 1, iy, iz) - 30.0 * ACC(ix, iy, iz) + 16. * ACC(ix + 1, iy, iz) - ACC(ix + 2, iy, iz)) * h2 / 12.0;
D[1][1] = (-ACC(ix,iy-2,iz)+16.*ACC(ix,iy-1,iz)-30.0*ACC(ix,iy,iz)+16.*ACC(ix,iy+1,iz)-ACC(ix,iy+2,iz)) * h2/12.0; D[1][1] = (-ACC(ix, iy - 2, iz) + 16. * ACC(ix, iy - 1, iz) - 30.0 * ACC(ix, iy, iz) + 16. * ACC(ix, iy + 1, iz) - ACC(ix, iy + 2, iz)) * h2 / 12.0;
D[2][2] = (-ACC(ix,iy,iz-2)+16.*ACC(ix,iy,iz-1)-30.0*ACC(ix,iy,iz)+16.*ACC(ix,iy,iz+1)-ACC(ix,iy,iz+2)) * h2/12.0; D[2][2] = (-ACC(ix, iy, iz - 2) + 16. * ACC(ix, iy, iz - 1) - 30.0 * ACC(ix, iy, iz) + 16. * ACC(ix, iy, iz + 1) - ACC(ix, iy, iz + 2)) * h2 / 12.0;
D[0][1] = D[1][0] = (ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))*h2_4;
D[0][2] = D[2][0] = (ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))*h2_4;
D[1][2] = D[2][1] = (ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))*h2_4;
D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
D[0][0] += 1.0; D[0][0] += 1.0;
D[1][1] += 1.0; D[1][1] += 1.0;
D[2][2] += 1.0; D[2][2] += 1.0;
double det = D[0][0]*D[1][1]*D[2][2] double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
- D[0][0]*D[1][2]*D[2][1]
- D[1][0]*D[0][1]*D[2][2]
+ D[1][0]*D[0][2]*D[1][2]
+ D[2][0]*D[0][1]*D[1][2]
- D[2][0]*D[0][2]*D[1][1];
(*pvar)(ix,iy,iz) = 1.0/det-1.0;
(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
} }
} }
else if ( order == 6 ) else if (order == 6)
{ {
h2_4/=36.; h2_4 /= 36.;
h2/=180.; h2 /= 180.;
#pragma omp parallel for #pragma omp parallel for
for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix ) for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy ) for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz ) for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
{ {
double D[3][3]; double D[3][3];
D[0][0] = (2.*ACC(ix-3,iy,iz)-27.*ACC(ix-2,iy,iz)+270.*ACC(ix-1,iy,iz)-490.0*ACC(ix,iy,iz)+270.*ACC(ix+1,iy,iz)-27.*ACC(ix+2,iy,iz)+2.*ACC(ix+3,iy,iz)) * h2; D[0][0] = (2. * ACC(ix - 3, iy, iz) - 27. * ACC(ix - 2, iy, iz) + 270. * ACC(ix - 1, iy, iz) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix + 1, iy, iz) - 27. * ACC(ix + 2, iy, iz) + 2. * ACC(ix + 3, iy, iz)) * h2;
D[1][1] = (2.*ACC(ix,iy-3,iz)-27.*ACC(ix,iy-2,iz)+270.*ACC(ix,iy-1,iz)-490.0*ACC(ix,iy,iz)+270.*ACC(ix,iy+1,iz)-27.*ACC(ix,iy+2,iz)+2.*ACC(ix,iy+3,iz)) * h2; D[1][1] = (2. * ACC(ix, iy - 3, iz) - 27. * ACC(ix, iy - 2, iz) + 270. * ACC(ix, iy - 1, iz) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix, iy + 1, iz) - 27. * ACC(ix, iy + 2, iz) + 2. * ACC(ix, iy + 3, iz)) * h2;
D[2][2] = (2.*ACC(ix,iy,iz-3)-27.*ACC(ix,iy,iz-2)+270.*ACC(ix,iy,iz-1)-490.0*ACC(ix,iy,iz)+270.*ACC(ix,iy,iz+1)-27.*ACC(ix,iy,iz+2)+2.*ACC(ix,iy,iz+3)) * h2; D[2][2] = (2. * ACC(ix, iy, iz - 3) - 27. * ACC(ix, iy, iz - 2) + 270. * ACC(ix, iy, iz - 1) - 490.0 * ACC(ix, iy, iz) + 270. * ACC(ix, iy, iz + 1) - 27. * ACC(ix, iy, iz + 2) + 2. * ACC(ix, iy, iz + 3)) * h2;
//.. this is actually 8th order accurate //.. this is actually 8th order accurate
D[0][1] = D[1][0] = (64.*(ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz)) D[0][1] = D[1][0] = (64. * (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) - 8. * (ACC(ix - 2, iy - 1, iz) - ACC(ix + 2, iy - 1, iz) - ACC(ix - 2, iy + 1, iz) + ACC(ix + 2, iy + 1, iz) + ACC(ix - 1, iy - 2, iz) - ACC(ix - 1, iy + 2, iz) - ACC(ix + 1, iy - 2, iz) + ACC(ix + 1, iy + 2, iz)) + 1. * (ACC(ix - 2, iy - 2, iz) - ACC(ix - 2, iy + 2, iz) - ACC(ix + 2, iy - 2, iz) + ACC(ix + 2, iy + 2, iz))) * h2_4;
-8.*(ACC(ix-2,iy-1,iz)-ACC(ix+2,iy-1,iz)-ACC(ix-2,iy+1,iz)+ACC(ix+2,iy+1,iz) D[0][2] = D[2][0] = (64. * (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) - 8. * (ACC(ix - 2, iy, iz - 1) - ACC(ix + 2, iy, iz - 1) - ACC(ix - 2, iy, iz + 1) + ACC(ix + 2, iy, iz + 1) + ACC(ix - 1, iy, iz - 2) - ACC(ix - 1, iy, iz + 2) - ACC(ix + 1, iy, iz - 2) + ACC(ix + 1, iy, iz + 2)) + 1. * (ACC(ix - 2, iy, iz - 2) - ACC(ix - 2, iy, iz + 2) - ACC(ix + 2, iy, iz - 2) + ACC(ix + 2, iy, iz + 2))) * h2_4;
+ ACC(ix-1,iy-2,iz)-ACC(ix-1,iy+2,iz)-ACC(ix+1,iy-2,iz)+ACC(ix+1,iy+2,iz)) D[1][2] = D[2][1] = (64. * (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) - 8. * (ACC(ix, iy - 2, iz - 1) - ACC(ix, iy + 2, iz - 1) - ACC(ix, iy - 2, iz + 1) + ACC(ix, iy + 2, iz + 1) + ACC(ix, iy - 1, iz - 2) - ACC(ix, iy - 1, iz + 2) - ACC(ix, iy + 1, iz - 2) + ACC(ix, iy + 1, iz + 2)) + 1. * (ACC(ix, iy - 2, iz - 2) - ACC(ix, iy - 2, iz + 2) - ACC(ix, iy + 2, iz - 2) + ACC(ix, iy + 2, iz + 2))) * h2_4;
+1.*(ACC(ix-2,iy-2,iz)-ACC(ix-2,iy+2,iz)-ACC(ix+2,iy-2,iz)+ACC(ix+2,iy+2,iz)))*h2_4;
D[0][2] = D[2][0] = (64.*(ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))
-8.*(ACC(ix-2,iy,iz-1)-ACC(ix+2,iy,iz-1)-ACC(ix-2,iy,iz+1)+ACC(ix+2,iy,iz+1)
+ ACC(ix-1,iy,iz-2)-ACC(ix-1,iy,iz+2)-ACC(ix+1,iy,iz-2)+ACC(ix+1,iy,iz+2))
+1.*(ACC(ix-2,iy,iz-2)-ACC(ix-2,iy,iz+2)-ACC(ix+2,iy,iz-2)+ACC(ix+2,iy,iz+2)))*h2_4;
D[1][2] = D[2][1] = (64.*(ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))
-8.*(ACC(ix,iy-2,iz-1)-ACC(ix,iy+2,iz-1)-ACC(ix,iy-2,iz+1)+ACC(ix,iy+2,iz+1)
+ ACC(ix,iy-1,iz-2)-ACC(ix,iy-1,iz+2)-ACC(ix,iy+1,iz-2)+ACC(ix,iy+1,iz+2))
+1.*(ACC(ix,iy-2,iz-2)-ACC(ix,iy-2,iz+2)-ACC(ix,iy+2,iz-2)+ACC(ix,iy+2,iz+2)))*h2_4;
D[0][0] += 1.0; D[0][0] += 1.0;
D[1][1] += 1.0; D[1][1] += 1.0;
D[2][2] += 1.0; D[2][2] += 1.0;
double det = D[0][0]*D[1][1]*D[2][2] double det = D[0][0] * D[1][1] * D[2][2] - D[0][0] * D[1][2] * D[2][1] - D[1][0] * D[0][1] * D[2][2] + D[1][0] * D[0][2] * D[1][2] + D[2][0] * D[0][1] * D[1][2] - D[2][0] * D[0][2] * D[1][1];
- D[0][0]*D[1][2]*D[2][1]
- D[1][0]*D[0][1]*D[2][2]
+ D[1][0]*D[0][2]*D[1][2]
+ D[2][0]*D[0][1]*D[1][2]
- D[2][0]*D[0][2]*D[1][1];
(*pvar)(ix,iy,iz) = 1.0/det-1.0;
(*pvar)(ix, iy, iz) = 1.0 / det - 1.0;
} }
}
}else else
throw std::runtime_error("compute_LLA_density : invalid operator order specified"); throw std::runtime_error("compute_LLA_density : invalid operator order specified");
} }
} }
void compute_Lu_density(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
void compute_Lu_density( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned order )
{ {
fnew = u; fnew = u;
for( unsigned ilevel=u.levelmin(); ilevel<=u.levelmax(); ++ilevel ) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0,ilevel), h2 = h*h; double h = pow(2.0, ilevel), h2 = h * h;
meshvar_bnd *pvar = fnew.get_grid(ilevel); meshvar_bnd *pvar = fnew.get_grid(ilevel);
#pragma omp parallel for #pragma omp parallel for
for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix ) for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy ) for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz ) for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
{ {
double D[3][3]; double D[3][3];
D[0][0] = 1.0 + (ACC(ix-1,iy,iz)-2.0*ACC(ix,iy,iz)+ACC(ix+1,iy,iz)) * h2; D[0][0] = 1.0 + (ACC(ix - 1, iy, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix + 1, iy, iz)) * h2;
D[1][1] = 1.0 + (ACC(ix,iy-1,iz)-2.0*ACC(ix,iy,iz)+ACC(ix,iy+1,iz)) * h2; D[1][1] = 1.0 + (ACC(ix, iy - 1, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 1, iz)) * h2;
D[2][2] = 1.0 + (ACC(ix,iy,iz-1)-2.0*ACC(ix,iy,iz)+ACC(ix,iy,iz+1)) * h2; D[2][2] = 1.0 + (ACC(ix, iy, iz - 1) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 1)) * h2;
(*pvar)(ix,iy,iz) = -(D[0][0]+D[1][1]+D[2][2] - 3.0);
(*pvar)(ix, iy, iz) = -(D[0][0] + D[1][1] + D[2][2] - 3.0);
} }
} }
} }
void compute_2LPT_source_FFT(config_file &cf_, const grid_hierarchy &u, grid_hierarchy &fnew)
void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hierarchy& fnew )
{ {
if( u.levelmin() != u.levelmax() ) if (u.levelmin() != u.levelmax())
throw std::runtime_error("FFT 2LPT can only be run in Unigrid mode!"); throw std::runtime_error("FFT 2LPT can only be run in Unigrid mode!");
fnew = u; fnew = u;
size_t nx,ny,nz,nzp; size_t nx, ny, nz, nzp;
nx = u.get_grid(u.levelmax())->size(0); nx = u.get_grid(u.levelmax())->size(0);
ny = u.get_grid(u.levelmax())->size(1); ny = u.get_grid(u.levelmax())->size(1);
nz = u.get_grid(u.levelmax())->size(2); nz = u.get_grid(u.levelmax())->size(2);
nzp = 2*(nz/2+1); nzp = 2 * (nz / 2 + 1);
//... copy data .................................................. //... copy data ..................................................
fftw_real *data = new fftw_real[nx*ny*nzp]; real_t *data = new real_t[nx * ny * nzp];
fftw_complex *cdata = reinterpret_cast<fftw_complex*> (data); complex_t *cdata = reinterpret_cast<complex_t *>(data);
fftw_complex *cdata_11, *cdata_12, *cdata_13, *cdata_22, *cdata_23, *cdata_33; complex_t *cdata_11, *cdata_12, *cdata_13, *cdata_22, *cdata_23, *cdata_33;
fftw_real *data_11, *data_12, *data_13, *data_22, *data_23, *data_33; real_t *data_11, *data_12, *data_13, *data_22, *data_23, *data_33;
data_11 = new fftw_real[nx*ny*nzp]; cdata_11 = reinterpret_cast<fftw_complex*> (data_11); data_11 = new real_t[nx * ny * nzp];
data_12 = new fftw_real[nx*ny*nzp]; cdata_12 = reinterpret_cast<fftw_complex*> (data_12); cdata_11 = reinterpret_cast<complex_t *>(data_11);
data_13 = new fftw_real[nx*ny*nzp]; cdata_13 = reinterpret_cast<fftw_complex*> (data_13); data_12 = new real_t[nx * ny * nzp];
data_22 = new fftw_real[nx*ny*nzp]; cdata_22 = reinterpret_cast<fftw_complex*> (data_22); cdata_12 = reinterpret_cast<complex_t *>(data_12);
data_23 = new fftw_real[nx*ny*nzp]; cdata_23 = reinterpret_cast<fftw_complex*> (data_23); data_13 = new real_t[nx * ny * nzp];
data_33 = new fftw_real[nx*ny*nzp]; cdata_33 = reinterpret_cast<fftw_complex*> (data_33); cdata_13 = reinterpret_cast<complex_t *>(data_13);
data_22 = new real_t[nx * ny * nzp];
cdata_22 = reinterpret_cast<complex_t *>(data_22);
data_23 = new real_t[nx * ny * nzp];
cdata_23 = reinterpret_cast<complex_t *>(data_23);
data_33 = new real_t[nx * ny * nzp];
cdata_33 = reinterpret_cast<complex_t *>(data_33);
#pragma omp parallel for #pragma omp parallel for
for( int i=0; i<(int)nx; ++i ) for (int i = 0; i < (int)nx; ++i)
for( size_t j=0; j<ny; ++j ) for (size_t j = 0; j < ny; ++j)
for( size_t k=0; k<nz; ++k ) for (size_t k = 0; k < nz; ++k)
{ {
size_t idx = ((size_t)i*ny+j)*nzp+k; size_t idx = ((size_t)i * ny + j) * nzp + k;
data[idx] = (*u.get_grid(u.levelmax()))(i,j,k); data[idx] = (*u.get_grid(u.levelmax()))(i, j, k);
} }
//... perform FFT and Poisson solve................................ //... perform FFT and Poisson solve................................
#ifdef FFTW3
#ifdef SINGLE_PRECISION fftw_plan_t
fftwf_plan plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
plan = fftwf_plan_dft_r2c_3d(nx,ny,nz, data, cdata, FFTW_ESTIMATE), iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE),
iplan = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata, data, FFTW_ESTIMATE), ip11 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_11, data_11, FFTW_ESTIMATE),
ip11 = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_11, data_11, FFTW_ESTIMATE), ip12 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_12, data_12, FFTW_ESTIMATE),
ip12 = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_12, data_12, FFTW_ESTIMATE), ip13 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_13, data_13, FFTW_ESTIMATE),
ip13 = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_13, data_13, FFTW_ESTIMATE), ip22 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_22, data_22, FFTW_ESTIMATE),
ip22 = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_22, data_22, FFTW_ESTIMATE), ip23 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_23, data_23, FFTW_ESTIMATE),
ip23 = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_23, data_23, FFTW_ESTIMATE), ip33 = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata_33, data_33, FFTW_ESTIMATE);
ip33 = fftwf_plan_dft_c2r_3d(nx,ny,nz, cdata_33, data_33, FFTW_ESTIMATE);
fftwf_execute(plan); FFTW_API(execute)
(plan);
#else double kfac = 2.0 * M_PI;
double norm = 1.0 / ((double)(nx * ny * nz));
fftw_plan #pragma omp parallel for
plan = fftw_plan_dft_r2c_3d(nx,ny,nz, data, cdata, FFTW_ESTIMATE), for (int i = 0; i < (int)nx; ++i)
iplan = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata, data, FFTW_ESTIMATE), for (size_t j = 0; j < ny; ++j)
ip11 = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_11, data_11, FFTW_ESTIMATE), for (size_t l = 0; l < nz / 2 + 1; ++l)
ip12 = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_12, data_12, FFTW_ESTIMATE),
ip13 = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_13, data_13, FFTW_ESTIMATE),
ip22 = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_22, data_22, FFTW_ESTIMATE),
ip23 = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_23, data_23, FFTW_ESTIMATE),
ip33 = fftw_plan_dft_c2r_3d(nx,ny,nz, cdata_33, data_33, FFTW_ESTIMATE);
fftw_execute(plan);
#endif
double kfac = 2.0*M_PI;
double norm = 1.0/((double)(nx*ny*nz));
#pragma omp parallel for
for( int i=0; i<(int)nx; ++i )
for( size_t j=0; j<ny; ++j )
for( size_t l=0; l<nz/2+1; ++l )
{ {
int ii = i; if(ii>(int)nx/2) ii-=nx; int ii = i;
int jj = (int)j; if(jj>(int)ny/2) jj-=ny; if (ii > (int)nx / 2)
ii -= nx;
int jj = (int)j;
if (jj > (int)ny / 2)
jj -= ny;
double ki = (double)ii; double ki = (double)ii;
double kj = (double)jj; double kj = (double)jj;
double kk = (double)l; double kk = (double)l;
@ -262,30 +215,29 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
k[1] = (double)kj * kfac; k[1] = (double)kj * kfac;
k[2] = (double)kk * kfac; k[2] = (double)kk * kfac;
size_t idx = ((size_t)i*ny+j)*nzp/2+l; size_t idx = ((size_t)i * ny + j) * nzp / 2 + l;
//double re = cdata[idx][0]; // double re = cdata[idx][0];
//double im = cdata[idx][1]; // double im = cdata[idx][1];
cdata_11[idx][0] = -k[0]*k[0] * cdata[idx][0] * norm; cdata_11[idx][0] = -k[0] * k[0] * cdata[idx][0] * norm;
cdata_11[idx][1] = -k[0]*k[0] * cdata[idx][1] * norm; cdata_11[idx][1] = -k[0] * k[0] * cdata[idx][1] * norm;
cdata_12[idx][0] = -k[0]*k[1] * cdata[idx][0] * norm; cdata_12[idx][0] = -k[0] * k[1] * cdata[idx][0] * norm;
cdata_12[idx][1] = -k[0]*k[1] * cdata[idx][1] * norm; cdata_12[idx][1] = -k[0] * k[1] * cdata[idx][1] * norm;
cdata_13[idx][0] = -k[0]*k[2] * cdata[idx][0] * norm; cdata_13[idx][0] = -k[0] * k[2] * cdata[idx][0] * norm;
cdata_13[idx][1] = -k[0]*k[2] * cdata[idx][1] * norm; cdata_13[idx][1] = -k[0] * k[2] * cdata[idx][1] * norm;
cdata_22[idx][0] = -k[1]*k[1] * cdata[idx][0] * norm; cdata_22[idx][0] = -k[1] * k[1] * cdata[idx][0] * norm;
cdata_22[idx][1] = -k[1]*k[1] * cdata[idx][1] * norm; cdata_22[idx][1] = -k[1] * k[1] * cdata[idx][1] * norm;
cdata_23[idx][0] = -k[1]*k[2] * cdata[idx][0] * norm; cdata_23[idx][0] = -k[1] * k[2] * cdata[idx][0] * norm;
cdata_23[idx][1] = -k[1]*k[2] * cdata[idx][1] * norm; cdata_23[idx][1] = -k[1] * k[2] * cdata[idx][1] * norm;
cdata_33[idx][0] = -k[2]*k[2] * cdata[idx][0] * norm; cdata_33[idx][0] = -k[2] * k[2] * cdata[idx][0] * norm;
cdata_33[idx][1] = -k[2]*k[2] * cdata[idx][1] * norm; cdata_33[idx][1] = -k[2] * k[2] * cdata[idx][1] * norm;
if (i == (int)nx / 2 || j == ny / 2 || l == nz / 2)
if( i==(int)nx/2||j==ny/2||l==nz/2)
{ {
cdata_11[idx][0] = 0.0; cdata_11[idx][0] = 0.0;
cdata_11[idx][1] = 0.0; cdata_11[idx][1] = 0.0;
@ -305,7 +257,6 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
cdata_33[idx][0] = 0.0; cdata_33[idx][0] = 0.0;
cdata_33[idx][1] = 0.0; cdata_33[idx][1] = 0.0;
} }
} }
delete[] data; delete[] data;
@ -316,174 +267,37 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
cdata_23[0][0] = 0.0; cdata_23[0][1] = 0.0; cdata_23[0][0] = 0.0; cdata_23[0][1] = 0.0;
cdata_33[0][0] = 0.0; cdata_33[0][1] = 0.0;*/ cdata_33[0][0] = 0.0; cdata_33[0][1] = 0.0;*/
FFTW_API(execute)(ip11);
FFTW_API(execute)(ip12);
FFTW_API(execute)(ip13);
FFTW_API(execute)(ip22);
FFTW_API(execute)(ip23);
FFTW_API(execute)(ip33);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(plan);
fftwf_execute(ip11); FFTW_API(destroy_plan)(iplan);
fftwf_execute(ip12); FFTW_API(destroy_plan)(ip11);
fftwf_execute(ip13); FFTW_API(destroy_plan)(ip12);
fftwf_execute(ip22); FFTW_API(destroy_plan)(ip13);
fftwf_execute(ip23); FFTW_API(destroy_plan)(ip22);
fftwf_execute(ip33); FFTW_API(destroy_plan)(ip23);
FFTW_API(destroy_plan)(ip33);
fftwf_destroy_plan(plan); //... copy data ..........................................
fftwf_destroy_plan(iplan); #pragma omp parallel for
fftwf_destroy_plan(ip11); for (int i = 0; i < (int)nx; ++i)
fftwf_destroy_plan(ip12); for (size_t j = 0; j < ny; ++j)
fftwf_destroy_plan(ip13); for (size_t k = 0; k < nz; ++k)
fftwf_destroy_plan(ip22);
fftwf_destroy_plan(ip23);
fftwf_destroy_plan(ip33);
#else
fftw_execute(ip11);
fftw_execute(ip12);
fftw_execute(ip13);
fftw_execute(ip22);
fftw_execute(ip23);
fftw_execute(ip33);
fftw_destroy_plan(plan);
fftw_destroy_plan(iplan);
fftw_destroy_plan(ip11);
fftw_destroy_plan(ip12);
fftw_destroy_plan(ip13);
fftw_destroy_plan(ip22);
fftw_destroy_plan(ip23);
fftw_destroy_plan(ip33);
#endif
//#endif
#else
rfftwnd_plan
plan = rfftw3d_create_plan( nx,ny,nz,
FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE|FFTW_IN_PLACE),
iplan = rfftw3d_create_plan( nx,ny,nz,
FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE|FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex( omp_get_max_threads(), plan, data, NULL );
#else
rfftwnd_one_real_to_complex( plan, data, NULL );
#endif
//#endif
//double fac = -1.0/(nx*ny*nz);
double kfac = 2.0*M_PI;
double norm = 1.0/((double)(nx*ny*nz));
#pragma omp parallel for
for( int i=0; i<(int)nx; ++i )
for( size_t j=0; j<ny; ++j )
for( size_t l=0; l<nz/2+1; ++l )
{ {
int ii = (int)i; if(ii>(int)(nx/2)) ii-=(int)nx; size_t ii = ((size_t)i * ny + j) * nzp + k;
int jj = (int)j; if(jj>(int)(ny/2)) jj-=(int)ny; (*fnew.get_grid(u.levelmax()))(i, j, k) = ((data_11[ii] * data_22[ii] - data_12[ii] * data_12[ii]) +
double ki = (double)ii; (data_11[ii] * data_33[ii] - data_13[ii] * data_13[ii]) +
double kj = (double)jj; (data_22[ii] * data_33[ii] - data_23[ii] * data_23[ii]));
double kk = (double)l;
double k[3];
k[0] = (double)ki * kfac;
k[1] = (double)kj * kfac;
k[2] = (double)kk * kfac;
size_t idx = ((size_t)i*ny+j)*nzp/2+l;
//double re = cdata[idx].re;
//double im = cdata[idx].im;
cdata_11[idx].re = -k[0]*k[0] * cdata[idx].re * norm;
cdata_11[idx].im = -k[0]*k[0] * cdata[idx].im * norm;
cdata_12[idx].re = -k[0]*k[1] * cdata[idx].re * norm;
cdata_12[idx].im = -k[0]*k[1] * cdata[idx].im * norm;
cdata_13[idx].re = -k[0]*k[2] * cdata[idx].re * norm;
cdata_13[idx].im = -k[0]*k[2] * cdata[idx].im * norm;
cdata_22[idx].re = -k[1]*k[1] * cdata[idx].re * norm;
cdata_22[idx].im = -k[1]*k[1] * cdata[idx].im * norm;
cdata_23[idx].re = -k[1]*k[2] * cdata[idx].re * norm;
cdata_23[idx].im = -k[1]*k[2] * cdata[idx].im * norm;
cdata_33[idx].re = -k[2]*k[2] * cdata[idx].re * norm;
cdata_33[idx].im = -k[2]*k[2] * cdata[idx].im * norm;
if( i==(int)(nx/2)||j==ny/2||l==nz/2)
{
cdata_11[idx].re = 0.0;
cdata_11[idx].im = 0.0;
cdata_12[idx].re = 0.0;
cdata_12[idx].im = 0.0;
cdata_13[idx].re = 0.0;
cdata_13[idx].im = 0.0;
cdata_22[idx].re = 0.0;
cdata_22[idx].im = 0.0;
cdata_23[idx].re = 0.0;
cdata_23[idx].im = 0.0;
cdata_33[idx].re = 0.0;
cdata_33[idx].im = 0.0;
}
}
delete[] data;
/*cdata_11[0].re = 0.0; cdata_11[0].im = 0.0;
cdata_12[0].re = 0.0; cdata_12[0].im = 0.0;
cdata_13[0].re = 0.0; cdata_13[0].im = 0.0;
cdata_22[0].re = 0.0; cdata_22[0].im = 0.0;
cdata_23[0].re = 0.0; cdata_23[0].im = 0.0;
cdata_33[0].re = 0.0; cdata_33[0].im = 0.0;*/
#ifndef SINGLETHREAD_FFTW
//rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata, NULL );
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_11, NULL );
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_12, NULL );
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_13, NULL );
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_22, NULL );
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_23, NULL );
rfftwnd_threads_one_complex_to_real( omp_get_max_threads(), iplan, cdata_33, NULL );
#else
//rfftwnd_one_complex_to_real( iplan, cdata, NULL );
rfftwnd_one_complex_to_real(iplan, cdata_11, NULL );
rfftwnd_one_complex_to_real(iplan, cdata_12, NULL );
rfftwnd_one_complex_to_real(iplan, cdata_13, NULL );
rfftwnd_one_complex_to_real(iplan, cdata_22, NULL );
rfftwnd_one_complex_to_real(iplan, cdata_23, NULL );
rfftwnd_one_complex_to_real(iplan, cdata_33, NULL );
#endif
rfftwnd_destroy_plan(plan);
rfftwnd_destroy_plan(iplan);
#endif
//... copy data ..........................................
#pragma omp parallel for
for( int i=0; i<(int)nx; ++i )
for( size_t j=0; j<ny; ++j )
for( size_t k=0; k<nz; ++k )
{
size_t ii = ((size_t)i*ny+j)*nzp+k;
(*fnew.get_grid(u.levelmax()))(i,j,k) = (( data_11[ii]*data_22[ii]-data_12[ii]*data_12[ii] ) +
( data_11[ii]*data_33[ii]-data_13[ii]*data_13[ii] ) +
( data_22[ii]*data_33[ii]-data_23[ii]*data_23[ii] ) );
//(*fnew.get_grid(u.levelmax()))(i,j,k) = //(*fnew.get_grid(u.levelmax()))(i,j,k) =
} }
//delete[] data; // delete[] data;
delete[] data_11; delete[] data_11;
delete[] data_12; delete[] data_12;
delete[] data_13; delete[] data_13;
@ -492,131 +306,96 @@ void compute_2LPT_source_FFT( config_file& cf_, const grid_hierarchy& u, grid_hi
delete[] data_33; delete[] data_33;
} }
void compute_2LPT_source( const grid_hierarchy& u, grid_hierarchy& fnew, unsigned order ) void compute_2LPT_source(const grid_hierarchy &u, grid_hierarchy &fnew, unsigned order)
{ {
fnew = u; fnew = u;
fnew.zero(); fnew.zero();
for( unsigned ilevel=u.levelmin(); ilevel<=u.levelmax(); ++ilevel ) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0,ilevel), h2 = h*h, h2_4 = 0.25*h2; double h = pow(2.0, ilevel), h2 = h * h, h2_4 = 0.25 * h2;
meshvar_bnd *pvar = fnew.get_grid(ilevel); meshvar_bnd *pvar = fnew.get_grid(ilevel);
if ( order == 2 ) if (order == 2)
{ {
#pragma omp parallel for #pragma omp parallel for
for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix ) for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy ) for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz ) for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
{ {
double D[3][3]; double D[3][3];
D[0][0] = (ACC(ix-2,iy,iz)-2.0*ACC(ix,iy,iz)+ACC(ix+2,iy,iz)) * h2_4; D[0][0] = (ACC(ix - 2, iy, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix + 2, iy, iz)) * h2_4;
D[1][1] = (ACC(ix,iy-2,iz)-2.0*ACC(ix,iy,iz)+ACC(ix,iy+2,iz)) * h2_4; D[1][1] = (ACC(ix, iy - 2, iz) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy + 2, iz)) * h2_4;
D[2][2] = (ACC(ix,iy,iz-2)-2.0*ACC(ix,iy,iz)+ACC(ix,iy,iz+2)) * h2_4; D[2][2] = (ACC(ix, iy, iz - 2) - 2.0 * ACC(ix, iy, iz) + ACC(ix, iy, iz + 2)) * h2_4;
D[0][1] = D[1][0] = (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) * h2_4;
D[0][2] = D[2][0] = (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) * h2_4;
D[1][2] = D[2][1] = (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) * h2_4;
D[0][1] = D[1][0] = (ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))*h2_4; (*pvar)(ix, iy, iz) = (D[0][0] * D[1][1] - D[0][1] * D[0][1] + D[0][0] * D[2][2] - D[0][2] * D[0][2] + D[1][1] * D[2][2] - D[1][2] * D[1][2]);
D[0][2] = D[2][0] = (ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))*h2_4;
D[1][2] = D[2][1] = (ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))*h2_4;
(*pvar)(ix,iy,iz) = ( D[0][0]*D[1][1] - D[0][1]*D[0][1]
+ D[0][0]*D[2][2] - D[0][2]*D[0][2]
+ D[1][1]*D[2][2] - D[1][2]*D[1][2] );
} }
} }
else if ( order == 4 || order == 6 ) else if (order == 4 || order == 6)
{ {
double h2_144 = h2 / 144.; double h2_144 = h2 / 144.;
#pragma omp parallel for #pragma omp parallel for
for( int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix ) for (int ix = 0; ix < (int)(*u.get_grid(ilevel)).size(0); ++ix)
for( int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy ) for (int iy = 0; iy < (int)(*u.get_grid(ilevel)).size(1); ++iy)
for( int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz ) for (int iz = 0; iz < (int)(*u.get_grid(ilevel)).size(2); ++iz)
{ {
//.. this is actually 8th order accurate //.. this is actually 8th order accurate
double D[3][3]; double D[3][3];
D[0][0] = ((ACC(ix-4,iy,iz)+ACC(ix+4,iy,iz)) D[0][0] = ((ACC(ix - 4, iy, iz) + ACC(ix + 4, iy, iz)) - 16. * (ACC(ix - 3, iy, iz) + ACC(ix + 3, iy, iz)) + 64. * (ACC(ix - 2, iy, iz) + ACC(ix + 2, iy, iz)) + 16. * (ACC(ix - 1, iy, iz) + ACC(ix + 1, iy, iz)) - 130. * ACC(ix, iy, iz)) * h2_144;
- 16. * (ACC(ix-3,iy,iz)+ACC(ix+3,iy,iz))
+ 64. * (ACC(ix-2,iy,iz)+ACC(ix+2,iy,iz))
+ 16. * (ACC(ix-1,iy,iz)+ACC(ix+1,iy,iz))
- 130.* ACC(ix,iy,iz) ) * h2_144;
D[1][1] = ((ACC(ix,iy-4,iz)+ACC(ix,iy+4,iz)) D[1][1] = ((ACC(ix, iy - 4, iz) + ACC(ix, iy + 4, iz)) - 16. * (ACC(ix, iy - 3, iz) + ACC(ix, iy + 3, iz)) + 64. * (ACC(ix, iy - 2, iz) + ACC(ix, iy + 2, iz)) + 16. * (ACC(ix, iy - 1, iz) + ACC(ix, iy + 1, iz)) - 130. * ACC(ix, iy, iz)) * h2_144;
- 16. * (ACC(ix,iy-3,iz)+ACC(ix,iy+3,iz))
+ 64. * (ACC(ix,iy-2,iz)+ACC(ix,iy+2,iz))
+ 16. * (ACC(ix,iy-1,iz)+ACC(ix,iy+1,iz))
- 130.* ACC(ix,iy,iz) ) * h2_144;
D[2][2] = ((ACC(ix,iy,iz-4)+ACC(ix,iy,iz+4)) D[2][2] = ((ACC(ix, iy, iz - 4) + ACC(ix, iy, iz + 4)) - 16. * (ACC(ix, iy, iz - 3) + ACC(ix, iy, iz + 3)) + 64. * (ACC(ix, iy, iz - 2) + ACC(ix, iy, iz + 2)) + 16. * (ACC(ix, iy, iz - 1) + ACC(ix, iy, iz + 1)) - 130. * ACC(ix, iy, iz)) * h2_144;
- 16. * (ACC(ix,iy,iz-3)+ACC(ix,iy,iz+3))
+ 64. * (ACC(ix,iy,iz-2)+ACC(ix,iy,iz+2))
+ 16. * (ACC(ix,iy,iz-1)+ACC(ix,iy,iz+1))
- 130.* ACC(ix,iy,iz) ) * h2_144;
D[0][1] = D[1][0] = (64.*(ACC(ix-1,iy-1,iz)-ACC(ix-1,iy+1,iz)-ACC(ix+1,iy-1,iz)+ACC(ix+1,iy+1,iz))
-8.*(ACC(ix-2,iy-1,iz)-ACC(ix+2,iy-1,iz)-ACC(ix-2,iy+1,iz)+ACC(ix+2,iy+1,iz)
+ ACC(ix-1,iy-2,iz)-ACC(ix-1,iy+2,iz)-ACC(ix+1,iy-2,iz)+ACC(ix+1,iy+2,iz))
+1.*(ACC(ix-2,iy-2,iz)-ACC(ix-2,iy+2,iz)-ACC(ix+2,iy-2,iz)+ACC(ix+2,iy+2,iz)))*h2_144;
D[0][2] = D[2][0] = (64.*(ACC(ix-1,iy,iz-1)-ACC(ix-1,iy,iz+1)-ACC(ix+1,iy,iz-1)+ACC(ix+1,iy,iz+1))
-8.*(ACC(ix-2,iy,iz-1)-ACC(ix+2,iy,iz-1)-ACC(ix-2,iy,iz+1)+ACC(ix+2,iy,iz+1)
+ ACC(ix-1,iy,iz-2)-ACC(ix-1,iy,iz+2)-ACC(ix+1,iy,iz-2)+ACC(ix+1,iy,iz+2))
+1.*(ACC(ix-2,iy,iz-2)-ACC(ix-2,iy,iz+2)-ACC(ix+2,iy,iz-2)+ACC(ix+2,iy,iz+2)))*h2_144;
D[1][2] = D[2][1] = (64.*(ACC(ix,iy-1,iz-1)-ACC(ix,iy-1,iz+1)-ACC(ix,iy+1,iz-1)+ACC(ix,iy+1,iz+1))
-8.*(ACC(ix,iy-2,iz-1)-ACC(ix,iy+2,iz-1)-ACC(ix,iy-2,iz+1)+ACC(ix,iy+2,iz+1)
+ ACC(ix,iy-1,iz-2)-ACC(ix,iy-1,iz+2)-ACC(ix,iy+1,iz-2)+ACC(ix,iy+1,iz+2))
+1.*(ACC(ix,iy-2,iz-2)-ACC(ix,iy-2,iz+2)-ACC(ix,iy+2,iz-2)+ACC(ix,iy+2,iz+2)))*h2_144;
(*pvar)(ix,iy,iz) = ( D[0][0]*D[1][1] - SQR( D[0][1] )
+ D[0][0]*D[2][2] - SQR( D[0][2] )
+ D[1][1]*D[2][2] - SQR( D[1][2] ) );
}
D[0][1] = D[1][0] = (64. * (ACC(ix - 1, iy - 1, iz) - ACC(ix - 1, iy + 1, iz) - ACC(ix + 1, iy - 1, iz) + ACC(ix + 1, iy + 1, iz)) - 8. * (ACC(ix - 2, iy - 1, iz) - ACC(ix + 2, iy - 1, iz) - ACC(ix - 2, iy + 1, iz) + ACC(ix + 2, iy + 1, iz) + ACC(ix - 1, iy - 2, iz) - ACC(ix - 1, iy + 2, iz) - ACC(ix + 1, iy - 2, iz) + ACC(ix + 1, iy + 2, iz)) + 1. * (ACC(ix - 2, iy - 2, iz) - ACC(ix - 2, iy + 2, iz) - ACC(ix + 2, iy - 2, iz) + ACC(ix + 2, iy + 2, iz))) * h2_144;
D[0][2] = D[2][0] = (64. * (ACC(ix - 1, iy, iz - 1) - ACC(ix - 1, iy, iz + 1) - ACC(ix + 1, iy, iz - 1) + ACC(ix + 1, iy, iz + 1)) - 8. * (ACC(ix - 2, iy, iz - 1) - ACC(ix + 2, iy, iz - 1) - ACC(ix - 2, iy, iz + 1) + ACC(ix + 2, iy, iz + 1) + ACC(ix - 1, iy, iz - 2) - ACC(ix - 1, iy, iz + 2) - ACC(ix + 1, iy, iz - 2) + ACC(ix + 1, iy, iz + 2)) + 1. * (ACC(ix - 2, iy, iz - 2) - ACC(ix - 2, iy, iz + 2) - ACC(ix + 2, iy, iz - 2) + ACC(ix + 2, iy, iz + 2))) * h2_144;
D[1][2] = D[2][1] = (64. * (ACC(ix, iy - 1, iz - 1) - ACC(ix, iy - 1, iz + 1) - ACC(ix, iy + 1, iz - 1) + ACC(ix, iy + 1, iz + 1)) - 8. * (ACC(ix, iy - 2, iz - 1) - ACC(ix, iy + 2, iz - 1) - ACC(ix, iy - 2, iz + 1) + ACC(ix, iy + 2, iz + 1) + ACC(ix, iy - 1, iz - 2) - ACC(ix, iy - 1, iz + 2) - ACC(ix, iy + 1, iz - 2) + ACC(ix, iy + 1, iz + 2)) + 1. * (ACC(ix, iy - 2, iz - 2) - ACC(ix, iy - 2, iz + 2) - ACC(ix, iy + 2, iz - 2) + ACC(ix, iy + 2, iz + 2))) * h2_144;
(*pvar)(ix, iy, iz) = (D[0][0] * D[1][1] - SQR(D[0][1]) + D[0][0] * D[2][2] - SQR(D[0][2]) + D[1][1] * D[2][2] - SQR(D[1][2]));
}
} }
else else
throw std::runtime_error("compute_2LPT_source : invalid operator order specified"); throw std::runtime_error("compute_2LPT_source : invalid operator order specified");
} }
//.. subtract global mean so the multi-grid poisson solver behaves well //.. subtract global mean so the multi-grid poisson solver behaves well
for( int i=fnew.levelmax(); i>(int)fnew.levelmin(); --i ) for (int i = fnew.levelmax(); i > (int)fnew.levelmin(); --i)
mg_straight().restrict( (*fnew.get_grid(i)), (*fnew.get_grid(i-1)) ); mg_straight().restrict((*fnew.get_grid(i)), (*fnew.get_grid(i - 1)));
long double sum = 0.0; long double sum = 0.0;
int nx,ny,nz; int nx, ny, nz;
nx = fnew.get_grid(fnew.levelmin())->size(0); nx = fnew.get_grid(fnew.levelmin())->size(0);
ny = fnew.get_grid(fnew.levelmin())->size(1); ny = fnew.get_grid(fnew.levelmin())->size(1);
nz = fnew.get_grid(fnew.levelmin())->size(2); nz = fnew.get_grid(fnew.levelmin())->size(2);
for( int ix=0; ix<nx; ++ix ) for (int ix = 0; ix < nx; ++ix)
for( int iy=0; iy<ny; ++iy ) for (int iy = 0; iy < ny; ++iy)
for( int iz=0; iz<nz; ++iz ) for (int iz = 0; iz < nz; ++iz)
sum += (*fnew.get_grid(fnew.levelmin()))(ix,iy,iz); sum += (*fnew.get_grid(fnew.levelmin()))(ix, iy, iz);
sum /= (double)((size_t)nx*(size_t)ny*(size_t)nz); sum /= (double)((size_t)nx * (size_t)ny * (size_t)nz);
for( unsigned i=fnew.levelmin(); i<=fnew.levelmax(); ++i ) for (unsigned i = fnew.levelmin(); i <= fnew.levelmax(); ++i)
{ {
nx = fnew.get_grid(i)->size(0); nx = fnew.get_grid(i)->size(0);
ny = fnew.get_grid(i)->size(1); ny = fnew.get_grid(i)->size(1);
nz = fnew.get_grid(i)->size(2); nz = fnew.get_grid(i)->size(2);
for( int ix=0; ix<nx; ++ix ) for (int ix = 0; ix < nx; ++ix)
for( int iy=0; iy<ny; ++iy ) for (int iy = 0; iy < ny; ++iy)
for( int iz=0; iz<nz; ++iz ) for (int iz = 0; iz < nz; ++iz)
(*fnew.get_grid(i))(ix,iy,iz) -= sum; (*fnew.get_grid(i))(ix, iy, iz) -= sum;
} }
} }
#undef SQR #undef SQR
#undef ACC #undef ACC

View file

@ -38,28 +38,15 @@ void fft_coarsen(m1 &v, m2 &V)
size_t nxf = v.size(0), nyf = v.size(1), nzf = v.size(2), nzfp = nzf + 2; size_t nxf = v.size(0), nyf = v.size(1), nzf = v.size(2), nzfp = nzf + 2;
size_t nxF = V.size(0), nyF = V.size(1), nzF = V.size(2), nzFp = nzF + 2; size_t nxF = V.size(0), nyF = V.size(1), nzF = V.size(2), nzFp = nzF + 2;
fftw_real *rcoarse = new fftw_real[nxF * nyF * nzFp]; real_t *rcoarse = new real_t[nxF * nyF * nzFp];
fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse); complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
fftw_real *rfine = new fftw_real[nxf * nyf * nzfp]; real_t *rfine = new real_t[nxf * nyf * nzfp];
fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine); complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
#ifdef FFTW3 fftw_plan_t
#ifdef SINGLE_PRECISION pf = FFTW_API(plan_dft_r2c_3d)(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
fftwf_plan ipc = FFTW_API(plan_dft_c2r_3d)(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
pf = fftwf_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
ipc = fftwf_plan_dft_c2r_3d(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
#else
fftw_plan
pf = fftw_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
ipc = fftw_plan_dft_c2r_3d(nxF, nyF, nzF, ccoarse, rcoarse, FFTW_ESTIMATE);
#endif
#else
rfftwnd_plan
pf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
ipc = rfftw3d_create_plan(nxF, nyF, nzF, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < (int)nxf; i++) for (int i = 0; i < (int)nxf; i++)
@ -70,19 +57,7 @@ void fft_coarsen(m1 &v, m2 &V)
rfine[q] = v(i, j, k); rfine[q] = v(i, j, k);
} }
#ifdef FFTW3 FFTW_API(execute)(pf);
#ifdef SINGLE_PRECISION
fftwf_execute(pf);
#else
fftw_execute(pf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
#else
rfftwnd_one_real_to_complex(pf, rfine, NULL);
#endif
#endif
double fftnorm = 1.0 / ((double)nxF * (double)nyF * (double)nzF); double fftnorm = 1.0 / ((double)nxF * (double)nyF * (double)nzF);
@ -125,19 +100,7 @@ void fft_coarsen(m1 &v, m2 &V)
delete[] rfine; delete[] rfine;
#ifdef FFTW3 FFTW_API(execute)(ipc);
#ifdef SINGLE_PRECISION
fftwf_execute(ipc);
#else
fftw_execute(ipc);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipc, ccoarse, NULL);
#else
rfftwnd_one_complex_to_real(ipc, ccoarse, NULL);
#endif
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < (int)nxF; i++) for (int i = 0; i < (int)nxF; i++)
@ -150,18 +113,8 @@ void fft_coarsen(m1 &v, m2 &V)
delete[] rcoarse; delete[] rcoarse;
#ifdef FFTW3 FFTW_API(destroy_plan)(pf);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(ipc);
fftwf_destroy_plan(pf);
fftwf_destroy_plan(ipc);
#else
fftw_destroy_plan(pf);
fftw_destroy_plan(ipc);
#endif
#else
rfftwnd_destroy_plan(pf);
rfftwnd_destroy_plan(ipc);
#endif
} }
template <typename m1, typename m2> template <typename m1, typename m2>
@ -191,14 +144,14 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
size_t nxc = nxf / 2, nyc = nyf / 2, nzc = nzf / 2, nzcp = nzf / 2 + 2; size_t nxc = nxf / 2, nyc = nyf / 2, nzc = nzf / 2, nzcp = nzf / 2 + 2;
fftw_real *rcoarse = new fftw_real[nxc * nyc * nzcp]; real_t *rcoarse = new real_t[nxc * nyc * nzcp];
fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse); complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
fftw_real *rfine = new fftw_real[nxf * nyf * nzfp]; real_t *rfine = new real_t[nxf * nyf * nzfp];
fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine); complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
// copy coarse data to rcoarse[.] // copy coarse data to rcoarse[.]
memset(rcoarse, 0, sizeof(fftw_real) * nxc * nyc * nzcp); memset(rcoarse, 0, sizeof(real_t) * nxc * nyc * nzcp);
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < (int)nxc; ++i) for (int i = 0; i < (int)nxc; ++i)
@ -221,36 +174,13 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
rfine[q] = v(i, j, k); rfine[q] = v(i, j, k);
} }
#ifdef FFTW3
#ifdef SINGLE_PRECISION
fftwf_plan
pc = fftwf_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
pf = fftwf_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
ipf = fftwf_plan_dft_c2r_3d(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
fftwf_execute(pc);
fftwf_execute(pf);
#else
fftw_plan
pc = fftw_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
pf = fftw_plan_dft_r2c_3d(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
ipf = fftw_plan_dft_c2r_3d(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
fftw_execute(pc);
fftw_execute(pf);
#endif
#else
rfftwnd_plan
pc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
pf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
ipf = rfftw3d_create_plan(nxf, nyf, nzf, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW fftw_plan_t
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pc, rcoarse, NULL); pc = FFTW_API(plan_dft_r2c_3d)(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE),
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL); pf = FFTW_API(plan_dft_r2c_3d)(nxf, nyf, nzf, rfine, cfine, FFTW_ESTIMATE),
#else ipf = FFTW_API(plan_dft_c2r_3d)(nxf, nyf, nzf, cfine, rfine, FFTW_ESTIMATE);
rfftwnd_one_real_to_complex(pc, rcoarse, NULL); FFTW_API(execute)(pc);
rfftwnd_one_real_to_complex(pf, rfine, NULL); FFTW_API(execute)(pf);
#endif
#endif
/*************************************************/ /*************************************************/
//.. perform actual interpolation //.. perform actual interpolation
@ -300,28 +230,11 @@ void fft_interpolate(m1 &V, m2 &v, bool from_basegrid = false)
/*************************************************/ /*************************************************/
#ifdef FFTW3 FFTW_API(execute)(ipf);
#ifdef SINGLE_PRECISION
fftwf_execute(ipf); FFTW_API(destroy_plan)(pf);
fftwf_destroy_plan(pf); FFTW_API(destroy_plan)(pc);
fftwf_destroy_plan(pc); FFTW_API(destroy_plan)(ipf);
fftwf_destroy_plan(ipf);
#else
fftw_execute(ipf);
fftw_destroy_plan(pf);
fftw_destroy_plan(pc);
fftw_destroy_plan(ipf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipf, cfine, NULL);
#else
rfftwnd_one_complex_to_real(ipf, cfine, NULL);
#endif
fftwnd_destroy_plan(pf);
fftwnd_destroy_plan(pc);
fftwnd_destroy_plan(ipf);
#endif
// copy back and normalize // copy back and normalize
#pragma omp parallel for #pragma omp parallel for
@ -349,8 +262,6 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
levelmin = cf.get_value_safe<unsigned>("setup", "levelmin_TF", levelminPoisson); levelmin = cf.get_value_safe<unsigned>("setup", "levelmin_TF", levelminPoisson);
levelmax = cf.get_value<unsigned>("setup", "levelmax"); levelmax = cf.get_value<unsigned>("setup", "levelmax");
bool kspace = cf.get_value<bool>("setup", "kspace_TF");
bool fix = cf.get_value_safe<bool>("setup","fix_mode_amplitude",false); bool fix = cf.get_value_safe<bool>("setup","fix_mode_amplitude",false);
bool flip = cf.get_value_safe<bool>("setup","flip_mode_amplitude",false); bool flip = cf.get_value_safe<bool>("setup","flip_mode_amplitude",false);
@ -360,30 +271,10 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
music::ulog.Print("Running unigrid density convolution..."); music::ulog.Print("Running unigrid density convolution...");
//... select the transfer function to be used //... select the transfer function to be used
convolution::kernel_creator *the_kernel_creator; convolution::kernel_creator *the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k"];
if (kspace) std::cout << " - Using k-space transfer function kernel.\n";
{ music::ulog.Print("Using k-space transfer function kernel.");
std::cout << " - Using k-space transfer function kernel.\n";
music::ulog.Print("Using k-space transfer function kernel.");
#ifdef SINGLE_PRECISION
the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_float"];
#else
the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_double"];
#endif
}
else
{
std::cout << " - Using real-space transfer function kernel.\n";
music::ulog.Print("Using real-space transfer function kernel.");
#ifdef SINGLE_PRECISION
the_kernel_creator = convolution::get_kernel_map()["tf_kernel_real_float"];
#else
the_kernel_creator = convolution::get_kernel_map()["tf_kernel_real_double"];
#endif
}
//... initialize convolution kernel //... initialize convolution kernel
convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type); convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type);
@ -402,7 +293,7 @@ void GenerateDensityUnigrid(config_file &cf, transfer_function *ptf, tf_type typ
the_tf_kernel->fetch_kernel(levelmin, false); the_tf_kernel->fetch_kernel(levelmin, false);
//... perform convolution //... perform convolution
convolution::perform<real_t>(the_tf_kernel, reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip); convolution::perform(the_tf_kernel, reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
//... clean up kernel //... clean up kernel
delete the_tf_kernel; delete the_tf_kernel;
@ -451,17 +342,11 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
unsigned nbase = 1 << levelmin; unsigned nbase = 1 << levelmin;
convolution::kernel_creator *the_kernel_creator; convolution::kernel_creator *the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k"];
std::cout << " - Using k-space transfer function kernel.\n"; std::cout << " - Using k-space transfer function kernel.\n";
music::ulog.Print("Using k-space transfer function kernel."); music::ulog.Print("Using k-space transfer function kernel.");
#ifdef SINGLE_PRECISION
the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_float"];
#else
the_kernel_creator = convolution::get_kernel_map()["tf_kernel_k_double"];
#endif
convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type); convolution::kernel *the_tf_kernel = the_kernel_creator->create(cf, ptf, refh, type);
/***** PERFORM CONVOLUTIONS *****/ /***** PERFORM CONVOLUTIONS *****/
@ -475,7 +360,7 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
top = new DensityGrid<real_t>(nbase, nbase, nbase); top = new DensityGrid<real_t>(nbase, nbase, nbase);
music::ilog.Print("Performing noise convolution on level %3d", levelmin); music::ilog.Print("Performing noise convolution on level %3d", levelmin);
rand.load(*top, levelmin); rand.load(*top, levelmin);
convolution::perform<real_t>(the_tf_kernel->fetch_kernel(levelmin, false), reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip); convolution::perform(the_tf_kernel->fetch_kernel(levelmin, false), reinterpret_cast<void *>(top->get_data_ptr()), shift, fix, flip);
delta.create_base_hierarchy(levelmin); delta.create_base_hierarchy(levelmin);
top->copy(*delta.get_grid(levelmin)); top->copy(*delta.get_grid(levelmin));
@ -506,7 +391,7 @@ void GenerateDensityHierarchy(config_file &cf, transfer_function *ptf, tf_type t
// load white noise for patch // load white noise for patch
rand.load(*fine, levelmin + i); rand.load(*fine, levelmin + i);
convolution::perform<real_t>(the_tf_kernel->fetch_kernel(levelmin + i, true), convolution::perform(the_tf_kernel->fetch_kernel(levelmin + i, true),
reinterpret_cast<void *>(fine->get_data_ptr()), shift, fix, flip); reinterpret_cast<void *>(fine->get_data_ptr()), shift, fix, flip);
if( fourier_splicing ){ if( fourier_splicing ){

View file

@ -11,12 +11,13 @@
#ifndef __FD_SCHEMES_HH #ifndef __FD_SCHEMES_HH
#define __FD_SCHEMES_HH #define __FD_SCHEMES_HH
#include <general.hh>
#include <vector> #include <vector>
#include <stdexcept> #include <stdexcept>
//! abstract implementation of the Poisson/Force scheme //! abstract implementation of the Poisson/Force scheme
template< class L, class G, typename real_t=double > template< class L, class G>
class scheme class scheme
{ {
public: public:
@ -57,10 +58,9 @@ public:
}; };
//! base class for finite difference gradients //! base class for finite difference gradients
template< int nextent, typename T > template< int nextent>
class gradient class gradient
{ {
typedef T real_t;
std::vector<real_t> m_stencil; std::vector<real_t> m_stencil;
const unsigned nl; const unsigned nl;
public: public:
@ -110,20 +110,21 @@ public:
}; };
//! base class for finite difference stencils //! base class for finite difference stencils
template< int nextent, typename real_t > template< int nextent>
class base_stencil class base_stencil
{ {
protected: protected:
std::vector<real_t> m_stencil; static constexpr size_t nl{2*nextent+1};
const unsigned nl; std::array<real_t,nl*nl*nl> m_stencil;
public: public:
bool m_modsource; bool m_modsource;
public: public:
base_stencil( bool amodsource = false ) explicit base_stencil( bool amodsource = false )
: nl( 2*nextent+1 ), m_modsource( amodsource ) : m_modsource( amodsource )
{ {
m_stencil.assign(nl*nl*nl,(real_t)0.0); m_stencil.fill( (real_t)0.0 );
} }
real_t& operator()(int i, int j, int k) real_t& operator()(int i, int j, int k)
@ -176,8 +177,7 @@ public:
//... Implementation of the Gradient schemes............................................ //... Implementation of the Gradient schemes............................................
template< typename real_t > class deriv_2P : public gradient<1>
class deriv_2P : public gradient<1,real_t>
{ {
public: public:
@ -194,8 +194,7 @@ public:
//... Implementation of the Laplacian schemes.......................................... //... Implementation of the Laplacian schemes..........................................
//! 7-point, 2nd order finite difference Laplacian //! 7-point, 2nd order finite difference Laplacian
template< typename real_t > class stencil_7P : public base_stencil<1>
class stencil_7P : public base_stencil<1,real_t>
{ {
public: public:
@ -214,7 +213,7 @@ public:
inline real_t apply( const C& c, const int i, const int j, const int k ) const inline real_t apply( const C& c, const int i, const int j, const int k ) const
{ {
//return c(i-1,j,k)+c(i+1,j,k)+c(i,j-1,k)+c(i,j+1,k)+c(i,j,k-1)+c(i,j,k+1)-6.0*c(i,j,k); //return c(i-1,j,k)+c(i+1,j,k)+c(i,j-1,k)+c(i,j+1,k)+c(i,j,k-1)+c(i,j,k+1)-6.0*c(i,j,k);
return (double)c(i-1,j,k)+(double)c(i+1,j,k)+(double)c(i,j-1,k)+(double)c(i,j+1,k)+(double)c(i,j,k-1)+(double)c(i,j,k+1)-6.0*(double)c(i,j,k); return (real_t)c(i-1,j,k)+(real_t)c(i+1,j,k)+(real_t)c(i,j-1,k)+(real_t)c(i,j+1,k)+(real_t)c(i,j,k-1)+(real_t)c(i,j,k+1)-6.0*(real_t)c(i,j,k);
} }
template< class C > template< class C >
@ -230,8 +229,7 @@ public:
}; };
//! 13-point, 4th order finite difference Laplacian //! 13-point, 4th order finite difference Laplacian
template< typename real_t > class stencil_13P : public base_stencil<2>
class stencil_13P : public base_stencil<2,real_t>
{ {
public: public:
@ -279,8 +277,7 @@ public:
//! 19-point, 6th order finite difference Laplacian //! 19-point, 6th order finite difference Laplacian
template< typename real_t > class stencil_19P : public base_stencil<3>
class stencil_19P : public base_stencil<3,real_t>
{ {
public: public:
@ -339,7 +336,6 @@ public:
//! flux operator for the 4th order FD Laplacian //! flux operator for the 4th order FD Laplacian
template< typename real_t >
class Laplace_flux_O4 class Laplace_flux_O4
{ {
public: public:
@ -354,7 +350,7 @@ public:
template< class C > template< class C >
inline double apply_x( int idir, const C& c, const int i, const int j, const int k ) inline double apply_x( int idir, const C& c, const int i, const int j, const int k )
{ {
double fac = -((double)idir)/12.0; double fac = -((real_t)idir)/12.0;
return fac*(-c(i-2,j,k)+15.0*c(i-1,j,k)-15.0*c(i,j,k)+c(i+1,j,k)); return fac*(-c(i-2,j,k)+15.0*c(i-1,j,k)-15.0*c(i,j,k)+c(i+1,j,k));
} }
@ -369,7 +365,7 @@ public:
template< class C > template< class C >
inline double apply_y( int idir, const C& c, const int i, const int j, const int k ) inline double apply_y( int idir, const C& c, const int i, const int j, const int k )
{ {
double fac = -((double)idir)/12.0; double fac = -((real_t)idir)/12.0;
return fac*(-c(i,j-2,k)+15.0*c(i,j-1,k)-15.0*c(i,j,k)+c(i,j+1,k)); return fac*(-c(i,j-2,k)+15.0*c(i,j-1,k)-15.0*c(i,j,k)+c(i,j+1,k));
} }
@ -384,7 +380,7 @@ public:
template< class C > template< class C >
inline double apply_z( int idir, const C& c, const int i, const int j, const int k ) inline double apply_z( int idir, const C& c, const int i, const int j, const int k )
{ {
double fac = -((double)idir)/12.0; double fac = -((real_t)idir)/12.0;
return fac*(-c(i,j,k-2)+15.0*c(i,j,k-1)-15.0*c(i,j,k)+c(i,j,k+1)); return fac*(-c(i,j,k-2)+15.0*c(i,j,k-1)-15.0*c(i,j,k)+c(i,j,k+1));
} }
@ -392,7 +388,6 @@ public:
//! flux operator for the 6th order FD Laplacian //! flux operator for the 6th order FD Laplacian
template< typename real_t >
class Laplace_flux_O6 class Laplace_flux_O6
{ {
public: public:
@ -408,7 +403,7 @@ public:
template< class C > template< class C >
inline double apply_x( int idir, const C& c, const int i, const int j, const int k ) inline double apply_x( int idir, const C& c, const int i, const int j, const int k )
{ {
double fac = -((double)idir)/180.0; real_t fac = -((real_t)idir)/180.0;
return fac*(2.*c(i-3,j,k)-25.*c(i-2,j,k)+245.*c(i-1,j,k)-245.0*c(i,j,k)+25.*c(i+1,j,k)-2.*c(i+2,j,k)); return fac*(2.*c(i-3,j,k)-25.*c(i-2,j,k)+245.*c(i-1,j,k)-245.0*c(i,j,k)+25.*c(i+1,j,k)-2.*c(i+2,j,k));
} }
@ -423,7 +418,7 @@ public:
template< class C > template< class C >
inline double apply_y( int idir, const C& c, const int i, const int j, const int k ) inline double apply_y( int idir, const C& c, const int i, const int j, const int k )
{ {
double fac = -((double)idir)/180.0; real_t fac = -((real_t)idir)/180.0;
return fac*(2.*c(i,j-3,k)-25.*c(i,j-2,k)+245.*c(i,j-1,k)-245.0*c(i,j,k)+25.*c(i,j+1,k)-2.*c(i,j+2,k)); return fac*(2.*c(i,j-3,k)-25.*c(i,j-2,k)+245.*c(i,j-1,k)-245.0*c(i,j,k)+25.*c(i,j+1,k)-2.*c(i,j+2,k));
} }
@ -438,7 +433,7 @@ public:
template< class C > template< class C >
inline double apply_z( int idir, const C& c, const int i, const int j, const int k ) inline double apply_z( int idir, const C& c, const int i, const int j, const int k )
{ {
double fac = -((double)idir)/180.0; real_t fac = -((real_t)idir)/180.0;
return fac*(2.*c(i,j,k-3)-25.*c(i,j,k-2)+245.*c(i,j,k-1)-245.0*c(i,j,k)+25.*c(i,j,k+1)-2.*c(i,j,k+2)); return fac*(2.*c(i,j,k-3)-25.*c(i,j,k-2)+245.*c(i,j,k-1)-245.0*c(i,j,k)+25.*c(i,j,k+1)-2.*c(i,j,k+2));
} }

View file

@ -8,75 +8,56 @@
*/ */
#ifndef __GENERAL_HH #pragma once
#define __GENERAL_HH
#include "logger.hh" #include <logger.hh>
#include <config_file.hh>
#include <cassert> #include <cassert>
#include "omp.h" #include <omp.h>
#ifdef WITH_MPI #include <fftw3.h>
#ifdef MANNO
#include <mpi.h> // include CMake controlled configuration settings
#else #include "cmake_config.hh"
#include <mpi++.h>
#endif #if defined(USE_PRECISION_FLOAT)
#else using real_t = float;
#include <time.h> using complex_t = fftwf_complex;
#define FFTW_PREFIX fftwf
#elif defined(USE_PRECISION_DOUBLE)
using real_t = double;
using complex_t = fftw_complex;
#define FFTW_PREFIX fftw
#elif defined(USE_PRECISION_LONGDOUBLE)
using real_t = long double;
using complex_t = fftwl_complex;
#define FFTW_PREFIX fftwl
#endif #endif
#ifdef FFTW3 #define FFTW_GEN_NAME_PRIM(a, b) a##_##b
#include <fftw3.h> #define FFTW_GEN_NAME(a, b) FFTW_GEN_NAME_PRIM(a, b)
#if defined(SINGLE_PRECISION) #define FFTW_API(x) FFTW_GEN_NAME(FFTW_PREFIX, x)
typedef float fftw_real;
#else
typedef double fftw_real;
#endif
#else using fftw_plan_t = FFTW_GEN_NAME(FFTW_PREFIX, plan);
#if defined(SINGLE_PRECISION) and not defined(SINGLETHREAD_FFTW)
#include <srfftw.h>
#include <srfftw_threads.h>
#elif defined(SINGLE_PRECISION) and defined(SINGLETHREAD_FFTW)
#include <srfftw.h>
#elif not defined(SINGLE_PRECISION) and not defined(SINGLETHREAD_FFTW)
#include <drfftw.h>
#include <drfftw_threads.h>
#elif not defined(SINGLE_PRECISION) and defined(SINGLETHREAD_FFTW)
#include <drfftw.h>
#endif
#endif
#ifdef SINGLE_PRECISION #define RE(x) ((x)[0])
typedef float real_t; #define IM(x) ((x)[1])
#else
typedef double real_t;
#endif
#include <vector>
#include <array> #include <array>
using vec3_t = std::array<real_t,3>; using vec3_t = std::array<real_t,3>;
#ifdef FFTW3 namespace CONFIG
#define RE(x) ((x)[0]) {
#define IM(x) ((x)[1]) // extern int MPI_thread_support;
#else // extern int MPI_task_rank;
#define RE(x) ((x).re) // extern int MPI_task_size;
#define IM(x) ((x).im) // extern bool MPI_ok;
#endif // extern bool MPI_threads_ok;
extern bool FFTW_threads_ok;
#if defined(FFTW3) && defined(SINGLE_PRECISION) extern int num_threads;
#define fftw_complex fftwf_complex } // namespace CONFIG
#endif
#include <vector>
#include "config_file.hh"
//#include "mesh.hh"
//! compute square of argument //! compute square of argument
template< typename T > template< typename T >
@ -180,6 +161,3 @@ inline bool is_number(const std::string& s)
return true; return true;
} }
#endif

View file

@ -13,6 +13,8 @@
#include <iomanip> #include <iomanip>
#include <math.h> #include <math.h>
#include <thread>
#include <gsl/gsl_rng.h> #include <gsl/gsl_rng.h>
#include <gsl/gsl_randist.h> #include <gsl/gsl_randist.h>
#include <gsl/gsl_integration.h> #include <gsl/gsl_integration.h>
@ -26,25 +28,40 @@ extern "C"
} }
#endif #endif
#include "general.hh" #include <exception>
#include "defaults.hh" #include <cfenv>
#include "output.hh"
#include "config_file.hh" #include <general.hh>
#include <defaults.hh>
#include <output.hh>
#include "poisson.hh" #include <config_file.hh>
#include "mg_solver.hh"
#include "fd_schemes.hh"
#include "random.hh"
#include "densities.hh"
#include "convolution_kernel.hh" #include <poisson.hh>
#include "cosmology.hh" #include <mg_solver.hh>
#include "transfer_function.hh" #include <fd_schemes.hh>
#include <random.hh>
#include <densities.hh>
#include <convolution_kernel.hh>
#include <cosmology.hh>
#include <transfer_function.hh>
#define THE_CODE_NAME "music!" #define THE_CODE_NAME "music!"
#define THE_CODE_VERSION "2.0a" #define THE_CODE_VERSION "2.0a"
// initialise with "default" values
namespace CONFIG{
// int MPI_thread_support = -1;
// int MPI_task_rank = 0;
// int MPI_task_size = 1;
// bool MPI_ok = false;
// bool MPI_threads_ok = false;
bool FFTW_threads_ok = false;
int num_threads = 1;
}
namespace music namespace music
{ {
@ -87,11 +104,6 @@ void splash(void)
#if defined(CMAKE_BUILD) #if defined(CMAKE_BUILD)
music::ilog.Print("Version built from git rev.: %s, tag: %s, branch: %s", GIT_REV, GIT_TAG, GIT_BRANCH); music::ilog.Print("Version built from git rev.: %s, tag: %s, branch: %s", GIT_REV, GIT_TAG, GIT_BRANCH);
#endif
#if defined(SINGLE_PRECISION)
music::ilog.Print("Version was compiled for single precision.");
#else
music::ilog.Print("Version was compiled for double precision.");
#endif #endif
std::cout << "\n\n"; std::cout << "\n\n";
} }
@ -294,6 +306,50 @@ void add_constant_value( grid_hierarchy &u, const double val )
} }
} }
#include <system_stat.hh>
void output_system_info()
{
std::feclearexcept(FE_ALL_EXCEPT);
//------------------------------------------------------------------------------
// Write code configuration to screen
//------------------------------------------------------------------------------
// hardware related infos
music::ilog << std::setw(32) << std::left << "CPU vendor string" << " : " << SystemStat::Cpu().get_CPUstring() << std::endl;
// multi-threading related infos
music::ilog << std::setw(32) << std::left << "Available HW threads / task" << " : " << std::thread::hardware_concurrency() << " (" << CONFIG::num_threads << " used)" << std::endl;
// memory related infos
SystemStat::Memory mem;
unsigned availpmem = mem.get_AvailMem()/1024/1024;
unsigned usedpmem = mem.get_UsedMem()/1024/1024;
unsigned maxpmem = availpmem, minpmem = availpmem;
unsigned maxupmem = usedpmem, minupmem = usedpmem;
music::ilog << std::setw(32) << std::left << "Total system memory (phys)" << " : " << mem.get_TotalMem()/1024/1024 << " Mb" << std::endl;
music::ilog << std::setw(32) << std::left << "Used system memory (phys)" << " : " << "Max: " << maxupmem << " Mb, Min: " << minupmem << " Mb" << std::endl;
music::ilog << std::setw(32) << std::left << "Available system memory (phys)" << " : " << "Max: " << maxpmem << " Mb, Min: " << minpmem << " Mb" << std::endl;
// Kernel related infos
SystemStat::Kernel kern;
auto kinfo = kern.get_kernel_info();
music::ilog << std::setw(32) << std::left << "OS/Kernel version" << " : " << kinfo.kernel << " version " << kinfo.major << "." << kinfo.minor << " build " << kinfo.build_number << std::endl;
// FFTW related infos
music::ilog << std::setw(32) << std::left << "FFTW version" << " : " << FFTW_API(version) << std::endl;
music::ilog << std::setw(32) << std::left << "FFTW supports multi-threading" << " : " << (CONFIG::FFTW_threads_ok? "yes" : "no") << std::endl;
music::ilog << std::setw(32) << std::left << "FFTW mode" << " : ";
#if defined(FFTW_MODE_PATIENT)
music::ilog << "FFTW_PATIENT" << std::endl;
#elif defined(FFTW_MODE_MEASURE)
music::ilog << "FFTW_MEASURE" << std::endl;
#else
music::ilog << "FFTW_ESTIMATE" << std::endl;
#endif
}
/*****************************************************************************************************/ /*****************************************************************************************************/
/*****************************************************************************************************/ /*****************************************************************************************************/
/*****************************************************************************************************/ /*****************************************************************************************************/
@ -342,25 +398,6 @@ int main(int argc, const char *argv[])
music::ulog.Print("Running %s, version %s", THE_CODE_NAME, THE_CODE_VERSION); music::ulog.Print("Running %s, version %s", THE_CODE_NAME, THE_CODE_VERSION);
music::ulog.Print("Log is for run started %s", asctime(localtime(&ltime))); music::ulog.Print("Log is for run started %s", asctime(localtime(&ltime)));
#ifdef FFTW3
music::ulog.Print("Code was compiled using FFTW version 3.x");
#else
music::ulog.Print("Code was compiled using FFTW version 2.x");
#endif
#ifdef SINGLETHREAD_FFTW
music::ulog.Print("Code was compiled for single-threaded FFTW");
#else
music::ulog.Print("Code was compiled for multi-threaded FFTW");
music::ulog.Print("Running with a maximum of %d OpenMP threads", omp_get_max_threads());
#endif
#ifdef SINGLE_PRECISION
music::ulog.Print("Code was compiled for single precision.");
#else
music::ulog.Print("Code was compiled for double precision.");
#endif
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
//... read and interpret config file //... read and interpret config file
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -369,6 +406,13 @@ int main(int argc, const char *argv[])
bool force_shift(false); bool force_shift(false);
double boxlength; double boxlength;
//------------------------------------------------------------------------------
//... init multi-threading
//------------------------------------------------------------------------------
CONFIG::FFTW_threads_ok = FFTW_API(init_threads)();
CONFIG::num_threads = cf.get_value_safe<unsigned>("execution", "NumThreads",std::thread::hardware_concurrency());
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
//... initialize some parameters about grid set-up //... initialize some parameters about grid set-up
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -403,24 +447,6 @@ int main(int argc, const char *argv[])
else else
music::ilog.Print("Using real space sampled transfer functions..."); music::ilog.Print("Using real space sampled transfer functions...");
//------------------------------------------------------------------------------
//... initialize multithread FFTW
//------------------------------------------------------------------------------
#if not defined(SINGLETHREAD_FFTW)
#ifdef FFTW3
#ifdef SINGLE_PRECISION
fftwf_init_threads();
fftwf_plan_with_nthreads(omp_get_max_threads());
#else
fftw_init_threads();
fftw_plan_with_nthreads(omp_get_max_threads());
#endif
#else
fftw_threads_init();
#endif
#endif
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
//... initialize cosmology //... initialize cosmology
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
@ -1373,13 +1399,8 @@ int main(int argc, const char *argv[])
delete the_transfer_function_plugin; delete the_transfer_function_plugin;
delete the_poisson_solver; delete the_poisson_solver;
#if defined(FFTW3) and not defined(SINGLETHREAD_FFTW) if( CONFIG::FFTW_threads_ok )
#ifdef SINGLE_PRECISION FFTW_API(cleanup_threads)();
fftwf_cleanup_threads();
#else
fftw_cleanup_threads();
#endif
#endif
//------------------------------------------------------------------------------ //------------------------------------------------------------------------------
//... we are done ! //... we are done !

View file

@ -290,12 +290,12 @@ struct cubic_interp
{ {
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -312,12 +312,12 @@ struct cubic_interp
{ {
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -338,12 +338,12 @@ struct cubic_interp
{ {
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -359,12 +359,12 @@ struct cubic_interp
{ {
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -384,12 +384,12 @@ struct cubic_interp
{ {
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy+1,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0; coarse_flux = Laplace_flux_O4().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -405,12 +405,12 @@ struct cubic_interp
{ {
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy+1,iz);
coarse_flux = Laplace_flux_O4<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -717,13 +717,13 @@ struct interp_O5_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_x(-1,*u,ix+1,iy+1,iz+1);
fine_flux /= 4.0; fine_flux /= 4.0;
coarse_flux = Laplace_flux_O4<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -758,12 +758,12 @@ struct interp_O5_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_x(+1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_x(+1,*u,ix,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -798,12 +798,12 @@ struct interp_O5_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix,iy+1,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_y(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -838,12 +838,12 @@ struct interp_O5_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_y(+1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O4().apply_y(+1,*u,ix+1,iy,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -880,12 +880,12 @@ struct interp_O5_fluxcorr
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix,iy+1,iz+1);
fine_flux += Laplace_flux_O4<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O4().apply_z(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O4<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0; coarse_flux = Laplace_flux_O4().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -920,12 +920,12 @@ struct interp_O5_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O4<real_t>().apply_z(+1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O4().apply_z(+1,*u,ix+1,iy+1,iz);
coarse_flux = Laplace_flux_O4<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O4().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -1027,13 +1027,13 @@ struct interp_O7_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy+1,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_x(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O6().apply_x(-1,*u,ix+1,iy+1,iz+1);
fine_flux /= 4.0; fine_flux /= 4.0;
coarse_flux = Laplace_flux_O6<real_t>().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O6().apply_x(-1,*utop,ixtop+1,iytop,iztop)/2.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -1074,12 +1074,12 @@ struct interp_O7_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_x(+1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O6().apply_x(+1,*u,ix,iy+1,iz+1);
coarse_flux = Laplace_flux_O6<real_t>().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O6().apply_x(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -1119,12 +1119,12 @@ struct interp_O7_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix+1,iy+1,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix,iy+1,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_y(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O6().apply_y(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O6<real_t>().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0; coarse_flux = Laplace_flux_O6().apply_y(-1,*utop,ixtop,iytop+1,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -1164,12 +1164,12 @@ struct interp_O7_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_y(+1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O6().apply_y(+1,*u,ix+1,iy,iz+1);
coarse_flux = Laplace_flux_O6<real_t>().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O6().apply_y(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -1210,12 +1210,12 @@ struct interp_O7_fluxcorr
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix,iy,iz+1); fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix,iy,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix+1,iy,iz+1); fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix+1,iy,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix,iy+1,iz+1); fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix,iy+1,iz+1);
fine_flux += Laplace_flux_O6<real_t>().apply_z(-1,*u,ix+1,iy+1,iz+1); fine_flux += Laplace_flux_O6().apply_z(-1,*u,ix+1,iy+1,iz+1);
coarse_flux = Laplace_flux_O6<real_t>().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0; coarse_flux = Laplace_flux_O6().apply_z(-1,*utop,ixtop,iytop,iztop+1)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;
@ -1255,12 +1255,12 @@ struct interp_O7_fluxcorr
} }
fine_flux = 0.0; fine_flux = 0.0;
fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix,iy,iz); fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix,iy,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix+1,iy,iz); fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix+1,iy,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix,iy+1,iz); fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix,iy+1,iz);
fine_flux += Laplace_flux_O6<real_t>().apply_z(+1,*u,ix+1,iy+1,iz); fine_flux += Laplace_flux_O6().apply_z(+1,*u,ix+1,iy+1,iz);
coarse_flux = Laplace_flux_O6<real_t>().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0; coarse_flux = Laplace_flux_O6().apply_z(+1,*utop,ixtop,iytop,iztop)/2.0;
fine_flux /= 4.0; fine_flux /= 4.0;
dflux = coarse_flux - fine_flux; dflux = coarse_flux - fine_flux;

View file

@ -8,30 +8,36 @@
*/ */
#ifndef __MG_SOLVER_HH #pragma once
#define __MG_SOLVER_HH
#include <cmath> #include <cmath>
#include <iostream> #include <iostream>
#include "mg_operators.hh" #include <mg_operators.hh>
#include "mg_interp.hh" #include <mg_interp.hh>
#include "mesh.hh" #include <mesh.hh>
#define BEGIN_MULTIGRID_NAMESPACE namespace multigrid { #define BEGIN_MULTIGRID_NAMESPACE \
namespace multigrid \
{
#define END_MULTIGRID_NAMESPACE } #define END_MULTIGRID_NAMESPACE }
BEGIN_MULTIGRID_NAMESPACE BEGIN_MULTIGRID_NAMESPACE
//! options for multigrid smoothing operation //! options for multigrid smoothing operation
namespace opt { namespace opt
enum smtype { sm_jacobi, sm_gauss_seidel, sm_sor }; {
enum smtype
{
sm_jacobi,
sm_gauss_seidel,
sm_sor
};
} }
//! actual implementation of FAS adaptive multigrid solver //! actual implementation of FAS adaptive multigrid solver
template< class S, class I, class O, typename T=double > template <class S, class I, class O>
class solver class solver
{ {
public: public:
@ -40,228 +46,213 @@ public:
typedef I interp; typedef I interp;
protected: protected:
scheme m_scheme; //!< finite difference scheme scheme m_scheme; //!< finite difference scheme
mgop m_gridop; //!< grid prolongation and restriction operator mgop m_gridop; //!< grid prolongation and restriction operator
unsigned m_npresmooth, //!< number of pre sweeps unsigned m_npresmooth, //!< number of pre sweeps
m_npostsmooth; //!< number of post sweeps m_npostsmooth; //!< number of post sweeps
opt::smtype m_smoother; //!< smoothing method to be applied opt::smtype m_smoother; //!< smoothing method to be applied
unsigned m_ilevelmin; //!< index of the top grid level unsigned m_ilevelmin; //!< index of the top grid level
const static bool m_bperiodic = true; //!< flag whether top grid is periodic const static bool m_bperiodic = true; //!< flag whether top grid is periodic
std::vector<double> m_residu_ini; //!< vector of initial residuals for each level std::vector<double> m_residu_ini; //!< vector of initial residuals for each level
bool m_is_ini; //!< bool that is true for first iteration bool m_is_ini; //!< bool that is true for first iteration
GridHierarchy<T> *m_pu, //!< pointer to GridHierarchy for solution u GridHierarchy<real_t>
*m_pf, //!< pointer to GridHierarchy for right-hand-side *m_pu, //!< pointer to GridHierarchy for solution u
*m_pfsave; //!< pointer to saved state of right-hand-side (unused) *m_pf, //!< pointer to GridHierarchy for right-hand-side
*m_pfsave; //!< pointer to saved state of right-hand-side (unused)
const MeshvarBnd<T> *m_pubnd; const MeshvarBnd<real_t> *m_pubnd;
//! compute residual for a level //! compute residual for a level
double compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& unew, int ilevel ); double compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &unew, int ilevel);
//! compute residuals for entire grid hierarchy //! compute residuals for entire grid hierarchy
double compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& uhnew, bool verbose ); double compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &uhnew, bool verbose);
//! compute residuals for entire grid hierarchy //! compute residuals for entire grid hierarchy
double compute_RMS_resid( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose ); double compute_RMS_resid(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose);
protected: protected:
//! Jacobi smoothing //! Jacobi smoothing
void Jacobi( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f ); void Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
//! Gauss-Seidel smoothing //! Gauss-Seidel smoothing
void GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f ); void GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
//! Successive-Overrelaxation smoothing //! Successive-Overrelaxation smoothing
void SOR( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f ); void SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f);
//! main two-grid (V-cycle) for multi-grid iterations //! main two-grid (V-cycle) for multi-grid iterations
void twoGrid( unsigned ilevel ); void twoGrid(unsigned ilevel);
//! apply boundary conditions //! apply boundary conditions
void setBC( unsigned ilevel ); void setBC(unsigned ilevel);
//! make top grid periodic boundary conditions //! make top grid periodic boundary conditions
void make_periodic( MeshvarBnd<T> *u ); void make_periodic(MeshvarBnd<real_t> *u);
//void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<T>& coarse, MeshvarBnd<T>& fine ); // void interp_coarse_fine_cubic( unsigned ilevel, MeshvarBnd<real_t>& coarse, MeshvarBnd<real_t>& fine );
public: public:
//! constructor //! constructor
solver( GridHierarchy<T>& f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth ); solver(GridHierarchy<real_t> &f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth);
//! destructor //! destructor
~solver() ~solver()
{ }
//! solve Poisson's equation
double solve( GridHierarchy<T>& u, double accuracy, double h=-1.0, bool verbose=false );
//! solve Poisson's equation
double solve( GridHierarchy<T>& u, double accuracy, bool verbose=false )
{ {
return this->solve ( u, accuracy, -1.0, verbose );
} }
//! solve Poisson's equation
double solve(GridHierarchy<real_t> &u, double accuracy, double h = -1.0, bool verbose = false);
//! solve Poisson's equation
double solve(GridHierarchy<real_t> &u, double accuracy, bool verbose = false)
{
return this->solve(u, accuracy, -1.0, verbose);
}
}; };
template <class S, class I, class O>
template< class S, class I, class O, typename T > solver<S, I, O>::solver(GridHierarchy<real_t> &f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth)
solver<S,I,O,T>::solver( GridHierarchy<T>& f, opt::smtype smoother, unsigned npresmooth, unsigned npostsmooth ) : m_scheme(), m_gridop(), m_npresmooth(npresmooth), m_npostsmooth(npostsmooth),
: m_scheme(), m_gridop(), m_npresmooth( npresmooth ), m_npostsmooth( npostsmooth ), m_smoother(smoother), m_ilevelmin(f.levelmin()), m_is_ini(true), m_pf(&f)
m_smoother( smoother ), m_ilevelmin( f.levelmin() ), m_is_ini( true ), m_pf( &f )
{ {
m_is_ini = true; m_is_ini = true;
} }
template <class S, class I, class O>
template< class S, class I, class O, typename T > void solver<S, I, O>::Jacobi(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
void solver<S,I,O,T>::Jacobi( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f )
{ {
int int
nx = u->size(0), nx = u->size(0),
ny = u->size(1), ny = u->size(1),
nz = u->size(2); nz = u->size(2);
double double
c0 = -1.0/m_scheme.ccoeff(), c0 = -1.0 / m_scheme.ccoeff(),
h2 = h*h; h2 = h * h;
MeshvarBnd<T> uold(*u); MeshvarBnd<real_t> uold(*u);
double alpha = 0.95, ialpha = 1.0-alpha; double alpha = 0.95, ialpha = 1.0 - alpha;
#pragma omp parallel for
for( int ix=0; ix<nx; ++ix )
for( int iy=0; iy<ny; ++iy )
for( int iz=0; iz<nz; ++iz )
(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( uold, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
#pragma omp parallel for
for (int ix = 0; ix < nx; ++ix)
for (int iy = 0; iy < ny; ++iy)
for (int iz = 0; iz < nz; ++iz)
(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
} }
template< class S, class I, class O, typename T > template <class S, class I, class O>
void solver<S,I,O,T>::SOR( T h, MeshvarBnd<T> *u, const MeshvarBnd<T>* f ) void solver<S, I, O>::SOR(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
{ {
int int
nx = u->size(0), nx = u->size(0),
ny = u->size(1), ny = u->size(1),
nz = u->size(2); nz = u->size(2);
double double
c0 = -1.0/m_scheme.ccoeff(), c0 = -1.0 / m_scheme.ccoeff(),
h2 = h*h; h2 = h * h;
MeshvarBnd<T> uold(*u); MeshvarBnd<real_t> uold(*u);
double double
alpha = 1.2, alpha = 1.2,
//alpha = 2 / (1 + 4 * atan(1.0) / double(u->size(0)))-1.0, //.. ideal alpha // alpha = 2 / (1 + 4 * atan(1.0) / double(u->size(0)))-1.0, //.. ideal alpha
ialpha = 1.0-alpha; ialpha = 1.0 - alpha;
#pragma omp parallel for
for( int ix=0; ix<nx; ++ix )
for( int iy=0; iy<ny; ++iy )
for( int iz=0; iz<nz; ++iz )
if( (ix+iy+iz)%2==0 )
(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( uold, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
#pragma omp parallel for
for( int ix=0; ix<nx; ++ix )
for( int iy=0; iy<ny; ++iy )
for( int iz=0; iz<nz; ++iz )
if( (ix+iy+iz)%2!=0 )
(*u)(ix,iy,iz) = ialpha * uold(ix,iy,iz) + alpha * (m_scheme.rhs( *u, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
#pragma omp parallel for
for (int ix = 0; ix < nx; ++ix)
for (int iy = 0; iy < ny; ++iy)
for (int iz = 0; iz < nz; ++iz)
if ((ix + iy + iz) % 2 == 0)
(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(uold, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
#pragma omp parallel for
for (int ix = 0; ix < nx; ++ix)
for (int iy = 0; iy < ny; ++iy)
for (int iz = 0; iz < nz; ++iz)
if ((ix + iy + iz) % 2 != 0)
(*u)(ix, iy, iz) = ialpha * uold(ix, iy, iz) + alpha * (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
} }
template< class S, class I, class O, typename T > template <class S, class I, class O>
void solver<S,I,O,T>::GaussSeidel( T h, MeshvarBnd<T>* u, const MeshvarBnd<T>* f ) void solver<S, I, O>::GaussSeidel(real_t h, MeshvarBnd<real_t> *u, const MeshvarBnd<real_t> *f)
{ {
int int
nx = u->size(0), nx = u->size(0),
ny = u->size(1), ny = u->size(1),
nz = u->size(2); nz = u->size(2);
T real_t
c0 = -1.0/m_scheme.ccoeff(), c0 = -1.0 / m_scheme.ccoeff(),
h2 = h*h; h2 = h * h;
for( int color=0; color < 2; ++color )
#pragma omp parallel for
for( int ix=0; ix<nx; ++ix )
for( int iy=0; iy<ny; ++iy )
for( int iz=0; iz<nz; ++iz )
if( (ix+iy+iz)%2 == color )
(*u)(ix,iy,iz) = (m_scheme.rhs( *u, ix, iy, iz ) + h2 * (*f)(ix,iy,iz))*c0;
for (int color = 0; color < 2; ++color)
#pragma omp parallel for
for (int ix = 0; ix < nx; ++ix)
for (int iy = 0; iy < ny; ++iy)
for (int iz = 0; iz < nz; ++iz)
if ((ix + iy + iz) % 2 == color)
(*u)(ix, iy, iz) = (m_scheme.rhs(*u, ix, iy, iz) + h2 * (*f)(ix, iy, iz)) * c0;
} }
template <class S, class I, class O>
template< class S, class I, class O, typename T > void solver<S, I, O>::twoGrid(unsigned ilevel)
void solver<S,I,O,T>::twoGrid( unsigned ilevel )
{ {
MeshvarBnd<T> *uf, *uc, *ff, *fc; MeshvarBnd<real_t> *uf, *uc, *ff, *fc;
double double
h = 1.0/(1<<ilevel), h = 1.0 / (1 << ilevel),
c0 = -1.0/m_scheme.ccoeff(), c0 = -1.0 / m_scheme.ccoeff(),
h2 = h*h; h2 = h * h;
uf = m_pu->get_grid(ilevel); uf = m_pu->get_grid(ilevel);
ff = m_pf->get_grid(ilevel); ff = m_pf->get_grid(ilevel);
uc = m_pu->get_grid(ilevel-1); uc = m_pu->get_grid(ilevel - 1);
fc = m_pf->get_grid(ilevel-1); fc = m_pf->get_grid(ilevel - 1);
int int
nx = uf->size(0), nx = uf->size(0),
ny = uf->size(1), ny = uf->size(1),
nz = uf->size(2); nz = uf->size(2);
if( m_bperiodic && ilevel <= m_ilevelmin) if (m_bperiodic && ilevel <= m_ilevelmin)
make_periodic( uf ); make_periodic(uf);
else if(!m_bperiodic) else if (!m_bperiodic)
setBC( ilevel ); setBC(ilevel);
//... do smoothing sweeps with specified solver //... do smoothing sweeps with specified solver
for( unsigned i=0; i<m_npresmooth; ++i ){ for (unsigned i = 0; i < m_npresmooth; ++i)
{
if( ilevel > m_ilevelmin ) if (ilevel > m_ilevelmin)
interp().interp_coarse_fine(ilevel,*uc,*uf); interp().interp_coarse_fine(ilevel, *uc, *uf);
if( m_smoother == opt::sm_gauss_seidel ) if (m_smoother == opt::sm_gauss_seidel)
GaussSeidel( h, uf, ff ); GaussSeidel(h, uf, ff);
else if( m_smoother == opt::sm_jacobi ) else if (m_smoother == opt::sm_jacobi)
Jacobi( h, uf, ff); Jacobi(h, uf, ff);
else if( m_smoother == opt::sm_sor ) else if (m_smoother == opt::sm_sor)
SOR( h, uf, ff ); SOR(h, uf, ff);
if( m_bperiodic && ilevel <= m_ilevelmin ) if (m_bperiodic && ilevel <= m_ilevelmin)
make_periodic( uf ); make_periodic(uf);
} }
m_gridop.restrict(*uf, *uc);
m_gridop.restrict( *uf, *uc );
//... essential!! //... essential!!
if( m_bperiodic && ilevel <= m_ilevelmin ) if (m_bperiodic && ilevel <= m_ilevelmin)
make_periodic( uc ); make_periodic(uc);
else if( ilevel > m_ilevelmin ) else if (ilevel > m_ilevelmin)
interp().interp_coarse_fine(ilevel,*uc,*uf); interp().interp_coarse_fine(ilevel, *uc, *uf);
//.................................................................... //....................................................................
//... we now use hard-coded restriction+operatore app, see below //... we now use hard-coded restriction+operatore app, see below
@ -283,243 +274,231 @@ void solver<S,I,O,T>::twoGrid( unsigned ilevel )
//.................................................................... //....................................................................
int int
oxp = uf->offset(0), oxp = uf->offset(0),
oyp = uf->offset(1), oyp = uf->offset(1),
ozp = uf->offset(2); ozp = uf->offset(2);
meshvar_bnd tLu(*uc,false); meshvar_bnd tLu(*uc, false);
#pragma omp parallel for #pragma omp parallel for
for( int ix=0; ix<nx/2; ++ix ) for (int ix = 0; ix < nx / 2; ++ix)
{ {
int iix=2*ix; int iix = 2 * ix;
for( int iy=0,iiy=0; iy<ny/2; ++iy,iiy+=2 ) for (int iy = 0, iiy = 0; iy < ny / 2; ++iy, iiy += 2)
for (int iz = 0, iiz = 0; iz < nz / 2; ++iz, iiz += 2)
for( int iz=0,iiz=0; iz<nz/2; ++iz,iiz+=2 ) tLu(ix + oxp, iy + oyp, iz + ozp) = 0.125 * (m_scheme.apply((*uf), iix, iiy, iiz) + m_scheme.apply((*uf), iix, iiy, iiz + 1) + m_scheme.apply((*uf), iix, iiy + 1, iiz) + m_scheme.apply((*uf), iix, iiy + 1, iiz + 1) + m_scheme.apply((*uf), iix + 1, iiy, iiz) + m_scheme.apply((*uf), iix + 1, iiy, iiz + 1) + m_scheme.apply((*uf), iix + 1, iiy + 1, iiz) + m_scheme.apply((*uf), iix + 1, iiy + 1, iiz + 1)) / h2;
tLu(ix+oxp,iy+oyp,iz+ozp) = 0.125 * (
m_scheme.apply( (*uf), iix, iiy, iiz )
+m_scheme.apply( (*uf), iix, iiy, iiz+1 )
+m_scheme.apply( (*uf), iix, iiy+1, iiz )
+m_scheme.apply( (*uf), iix, iiy+1, iiz+1 )
+m_scheme.apply( (*uf), iix+1, iiy, iiz )
+m_scheme.apply( (*uf), iix+1, iiy, iiz+1 )
+m_scheme.apply( (*uf), iix+1, iiy+1, iiz )
+m_scheme.apply( (*uf), iix+1, iiy+1, iiz+1 )
)/h2;
} }
//... restrict source term //... restrict source term
m_gridop.restrict( *ff, *fc ); m_gridop.restrict(*ff, *fc);
int oi, oj, ok; int oi, oj, ok;
oi = ff->offset(0); oi = ff->offset(0);
oj = ff->offset(1); oj = ff->offset(1);
ok = ff->offset(2); ok = ff->offset(2);
#pragma omp parallel for #pragma omp parallel for
for( int ix=oi; ix<oi+(int)ff->size(0)/2; ++ix ) for (int ix = oi; ix < oi + (int)ff->size(0) / 2; ++ix)
for( int iy=oj; iy<oj+(int)ff->size(1)/2; ++iy ) for (int iy = oj; iy < oj + (int)ff->size(1) / 2; ++iy)
for( int iz=ok; iz<ok+(int)ff->size(2)/2; ++iz ) for (int iz = ok; iz < ok + (int)ff->size(2) / 2; ++iz)
(*fc)(ix,iy,iz) += ((tLu( ix, iy, iz ) - (m_scheme.apply( *uc, ix, iy, iz )/(4.0*h2)))); (*fc)(ix, iy, iz) += ((tLu(ix, iy, iz) - (m_scheme.apply(*uc, ix, iy, iz) / (4.0 * h2))));
tLu.deallocate(); tLu.deallocate();
meshvar_bnd ucsave(*uc,true); meshvar_bnd ucsave(*uc, true);
//... have we reached the end of the recursion or do we need to go up one level? //... have we reached the end of the recursion or do we need to go up one level?
if( ilevel == 1 ) if (ilevel == 1)
if( m_bperiodic ) if (m_bperiodic)
(*uc)(0,0,0) = 0.0; (*uc)(0, 0, 0) = 0.0;
else else
(*uc)(0,0,0) = (m_scheme.rhs( (*uc), 0, 0, 0 ) + 4.0 * h2 * (*fc)(0,0,0))*c0; (*uc)(0, 0, 0) = (m_scheme.rhs((*uc), 0, 0, 0) + 4.0 * h2 * (*fc)(0, 0, 0)) * c0;
else else
twoGrid( ilevel-1 ); twoGrid(ilevel - 1);
meshvar_bnd cc(*uc,false); meshvar_bnd cc(*uc, false);
//... compute correction on coarse grid
//... compute correction on coarse grid #pragma omp parallel for
#pragma omp parallel for for (int ix = 0; ix < (int)cc.size(0); ++ix)
for( int ix=0; ix<(int)cc.size(0); ++ix ) for (int iy = 0; iy < (int)cc.size(1); ++iy)
for( int iy=0; iy<(int)cc.size(1); ++iy ) for (int iz = 0; iz < (int)cc.size(2); ++iz)
for( int iz=0; iz<(int)cc.size(2); ++iz ) cc(ix, iy, iz) = (*uc)(ix, iy, iz) - ucsave(ix, iy, iz);
cc(ix,iy,iz) = (*uc)(ix,iy,iz) - ucsave(ix,iy,iz);
ucsave.deallocate(); ucsave.deallocate();
if( m_bperiodic && ilevel <= m_ilevelmin ) if (m_bperiodic && ilevel <= m_ilevelmin)
make_periodic( &cc ); make_periodic(&cc);
m_gridop.prolong_add( cc, *uf ); m_gridop.prolong_add(cc, *uf);
//... interpolate and apply coarse-fine boundary conditions on fine level //... interpolate and apply coarse-fine boundary conditions on fine level
if( m_bperiodic && ilevel <= m_ilevelmin ) if (m_bperiodic && ilevel <= m_ilevelmin)
make_periodic( uf ); make_periodic(uf);
else if(!m_bperiodic) else if (!m_bperiodic)
setBC( ilevel ); setBC(ilevel);
//... do smoothing sweeps with specified solver //... do smoothing sweeps with specified solver
for( unsigned i=0; i<m_npostsmooth; ++i ){ for (unsigned i = 0; i < m_npostsmooth; ++i)
{
if( ilevel > m_ilevelmin ) if (ilevel > m_ilevelmin)
interp().interp_coarse_fine(ilevel,*uc,*uf); interp().interp_coarse_fine(ilevel, *uc, *uf);
if( m_smoother == opt::sm_gauss_seidel ) if (m_smoother == opt::sm_gauss_seidel)
GaussSeidel( h, uf, ff ); GaussSeidel(h, uf, ff);
else if( m_smoother == opt::sm_jacobi ) else if (m_smoother == opt::sm_jacobi)
Jacobi( h, uf, ff); Jacobi(h, uf, ff);
else if( m_smoother == opt::sm_sor ) else if (m_smoother == opt::sm_sor)
SOR( h, uf, ff ); SOR(h, uf, ff);
if( m_bperiodic && ilevel <= m_ilevelmin )
make_periodic( uf );
if (m_bperiodic && ilevel <= m_ilevelmin)
make_periodic(uf);
} }
} }
template< class S, class I, class O, typename T > template <class S, class I, class O>
double solver<S,I,O,T>::compute_error( const MeshvarBnd<T>& u, const MeshvarBnd<T>& f, int ilevel ) double solver<S, I, O>::compute_error(const MeshvarBnd<real_t> &u, const MeshvarBnd<real_t> &f, int ilevel)
{ {
int int
nx = u.size(0), nx = u.size(0),
ny = u.size(1), ny = u.size(1),
nz = u.size(2); nz = u.size(2);
double err = 0.0, err2 = 0.0; double err = 0.0, err2 = 0.0;
size_t count = 0; size_t count = 0;
double h = 1.0/(1ul<<ilevel), h2=h*h; double h = 1.0 / (1ul << ilevel), h2 = h * h;
#pragma omp parallel for reduction(+:err,count) #pragma omp parallel for reduction(+ \
for( int ix=0; ix<nx; ++ix ) : err, count)
for( int iy=0; iy<ny; ++iy ) for (int ix = 0; ix < nx; ++ix)
for( int iz=0; iz<nz; ++iz ) for (int iy = 0; iy < ny; ++iy)
if( true )//fabs(unew(ix,iy,iz)) > 0.0 )//&& u(ix,iy,iz) != unew(ix,iy,iz) ) for (int iz = 0; iz < nz; ++iz)
if (true) // fabs(unew(ix,iy,iz)) > 0.0 )//&& u(ix,iy,iz) != unew(ix,iy,iz) )
{ {
//err += fabs(1.0 - (double)u(ix,iy,iz)/(double)unew(ix,iy,iz)); // err += fabs(1.0 - (double)u(ix,iy,iz)/(double)unew(ix,iy,iz));
/*err += fabs(((double)m_scheme.apply( u, ix, iy, iz )/h2 + (double)(f(ix,iy,iz)) )); /*err += fabs(((double)m_scheme.apply( u, ix, iy, iz )/h2 + (double)(f(ix,iy,iz)) ));
err2 += fabs((double)f(ix,iy,iz));*/ err2 += fabs((double)f(ix,iy,iz));*/
err += fabs( (double)m_scheme.apply( u, ix, iy, iz )/h2/(double)(f(ix,iy,iz)) + 1.0 ); err += fabs((double)m_scheme.apply(u, ix, iy, iz) / h2 / (double)(f(ix, iy, iz)) + 1.0);
++count; ++count;
} }
if( count != 0 ) if (count != 0)
err /= count; err /= count;
return err; return err;
} }
template< class S, class I, class O, typename T > template <class S, class I, class O>
double solver<S,I,O,T>::compute_error( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose ) double solver<S, I, O>::compute_error(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose)
{ {
double maxerr = 0.0; double maxerr = 0.0;
for( unsigned ilevel=uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel ) for (unsigned ilevel = uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel)
{ {
int int
nx = uh.get_grid(ilevel)->size(0), nx = uh.get_grid(ilevel)->size(0),
ny = uh.get_grid(ilevel)->size(1), ny = uh.get_grid(ilevel)->size(1),
nz = uh.get_grid(ilevel)->size(2); nz = uh.get_grid(ilevel)->size(2);
double err = 0.0, mean_res = 0.0; double err = 0.0, mean_res = 0.0;
size_t count = 0; size_t count = 0;
double h = 1.0/(1ul<<ilevel), h2=h*h; double h = 1.0 / (1ul << ilevel), h2 = h * h;
#pragma omp parallel for reduction(+:err,count) #pragma omp parallel for reduction(+ \
for( int ix=0; ix<nx; ++ix ) : err, count)
for( int iy=0; iy<ny; ++iy ) for (int ix = 0; ix < nx; ++ix)
for( int iz=0; iz<nz; ++iz ) for (int iy = 0; iy < ny; ++iy)
{ for (int iz = 0; iz < nz; ++iz)
double res = (double)m_scheme.apply( *uh.get_grid(ilevel), ix, iy, iz ) + h2 * (double)((*fh.get_grid(ilevel))(ix,iy,iz)); {
double val = (*uh.get_grid(ilevel))( ix, iy, iz ); double res = (double)m_scheme.apply(*uh.get_grid(ilevel), ix, iy, iz) + h2 * (double)((*fh.get_grid(ilevel))(ix, iy, iz));
double val = (*uh.get_grid(ilevel))(ix, iy, iz);
if( fabs(val) > 0.0 ) if (fabs(val) > 0.0)
{ {
err += fabs( res/val ); err += fabs(res / val);
mean_res += fabs(res); mean_res += fabs(res);
++count; ++count;
} }
} }
if( count != 0 ) if (count != 0)
{ {
err /= count; err /= count;
mean_res /= count; mean_res /= count;
} }
if( verbose ) if (verbose)
std::cout << " Level " << std::setw(6) << ilevel << ", Error = " << err << std::endl; std::cout << " Level " << std::setw(6) << ilevel << ", Error = " << err << std::endl;
music::dlog.Print("[mg] level %3d, residual %g, rel. error %g",ilevel, mean_res, err); music::dlog.Print("[mg] level %3d, residual %g, rel. error %g", ilevel, mean_res, err);
maxerr = std::max(maxerr,err);
maxerr = std::max(maxerr, err);
} }
return maxerr; return maxerr;
} }
template< class S, class I, class O, typename T > template <class S, class I, class O>
double solver<S,I,O,T>::compute_RMS_resid( const GridHierarchy<T>& uh, const GridHierarchy<T>& fh, bool verbose ) double solver<S, I, O>::compute_RMS_resid(const GridHierarchy<real_t> &uh, const GridHierarchy<real_t> &fh, bool verbose)
{ {
if( m_is_ini ) if (m_is_ini)
m_residu_ini.assign( uh.levelmax()+1, 0.0 ); m_residu_ini.assign(uh.levelmax() + 1, 0.0);
double maxerr=0.0; double maxerr = 0.0;
for( unsigned ilevel=uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel ) for (unsigned ilevel = uh.levelmin(); ilevel <= uh.levelmax(); ++ilevel)
{ {
int int
nx = uh.get_grid(ilevel)->size(0), nx = uh.get_grid(ilevel)->size(0),
ny = uh.get_grid(ilevel)->size(1), ny = uh.get_grid(ilevel)->size(1),
nz = uh.get_grid(ilevel)->size(2); nz = uh.get_grid(ilevel)->size(2);
double h = 1.0/(1<<ilevel), h2=h*h; double h = 1.0 / (1 << ilevel), h2 = h * h;
double sum = 0.0, sumd2 = 0.0; double sum = 0.0, sumd2 = 0.0;
size_t count = 0; size_t count = 0;
#pragma omp parallel for reduction(+:sum,sumd2,count) #pragma omp parallel for reduction(+ \
for( int ix=0; ix<nx; ++ix ) : sum, sumd2, count)
for( int iy=0; iy<ny; ++iy ) for (int ix = 0; ix < nx; ++ix)
for( int iz=0; iz<nz; ++iz ) for (int iy = 0; iy < ny; ++iy)
for (int iz = 0; iz < nz; ++iz)
{ {
double d = (double)(*fh.get_grid(ilevel))(ix,iy,iz); double d = (double)(*fh.get_grid(ilevel))(ix, iy, iz);
sumd2 += d*d; sumd2 += d * d;
double r = ((double)m_scheme.apply( *uh.get_grid(ilevel), ix, iy, iz )/h2 + (double)(*fh.get_grid(ilevel))(ix,iy,iz)); double r = ((double)m_scheme.apply(*uh.get_grid(ilevel), ix, iy, iz) / h2 + (double)(*fh.get_grid(ilevel))(ix, iy, iz));
sum += r*r; sum += r * r;
++count; ++count;
} }
if( m_is_ini ) if (m_is_ini)
m_residu_ini[ilevel] = sqrt(sum)/count; m_residu_ini[ilevel] = sqrt(sum) / count;
double err_abs = sqrt(sum/count); double err_abs = sqrt(sum / count);
double err_rel = err_abs / sqrt(sumd2/count); double err_rel = err_abs / sqrt(sumd2 / count);
if( verbose && !m_is_ini ) if (verbose && !m_is_ini)
std::cout << " Level " << std::setw(6) << ilevel << ", Error = " << err_rel << std::endl; std::cout << " Level " << std::setw(6) << ilevel << ", Error = " << err_rel << std::endl;
music::dlog.Print("[mg] level %3d, rms residual %g, rel. error %g",ilevel, err_abs, err_rel); music::dlog.Print("[mg] level %3d, rms residual %g, rel. error %g", ilevel, err_abs, err_rel);
if( err_rel > maxerr ) if (err_rel > maxerr)
maxerr = err_rel; maxerr = err_rel;
} }
if( m_is_ini ) if (m_is_ini)
m_is_ini = false; m_is_ini = false;
return maxerr; return maxerr;
} }
template <class S, class I, class O>
template< class S, class I, class O, typename T > double solver<S, I, O>::solve(GridHierarchy<real_t> &uh, double acc, double h, bool verbose)
double solver<S,I,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool verbose )
{ {
double err, maxerr = 1e30; double err, maxerr = 1e30;
@ -529,151 +508,139 @@ double solver<S,I,O,T>::solve( GridHierarchy<T>& uh, double acc, double h, bool
m_pu = &uh; m_pu = &uh;
//err = compute_RMS_resid( *m_pu, *m_pf, fullverbose ); // err = compute_RMS_resid( *m_pu, *m_pf, fullverbose );
//... iterate ...// //... iterate ...//
while (true) while (true)
{ {
music::ulog.Print("Performing multi-grid V-cycle..."); music::ulog.Print("Performing multi-grid V-cycle...");
twoGrid( uh.levelmax() ); twoGrid(uh.levelmax());
//err = compute_RMS_resid( *m_pu, *m_pf, fullverbose ); // err = compute_RMS_resid( *m_pu, *m_pf, fullverbose );
err = compute_error( *m_pu, *m_pf, fullverbose ); err = compute_error(*m_pu, *m_pf, fullverbose);
++niter; ++niter;
if( fullverbose ){ if (fullverbose)
music::ulog.Print(" multigrid iteration %3d, maximum RMS residual = %g", niter, err ); {
music::ulog.Print(" multigrid iteration %3d, maximum RMS residual = %g", niter, err);
std::cout << " - Step No. " << std::setw(3) << niter << ", Max Err = " << err << std::endl; std::cout << " - Step No. " << std::setw(3) << niter << ", Max Err = " << err << std::endl;
std::cout << " ---------------------------------------------------\n"; std::cout << " ---------------------------------------------------\n";
} }
if( err < maxerr ) if (err < maxerr)
maxerr = err; maxerr = err;
if( (niter > 1) && ((err < acc) || (niter > 20)) ) if ((niter > 1) && ((err < acc) || (niter > 20)))
break; break;
} }
if( err > acc ) if (err > acc)
{ {
std::cout << "Error : no convergence in Poisson solver" << std::endl; std::cout << "Error : no convergence in Poisson solver" << std::endl;
music::elog.Print("No convergence in Poisson solver, final error: %g.",err); music::elog.Print("No convergence in Poisson solver, final error: %g.", err);
} }
else if( verbose ) else if (verbose)
{ {
std::cout << " - Converged in " << niter << " steps to " << maxerr << std::endl; std::cout << " - Converged in " << niter << " steps to " << maxerr << std::endl;
music::ulog.Print("Poisson solver converged to max. error of %g in %d steps.",err,niter); music::ulog.Print("Poisson solver converged to max. error of %g in %d steps.", err, niter);
} }
//.. make sure that the RHS does not contain the FAS corrections any more //.. make sure that the RHS does not contain the FAS corrections any more
for( int i=m_pf->levelmax(); i>0; --i ) for (int i = m_pf->levelmax(); i > 0; --i)
m_gridop.restrict( *m_pf->get_grid(i), *m_pf->get_grid(i-1) ); m_gridop.restrict(*m_pf->get_grid(i), *m_pf->get_grid(i - 1));
return err; return err;
} }
// TODO: this only works for 2nd order! (but actually not needed)
template <class S, class I, class O>
//TODO: this only works for 2nd order! (but actually not needed) void solver<S, I, O>::setBC(unsigned ilevel)
template< class S, class I, class O, typename T >
void solver<S,I,O,T>::setBC( unsigned ilevel )
{ {
//... set only on level before additional refinement starts //... set only on level before additional refinement starts
if( ilevel == m_ilevelmin ) if (ilevel == m_ilevelmin)
{ {
MeshvarBnd<T> *u = m_pu->get_grid(ilevel); MeshvarBnd<real_t> *u = m_pu->get_grid(ilevel);
int int
nx = u->size(0), nx = u->size(0),
ny = u->size(1), ny = u->size(1),
nz = u->size(2); nz = u->size(2);
for( int iy=0; iy<ny; ++iy ) for (int iy = 0; iy < ny; ++iy)
for( int iz=0; iz<nz; ++iz ) for (int iz = 0; iz < nz; ++iz)
{ {
(*u)(-1,iy,iz) = 2.0*(*m_pubnd)(-1,iy,iz) - (*u)(0,iy,iz); (*u)(-1, iy, iz) = 2.0 * (*m_pubnd)(-1, iy, iz) - (*u)(0, iy, iz);
(*u)(nx,iy,iz) = 2.0*(*m_pubnd)(nx,iy,iz) - (*u)(nx-1,iy,iz);; (*u)(nx, iy, iz) = 2.0 * (*m_pubnd)(nx, iy, iz) - (*u)(nx - 1, iy, iz);
;
} }
for( int ix=0; ix<nx; ++ix ) for (int ix = 0; ix < nx; ++ix)
for( int iz=0; iz<nz; ++iz ) for (int iz = 0; iz < nz; ++iz)
{ {
(*u)(ix,-1,iz) = 2.0*(*m_pubnd)(ix,-1,iz) - (*u)(ix,0,iz); (*u)(ix, -1, iz) = 2.0 * (*m_pubnd)(ix, -1, iz) - (*u)(ix, 0, iz);
(*u)(ix,ny,iz) = 2.0*(*m_pubnd)(ix,ny,iz) - (*u)(ix,ny-1,iz); (*u)(ix, ny, iz) = 2.0 * (*m_pubnd)(ix, ny, iz) - (*u)(ix, ny - 1, iz);
} }
for( int ix=0; ix<nx; ++ix ) for (int ix = 0; ix < nx; ++ix)
for( int iy=0; iy<ny; ++iy ) for (int iy = 0; iy < ny; ++iy)
{ {
(*u)(ix,iy,-1) = 2.0*(*m_pubnd)(ix,iy,-1) - (*u)(ix,iy,0); (*u)(ix, iy, -1) = 2.0 * (*m_pubnd)(ix, iy, -1) - (*u)(ix, iy, 0);
(*u)(ix,iy,nz) = 2.0*(*m_pubnd)(ix,iy,nz) - (*u)(ix,iy,nz-1); (*u)(ix, iy, nz) = 2.0 * (*m_pubnd)(ix, iy, nz) - (*u)(ix, iy, nz - 1);
} }
} }
} }
//... enforce periodic boundary conditions //... enforce periodic boundary conditions
template< class S, class I, class O, typename T > template <class S, class I, class O>
void solver<S,I,O,T>::make_periodic( MeshvarBnd<T> *u ) void solver<S, I, O>::make_periodic(MeshvarBnd<real_t> *u)
{ {
int int
nx = u->size(0), nx = u->size(0),
ny = u->size(1), ny = u->size(1),
nz = u->size(2); nz = u->size(2);
int nb = u->m_nbnd; int nb = u->m_nbnd;
// if( u->offset(0) == 0 )
for (int iy = -nb; iy < ny + nb; ++iy)
for (int iz = -nb; iz < nz + nb; ++iz)
{
int iiy((iy + ny) % ny), iiz((iz + nz) % nz);
//if( u->offset(0) == 0 ) for (int i = -nb; i < 0; ++i)
for( int iy=-nb; iy<ny+nb; ++iy )
for( int iz=-nb; iz<nz+nb; ++iz )
{ {
int iiy( (iy+ny)%ny ), iiz( (iz+nz)%nz ); (*u)(i, iy, iz) = (*u)(nx + i, iiy, iiz);
(*u)(nx - 1 - i, iy, iz) = (*u)(-1 - i, iiy, iiz);
for( int i=-nb; i<0; ++i )
{
(*u)(i,iy,iz) = (*u)(nx+i,iiy,iiz);
(*u)(nx-1-i,iy,iz) = (*u)(-1-i,iiy,iiz);
}
} }
}
//if( u->offset(1) == 0 ) // if( u->offset(1) == 0 )
for( int ix=-nb; ix<nx+nb; ++ix ) for (int ix = -nb; ix < nx + nb; ++ix)
for( int iz=-nb; iz<nz+nb; ++iz ) for (int iz = -nb; iz < nz + nb; ++iz)
{
int iix((ix + nx) % nx), iiz((iz + nz) % nz);
for (int i = -nb; i < 0; ++i)
{ {
int iix( (ix+nx)%nx ), iiz( (iz+nz)%nz ); (*u)(ix, i, iz) = (*u)(iix, ny + i, iiz);
(*u)(ix, ny - 1 - i, iz) = (*u)(iix, -1 - i, iiz);
for( int i=-nb; i<0; ++i )
{
(*u)(ix,i,iz) = (*u)(iix,ny+i,iiz);
(*u)(ix,ny-1-i,iz) = (*u)(iix,-1-i,iiz);
}
} }
}
//if( u->offset(2) == 0 ) // if( u->offset(2) == 0 )
for( int ix=-nb; ix<nx+nb; ++ix ) for (int ix = -nb; ix < nx + nb; ++ix)
for( int iy=-nb; iy<ny+nb; ++iy ) for (int iy = -nb; iy < ny + nb; ++iy)
{
int iix((ix + nx) % nx), iiy((iy + ny) % ny);
for (int i = -nb; i < 0; ++i)
{ {
int iix( (ix+nx)%nx ), iiy( (iy+ny)%ny ); (*u)(ix, iy, i) = (*u)(iix, iiy, nz + i);
(*u)(ix, iy, nz - 1 - i) = (*u)(iix, iiy, -1 - i);
for( int i=-nb; i<0; ++i )
{
(*u)(ix,iy,i) = (*u)(iix,iiy,nz+i);
(*u)(ix,iy,nz-1-i) = (*u)(iix,iiy,-1-i);
}
} }
}
} }
END_MULTIGRID_NAMESPACE END_MULTIGRID_NAMESPACE
#endif

View file

@ -230,19 +230,11 @@ protected:
HDFCreateFile(filename); HDFCreateFile(filename);
write_sim_header(filename, the_sim_header); write_sim_header(filename, the_sim_header);
#ifdef SINGLE_PRECISION
//... create full array in file //... create full array in file
HDFHyperslabWriter3Ds<float> *slab_writer = new HDFHyperslabWriter3Ds<float>(filename, enzoname, nsz); HDFHyperslabWriter3Ds<real_t> *slab_writer = new HDFHyperslabWriter3Ds<real_t>(filename, enzoname, nsz);
//... create buffer //... create buffer
float *data_buf = new float[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]]; real_t *data_buf = new real_t[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
#else
//... create full array in file
HDFHyperslabWriter3Ds<double> *slab_writer = new HDFHyperslabWriter3Ds<double>(filename, enzoname, nsz);
//... create buffer
double *data_buf = new double[slices_in_slab * (size_t)ng[0] * (size_t)ng[1]];
#endif
//... write slice by slice //... write slice by slice
size_t slices_written = 0; size_t slices_written = 0;

View file

@ -1390,7 +1390,5 @@ public:
namespace namespace
{ {
output_plugin_creator_concrete<gadget2_output_plugin<float>> creator1("gadget2"); output_plugin_creator_concrete<gadget2_output_plugin<float>> creator1("gadget2");
#ifndef SINGLE_PRECISION
output_plugin_creator_concrete<gadget2_output_plugin<double>> creator2("gadget2_double"); output_plugin_creator_concrete<gadget2_output_plugin<double>> creator2("gadget2_double");
#endif
} }

File diff suppressed because it is too large Load diff

View file

@ -1573,8 +1573,6 @@ public:
namespace{ namespace{
output_plugin_creator_concrete< gadget_tetmesh_output_plugin<float> > creator1("gadget_tetmesh"); output_plugin_creator_concrete< gadget_tetmesh_output_plugin<float> > creator1("gadget_tetmesh");
#ifndef SINGLE_PRECISION
output_plugin_creator_concrete< gadget_tetmesh_output_plugin<double> > creator2("gadget_tetmesh_double"); output_plugin_creator_concrete< gadget_tetmesh_output_plugin<double> > creator2("gadget_tetmesh_double");
#endif
} }

View file

@ -1107,9 +1107,7 @@ int tipsy_output_plugin<double>::xdr_dump( XDR *xdrs, double*p )
namespace{ namespace{
output_plugin_creator_concrete< tipsy_output_plugin<float> > creator1("tipsy"); output_plugin_creator_concrete< tipsy_output_plugin<float> > creator1("tipsy");
//#ifndef SINGLE_PRECISION
output_plugin_creator_concrete< tipsy_output_plugin<double> > creator2("tipsy_double"); output_plugin_creator_concrete< tipsy_output_plugin<double> > creator2("tipsy_double");
//#endif
} }

View file

@ -1396,9 +1396,7 @@ int tipsy_output_plugin_res < double >::xdr_dump (XDR * xdrs, double *p)
namespace namespace
{ {
output_plugin_creator_concrete< tipsy_output_plugin_res<float> >creator1 ("tipsy_resample"); output_plugin_creator_concrete< tipsy_output_plugin_res<float> >creator1 ("tipsy_resample");
#ifndef SINGLE_PRECISION
output_plugin_creator_concrete< tipsy_output_plugin_res<double> >creator2 ("tipsy_double_resample"); output_plugin_creator_concrete< tipsy_output_plugin_res<double> >creator2 ("tipsy_double_resample");
#endif
} }
#endif // defined(HAVE_TIRPC) #endif // defined(HAVE_TIRPC)

View file

@ -40,8 +40,8 @@ void rapid_proto_ngenic_rng(size_t res, long baseseed, music_wnoise_generator<T>
seedtable[(res - 1 - j) * res + (res - 1 - i)] = 0x7fffffff * gsl_rng_uniform(random_generator); seedtable[(res - 1 - j) * res + (res - 1 - i)] = 0x7fffffff * gsl_rng_uniform(random_generator);
} }
fftw_real *rnoise = new fftw_real[res * res * (res + 2)]; real_t *rnoise = new real_t[res * res * (res + 2)];
fftw_complex *knoise = reinterpret_cast<fftw_complex *>(rnoise); complex_t *knoise = reinterpret_cast<complex_t *>(rnoise);
double fnorm = 1. / sqrt(res * res * res); double fnorm = 1. / sqrt(res * res * res);
@ -126,26 +126,9 @@ void rapid_proto_ngenic_rng(size_t res, long baseseed, music_wnoise_generator<T>
delete[] seedtable; delete[] seedtable;
//... perform FT to real space //... perform FT to real space
fftw_plan_t plan = FFTW_API(plan_dft_c2r_3d)(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
#ifdef FFTW3 FFTW_API(execute)(plan);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(plan);
fftwf_plan plan = fftwf_plan_dft_c2r_3d(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
fftwf_execute(plan);
fftwf_destroy_plan(plan);
#else
fftw_plan plan = fftw_plan_dft_c2r_3d(res, res, res, knoise, rnoise, FFTW_ESTIMATE);
fftw_execute(plan);
fftw_destroy_plan(plan);
#endif
#else
rfftwnd_plan plan = rfftw3d_create_plan(res, res, res, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), plan, knoise, NULL);
#else
rfftwnd_one_complex_to_real(plan, knoise, NULL);
#endif
rfftwnd_destroy_plan(plan);
#endif
// copy to array that holds the random numbers // copy to array that holds the random numbers
@ -443,37 +426,25 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
ncubes_ = 1; ncubes_ = 1;
baseseed_ = -2; baseseed_ = -2;
if (sizeof(fftw_real) != sizeof(T)) if (sizeof(real_t) != sizeof(T))
{ {
music::elog.Print("type mismatch with fftw_real in k-space averaging"); music::elog.Print("type mismatch with real_t in k-space averaging");
throw std::runtime_error("type mismatch with fftw_real in k-space averaging"); throw std::runtime_error("type mismatch with real_t in k-space averaging");
} }
fftw_real real_t
*rfine = new fftw_real[(size_t)rc.res_ * (size_t)rc.res_ * 2 * ((size_t)rc.res_ / 2 + 1)], *rfine = new real_t[(size_t)rc.res_ * (size_t)rc.res_ * 2 * ((size_t)rc.res_ / 2 + 1)],
*rcoarse = new fftw_real[(size_t)res_ * (size_t)res_ * 2 * ((size_t)res_ / 2 + 1)]; *rcoarse = new real_t[(size_t)res_ * (size_t)res_ * 2 * ((size_t)res_ / 2 + 1)];
fftw_complex complex_t
*ccoarse = reinterpret_cast<fftw_complex *>(rcoarse), *ccoarse = reinterpret_cast<complex_t *>(rcoarse),
*cfine = reinterpret_cast<fftw_complex *>(rfine); *cfine = reinterpret_cast<complex_t *>(rfine);
int nx(rc.res_), ny(rc.res_), nz(rc.res_), nxc(res_), nyc(res_), nzc(res_); int nx(rc.res_), ny(rc.res_), nz(rc.res_), nxc(res_), nyc(res_), nzc(res_);
#ifdef FFTW3
#ifdef SINGLE_PRECISION
fftwf_plan
pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
ipc = fftwf_plan_dft_c2r_3d(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
#else
fftw_plan
pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
ipc = fftw_plan_dft_c2r_3d(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
#endif
#else fftw_plan_t
rfftwnd_plan pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE), ipc = FFTW_API(plan_dft_c2r_3d)(nxc, nyc, nzc, ccoarse, rcoarse, FFTW_ESTIMATE);
ipc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nx; i++) for (int i = 0; i < nx; i++)
@ -484,19 +455,7 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
rfine[q] = rc(i, j, k); rfine[q] = rc(i, j, k);
} }
#ifdef FFTW3 FFTW_API(execute)(pf);
#ifdef SINGLE_PRECISION
fftwf_execute(pf);
#else
fftw_execute(pf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
#else
rfftwnd_one_real_to_complex(pf, rfine, NULL);
#endif
#endif
double fftnorm = 1.0 / ((double)nxc * (double)nyc * (double)nzc); double fftnorm = 1.0 / ((double)nxc * (double)nyc * (double)nzc);
@ -532,19 +491,9 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
} }
delete[] rfine; delete[] rfine;
#ifdef FFTW3
#ifdef SINGLE_PRECISION FFTW_API(execute)(ipc);
fftwf_execute(ipc);
#else
fftw_execute(ipc);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipc, ccoarse, NULL);
#else
rfftwnd_one_complex_to_real(ipc, ccoarse, NULL);
#endif
#endif
rnums_.push_back(new Meshvar<T>(res_, 0, 0, 0)); rnums_.push_back(new Meshvar<T>(res_, 0, 0, 0));
cubemap_[0] = 0; // map all to single array cubemap_[0] = 0; // map all to single array
@ -563,18 +512,8 @@ music_wnoise_generator<T>::music_wnoise_generator(/*const*/ music_wnoise_generat
delete[] rcoarse; delete[] rcoarse;
#ifdef FFTW3 FFTW_API(destroy_plan)(pf);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(ipc);
fftwf_destroy_plan(pf);
fftwf_destroy_plan(ipc);
#else
fftw_destroy_plan(pf);
fftw_destroy_plan(ipc);
#endif
#else
rfftwnd_destroy_plan(pf);
rfftwnd_destroy_plan(ipc);
#endif
double rmean, rvar; double rmean, rvar;
rmean = sum / count; rmean = sum / count;
@ -617,24 +556,12 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
size_t nx = lx[0], ny = lx[1], nz = lx[2], size_t nx = lx[0], ny = lx[1], nz = lx[2],
nxc = lx[0] / 2, nyc = lx[1] / 2, nzc = lx[2] / 2; nxc = lx[0] / 2, nyc = lx[1] / 2, nzc = lx[2] / 2;
fftw_real *rfine = new fftw_real[nx * ny * (nz + 2l)]; real_t *rfine = new real_t[nx * ny * (nz + 2l)];
fftw_complex *cfine = reinterpret_cast<fftw_complex *>(rfine); complex_t *cfine = reinterpret_cast<complex_t *>(rfine);
#ifdef FFTW3 fftw_plan_t
#ifdef SINGLE_PRECISION pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
fftwf_plan ipf = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
ipf = fftwf_plan_dft_c2r_3d(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
#else
fftw_plan
pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfine, cfine, FFTW_ESTIMATE),
ipf = fftw_plan_dft_c2r_3d(nx, ny, nz, cfine, rfine, FFTW_ESTIMATE);
#endif
#else
rfftwnd_plan
pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
ipf = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < (int)nx; i++) for (int i = 0; i < (int)nx; i++)
@ -646,18 +573,10 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
} }
// this->free_all_mem(); // temporarily free memory, allocate again later // this->free_all_mem(); // temporarily free memory, allocate again later
fftw_real *rcoarse = new fftw_real[nxc * nyc * (nzc + 2)]; real_t *rcoarse = new real_t[nxc * nyc * (nzc + 2)];
fftw_complex *ccoarse = reinterpret_cast<fftw_complex *>(rcoarse); complex_t *ccoarse = reinterpret_cast<complex_t *>(rcoarse);
#ifdef FFTW3 fftw_plan pc = FFTW_API(plan_dft_r2c_3d)(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
#ifdef SINGLE_PRECISION
fftwf_plan pc = fftwf_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
#else
fftw_plan pc = fftw_plan_dft_r2c_3d(nxc, nyc, nzc, rcoarse, ccoarse, FFTW_ESTIMATE);
#endif
#else
rfftwnd_plan pc = rfftw3d_create_plan(nxc, nyc, nzc, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < (int)nxc; i++) for (int i = 0; i < (int)nxc; i++)
@ -667,23 +586,9 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
size_t q = ((size_t)i * (size_t)nyc + (size_t)j) * (size_t)(nzc + 2) + (size_t)k; size_t q = ((size_t)i * (size_t)nyc + (size_t)j) * (size_t)(nzc + 2) + (size_t)k;
rcoarse[q] = rc(x0[0] / 2 + i, x0[1] / 2 + j, x0[2] / 2 + k); rcoarse[q] = rc(x0[0] / 2 + i, x0[1] / 2 + j, x0[2] / 2 + k);
} }
#ifdef FFTW3
#ifdef SINGLE_PRECISION FFTW_API(execute)(pc);
fftwf_execute(pc); FFTW_API(execute)(pf);
fftwf_execute(pf);
#else
fftw_execute(pc);
fftw_execute(pf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pc, rcoarse, NULL);
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), pf, rfine, NULL);
#else
rfftwnd_one_real_to_complex(pc, rcoarse, NULL);
rfftwnd_one_real_to_complex(pf, rfine, NULL);
#endif
#endif
double fftnorm = 1.0 / ((double)nx * (double)ny * (double)nz); double fftnorm = 1.0 / ((double)nx * (double)ny * (double)nz);
double sqrt8 = sqrt(8.0); double sqrt8 = sqrt(8.0);
@ -747,19 +652,8 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
IM(cfine[q]) *= fftnorm; IM(cfine[q]) *= fftnorm;
} }
#ifdef FFTW3
#ifdef SINGLE_PRECISION FFTW_API(execute)(ipf);
fftwf_execute(ipf);
#else
fftw_execute(ipf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), ipf, cfine, NULL);
#else
rfftwnd_one_complex_to_real(ipf, cfine, NULL);
#endif
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < (int)nx; i++) for (int i = 0; i < (int)nx; i++)
@ -772,21 +666,9 @@ music_wnoise_generator<T>::music_wnoise_generator(music_wnoise_generator<T> &rc,
delete[] rfine; delete[] rfine;
#ifdef FFTW3 FFTW_API(destroy_plan)(pf);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(pc);
fftwf_destroy_plan(pf); FFTW_API(destroy_plan)(ipf);
fftwf_destroy_plan(pc);
fftwf_destroy_plan(ipf);
#else
fftw_destroy_plan(pf);
fftw_destroy_plan(pc);
fftw_destroy_plan(ipf);
#endif
#else
fftwnd_destroy_plan(pf);
fftwnd_destroy_plan(pc);
fftwnd_destroy_plan(ipf);
#endif
} }

View file

@ -238,63 +238,25 @@ public:
void RNG_panphasia::forward_transform_field(real_t *field, int nx, int ny, int nz) void RNG_panphasia::forward_transform_field(real_t *field, int nx, int ny, int nz)
{ {
fftw_real *rfield = reinterpret_cast<fftw_real *>(field); real_t *rfield = reinterpret_cast<real_t *>(field);
fftw_complex *cfield = reinterpret_cast<fftw_complex *>(field); complex_t *cfield = reinterpret_cast<complex_t *>(field);
#ifdef FFTW3 fftw_plan_t pf = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
#ifdef SINGLE_PRECISION
fftwf_plan pf = fftwf_plan_dft_r2c_3d(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
#else
fftw_plan pf = fftw_plan_dft_r2c_3d(nx, ny, nz, rfield, cfield, FFTW_ESTIMATE);
#endif
#else
rfftwnd_plan pf = rfftw3d_create_plan(nx, ny, nz, FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#ifdef FFTW3 FFTW_API(execute)(pf);
#ifdef SINGLE_PRECISION
fftwf_execute(pf); FFTW_API(destroy_plan)(pf);
#else
fftw_execute(pf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(num_threads_, pf, rfield, NULL);
#else
rfftwnd_one_real_to_complex(pf, rfield, NULL);
#endif
#endif
} }
void RNG_panphasia::backward_transform_field(real_t *field, int nx, int ny, int nz) void RNG_panphasia::backward_transform_field(real_t *field, int nx, int ny, int nz)
{ {
fftw_real *rfield = reinterpret_cast<fftw_real *>(field); real_t *rfield = reinterpret_cast<real_t *>(field);
fftw_complex *cfield = reinterpret_cast<fftw_complex *>(field); complex_t *cfield = reinterpret_cast<complex_t *>(field);
#ifdef FFTW3 fftw_plan_t ipf = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
#ifdef SINGLE_PRECISION FFTW_API(execute)(ipf);
fftwf_plan ipf = fftwf_plan_dft_c2r_3d(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE); FFTW_API(destroy_plan(ipf));
#else
fftw_plan ipf = fftw_plan_dft_c2r_3d(nx, ny, nz, cfield, rfield, FFTW_ESTIMATE);
#endif
#else
rfftwnd_plan ipf = rfftw3d_create_plan(nx, ny, nz, FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#endif
#ifdef FFTW3
#ifdef SINGLE_PRECISION
fftwf_execute(ipf);
#else
fftw_execute(ipf);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(num_threads_, ipf, cfield, NULL);
#else
rfftwnd_one_complex_to_real(ipf, cfield, NULL);
#endif
#endif
} }
#include <sys/time.h> #include <sys/time.h>
@ -309,8 +271,8 @@ inline double get_wtime(void)
void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R) void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
{ {
fftw_real *pr0, *pr1, *pr2, *pr3, *pr4; real_t *pr0, *pr1, *pr2, *pr3, *pr4;
fftw_complex *pc0, *pc1, *pc2, *pc3, *pc4; complex_t *pc0, *pc1, *pc2, *pc3, *pc4;
// determine resolution and offset so that we can do proper resampling // determine resolution and offset so that we can do proper resampling
int ileft[3], ileft_corner[3], nx[3], nxremap[3]; int ileft[3], ileft_corner[3], nx[3], nxremap[3];
@ -379,17 +341,17 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
size_t ngp = nxremap[0] * nxremap[1] * (nxremap[2] + 2); size_t ngp = nxremap[0] * nxremap[1] * (nxremap[2] + 2);
pr0 = new fftw_real[ngp]; pr0 = new real_t[ngp];
pr1 = new fftw_real[ngp]; pr1 = new real_t[ngp];
pr2 = new fftw_real[ngp]; pr2 = new real_t[ngp];
pr3 = new fftw_real[ngp]; pr3 = new real_t[ngp];
pr4 = new fftw_real[ngp]; pr4 = new real_t[ngp];
pc0 = reinterpret_cast<fftw_complex *>(pr0); pc0 = reinterpret_cast<complex_t *>(pr0);
pc1 = reinterpret_cast<fftw_complex *>(pr1); pc1 = reinterpret_cast<complex_t *>(pr1);
pc2 = reinterpret_cast<fftw_complex *>(pr2); pc2 = reinterpret_cast<complex_t *>(pr2);
pc3 = reinterpret_cast<fftw_complex *>(pr3); pc3 = reinterpret_cast<complex_t *>(pr3);
pc4 = reinterpret_cast<fftw_complex *>(pr4); pc4 = reinterpret_cast<complex_t *>(pr4);
music::ilog.Print("calculating PANPHASIA random numbers for level %d...", level); music::ilog.Print("calculating PANPHASIA random numbers for level %d...", level);
clear_panphasia_thread_states(); clear_panphasia_thread_states();
@ -782,7 +744,7 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
{ {
music::ulog.Print("Remapping fields from dimension %d -> %d", nxremap[0], nx_m[0]); music::ulog.Print("Remapping fields from dimension %d -> %d", nxremap[0], nx_m[0]);
memset(pr1, 0, ngp * sizeof(fftw_real)); memset(pr1, 0, ngp * sizeof(real_t));
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nxremap[0]; i++) for (int i = 0; i < nxremap[0]; i++)
@ -812,7 +774,7 @@ void RNG_panphasia::fill_grid(int level, DensityGrid<real_t> &R)
} }
} }
memcpy(pr0, pr1, ngp * sizeof(fftw_real)); memcpy(pr0, pr1, ngp * sizeof(real_t));
} }
// if (level == 9) // if (level == 9)

View file

@ -10,8 +10,8 @@
/****** ABSTRACT FACTORY PATTERN IMPLEMENTATION *******/ /****** ABSTRACT FACTORY PATTERN IMPLEMENTATION *******/
#include "poisson.hh" #include <poisson.hh>
#include "Numerics.hh" #include <Numerics.hh>
std::map<std::string, poisson_plugin_creator *> & std::map<std::string, poisson_plugin_creator *> &
get_poisson_plugin_map() get_poisson_plugin_map()
@ -40,23 +40,18 @@ void print_poisson_plugins()
/****** CALL IMPLEMENTATIONS OF POISSON SOLVER CLASSES ******/ /****** CALL IMPLEMENTATIONS OF POISSON SOLVER CLASSES ******/
#include "mg_solver.hh" #include <mg_solver.hh>
#include "fd_schemes.hh" #include <fd_schemes.hh>
#ifdef SINGLE_PRECISION
typedef multigrid::solver<stencil_7P<float>, interp_O3_fluxcorr, mg_straight, float> poisson_solver_O2; typedef multigrid::solver<stencil_7P, interp_O3_fluxcorr, mg_straight> poisson_solver_O2;
typedef multigrid::solver<stencil_13P<float>, interp_O5_fluxcorr, mg_straight, float> poisson_solver_O4; typedef multigrid::solver<stencil_13P, interp_O5_fluxcorr, mg_straight> poisson_solver_O4;
typedef multigrid::solver<stencil_19P<float>, interp_O7_fluxcorr, mg_straight, float> poisson_solver_O6; typedef multigrid::solver<stencil_19P, interp_O7_fluxcorr, mg_straight> poisson_solver_O6;
#else
typedef multigrid::solver<stencil_7P<double>, interp_O3_fluxcorr, mg_straight, double> poisson_solver_O2;
typedef multigrid::solver<stencil_13P<double>, interp_O5_fluxcorr, mg_straight, double> poisson_solver_O4;
typedef multigrid::solver<stencil_19P<double>, interp_O7_fluxcorr, mg_straight, double> poisson_solver_O6;
#endif
/**************************************************************************************/ /**************************************************************************************/
/**************************************************************************************/ /**************************************************************************************/
double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u) real_t multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
{ {
music::ulog.Print("Initializing multi-grid Poisson solver..."); music::ulog.Print("Initializing multi-grid Poisson solver...");
@ -68,11 +63,11 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
std::cout << " - Invoking multi-grid Poisson solver..." << std::endl; std::cout << " - Invoking multi-grid Poisson solver..." << std::endl;
} }
double acc = 1e-5, err; real_t acc = 1e-5, err;
std::string ps_smoother_name; std::string ps_smoother_name;
unsigned ps_presmooth, ps_postsmooth, order; unsigned ps_presmooth, ps_postsmooth, order;
acc = cf_.get_value_safe<double>("poisson", "accuracy", acc); acc = cf_.get_value_safe<real_t>("poisson", "accuracy", acc);
ps_presmooth = cf_.get_value_safe<unsigned>("poisson", "pre_smooth", 3); ps_presmooth = cf_.get_value_safe<unsigned>("poisson", "pre_smooth", 3);
ps_postsmooth = cf_.get_value_safe<unsigned>("poisson", "post_smooth", 3); ps_postsmooth = cf_.get_value_safe<unsigned>("poisson", "post_smooth", 3);
ps_smoother_name = cf_.get_value_safe<std::string>("poisson", "smoother", "gs"); ps_smoother_name = cf_.get_value_safe<std::string>("poisson", "smoother", "gs");
@ -102,12 +97,12 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
<< " reverting to \'gs\' (Gauss-Seidel)" << std::endl; << " reverting to \'gs\' (Gauss-Seidel)" << std::endl;
} }
double tstart, tend; real_t tstart, tend;
#ifndef SINGLETHREAD_FFTW #ifndef SINGLETHREAD_FFTW
tstart = omp_get_wtime(); tstart = omp_get_wtime();
#else #else
tstart = (double)clock() / CLOCKS_PER_SEC; tstart = (real_t)clock() / CLOCKS_PER_SEC;
#endif #endif
//----- run Poisson solver -----// //----- run Poisson solver -----//
@ -142,7 +137,7 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
if (verbosity > 1) if (verbosity > 1)
std::cout << " - Poisson solver took " << tend - tstart << "s with " << omp_get_max_threads() << " threads." << std::endl; std::cout << " - Poisson solver took " << tend - tstart << "s with " << omp_get_max_threads() << " threads." << std::endl;
#else #else
tend = (double)clock() / CLOCKS_PER_SEC; tend = (real_t)clock() / CLOCKS_PER_SEC;
if (verbosity > 1) if (verbosity > 1)
std::cout << " - Poisson solver took " << tend - tstart << "s." << std::endl; std::cout << " - Poisson solver took " << tend - tstart << "s." << std::endl;
@ -151,7 +146,7 @@ double multigrid_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
return err; return err;
} }
double multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du) real_t multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
{ {
Du = u; Du = u;
@ -176,7 +171,7 @@ double multigrid_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hiera
return 0.0; return 0.0;
} }
double multigrid_poisson_plugin::gradient_add(int dir, grid_hierarchy &u, grid_hierarchy &Du) real_t multigrid_poisson_plugin::gradient_add(int dir, grid_hierarchy &u, grid_hierarchy &Du)
{ {
// Du = u; // Du = u;
@ -207,7 +202,7 @@ void multigrid_poisson_plugin::implementation::gradient_O2(int dir, grid_hierarc
for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0, ilevel); real_t h = pow(2.0, ilevel);
meshvar_bnd *pvar = Du.get_grid(ilevel); meshvar_bnd *pvar = Du.get_grid(ilevel);
if (dir == 0) if (dir == 0)
@ -241,7 +236,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O2(int dir, grid_hie
for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0, ilevel); real_t h = pow(2.0, ilevel);
meshvar_bnd *pvar = Du.get_grid(ilevel); meshvar_bnd *pvar = Du.get_grid(ilevel);
if (dir == 0) if (dir == 0)
@ -275,7 +270,7 @@ void multigrid_poisson_plugin::implementation::gradient_O4(int dir, grid_hierarc
for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0, ilevel); real_t h = pow(2.0, ilevel);
meshvar_bnd *pvar = Du.get_grid(ilevel); meshvar_bnd *pvar = Du.get_grid(ilevel);
h /= 12.0; h /= 12.0;
@ -311,7 +306,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O4(int dir, grid_hie
for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0, ilevel); real_t h = pow(2.0, ilevel);
meshvar_bnd *pvar = Du.get_grid(ilevel); meshvar_bnd *pvar = Du.get_grid(ilevel);
h /= 12.0; h /= 12.0;
@ -347,7 +342,7 @@ void multigrid_poisson_plugin::implementation::gradient_O6(int dir, grid_hierarc
for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0, ilevel); real_t h = pow(2.0, ilevel);
meshvar_bnd *pvar = Du.get_grid(ilevel); meshvar_bnd *pvar = Du.get_grid(ilevel);
h /= 60.; h /= 60.;
@ -385,7 +380,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O6(int dir, grid_hie
for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel) for (unsigned ilevel = u.levelmin(); ilevel <= u.levelmax(); ++ilevel)
{ {
double h = pow(2.0, ilevel); real_t h = pow(2.0, ilevel);
meshvar_bnd *pvar = Du.get_grid(ilevel); meshvar_bnd *pvar = Du.get_grid(ilevel);
h /= 60.; h /= 60.;
@ -421,7 +416,7 @@ void multigrid_poisson_plugin::implementation::gradient_add_O6(int dir, grid_hie
/**************************************************************************************/ /**************************************************************************************/
#include "general.hh" #include "general.hh"
double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u) real_t fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
{ {
music::ulog.Print("Entering k-space Poisson solver..."); music::ulog.Print("Entering k-space Poisson solver...");
@ -446,8 +441,8 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
nzp = 2 * (nz / 2 + 1); nzp = 2 * (nz / 2 + 1);
//... copy data .................................................. //... copy data ..................................................
fftw_real *data = new fftw_real[(size_t)nx * (size_t)ny * (size_t)nzp]; real_t *data = new real_t[(size_t)nx * (size_t)ny * (size_t)nzp];
fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data); complex_t *cdata = reinterpret_cast<complex_t *>(data);
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nx; ++i) for (int i = 0; i < nx; ++i)
@ -461,37 +456,14 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
//... perform FFT and Poisson solve................................ //... perform FFT and Poisson solve................................
music::ulog.Print("Performing forward transform."); music::ulog.Print("Performing forward transform.");
#ifdef FFTW3 fftw_plan_t
#ifdef SINGLE_PRECISION plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
fftwf_plan iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
plan = fftwf_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
iplan = fftwf_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
fftwf_execute(plan); FFTW_API(execute)(plan);
#else
fftw_plan
plan = fftw_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
iplan = fftw_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
fftw_execute(plan); real_t kfac = 2.0 * M_PI;
#endif real_t fac = -1.0 / (real_t)((size_t)nx * (size_t)ny * (size_t)nz);
#else
rfftwnd_plan
plan = rfftw3d_create_plan(nx, ny, nz,
FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
iplan = rfftw3d_create_plan(nx, ny, nz,
FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
#else
rfftwnd_one_real_to_complex(plan, data, NULL);
#endif
#endif
double kfac = 2.0 * M_PI;
double fac = -1.0 / (double)((size_t)nx * (size_t)ny * (size_t)nz);
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nx; ++i) for (int i = 0; i < nx; ++i)
@ -504,11 +476,11 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
int jj = j; int jj = j;
if (jj > ny / 2) if (jj > ny / 2)
jj -= ny; jj -= ny;
double ki = (double)ii; real_t ki = (real_t)ii;
double kj = (double)jj; real_t kj = (real_t)jj;
double kk = (double)k; real_t kk = (real_t)k;
double kk2 = kfac * kfac * (ki * ki + kj * kj + kk * kk); real_t kk2 = kfac * kfac * (ki * ki + kj * kj + kk * kk);
size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k; size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k;
@ -521,26 +493,9 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
music::ulog.Print("Performing backward transform."); music::ulog.Print("Performing backward transform.");
#ifdef FFTW3 FFTW_API(execute)(iplan);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(plan);
fftwf_execute(iplan); FFTW_API(destroy_plan)(iplan);
fftwf_destroy_plan(plan);
fftwf_destroy_plan(iplan);
#else
fftw_execute(iplan);
fftw_destroy_plan(plan);
fftw_destroy_plan(iplan);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
#else
rfftwnd_one_complex_to_real(iplan, cdata, NULL);
#endif
rfftwnd_destroy_plan(plan);
rfftwnd_destroy_plan(iplan);
#endif
//... copy data .......................................... //... copy data ..........................................
#pragma omp parallel for #pragma omp parallel for
@ -596,7 +551,7 @@ double fft_poisson_plugin::solve(grid_hierarchy &f, grid_hierarchy &u)
return 0.0; return 0.0;
} }
double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du) real_t fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &Du)
{ {
music::ulog.Print("Computing a gradient in k-space...\n"); music::ulog.Print("Computing a gradient in k-space...\n");
@ -612,8 +567,8 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
nzp = 2 * (nz / 2 + 1); nzp = 2 * (nz / 2 + 1);
//... copy data .................................................. //... copy data ..................................................
fftw_real *data = new fftw_real[(size_t)nx * (size_t)ny * (size_t)nzp]; real_t *data = new real_t[(size_t)nx * (size_t)ny * (size_t)nzp];
fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data); complex_t *cdata = reinterpret_cast<complex_t *>(data);
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nx; ++i) for (int i = 0; i < nx; ++i)
@ -625,38 +580,14 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
} }
//... perform FFT and Poisson solve................................ //... perform FFT and Poisson solve................................
fftw_plan_t
plan = FFTW_API(plan_dft_r2c_3d)(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
iplan = FFTW_API(plan_dft_c2r_3d)(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
#ifdef FFTW3 FFTW_API(execute)(plan);
#ifdef SINGLE_PRECISION
fftwf_plan
plan = fftwf_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
iplan = fftwf_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
fftwf_execute(plan); real_t fac = -1.0 / (real_t)((size_t)nx * (size_t)ny * (size_t)nz);
#else real_t kfac = 2.0 * M_PI;
fftw_plan
plan = fftw_plan_dft_r2c_3d(nx, ny, nz, data, cdata, FFTW_ESTIMATE),
iplan = fftw_plan_dft_c2r_3d(nx, ny, nz, cdata, data, FFTW_ESTIMATE);
fftw_execute(plan);
#endif
#else
rfftwnd_plan
plan = rfftw3d_create_plan(nx, ny, nz,
FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE),
iplan = rfftw3d_create_plan(nx, ny, nz,
FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
#else
rfftwnd_one_real_to_complex(plan, data, NULL);
#endif
#endif
double fac = -1.0 / (double)((size_t)nx * (size_t)ny * (size_t)nz);
double kfac = 2.0 * M_PI;
bool do_glass = cf_.get_value_safe<bool>("output", "glass", false); bool do_glass = cf_.get_value_safe<bool>("output", "glass", false);
bool deconvolve_cic = do_glass | cf_.get_value_safe<bool>("output", "glass_cicdeconvolve", false); bool deconvolve_cic = do_glass | cf_.get_value_safe<bool>("output", "glass_cicdeconvolve", false);
@ -671,98 +602,54 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
{ {
size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k; size_t idx = (size_t)(i * ny + j) * (size_t)(nzp / 2) + (size_t)k;
int ii = i; int ii = i;
if (ii > nx / 2) if (ii > nx / 2) ii -= nx;
ii -= nx;
int jj = j; int jj = j;
if (jj > ny / 2) if (jj > ny / 2) jj -= ny;
jj -= ny;
const double ki = (double)ii;
const double kj = (double)jj;
const double kk = (double)k;
const double kkdir[3] = {kfac * ki, kfac * kj, kfac * kk}; const real_t ki{(real_t)ii};
const double kdir = kkdir[dir]; const real_t kj{(real_t)jj};
const real_t kk{(real_t)k};
const real_t kkdir[3] = {kfac * ki, kfac * kj, kfac * kk};
const real_t kdir = kkdir[dir];
double re = RE(cdata[idx]); real_t re = RE(cdata[idx]);
double im = IM(cdata[idx]); real_t im = IM(cdata[idx]);
RE(cdata[idx]) = fac * im * kdir; RE(cdata[idx]) = fac * im * kdir;
IM(cdata[idx]) = -fac * re * kdir; IM(cdata[idx]) = -fac * re * kdir;
#ifdef FFTW3
if (deconvolve_cic) if (deconvolve_cic)
{ {
double dfx, dfy, dfz; real_t dfx, dfy, dfz;
dfx = M_PI * ki / (double)nx; dfx = M_PI * ki / (real_t)nx;
dfx = (i != 0) ? sin(dfx) / dfx : 1.0; dfx = (i != 0) ? std::sin(dfx) / dfx : 1.0;
dfy = M_PI * kj / (double)ny; dfy = M_PI * kj / (real_t)ny;
dfy = (j != 0) ? sin(dfy) / dfy : 1.0; dfy = (j != 0) ? std::sin(dfy) / dfy : 1.0;
dfz = M_PI * kk / (double)nz; dfz = M_PI * kk / (real_t)nz;
dfz = (k != 0) ? sin(dfz) / dfz : 1.0; dfz = (k != 0) ? std::sin(dfz) / dfz : 1.0;
dfx = 1.0 / (dfx * dfy * dfz); dfx = 1.0 / (dfx * dfy * dfz);
dfx = dfx * dfx; dfx = dfx * dfx;
cdata[idx][0] *= dfx; RE(cdata[idx]) *= dfx;
cdata[idx][1] *= dfx; IM(cdata[idx]) *= dfx;
} }
#else
if (deconvolve_cic)
{
double dfx, dfy, dfz;
dfx = M_PI * ki / (double)nx;
dfx = (i != 0) ? sin(dfx) / dfx : 1.0;
dfy = M_PI * kj / (double)ny;
dfy = (j != 0) ? sin(dfy) / dfy : 1.0;
dfz = M_PI * kk / (double)nz;
dfz = (k != 0) ? sin(dfz) / dfz : 1.0;
dfx = 1.0 / (dfx * dfy * dfz);
dfx = dfx * dfx;
cdata[idx].re *= dfx;
cdata[idx].im *= dfx;
}
#endif
if( (dir == 0 && i==nx/2) || (dir == 1 && j==ny/2) || (dir == 2 && k==nz/2) ) if( (dir == 0 && i==nx/2) || (dir == 1 && j==ny/2) || (dir == 2 && k==nz/2) )
{ {
#ifdef FFTW3 RE(cdata[idx]) = 0.0;
cdata[idx][0] = 0.0; IM(cdata[idx]) = 0.0;
cdata[idx][1] = 0.0;
#else
cdata[idx].re = 0.0;
cdata[idx].im = 0.0;
#endif
} }
} }
RE(cdata[0]) = 0.0; RE(cdata[0]) = 0.0;
IM(cdata[0]) = 0.0; IM(cdata[0]) = 0.0;
#ifdef FFTW3 FFTW_API(execute)(iplan);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(plan);
fftwf_execute(iplan); FFTW_API(destroy_plan)(iplan);
fftwf_destroy_plan(plan);
fftwf_destroy_plan(iplan);
#else
fftw_execute(iplan);
fftw_destroy_plan(plan);
fftw_destroy_plan(iplan);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
#else
rfftwnd_one_complex_to_real(iplan, cdata, NULL);
#endif
rfftwnd_destroy_plan(plan);
rfftwnd_destroy_plan(iplan);
#endif
//... copy data .......................................... //... copy data ..........................................
double dmax = 0.0; real_t dmax = 0.0;
for (int i = 0; i < nx; ++i) for (int i = 0; i < nx; ++i)
for (int j = 0; j < ny; ++j) for (int j = 0; j < ny; ++j)
for (int k = 0; k < nz; ++k) for (int k = 0; k < nz; ++k)
@ -784,35 +671,35 @@ double fft_poisson_plugin::gradient(int dir, grid_hierarchy &u, grid_hierarchy &
/**************************************************************************************/ /**************************************************************************************/
template <int order> template <int order>
double poisson_hybrid_kernel(int idir, int i, int j, int k, int n) real_t poisson_hybrid_kernel(int idir, int i, int j, int k, int n)
{ {
return 1.0; return 1.0;
} }
template <> template <>
inline double poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n) inline real_t poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
{ {
if (i == 0 && j == 0 && k == 0) if (i == 0 && j == 0 && k == 0)
return 0.0; return 0.0;
double real_t
ki(M_PI * (double)i / (double)n), ki(M_PI * (real_t)i / (real_t)n),
kj(M_PI * (double)j / (double)n), kj(M_PI * (real_t)j / (real_t)n),
kk(M_PI * (double)k / (double)n), kk(M_PI * (real_t)k / (real_t)n),
kr(sqrt(ki * ki + kj * kj + kk * kk)); kr(sqrt(ki * ki + kj * kj + kk * kk));
double grad = 1.0, laplace = 1.0; real_t grad = 1.0, laplace = 1.0;
if (idir == 0) if (idir == 0)
grad = sin(ki); grad = std::sin(ki);
else if (idir == 1) else if (idir == 1)
grad = sin(kj); grad = std::sin(kj);
else else
grad = sin(kk); grad = std::sin(kk);
laplace = 2.0 * ((-cos(ki) + 1.0) + (-cos(kj) + 1.0) + (-cos(kk) + 1.0)); laplace = 2.0 * ((-std::cos(ki) + 1.0) + (-std::cos(kj) + 1.0) + (-std::cos(kk) + 1.0));
double kgrad = 1.0; real_t kgrad = 1.0;
if (idir == 0) if (idir == 0)
kgrad = ki; kgrad = ki;
else if (idir == 1) else if (idir == 1)
@ -824,30 +711,30 @@ inline double poisson_hybrid_kernel<2>(int idir, int i, int j, int k, int n)
} }
template <> template <>
inline double poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n) inline real_t poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
{ {
if (i == 0 && j == 0 && k == 0) if (i == 0 && j == 0 && k == 0)
return 0.0; return 0.0;
double real_t
ki(M_PI * (double)i / (double)n), ki(M_PI * (real_t)i / (real_t)n),
kj(M_PI * (double)j / (double)n), kj(M_PI * (real_t)j / (real_t)n),
kk(M_PI * (double)k / (double)n), kk(M_PI * (real_t)k / (real_t)n),
kr(sqrt(ki * ki + kj * kj + kk * kk)); kr(sqrt(ki * ki + kj * kj + kk * kk));
double grad = 1.0, laplace = 1.0; real_t grad = 1.0, laplace = 1.0;
if (idir == 0) if (idir == 0)
grad = 0.166666666667 * (-sin(2. * ki) + 8. * sin(ki)); grad = 0.166666666667 * (-std::sin(2. * ki) + 8. * std::sin(ki));
else if (idir == 1) else if (idir == 1)
grad = 0.166666666667 * (-sin(2. * kj) + 8. * sin(kj)); grad = 0.166666666667 * (-std::sin(2. * kj) + 8. * std::sin(kj));
else if (idir == 2) else if (idir == 2)
grad = 0.166666666667 * (-sin(2. * kk) + 8. * sin(kk)); grad = 0.166666666667 * (-std::sin(2. * kk) + 8. * std::sin(kk));
laplace = 0.1666666667 * ((cos(2 * ki) - 16. * cos(ki) + 15.) + (cos(2 * kj) - 16. * cos(kj) + 15.) + (cos(2 * kk) - 16. * cos(kk) + 15.)); laplace = 0.1666666667 * ((std::cos(2 * ki) - 16. * std::cos(ki) + 15.) + (std::cos(2 * kj) - 16. * std::cos(kj) + 15.) + (std::cos(2 * kk) - 16. * std::cos(kk) + 15.));
double kgrad = 1.0; real_t kgrad = 1.0;
if (idir == 0) if (idir == 0)
kgrad = ki; kgrad = ki;
else if (idir == 1) else if (idir == 1)
@ -859,29 +746,29 @@ inline double poisson_hybrid_kernel<4>(int idir, int i, int j, int k, int n)
} }
template <> template <>
inline double poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n) inline real_t poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
{ {
double real_t
ki(M_PI * (double)i / (double)n), ki(M_PI * (real_t)i / (real_t)n),
kj(M_PI * (double)j / (double)n), kj(M_PI * (real_t)j / (real_t)n),
kk(M_PI * (double)k / (double)n), kk(M_PI * (real_t)k / (real_t)n),
kr(sqrt(ki * ki + kj * kj + kk * kk)); kr(sqrt(ki * ki + kj * kj + kk * kk));
if (i == 0 && j == 0 && k == 0) if (i == 0 && j == 0 && k == 0)
return 0.0; return 0.0;
double grad = 1.0, laplace = 1.0; real_t grad = 1.0, laplace = 1.0;
if (idir == 0) if (idir == 0)
grad = 0.0333333333333 * (sin(3. * ki) - 9. * sin(2. * ki) + 45. * sin(ki)); grad = 0.0333333333333 * (std::sin(3. * ki) - 9. * std::sin(2. * ki) + 45. * std::sin(ki));
else if (idir == 1) else if (idir == 1)
grad = 0.0333333333333 * (sin(3. * kj) - 9. * sin(2. * kj) + 45. * sin(kj)); grad = 0.0333333333333 * (std::sin(3. * kj) - 9. * std::sin(2. * kj) + 45. * std::sin(kj));
else if (idir == 2) else if (idir == 2)
grad = 0.0333333333333 * (sin(3. * kk) - 9. * sin(2. * kk) + 45. * sin(kk)); grad = 0.0333333333333 * (std::sin(3. * kk) - 9. * std::sin(2. * kk) + 45. * std::sin(kk));
laplace = 0.01111111111111 * ((-2. * cos(3.0 * ki) + 27. * cos(2. * ki) - 270. * cos(ki) + 245.) + (-2. * cos(3.0 * kj) + 27. * cos(2. * kj) - 270. * cos(kj) + 245.) + (-2. * cos(3.0 * kk) + 27. * cos(2. * kk) - 270. * cos(kk) + 245.)); laplace = 0.01111111111111 * ((-2. * std::cos(3.0 * ki) + 27. * std::cos(2. * ki) - 270. * std::cos(ki) + 245.) + (-2. * std::cos(3.0 * kj) + 27. * std::cos(2. * kj) - 270. * std::cos(kj) + 245.) + (-2. * std::cos(3.0 * kk) + 27. * std::cos(2. * kk) - 270. * std::cos(kk) + 245.));
double kgrad = 1.0; real_t kgrad = 1.0;
if (idir == 0) if (idir == 0)
kgrad = ki; kgrad = ki;
else if (idir == 1) else if (idir == 1)
@ -896,42 +783,19 @@ inline double poisson_hybrid_kernel<6>(int idir, int i, int j, int k, int n)
} }
template <int order> template <int order>
void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, bool periodic, bool deconvolve_cic) void do_poisson_hybrid(real_t *data, int idir, int nxp, int nyp, int nzp, bool periodic, bool deconvolve_cic)
{ {
double fftnorm = 1.0 / ((double)nxp * (double)nyp * (double)nzp); real_t fftnorm = 1.0 / ((real_t)nxp * (real_t)nyp * (real_t)nzp);
fftw_complex *cdata = reinterpret_cast<fftw_complex *>(data); complex_t *cdata = reinterpret_cast<complex_t *>(data);
if (deconvolve_cic) if (deconvolve_cic)
music::ilog.Print("CIC deconvolution step is enabled."); music::ilog.Print("CIC deconvolution step is enabled.");
#ifdef FFTW3 fftw_plan_t iplan, plan;
#ifdef SINGLE_PRECISION plan = FFTW_API(plan_dft_r2c_3d)(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
fftwf_plan iplan, plan; iplan = FFTW_API(plan_dft_c2r_3d)(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
plan = fftwf_plan_dft_r2c_3d(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE); FFTW_API(execute)(plan);
iplan = fftwf_plan_dft_c2r_3d(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
fftwf_execute(plan);
#else
fftw_plan iplan, plan;
plan = fftw_plan_dft_r2c_3d(nxp, nyp, nzp, data, cdata, FFTW_ESTIMATE);
iplan = fftw_plan_dft_c2r_3d(nxp, nyp, nzp, cdata, data, FFTW_ESTIMATE);
fftw_execute(plan);
#endif
#else
rfftwnd_plan iplan, plan;
plan = rfftw3d_create_plan(nxp, nyp, nzp,
FFTW_REAL_TO_COMPLEX, FFTW_ESTIMATE | FFTW_IN_PLACE);
iplan = rfftw3d_create_plan(nxp, nyp, nzp,
FFTW_COMPLEX_TO_REAL, FFTW_ESTIMATE | FFTW_IN_PLACE);
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_real_to_complex(omp_get_max_threads(), plan, data, NULL);
#else
rfftwnd_one_real_to_complex(plan, data, NULL);
#endif
#endif
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nxp; ++i) for (int i = 0; i < nxp; ++i)
@ -948,22 +812,22 @@ void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, boo
kj -= nyp; kj -= nyp;
//... apply hybrid correction //... apply hybrid correction
double dk = poisson_hybrid_kernel<order>(idir, ki, kj, k, nxp / 2); real_t dk = poisson_hybrid_kernel<order>(idir, ki, kj, k, nxp / 2);
fftw_real re = RE(cdata[ii]), im = IM(cdata[ii]); real_t re = RE(cdata[ii]), im = IM(cdata[ii]);
RE(cdata[ii]) = -im * dk * fftnorm; RE(cdata[ii]) = -im * dk * fftnorm;
IM(cdata[ii]) = re * dk * fftnorm; IM(cdata[ii]) = re * dk * fftnorm;
if (deconvolve_cic) if (deconvolve_cic)
{ {
double dfx, dfy, dfz; real_t dfx, dfy, dfz;
dfx = M_PI * ki / (double)nxp; dfx = M_PI * ki / (real_t)nxp;
dfx = (i != 0) ? sin(dfx) / dfx : 1.0; dfx = (i != 0) ? std::sin(dfx) / dfx : 1.0;
dfy = M_PI * kj / (double)nyp; dfy = M_PI * kj / (real_t)nyp;
dfy = (j != 0) ? sin(dfy) / dfy : 1.0; dfy = (j != 0) ? std::sin(dfy) / dfy : 1.0;
dfz = M_PI * kk / (double)nzp; dfz = M_PI * kk / (real_t)nzp;
dfz = (k != 0) ? sin(dfz) / dfz : 1.0; dfz = (k != 0) ? std::sin(dfz) / dfz : 1.0;
dfx = 1.0 / (dfx * dfy * dfz); dfx = 1.0 / (dfx * dfy * dfz);
dfx = dfx * dfx; dfx = dfx * dfx;
@ -981,26 +845,9 @@ void do_poisson_hybrid(fftw_real *data, int idir, int nxp, int nyp, int nzp, boo
RE(cdata[0]) = 0.0; RE(cdata[0]) = 0.0;
IM(cdata[0]) = 0.0; IM(cdata[0]) = 0.0;
#ifdef FFTW3 FFTW_API(execute)(iplan);
#ifdef SINGLE_PRECISION FFTW_API(destroy_plan)(plan);
fftwf_execute(iplan); FFTW_API(destroy_plan)(iplan);
fftwf_destroy_plan(plan);
fftwf_destroy_plan(iplan);
#else
fftw_execute(iplan);
fftw_destroy_plan(plan);
fftw_destroy_plan(iplan);
#endif
#else
#ifndef SINGLETHREAD_FFTW
rfftwnd_threads_one_complex_to_real(omp_get_max_threads(), iplan, cdata, NULL);
#else
rfftwnd_one_complex_to_real(iplan, cdata, NULL);
#endif
rfftwnd_destroy_plan(plan);
rfftwnd_destroy_plan(iplan);
#endif
} }
template <typename T> template <typename T>
@ -1008,7 +855,7 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
{ {
int nx = f.size(0), ny = f.size(1), nz = f.size(2), nxp, nyp, nzp; int nx = f.size(0), ny = f.size(1), nz = f.size(2), nxp, nyp, nzp;
fftw_real *data; real_t *data;
int xo = 0, yo = 0, zo = 0; int xo = 0, yo = 0, zo = 0;
int nmax = std::max(nx, std::max(ny, nz)); int nmax = std::max(nx, std::max(ny, nz));
@ -1018,12 +865,12 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
if (!periodic) if (!periodic)
{ {
nxp = nmax + 2 * boundary; // 2*nmax; nxp = nmax + 2 * boundary;
nyp = nmax + 2 * boundary; // 2*nmax; nyp = nmax + 2 * boundary;
nzp = nmax + 2 * boundary; // 2*nmax; nzp = nmax + 2 * boundary;
xo = boundary; // nmax/2; xo = boundary;
yo = boundary; // nmax/2; yo = boundary;
zo = boundary; // nmax/2; zo = boundary;
} }
else else
{ {
@ -1032,17 +879,11 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
nzp = nmax; nzp = nmax;
} }
data = new fftw_real[(size_t)nxp * (size_t)nyp * (size_t)(nzp + 2)]; data = new real_t[(size_t)nxp * (size_t)nyp * (size_t)(nzp + 2)];
if (idir == 0) if (idir == 0)
std::cout << " - Performing hybrid Poisson step... (" << nxp << ", " << nyp << ", " << nzp << ")\n"; std::cout << " - Performing hybrid Poisson step... (" << nxp << ", " << nyp << ", " << nzp << ")\n";
// size_t N = (size_t)nxp*(size_t)nyp*2*((size_t)nzp/2+1);
// #pragma omp parallel for
// for( size_t i=0; i<N; ++i )
// data[i]=0.0;
#pragma omp parallel for #pragma omp parallel for
for (int i = 0; i < nxp; ++i) for (int i = 0; i < nxp; ++i)
for (int j = 0; j < nyp; ++j) for (int j = 0; j < nyp; ++j)
@ -1097,7 +938,7 @@ void poisson_hybrid(T &f, int idir, int order, bool periodic, bool deconvolve_ci
/**************************************************************************************/ /**************************************************************************************/
/**************************************************************************************/ /**************************************************************************************/
template void poisson_hybrid<MeshvarBnd<double>>(MeshvarBnd<double> &f, int idir, int order, bool periodic, bool deconvolve_cic); template void poisson_hybrid<MeshvarBnd<real_t>>(MeshvarBnd<real_t> &f, int idir, int order, bool periodic, bool deconvolve_cic);
template void poisson_hybrid<MeshvarBnd<float>>(MeshvarBnd<float> &f, int idir, int order, bool periodic, bool deconvolve_cic); template void poisson_hybrid<MeshvarBnd<float>>(MeshvarBnd<float> &f, int idir, int order, bool periodic, bool deconvolve_cic);
namespace namespace

File diff suppressed because one or more lines are too long

194
src/system_stat.hh Normal file
View file

@ -0,0 +1,194 @@
// This file is part of MUSIC2
// A software package to generate ICs for cosmological simulations
// Copyright (C) 2020-23 by Oliver Hahn
//
// MUSIC2 is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// MUSIC2 is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
#pragma once
#ifdef __APPLE__
#include <sys/types.h>
#include <sys/sysctl.h>
#include <mach/mach.h>
#include <mach/vm_statistics.h>
#include <mach/mach_types.h>
#include <mach/mach_init.h>
#include <mach/mach_host.h>
#include <unistd.h>
#elif __linux__
#include <cstring>
#include <cstdio>
#include <strings.h>
#endif
#include <string>
namespace SystemStat
{
class Cpu
{
public:
Cpu() {}
std::string get_CPUstring() const
{
#ifdef __APPLE__
char buffer[1024];
size_t size = sizeof(buffer);
if (sysctlbyname("machdep.cpu.brand_string", &buffer, &size, NULL, 0) < 0)
{
return "";
}
return std::string(buffer);
#elif __linux__
std::string str = "";
FILE *cpuinfo = fopen("/proc/cpuinfo", "rb");
char *arg = 0;
size_t size = 0;
while (getdelim(&arg, &size, '\n', cpuinfo) != -1)
{
if (strncmp(arg, "model name", 10) == 0)
{
str = std::string(arg + 13);
break;
}
}
free(arg);
fclose(cpuinfo);
//remove newline characters from string
str.erase(std::remove(str.begin(), str.end(), '\n'), str.end());
return str;
#endif
}
};
class Memory
{
private:
size_t total;
size_t avail;
size_t used;
public:
Memory()
: total(0), avail(0), used(0)
{
this->get_statistics();
}
size_t get_TotalMem() const { return this->total; }
size_t get_AvailMem() const { return this->avail; }
size_t get_UsedMem() const { return this->used; }
void update() { this->get_statistics(); }
protected:
int get_statistics(void)
{
#ifdef __APPLE__
int64_t pagesize = int64_t(getpagesize());
int mib[2] = {CTL_HW, HW_MEMSIZE};
size_t length = sizeof(size_t);
sysctl(mib, 2, &this->total, &length, nullptr, 0);
vm_statistics64 vmstat;
natural_t mcount = HOST_VM_INFO64_COUNT;
if (host_statistics64(mach_host_self(), HOST_VM_INFO64, reinterpret_cast<host_info64_t>(&vmstat), &mcount) == KERN_SUCCESS)
{
#if 1 // count inactive as available
this->avail = (int64_t(vmstat.free_count) +
int64_t(vmstat.inactive_count)) *
pagesize;
this->used = (int64_t(vmstat.active_count) +
int64_t(vmstat.wire_count)) *
pagesize;
#else // count inactive as unavailable
this->avail = int64_t(vmstat.free_count) * pagesize;
this->used = (int64_t(vmstat.active_count) +
int64_t(vmstat.inactive_count) +
int64_t(vmstat.wire_count)) *
pagesize;
#endif
}
#elif __linux__
FILE *fd;
char buf[1024];
if ((fd = fopen("/proc/meminfo", "r")))
{
while (1)
{
if (fgets(buf, 500, fd) != buf)
break;
if (bcmp(buf, "MemTotal", 8) == 0)
{
this->total = atoll(buf + 10) * 1024; // in Mb
}
if (strncmp(buf, "Committed_AS", 12) == 0)
{
this->used = atoll(buf + 14) * 1024; // in Mb
}
// if(strncmp(buf, "SwapTotal", 9) == 0)
// {
// *SwapTotal = atoll(buf + 11);
// }
// if(strncmp(buf, "SwapFree", 8) == 0)
// {
// *SwapFree = atoll(buf + 10);
// }
}
fclose(fd);
}
this->avail = this->total - this->used;
#endif
return 0;
}
};
#include <cstdlib>
#include <string>
#include <sys/utsname.h>
class Kernel
{
public:
struct info_t
{
std::string kernel;
std::uint32_t major;
std::uint32_t minor;
std::uint32_t patch;
std::uint32_t build_number;
};
Kernel() {}
info_t get_kernel_info()
{
utsname uts;
uname(&uts);
char *marker = uts.release;
const std::uint32_t major = std::strtoul(marker, &marker, 10);
const std::uint32_t minor = std::strtoul(marker + 1, &marker, 10);
const std::uint32_t patch = std::strtoul(marker + 1, &marker, 10);
const std::uint32_t build_number = std::strtoul(marker + 1, nullptr, 10);
std::string kernel = uts.sysname;
return {kernel, major, minor, patch, build_number};
}
};
} /* namespace SystemStat */

View file

@ -294,10 +294,10 @@ protected:
double fftnorm = 1.0/N; double fftnorm = 1.0/N;
fftw_complex *in, *out; complex_t *in, *out;
in = new fftw_complex[N]; in = new complex_t[N];
out = new fftw_complex[N]; out = new complex_t[N];
//... perform anti-ringing correction from Hamilton (2000) //... perform anti-ringing correction from Hamilton (2000)
k0r0 = krgood( mu, q, dlnr, k0r0 ); k0r0 = krgood( mu, q, dlnr, k0r0 );
@ -341,24 +341,10 @@ protected:
} }
ofsk.close(); ofsk.close();
#ifdef FFTW3 fftw_plan_t p,ip;
#ifdef SINGLE_PRECISION p = FFTW_API(plan_dft_1d)(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
fftwf_plan p,ip; ip = FFTW_API(plan_dft_1d)(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
p = fftwf_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE); FFTW_API(execute)(p);
ip = fftwf_plan_dft_1d(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
fftwf_execute(p);
#else
fftw_plan p,ip;
p = fftw_plan_dft_1d(N, in, out, FFTW_FORWARD, FFTW_ESTIMATE);
ip = fftw_plan_dft_1d(N, out, in, FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_execute(p);
#endif
#else
fftw_plan p,ip;
p = fftw_create_plan(N, FFTW_FORWARD, FFTW_ESTIMATE);
ip = fftw_create_plan(N, FFTW_BACKWARD, FFTW_ESTIMATE);
fftw_one(p, in, out);
#endif
//... compute the Hankel transform by convolution with the Bessel function //... compute the Hankel transform by convolution with the Bessel function
for( unsigned i=0; i<N; ++i ) for( unsigned i=0; i<N; ++i )
@ -430,15 +416,7 @@ protected:
} }
#ifdef FFTW3 FFTW_API(execute)(ip);
#ifdef SINGLE_PRECISION
fftwf_execute(ip);
#else
fftw_execute(ip);
#endif
#else
fftw_one(ip, out, in);
#endif
rr.assign(N,0.0); rr.assign(N,0.0);
TT.assign(N,0.0); TT.assign(N,0.0);
@ -482,13 +460,8 @@ protected:
delete[] in; delete[] in;
delete[] out; delete[] out;
#if defined(FFTW3) && defined(SINGLE_PRECISION) FFTW_API(destroy_plan)(p);
fftwf_destroy_plan(p); FFTW_API(destroy_plan)(ip);
fftwf_destroy_plan(ip);
#else
fftw_destroy_plan(p);
fftw_destroy_plan(ip);
#endif
} }
std::vector<real_t> m_xtable,m_ytable,m_dytable; std::vector<real_t> m_xtable,m_ytable,m_dytable;
double m_xmin, m_xmax, m_dx, m_rdx; double m_xmin, m_xmax, m_dx, m_rdx;