1
0
Fork 0
mirror of https://github.com/cosmo-sims/MUSIC.git synced 2024-09-19 17:03:46 +02:00

Speeded up transfer function calculation significantly.

Added an approximate log10 algorithm.
Use spherical symmetry of problem to avoid computing duplicate values.
This commit is contained in:
Oliver Hahn 2010-08-14 16:52:25 -07:00
parent ae0d833155
commit 785601d4ae
6 changed files with 156 additions and 34 deletions

View file

@ -41,6 +41,36 @@
real_t integrate( double (* func) (double x, void * params), double a, double b, void *params=NULL);
typedef __attribute__((__may_alias__)) int aint;
inline float fast_log2 (float val)
{
//if( sizeof(int) != sizeof(float) )
// throw std::runtime_error("fast_log2 will fail on this system!!");
aint * const exp_ptr = reinterpret_cast <aint *> (&val);
aint x = *exp_ptr;
const int log_2 = ((x >> 23) & 255) - 128;
x &= ~(255 << 23);
x += 127 << 23;
*exp_ptr = x;
val = ((-1.0f/3) * val + 2) * val - 2.0f/3; // (1)
return (val + log_2);
}
inline float fast_log (const float &val)
{
return (fast_log2 (val) * 0.69314718f);
}
inline float fast_log10 (const float &val)
{
return (fast_log2 (val) * 0.3010299956639812f);
}
struct Base_interp
{
int n, mm, jsav, cor, dj;

View file

@ -159,7 +159,7 @@ namespace convolution{
bool
bperiodic = cparam_.pcf->getValueSafe<bool>("setup","periodic_TF",true);
const int ref_fac = (int)(pow(2,cparam_.coarse_fact)+0.5), ql = -ref_fac/2+1, qr=ql+ref_fac, rf8=pow(ref_fac,3);
const int ref_fac = (int)(pow(2,cparam_.coarse_fact)+0.5), ql = -ref_fac/2+1, qr=ql+ref_fac, rf8=(int)pow(ref_fac,3);
std::cout << " - Computing transfer function kernel...\n";
@ -179,13 +179,15 @@ namespace convolution{
int nzp = cparam_.nz/2+1;
double kmax = 0.5*M_PI/std::max(cparam_.nx,std::max(cparam_.ny,cparam_.nz));
T0 = tfr->compute_real(0.0);
//... obtain a grid-version of the real space transfer function
if( bperiodic )
{
#pragma omp parallel for
for( int i=0; i<cparam_.nx; ++i )
for( int j=0; j<cparam_.ny; ++j )
for( int k=0; k<cparam_.nz; ++k )
{
#pragma omp parallel for //schedule(static,4)
for( int i=0; i<=cparam_.nx/2; ++i )
for( int j=0; j<=cparam_.ny/2; ++j )
for( int k=0; k<=cparam_.nz/2; ++k )
{
int iix(i), iiy(j), iiz(k);
double rr[3], rr2;
@ -194,7 +196,25 @@ namespace convolution{
if( iiy > (int)cparam_.ny/2 ) iiy -= cparam_.ny;
if( iiz > (int)cparam_.nz/2 ) iiz -= cparam_.nz;
unsigned idx = (i*cparam_.ny + j) * 2*(cparam_.nz/2+1) + k;
//... speed up 8x by copying data to other octants
int idx[8];
int nx=cparam_.nx, ny = cparam_.ny, nz = cparam_.nz;
idx[0] = ((i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (k);
idx[1] = ((nx-i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (k);
idx[2] = ((i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (k);
idx[3] = ((nx-i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (k);
idx[4] = ((i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (nz-k);
idx[5] = ((nx-i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (nz-k);
idx[6] = ((i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (nz-k);
idx[7] = ((nx-i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (nz-k);
if(i==0||i==nx/2){ idx[1]=idx[3]=idx[5]=idx[7]=-1;}
if(j==0||j==ny/2){ idx[2]=idx[3]=idx[6]=idx[7]=-1;}
if(k==0||k==nz/2){ idx[4]=idx[5]=idx[6]=idx[7]=-1;}
double val = 0.0;
for( int ii=-1; ii<=1; ++ii )
for( int jj=-1; jj<=1; ++jj )
@ -204,11 +224,11 @@ namespace convolution{
rr[1] = ((double)iiy ) * dy + jj*boxlength;
rr[2] = ((double)iiz ) * dz + kk*boxlength;
if( rr[0] > -boxlength && rr[0] <= boxlength
&& rr[1] > -boxlength && rr[1] <= boxlength
&& rr[2] > -boxlength && rr[2] <= boxlength )
if( rr[0] > -boxlength && rr[0] < boxlength
&& rr[1] > -boxlength && rr[1] < boxlength
&& rr[2] > -boxlength && rr[2] < boxlength )
{
if( ref_fac > 1 )
if( ref_fac > 1 )//&& fabs(tfr->get_grad(rr2)*dx/T0) > 1e-4 )
{
double rrr[3];
register double rrr2[3];
@ -225,19 +245,25 @@ namespace convolution{
rrr[2] = rr[2]+(double)kkk*0.5*dx - 0.25*dx;
rrr2[2]= rrr[2]*rrr[2];
rr2 = rrr2[0]+rrr2[1]+rrr2[2];
kdata_[idx] += (fftw_real)(tfr->compute_real(rr2)/rf8);
val += tfr->compute_real(rr2)/rf8;
}
}
}
}else{
rr2 = rr[0]*rr[0]+rr[1]*rr[1]+rr[2]*rr[2];
val += tfr->compute_real(rr2);
kdata_[idx] += (fftw_real)tfr->compute_real(rr2);
}
}
}
kdata_[idx] *= fac;
val *= fac;
for(int q=0;q<8;++q)
if(idx[q]!=-1)
kdata_[idx[q]] = val;
}
}else{
#pragma omp parallel for

View file

@ -865,7 +865,7 @@ class refinement_hierarchy
std::vector<double> x0_,y0_,z0_,xl_,yl_,zl_;
std::vector<unsigned> ox_,oy_,oz_,oax_,oay_,oaz_;
std::vector<unsigned> nx_,ny_,nz_;
unsigned levelmin_, levelmax_, padding_;
unsigned levelmin_, levelmax_, levelmin_tf_, padding_;
config_file& cf_;
bool align_top_;
double x0ref_[3], lxref_[3];
@ -885,6 +885,7 @@ public:
//... query the parameter data we need
levelmin_ = cf_.getValue<unsigned>("setup","levelmin");
levelmax_ = cf_.getValue<unsigned>("setup","levelmax");
levelmin_tf_= cf_.getValueSafe<unsigned>("setup","levelmin_TF",levelmin_);
padding_ = cf_.getValue<unsigned>("setup","padding");
align_top_ = cf_.getValue<bool>("setup","align_top");
@ -899,7 +900,9 @@ public:
temp = cf_.getValue<std::string>( "setup", "ref_extent" );
sscanf( temp.c_str(), "%lf,%lf,%lf", &lxref_[0],&lxref_[1],&lxref_[2] );
unsigned ncoarse = (unsigned)pow(2,levelmin_);
unsigned
ncoarse = (unsigned)pow(2,levelmin_);
//ncoarse_tf = (unsigned)pow(2,levelmin_tf_);
//... determine shift

View file

@ -215,8 +215,8 @@ public:
//double dmean = coarsemean-finemean;
/*//... subtract the mean difference caused by interpolation
#pragma omp parallel for
//... subtract the mean difference caused by interpolation
/*#pragma omp parallel for
for( int i=0; i<nx; ++i )
for( int j=0; j<ny; ++j )
for( int k=0; k<nz; ++k )

View file

@ -143,7 +143,6 @@ protected:
//! initialize member variables and allocate memory
void initialize( void )
{
std::cerr << " - Generating random numbers w/ sample cube size of " << cubesize_ << std::endl;
ncubes_ = std::max((int)((double)res_/cubesize_),1);
if( res_ < cubesize_ )
@ -152,6 +151,8 @@ protected:
cubesize_ = res_;
}
std::cout << " - Generating random numbers w/ sample cube size of " << cubesize_ << std::endl;
rnums_.assign( ncubes_*ncubes_*ncubes_, NULL );
}
@ -382,6 +383,7 @@ public:
random_numbers( unsigned res, unsigned cubesize, long baseseed, int *x0, int *lx )
: res_( res ), cubesize_( cubesize ), ncubes_( 1 ), baseseed_( baseseed )
{
std::cout << " - Generating random numbers (1) with seed " << baseseed << std::endl;
initialize();
fill_subvolume( x0, lx );
}
@ -390,6 +392,8 @@ public:
random_numbers( unsigned res, unsigned cubesize, long baseseed, bool zeromean=true )
: res_( res ), cubesize_( cubesize ), ncubes_( 1 ), baseseed_( baseseed )
{
std::cout << " - Generating random numbers (2) with seed " << baseseed << std::endl;
double mean = 0.0;
initialize();
mean = fill_all();
@ -482,7 +486,7 @@ public:
#ifdef RAND_DEBUG
(*rnums_[0])(kk,jj,ii) = 0.0;
#else
(*rnums_[0])(kk,jj,ii) = in_float[q++];
(*rnums_[0])(kk,jj,ii) = -in_float[q++];
#endif
}
@ -512,7 +516,7 @@ public:
#ifdef RAND_DEBUG
(*rnums_[0])(kk,jj,ii) = 0.0;
#else
(*rnums_[0])(kk,jj,ii) = in_double[q++];
(*rnums_[0])(kk,jj,ii) = -in_double[q++];
#endif
}
@ -535,6 +539,47 @@ public:
}
//... create constrained new field
random_numbers( random_numbers<T>& rc, unsigned cubesize, long baseseed, bool zeromean=true )
: res_( 2*rc.res_ ), cubesize_( cubesize ), ncubes_( 1 ), baseseed_( baseseed )
{
double mean = 0.0;
initialize();
mean = fill_all();
if( zeromean )
{
for(unsigned i=0; i<res_; ++i )
for( unsigned j=0; j<res_; ++j )
for( unsigned k=0; k<res_; ++k )
(*this)(i,j,k) -= mean;
}
std::cout << " - Generating a constrained random number set with seed " << baseseed << "...\n";
double fac = 1./sqrt(8.0);
for( unsigned i=0,ii=0; i<res_; i+=2,++ii )
for( unsigned j=0,jj=0; j<res_; j+=2,++jj )
for( unsigned k=0,kk=0; k<res_; k+=2,++kk )
{
double topval = rc(ii,jj,kk);
double locmean = 0.125*((*this)(i,j,k)+(*this)(i+1,j,k)+(*this)(i,j+1,k)+(*this)(i,j,k+1)+
(*this)(i+1,j+1,k)+(*this)(i+1,j,k+1)+(*this)(i,j+1,k+1)+(*this)(i+1,j+1,k+1));
double dif = fac*topval-locmean;
(*this)(i,j,k) += dif;
(*this)(i+1,j,k) += dif;
(*this)(i,j+1,k) += dif;
(*this)(i,j,k+1) += dif;
(*this)(i+1,j+1,k) += dif;
(*this)(i+1,j,k+1) += dif;
(*this)(i,j+1,k+1) += dif;
(*this)(i+1,j+1,k+1) += dif;
}
}
//... copy construct by averaging down
explicit random_numbers( /*const*/ random_numbers <T>& rc )
{

View file

@ -355,8 +355,8 @@ protected:
fftw_destroy_plan(p);
fftw_destroy_plan(ip);
}
std::vector<real_t> m_xtable,m_ytable;
double m_xmin, m_xmax, m_dx;
std::vector<real_t> m_xtable,m_ytable,m_dytable;
double m_xmin, m_xmax, m_dx, m_rdx;
public:
TransferFunction_real( transfer_function *tf, real_t nspec, real_t pnorm, real_t dplus, real_t rmin, real_t rmax, real_t knymax, unsigned nr )
{
@ -399,16 +399,12 @@ public:
{
if( r[i] > rmin && r[i] < rmax )
{
xsp.push_back( log10(r[i]) );
xsp.push_back( r[i] );//log10(r[i]) );
ysp.push_back( T[i]*r[i]*r[i] );
}
}
accp = gsl_interp_accel_alloc ();
//... spline interpolation is only marginally slower here
@ -417,12 +413,11 @@ public:
//... set up everything for spline interpolation
gsl_spline_init (splinep, &xsp[0], &ysp[0], xsp.size() );
//.. build lookup table using spline interpolation
m_xmin = log10(rmin);
m_xmax = log10(rmax);
m_dx = (m_xmax-m_xmin)/nr;
m_rdx = 1.0/m_dx;
for(unsigned i=0; i<nr; ++i )
{
@ -430,6 +425,14 @@ public:
m_ytable.push_back( gsl_spline_eval(splinep, (m_xtable.back()), accp) );
}
for(unsigned i=0; i<nr-1; ++i )
{
real_t dy,dr;
dy = m_ytable[i+1]/pow(m_xtable[i+1],2)-m_ytable[i]/pow(m_xtable[i],2);
dr = pow(10.0,m_xtable[i+1])-pow(10.0,m_xtable[i]);
m_dytable.push_back(dy/dr);
}
//... DEBUG: output of spline interpolated real space transfer function
if(false)
{
@ -457,7 +460,20 @@ public:
~TransferFunction_real()
{ }
inline real_t get_grad( real_t r2 ) const
{
double r = 0.5*fast_log10(r2);
double ii = (r-m_xmin)*m_rdx;
int i = (int)ii;
i=std::max(0,i);
i=std::min(i, (int)m_xtable.size()-2);
return (real_t)m_dytable[i];
}
//... fast version
inline real_t compute_real( real_t r2 ) const
{
const double EPS = 1e-8;
@ -466,20 +482,22 @@ public:
if( r2 <Reps2 )
return Tr0_;
double r=0.5*log10(r2);
double ii = (r-m_xmin)/m_dx;
double r = 0.5*fast_log10(r2);
//float r = 0.5f*log10f(r2);
double ii = (r-m_xmin)*m_rdx;
int i = (int)ii;
if( i < 0 ) i = 0;
else if( i >= (int)m_xtable.size()-1 ) i = m_xtable.size()-2;
i=std::max(0,i);
i=std::min(i, (int)m_xtable.size()-2);
double y1,y2;
y1 = m_ytable[i];
y2 = m_ytable[i+1];
//divide by r**2 because r^2 T is tabulated
return (real_t)((y1 + (y2-y1)*(ii-(double)i))/r2);
}
};