Speeded up transfer function calculation significantly.

Added an approximate log10 algorithm. Use spherical symmetry of problem to avoid computing duplicate values.
2024-09-19 17:03:46 +02:00 · 2010-08-14 16:52:25 -07:00 · 2010-08-14 16:52:25 -07:00 · 785601d4ae
commit 785601d4ae
parent ae0d833155
6 changed files with 156 additions and 34 deletions
--- a/Numerics.hh
+++ b/Numerics.hh
@ -41,6 +41,36 @@

 real_t integrate( double (* func) (double x, void * params), double a, double b, void *params=NULL);

+typedef __attribute__((__may_alias__)) int aint;
+
+inline float fast_log2 (float val)
+{
+	//if( sizeof(int) != sizeof(float) )
+	//	throw std::runtime_error("fast_log2 will fail on this system!!");
+	aint * const    exp_ptr = reinterpret_cast <aint *> (&val);
+	aint            x = *exp_ptr;
+	const int      log_2 = ((x >> 23) & 255) - 128;
+	x &= ~(255 << 23);
+	x += 127 << 23;
+	*exp_ptr = x;
+	
+	val = ((-1.0f/3) * val + 2) * val - 2.0f/3;   // (1)
+	
+	return (val + log_2);
+} 
+
+inline float fast_log (const float &val)
+{
+	return (fast_log2 (val) * 0.69314718f);
+} 
+
+inline float fast_log10 (const float &val)
+{
+	return (fast_log2 (val) * 0.3010299956639812f);
+} 
+
+
+
 struct Base_interp
 {
 	int n, mm, jsav, cor, dj; 
--- a/convolution_kernel.cc
+++ b/convolution_kernel.cc
@ -159,7 +159,7 @@ namespace convolution{
 		bool
 			bperiodic	= cparam_.pcf->getValueSafe<bool>("setup","periodic_TF",true);
 		
-		const int ref_fac = (int)(pow(2,cparam_.coarse_fact)+0.5), ql = -ref_fac/2+1, qr=ql+ref_fac, rf8=pow(ref_fac,3);
+		const int ref_fac = (int)(pow(2,cparam_.coarse_fact)+0.5), ql = -ref_fac/2+1, qr=ql+ref_fac, rf8=(int)pow(ref_fac,3);
 				
 		std::cout << "   - Computing transfer function kernel...\n";
 		
@ -179,13 +179,15 @@ namespace convolution{
 		int nzp = cparam_.nz/2+1;
 		double kmax = 0.5*M_PI/std::max(cparam_.nx,std::max(cparam_.ny,cparam_.nz));
 		
+		T0 = tfr->compute_real(0.0);
+		
 		//... obtain a grid-version of the real space transfer function
 		if( bperiodic  )
-		{			
-			#pragma omp parallel for 
-			for( int i=0; i<cparam_.nx; ++i )
-				for( int j=0; j<cparam_.ny; ++j )
-					for( int k=0; k<cparam_.nz; ++k )
+		{		
+			#pragma omp parallel for //schedule(static,4)
+			for( int i=0; i<=cparam_.nx/2; ++i )
+				for( int j=0; j<=cparam_.ny/2; ++j )
+					for( int k=0; k<=cparam_.nz/2; ++k )
 					{
 						int iix(i), iiy(j), iiz(k);
 						double rr[3], rr2;
@ -194,7 +196,25 @@ namespace convolution{
 						if( iiy > (int)cparam_.ny/2 ) iiy -= cparam_.ny;
 						if( iiz > (int)cparam_.nz/2 ) iiz -= cparam_.nz;
 						
-						unsigned idx = (i*cparam_.ny + j) * 2*(cparam_.nz/2+1) + k;
+						
+						//... speed up 8x by copying data to other octants
+						int idx[8];
+						int nx=cparam_.nx, ny = cparam_.ny, nz = cparam_.nz;
+						
+						idx[0] = ((i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (k);
+						idx[1] = ((nx-i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (k);
+						idx[2] = ((i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (k);
+						idx[3] = ((nx-i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (k);
+						idx[4] = ((i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (nz-k);
+						idx[5] = ((nx-i)*cparam_.ny + (j)) * 2*(cparam_.nz/2+1) + (nz-k);
+						idx[6] = ((i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (nz-k);
+						idx[7] = ((nx-i)*cparam_.ny + (ny-j)) * 2*(cparam_.nz/2+1) + (nz-k);
+						
+						if(i==0||i==nx/2){ idx[1]=idx[3]=idx[5]=idx[7]=-1;}
+						if(j==0||j==ny/2){ idx[2]=idx[3]=idx[6]=idx[7]=-1;}
+						if(k==0||k==nz/2){ idx[4]=idx[5]=idx[6]=idx[7]=-1;}
+						
+						double val = 0.0;
 						
 						for( int ii=-1; ii<=1; ++ii )
 							for( int jj=-1; jj<=1; ++jj )
@ -204,11 +224,11 @@ namespace convolution{
 									rr[1] = ((double)iiy ) * dy + jj*boxlength;
 									rr[2] = ((double)iiz ) * dz + kk*boxlength;
 									
-									if( rr[0] > -boxlength && rr[0] <= boxlength
-									 && rr[1] > -boxlength && rr[1] <= boxlength
-									 && rr[2] > -boxlength && rr[2] <= boxlength )
+									if( rr[0] > -boxlength && rr[0] < boxlength
+									 && rr[1] > -boxlength && rr[1] < boxlength
+									 && rr[2] > -boxlength && rr[2] < boxlength )
 									{
-										if( ref_fac > 1 )
+										if( ref_fac > 1 )//&& fabs(tfr->get_grad(rr2)*dx/T0) > 1e-4 )
 										{
 											double rrr[3];
 											register double rrr2[3];
@ -225,19 +245,25 @@ namespace convolution{
 														rrr[2] = rr[2]+(double)kkk*0.5*dx - 0.25*dx;
 														rrr2[2]= rrr[2]*rrr[2];
 														rr2 = rrr2[0]+rrr2[1]+rrr2[2];
-														kdata_[idx] += (fftw_real)(tfr->compute_real(rr2)/rf8);	
+														val += tfr->compute_real(rr2)/rf8;
 													}
 												}
 											}
 											
 										}else{
 											rr2 = rr[0]*rr[0]+rr[1]*rr[1]+rr[2]*rr[2];
+											val += tfr->compute_real(rr2);	
 											kdata_[idx] += (fftw_real)tfr->compute_real(rr2);
 										}
 									}
 								}
 						
-						kdata_[idx] *= fac;	
+						val *= fac;
+						
+						for(int q=0;q<8;++q)
+							if(idx[q]!=-1)  
+								kdata_[idx[q]] = val;
+						
 					}
 		}else{
 			#pragma omp parallel for
--- a/mesh.hh
+++ b/mesh.hh
@ -865,7 +865,7 @@ class refinement_hierarchy
 	std::vector<double> x0_,y0_,z0_,xl_,yl_,zl_;
 	std::vector<unsigned> ox_,oy_,oz_,oax_,oay_,oaz_;
 	std::vector<unsigned> nx_,ny_,nz_;
-	unsigned levelmin_, levelmax_, padding_;
+	unsigned levelmin_, levelmax_, levelmin_tf_, padding_;
 	config_file& cf_;
 	bool align_top_;
 	double x0ref_[3], lxref_[3];
@ -885,6 +885,7 @@ public:
 		//... query the parameter data we need
 		levelmin_	= cf_.getValue<unsigned>("setup","levelmin");
 		levelmax_	= cf_.getValue<unsigned>("setup","levelmax");
+		levelmin_tf_= cf_.getValueSafe<unsigned>("setup","levelmin_TF",levelmin_);
 		padding_	= cf_.getValue<unsigned>("setup","padding");
 		align_top_	= cf_.getValue<bool>("setup","align_top");
 		
@ -899,7 +900,9 @@ public:
 		temp		= cf_.getValue<std::string>( "setup", "ref_extent" );
 		sscanf( temp.c_str(), "%lf,%lf,%lf", &lxref_[0],&lxref_[1],&lxref_[2] );
 		
-		unsigned ncoarse = (unsigned)pow(2,levelmin_);
+		unsigned 
+			ncoarse = (unsigned)pow(2,levelmin_);
+			//ncoarse_tf = (unsigned)pow(2,levelmin_tf_);
 		
 		//... determine shift

--- a/mg_operators.hh
+++ b/mg_operators.hh
@ -215,8 +215,8 @@ public:
 		
 		//double dmean = coarsemean-finemean;
 		
-		/*//... subtract the mean difference caused by interpolation
-		#pragma omp parallel for
+		//... subtract the mean difference caused by interpolation
+		/*#pragma omp parallel for
 		for( int i=0; i<nx; ++i )
 			for( int j=0; j<ny; ++j )
 				for( int k=0; k<nz; ++k )
--- a/random.hh
+++ b/random.hh
@ -143,7 +143,6 @@ protected:
 	//! initialize member variables and allocate memory
 	void initialize( void )
 	{
-		std::cerr << " - Generating random numbers w/ sample cube size of " << cubesize_ << std::endl;
 		
 		ncubes_ = std::max((int)((double)res_/cubesize_),1);
 		if( res_ < cubesize_ )
@ -152,6 +151,8 @@ protected:
 			cubesize_ = res_;
 		}
 		
+		std::cout << " - Generating random numbers w/ sample cube size of " << cubesize_ << std::endl;
+		
 		rnums_.assign( ncubes_*ncubes_*ncubes_, NULL );
 	}
 	
@ -382,6 +383,7 @@ public:
 	random_numbers( unsigned res, unsigned cubesize, long baseseed, int *x0, int *lx )
 	: res_( res ), cubesize_( cubesize ), ncubes_( 1 ), baseseed_( baseseed )
 	{
+		std::cout << " - Generating random numbers (1) with seed " << baseseed << std::endl;
 		initialize();
 		fill_subvolume( x0, lx );
 	}
@ -390,6 +392,8 @@ public:
 	random_numbers( unsigned res, unsigned cubesize, long baseseed, bool zeromean=true )
 	: res_( res ), cubesize_( cubesize ), ncubes_( 1 ), baseseed_( baseseed )
 	{
+		std::cout << " - Generating random numbers (2) with seed " << baseseed << std::endl;
+		
 		double mean = 0.0;
 		initialize();
 		mean = fill_all();
@ -482,7 +486,7 @@ public:
 #ifdef RAND_DEBUG
 						(*rnums_[0])(kk,jj,ii) = 0.0;
 #else
-						(*rnums_[0])(kk,jj,ii) = in_float[q++];
+						(*rnums_[0])(kk,jj,ii) = -in_float[q++];
 #endif
 						
 					}
@ -512,7 +516,7 @@ public:
 #ifdef RAND_DEBUG						
 						(*rnums_[0])(kk,jj,ii) = 0.0;
 #else
-						(*rnums_[0])(kk,jj,ii) = in_double[q++];
+						(*rnums_[0])(kk,jj,ii) = -in_double[q++];
 #endif
 						
 					}
@ -535,6 +539,47 @@ public:
 		
 	}
 	
+	//... create constrained new field
+	random_numbers( random_numbers<T>& rc, unsigned cubesize, long baseseed, bool zeromean=true )
+	: res_( 2*rc.res_ ), cubesize_( cubesize ), ncubes_( 1 ), baseseed_( baseseed )
+	{
+		double mean = 0.0;
+		initialize();
+		mean = fill_all();
+		
+		if( zeromean )
+		{
+			for(unsigned i=0; i<res_; ++i )
+				for( unsigned j=0; j<res_; ++j )
+					for( unsigned k=0; k<res_; ++k )
+						(*this)(i,j,k) -= mean;
+		}
+		
+		std::cout << " - Generating a constrained random number set with seed " << baseseed << "...\n";
+		
+		double fac = 1./sqrt(8.0);
+		
+		for( unsigned i=0,ii=0; i<res_; i+=2,++ii )
+			for( unsigned j=0,jj=0; j<res_; j+=2,++jj )
+				for( unsigned k=0,kk=0; k<res_; k+=2,++kk )
+				{
+					double topval = rc(ii,jj,kk);
+					double locmean = 0.125*((*this)(i,j,k)+(*this)(i+1,j,k)+(*this)(i,j+1,k)+(*this)(i,j,k+1)+
+											 (*this)(i+1,j+1,k)+(*this)(i+1,j,k+1)+(*this)(i,j+1,k+1)+(*this)(i+1,j+1,k+1));
+					double dif = fac*topval-locmean;
+					
+					(*this)(i,j,k) += dif;
+					(*this)(i+1,j,k) += dif;
+					(*this)(i,j+1,k) += dif;
+					(*this)(i,j,k+1) += dif;
+					(*this)(i+1,j+1,k) += dif;
+					(*this)(i+1,j,k+1) += dif;
+					(*this)(i,j+1,k+1) += dif;
+					(*this)(i+1,j+1,k+1) += dif;
+					
+				}
+	}
+
 	//... copy construct by averaging down
 	explicit random_numbers( /*const*/ random_numbers <T>& rc )
 	{
--- a/transfer_function.hh
+++ b/transfer_function.hh
@ -355,8 +355,8 @@ protected:
 		fftw_destroy_plan(p);
 		fftw_destroy_plan(ip);
 	}
-	std::vector<real_t> m_xtable,m_ytable;
-	double m_xmin, m_xmax, m_dx;
+	std::vector<real_t> m_xtable,m_ytable,m_dytable;
+	double m_xmin, m_xmax, m_dx, m_rdx;
 public:
 	TransferFunction_real( transfer_function *tf, real_t nspec, real_t pnorm, real_t dplus, real_t rmin, real_t rmax, real_t knymax, unsigned nr )
 	{
@ -399,16 +399,12 @@ public:
 		{
 			if( r[i] > rmin && r[i] < rmax )
 			{
-				xsp.push_back( log10(r[i]) );
+				xsp.push_back( r[i] );//log10(r[i]) );
 				ysp.push_back( T[i]*r[i]*r[i] );
 			}
 			
 		}
 		
-		
-		
-		
-		
 		accp = gsl_interp_accel_alloc ();

 		//... spline interpolation is only marginally slower here
@ -417,12 +413,11 @@ public:
 		//... set up everything for spline interpolation
 		gsl_spline_init (splinep, &xsp[0], &ysp[0], xsp.size() );
 		
-
-		
 		//.. build lookup table using spline interpolation
 		m_xmin = log10(rmin);
 		m_xmax = log10(rmax);
 		m_dx   = (m_xmax-m_xmin)/nr;
+		m_rdx  = 1.0/m_dx;
 		
 		for(unsigned i=0; i<nr; ++i )
 		{
@ -430,6 +425,14 @@ public:
 			m_ytable.push_back( gsl_spline_eval(splinep, (m_xtable.back()), accp) );
 		}
 		
+		for(unsigned i=0; i<nr-1; ++i )
+		{
+			real_t dy,dr;
+			dy = m_ytable[i+1]/pow(m_xtable[i+1],2)-m_ytable[i]/pow(m_xtable[i],2);
+			dr = pow(10.0,m_xtable[i+1])-pow(10.0,m_xtable[i]);
+			m_dytable.push_back(dy/dr);
+		}
+		
 		//... DEBUG: output of spline interpolated real space transfer function
 		if(false)
 		{
@ -457,7 +460,20 @@ public:
 	
 	~TransferFunction_real()
 	{ }
+
+	inline real_t get_grad( real_t r2 ) const
+	{
+		double r = 0.5*fast_log10(r2);
+		double ii = (r-m_xmin)*m_rdx;
+		int i = (int)ii;
+		
+		i=std::max(0,i);
+		i=std::min(i, (int)m_xtable.size()-2);
+		
+		return (real_t)m_dytable[i];
+	}
 	
+	//... fast version
 	inline real_t compute_real( real_t r2 ) const
 	{
 		const double EPS = 1e-8;
@ -466,20 +482,22 @@ public:
 		if( r2 <Reps2 )
 			return Tr0_;
 		
-		double r=0.5*log10(r2);
-		double ii = (r-m_xmin)/m_dx;
+		double r = 0.5*fast_log10(r2);
+		//float r = 0.5f*log10f(r2);
+		
+		double ii = (r-m_xmin)*m_rdx;
 		int i = (int)ii;
 		
-		if( i < 0 ) i = 0;
-		else if( i >= (int)m_xtable.size()-1 ) i = m_xtable.size()-2;
+		i=std::max(0,i);
+		i=std::min(i, (int)m_xtable.size()-2);
 		
 		double y1,y2;
 		y1 = m_ytable[i];
 		y2 = m_ytable[i+1];
 		
+		//divide by r**2 because r^2 T is tabulated
 		return (real_t)((y1 + (y2-y1)*(ii-(double)i))/r2);
 	}
-
 };