some clean up, reduced number of convolutions

2024-09-19 17:03:45 +02:00 · 2019-05-12 18:48:01 +02:00 · 2019-05-12 18:48:01 +02:00 · f61dfcf74d
commit f61dfcf74d
parent 80244b8b04
2 changed files with 102 additions and 172 deletions
--- a/include/grid_fft.hh
+++ b/include/grid_fft.hh
@ -52,6 +52,7 @@ void pad_insertf( kdep_functor kfunc, Grid_FFT<data_t> &fp ){
    //size_t nhalf[3] = {f.n_[0] / 2, f.n_[1] / 2, f.n_[2] / 2};
    size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3};
    #pragma omp parallel for
    for (size_t i = 0; i < 2*fp.size(0)/3; ++i)
    {
        size_t ip = (i > nhalf[0]) ? i + dn[0] : i;
@ -954,6 +955,24 @@ public:
            }, res, op );
    }
    template< typename opp >
    void convolve_SumHessians( Grid_FFT<data_t> & inl, const std::array<int,2>& d2l, Grid_FFT<data_t> & inr, const std::array<int,2>& d2r1, 
                               const std::array<int,2>& d2r2, Grid_FFT<data_t> & res, opp op ){
        // transform to FS in case fields are not
        inl.FourierTransformForward();
        inr.FourierTransformForward();
        // perform convolution of Hessians
        this->convolve2__(
            [&]( size_t i, size_t j, size_t k ) -> ccomplex_t{
                auto kk = inl.template get_k<real_t>(i,j,k);
                return -kk[d2l[0]] * kk[d2l[1]] * inl.kelem(i,j,k);// / phifac;
            },
            [&]( size_t i, size_t j, size_t k ){
                auto kk = inr.template get_k<real_t>(i,j,k);
                return (-kk[d2r1[0]] * kk[d2r1[1]] -kk[d2r2[0]] * kk[d2r2[1]]) * inr.kelem(i,j,k);
            }, res, op );
    }
    template< typename kfunc1, typename kfunc2, typename opp >
    void convolve2__( kfunc1 kf1, kfunc2 kf2, Grid_FFT<data_t> & res, opp op )
    {
@ -968,6 +987,8 @@ public:
        //... convolve
        f1p_->FourierTransformBackward();
        f2p_->FourierTransformBackward();
        #pragma omp parallel for
        for (size_t i = 0; i < f1p_->ntot_; ++i){
            (*f2p_).relem(i) *= (*f1p_).relem(i);
        }
--- a/src/main.cc
+++ b/src/main.cc
@ -175,43 +175,6 @@ int main( int argc, char** argv )
    phi.zero_DC_mode();
    //======================================================================
    //... compute 2LPT displacement potential
    Grid_FFT<real_t>
        phi_xx({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi_xy({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi_xz({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi_yy({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi_yz({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi_zz({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    phi_xx.FourierTransformForward(false);
    phi_xy.FourierTransformForward(false);
    phi_xz.FourierTransformForward(false);
    phi_yy.FourierTransformForward(false);
    phi_yz.FourierTransformForward(false);
    phi_zz.FourierTransformForward(false);
    #pragma omp parallel for
    for (size_t i = 0; i < phi.size(0); ++i)
    {
        for (size_t j = 0; j < phi.size(1); ++j)
        {
            for (size_t k = 0; k < phi.size(2); ++k)
            {
                auto kk = phi.get_k<real_t>(i,j,k);
                size_t idx = phi.get_idx(i,j,k);
                phi_xx.kelem(idx) = -kk[0] * kk[0] * phi.kelem(idx) / phifac;
                phi_xy.kelem(idx) = -kk[0] * kk[1] * phi.kelem(idx) / phifac;
                phi_xz.kelem(idx) = -kk[0] * kk[2] * phi.kelem(idx) / phifac;
                phi_yy.kelem(idx) = -kk[1] * kk[1] * phi.kelem(idx) / phifac;
                phi_yz.kelem(idx) = -kk[1] * kk[2] * phi.kelem(idx) / phifac;
                phi_zz.kelem(idx) = -kk[2] * kk[2] * phi.kelem(idx) / phifac;
            }
        }
    }
    auto assign_op = []( ccomplex_t res, ccomplex_t val ) -> ccomplex_t{ return res; };
    auto add_op = []( ccomplex_t res, ccomplex_t val ) -> ccomplex_t{ return val+res; };
    auto sub_op = []( ccomplex_t res, ccomplex_t val ) -> ccomplex_t{ return val-res; };
@ -225,34 +188,6 @@ int main( int argc, char** argv )
    Conv.convolve_Hessians( phi, {0,2}, phi, {0,2}, phi2, sub_op );
    Conv.convolve_Hessians( phi, {1,2}, phi, {1,2}, phi2, sub_op );
    // Conv.convolve2__( 
    //     [&]( size_t i, size_t j, size_t k ){
    //         auto kk = phi.get_k<real_t>(i,j,k);
    //         return -kk[0] * kk[0] * phi.kelem(i,j,k) / phifac;
    //     },
    //     [&]( size_t i, size_t j, size_t k ){
    //         auto kk = phi.get_k<real_t>(i,j,k);
    //         return -kk[1] * kk[1] * phi.kelem(i,j,k) / phifac;
    //     }, phi2, assign_op );
    // Conv.convolve2__( 
    //     [&]( size_t i, size_t j, size_t k ){
    //         auto kk = phi.get_k<real_t>(i,j,k);
    //         return -kk[0] * kk[0] * phi.kelem(i,j,k) / phifac;
    //     },
    //     [&]( size_t i, size_t j, size_t k ){
    //         auto kk = phi.get_k<real_t>(i,j,k);
    //         return -kk[2] * kk[2] * phi.kelem(i,j,k) / phifac;
    //     }, phi2, assign_op );
    //Conv.convolve2(phi_xx,phi_yy,phi2,assign_op);
    // Conv.convolve2(phi_xx,phi_zz,phi2,add_op);
    // Conv.convolve2(phi_yy,phi_zz,phi2,add_op);
    // Conv.convolve2(phi_xy,phi_xy,phi2,sub_op);
    // Conv.convolve2(phi_xz,phi_xz,phi2,sub_op);
    // Conv.convolve2(phi_yz,phi_yz,phi2,sub_op);
 #else 
    phi2.FourierTransformBackward();
    phi_xx.FourierTransformBackward();
@ -286,52 +221,16 @@ int main( int argc, char** argv )
    //======================================================================
    //... compute 3LPT displacement potential
    Grid_FFT<real_t>
        phi2_xx({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi2_xy({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi2_xz({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi2_yy({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi2_yz({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}),
        phi2_zz({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
    phi2_xx.FourierTransformForward(false);
    phi2_xy.FourierTransformForward(false);
    phi2_xz.FourierTransformForward(false);
    phi2_yy.FourierTransformForward(false);
    phi2_yz.FourierTransformForward(false);
    phi2_zz.FourierTransformForward(false);
    #pragma omp parallel for
    for (size_t i = 0; i < phi2.size(0); ++i)
    {
        for (size_t j = 0; j < phi2.size(1); ++j)
        {
            for (size_t k = 0; k < phi2.size(2); ++k)
            {
                auto kk = phi2.get_k<real_t>(i,j,k);
                size_t idx = phi2.get_idx(i,j,k);
                phi2_xx.kelem(idx) = -kk[0] * kk[0] * phi2.kelem(idx) / phifac;
                phi2_xy.kelem(idx) = -kk[0] * kk[1] * phi2.kelem(idx) / phifac;
                phi2_xz.kelem(idx) = -kk[0] * kk[2] * phi2.kelem(idx) / phifac;
                phi2_yy.kelem(idx) = -kk[1] * kk[1] * phi2.kelem(idx) / phifac;
                phi2_yz.kelem(idx) = -kk[1] * kk[2] * phi2.kelem(idx) / phifac;
                phi2_zz.kelem(idx) = -kk[2] * kk[2] * phi2.kelem(idx) / phifac;
            }
        }
    }
    #if 1
    auto sub2_op = []( ccomplex_t res, ccomplex_t val ) -> ccomplex_t{ return val-2.0*res; };
-    Conv.convolve2(phi_xx,phi2_yy,phi3a,assign_op);
+
-    Conv.convolve2(phi_xx,phi2_zz,phi3a,add_op);
+    Conv.convolve_SumHessians( phi, {0,0}, phi2, {1,1}, {2,2}, phi3a, assign_op );
-    Conv.convolve2(phi_yy,phi2_xx,phi3a,add_op);
+    Conv.convolve_SumHessians( phi, {1,1}, phi2, {2,2}, {0,0}, phi3a, add_op );
-    Conv.convolve2(phi_yy,phi2_zz,phi3a,add_op);
+    Conv.convolve_SumHessians( phi, {2,2}, phi2, {0,0}, {1,1}, phi3a, add_op );
-    Conv.convolve2(phi_zz,phi2_xx,phi3a,add_op);
+    Conv.convolve_Hessians( phi, {0,1}, phi2, {0,1}, phi3a, sub2_op );
-    Conv.convolve2(phi_zz,phi2_yy,phi3a,add_op);
+    Conv.convolve_Hessians( phi, {0,2}, phi2, {0,2}, phi3a, sub2_op );
-    Conv.convolve2(phi_xy,phi2_xy,phi3a,sub2_op);
+    Conv.convolve_Hessians( phi, {1,2}, phi2, {1,2}, phi3a, sub2_op );
    Conv.convolve2(phi_xz,phi2_xz,phi3a,sub2_op);
    Conv.convolve2(phi_yz,phi2_yz,phi3a,sub2_op);
    phi3a.apply_function_k_dep([&](auto x, auto k) {
        return 0.5 * x;
@ -378,7 +277,7 @@ int main( int argc, char** argv )
    phi3a.FourierTransformForward();
    phi3a.apply_function_k_dep([&](auto x, auto k) {
        real_t kmod2 = k.norm_squared();
-        return x * (-1.0 / kmod2) * phifac;
+        return x * (-1.0 / kmod2) * phifac / phifac / phifac;
    });
    phi3a.zero_DC_mode();
@ -391,83 +290,46 @@ int main( int argc, char** argv )
    ///////////////////////////////////////////////////////////////////////
    // we store the densities here if we compute them
    Grid_FFT<real_t> &delta = phi_xx;
    Grid_FFT<real_t> &delta2 = phi_xy;
    Grid_FFT<real_t> &delta3a = phi_xz;
    Grid_FFT<real_t> &delta3b = phi_yy;
    Grid_FFT<real_t> &delta3 = phi_yz;
    // we store displacements and velocities here if we compute them
    Grid_FFT<real_t> &Psix = phi_xx;
    Grid_FFT<real_t> &Psiy = phi_xy;
    Grid_FFT<real_t> &Psiz = phi_xz;
    Grid_FFT<real_t> &Vx   = phi_yy;
    Grid_FFT<real_t> &Vy   = phi_yz;
    Grid_FFT<real_t> &Vz   = phi_zz;
    const bool compute_densities = true;
    phi_xx.FourierTransformForward(false);
    phi_xy.FourierTransformForward(false);
    phi_xz.FourierTransformForward(false);
    phi_yy.FourierTransformForward(false);
    phi_yz.FourierTransformForward(false);
    phi_zz.FourierTransformForward(false);
    if( compute_densities ){
-        phi.FourierTransformForward();
+        Grid_FFT<real_t> delta   = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-        phi2.FourierTransformForward();
+        Grid_FFT<real_t> delta2  = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-        phi3a.FourierTransformForward();
+        Grid_FFT<real_t> delta3a = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-        phi3b.FourierTransformForward();
+        Grid_FFT<real_t> delta3b = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
-    }
+        Grid_FFT<real_t> delta3  = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        delta.FourierTransformForward(false);
        delta2.FourierTransformForward(false);
        delta3a.FourierTransformForward(false);
        delta3b.FourierTransformForward(false);
        delta3.FourierTransformForward(false);
-    #pragma omp parallel for
+        #pragma omp parallel for
-    for (size_t i = 0; i < phi.size(0); ++i)
+        for (size_t i = 0; i < phi.size(0); ++i)
    {
        for (size_t j = 0; j < phi.size(1); ++j)
        {
-            for (size_t k = 0; k < phi.size(2); ++k)
+            for (size_t j = 0; j < phi.size(1); ++j)
            {
-                auto kk = phi.get_k<real_t>(i,j,k);
+                for (size_t k = 0; k < phi.size(2); ++k)
-                size_t idx = phi.get_idx(i,j,k);
+                {
-                auto laplace = -kk.norm_squared();
+                    auto kk = phi.get_k<real_t>(i,j,k);
                    size_t idx = phi.get_idx(i,j,k);
                    auto laplace = -kk.norm_squared();
-                // scale potentials with respective order growth factors
+                    // scale potentials with respective order growth factors
-                phi.kelem(idx)   *= g1;
+                    phi.kelem(idx)   *= g1;
-                phi2.kelem(idx)  *= g2;
+                    phi2.kelem(idx)  *= g2;
-                phi3a.kelem(idx) *= g3a;
+                    phi3a.kelem(idx) *= g3a;
-                phi3b.kelem(idx) *= g3b;
+                    phi3b.kelem(idx) *= g3b;
                if( compute_densities ){
                    // compute densities associated to respective potentials as well
                    delta.kelem(idx) = laplace * phi.kelem(idx) / phifac;
                    delta2.kelem(idx) = laplace * phi2.kelem(idx) / phifac;
                    delta3a.kelem(idx) = laplace * phi3a.kelem(idx) / phifac;
                    delta3b.kelem(idx) = laplace * phi3b.kelem(idx) / phifac;
                    delta3.kelem(idx) = delta3a.kelem(idx) + delta3b.kelem(idx);
-                }else{
+                
                    auto phitot = phi.kelem(idx) + ((LPTorder>1)?phi2.kelem(idx):0.0) + ((LPTorder>2)? phi3a.kelem(idx) + phi3b.kelem(idx) : 0.0);
                    auto phitot_v = vfac1 * phi.kelem(idx) + ((LPTorder>1)? vfac2 * phi2.kelem(idx) : 0.0) + ((LPTorder>2)? vfac3 * (phi3a.kelem(idx) + phi3b.kelem(idx)) : 0.0);
                    Psix.kelem(idx) = ccomplex_t(0.0,1.0) * kk[0]* boxlen * ( phitot );
                    Psiy.kelem(idx) = ccomplex_t(0.0,1.0) * kk[1]* boxlen * ( phitot );
                    Psiz.kelem(idx) = ccomplex_t(0.0,1.0) * kk[2]* boxlen * ( phitot );
                    Vx.kelem(idx)   = ccomplex_t(0.0,1.0) * kk[0]* boxlen * ( phitot_v );
                    Vy.kelem(idx)   = ccomplex_t(0.0,1.0) * kk[1]* boxlen * ( phitot_v );
                    Vz.kelem(idx)   = ccomplex_t(0.0,1.0) * kk[2]* boxlen * ( phitot_v );
                }
            }
        }
    }
    ///////////////////////////////////////////////////////////////////////
    //... write output .....
    if( compute_densities ){
        phi.FourierTransformBackward();
        phi2.FourierTransformBackward();
        phi3a.FourierTransformBackward();
@ -491,6 +353,52 @@ int main( int argc, char** argv )
        delta3b.Write_to_HDF5(fname_hdf5, "delta3b");
        delta3.Write_to_HDF5(fname_hdf5, "delta3");
    }else{
        // we store displacements and velocities here if we compute them
        Grid_FFT<real_t> Psix = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        Grid_FFT<real_t> Psiy = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        Grid_FFT<real_t> Psiz = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        Grid_FFT<real_t> Vx   = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        Grid_FFT<real_t> Vy   = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        Grid_FFT<real_t> Vz   = Grid_FFT<real_t>({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
        Psix.FourierTransformForward(false);
        Psiy.FourierTransformForward(false);
        Psiz.FourierTransformForward(false);
        Vx.FourierTransformForward(false);
        Vy.FourierTransformForward(false);
        Vz.FourierTransformForward(false);
        #pragma omp parallel for
        for (size_t i = 0; i < phi.size(0); ++i)
        {
            for (size_t j = 0; j < phi.size(1); ++j)
            {
                for (size_t k = 0; k < phi.size(2); ++k)
                {
                    auto kk = phi.get_k<real_t>(i,j,k);
                    size_t idx = phi.get_idx(i,j,k);
                    auto laplace = -kk.norm_squared();
                    // scale potentials with respective order growth factors
                    phi.kelem(idx)   *= g1;
                    phi2.kelem(idx)  *= g2;
                    phi3a.kelem(idx) *= g3a;
                    phi3b.kelem(idx) *= g3b;
                    auto phitot = phi.kelem(idx) + ((LPTorder>1)?phi2.kelem(idx):0.0) + ((LPTorder>2)? phi3a.kelem(idx) + phi3b.kelem(idx) : 0.0);
                    auto phitot_v = vfac1 * phi.kelem(idx) + ((LPTorder>1)? vfac2 * phi2.kelem(idx) : 0.0) + ((LPTorder>2)? vfac3 * (phi3a.kelem(idx) + phi3b.kelem(idx)) : 0.0);
                    Psix.kelem(idx) = ccomplex_t(0.0,1.0) * kk[0]* boxlen * ( phitot );
                    Psiy.kelem(idx) = ccomplex_t(0.0,1.0) * kk[1]* boxlen * ( phitot );
                    Psiz.kelem(idx) = ccomplex_t(0.0,1.0) * kk[2]* boxlen * ( phitot );
                    Vx.kelem(idx)   = ccomplex_t(0.0,1.0) * kk[0]* boxlen * ( phitot_v );
                    Vy.kelem(idx)   = ccomplex_t(0.0,1.0) * kk[1]* boxlen * ( phitot_v );
                    Vz.kelem(idx)   = ccomplex_t(0.0,1.0) * kk[2]* boxlen * ( phitot_v );
                }
            }
        }
        Psix.FourierTransformBackward();
        Psiy.FourierTransformBackward();
        Psiz.FourierTransformBackward();
@ -518,9 +426,10 @@ int main( int argc, char** argv )
        the_output_plugin->finalize();
 		delete the_output_plugin;
    }
-    
+
    ///////////////////////////////////////////////////////////////////////
 #if defined(USE_MPI)