From edcfbbab588b0bc38877e35e09d2be353c1619e5 Mon Sep 17 00:00:00 2001
From: Oliver Hahn <oliverjhahn@gmail.com>
Date: Wed, 11 Nov 2020 16:26:49 +0100
Subject: [PATCH] added new fourier copy-interpolation routine

---
 include/grid_fft.hh |  15 ++--
 src/grid_fft.cc     | 204 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+), 5 deletions(-)
diff --git a/include/grid_fft.hh b/include/grid_fft.hh
index a49e1a6..608705a 100644
--- a/include/grid_fft.hh
+++ b/include/grid_fft.hh
@@ -134,7 +134,7 @@ public:
     //! set all field elements to zero
     void zero() noexcept
     {
-#pragma omp parallel for
+        #pragma omp parallel for
         for (size_t i = 0; i < ntot_; ++i)
             data_[i] = 0.0;
     }
@@ -155,8 +155,8 @@ public:
         assert(this->n_[1] == g.n_[1]);
         assert(this->n_[2] == g.n_[2]);
 
-// now we can copy all the data over
-#pragma omp parallel for
+        // now we can copy all the data over
+        #pragma omp parallel for
         for (size_t i = 0; i < ntot_; ++i)
             data_[i] = g.data_[i];
     }
@@ -357,6 +357,7 @@ public:
         return val;
     }
 
+    
     inline ccomplex_t gradient( const int idim, std::array<size_t,3> ijk ) const
     {
         if( bdistributed ){
@@ -791,13 +792,17 @@ public:
         }
     }
 
+    //! perform a backwards Fourier transform
     void FourierTransformBackward(bool do_transform = true);
 
+    //! perform a forwards Fourier transform
     void FourierTransformForward(bool do_transform = true);
 
-    void ApplyNorm(void);
+    //! perform a copy operation between to FFT grids that might not be of the same size
+    void FourierInterpolateCopyTo( grid_fft_t &grid_to );
 
-    void FillRandomReal(unsigned long int seed = 123456ul);
+    //! normalise field
+    void ApplyNorm(void);
 
     void Write_to_HDF5(std::string fname, std::string datasetname) const;
 
diff --git a/src/grid_fft.cc b/src/grid_fft.cc
index 8de61fc..a60a428 100644
--- a/src/grid_fft.cc
+++ b/src/grid_fft.cc
@@ -185,6 +185,7 @@ void Grid_FFT<data_t, bdistributed>::ApplyNorm(void)
         data_[i] *= fft_norm_fac_;
 }
 
+//! Perform a backwards Fourier transformation
 template <typename data_t, bool bdistributed>
 void Grid_FFT<data_t, bdistributed>::FourierTransformForward(bool do_transform)
 {
@@ -217,6 +218,7 @@ void Grid_FFT<data_t, bdistributed>::FourierTransformForward(bool do_transform)
     }
 }
 
+//! Perform a forward Fourier transformation
 template <typename data_t, bool bdistributed>
 void Grid_FFT<data_t, bdistributed>::FourierTransformBackward(bool do_transform)
 {
@@ -248,6 +250,208 @@ void Grid_FFT<data_t, bdistributed>::FourierTransformBackward(bool do_transform)
     }
 }
 
+//! Perform a copy to another field, not necessarily the same size, using Fourier interpolation
+template <typename data_t, bool bdistributed>
+void Grid_FFT<data_t, bdistributed>::FourierInterpolateCopyTo( grid_fft_t &grid_to )
+{
+    grid_fft_t &grid_from = *this;
+
+    //... transform both grids to Fourier space
+    grid_from.FourierTransformForward(true);
+    grid_to.FourierTransformForward(false);
+    
+    // if grids are same size, we directly copy without the need for interpolation
+    if( grid_from.n_[0] == grid_to.n_[0] 
+        && grid_from.n_[1] == grid_to.n_[1] 
+        && grid_from.n_[2] == grid_to.n_[2] )
+    {
+        grid_to.copy_from( grid_from );
+        return;
+    }
+
+    // set to zero so that regions without data are zeroed
+    grid_to.zero();
+
+    // if not running MPI, then can do without sending and receiving
+#if !defined(USE_MPI)
+
+    // determine effective Nyquist modes representable by both fields and their locations in array
+    size_t fny0_left  = std::min(grid_from.n_[0] / 2, grid_to.n_[0] / 2);
+    size_t fny0_right = std::max(grid_from.n_[0] - grid_to.n_[0] / 2, grid_from.n_[0] / 2);
+    size_t fny1_left  = std::min(grid_from.n_[1] / 2, grid_to.n_[1] / 2);
+    size_t fny1_right = std::max(grid_from.n_[1] - grid_to.n_[1] / 2, grid_from.n_[1] / 2);
+    size_t fny2_left  = std::min(grid_from.n_[2] / 2, grid_to.n_[2] / 2);
+    // size_t fny2_right = std::max(grid_from.n_[2] - grid_to.n_[2] / 2, grid_from.n_[2] / 2);
+
+    const size_t fny0_left_recv  = fny0_left;
+    const size_t fny0_right_recv = (fny0_right + grid_to.n_[0]) - grid_from.n_[0];
+    const size_t fny1_left_recv  = fny1_left;
+    const size_t fny1_right_recv = (fny1_right + grid_to.n_[1]) - grid_from.n_[1];
+    const size_t fny2_left_recv  = fny2_left;
+    // const size_t fny2_right_recv = (fny2_right + grid_to.n_[2]) - grid_from.n_[2];
+
+    #pragma omp parallel for
+    for( size_t i=0; i<grid_to.size(0); ++i )
+    {
+        if (i < fny0_left_recv || i > fny0_right_recv)
+        {
+            size_t isend = (i < fny0_left_recv)? i : (i + grid_from.n_[0]) - grid_to.n_[0];
+
+            // copy data slice into new grid, avoiding modes that do not exist in both
+            for( size_t j=0; j<grid_to.size(1); ++j )
+            {
+                if( j < fny1_left_recv || j > fny1_right_recv )
+                {
+                    const size_t jsend = (j < fny1_left_recv)? j : (j + grid_from.n_[1]) - grid_to.n_[1];
+
+                    for( size_t k=0; k<fny2_left_recv; ++k )
+                    {
+                        grid_to.kelem(i,j,k) = grid_from.kelem(isend,jsend,k);
+                    }
+                }
+            }
+        }
+    }
+
+#else
+    // if they are not the same size, we use Fourier interpolation to upscale/downscale
+    double tstart = get_wtime();
+    music::dlog << "[MPI] Started scatter for Fourier interpolation/copy" << std::endl;
+
+    //... determine communication offsets
+    std::vector<ptrdiff_t> offsets_send, offsets_recv;
+
+    // this should use bisection, not linear search
+    auto get_task = [](ptrdiff_t index, const std::vector<ptrdiff_t> &offsets, const int ntasks) -> int
+    {
+        int itask = 0;
+        while (itask < ntasks - 1 && offsets[itask + 1] <= index)
+            ++itask;
+        return itask;
+    };
+
+    int ntasks(MPI::get_size());
+
+    offsets_send.assign(ntasks, 0);
+    offsets_recv.assign(ntasks, 0);
+
+    MPI_Allgather(&grid_from.local_1_start_, 1, MPI_LONG_LONG, &offsets_send[0], 1,
+                    MPI_LONG_LONG, MPI_COMM_WORLD);
+    MPI_Allgather(&grid_to.local_1_start_, 1, MPI_LONG_LONG, &offsets_recv[0], 1,
+                    MPI_LONG_LONG, MPI_COMM_WORLD);
+
+    MPI_Datatype datatype =
+        (typeid(data_t) == typeid(float)) ? MPI_C_FLOAT_COMPLEX 
+        : (typeid(data_t) == typeid(double)) ? MPI_C_DOUBLE_COMPLEX 
+        : (typeid(data_t) == typeid(long double)) ? MPI_C_LONG_DOUBLE_COMPLEX
+        : MPI_BYTE;
+
+    const size_t slicesz = grid_from.size(1) * grid_from.size(3);
+
+    // determine effective Nyquist modes representable by both fields and their locations in array
+    size_t fny0_left  = std::min(grid_from.n_[1] / 2, grid_to.n_[1] / 2);
+    size_t fny0_right = std::max(grid_from.n_[1] - grid_to.n_[1] / 2, grid_from.n_[1] / 2);
+    size_t fny1_left  = std::min(grid_from.n_[0] / 2, grid_to.n_[0] / 2);
+    size_t fny1_right = std::max(grid_from.n_[0] - grid_to.n_[0] / 2, grid_from.n_[0] / 2);
+    size_t fny2_left  = std::min(grid_from.n_[2] / 2, grid_to.n_[2] / 2);
+    // size_t fny2_right = std::max(grid_from.n_[2] - grid_to.n_[2] / 2, grid_from.n_[2] / 2);
+
+    const size_t fny0_left_recv  = fny0_left;
+    const size_t fny0_right_recv = (fny0_right + grid_to.n_[1]) - grid_from.n_[1];
+    const size_t fny1_left_recv  = fny1_left;
+    const size_t fny1_right_recv = (fny1_right + grid_to.n_[0]) - grid_from.n_[0];
+    const size_t fny2_left_recv  = fny2_left;
+    // const size_t fny2_right_recv = (fny2_right + grid_to.n_[2]) - grid_from.n_[2];
+
+    //--- send data from buffer ---------------------------------------------------
+    
+    std::vector<MPI_Request> req;
+    MPI_Request temp_req;
+
+    for (size_t i = 0; i < grid_from.size(0); ++i)
+    {
+        size_t iglobal_send = i + offsets_send[CONFIG::MPI_task_rank];
+        
+        if (iglobal_send < fny0_left) 
+        {
+            size_t iglobal_recv = iglobal_send;
+            int sendto = get_task(iglobal_recv, offsets_recv, CONFIG::MPI_task_size);
+            MPI_Isend(&grid_from.kelem(i * slicesz), (int)slicesz, datatype, sendto,
+                        (int)iglobal_recv, MPI_COMM_WORLD, &temp_req);
+            req.push_back(temp_req);
+            // std::cout << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": Isend #" << iglobal_send << " to task " << sendto << ", size = " << slicesz << std::endl;
+        }
+        if (iglobal_send > fny0_right) 
+        {
+            size_t iglobal_recv = (iglobal_send + grid_to.n_[1]) - grid_from.n_[1]; 
+            int sendto = get_task(iglobal_recv, offsets_recv, CONFIG::MPI_task_size);
+            MPI_Isend(&grid_from.kelem(i * slicesz), (int)slicesz, datatype, sendto,
+                        (int)iglobal_recv, MPI_COMM_WORLD, &temp_req);
+            req.push_back(temp_req);
+            // std::cout << "task  " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": Isend #" << iglobal_send << " to task " << sendto << ", size = " << slicesz<< std::endl;
+        }
+    }
+
+    //--- receive data ------------------------------------------------------------
+    #pragma omp parallel
+    {
+        MPI_Status status;
+        ccomplex_t  * recvbuf = new ccomplex_t[ slicesz ]; 
+
+        #pragma omp for
+        for( size_t i=0; i<grid_to.size(0); ++i )
+        {
+            size_t iglobal_recv = i + offsets_recv[CONFIG::MPI_task_rank];
+
+            if (iglobal_recv < fny0_left_recv || iglobal_recv > fny0_right_recv)
+            {
+                size_t iglobal_send = (iglobal_recv < fny0_left_recv)? iglobal_recv : (iglobal_recv + grid_from.n_[1]) - grid_to.n_[1];
+
+                int recvfrom = get_task(iglobal_send, offsets_send, CONFIG::MPI_task_size);
+                
+                // receive data slice and check for MPI errors when in debug mode
+                status.MPI_ERROR = MPI_SUCCESS;
+                MPI_Recv(&recvbuf[0], (int)slicesz, datatype, recvfrom, (int)iglobal_recv, MPI_COMM_WORLD, &status);
+                assert(status.MPI_ERROR == MPI_SUCCESS);
+
+                // copy data slice into new grid, avoiding modes that do not exist in both
+                for( size_t j=0; j<grid_to.size(1); ++j )
+                {
+                    if( j < fny1_left_recv || j > fny1_right_recv )
+                    {
+                        const size_t jsend = (j < fny1_left_recv)? j : (j + grid_from.n_[0]) - grid_to.n_[0];
+
+                        for( size_t k=0; k<fny2_left_recv; ++k )
+                        {
+                            grid_to.kelem(i,j,k) = recvbuf[jsend * grid_from.sizes_[3] + k];
+                        }
+                    }
+                }
+            }
+        }
+        delete[] recvbuf;
+    }
+    
+    MPI_Status status;
+    for (size_t i = 0; i < req.size(); ++i)
+    {
+        // need to set status as wait does not necessarily modify it
+        // c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php
+        status.MPI_ERROR = MPI_SUCCESS;
+        // std::cout << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl;
+        MPI_Wait(&req[i], &status);
+        // std::cout << "---> ok!" << std::endl;
+        assert(status.MPI_ERROR == MPI_SUCCESS);
+    }
+
+    MPI_Barrier(MPI_COMM_WORLD);
+
+    music::dlog.Print("[MPI] Completed scatter for Fourier interpolation/copy, took %fs\n",
+                        get_wtime() - tstart);  
+#endif //defined(USE_MPI)      
+}
+
+
 #define H5_USE_16_API
 #include <hdf5.h>