diff --git a/include/convolution.hh b/include/convolution.hh new file mode 100644 index 0000000..bc35818 --- /dev/null +++ b/include/convolution.hh @@ -0,0 +1,579 @@ +#pragma once + +#include + +#include +#include + +//! convolution class, respecting Orszag's 3/2 rule +template< typename data_t > +class OrszagConvolver +{ +protected: + Grid_FFT *f1p_, *f2p_; +#ifdef USE_MPI + Grid_FFT *fMPIbuf_; +#endif + std::array np_; + std::array length_; + + ccomplex_t *crecvbuf_; + real_t *recvbuf_; + ptrdiff_t *offsets_; + ptrdiff_t *offsetsp_; + ptrdiff_t *sizes_; + ptrdiff_t *sizesp_; + +private: + int get_task( ptrdiff_t index, const ptrdiff_t *offsets, const ptrdiff_t *sizes, const int ntasks ) const + { + int itask = 0; + while( itask < ntasks-1 && offsets[itask+1] <= index ) ++itask; + return itask; + } + + // void pad_insert( const Grid_FFT & f, Grid_FFT & fp ); + // void unpad( const Grid_FFT & fp, Grid_FFT< data_t > & f ); + +public: + + + OrszagConvolver( const std::array &N, const std::array &L ) + : np_({3*N[0]/2,3*N[1]/2,3*N[2]/2}), length_(L) + { + //... create temporaries + f1p_ = new Grid_FFT(np_, length_, kspace_id); + f2p_ = new Grid_FFT(np_, length_, kspace_id); + +#if defined(USE_MPI) + fMPIbuf_ = Grid_FFT(N, length_, kspace_id); + size_t maxslicesz = f1p_->sizes_[1] * f1p_->sizes_[3] * 2; + + crecvbuf_ = new ccomplex_t[maxslicesz / 2]; + recvbuf_ = reinterpret_cast(&crecvbuf_[0]); + + int ntasks(MPI_Get_size()); + + offsets_ = new ptrdiff_t[ntasks]; + offsetsp_ = new ptrdiff_t[ntasks]; + sizes_ = new ptrdiff_t[ntasks]; + sizesp_ = new ptrdiff_t[ntasks]; + + size_t tsize = N[0], tsizep = f1p_->size(0); + + MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, + MPI_LONG_LONG, MPI_COMM_WORLD); + MPI_Allgather(&f1p_->local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, + MPI_LONG_LONG, MPI_COMM_WORLD); + MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, + MPI_COMM_WORLD); + MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, + MPI_COMM_WORLD); +#endif + } + + ~OrszagConvolver() + { + delete f1p_; + delete f2p_; +#if defined(USE_MPI) + delete fMPIbuf_; + delete[] crecvbuf_; + delete[] offsets_; + delete[] offsetsp_; + delete[] sizes_; + delete[] sizesp_; +#endif + } + + template< typename opp > + void convolve_Hessians( Grid_FFT & inl, const std::array& d2l, Grid_FFT & inr, const std::array& d2r, Grid_FFT & res, opp op ){ + // transform to FS in case fields are not + inl.FourierTransformForward(); + inr.FourierTransformForward(); + // perform convolution of Hessians + this->convolve2( + [&]( size_t i, size_t j, size_t k ) -> ccomplex_t{ + auto kk = inl.template get_k(i,j,k); + return -kk[d2l[0]] * kk[d2l[1]] * inl.kelem(i,j,k); + }, + [&]( size_t i, size_t j, size_t k ){ + auto kk = inr.template get_k(i,j,k); + return -kk[d2r[0]] * kk[d2r[1]] * inr.kelem(i,j,k); + }, res, op ); + } + + template< typename opp > + void convolve_SumHessians( Grid_FFT & inl, const std::array& d2l, Grid_FFT & inr, const std::array& d2r1, + const std::array& d2r2, Grid_FFT & res, opp op ){ + // transform to FS in case fields are not + inl.FourierTransformForward(); + inr.FourierTransformForward(); + // perform convolution of Hessians + this->convolve2( + [&]( size_t i, size_t j, size_t k ) -> ccomplex_t{ + auto kk = inl.template get_k(i,j,k); + return -kk[d2l[0]] * kk[d2l[1]] * inl.kelem(i,j,k); + }, + [&]( size_t i, size_t j, size_t k ){ + auto kk = inr.template get_k(i,j,k); + return (-kk[d2r1[0]] * kk[d2r1[1]] -kk[d2r2[0]] * kk[d2r2[1]]) * inr.kelem(i,j,k); + }, res, op ); + } + + template< typename kfunc1, typename kfunc2, typename opp > + void convolve2( kfunc1 kf1, kfunc2 kf2, Grid_FFT & res, opp op ) + { + //... prepare data 1 + f1p_->FourierTransformForward(false); + this->pad_insert( kf1, *f1p_ ); + + //... prepare data 1 + f2p_->FourierTransformForward(false); + this->pad_insert( kf2, *f2p_ ); + + //... convolve + f1p_->FourierTransformBackward(); + f2p_->FourierTransformBackward(); + + #pragma omp parallel for + for (size_t i = 0; i < f1p_->ntot_; ++i){ + (*f2p_).relem(i) *= (*f1p_).relem(i); + } + f2p_->FourierTransformForward(); + //... copy data back + res.FourierTransformForward(); + unpad(*f2p_, res, op); + } + + //... inplace interface + /*void convolve3( const Grid_FFT & f1, const Grid_FFT & f2, const Grid_FFT & f3, Grid_FFT & res ) + { + convolve2( f1, f2, res ); + convolve2( res, f3, res ); + }*/ + + +private: + template + void pad_insert( kdep_functor kfunc, Grid_FFT &fp ){ + assert( fp.space_ == kspace_id ); + + + size_t dn[3] = { + fp.n_[0]/3,// fp.n_[0] - f.n_[0], + fp.n_[1]/3,// fp.n_[1] - f.n_[1], + fp.n_[2]/3// fp.n_[2] - f.n_[2], + }; + const double rfac = std::pow(1.5,1.5);//std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(f.n_[0] * f.n_[1] * f.n_[2]); + + fp.zero(); + + #if !defined(USE_MPI) //////////////////////////////////////////////////////////////////////////////////// + + //size_t nhalf[3] = {f.n_[0] / 2, f.n_[1] / 2, f.n_[2] / 2}; + size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3}; + + #pragma omp parallel for + for (size_t i = 0; i < 2*fp.size(0)/3; ++i) + { + size_t ip = (i > nhalf[0]) ? i + dn[0] : i; + for (size_t j = 0; j < 2*fp.size(1)/3; ++j) + { + size_t jp = (j > nhalf[1]) ? j + dn[1] : j; + for (size_t k = 0; k < 2*fp.size(2)/3; ++k) + { + size_t kp = (k > nhalf[2]) ? k + dn[2] : k; + // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue; + //fp.kelem(ip, jp, kp) = f.kelem(i, j, k) * rfac; + fp.kelem(ip, jp, kp) = kfunc(i, j, k) * rfac; + } + } + } + + #else /// then USE_MPI is defined //////////////////////////////////////////////////////////// + + throw std::runtime_error("need to implement buffering before sending for MPI"); + + MPI_Barrier(MPI_COMM_WORLD); + + ///////////////////////////////////////////////////////////////////// + size_t maxslicesz = fp.sizes_[1] * fp.sizes_[3] * 2; + + std::vector crecvbuf_(maxslicesz / 2, 0); + real_t *recvbuf_ = reinterpret_cast(&crecvbuf_[0]); + + std::vector + offsets_(CONFIG::MPI_task_size, 0), + offsetsp_(CONFIG::MPI_task_size, 0), + sizes_(CONFIG::MPI_task_size, 0), + sizesp_(CONFIG::MPI_task_size, 0); + + size_t tsize = f.size(0), tsizep = fp.size(0); + + MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, + MPI_LONG_LONG, MPI_COMM_WORLD); + MPI_Allgather(&fp.local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, + MPI_LONG_LONG, MPI_COMM_WORLD); + MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, + MPI_COMM_WORLD); + MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, + MPI_COMM_WORLD); + ///////////////////////////////////////////////////////////////////// + + double tstart = get_wtime(); + csoca::dlog << "[MPI] Started scatter for convolution" << std::endl; + + //... collect offsets + + assert(f.space_ == kspace_id); + + size_t nf[3] = {f.size(0), f.size(1), f.size(2)}; + size_t nfp[3] = {fp.size(0), fp.size(1), fp.size(2)}; + size_t fny[3] = {f.n_[1] / 2, f.n_[0] / 2, f.n_[2] / 2}; + + //... local size must be divisible by 2, otherwise this gets too complicated + assert(f.n_[1] % 2 == 0); + + size_t slicesz = f.size(1) * f.size(3); //*2; + + // comunicate + if (typeid(data_t) == typeid(real_t)) + slicesz *= 2; // then sizeof(real_t) gives only half of a complex + + MPI_Datatype datatype = + (typeid(data_t) == typeid(float)) + ? MPI_FLOAT + : (typeid(data_t) == typeid(double)) + ? MPI_DOUBLE + : (typeid(data_t) == typeid(std::complex)) + ? MPI_COMPLEX + : (typeid(data_t) == typeid(std::complex)) + ? MPI_DOUBLE_COMPLEX + : MPI_INT; + + MPI_Status status; + + std::vector req; + MPI_Request temp_req; + + // fill MPI send buffer + fMPIbuf_.FourierTransformForward(false); + #pragma omp parallel for + for (size_t i = 0; i < 2*fp.size(0)/3; ++i) + { + size_t ip = (i > nhalf[0]) ? i + dn[0] : i; + for (size_t j = 0; j < 2*fp.size(1)/3; ++j) + { + size_t jp = (j > nhalf[1]) ? j + dn[1] : j; + for (size_t k = 0; k < 2*fp.size(2)/3; ++k) + { + size_t kp = (k > nhalf[2]) ? k + dn[2] : k; + // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue; + //fp.kelem(ip, jp, kp) = f.kelem(i, j, k) * rfac; + fMPIbuf_.kelem(ip, jp, kp) = kfunc(i, j, k) * rfac; + } + } + } + + + // send data from buffer + for (size_t i = 0; i < nf[0]; ++i) + { + size_t iglobal = i + offsets_[CONFIG::MPI_task_rank]; + + if (iglobal < fny[0]) + { + int sendto = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size); + MPI_Isend(&fMPIbuf_.kelem(i * slicesz), (int)slicesz, datatype, sendto, + (int)iglobal, MPI_COMM_WORLD, &temp_req); + req.push_back(temp_req); + // ofs << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": + // Isend #" << iglobal << " to task " << sendto << std::endl; + } + if (iglobal > fny[0]) + { + int sendto = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size); + MPI_Isend(&fMPIbuf_.kelem(i * slicesz), (int)slicesz, datatype, sendto, + (int)(iglobal + fny[0]), MPI_COMM_WORLD, &temp_req); + req.push_back(temp_req); + // ofs << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": + // Isend #" << iglobal+fny[0] << " to task " << sendto << std::endl; + } + } + + for (size_t i = 0; i < nfp[0]; ++i) + { + size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank]; + + if (iglobal < fny[0] || iglobal > 2 * fny[0]) + { + int recvfrom = 0; + if (iglobal <= fny[0]) + recvfrom = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size); + else + recvfrom = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size); + + // ofs << "task " << CONFIG::MPI_task_rank << " : receive #" << iglobal << " from task " + // << recvfrom << std::endl; + + MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal, + MPI_COMM_WORLD, &status); + // ofs << "---> ok! " << (bool)(status.Get_error()==MPI::SUCCESS) << + // std::endl; + + assert(status.MPI_ERROR == MPI_SUCCESS); + + for (size_t j = 0; j < nf[1]; ++j) + { + if (j < fny[1]) + { + size_t jp = j; + for (size_t k = 0; k < nf[2]; ++k) + { + // size_t kp = (k>fny[2])? k+fny[2] : k; + if (k < fny[2]) + fp.kelem(i, jp, k) = crecvbuf_[j * f.sizes_[3] + k]; + else if (k > fny[2]) + fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * f.sizes_[3] + k]; + } + } + + else if (j > fny[1]) + { + size_t jp = j + fny[1]; + for (size_t k = 0; k < nf[2]; ++k) + { + // size_t kp = (k>fny[2])? k+fny[2] : k; + // fp.kelem(i,jp,kp) = crecvbuf_[j*f.sizes_[3]+k]; + if (k < fny[2]) + fp.kelem(i, jp, k) = crecvbuf_[j * f.sizes_[3] + k]; + else if (k > fny[2]) + fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * f.sizes_[3] + k]; + } + } + } + } + } + + for (size_t i = 0; i < req.size(); ++i) + { + // need to set status as wait does not necessarily modify it + // c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php + status.MPI_ERROR = MPI_SUCCESS; + // ofs << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl; + MPI_Wait(&req[i], &status); + // ofs << "---> ok!" << std::endl; + assert(status.MPI_ERROR == MPI_SUCCESS); + } + + // usleep(1000); + + MPI_Barrier(MPI_COMM_WORLD); + + // std::cerr << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! <<<<<" + // << std::endl; ofs << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! + // <<<<<" << std::endl; + csoca::dlog.Print("[MPI] Completed scatter for convolution, took %fs\n", + get_wtime() - tstart); + + #endif /// end of ifdef/ifndef USE_MPI /////////////////////////////////////////////////////////////// + } + + + template + void unpad(const Grid_FFT &fp, Grid_FFT &f, operator_t op ) + { + // assert(fp.n_[0] == 3 * f.n_[0] / 2); + // assert(fp.n_[1] == 3 * f.n_[1] / 2); + // assert(fp.n_[2] == 3 * f.n_[2] / 2); + + // make sure we're in Fourier space... + assert( fp.space_ == kspace_id ); + f.FourierTransformForward(); + + size_t dn[3] = { + fp.n_[0] - f.n_[0], + fp.n_[1] - f.n_[1], + fp.n_[2] - f.n_[2], + }; + + const double rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(f.n_[0] * f.n_[1] * f.n_[2]); + + #if !defined(USE_MPI) //////////////////////////////////////////////////////////////////////////////////// + + size_t nhalf[3] = {f.n_[0] / 2, f.n_[1] / 2, f.n_[2] / 2}; + + for (size_t i = 0; i < f.size(0); ++i) + { + size_t ip = (i > nhalf[0]) ? i + dn[0] : i; + for (size_t j = 0; j < f.size(1); ++j) + { + size_t jp = (j > nhalf[1]) ? j + dn[1] : j; + for (size_t k = 0; k < f.size(2); ++k) + { + size_t kp = (k > nhalf[2]) ? k + dn[2] : k; + // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue; + f.kelem(i, j, k) = op(fp.kelem(ip, jp, kp) / rfac, f.kelem(i, j, k)); + } + } + } + + #else /// then USE_MPI is defined ////////////////////////////////////////////////////////////// + + ///////////////////////////////////////////////////////////////////// + size_t maxslicesz = fp.sizes_[1] * fp.sizes_[3] * 2; + + std::vector crecvbuf_(maxslicesz / 2,0); + real_t* recvbuf_ = reinterpret_cast(&crecvbuf_[0]); + + std::vector + offsets_(CONFIG::MPI_task_size, 0), + offsetsp_(CONFIG::MPI_task_size, 0), + sizes_(CONFIG::MPI_task_size, 0), + sizesp_(CONFIG::MPI_task_size, 0); + + size_t tsize = f.size(0), tsizep = fp.size(0); + + MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, + MPI_LONG_LONG, MPI_COMM_WORLD); + MPI_Allgather(&fp.local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, + MPI_LONG_LONG, MPI_COMM_WORLD); + MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, + MPI_COMM_WORLD); + MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, + MPI_COMM_WORLD); + ///////////////////////////////////////////////////////////////////// + + double tstart = get_wtime(); + + csoca::ilog << "[MPI] Started gather for convolution"; + + MPI_Barrier(MPI_COMM_WORLD); + + size_t nf[3] = {f.size(0), f.size(1), f.size(2)}; + size_t nfp[4] = {fp.size(0), fp.size(1), fp.size(2), fp.size(3)}; + size_t fny[3] = {f.n_[1] / 2, f.n_[0] / 2, f.n_[2] / 2}; + + size_t slicesz = fp.size(1) * fp.size(3); + + if (typeid(data_t) == typeid(real_t)) + slicesz *= 2; // then sizeof(real_t) gives only half of a complex + + MPI_Datatype datatype = + (typeid(data_t) == typeid(float)) + ? MPI_FLOAT + : (typeid(data_t) == typeid(double)) + ? MPI_DOUBLE + : (typeid(data_t) == typeid(std::complex)) + ? MPI_COMPLEX + : (typeid(data_t) == typeid(std::complex)) + ? MPI_DOUBLE_COMPLEX + : MPI_INT; + + MPI_Status status; + + //... local size must be divisible by 2, otherwise this gets too complicated + // assert( tsize%2 == 0 ); + + f.zero(); + + std::vector req; + MPI_Request temp_req; + + for (size_t i = 0; i < nfp[0]; ++i) + { + size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank]; + + //... sending + if (iglobal < fny[0]) + { + int sendto = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size); + + MPI_Isend(&fp.kelem(i * slicesz), (int)slicesz, datatype, sendto, (int)iglobal, + MPI_COMM_WORLD, &temp_req); + req.push_back(temp_req); + } + else if (iglobal > 2 * fny[0]) + { + int sendto = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size); + MPI_Isend(&fp.kelem(i * slicesz), (int)slicesz, datatype, sendto, (int)iglobal, + MPI_COMM_WORLD, &temp_req); + req.push_back(temp_req); + } + } + + for (size_t i = 0; i < nf[0]; ++i) + { + size_t iglobal = i + offsets_[CONFIG::MPI_task_rank]; + + int recvfrom = 0; + if (iglobal < fny[0]) + { + recvfrom = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size); + MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal, + MPI_COMM_WORLD, &status); + } + else if (iglobal > fny[0]) + { + recvfrom = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size); + MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, + (int)(iglobal + fny[0]), MPI_COMM_WORLD, &status); + } + else + continue; + + assert(status.MPI_ERROR == MPI_SUCCESS); + + for (size_t j = 0; j < nf[1]; ++j) + { + + if (j < fny[1]) + { + size_t jp = j; + for (size_t k = 0; k < nf[2]; ++k) + { + // size_t kp = (k>fny[2])? k+fny[2] : k; + // f.kelem(i,j,k) = crecvbuf_[jp*nfp[3]+kp]; + if (k < fny[2]) + f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k],f.kelem(i, j, k)); + else if (k > fny[2]) + f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k + fny[2]], f.kelem(i, j, k)); + } + } + if (j > fny[1]) + { + size_t jp = j + fny[1]; + for (size_t k = 0; k < nf[2]; ++k) + { + // size_t kp = (k>fny[2])? k+fny[2] : k; + // f.kelem(i,j,k) = crecvbuf_[jp*nfp[3]+kp]; + if (k < fny[2]) + f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k], f.kelem(i, j, k)); + else if (k > fny[2]) + f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k + fny[2]], f.kelem(i, j, k)); + } + } + } + } + + for (size_t i = 0; i < req.size(); ++i) + { + // need to preset status as wait does not necessarily modify it to reflect + // success c.f. + // http://www.open-mpi.org/community/lists/devel/2007/04/1402.php + status.MPI_ERROR = MPI_SUCCESS; + + MPI_Wait(&req[i], &status); + assert(status.MPI_ERROR == MPI_SUCCESS); + } + + MPI_Barrier(MPI_COMM_WORLD); + + csoca::ilog.Print("[MPI] Completed gather for convolution, took %fs", get_wtime() - tstart); + + #endif /// end of ifdef/ifndef USE_MPI ////////////////////////////////////////////////////////////// + } + + +}; diff --git a/include/general.hh b/include/general.hh index f8b2578..0461d76 100644 --- a/include/general.hh +++ b/include/general.hh @@ -72,6 +72,17 @@ inline int MPI_Get_size( void ){ #endif #endif +// get task based on offsets +template +int get_task(ptrdiff_t index, const array_type &offsets, const array_type& sizes, + const int ntasks ) +{ + int itask = 0; + while (itask < ntasks - 1 && offsets[itask + 1] <= index) + ++itask; + return itask; +} + namespace CONFIG { extern int MPI_thread_support; diff --git a/include/grid_fft.hh b/include/grid_fft.hh index 3b37c5c..4e9fedf 100644 --- a/include/grid_fft.hh +++ b/include/grid_fft.hh @@ -14,231 +14,6 @@ enum space_t rspace_id }; -template< typename data_t > -class Grid_FFT; - -template -int get_task(ptrdiff_t index, const array_type &offsets, const array_type& sizes, - const int ntasks ) -{ - int itask = 0; - while (itask < ntasks - 1 && offsets[itask + 1] <= index) - ++itask; - return itask; -} - -// template -// void unpad(const Grid_FFT &fp, Grid_FFT &f, operator_t op ); - -template -void pad_insert(const Grid_FFT &f, Grid_FFT &fp); - -template -void pad_insertf( kdep_functor kfunc, Grid_FFT &fp ){ - assert( fp.space_ == kspace_id ); - - - size_t dn[3] = { - fp.n_[0]/3,// fp.n_[0] - f.n_[0], - fp.n_[1]/3,// fp.n_[1] - f.n_[1], - fp.n_[2]/3// fp.n_[2] - f.n_[2], - }; - const double rfac = std::pow(1.5,1.5);//std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(f.n_[0] * f.n_[1] * f.n_[2]); - - fp.zero(); - -#if !defined(USE_MPI) //////////////////////////////////////////////////////////////////////////////////// - - //size_t nhalf[3] = {f.n_[0] / 2, f.n_[1] / 2, f.n_[2] / 2}; - size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3}; - - #pragma omp parallel for - for (size_t i = 0; i < 2*fp.size(0)/3; ++i) - { - size_t ip = (i > nhalf[0]) ? i + dn[0] : i; - for (size_t j = 0; j < 2*fp.size(1)/3; ++j) - { - size_t jp = (j > nhalf[1]) ? j + dn[1] : j; - for (size_t k = 0; k < 2*fp.size(2)/3; ++k) - { - size_t kp = (k > nhalf[2]) ? k + dn[2] : k; - // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue; - //fp.kelem(ip, jp, kp) = f.kelem(i, j, k) * rfac; - fp.kelem(ip, jp, kp) = kfunc(i, j, k) * rfac; - } - } - } - -#else /// then USE_MPI is defined //////////////////////////////////////////////////////////// - - throw std::runtime_error("need to implement buffering before sending for MPI"); - - MPI_Barrier(MPI_COMM_WORLD); - - ///////////////////////////////////////////////////////////////////// - size_t maxslicesz = fp.sizes_[1] * fp.sizes_[3] * 2; - - std::vector crecvbuf_(maxslicesz / 2, 0); - real_t *recvbuf_ = reinterpret_cast(&crecvbuf_[0]); - - std::vector - offsets_(CONFIG::MPI_task_size, 0), - offsetsp_(CONFIG::MPI_task_size, 0), - sizes_(CONFIG::MPI_task_size, 0), - sizesp_(CONFIG::MPI_task_size, 0); - - size_t tsize = f.size(0), tsizep = fp.size(0); - - MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&fp.local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - ///////////////////////////////////////////////////////////////////// - - double tstart = get_wtime(); - csoca::dlog << "[MPI] Started scatter for convolution" << std::endl; - - //... collect offsets - - assert(f.space_ == kspace_id); - - size_t nf[3] = {f.size(0), f.size(1), f.size(2)}; - size_t nfp[3] = {fp.size(0), fp.size(1), fp.size(2)}; - size_t fny[3] = {f.n_[1] / 2, f.n_[0] / 2, f.n_[2] / 2}; - - //... local size must be divisible by 2, otherwise this gets too complicated - assert(f.n_[1] % 2 == 0); - - size_t slicesz = f.size(1) * f.size(3); //*2; - - // comunicate - if (typeid(data_t) == typeid(real_t)) - slicesz *= 2; // then sizeof(real_t) gives only half of a complex - - MPI_Datatype datatype = - (typeid(data_t) == typeid(float)) - ? MPI_FLOAT - : (typeid(data_t) == typeid(double)) - ? MPI_DOUBLE - : (typeid(data_t) == typeid(std::complex)) - ? MPI_COMPLEX - : (typeid(data_t) == typeid(std::complex)) - ? MPI_DOUBLE_COMPLEX - : MPI_INT; - - MPI_Status status; - - std::vector req; - MPI_Request temp_req; - - for (size_t i = 0; i < nf[0]; ++i) - { - size_t iglobal = i + offsets_[CONFIG::MPI_task_rank]; - - if (iglobal < fny[0]) - { - int sendto = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size); - MPI_Isend(&f.kelem(i * slicesz), (int)slicesz, datatype, sendto, - (int)iglobal, MPI_COMM_WORLD, &temp_req); - req.push_back(temp_req); - // ofs << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": - // Isend #" << iglobal << " to task " << sendto << std::endl; - } - if (iglobal > fny[0]) - { - int sendto = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size); - MPI_Isend(&f.kelem(i * slicesz), (int)slicesz, datatype, sendto, - (int)(iglobal + fny[0]), MPI_COMM_WORLD, &temp_req); - req.push_back(temp_req); - // ofs << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": - // Isend #" << iglobal+fny[0] << " to task " << sendto << std::endl; - } - } - - for (size_t i = 0; i < nfp[0]; ++i) - { - size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank]; - - if (iglobal < fny[0] || iglobal > 2 * fny[0]) - { - int recvfrom = 0; - if (iglobal <= fny[0]) - recvfrom = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size); - else - recvfrom = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size); - - // ofs << "task " << CONFIG::MPI_task_rank << " : receive #" << iglobal << " from task " - // << recvfrom << std::endl; - - MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal, - MPI_COMM_WORLD, &status); - // ofs << "---> ok! " << (bool)(status.Get_error()==MPI::SUCCESS) << - // std::endl; - - assert(status.MPI_ERROR == MPI_SUCCESS); - - for (size_t j = 0; j < nf[1]; ++j) - { - if (j < fny[1]) - { - size_t jp = j; - for (size_t k = 0; k < nf[2]; ++k) - { - // size_t kp = (k>fny[2])? k+fny[2] : k; - if (k < fny[2]) - fp.kelem(i, jp, k) = crecvbuf_[j * f.sizes_[3] + k]; - else if (k > fny[2]) - fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * f.sizes_[3] + k]; - } - } - - else if (j > fny[1]) - { - size_t jp = j + fny[1]; - for (size_t k = 0; k < nf[2]; ++k) - { - // size_t kp = (k>fny[2])? k+fny[2] : k; - // fp.kelem(i,jp,kp) = crecvbuf_[j*f.sizes_[3]+k]; - if (k < fny[2]) - fp.kelem(i, jp, k) = crecvbuf_[j * f.sizes_[3] + k]; - else if (k > fny[2]) - fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * f.sizes_[3] + k]; - } - } - } - } - } - - for (size_t i = 0; i < req.size(); ++i) - { - // need to set status as wait does not necessarily modify it - // c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php - status.MPI_ERROR = MPI_SUCCESS; - // ofs << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl; - MPI_Wait(&req[i], &status); - // ofs << "---> ok!" << std::endl; - assert(status.MPI_ERROR == MPI_SUCCESS); - } - - // usleep(1000); - - MPI_Barrier(MPI_COMM_WORLD); - - // std::cerr << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! <<<<<" - // << std::endl; ofs << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! - // <<<<<" << std::endl; - csoca::dlog.Print("[MPI] Completed scatter for convolution, took %fs\n", - get_wtime() - tstart); - -#endif /// end of ifdef/ifndef USE_MPI /////////////////////////////////////////////////////////////// -} - - - template class Grid_FFT @@ -439,8 +214,7 @@ public: double compute_2norm(void) { real_t sum1{0.0}; -#pragma omp parallel for reduction(+ \ - : sum1) +#pragma omp parallel for reduction(+ : sum1) for (size_t i = 0; i < sizes_[0]; ++i) { for (size_t j = 0; j < sizes_[1]; ++j) @@ -462,8 +236,7 @@ public: double std(void) { real_t sum1{0.0}, sum2{0.0}; -#pragma omp parallel for reduction(+ \ - : sum1, sum2) +#pragma omp parallel for reduction(+ : sum1, sum2) for (size_t i = 0; i < sizes_[0]; ++i) { for (size_t j = 0; j < sizes_[1]; ++j) @@ -485,8 +258,7 @@ public: double mean(void) { real_t sum1{0.0}; -#pragma omp parallel for reduction(+ \ - : sum1) +#pragma omp parallel for reduction(+ : sum1) for (size_t i = 0; i < sizes_[0]; ++i) { for (size_t j = 0; j < sizes_[1]; ++j) @@ -667,379 +439,3 @@ public: } } }; - - - -template -void unpad(const Grid_FFT &fp, Grid_FFT &f, operator_t op ) -{ - // assert(fp.n_[0] == 3 * f.n_[0] / 2); - // assert(fp.n_[1] == 3 * f.n_[1] / 2); - // assert(fp.n_[2] == 3 * f.n_[2] / 2); - - size_t dn[3] = { - fp.n_[0] - f.n_[0], - fp.n_[1] - f.n_[1], - fp.n_[2] - f.n_[2], - }; - - const double rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(f.n_[0] * f.n_[1] * f.n_[2]); - -#if !defined(USE_MPI) //////////////////////////////////////////////////////////////////////////////////// - - size_t nhalf[3] = {f.n_[0] / 2, f.n_[1] / 2, f.n_[2] / 2}; - - for (size_t i = 0; i < f.size(0); ++i) - { - size_t ip = (i > nhalf[0]) ? i + dn[0] : i; - for (size_t j = 0; j < f.size(1); ++j) - { - size_t jp = (j > nhalf[1]) ? j + dn[1] : j; - for (size_t k = 0; k < f.size(2); ++k) - { - size_t kp = (k > nhalf[2]) ? k + dn[2] : k; - // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue; - f.kelem(i, j, k) = op(fp.kelem(ip, jp, kp) / rfac, f.kelem(i, j, k)); - } - } - } - -#else /// then USE_MPI is defined ////////////////////////////////////////////////////////////// - - ///////////////////////////////////////////////////////////////////// - size_t maxslicesz = fp.sizes_[1] * fp.sizes_[3] * 2; - - std::vector crecvbuf_(maxslicesz / 2,0); - real_t* recvbuf_ = reinterpret_cast(&crecvbuf_[0]); - - std::vector - offsets_(CONFIG::MPI_task_size, 0), - offsetsp_(CONFIG::MPI_task_size, 0), - sizes_(CONFIG::MPI_task_size, 0), - sizesp_(CONFIG::MPI_task_size, 0); - - size_t tsize = f.size(0), tsizep = fp.size(0); - - MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&fp.local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - ///////////////////////////////////////////////////////////////////// - - double tstart = get_wtime(); - - csoca::ilog << "[MPI] Started gather for convolution"; - - MPI_Barrier(MPI_COMM_WORLD); - - size_t nf[3] = {f.size(0), f.size(1), f.size(2)}; - size_t nfp[4] = {fp.size(0), fp.size(1), fp.size(2), fp.size(3)}; - size_t fny[3] = {f.n_[1] / 2, f.n_[0] / 2, f.n_[2] / 2}; - - size_t slicesz = fp.size(1) * fp.size(3); - - if (typeid(data_t) == typeid(real_t)) - slicesz *= 2; // then sizeof(real_t) gives only half of a complex - - MPI_Datatype datatype = - (typeid(data_t) == typeid(float)) - ? MPI_FLOAT - : (typeid(data_t) == typeid(double)) - ? MPI_DOUBLE - : (typeid(data_t) == typeid(std::complex)) - ? MPI_COMPLEX - : (typeid(data_t) == typeid(std::complex)) - ? MPI_DOUBLE_COMPLEX - : MPI_INT; - - MPI_Status status; - - //... local size must be divisible by 2, otherwise this gets too complicated - // assert( tsize%2 == 0 ); - - f.zero(); - - std::vector req; - MPI_Request temp_req; - - for (size_t i = 0; i < nfp[0]; ++i) - { - size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank]; - - //... sending - if (iglobal < fny[0]) - { - int sendto = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size); - - MPI_Isend(&fp.kelem(i * slicesz), (int)slicesz, datatype, sendto, (int)iglobal, - MPI_COMM_WORLD, &temp_req); - req.push_back(temp_req); - } - else if (iglobal > 2 * fny[0]) - { - int sendto = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size); - MPI_Isend(&fp.kelem(i * slicesz), (int)slicesz, datatype, sendto, (int)iglobal, - MPI_COMM_WORLD, &temp_req); - req.push_back(temp_req); - } - } - - for (size_t i = 0; i < nf[0]; ++i) - { - size_t iglobal = i + offsets_[CONFIG::MPI_task_rank]; - - int recvfrom = 0; - if (iglobal < fny[0]) - { - recvfrom = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size); - MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal, - MPI_COMM_WORLD, &status); - } - else if (iglobal > fny[0]) - { - recvfrom = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size); - MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, - (int)(iglobal + fny[0]), MPI_COMM_WORLD, &status); - } - else - continue; - - assert(status.MPI_ERROR == MPI_SUCCESS); - - for (size_t j = 0; j < nf[1]; ++j) - { - - if (j < fny[1]) - { - size_t jp = j; - for (size_t k = 0; k < nf[2]; ++k) - { - // size_t kp = (k>fny[2])? k+fny[2] : k; - // f.kelem(i,j,k) = crecvbuf_[jp*nfp[3]+kp]; - if (k < fny[2]) - f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k],f.kelem(i, j, k)); - else if (k > fny[2]) - f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k + fny[2]], f.kelem(i, j, k)); - } - } - if (j > fny[1]) - { - size_t jp = j + fny[1]; - for (size_t k = 0; k < nf[2]; ++k) - { - // size_t kp = (k>fny[2])? k+fny[2] : k; - // f.kelem(i,j,k) = crecvbuf_[jp*nfp[3]+kp]; - if (k < fny[2]) - f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k], f.kelem(i, j, k)); - else if (k > fny[2]) - f.kelem(i, j, k) = op(crecvbuf_[jp * nfp[3] + k + fny[2]], f.kelem(i, j, k)); - } - } - } - } - - for (size_t i = 0; i < req.size(); ++i) - { - // need to preset status as wait does not necessarily modify it to reflect - // success c.f. - // http://www.open-mpi.org/community/lists/devel/2007/04/1402.php - status.MPI_ERROR = MPI_SUCCESS; - - MPI_Wait(&req[i], &status); - assert(status.MPI_ERROR == MPI_SUCCESS); - } - - MPI_Barrier(MPI_COMM_WORLD); - - csoca::ilog.Print("[MPI] Completed gather for convolution, took %fs", get_wtime() - tstart); - -#endif /// end of ifdef/ifndef USE_MPI ////////////////////////////////////////////////////////////// -} - - - - -template< typename data_t > -class OrszagConvolver -{ -protected: - Grid_FFT *f1p_, *f2p_; - std::array np_; - std::array length_; - - ccomplex_t *crecvbuf_; - real_t *recvbuf_; - ptrdiff_t *offsets_; - ptrdiff_t *offsetsp_; - ptrdiff_t *sizes_; - ptrdiff_t *sizesp_; - -private: - int get_task( ptrdiff_t index, const ptrdiff_t *offsets, const ptrdiff_t *sizes, const int ntasks ) const - { - int itask = 0; - while( itask < ntasks-1 && offsets[itask+1] <= index ) ++itask; - return itask; - } - - // void pad_insert( const Grid_FFT & f, Grid_FFT & fp ); - // void unpad( const Grid_FFT & fp, Grid_FFT< data_t > & f ); - -public: - - - OrszagConvolver( const std::array &N, const std::array &L ) - : np_({3*N[0]/2,3*N[1]/2,3*N[2]/2}), length_(L) - { - //... create temporaries - f1p_ = new Grid_FFT(np_, length_, kspace_id); - f2p_ = new Grid_FFT(np_, length_, kspace_id); - -#if defined(USE_MPI) - size_t maxslicesz = f1p_->sizes_[1] * f1p_->sizes_[3] * 2; - - crecvbuf_ = new ccomplex_t[maxslicesz / 2]; - recvbuf_ = reinterpret_cast(&crecvbuf_[0]); - - int ntasks(MPI_Get_size()); - - offsets_ = new ptrdiff_t[ntasks]; - offsetsp_ = new ptrdiff_t[ntasks]; - sizes_ = new ptrdiff_t[ntasks]; - sizesp_ = new ptrdiff_t[ntasks]; - - size_t tsize = N[0], tsizep = f1p_->size(0); - - MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&f1p_->local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); -#endif - } - - ~OrszagConvolver() - { - delete f1p_; - delete f2p_; -#if defined(USE_MPI) - delete[] crecvbuf_; - delete[] offsets_; - delete[] offsetsp_; - delete[] sizes_; - delete[] sizesp_; -#endif - } - - template< typename opp > - void convolve_Hessians( Grid_FFT & inl, const std::array& d2l, Grid_FFT & inr, const std::array& d2r, Grid_FFT & res, opp op ){ - // transform to FS in case fields are not - inl.FourierTransformForward(); - inr.FourierTransformForward(); - // perform convolution of Hessians - this->convolve2__( - [&]( size_t i, size_t j, size_t k ) -> ccomplex_t{ - auto kk = inl.template get_k(i,j,k); - return -kk[d2l[0]] * kk[d2l[1]] * inl.kelem(i,j,k);// / phifac; - }, - [&]( size_t i, size_t j, size_t k ){ - auto kk = inr.template get_k(i,j,k); - return -kk[d2r[0]] * kk[d2r[1]] * inr.kelem(i,j,k);// / phifac; - }, res, op ); - } - - template< typename opp > - void convolve_SumHessians( Grid_FFT & inl, const std::array& d2l, Grid_FFT & inr, const std::array& d2r1, - const std::array& d2r2, Grid_FFT & res, opp op ){ - // transform to FS in case fields are not - inl.FourierTransformForward(); - inr.FourierTransformForward(); - // perform convolution of Hessians - this->convolve2__( - [&]( size_t i, size_t j, size_t k ) -> ccomplex_t{ - auto kk = inl.template get_k(i,j,k); - return -kk[d2l[0]] * kk[d2l[1]] * inl.kelem(i,j,k);// / phifac; - }, - [&]( size_t i, size_t j, size_t k ){ - auto kk = inr.template get_k(i,j,k); - return (-kk[d2r1[0]] * kk[d2r1[1]] -kk[d2r2[0]] * kk[d2r2[1]]) * inr.kelem(i,j,k); - }, res, op ); - } - - template< typename kfunc1, typename kfunc2, typename opp > - void convolve2__( kfunc1 kf1, kfunc2 kf2, Grid_FFT & res, opp op ) - { - //... prepare data 1 - f1p_->FourierTransformForward(false); - pad_insertf( kf1, *f1p_ ); - - //... prepare data 1 - f2p_->FourierTransformForward(false); - pad_insertf( kf2, *f2p_ ); - - //... convolve - f1p_->FourierTransformBackward(); - f2p_->FourierTransformBackward(); - - #pragma omp parallel for - for (size_t i = 0; i < f1p_->ntot_; ++i){ - (*f2p_).relem(i) *= (*f1p_).relem(i); - } - f2p_->FourierTransformForward(); - //... copy data back - res.FourierTransformForward(); - unpad(*f2p_, res, op); - } - - //... inplace interface - template - void convolve2( Grid_FFT & f1, Grid_FFT & f2, Grid_FFT & res, opp op)// = []( ccomplex_t convres, ccomplex_t res ) -> ccomplex_t{ return convres; } ) - { - #if 1 - // constexpr real_t fac{ std::pow(1.5,1.5) }; - constexpr real_t fac{ 1.0 }; - - //... copy data 1 - f1.FourierTransformForward(); - f1p_->FourierTransformForward(false); - pad_insert(f1, *f1p_); - //... copy data 2 - f2.FourierTransformForward(); - f2p_->FourierTransformForward(false); - pad_insert(f2, *f2p_); - //... convolve - f1p_->FourierTransformBackward(); - f2p_->FourierTransformBackward(); - for (size_t i = 0; i < f1p_->ntot_; ++i){ - (*f2p_).relem(i) *= fac * (*f1p_).relem(i); - } - f2p_->FourierTransformForward(); - //... copy data back - res.FourierTransformForward(); - unpad(*f2p_, res, op); - #else - res.FourierTransformBackward(); - f1.FourierTransformBackward(); - f2.FourierTransformBackward(); - - for (size_t i = 0; i < res.ntot_; ++i){ - res.relem(i) = op(f1.relem(i)*f2.relem(i),res.relem(i)); - } - - #endif - } - - //... inplace interface - /*void convolve3( const Grid_FFT & f1, const Grid_FFT & f2, const Grid_FFT & f3, Grid_FFT & res ) - { - convolve2( f1, f2, res ); - convolve2( res, f3, res ); - }*/ -}; diff --git a/src/grid_fft.cc b/src/grid_fft.cc index 7eaa5e9..f2bcb8a 100644 --- a/src/grid_fft.cc +++ b/src/grid_fft.cc @@ -587,226 +587,8 @@ void Grid_FFT::ComputePowerSpectrum(std::vector &bin_k, std::vec } } - - -template -void pad_insert(const Grid_FFT &f, Grid_FFT &fp) -{ - - // assert(fp.n_[0] == 3 * f.n_[0] / 2); - // assert(fp.n_[1] == 3 * f.n_[1] / 2); - // assert(fp.n_[2] == 3 * f.n_[2] / 2); - - assert( f.space_ == kspace_id ); - assert( fp.space_ == kspace_id ); - - - size_t dn[3] = { - fp.n_[0] - f.n_[0], - fp.n_[1] - f.n_[1], - fp.n_[2] - f.n_[2], - }; - const double rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(f.n_[0] * f.n_[1] * f.n_[2]); - - fp.zero(); - -#if !defined(USE_MPI) //////////////////////////////////////////////////////////////////////////////////// - - size_t nhalf[3] = {f.n_[0] / 2, f.n_[1] / 2, f.n_[2] / 2}; - - for (size_t i = 0; i < f.size(0); ++i) - { - size_t ip = (i > nhalf[0]) ? i + dn[0] : i; - for (size_t j = 0; j < f.size(1); ++j) - { - size_t jp = (j > nhalf[1]) ? j + dn[1] : j; - for (size_t k = 0; k < f.size(2); ++k) - { - size_t kp = (k > nhalf[2]) ? k + dn[2] : k; - // if( i==nhalf[0]||j==nhalf[1]||k==nhalf[2]) continue; - fp.kelem(ip, jp, kp) = f.kelem(i, j, k) * rfac; - } - } - } - - -#else /// then USE_MPI is defined //////////////////////////////////////////////////////////// - - MPI_Barrier(MPI_COMM_WORLD); - - ///////////////////////////////////////////////////////////////////// - size_t maxslicesz = fp.sizes_[1] * fp.sizes_[3] * 2; - - std::vector crecvbuf_(maxslicesz / 2, 0); - real_t *recvbuf_ = reinterpret_cast(&crecvbuf_[0]); - - std::vector - offsets_(CONFIG::MPI_task_size, 0), - offsetsp_(CONFIG::MPI_task_size, 0), - sizes_(CONFIG::MPI_task_size, 0), - sizesp_(CONFIG::MPI_task_size, 0); - - size_t tsize = f.size(0), tsizep = fp.size(0); - - MPI_Allgather(&f.local_1_start_, 1, MPI_LONG_LONG, &offsets_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&fp.local_1_start_, 1, MPI_LONG_LONG, &offsetsp_[0], 1, - MPI_LONG_LONG, MPI_COMM_WORLD); - MPI_Allgather(&tsize, 1, MPI_LONG_LONG, &sizes_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - MPI_Allgather(&tsizep, 1, MPI_LONG_LONG, &sizesp_[0], 1, MPI_LONG_LONG, - MPI_COMM_WORLD); - ///////////////////////////////////////////////////////////////////// - - double tstart = get_wtime(); - csoca::dlog << "[MPI] Started scatter for convolution" << std::endl; - - //... collect offsets - - assert(f.space_ == kspace_id); - - size_t nf[3] = {f.size(0), f.size(1), f.size(2)}; - size_t nfp[3] = {fp.size(0), fp.size(1), fp.size(2)}; - size_t fny[3] = {f.n_[1] / 2, f.n_[0] / 2, f.n_[2] / 2}; - - //... local size must be divisible by 2, otherwise this gets too complicated - assert(f.n_[1] % 2 == 0); - - size_t slicesz = f.size(1) * f.size(3); //*2; - - // comunicate - if (typeid(data_t) == typeid(real_t)) - slicesz *= 2; // then sizeof(real_t) gives only half of a complex - - MPI_Datatype datatype = - (typeid(data_t) == typeid(float)) - ? MPI_FLOAT - : (typeid(data_t) == typeid(double)) - ? MPI_DOUBLE - : (typeid(data_t) == typeid(std::complex)) - ? MPI_COMPLEX - : (typeid(data_t) == typeid(std::complex)) - ? MPI_DOUBLE_COMPLEX - : MPI_INT; - - MPI_Status status; - - std::vector req; - MPI_Request temp_req; - - for (size_t i = 0; i < nf[0]; ++i) - { - size_t iglobal = i + offsets_[CONFIG::MPI_task_rank]; - - if (iglobal < fny[0]) - { - int sendto = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size); - MPI_Isend(&f.kelem(i * slicesz), (int)slicesz, datatype, sendto, - (int)iglobal, MPI_COMM_WORLD, &temp_req); - req.push_back(temp_req); - // ofs << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": - // Isend #" << iglobal << " to task " << sendto << std::endl; - } - if (iglobal > fny[0]) - { - int sendto = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size); - MPI_Isend(&f.kelem(i * slicesz), (int)slicesz, datatype, sendto, - (int)(iglobal + fny[0]), MPI_COMM_WORLD, &temp_req); - req.push_back(temp_req); - // ofs << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": - // Isend #" << iglobal+fny[0] << " to task " << sendto << std::endl; - } - } - - for (size_t i = 0; i < nfp[0]; ++i) - { - size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank]; - - if (iglobal < fny[0] || iglobal > 2 * fny[0]) - { - int recvfrom = 0; - if (iglobal <= fny[0]) - recvfrom = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size); - else - recvfrom = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size); - - // ofs << "task " << CONFIG::MPI_task_rank << " : receive #" << iglobal << " from task " - // << recvfrom << std::endl; - - MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal, - MPI_COMM_WORLD, &status); - // ofs << "---> ok! " << (bool)(status.Get_error()==MPI::SUCCESS) << - // std::endl; - - assert(status.MPI_ERROR == MPI_SUCCESS); - - for (size_t j = 0; j < nf[1]; ++j) - { - if (j < fny[1]) - { - size_t jp = j; - for (size_t k = 0; k < nf[2]; ++k) - { - // size_t kp = (k>fny[2])? k+fny[2] : k; - if (k < fny[2]) - fp.kelem(i, jp, k) = crecvbuf_[j * f.sizes_[3] + k]; - else if (k > fny[2]) - fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * f.sizes_[3] + k]; - } - } - - else if (j > fny[1]) - { - size_t jp = j + fny[1]; - for (size_t k = 0; k < nf[2]; ++k) - { - // size_t kp = (k>fny[2])? k+fny[2] : k; - // fp.kelem(i,jp,kp) = crecvbuf_[j*f.sizes_[3]+k]; - if (k < fny[2]) - fp.kelem(i, jp, k) = crecvbuf_[j * f.sizes_[3] + k]; - else if (k > fny[2]) - fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * f.sizes_[3] + k]; - } - } - } - } - } - - for (size_t i = 0; i < req.size(); ++i) - { - // need to set status as wait does not necessarily modify it - // c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php - status.MPI_ERROR = MPI_SUCCESS; - // ofs << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl; - MPI_Wait(&req[i], &status); - // ofs << "---> ok!" << std::endl; - assert(status.MPI_ERROR == MPI_SUCCESS); - } - - // usleep(1000); - - MPI_Barrier(MPI_COMM_WORLD); - - // std::cerr << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! <<<<<" - // << std::endl; ofs << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! - // <<<<<" << std::endl; - csoca::dlog.Print("[MPI] Completed scatter for convolution, took %fs\n", - get_wtime() - tstart); - -#endif /// end of ifdef/ifndef USE_MPI /////////////////////////////////////////////////////////////// -} - - - - /********************************************************************************************/ template class Grid_FFT; template class Grid_FFT; - -// template void unpad(const Grid_FFT &fp, Grid_FFT &f); -// template void unpad(const Grid_FFT &fp, Grid_FFT &f); - -template void pad_insert(const Grid_FFT &f, Grid_FFT &fp); -template void pad_insert(const Grid_FFT &f, Grid_FFT &fp); diff --git a/src/main.cc b/src/main.cc index 0661764..5d79bbd 100644 --- a/src/main.cc +++ b/src/main.cc @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -181,8 +182,7 @@ int main( int argc, char** argv ) #if 1 // phi_xx * phi_yy - Conv.convolve_Hessians( phi, {0,0}, phi, {1,1}, phi2, assign_op ); - Conv.convolve_Hessians( phi, {0,0}, phi, {2,2}, phi2, add_op ); + Conv.convolve_SumHessians( phi, {0,0}, phi, {1,1}, {2,2}, phi2, assign_op ); Conv.convolve_Hessians( phi, {1,1}, phi, {2,2}, phi2, add_op ); Conv.convolve_Hessians( phi, {0,1}, phi, {0,1}, phi2, sub_op ); Conv.convolve_Hessians( phi, {0,2}, phi, {0,2}, phi2, sub_op ); @@ -204,9 +204,11 @@ int main( int argc, char** argv ) { size_t idx = phi2.get_idx(i, j, k); - phi2.relem(idx) = phi_xx.relem(idx)*phi_yy.relem(idx)-phi_xy.relem(idx)*phi_xy.relem(idx) - +phi_xx.relem(idx)*phi_zz.relem(idx)-phi_xz.relem(idx)*phi_xz.relem(idx) - +phi_yy.relem(idx)*phi_zz.relem(idx)-phi_yz.relem(idx)*phi_yz.relem(idx); + phi2.relem(idx) = phi_xx.relem(idx)*(phi_yy.relem(idx)+phi_zz.relem(idx)) + +phi_yy.relem(idx)*phi_zz.relem(idx) + -phi_xy.relem(idx)*phi_xy.relem(idx) + -phi_xz.relem(idx)*phi_xz.relem(idx) + -phi_yz.relem(idx)*phi_yz.relem(idx); } } }