mirror of
https://github.com/cosmo-sims/monofonIC.git
synced 2024-09-19 17:03:45 +02:00
Merge branch 'develop' of https://bitbucket.org/ohahn/monofonic into amrex
This commit is contained in:
commit
dd66859b12
7 changed files with 378 additions and 470 deletions
|
@ -427,19 +427,18 @@ public:
|
||||||
// }
|
// }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <typename kdep_functor>
|
|
||||||
void pad_insert(kdep_functor kfunc, Grid_FFT<data_t> &fp)
|
|
||||||
{
|
|
||||||
assert(fp.space_ == kspace_id);
|
|
||||||
|
|
||||||
|
template <typename kdep_functor>
|
||||||
|
void pad_insert( kdep_functor kfunc, Grid_FFT<data_t> &fp)
|
||||||
|
{
|
||||||
const real_t rfac = std::pow(1.5, 1.5);
|
const real_t rfac = std::pow(1.5, 1.5);
|
||||||
|
|
||||||
|
#if !defined(USE_MPI)
|
||||||
|
const size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3};
|
||||||
|
|
||||||
fp.zero();
|
fp.zero();
|
||||||
|
|
||||||
#if !defined(USE_MPI) ////////////////////////////////////////////////////////////////////////////////////
|
#pragma omp parallel for
|
||||||
const size_t nhalf[3] = {fp.n_[0] / 3, fp.n_[1] / 3, fp.n_[2] / 3};
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (size_t i = 0; i < 2 * fp.size(0) / 3; ++i)
|
for (size_t i = 0; i < 2 * fp.size(0) / 3; ++i)
|
||||||
{
|
{
|
||||||
size_t ip = (i > nhalf[0]) ? i + nhalf[0] : i;
|
size_t ip = (i > nhalf[0]) ? i + nhalf[0] : i;
|
||||||
|
@ -453,37 +452,10 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
#else /// then USE_MPI is defined ////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
fbuf_->FourierTransformForward(false);
|
fbuf_->FourierTransformForward(false);
|
||||||
/////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
double tstart = get_wtime();
|
#pragma omp parallel for
|
||||||
music::dlog << "[MPI] Started scatter for convolution" << std::endl;
|
|
||||||
|
|
||||||
//... collect offsets
|
|
||||||
|
|
||||||
assert(fbuf_->space_ == kspace_id);
|
|
||||||
|
|
||||||
size_t nf[3] = {fbuf_->size(0), fbuf_->size(1), fbuf_->size(2)};
|
|
||||||
size_t nfp[3] = {fp.size(0), fp.size(1), fp.size(2)};
|
|
||||||
size_t fny[3] = {fbuf_->n_[1] / 2, fbuf_->n_[0] / 2, fbuf_->n_[2] / 2};
|
|
||||||
|
|
||||||
//... local size must be divisible by 2, otherwise this gets too complicated
|
|
||||||
assert(fbuf_->n_[1] % 2 == 0);
|
|
||||||
size_t slicesz = fbuf_->size(1) * fbuf_->size(3);
|
|
||||||
|
|
||||||
MPI_Datatype datatype =
|
|
||||||
(typeid(data_t) == typeid(float)) ? MPI_C_FLOAT_COMPLEX
|
|
||||||
: (typeid(data_t) == typeid(double)) ? MPI_C_DOUBLE_COMPLEX
|
|
||||||
: (typeid(data_t) == typeid(long double)) ? MPI_C_LONG_DOUBLE_COMPLEX
|
|
||||||
: MPI_BYTE;
|
|
||||||
|
|
||||||
// fill MPI send buffer with results of kfunc
|
|
||||||
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (size_t i = 0; i < fbuf_->size(0); ++i)
|
for (size_t i = 0; i < fbuf_->size(0); ++i)
|
||||||
{
|
{
|
||||||
for (size_t j = 0; j < fbuf_->size(1); ++j)
|
for (size_t j = 0; j < fbuf_->size(1); ++j)
|
||||||
|
@ -495,125 +467,13 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Status status;
|
fbuf_->FourierInterpolateCopyTo( fp );
|
||||||
|
|
||||||
std::vector<MPI_Request> req;
|
#endif //defined(USE_MPI)
|
||||||
MPI_Request temp_req;
|
|
||||||
|
|
||||||
// send data from buffer
|
|
||||||
for (size_t i = 0; i < nf[0]; ++i)
|
|
||||||
{
|
|
||||||
size_t iglobal = i + offsets_[CONFIG::MPI_task_rank];
|
|
||||||
|
|
||||||
if (iglobal <= fny[0])
|
|
||||||
{
|
|
||||||
int sendto = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size);
|
|
||||||
MPI_Isend(&fbuf_->kelem(i * slicesz), (int)slicesz, datatype, sendto,
|
|
||||||
(int)iglobal, MPI_COMM_WORLD, &temp_req);
|
|
||||||
req.push_back(temp_req);
|
|
||||||
// std::cout << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": Isend #" << iglobal << " to task " << sendto << ", size = " << slicesz << std::endl;
|
|
||||||
}
|
|
||||||
if (iglobal >= fny[0])
|
|
||||||
{
|
|
||||||
int sendto = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size);
|
|
||||||
MPI_Isend(&fbuf_->kelem(i * slicesz), (int)slicesz, datatype, sendto,
|
|
||||||
(int)(iglobal + fny[0]), MPI_COMM_WORLD, &temp_req);
|
|
||||||
req.push_back(temp_req);
|
|
||||||
// std::cout << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": Isend #" << iglobal+fny[0] << " to task " << sendto << ", size = " << slicesz<< std::endl;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < nfp[0]; ++i)
|
|
||||||
{
|
|
||||||
size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank];
|
|
||||||
|
|
||||||
if (iglobal <= fny[0] || iglobal >= 2 * fny[0])
|
|
||||||
{
|
|
||||||
int recvfrom = 0;
|
|
||||||
if (iglobal <= fny[0])
|
|
||||||
recvfrom = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size);
|
|
||||||
else
|
|
||||||
recvfrom = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size);
|
|
||||||
|
|
||||||
// std::cout << "task " << CONFIG::MPI_task_rank << " : receive #" << iglobal << " from task "
|
|
||||||
// << recvfrom << ", size = " << slicesz << ", " << crecvbuf_ << ", " << datatype << std::endl;
|
|
||||||
status.MPI_ERROR = MPI_SUCCESS;
|
|
||||||
|
|
||||||
MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal,
|
|
||||||
MPI_COMM_WORLD, &status);
|
|
||||||
// std::cout << "---> ok! " << (bool)(status.MPI_ERROR==MPI_SUCCESS) << std::endl;
|
|
||||||
|
|
||||||
assert(status.MPI_ERROR == MPI_SUCCESS);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < nf[1]; ++j)
|
|
||||||
{
|
|
||||||
if (j <= fny[1])
|
|
||||||
{
|
|
||||||
size_t jp = j;
|
|
||||||
for (size_t k = 0; k < nf[2]; ++k)
|
|
||||||
{
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
|
||||||
{
|
|
||||||
fp.kelem(i, jp, k) = crecvbuf_[j * fbuf_->sizes_[3] + k];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (k <= fny[2])
|
|
||||||
fp.kelem(i, jp, k) = crecvbuf_[j * fbuf_->sizes_[3] + k];
|
|
||||||
if (k >= fny[2])
|
|
||||||
fp.kelem(i, jp, k + nf[2] / 2) = crecvbuf_[j * fbuf_->sizes_[3] + k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (j >= fny[1])
|
|
||||||
{
|
|
||||||
size_t jp = j + fny[1];
|
|
||||||
for (size_t k = 0; k < nf[2]; ++k)
|
|
||||||
{
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
|
||||||
{
|
|
||||||
fp.kelem(i, jp, k) = crecvbuf_[j * fbuf_->sizes_[3] + k];
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (k <= fny[2])
|
|
||||||
fp.kelem(i, jp, k) = crecvbuf_[j * fbuf_->sizes_[3] + k];
|
|
||||||
if (k >= fny[2])
|
|
||||||
fp.kelem(i, jp, k + fny[2]) = crecvbuf_[j * fbuf_->sizes_[3] + k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (size_t i = 0; i < req.size(); ++i)
|
|
||||||
{
|
|
||||||
// need to set status as wait does not necessarily modify it
|
|
||||||
// c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php
|
|
||||||
status.MPI_ERROR = MPI_SUCCESS;
|
|
||||||
// std::cout << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl;
|
|
||||||
MPI_Wait(&req[i], &status);
|
|
||||||
// std::cout << "---> ok!" << std::endl;
|
|
||||||
assert(status.MPI_ERROR == MPI_SUCCESS);
|
|
||||||
}
|
|
||||||
|
|
||||||
// usleep(1000);
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
// std::cerr << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed! <<<<<"
|
|
||||||
// << std::endl; ofs << ">>>>> task " << CONFIG::MPI_task_rank << " all transfers completed!
|
|
||||||
// <<<<<" << std::endl;
|
|
||||||
music::dlog.Print("[MPI] Completed scatter for convolution, took %fs\n",
|
|
||||||
get_wtime() - tstart);
|
|
||||||
|
|
||||||
#endif /// end of ifdef/ifndef USE_MPI ///////////////////////////////////////////////////////////////
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename operator_t>
|
template <typename operator_t>
|
||||||
void unpad(const Grid_FFT<data_t> &fp, operator_t output_op)
|
void unpad( Grid_FFT<data_t> &fp, operator_t output_op)
|
||||||
{
|
{
|
||||||
const real_t rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(fbuf_->n_[0] * fbuf_->n_[1] * fbuf_->n_[2]);
|
const real_t rfac = std::sqrt(fp.n_[0] * fp.n_[1] * fp.n_[2]) / std::sqrt(fbuf_->n_[0] * fbuf_->n_[1] * fbuf_->n_[2]);
|
||||||
|
|
||||||
|
@ -624,7 +484,7 @@ private:
|
||||||
fbuf_->FourierTransformForward(false);
|
fbuf_->FourierTransformForward(false);
|
||||||
size_t nhalf[3] = {fbuf_->n_[0] / 2, fbuf_->n_[1] / 2, fbuf_->n_[2] / 2};
|
size_t nhalf[3] = {fbuf_->n_[0] / 2, fbuf_->n_[1] / 2, fbuf_->n_[2] / 2};
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t i = 0; i < fbuf_->size(0); ++i)
|
for (size_t i = 0; i < fbuf_->size(0); ++i)
|
||||||
{
|
{
|
||||||
size_t ip = (i > nhalf[0]) ? i + nhalf[0] : i;
|
size_t ip = (i > nhalf[0]) ? i + nhalf[0] : i;
|
||||||
|
@ -643,193 +503,6 @@ private:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//... copy data back
|
|
||||||
#pragma omp parallel for
|
|
||||||
for (size_t i = 0; i < fbuf_->ntot_; ++i)
|
|
||||||
{
|
|
||||||
output_op(i, (*fbuf_)[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else /// then USE_MPI is defined //////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
/////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
double tstart = get_wtime();
|
|
||||||
|
|
||||||
music::dlog << "[MPI] Started gather for convolution";
|
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
|
||||||
|
|
||||||
size_t nf[3] = {fbuf_->size(0), fbuf_->size(1), fbuf_->size(2)};
|
|
||||||
size_t nfp[4] = {fp.size(0), fp.size(1), fp.size(2), fp.size(3)};
|
|
||||||
size_t fny[3] = {fbuf_->n_[1] / 2, fbuf_->n_[0] / 2, fbuf_->n_[2] / 2};
|
|
||||||
|
|
||||||
size_t slicesz = fp.size(1) * fp.size(3);
|
|
||||||
|
|
||||||
MPI_Datatype datatype =
|
|
||||||
(typeid(data_t) == typeid(float)) ? MPI_C_FLOAT_COMPLEX
|
|
||||||
: (typeid(data_t) == typeid(double)) ? MPI_C_DOUBLE_COMPLEX
|
|
||||||
: (typeid(data_t) == typeid(long double)) ? MPI_C_LONG_DOUBLE_COMPLEX
|
|
||||||
: MPI_BYTE;
|
|
||||||
|
|
||||||
MPI_Status status;
|
|
||||||
|
|
||||||
//... local size must be divisible by 2, otherwise this gets too complicated
|
|
||||||
// assert( tsize%2 == 0 );
|
|
||||||
|
|
||||||
std::vector<MPI_Request> req;
|
|
||||||
MPI_Request temp_req;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < nfp[0]; ++i)
|
|
||||||
{
|
|
||||||
size_t iglobal = i + offsetsp_[CONFIG::MPI_task_rank];
|
|
||||||
|
|
||||||
//... sending
|
|
||||||
if (iglobal <= fny[0])
|
|
||||||
{
|
|
||||||
int sendto = get_task(iglobal, offsets_, sizes_, CONFIG::MPI_task_size);
|
|
||||||
|
|
||||||
MPI_Isend(&fp.kelem(i * slicesz), (int)slicesz, datatype, sendto, (int)iglobal,
|
|
||||||
MPI_COMM_WORLD, &temp_req);
|
|
||||||
req.push_back(temp_req);
|
|
||||||
}
|
|
||||||
else if (iglobal >= 2 * fny[0])
|
|
||||||
{
|
|
||||||
int sendto = get_task(iglobal - fny[0], offsets_, sizes_, CONFIG::MPI_task_size);
|
|
||||||
MPI_Isend(&fp.kelem(i * slicesz), (int)slicesz, datatype, sendto, (int)iglobal,
|
|
||||||
MPI_COMM_WORLD, &temp_req);
|
|
||||||
req.push_back(temp_req);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fbuf_->zero();
|
|
||||||
|
|
||||||
for (size_t i = 0; i < nf[0]; ++i)
|
|
||||||
{
|
|
||||||
size_t iglobal = i + offsets_[CONFIG::MPI_task_rank];
|
|
||||||
int recvfrom = 0;
|
|
||||||
if (iglobal <= fny[0])
|
|
||||||
{
|
|
||||||
real_t wi = (iglobal == fny[0]) ? 0.0 : 1.0;
|
|
||||||
|
|
||||||
recvfrom = get_task(iglobal, offsetsp_, sizesp_, CONFIG::MPI_task_size);
|
|
||||||
MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom, (int)iglobal,
|
|
||||||
MPI_COMM_WORLD, &status);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < nf[1]; ++j)
|
|
||||||
{
|
|
||||||
real_t wj = (j == fny[1]) ? 0.0 : 1.0;
|
|
||||||
if (j <= fny[1])
|
|
||||||
{
|
|
||||||
size_t jp = j;
|
|
||||||
for (size_t k = 0; k < nf[2]; ++k)
|
|
||||||
{
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
|
||||||
{
|
|
||||||
real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
real_t w = wi * wj * wk;
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
real_t w = wi * wj * wk;
|
|
||||||
if (k < fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
if (k > fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
|
|
||||||
// if (w < 1.0)
|
|
||||||
// {
|
|
||||||
// fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (j >= fny[1])
|
|
||||||
{
|
|
||||||
size_t jp = j + fny[1];
|
|
||||||
for (size_t k = 0; k < nf[2]; ++k)
|
|
||||||
{
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
|
||||||
{
|
|
||||||
real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
real_t w = wi * wj * wk;
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
real_t w = wi * wj * wk;
|
|
||||||
if (k < fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
if (k > fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
|
|
||||||
// if (w < 1.0)
|
|
||||||
// {
|
|
||||||
// fbuf_->kelem(i, j, k) = std::real(fbuf_->kelem(i, j, k));
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (iglobal >= fny[0])
|
|
||||||
{
|
|
||||||
real_t wi = (iglobal == fny[0]) ? 0.0 : 1.0;
|
|
||||||
|
|
||||||
recvfrom = get_task(iglobal + fny[0], offsetsp_, sizesp_, CONFIG::MPI_task_size);
|
|
||||||
MPI_Recv(&recvbuf_[0], (int)slicesz, datatype, recvfrom,
|
|
||||||
(int)(iglobal + fny[0]), MPI_COMM_WORLD, &status);
|
|
||||||
|
|
||||||
for (size_t j = 0; j < nf[1]; ++j)
|
|
||||||
{
|
|
||||||
real_t wj = (j == fny[1]) ? 0.0 : 1.0;
|
|
||||||
if (j <= fny[1])
|
|
||||||
{
|
|
||||||
size_t jp = j;
|
|
||||||
for (size_t k = 0; k < nf[2]; ++k)
|
|
||||||
{
|
|
||||||
const real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
const real_t w = wi * wj * wk;
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
|
||||||
{
|
|
||||||
real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
real_t w = wi * wj * wk;
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (k < fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
if (k > fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (j >= fny[1])
|
|
||||||
{
|
|
||||||
size_t jp = j + fny[1];
|
|
||||||
for (size_t k = 0; k < nf[2]; ++k)
|
|
||||||
{
|
|
||||||
const real_t wk = (k == fny[2]) ? 0.0 : 1.0;
|
|
||||||
const real_t w = wi * wj * wk;
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
|
||||||
{
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (k < fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k] / rfac;
|
|
||||||
if (k > fny[2])
|
|
||||||
fbuf_->kelem(i, j, k) += w * crecvbuf_[jp * nfp[3] + k + fny[2]] / rfac;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
//... copy data back
|
//... copy data back
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t i = 0; i < fbuf_->ntot_; ++i)
|
for (size_t i = 0; i < fbuf_->ntot_; ++i)
|
||||||
|
@ -837,21 +510,18 @@ private:
|
||||||
output_op(i, (*fbuf_)[i]);
|
output_op(i, (*fbuf_)[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < req.size(); ++i)
|
#else /// then USE_MPI is defined //////////////////////////////////////////////////////////////
|
||||||
{
|
|
||||||
// need to preset status as wait does not necessarily modify it to reflect
|
|
||||||
// success c.f.
|
|
||||||
// http://www.open-mpi.org/community/lists/devel/2007/04/1402.php
|
|
||||||
status.MPI_ERROR = MPI_SUCCESS;
|
|
||||||
|
|
||||||
MPI_Wait(&req[i], &status);
|
fp.FourierInterpolateCopyTo( *fbuf_ );
|
||||||
assert(status.MPI_ERROR == MPI_SUCCESS);
|
|
||||||
|
//... copy data back
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (size_t i = 0; i < fbuf_->ntot_; ++i)
|
||||||
|
{
|
||||||
|
|
||||||
|
output_op(i, (*fbuf_)[i] / rfac);
|
||||||
}
|
}
|
||||||
|
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
#endif //defined(USE_MPI)
|
||||||
|
|
||||||
music::dlog.Print("[MPI] Completed gather for convolution, took %fs", get_wtime() - tstart);
|
|
||||||
|
|
||||||
#endif /// end of ifdef/ifndef USE_MPI //////////////////////////////////////////////////////////////
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
|
@ -134,7 +134,7 @@ public:
|
||||||
//! set all field elements to zero
|
//! set all field elements to zero
|
||||||
void zero() noexcept
|
void zero() noexcept
|
||||||
{
|
{
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t i = 0; i < ntot_; ++i)
|
for (size_t i = 0; i < ntot_; ++i)
|
||||||
data_[i] = 0.0;
|
data_[i] = 0.0;
|
||||||
}
|
}
|
||||||
|
@ -155,8 +155,8 @@ public:
|
||||||
assert(this->n_[1] == g.n_[1]);
|
assert(this->n_[1] == g.n_[1]);
|
||||||
assert(this->n_[2] == g.n_[2]);
|
assert(this->n_[2] == g.n_[2]);
|
||||||
|
|
||||||
// now we can copy all the data over
|
// now we can copy all the data over
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t i = 0; i < ntot_; ++i)
|
for (size_t i = 0; i < ntot_; ++i)
|
||||||
data_[i] = g.data_[i];
|
data_[i] = g.data_[i];
|
||||||
}
|
}
|
||||||
|
@ -357,6 +357,7 @@ public:
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline ccomplex_t gradient( const int idim, std::array<size_t,3> ijk ) const
|
inline ccomplex_t gradient( const int idim, std::array<size_t,3> ijk ) const
|
||||||
{
|
{
|
||||||
if( bdistributed ){
|
if( bdistributed ){
|
||||||
|
@ -791,13 +792,17 @@ public:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! perform a backwards Fourier transform
|
||||||
void FourierTransformBackward(bool do_transform = true);
|
void FourierTransformBackward(bool do_transform = true);
|
||||||
|
|
||||||
|
//! perform a forwards Fourier transform
|
||||||
void FourierTransformForward(bool do_transform = true);
|
void FourierTransformForward(bool do_transform = true);
|
||||||
|
|
||||||
void ApplyNorm(void);
|
//! perform a copy operation between to FFT grids that might not be of the same size
|
||||||
|
void FourierInterpolateCopyTo( grid_fft_t &grid_to );
|
||||||
|
|
||||||
void FillRandomReal(unsigned long int seed = 123456ul);
|
//! normalise field
|
||||||
|
void ApplyNorm(void);
|
||||||
|
|
||||||
void Write_to_HDF5(std::string fname, std::string datasetname) const;
|
void Write_to_HDF5(std::string fname, std::string datasetname) const;
|
||||||
|
|
||||||
|
|
263
src/grid_fft.cc
263
src/grid_fft.cc
|
@ -99,9 +99,9 @@ void Grid_FFT<data_t, bdistributed>::allocate(void)
|
||||||
|
|
||||||
if (typeid(data_t) == typeid(real_t))
|
if (typeid(data_t) == typeid(real_t))
|
||||||
{
|
{
|
||||||
cmplxsz = FFTW_API(mpi_local_size_3d_transposed)(n_[0], n_[1], n_[2] / 2 + 1, MPI_COMM_WORLD,
|
cmplxsz = FFTW_API(mpi_local_size_3d_transposed)(n_[0], n_[1], n_[2], MPI_COMM_WORLD,
|
||||||
&local_0_size_, &local_0_start_, &local_1_size_, &local_1_start_);
|
&local_0_size_, &local_0_start_, &local_1_size_, &local_1_start_);
|
||||||
ntot_ = 2 * cmplxsz;
|
ntot_ = local_0_size_ * n_[1] * (n_[2]+2);
|
||||||
data_ = (data_t *)FFTW_API(malloc)(ntot_ * sizeof(real_t));
|
data_ = (data_t *)FFTW_API(malloc)(ntot_ * sizeof(real_t));
|
||||||
cdata_ = reinterpret_cast<ccomplex_t *>(data_);
|
cdata_ = reinterpret_cast<ccomplex_t *>(data_);
|
||||||
plan_ = FFTW_API(mpi_plan_dft_r2c_3d)(n_[0], n_[1], n_[2], (real_t *)data_, (complex_t *)data_,
|
plan_ = FFTW_API(mpi_plan_dft_r2c_3d)(n_[0], n_[1], n_[2], (real_t *)data_, (complex_t *)data_,
|
||||||
|
@ -185,6 +185,7 @@ void Grid_FFT<data_t, bdistributed>::ApplyNorm(void)
|
||||||
data_[i] *= fft_norm_fac_;
|
data_[i] *= fft_norm_fac_;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Perform a backwards Fourier transformation
|
||||||
template <typename data_t, bool bdistributed>
|
template <typename data_t, bool bdistributed>
|
||||||
void Grid_FFT<data_t, bdistributed>::FourierTransformForward(bool do_transform)
|
void Grid_FFT<data_t, bdistributed>::FourierTransformForward(bool do_transform)
|
||||||
{
|
{
|
||||||
|
@ -217,6 +218,7 @@ void Grid_FFT<data_t, bdistributed>::FourierTransformForward(bool do_transform)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Perform a forward Fourier transformation
|
||||||
template <typename data_t, bool bdistributed>
|
template <typename data_t, bool bdistributed>
|
||||||
void Grid_FFT<data_t, bdistributed>::FourierTransformBackward(bool do_transform)
|
void Grid_FFT<data_t, bdistributed>::FourierTransformBackward(bool do_transform)
|
||||||
{
|
{
|
||||||
|
@ -248,6 +250,229 @@ void Grid_FFT<data_t, bdistributed>::FourierTransformBackward(bool do_transform)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//! Perform a copy to another field, not necessarily the same size, using Fourier interpolation
|
||||||
|
template <typename data_t, bool bdistributed>
|
||||||
|
void Grid_FFT<data_t, bdistributed>::FourierInterpolateCopyTo( grid_fft_t &grid_to )
|
||||||
|
{
|
||||||
|
grid_fft_t &grid_from = *this;
|
||||||
|
|
||||||
|
//... transform both grids to Fourier space
|
||||||
|
grid_from.FourierTransformForward(true);
|
||||||
|
grid_to.FourierTransformForward(false);
|
||||||
|
|
||||||
|
// if grids are same size, we directly copy without the need for interpolation
|
||||||
|
if( grid_from.n_[0] == grid_to.n_[0]
|
||||||
|
&& grid_from.n_[1] == grid_to.n_[1]
|
||||||
|
&& grid_from.n_[2] == grid_to.n_[2] )
|
||||||
|
{
|
||||||
|
grid_to.copy_from( grid_from );
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// set to zero so that regions without data are zeroed
|
||||||
|
grid_to.zero();
|
||||||
|
|
||||||
|
// if not running MPI, then can do without sending and receiving
|
||||||
|
#if !defined(USE_MPI)
|
||||||
|
|
||||||
|
// determine effective Nyquist modes representable by both fields and their locations in array
|
||||||
|
size_t fny0_left = std::min(grid_from.n_[0] / 2, grid_to.n_[0] / 2);
|
||||||
|
size_t fny0_right = std::max(grid_from.n_[0] - grid_to.n_[0] / 2, grid_from.n_[0] / 2);
|
||||||
|
size_t fny1_left = std::min(grid_from.n_[1] / 2, grid_to.n_[1] / 2);
|
||||||
|
size_t fny1_right = std::max(grid_from.n_[1] - grid_to.n_[1] / 2, grid_from.n_[1] / 2);
|
||||||
|
size_t fny2_left = std::min(grid_from.n_[2] / 2, grid_to.n_[2] / 2);
|
||||||
|
// size_t fny2_right = std::max(grid_from.n_[2] - grid_to.n_[2] / 2, grid_from.n_[2] / 2);
|
||||||
|
|
||||||
|
const size_t fny0_left_recv = fny0_left;
|
||||||
|
const size_t fny0_right_recv = (fny0_right + grid_to.n_[0]) - grid_from.n_[0];
|
||||||
|
const size_t fny1_left_recv = fny1_left;
|
||||||
|
const size_t fny1_right_recv = (fny1_right + grid_to.n_[1]) - grid_from.n_[1];
|
||||||
|
const size_t fny2_left_recv = fny2_left;
|
||||||
|
// const size_t fny2_right_recv = (fny2_right + grid_to.n_[2]) - grid_from.n_[2];
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for( size_t i=0; i<grid_to.size(0); ++i )
|
||||||
|
{
|
||||||
|
if (i < fny0_left_recv || i > fny0_right_recv)
|
||||||
|
{
|
||||||
|
size_t isend = (i < fny0_left_recv)? i : (i + grid_from.n_[0]) - grid_to.n_[0];
|
||||||
|
|
||||||
|
// copy data slice into new grid, avoiding modes that do not exist in both
|
||||||
|
for( size_t j=0; j<grid_to.size(1); ++j )
|
||||||
|
{
|
||||||
|
if( j < fny1_left_recv || j > fny1_right_recv )
|
||||||
|
{
|
||||||
|
const size_t jsend = (j < fny1_left_recv)? j : (j + grid_from.n_[1]) - grid_to.n_[1];
|
||||||
|
|
||||||
|
for( size_t k=0; k<fny2_left_recv; ++k )
|
||||||
|
{
|
||||||
|
grid_to.kelem(i,j,k) = grid_from.kelem(isend,jsend,k);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// if they are not the same size, we use Fourier interpolation to upscale/downscale
|
||||||
|
double tstart = get_wtime();
|
||||||
|
music::dlog << "[MPI] Started scatter for Fourier interpolation/copy" << std::endl;
|
||||||
|
|
||||||
|
//... determine communication offsets
|
||||||
|
std::vector<ptrdiff_t> offsets_send, offsets_recv, sizes_send, sizes_recv;
|
||||||
|
|
||||||
|
// this should use bisection, not linear search
|
||||||
|
auto get_task = [](ptrdiff_t index, const std::vector<ptrdiff_t> &offsets, const int ntasks) -> int
|
||||||
|
{
|
||||||
|
int itask = 0;
|
||||||
|
while (itask < ntasks - 1 && offsets[itask + 1] <= index)
|
||||||
|
++itask;
|
||||||
|
return itask;
|
||||||
|
};
|
||||||
|
|
||||||
|
int ntasks(MPI::get_size());
|
||||||
|
|
||||||
|
offsets_send.assign(ntasks+1, 0);
|
||||||
|
sizes_send.assign(ntasks, 0);
|
||||||
|
offsets_recv.assign(ntasks+1, 0);
|
||||||
|
sizes_recv.assign(ntasks, 0);
|
||||||
|
|
||||||
|
MPI_Allgather(&grid_from.local_1_size_, 1, MPI_LONG_LONG, &sizes_send[0], 1,
|
||||||
|
MPI_LONG_LONG, MPI_COMM_WORLD);
|
||||||
|
MPI_Allgather(&grid_to.local_1_size_, 1, MPI_LONG_LONG, &sizes_recv[0], 1,
|
||||||
|
MPI_LONG_LONG, MPI_COMM_WORLD);
|
||||||
|
MPI_Allgather(&grid_from.local_1_start_, 1, MPI_LONG_LONG, &offsets_send[0], 1,
|
||||||
|
MPI_LONG_LONG, MPI_COMM_WORLD);
|
||||||
|
MPI_Allgather(&grid_to.local_1_start_, 1, MPI_LONG_LONG, &offsets_recv[0], 1,
|
||||||
|
MPI_LONG_LONG, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
for( int i=0; i< CONFIG::MPI_task_size; i++ ){
|
||||||
|
if( offsets_send[i+1] < offsets_send[i] + sizes_send[i] ) offsets_send[i+1] = offsets_send[i] + sizes_send[i];
|
||||||
|
if( offsets_recv[i+1] < offsets_recv[i] + sizes_recv[i] ) offsets_recv[i+1] = offsets_recv[i] + sizes_recv[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
const MPI_Datatype datatype =
|
||||||
|
(typeid(data_t) == typeid(float)) ? MPI_C_FLOAT_COMPLEX
|
||||||
|
: (typeid(data_t) == typeid(double)) ? MPI_C_DOUBLE_COMPLEX
|
||||||
|
: (typeid(data_t) == typeid(long double)) ? MPI_C_LONG_DOUBLE_COMPLEX
|
||||||
|
: MPI_BYTE;
|
||||||
|
|
||||||
|
const size_t slicesz = grid_from.size(1) * grid_from.size(3);
|
||||||
|
|
||||||
|
// determine effective Nyquist modes representable by both fields and their locations in array
|
||||||
|
const size_t fny0_left = std::min(grid_from.n_[1] / 2, grid_to.n_[1] / 2);
|
||||||
|
const size_t fny0_right = std::max(grid_from.n_[1] - grid_to.n_[1] / 2, grid_from.n_[1] / 2);
|
||||||
|
const size_t fny1_left = std::min(grid_from.n_[0] / 2, grid_to.n_[0] / 2);
|
||||||
|
const size_t fny1_right = std::max(grid_from.n_[0] - grid_to.n_[0] / 2, grid_from.n_[0] / 2);
|
||||||
|
const size_t fny2_left = std::min(grid_from.n_[2] / 2, grid_to.n_[2] / 2);
|
||||||
|
// size_t fny2_right = std::max(grid_from.n_[2] - grid_to.n_[2] / 2, grid_from.n_[2] / 2);
|
||||||
|
|
||||||
|
const size_t fny0_left_recv = fny0_left;
|
||||||
|
const size_t fny0_right_recv = (fny0_right + grid_to.n_[1]) - grid_from.n_[1];
|
||||||
|
const size_t fny1_left_recv = fny1_left;
|
||||||
|
const size_t fny1_right_recv = (fny1_right + grid_to.n_[0]) - grid_from.n_[0];
|
||||||
|
const size_t fny2_left_recv = fny2_left;
|
||||||
|
// const size_t fny2_right_recv = (fny2_right + grid_to.n_[2]) - grid_from.n_[2];
|
||||||
|
|
||||||
|
//--- send data from buffer ---------------------------------------------------
|
||||||
|
|
||||||
|
std::vector<MPI_Request> req;
|
||||||
|
MPI_Request temp_req;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < grid_from.size(0); ++i)
|
||||||
|
{
|
||||||
|
size_t iglobal_send = i + offsets_send[CONFIG::MPI_task_rank];
|
||||||
|
|
||||||
|
if (iglobal_send < fny0_left)
|
||||||
|
{
|
||||||
|
size_t iglobal_recv = iglobal_send;
|
||||||
|
int sendto = get_task(iglobal_recv, offsets_recv, CONFIG::MPI_task_size);
|
||||||
|
MPI_Isend(&grid_from.kelem(i * slicesz), (int)slicesz, datatype, sendto,
|
||||||
|
(int)iglobal_recv, MPI_COMM_WORLD, &temp_req);
|
||||||
|
req.push_back(temp_req);
|
||||||
|
// std::cout << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": Isend #" << iglobal_send << " to task " << sendto << ", size = " << slicesz << std::endl;
|
||||||
|
}
|
||||||
|
if (iglobal_send > fny0_right)
|
||||||
|
{
|
||||||
|
size_t iglobal_recv = (iglobal_send + grid_to.n_[1]) - grid_from.n_[1];
|
||||||
|
int sendto = get_task(iglobal_recv, offsets_recv, CONFIG::MPI_task_size);
|
||||||
|
MPI_Isend(&grid_from.kelem(i * slicesz), (int)slicesz, datatype, sendto,
|
||||||
|
(int)iglobal_recv, MPI_COMM_WORLD, &temp_req);
|
||||||
|
req.push_back(temp_req);
|
||||||
|
// std::cout << "task " << CONFIG::MPI_task_rank << " : added request No" << req.size()-1 << ": Isend #" << iglobal_send << " to task " << sendto << ", size = " << slicesz<< std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//--- receive data ------------------------------------------------------------
|
||||||
|
#pragma omp parallel if(CONFIG::MPI_threads_ok)
|
||||||
|
{
|
||||||
|
MPI_Status status;
|
||||||
|
ccomplex_t * recvbuf = new ccomplex_t[ slicesz ];
|
||||||
|
|
||||||
|
#pragma omp for schedule(dynamic)
|
||||||
|
for( size_t i=0; i<grid_to.size(0); ++i )
|
||||||
|
{
|
||||||
|
size_t iglobal_recv = i + offsets_recv[CONFIG::MPI_task_rank];
|
||||||
|
|
||||||
|
if (iglobal_recv < fny0_left_recv || iglobal_recv > fny0_right_recv)
|
||||||
|
{
|
||||||
|
size_t iglobal_send = (iglobal_recv < fny0_left_recv)? iglobal_recv : (iglobal_recv + grid_from.n_[1]) - grid_to.n_[1];
|
||||||
|
|
||||||
|
int recvfrom = get_task(iglobal_send, offsets_send, CONFIG::MPI_task_size);
|
||||||
|
|
||||||
|
//#pragma omp critical // need critical region here if we do "MPI_THREAD_FUNNELED",
|
||||||
|
{
|
||||||
|
// receive data slice and check for MPI errors when in debug mode
|
||||||
|
status.MPI_ERROR = MPI_SUCCESS;
|
||||||
|
MPI_Recv(&recvbuf[0], (int)slicesz, datatype, recvfrom, (int)iglobal_recv, MPI_COMM_WORLD, &status);
|
||||||
|
assert(status.MPI_ERROR == MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy data slice into new grid, avoiding modes that do not exist in both
|
||||||
|
for( size_t j=0; j<grid_to.size(1); ++j )
|
||||||
|
{
|
||||||
|
if( j < fny1_left_recv || j > fny1_right_recv )
|
||||||
|
{
|
||||||
|
const size_t jsend = (j < fny1_left_recv)? j : (j + grid_from.n_[0]) - grid_to.n_[0];
|
||||||
|
|
||||||
|
for( size_t k=0; k<fny2_left_recv; ++k )
|
||||||
|
{
|
||||||
|
grid_to.kelem(i,j,k) = recvbuf[jsend * grid_from.sizes_[3] + k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
delete[] recvbuf;
|
||||||
|
}
|
||||||
|
MPI_Barrier( MPI_COMM_WORLD );
|
||||||
|
|
||||||
|
MPI_Status status;
|
||||||
|
for (size_t i = 0; i < req.size(); ++i)
|
||||||
|
{
|
||||||
|
// need to set status as wait does not necessarily modify it
|
||||||
|
// c.f. http://www.open-mpi.org/community/lists/devel/2007/04/1402.php
|
||||||
|
status.MPI_ERROR = MPI_SUCCESS;
|
||||||
|
// std::cout << "task " << CONFIG::MPI_task_rank << " : checking request No" << i << std::endl;
|
||||||
|
int flag(1);
|
||||||
|
MPI_Test(&req[i], &flag, &status);
|
||||||
|
if( !flag ){
|
||||||
|
std::cout << "task " << CONFIG::MPI_task_rank << " : request No" << i << " unsuccessful" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Wait(&req[i], &status);
|
||||||
|
// std::cout << "---> ok!" << std::endl;
|
||||||
|
assert(status.MPI_ERROR == MPI_SUCCESS);
|
||||||
|
}
|
||||||
|
|
||||||
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
music::dlog.Print("[MPI] Completed scatter for Fourier interpolation/copy, took %fs\n",
|
||||||
|
get_wtime() - tstart);
|
||||||
|
#endif //defined(USE_MPI)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#define H5_USE_16_API
|
#define H5_USE_16_API
|
||||||
#include <hdf5.h>
|
#include <hdf5.h>
|
||||||
|
|
||||||
|
@ -586,7 +811,29 @@ void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::strin
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(USE_MPI)
|
||||||
|
std::vector<size_t> sizes0(MPI::get_size(), 0);
|
||||||
|
std::vector<size_t> offsets0(MPI::get_size()+1, 0);
|
||||||
|
|
||||||
|
MPI_Allgather((this->space_==kspace_id)? &this->local_1_start_ : &this->local_0_start_, 1,
|
||||||
|
MPI_UNSIGNED_LONG_LONG, &offsets0[0], 1, MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
MPI_Allgather((this->space_==kspace_id)? &this->local_1_size_ : &this->local_0_size_, 1,
|
||||||
|
MPI_UNSIGNED_LONG_LONG, &sizes0[0], 1, MPI_UNSIGNED_LONG_LONG, MPI_COMM_WORLD);
|
||||||
|
|
||||||
|
for( int i=0; i< CONFIG::MPI_task_size; i++ ){
|
||||||
|
if( offsets0[i+1] < offsets0[i] + sizes0[i] ) offsets0[i+1] = offsets0[i] + sizes0[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(USE_MPI)
|
||||||
|
auto loc_count = size(0), glob_count = size(0);
|
||||||
|
MPI_Allreduce( &loc_count, &glob_count, 1, MPI_UNSIGNED_LONG_LONG, MPI_SUM, MPI_COMM_WORLD );
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(USE_MPI) && !defined(USE_MPI_IO)
|
#if defined(USE_MPI) && !defined(USE_MPI_IO)
|
||||||
|
|
||||||
for (int itask = 0; itask < mpi_size; ++itask)
|
for (int itask = 0; itask < mpi_size; ++itask)
|
||||||
{
|
{
|
||||||
MPI_Barrier(MPI_COMM_WORLD);
|
MPI_Barrier(MPI_COMM_WORLD);
|
||||||
|
@ -603,7 +850,7 @@ void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::strin
|
||||||
count[i] = size(i);
|
count[i] = size(i);
|
||||||
|
|
||||||
#if defined(USE_MPI)
|
#if defined(USE_MPI)
|
||||||
count[0] *= mpi_size;
|
count[0] = glob_count;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
if (typeid(data_t) == typeid(float))
|
if (typeid(data_t) == typeid(float))
|
||||||
|
@ -654,7 +901,7 @@ void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::strin
|
||||||
plist_id = H5Pcreate(H5P_DATASET_XFER);
|
plist_id = H5Pcreate(H5P_DATASET_XFER);
|
||||||
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
|
H5Pset_dxpl_mpio(plist_id, H5FD_MPIO_COLLECTIVE);
|
||||||
#else
|
#else
|
||||||
plist_id = H5P_DEFAULT;
|
plist_id = H5P_DEFAULT;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
memspace = H5Screate_simple(3, count, NULL);
|
memspace = H5Screate_simple(3, count, NULL);
|
||||||
|
@ -663,9 +910,9 @@ void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::strin
|
||||||
for (size_t i = 0; i < size(0); ++i)
|
for (size_t i = 0; i < size(0); ++i)
|
||||||
{
|
{
|
||||||
#if defined(USE_MPI)
|
#if defined(USE_MPI)
|
||||||
offset[0] = mpi_rank * size(0) + i;
|
offset[0] = offsets0[mpi_rank] + i;
|
||||||
#else
|
#else
|
||||||
offset[0] = i;
|
offset[0] = i;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (size_t j = 0; j < size(1); ++j)
|
for (size_t j = 0; j < size(1); ++j)
|
||||||
|
@ -703,7 +950,7 @@ void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::strin
|
||||||
for (int i = 0; i < 3; ++i)
|
for (int i = 0; i < 3; ++i)
|
||||||
count[i] = size(i);
|
count[i] = size(i);
|
||||||
#if defined(USE_MPI)
|
#if defined(USE_MPI)
|
||||||
count[0] *= mpi_size;
|
count[0] = glob_count;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(USE_MPI) && !defined(USE_MPI_IO)
|
#if defined(USE_MPI) && !defined(USE_MPI_IO)
|
||||||
|
@ -738,7 +985,7 @@ void Grid_FFT<data_t, bdistributed>::Write_to_HDF5(std::string fname, std::strin
|
||||||
for (size_t i = 0; i < size(0); ++i)
|
for (size_t i = 0; i < size(0); ++i)
|
||||||
{
|
{
|
||||||
#if defined(USE_MPI)
|
#if defined(USE_MPI)
|
||||||
offset[0] = mpi_rank * size(0) + i;
|
offset[0] = offsets0[mpi_rank] + i;//mpi_rank * size(0) + i;
|
||||||
#else
|
#else
|
||||||
offset[0] = i;
|
offset[0] = i;
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -187,6 +187,19 @@ int run( config_file& the_config )
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
// Create arrays
|
// Create arrays
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
|
|
||||||
|
// white noise field
|
||||||
|
Grid_FFT<real_t> wnoise({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
|
||||||
|
|
||||||
|
//... Fill the wnoise grid with a Gaussian white noise field, we do this first since the RNG might need extra memory
|
||||||
|
music::ilog << "-------------------------------------------------------------------------------" << std::endl;
|
||||||
|
music::ilog << "Generating white noise field...." << std::endl;
|
||||||
|
|
||||||
|
the_random_number_generator->Fill_Grid(wnoise);
|
||||||
|
|
||||||
|
wnoise.FourierTransformForward();
|
||||||
|
|
||||||
|
//... Next, declare LPT related arrays, allocated only as needed by order
|
||||||
Grid_FFT<real_t> phi({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
|
Grid_FFT<real_t> phi({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
|
||||||
Grid_FFT<real_t> phi2({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}, false); // do not allocate these unless needed
|
Grid_FFT<real_t> phi2({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}, false); // do not allocate these unless needed
|
||||||
Grid_FFT<real_t> phi3({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}, false); // ..
|
Grid_FFT<real_t> phi3({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen}, false); // ..
|
||||||
|
@ -197,24 +210,12 @@ int run( config_file& the_config )
|
||||||
//... array [.] access to components of A3:
|
//... array [.] access to components of A3:
|
||||||
std::array<Grid_FFT<real_t> *, 3> A3({&A3x, &A3y, &A3z});
|
std::array<Grid_FFT<real_t> *, 3> A3({&A3x, &A3y, &A3z});
|
||||||
|
|
||||||
// white noise field
|
|
||||||
Grid_FFT<real_t> wnoise({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
|
|
||||||
|
|
||||||
// temporary storage of additional data
|
// temporary storage of additional data
|
||||||
Grid_FFT<real_t> tmp({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
|
Grid_FFT<real_t> tmp({ngrid, ngrid, ngrid}, {boxlen, boxlen, boxlen});
|
||||||
|
|
||||||
//--------------------------------------------------------------------
|
|
||||||
// Fill the grid with a Gaussian white noise field
|
|
||||||
//--------------------------------------------------------------------
|
|
||||||
music::ilog << "-------------------------------------------------------------------------------" << std::endl;
|
|
||||||
music::ilog << "Generating white noise field...." << std::endl;
|
|
||||||
|
|
||||||
the_random_number_generator->Fill_Grid(wnoise);
|
|
||||||
|
|
||||||
wnoise.FourierTransformForward();
|
|
||||||
|
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
// Use externally specified large scale modes from constraints in case
|
// Use externally specified large scale modes from constraints in case
|
||||||
|
// TODO: move to separate routine
|
||||||
//--------------------------------------------------------------------
|
//--------------------------------------------------------------------
|
||||||
if( bAddConstrainedModes ){
|
if( bAddConstrainedModes ){
|
||||||
Grid_FFT<real_t,false> cwnoise({8,8,8}, {boxlen,boxlen,boxlen});
|
Grid_FFT<real_t,false> cwnoise({8,8,8}, {boxlen,boxlen,boxlen});
|
||||||
|
|
|
@ -74,8 +74,9 @@ int main( int argc, char** argv )
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
|
|
||||||
#if defined(USE_MPI)
|
#if defined(USE_MPI)
|
||||||
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &CONFIG::MPI_thread_support);
|
int thread_wanted = MPI_THREAD_MULTIPLE; // MPI_THREAD_FUNNELED
|
||||||
CONFIG::MPI_threads_ok = CONFIG::MPI_thread_support >= MPI_THREAD_FUNNELED;
|
MPI_Init_thread(&argc, &argv, thread_wanted, &CONFIG::MPI_thread_support);
|
||||||
|
CONFIG::MPI_threads_ok = CONFIG::MPI_thread_support >= thread_wanted;
|
||||||
MPI_Comm_rank(MPI_COMM_WORLD, &CONFIG::MPI_task_rank);
|
MPI_Comm_rank(MPI_COMM_WORLD, &CONFIG::MPI_task_rank);
|
||||||
MPI_Comm_size(MPI_COMM_WORLD, &CONFIG::MPI_task_size);
|
MPI_Comm_size(MPI_COMM_WORLD, &CONFIG::MPI_task_size);
|
||||||
CONFIG::MPI_ok = true;
|
CONFIG::MPI_ok = true;
|
||||||
|
@ -168,7 +169,7 @@ int main( int argc, char** argv )
|
||||||
omp_set_num_threads(CONFIG::num_threads);
|
omp_set_num_threads(CONFIG::num_threads);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// std::feclearexcept(FE_ALL_EXCEPT);
|
std::feclearexcept(FE_ALL_EXCEPT);
|
||||||
|
|
||||||
//------------------------------------------------------------------------------
|
//------------------------------------------------------------------------------
|
||||||
// Write code configuration to screen
|
// Write code configuration to screen
|
||||||
|
|
|
@ -37,7 +37,7 @@ public:
|
||||||
real_t boxsize = cf_.get_value<double>("setup", "BoxLength");
|
real_t boxsize = cf_.get_value<double>("setup", "BoxLength");
|
||||||
real_t omegab = pcc->cosmo_param_["Omega_b"];
|
real_t omegab = pcc->cosmo_param_["Omega_b"];
|
||||||
real_t omegam = pcc->cosmo_param_["Omega_m"];
|
real_t omegam = pcc->cosmo_param_["Omega_m"];
|
||||||
real_t omegal = pcc->cosmo_param_["Omega_L"];
|
real_t omegal = pcc->cosmo_param_["Omega_DE"];
|
||||||
|
|
||||||
|
|
||||||
out_eulerian_ = cf_.get_value_safe<bool>("output", "generic_out_eulerian",false);
|
out_eulerian_ = cf_.get_value_safe<bool>("output", "generic_out_eulerian",false);
|
||||||
|
|
|
@ -148,13 +148,12 @@ private:
|
||||||
protected:
|
protected:
|
||||||
std::string descriptor_string_;
|
std::string descriptor_string_;
|
||||||
int num_threads_;
|
int num_threads_;
|
||||||
int levelmin_, levelmin_final_, levelmax_, ngrid_;
|
int levelmin_, levelmin_final_, levelmax_, ngrid_, ngrid_panphasia_;
|
||||||
bool incongruent_fields_;
|
bool incongruent_fields_;
|
||||||
double inter_grid_phase_adjustment_;
|
double inter_grid_phase_adjustment_;
|
||||||
// double translation_phase_;
|
// double translation_phase_;
|
||||||
pan_state_ *lstate;
|
pan_state_ *lstate;
|
||||||
int grid_p_, grid_m_;
|
int grid_p_;
|
||||||
double grid_rescale_fac_;
|
|
||||||
int coordinate_system_shift_[3];
|
int coordinate_system_shift_[3];
|
||||||
int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
|
int ix_abs_[3], ix_per_[3], ix_rel_[3], level_p_, lextra_;
|
||||||
|
|
||||||
|
@ -170,32 +169,29 @@ protected:
|
||||||
|
|
||||||
void initialize_for_grid_structure(void)
|
void initialize_for_grid_structure(void)
|
||||||
{
|
{
|
||||||
clear_panphasia_thread_states();
|
|
||||||
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_);
|
|
||||||
|
|
||||||
// if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
|
// if ngrid is not a multiple of i_base, then we need to enlarge and then sample down
|
||||||
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
|
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
|
||||||
|
|
||||||
grid_p_ = pdescriptor_->i_base;
|
grid_p_ = pdescriptor_->i_base;
|
||||||
grid_m_ = largest_power_two_lte(grid_p_);
|
|
||||||
|
|
||||||
lextra_ = (log10((double)ngrid_ / (double)pdescriptor_->i_base) + 0.001) / log10(2.0);
|
lextra_ = (log10((double)ngrid_ / (double)grid_p_) + 0.001) / log10(2.0);
|
||||||
int ratio = 1 << lextra_;
|
|
||||||
grid_rescale_fac_ = 1.0;
|
ngrid_panphasia_ = (1 << lextra_) * grid_p_;
|
||||||
|
|
||||||
|
if( ngrid_panphasia_ < ngrid_ ){
|
||||||
|
lextra_++;
|
||||||
|
ngrid_panphasia_*=2;
|
||||||
|
}
|
||||||
|
assert( ngrid_panphasia_ >= ngrid_ );
|
||||||
|
|
||||||
|
clear_panphasia_thread_states();
|
||||||
|
|
||||||
|
music::ilog.Print("PANPHASIA: using grid size %lld (level=%d)",ngrid_panphasia_, lextra_);
|
||||||
|
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_, ngrid_panphasia_ );
|
||||||
|
|
||||||
coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
|
coordinate_system_shift_[0] = -pcf_->get_value_safe<int>("setup", "shift_x", 0);
|
||||||
coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
|
coordinate_system_shift_[1] = -pcf_->get_value_safe<int>("setup", "shift_y", 0);
|
||||||
coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
|
coordinate_system_shift_[2] = -pcf_->get_value_safe<int>("setup", "shift_z", 0);
|
||||||
|
|
||||||
incongruent_fields_ = false;
|
|
||||||
if (ngrid_ != ratio * pdescriptor_->i_base)
|
|
||||||
{
|
|
||||||
incongruent_fields_ = true;
|
|
||||||
ngrid_ = 2 * ratio * pdescriptor_->i_base;
|
|
||||||
grid_rescale_fac_ = (double)ngrid_ / (1 << levelmin_);
|
|
||||||
music::ilog << "PANPHASIA: will use a higher resolution (using Fourier interpolation)" << std::endl;
|
|
||||||
music::ilog << " (" << grid_m_ << " -> " << grid_p_ << ") * 2**ref to be compatible with PANPHASIA" << std::endl;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::unique_ptr<panphasia_descriptor> pdescriptor_;
|
std::unique_ptr<panphasia_descriptor> pdescriptor_;
|
||||||
|
@ -243,28 +239,17 @@ public:
|
||||||
auto dsinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
|
auto dsinc = [](real_t x) { return (std::fabs(x) > 1e-16) ? (x * std::cos(x) - std::sin(x)) / (x * x) : 0.0; };
|
||||||
const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
|
const real_t sqrt3{std::sqrt(3.0)}, sqrt27{std::sqrt(27.0)};
|
||||||
|
|
||||||
// make sure we're in the right space
|
// we will overwrite 'g', we can deallocate it while we prepare the panphasia field
|
||||||
Grid_FFT<real_t> &g0 = g;
|
g.reset();
|
||||||
g0.FourierTransformBackward(false);
|
|
||||||
|
|
||||||
// temporaries
|
|
||||||
Grid_FFT<real_t> g1(g.n_, g.length_);
|
|
||||||
Grid_FFT<real_t> g2(g.n_, g.length_);
|
|
||||||
Grid_FFT<real_t> g3(g.n_, g.length_);
|
|
||||||
Grid_FFT<real_t> g4(g.n_, g.length_);
|
|
||||||
|
|
||||||
clear_panphasia_thread_states();
|
clear_panphasia_thread_states();
|
||||||
music::ilog.Print("PANPHASIA: running with %d threads", num_threads_);
|
|
||||||
|
|
||||||
ngrid_ = pcf_->get_value<size_t>("setup", "GridRes");
|
// temporaries
|
||||||
|
Grid_FFT<real_t> g0({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
||||||
grid_p_ = pdescriptor_->i_base;
|
Grid_FFT<real_t> g1({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
||||||
// grid_m_ = largest_power_two_lte(grid_p_);
|
Grid_FFT<real_t> g2({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
||||||
if (ngrid_ % grid_p_ != 0)
|
Grid_FFT<real_t> g3({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
||||||
{
|
Grid_FFT<real_t> g4({size_t(ngrid_panphasia_),size_t(ngrid_panphasia_),size_t(ngrid_panphasia_)}, g.length_);
|
||||||
music::elog << "Grid resolution " << ngrid_ << " is not divisible by PANPHASIA descriptor length " << grid_p_ << std::endl;
|
|
||||||
throw std::runtime_error("Chosen [setup] / GridRes is not compatible with PANPHASIA descriptor length!");
|
|
||||||
}
|
|
||||||
|
|
||||||
double t1 = get_wtime();
|
double t1 = get_wtime();
|
||||||
// double tp = t1;
|
// double tp = t1;
|
||||||
|
@ -285,22 +270,17 @@ public:
|
||||||
std::memset(descriptor, 0, 100);
|
std::memset(descriptor, 0, 100);
|
||||||
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
||||||
|
|
||||||
start_panphasia_(&lstate[mythread], descriptor, &ngrid_, &verbosity);
|
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
|
||||||
|
|
||||||
{
|
{
|
||||||
panphasia_descriptor d(descriptor_string_);
|
panphasia_descriptor d(descriptor_string_);
|
||||||
|
|
||||||
const int lextra = (log10((double)ngrid_ / (double)d.i_base) + 0.001) / log10(2.0);
|
int level_p = d.wn_level_base + lextra_;
|
||||||
int level_p = d.wn_level_base + lextra;
|
|
||||||
const int ratio = 1 << lextra;
|
|
||||||
|
|
||||||
lstate[mythread].layer_min = 0;
|
lstate[mythread].layer_min = 0;
|
||||||
lstate[mythread].layer_max = level_p;
|
lstate[mythread].layer_max = level_p;
|
||||||
lstate[mythread].indep_field = 1;
|
lstate[mythread].indep_field = 1;
|
||||||
|
|
||||||
_unused(ratio);
|
|
||||||
assert(ngrid_ == ratio * d.i_base);
|
|
||||||
|
|
||||||
long long ix_rel[3];
|
long long ix_rel[3];
|
||||||
ix_rel[0] = 0; //ileft_corner_p[0];
|
ix_rel[0] = 0; //ileft_corner_p[0];
|
||||||
ix_rel[1] = 0; //ileft_corner_p[1];
|
ix_rel[1] = 0; //ileft_corner_p[1];
|
||||||
|
@ -317,26 +297,28 @@ public:
|
||||||
pan_state_ *ps = &lstate[mythread];
|
pan_state_ *ps = &lstate[mythread];
|
||||||
|
|
||||||
#pragma omp for //nowait
|
#pragma omp for //nowait
|
||||||
for (size_t i = 0; i < g.size(0); i += 2)
|
for (size_t i = 0; i < g0.size(0); i += 2)
|
||||||
{
|
{
|
||||||
for (size_t j = 0; j < g.size(1); j += 2)
|
const int ixmax(std::min<int>(2,g0.size(0)-i));
|
||||||
|
for (size_t j = 0; j < g0.size(1); j += 2)
|
||||||
{
|
{
|
||||||
for (size_t k = 0; k < g.size(2); k += 2)
|
const int iymax(std::min<int>(2,g0.size(1)-j));
|
||||||
|
for (size_t k = 0; k < g0.size(2); k += 2)
|
||||||
{
|
{
|
||||||
|
const int izmax(std::min<int>(2,g0.size(2)-k));
|
||||||
|
|
||||||
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
||||||
|
for (int ix = 0; ix < ixmax; ++ix)
|
||||||
for (int ix = 0; ix < 2; ++ix)
|
|
||||||
{
|
{
|
||||||
for (int iy = 0; iy < 2; ++iy)
|
for (int iy = 0; iy < iymax; ++iy)
|
||||||
{
|
{
|
||||||
for (int iz = 0; iz < 2; ++iz)
|
for (int iz = 0; iz < izmax; ++iz)
|
||||||
{
|
{
|
||||||
int ilocal = i + ix;
|
int ilocal = i + ix;
|
||||||
int jlocal = j + iy;
|
int jlocal = j + iy;
|
||||||
int klocal = k + iz;
|
int klocal = k + iz;
|
||||||
|
|
||||||
int iglobal = ilocal + g.local_0_start_;
|
int iglobal = ilocal + g0.local_0_start_;
|
||||||
int jglobal = jlocal;
|
int jglobal = jlocal;
|
||||||
int kglobal = klocal;
|
int kglobal = klocal;
|
||||||
|
|
||||||
|
@ -373,9 +355,9 @@ public:
|
||||||
{
|
{
|
||||||
auto kvec = g0.get_k<real_t>(i, j, k);
|
auto kvec = g0.get_k<real_t>(i, j, k);
|
||||||
|
|
||||||
auto argx = 0.5 * M_PI * kvec[0] / g.kny_[0];
|
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
|
||||||
auto argy = 0.5 * M_PI * kvec[1] / g.kny_[1];
|
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
|
||||||
auto argz = 0.5 * M_PI * kvec[2] / g.kny_[2];
|
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
|
||||||
|
|
||||||
auto fx = real_t(sinc(argx));
|
auto fx = real_t(sinc(argx));
|
||||||
auto gx = ccomplex_t(0.0, dsinc(argx));
|
auto gx = ccomplex_t(0.0, dsinc(argx));
|
||||||
|
@ -423,22 +405,17 @@ public:
|
||||||
std::memset(descriptor, 0, 100);
|
std::memset(descriptor, 0, 100);
|
||||||
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
std::memcpy(descriptor, descriptor_string_.c_str(), descriptor_string_.size());
|
||||||
|
|
||||||
start_panphasia_(&lstate[mythread], descriptor, &ngrid_, &verbosity);
|
start_panphasia_(&lstate[mythread], descriptor, &ngrid_panphasia_, &verbosity);
|
||||||
|
|
||||||
{
|
{
|
||||||
panphasia_descriptor d(descriptor_string_);
|
panphasia_descriptor d(descriptor_string_);
|
||||||
|
|
||||||
const int lextra = (log10((double)ngrid_ / (double)d.i_base) + 0.001) / log10(2.0);
|
int level_p = d.wn_level_base + lextra_;
|
||||||
int level_p = d.wn_level_base + lextra;
|
|
||||||
const int ratio = 1 << lextra;
|
|
||||||
|
|
||||||
lstate[mythread].layer_min = 0;
|
lstate[mythread].layer_min = 0;
|
||||||
lstate[mythread].layer_max = level_p;
|
lstate[mythread].layer_max = level_p;
|
||||||
lstate[mythread].indep_field = 1;
|
lstate[mythread].indep_field = 1;
|
||||||
|
|
||||||
_unused(ratio);
|
|
||||||
assert(ngrid_ == ratio * d.i_base);
|
|
||||||
|
|
||||||
long long ix_rel[3];
|
long long ix_rel[3];
|
||||||
ix_rel[0] = 0; //ileft_corner_p[0];
|
ix_rel[0] = 0; //ileft_corner_p[0];
|
||||||
ix_rel[1] = 0; //ileft_corner_p[1];
|
ix_rel[1] = 0; //ileft_corner_p[1];
|
||||||
|
@ -460,22 +437,26 @@ public:
|
||||||
#pragma omp for //nowait
|
#pragma omp for //nowait
|
||||||
for (size_t i = 0; i < g1.size(0); i += 2)
|
for (size_t i = 0; i < g1.size(0); i += 2)
|
||||||
{
|
{
|
||||||
|
const int ixmax(std::min<int>(2,g1.size(0)-i));
|
||||||
for (size_t j = 0; j < g1.size(1); j += 2)
|
for (size_t j = 0; j < g1.size(1); j += 2)
|
||||||
{
|
{
|
||||||
|
const int iymax(std::min<int>(2,g1.size(1)-j));
|
||||||
for (size_t k = 0; k < g1.size(2); k += 2)
|
for (size_t k = 0; k < g1.size(2); k += 2)
|
||||||
{
|
{
|
||||||
|
const int izmax(std::min<int>(2,g1.size(2)-k));
|
||||||
|
|
||||||
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
// ARJ - added inner set of loops to speed up evaluation of Panphasia
|
||||||
for (int ix = 0; ix < 2; ++ix)
|
for (int ix = 0; ix < ixmax; ++ix)
|
||||||
{
|
{
|
||||||
for (int iy = 0; iy < 2; ++iy)
|
for (int iy = 0; iy < iymax; ++iy)
|
||||||
{
|
{
|
||||||
for (int iz = 0; iz < 2; ++iz)
|
for (int iz = 0; iz < izmax; ++iz)
|
||||||
{
|
{
|
||||||
int ilocal = i + ix;
|
int ilocal = i + ix;
|
||||||
int jlocal = j + iy;
|
int jlocal = j + iy;
|
||||||
int klocal = k + iz;
|
int klocal = k + iz;
|
||||||
|
|
||||||
int iglobal = ilocal + g.local_0_start_;
|
int iglobal = ilocal + g1.local_0_start_;
|
||||||
int jglobal = jlocal;
|
int jglobal = jlocal;
|
||||||
int kglobal = klocal;
|
int kglobal = klocal;
|
||||||
|
|
||||||
|
@ -505,19 +486,19 @@ public:
|
||||||
g4.FourierTransformForward();
|
g4.FourierTransformForward();
|
||||||
|
|
||||||
#pragma omp parallel for
|
#pragma omp parallel for
|
||||||
for (size_t i = 0; i < g1.size(0); i++)
|
for (size_t i = 0; i < g0.size(0); i++)
|
||||||
{
|
{
|
||||||
for (size_t j = 0; j < g1.size(1); j++)
|
for (size_t j = 0; j < g0.size(1); j++)
|
||||||
{
|
{
|
||||||
for (size_t k = 0; k < g1.size(2); k++)
|
for (size_t k = 0; k < g0.size(2); k++)
|
||||||
{
|
{
|
||||||
if (!g1.is_nyquist_mode(i, j, k))
|
if (!g0.is_nyquist_mode(i, j, k))
|
||||||
{
|
{
|
||||||
auto kvec = g1.get_k<real_t>(i, j, k);
|
auto kvec = g0.get_k<real_t>(i, j, k);
|
||||||
|
|
||||||
auto argx = 0.5 * M_PI * kvec[0] / g.kny_[0];
|
auto argx = 0.5 * M_PI * kvec[0] / g0.kny_[0];
|
||||||
auto argy = 0.5 * M_PI * kvec[1] / g.kny_[1];
|
auto argy = 0.5 * M_PI * kvec[1] / g0.kny_[1];
|
||||||
auto argz = 0.5 * M_PI * kvec[2] / g.kny_[2];
|
auto argz = 0.5 * M_PI * kvec[2] / g0.kny_[2];
|
||||||
|
|
||||||
auto fx = real_t(sinc(argx));
|
auto fx = real_t(sinc(argx));
|
||||||
auto gx = ccomplex_t(0.0, dsinc(argx));
|
auto gx = ccomplex_t(0.0, dsinc(argx));
|
||||||
|
@ -529,19 +510,22 @@ public:
|
||||||
auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
|
auto y1(g1.kelem(i, j, k)), y2(g2.kelem(i, j, k)), y3(g3.kelem(i, j, k)), y4(g4.kelem(i, j, k));
|
||||||
|
|
||||||
g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
|
g0.kelem(i, j, k) += real_t(3.0) * (y1 * gx * gy * fz + y2 * fx * gy * gz + y3 * gx * fy * gz) + sqrt27 * y4 * gx * gy * gz;
|
||||||
|
|
||||||
// fix the overall minus w.r.t. the monofonic noise definition
|
|
||||||
g0.kelem(i, j, k ) *= -1.0;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// music::ilog.Print("\033[31mtiming [build panphasia field2]: %f s\033[0m", get_wtime() - tp);
|
g1.reset();
|
||||||
// tp = get_wtime();
|
g2.reset();
|
||||||
|
g3.reset();
|
||||||
|
g4.reset();
|
||||||
|
|
||||||
|
g.allocate();
|
||||||
|
g0.FourierInterpolateCopyTo( g );
|
||||||
|
|
||||||
music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
|
music::ilog.Print("time for calculating PANPHASIA field : %f s, %f µs/cell", get_wtime() - t1,
|
||||||
1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
|
1e6 * (get_wtime() - t1) / g.global_size(0) / g.global_size(1) / g.global_size(2));
|
||||||
music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g0.mean(), g0.std());
|
music::ilog.Print("PANPHASIA k-space statistices: mean Re = %f, std = %f", g.mean(), g.std());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue