diff --git a/source/source_basis/module_pw/CMakeLists.txt b/source/source_basis/module_pw/CMakeLists.txt index 912772e0573..282558817c4 100644 --- a/source/source_basis/module_pw/CMakeLists.txt +++ b/source/source_basis/module_pw/CMakeLists.txt @@ -43,6 +43,25 @@ add_library( ${objects} ) +add_executable( + MODULE_PW_simd_bench + test_serial/pw_simd_bench.cpp +) + +target_link_libraries( + MODULE_PW_simd_bench + parameter + ${math_libs} + planewave + device + base + Threads::Threads +) + +if(USE_OPENMP) + target_link_libraries(MODULE_PW_simd_bench OpenMP::OpenMP_CXX) +endif() + if (USE_DSP) target_link_libraries(planewave PRIVATE ${MTBLAS_FFT_DIR}/libmtblas/lib/libmtfft.a) diff --git a/source/source_basis/module_pw/pw_gatherscatter.h b/source/source_basis/module_pw/pw_gatherscatter.h index 207320f4268..0c4b5fc0fb7 100644 --- a/source/source_basis/module_pw/pw_gatherscatter.h +++ b/source/source_basis/module_pw/pw_gatherscatter.h @@ -1,10 +1,49 @@ #include "pw_basis.h" #include "source_base/global_function.h" #include "source_base/timer.h" +#include #include namespace ModulePW { +namespace detail +{ +template +inline void copy_complex_buffer(const std::complex* in, std::complex* out, const int count) +{ + if (count <= 0) + { + return; + } + + std::copy_n(in, count, out); +} + +// Top-level transform copies own the OpenMP parallel region; gather/scatter +// loops call the non-parallel helper inside their existing parallel regions. +template +inline void copy_complex_buffer_parallel(const std::complex* in, std::complex* out, const int count) +{ + constexpr int chunk_size = 1024; + if (count <= chunk_size) + { + copy_complex_buffer(in, out, count); + return; + } + +#ifdef _OPENMP +#pragma omp parallel for schedule(static) + for (int offset = 0; offset < count; offset += chunk_size) + { + const int chunk_count = std::min(chunk_size, count - offset); + std::copy_n(in + offset, chunk_count, out + offset); + } +#else + copy_complex_buffer(in, out, count); +#endif +} +} // namespace detail + /** * @brief gather planes and scatter sticks * @param in: (nplane,fftny,fftnx) @@ -21,19 +60,18 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const const int nst_ = this->nst; const int nz_ = this->nz; const int* istot2ixy_ = this->istot2ixy; + ModuleBase::timer::start(this->classname, "gatherp_copy_serial"); #ifdef _OPENMP #pragma omp parallel for #endif for(int is = 0 ; is < nst_ ; ++is) { int ixy = istot2ixy_[is]; - std::complex *outp = &out[is*nz_]; - std::complex *inp = &in[ixy*nz_]; - for(int iz = 0 ; iz < nz_ ; ++iz) - { - outp[iz] = inp[iz]; - } + std::complex* outp = &out[is*nz_]; + const std::complex* inp = &in[ixy*nz_]; + detail::copy_complex_buffer(inp, outp, nz_); } + ModuleBase::timer::end(this->classname, "gatherp_copy_serial"); return; } @@ -44,19 +82,18 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const const int nstot_gps = this->nstot; const int nplane_gps = this->nplane; const int* istot2ixy_gps = this->istot2ixy; + ModuleBase::timer::start(this->classname, "gatherp_copy_pack"); #ifdef _OPENMP #pragma omp parallel for #endif for (int istot = 0; istot < nstot_gps; ++istot) { int ixy = istot2ixy_gps[istot]; - std::complex *outp = &out[istot * nplane_gps]; - std::complex *inp = &in[ixy * nplane_gps]; - for (int iz = 0; iz < nplane_gps; ++iz) - { - outp[iz] = inp[iz]; - } + std::complex* outp = &out[istot * nplane_gps]; + const std::complex* inp = &in[ixy * nplane_gps]; + detail::copy_complex_buffer(inp, outp, nplane_gps); } + ModuleBase::timer::end(this->classname, "gatherp_copy_pack"); //exchange data //(nplane,nstot) to (numz[ip],ns, poolnproc) @@ -80,6 +117,7 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const const int* numz_gps = this->numz; const int* startg_gps = this->startg; const int* startz_gps = this->startz; + ModuleBase::timer::start(this->classname, "gatherp_copy_unpack"); #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif @@ -90,14 +128,12 @@ void PW_Basis::gatherp_scatters(std::complex* in, std::complex* out) const int nzip = numz_gps[ip]; std::complex *outp0 = &out[startz_gps[ip]]; std::complex *inp0 = &in[startg_gps[ip]]; - std::complex *outp = &outp0[is * nz_gps]; - std::complex *inp = &inp0[is * nzip ]; - for (int izip = 0; izip < nzip; ++izip) - { - outp[izip] = inp[izip]; - } + std::complex* outp = &outp0[is * nz_gps]; + const std::complex* inp = &inp0[is * nzip ]; + detail::copy_complex_buffer(inp, outp, nzip); } } + ModuleBase::timer::end(this->classname, "gatherp_copy_unpack"); #endif return; } @@ -118,6 +154,7 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const const int nst_ = this->nst; const int nz_ = this->nz; const int* istot2ixy_ = this->istot2ixy; + ModuleBase::timer::start(this->classname, "gathers_zero_serial"); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif @@ -125,20 +162,20 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const { out[i] = std::complex(0, 0); } + ModuleBase::timer::end(this->classname, "gathers_zero_serial"); + ModuleBase::timer::start(this->classname, "gathers_copy_serial"); #ifdef _OPENMP #pragma omp parallel for #endif for(int is = 0 ; is < nst_ ; ++is) { int ixy = istot2ixy_[is]; - std::complex *outp = &out[ixy*nz_]; - std::complex *inp = &in[is*nz_]; - for(int iz = 0 ; iz < nz_ ; ++iz) - { - outp[iz] = inp[iz]; - } + std::complex* outp = &out[ixy*nz_]; + const std::complex* inp = &in[is*nz_]; + detail::copy_complex_buffer(inp, outp, nz_); } + ModuleBase::timer::end(this->classname, "gathers_copy_serial"); return; } @@ -152,6 +189,7 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const const int* numz_ = this->numz; const int* startg_ = this->startg; const int* startz_ = this->startz; + ModuleBase::timer::start(this->classname, "gathers_copy_pack"); #ifdef _OPENMP #pragma omp parallel for collapse(2) #endif @@ -162,14 +200,12 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const int nzip = numz_[ip]; std::complex *outp0 = &out[startg_[ip]]; std::complex *inp0 = &in[startz_[ip]]; - std::complex *outp = &outp0[is * nzip]; - std::complex *inp = &inp0[is * nz_ ]; - for (int izip = 0; izip < nzip; ++izip) - { - outp[izip] = inp[izip]; - } + std::complex* outp = &outp0[is * nzip]; + const std::complex* inp = &inp0[is * nz_ ]; + detail::copy_complex_buffer(inp, outp, nzip); } } + ModuleBase::timer::end(this->classname, "gathers_copy_pack"); //exchange data //(numz[ip],ns, poolnproc) to (nplane,nstot) @@ -187,6 +223,7 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const } const int nrxx_gsp = this->nrxx; + ModuleBase::timer::start(this->classname, "gathers_zero_mpi"); #ifdef _OPENMP #pragma omp parallel for schedule(static) #endif @@ -194,10 +231,12 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const { out[i] = std::complex(0, 0); } + ModuleBase::timer::end(this->classname, "gathers_zero_mpi"); //change (nplane,nstot) to (nplane fftnxy) const int nstot = this->nstot; const int nplane = this->nplane; const int* istot2ixy = this->istot2ixy; + ModuleBase::timer::start(this->classname, "gathers_copy_unpack"); #ifdef _OPENMP #pragma omp parallel for #endif @@ -205,13 +244,11 @@ void PW_Basis::gathers_scatterp(std::complex* in, std::complex* out) const { int ixy = istot2ixy[istot]; //int ixy = (ixy / fftny)*ny + ixy % fftny; - std::complex *outp = &out[ixy * nplane]; - std::complex *inp = &in[istot * nplane]; - for (int iz = 0; iz < nplane; ++iz) - { - outp[iz] = inp[iz]; - } + std::complex* outp = &out[ixy * nplane]; + const std::complex* inp = &in[istot * nplane]; + detail::copy_complex_buffer(inp, outp, nplane); } + ModuleBase::timer::end(this->classname, "gathers_copy_unpack"); #endif return; } diff --git a/source/source_basis/module_pw/pw_transform.cpp b/source/source_basis/module_pw/pw_transform.cpp index 220b353e9d4..dc867e0cd86 100644 --- a/source/source_basis/module_pw/pw_transform.cpp +++ b/source/source_basis/module_pw/pw_transform.cpp @@ -34,13 +34,7 @@ void PW_Basis::real2recip(const std::complex* in, const int npw_ = this->npw; const int nxyz_ = this->nxyz; const int* ig2isz_ = this->ig2isz; -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ir = 0; ir < nrxx_; ++ir) - { - this->fft_bundle.get_auxr_data()[ir] = in[ir]; - } + detail::copy_complex_buffer_parallel(in, this->fft_bundle.get_auxr_data(), nrxx_); this->fft_bundle.fftxyfor(fft_bundle.get_auxr_data(), fft_bundle.get_auxr_data()); this->gatherp_scatters(this->fft_bundle.get_auxr_data(), this->fft_bundle.get_auxg_data()); @@ -199,13 +193,7 @@ void PW_Basis::recip2real(const std::complex* in, } else { -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ir = 0; ir < nrxx_; ++ir) - { - out[ir] = this->fft_bundle.get_auxr_data()[ir]; - } + detail::copy_complex_buffer_parallel(this->fft_bundle.get_auxr_data(), out, nrxx_); } ModuleBase::timer::end(this->classname, "recip2real"); } @@ -340,4 +328,4 @@ template void PW_Basis::recip2real(const std::complex* in, std::complex* out, const bool add, const double factor) const; -} // namespace ModulePW \ No newline at end of file +} // namespace ModulePW diff --git a/source/source_basis/module_pw/pw_transform_k.cpp b/source/source_basis/module_pw/pw_transform_k.cpp index a09aa2b686f..8c45e3d9b22 100644 --- a/source/source_basis/module_pw/pw_transform_k.cpp +++ b/source/source_basis/module_pw/pw_transform_k.cpp @@ -33,13 +33,7 @@ void PW_Basis_K::real2recip(const std::complex* in, assert(this->gamma_only == false); auto* auxr = this->fft_bundle.get_auxr_data(); -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ir = 0; ir < this->nrxx; ++ir) - { - auxr[ir] = in[ir]; - } + detail::copy_complex_buffer_parallel(in, auxr, this->nrxx); this->fft_bundle.fftxyfor(fft_bundle.get_auxr_data(), fft_bundle.get_auxr_data()); this->gatherp_scatters(this->fft_bundle.get_auxr_data(), this->fft_bundle.get_auxg_data()); @@ -200,13 +194,7 @@ void PW_Basis_K::recip2real(const std::complex* in, } else { -#ifdef _OPENMP -#pragma omp parallel for schedule(static) -#endif - for (int ir = 0; ir < this->nrxx; ++ir) - { - out[ir] = auxr[ir]; - } + detail::copy_complex_buffer_parallel(auxr, out, this->nrxx); } ModuleBase::timer::end(this->classname, "recip2real"); } diff --git a/source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp b/source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp index 84932bae2ff..039ea7c089a 100644 --- a/source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp +++ b/source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp @@ -2,6 +2,9 @@ #include "source_base/global_function.h" #include "source_base/constants.h" #include "source_base/matrix3.h" +#include +#include +#include /************************************************ * serial unit test of functions in pw_basis.cpp @@ -27,6 +30,7 @@ #define private public #include "../pw_basis_k.h" #include "../pw_basis.h" +#include "../pw_gatherscatter.h" #undef private #undef protected @@ -188,4 +192,99 @@ TEST_F(PWBasisKTEST, CollectLocalPW) EXPECT_EQ(basis_k.npwk_max,2721); } +TEST_F(PWBasisKTEST, ComplexTransformRoundTrip) +{ + ModulePW::PW_Basis_K basis_k(device_flag, precision_double); + double lat0 = 2.0; + ModuleBase::Matrix3 latvec(1.0,0.0,1.0, + 0.0,2.0,0.0, + 0.0,0.0,2.0); + double gridecut = 30.0; + const bool gamma_only_in = false; + const double gk_ecut_in = 20.0; + const int nks_in = 1; + const ModuleBase::Vector3 kvec_d_in[1] = { {0.0, 0.0, 0.0} }; + const int distribution_type_in = 2; + const bool xprime_in = false; + + basis_k.initgrids(lat0, latvec, gridecut); + basis_k.initparameters(gamma_only_in, gk_ecut_in, nks_in, kvec_d_in, distribution_type_in, xprime_in); + ASSERT_NO_THROW(basis_k.setuptransform()); + ASSERT_NE(basis_k.npwk, nullptr); + ASSERT_GT(basis_k.npwk[0], 0); + + // Use reciprocal-space input because arbitrary real-space data is projected + // by the plane-wave cutoff and is not exactly recoverable. + std::vector> recip_in(basis_k.npwk[0]); + std::vector> real_space(basis_k.nrxx); + std::vector> recip_out(basis_k.npwk[0]); + for (int ig = 0; ig < basis_k.npwk[0]; ++ig) + { + const double real_part = (ig % 17 - 8) / 11.0; + const double imag_part = (ig % 19 - 9) / 13.0; + recip_in[ig] = std::complex(real_part, imag_part); + } + + basis_k.recip2real(recip_in.data(), real_space.data(), 0); + basis_k.real2recip(real_space.data(), recip_out.data(), 0); + + for (int ig = 0; ig < basis_k.npwk[0]; ++ig) + { + EXPECT_NEAR(recip_in[ig].real(), recip_out[ig].real(), 1e-10); + EXPECT_NEAR(recip_in[ig].imag(), recip_out[ig].imag(), 1e-10); + } +} + +TEST_F(PWBasisKTEST, CopyComplexBufferTimerBenchmark) +{ + if (std::getenv("ABACUS_PW_SIMD_TIMER_TEST") == nullptr) + { + GTEST_SKIP() << "Set ABACUS_PW_SIMD_TIMER_TEST=1 to run the copy timer benchmark."; + } + const int count = 1 << 20; + const int repeats = 64; + std::vector> src(count); + std::vector> copy_n_dst(count); + std::vector> scalar_dst(count); + + for (int i = 0; i < count; ++i) + { + src[i] = std::complex((i % 97) / 17.0, (i % 89) / 19.0); + } + + volatile double checksum = 0.0; + + const auto copy_n_start = std::chrono::steady_clock::now(); + for (int repeat = 0; repeat < repeats; ++repeat) + { + ModulePW::detail::copy_complex_buffer(src.data(), copy_n_dst.data(), count); + checksum += copy_n_dst[repeat].real(); + } + const auto copy_n_end = std::chrono::steady_clock::now(); + + const auto scalar_start = std::chrono::steady_clock::now(); + for (int repeat = 0; repeat < repeats; ++repeat) + { + for (int i = 0; i < count; ++i) + { + scalar_dst[i] = src[i]; + } + checksum += scalar_dst[repeat].imag(); + } + const auto scalar_end = std::chrono::steady_clock::now(); + + const double copy_n_time = std::chrono::duration(copy_n_end - copy_n_start).count(); + const double scalar_time = std::chrono::duration(scalar_end - scalar_start).count(); + const double bytes_moved = static_cast(count) * sizeof(std::complex) * repeats; + const double gib = bytes_moved / (1024.0 * 1024.0 * 1024.0); + + std::cout << "PW_SIMD_TEST copy_n_helper " << copy_n_time << " s, " + << gib / copy_n_time << " GiB/s\n"; + std::cout << "PW_SIMD_TEST scalar_loop " << scalar_time << " s, " + << gib / scalar_time << " GiB/s\n"; + std::cout << "PW_SIMD_TEST speedup copy_n/scalar " << scalar_time / copy_n_time + << ", checksum " << checksum << "\n"; + + ASSERT_EQ(copy_n_dst, scalar_dst); +} diff --git a/source/source_basis/module_pw/test_serial/pw_basis_test.cpp b/source/source_basis/module_pw/test_serial/pw_basis_test.cpp index ea678b9d97c..57ac8f06554 100644 --- a/source/source_basis/module_pw/test_serial/pw_basis_test.cpp +++ b/source/source_basis/module_pw/test_serial/pw_basis_test.cpp @@ -2,6 +2,7 @@ #include "source_base/global_function.h" #include "source_base/constants.h" #include "source_base/matrix3.h" +#include /************************************************ * serial unit test of functions in pw_basis.cpp @@ -362,3 +363,41 @@ TEST_F(PWBasisTEST,CollectUniqgg) pwb.collect_uniqgg(); EXPECT_EQ(pwb.ngg,78); } + +TEST_F(PWBasisTEST,ComplexTransformRoundTrip) +{ + double lat0 = 2.0; + ModuleBase::Matrix3 latvec(1.0,0.0,1.0, + 0.0,2.0,0.0, + 0.0,0.0,2.0); + double gridecut = 30.0; + bool gamma_only_in = false; + double pwecut_in = 20.0; + int distribution_type_in = 2; + bool xprime_in = false; + + pwb.initgrids(lat0, latvec, gridecut); + pwb.initparameters(gamma_only_in, pwecut_in, distribution_type_in, xprime_in); + ASSERT_NO_THROW(pwb.setuptransform()); + + // Use reciprocal-space input because arbitrary real-space data is projected + // by the plane-wave cutoff and is not exactly recoverable. + std::vector> recip_in(pwb.npw); + std::vector> real_space(pwb.nrxx); + std::vector> recip_out(pwb.npw); + for (int ig = 0; ig < pwb.npw; ++ig) + { + const double real_part = (ig % 11 - 5) / 7.0; + const double imag_part = (ig % 13 - 6) / 9.0; + recip_in[ig] = std::complex(real_part, imag_part); + } + + pwb.recip2real(recip_in.data(), real_space.data()); + pwb.real2recip(real_space.data(), recip_out.data()); + + for (int ig = 0; ig < pwb.npw; ++ig) + { + EXPECT_NEAR(recip_in[ig].real(), recip_out[ig].real(), 1e-10); + EXPECT_NEAR(recip_in[ig].imag(), recip_out[ig].imag(), 1e-10); + } +} diff --git a/source/source_basis/module_pw/test_serial/pw_simd_bench.cpp b/source/source_basis/module_pw/test_serial/pw_simd_bench.cpp new file mode 100644 index 00000000000..000bd0bde6f --- /dev/null +++ b/source/source_basis/module_pw/test_serial/pw_simd_bench.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "source_base/matrix3.h" +#include "source_base/timer.h" + +#include "../pw_basis.h" +#include "../pw_basis_k.h" + +namespace +{ + +using Clock = std::chrono::steady_clock; + +template +double measure_seconds(Func&& func) +{ + const auto start = Clock::now(); + func(); + const auto end = Clock::now(); + return std::chrono::duration(end - start).count(); +} + +void print_metric(const std::string& name, const double value) +{ + std::cout << "METRIC " << name << " " << std::fixed << std::setprecision(9) << value << '\n'; +} + +void print_timer_metric(const std::string& class_name, const std::string& timer_name) +{ + const auto class_it = ModuleBase::timer::timer_pool.find(class_name); + if (class_it == ModuleBase::timer::timer_pool.end()) + { + return; + } + const auto timer_it = class_it->second.find(timer_name); + if (timer_it == class_it->second.end()) + { + return; + } + print_metric("timer." + class_name + "." + timer_name + ".seconds", timer_it->second.cpu_second); + print_metric("timer." + class_name + "." + timer_name + ".calls", static_cast(timer_it->second.calls)); +} + +double run_pw_basis_roundtrip(ModulePW::PW_Basis& basis, + const int npw, + const int nrxx, + const int repeat) +{ + std::vector> recip_in(npw); + std::vector> real_space(nrxx); + std::vector> recip_out(npw); + for (int ig = 0; ig < npw; ++ig) + { + recip_in[ig] = std::complex((ig % 17 - 8) / 13.0, (ig % 19 - 9) / 17.0); + } + + for (int warmup = 0; warmup < 3; ++warmup) + { + basis.recip2real(recip_in.data(), real_space.data()); + basis.real2recip(real_space.data(), recip_out.data()); + } + + return measure_seconds([&]() { + for (int i = 0; i < repeat; ++i) + { + basis.recip2real(recip_in.data(), real_space.data()); + basis.real2recip(real_space.data(), recip_out.data()); + } + }); +} + +double run_pw_basis_k_roundtrip(ModulePW::PW_Basis_K& basis, + const int npw, + const int nrxx, + const int ik, + const int repeat) +{ + std::vector> recip_in(npw); + std::vector> real_space(nrxx); + std::vector> recip_out(npw); + for (int ig = 0; ig < npw; ++ig) + { + recip_in[ig] = std::complex((ig % 17 - 8) / 13.0, (ig % 19 - 9) / 17.0); + } + + for (int warmup = 0; warmup < 3; ++warmup) + { + basis.recip2real(recip_in.data(), real_space.data(), ik); + basis.real2recip(real_space.data(), recip_out.data(), ik); + } + + return measure_seconds([&]() { + for (int i = 0; i < repeat; ++i) + { + basis.recip2real(recip_in.data(), real_space.data(), ik); + basis.real2recip(real_space.data(), recip_out.data(), ik); + } + }); +} + +void bench_pw_basis_medium() +{ + ModuleBase::timer::timer_pool.clear(); + ModulePW::PW_Basis basis; + basis.initgrids(2.0, ModuleBase::Matrix3(1.0, 0.0, 1.0, + 0.0, 2.0, 0.0, + 0.0, 0.0, 2.0), + 30.0); + basis.initparameters(false, 20.0, 2, false); + basis.setuptransform(); + + const int repeat = 4096; + const double elapsed = run_pw_basis_roundtrip(basis, basis.npw, basis.nrxx, repeat); + print_metric("PW_Basis.medium.roundtrip.wall", elapsed); + print_metric("PW_Basis.medium.roundtrip.ms_per_op", elapsed / repeat * 1000.0); + print_metric("PW_Basis.medium.nrxx", static_cast(basis.nrxx)); + print_metric("PW_Basis.medium.npw", static_cast(basis.npw)); + + print_timer_metric("PW_Basis", "real2recip"); + print_timer_metric("PW_Basis", "recip2real"); + print_timer_metric("PW_Basis", "gatherp_copy_serial"); + print_timer_metric("PW_Basis", "gathers_copy_serial"); +} + +void bench_pw_basis_large() +{ + ModuleBase::timer::timer_pool.clear(); + ModulePW::PW_Basis basis; + basis.initgrids(2.0, ModuleBase::Matrix3(2.0, 0.0, 0.0, + 0.0, 2.0, 0.0, + 0.0, 0.0, 2.0), + 40.0); + basis.initparameters(false, 25.0, 2, false); + basis.setuptransform(); + + const int repeat = 2048; + const double elapsed = run_pw_basis_roundtrip(basis, basis.npw, basis.nrxx, repeat); + print_metric("PW_Basis.large.roundtrip.wall", elapsed); + print_metric("PW_Basis.large.roundtrip.ms_per_op", elapsed / repeat * 1000.0); + print_metric("PW_Basis.large.nrxx", static_cast(basis.nrxx)); + print_metric("PW_Basis.large.npw", static_cast(basis.npw)); + + print_timer_metric("PW_Basis", "real2recip"); + print_timer_metric("PW_Basis", "recip2real"); + print_timer_metric("PW_Basis", "gatherp_copy_serial"); + print_timer_metric("PW_Basis", "gathers_copy_serial"); +} + +void bench_pw_basis_k_medium() +{ + ModuleBase::timer::timer_pool.clear(); + ModulePW::PW_Basis_K basis("cpu", "double"); + const ModuleBase::Vector3 kvec_d[1] = {{0.0, 0.0, 0.0}}; + basis.initgrids(2.0, ModuleBase::Matrix3(1.0, 0.0, 1.0, + 0.0, 2.0, 0.0, + 0.0, 0.0, 2.0), + 30.0); + basis.initparameters(false, 20.0, 1, kvec_d, 2, false); + basis.setuptransform(); + + const int repeat = 4096; + const int ik = 0; + const double elapsed = run_pw_basis_k_roundtrip(basis, basis.npwk[ik], basis.nrxx, ik, repeat); + print_metric("PW_Basis_K.medium.roundtrip.wall", elapsed); + print_metric("PW_Basis_K.medium.roundtrip.ms_per_op", elapsed / repeat * 1000.0); + print_metric("PW_Basis_K.medium.nrxx", static_cast(basis.nrxx)); + print_metric("PW_Basis_K.medium.npw", static_cast(basis.npwk[ik])); + + print_timer_metric("PW_Basis_K", "real2recip"); + print_timer_metric("PW_Basis_K", "recip2real"); + print_timer_metric("PW_Basis_K", "gatherp_copy_serial"); + print_timer_metric("PW_Basis_K", "gathers_copy_serial"); +} + +} // namespace + +int main() +{ + ModuleBase::timer::enable(); + + bench_pw_basis_medium(); + bench_pw_basis_large(); + bench_pw_basis_k_medium(); + + std::ofstream nullstream("/dev/null"); + ModuleBase::timer::finish(nullstream, false, false); + return 0; +} diff --git a/work_docs/feat_simd_optimization_process_report_2026-06-06.md b/work_docs/feat_simd_optimization_process_report_2026-06-06.md new file mode 100644 index 00000000000..b9bf6e3fd5d --- /dev/null +++ b/work_docs/feat_simd_optimization_process_report_2026-06-06.md @@ -0,0 +1,284 @@ +# `feat/simd` 优化过程报告 + +## 1. 总览 + +本报告结合 `feat/simd` 相对 `develop` 的提交历史与 **2026-06-06** 采集的性能数据,还原并总结这条优化分支的演进过程。 + +该分支的目标并非修改 `module_pw` 的数学结果,而是降低 transform 路径中复数缓冲区重复搬运的开销,重点覆盖以下阶段: + +- gather planes / scatter sticks +- gather sticks / scatter planes +- `real2recip` 与 `recip2real` 周围的顶层缓冲区拷贝 + +最终性能呈”部分显著成功、部分仍需验证”的状态: + +- `PW_Basis` 路径:约 **1.5x** 的明确提升 +- `PW_Basis_K.medium`:功能正确,但未稳定跑出性能收益 + +这条分支的价值不仅在于最终加速比,更在于其演进过程本身——它展示了一条从”先试一个低层 copy 优化”起步,逐步推进到”可维护、可测试、可量化”的完整优化路径。 + +## 2. 按 commit 展开的优化过程 + +### `7c58a45ce` `have a try` + +这条分支的首次优化尝试,直接瞄准 `pw_gatherscatter.h` 中高频重复的复数拷贝循环。 + +#### 背景与问题 + +原始实现按 `std::complex` 逐元素复制:每轮循环只处理一个复数,且循环在多个 gather/scatter 路径里反复出现。对编译器而言,这种写法未能明确表达”底层其实是连续交错的实部/虚部标量流”,在 transform 多次调用下,拷贝开销容易成为热点。 + +#### 改动 + +在 `gatherp_scatters` 与 `gathers_scatterp` 的每个内部循环中([pw_gatherscatter.h](../source/source_basis/module_pw/pw_gatherscatter.h)),将原始逐元素复数拷贝改写为: + +- 复数缓冲区通过 `reinterpret_cast` 转为连续交错的标量数组 +- 以 `2 * count` 个标量取代 `count` 个复数对象进行复制 +- 引入 `__restrict__` 指针消除别名冲突 +- 对 GCC 构建增加 `#pragma GCC ivdep` 提示向量化 + +```cpp +// 原始实现:每轮一个复数 +for (int iz = 0; iz < nz_; ++iz) { outp[iz] = inp[iz]; } + +// 优化后:连续标量流 + __restrict__ + ivdep +T* __restrict__ outp_r = reinterpret_cast(outp); +const T* __restrict__ inp_r = reinterpret_cast(inp); +#pragma GCC ivdep +for (int iz = 0; iz < 2 * nz_; ++iz) { outp_r[iz] = inp_r[iz]; } +``` + +核心思路:将复数拷贝重写为更接近”连续标量流”的形式,降低编译器向量化门槛。 + +#### 局限 + +方向正确,但版本不成熟: + +- 优化逻辑在多个位置重复展开 +- 依赖 GCC 特定 pragma,可移植性一般 +- 仅覆盖 gather/scatter 内部循环,未触及 `pw_transform.cpp` / `pw_transform_k.cpp` 的顶层 copy +- 缺乏针对性的正确性测试 + +该 commit 定位为”验证思路是否值得推进”的试探性步骤,而非最终形态。 + +### `c268969f9` `refine complex buffer copies in module_pw` + +本次分支最核心的一次重构:将前一个提交的”局部尝试”抽象为可复用机制。 + +#### 背景与问题 + +第一轮尝试后,面临两个主要问题: + +1. 低层 copy 优化代码在多个位置重复,维护成本高 +2. 优化范围不完整,未覆盖整个 transform 拷贝链路 + +#### 改动 + +在 [pw_gatherscatter.h](../source/source_basis/module_pw/pw_gatherscatter.h) 中引入两个 helper(第 12–44 行): + +- `detail::copy_complex_buffer` — 基于 `std::copy_n` 的串行半精度复数拷贝 +- `detail::copy_complex_buffer_parallel` — 按 1024 元素分块、配合 OpenMP `#pragma omp parallel for` 的并行版本 + +```cpp +// 串行版本(pw_gatherscatter.h:12-20) +template +inline void copy_complex_buffer(const std::complex* in, + std::complex* out, const int count) { + if (count <= 0) return; + std::copy_n(in, count, out); +} + +// 并行版本(pw_gatherscatter.h:25-44):大于 chunk_size 时启动 OpenMP +template +inline void copy_complex_buffer_parallel(const std::complex* in, + std::complex* out, const int count) { + constexpr int chunk_size = 1024; + if (count <= chunk_size) { copy_complex_buffer(in, out, count); return; } + #pragma omp parallel for schedule(static) + for (int offset = 0; offset < count; offset += chunk_size) { + int chunk_count = std::min(chunk_size, count - offset); + std::copy_n(in + offset, chunk_count, out + offset); + } +} +``` + +并将 helper 应用于以下路径: + +| 方法 | 位置 | 调用 | +|---|---|---| +| `PW_Basis::gatherp_scatters` | [pw_gatherscatter.h:72,94,133](../source/source_basis/module_pw/pw_gatherscatter.h#L72) | `copy_complex_buffer` | +| `PW_Basis::gathers_scatterp` | [pw_gatherscatter.h:176,205,249](../source/source_basis/module_pw/pw_gatherscatter.h#L176) | `copy_complex_buffer` | +| `PW_Basis::real2recip` | [pw_transform.cpp:37](../source/source_basis/module_pw/pw_transform.cpp#L37) | `copy_complex_buffer_parallel` | +| `PW_Basis::recip2real` | [pw_transform.cpp:196](../source/source_basis/module_pw/pw_transform.cpp#L196) | `copy_complex_buffer_parallel` | +| `PW_Basis_K::real2recip` | [pw_transform_k.cpp:36](../source/source_basis/module_pw/pw_transform_k.cpp#L36) | `copy_complex_buffer_parallel` | +| `PW_Basis_K::recip2real` | [pw_transform_k.cpp:197](../source/source_basis/module_pw/pw_transform_k.cpp#L197) | `copy_complex_buffer_parallel` | + +至此,优化目标不再局限于局部 gather/scatter 循环,而是扩展到 FFT transform 前后整条复数缓冲区搬运链的 **6 个调用点**。 + +#### 为何是关键转折 + +最终 benchmark 印证了这一设计的重要性:`PW_Basis` 端到端加速比明显高于单个 gather/scatter timer 的提升。这说明最终收益并非仅来自局部循环,而是来自 **gather/scatter 内部 copy** 与 **transform 顶层 staging copy** 两端的共同改善。 + +“沿整条 copy 链统一优化”的思路,正是从这个 commit 开始建立的。 + +#### 仍缺什么 + +性能路径已经收拢,但正确性保障不足。copy-heavy 优化极易出现”看起来能跑,但数值细节有偏差”的问题,需要补充有针对性的 round-trip 测试。 + +### `754fe85bb` `add module_pw complex transform round-trip tests` + +补上优化过程中最缺的一块:针对 transform 正确性的专项测试。 + +#### 背景与问题 + +复数拷贝优化存在几个典型风险: + +- 实部和虚部次序被破坏 +- 拷贝长度正确但布局错误 +- round-trip 表面上能工作,但数值发生微小漂移 + +若无专门测试,这些问题往往在后期才暴露。 + +#### 改动 + +为两条路径分别增加串行单元测试: + +- `PWBasisTEST.ComplexTransformRoundTrip` — [test_serial/pw_basis_test.cpp:367-403](../source/source_basis/module_pw/test_serial/pw_basis_test.cpp#L367) +- `PWBasisKTEST.ComplexTransformRoundTrip` — [test_serial/pw_basis_k_test.cpp:195-236](../source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp#L195) + +测试流程:构造确定性的 reciprocal-space 输入 → 调用 `recip2real` → 再调用 `real2recip` → 对恢复的 reciprocal-space 数据逐元素比较。 + +```cpp +// pw_basis_test.cpp:367-403 — 简化示意 +std::vector> recip_in(pwb.npw); +// 用确定性格子 (ig % 11 - 5) / 7.0 + i * (ig % 13 - 6) / 9.0 填充 +pwb.recip2real(recip_in.data(), real_space.data()); +pwb.real2recip(real_space.data(), recip_out.data()); +for (int ig = 0; ig < pwb.npw; ++ig) { + EXPECT_NEAR(recip_in[ig].real(), recip_out[ig].real(), 1e-10); + EXPECT_NEAR(recip_in[ig].imag(), recip_out[ig].imag(), 1e-10); +} +``` + +选择 reciprocal-space 作为输入源是刻意的:若直接取任意 real-space 数据做 round-trip,plane-wave cutoff 投影本身即可能导致数据无法精确恢复,从而制造与优化无关的假失败(详见 [pw_basis_test.cpp:383-384](../source/source_basis/module_pw/test_serial/pw_basis_test.cpp#L383) 注释)。 + +#### 为何重要 + +这是整条优化链中”建立安全护栏”的一步。只有将 round-trip 正确性明确固定下来,后续的性能改写才不会演变成”为了更快,悄悄牺牲数值行为”。 + +### `25ebe2e30` `document module_pw copy helpers and tests` + +改动量不大,但让分支从”能跑”走向”容易理解和维护”。 + +#### 背景与问题 + +helper 与测试补上之后,若设计意图缺乏说明,后续维护者仍可能误判: + +- 为何 helper 要按交错标量流的思路来写 +- 为何测试从 reciprocal-space 输入开始,而非任意 real-space 数组 +- 为何同时保留串行 helper 和顶层 parallel helper + +#### 改动 + +补充几类说明性注释(详见 [pw_gatherscatter.h:22-23](../source/source_basis/module_pw/pw_gatherscatter.h#L22)、[pw_basis_test.cpp:383-384](../source/source_basis/module_pw/test_serial/pw_basis_test.cpp#L383)): + +- helper 设计意图:让编译器更容易处理连续的实部/虚部数据 +- round-trip 测试起点:reciprocal-space 输入才是合理选择 +- 并行策略:顶层 transform copy 拥有自己的 OpenMP 区域;gather/scatter 内部循环则在已有并行区中调用非并行 helper + +#### 价值 + +它本身不直接提升性能,但降低了后续维护中”误删优化””误改测试””看不懂所以回退”的风险。对于底层性能优化而言,可解释性不是附属品,而是长期稳定性的组成部分。 + +### `f3a0b6b4c` `Merge branch 'deepmodeling:develop' into feat/simd` + +该 merge commit 并非 SIMD 优化的实现部分,但在演进过程中承担了”集成与收敛”的角色。 + +#### 意义 + +- 将分支与当时的 `develop` 同步 +- 确保 SIMD 改动能与主线近期演化共存 +- 降低后续合并时大规模冲突的风险 + +因此,它应被理解为”将性能优化保持在主线可集成状态”的必要步骤,而非单独的性能改进。 + +### `3245d2d31` `remove pragma GCC ivdep and use std::copy_n` + +这是分支最后一个关键的”收口”与”工程化”提交。 + +#### 背景与问题 + +此前 helper 仍较依赖 GCC 风格: + +- `reinterpret_cast` 到标量流 +- `#pragma GCC ivdep` + +这种写法虽可能有效,但存在明显问题:编译器耦合强、可读性弱、维护者理解成本高。 + +#### 改动 + +**标准库化** — 将 helper 调整为更可移植的实现([pw_gatherscatter.h:12-44](../source/source_basis/module_pw/pw_gatherscatter.h#L12)): + +- `copy_complex_buffer` → 内层使用 `std::copy_n`,移除 `reinterpret_cast`、`__restrict__` 与 `#pragma GCC ivdep` +- `copy_complex_buffer_parallel` → 按 1024 元素分块,在大 buffer 上以 OpenMP `#pragma omp parallel for` 调度 `std::copy_n` +- 显式补充 `#include ` + +```cpp +// 最终版:std::copy_n 替代手工循环(pw_gatherscatter.h:19) +std::copy_n(in, count, out); +``` + +**可观测性** — 在 copy-sensitive 路径补充 `ModuleBase::timer` 时间戳([pw_gatherscatter.h](../source/source_basis/module_pw/pw_gatherscatter.h) 的 6 处 start/end:[L63-L74](../source/source_basis/module_pw/pw_gatherscatter.h#L63)、[L85-L96](../source/source_basis/module_pw/pw_gatherscatter.h#L85)、[L120-L136](../source/source_basis/module_pw/pw_gatherscatter.h#L120)、[L157-L178](../source/source_basis/module_pw/pw_gatherscatter.h#L157)、[L192-L208](../source/source_basis/module_pw/pw_gatherscatter.h#L192)、[L239-L251](../source/source_basis/module_pw/pw_gatherscatter.h#L239));并在 `real2recip`/`recip2real` 顶层([pw_transform.cpp:30-66](../source/source_basis/module_pw/pw_transform.cpp#L30)、[pw_transform_k.cpp:32-68](../source/source_basis/module_pw/pw_transform_k.cpp#L32))也统一添加 timer。 + +**验证加强**: + +- 在 [pw_basis_k_test.cpp:213-214](../source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp#L213) 增加 `npwk` 合法性检查:`ASSERT_NE(basis_k.npwk, nullptr)` 与 `ASSERT_GT(basis_k.npwk[0], 0)` +- 在 [pw_basis_k_test.cpp:238-274](../source/source_basis/module_pw/test_serial/pw_basis_k_test.cpp#L238) 增加可选开启的 copy benchmark(`CopyComplexBufferTimerBenchmark`,通过环境变量 `ABACUS_PW_SIMD_TIMER_TEST` 控制) + +#### 为何重要 + +这是分支从”手工调优尝试”走向”标准库驱动、便于测量、便于维护”的关键一步: + +- 不再依赖 GCC 特有 pragma 表达优化意图 +- 代码语义更清晰,更接近标准 C++ +- 性能热点变得可测量,而非仅凭感觉判断 + +本次 `feat/simd` 与 `develop` 的性能对比之所以能顺利完成,正是因为这个提交将 timer 与 helper 结构整理到了适合 benchmark 的状态。 + +## 3. 性能结果到提交的映射 + +最终 benchmark 可反推各阶段的贡献分布。 + +### `PW_Basis` 路径 + +端到端约 **1.53x–1.55x** 的提升,表明以下提交的组合有效: + +- `7c58a45ce` — 首次优化尝试 +- `c268969f9` — 核心抽象与链路扩展 +- `3245d2d31` — 工程化收口 + +尤其值得关注的是:gather/scatter copy timer 有提升,而 `real2recip` / `recip2real` 顶层 timer([pw_transform.cpp:30-66](../source/source_basis/module_pw/pw_transform.cpp#L30)、[pw_transform_k.cpp:32-68](../source/source_basis/module_pw/pw_transform_k.cpp#L32))提升更明显。这恰好印证了 `c268969f9` 的核心思路——收益来自整条 transform copy 链的统一优化,而非局部循环的孤立改进。 + +### `PW_Basis_K` 路径 + +`PW_Basis_K.medium` 的结果更为复杂: + +- `754fe85bb` 保证了正确性 +- `25ebe2e30` 和 `3245d2d31` 提升了可解释性与可测量性 +- 但当前串行中等规模基准未给出净加速结果 + +这并不意味着分支没有价值,而是揭示了明显的证据分层:对 `PW_Basis` 收益明确,对本次测试的 `PW_Basis_K` case 尚不充分。从优化过程本身来看,这同样是有价值的——分支不仅做了优化,还通过测试和 timer 明确暴露了”哪些地方收益明显、哪些地方还没跑出来”。 + +## 4. 总结 + +`feat/simd` 展示了一条完整的性能优化路径: + +1. **试探** — 将热点复数拷贝循环改写为利于向量化的低层形式 +2. **抽象** — 将局部技巧封装为 helper,扩展至整条 transform 拷贝链路 +3. **验证** — 增加 round-trip 测试,确保优化不破坏数值正确性 +4. **解释** — 补充注释与设计说明,确保可维护性 +5. **工程化** — 以 `std::copy_n` 替换编译器特定 pragma,提升可移植性 +6. **可观测** — 补充 timer,使性能结论可量化验证 + +与 `develop` 对比后,该分支在 `PW_Basis` 路径上的成功是明确的:带来了可重复、可解释的性能提升。在 `PW_Basis_K.medium` 测试口径下,它保持了正确性,但尚未取得稳定加速的充分证据。 + +这条分支最值得肯定的,不仅是它让某些热点更快了,更在于它将优化本身做成了一个完整的工程过程——有实验、有抽象、有测试、有注释、有 timer,也有对收益边界的诚实呈现。 diff --git a/work_docs/module_pw_simd_perf_compare_2026-06-06.md b/work_docs/module_pw_simd_perf_compare_2026-06-06.md new file mode 100644 index 00000000000..16e082b930b --- /dev/null +++ b/work_docs/module_pw_simd_perf_compare_2026-06-06.md @@ -0,0 +1,142 @@ +# `feat/simd` 与 `develop` 在 `module_pw` 上的性能对比报告 + +## 1. 对比范围 + +本报告对比了 **2026-06-06** 当天 `feat/simd` 与 `develop` 两个分支在 `module_pw` 模块上的性能表现。 + +- 优化分支:`feat/simd` +- 基线分支:`develop` +- 对比重点:SIMD 优化涉及到的复数缓冲区拷贝路径 +- 基准入口:`source/source_basis/module_pw/test_serial/pw_simd_bench.cpp` +- 构建选项:`-DENABLE_MPI=OFF -DUSE_OPENMP=OFF -DUSE_ELPA=OFF -DBUILD_TESTING=OFF` + +为了保证对比公平,`develop` 分支是在独立临时 worktree `/home/aunixt/abacus-develop-develop` 中测得,临时只补充了两类内容: + +1. 与 `feat/simd` 完全一致的 `MODULE_PW_simd_bench` 基准程序 +2. 与优化分支同名的 `ModuleBase::timer` 时间戳,用于记录相同的 gather/scatter 拷贝阶段 + +对 `develop` 没有回移植任何 SIMD 优化逻辑,基线分支的算法行为保持不变。 + +## 2. 基准设计 + +本次基准使用 reciprocal-space round-trip transform 作为统一测试口径。 + +- `PW_Basis.medium` + - 晶格:`Matrix3(1,0,1; 0,2,0; 0,0,2)` + - `gridecut=30.0`,`pwecut=20.0` + - `nrxx=320`,`npw=49` + - 重复次数:`4096` +- `PW_Basis.large` + - 晶格:`Matrix3(2,0,0; 0,2,0; 0,0,2)` + - `gridecut=40.0`,`pwecut=25.0` + - `nrxx=729`,`npw=147` + - 重复次数:`2048` +- `PW_Basis_K.medium` + - 单个 k 点:`{0,0,0}` + - 几何参数与 `PW_Basis.medium` 相同 + - `nrxx=320`,`npwk=49` + - 重复次数:`4096` + +两个分支都在同一台机器上各运行 **3 次**,最终报告采用 **3 次结果的中位数** 作为比较依据。 + +## 3. 中位数结果 + +### 3.1 端到端 round-trip 耗时 + +| 用例 | 指标 | `develop` 中位数 | `feat/simd` 中位数 | 加速比 | +|---|---:|---:|---:|---:| +| `PW_Basis.medium` | ms/op | 0.001819651 | 0.001192664 | **1.526x** | +| `PW_Basis.large` | ms/op | 0.004269503 | 0.002761381 | **1.546x** | +| `PW_Basis_K.medium` | ms/op | 0.001135719 | 0.001236483 | **0.919x** | + +### 3.2 与拷贝路径相关的 timer 分解结果 + +| 用例 | Timer 名称 | `develop` 中位数 (s) | `feat/simd` 中位数 (s) | 加速比 | +|---|---|---:|---:|---:| +| `PW_Basis.medium` | `real2recip` | 0.002785 | 0.001803 | **1.545x** | +| `PW_Basis.medium` | `recip2real` | 0.004378 | 0.002761 | **1.586x** | +| `PW_Basis.medium` | `gatherp_copy_serial` | 0.000396 | 0.000311 | **1.273x** | +| `PW_Basis.medium` | `gathers_copy_serial` | 0.000528 | 0.000341 | **1.548x** | +| `PW_Basis.large` | `real2recip` | 0.003565 | 0.002367 | **1.506x** | +| `PW_Basis.large` | `recip2real` | 0.005033 | 0.003151 | **1.597x** | +| `PW_Basis.large` | `gatherp_copy_serial` | 0.000361 | 0.000264 | **1.367x** | +| `PW_Basis.large` | `gathers_copy_serial` | 0.000333 | 0.000293 | **1.137x** | +| `PW_Basis_K.medium` | `real2recip` | 0.001949 | 0.002108 | 0.925x | +| `PW_Basis_K.medium` | `recip2real` | 0.002479 | 0.002634 | 0.941x | +| `PW_Basis_K.medium` | `gatherp_copy_serial` | 0.000342 | 0.000354 | 0.966x | +| `PW_Basis_K.medium` | `gathers_copy_serial` | 0.000342 | 0.000309 | 1.107x | + +## 4. 结果解读 + +### 4.1 哪些部分获得了明显提升 + +在本次测试口径下,`feat/simd` 在 `PW_Basis` 路径上表现出比较稳定的收益,端到端 round-trip 性能大约提升 **1.5 倍**。这种提升不仅体现在总耗时上,也体现在顶层 transform timer 上: + +- `PW_Basis.medium`:`real2recip` 与 `recip2real` 都提升了约 **1.55x** +- `PW_Basis.large`:`real2recip` 与 `recip2real` 提升约 **1.51x 到 1.60x** + +这与分支中的实现修改是一致的。`pw_gatherscatter.h`、`pw_transform.cpp` 和 `pw_transform_k.cpp` 的改动并没有改变 FFT 的数学流程,而是降低了 transform 前后重复复数缓冲区搬运的成本。 + +### 4.2 性能收益主要来自哪里 + +从 copy-phase timer 可以直接看到,优化分支确实改善了串行 gather/scatter 阶段的拷贝开销: + +- `gatherp_copy_serial`:提升约 **1.27x 到 1.37x** +- `gathers_copy_serial`:提升约 **1.14x 到 1.55x** + +而端到端加速比比这些局部 timer 还高,说明收益不只来自 gather/scatter 内部循环,还来自顶层 transform 中的连续缓冲区拷贝优化,具体包括: + +- `PW_Basis::real2recip` +- `PW_Basis::recip2real` +- `PW_Basis_K::real2recip` +- `PW_Basis_K::recip2real` + +也正因为覆盖了更完整的 copy 链路,所以整体 transform 路径的收益会大于单个 gather/scatter 子阶段。 + +### 4.3 哪些部分在本次基准中没有体现提升 + +`PW_Basis_K.medium` 在这组串行微基准下没有表现出净提升,反而中位数上出现了轻微回退: + +- `develop`:`0.001135719 ms/op` +- `feat/simd`:`0.001236483 ms/op` +- 比值:`0.919x` + +内部 timer 也说明这个 case 当前更接近噪声区间: + +- `gatherp_copy_serial` 基本持平 +- `gathers_copy_serial` 有小幅提升 +- 顶层 `real2recip` / `recip2real` 略慢于基线 + +因此,从现有证据出发,更准确的结论应当是: + +- `PW_Basis`:SIMD/copy 重构收益明确 +- `PW_Basis_K.medium`:在这组基准下,收益暂时没有被证明出来 + +## 5. 原始三轮数据 + +### 5.1 `feat/simd` + +| 轮次 | `PW_Basis.medium` ms/op | `PW_Basis.large` ms/op | `PW_Basis_K.medium` ms/op | +|---|---:|---:|---:| +| 1 | 0.001192664 | 0.002739758 | 0.001266641 | +| 2 | 0.001468912 | 0.002972134 | 0.001236483 | +| 3 | 0.001141960 | 0.002761381 | 0.001045821 | + +### 5.2 `develop` + +| 轮次 | `PW_Basis.medium` ms/op | `PW_Basis.large` ms/op | `PW_Basis_K.medium` ms/op | +|---|---:|---:|---:| +| 1 | 0.001819651 | 0.004574592 | 0.001130419 | +| 2 | 0.001776332 | 0.004053817 | 0.001147463 | +| 3 | 0.002436848 | 0.004269503 | 0.001135719 | + +## 6. 最终结论 + +从 `module_pw` 这次对比来看,`feat/simd` 在 `PW_Basis` 路径上带来了**明确且可重复的性能提升**,在本次串行 round-trip 微基准中,中位数加速比约为 **1.53x 到 1.55x**。 + +但在当前这组 `PW_Basis_K.medium` 基准中,并没有观察到同样明确的收益。因此更客观的总结应该是: + +- `PW_Basis`:优化成功,收益已经被清楚证明 +- `PW_Basis_K.medium`:功能正确性保持不变,但当前基准下性能收益尚未建立 + +如果后续还需要继续补充性能论证,下一步更值得做的是增加更大的 `PW_Basis_K` 用例,或者在 MPI / OpenMP 配置下继续测试,以便观察更大问题规模下 helper 开销被摊薄之后,是否能更充分体现 SIMD/copy 优化的价值。