From 2d4ef286cdb928dc5a9e42a1586e8978c40c2bbc Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 3 Jun 2026 11:38:48 +0800 Subject: [PATCH 1/2] Optimize cube output formatting --- source/module_io/write_cube.cpp | 311 +++++++++++++++++++++++++++++--- 1 file changed, 290 insertions(+), 21 deletions(-) diff --git a/source/module_io/write_cube.cpp b/source/module_io/write_cube.cpp index dbb4e469cfc..d3368074910 100644 --- a/source/module_io/write_cube.cpp +++ b/source/module_io/write_cube.cpp @@ -1,9 +1,266 @@ #include "module_base/element_name.h" +#include "module_base/global_variable.h" #include "module_io/cube_io.h" #include "module_parameter/parameter.h" -#include #include "module_hamilt_pw/hamilt_pwdft/parallel_grid.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _OPENMP +#include +#endif + +namespace +{ +int choose_cube_rows_per_chunk(const int nz, const int precision) +{ + constexpr std::size_t target_chunk_bytes = 8 * 1024 * 1024; + const std::size_t approx_bytes_per_value = static_cast(std::max(precision + 9, 16)); + const std::size_t approx_line_breaks = static_cast(std::max(nz / 6, 1)); + const std::size_t approx_bytes_per_row = static_cast(std::max(nz, 1)) * approx_bytes_per_value + + approx_line_breaks + 1; + const std::size_t rows = target_chunk_bytes / std::max(approx_bytes_per_row, 1); + return static_cast(std::max(rows, 1)); +} + +// Estimates buffer size needed for a chunk of cube data +std::size_t estimate_cube_chunk_size(const int row_count, + const int nz, + const int ndata_line, + const int precision) +{ + const std::size_t bytes_per_value = static_cast(std::max(precision + 16, 32)); + const int inner_line_breaks = (ndata_line > 0 && nz > 0) ? (nz - 1) / ndata_line : 0; + return static_cast(row_count) + * (static_cast(nz) * bytes_per_value + + static_cast(inner_line_breaks + 1)); +} + +void format_cube_data_rows_to_buffer(const std::vector& data, + const int row_begin, + const int row_end, + const int nz, + const int precision, + const int ndata_line, + std::string& buffer) +{ + const int row_count = row_end - row_begin; + buffer.resize(estimate_cube_chunk_size(row_count, nz, ndata_line, precision)); + if (buffer.empty()) + { + return; + } + + char* ptr = &buffer[0]; + char* end = ptr + buffer.size(); + + const auto ensure_space = [&buffer, &ptr, &end](const std::size_t required) { + const std::size_t used = static_cast(ptr - &buffer[0]); + if (static_cast(end - ptr) >= required) + { + return; + } + + const std::size_t grown_size = std::max(buffer.size() + required, buffer.size() * 2 + 1); + buffer.resize(grown_size); + ptr = &buffer[0] + used; + end = &buffer[0] + buffer.size(); + }; + + for (int ixy = row_begin; ixy < row_end; ++ixy) + { + for (int iz = 0; iz < nz; ++iz) + { + ensure_space(static_cast(std::max(precision + 16, 32))); + int written = std::snprintf(ptr, + static_cast(end - ptr), + " %.*e", + precision, + data[ixy * nz + iz]); + if (written < 0) + { + written = 0; + } + if (written >= end - ptr) + { + ensure_space(static_cast(written) + 1); + written = std::snprintf(ptr, + static_cast(end - ptr), + " %.*e", + precision, + data[ixy * nz + iz]); + if (written < 0) + { + written = 0; + } + } + ptr += written; + + if ((iz + 1) % ndata_line == 0 && iz != nz - 1) + { + ensure_space(1); + *ptr++ = '\n'; + } + } + ensure_space(1); + *ptr++ = '\n'; + } + + buffer.resize(static_cast(ptr - &buffer[0])); +} + +std::string format_cube_data_chunk(const std::vector& data, + const int row_begin, + const int row_end, + const int nz, + const int precision, + const int ndata_line) +{ + std::string data_buffer; + format_cube_data_rows_to_buffer(data, row_begin, row_end, nz, precision, ndata_line, data_buffer); + return data_buffer; +} + +std::vector format_cube_data_chunk_parallel(const std::vector& data, + const int row_begin, + const int row_end, + const int nz, + const int precision, + const int ndata_line) +{ +#ifdef _OPENMP + const int row_count = row_end - row_begin; + const int max_threads = std::max(omp_get_max_threads(), 1); + const int subchunk_count = std::min(max_threads, row_count); + if (subchunk_count <= 1) + { + return std::vector{format_cube_data_chunk(data, row_begin, row_end, nz, precision, ndata_line)}; + } + + std::vector subchunk_buffers(subchunk_count); + int actual_threads = 1; + +#pragma omp parallel num_threads(subchunk_count) + { + const int thread_id = omp_get_thread_num(); + const int thread_count = omp_get_num_threads(); +#pragma omp single + { + actual_threads = thread_count; + } + + const int sub_begin = row_begin + row_count * thread_id / thread_count; + const int sub_end = row_begin + row_count * (thread_id + 1) / thread_count; + format_cube_data_rows_to_buffer(data, + sub_begin, + sub_end, + nz, + precision, + ndata_line, + subchunk_buffers[thread_id]); + } + subchunk_buffers.resize(static_cast(actual_threads)); + return subchunk_buffers; +#else + return std::vector{format_cube_data_chunk(data, row_begin, row_end, nz, precision, ndata_line)}; +#endif +} + +class AsyncCubeWriter +{ + public: + explicit AsyncCubeWriter(std::ofstream& ofs, const std::size_t max_queue_size) + : ofs_(ofs), max_queue_size_(std::max(max_queue_size, 1)), worker_(&AsyncCubeWriter::run, this) + { + } + + ~AsyncCubeWriter() + { + finish(); + } + + void push(std::vector&& buffers) + { + std::unique_lock lock(mutex_); + cv_not_full_.wait(lock, [this] { return queue_.size() < max_queue_size_ || finished_; }); + if (finished_) + { + return; + } + queue_.push_back(std::move(buffers)); + cv_not_empty_.notify_one(); + } + + void finish() + { + { + std::lock_guard lock(mutex_); + if (finished_) + { + return; + } + finished_ = true; + } + cv_not_empty_.notify_one(); + cv_not_full_.notify_all(); + if (worker_.joinable()) + { + worker_.join(); + } + } + + private: + void run() + { + while (true) + { + std::vector buffers; + { + std::unique_lock lock(mutex_); + cv_not_empty_.wait(lock, [this] { return finished_ || !queue_.empty(); }); + if (queue_.empty()) + { + if (finished_) + { + break; + } + continue; + } + + buffers = std::move(queue_.front()); + queue_.pop_front(); + cv_not_full_.notify_one(); + } + + for (const std::string& buffer : buffers) + { + ofs_.write(buffer.data(), buffer.size()); + } + } + } + + std::ofstream& ofs_; + const std::size_t max_queue_size_; + std::deque> queue_; + std::mutex mutex_; + std::condition_variable cv_not_empty_; + std::condition_variable cv_not_full_; + bool finished_ = false; + std::thread worker_; +}; +} // namespace + void ModuleIO::write_vdata_palgrid( const Parallel_Grid& pgrid, const double* const data, @@ -123,6 +380,7 @@ void ModuleIO::write_vdata_palgrid( } } write_cube(fn, comment, ucell->nat, { 0.0, 0.0, 0.0 }, nx, ny, nz, dx, dy, dz, atom_type, atom_charge, atom_pos, data_xyz_full, precision); + end = time(nullptr); ModuleBase::GlobalFunc::OUT_TIME("write_vdata_palgrid", start, end); } @@ -158,38 +416,49 @@ void ModuleIO::write_cube(const std::string& file, assert(atom_pos.size() >= natom); for (int i = 0;i < natom;++i) { assert(atom_pos[i].size() >= 3); } assert(data.size() >= nx * ny * nz); + assert(precision >= 0); + assert(ndata_line > 0); std::ofstream ofs(file); - for (int i = 0;i < 2;++i) { ofs << comment[i] << "\n"; } + std::ostringstream header_stream; + for (int i = 0;i < 2;++i) { header_stream << comment[i] << "\n"; } - ofs << std::fixed; - ofs << std::setprecision(1); // as before + header_stream << std::fixed; + header_stream << std::setprecision(1); // as before - ofs << natom << " " << origin[0] << " " << origin[1] << " " << origin[2] << " \n"; + header_stream << natom << " " << origin[0] << " " << origin[1] << " " << origin[2] << " \n"; - ofs << std::setprecision(6); //as before - ofs << nx << " " << dx[0] << " " << dx[1] << " " << dx[2] << "\n"; - ofs << ny << " " << dy[0] << " " << dy[1] << " " << dy[2] << "\n"; - ofs << nz << " " << dz[0] << " " << dz[1] << " " << dz[2] << "\n"; + header_stream << std::setprecision(6); //as before + header_stream << nx << " " << dx[0] << " " << dx[1] << " " << dx[2] << "\n"; + header_stream << ny << " " << dy[0] << " " << dy[1] << " " << dy[2] << "\n"; + header_stream << nz << " " << dz[0] << " " << dz[1] << " " << dz[2] << "\n"; for (int i = 0;i < natom;++i) { - ofs << " " << atom_type[i] << " " << atom_charge[i] << " " << atom_pos[i][0] << " " << atom_pos[i][1] << " " << atom_pos[i][2] << "\n"; + header_stream << " " << atom_type[i] << " " << atom_charge[i] << " " << atom_pos[i][0] << " " << atom_pos[i][1] << " " << atom_pos[i][2] << "\n"; } + const std::string header_buffer = header_stream.str(); + + ofs.write(header_buffer.data(), header_buffer.size()); - ofs.unsetf(std::ofstream::fixed); - ofs << std::setprecision(precision); - ofs << std::scientific; const int nxy = nx * ny; - for (int ixy = 0; ixy < nxy; ++ixy) + const int rows_per_chunk = choose_cube_rows_per_chunk(nz, precision); + AsyncCubeWriter async_writer(ofs, 2); + for (int row_begin = 0; row_begin < nxy; row_begin += rows_per_chunk) { - for (int iz = 0;iz < nz;++iz) - { - ofs << " " << data[ixy * nz + iz]; - if ((iz + 1) % ndata_line == 0 && iz != nz - 1) { ofs << "\n"; } - } - ofs << "\n"; + const int row_end = std::min(row_begin + rows_per_chunk, nxy); + + std::vector data_buffers = format_cube_data_chunk_parallel(data, + row_begin, + row_end, + nz, + precision, + ndata_line); + async_writer.push(std::move(data_buffers)); } + + async_writer.finish(); + ofs.close(); -} \ No newline at end of file +} From 6a9a0e44c702fe9db9794fbae5e27d7906781aa3 Mon Sep 17 00:00:00 2001 From: dyzheng Date: Wed, 3 Jun 2026 16:35:53 +0800 Subject: [PATCH 2/2] Optimize cube file data formatting and writing --- source/module_io/write_cube.cpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/source/module_io/write_cube.cpp b/source/module_io/write_cube.cpp index d3368074910..a603aaefcf1 100644 --- a/source/module_io/write_cube.cpp +++ b/source/module_io/write_cube.cpp @@ -17,8 +17,20 @@ #include #include -#ifdef _OPENMP +#if defined(_OPENMP) +#if defined(__has_include) +#if __has_include() #include +#define ABACUS_WRITE_CUBE_USE_OPENMP 1 +#endif +#else +#include +#define ABACUS_WRITE_CUBE_USE_OPENMP 1 +#endif +#endif + +#ifndef ABACUS_WRITE_CUBE_USE_OPENMP +#define ABACUS_WRITE_CUBE_USE_OPENMP 0 #endif namespace @@ -139,7 +151,7 @@ std::vector format_cube_data_chunk_parallel(const std::vector