diff --git a/.gitattributes b/.gitattributes index 9b8c3bc74e9..035167aaf71 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,3 +1,10 @@ +# Shell scripts and the bash-parsed integration-test case lists must keep LF +# endings so they run under bash, including MSYS2/Git-Bash on Windows where +# core.autocrlf may rewrite them to CRLF (which breaks `#!/bin/bash` and adds +# stray \r to parsed lines such as the case names in CASES_*.txt). +*.sh text eol=lf +CASES_*.txt text eol=lf + .gitattributes export-ignore .gitignore export-ignore .gitmodules export-ignore diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c6ece90f9c..3123d07dac3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -268,7 +268,7 @@ if(ENABLE_ASAN) set(CMAKE_BUILD_TYPE "RelWithDebInfo") endif() -if(NOT CMAKE_BUILD_TYPE) +if(NOT CMAKE_BUILD_TYPE AND NOT MSVC) add_compile_options(-O3 -g) endif() @@ -289,6 +289,14 @@ if(ENABLE_NATIVE_OPTIMIZATION) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -mtune=native") endif() +# Windows (native build, e.g. MinGW-w64 or MSVC) portability defines: +# _USE_MATH_DEFINES - expose M_PI and friends from +# NOMINMAX - stop defining min()/max() macros +# _CRT_SECURE_NO_WARNINGS - silence CRT "use _s function" deprecations +if(WIN32) + add_compile_definitions(_USE_MATH_DEFINES NOMINMAX _CRT_SECURE_NO_WARNINGS) +endif() + if(ENABLE_LCAO) find_package(Cereal REQUIRED) include_directories(${CEREAL_INCLUDE_DIR}) @@ -610,8 +618,13 @@ elseif(NOT USE_SW) find_package(Lapack REQUIRED) include_directories(${FFTW3_INCLUDE_DIRS}) list(APPEND math_libs FFTW3::FFTW3 LAPACK::LAPACK BLAS::BLAS) - find_package(ScaLAPACK REQUIRED) - list(APPEND math_libs ScaLAPACK::ScaLAPACK) + # ScaLAPACK is a distributed-memory library and is only needed for the + # MPI build. A serial build (e.g. the native Windows serial version) + # must not require it. + if(ENABLE_MPI) + find_package(ScaLAPACK REQUIRED) + list(APPEND math_libs ScaLAPACK::ScaLAPACK) + endif() if(USE_OPENMP) list(APPEND math_libs FFTW3::FFTW3_OMP) endif() @@ -869,7 +882,10 @@ if (USE_SW) list(APPEND math_libs gfortran) endif() -list(APPEND math_libs m) +# libm exists on Linux and MinGW-w64 but not in the MSVC CRT. +if(NOT MSVC) + list(APPEND math_libs m) +endif() target_link_libraries(${ABACUS_BIN_NAME} ${math_libs}) install(PROGRAMS ${ABACUS_BIN_PATH} @@ -877,8 +893,12 @@ install(PROGRAMS ${ABACUS_BIN_PATH} # DESTINATION ${CMAKE_INSTALL_BINDIR} ) -# Create a symbolic link 'abacus' pointing to the actual executable -install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${ABACUS_BIN_NAME} ${CMAKE_INSTALL_PREFIX}/bin/abacus WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/bin)") +# Create a symbolic link 'abacus' pointing to the actual executable. +# Skipped on Windows: symlink creation needs elevated/developer-mode +# privileges there and the executable carries an .exe suffix anyway. +if(NOT WIN32) + install(CODE "execute_process(COMMAND ${CMAKE_COMMAND} -E create_symlink ${ABACUS_BIN_NAME} ${CMAKE_INSTALL_PREFIX}/bin/abacus WORKING_DIRECTORY ${CMAKE_INSTALL_PREFIX}/bin)") +endif() if(ENABLE_COVERAGE) coverage_evaluate() diff --git a/cmake/FindBlas.cmake b/cmake/FindBlas.cmake index a3c7f75069d..c75fc1dc9f1 100644 --- a/cmake/FindBlas.cmake +++ b/cmake/FindBlas.cmake @@ -5,7 +5,16 @@ if(DEFINED BLAS_LIBRARY) set(BLAS_LIBRARIES ${BLAS_LIBRARY}) endif() +# Delegate to CMake's builtin FindBLAS module. On case-insensitive +# filesystems (Windows, macOS) this file "FindBlas.cmake" and the builtin +# "FindBLAS.cmake" resolve to the same name, so a plain find_package(BLAS) +# recurses into this very file. Temporarily remove our module directory +# from CMAKE_MODULE_PATH so the builtin module is used instead. Harmless +# no-op on case-sensitive filesystems. +set(_abacus_blas_saved_module_path "${CMAKE_MODULE_PATH}") +list(REMOVE_ITEM CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") find_package(BLAS REQUIRED) +set(CMAKE_MODULE_PATH "${_abacus_blas_saved_module_path}") if(NOT TARGET BLAS::BLAS) add_library(BLAS::BLAS UNKNOWN IMPORTED) diff --git a/cmake/FindLapack.cmake b/cmake/FindLapack.cmake index 15c3976d64c..1e4d0fc9a61 100644 --- a/cmake/FindLapack.cmake +++ b/cmake/FindLapack.cmake @@ -6,8 +6,18 @@ if(DEFINED LAPACK_LIBRARY) set(LAPACK_LIBRARIES ${LAPACK_LIBRARY}) endif() +# find_package(Blas) must resolve to our cmake/FindBlas.cmake wrapper, so +# leave CMAKE_MODULE_PATH intact for it. find_package(Blas REQUIRED) + +# Delegate to CMake's builtin FindLAPACK module. As with FindBlas, the names +# "FindLapack.cmake" and builtin "FindLAPACK.cmake" collide on +# case-insensitive filesystems, so drop our module directory from +# CMAKE_MODULE_PATH around the call to avoid infinite recursion. +set(_abacus_lapack_saved_module_path "${CMAKE_MODULE_PATH}") +list(REMOVE_ITEM CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") find_package(LAPACK REQUIRED) +set(CMAKE_MODULE_PATH "${_abacus_lapack_saved_module_path}") if(NOT TARGET LAPACK::LAPACK) add_library(LAPACK::LAPACK UNKNOWN IMPORTED) diff --git a/source/source_base/fs_compat.h b/source/source_base/fs_compat.h new file mode 100644 index 00000000000..3e3a58e0dca --- /dev/null +++ b/source/source_base/fs_compat.h @@ -0,0 +1,48 @@ +#ifndef MODULEBASE_FS_COMPAT_H +#define MODULEBASE_FS_COMPAT_H + +//========================================================== +// Small filesystem-portability helpers. +// +// The POSIX `mkdir(path, mode)` takes a permission-mode argument that +// does not exist in the Windows CRT (`_mkdir`/MinGW `mkdir` take only a +// path). This header provides a single cross-platform directory-creation +// helper so call sites stay identical on every platform. +//========================================================== + +#include +#include + +#ifdef _WIN32 +#include // _mkdir +#else +#include // mkdir +#include +#endif + +namespace ModuleBase +{ + +/** + * @brief Create a single directory, portably. + * + * @param path directory path to create + * @return 0 on success; -1 on failure with `errno` set (e.g. EEXIST when + * the directory already exists), matching POSIX `mkdir` semantics. + * + * On Windows the permission mode is not applicable and is ignored; on + * POSIX systems the directory is created with mode 0755 (subject to umask), + * preserving the previous behaviour of the call sites. + */ +inline int make_directory(const std::string& path) +{ +#ifdef _WIN32 + return _mkdir(path.c_str()); +#else + return mkdir(path.c_str(), 0755); +#endif +} + +} // namespace ModuleBase + +#endif // MODULEBASE_FS_COMPAT_H diff --git a/source/source_base/global_file.cpp b/source/source_base/global_file.cpp index 939c2ed998c..28c7bd9945d 100644 --- a/source/source_base/global_file.cpp +++ b/source/source_base/global_file.cpp @@ -8,11 +8,9 @@ #ifdef __MPI #include #endif -#include -#include -#include #include #include +#include "source_base/fs_compat.h" #include "global_function.h" #include "global_variable.h" #include "source_base/parallel_common.h" @@ -57,7 +55,7 @@ void ModuleBase::Global_File::make_dir_out( { if(rank==times) { - int ret = mkdir(global_out_dir.c_str(), 0755); + int ret = ModuleBase::make_directory(global_out_dir); if ( ret == 0 || errno == EEXIST ) { std::cout << " MAKE THE DIR : " << global_out_dir << std::endl; @@ -95,7 +93,7 @@ void ModuleBase::Global_File::make_dir_out( { if(rank==times) { - int ret = mkdir(global_stru_dir.c_str(), 0755); + int ret = ModuleBase::make_directory(global_stru_dir); if ( ret == 0 || errno == EEXIST ) { std::cout << " MAKE THE STRU DIR : " << global_stru_dir << std::endl; @@ -135,7 +133,7 @@ void ModuleBase::Global_File::make_dir_out( { if(rank==times) { - int ret = mkdir(global_matrix_dir.c_str(), 0755); + int ret = ModuleBase::make_directory(global_matrix_dir); if ( ret == 0 || errno == EEXIST ) { std::cout << " MAKE THE MATRIX DIR : " << global_matrix_dir << std::endl; @@ -174,7 +172,7 @@ void ModuleBase::Global_File::make_dir_out( { if(rank==times) { - int ret = mkdir(global_wfc_dir.c_str(), 0755); + int ret = ModuleBase::make_directory(global_wfc_dir); if ( ret == 0 || errno == EEXIST ) { std::cout << " MAKE THE WFC DIR : " << global_wfc_dir << std::endl; @@ -213,7 +211,7 @@ void ModuleBase::Global_File::make_dir_out( { if(rank==times) { - int ret = mkdir(global_mlkedf_descriptor_dir.c_str(), 0755); + int ret = ModuleBase::make_directory(global_mlkedf_descriptor_dir); if ( ret == 0 || errno == EEXIST ) { std::cout << " MAKE THE MLKEDF DESCRIPTOR DIR : " << global_mlkedf_descriptor_dir << std::endl; @@ -254,7 +252,7 @@ void ModuleBase::Global_File::make_dir_out( { if(rank==times) { - int ret = mkdir(global_deepks_label_elec_dir.c_str(), 0755); + int ret = ModuleBase::make_directory(global_deepks_label_elec_dir); if ( ret == 0 || errno == EEXIST ) { std::cout << " MAKE THE DEEPKS LABELS (ELEC) DIR : " << global_deepks_label_elec_dir << std::endl; diff --git a/source/source_base/global_function.cpp b/source/source_base/global_function.cpp index 98e6e1ebf9f..0806dcf5dc3 100644 --- a/source/source_base/global_function.cpp +++ b/source/source_base/global_function.cpp @@ -16,10 +16,8 @@ #include #include #include -#include -#include #include -#include +#include "source_base/fs_compat.h" namespace ModuleBase { @@ -68,7 +66,7 @@ void MAKE_DIR(const std::string &fn) // ModuleBase::TITLE("global_function","MAKE_DIR"); if (GlobalV::MY_RANK == 0) { - int ret = mkdir(fn.c_str(), 0755); + int ret = ModuleBase::make_directory(fn); if (ret != 0 && errno != EEXIST) { ModuleBase::WARNING_QUIT("MAKE_DIR", fn); diff --git a/source/source_base/module_container/base/core/cpu_allocator.cpp b/source/source_base/module_container/base/core/cpu_allocator.cpp index 2027402430c..2581babb18c 100644 --- a/source/source_base/module_container/base/core/cpu_allocator.cpp +++ b/source/source_base/module_container/base/core/cpu_allocator.cpp @@ -1,28 +1,48 @@ #include +#include +#ifdef _WIN32 +#include // _aligned_malloc / _aligned_free +#endif + namespace base { namespace core { // Allocate a block of CPU memory with the given size and default alignment. +// On Windows the aligned allocator family is used consistently so that every +// pointer handed out by this class can be released through free() below +// (_aligned_malloc memory must not be released with ::operator delete). void *CPUAllocator::allocate(size_t size) { this->allocated_size_ = size; +#ifdef _WIN32 + return _aligned_malloc(size, alignof(std::max_align_t)); +#else return ::operator new(size); +#endif } // Allocate a block of CPU memory with the given size and alignment. void *CPUAllocator::allocate(size_t size, size_t alignment) { this->allocated_size_ = size; void *ptr = nullptr; +#ifdef _WIN32 + ptr = _aligned_malloc(size, alignment); // posix_memalign has no Windows CRT equivalent +#else if (posix_memalign(&ptr, alignment, size) != 0) { ptr = nullptr; } +#endif return ptr; } // Free a block of CPU memory that was previously allocated by this allocator. void CPUAllocator::free(void *ptr) { this->allocated_size_ = 0; +#ifdef _WIN32 + _aligned_free(ptr); +#else ::operator delete(ptr); +#endif } // Get the type of device used by the TensorBuffer. diff --git a/source/source_base/module_fft/fft_base.h b/source/source_base/module_fft/fft_base.h index 1fcbc514129..a39c529d1e6 100644 --- a/source/source_base/module_fft/fft_base.h +++ b/source/source_base/module_fft/fft_base.h @@ -2,6 +2,22 @@ #define FFT_BASE_H #include + +// These FFT virtuals are declared weak so the ELF linker can resolve the +// unused single-precision (FFT_CPU) vtable slots to null when +// ENABLE_FLOAT_FFTW is off. MinGW/PE has no working equivalent: weak template +// members there either collide ("multiple definition") or leave null vtable +// slots that crash on dispatch. On Windows we therefore drop the attribute and +// rely on the build defining the symbols (ENABLE_FLOAT_FFTW=ON supplies the +// real FFT_CPU methods; the float CPU path is unused otherwise). +// Linux/ELF behaviour is unchanged -- ABACUS_FFT_WEAK expands to exactly +// __attribute__((weak)) there. +#if defined(_WIN32) +#define ABACUS_FFT_WEAK +#else +#define ABACUS_FFT_WEAK __attribute__((weak)) +#endif + namespace ModuleBase { template @@ -16,7 +32,7 @@ class FFT_BASE * * The function is used to initialize the fft parameters. */ - virtual __attribute__((weak)) void initfft(int nx_in, + virtual ABACUS_FFT_WEAK void initfft(int nx_in, int ny_in, int nz_in, int lixy_in, @@ -27,7 +43,7 @@ class FFT_BASE bool gamma_only_in, bool xprime_in = true); - virtual __attribute__((weak)) void initfft(int nx_in, int ny_in, int nz_in); + virtual ABACUS_FFT_WEAK void initfft(int nx_in, int ny_in, int nz_in); /** * @brief Setup the fft plan and data as pure virtual function. @@ -72,11 +88,11 @@ class FFT_BASE * FFT_BASE is an abstract class,the function will be override, * The attribute weak is used to avoid define the function. */ - virtual __attribute__((weak)) FPTYPE* get_rspace_data() const; + virtual ABACUS_FFT_WEAK FPTYPE* get_rspace_data() const; - virtual __attribute__((weak)) std::complex* get_auxr_data() const; + virtual ABACUS_FFT_WEAK std::complex* get_auxr_data() const; - virtual __attribute__((weak)) std::complex* get_auxg_data() const; + virtual ABACUS_FFT_WEAK std::complex* get_auxg_data() const; /** * @brief Get the auxiliary real space data in 3D @@ -85,7 +101,7 @@ class FFT_BASE * While the FFT_BASE is an abstract class,the function will be override, * The attribute weak is used to avoid define the function. */ - virtual __attribute__((weak)) std::complex* get_auxr_3d_data() const; + virtual ABACUS_FFT_WEAK std::complex* get_auxr_3d_data() const; // forward fft in x-y direction @@ -100,10 +116,10 @@ class FFT_BASE * determined by the xprime flag).Notably, the Y axis operates in * "many-many-FFT" mode. */ - virtual __attribute__((weak)) void fftxyfor(std::complex* in, + virtual ABACUS_FFT_WEAK void fftxyfor(std::complex* in, std::complex* out) const; - virtual __attribute__((weak)) void fftxybac(std::complex* in, + virtual ABACUS_FFT_WEAK void fftxybac(std::complex* in, std::complex* out) const; /** @@ -115,10 +131,10 @@ class FFT_BASE * It involves only one axis, z. The FFT is applied only once. * Notably, the Z axis operates in many FFT with nz*ns. */ - virtual __attribute__((weak)) void fftzfor(std::complex* in, + virtual ABACUS_FFT_WEAK void fftzfor(std::complex* in, std::complex* out) const; - virtual __attribute__((weak)) void fftzbac(std::complex* in, + virtual ABACUS_FFT_WEAK void fftzbac(std::complex* in, std::complex* out) const; /** @@ -129,10 +145,10 @@ class FFT_BASE * This function performs the forward FFT in the x-y direction * with real to complex.There is no difference between fftxyfor. */ - virtual __attribute__((weak)) void fftxyr2c(FPTYPE* in, + virtual ABACUS_FFT_WEAK void fftxyr2c(FPTYPE* in, std::complex* out) const; - virtual __attribute__((weak)) void fftxyc2r(std::complex* in, + virtual ABACUS_FFT_WEAK void fftxyc2r(std::complex* in, FPTYPE* out) const; /** @@ -144,10 +160,10 @@ class FFT_BASE * It involves three axes, x, y, and z. The FFT is applied multiple times * for fft3D_forward. */ - virtual __attribute__((weak)) void fft3D_forward(std::complex* in, + virtual ABACUS_FFT_WEAK void fft3D_forward(std::complex* in, std::complex* out) const; - virtual __attribute__((weak)) void fft3D_backward(std::complex* in, + virtual ABACUS_FFT_WEAK void fft3D_backward(std::complex* in, std::complex* out) const; protected: @@ -156,6 +172,44 @@ class FFT_BASE int nz = 0; }; +#if defined(_WIN32) +// On Linux the non-pure base virtuals above are __attribute__((weak)) and the +// ELF linker resolves their (never-used) vtable slots to null. MinGW/PE has no +// such fallback, so define trivial bodies for them here -- they are never +// executed (FFT_BASE is abstract; every backend overrides what it actually +// uses, and the unoverridden slots, e.g. fft3D_* on the CPU backend, are not +// called). This block is compiled only on Windows; Linux keeps the upstream +// weak declarations unchanged. +template +void FFT_BASE::initfft(int, int, int, int, int, int, int, int, bool, bool) {} +template +void FFT_BASE::initfft(int, int, int) {} +template +FPTYPE* FFT_BASE::get_rspace_data() const { return nullptr; } +template +std::complex* FFT_BASE::get_auxr_data() const { return nullptr; } +template +std::complex* FFT_BASE::get_auxg_data() const { return nullptr; } +template +std::complex* FFT_BASE::get_auxr_3d_data() const { return nullptr; } +template +void FFT_BASE::fftxyfor(std::complex*, std::complex*) const {} +template +void FFT_BASE::fftxybac(std::complex*, std::complex*) const {} +template +void FFT_BASE::fftzfor(std::complex*, std::complex*) const {} +template +void FFT_BASE::fftzbac(std::complex*, std::complex*) const {} +template +void FFT_BASE::fftxyr2c(FPTYPE*, std::complex*) const {} +template +void FFT_BASE::fftxyc2r(std::complex*, FPTYPE*) const {} +template +void FFT_BASE::fft3D_forward(std::complex*, std::complex*) const {} +template +void FFT_BASE::fft3D_backward(std::complex*, std::complex*) const {} +#endif // _WIN32 + template FFT_BASE::FFT_BASE(); template FFT_BASE::FFT_BASE(); template FFT_BASE::~FFT_BASE(); diff --git a/source/source_base/module_fft/fft_cpu.h b/source/source_base/module_fft/fft_cpu.h index f33fecd74b8..ec47768d8e9 100644 --- a/source/source_base/module_fft/fft_cpu.h +++ b/source/source_base/module_fft/fft_cpu.h @@ -40,14 +40,14 @@ class FFT_CPU : public FFT_BASE bool gamma_only_in, bool xprime_in = true) override; - __attribute__((weak)) + ABACUS_FFT_WEAK void setupFFT() override; // void initplan(const unsigned int& flag = 0); - __attribute__((weak)) + ABACUS_FFT_WEAK void cleanFFT() override; - __attribute__((weak)) + ABACUS_FFT_WEAK void clear() override; /** @@ -58,13 +58,13 @@ class FFT_CPU : public FFT_BASE * which is used in the CPU fft.Use the weak attribute * to avoid defining float while without flag ENABLE_FLOAT_FFTW. */ - __attribute__((weak)) + ABACUS_FFT_WEAK FPTYPE* get_rspace_data() const override; - __attribute__((weak)) + ABACUS_FFT_WEAK std::complex* get_auxr_data() const override; - __attribute__((weak)) + ABACUS_FFT_WEAK std::complex* get_auxg_data() const override; /** @@ -75,27 +75,27 @@ class FFT_CPU : public FFT_BASE * The function details can be found in FFT_BASE, * and the function interfaces can be found in FFT_BUNDLE. */ - __attribute__((weak)) + ABACUS_FFT_WEAK void fftxyfor(std::complex* in, std::complex* out) const override; - __attribute__((weak)) + ABACUS_FFT_WEAK void fftxybac(std::complex* in, std::complex* out) const override; - __attribute__((weak)) + ABACUS_FFT_WEAK void fftzfor(std::complex* in, std::complex* out) const override; - __attribute__((weak)) + ABACUS_FFT_WEAK void fftzbac(std::complex* in, std::complex* out) const override; - __attribute__((weak)) + ABACUS_FFT_WEAK void fftxyr2c(FPTYPE* in, std::complex* out) const override; - __attribute__((weak)) + ABACUS_FFT_WEAK void fftxyc2r(std::complex* in, FPTYPE* out) const override; private: diff --git a/source/source_esolver/esolver_ks_lcao.cpp b/source/source_esolver/esolver_ks_lcao.cpp index 08499df2bdf..fc0ecddc8a2 100644 --- a/source/source_esolver/esolver_ks_lcao.cpp +++ b/source/source_esolver/esolver_ks_lcao.cpp @@ -151,8 +151,14 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) } // 9) for each ionic step, the overlap must be rebuilt - // since it depends on ionic positions - this->deepks.build_overlap(ucell, orb_, pv, gd, *(two_center_bundle_.overlap_orb_alpha), PARAM.inp); + // since it depends on ionic positions. + // overlap_orb_alpha is only built when DeePKS is enabled (descriptor + // orbitals); guard the dereference so non-DeePKS runs don't form a + // reference from a null unique_ptr (undefined behaviour). + if (two_center_bundle_.overlap_orb_alpha) + { + this->deepks.build_overlap(ucell, orb_, pv, gd, *(two_center_bundle_.overlap_orb_alpha), PARAM.inp); + } // 10) prepare sc calculation init_deltaspin_lcao(ucell, PARAM.inp, &(this->pv), this->kv, this->p_hamilt, this->psi, this->dmat.dm, this->pelec); diff --git a/source/source_io/module_output/binstream.cpp b/source/source_io/module_output/binstream.cpp index 64a936aa22a..2b9960ede71 100644 --- a/source/source_io/module_output/binstream.cpp +++ b/source/source_io/module_output/binstream.cpp @@ -1,17 +1,36 @@ #include +#include #include "binstream.h" +namespace +{ +// Binstream is always a *binary* stream. On Windows, fopen mode "r"/"w"/"a" +// opens in text mode, which translates CRLF and treats 0x1A as EOF, corrupting +// binary data (e.g. wavefunction / charge files) -> "Some data didn't be read". +// Append "b" if the caller didn't, so binary mode is always used. On POSIX the +// "b" flag is a harmless no-op, so the Linux behaviour is unchanged. +std::string ensure_binary_mode(const char* op) +{ + std::string mode(op ? op : ""); + if (mode.find('b') == std::string::npos) + { + mode += 'b'; + } + return mode; +} +} // namespace + /** * @brief Construct a new Binstream:: Binstream object - * - * @param filename + * + * @param filename * @param op "r": read * "a": add - * "w": write + * "w": write */ Binstream::Binstream(const std::string filename,const char *op) { - fileptr=fopen(filename.c_str(),op); + fileptr=fopen(filename.c_str(),ensure_binary_mode(op).c_str()); } Binstream::~Binstream() @@ -30,7 +49,7 @@ void Binstream:: close() // open a file void Binstream::open(const std::string filename,const char *op) { - fileptr=fopen(filename.c_str(),op); + fileptr=fopen(filename.c_str(),ensure_binary_mode(op).c_str()); } // ! operator diff --git a/source/source_io/module_parameter/input_conv.h b/source/source_io/module_parameter/input_conv.h index 58f8db919ed..99db21591cb 100644 --- a/source/source_io/module_parameter/input_conv.h +++ b/source/source_io/module_parameter/input_conv.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -70,11 +70,11 @@ void parse_expression(const std::string& fn, std::vector& vec) str.push_back(section); } - // Compile the regular expression - regex_t reg; - regcomp(®, pattern.c_str(), REG_EXTENDED); - regmatch_t pmatch[1]; - const size_t nmatch = 1; + // Compile the regular expression. std::regex (ECMAScript grammar) is + // portable; the previous POSIX implementation did not build on + // Windows/MinGW. The pattern is plain enough to behave identically here. + const std::regex reg(pattern); + std::smatch match; // Loop over each section and apply regex to extract numbers for (size_t i = 0; i < str.size(); ++i) @@ -83,29 +83,31 @@ void parse_expression(const std::string& fn, std::vector& vec) { continue; } - int status = regexec(®, str[i].c_str(), nmatch, pmatch, 0); + + // Extract the first matched substring (mirrors the old regexec call) std::string sub_str = ""; + if (std::regex_search(str[i], match, reg)) + { + sub_str = match[0].str(); + } - // Extract the matched substring - for (size_t j = pmatch[0].rm_so; j != pmatch[0].rm_eo; ++j) + // A token that matches nothing is invalid input. Fail fast instead of + // feeding an empty string to the parsers below, which would push an + // indeterminate value into vec. + if (sub_str.empty()) { - sub_str += str[i][j]; + ModuleBase::WARNING_QUIT("Input_Conv::parse_expression", + "invalid token in expression: \"" + str[i] + "\""); } // Check if the substring contains multiplication (e.g., "2*3.14") - std::string sub_pattern("\\*"); - regex_t sub_reg; - regcomp(&sub_reg, sub_pattern.c_str(), REG_EXTENDED); - regmatch_t sub_pmatch[1]; - const size_t sub_nmatch = 1; - - if (regexec(&sub_reg, sub_str.c_str(), sub_nmatch, sub_pmatch, 0) == 0) + if (sub_str.find('*') != std::string::npos) { size_t pos = sub_str.find("*"); int num = stoi(sub_str.substr(0, pos)); assert(num >= 0); - T occ = stof(sub_str.substr(pos + 1, sub_str.size())); - + T occ = static_cast(stof(sub_str.substr(pos + 1, sub_str.size()))); + // Add the value to the vector `num` times for (size_t k = 0; k != num; k++) { @@ -114,18 +116,20 @@ void parse_expression(const std::string& fn, std::vector& vec) } else { - // Handle scientific notation and convert to T + // Handle scientific notation and convert to T. Initialize occ and + // check the extraction so a malformed token fails fast rather than + // pushing an indeterminate value. std::stringstream convert; convert << sub_str; - T occ; - convert >> occ; + T occ{}; + if (!(convert >> occ)) + { + ModuleBase::WARNING_QUIT("Input_Conv::parse_expression", + "failed to parse number: \"" + sub_str + "\""); + } vec.emplace_back(occ); } - - regfree(&sub_reg); } - - regfree(®); } #ifdef __LCAO diff --git a/source/source_io/module_restart/restart.cpp b/source/source_io/module_restart/restart.cpp index c960215a2a6..26c939bc627 100644 --- a/source/source_io/module_restart/restart.cpp +++ b/source/source_io/module_restart/restart.cpp @@ -1,7 +1,20 @@ #include "restart.h" #include +#include +#ifdef _WIN32 +#include // open/read/write/close (_open ...) on Windows +// The POSIX owner-permission bits are not defined by the Windows CRT; map them +// to the MSVCRT read/write mode bits so open(..., O_CREAT, mode) still works. +#ifndef S_IRUSR +#define S_IRUSR _S_IREAD +#endif +#ifndef S_IWUSR +#define S_IWUSR _S_IWRITE +#endif +#else #include +#endif #include #include diff --git a/source/source_psi/psi_initializer.cpp b/source/source_psi/psi_initializer.cpp index 2432c39e0f3..bed67ccd1c3 100644 --- a/source/source_psi/psi_initializer.cpp +++ b/source/source_psi/psi_initializer.cpp @@ -88,6 +88,22 @@ void psi_initializer::random_t(T* psi, const int iw_start, const int iw_end, // then distribute the data to all processors in the pool stick_to_pool(stickrr.data(), ir, tmprr.data()); stick_to_pool(stickarg.data(), ir, tmparg.data()); +#else + // Serial build: there is no pool to distribute to, so copy + // the stick directly into the gathered arrays, mirroring the + // rank-0 branch of stick_to_pool(). Without this, tmprr/tmparg + // stay zero-initialized and the seeded (pw_seed>0) random + // wavefunctions become all-zero, which later trips + // Gram-Schmidt with "psi_norm <= 0.0". + { + const int is = this->ixy2is_[ir]; + const int nz_loc = this->pw_wfc_->nz; + for (int iz = 0; iz < nz_loc; ++iz) + { + tmprr[is * nz_loc + iz] = stickrr[iz]; + tmparg[is * nz_loc + iz] = stickarg[iz]; + } + } #endif } // then for each g-component, initialize the wavefunction value diff --git a/source/source_pw/module_pwdft/structure_factor.cpp b/source/source_pw/module_pwdft/structure_factor.cpp index 6b342eaca87..f0db1acd22b 100644 --- a/source/source_pw/module_pwdft/structure_factor.cpp +++ b/source/source_pw/module_pwdft/structure_factor.cpp @@ -282,6 +282,17 @@ void Structure_Factor::bspline_sf(const int norder, #ifdef __MPI pgrid.zpiece_to_all(zpiece, iz, tmpr); + #else + // Serial build: the whole real-space grid is local, so there is no + // pool to scatter to. zpiece_to_all() is MPI-only, which otherwise + // leaves tmpr uninitialized -> garbage structure factor and a wrong + // total energy. Fill tmpr directly, using the SAME real-space layout + // as zpiece_to_all's serial path: rho[ir*nczp + znow], i.e. xy index + // outer and z innermost (nczp == nz, znow == iz when serial). + for(int ir = 0; ir < rho_basis->nxy; ir++) + { + tmpr[ir*rho_basis->nz + iz] = zpiece[ir]; + } #endif } diff --git a/tests/integrate/Autotest.sh b/tests/integrate/Autotest.sh index dc466a096fc..9e5250b511e 100755 --- a/tests/integrate/Autotest.sh +++ b/tests/integrate/Autotest.sh @@ -44,7 +44,12 @@ done # number of OpenMP threads if [[ -z "$nt" ]]; then - nt=$(expr `nproc` / ${np}) + if [ "$np" -le 0 ] 2>/dev/null; then + # serial build (no MPI launcher): use all cores for OpenMP + nt=$(nproc) + else + nt=$(expr `nproc` / ${np}) + fi fi export OMP_NUM_THREADS=${nt} @@ -251,7 +256,12 @@ for dir in $testdir; do TIMEFORMAT='[----------] Time elapsed: %R seconds' #parallel test time { - if [ "$case" = "282_NO_RPA" ]; then + if [ "$np" -le 0 ] 2>/dev/null; then + # serial build: run the binary directly, no MPI launcher. + # This lets a serial ABACUS (ENABLE_MPI=OFF, e.g. the native + # Windows build) reuse this harness unchanged. + $abacus > log.txt + elif [ "$case" = "282_NO_RPA" ]; then mpirun -np 1 $abacus > log.txt else mpirun -np $np $abacus > log.txt diff --git a/toolchain/build_abacus_windows.sh b/toolchain/build_abacus_windows.sh new file mode 100644 index 00000000000..7df3faf1a0c --- /dev/null +++ b/toolchain/build_abacus_windows.sh @@ -0,0 +1,177 @@ +#!/bin/bash -e +# Build ABACUS natively on Windows (MSYS2 / MinGW-w64). +# +# Windows counterpart of build_abacus_gnu.sh. Run it from the "MSYS2 MinGW +# 64-bit" shell after ./toolchain_windows.sh has installed the prerequisites. +# +# By default it builds the most capable supported configuration: MPI + LCAO +# (plane-wave and numerical-atomic-orbital bases) with OpenBLAS + FFTW + +# ScaLAPACK. ELPA / PEXSI / hybrid functionals (LibRI) / DeePKS / GPU are not +# available on Windows yet and stay OFF. +# +# Override the configuration from the environment, e.g.: +# ENABLE_MPI=OFF ./build_abacus_windows.sh # serial +# ENABLE_LCAO=OFF ./build_abacus_windows.sh # plane-wave only +# ENABLE_MPI=OFF ENABLE_LCAO=OFF ./build_abacus_windows.sh # serial PW (Phase 1) +ENABLE_MPI=${ENABLE_MPI:-ON} +ENABLE_LCAO=${ENABLE_LCAO:-ON} + +ABACUS_DIR=.. +TOOL=$(pwd) +INSTALL_DIR=$TOOL/install +[ -f "$INSTALL_DIR/setup" ] && source "$INSTALL_DIR/setup" +cd $ABACUS_DIR +ABACUS_DIR=$(pwd) +MINGW_PREFIX=${MINGW_PREFIX:-/mingw64} + +BUILD_DIR=build_abacus_windows +rm -rf $BUILD_DIR + +PREFIX=$ABACUS_DIR +LAPACK=${OPENBLAS_ROOT:-$MINGW_PREFIX}/lib # OpenBLAS supplies both BLAS and LAPACK +FFTW3=${FFTW_ROOT:-$MINGW_PREFIX} + +NUM_JOBS="$(nproc)" +# Cap the *default* parallelism by available RAM. Several heavy -O3 template +# TUs (e.g. source_cell/module_symmetry/symmetry.cpp, read_pp_upf201.cpp) can +# each peak around 3 GB in cc1plus, and ninja tends to schedule them together; +# on a many-core box -j nproc then exhausts memory and the build dies with +# "cc1plus.exe: out of memory" (seen even on a 31 GB / 20-core machine at +# -j 20). Budget ~3 GB per job. An explicit -j below always overrides this. +if [ -r /proc/meminfo ]; then + mem_gb=$(awk '/^MemTotal:/ {printf "%d", $2/1024/1024}' /proc/meminfo) + if [ -n "$mem_gb" ] && [ "$mem_gb" -ge 1 ]; then + mem_jobs=$(( mem_gb / 3 )); [ "$mem_jobs" -lt 1 ] && mem_jobs=1 + [ "$mem_jobs" -lt "$NUM_JOBS" ] && NUM_JOBS=$mem_jobs + fi +fi +while [[ $# -gt 0 ]]; do + case $1 in + -j) + if [[ -n "$2" && "$2" =~ ^[0-9]+$ ]]; then NUM_JOBS="${2}"; shift 2 + else echo "ERROR: -j requires a number argument"; exit 1; fi ;; + -j[0-9]*) NUM_JOBS="${1#-j}"; shift ;; + *) echo "ERROR: Unsupported argument: $1" >&2; echo "Usage: $0 [-j N|-jN]" >&2; exit 1 ;; + esac +done +echo "Building with -j ${NUM_JOBS} (override with -j N; lower it if cc1plus runs out of memory)." + +# MPI on Windows is MS-MPI (mingw-w64-x86_64-msmpi). Point FindMPI at it. +MPI_ARGS=() +if [ "$ENABLE_MPI" = "ON" ]; then + MPI_ARGS=(-DMPI_CXX_INCLUDE_PATH=$MINGW_PREFIX/include + -DMPI_CXX_LIBRARIES=$MINGW_PREFIX/lib/libmsmpi.dll.a) +fi + +# Notes on the non-default options: +# * USE_ELPA/PEXSI/LIBRI/MLALGO/CUDA = OFF -> not available on Windows yet. +# When ENABLE_MPI=ON the LCAO solver is ScaLAPACK (found automatically); +# when serial it is LAPACK (DiagoLapack). +# * BLA_VENDOR=OpenBLAS -> let CMake's FindBLAS/FindLAPACK pick OpenBLAS. +# * ENABLE_FLOAT_FFTW=ON -> make FFT_CPU concrete (vtable) on PE. +# * COMMIT_INFO=OFF -> skip the git/sh build-stamp step. +# * CMAKE_CXX_FLAGS "-include .." -> MSYS2 ships a very new GCC whose libstdc++ +# dropped transitive standard headers; force-include the common ones so the +# existing sources build unchanged. (Not Windows-specific; tied to GCC>=15.) +cmake -B $BUILD_DIR -G Ninja -DCMAKE_INSTALL_PREFIX=$PREFIX \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=g++ \ + -DENABLE_MPI=$ENABLE_MPI \ + -DENABLE_LCAO=$ENABLE_LCAO \ + -DUSE_OPENMP=OFF \ + -DUSE_ELPA=OFF \ + -DENABLE_PEXSI=OFF \ + -DENABLE_LIBRI=OFF \ + -DENABLE_MLALGO=OFF \ + -DUSE_CUDA=OFF \ + -DBUILD_TESTING=OFF \ + -DCOMMIT_INFO=OFF \ + -DBLA_VENDOR=OpenBLAS \ + -DENABLE_FLOAT_FFTW=ON \ + -DLAPACK_DIR=$LAPACK \ + -DFFTW3_DIR=$FFTW3 \ + -DCMAKE_PREFIX_PATH=$MINGW_PREFIX \ + "${MPI_ARGS[@]}" \ + -DCMAKE_CXX_FLAGS="-include cstdint -include cstring -include algorithm" + +cmake --build $BUILD_DIR -j "${NUM_JOBS}" + +# Provide a generic `abacus` command, matching the Linux toolchain (which +# symlinks `abacus` -> abacus_). Native Windows symlinks need elevated +# privileges, so instead copy the built binary to abacus.exe; a bare `abacus` +# then resolves to it in the MSYS2 shell (and in cmd/PowerShell). The glob +# matches the configured target (abacus_basic_para.exe, abacus_pw_ser.exe, ...) +# but not the abacus.exe copy itself (no underscore). +built_exe=$(ls "${ABACUS_DIR}/${BUILD_DIR}"/abacus_*.exe 2>/dev/null | head -n 1) +if [ -n "$built_exe" ]; then + cp -f "$built_exe" "${ABACUS_DIR}/${BUILD_DIR}/abacus.exe" + echo "Created generic launcher: ${ABACUS_DIR}/${BUILD_DIR}/abacus.exe -> $(basename "$built_exe")" +else + echo "WARNING: no abacus_*.exe found in ${BUILD_DIR}; 'abacus' command not created." +fi + +# Bundle the dependent MinGW / OpenBLAS / FFTW / ScaLAPACK runtime DLLs next to +# the binary. Windows searches the *application directory* before PATH, so this +# makes abacus.exe self-contained and, crucially, lets it find its DLLs even +# when launched by a process that does not propagate PATH to its children -- +# which is exactly what MS-MPI's mpiexec does when the test harness redirects +# stdout to a file ("error while loading shared libraries"). System DLLs +# (msmpi.dll in System32, kernel32, ...) resolve on their own and are skipped. +if [ -n "$built_exe" ]; then + echo "Bundling dependent DLLs into ${BUILD_DIR}/ ..." + ldd "${ABACUS_DIR}/${BUILD_DIR}/abacus.exe" 2>/dev/null \ + | awk -v p="$MINGW_PREFIX" '$3 ~ p {print $3}' | sort -u \ + | while read -r dll; do cp -f "$dll" "${ABACUS_DIR}/${BUILD_DIR}/"; done +fi + +# When MPI is on, drop an `mpirun` shim next to the binary so the shared test +# harness (which invokes `mpirun -np N`) drives MS-MPI unchanged. MS-MPI ships +# only `mpiexec`; the shim forwards to it and pins the (OpenMP-threaded) BLAS to +# one thread per rank -- otherwise each rank's multithreaded OpenBLAS +# oversubscribes the cores and its buffer allocator fails under several ranks. +if [ "$ENABLE_MPI" = "ON" ]; then + cat << 'SHIM' > "${ABACUS_DIR}/${BUILD_DIR}/mpirun" +#!/bin/bash +# mpirun -> mpiexec shim for native Windows (MS-MPI). See build_abacus_windows.sh. +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +exec mpiexec "$@" +SHIM + chmod +x "${ABACUS_DIR}/${BUILD_DIR}/mpirun" + echo "Created mpirun->mpiexec shim: ${ABACUS_DIR}/${BUILD_DIR}/mpirun" +fi + +# generate abacus_env.sh: sourcing it puts the MinGW runtime DLLs (via the +# toolchain setup) and the binary directory on PATH, so `abacus` runs directly. +# MSYS2's OpenBLAS is OpenMP-threaded, so OMP_NUM_THREADS (not the often-cited +# OPENBLAS_NUM_THREADS) is what actually caps its threads; pin it to 1 so that +# `mpiexec -n N abacus` doesn't oversubscribe and trip OpenBLAS's buffer +# allocator. (Both are set; OPENBLAS_NUM_THREADS alone has no effect here.) +cat << EOF > "${TOOL}/abacus_env.sh" +#!/bin/bash +[ -f "${INSTALL_DIR}/setup" ] && source "${INSTALL_DIR}/setup" +export PATH="${ABACUS_DIR}/${BUILD_DIR}":\${PATH} +# MS-MPI's mpiexec lives in its own Bin dir (MSMPI_BIN), which the MinGW PATH +# does not inherit; add it so \`mpiexec\` and the mpirun shim resolve. +[ -n "\$MSMPI_BIN" ] && export PATH="\$(cygpath -u "\$MSMPI_BIN")":\${PATH} +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +EOF + +cat << EOF +========================== usage ========================= +Done! Binary: $(basename "$built_exe") in ${ABACUS_DIR}/${BUILD_DIR}/ +Run it from a MinGW bash shell: + bash + source ${TOOL}/abacus_env.sh + abacus # serial run + mpiexec -n 4 abacus # parallel run (MS-MPI) + +Run the standard test suite (the mpirun->mpiexec shim makes the existing +harness work unchanged): + cd ${ABACUS_DIR}/tests/01_PW + bash ../integrate/Autotest.sh -a abacus # MPI (default np=4) + bash ../integrate/Autotest.sh -a abacus -n 0 # serial (no launcher) +========================================================== +EOF diff --git a/toolchain/toolchain_windows.sh b/toolchain/toolchain_windows.sh new file mode 100644 index 00000000000..84b575da232 --- /dev/null +++ b/toolchain/toolchain_windows.sh @@ -0,0 +1,70 @@ +#!/bin/bash -e +# Toolchain setup for a NATIVE Windows build of ABACUS via MSYS2 / MinGW-w64. +# +# This is the Windows counterpart of toolchain_gnu.sh / toolchain_intel.sh. +# On Linux those scripts build the dependency stack from source; on Windows the +# MinGW-w64 dependencies are provided by the MSYS2 distribution, so here we just +# install them with pacman and record their location for build_abacus_windows.sh. +# +# Scope: PW + LCAO, serial and MPI (MS-MPI + ScaLAPACK). ELPA / PEXSI / hybrid +# functionals (LibRI) / DeePKS / LibXC / GPU are intentionally omitted because +# they have no reliable native-Windows build yet; they remain ordinary ABACUS +# feature switches for the future. +# +# Usage: open the "MSYS2 MinGW 64-bit" shell and run: +# ./toolchain_windows.sh +# then: +# ./build_abacus_windows.sh + +if ! command -v pacman >/dev/null 2>&1; then + echo "ERROR: pacman not found. Run this inside an MSYS2 shell (https://www.msys2.org)." + exit 1 +fi + +echo "[*] Installing MinGW-w64 build prerequisites via pacman ..." +pacman -S --needed --noconfirm \ + mingw-w64-x86_64-gcc \ + mingw-w64-x86_64-gcc-fortran \ + mingw-w64-x86_64-cmake \ + mingw-w64-x86_64-ninja \ + mingw-w64-x86_64-openblas \ + mingw-w64-x86_64-fftw \ + mingw-w64-x86_64-cereal \ + mingw-w64-x86_64-msmpi \ + mingw-w64-x86_64-scalapack + +# Notes: +# * cereal : header-only serialization, required by the LCAO build. +# * msmpi : MS-MPI headers + import lib for the MPI build. The MS-MPI +# *runtime* (msmpi.dll, mpiexec) is a separate Microsoft +# redistributable that must be installed system-wide to run +# parallel jobs: https://www.microsoft.com/download/details.aspx?id=105289 +# * scalapack : distributed eigensolver used by the LCAO MPI build (no ELPA). + +# 'bc' (a base MSYS tool, not a MinGW package) is used by the integration-test +# harness tests/integrate/tools/catch_properties.sh; install it so the existing +# serial test flow (Autotest.sh -n 0) works out of the box. +pacman -S --needed --noconfirm bc + +# MinGW-w64 installs everything under the /mingw64 prefix. Record it in a setup +# file with the same variable names the build_abacus_*.sh scripts expect, so the +# build step is uniform with the Linux toolchain. +TOOL=$(cd "$(dirname "$0")" && pwd) +INSTALL_DIR="$TOOL/install" +mkdir -p "$INSTALL_DIR" +cat > "$INSTALL_DIR/setup" <<'EOF' +# Native Windows (MSYS2/MinGW-w64) prerequisites live under /mingw64. +export MINGW_PREFIX="${MINGW_PREFIX:-/mingw64}" +export OPENBLAS_ROOT="$MINGW_PREFIX" # OpenBLAS provides BLAS *and* LAPACK +export FFTW_ROOT="$MINGW_PREFIX" +export PATH="$MINGW_PREFIX/bin:$PATH" +EOF + +cat <