diff --git a/src/serialbox/core/CMakeLists.txt b/src/serialbox/core/CMakeLists.txt index d06eb733..15b3d131 100644 --- a/src/serialbox/core/CMakeLists.txt +++ b/src/serialbox/core/CMakeLists.txt @@ -60,6 +60,8 @@ set(SOURCES archive/NetCDFArchive.h archive/MockArchive.cpp archive/MockArchive.h + archive/ZarrArchive.cpp + archive/ZarrArchive.h ) add_library(SerialboxObjects OBJECT ${SOURCES}) diff --git a/src/serialbox/core/archive/ArchiveFactory.cpp b/src/serialbox/core/archive/ArchiveFactory.cpp index faf3d3f9..61b0cce0 100644 --- a/src/serialbox/core/archive/ArchiveFactory.cpp +++ b/src/serialbox/core/archive/ArchiveFactory.cpp @@ -18,6 +18,7 @@ #include "serialbox/core/archive/BinaryArchive.h" #include "serialbox/core/archive/MockArchive.h" #include "serialbox/core/archive/NetCDFArchive.h" +#include "serialbox/core/archive/ZarrArchive.h" namespace serialbox { @@ -28,6 +29,8 @@ std::unique_ptr ArchiveFactory::create(const std::string& name, OpenMod return std::make_unique(mode, directory, prefix); } else if(name == MockArchive::Name) { return std::make_unique(mode); + } else if(name == ZarrArchive::Name) { + return std::make_unique(mode, directory, prefix); #ifdef SERIALBOX_HAS_NETCDF } else if(name == NetCDFArchive::Name) { return std::make_unique(mode, directory, prefix); @@ -43,7 +46,7 @@ std::unique_ptr ArchiveFactory::create(const std::string& name, OpenMod } std::vector ArchiveFactory::registeredArchives() { - std::vector archives{BinaryArchive::Name, MockArchive::Name + std::vector archives{BinaryArchive::Name, MockArchive::Name, ZarrArchive::Name #ifdef SERIALBOX_HAS_NETCDF , NetCDFArchive::Name @@ -57,6 +60,8 @@ std::string ArchiveFactory::archiveFromExtension(std::string filename) { if(extension == ".dat" || extension == ".bin") return BinaryArchive::Name; + else if(extension == ".zarr") + return ZarrArchive::Name; #ifdef SERIALBOX_HAS_NETCDF else if(extension == ".nc") return NetCDFArchive::Name; @@ -78,6 +83,8 @@ void ArchiveFactory::writeToFile(std::string filename, const StorageView& storag if(archiveName == BinaryArchive::Name) { BinaryArchive::writeToFile(filename, storageView); + } else if(archiveName == ZarrArchive::Name) { + ZarrArchive::writeToFile(filename, storageView, fieldname); } #ifdef SERIALBOX_HAS_NETCDF else if(archiveName == NetCDFArchive::Name) { @@ -103,6 +110,8 @@ void ArchiveFactory::readFromFile(std::string filename, StorageView& storageView if(archiveName == BinaryArchive::Name) { BinaryArchive::readFromFile(filename, storageView); + } else if(archiveName == ZarrArchive::Name) { + ZarrArchive::readFromFile(filename, storageView, fieldname); } #ifdef SERIALBOX_HAS_NETCDF else if(archiveName == NetCDFArchive::Name) { diff --git a/src/serialbox/core/archive/ArchiveFactory.h b/src/serialbox/core/archive/ArchiveFactory.h index 8975afcc..22e2d240 100644 --- a/src/serialbox/core/archive/ArchiveFactory.h +++ b/src/serialbox/core/archive/ArchiveFactory.h @@ -50,6 +50,7 @@ class ArchiveFactory { /// ------------- | -------- /// .dat, .bin | Binary /// .nc | NetCDF + /// .zarr | Zarr (directory) /// static std::string archiveFromExtension(std::string filename); diff --git a/src/serialbox/core/archive/ZarrArchive.cpp b/src/serialbox/core/archive/ZarrArchive.cpp new file mode 100644 index 00000000..3d9c76f5 --- /dev/null +++ b/src/serialbox/core/archive/ZarrArchive.cpp @@ -0,0 +1,478 @@ +//===-- serialbox/core/archive/ZarrArchive.cpp --------------------------------------*- C++ -*-===// +// +// S E R I A L B O X +// +// This file is distributed under terms of BSD license. +// See LICENSE.txt for more information +// +//===------------------------------------------------------------------------------------------===// +// +/// \file +/// This file implements the archive based on the Zarr storage format (v2). +/// +//===------------------------------------------------------------------------------------------===// + +#include "serialbox/core/archive/ZarrArchive.h" +#include "serialbox/core/Logging.h" +#include "serialbox/core/Unreachable.h" +#include "serialbox/core/Version.h" +#include +#include +#include +#include +#include + +namespace serialbox { + +//===------------------------------------------------------------------------------------------===// +// Constants +//===------------------------------------------------------------------------------------------===// + +const std::string ZarrArchive::Name = "Zarr"; + +const int ZarrArchive::Version = 0; + +//===------------------------------------------------------------------------------------------===// +// Helpers +//===------------------------------------------------------------------------------------------===// + +char ZarrArchive::nativeEndianChar() { + const uint16_t test = 0x0102; + return (*reinterpret_cast(&test) == 0x01) ? '>' : '<'; +} + +std::string ZarrArchive::typeIDtoZarrDtype(TypeID type) { + const char endian = nativeEndianChar(); + switch(type) { + case TypeID::Boolean: + return "|b1"; + case TypeID::Int32: + return std::string(1, endian) + "i4"; + case TypeID::Int64: + return std::string(1, endian) + "i8"; + case TypeID::Float32: + return std::string(1, endian) + "f4"; + case TypeID::Float64: + return std::string(1, endian) + "f8"; + default: + throw Exception("ZarrArchive: cannot convert type '%s' to Zarr dtype", + TypeUtil::toString(type)); + } +} + +std::vector ZarrArchive::storageViewToBuffer(const StorageView& storageView) { + const int bytesPerElement = storageView.bytesPerElement(); + std::vector buffer(storageView.sizeInBytes()); + Byte* dst = buffer.data(); + + if(storageView.isMemCopyable()) { + std::memcpy(dst, storageView.originPtr(), buffer.size()); + } else { + for(auto it = storageView.begin(), end = storageView.end(); it != end; + ++it, dst += bytesPerElement) + std::memcpy(dst, it.ptr(), bytesPerElement); + } + return buffer; +} + +void ZarrArchive::bufferToStorageView(const std::vector& buffer, StorageView& storageView) { + const int bytesPerElement = storageView.bytesPerElement(); + const Byte* src = buffer.data(); + + if(storageView.isMemCopyable()) { + std::memcpy(storageView.originPtr(), src, buffer.size()); + } else { + for(auto it = storageView.begin(), end = storageView.end(); it != end; + ++it, src += bytesPerElement) + std::memcpy(it.ptr(), src, bytesPerElement); + } +} + +//===------------------------------------------------------------------------------------------===// +// Path helpers +//===------------------------------------------------------------------------------------------===// + +std::filesystem::path ZarrArchive::fieldDirectory(const std::string& field) const { + return directory_ / (prefix_ + "_" + field + ".zarr"); +} + +std::filesystem::path ZarrArchive::chunkFile(const std::string& field, int saveId, + std::size_t numDataDims) const { + // Zarr v2 chunk key: indices separated by '.' + // First index is the save id; the remaining numDataDims indices are all 0. + std::string key = std::to_string(saveId); + for(std::size_t i = 0; i < numDataDims; ++i) + key += ".0"; + return fieldDirectory(field) / key; +} + +//===------------------------------------------------------------------------------------------===// +// Zarr metadata +//===------------------------------------------------------------------------------------------===// + +void ZarrArchive::writeZarrayMetadata(const std::filesystem::path& fieldDir, + const StorageView& storageView, int numSaves) const { + // Build active dims (skip size-0 dims) + std::vector activeDims; + for(int d : storageView.dims()) + if(d > 0) + activeDims.push_back(d); + + // Shape: [num_saves, d0, d1, ...] + json::json shape = json::json::array(); + shape.push_back(numSaves); + for(int d : activeDims) + shape.push_back(d); + + // Chunks: [1, d0, d1, ...] — one chunk per save + json::json chunks = json::json::array(); + chunks.push_back(1); + for(int d : activeDims) + chunks.push_back(d); + + json::json zarray; + zarray["zarr_format"] = 2; + zarray["shape"] = shape; + zarray["chunks"] = chunks; + zarray["dtype"] = typeIDtoZarrDtype(storageView.type()); + zarray["compressor"] = nullptr; + zarray["fill_value"] = 0; + zarray["order"] = "C"; + zarray["filters"] = nullptr; + + std::ofstream fs(fieldDir / ".zarray", std::ios::out | std::ios::trunc); + if(!fs.is_open()) + throw Exception("ZarrArchive: cannot open .zarray metadata file in '%s'", + fieldDir.string()); + fs << zarray.dump(2) << "\n"; +} + +json::json ZarrArchive::readZarrayMetadata(const std::filesystem::path& fieldDir) const { + std::filesystem::path zarrayPath = fieldDir / ".zarray"; + if(!std::filesystem::exists(zarrayPath)) + throw Exception("ZarrArchive: .zarray not found in '%s'", fieldDir.string()); + + std::ifstream fs(zarrayPath.string()); + json::json j; + fs >> j; + return j; +} + +//===------------------------------------------------------------------------------------------===// +// Constructor +//===------------------------------------------------------------------------------------------===// + +ZarrArchive::ZarrArchive(OpenModeKind mode, const std::string& directory, + const std::string& prefix) + : mode_(mode), directory_(directory), prefix_(prefix) { + + LOG(info) << "Creating ZarrArchive (mode = " << mode_ << ") from directory " << directory_; + + metaDatafile_ = directory_ / ("ArchiveMetaData-" + prefix_ + ".json"); + + try { + bool isDir = std::filesystem::is_directory(directory_); + + switch(mode_) { + case OpenModeKind::Read: + if(!isDir) + throw Exception("no such directory: '%s'", directory_.string()); + break; + case OpenModeKind::Write: + case OpenModeKind::Append: + if(!isDir) + std::filesystem::create_directories(directory_); + break; + } + } catch(std::filesystem::filesystem_error& e) { + throw Exception(e.what()); + } + + readMetaDataFromJson(); + + if(mode_ == OpenModeKind::Write) + clear(); +} + +//===------------------------------------------------------------------------------------------===// +// MetaData +//===------------------------------------------------------------------------------------------===// + +void ZarrArchive::updateMetaData() { writeMetaDataToJson(); } + +void ZarrArchive::writeMetaDataToJson() { + LOG(info) << "Update MetaData of Zarr Archive"; + + json_.clear(); + + json_["serialbox_version"] = + 100 * SERIALBOX_VERSION_MAJOR + 10 * SERIALBOX_VERSION_MINOR + SERIALBOX_VERSION_PATCH; + json_["archive_name"] = ZarrArchive::Name; + json_["archive_version"] = ZarrArchive::Version; + + for(auto it = fieldMap_.begin(), end = fieldMap_.end(); it != end; ++it) + json_["field_map"][it->first] = it->second; + + std::ofstream fs(metaDatafile_.string(), std::ios::out | std::ios::trunc); + if(!fs.is_open()) + throw Exception("ZarrArchive: cannot open file: %s", metaDatafile_.string()); + + fs << json_.dump(2) << "\n"; + fs.close(); +} + +void ZarrArchive::readMetaDataFromJson() { + LOG(info) << "Reading MetaData for Zarr archive ..."; + + if(!std::filesystem::exists(metaDatafile_)) { + if(mode_ != OpenModeKind::Read) + return; + throw Exception("archive meta data not found in directory '%s'", directory_.string()); + } + + std::ifstream fs(metaDatafile_.string(), std::ios::in); + fs >> json_; + fs.close(); + + int serialboxVersion = json_["serialbox_version"]; + std::string archiveName = json_["archive_name"]; + int archiveVersion = json_["archive_version"]; + + if(!Version::isCompatible(serialboxVersion)) + throw Exception("serialbox version of Zarr archive (%s) is not compatible with the version " + "of the library (%s)", + Version::toString(serialboxVersion), SERIALBOX_VERSION_STRING); + + if(archiveName != ZarrArchive::Name) + throw Exception("archive is not a Zarr archive"); + + if(archiveVersion > ZarrArchive::Version) + throw Exception("Zarr archive version (%i) does not match the version of the library (%i)", + archiveVersion, ZarrArchive::Version); + + if(json_.count("field_map")) { + fieldMap_.clear(); + for(auto it = json_["field_map"].begin(); it != json_["field_map"].end(); ++it) + fieldMap_.insert({it.key(), static_cast(it.value())}); + } +} + +//===------------------------------------------------------------------------------------------===// +// Writing +//===------------------------------------------------------------------------------------------===// + +FieldID ZarrArchive::write(const StorageView& storageView, const std::string& field, + const std::shared_ptr info) { + if(mode_ == OpenModeKind::Read) + throw Exception("Archive is not initialized with OpenModeKind set to 'Write' or 'Append'"); + + LOG(info) << "Attempting to write field \"" << field << "\" to Zarr archive ..."; + + // Collect active (non-zero) data dimensions + std::vector activeDims; + for(int d : storageView.dims()) + if(d > 0) + activeDims.push_back(d); + const std::size_t numDataDims = activeDims.size(); + + std::filesystem::path fieldDir = fieldDirectory(field); + + FieldID fieldID{field, 0}; + auto it = fieldMap_.find(field); + + if(it != fieldMap_.end()) { + // Subsequent save — increment id and update .zarray shape + it->second++; + fieldID.id = it->second; + writeZarrayMetadata(fieldDir, storageView, fieldID.id + 1); + } else { + // First save — create the Zarr array directory and initial metadata + try { + std::filesystem::create_directories(fieldDir); + } catch(std::filesystem::filesystem_error& e) { + throw Exception(e.what()); + } + fieldMap_.insert({field, 0}); + writeZarrayMetadata(fieldDir, storageView, 1); + } + + // Serialize data to a contiguous buffer and write the chunk file + std::vector buffer = storageViewToBuffer(storageView); + + std::filesystem::path chunkPath = chunkFile(field, fieldID.id, numDataDims); + std::ofstream fs(chunkPath.string(), std::ios::out | std::ios::binary | std::ios::trunc); + if(!fs.is_open()) + throw Exception("ZarrArchive: cannot open chunk file: %s", chunkPath.string()); + fs.write(buffer.data(), static_cast(buffer.size())); + fs.close(); + + // Update archive-level metadata + updateMetaData(); + + LOG(info) << "Successfully wrote field \"" << fieldID.name << "\" (id = " << fieldID.id + << ") to " << chunkPath.filename(); + return fieldID; +} + +void ZarrArchive::writeToFile(std::string zarrPath, const StorageView& storageView, + const std::string& field) { + std::filesystem::path fieldDir(zarrPath); + + // Collect active data dimensions + std::vector activeDims; + for(int d : storageView.dims()) + if(d > 0) + activeDims.push_back(d); + const std::size_t numDataDims = activeDims.size(); + + // Create directory + std::filesystem::create_directories(fieldDir); + + // Write .zarray (single save, no leading time dimension) + json::json shape = json::json::array(); + json::json chunks = json::json::array(); + for(int d : activeDims) { + shape.push_back(d); + chunks.push_back(d); + } + + json::json zarray; + zarray["zarr_format"] = 2; + zarray["shape"] = shape; + zarray["chunks"] = chunks; + zarray["dtype"] = typeIDtoZarrDtype(storageView.type()); + zarray["compressor"] = nullptr; + zarray["fill_value"] = 0; + zarray["order"] = "C"; + zarray["filters"] = nullptr; + + std::ofstream mfs((fieldDir / ".zarray").string(), std::ios::out | std::ios::trunc); + if(!mfs.is_open()) + throw Exception("ZarrArchive: cannot create .zarray in '%s'", fieldDir.string()); + mfs << zarray.dump(2) << "\n"; + mfs.close(); + + // Build chunk key: "0.0...0" + std::string key = "0"; + for(std::size_t i = 1; i < numDataDims; ++i) + key += ".0"; + + std::vector buffer = storageViewToBuffer(storageView); + std::filesystem::path chunkPath = fieldDir / key; + std::ofstream fs(chunkPath.string(), std::ios::out | std::ios::binary | std::ios::trunc); + if(!fs.is_open()) + throw Exception("ZarrArchive: cannot open chunk file: %s", chunkPath.string()); + fs.write(buffer.data(), static_cast(buffer.size())); + fs.close(); +} + +//===------------------------------------------------------------------------------------------===// +// Reading +//===------------------------------------------------------------------------------------------===// + +void ZarrArchive::read(StorageView& storageView, const FieldID& fieldID, + std::shared_ptr info) const { + LOG(info) << "Attempting to read field \"" << fieldID.name << "\" (id = " << fieldID.id + << ") via ZarrArchive ..."; + + auto it = fieldMap_.find(fieldID.name); + if(it == fieldMap_.end()) + throw Exception("no field '%s' registered in ZarrArchive", fieldID.name); + + if(fieldID.id > it->second) + throw Exception("invalid id '%i' of field '%s'", fieldID.id, fieldID.name); + + // Collect active data dimensions + std::size_t numDataDims = 0; + for(int d : storageView.dims()) + if(d > 0) + numDataDims++; + + std::filesystem::path chunkPath = chunkFile(fieldID.name, fieldID.id, numDataDims); + if(!std::filesystem::exists(chunkPath)) + throw Exception("ZarrArchive: chunk file not found: %s", chunkPath.string()); + + const std::size_t expectedBytes = storageView.sizeInBytes(); + std::vector buffer(expectedBytes); + + std::ifstream fs(chunkPath.string(), std::ios::in | std::ios::binary); + if(!fs.is_open()) + throw Exception("ZarrArchive: cannot open chunk file: %s", chunkPath.string()); + fs.read(buffer.data(), static_cast(expectedBytes)); + fs.close(); + + bufferToStorageView(buffer, storageView); + + LOG(info) << "Successfully read field \"" << fieldID.name << "\" (id = " << fieldID.id << ")"; +} + +void ZarrArchive::readFromFile(std::string zarrPath, StorageView& storageView, + const std::string& field) { + std::filesystem::path fieldDir(zarrPath); + + if(!std::filesystem::exists(fieldDir)) + throw Exception("ZarrArchive: Zarr store not found: %s", zarrPath); + + // Collect active data dimensions + std::size_t numDataDims = 0; + for(int d : storageView.dims()) + if(d > 0) + numDataDims++; + + // Build chunk key for a single-save store: "0.0...0" + std::string key = "0"; + for(std::size_t i = 1; i < numDataDims; ++i) + key += ".0"; + + std::filesystem::path chunkPath = fieldDir / key; + if(!std::filesystem::exists(chunkPath)) + throw Exception("ZarrArchive: chunk file not found: %s", chunkPath.string()); + + const std::size_t expectedBytes = storageView.sizeInBytes(); + std::vector buffer(expectedBytes); + + std::ifstream fs(chunkPath.string(), std::ios::in | std::ios::binary); + if(!fs.is_open()) + throw Exception("ZarrArchive: cannot open chunk file: %s", chunkPath.string()); + fs.read(buffer.data(), static_cast(expectedBytes)); + fs.close(); + + bufferToStorageView(buffer, storageView); +} + +//===------------------------------------------------------------------------------------------===// +// Misc +//===------------------------------------------------------------------------------------------===// + +void ZarrArchive::clear() { + std::filesystem::directory_iterator end; + for(std::filesystem::directory_iterator it(directory_); it != end; ++it) { + const auto& p = it->path(); + if(std::filesystem::is_directory(p) && + boost::algorithm::starts_with(p.filename().string(), prefix_ + "_") && + p.extension() == ".zarr") { + std::filesystem::remove_all(p); + } + } + fieldMap_.clear(); +} + +std::ostream& ZarrArchive::toStream(std::ostream& stream) const { + stream << "ZarrArchive = {\n"; + stream << " directory: " << directory_.string() << "\n"; + stream << " mode: " << mode_ << "\n"; + stream << " prefix: " << prefix_ << "\n"; + stream << " fieldMap = {\n"; + for(auto it = fieldMap_.begin(), end = fieldMap_.end(); it != end; ++it) + stream << " " << it->first << ": " << it->second << "\n"; + stream << " }\n"; + stream << "}\n"; + return stream; +} + +std::unique_ptr ZarrArchive::create(OpenModeKind mode, const std::string& directory, + const std::string& prefix) { + return std::make_unique(mode, directory, prefix); +} + +} // namespace serialbox diff --git a/src/serialbox/core/archive/ZarrArchive.h b/src/serialbox/core/archive/ZarrArchive.h new file mode 100644 index 00000000..b20cbf31 --- /dev/null +++ b/src/serialbox/core/archive/ZarrArchive.h @@ -0,0 +1,180 @@ +//===-- serialbox/core/archive/ZarrArchive.h ----------------------------------------*- C++ -*-===// +// +// S E R I A L B O X +// +// This file is distributed under terms of BSD license. +// See LICENSE.txt for more information +// +//===------------------------------------------------------------------------------------------===// +// +/// \file +/// This file implements the archive based on the Zarr storage format (v2). +/// +//===------------------------------------------------------------------------------------------===// + +#ifndef SERIALBOX_CORE_ARCHIVE_ZARRARCHIVE_H +#define SERIALBOX_CORE_ARCHIVE_ZARRARCHIVE_H + +#include "serialbox/core/Json.h" +#include "serialbox/core/archive/Archive.h" +#include +#include +#include +#include + +namespace serialbox { + +/// \brief Archive based on the Zarr storage format (version 2) +/// +/// Each field is stored as a Zarr v2 array in a subdirectory named +/// `_.zarr/` within the archive directory. The first +/// dimension of the array corresponds to the save index (FieldID::id), +/// allowing multiple saves of the same field in a single Zarr store. +/// Data is stored without compression in native byte order. +/// +/// Directory layout: +/// \code +/// / +/// ArchiveMetaData-.json -- serialbox metadata +/// _.zarr/ +/// .zarray -- Zarr array metadata (JSON) +/// 0.0.0...0 -- chunk for save 0 +/// 1.0.0...0 -- chunk for save 1 +/// ... +/// _.zarr/ +/// ... +/// \endcode +/// +/// \see https://zarr.readthedocs.io/en/stable/spec/v2.html +/// +/// \ingroup core +class ZarrArchive : public Archive { +public: + /// \brief Name of the Zarr archive + static const std::string Name; + + /// \brief Revision of the Zarr archive + static const int Version; + + /// \brief Initialize the archive + /// + /// \param mode Policy to open files in the archive + /// \param directory Directory to write/read files. If the archive is opened in 'Read' mode, + /// the directory is expected to supply an 'ArchiveMetaData-prefix.json'. + /// In 'Write' mode, existing field directories matching the pattern + /// '_*.zarr' will be removed and recreated. + /// The 'Append' mode will open existing metadata if present. + /// \param prefix Prefix of all field directories, followed by an underscore and fieldname + ZarrArchive(OpenModeKind mode, const std::string& directory, const std::string& prefix); + + /// \brief Load meta-data from JSON file + void readMetaDataFromJson(); + + /// \brief Convert meta-data to JSON and serialize to file + void writeMetaDataToJson(); + + /// \name Archive implementation + /// \see Archive + /// @{ + virtual FieldID write(const StorageView& storageView, const std::string& fieldID, + const std::shared_ptr info) override; + + virtual void read(StorageView& storageView, const FieldID& fieldID, + std::shared_ptr info) const override; + + virtual void updateMetaData() override; + + virtual OpenModeKind mode() const override { return mode_; } + + virtual std::string directory() const override { return directory_.string(); } + + virtual std::string prefix() const override { return prefix_; } + + virtual std::string name() const override { return ZarrArchive::Name; } + + virtual std::string metaDataFile() const override { return metaDatafile_.string(); } + + virtual std::ostream& toStream(std::ostream& stream) const override; + + virtual void clear() override; + + virtual bool isReadingThreadSafe() const override { return false; } + + virtual bool isWritingThreadSafe() const override { return false; } + + virtual bool isSlicedReadingSupported() const override { return false; } + + /// @} + + /// \brief Create a ZarrArchive + static std::unique_ptr create(OpenModeKind mode, const std::string& directory, + const std::string& prefix); + + /// \brief Directly write a field (given by `storageView`) to a Zarr store directory + /// + /// Creates a single-save Zarr array at the given path. The path should either + /// not exist or be an existing Zarr store directory. + /// + /// \param zarrPath Path to the Zarr store directory (conventionally ending in .zarr) + /// \param storageView StorageView of the field + /// \param field Name of the field (used as Zarr variable name) + static void writeToFile(std::string zarrPath, const StorageView& storageView, + const std::string& field); + + /// \brief Directly read a field (given by `storageView`) from a Zarr store directory + /// + /// \param zarrPath Path to the Zarr store directory + /// \param storageView StorageView of the field + /// \param field Name of the field + static void readFromFile(std::string zarrPath, StorageView& storageView, + const std::string& field); + +private: + OpenModeKind mode_; + std::filesystem::path directory_; + std::string prefix_; + std::filesystem::path metaDatafile_; + + /// Maps field name to the maximum save id written so far + std::unordered_map fieldMap_; + json::json json_; + + /// \brief Return the path to the Zarr array directory for a given field + std::filesystem::path fieldDirectory(const std::string& field) const; + + /// \brief Return the path to a chunk file for a given field and save id + /// + /// Chunk naming follows Zarr v2: each dimension index separated by '.'. + /// The first index is the save id; all remaining indices are 0. + std::filesystem::path chunkFile(const std::string& field, int saveId, + std::size_t numDataDims) const; + + /// \brief Write or update the .zarray metadata file for a field + /// + /// \param fieldDir Path to the Zarr array directory + /// \param storageView StorageView describing the field shape and type + /// \param numSaves Current total number of saves (length of first dimension) + void writeZarrayMetadata(const std::filesystem::path& fieldDir, const StorageView& storageView, + int numSaves) const; + + /// \brief Read the .zarray metadata from a Zarr array directory + json::json readZarrayMetadata(const std::filesystem::path& fieldDir) const; + + /// \brief Copy data from the StorageView into a contiguous flat buffer (column-major iteration) + static std::vector storageViewToBuffer(const StorageView& storageView); + + /// \brief Copy data from a contiguous flat buffer into the StorageView (column-major iteration) + static void bufferToStorageView(const std::vector& buffer, StorageView& storageView); + + /// \brief Return the Zarr dtype string for the given TypeID + /// + /// Uses the system's native byte order prefix ('<' for little-endian, '>' for big-endian). + static std::string typeIDtoZarrDtype(TypeID type); + + /// \brief Return the system byte-order character ('>' or '<') + static char nativeEndianChar(); +}; + +} // namespace serialbox + +#endif diff --git a/test/serialbox/core/CMakeLists.txt b/test/serialbox/core/CMakeLists.txt index f9e61afd..05a9a831 100644 --- a/test/serialbox/core/CMakeLists.txt +++ b/test/serialbox/core/CMakeLists.txt @@ -28,11 +28,12 @@ set(SOURCES UnittestUpgradeArchive.cpp UnittestVersion.cpp - # archive/ - archive/UnittestArchiveFactory.cpp + # archive/ + archive/UnittestArchiveFactory.cpp archive/UnittestBinaryArchive.cpp archive/UnittestNetCDFArchive.cpp archive/UnittestMockArchive.cpp + archive/UnittestZarrArchive.cpp # frontend/gridtools/ frontend/gridtools/UnittestStorageView.cpp diff --git a/test/serialbox/core/archive/UnittestZarrArchive.cpp b/test/serialbox/core/archive/UnittestZarrArchive.cpp new file mode 100644 index 00000000..c3fe3285 --- /dev/null +++ b/test/serialbox/core/archive/UnittestZarrArchive.cpp @@ -0,0 +1,439 @@ +//===-- serialbox/core/archive/UnittestZarrArchive.cpp ------------------------------*- C++ -*-===// +// +// S E R I A L B O X +// +// This file is distributed under terms of BSD license. +// See LICENSE.txt for more information +// +//===------------------------------------------------------------------------------------------===// +// +/// \file +/// This file contains the unittests for the Zarr Archive. +/// +//===------------------------------------------------------------------------------------------===// + +#include "serialbox/core/archive/ZarrArchive.h" +#include "utility/SerializerTestBase.h" +#include "utility/Storage.h" +#include +#include +#include + +using namespace serialbox; +using namespace unittest; + +//===------------------------------------------------------------------------------------------===// +// Utility tests +//===------------------------------------------------------------------------------------------===// + +namespace { + +class ZarrArchiveUtilityTest : public SerializerUnittestBase {}; + +} // anonymous namespace + +TEST_F(ZarrArchiveUtilityTest, Construction) { + + // ----------------------------------------------------------------------------------------------- + // Writing + // ----------------------------------------------------------------------------------------------- + + // Open fresh archive and write meta data to disk + { + ZarrArchive b(OpenModeKind::Write, this->directory->path().string(), "field"); + b.updateMetaData(); + + EXPECT_TRUE(boost::algorithm::starts_with(b.name(), "Zarr")); + EXPECT_EQ(b.mode(), OpenModeKind::Write); + EXPECT_EQ(b.prefix(), "field"); + } + + // Create directory if not already existent + { + ZarrArchive b(OpenModeKind::Write, (this->directory->path() / "this-dir-is-created").string(), + "field"); + EXPECT_TRUE(std::filesystem::exists(this->directory->path() / "this-dir-is-created")); + } + + // ----------------------------------------------------------------------------------------------- + // Reading + // ----------------------------------------------------------------------------------------------- + { + ZarrArchive b(OpenModeKind::Read, this->directory->path().string(), "field"); + b.updateMetaData(); + } + + // Throw Exception: Directory does not exist + { + EXPECT_THROW(ZarrArchive(OpenModeKind::Read, (this->directory->path() / "not-a-dir").string(), + "field"), + Exception); + } + + // ----------------------------------------------------------------------------------------------- + // Appending + // ----------------------------------------------------------------------------------------------- + + { + EXPECT_NO_THROW(ZarrArchive(OpenModeKind::Append, this->directory->path().string(), "field")); + } + + // Create directory if not already existent + { + ZarrArchive b(OpenModeKind::Append, (this->directory->path() / "this-dir-is-created").string(), + "field"); + } + + // Create directories if not already existent + { + ZarrArchive b(OpenModeKind::Append, (this->directory->path() / "nest1" / "nest2").string(), + "field"); + EXPECT_TRUE(std::filesystem::exists(this->directory->path() / "nest1" / "nest2")); + } +} + +TEST_F(ZarrArchiveUtilityTest, MetaData) { + using Storage = Storage; + + Storage u_0_input(Storage::RowMajor, {5, 6, 7}, {{2, 2}, {4, 2}, {4, 5}}, Storage::random); + + ZarrArchive archiveWrite(OpenModeKind::Write, this->directory->path().string(), "field"); + + auto sv_u_0_input = u_0_input.toStorageView(); + archiveWrite.write(sv_u_0_input, "u", nullptr); + archiveWrite.updateMetaData(); + + // Read meta data file to get in-memory copy + std::ifstream ifs(archiveWrite.metaDataFile()); + json::json j; + ifs >> j; + ifs.close(); + + std::string filename = archiveWrite.metaDataFile(); + auto toFile = [&filename](const json::json& jsonNode) -> void { + std::ofstream ofs(filename, std::ios::out | std::ios::trunc); + ofs << jsonNode.dump(4); + }; + + // ----------------------------------------------------------------------------------------------- + // Invalid serialbox version + // ----------------------------------------------------------------------------------------------- + { + json::json corrupted = j; + corrupted["serialbox_version"] = 100 * (SERIALBOX_VERSION_MAJOR + 1) + + 10 * SERIALBOX_VERSION_MINOR + SERIALBOX_VERSION_PATCH; + toFile(corrupted); + + ASSERT_THROW(ZarrArchive(OpenModeKind::Read, this->directory->path().string(), "field"), + Exception); + } + + // ----------------------------------------------------------------------------------------------- + // Not a Zarr archive + // ----------------------------------------------------------------------------------------------- + { + json::json corrupted = j; + corrupted["archive_name"] = "not-ZarrArchive"; + toFile(corrupted); + + ASSERT_THROW(ZarrArchive(OpenModeKind::Read, this->directory->path().string(), "field"), + Exception); + } + + // ----------------------------------------------------------------------------------------------- + // Invalid Zarr archive version + // ----------------------------------------------------------------------------------------------- + { + json::json corrupted = j; + corrupted["archive_version"] = ZarrArchive::Version + 1; + toFile(corrupted); + + ASSERT_THROW(ZarrArchive(OpenModeKind::Read, this->directory->path().string(), "field"), + Exception); + } + + // ----------------------------------------------------------------------------------------------- + // MetaData not found + // ----------------------------------------------------------------------------------------------- + { + std::filesystem::remove(filename); + ASSERT_THROW(ZarrArchive(OpenModeKind::Read, this->directory->path().string(), "field"), + Exception); + } +} + +TEST_F(ZarrArchiveUtilityTest, ZarrayMetadata) { + using Storage = Storage; + + Storage u_0_input(Storage::RowMajor, {5, 6, 7}, {{2, 2}, {4, 2}, {4, 5}}, Storage::random); + Storage u_1_input(Storage::RowMajor, {5, 6, 7}, {{2, 2}, {4, 2}, {4, 5}}, Storage::random); + + ZarrArchive archiveWrite(OpenModeKind::Write, this->directory->path().string(), "field"); + + auto sv0 = u_0_input.toStorageView(); + auto sv1 = u_1_input.toStorageView(); + archiveWrite.write(sv0, "u", nullptr); + archiveWrite.write(sv1, "u", nullptr); + + // Check .zarray file was created with correct shape + std::filesystem::path fieldDir = + std::filesystem::path(archiveWrite.directory()) / "field_u.zarr"; + ASSERT_TRUE(std::filesystem::exists(fieldDir / ".zarray")); + + std::ifstream zfs((fieldDir / ".zarray").string()); + json::json zarray; + zfs >> zarray; + + EXPECT_EQ(zarray["zarr_format"], 2); + EXPECT_EQ(zarray["order"], "C"); + EXPECT_EQ(zarray["compressor"], nullptr); + // shape[0] should be num_saves = 2 + ASSERT_GE(zarray["shape"].size(), 1u); + EXPECT_EQ(zarray["shape"][0], 2); + // chunks[0] should be 1 + ASSERT_GE(zarray["chunks"].size(), 1u); + EXPECT_EQ(zarray["chunks"][0], 1); +} + +TEST_F(ZarrArchiveUtilityTest, toString) { + using Storage = Storage; + std::stringstream ss; + + Storage storage(Storage::ColMajor, {5, 1, 1}); + + ZarrArchive archive(OpenModeKind::Write, this->directory->path().string(), "field"); + StorageView sv = storage.toStorageView(); + archive.write(sv, "storage", nullptr); + + ss << archive; + + EXPECT_TRUE(boost::algorithm::starts_with(ss.str(), "ZarrArchive")); + EXPECT_NE(ss.str().find("directory"), std::string::npos); + EXPECT_NE(ss.str().find("mode"), std::string::npos); + EXPECT_NE(ss.str().find("prefix"), std::string::npos); + EXPECT_NE(ss.str().find("fieldMap"), std::string::npos); +} + +TEST_F(ZarrArchiveUtilityTest, writeAndRead) { + using Storage = Storage; + Storage storage_input(Storage::ColMajor, {5, 2, 5}, Storage::random); + Storage storage_output(Storage::ColMajor, {5, 2, 5}); + + auto sv_input = storage_input.toStorageView(); + auto sv_output = storage_output.toStorageView(); + + // Write and read from Zarr store directory + std::string zarrPath = (this->directory->path() / "test.zarr").string(); + ZarrArchive::writeToFile(zarrPath, sv_input, "field"); + ZarrArchive::readFromFile(zarrPath, sv_output, "field"); + + ASSERT_TRUE(Storage::verify(storage_input, storage_output)); + + // Read from non-existing store -> Exception + ASSERT_THROW(ZarrArchive::readFromFile( + (this->directory->path() / "does-not-exist.zarr").string(), sv_output, "field"), + Exception); +} + +//===------------------------------------------------------------------------------------------===// +// Read/Write tests +//===------------------------------------------------------------------------------------------===// + +namespace { + +template +class ZarrArchiveReadWriteTest : public SerializerUnittestBase {}; + +using TestTypes = testing::Types; + +} // anonymous namespace + +TYPED_TEST_CASE(ZarrArchiveReadWriteTest, TestTypes); + +TYPED_TEST(ZarrArchiveReadWriteTest, WriteAndRead) { + + // ----------------------------------------------------------------------------------------------- + // Preparation + // ----------------------------------------------------------------------------------------------- + using Storage = Storage; + + // Prepare input data + Storage u_0_input(Storage::RowMajor, {5, 6, 7}, {{2, 2}, {4, 2}, {4, 5}}, Storage::random); + Storage u_1_input(Storage::RowMajor, {5, 6, 7}, {{2, 2}, {4, 2}, {4, 5}}, Storage::random); + Storage u_2_input(Storage::RowMajor, {5, 6, 7}, {{2, 2}, {4, 2}, {4, 5}}, Storage::random); + + Storage v_0_input(Storage::ColMajor, {5, 1, 1}, Storage::random); + Storage v_1_input(Storage::ColMajor, {5, 1, 1}, Storage::random); + Storage v_2_input(Storage::ColMajor, {5, 1, 1}, Storage::random); + + Storage storage_2d_0_input(Storage::ColMajor, {26, 23}, {{2, 2}, {4, 2}}, Storage::random); + Storage storage_2d_1_input(Storage::ColMajor, {26, 23}, {{2, 2}, {4, 2}}, Storage::random); + + Storage storage_7d_0_input(Storage::ColMajor, {2, 2, 2, 2, 2, 2, 2}, Storage::random); + Storage storage_7d_1_input(Storage::ColMajor, {2, 2, 2, 2, 2, 2, 2}, Storage::random); + + // Prepare output + Storage u_0_output(Storage::RowMajor, {5, 6, 7}); + Storage u_1_output(Storage::RowMajor, {5, 6, 7}); + Storage u_2_output(Storage::RowMajor, {5, 6, 7}); + + Storage v_0_output(Storage::RowMajor, {5, 1, 1}); + Storage v_1_output(Storage::RowMajor, {5, 1, 1}); + Storage v_2_output(Storage::RowMajor, {5, 1, 1}); + + Storage storage_2d_0_output(Storage::ColMajor, {26, 23}, {{2, 2}, {4, 2}}); + Storage storage_2d_1_output(Storage::ColMajor, {26, 23}, {{2, 2}, {4, 2}}); + + Storage storage_7d_0_output(Storage::ColMajor, {2, 2, 2, 2, 2, 2, 2}); + Storage storage_7d_1_output(Storage::ColMajor, {2, 2, 2, 2, 2, 2, 2}); + + // ----------------------------------------------------------------------------------------------- + // Writing (data and meta-data) + // ----------------------------------------------------------------------------------------------- + { + ZarrArchive archiveWrite(OpenModeKind::Write, this->directory->path().string(), "field"); + + EXPECT_STREQ(archiveWrite.directory().c_str(), this->directory->path().string().c_str()); + + // u: id = 0 + { + auto sv = u_0_input.toStorageView(); + FieldID fieldID = archiveWrite.write(sv, "u", nullptr); + ASSERT_EQ(fieldID.name, "u"); + ASSERT_EQ(fieldID.id, 0); + } + + // u: id = 1 + { + auto sv = u_1_input.toStorageView(); + FieldID fieldID = archiveWrite.write(sv, "u", nullptr); + ASSERT_EQ(fieldID.name, "u"); + ASSERT_EQ(fieldID.id, 1); + } + + // u: id = 2 + { + auto sv = u_2_input.toStorageView(); + FieldID fieldID = archiveWrite.write(sv, "u", nullptr); + ASSERT_EQ(fieldID.name, "u"); + ASSERT_EQ(fieldID.id, 2); + } + + // v: id = 0 + { + auto sv = v_0_input.toStorageView(); + FieldID fieldID = archiveWrite.write(sv, "v", nullptr); + ASSERT_EQ(fieldID.name, "v"); + ASSERT_EQ(fieldID.id, 0); + } + + // v: id = 1 + { + auto sv = v_1_input.toStorageView(); + FieldID fieldID = archiveWrite.write(sv, "v", nullptr); + ASSERT_EQ(fieldID.name, "v"); + ASSERT_EQ(fieldID.id, 1); + } + + // v: id = 2 + { + auto sv = v_2_input.toStorageView(); + FieldID fieldID = archiveWrite.write(sv, "v", nullptr); + ASSERT_EQ(fieldID.name, "v"); + ASSERT_EQ(fieldID.id, 2); + } + + // storage 2d + auto sv_2d_0 = storage_2d_0_input.toStorageView(); + archiveWrite.write(sv_2d_0, "storage_2d", nullptr); + + auto sv_2d_1 = storage_2d_1_input.toStorageView(); + archiveWrite.write(sv_2d_1, "storage_2d", nullptr); + + // storage 7d + auto sv_7d_0 = storage_7d_0_input.toStorageView(); + archiveWrite.write(sv_7d_0, "storage_7d", nullptr); + + auto sv_7d_1 = storage_7d_1_input.toStorageView(); + archiveWrite.write(sv_7d_1, "storage_7d", nullptr); + } + + // ----------------------------------------------------------------------------------------------- + // Reading + // ----------------------------------------------------------------------------------------------- + { + ZarrArchive archiveRead(OpenModeKind::Read, this->directory->path().string(), "field"); + EXPECT_STREQ(archiveRead.directory().c_str(), this->directory->path().string().c_str()); + + // u + auto sv_u_0 = u_0_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_u_0, FieldID{"u", 0}, nullptr)); + + auto sv_u_1 = u_1_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_u_1, FieldID{"u", 1}, nullptr)); + + auto sv_u_2 = u_2_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_u_2, FieldID{"u", 2}, nullptr)); + + // v + auto sv_v_0 = v_0_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_v_0, FieldID{"v", 0}, nullptr)); + + auto sv_v_1 = v_1_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_v_1, FieldID{"v", 1}, nullptr)); + + auto sv_v_2 = v_2_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_v_2, FieldID{"v", 2}, nullptr)); + + // Check exceptional cases + ASSERT_THROW(archiveRead.write(sv_u_2, "u", nullptr), Exception); + ASSERT_THROW(archiveRead.read(sv_u_2, FieldID{"u", 1024}, nullptr), Exception); + ASSERT_THROW(archiveRead.read(sv_u_2, FieldID{"not-a-field", 0}, nullptr), Exception); + + // storage 2d + auto sv_2d_0 = storage_2d_0_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_2d_0, FieldID{"storage_2d", 0}, nullptr)); + + auto sv_2d_1 = storage_2d_1_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_2d_1, FieldID{"storage_2d", 1}, nullptr)); + + // storage 7d + auto sv_7d_0 = storage_7d_0_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_7d_0, FieldID{"storage_7d", 0}, nullptr)); + + auto sv_7d_1 = storage_7d_1_output.toStorageView(); + ASSERT_NO_THROW(archiveRead.read(sv_7d_1, FieldID{"storage_7d", 1}, nullptr)); + } + + // ----------------------------------------------------------------------------------------------- + // Validation + // ----------------------------------------------------------------------------------------------- + ASSERT_TRUE(Storage::verify(u_0_output, u_0_input)); + ASSERT_TRUE(Storage::verify(u_1_output, u_1_input)); + ASSERT_TRUE(Storage::verify(u_2_output, u_2_input)); + + ASSERT_TRUE(Storage::verify(v_0_output, v_0_input)); + ASSERT_TRUE(Storage::verify(v_1_output, v_1_input)); + ASSERT_TRUE(Storage::verify(v_2_output, v_2_input)); + + ASSERT_TRUE(Storage::verify(storage_2d_0_output, storage_2d_0_input)); + ASSERT_TRUE(Storage::verify(storage_2d_1_output, storage_2d_1_input)); + + ASSERT_TRUE(Storage::verify(storage_7d_0_output, storage_7d_0_input)); + ASSERT_TRUE(Storage::verify(storage_7d_1_output, storage_7d_1_input)); + + // ----------------------------------------------------------------------------------------------- + // Cleanup (open in Write mode removes existing field directories) + // ----------------------------------------------------------------------------------------------- + { + ZarrArchive archiveWrite(OpenModeKind::Write, this->directory->path().string(), "field"); + EXPECT_FALSE( + std::filesystem::exists(this->directory->path() / "field_u.zarr")); + EXPECT_FALSE( + std::filesystem::exists(this->directory->path() / "field_v.zarr")); + EXPECT_FALSE( + std::filesystem::exists(this->directory->path() / "field_storage_2d.zarr")); + EXPECT_FALSE( + std::filesystem::exists(this->directory->path() / "field_storage_7d.zarr")); + } +} diff --git a/test_data/ser_data.zip b/test_data/ser_data.zip new file mode 100644 index 00000000..764ea6f0 Binary files /dev/null and b/test_data/ser_data.zip differ