diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 835948d..04b889f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,3 +135,30 @@ jobs: - run: cargo clippy -p pcf-compact --all-targets -- -D warnings - run: cargo build -p pcf-compact --verbose - run: cargo test -p pcf-compact --verbose + + pcf-dcp: + name: pcf-dcp profile + runs-on: ubuntu-latest + defaults: + run: + working-directory: . + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + - uses: Swatinem/rust-cache@v2 + - run: cargo fmt -p pcf-dcp -- --check + - run: cargo clippy -p pcf-dcp --all-targets -- -D warnings + - run: cargo build -p pcf-dcp --verbose + - run: cargo test -p pcf-dcp --verbose + - name: Regenerate the spec test vector + run: cargo run -p pcf-dcp --example gen_testvector -- pcf_dcp_testvector.bin + - name: Inspect generated test vector (spec Section 17 is 700 bytes) + run: | + ls -l pcf_dcp_testvector.bin + test "$(wc -c < pcf_dcp_testvector.bin)" = "700" + - uses: actions/upload-artifact@v4 + with: + name: pcf-dcp-testvector + path: pcf_dcp_testvector.bin diff --git a/.github/workflows/release-prepare.yml b/.github/workflows/release-prepare.yml index a9e9013..95bfe21 100644 --- a/.github/workflows/release-prepare.yml +++ b/.github/workflows/release-prepare.yml @@ -76,13 +76,16 @@ jobs: sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PCF-v1.0/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PFS-MS-v1.0/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PCF-SIG-v1.0/Cargo.toml + sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PCF-DCP-v1.0/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' tools/pcf-debug/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' tools/pcf-compact/Cargo.toml # path-dep version pins on pcf sed -i 's|pcf = { path = "\.\./PCF-v1.0", version = "[^"]*" }|pcf = { path = "../PCF-v1.0", version = "'"$NEW"'" }|' reference/PFS-MS-v1.0/Cargo.toml sed -i 's|pcf = { path = "\.\./PCF-v1.0", version = "[^"]*" }|pcf = { path = "../PCF-v1.0", version = "'"$NEW"'" }|' reference/PCF-SIG-v1.0/Cargo.toml + sed -i 's|pcf = { path = "\.\./PCF-v1.0", version = "[^"]*" }|pcf = { path = "../PCF-v1.0", version = "'"$NEW"'" }|' reference/PCF-DCP-v1.0/Cargo.toml sed -i 's|pcf = { path = "\.\./\.\./reference/PCF-v1.0", version = "[^"]*" }|pcf = { path = "../../reference/PCF-v1.0", version = "'"$NEW"'" }|' tools/pcf-debug/Cargo.toml sed -i 's|pcf-sig = { path = "\.\./\.\./reference/PCF-SIG-v1.0", version = "[^"]*" }|pcf-sig = { path = "../../reference/PCF-SIG-v1.0", version = "'"$NEW"'" }|' tools/pcf-debug/Cargo.toml + sed -i 's|pcf-dcp = { path = "\.\./\.\./reference/PCF-DCP-v1.0", version = "[^"]*" }|pcf-dcp = { path = "../../reference/PCF-DCP-v1.0", version = "'"$NEW"'" }|' tools/pcf-debug/Cargo.toml sed -i 's|pcf = { path = "\.\./\.\./reference/PCF-v1.0", version = "[^"]*" }|pcf = { path = "../../reference/PCF-v1.0", version = "'"$NEW"'" }|' tools/pcf-compact/Cargo.toml - name: Bump TypeScript packages diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 90c9928..be1a07c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -132,6 +132,19 @@ jobs: if: needs.resolve.outputs.dry_run != 'true' run: sleep 45 + - name: cargo publish pcf-dcp + shell: bash + run: | + if [ "${{ needs.resolve.outputs.dry_run }}" = "true" ]; then + cargo publish -p pcf-dcp --allow-dirty --dry-run + else + cargo publish -p pcf-dcp --allow-dirty --token "${{ steps.cargo-auth.outputs.token }}" + fi + + - name: Wait for crates.io index + if: needs.resolve.outputs.dry_run != 'true' + run: sleep 45 + - name: cargo publish pcf-debug shell: bash run: | diff --git a/Cargo.toml b/Cargo.toml index 8f2f4c7..6bd8fea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "reference/PCF-v1.0", "reference/PFS-MS-v1.0", "reference/PCF-SIG-v1.0", + "reference/PCF-DCP-v1.0", "tools/pcf-debug", "tools/pcf-compact", ] diff --git a/reference/PCF-DCP-v1.0/Cargo.toml b/reference/PCF-DCP-v1.0/Cargo.toml new file mode 100644 index 0000000..6034449 --- /dev/null +++ b/reference/PCF-DCP-v1.0/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "pcf-dcp" +version = "0.0.8" +edition = "2021" +description = "Reference implementation of PCF-DCP v1.0, the PCF Dynamic Container Partition profile" +license = "MIT OR Apache-2.0" +repository = "https://github.com/kduma-OSS/Partitioned-Container-Format" +homepage = "https://github.com/kduma-OSS/Partitioned-Container-Format" +readme = "README.md" +keywords = ["pcf", "dcp", "container", "deduplication", "fragmentation"] +categories = ["encoding", "filesystem"] + +# This crate is a *reference* implementation of the PCF-DCP profile. Like the +# `pcf` crate it builds on, it favours a direct, auditable mapping onto the +# written specification (`specs/PCF-DCP-spec-v1.0.txt`) over raw performance. + +[[bin]] +name = "dcp" +path = "src/bin/dcp.rs" + +[dependencies] +# The PCF-DCP profile is layered strictly above PCF v1.0; every byte container +# operation goes through the reference PCF crate. The arena reuses PCF's Table +# Block, Partition Entry, and table-hash primitives directly. +pcf = { path = "../PCF-v1.0", version = "0.0.8" } diff --git a/reference/PCF-DCP-v1.0/README.md b/reference/PCF-DCP-v1.0/README.md new file mode 100644 index 0000000..0fab1d8 --- /dev/null +++ b/reference/PCF-DCP-v1.0/README.md @@ -0,0 +1,120 @@ +# pcf-dcp — PCF Dynamic Container Partition (reference implementation) + +Reference reader/writer for **PCF-DCP v1.0**, an application-level profile that +adds *dynamic*, fragmentable, dedup-friendly sub-partitions to the +[Partitioned Container Format](../PCF-v1.0) without modifying the PCF byte +container. + +This crate mirrors the written specification (`specs/PCF-DCP-spec-v1.0.txt`) +field-for-field and is intended as the *normative* implementation against which +language ports are checked. It favours auditability over performance. + +## Model at a glance + +PCF-DCP defines one new PCF partition type: + +| Type | Name | Holds | +|--------------|-----------------|----------------------------------------------------| +| `0xAAAC0001` | `DCP_CONTAINER` | An *arena*: a header, an inner partition table, fragment tables, and data extents | + +A DCP container's bytes are an **arena** addressed by arena-relative offsets: + +``` +arena: +[ DCP Header (24 B) | data extents | Fragment Tables | Inner Table Block(s) ] +``` + +* **DCP Header** — `"PDCP"` magic, profile version, `inner_table_offset`, + `arena_used` (a bump pointer). +* **Inner Table Block** — a chain of reused PCF Table Blocks (74 B header + + 141 B entries), byte-for-byte identical to the top-level table, listing the + *inner* partitions. Two entry fields are reinterpreted: `start_offset` points + at the partition's Fragment Table, and `max_length` equals `used_bytes`. +* **Fragment Table** — per inner partition, a chain of 9-byte block headers each + followed by 18-byte **Fragment Entries**. Each entry names one extent + `(offset, length, kind, flags)`. The logical content of an inner partition is + the concatenation of its DATA extents. + +A generic PCF reader sees a DCP file as **one opaque partition**; only a +DCP-aware reader looks inside. A DCP file is always a conforming PCF v1.0 file. + +## Why a profile + +PCF stores each partition as a contiguous, statically-reserved region. PCF-DCP +makes each *inner* partition grow, shrink, and be edited in the middle without +relocating its neighbours, by describing it as a list of extents rather than one +range. This buys: + +* **Fragmentation / random edits** — append, insert, overwrite, delete, and + truncate are edits of the Fragment Table (copy-on-write for shared bytes); no + data is moved. +* **Deduplication** — two extents may name the same arena bytes; identical + chunks are stored once. The per-extent `SHARED` flag makes safe in-place + editing explicit. +* **Hash / signature stability** — an inner partition's `data_hash` covers its + *logical content*, so fragmentation, dedup, compaction, and promotion all + leave the hash (and any PCF-SIG signature over it) unchanged. + +## Library example + +```rust +use std::io::Cursor; +use pcf_dcp::{Arena, Chunker, DcpReader, DcpWriter, HashAlgo}; + +let mut arena = Arena::new(); +arena.add_inner(0x10, [0xA1; 16], "A", b"Hello, World!", HashAlgo::Sha256, Chunker::Fixed(7))?; +arena.add_inner(0x10, [0xB2; 16], "B", b"World!", HashAlgo::Sha256, Chunker::Whole)?; + +let mut w = DcpWriter::new(); +w.add_container([0xDC; 16], "dcp", arena)?; +let image = w.to_image()?; + +let mut r = DcpReader::open(Cursor::new(image))?; +r.verify()?; +assert_eq!(r.read_inner(&[0xB2; 16])?, b"World!"); +# Ok::<(), pcf_dcp::Error>(()) +``` + +## Promotion / demotion + +`DcpWriter::promote` moves an inner partition out to a top-level PCF partition +(dynamic → fixed); `demote` moves a top-level partition into a container +(fixed → dynamic). Both preserve `uid`, `partition_type`, `label`, +`data_hash_algo_id`, and `data_hash` — the **promotion invariant**, identical to +the set of fields a PCF-SIG signature protects. + +## Command-line tool + +The `dcp` binary inspects and rewrites DCP files; every mutating command +re-verifies before writing: + +``` +dcp info +dcp dedup [--fixed N] [--trailer] +dcp defrag [--trailer] +dcp promote [--trailer] +dcp demote [--trailer] +``` + +UIDs are 32 hex digits, or `0xNN` for a uid of 16 identical bytes (e.g. `0xDC`). + +## Build & test + +``` +cargo test -p pcf-dcp +cargo run -p pcf-dcp --example gen_testvector -- /tmp/dcp.bin # the 700-byte vector +cargo run -p pcf-dcp --bin dcp -- info /tmp/dcp.bin +``` + +The example reproduces the byte-exact 700-byte test vector from Section 17 of +the specification. + +## Relationship to `pcf` + +This crate is layered strictly above [`pcf`](../PCF-v1.0): every container byte +operation goes through the reference PCF crate, and the arena reuses PCF's Table +Block, Partition Entry, and table-hash primitives directly. + +## Licence + +MIT OR Apache-2.0. diff --git a/reference/PCF-DCP-v1.0/examples/gen_testvector.rs b/reference/PCF-DCP-v1.0/examples/gen_testvector.rs new file mode 100644 index 0000000..dd585ce --- /dev/null +++ b/reference/PCF-DCP-v1.0/examples/gen_testvector.rs @@ -0,0 +1,57 @@ +//! Generates the canonical PCF-DCP v1.0 test-vector file used in spec +//! Section 17. +//! +//! Run with: `cargo run --example gen_testvector -- ` +//! (defaults to ./pcf_dcp_testvector.bin). Everything is fixed and +//! deterministic so that ports can reproduce the file byte-for-byte. + +use std::io::Cursor; + +use pcf::Container; +use pcf_dcp::{build_reference_vector, DcpReader}; + +fn main() { + let path = std::env::args() + .nth(1) + .unwrap_or_else(|| "pcf_dcp_testvector.bin".to_string()); + + let image = build_reference_vector().expect("build reference vector"); + std::fs::write(&path, &image).expect("write file"); + + // It is a conforming PCF v1.0 file ... + let mut pcf = Container::open(Cursor::new(image.clone())).expect("pcf open"); + pcf.verify().expect("pcf verify"); + + // ... and a conforming DCP file. + let mut dcp = DcpReader::open(Cursor::new(image.clone())).expect("dcp open"); + dcp.verify().expect("dcp verify"); + + eprintln!("wrote {} ({} bytes)", path, image.len()); + for c in dcp.containers().expect("containers") { + let arena = dcp.open_arena(&c).expect("arena"); + eprintln!( + " container {:<6} type=0x{:08X} used={} inners={}", + c.label_string().unwrap_or_default(), + c.partition_type, + c.used_bytes, + arena.len() + ); + for info in arena.inners() { + let n = info.data_hash_algo.digest_len(); + let hex: String = info.data_hash[..n] + .iter() + .map(|b| format!("{b:02x}")) + .collect(); + let shared = info.extents.iter().filter(|e| e.shared).count(); + eprintln!( + " inner {:<3} type=0x{:08X} used={} extents={} shared={} data_hash={}", + info.label, + info.partition_type, + info.used_bytes, + info.extents.len(), + shared, + hex + ); + } + } +} diff --git a/reference/PCF-DCP-v1.0/src/arena.rs b/reference/PCF-DCP-v1.0/src/arena.rs new file mode 100644 index 0000000..9a2e232 --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/arena.rs @@ -0,0 +1,881 @@ +//! The DCP arena: the in-memory model of one DCP container and its canonical +//! byte serialisation. +//! +//! An [`Arena`] holds a byte pool (`blob`) plus a list of inner partitions, +//! each of which owns a list of [`Frag`]s. A `Frag` addresses a byte range in +//! the pool; two `Frag`s addressing the *same* range share that extent +//! (deduplication, spec Section 10.2). Editing operations (append, overwrite, +//! insert, delete, truncate) work purely on the fragment list and append new +//! bytes to the pool, never overwriting bytes a `SHARED` extent still names +//! (copy-on-write, spec Section 10.1). +//! +//! [`Arena::to_bytes`] always emits the *canonical* layout used by the spec's +//! test vector (Section 17): `DCP Header || data extents || Fragment Tables || +//! Inner Table Block(s)`, with each distinct extent emitted exactly once. + +use std::collections::HashMap; + +use pcf::{ + compute_table_hash, decode_label, encode_label, HashAlgo, PartitionEntry, TableBlockHeader, + ENTRY_SIZE, NIL_UID, TABLE_HEADER_SIZE, UID_SIZE, +}; + +use crate::consts::*; +use crate::error::{Error, Result}; +use crate::fragment::{walk_fragment_table, FragTableHeader, FragmentEntry}; +use crate::header::{read_header, DcpHeader}; + +/// How a Writer splits an inner partition's content into extents +/// (spec Section 10.2; chunking is writer-side policy). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Chunker { + /// One extent for the whole content. + Whole, + /// Fixed-size chunks of `n` bytes (the final chunk may be shorter). `n == 0` + /// is treated as [`Chunker::Whole`]. + Fixed(usize), +} + +impl Chunker { + fn split<'a>(&self, content: &'a [u8]) -> Vec<&'a [u8]> { + match *self { + Chunker::Whole => { + if content.is_empty() { + Vec::new() + } else { + vec![content] + } + } + Chunker::Fixed(0) => Chunker::Whole.split(content), + Chunker::Fixed(n) => content.chunks(n).collect(), + } + } +} + +/// One extent reference inside an inner partition. `offset`/`length` address +/// [`Arena::blob`]; `shared` is the on-disk SHARED flag (bit 0 of `flags`). +#[derive(Debug, Clone, Copy)] +struct Frag { + offset: u64, + length: u64, + kind: u8, + shared: bool, +} + +/// One inner partition. +#[derive(Debug, Clone)] +struct Inner { + partition_type: u32, + uid: [u8; UID_SIZE], + label: [u8; 32], + data_hash_algo: HashAlgo, + frags: Vec, +} + +impl Inner { + fn logical_len(&self) -> u64 { + self.frags + .iter() + .filter(|f| f.kind == KIND_DATA) + .map(|f| f.length) + .sum() + } + + fn content(&self, blob: &[u8]) -> Vec { + let mut out = Vec::with_capacity(self.logical_len() as usize); + for f in &self.frags { + if f.kind == KIND_DATA { + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + out.extend_from_slice(&blob[a..b]); + } + } + out + } + + fn data_hash(&self, blob: &[u8]) -> [u8; 64] { + self.data_hash_algo.compute(&self.content(blob)) + } +} + +/// A read-only view of one extent, for tooling (`dcp info`, tests). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ExtentInfo { + /// Arena/pool-relative offset of the extent. + pub extent_offset: u64, + /// Length of the extent in bytes. + pub extent_length: u64, + /// Extent kind (`1` = DATA). + pub kind: u8, + /// Whether the SHARED flag is set. + pub shared: bool, +} + +/// A read-only view of one inner partition, for tooling and verification. +#[derive(Debug, Clone)] +pub struct InnerInfo { + /// Application partition type. + pub partition_type: u32, + /// 16-byte uid (unique file-wide). + pub uid: [u8; UID_SIZE], + /// Decoded label. + pub label: String, + /// Logical content length (= `used_bytes`). + pub used_bytes: u64, + /// Hash algorithm protecting the logical content. + pub data_hash_algo: HashAlgo, + /// The 64-byte data-hash field over the logical content. + pub data_hash: [u8; 64], + /// The partition's extents in logical order. + pub extents: Vec, +} + +/// The in-memory model of one DCP container. +#[derive(Debug, Clone)] +pub struct Arena { + profile_version_major: u8, + profile_version_minor: u8, + flags: u16, + inner_table_algo: HashAlgo, + blob: Vec, + inners: Vec, +} + +impl Default for Arena { + fn default() -> Self { + Self::new() + } +} + +impl Arena { + // ---- construction ----------------------------------------------------- + + /// A fresh, empty arena (profile v1.0, SHA-256 inner table hashing). + pub fn new() -> Self { + Arena { + profile_version_major: PROFILE_VERSION_MAJOR, + profile_version_minor: PROFILE_VERSION_MINOR, + flags: 0, + inner_table_algo: HashAlgo::Sha256, + blob: Vec::new(), + inners: Vec::new(), + } + } + + /// Choose the hash algorithm used for inner Table Blocks (default + /// SHA-256). A Writer SHOULD keep this cryptographic (spec Section 9.2). + pub fn with_inner_table_algo(mut self, algo: HashAlgo) -> Self { + self.inner_table_algo = algo; + self + } + + /// Parse an arena from its on-disk bytes (spec Sections 6–8). The byte + /// pool is the arena itself, so every parsed extent offset is + /// arena-relative and indexes directly into it. + pub fn parse(bytes: &[u8]) -> Result { + let header = read_header(bytes)?; + if header.profile_version_major != PROFILE_VERSION_MAJOR { + return Err(Error::UnsupportedProfileMajor(header.profile_version_major)); + } + let arena_used = header.arena_used; + + let mut inners = Vec::new(); + let mut inner_table_algo = HashAlgo::Sha256; + let mut first_block = true; + let mut off = header.inner_table_offset; + let mut budget = bytes.len() / TABLE_HEADER_SIZE as usize + 1; + while off != ARENA_NONE { + if budget == 0 { + return Err(Error::OffsetOutOfRange); + } + budget -= 1; + let base = off as usize; + let hb: [u8; 74] = bytes + .get(base..base + TABLE_HEADER_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let h = TableBlockHeader::from_bytes(&hb)?; + if first_block { + inner_table_algo = h.table_hash_algo; + first_block = false; + } + for i in 0..h.partition_count as u64 { + let eo = base + TABLE_HEADER_SIZE as usize + (i * ENTRY_SIZE) as usize; + let eb: [u8; 141] = bytes + .get(eo..eo + ENTRY_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let entry = PartitionEntry::from_bytes(&eb)?; + let on_disk = walk_fragment_table(bytes, entry.start_offset)?; + let frags = on_disk + .iter() + .map(|fe| Frag { + offset: fe.extent_offset, + length: fe.extent_length, + kind: fe.kind, + shared: fe.is_shared(), + }) + .collect(); + inners.push(Inner { + partition_type: entry.partition_type, + uid: entry.uid, + label: entry.label, + data_hash_algo: entry.data_hash_algo, + frags, + }); + } + off = h.next_table_offset; + } + + let blob = bytes.to_vec(); + let arena = Arena { + profile_version_major: header.profile_version_major, + profile_version_minor: header.profile_version_minor, + flags: header.flags, + inner_table_algo, + blob, + inners, + }; + // Bound every extent by the declared arena_used. + for inner in &arena.inners { + for f in &inner.frags { + let end = f + .offset + .checked_add(f.length) + .ok_or(Error::OffsetOutOfRange)?; + if end > arena_used { + return Err(Error::OffsetOutOfRange); + } + } + } + Ok(arena) + } + + // ---- read-only views -------------------------------------------------- + + /// Number of inner partitions. + pub fn len(&self) -> usize { + self.inners.len() + } + + /// Whether the arena has no inner partitions. + pub fn is_empty(&self) -> bool { + self.inners.is_empty() + } + + /// The uids of all inner partitions, in stored order. + pub fn uids(&self) -> Vec<[u8; UID_SIZE]> { + self.inners.iter().map(|i| i.uid).collect() + } + + fn index_of(&self, uid: &[u8; UID_SIZE]) -> Result { + self.inners + .iter() + .position(|i| &i.uid == uid) + .ok_or(Error::NotFound) + } + + /// A read-only view of one inner partition. + pub fn inner_info(&self, uid: &[u8; UID_SIZE]) -> Result { + let inner = &self.inners[self.index_of(uid)?]; + Ok(self.view(inner)) + } + + /// Read-only views of every inner partition, in stored order. + pub fn inners(&self) -> Vec { + self.inners.iter().map(|i| self.view(i)).collect() + } + + fn view(&self, inner: &Inner) -> InnerInfo { + InnerInfo { + partition_type: inner.partition_type, + uid: inner.uid, + label: decode_label(&inner.label).unwrap_or_default(), + used_bytes: inner.logical_len(), + data_hash_algo: inner.data_hash_algo, + data_hash: inner.data_hash(&self.blob), + extents: inner + .frags + .iter() + .map(|f| ExtentInfo { + extent_offset: f.offset, + extent_length: f.length, + kind: f.kind, + shared: f.shared, + }) + .collect(), + } + } + + /// Reconstruct an inner partition's logical content (spec Section 8.3), + /// checking its length and (when algorithmic) its stored data hash. + pub fn content(&self, uid: &[u8; UID_SIZE]) -> Result> { + let inner = &self.inners[self.index_of(uid)?]; + let bytes = inner.content(&self.blob); + let declared = inner.logical_len(); + if bytes.len() as u64 != declared { + return Err(Error::LengthMismatch { + expected: declared, + got: bytes.len() as u64, + }); + } + Ok(bytes) + } + + // ---- builder ---------------------------------------------------------- + + /// Add an inner partition whose `content` is split by `chunker` into + /// extents, deduplicating against extents already present (spec Section + /// 10.2). Sharing sets the SHARED flag on the new and aliased entries + /// (rule F1, spec Section 8.4). + #[allow(clippy::too_many_arguments)] + pub fn add_inner( + &mut self, + partition_type: u32, + uid: [u8; UID_SIZE], + label: &str, + content: &[u8], + data_hash_algo: HashAlgo, + chunker: Chunker, + ) -> Result<()> { + if partition_type == 0 { + return Err(Error::ReservedType); + } + if partition_type == DCP_CONTAINER_TYPE { + return Err(Error::NestedContainer); + } + if uid == NIL_UID { + return Err(Error::NilUid); + } + if self.inners.iter().any(|i| i.uid == uid) { + return Err(Error::DuplicateUid); + } + let label = encode_label(label).map_err(Error::Pcf)?; + + let mut frags: Vec = Vec::new(); + for chunk in chunker.split(content) { + // Deduplicate against extents already present in other inner + // partitions AND against earlier chunks of this same partition. + let hit = self + .find_extent(chunk) + .or_else(|| find_local(&self.blob, &frags, chunk)); + match hit { + Some((offset, length)) => { + self.mark_shared(offset, length); + for f in &mut frags { + if f.offset == offset && f.length == length { + f.shared = true; + } + } + frags.push(Frag { + offset, + length, + kind: KIND_DATA, + shared: true, + }); + } + None => { + let offset = self.blob.len() as u64; + self.blob.extend_from_slice(chunk); + frags.push(Frag { + offset, + length: chunk.len() as u64, + kind: KIND_DATA, + shared: false, + }); + } + } + } + self.inners.push(Inner { + partition_type, + uid, + label, + data_hash_algo, + frags, + }); + Ok(()) + } + + /// Find an existing DATA extent whose bytes equal `chunk`, returning its + /// `(offset, length)`. Realises content-defined sharing for `add_inner` + /// and `dedup`. + fn find_extent(&self, chunk: &[u8]) -> Option<(u64, u64)> { + if chunk.is_empty() { + return None; + } + for inner in &self.inners { + for f in &inner.frags { + if f.kind == KIND_DATA && f.length == chunk.len() as u64 { + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + if &self.blob[a..b] == chunk { + return Some((f.offset, f.length)); + } + } + } + } + None + } + + /// Set the SHARED flag on every live fragment that references exactly the + /// `(offset, length)` extent (rule F1). + fn mark_shared(&mut self, offset: u64, length: u64) { + for inner in &mut self.inners { + for f in &mut inner.frags { + if f.offset == offset && f.length == length { + f.shared = true; + } + } + } + } + + // ---- logical edits (copy-on-write) ------------------------------------ + + /// Append `bytes` to the end of an inner partition's logical content. + pub fn append(&mut self, uid: &[u8; UID_SIZE], bytes: &[u8]) -> Result<()> { + let idx = self.index_of(uid)?; + if bytes.is_empty() { + return Ok(()); + } + let offset = self.blob.len() as u64; + self.blob.extend_from_slice(bytes); + self.inners[idx].frags.push(Frag { + offset, + length: bytes.len() as u64, + kind: KIND_DATA, + shared: false, + }); + Ok(()) + } + + /// Overwrite the logical range `[pos, pos+len)` with `bytes` (which need not + /// be the same length: this is delete-then-insert). The replaced bytes go + /// into a fresh private extent, leaving any SHARED bytes untouched. + pub fn overwrite( + &mut self, + uid: &[u8; UID_SIZE], + pos: u64, + len: u64, + bytes: &[u8], + ) -> Result<()> { + self.delete(uid, pos, len)?; + self.insert(uid, pos, bytes) + } + + /// Insert `bytes` at logical position `pos` (`pos == content length` + /// appends). The new bytes form a fresh private extent. + pub fn insert(&mut self, uid: &[u8; UID_SIZE], pos: u64, bytes: &[u8]) -> Result<()> { + let idx = self.index_of(uid)?; + let total = self.inners[idx].logical_len(); + if pos > total { + return Err(Error::PositionOutOfRange); + } + if bytes.is_empty() { + return Ok(()); + } + let split = self.split_at(idx, pos); + let offset = self.blob.len() as u64; + self.blob.extend_from_slice(bytes); + self.inners[idx].frags.insert( + split, + Frag { + offset, + length: bytes.len() as u64, + kind: KIND_DATA, + shared: false, + }, + ); + Ok(()) + } + + /// Delete the logical range `[pos, pos+len)`, dropping the covered + /// fragments without moving any bytes (spec Section 10.1). + pub fn delete(&mut self, uid: &[u8; UID_SIZE], pos: u64, len: u64) -> Result<()> { + let idx = self.index_of(uid)?; + let total = self.inners[idx].logical_len(); + let end = pos.checked_add(len).ok_or(Error::PositionOutOfRange)?; + if end > total { + return Err(Error::PositionOutOfRange); + } + if len == 0 { + return Ok(()); + } + let lo = self.split_at(idx, pos); + let hi = self.split_at(idx, end); + self.inners[idx].frags.drain(lo..hi); + Ok(()) + } + + /// Truncate the partition's logical content to `new_len` bytes. + pub fn truncate(&mut self, uid: &[u8; UID_SIZE], new_len: u64) -> Result<()> { + let idx = self.index_of(uid)?; + let total = self.inners[idx].logical_len(); + if new_len > total { + return Err(Error::PositionOutOfRange); + } + let cut = self.split_at(idx, new_len); + self.inners[idx].frags.truncate(cut); + Ok(()) + } + + /// Ensure a fragment boundary exists at logical position `pos` in inner + /// `idx`, splitting the straddling fragment if needed. Returns the index of + /// the first fragment at-or-after `pos`. Splitting never copies bytes: both + /// halves keep the parent's `shared` flag and address the same pool bytes. + fn split_at(&mut self, idx: usize, pos: u64) -> usize { + let frags = &mut self.inners[idx].frags; + let mut logical = 0u64; + let mut i = 0; + while i < frags.len() { + let flen = frags[i].length; + if logical == pos { + return i; + } + if pos < logical + flen { + // Split fragment i at (pos - logical). + let head = pos - logical; + let f = frags[i]; + let left = Frag { + offset: f.offset, + length: head, + kind: f.kind, + shared: f.shared, + }; + let right = Frag { + offset: f.offset + head, + length: flen - head, + kind: f.kind, + shared: f.shared, + }; + frags[i] = left; + frags.insert(i + 1, right); + return i + 1; + } + logical += flen; + i += 1; + } + frags.len() + } + + // ---- promotion support ------------------------------------------------ + + /// Remove an inner partition, returning the pieces a promotion needs: its + /// type, label, hash algorithm, and reconstructed logical content. The uid + /// is the caller's; the data hash is recomputed from the content (and is, + /// by construction, identical to the inner entry's — the promotion + /// invariant, spec Section 10.4). + pub fn remove_inner( + &mut self, + uid: &[u8; UID_SIZE], + ) -> Result<(u32, String, HashAlgo, Vec)> { + let idx = self.index_of(uid)?; + let content = self.content(uid)?; + let inner = self.inners.remove(idx); + let label = decode_label(&inner.label).unwrap_or_default(); + Ok((inner.partition_type, label, inner.data_hash_algo, content)) + } + + // ---- deduplication and compaction ------------------------------------- + + /// Re-chunk every inner partition with `chunker` and deduplicate identical + /// extents across the whole arena (spec Section 10.2). Logical content and + /// every `data_hash` are preserved. Returns the number of bytes the pool + /// shrank by once re-serialised (an estimate of dedup savings). + pub fn dedup(&mut self, chunker: Chunker) -> u64 { + let before = self.canonical_extent_bytes(); + // Rebuild the pool from each partition's logical content, re-chunking + // and sharing identical chunks. A fresh arena guarantees a clean pool. + let mut rebuilt = Arena { + profile_version_major: self.profile_version_major, + profile_version_minor: self.profile_version_minor, + flags: self.flags, + inner_table_algo: self.inner_table_algo, + blob: Vec::new(), + inners: Vec::new(), + }; + for inner in &self.inners { + let content = inner.content(&self.blob); + // add_inner cannot fail here: inputs already passed validation. + let _ = rebuilt.add_inner( + inner.partition_type, + inner.uid, + &decode_label(&inner.label).unwrap_or_default(), + &content, + inner.data_hash_algo, + chunker, + ); + } + *self = rebuilt; + let after = self.canonical_extent_bytes(); + before.saturating_sub(after) + } + + /// Compact the arena (spec Section 10.3): drop unreferenced pool bytes and + /// normalise the SHARED flag, clearing it on any extent now referenced + /// exactly once (rule F2). Returns the number of dead pool bytes reclaimed. + pub fn compact(&mut self) -> u64 { + // Reference count by distinct (offset, length) extent. + let mut refcount: HashMap<(u64, u64), u32> = HashMap::new(); + for inner in &self.inners { + for f in &inner.frags { + *refcount.entry((f.offset, f.length)).or_insert(0) += 1; + } + } + // Normalise SHARED: an extent referenced once is private again. + for inner in &mut self.inners { + for f in &mut inner.frags { + let rc = refcount[&(f.offset, f.length)]; + if rc <= 1 { + f.shared = false; + } + } + } + // Sweep: copy each distinct live extent once into a fresh pool, in + // first-reference order, and rewrite offsets. + let dead_before = self.blob.len() as u64 - self.live_extent_bytes(&refcount); + let mut newpool: Vec = Vec::new(); + let mut remap: HashMap<(u64, u64), u64> = HashMap::new(); + for inner in &self.inners { + for f in &inner.frags { + remap.entry((f.offset, f.length)).or_insert_with(|| { + let at = newpool.len() as u64; + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + newpool.extend_from_slice(&self.blob[a..b]); + at + }); + } + } + for inner in &mut self.inners { + for f in &mut inner.frags { + f.offset = remap[&(f.offset, f.length)]; + } + } + self.blob = newpool; + dead_before + } + + fn live_extent_bytes(&self, refcount: &HashMap<(u64, u64), u32>) -> u64 { + refcount + .keys() + .map(|&(_, len)| len) + .sum::() + .min(self.blob.len() as u64) + } + + /// Total bytes of the distinct extents that [`Self::to_bytes`] would emit. + fn canonical_extent_bytes(&self) -> u64 { + let mut seen: HashMap<(u64, u64), ()> = HashMap::new(); + let mut total = 0u64; + for inner in &self.inners { + for f in &inner.frags { + if seen.insert((f.offset, f.length), ()).is_none() { + total += f.length; + } + } + } + total + } + + // ---- canonical serialisation ------------------------------------------ + + /// Serialise the arena into its canonical on-disk layout (spec Section 17): + /// `DCP Header || data extents || Fragment Tables || Inner Table Block(s)`, + /// each distinct extent emitted once. The returned bytes are a complete DCP + /// arena ready to become a PCF partition's data. + pub fn to_bytes(&self) -> Vec { + // --- 1. distinct extents, first-reference order -------------------- + let mut ext_order: Vec<(u64, u64)> = Vec::new(); + let mut ext_index: HashMap<(u64, u64), usize> = HashMap::new(); + for inner in &self.inners { + for f in &inner.frags { + let key = (f.offset, f.length); + ext_index.entry(key).or_insert_with(|| { + ext_order.push(key); + ext_order.len() - 1 + }); + } + } + + // --- 2. lay out extents right after the header --------------------- + let mut cur = DCP_HEADER_SIZE; + let mut ext_arena_off: Vec = Vec::with_capacity(ext_order.len()); + for &(_, len) in &ext_order { + ext_arena_off.push(cur); + cur += len; + } + + // --- 3. Fragment Tables (one chain per inner) ---------------------- + let mut frag_off: Vec = Vec::with_capacity(self.inners.len()); + for inner in &self.inners { + frag_off.push(cur); + cur += fragtable_span(inner.frags.len()); + } + + // --- 4. Inner Table Block(s) --------------------------------------- + let inner_table_offset = cur; + let counts = block_counts(self.inners.len()); + let mut block_off: Vec = Vec::with_capacity(counts.len()); + for &c in &counts { + block_off.push(cur); + cur += TABLE_HEADER_SIZE + c as u64 * ENTRY_SIZE; + } + let arena_used = cur; + + // --- serialise into a zeroed buffer -------------------------------- + let mut buf = vec![0u8; arena_used as usize]; + + let header = DcpHeader { + profile_version_major: self.profile_version_major, + profile_version_minor: self.profile_version_minor, + flags: self.flags, + inner_table_offset, + arena_used, + }; + buf[0..24].copy_from_slice(&header.to_bytes()); + + for (i, &(boff, len)) in ext_order.iter().enumerate() { + let dst = ext_arena_off[i] as usize; + let (a, b) = (boff as usize, (boff + len) as usize); + buf[dst..dst + len as usize].copy_from_slice(&self.blob[a..b]); + } + + for (ii, inner) in self.inners.iter().enumerate() { + write_fragment_table( + &mut buf, + frag_off[ii], + &inner.frags, + &ext_index, + &ext_arena_off, + ); + } + + let entries: Vec = self + .inners + .iter() + .enumerate() + .map(|(ii, inner)| { + let used = inner.logical_len(); + PartitionEntry { + partition_type: inner.partition_type, + uid: inner.uid, + label: inner.label, + start_offset: frag_off[ii], + max_length: used, + used_bytes: used, + data_hash_algo: inner.data_hash_algo, + data_hash: inner.data_hash(&self.blob), + } + }) + .collect(); + + let mut idx = 0usize; + for (b, &c) in counts.iter().enumerate() { + let next = if b + 1 < counts.len() { + block_off[b + 1] + } else { + 0 + }; + let slice = &entries[idx..idx + c]; + let th = compute_table_hash(self.inner_table_algo, next, slice); + let bh = TableBlockHeader { + partition_count: c as u8, + next_table_offset: next, + table_hash_algo: self.inner_table_algo, + table_hash: th, + }; + let bo = block_off[b] as usize; + buf[bo..bo + 74].copy_from_slice(&bh.to_bytes()); + for (j, e) in slice.iter().enumerate() { + let eo = bo + 74 + j * ENTRY_SIZE as usize; + buf[eo..eo + ENTRY_SIZE as usize].copy_from_slice(&e.to_bytes()); + } + idx += c; + } + + buf + } +} + +/// Find an extent among `frags` whose pool bytes equal `chunk`, for +/// intra-partition deduplication while a partition is being built. +fn find_local(blob: &[u8], frags: &[Frag], chunk: &[u8]) -> Option<(u64, u64)> { + if chunk.is_empty() { + return None; + } + for f in frags { + if f.kind == KIND_DATA && f.length == chunk.len() as u64 { + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + if &blob[a..b] == chunk { + return Some((f.offset, f.length)); + } + } + } + None +} + +/// On-disk span of an inner partition's Fragment Table chain holding `n` +/// extents, split into blocks of at most 255 entries. +fn fragtable_span(n: usize) -> u64 { + let mut span = 0u64; + for c in block_counts(n) { + span += FRAGTABLE_HEADER_SIZE + c as u64 * FRAGMENT_ENTRY_SIZE; + } + span +} + +/// Split `n` items into blocks of at most 255; always at least one block (an +/// empty block when `n == 0`). +fn block_counts(n: usize) -> Vec { + if n == 0 { + return vec![0]; + } + let mut out = Vec::new(); + let mut rem = n; + while rem > 0 { + let c = rem.min(MAX_ENTRIES_PER_BLOCK); + out.push(c); + rem -= c; + } + out +} + +/// Write one inner partition's Fragment Table chain at `start`. +fn write_fragment_table( + buf: &mut [u8], + start: u64, + frags: &[Frag], + ext_index: &HashMap<(u64, u64), usize>, + ext_arena_off: &[u64], +) { + let counts = block_counts(frags.len()); + let mut block_start = start; + let mut idx = 0usize; + for (b, &c) in counts.iter().enumerate() { + let span = FRAGTABLE_HEADER_SIZE + c as u64 * FRAGMENT_ENTRY_SIZE; + let next = if b + 1 < counts.len() { + block_start + span + } else { + 0 + }; + let bs = block_start as usize; + let fh = FragTableHeader { + next_fragtable_offset: next, + fragment_count: c as u8, + }; + buf[bs..bs + 9].copy_from_slice(&fh.to_bytes()); + for j in 0..c { + let f = &frags[idx + j]; + let arena_off = ext_arena_off[ext_index[&(f.offset, f.length)]]; + let fe = FragmentEntry { + extent_offset: arena_off, + extent_length: f.length, + kind: f.kind, + flags: if f.shared { FLAG_SHARED } else { 0 }, + }; + let eo = bs + 9 + j * FRAGMENT_ENTRY_SIZE as usize; + buf[eo..eo + FRAGMENT_ENTRY_SIZE as usize].copy_from_slice(&fe.to_bytes()); + } + block_start += span; + idx += c; + } +} diff --git a/reference/PCF-DCP-v1.0/src/bin/dcp.rs b/reference/PCF-DCP-v1.0/src/bin/dcp.rs new file mode 100644 index 0000000..0ace1ed --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/bin/dcp.rs @@ -0,0 +1,269 @@ +//! `dcp` — a small command-line tool for DCP containers. +//! +//! Subcommands (arguments parsed by hand, in the style of the other reference +//! tools): +//! +//! ```text +//! dcp info +//! dcp dedup [--fixed N] [--trailer] +//! dcp defrag [--trailer] +//! dcp promote [--trailer] +//! dcp demote [--trailer] +//! ``` +//! +//! UIDs are given as 32 hex digits (16 bytes), or as `0xNN` to mean a uid of 16 +//! identical bytes (e.g. `0xDC` = 16×0xDC), matching the test vector's notation. +//! Every mutating command rewrites the file and then re-verifies it. + +use std::io::Cursor; +use std::process::ExitCode; + +use pcf_dcp::{Chunker, DcpReader, DcpWriter, UID_SIZE}; + +fn main() -> ExitCode { + let args: Vec = std::env::args().skip(1).collect(); + if args.is_empty() { + usage(); + return ExitCode::FAILURE; + } + let cmd = args[0].as_str(); + let rest = &args[1..]; + let result = match cmd { + "info" => cmd_info(rest), + "dedup" => cmd_dedup(rest), + "defrag" => cmd_defrag(rest), + "promote" => cmd_promote(rest), + "demote" => cmd_demote(rest), + "-h" | "--help" | "help" => { + usage(); + return ExitCode::SUCCESS; + } + other => Err(format!("unknown command '{other}'")), + }; + match result { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { + eprintln!("dcp: {e}"); + ExitCode::FAILURE + } + } +} + +fn usage() { + eprintln!( + "usage:\n dcp info \n dcp dedup [--fixed N] [--trailer]\n \ + dcp defrag [--trailer]\n dcp promote [--trailer]\n \ + dcp demote [--trailer]" + ); +} + +// ---- commands ------------------------------------------------------------- + +fn cmd_info(args: &[String]) -> Result<(), String> { + let path = args.first().ok_or("info: missing ")?; + let bytes = std::fs::read(path).map_err(|e| format!("read {path}: {e}"))?; + let mut r = DcpReader::open(Cursor::new(bytes)).map_err(de)?; + r.verify().map_err(de)?; + let containers = r.containers().map_err(de)?; + println!("{}: {} DCP container(s)", path, containers.len()); + for c in containers { + let arena = r.open_arena(&c).map_err(de)?; + println!( + " container {} (uid {}) used={} inner={}", + c.label_string().unwrap_or_default(), + hex(&c.uid), + c.used_bytes, + arena.len() + ); + for info in arena.inners() { + let n = info.data_hash_algo.digest_len(); + let dh: String = info.data_hash[..n] + .iter() + .map(|b| format!("{b:02x}")) + .collect(); + let shared = info.extents.iter().filter(|e| e.shared).count(); + println!( + " inner {} (uid {}) type=0x{:08X} used={} extents={} shared={} algo={:?} data_hash={}", + info.label, + hex(&info.uid), + info.partition_type, + info.used_bytes, + info.extents.len(), + shared, + info.data_hash_algo, + dh + ); + } + } + Ok(()) +} + +fn cmd_dedup(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 1)?; + let path = &opts.positional[0]; + let chunker = match opts.fixed { + Some(n) => Chunker::Fixed(n), + None => Chunker::Whole, + }; + let mut w = open_writer(path, opts.trailer)?; + let containers = container_uids(path)?; + let mut saved = 0u64; + for uid in &containers { + saved += w.dedup(uid, chunker).map_err(de)?; + } + commit(path, &w)?; + println!( + "deduplicated {} container(s); ~{} bytes saved", + containers.len(), + saved + ); + Ok(()) +} + +fn cmd_defrag(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 1)?; + let path = &opts.positional[0]; + let mut w = open_writer(path, opts.trailer)?; + let containers = container_uids(path)?; + let mut reclaimed = 0u64; + for uid in &containers { + reclaimed += w.defrag(uid).map_err(de)?; + } + commit(path, &w)?; + println!( + "defragmented {} container(s); {} dead bytes reclaimed", + containers.len(), + reclaimed + ); + Ok(()) +} + +fn cmd_promote(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 3)?; + let path = &opts.positional[0]; + let cuid = parse_uid(&opts.positional[1])?; + let iuid = parse_uid(&opts.positional[2])?; + let mut w = open_writer(path, opts.trailer)?; + w.promote(&cuid, &iuid).map_err(de)?; + commit(path, &w)?; + println!( + "promoted inner {} out of container {}", + hex(&iuid), + hex(&cuid) + ); + Ok(()) +} + +fn cmd_demote(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 3)?; + let path = &opts.positional[0]; + let puid = parse_uid(&opts.positional[1])?; + let cuid = parse_uid(&opts.positional[2])?; + let mut w = open_writer(path, opts.trailer)?; + w.demote(&puid, &cuid).map_err(de)?; + commit(path, &w)?; + println!( + "demoted partition {} into container {}", + hex(&puid), + hex(&cuid) + ); + Ok(()) +} + +// ---- helpers -------------------------------------------------------------- + +fn open_writer(path: &str, trailer: bool) -> Result { + let bytes = std::fs::read(path).map_err(|e| format!("read {path}: {e}"))?; + let mut w = DcpWriter::open(Cursor::new(bytes)).map_err(de)?; + w.set_trailer(trailer); + Ok(w) +} + +fn container_uids(path: &str) -> Result, String> { + let bytes = std::fs::read(path).map_err(|e| format!("read {path}: {e}"))?; + let mut r = DcpReader::open(Cursor::new(bytes)).map_err(de)?; + Ok(r.containers() + .map_err(de)? + .into_iter() + .map(|c| c.uid) + .collect()) +} + +fn commit(path: &str, w: &DcpWriter) -> Result<(), String> { + let image = w.to_image().map_err(de)?; + // Re-verify before overwriting the file on disk. + let mut r = DcpReader::open(Cursor::new(image.clone())).map_err(de)?; + r.verify().map_err(de)?; + std::fs::write(path, &image).map_err(|e| format!("write {path}: {e}"))?; + Ok(()) +} + +fn de(e: E) -> String { + e.to_string() +} + +fn hex(uid: &[u8; UID_SIZE]) -> String { + uid.iter().map(|b| format!("{b:02x}")).collect() +} + +/// Parse a uid: either 32 hex digits, or `0xNN` meaning 16 identical bytes. +fn parse_uid(s: &str) -> Result<[u8; UID_SIZE], String> { + if let Some(rest) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) { + if rest.len() == 2 { + let b = u8::from_str_radix(rest, 16).map_err(|_| format!("bad uid byte '{s}'"))?; + return Ok([b; UID_SIZE]); + } + } + let clean: String = s + .chars() + .filter(|c| !c.is_whitespace() && *c != '-') + .collect(); + if clean.len() != 32 { + return Err(format!("uid '{s}' must be 32 hex digits or 0xNN")); + } + let mut uid = [0u8; UID_SIZE]; + for (i, byte) in uid.iter_mut().enumerate() { + *byte = u8::from_str_radix(&clean[i * 2..i * 2 + 2], 16) + .map_err(|_| format!("bad hex in uid '{s}'"))?; + } + Ok(uid) +} + +/// Parsed options common to the subcommands. +struct Opts { + positional: Vec, + fixed: Option, + trailer: bool, +} + +impl Opts { + fn parse(args: &[String], need: usize) -> Result { + let mut positional = Vec::new(); + let mut fixed = None; + let mut trailer = false; + let mut i = 0; + while i < args.len() { + match args[i].as_str() { + "--trailer" => trailer = true, + "--fixed" => { + i += 1; + let n = args.get(i).ok_or("--fixed needs a value")?; + fixed = Some(n.parse().map_err(|_| format!("bad --fixed value '{n}'"))?); + } + other => positional.push(other.to_string()), + } + i += 1; + } + if positional.len() < need { + return Err(format!( + "expected {need} positional argument(s), got {}", + positional.len() + )); + } + Ok(Opts { + positional, + fixed, + trailer, + }) + } +} diff --git a/reference/PCF-DCP-v1.0/src/consts.rs b/reference/PCF-DCP-v1.0/src/consts.rs new file mode 100644 index 0000000..4173650 --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/consts.rs @@ -0,0 +1,59 @@ +//! On-disk constants defined by PCF-DCP v1.0. +//! +//! Every value here is normative and corresponds directly to a figure in the +//! specification (`specs/PCF-DCP-spec-v1.0.txt`, Appendix A and B). + +/// PCF partition type carrying one DCP arena (spec Appendix B). A generic PCF +/// reader sees this as one opaque, typed partition. +pub const DCP_CONTAINER_TYPE: u32 = 0xAAAC_0001; + +/// First value of the block reserved by this profile for future partition +/// types (spec Appendix B). +pub const DCP_TYPE_RESERVED_LO: u32 = 0xAAAC_0000; + +/// Last value of the block reserved by this profile (spec Appendix B). +pub const DCP_TYPE_RESERVED_HI: u32 = 0xAAAC_00FF; + +/// 4-byte magic at the start of a DCP arena (spec Section 6): `"PDCP"`. +pub const DCP_MAGIC: [u8; 4] = [0x50, 0x44, 0x43, 0x50]; + +/// PCF-DCP profile version implemented by this crate (major, spec Section 14). +pub const PROFILE_VERSION_MAJOR: u8 = 1; + +/// PCF-DCP profile version implemented by this crate (minor, spec Section 14). +pub const PROFILE_VERSION_MINOR: u8 = 0; + +/// Fixed size of the DCP Header, in bytes (spec Section 6). +pub const DCP_HEADER_SIZE: u64 = 24; + +/// Fixed size of a Fragment Table block header, in bytes (spec Section 8.1). +pub const FRAGTABLE_HEADER_SIZE: u64 = 9; + +/// Fixed size of one Fragment Entry, in bytes (spec Section 8.2). +pub const FRAGMENT_ENTRY_SIZE: u64 = 18; + +/// Fragment Entry kind: RESERVED / INVALID guard (spec Section 8.2). MUST NOT +/// appear in a live entry. +pub const KIND_INVALID: u8 = 0; +/// Fragment Entry kind: DATA — literal content bytes (the only kind defined in +/// v1.0). +pub const KIND_DATA: u8 = 1; +/// Fragment Entry kind: HOLE (RESERVED for sparse content; MUST NOT be written +/// in v1.0). +pub const KIND_HOLE: u8 = 2; +/// Fragment Entry kind: REF (RESERVED for cross-container references; MUST NOT +/// be written in v1.0). +pub const KIND_REF: u8 = 3; + +/// Fragment Entry `flags` bit 0: SHARED — the extent's bytes MUST NOT be +/// overwritten in place; edits must be copy-on-write (spec Section 8.4). +pub const FLAG_SHARED: u8 = 0x01; + +/// The arena-relative offset value reserved as "none" / chain terminator +/// (spec Appendix B). +pub const ARENA_NONE: u64 = 0; + +/// Maximum number of entries a single (inner) Table Block can hold, and the +/// maximum number of Fragment Entries a single Fragment Table block can hold +/// (both counts are a `u8`). +pub const MAX_ENTRIES_PER_BLOCK: usize = 255; diff --git a/reference/PCF-DCP-v1.0/src/error.rs b/reference/PCF-DCP-v1.0/src/error.rs new file mode 100644 index 0000000..499991e --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/error.rs @@ -0,0 +1,101 @@ +//! Error type shared across the crate. + +use std::fmt; + +/// All ways a PCF-DCP operation can fail. +#[derive(Debug)] +pub enum Error { + /// Underlying PCF container error. + Pcf(pcf::Error), + /// Underlying I/O failure. + Io(std::io::Error), + + // ----- Malformed arena (spec Sections 6, 8, 13) ------------------------ + /// The arena did not begin with the `"PDCP"` magic (spec Section 6). + BadDcpMagic, + /// The arena's `profile_version_major` is not implemented by this crate. + UnsupportedProfileMajor(u8), + /// A Fragment Entry carried a `kind` this version does not implement + /// (HOLE/REF/unknown), rendering the inner partition unreadable. + BadFragmentKind(u8), + /// An extent's `[offset, offset+length)` range escapes `[0, arena_used)`. + OffsetOutOfRange, + /// Reconstructed logical content length did not match the inner entry's + /// `used_bytes` (spec Section 8.3), or a stored data hash did not verify. + LengthMismatch { + /// The `used_bytes` the inner entry declared. + expected: u64, + /// The length actually reconstructed from the Fragment Table. + got: u64, + }, + /// A stored hash (inner `table_hash` or inner `data_hash`) did not verify. + HashMismatch, + + // ----- Logical-model violations (spec Sections 2.1, 7.2, 13) ----------- + /// No inner partition (or top-level partition) with the requested uid. + NotFound, + /// A uid is used by more than one partition file-wide (spec Section 2.1). + DuplicateUid, + /// An inner partition is itself a DCP container; nesting is forbidden in + /// v1.0 (spec Appendix B, "Nesting"). + NestedContainer, + /// A partition uid is the PCF NIL uid. + NilUid, + /// A partition type is the PCF reserved type `0x00000000`. + ReservedType, + /// A top-level partition expected to be a DCP container is not one. + NotADcpContainer, + /// A logical edit addressed a position beyond the partition's content. + PositionOutOfRange, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Error::Pcf(e) => write!(f, "pcf error: {e}"), + Error::Io(e) => write!(f, "i/o error: {e}"), + Error::BadDcpMagic => write!(f, "arena does not begin with \"PDCP\" magic"), + Error::UnsupportedProfileMajor(v) => { + write!(f, "unsupported PCF-DCP profile major version {v}") + } + Error::BadFragmentKind(k) => write!(f, "unsupported fragment kind {k}"), + Error::OffsetOutOfRange => write!(f, "extent range escapes the arena"), + Error::LengthMismatch { expected, got } => { + write!(f, "logical length mismatch: expected {expected}, got {got}") + } + Error::HashMismatch => write!(f, "stored hash does not verify"), + Error::NotFound => write!(f, "no partition with that uid"), + Error::DuplicateUid => write!(f, "uid is not unique file-wide"), + Error::NestedContainer => write!(f, "an inner partition may not be a DCP container"), + Error::NilUid => write!(f, "uid is the NIL uid"), + Error::ReservedType => write!(f, "partition type is the reserved type 0x00000000"), + Error::NotADcpContainer => write!(f, "partition is not a DCP container"), + Error::PositionOutOfRange => write!(f, "logical position is past end of content"), + } + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Error::Pcf(e) => Some(e), + Error::Io(e) => Some(e), + _ => None, + } + } +} + +impl From for Error { + fn from(e: pcf::Error) -> Self { + Error::Pcf(e) + } +} + +impl From for Error { + fn from(e: std::io::Error) -> Self { + Error::Io(e) + } +} + +/// Convenience alias. +pub type Result = std::result::Result; diff --git a/reference/PCF-DCP-v1.0/src/fragment.rs b/reference/PCF-DCP-v1.0/src/fragment.rs new file mode 100644 index 0000000..24ae32c --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/fragment.rs @@ -0,0 +1,166 @@ +//! The Fragment Table: its 9-byte block header and 18-byte entries +//! (spec Section 8). + +use crate::consts::{ + ARENA_NONE, FLAG_SHARED, FRAGMENT_ENTRY_SIZE, FRAGTABLE_HEADER_SIZE, KIND_DATA, +}; +use crate::error::{Error, Result}; + +/// One Fragment Entry: a single extent of an inner partition (spec Section 8.2). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FragmentEntry { + /// Arena-relative start of the extent's bytes. + pub extent_offset: u64, + /// Length of the extent in bytes. + pub extent_length: u64, + /// Extent kind (`1` = DATA; `0` invalid; `2`/`3` reserved). + pub kind: u8, + /// `flags` byte (bit 0 = SHARED; others reserved 0). + pub flags: u8, +} + +impl FragmentEntry { + /// Serialise to the on-disk 18-byte layout. + pub fn to_bytes(&self) -> [u8; 18] { + let mut b = [0u8; 18]; + b[0..8].copy_from_slice(&self.extent_offset.to_le_bytes()); + b[8..16].copy_from_slice(&self.extent_length.to_le_bytes()); + b[16] = self.kind; + b[17] = self.flags; + b + } + + /// Parse from the on-disk 18-byte layout. + pub fn from_bytes(b: &[u8; 18]) -> Self { + FragmentEntry { + extent_offset: u64::from_le_bytes(b[0..8].try_into().unwrap()), + extent_length: u64::from_le_bytes(b[8..16].try_into().unwrap()), + kind: b[16], + flags: b[17], + } + } + + /// Whether this entry's `kind` is DATA (the only v1.0 content kind). + pub fn is_data(&self) -> bool { + self.kind == KIND_DATA + } + + /// Whether the SHARED flag (bit 0) is set. + pub fn is_shared(&self) -> bool { + self.flags & FLAG_SHARED != 0 + } +} + +/// The 9-byte header that begins each Fragment Table block (spec Section 8.1). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FragTableHeader { + /// Arena-relative offset of the next Fragment Table block of this + /// partition, or 0 if this is the last block. + pub next_fragtable_offset: u64, + /// Number of Fragment Entries packed immediately after this header. + pub fragment_count: u8, +} + +impl FragTableHeader { + /// Serialise to the on-disk 9-byte layout. + pub fn to_bytes(&self) -> [u8; 9] { + let mut b = [0u8; 9]; + b[0..8].copy_from_slice(&self.next_fragtable_offset.to_le_bytes()); + b[8] = self.fragment_count; + b + } + + /// Parse from the on-disk 9-byte layout. + pub fn from_bytes(b: &[u8; 9]) -> Self { + FragTableHeader { + next_fragtable_offset: u64::from_le_bytes(b[0..8].try_into().unwrap()), + fragment_count: b[8], + } + } +} + +/// Walk an inner partition's Fragment Table chain starting at arena-relative +/// `first_off`, returning its Fragment Entries in logical order across the +/// whole chain (spec Section 8.3). `first_off == 0` yields an empty list. +pub fn walk_fragment_table(arena: &[u8], first_off: u64) -> Result> { + let mut out = Vec::new(); + let mut off = first_off; + // A simple cycle guard: a well-formed chain only ever moves forward, but a + // corrupt file could loop. Bound the walk by the arena length. + let mut budget = arena.len() / FRAGTABLE_HEADER_SIZE as usize + 1; + while off != ARENA_NONE { + if budget == 0 { + return Err(Error::OffsetOutOfRange); + } + budget -= 1; + let base = off as usize; + let hb: [u8; 9] = arena + .get(base..base + FRAGTABLE_HEADER_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let h = FragTableHeader::from_bytes(&hb); + let mut eo = base + FRAGTABLE_HEADER_SIZE as usize; + for _ in 0..h.fragment_count { + let eb: [u8; 18] = arena + .get(eo..eo + FRAGMENT_ENTRY_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + out.push(FragmentEntry::from_bytes(&eb)); + eo += FRAGMENT_ENTRY_SIZE as usize; + } + off = h.next_fragtable_offset; + } + Ok(out) +} + +/// Reconstruct the logical content of a partition from its Fragment Entries +/// (spec Section 8.3): concatenate the bytes of its DATA extents in order. +/// +/// `arena_used` bounds every extent range; a reserved (non-DATA) kind makes the +/// partition unreadable to a v1.0 reader (spec Section 8.2). +pub fn reconstruct(arena: &[u8], frags: &[FragmentEntry], arena_used: u64) -> Result> { + let mut out = Vec::new(); + for f in frags { + if !f.is_data() { + return Err(Error::BadFragmentKind(f.kind)); + } + let end = f + .extent_offset + .checked_add(f.extent_length) + .ok_or(Error::OffsetOutOfRange)?; + if end > arena_used || end > arena.len() as u64 { + return Err(Error::OffsetOutOfRange); + } + out.extend_from_slice(&arena[f.extent_offset as usize..end as usize]); + } + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn entry_roundtrip() { + let e = FragmentEntry { + extent_offset: 31, + extent_length: 6, + kind: KIND_DATA, + flags: FLAG_SHARED, + }; + assert_eq!(FragmentEntry::from_bytes(&e.to_bytes()), e); + assert!(e.is_data()); + assert!(e.is_shared()); + } + + #[test] + fn header_roundtrip() { + let h = FragTableHeader { + next_fragtable_offset: 0, + fragment_count: 2, + }; + assert_eq!(FragTableHeader::from_bytes(&h.to_bytes()), h); + } +} diff --git a/reference/PCF-DCP-v1.0/src/header.rs b/reference/PCF-DCP-v1.0/src/header.rs new file mode 100644 index 0000000..07c4403 --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/header.rs @@ -0,0 +1,83 @@ +//! The fixed 24-byte DCP Header at arena offset 0 (spec Section 6). + +use crate::consts::{DCP_HEADER_SIZE, DCP_MAGIC}; +use crate::error::{Error, Result}; + +/// Parsed DCP Header. All offsets it carries are arena-relative. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DcpHeader { + /// PCF-DCP profile major version (MUST be implemented by the reader). + pub profile_version_major: u8, + /// PCF-DCP profile minor version (a reader SHOULD accept a higher value). + pub profile_version_minor: u8, + /// Reserved; MUST be 0 in v1.0. + pub flags: u16, + /// Arena-relative offset of the first Inner Table Block (0 = no inner + /// partitions). + pub inner_table_offset: u64, + /// Bump pointer: arena-relative offset of the first free byte. Every stored + /// structure and extent lies within `[0, arena_used)`. + pub arena_used: u64, +} + +impl DcpHeader { + /// Serialise to the on-disk 24-byte layout. + pub fn to_bytes(&self) -> [u8; 24] { + let mut b = [0u8; 24]; + b[0..4].copy_from_slice(&DCP_MAGIC); + b[4] = self.profile_version_major; + b[5] = self.profile_version_minor; + b[6..8].copy_from_slice(&self.flags.to_le_bytes()); + b[8..16].copy_from_slice(&self.inner_table_offset.to_le_bytes()); + b[16..24].copy_from_slice(&self.arena_used.to_le_bytes()); + b + } + + /// Parse from the on-disk 24-byte layout, validating the magic. + pub fn from_bytes(b: &[u8; 24]) -> Result { + if b[0..4] != DCP_MAGIC { + return Err(Error::BadDcpMagic); + } + Ok(DcpHeader { + profile_version_major: b[4], + profile_version_minor: b[5], + flags: u16::from_le_bytes([b[6], b[7]]), + inner_table_offset: u64::from_le_bytes(b[8..16].try_into().unwrap()), + arena_used: u64::from_le_bytes(b[16..24].try_into().unwrap()), + }) + } +} + +/// Read a DCP Header from the start of an arena byte slice. +pub(crate) fn read_header(arena: &[u8]) -> Result { + let fixed: [u8; 24] = arena + .get(0..DCP_HEADER_SIZE as usize) + .ok_or(Error::BadDcpMagic)? + .try_into() + .unwrap(); + DcpHeader::from_bytes(&fixed) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn header_roundtrip() { + let h = DcpHeader { + profile_version_major: 1, + profile_version_minor: 0, + flags: 0, + inner_table_offset: 109, + arena_used: 465, + }; + assert_eq!(DcpHeader::from_bytes(&h.to_bytes()).unwrap(), h); + } + + #[test] + fn rejects_bad_magic() { + let mut b = [0u8; 24]; + b[0..4].copy_from_slice(b"XXXX"); + assert!(matches!(DcpHeader::from_bytes(&b), Err(Error::BadDcpMagic))); + } +} diff --git a/reference/PCF-DCP-v1.0/src/lib.rs b/reference/PCF-DCP-v1.0/src/lib.rs new file mode 100644 index 0000000..3710f4a --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/lib.rs @@ -0,0 +1,69 @@ +//! # `pcf-dcp` — PCF Dynamic Container Partition (reference implementation) +//! +//! This crate is the reference reader/writer for **PCF-DCP v1.0**, an +//! application-level profile that adds *dynamic*, fragmentable, dedup-friendly +//! sub-partitions to [PCF v1.0](../pcf/index.html) without changing the PCF +//! byte container. It mirrors the written specification +//! (`specs/PCF-DCP-spec-v1.0.txt`) field-for-field and favours auditability +//! over performance. +//! +//! ## Layout at a glance +//! +//! One new PCF partition type is defined: +//! +//! * **`DCP_CONTAINER`** (type `0xAAAC0001`) — a partition whose bytes are an +//! *arena*: a [`DcpHeader`], a chain of reused PCF Table Blocks listing +//! *inner* partitions, a [`FragmentEntry`] table per inner partition, and the +//! data extents those fragments name. +//! +//! Each inner partition's logical content is the concatenation of its DATA +//! extents (spec Section 8.3); its `data_hash` covers that logical content, so +//! fragmentation, deduplication, compaction, and promotion all leave the hash +//! (and any PCF-SIG signature over it) unchanged. +//! +//! A generic PCF reader sees a DCP file as one opaque, typed partition; only a +//! DCP-aware reader looks inside. A DCP file is always a conforming PCF v1.0 +//! file. +//! +//! ## Example +//! +//! ``` +//! use std::io::Cursor; +//! use pcf_dcp::{Arena, Chunker, DcpReader, DcpWriter, HashAlgo}; +//! +//! // Build a container with two inner partitions that share an extent. +//! let mut arena = Arena::new(); +//! arena.add_inner(0x10, [0xA1; 16], "A", b"Hello, World!", HashAlgo::Sha256, Chunker::Fixed(7))?; +//! arena.add_inner(0x10, [0xB2; 16], "B", b"World!", HashAlgo::Sha256, Chunker::Whole)?; +//! +//! let mut w = DcpWriter::new(); +//! w.add_container([0xDC; 16], "dcp", arena)?; +//! let image = w.to_image()?; +//! +//! // Read it back: a valid PCF file whose inner content reconstructs exactly. +//! let mut r = DcpReader::open(Cursor::new(image))?; +//! r.verify()?; +//! assert_eq!(r.read_inner(&[0xB2; 16])?, b"World!"); +//! # Ok::<(), pcf_dcp::Error>(()) +//! ``` + +mod arena; +pub mod consts; +mod error; +mod fragment; +mod header; +mod reader; +mod vector; +mod writer; + +pub use arena::{Arena, Chunker, ExtentInfo, InnerInfo}; +pub use consts::*; +pub use error::{Error, Result}; +pub use fragment::{reconstruct, walk_fragment_table, FragTableHeader, FragmentEntry}; +pub use header::DcpHeader; +pub use reader::{DcpReader, InnerLocation, Resolved}; +pub use vector::build_reference_vector; +pub use writer::DcpWriter; + +// Re-export underlying PCF primitives used across the DCP API surface. +pub use pcf::{HashAlgo, PartitionEntry, UID_SIZE}; diff --git a/reference/PCF-DCP-v1.0/src/reader.rs b/reference/PCF-DCP-v1.0/src/reader.rs new file mode 100644 index 0000000..0f7296b --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/reader.rs @@ -0,0 +1,222 @@ +//! [`DcpReader`]: reading DCP containers from a PCF file. +//! +//! The reader works entirely through the high-level [`pcf::Container`] API +//! (`open`, `entries`, `read_partition_data`, `verify`). Because +//! `Container::open` resolves a file [`pcf::Trailer`] and exposes the table +//! head itself, a DCP file written in trailer mode (append-only host) reads +//! back transparently here — this code never assumes the header's +//! `partition_table_offset` is a real offset (spec Section 2, "Compatibility +//! with the PCF File Trailer"). + +use std::collections::HashSet; +use std::io::{Read, Seek, Write}; + +use pcf::{Container, PartitionEntry, UID_SIZE}; + +use crate::arena::{Arena, InnerInfo}; +use crate::consts::DCP_CONTAINER_TYPE; +use crate::error::{Error, Result}; + +/// An inner partition together with the container that holds it. +#[derive(Debug, Clone)] +pub struct InnerLocation { + /// uid of the enclosing DCP container partition. + pub container_uid: [u8; UID_SIZE], + /// The inner partition's metadata and extents. + pub info: InnerInfo, +} + +/// The result of resolving a uid against the flattened partition set +/// (top-level ∪ inner), per the Opt-B scope of spec Section 2.1. +#[derive(Debug, Clone)] +pub enum Resolved { + /// A top-level PCF partition. + TopLevel(PartitionEntry), + /// An inner partition inside a DCP container. + Inner(InnerLocation), +} + +/// A reader for DCP containers layered over a PCF file. +pub struct DcpReader { + container: Container, +} + +impl DcpReader { + /// Open a PCF file for DCP-aware reading. + pub fn open(storage: S) -> Result { + Ok(Self { + container: Container::open(storage)?, + }) + } + + /// Borrow the underlying PCF container (e.g. to inspect non-DCP + /// partitions). + pub fn container(&mut self) -> &mut Container { + &mut self.container + } + + /// All top-level entries, in chain order. + pub fn entries(&mut self) -> Result> { + Ok(self.container.entries()?) + } + + /// The top-level DCP container entries (`partition_type == + /// DCP_CONTAINER_TYPE`). + pub fn containers(&mut self) -> Result> { + Ok(self + .container + .entries()? + .into_iter() + .filter(|e| e.partition_type == DCP_CONTAINER_TYPE) + .collect()) + } + + /// Parse the arena of a DCP container entry. + pub fn open_arena(&mut self, entry: &PartitionEntry) -> Result { + if entry.partition_type != DCP_CONTAINER_TYPE { + return Err(Error::NotADcpContainer); + } + let data = self.container.read_partition_data(entry)?; + Arena::parse(&data) + } + + /// Every inner partition across every DCP container, in file order. + pub fn inner_partitions(&mut self) -> Result> { + let mut out = Vec::new(); + for c in self.containers()? { + let arena = self.open_arena(&c)?; + for info in arena.inners() { + out.push(InnerLocation { + container_uid: c.uid, + info, + }); + } + } + Ok(out) + } + + /// Resolve a uid against the flattened set top-level ∪ inner (spec Section + /// 2.1). Top-level entries are checked first. + pub fn resolve_uid(&mut self, uid: &[u8; UID_SIZE]) -> Result { + if let Some(e) = self + .container + .entries()? + .into_iter() + .find(|e| &e.uid == uid) + { + return Ok(Resolved::TopLevel(e)); + } + for loc in self.inner_partitions()? { + if &loc.info.uid == uid { + return Ok(Resolved::Inner(loc)); + } + } + Err(Error::NotFound) + } + + /// Reconstruct an inner partition's logical content by uid, searching every + /// DCP container. + pub fn read_inner(&mut self, uid: &[u8; UID_SIZE]) -> Result> { + for c in self.containers()? { + let arena = self.open_arena(&c)?; + if arena.uids().iter().any(|u| u == uid) { + return arena.content(uid); + } + } + Err(Error::NotFound) + } + + /// Full DCP-aware verification: + /// + /// 1. PCF integrity (`Container::verify`): every table block and partition + /// data hash, and per-entry conformance. + /// 2. Per container: valid `"PDCP"` magic and supported profile major (via + /// `Arena::parse`), each inner Table Block's `table_hash` (checked while + /// parsing through PCF), reconstruction length and (when algorithmic) + /// `data_hash`, no nested container, and file-wide uid uniqueness. + pub fn verify(&mut self) -> Result<()> { + self.container.verify()?; + + let mut seen: HashSet<[u8; UID_SIZE]> = HashSet::new(); + // Top-level uids participate in the file-wide namespace too. + for e in self.container.entries()? { + if !seen.insert(e.uid) { + return Err(Error::DuplicateUid); + } + } + + for c in self.containers()? { + // Verify the inner Table Block hashes the same way PCF does. + let data = self.container.read_partition_data(&c)?; + verify_inner_table_hashes(&data)?; + + let arena = Arena::parse(&data)?; + for info in arena.inners() { + if info.partition_type == DCP_CONTAINER_TYPE { + return Err(Error::NestedContainer); + } + if !seen.insert(info.uid) { + return Err(Error::DuplicateUid); + } + // Reconstruct and check length + data hash. + let content = arena.content(&info.uid)?; + if content.len() as u64 != info.used_bytes { + return Err(Error::LengthMismatch { + expected: info.used_bytes, + got: content.len() as u64, + }); + } + if !info.data_hash_algo.verify(&content, &info.data_hash) { + return Err(Error::HashMismatch); + } + } + } + Ok(()) + } +} + +/// Walk the inner Table Block chain in an arena and recompute each block's +/// `table_hash`, exactly as PCF does for the top-level table (spec Section +/// 9.2). The inner table is the primary integrity anchor for the inner entries +/// because the container's own PCF `data_hash_algo` is normally 0. +fn verify_inner_table_hashes(arena: &[u8]) -> Result<()> { + use pcf::{ + compute_table_hash, PartitionEntry, TableBlockHeader, ENTRY_SIZE, TABLE_HEADER_SIZE, + }; + + let header = crate::header::read_header(arena)?; + let mut off = header.inner_table_offset; + let mut budget = arena.len() / TABLE_HEADER_SIZE as usize + 1; + while off != 0 { + if budget == 0 { + return Err(Error::OffsetOutOfRange); + } + budget -= 1; + let base = off as usize; + let hb: [u8; 74] = arena + .get(base..base + TABLE_HEADER_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let h = TableBlockHeader::from_bytes(&hb)?; + let mut entries = Vec::with_capacity(h.partition_count as usize); + for i in 0..h.partition_count as u64 { + let eo = base + TABLE_HEADER_SIZE as usize + (i * ENTRY_SIZE) as usize; + let eb: [u8; 141] = arena + .get(eo..eo + ENTRY_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + entries.push(PartitionEntry::from_bytes(&eb)?); + } + if h.table_hash_algo.verifies() { + let computed = compute_table_hash(h.table_hash_algo, h.next_table_offset, &entries); + let n = h.table_hash_algo.digest_len(); + if computed[..n] != h.table_hash[..n] { + return Err(Error::HashMismatch); + } + } + off = h.next_table_offset; + } + Ok(()) +} diff --git a/reference/PCF-DCP-v1.0/src/vector.rs b/reference/PCF-DCP-v1.0/src/vector.rs new file mode 100644 index 0000000..599db6f --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/vector.rs @@ -0,0 +1,43 @@ +//! The canonical PCF-DCP v1.0 test vector (spec Section 17). + +use pcf::HashAlgo; + +use crate::arena::{Arena, Chunker}; +use crate::error::Result; +use crate::writer::DcpWriter; + +/// Build the byte-exact 700-byte reference file from spec Section 17. +/// +/// The file is one DCP container ("dcp", uid 16×0xDC, unsealed) holding two +/// inner partitions: +/// +/// * **A** ("Hello, World!", 13 B) stored as two extents — `"Hello, "` (7 B, +/// private) and `"World!"` (6 B, shared) — via fixed-7 chunking. +/// * **B** ("World!", 6 B) stored as one extent that *deduplicates* onto A's +/// second extent; both references carry SHARED = 1. +/// +/// Building the same logical container and emitting the canonical layout MUST +/// reproduce these exact bytes. +pub fn build_reference_vector() -> Result> { + let mut arena = Arena::new(); + arena.add_inner( + 0x0000_0010, + [0xA1u8; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + )?; + arena.add_inner( + 0x0000_0010, + [0xB2u8; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + )?; + + let mut w = DcpWriter::new(); + w.add_container([0xDCu8; 16], "dcp", arena)?; + w.to_image() +} diff --git a/reference/PCF-DCP-v1.0/src/writer.rs b/reference/PCF-DCP-v1.0/src/writer.rs new file mode 100644 index 0000000..5a85eba --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/writer.rs @@ -0,0 +1,255 @@ +//! [`DcpWriter`]: building and rewriting PCF files that carry DCP containers. +//! +//! The writer keeps the whole file as an in-memory list of top-level partitions +//! (plain partitions and DCP containers) and emits a fresh, canonical PCF image +//! on demand. Every mutating operation — adding a container, promotion, +//! demotion, dedup, defrag — is a logical edit of that list followed by a +//! rebuild. This is deliberately simple and always correct for a reference +//! implementation; the resulting file is a fully conforming PCF v1.0 file. + +use std::io::{Cursor, Read, Seek, Write}; + +use pcf::{decode_label, Container, HashAlgo, UID_SIZE}; + +use crate::arena::{Arena, Chunker}; +use crate::consts::DCP_CONTAINER_TYPE; +use crate::error::{Error, Result}; + +/// The body of a top-level partition. +enum Body { + /// An ordinary partition's raw bytes. + Plain(Vec), + /// A DCP container's arena. + Container(Arena), +} + +/// One top-level partition. +struct TopPart { + partition_type: u32, + uid: [u8; UID_SIZE], + label: String, + data_hash_algo: HashAlgo, + body: Body, +} + +/// A writer that assembles a PCF file containing DCP containers. +pub struct DcpWriter { + parts: Vec, + table_hash_algo: HashAlgo, + trailer: bool, +} + +impl Default for DcpWriter { + fn default() -> Self { + Self::new() + } +} + +impl DcpWriter { + /// A new, empty writer (top-level table hashed with SHA-256). + pub fn new() -> Self { + DcpWriter { + parts: Vec::new(), + table_hash_algo: HashAlgo::Sha256, + trailer: false, + } + } + + /// Load an existing PCF file into the writer's model, classifying each + /// top-level partition as a plain partition or a DCP container. + pub fn open(storage: S) -> Result { + let mut c = Container::open(storage)?; + let mut parts = Vec::new(); + for e in c.entries()? { + let data = c.read_partition_data(&e)?; + let label = decode_label(&e.label).unwrap_or_default(); + let body = if e.partition_type == DCP_CONTAINER_TYPE { + Body::Container(Arena::parse(&data)?) + } else { + Body::Plain(data) + }; + parts.push(TopPart { + partition_type: e.partition_type, + uid: e.uid, + label, + data_hash_algo: e.data_hash_algo, + body, + }); + } + Ok(DcpWriter { + parts, + table_hash_algo: HashAlgo::Sha256, + trailer: false, + }) + } + + /// Finalise emitted images in trailer mode (append-only host). Off by + /// default; passes through to [`pcf::Container::finalize_with_trailer`]. + pub fn set_trailer(&mut self, on: bool) { + self.trailer = on; + } + + // ---- top-level construction ------------------------------------------- + + /// Add a DCP container partition holding `arena` (data hash algo 0, + /// unsealed; spec Section 9). + pub fn add_container(&mut self, uid: [u8; UID_SIZE], label: &str, arena: Arena) -> Result<()> { + self.ensure_unique(&uid)?; + self.parts.push(TopPart { + partition_type: DCP_CONTAINER_TYPE, + uid, + label: label.to_string(), + data_hash_algo: HashAlgo::None, + body: Body::Container(arena), + }); + Ok(()) + } + + /// Add an ordinary top-level partition. + pub fn add_plain( + &mut self, + partition_type: u32, + uid: [u8; UID_SIZE], + label: &str, + data: Vec, + data_hash_algo: HashAlgo, + ) -> Result<()> { + self.ensure_unique(&uid)?; + self.parts.push(TopPart { + partition_type, + uid, + label: label.to_string(), + data_hash_algo, + body: Body::Plain(data), + }); + Ok(()) + } + + fn ensure_unique(&self, uid: &[u8; UID_SIZE]) -> Result<()> { + if self.parts.iter().any(|p| &p.uid == uid) { + return Err(Error::DuplicateUid); + } + Ok(()) + } + + fn container_mut(&mut self, uid: &[u8; UID_SIZE]) -> Result<&mut Arena> { + for p in &mut self.parts { + if &p.uid == uid { + return match &mut p.body { + Body::Container(a) => Ok(a), + Body::Plain(_) => Err(Error::NotADcpContainer), + }; + } + } + Err(Error::NotFound) + } + + /// Borrow a container's arena for inspection or in-place editing. + pub fn arena_mut(&mut self, container_uid: &[u8; UID_SIZE]) -> Result<&mut Arena> { + self.container_mut(container_uid) + } + + // ---- migration: promotion / demotion ---------------------------------- + + /// Promote an inner partition out of its DCP container to a top-level PCF + /// partition (dynamic → fixed), preserving uid, type, label, hash algorithm + /// and `data_hash` (the promotion invariant, spec Section 10.4). The inner + /// partition is removed from the arena (a MOVE, keeping uids unique). + pub fn promote( + &mut self, + container_uid: &[u8; UID_SIZE], + inner_uid: &[u8; UID_SIZE], + ) -> Result<()> { + let (ptype, label, algo, content) = { + let arena = self.container_mut(container_uid)?; + arena.remove_inner(inner_uid)? + }; + // The inner uid is now free file-wide; add it as a top-level partition. + self.parts.push(TopPart { + partition_type: ptype, + uid: *inner_uid, + label, + data_hash_algo: algo, + body: Body::Plain(content), + }); + Ok(()) + } + + /// Demote a top-level partition into a DCP container as an inner partition + /// (fixed → dynamic), preserving uid, type, label, hash algorithm and + /// `data_hash`. The content becomes a single DATA extent. + pub fn demote( + &mut self, + part_uid: &[u8; UID_SIZE], + container_uid: &[u8; UID_SIZE], + ) -> Result<()> { + let pos = self + .parts + .iter() + .position(|p| &p.uid == part_uid) + .ok_or(Error::NotFound)?; + if self.parts[pos].partition_type == DCP_CONTAINER_TYPE { + return Err(Error::NestedContainer); + } + let (ptype, label, algo, content) = { + let p = &self.parts[pos]; + let content = match &p.body { + Body::Plain(b) => b.clone(), + Body::Container(_) => return Err(Error::NestedContainer), + }; + (p.partition_type, p.label.clone(), p.data_hash_algo, content) + }; + let arena = self.container_mut(container_uid)?; + arena.add_inner(ptype, *part_uid, &label, &content, algo, Chunker::Whole)?; + self.parts.remove(pos); + Ok(()) + } + + // ---- container-level maintenance -------------------------------------- + + /// Re-chunk and deduplicate a container's inner partitions (spec Section + /// 10.2). Returns estimated bytes saved. + pub fn dedup(&mut self, container_uid: &[u8; UID_SIZE], chunker: Chunker) -> Result { + Ok(self.container_mut(container_uid)?.dedup(chunker)) + } + + /// Compact / defragment a container's arena, reclaiming dead bytes and + /// normalising the SHARED flag (spec Section 10.3). Returns bytes reclaimed. + pub fn defrag(&mut self, container_uid: &[u8; UID_SIZE]) -> Result { + Ok(self.container_mut(container_uid)?.compact()) + } + + // ---- serialisation ---------------------------------------------------- + + /// Build a fresh, canonical PCF image of the whole file. The first table + /// block is sized to hold every partition (a single block, no overflow), + /// matching the spec's canonical test-vector layout. + pub fn to_image(&self) -> Result> { + let cap = self.parts.len().max(1) as u32; + let mut c = Container::create_with(Cursor::new(Vec::new()), cap, self.table_hash_algo)?; + for p in &self.parts { + let data = match &p.body { + Body::Plain(b) => b.clone(), + Body::Container(a) => a.to_bytes(), + }; + c.add_partition( + p.partition_type, + p.uid, + &p.label, + &data, + 0, + p.data_hash_algo, + )?; + } + if self.trailer { + c.finalize_with_trailer()?; + } + Ok(c.into_storage().into_inner()) + } + + /// Write the image to any [`Write`] sink. + pub fn write_to(&self, mut out: W) -> Result<()> { + out.write_all(&self.to_image()?)?; + Ok(()) + } +} diff --git a/reference/PCF-DCP-v1.0/testdata/canonical.bin b/reference/PCF-DCP-v1.0/testdata/canonical.bin new file mode 100644 index 0000000..834aea4 Binary files /dev/null and b/reference/PCF-DCP-v1.0/testdata/canonical.bin differ diff --git a/reference/PCF-DCP-v1.0/tests/coverage.rs b/reference/PCF-DCP-v1.0/tests/coverage.rs new file mode 100644 index 0000000..a6236ae --- /dev/null +++ b/reference/PCF-DCP-v1.0/tests/coverage.rs @@ -0,0 +1,228 @@ +//! Error paths and edge cases (spec Sections 8, 13). + +use std::io::Cursor; + +use pcf::HashAlgo; +use pcf_dcp::{ + build_reference_vector, Arena, Chunker, DcpReader, Error, FragTableHeader, FragmentEntry, +}; + +#[test] +fn bad_magic_is_rejected() { + let mut bytes = build_reference_vector().unwrap(); + // Corrupt the arena magic (file offset 0x00EB). + bytes[0xEB] = b'X'; + // The PCF layer is still valid; the DCP arena parse must fail. + let mut c = pcf::Container::open(Cursor::new(bytes)).unwrap(); + let e = c.entries().unwrap().into_iter().next().unwrap(); + let data = c.read_partition_data(&e).unwrap(); + assert!(matches!(Arena::parse(&data), Err(Error::BadDcpMagic))); +} + +#[test] +fn unsupported_profile_major_is_rejected() { + let mut a = Arena::new(); + a.add_inner(0x10, [1; 16], "x", b"hi", HashAlgo::Sha256, Chunker::Whole) + .unwrap(); + let mut bytes = a.to_bytes(); + bytes[4] = 2; // profile_version_major + assert!(matches!( + Arena::parse(&bytes), + Err(Error::UnsupportedProfileMajor(2)) + )); +} + +#[test] +fn reserved_and_nil_and_nested_are_rejected() { + let mut a = Arena::new(); + assert!(matches!( + a.add_inner(0, [1; 16], "x", b"", HashAlgo::None, Chunker::Whole), + Err(Error::ReservedType) + )); + assert!(matches!( + a.add_inner( + 0xAAAC_0001, + [1; 16], + "x", + b"", + HashAlgo::None, + Chunker::Whole + ), + Err(Error::NestedContainer) + )); + assert!(matches!( + a.add_inner(0x10, [0; 16], "x", b"", HashAlgo::None, Chunker::Whole), + Err(Error::NilUid) + )); +} + +#[test] +fn duplicate_uid_within_arena_is_rejected() { + let mut a = Arena::new(); + a.add_inner(0x10, [1; 16], "x", b"a", HashAlgo::None, Chunker::Whole) + .unwrap(); + assert!(matches!( + a.add_inner(0x10, [1; 16], "y", b"b", HashAlgo::None, Chunker::Whole), + Err(Error::DuplicateUid) + )); +} + +#[test] +fn bad_fragment_kind_renders_partition_unreadable() { + // Hand-build a fragment entry with a reserved kind and walk it. + let fe = FragmentEntry { + extent_offset: 24, + extent_length: 1, + kind: 2, // HOLE (reserved) + flags: 0, + }; + assert!(!fe.is_data()); + let frags = vec![fe]; + let arena = vec![0u8; 64]; + assert!(matches!( + pcf_dcp::reconstruct(&arena, &frags, 64), + Err(Error::BadFragmentKind(2)) + )); +} + +#[test] +fn offset_out_of_range_is_rejected() { + let fe = FragmentEntry { + extent_offset: 60, + extent_length: 100, // runs past arena_used + kind: 1, + flags: 0, + }; + assert!(matches!( + pcf_dcp::reconstruct(&[0u8; 64], &[fe], 64), + Err(Error::OffsetOutOfRange) + )); +} + +#[test] +fn empty_inner_is_allowed() { + let mut a = Arena::new(); + a.add_inner( + 0x10, + [1; 16], + "empty", + b"", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let info = a.inner_info(&[1; 16]).unwrap(); + assert_eq!(info.used_bytes, 0); + assert_eq!(info.extents.len(), 0); + assert_eq!(a.content(&[1; 16]).unwrap(), b""); + // Round-trips through serialise/parse. + let bytes = a.to_bytes(); + let parsed = Arena::parse(&bytes).unwrap(); + assert_eq!(parsed.content(&[1; 16]).unwrap(), b""); +} + +#[test] +fn many_inners_chain_the_inner_table() { + // More than 255 inner partitions force a multi-block inner table. + let mut a = Arena::new(); + for i in 0..300u32 { + let mut uid = [0u8; 16]; + uid[0..4].copy_from_slice(&i.to_le_bytes()); + uid[15] = 1; // keep non-NIL even when i == 0 + a.add_inner( + 0x10, + uid, + "n", + &i.to_le_bytes(), + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + } + assert_eq!(a.len(), 300); + let bytes = a.to_bytes(); + let parsed = Arena::parse(&bytes).unwrap(); + assert_eq!(parsed.len(), 300); + // Spot-check a late partition. + let mut uid = [0u8; 16]; + uid[0..4].copy_from_slice(&299u32.to_le_bytes()); + uid[15] = 1; + assert_eq!(parsed.content(&uid).unwrap(), 299u32.to_le_bytes()); + + // The whole thing is a valid PCF + DCP file. + let mut w = pcf_dcp::DcpWriter::new(); + w.add_container([0xDC; 16], "big", a).unwrap(); + let image = w.to_image().unwrap(); + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); +} + +#[test] +fn many_extents_chain_the_fragment_table() { + // More than 255 extents in one partition force a multi-block fragment table. + let mut a = Arena::new(); + let content = vec![0xAB; 300]; + a.add_inner( + 0x10, + [1; 16], + "frag", + &content, + HashAlgo::Sha256, + Chunker::Fixed(1), + ) + .unwrap(); + let info = a.inner_info(&[1; 16]).unwrap(); + // Fixed(1) with identical bytes deduplicates to a single shared extent, so + // assert the *logical* length instead, then force distinct extents. + assert_eq!(info.used_bytes, 300); + + let mut b = Arena::new(); + let distinct: Vec = (0..300u32).map(|i| i as u8).collect(); + // 300 distinct-ish single-byte chunks; some repeat (values wrap mod 256), + // but the fragment list still has 300 entries. + b.add_inner( + 0x10, + [2; 16], + "frag2", + &distinct, + HashAlgo::Sha256, + Chunker::Fixed(1), + ) + .unwrap(); + let bytes = b.to_bytes(); + let parsed = Arena::parse(&bytes).unwrap(); + assert_eq!(parsed.content(&[2; 16]).unwrap(), distinct); +} + +#[test] +fn fragtable_header_count_bounds() { + let h = FragTableHeader { + next_fragtable_offset: 7, + fragment_count: 255, + }; + assert_eq!(FragTableHeader::from_bytes(&h.to_bytes()), h); +} + +#[test] +fn verify_detects_global_uid_collision() { + // A top-level partition sharing a uid with an inner partition is a file-wide + // collision (spec Section 2.1). + let mut a = Arena::new(); + a.add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = pcf_dcp::DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", a).unwrap(); + // Add a top-level plain partition with the SAME uid as the inner one. + w.add_plain(0x10, [0xB2; 16], "dup", b"x".to_vec(), HashAlgo::Sha256) + .unwrap(); + let image = w.to_image().unwrap(); + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + assert!(matches!(r.verify(), Err(Error::DuplicateUid))); +} diff --git a/reference/PCF-DCP-v1.0/tests/roundtrip.rs b/reference/PCF-DCP-v1.0/tests/roundtrip.rs new file mode 100644 index 0000000..8b559de --- /dev/null +++ b/reference/PCF-DCP-v1.0/tests/roundtrip.rs @@ -0,0 +1,258 @@ +//! End-to-end round-trips: build, edit, dedup/defrag, promote/demote. + +use std::io::Cursor; + +use pcf::HashAlgo; +use pcf_dcp::{Arena, Chunker, DcpReader, DcpWriter, Resolved}; + +fn build_two_inner_file() -> Vec { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", arena).unwrap(); + w.to_image().unwrap() +} + +#[test] +fn edits_reconstruct_correctly() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [1; 16], + "f", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + + arena.append(&[1; 16], b"!!").unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"Hello, World!!!"); + + arena.insert(&[1; 16], 5, b"XYZ").unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"HelloXYZ, World!!!"); + + arena.delete(&[1; 16], 5, 3).unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"Hello, World!!!"); + + arena.overwrite(&[1; 16], 0, 5, b"HOWDY").unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"HOWDY, World!!!"); + + arena.truncate(&[1; 16], 5).unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"HOWDY"); +} + +#[test] +fn cow_does_not_disturb_shared_bytes() { + // A and B share "World!"; overwriting A's copy must not change B. + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + // Overwrite the "World!" part of A (logical [7,13)). + arena.overwrite(&[0xA1; 16], 7, 6, b"PLANET").unwrap(); + assert_eq!(arena.content(&[0xA1; 16]).unwrap(), b"Hello, PLANET"); + assert_eq!(arena.content(&[0xB2; 16]).unwrap(), b"World!"); +} + +#[test] +fn dedup_then_defrag_preserve_content() { + // Two inners with no initial sharing; dedup should fold the identical chunk. + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [1; 16], + "A", + b"abcabc", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + arena + .add_inner( + 0x10, + [2; 16], + "B", + b"abcabc", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let h1 = arena.inner_info(&[1; 16]).unwrap().data_hash; + + let saved = arena.dedup(Chunker::Fixed(3)); + assert!(saved > 0, "identical chunks should dedup"); + // Content and hash unchanged. + assert_eq!(arena.content(&[1; 16]).unwrap(), b"abcabc"); + assert_eq!(arena.content(&[2; 16]).unwrap(), b"abcabc"); + assert_eq!(arena.inner_info(&[1; 16]).unwrap().data_hash, h1); + + arena.compact(); + assert_eq!(arena.content(&[2; 16]).unwrap(), b"abcabc"); +} + +#[test] +fn defrag_clears_shared_when_no_longer_aliased() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + // Remove B, so "World!" is referenced only by A now. + arena.remove_inner(&[0xB2; 16]).unwrap(); + arena.compact(); + let a = arena.inner_info(&[0xA1; 16]).unwrap(); + assert!( + a.extents.iter().all(|e| !e.shared), + "F2: shared cleared at compaction" + ); + assert_eq!(arena.content(&[0xA1; 16]).unwrap(), b"Hello, World!"); +} + +#[test] +fn promote_preserves_uid_and_data_hash() { + let image = build_two_inner_file(); + let mut w = DcpWriter::open(Cursor::new(image)).unwrap(); + + // data_hash of inner B before promotion. + let before = { + let bytes = w.to_image().unwrap(); + let mut r = DcpReader::open(Cursor::new(bytes)).unwrap(); + let inner = r + .inner_partitions() + .unwrap() + .into_iter() + .find(|l| l.info.uid == [0xB2; 16]) + .unwrap(); + inner.info.data_hash + }; + + w.promote(&[0xDC; 16], &[0xB2; 16]).unwrap(); + let image = w.to_image().unwrap(); + + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + match r.resolve_uid(&[0xB2; 16]).unwrap() { + Resolved::TopLevel(e) => { + assert_eq!(e.uid, [0xB2; 16]); + assert_eq!( + e.data_hash, before, + "promotion invariant: data_hash unchanged" + ); + assert_eq!(e.used_bytes, 6); + } + _ => panic!("B should now be top-level"), + } + // The promoted partition reads back as "World!". + assert_eq!(r.read_inner(&[0xA1; 16]).unwrap(), b"Hello, World!"); +} + +#[test] +fn demote_then_promote_is_identity_for_content() { + let image = build_two_inner_file(); + let mut w = DcpWriter::open(Cursor::new(image)).unwrap(); + w.promote(&[0xDC; 16], &[0xB2; 16]).unwrap(); + // Now B is top-level; demote it back into the container. + w.demote(&[0xB2; 16], &[0xDC; 16]).unwrap(); + let image = w.to_image().unwrap(); + + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_inner(&[0xB2; 16]).unwrap(), b"World!"); + // And it is an inner partition again. + assert!(matches!( + r.resolve_uid(&[0xB2; 16]).unwrap(), + Resolved::Inner(_) + )); +} + +#[test] +fn trailer_mode_reads_back_identically() { + // Build the same file in trailer mode (append-only host); the reader must + // resolve the table head from the trailer and expose every inner partition. + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", arena).unwrap(); + w.set_trailer(true); + let image = w.to_image().unwrap(); + + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_inner(&[0xA1; 16]).unwrap(), b"Hello, World!"); + assert_eq!(r.read_inner(&[0xB2; 16]).unwrap(), b"World!"); + assert_eq!(r.inner_partitions().unwrap().len(), 2); +} diff --git a/reference/PCF-DCP-v1.0/tests/spec_compliance.rs b/reference/PCF-DCP-v1.0/tests/spec_compliance.rs new file mode 100644 index 0000000..bbcd0be --- /dev/null +++ b/reference/PCF-DCP-v1.0/tests/spec_compliance.rs @@ -0,0 +1,190 @@ +//! Conformance tests tying the implementation to specific sections of +//! `specs/PCF-DCP-spec-v1.0.txt`, culminating in the byte-exact Section 17 +//! test vector. + +use std::io::Cursor; + +use pcf::{Container, HashAlgo}; +use pcf_dcp::{ + build_reference_vector, Arena, Chunker, DcpHeader, DcpReader, FragTableHeader, FragmentEntry, + DCP_CONTAINER_TYPE, DCP_HEADER_SIZE, FRAGMENT_ENTRY_SIZE, FRAGTABLE_HEADER_SIZE, +}; + +/// The canonical 700-byte file, byte-for-byte equal to the spec's Section 17 +/// hex dump (verified during development). +const CANONICAL: &[u8] = include_bytes!("../testdata/canonical.bin"); + +#[test] +fn structure_sizes_match_appendix_a() { + assert_eq!(DCP_HEADER_SIZE, 24); + assert_eq!(FRAGTABLE_HEADER_SIZE, 9); + assert_eq!(FRAGMENT_ENTRY_SIZE, 18); + assert_eq!(DCP_CONTAINER_TYPE, 0xAAAC_0001); +} + +#[test] +fn header_roundtrip_and_magic() { + let h = DcpHeader { + profile_version_major: 1, + profile_version_minor: 0, + flags: 0, + inner_table_offset: 109, + arena_used: 465, + }; + let b = h.to_bytes(); + assert_eq!(&b[0..4], b"PDCP"); + assert_eq!(DcpHeader::from_bytes(&b).unwrap(), h); +} + +#[test] +fn fragment_records_roundtrip() { + let e = FragmentEntry { + extent_offset: 31, + extent_length: 6, + kind: 1, + flags: 1, + }; + assert_eq!(FragmentEntry::from_bytes(&e.to_bytes()), e); + let h = FragTableHeader { + next_fragtable_offset: 0, + fragment_count: 2, + }; + assert_eq!(FragTableHeader::from_bytes(&h.to_bytes()), h); +} + +#[test] +fn reconstruction_equals_logical_content() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [1; 16], + "x", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"Hello, World!"); + // Two extents, total used_bytes 13. + let info = arena.inner_info(&[1; 16]).unwrap(); + assert_eq!(info.used_bytes, 13); + assert_eq!(info.extents.len(), 2); +} + +#[test] +fn data_hash_is_invariant_under_fragmentation() { + // The same content chunked differently yields the same data_hash (it covers + // logical content only — spec Section 8.3 / 9.1). + let mk = |c: Chunker| { + let mut a = Arena::new(); + a.add_inner(0x10, [7; 16], "x", b"abcdefghij", HashAlgo::Sha256, c) + .unwrap(); + a.inner_info(&[7; 16]).unwrap().data_hash + }; + assert_eq!(mk(Chunker::Whole), mk(Chunker::Fixed(3))); + assert_eq!(mk(Chunker::Whole), HashAlgo::Sha256.compute(b"abcdefghij")); +} + +#[test] +fn dedup_sets_shared_on_all_aliases_rule_f1() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + + let a = arena.inner_info(&[0xA1; 16]).unwrap(); + let b = arena.inner_info(&[0xB2; 16]).unwrap(); + // A: "Hello, " private, "World!" shared. + assert!(!a.extents[0].shared); + assert!(a.extents[1].shared); + // B: single extent, shared, deduplicated onto A's second extent. + assert_eq!(b.extents.len(), 1); + assert!(b.extents[0].shared); + // B's data_hash equals a standalone SHA-256("World!") — promotion invariant. + assert_eq!(b.data_hash, HashAlgo::Sha256.compute(b"World!")); +} + +#[test] +fn canonical_vector_is_byte_exact_700() { + let image = build_reference_vector().unwrap(); + assert_eq!(image.len(), 700, "spec Section 17 total file size"); + assert_eq!( + image, CANONICAL, + "must reproduce the Section 17 bytes exactly" + ); +} + +#[test] +fn canonical_vector_key_offsets() { + let image = build_reference_vector().unwrap(); + // Top-level: file header partition_table_offset = 20, one entry of type DCP. + assert_eq!(&image[0..8], &pcf::MAGIC); + // Arena begins at file offset 0x00EB (235). + assert_eq!(&image[0xEB..0xEF], b"PDCP"); + assert_eq!(image[0xEF], 1); // profile_version_major + assert_eq!(image[0xF0], 0); // profile_version_minor (the spec dump's 01 was a typo) + // inner_table_offset = 109 (arena-rel), arena_used = 465. + assert_eq!( + u64::from_le_bytes(image[0xF3..0xFB].try_into().unwrap()), + 109 + ); + assert_eq!( + u64::from_le_bytes(image[0xFB..0x103].try_into().unwrap()), + 465 + ); + // Shared flags: A[1] at 0x013C and B[0] at 0x0157 are 1; A[0] at 0x012A is 0. + assert_eq!(image[0x012A], 0); + assert_eq!(image[0x013C], 1); + assert_eq!(image[0x0157], 1); +} + +#[test] +fn canonical_vector_is_valid_pcf() { + // A generic PCF reader sees one valid partition and the table hash verifies. + let image = build_reference_vector().unwrap(); + let mut c = Container::open(Cursor::new(image)).unwrap(); + c.verify().unwrap(); + let entries = c.entries().unwrap(); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].partition_type, DCP_CONTAINER_TYPE); + assert_eq!(entries[0].used_bytes, 465); + assert_eq!(entries[0].data_hash_algo, HashAlgo::None); +} + +#[test] +fn canonical_vector_is_valid_dcp() { + let image = build_reference_vector().unwrap(); + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_inner(&[0xA1; 16]).unwrap(), b"Hello, World!"); + assert_eq!(r.read_inner(&[0xB2; 16]).unwrap(), b"World!"); +} + +#[test] +fn parse_roundtrips_canonical_arena_byte_exact() { + // Parsing the canonical arena and re-serialising reproduces it exactly, + // because the test vector is already in canonical layout. + let mut c = Container::open(Cursor::new(CANONICAL.to_vec())).unwrap(); + let entry = c.entries().unwrap().into_iter().next().unwrap(); + let data = c.read_partition_data(&entry).unwrap(); + let arena = Arena::parse(&data).unwrap(); + assert_eq!(arena.to_bytes(), data); +} diff --git a/specs/PCF-DCP-spec-v1.0.txt b/specs/PCF-DCP-spec-v1.0.txt new file mode 100644 index 0000000..5c51234 --- /dev/null +++ b/specs/PCF-DCP-spec-v1.0.txt @@ -0,0 +1,1356 @@ +=============================================================================== + PCF-DCP -- Dynamic Container Partition Profile + Specification Version 1.0 +=============================================================================== + +Status of This Document + + This document specifies version 1.0 of PCF-DCP, an application-level + profile that uses the Partitioned Container Format (PCF) version 1.0 to + store a set of inner partitions that may each GROW, SHRINK, and be MUTATED + IN THE MIDDLE without relocating their neighbours, inside a single PCF + partition that acts as a self-contained arena. + + PCF-DCP does NOT modify, extend, or fork PCF. A PCF-DCP file is a fully + conforming PCF v1.0 file. All structures defined here live inside the data + region of one PCF partition (the "DCP container") and inside the + application-defined portions of PCF entries. This profile is layered + strictly above the PCF specification; where the two appear to conflict, the + PCF specification governs the byte container and this document governs only + the interpretation of the DCP container partition's contents. + + The profile version described here is major version 1, minor version 0. + + +------------------------------------------------------------------------------- +Table of Contents +------------------------------------------------------------------------------- + + 1. Introduction + 2. Relationship to PCF + 2.1 Relationship to PCF-SIG + 2.2 Compatibility with the PCF File Trailer + 3. Conventions and Terminology + 3.1 Requirement Keywords + 3.2 Terminology + 3.3 Data Types and Byte Order + 4. Profile Model Overview + 4.1 The DCP Container Partition + 4.2 The Arena and Relative Offsets + 4.3 Free Space and the Bump Allocator + 4.4 No Nested Containers + 5. Partition Types and Reserved Values + 6. DCP Header + 7. Inner Partition Table + 7.1 Reuse of the PCF Table Block + 7.2 Reinterpretation of Inner Partition Entry Fields + 8. Fragment Table + 8.1 Fragment Table Header + 8.2 Fragment Entry + 8.3 Logical Content Reconstruction + 8.4 The SHARED Flag and Copy-on-Write + 9. Integrity Model + 9.1 Inner Partition Data Hash + 9.2 Inner Table Block Hash + 9.3 Fragment-Table Protection + 9.4 Container Data Hash and Sealed Mode + 10. Operations (Informative) + 10.1 In-Place Editing via the Fragment Table + 10.2 Deduplication + 10.3 Compaction and Defragmentation + 10.4 Promotion and the Promotion Invariant + 11. Reader Algorithms (Informative) + 12. Writer Algorithms (Informative) + 13. Conformance and Validation + 14. Versioning + 15. Future Considerations (Informative) + 16. Assumptions and Design Decisions (Informative) + 17. Test Vectors + Appendix A. Field Layout Summary + Appendix B. Type and Constant Registry + + +------------------------------------------------------------------------------- +1. Introduction +------------------------------------------------------------------------------- + + PCF stores each partition as a CONTIGUOUS prefix of a pre-allocated region + (PCF Section 6): its used data occupies [start_offset, start_offset + + used_bytes), and growth beyond max_length requires relocating the whole + partition. This is simple and fast, but it offers no way to grow a partition + that is boxed in by its neighbours, and no way to insert or delete bytes in + the middle of a partition without rewriting everything after the edit. + + PCF-DCP adds exactly that capability, without touching the container format. + It defines ONE new application partition type, the DCP CONTAINER (type + 0xAAAC0001). The data region of a DCP container is a self-contained ARENA + holding any number of INNER PARTITIONS. Each inner partition's bytes are + described not by a single contiguous range but by a FRAGMENT TABLE: an + ordered list of EXTENTS (arbitrary (offset, length) slices of the arena) + whose concatenation is the partition's logical content. Because content is + addressed indirectly through extents, an inner partition can: + + - GROW by appending a new extent anywhere in free arena space; + - SHRINK by dropping or trimming trailing extents; + - be MUTATED IN THE MIDDLE -- overwrite, insert, or delete a byte range + -- by splitting the affected extents and rewriting only the changed + bytes into a fresh extent, never moving the unchanged bytes and never + disturbing other inner partitions. + + When the DCP container is the last region in the file, its arena can grow + toward the end of the file without bound, giving the inner partitions + effectively unlimited room to expand. + + A central design requirement is the PROMOTION INVARIANT: an inner partition + can be promoted to an ordinary top-level PCF partition, and an ordinary + partition can be demoted into a DCP container, with its PCF identity and + integrity (uid, type, label, used_bytes, data_hash) byte-for-byte preserved. + This is what makes the profile composable with cryptographic signing + (Section 2.1): a signature over an inner partition's content survives + promotion, defragmentation, and arena relocation. + + Two further capabilities fall out of the extent model almost for free and + are specified here as OPTIONAL: extents MAY be SHARED between inner + partitions (deduplication, Section 10.2), and the arena MAY be compacted to + reclaim free space and defragment partitions (Section 10.3). A per-extent + SHARED flag (Section 8.4) makes safe copy-on-write editing explicit so that + private (unshared) extents can still be overwritten cheaply in place. + + +------------------------------------------------------------------------------- +2. Relationship to PCF +------------------------------------------------------------------------------- + + A PCF-DCP file MUST be a conforming PCF v1.0 file (PCF Section 12). In + particular: + + - The 20-byte PCF File Header is present at offset 0 with the exact PCF + magic and version_major = 1, version_minor = 0. + + - The DCP container is a normal PCF partition with its own PCF Partition + Entry: a unique 16-byte PCF uid, a start_offset, a max_length, a + used_bytes, and a data_hash. Its type is the application value + 0xAAAC0001, permitted by PCF Section 7.1 (any value in + 0x00000001..0xFFFFFFFE is available to the application). + + - The PCF partition table is a chain of PCF Table Blocks linked by + next_table_offset, terminated by 0. + + A DCP container's PCF data_hash_algo_id is normally 0 (none, Section 9.4): + the container's integrity is carried INSIDE the arena, by the inner table + and inner data hashes, so that appending to one inner partition costs O(the + change) rather than O(the whole container). + + A generic PCF reader that knows nothing of this profile sees a valid file. + It traverses the top-level Table Block chain, enumerates the DCP container + as one ordinary partition, and verifies every top-level table_hash. It does + not descend into the arena and assigns the arena bytes no meaning -- exactly + as PCF is content-agnostic about any partition (PCF Section 1). Reconstructing + the inner partitions is the job of a DCP-aware reader (Section 11). The + container partition does not, and need not, appear on any top-level chain as + anything other than one opaque partition. + + PCF-DCP constrains, but does not change, how these PCF facilities are used. + The DCP container reuses PCF's own Table Block (74 bytes) and Partition Entry + (141 bytes) structures, byte-for-byte, INSIDE the arena to describe the inner + partitions (Section 7). The only genuinely new structure is the Fragment + Table (Section 8); everything else is PCF reused recursively. + +2.1 Relationship to PCF-SIG + + PCF-SIG (a sibling profile defining cryptographic signatures over PCF + partitions) signs a partition by committing to a fixed set of its PCF entry + fields. That set is exactly {uid, type, label, used_bytes, data_hash_algo_id, + data_hash}; it deliberately EXCLUDES start_offset and max_length, so that a + signature survives any change to a partition's physical placement or + reservation (PCF-SIG "relocation stability"). + + This set is, field-for-field, the PROMOTION INVARIANT of this profile + (Section 10.4): promotion, demotion, defragmentation, and arena relocation + change only start_offset, max_length, and physical layout, never any of the + six protected fields. Consequently a PCF-SIG signature over an inner + partition remains valid whether the partition currently lives inside a DCP + container or has been promoted to the top level. + + Discoverability. PCF-SIG locates a partition to verify by resolving its uid + over the set of partitions it can enumerate. A plain PCF-SIG verifier + enumerates only the top-level Table Block chain and therefore CANNOT see a + partition that currently lives inside a DCP container; it reports such a uid + as missing (a per-entry condition, not a malformed file). A DCP-aware + reader removes this limitation by extending the resolvable set to include + every inner partition of every DCP container (Section 11): the set of + partitions resolvable by uid becomes (top-level partitions) UNION (the inner + partitions of all DCP containers). This is a reader-side extension of the + search scope; it changes no PCF-SIG byte and no PCF-SIG rule. + + To be signable in place, an inner partition MUST carry a CRYPTOGRAPHIC + data_hash_algo_id (PCF IDs 16, 17, or 18), as required by PCF-SIG. The DCP + container itself, carrying data_hash_algo_id = 0, is not signable as a blob + unless it is sealed (Section 9.4). + + uid uniqueness. Because uids are resolved across the union above, a Writer + MUST keep every inner partition's PCF uid unique across the WHOLE FILE, not + merely within one container (PCF requires uniqueness among live partitions + in a file; this profile reaffirms it across the container boundary). + Promotion MUST therefore be a MOVE -- the inner entry is removed as the + top-level entry is added -- never a copy that would leave two live entries + sharing a uid. + +2.2 Compatibility with the PCF File Trailer + + PCF v1.0 defines an OPTIONAL File Trailer: when the PCF File Header's + partition_table_offset holds the all-ones sentinel, the top-level + partition-table head is recorded in a fixed trailer at the end of the file + (and the table chain MAY be backward-linked). This lets append-only writers + commit without rewriting the header. PCF-DCP is fully compatible with both + header-pointer and trailer-mode host files: + + - A DCP-aware Reader MUST locate the top-level table head through the PCF + layer (which resolves the trailer when present); it MUST NOT assume the + File Header's partition_table_offset is a real offset. In trailer mode + that field is the sentinel, not the table position. Once the top-level + partitions are enumerated, locating and reading a DCP container is + unchanged: the container is one ordinary PCF partition. + + - Whether the host file uses a header pointer or a trailer is invisible to + the arena: the DCP Header, Inner Table Block chain, and Fragment Tables + are addressed by ARENA-RELATIVE offsets within the container's data + (Section 3.3) and never reference the enclosing file's layout. + + - The arena itself does NOT contain a PCF trailer. The inner table is + always located by inner_table_offset in the DCP Header (Section 6); the + inner Table Block chain is forward-linked (next_table_offset), because a + DCP Writer rewrites the arena as a whole (Section 4.3, 10.3) and so has + no append-only motive to invert it. + + A Writer MAY publish a finished DCP file in trailer mode (e.g. when the DCP + file is itself appended into a larger append-only host); doing so changes no + arena byte and leaves every inner partition, data_hash, and signature intact. + The test vector in Section 17 is given in classic header-pointer form. + + +------------------------------------------------------------------------------- +3. Conventions and Terminology +------------------------------------------------------------------------------- + +3.1 Requirement Keywords + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in RFC 2119. + +3.2 Terminology + + DCP container A single PCF partition of type 0xAAAC0001 whose data region + is a DCP arena. Often just "the container". + + Arena The data region of a DCP container: the byte range + [start_offset, start_offset + used_bytes) of the container's + PCF entry, holding the DCP Header, the inner table, the + fragment tables, and the data extents. + + Inner A partition stored inside the arena, described by a PCF + partition Partition Entry in the inner table (Section 7) and by a + Fragment Table (Section 8). Its logical content is the + concatenation of its extents. + + Inner table The PCF Table Block chain INSIDE the arena that lists the + inner partitions. Distinct from the file's top-level table. + + Fragment A per-inner-partition structure: a chain of Fragment Table + Table blocks listing that partition's extents in logical order. + + Extent A slice of the arena, an (offset, length) pair of bytes, + referenced by one Fragment Entry. Extents MAY be SHARED by + more than one Fragment Entry (Section 10.2). + + Logical The byte stream obtained by concatenating an inner + content partition's DATA extents in fragment-table order (Section + 8.3). The unit that data_hash and signatures commit to. + + Promotion Moving an inner partition out of its container to become an + ordinary top-level PCF partition (Section 10.4). Demotion is + the reverse. + + Bump pointer The arena offset arena_used (Section 6): the next free byte + for the default append-only allocator. + + This document additionally uses, unchanged, the PCF terms File, Reader, + Writer, Partition, Partition Table, Table Block, Entry, and UID. A "PCF uid" + is the 16-byte PCF partition identifier. + +3.3 Data Types and Byte Order + + PCF-DCP uses the same conventions as PCF Section 2.3. All multi-byte + integers are unsigned and LITTLE-ENDIAN (u8, u16, u32, u64). Byte arrays + (magics, uid, label, hashes) are stored in file order and are not subject + to endianness conversion. + + ARENA-RELATIVE OFFSETS. Every offset stored INSIDE the arena -- the DCP + Header's pointers, an inner entry's start_offset, a Fragment Table's + next_fragtable_offset, and a Fragment Entry's extent_offset -- is a byte + offset RELATIVE TO THE START OF THE ARENA, i.e. relative to the DCP + container's PCF start_offset. The absolute file offset of an arena byte at + arena-relative offset r is (container.start_offset + r). Because the DCP + Header occupies arena offset 0, the relative value 0 is never a valid + pointer to any inner structure and is used as a chain terminator, mirroring + PCF's use of absolute offset 0 (PCF Section 4). + + +------------------------------------------------------------------------------- +4. Profile Model Overview +------------------------------------------------------------------------------- + +4.1 The DCP Container Partition + + A DCP container is one PCF partition. Everything this profile defines lives + in its arena. A file MAY contain several DCP containers and MAY freely mix + them with ordinary PCF partitions and with partitions of other profiles; + each DCP container is independent. + +4.2 The Arena and Relative Offsets + + The arena is laid out as: + + arena offset 0: + +======================================+ + | DCP Header | 24 bytes (Section 6) + +======================================+ + | Inner Table Block chain | reused PCF Table Blocks + + | (lists the inner partitions) | reused PCF Partition Entries + +======================================+ (Section 7) + | Fragment Tables (one chain per | Section 8 + | inner partition) | + +======================================+ + | Data extents (the actual bytes, | referenced by Fragment Entries + | possibly shared, possibly out of | (Section 8.2) + | logical order, possibly fragmented) | + +======================================+ arena_used + | Free arena tail (bump space) | grows; to EOF if container is + +======================================+ the last region in the file + + As in PCF (PCF Section 3), the relative placement of these regions WITHIN + the arena is the Writer's responsibility; only the DCP Header's fixed + position at arena offset 0 is mandated. A Reader locates every structure by + following the pointers, never by assuming an order. + +4.3 Free Space and the Bump Allocator + + Like PCF (PCF Section 5.3, A3), PCF-DCP does NOT encode free space. The DCP + Header stores arena_used, a single bump pointer: the arena bytes + [arena_used, container.max_length - container's reserved tail) are free, and + the default allocation policy is to hand out new extents at arena_used and + advance it. Reuse of the space freed by deleted or rewritten extents is a + Writer-side policy that leaves no on-disk trace; it is reclaimed in bulk by + compaction (Section 10.3). A Reader never needs free-space information. + +4.4 No Nested Containers + + In version 1.0 a DCP container MUST NOT contain another DCP container: an + inner partition's type MUST NOT be 0xAAAC0001. Inner partitions are leaves. + This keeps reconstruction non-recursive and bounds reader complexity. (A + future major version MAY relax this; Section 15.) + + +------------------------------------------------------------------------------- +5. Partition Types and Reserved Values +------------------------------------------------------------------------------- + + PCF-DCP assigns the following PCF type value: + + Type Name Meaning + ----------- ------------- ----------------------------------------- + 0xAAAC0001 DCP_CONTAINER A partition whose data region is a DCP + arena (Section 4). + ----------- ------------- ----------------------------------------- + + The block 0xAAAC0000..0xAAAC00FF is reserved by this profile for future + related types, continuing the existing convention (0xAAAA* = PFS-MS, + 0xAAAB* = PCF-SIG). Inner partitions carry whatever application type their + content requires (including the PCF RAW type 0xFFFFFFFF), with the single + restriction of Section 4.4 (no inner DCP_CONTAINER). + + The PCF reserved values retain their PCF meaning at every level: type + 0x00000000 and the NIL PCF uid MUST NOT be used for any live partition, + inner or top-level (PCF Section 7). The all-zero arena-relative offset is + reserved as a chain terminator (Section 3.3). + + +------------------------------------------------------------------------------- +6. DCP Header +------------------------------------------------------------------------------- + + The arena MUST begin, at arena offset 0, with the following 24-byte header. + + Offset Size Type Field + (rel) + ------ ---- ----- -------------------------------------------------- + 0 4 bytes dcp_magic = 0x50 0x44 0x43 0x50 ("PDCP") + 4 1 u8 profile_version_major = 1 + 5 1 u8 profile_version_minor = 0 + 6 2 u16 flags (reserved; MUST be 0) + 8 8 u64 inner_table_offset (rel; first Inner Table Block) + 16 8 u64 arena_used (bump pointer; next free rel) + ------ ---- ----- -------------------------------------------------- + Total: 24 bytes + + dcp_magic + MUST be the four bytes "PDCP" (0x50 0x44 0x43 0x50). A Reader MUST treat + a partition typed 0xAAAC0001 whose arena does not begin with this magic + as malformed. + + profile_version_major, profile_version_minor + The PCF-DCP profile version of this container (Section 14). A Reader MUST + reject a container whose profile_version_major it does not implement and + SHOULD accept a higher profile_version_minor, ignoring features it does + not understand. + + flags + Reserved in version 1.0; a Writer MUST set all bits to 0 and a Reader + MUST ignore them. (Reserved for future, layout-neutral hints introduced + by a profile minor bump.) + + inner_table_offset + Arena-relative offset of the first Inner Table Block (Section 7). MUST be + >= 24 (it cannot point into the DCP Header). The value 0 would mean a + container with no inner table; a Writer SHOULD instead emit an empty + Inner Table Block (partition_count = 0) and point at it, but a Reader + MUST treat inner_table_offset == 0 as "no inner partitions". + + arena_used + The bump pointer: arena-relative offset of the first free byte (Section + 4.3). MUST be <= the container's PCF used_bytes. Every stored structure + and extent lies within [0, arena_used). + + This profile defines no other fixed arena-wide field. In particular it + defines no arena-wide checksum; integrity is provided by the inner table + and inner data hashes (Section 9), and OPTIONALLY by sealing the whole + container (Section 9.4). + + +------------------------------------------------------------------------------- +7. Inner Partition Table +------------------------------------------------------------------------------- + +7.1 Reuse of the PCF Table Block + + The inner partitions are listed by a chain of PCF Table Blocks stored INSIDE + the arena, byte-for-byte identical in layout to the top-level partition + table (PCF Section 5). The chain begins at inner_table_offset and is + followed via each block's next_table_offset until it reaches 0; each block + is a 74-byte Table Block Header (PCF Section 5.1) followed by partition_count + 141-byte Partition Entries (PCF Section 5.2). + + All offsets within these blocks are ARENA-RELATIVE (Section 3.3): a block's + next_table_offset is an arena-relative offset (0 = end of chain), and each + entry's start_offset is an arena-relative offset (Section 7.2). + + A block's table_hash is computed exactly as in PCF Section 8.4, over the + block's own [block, block + 74 + partition_count * 141) bytes with the + table_hash field treated as zero. Because the container's own PCF + data_hash_algo_id is normally 0, the inner table_hash is the primary + integrity anchor for the inner table; a Writer SHOULD therefore choose a + CRYPTOGRAPHIC table_hash_algo_id (PCF IDs 16, 17, 18) for inner blocks + (Section 9.2). + +7.2 Reinterpretation of Inner Partition Entry Fields + + An inner Partition Entry uses the identical 141-byte PCF layout, but two + fields are REINTERPRETED for the arena. This reinterpretation applies ONLY + to entries in an inner table; a generic PCF reader never parses an inner + table (Section 2) and so never observes it. + + type, uid, label, data_hash_algo_id, data_hash + Used with their normal PCF meaning. uid MUST be unique across the + whole file (Section 2.1). data_hash covers the partition's LOGICAL + content (Section 9.1), not any contiguous file range. + + start_offset (REINTERPRETED) + The ARENA-RELATIVE offset of this inner partition's first Fragment + Table block (Section 8), NOT the offset of contiguous data. MUST be + >= 24. + + used_bytes + The length of the partition's LOGICAL content: the sum of its DATA + extent lengths (Section 8.3). This is the value data_hash covers. + + max_length (REINTERPRETED) + There is no contiguous reservation inside an arena; growth is by + appending extents (Section 10.1). A Writer MUST set max_length equal + to used_bytes, and a DCP-aware Reader MUST ignore max_length for inner + entries (it locates content via the Fragment Table, never via + start_offset + used_bytes). The equality keeps PCF's used_bytes <= + max_length invariant trivially satisfied (PCF Section 5.2). + + All other PCF entry rules apply unchanged (non-NIL uid, non-zero type, + valid label, zero-filled field tails). + + +------------------------------------------------------------------------------- +8. Fragment Table +------------------------------------------------------------------------------- + + Each inner partition has exactly one Fragment Table, located by its entry's + (reinterpreted) start_offset. A Fragment Table is a singly linked chain of + one or more Fragment Table blocks; it lists the partition's extents in + LOGICAL order (the order in which they concatenate to form the content). + +8.1 Fragment Table Header + + Each Fragment Table block begins with the following 9-byte header. + + Offset Size Type Field + (rel) + ------ ---- ----- -------------------------------------------------- + 0 8 u64 next_fragtable_offset (rel; 0 = last block) + 8 1 u8 fragment_count (0..255 in THIS block) + ------ ---- ----- -------------------------------------------------- + Total: 9 bytes + + next_fragtable_offset + Arena-relative offset of the next Fragment Table block of THIS partition, + or 0 if this is the last block. A Reader MUST stop when it reads 0. + + fragment_count + Number of Fragment Entries packed immediately after this header, 0..255. + A partition needing more than 255 extents chains further blocks, so the + number of extents per partition is unbounded (mirroring PCF's overflow + chain, PCF Section 5.3). The Fragment Entries follow immediately: + + fragment[i] at block + 9 + i * 18 (0 <= i < fragment_count) + +8.2 Fragment Entry + + Each Fragment Entry is a fixed-size 18-byte record. + + Offset Size Type Field + (rel) + ------ ---- ----- -------------------------------------------------- + 0 8 u64 extent_offset (rel; start of the extent's bytes) + 8 8 u64 extent_length (length of the extent in bytes) + 16 1 u8 kind (1 = DATA; see below) + 17 1 u8 flags (bit0 = SHARED; others reserved 0) + ------ ---- ----- -------------------------------------------------- + Total: 18 bytes + + extent_offset, extent_length + The extent is the arena byte range [extent_offset, extent_offset + + extent_length). Both endpoints MUST lie within [0, arena_used). Extents + MAY appear in any physical position, MAY be out of logical order + physically, and MAY overlap or coincide with extents of other entries + (sharing, Section 10.2). extent_length MAY be 0 (an empty extent + contributes no bytes; a Writer SHOULD avoid emitting them). + + kind + Extent kind. Defined and RESERVED values: + + 0 RESERVED / INVALID. MUST NOT appear in a live Fragment Entry; like + PCF type 0, it guards against zero-filled records. + 1 DATA. The extent's bytes are literal content (the only kind + defined in version 1.0). + 2 HOLE (RESERVED). Intended for sparse content (a run of zero bytes + with no backing storage). MUST NOT be written in version 1.0. + 3 REF (RESERVED). Intended for a reference to bytes outside this + container (cross-container sharing). MUST NOT be written in version + 1.0. + + A Reader encountering a kind it does not implement MUST treat the + affected inner partition as unreadable, but MUST NOT treat the container + or the file as malformed on that basis alone. + + flags + Bit 0 (0x01) is SHARED (Section 8.4). All other bits are reserved and + MUST be 0 in version 1.0. + +8.3 Logical Content Reconstruction + + The logical content of an inner partition is the concatenation, in + fragment-table order across the whole chain, of the bytes of its DATA + extents: + + content = extent[0].bytes || extent[1].bytes || ... || extent[m-1].bytes + + where extent[i].bytes = arena[extent_offset .. extent_offset + extent_length) + for the i-th DATA extent encountered while walking the Fragment Table chain + (block by block, entry by entry). RESERVED kinds (HOLE, REF) have no + version-1.0 contribution and render the partition unreadable to a v1.0 + reader (Section 8.2). + + The length of content MUST equal the inner entry's used_bytes. Sharing, + physical order, and fragmentation are invisible to this reconstruction: + content depends only on which bytes are named, in which logical order. + +8.4 The SHARED Flag and Copy-on-Write + + The SHARED flag (flags bit 0) is a WRITER CONTRACT that makes safe in-place + editing explicit. It is purely advisory to Readers, which reconstruct + content identically regardless of its value. + + SHARED set + The bytes referenced by this extent MUST NOT be overwritten in place. + Any modification of a logical range that this extent covers MUST be + performed COPY-ON-WRITE: split the extent and write the changed bytes + into a fresh, privately owned extent (Section 10.1), leaving the + shared bytes untouched. + + SHARED clear + The extent is privately owned -- no other live Fragment Entry, in any + inner partition, references any of its bytes -- and a Writer MAY + overwrite those bytes in place. + + Maintenance rules: + + F1. When a Writer creates a Fragment Entry that references (in whole or + in part) arena bytes already referenced by another live Fragment + Entry -- that is, at the moment it shares an extent (Section 10.2) + -- it MUST set SHARED on the new entry AND on every existing live + entry that references any of those bytes. + + F2. Writers only ever SET the SHARED flag. The flag is CLEARED only by + compaction (Section 10.3), which has the global view needed to + prove that an extent has become privately referenced again. + + This asymmetry makes errors safe: a stale-SET flag (an extent marked SHARED + that is in fact no longer aliased) costs only an unnecessary copy-on-write + and is harmless; a stale-CLEAR flag would be dangerous (a Writer might + overwrite shared bytes), and rule F2 makes it unreachable. The flag is a + boolean "is aliased", NOT a reference count; no counts are stored on disk, + and liveness/aliasing is recomputed globally only at compaction. + + +------------------------------------------------------------------------------- +9. Integrity Model +------------------------------------------------------------------------------- + + Integrity layers cleanly, exactly as PCF intends (PCF Section 8.5): + + - each inner partition's data_hash protects its logical content; + - each inner Table Block's table_hash protects the inner entries; + - the container's own PCF data_hash optionally protects the whole arena + (sealed mode); + - the top-level table_hash protects the container's PCF entry, and the + PCF header anchors the top-level chain. + +9.1 Inner Partition Data Hash + + An inner entry's data_hash is computed, using the PCF Hash Algorithm + Registry and field encoding (PCF Sections 8.1, 8.2), over the partition's + LOGICAL content (Section 8.3) -- the reconstructed byte stream, NOT any + contiguous arena range. This is the natural generalisation of PCF Section + 8.3 from a contiguous prefix to a fragmented stream. + + Because data_hash commits to logical content only, it is INVARIANT under + every physical change that preserves content: fragmentation and + defragmentation, extent sharing and un-sharing, arena relocation, and + promotion/demotion. This invariance is the basis of the promotion invariant + (Section 10.4) and of PCF-SIG compatibility (Section 2.1). + + An inner partition intended to be signed by PCF-SIG MUST use a cryptographic + data_hash_algo_id (PCF IDs 16, 17, 18); a value of 0 (none) is permitted for + unsigned inner partitions but then provides no verification. + +9.2 Inner Table Block Hash + + Each inner Table Block carries its own table_hash, computed exactly as PCF + Section 8.4 over the block's header-plus-entries region within the arena. + Because the container normally carries data_hash_algo_id = 0, the inner + table_hash is the only integrity protection for the inner entries; a Writer + SHOULD use a cryptographic algorithm for it (PCF IDs 16, 17, 18). A Writer + MUST keep each inner table_hash consistent after any change to an inner + entry (the PCF hash cascade, PCF Section 8.5, applied within the arena). + +9.3 Fragment-Table Protection + + The Fragment Tables are not covered by any dedicated hash. They do not need + one: any corruption of a Fragment Table that changes which bytes an inner + partition reconstructs changes its logical content and is therefore detected + by the partition's data_hash (Section 9.1). A corruption that leaves the + reconstructed content byte-identical (for example, redirecting an extent to + an identical run of bytes elsewhere) is by definition harmless, because the + content -- the thing this profile and PCF-SIG commit to -- is unchanged. + +9.4 Container Data Hash and Sealed Mode + + By default a DCP container sets data_hash_algo_id = 0 so that incremental + edits do not force an O(arena) re-hash of the container's PCF entry. A + Writer MAY instead SEAL a finalized container by assigning it a cryptographic + PCF data_hash over its whole arena [start_offset, start_offset + used_bytes), + exactly as for any PCF partition (PCF Section 8.3). A sealed container is + itself signable as a blob by PCF-SIG, at the cost of recomputing the + container data_hash on every change; sealing is therefore appropriate only + for finalized containers. + + +------------------------------------------------------------------------------- +10. Operations (Informative) +------------------------------------------------------------------------------- + + The following is illustrative, not normative. All operations edit only the + acting inner partition's Fragment Table and append bytes/extents to free + arena space; no other inner partition is affected, and no unchanged bytes + are moved. + +10.1 In-Place Editing via the Fragment Table + + Let an inner partition have logical content C reconstructed from its extents. + "Split at logical position p" means: find the extent covering p and, if p + falls inside it, replace that one Fragment Entry with two entries describing + the same bytes [.., p) and [p, ..). Splitting changes no data bytes. + + Append N bytes + Write N bytes at arena_used; append a DATA Fragment Entry + (extent_offset = old arena_used, length = N, flags = 0); advance + arena_used by N; increase used_bytes by N. + + Overwrite logical [p, p+n) + Split at p and at p+n. Write the n replacement bytes to free arena + space as a new private extent and replace the covered middle entries + with that one entry. If any covered extent had SHARED set, this + copy-on-write is REQUIRED (Section 8.4); the shared bytes are left + intact. used_bytes is unchanged. + + Insert N bytes at logical p + Split at p. Write the N new bytes to free arena space; insert a DATA + entry between the two halves. used_bytes increases by N. + + Delete logical [p, p+n) + Split at p and at p+n; drop the middle entries. No data bytes are + moved or freed in place; the orphaned bytes are reclaimed by + compaction. used_bytes decreases by n. + + Truncate to length L + Split at L; drop all entries after it. used_bytes becomes L. + + After any of these, the Writer recomputes the inner entry's data_hash over + the new logical content and the enclosing inner Table Block's table_hash + (Section 9). Removing an entire inner partition is the PCF entry-removal + procedure (PCF Section 11.4) applied to the inner table; its extents become + free arena space. + +10.2 Deduplication + + Two Fragment Entries -- in the same or different inner partitions -- MAY + reference the same extent bytes. To deduplicate an identical run, a Writer + points a second Fragment Entry at the existing extent instead of writing the + bytes again, and applies maintenance rule F1 (Section 8.4): it sets SHARED + on the new entry and on every existing entry that aliases those bytes. No + on-disk reference count is kept; sharing is invisible to reconstruction and + to data_hash (Sections 8.3, 9.1). + + Chunking strategy (fixed-size, content-defined, or none) and any in-memory + index of chunk hashes are entirely Writer-side and outside this + specification. A Reader never needs to know that any extent is shared. + +10.3 Compaction and Defragmentation + + Compaction reclaims free arena space and MAY defragment partitions; it is a + full rewrite of the arena, intended for a finalized container. A + conforming compaction MUST PRESERVE SHARING: it collects the set of live + extents (the union over all inner partitions' Fragment Tables), copies each + distinct extent once, and rewrites every Fragment Entry to the new offset + via a (old -> new) map. This mark-and-sweep is also where the SHARED flag is + normalised: an extent that the global scan proves to be referenced exactly + once MAY have its SHARED flag cleared on the surviving entry (the only + permitted clearing, rule F2). A defragmenting compactor MAY additionally + rewrite a partition as a single contiguous extent. + + Because compaction preserves every partition's logical content, every inner + data_hash is unchanged; only inner table_hashes (and, if sealed, the + container data_hash) are recomputed. A naive compactor that is not + sharing-aware still produces a CORRECT container -- it merely copies shared + bytes more than once, losing the deduplication. + +10.4 Promotion and the Promotion Invariant + + Promotion moves an inner partition out to the top level: + + P1. Materialise the partition's logical content (Section 8.3) as a + contiguous region at some free file offset S. + P2. Build a top-level PCF Partition Entry that COPIES the inner entry's + type, uid, label, used_bytes, data_hash_algo_id, and data_hash + verbatim, and sets start_offset = S and max_length = used_bytes. + P3. Add that entry to a top-level Table Block (PCF Section 11.3) and + remove the inner entry from the inner table (PCF Section 11.4) -- a + single MOVE, so the uid is never duplicated (Section 2.1). + P4. Recompute the affected top-level and inner table_hashes. + + The six fields {type, uid, label, used_bytes, data_hash_algo_id, data_hash} + are byte-identical before and after; only start_offset and max_length + change, and only physical bytes move. Because data_hash commits to logical + content (Section 9.1) and the materialised contiguous bytes ARE that same + content, the promoted partition's data_hash verifies unchanged. This is the + PROMOTION INVARIANT, and it is exactly the set of fields a PCF-SIG signature + protects (Section 2.1), so a signature over the partition survives promotion. + + Demotion (top-level partition -> inner partition of a DCP container) is the + reverse: copy the same six fields into a new inner entry, build a Fragment + Table for the bytes (typically one extent), and remove the top-level entry. + The same invariant holds. + + +------------------------------------------------------------------------------- +11. Reader Algorithms (Informative) +------------------------------------------------------------------------------- + + The following pseudocode is illustrative, not normative. + +11.1 Open and enumerate inner partitions + + read and verify the 20-byte PCF header (PCF 11.1) + for each top-level partition P (PCF 11.1): + if P.type != 0xAAAC0001: handle as a normal/other-profile partition + else: treat P as a DCP container: + A = P.start_offset // arena base (absolute) + read 24-byte DCP Header at A+0; verify dcp_magic == "PDCP" + if profile_version_major unsupported: reject this container + it = header.inner_table_offset + while it != 0: // inner Table Block chain + read 74-byte block header at A+it; verify table_hash (PCF 8.4) + for i in 0 .. partition_count-1: + e = read 141-byte entry at A+it+74+i*141 + register inner partition e (key: e.uid) // Section 2.1 + it = block.next_table_offset + +11.2 Reconstruct an inner partition's content + + e = the inner entry + ft = e.start_offset // arena-rel: first FragTable + out = [] + while ft != 0: + read 9-byte FragTable header at A+ft + for j in 0 .. fragment_count-1: + g = read 18-byte Fragment Entry at A+ft+9+j*18 + if g.kind != 1 (DATA): partition unreadable to v1.0 (Section 8.2) + out.append( arena[g.extent_offset .. +g.extent_length) ) + ft = header.next_fragtable_offset + content = concat(out) + assert len(content) == e.used_bytes + if e.data_hash_algo_id != 0: verify hash(content) == e.data_hash + +11.3 Resolve a partition by uid (PCF-SIG-compatible scope, Section 2.1) + + resolvable = (all top-level partitions) + UNION (inner partitions of every DCP container) + find the unique entry whose uid matches; reconstruct (11.2) if inner. + +11.4 Promotion read-back + + a promoted partition is an ordinary top-level partition; no DCP awareness + is needed to read or to PCF-SIG-verify it. Its uid and data_hash equal + those it had inside the container (Section 10.4). + + +------------------------------------------------------------------------------- +12. Writer Algorithms (Informative) +------------------------------------------------------------------------------- + + The following pseudocode is illustrative, not normative. + +12.1 Create a container + + allocate a top-level partition entry, type = 0xAAAC0001, + data_hash_algo_id = 0 (unsealed), unique uid + arena: write DCP Header (magic, version, flags=0, + inner_table_offset -> empty Inner Table Block, arena_used) + maintain the top-level table_hash (PCF 8.5) + +12.2 Add an inner partition + + write the content as one or more DATA extents at arena_used (advance it) + build a Fragment Table listing those extents (flags = 0 unless sharing) + build an inner Partition Entry (Section 7.2): start_offset -> the + Fragment Table, used_bytes = logical length, max_length = used_bytes, + data_hash over logical content (cryptographic if to be signed) + append the entry to an inner Table Block (PCF 11.3), recompute its + table_hash; update arena_used in the DCP Header + +12.3 Edit / append / dedup + + perform the Section 10.1 / 10.2 operation on the partition's Fragment + Table; honour the SHARED flag (Section 8.4); recompute the inner + entry's data_hash and its block's table_hash + +12.4 Finalize + + OPTIONALLY compact the arena, preserving sharing (Section 10.3) + OPTIONALLY seal the container with a cryptographic PCF data_hash + (Section 9.4) if it is to be signed as a blob + + +------------------------------------------------------------------------------- +13. Conformance and Validation +------------------------------------------------------------------------------- + + A conforming PCF-DCP Reader MUST: + + R1. Be a conforming PCF Reader (PCF Section 12, C1..C8). In particular it + validates the file as PCF independently of this profile. + R2. For each partition of type 0xAAAC0001, verify dcp_magic and reject a + container whose profile_version_major it does not implement. + R3. Traverse the inner Table Block chain from inner_table_offset to 0, + verifying each inner table_hash unless its algo id is 0 (PCF 8.4), + interpreting all in-arena offsets as arena-relative (Section 3.3). + R4. Reconstruct an inner partition's content by concatenating its DATA + extents in fragment-table order (Section 8.3), and verify that the + length equals used_bytes and (unless algo id 0) that the content + matches data_hash. + R5. Treat an inner partition as unreadable, but NOT the file as + malformed, if it uses a reserved kind or a hash algorithm the Reader + does not implement (Sections 8.2, 9.1). + R6. When resolving partitions by uid (e.g. for PCF-SIG), include inner + partitions in scope (Section 2.1, 11.3) and treat a uid that is + non-unique across the whole file as malformed. + R7. Treat as malformed: a missing/incorrect dcp_magic; an inner entry + with type 0x00000000, NIL uid, or type 0xAAAC0001 (Section 4.4); an + arena pointer or extent that falls outside [0, arena_used); a + reconstructed length that disagrees with used_bytes. + + A conforming PCF-DCP Writer MUST: + + W1. Be a conforming PCF Writer (PCF Section 12, W1..W5) at every level + (top-level entries and inner entries alike). + W2. Begin each arena with a valid 24-byte DCP Header (Section 6) and + keep arena_used accurate as the bump pointer. + W3. Store every in-arena offset as arena-relative (Section 3.3); set + each inner entry's start_offset to its Fragment Table and its + max_length equal to used_bytes (Section 7.2). + W4. Give every inner partition a file-globally-unique, non-NIL uid + (Section 2.1) and never type an inner partition 0xAAAC0001. + W5. Maintain the hash cascade within the arena: recompute an inner + entry's data_hash over its logical content and its block's + table_hash after any change (Section 9). + W6. Honour the SHARED flag (Section 8.4): perform copy-on-write for any + extent marked SHARED, set SHARED on all aliases when sharing an + extent (F1), and clear it only during sharing-preserving compaction + (F2). Compaction MUST preserve sharing (Section 10.3). + W7. Use a cryptographic data_hash_algo_id for any inner partition that + is to be signed by PCF-SIG (Section 2.1), and treat promotion as a + MOVE that preserves the six protected fields (Section 10.4). + + As in PCF, the format TRUSTS the Writer for physical layout. A DCP-aware + Reader is NOT required to validate that extents do not overlap unintentionally, + that the arena has no gaps, or that free space is minimal; such a container + is not, by those facts alone, non-conforming (PCF Section 12). Intentional + extent overlap is the deduplication mechanism (Section 10.2). + + +------------------------------------------------------------------------------- +14. Versioning +------------------------------------------------------------------------------- + + PCF-DCP carries its own profile version in every DCP Header + (profile_version_major, profile_version_minor), independent of the PCF + container version (which remains 1.0). + + A profile MAJOR change denotes an incompatible change to the arena + layout or to reconstruction semantics (for example, changing the + Fragment Entry size, or allowing nested containers). A Reader MUST reject + a container whose profile_version_major it does not implement. + + A profile MINOR change denotes a backward-compatible addition that does + not alter any existing byte layout -- for example, assigning meaning to a + reserved DCP Header flag bit, defining a reserved Fragment Entry flags + bit, or activating a reserved kind value (HOLE, REF). A Reader + implementing major M MUST read containers with the same M and an equal or + lower minor, and SHOULD accept a higher minor, ignoring features it does + not understand. + + Different DCP containers in one file MAY declare different profile minor + versions. This document defines profile version 1.0. + + +------------------------------------------------------------------------------- +15. Future Considerations (Informative) +------------------------------------------------------------------------------- + + Sparse content (HOLE). kind = 2 is reserved for runs of implicit zero bytes + with no backing extent, enabling sparse inner partitions; activating it is a + minor bump because it does not change any existing layout. + + Cross-container references (REF). kind = 3 is reserved for an extent that + names bytes outside the current container (for example, another container's + arena or a top-level partition), enabling deduplication across container + boundaries. Its exact reference encoding is left to a future minor version. + + Nested containers. Section 4.4 forbids inner DCP containers in v1.0 to keep + reconstruction non-recursive; a future MAJOR version could permit nesting. + + Cryptographic signatures. An inner partition with a cryptographic data_hash + is directly signable in place by PCF-SIG once a reader includes inner + partitions in uid scope (Section 2.1); alternatively a sealed container + (Section 9.4) is signable as a blob. Either way the entry layouts here never + need to change, following PCF Section 13. + + Finalization. A finalized container SHOULD be compacted (Section 10.3) to + reclaim free arena space; the whole file MAY additionally be compacted with + the PCF compaction operation (PCF Section 11.5). + + Performance envelope. The extent/copy-on-write model is optimal for + write-once, append, and snapshot workloads. Heavy random in-place rewriting + remains correct but incurs the usual copy-on-write costs (fragmentation and + write amplification, as in copy-on-write filesystems), repaid by compaction. + + +------------------------------------------------------------------------------- +16. Assumptions and Design Decisions (Informative) +------------------------------------------------------------------------------- + + D1. The profile changes nothing in PCF. It uses one application type + (0xAAAC0001) from the reserved block 0xAAAC00xx, permitted by PCF + Section 7, and reuses PCF's Table Block and Partition Entry structures + verbatim inside the arena. + + D2. Content is addressed indirectly through extents, so an inner partition + can grow, shrink, and be edited in the middle without relocating + neighbours or moving unchanged bytes -- the capability PCF's contiguous + model lacks (PCF Section 6, A4). + + D3. All in-arena offsets are arena-relative, so the whole arena -- and thus + every inner partition -- can be relocated as one block (e.g. by + compaction) without rewriting any in-arena pointer. + + D4. Free space is derived from a single bump pointer (arena_used), echoing + PCF's "free space is derived" decision (PCF A3, A9); reuse of freed + extents is Writer policy, reclaimed in bulk by compaction. + + D5. data_hash commits to LOGICAL content (the concatenation of extents), + not to any physical range, making it invariant under fragmentation, + sharing, relocation, and promotion. This single decision yields both + the promotion invariant and zero-cost deduplication. + + D6. The six PCF-SIG-protected fields equal the six fields preserved by + promotion/demotion/compaction, so signatures survive all of them. A + reader-side extension of uid scope (Section 2.1) makes inner partitions + signable in place without changing a single PCF-SIG byte. + + D7. Sharing is a boolean per-extent flag (SHARED), not a reference count: + Writers only set it (when aliasing), compaction alone clears it (with a + global view). Stale-set is safe (extra copy-on-write); stale-clear is + made unreachable. No counts live on disk. + + D8. The Fragment Table needs no hash of its own: any corruption that + changes reconstructed content is caught by data_hash, and any that does + not is harmless. + + D9. The container is unsealed (data_hash_algo_id = 0) by default so edits + cost O(change), with an OPTIONAL sealed mode (O(arena) re-hash) for a + finalized, blob-signable container. + + D10. Inner partitions are leaves (no nested containers in v1.0), bounding + reader complexity; HOLE and REF kinds are reserved for a later minor + bump. + + D11. Type 0x00000000, the NIL uid, the invalid kind 0, and the all-zero + arena offset all act as guards against accidental zero-filled records + and as chain terminators, mirroring PCF's guards (PCF A11, A15). + + +------------------------------------------------------------------------------- +17. Test Vectors +------------------------------------------------------------------------------- + + This section provides a complete, byte-exact reference file so that + independent implementations can verify conformance. The file is a DCP + container holding TWO inner partitions, and it demonstrates fragmentation, + deduplication via a shared extent, and the SHARED flag: + + * Inner "A": type 0x00000010, uid = 16 x 0xA1, logical content the ASCII + string "Hello, World!" (13 bytes) stored as TWO extents -- "Hello, " + (7 bytes, PRIVATE) and "World!" (6 bytes, SHARED) -- protected by + SHA-256. + * Inner "B": type 0x00000010, uid = 16 x 0xB2, logical content "World!" + (6 bytes), stored as ONE extent that is the SAME arena bytes as A's + second extent (deduplication). Both references carry SHARED = 1. + Protected by SHA-256. + + The DCP container itself is one top-level PCF partition: type 0xAAAC0001, + uid = 16 x 0xDC, label "dcp", data_hash_algo_id = 0 (unsealed). Both the + top-level Table Block and the inner Table Block are hashed with SHA-256. All + multi-byte integers are little-endian. Total file size is 700 bytes. An + implementation that builds the same logical container and emits this exact + canonical layout MUST produce these exact bytes. Note that B's data_hash + equals SHA-256("World!"), the SAME value a standalone PCF partition of + "World!" would carry -- demonstrating the promotion invariant (Section 10.4). + + Top-level structure (absolute file offsets): + + 0x0000..0x0014 File Header (PCF, 20 bytes) + 0x0014..0x005E Top-level Table Block header (PCF, 74 bytes) + 0x005E..0x00EB Top-level Partition Entry "dcp" (PCF, 141 bytes) + 0x00EB..0x02BC DCP arena (the container's data) (465 bytes) + + Arena structure (arena-relative offsets; add 0x00EB for absolute): + + rel 0x000..0x018 DCP Header (24 bytes) + rel 0x018..0x01F extent "Hello, " (7 bytes) + rel 0x01F..0x025 extent "World!" (6 bytes, shared) + rel 0x025..0x052 Fragment Table A (2 entries) (45 bytes) + rel 0x052..0x06D Fragment Table B (1 entry) (27 bytes) + rel 0x06D..0x1D1 Inner Table Block (2 entries) (356 bytes) + arena_used = 0x1D1 (465) + + ---- File Header (offset 0x0000, 20 bytes) ----------------------- + 0000 89 4B 50 52 54 0D 0A 1A magic = 89 'K' 'P' 'R' 'T' 0D 0A 1A + 0008 01 00 version_major = 1 + 000A 00 00 version_minor = 0 + 000C 14 00 00 00 00 00 00 00 partition_table_offset = 20 + + ---- Top-level Table Block (offset 0x0014, header 74 bytes) ------ + 0014 01 partition_count = 1 + 0015 00 00 00 00 00 00 00 00 next_table_offset = 0 (end of chain) + 001D 10 table_hash_algo = 16 (SHA-256) + 001E 22 E5 05 13 61 0D A1 6E 02 A2 F7 C6 12 01 B5 04 table_hash (SHA-256 of this 74-byte header + 002E D8 19 FB CD 05 FF C0 5E 2F B2 3D 06 33 86 C9 53 with this field zeroed, plus the 1 entry) + 003E 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 004E 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + ---- Top-level Partition Entry 0 "dcp" (offset 0x005E, 141 bytes) - + 005E 01 00 AC AA type = 0xAAAC0001 (DCP_CONTAINER) + 0062 DC DC DC DC DC DC DC DC DC DC DC DC DC DC DC DC uid = 16 x 0xDC + 0072 64 63 70 00 00 00 00 00 00 00 00 00 00 00 00 00 label[0..16] = "dcp" then NUL padding + 0082 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label[16..32] = NUL padding + 0092 EB 00 00 00 00 00 00 00 start_offset = 235 (arena base) + 009A D1 01 00 00 00 00 00 00 max_length = 465 + 00A2 D1 01 00 00 00 00 00 00 used_bytes = 465 (= arena_used) + 00AA 00 data_hash_algo = 0 (none; unsealed) + 00AB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 data_hash = all zero (algo 0) + 00BB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero) + 00CB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero) + 00DB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero) + + ==== DCP ARENA begins at 0x00EB (arena offset 0) ==== + + ---- DCP Header (arena 0x000 / file 0x00EB, 24 bytes) ------------ + 00EB 50 44 43 50 dcp_magic = "PDCP" + 00EF 01 profile_version_major = 1 + 00F0 00 profile_version_minor = 0 + 00F1 00 00 flags = 0 + 00F3 6D 00 00 00 00 00 00 00 inner_table_offset = 109 (arena-rel) + 00FB D1 01 00 00 00 00 00 00 arena_used = 465 (arena-rel) + + ---- Data extents (arena 0x018..0x025 / file 0x0103..0x0110) ----- + 0103 48 65 6C 6C 6F 2C 20 extent "Hello, " (7 bytes, A private) + 010A 57 6F 72 6C 64 21 extent "World!" (6 bytes, A+B shared) + (the two extents happen to be physically adjacent here, but each is + described by its own Fragment Entry; "World!" is referenced twice.) + + ---- Fragment Table A (arena 0x025 / file 0x0110, 45 bytes) ------ + 0110 00 00 00 00 00 00 00 00 next_fragtable_offset = 0 (last) + 0118 02 fragment_count = 2 + 0119 18 00 00 00 00 00 00 00 [0] extent_offset = 24 + 0121 07 00 00 00 00 00 00 00 [0] extent_length = 7 + 0129 01 [0] kind = 1 (DATA) + 012A 00 [0] flags = 0 (private) + 012B 1F 00 00 00 00 00 00 00 [1] extent_offset = 31 + 0133 06 00 00 00 00 00 00 00 [1] extent_length = 6 + 013B 01 [1] kind = 1 (DATA) + 013C 01 [1] flags = 1 (SHARED) + + ---- Fragment Table B (arena 0x052 / file 0x013D, 27 bytes) ------ + 013D 00 00 00 00 00 00 00 00 next_fragtable_offset = 0 (last) + 0145 01 fragment_count = 1 + 0146 1F 00 00 00 00 00 00 00 [0] extent_offset = 31 (same as A[1]) + 014E 06 00 00 00 00 00 00 00 [0] extent_length = 6 + 0156 01 [0] kind = 1 (DATA) + 0157 01 [0] flags = 1 (SHARED) + + ---- Inner Table Block (arena 0x06D / file 0x0158, header 74 B) -- + 0158 02 partition_count = 2 + 0159 00 00 00 00 00 00 00 00 next_table_offset = 0 (end of chain) + 0161 10 table_hash_algo = 16 (SHA-256) + 0162 BE 19 BB 14 7C 68 18 51 CA B7 01 C4 BF 9D 6C 62 table_hash (SHA-256 of this 74-byte header + 0172 82 C0 CB 53 4F 06 BA 97 07 AB EF 01 AD 47 22 1D with this field zeroed, plus the 2 entries) + 0182 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 0192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + ---- Inner Partition Entry 0 "A" (arena 0x0B7 / file 0x01A2) ----- + 01A2 10 00 00 00 type = 0x00000010 + 01A6 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 uid = 16 x 0xA1 + 01B6 41 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label = "A" then NUL padding + 01C6 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label (cont.) + 01D6 25 00 00 00 00 00 00 00 start_offset = 37 (arena-rel -> FragTable A) + 01DE 0D 00 00 00 00 00 00 00 max_length = 13 (= used_bytes) + 01E6 0D 00 00 00 00 00 00 00 used_bytes = 13 (logical content length) + 01EE 10 data_hash_algo = 16 (SHA-256) + 01EF DF FD 60 21 BB 2B D5 B0 AF 67 62 90 80 9E C3 A5 data_hash = SHA-256("Hello, World!") + 01FF 31 91 DD 81 C7 F7 0A 4B 28 68 8A 36 21 82 98 6F (significant bytes continue) + 020F 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 021F 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + ---- Inner Partition Entry 1 "B" (arena 0x144 / file 0x022F) ----- + 022F 10 00 00 00 type = 0x00000010 + 0233 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 uid = 16 x 0xB2 + 0243 42 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label = "B" then NUL padding + 0253 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label (cont.) + 0263 52 00 00 00 00 00 00 00 start_offset = 82 (arena-rel -> FragTable B) + 026B 06 00 00 00 00 00 00 00 max_length = 6 (= used_bytes) + 0273 06 00 00 00 00 00 00 00 used_bytes = 6 + 027B 10 data_hash_algo = 16 (SHA-256) + 027C 51 4B 6B B7 C8 46 EC FB 8D 2D 29 EF 0B 5C 79 B6 data_hash = SHA-256("World!") + 028C 3E 6A E8 38 F1 23 DA 93 6F E8 27 FD A6 54 27 6C (significant bytes continue) + 029C 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 02AC 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + Plain hex dump of the complete 700-byte file (16 bytes per row): + + 0000 89 4B 50 52 54 0D 0A 1A 01 00 00 00 14 00 00 00 .KPRT........... + 0010 00 00 00 00 01 00 00 00 00 00 00 00 00 10 22 E5 ..............". + 0020 05 13 61 0D A1 6E 02 A2 F7 C6 12 01 B5 04 D8 19 ..a..n.......... + 0030 FB CD 05 FF C0 5E 2F B2 3D 06 33 86 C9 53 00 00 .....^/.=.3..S.. + 0040 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0050 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 00 ................ + 0060 AC AA DC DC DC DC DC DC DC DC DC DC DC DC DC DC ................ + 0070 DC DC 64 63 70 00 00 00 00 00 00 00 00 00 00 00 ..dcp........... + 0080 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0090 00 00 EB 00 00 00 00 00 00 00 D1 01 00 00 00 00 ................ + 00A0 00 00 D1 01 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00B0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00D0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00E0 00 00 00 00 00 00 00 00 00 00 00 50 44 43 50 01 ...........PDCP. + 00F0 00 00 00 6D 00 00 00 00 00 00 00 D1 01 00 00 00 ...m............ + 0100 00 00 00 48 65 6C 6C 6F 2C 20 57 6F 72 6C 64 21 ...Hello, World! + 0110 00 00 00 00 00 00 00 00 02 18 00 00 00 00 00 00 ................ + 0120 00 07 00 00 00 00 00 00 00 01 00 1F 00 00 00 00 ................ + 0130 00 00 00 06 00 00 00 00 00 00 00 01 01 00 00 00 ................ + 0140 00 00 00 00 00 01 1F 00 00 00 00 00 00 00 06 00 ................ + 0150 00 00 00 00 00 00 01 01 02 00 00 00 00 00 00 00 ................ + 0160 00 10 BE 19 BB 14 7C 68 18 51 CA B7 01 C4 BF 9D ......|h.Q...... + 0170 6C 62 82 C0 CB 53 4F 06 BA 97 07 AB EF 01 AD 47 lb...SO........G + 0180 22 1D 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "............... + 0190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 01A0 00 00 10 00 00 00 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 ................ + 01B0 A1 A1 A1 A1 A1 A1 41 00 00 00 00 00 00 00 00 00 ......A......... + 01C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 01D0 00 00 00 00 00 00 25 00 00 00 00 00 00 00 0D 00 ......%......... + 01E0 00 00 00 00 00 00 0D 00 00 00 00 00 00 00 10 DF ................ + 01F0 FD 60 21 BB 2B D5 B0 AF 67 62 90 80 9E C3 A5 31 .`!.+...gb.....1 + 0200 91 DD 81 C7 F7 0A 4B 28 68 8A 36 21 82 98 6F 00 ......K(h.6!..o. + 0210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0220 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 ................ + 0230 00 00 00 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 ................ + 0240 B2 B2 B2 42 00 00 00 00 00 00 00 00 00 00 00 00 ...B............ + 0250 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0260 00 00 00 52 00 00 00 00 00 00 00 06 00 00 00 00 ...R............ + 0270 00 00 00 06 00 00 00 00 00 00 00 10 51 4B 6B B7 ............QKk. + 0280 C8 46 EC FB 8D 2D 29 EF 0B 5C 79 B6 3E 6A E8 38 .F...-)..\y.>j.8 + 0290 F1 23 DA 93 6F E8 27 FD A6 54 27 6C 00 00 00 00 .#..o.'..T'l.... + 02A0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 02B0 00 00 00 00 00 00 00 00 00 00 00 00 ............ + + Verification notes: + + - The file is a conforming PCF v1.0 file: a generic PCF reader sees one + partition ("dcp", type 0xAAAC0001, used_bytes 465, data_hash algo 0) + and the top-level table_hash verifies. + - Reconstructing inner "A" concatenates extent [24,31) "Hello, " and + extent [31,37) "World!" -> "Hello, World!", whose SHA-256 is + df fd 60 21 ... 98 6f, matching A.data_hash; length 13 = used_bytes. + - Reconstructing inner "B" reads extent [31,37) "World!" -> SHA-256 + 51 4b 6b b7 ... 27 6c, matching B.data_hash; length 6 = used_bytes. + These are the SAME arena bytes A references at logical offset 7: a + shared extent referenced by two entries -- A[1] and B[0] -- each with + SHARED = 1 (flags bytes at 0x013C and 0x0157). A's other extent + "Hello, " is private (flags = 0 at 0x012A). + - Promoting "B" would write "World!" contiguously, set the top-level + entry's start_offset/max_length, and keep uid (16 x 0xB2) and data_hash + (51 4b 6b b7 ...) byte-identical -- the promotion invariant. + + +------------------------------------------------------------------------------- +Appendix A. Field Layout Summary +------------------------------------------------------------------------------- + + DCP Header (24 bytes, at arena offset 0) + 0 4 bytes dcp_magic = "PDCP" (50 44 43 50) + 4 1 u8 profile_version_major = 1 + 5 1 u8 profile_version_minor = 0 + 6 2 u16 flags (reserved, = 0) + 8 8 u64 inner_table_offset (arena-rel; 0 = none) + 16 8 u64 arena_used (bump pointer, arena-rel) + + Inner Table Block (reused PCF Table Block; offsets arena-relative) + Table Block Header (74 B) partition_count, next_table_offset (arena-rel, + 0 = end), table_hash_algo_id, table_hash + Inner Partition Entry (141 B; PCF layout, two fields reinterpreted) + 0 4 u32 type (NOT 0, NOT 0xAAAC0001) + 4 16 bytes uid (unique file-wide, non-NIL) + 20 32 bytes label + 52 8 u64 start_offset (arena-rel -> Fragment Table) + 60 8 u64 max_length (= used_bytes) + 68 8 u64 used_bytes (logical content length) + 76 1 u8 data_hash_algo_id (crypto if to be signed) + 77 64 bytes data_hash (over logical content) + + Fragment Table Header (9 bytes; offsets arena-relative) + 0 8 u64 next_fragtable_offset (arena-rel; 0 = last block) + 8 1 u8 fragment_count (0..255 in this block) + + Fragment Entry (18 bytes, at block + 9 + i*18) + 0 8 u64 extent_offset (arena-rel) + 8 8 u64 extent_length + 16 1 u8 kind (1 = DATA; 0 invalid; + 2 HOLE / 3 REF reserved) + 17 1 u8 flags (bit0 = SHARED; others 0) + + Logical content = concatenation of DATA extents in fragment-table order. + data_hash and used_bytes describe that logical content (Sections 8.3, 9.1). + + Container facilities used unchanged from PCF + File Header (20 B) magic, version 1.0, partition_table_offset + Top-level Table Block lists the DCP container (type 0xAAAC0001) among + ordinary partitions + DCP container entry start_offset = arena base, used_bytes = + arena_used, data_hash_algo = 0 (unsealed) or + crypto (sealed, Section 9.4) + + +------------------------------------------------------------------------------- +Appendix B. Type and Constant Registry +------------------------------------------------------------------------------- + + PCF partition types used by PCF-DCP + 0xAAAC0001 DCP_CONTAINER (arena holding inner partitions) + 0xAAAC0000..0xAAAC00FF reserved by this profile for future types + + Magics + "PDCP" = 0x50 0x44 0x43 0x50 (DCP Header) + + Fragment Entry kind + 0 = INVALID (reserved guard) 1 = DATA + 2 = HOLE (reserved) 3 = REF (reserved) + + Fragment Entry flags + bit 0 = SHARED (no in-place overwrite; copy-on-write required) + bits 1..7 reserved (MUST be 0) + + Reinterpreted inner-entry fields (inside an arena only) + start_offset -> arena-relative offset of the Fragment Table + max_length -> equal to used_bytes (no contiguous reservation) + + Reserved arena value + arena-relative offset 0 = chain terminator / "none" + + Structure sizes + DCP Header 24 bytes + Fragment Table Header 9 bytes + Fragment Entry 18 bytes + (reused) Table Block Header 74 bytes (PCF) + (reused) Partition Entry 141 bytes (PCF) + + Limits + Entries per (inner) Table Block <= 255 (PCF u8 partition_count) + Inner partitions per container unbounded (inner table chain) + Extents per Fragment Table block <= 255 (u8 fragment_count) + Extents per inner partition unbounded (Fragment Table chain) + Nesting inner partitions are leaves (no inner + DCP_CONTAINER in v1.0) + + Hash algorithms as PCF Section 8.1 (SHA-256 = 16 default/RECOMMENDED + for inner data_hash and inner table_hash) + + Profile version major 1, minor 0 (PCF container version: 1.0) + +=============================================================================== + End of PCF-DCP Specification v1.0 +=============================================================================== diff --git a/tools/pcf-debug/Cargo.toml b/tools/pcf-debug/Cargo.toml index dd5eab9..9c8ab42 100644 --- a/tools/pcf-debug/Cargo.toml +++ b/tools/pcf-debug/Cargo.toml @@ -18,3 +18,4 @@ path = "src/main.rs" [dependencies] pcf = { path = "../../reference/PCF-v1.0", version = "0.0.8" } pcf-sig = { path = "../../reference/PCF-SIG-v1.0", version = "0.0.8" } +pcf-dcp = { path = "../../reference/PCF-DCP-v1.0", version = "0.0.8" } diff --git a/tools/pcf-debug/README.md b/tools/pcf-debug/README.md index b4eda26..fb3a318 100644 --- a/tools/pcf-debug/README.md +++ b/tools/pcf-debug/README.md @@ -82,6 +82,10 @@ pcf-debug fs.pcf decode - **Diagnostics** — gaps, overlaps, truncated regions, chain cycles, and hash mismatches, by severity. - **Decoded partitions** — field trees produced by the plugin decoders. + *Container* decoders also decode what they contain: a `DCP_CONTAINER` + (`0xAAAC0001`) reconstructs each inner partition's logical content and routes + it back through the registry, nesting the result under a *decoded inner + partitions* group (e.g. an inner `PFS_NODE` is shown as a full PFS field tree). ## Writing a decoder plugin @@ -126,6 +130,11 @@ The first decoder whose `matches` returns true wins; `raw` is always last and matches everything. `decode` must be infallible — on malformed input, return the fields you could read plus `warnings`. +A *container* decoder may also override the optional `children` method to return +the sub-partitions it holds (each as a `DecodedChild` carrying a reconstructed +content blob). The pipeline decodes those recursively and nests them under the +parent — see `dcp-container` (`src/plugin/dcp.rs`). + The built-in `pfs-node` and `pfs-session` decoders (`src/plugin/pfs.rs`) are a complete worked example covering the PFS-MS record formats. diff --git a/tools/pcf-debug/src/lib.rs b/tools/pcf-debug/src/lib.rs index 68a35dc..0a264de 100644 --- a/tools/pcf-debug/src/lib.rs +++ b/tools/pcf-debug/src/lib.rs @@ -18,9 +18,14 @@ pub mod model; pub mod plugin; pub mod render; -use plugin::{DecoderRegistry, PartitionMeta}; +use plugin::{Decoded, DecodedChild, DecoderRegistry, FieldNode, FieldValue, PartitionMeta}; use render::Report; +/// Maximum container nesting depth followed by [`decode_recursive`]. DCP forbids +/// nesting, so the real depth is at most 2; this is a guard against pathological +/// or hostile inputs. +pub const MAX_DECODE_DEPTH: usize = 8; + /// Read a partition's used bytes from the file image, or an empty slice when the /// region is out of bounds or empty. fn partition_bytes(data: &[u8], entry: &pcf::PartitionEntry, in_bounds: bool) -> Vec { @@ -49,8 +54,81 @@ pub fn build_report(data: &[u8], verify: bool, registry: &DecoderRegistry) -> Re uid: &e.uid, label: &label, }; - decoded.push((e.uid, registry.decode(&meta, &bytes))); + decoded.push((e.uid, decode_recursive(registry, &meta, &bytes))); } } Report { layout, decoded } } + +/// Decode `data`, then recursively decode and nest any sub-partitions a +/// container decoder surfaces (e.g. the inner partitions of a DCP container). +/// The nested decodes appear as a `"decoded inner partitions"` group at the end +/// of the field tree. +pub fn decode_recursive(registry: &DecoderRegistry, meta: &PartitionMeta, data: &[u8]) -> Decoded { + let mut dec = registry.decode(meta, data); + attach_inner_decodes(registry, meta, data, &mut dec); + dec +} + +/// Append a `"decoded inner partitions"` group to `dec` for every sub-partition +/// the matching container decoder reports, decoding each recursively. A no-op +/// for non-container partitions. Useful when `dec` was produced by a forced +/// decoder (`--decoder`) and should still gain its nested children. +pub fn attach_inner_decodes( + registry: &DecoderRegistry, + meta: &PartitionMeta, + data: &[u8], + dec: &mut Decoded, +) { + attach_at_depth(registry, meta, data, dec, 0); +} + +fn attach_at_depth( + registry: &DecoderRegistry, + meta: &PartitionMeta, + data: &[u8], + dec: &mut Decoded, + depth: usize, +) { + if depth >= MAX_DECODE_DEPTH { + return; + } + let kids = registry.children(meta, data); + if kids.is_empty() { + return; + } + let mut group = FieldNode::group("decoded inner partitions"); + for ch in kids { + let cmeta = PartitionMeta { + partition_type: ch.partition_type, + uid: &ch.uid, + label: &ch.label, + }; + let mut cdec = registry.decode(&cmeta, &ch.data); + attach_at_depth(registry, &cmeta, &ch.data, &mut cdec, depth + 1); + group.push(child_to_field(&ch, cdec)); + } + dec.fields.push(group); +} + +/// Wrap one child's decoded field tree as a single named group, carrying its +/// uid/type as a note and preserving any decoder warnings as a sub-group. +fn child_to_field(child: &DecodedChild, dec: Decoded) -> FieldNode { + let mut node = FieldNode::group(format!("content[{}] -> {}", child.label, dec.format_name)) + .with_note(format!( + "uid {} type 0x{:08X}", + render::uid_hex(&child.uid), + child.partition_type + )); + for f in dec.fields { + node.push(f); + } + if !dec.warnings.is_empty() { + let mut warns = FieldNode::group("warnings"); + for msg in dec.warnings { + warns.push(FieldNode::leaf("warning", FieldValue::Text(msg), (0, 0))); + } + node.push(warns); + } + node +} diff --git a/tools/pcf-debug/src/main.rs b/tools/pcf-debug/src/main.rs index c412c2b..21af8b9 100644 --- a/tools/pcf-debug/src/main.rs +++ b/tools/pcf-debug/src/main.rs @@ -160,10 +160,14 @@ fn filter_decode( label: &label, }; let dec = match &opts.decoder { - Some(name) => registry - .decode_with(name, &meta, &bytes) - .unwrap_or_else(|| registry.decode(&meta, &bytes)), - None => registry.decode(&meta, &bytes), + Some(name) => { + let mut d = registry + .decode_with(name, &meta, &bytes) + .unwrap_or_else(|| registry.decode(&meta, &bytes)); + pcf_debug::attach_inner_decodes(registry, &meta, &bytes, &mut d); + d + } + None => pcf_debug::decode_recursive(registry, &meta, &bytes), }; decoded.push((e.uid, dec)); } diff --git a/tools/pcf-debug/src/plugin/dcp.rs b/tools/pcf-debug/src/plugin/dcp.rs new file mode 100644 index 0000000..8796840 --- /dev/null +++ b/tools/pcf-debug/src/plugin/dcp.rs @@ -0,0 +1,423 @@ +//! Decoder for PCF-DCP containers (see `specs/PCF-DCP-spec-v1.0.txt`): +//! `DCP_CONTAINER` (partition type `0xAAAC0001`, arena magic `"PDCP"`). +//! +//! The decoder mirrors the spec's byte tables field-for-field — DCP Header, +//! Inner Table Block chain, and a Fragment Table per inner partition — and +//! reports spec violations as warnings rather than failing. Parsing is inline +//! (it does not depend on the `pcf-dcp` reader), but it borrows the profile's +//! constants from the `pcf-dcp` crate so the two never drift. + +use pcf::{HashAlgo, ENTRY_SIZE, TABLE_HEADER_SIZE}; +use pcf_dcp::{ + DCP_CONTAINER_TYPE, DCP_HEADER_SIZE, DCP_MAGIC, FRAGMENT_ENTRY_SIZE, FRAGTABLE_HEADER_SIZE, + KIND_DATA, +}; + +use super::{ + le_u16, le_u32, le_u64, uid_at, Decoded, DecodedChild, FieldNode, FieldValue, PartitionDecoder, + PartitionMeta, +}; + +fn kind_name(kind: u8) -> &'static str { + match kind { + 0 => "INVALID (reserved)", + 1 => "DATA", + 2 => "HOLE (reserved)", + 3 => "REF (reserved)", + _ => "unknown", + } +} + +fn hash_algo_name(id: u8) -> &'static str { + match HashAlgo::from_id(id) { + Ok(a) => crate::model::algo_name(a), + Err(_) => "unknown", + } +} + +/// Render a `<64-byte hash>` pair, truncated to the digest length. +fn hash_field(data: &[u8], algo_off: usize, hash_off: usize) -> FieldNode { + let id = data.get(algo_off).copied().unwrap_or(0); + let len = HashAlgo::from_id(id).map(|a| a.digest_len()).unwrap_or(0); + let bytes = data + .get(hash_off..hash_off + 64) + .map(|b| b[..len.min(64)].to_vec()) + .unwrap_or_default(); + FieldNode::group("data_hash") + .child(FieldNode::leaf( + "algo_id", + FieldValue::Enum { + raw: id as u64, + name: hash_algo_name(id).into(), + }, + (algo_off as u64, algo_off as u64 + 1), + )) + .child(FieldNode::leaf( + "hash", + FieldValue::Bytes(bytes), + (hash_off as u64, hash_off as u64 + 64), + )) +} + +pub struct DcpContainerDecoder; + +impl PartitionDecoder for DcpContainerDecoder { + fn name(&self) -> &'static str { + "dcp-container" + } + + fn matches(&self, meta: &PartitionMeta, data: &[u8]) -> bool { + meta.partition_type == DCP_CONTAINER_TYPE || data.get(0..4) == Some(&DCP_MAGIC) + } + + fn decode(&self, _meta: &PartitionMeta, data: &[u8]) -> Decoded { + let mut warnings = Vec::new(); + let mut fields = Vec::new(); + + if (data.len() as u64) < DCP_HEADER_SIZE { + warnings.push(format!( + "arena is {} bytes; DCP Header needs at least {DCP_HEADER_SIZE}", + data.len() + )); + } + + // ---- DCP Header --------------------------------------------------- + let magic_ok = data.get(0..4) == Some(&DCP_MAGIC); + if !magic_ok { + warnings.push("dcp_magic is not \"PDCP\"".into()); + } + let mut header = FieldNode::group("DCP Header"); + header.push( + FieldNode::leaf( + "dcp_magic", + FieldValue::Text(ascii4(data.get(0..4).unwrap_or(&[]))), + (0, 4), + ) + .with_note(if magic_ok { + "magic OK" + } else { + "expected \"PDCP\"" + }), + ); + let major = data.get(4).copied().unwrap_or(0); + if major != 1 { + warnings.push(format!( + "profile_version_major is {major} (v1.0 reader expects 1)" + )); + } + header.push(FieldNode::leaf( + "profile_version_major", + FieldValue::U64(major as u64), + (4, 5), + )); + header.push(FieldNode::leaf( + "profile_version_minor", + FieldValue::U64(data.get(5).copied().unwrap_or(0) as u64), + (5, 6), + )); + let flags = le_u16(data, 6).unwrap_or(0); + if flags != 0 { + warnings.push(format!("flags is {flags:#06x}; v1.0 requires 0")); + } + header.push(FieldNode::leaf( + "flags", + FieldValue::U64(flags as u64), + (6, 8), + )); + let inner_table_offset = le_u64(data, 8).unwrap_or(0); + header.push(FieldNode::leaf( + "inner_table_offset", + FieldValue::U64(inner_table_offset), + (8, 16), + )); + let arena_used = le_u64(data, 16).unwrap_or(0); + header.push(FieldNode::leaf( + "arena_used", + FieldValue::U64(arena_used), + (16, 24), + )); + fields.push(header); + + // ---- Inner Table Block chain -------------------------------------- + let mut inner_group = FieldNode::group("Inner Table Block(s)"); + let mut frag_offsets: Vec<(String, u64)> = Vec::new(); // (label, start_offset) + let mut off = inner_table_offset; + let mut block_idx = 0usize; + let mut budget = data.len() / TABLE_HEADER_SIZE as usize + 1; + while off != 0 { + if budget == 0 { + warnings.push("inner table chain does not terminate".into()); + break; + } + budget -= 1; + let base = off as usize; + if base + TABLE_HEADER_SIZE as usize > data.len() { + warnings.push(format!("inner Table Block at {off} runs past end of arena")); + break; + } + let count = data[base]; + let next = le_u64(data, base + 1).unwrap_or(0); + let th_algo = data.get(base + 9).copied().unwrap_or(0); + let mut block = FieldNode::group(format!("block[{block_idx}] @ {off}")); + block.push(FieldNode::leaf( + "partition_count", + FieldValue::U64(count as u64), + (base as u64, base as u64 + 1), + )); + block.push(FieldNode::leaf( + "next_table_offset", + FieldValue::U64(next), + (base as u64 + 1, base as u64 + 9), + )); + block.push( + hash_field(data, base + 9, base + 10) + .with_note(format!("table_hash ({})", hash_algo_name(th_algo))), + ); + + for i in 0..count as usize { + let eo = base + TABLE_HEADER_SIZE as usize + i * ENTRY_SIZE as usize; + if eo + ENTRY_SIZE as usize > data.len() { + warnings.push(format!("inner entry {i} runs past end of arena")); + break; + } + let ptype = le_u32(data, eo).unwrap_or(0); + let uid = uid_at(data, eo + 4).unwrap_or([0; 16]); + let label = label32(data, eo + 20); + let start_offset = le_u64(data, eo + 52).unwrap_or(0); + let max_length = le_u64(data, eo + 60).unwrap_or(0); + let used_bytes = le_u64(data, eo + 68).unwrap_or(0); + + if ptype == DCP_CONTAINER_TYPE { + warnings.push(format!( + "inner entry \"{label}\" is itself a DCP container (nesting forbidden)" + )); + } + if max_length != used_bytes { + warnings.push(format!( + "inner entry \"{label}\": max_length ({max_length}) != used_bytes ({used_bytes}) (spec 7.2)" + )); + } + frag_offsets.push((label.clone(), start_offset)); + + let mut entry = FieldNode::group(format!("inner[{label}]")); + entry.push(FieldNode::leaf( + "type", + FieldValue::U64(ptype as u64), + (eo as u64, eo as u64 + 4), + )); + entry.push(FieldNode::leaf( + "uid", + FieldValue::Uid(uid), + (eo as u64 + 4, eo as u64 + 20), + )); + entry.push(FieldNode::leaf( + "label", + FieldValue::Text(label), + (eo as u64 + 20, eo as u64 + 52), + )); + entry.push( + FieldNode::leaf( + "start_offset", + FieldValue::U64(start_offset), + (eo as u64 + 52, eo as u64 + 60), + ) + .with_note("reinterpreted -> Fragment Table"), + ); + entry.push( + FieldNode::leaf( + "max_length", + FieldValue::U64(max_length), + (eo as u64 + 60, eo as u64 + 68), + ) + .with_note("reinterpreted = used_bytes"), + ); + entry.push(FieldNode::leaf( + "used_bytes", + FieldValue::U64(used_bytes), + (eo as u64 + 68, eo as u64 + 76), + )); + entry.push(hash_field(data, eo + 76, eo + 77)); + block.push(entry); + } + inner_group.push(block); + off = next; + block_idx += 1; + } + fields.push(inner_group); + + // ---- Fragment Tables, one chain per inner partition --------------- + let mut frag_group = FieldNode::group("Fragment Tables"); + let mut total_extents = 0usize; + let mut shared_extents = 0usize; + for (label, start) in &frag_offsets { + let mut inner = FieldNode::group(format!("frags[{label}] @ {start}")); + let mut foff = *start; + let mut fbudget = data.len() / FRAGTABLE_HEADER_SIZE as usize + 1; + let mut chain_idx = 0usize; + while foff != 0 { + if fbudget == 0 { + warnings.push(format!("fragment table for \"{label}\" does not terminate")); + break; + } + fbudget -= 1; + let base = foff as usize; + if base + FRAGTABLE_HEADER_SIZE as usize > data.len() { + warnings.push(format!( + "fragment table for \"{label}\" runs past end of arena" + )); + break; + } + let next = le_u64(data, base).unwrap_or(0); + let fcount = data[base + 8]; + let mut blk = FieldNode::group(format!("block[{chain_idx}] @ {foff}")); + blk.push(FieldNode::leaf( + "next_fragtable_offset", + FieldValue::U64(next), + (base as u64, base as u64 + 8), + )); + blk.push(FieldNode::leaf( + "fragment_count", + FieldValue::U64(fcount as u64), + (base as u64 + 8, base as u64 + 9), + )); + for i in 0..fcount as usize { + let xo = + base + FRAGTABLE_HEADER_SIZE as usize + i * FRAGMENT_ENTRY_SIZE as usize; + if xo + FRAGMENT_ENTRY_SIZE as usize > data.len() { + warnings.push(format!( + "fragment {i} of \"{label}\" runs past end of arena" + )); + break; + } + let ext_off = le_u64(data, xo).unwrap_or(0); + let ext_len = le_u64(data, xo + 8).unwrap_or(0); + let kind = data.get(xo + 16).copied().unwrap_or(0); + let eflags = data.get(xo + 17).copied().unwrap_or(0); + let shared = eflags & 1 != 0; + total_extents += 1; + if shared { + shared_extents += 1; + } + if kind != KIND_DATA { + warnings.push(format!( + "fragment {i} of \"{label}\" has kind {kind} ({}) — unreadable in v1.0", + kind_name(kind) + )); + } + if eflags & !1 != 0 { + warnings.push(format!( + "fragment {i} of \"{label}\" has reserved flag bits set" + )); + } + let mut frag = FieldNode::group(format!("extent[{i}]")); + frag.push(FieldNode::leaf( + "extent_offset", + FieldValue::U64(ext_off), + (xo as u64, xo as u64 + 8), + )); + frag.push(FieldNode::leaf( + "extent_length", + FieldValue::U64(ext_len), + (xo as u64 + 8, xo as u64 + 16), + )); + frag.push(FieldNode::leaf( + "kind", + FieldValue::Enum { + raw: kind as u64, + name: kind_name(kind).into(), + }, + (xo as u64 + 16, xo as u64 + 17), + )); + frag.push(FieldNode::leaf( + "flags", + FieldValue::Flags { + raw: eflags as u64, + set: if shared { + vec!["SHARED".into()] + } else { + Vec::new() + }, + }, + (xo as u64 + 17, xo as u64 + 18), + )); + blk.push(frag); + } + inner.push(blk); + foff = next; + chain_idx += 1; + } + frag_group.push(inner); + } + fields.push(frag_group); + + // ---- Summary ------------------------------------------------------ + let mut summary = FieldNode::group("summary"); + summary.push(FieldNode::leaf( + "inner_partitions", + FieldValue::U64(frag_offsets.len() as u64), + (0, 0), + )); + summary.push(FieldNode::leaf( + "extents", + FieldValue::U64(total_extents as u64), + (0, 0), + )); + summary.push(FieldNode::leaf( + "shared_extents", + FieldValue::U64(shared_extents as u64), + (0, 0), + )); + fields.push(summary); + + Decoded { + format_name: "DCP_CONTAINER".into(), + fields, + warnings, + } + } + + /// The inner partitions of the DCP container, each with its reconstructed + /// logical content, so the pipeline can decode them recursively (spec + /// Sections 7–8). Defensive: a malformed arena or an inner partition whose + /// content cannot be reconstructed (reserved fragment kind, length + /// mismatch) is simply omitted — `decode` already surfaces the structural + /// detail and any warnings. + fn children(&self, _meta: &PartitionMeta, data: &[u8]) -> Vec { + let arena = match pcf_dcp::Arena::parse(data) { + Ok(a) => a, + Err(_) => return Vec::new(), + }; + arena + .inners() + .into_iter() + .filter_map(|info| { + arena.content(&info.uid).ok().map(|content| DecodedChild { + partition_type: info.partition_type, + uid: info.uid, + label: info.label, + data: content, + }) + }) + .collect() + } +} + +/// Render a 4-byte magic as ASCII (non-printable bytes shown as `\xNN`). +fn ascii4(b: &[u8]) -> String { + b.iter() + .map(|&c| { + if (0x20..0x7f).contains(&c) { + (c as char).to_string() + } else { + format!("\\x{c:02x}") + } + }) + .collect() +} + +/// Decode a 32-byte label field (read until the first NUL). +fn label32(data: &[u8], off: usize) -> String { + let bytes = data.get(off..off + 32).unwrap_or(&[]); + let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + String::from_utf8_lossy(&bytes[..end]).into_owned() +} diff --git a/tools/pcf-debug/src/plugin/mod.rs b/tools/pcf-debug/src/plugin/mod.rs index 6aad704..145ea3c 100644 --- a/tools/pcf-debug/src/plugin/mod.rs +++ b/tools/pcf-debug/src/plugin/mod.rs @@ -11,10 +11,12 @@ //! (shared-library) backend could be added behind a feature without reworking //! any decoder. +mod dcp; mod pcfsig; mod pfs; mod raw; +pub use dcp::DcpContainerDecoder; pub use pcfsig::{PcfSigKeyDecoder, PcfSigSignatureDecoder}; pub use pfs::{PfsNodeDecoder, PfsSessionDecoder}; pub use raw::RawDecoder; @@ -111,6 +113,22 @@ pub struct Decoded { pub warnings: Vec, } +/// A sub-partition surfaced by a *container* decoder (e.g. the inner partitions +/// of a DCP container) whose reconstructed logical content should itself be +/// decoded. Returned by [`PartitionDecoder::children`] and decoded recursively +/// by [`crate::decode_recursive`]. +#[derive(Debug, Clone)] +pub struct DecodedChild { + /// The sub-partition's application type. + pub partition_type: u32, + /// The sub-partition's 16-byte uid. + pub uid: [u8; 16], + /// The sub-partition's decoded label. + pub label: String, + /// The sub-partition's reconstructed logical content. + pub data: Vec, +} + /// A plugin that turns partition bytes into a field tree. pub trait PartitionDecoder { /// Stable identifier, used for `--decoder` selection and HTML anchors. @@ -123,6 +141,14 @@ pub trait PartitionDecoder { /// Full decode. Must never panic: on malformed input it returns whatever /// fields it could read plus `warnings`. fn decode(&self, meta: &PartitionMeta, data: &[u8]) -> Decoded; + + /// Sub-partitions contained within this partition whose reconstructed + /// content should itself be decoded (e.g. the inner partitions of a DCP + /// container). The default is none; only container-like decoders override + /// it. Must never panic: on malformed input it returns an empty list. + fn children(&self, _meta: &PartitionMeta, _data: &[u8]) -> Vec { + Vec::new() + } } /// An ordered set of decoders. The first decoder whose `matches` returns true @@ -141,6 +167,7 @@ impl DecoderRegistry { Box::new(PfsSessionDecoder), Box::new(PcfSigKeyDecoder), Box::new(PcfSigSignatureDecoder), + Box::new(DcpContainerDecoder), Box::new(RawDecoder), ], } @@ -168,6 +195,17 @@ impl DecoderRegistry { RawDecoder.decode(meta, data) } + /// The sub-partitions of `data`, as reported by the first matching decoder + /// (mirrors [`Self::decode`]). Empty for non-container partitions. + pub fn children(&self, meta: &PartitionMeta, data: &[u8]) -> Vec { + for d in &self.decoders { + if d.matches(meta, data) { + return d.children(meta, data); + } + } + Vec::new() + } + /// Decode with a specific decoder by name, if present. pub fn decode_with(&self, name: &str, meta: &PartitionMeta, data: &[u8]) -> Option { self.decoders diff --git a/tools/pcf-debug/tests/decode_dcp.rs b/tools/pcf-debug/tests/decode_dcp.rs new file mode 100644 index 0000000..0c43c05 --- /dev/null +++ b/tools/pcf-debug/tests/decode_dcp.rs @@ -0,0 +1,211 @@ +//! Tests for the PCF-DCP container decoder, both directly (with synthesised +//! bytes) and through the full walk → registry → decode pipeline using the +//! canonical 700-byte test vector from `reference/PCF-DCP-v1.0/testdata/`. + +use pcf_debug::build_report; +use pcf_debug::plugin::{ + DcpContainerDecoder, Decoded, DecoderRegistry, FieldNode, FieldValue, PartitionDecoder, + PartitionMeta, +}; + +const CANONICAL: &[u8] = include_bytes!("../../../reference/PCF-DCP-v1.0/testdata/canonical.bin"); + +const DCP_CONTAINER_TYPE: u32 = 0xAAAC_0001; + +/// Find a (possibly nested) field by name. +fn find<'a>(fields: &'a [FieldNode], name: &str) -> Option<&'a FieldNode> { + for f in fields { + if f.name == name { + return Some(f); + } + if let Some(hit) = find(&f.children, name) { + return Some(hit); + } + } + None +} + +fn find_decoded<'a>( + report: &'a pcf_debug::render::Report, + format_name: &str, +) -> Option<&'a Decoded> { + report + .decoded + .iter() + .find(|(_, d)| d.format_name == format_name) + .map(|(_, d)| d) +} + +#[test] +fn registry_routes_dcp_type_to_dedicated_decoder() { + let r = DecoderRegistry::with_builtins(); + assert!(r.names().contains(&"dcp-container")); +} + +#[test] +fn dcp_decoder_on_canonical_vector() { + let report = build_report(CANONICAL, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").expect("canonical vector has a DCP_CONTAINER"); + + assert!( + dcp.warnings.is_empty(), + "clean container has no warnings: {:?}", + dcp.warnings + ); + + // DCP Header. + let magic = find(&dcp.fields, "dcp_magic").unwrap(); + assert_eq!(magic.value, FieldValue::Text("PDCP".into())); + assert_eq!(magic.note.as_deref(), Some("magic OK")); + + let ito = find(&dcp.fields, "inner_table_offset").unwrap(); + assert_eq!(ito.value, FieldValue::U64(109)); + let used = find(&dcp.fields, "arena_used").unwrap(); + assert_eq!(used.value, FieldValue::U64(465)); + + // Inner partition A: two extents, reinterpreted start_offset. + let inner_a = find(&dcp.fields, "inner[A]").unwrap(); + let start = find(&inner_a.children, "start_offset").unwrap(); + assert_eq!(start.value, FieldValue::U64(37)); + assert_eq!( + start.note.as_deref(), + Some("reinterpreted -> Fragment Table") + ); + + // Summary: 2 inner partitions, 3 extent references, 2 of them shared. + let inner_count = find(&dcp.fields, "inner_partitions").unwrap(); + assert_eq!(inner_count.value, FieldValue::U64(2)); + let extents = find(&dcp.fields, "extents").unwrap(); + assert_eq!(extents.value, FieldValue::U64(3)); + let shared = find(&dcp.fields, "shared_extents").unwrap(); + assert_eq!(shared.value, FieldValue::U64(2)); +} + +#[test] +fn dcp_decoder_flags_shared_extent() { + let report = build_report(CANONICAL, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").unwrap(); + // Fragment B's only extent is SHARED. + let frags_b = find(&dcp.fields, "frags[B] @ 82").unwrap(); + let flags = find(&frags_b.children, "flags").unwrap(); + match &flags.value { + FieldValue::Flags { raw, set } => { + assert_eq!(*raw, 1); + assert_eq!(set, &vec!["SHARED".to_string()]); + } + other => panic!("flags has wrong shape: {other:?}"), + } +} + +#[test] +fn dcp_decoder_warns_on_bad_magic() { + let mut bytes = vec![0u8; 24]; + bytes[..4].copy_from_slice(b"XDCP"); + let uid = [0u8; 16]; + let meta = PartitionMeta { + partition_type: DCP_CONTAINER_TYPE, + uid: &uid, + label: "dcp", + }; + let d: Decoded = DcpContainerDecoder.decode(&meta, &bytes); + assert!(d.warnings.iter().any(|w| w.contains("magic"))); +} + +#[test] +fn dcp_decoder_matches_by_magic_without_type() { + let mut bytes = vec![0u8; 24]; + bytes[..4].copy_from_slice(b"PDCP"); + let uid = [0u8; 16]; + let meta = PartitionMeta { + partition_type: 0xFFFF_FFFF, + uid: &uid, + label: "raw", + }; + assert!(DcpContainerDecoder.matches(&meta, &bytes)); +} + +/// Find the first (possibly nested) field whose name contains `needle`. +fn find_contains<'a>(fields: &'a [FieldNode], needle: &str) -> Option<&'a FieldNode> { + for f in fields { + if f.name.contains(needle) { + return Some(f); + } + if let Some(hit) = find_contains(&f.children, needle) { + return Some(hit); + } + } + None +} + +#[test] +fn recursively_decodes_inner_partition_content() { + // The pipeline reconstructs each inner partition's logical content and + // decodes it, nesting the result under the DCP container. + let report = build_report(CANONICAL, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").unwrap(); + + let group = find(&dcp.fields, "decoded inner partitions") + .expect("container has a decoded-inner-partitions group"); + // A's content "Hello, World!" (13 B) and B's "World!" (6 B) are raw text. + let a = find_contains(&group.children, "content[A]").unwrap(); + assert!(a.name.contains("-> RAW"), "A decodes as RAW: {}", a.name); + let a_size = find(&a.children, "size").unwrap(); + assert_eq!(a_size.value, FieldValue::U64(13)); + let b = find_contains(&group.children, "content[B]").unwrap(); + let b_size = find(&b.children, "size").unwrap(); + assert_eq!(b_size.value, FieldValue::U64(6)); +} + +#[test] +fn recursive_decode_routes_inner_to_matching_decoder() { + use pcf_dcp::{Arena, Chunker, DcpWriter, HashAlgo}; + + // An inner partition typed as PFS_NODE (0xAAAA0001) must route, after + // reconstruction, to the PFS node decoder — not the raw fallback. + let mut node = b"PFSN".to_vec(); + node.extend_from_slice(&[0u8; 60]); // pad past the fixed prefix + let mut arena = Arena::new(); + arena + .add_inner( + 0xAAAA_0001, + [0x0A; 16], + "node", + &node, + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", arena).unwrap(); + let image = w.to_image().unwrap(); + + let report = build_report(&image, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").unwrap(); + let group = find(&dcp.fields, "decoded inner partitions").unwrap(); + let child = find_contains(&group.children, "content[node]").unwrap(); + assert!( + child.name.contains("-> PFS_NODE"), + "inner routed to PFS decoder, got: {}", + child.name + ); + // The container's own warnings are unaffected by the inner's warnings, + // which are nested under the child instead. + assert!( + dcp.warnings.is_empty(), + "container warnings: {:?}", + dcp.warnings + ); +} + +#[test] +fn registry_reports_no_children_for_leaf_partitions() { + // A non-container partition yields no children (default trait impl). + let uid = [0u8; 16]; + let meta = PartitionMeta { + partition_type: 0xFFFF_FFFF, + uid: &uid, + label: "raw", + }; + let registry = DecoderRegistry::with_builtins(); + assert!(registry.children(&meta, b"plain bytes").is_empty()); +}