From c108a7649a5dcf18ebaeb486938f59ee4fff9dc3 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 6 Jun 2026 17:41:19 +0000 Subject: [PATCH 1/3] Add PCF-DCP v1.0 spec: dynamic container partition profile Introduce specs/PCF-DCP-spec-v1.0.txt, an application-level profile over PCF v1.0 that adds inner partitions which can grow, shrink, and be mutated in the middle without relocating neighbours. Key elements: - DCP_CONTAINER partition type 0xAAAC0001; arena addressed by relative offsets, with a 24-byte DCP Header (bump-pointer allocator, derived free space). - Inner partitions listed by reused PCF Table Blocks/Entries; content described by per-partition Fragment Tables of variable-length extents (18-byte fixed Fragment Entry). - data_hash committed over logical (reconstructed) content, making it invariant under fragmentation, sharing, relocation, and promotion. - Promotion invariant: the six fields preserved by promotion/demotion equal the fields a PCF-SIG signature protects; inner partitions are signable in place via a reader-side uid-scope extension. - Optional deduplication via shared extents, with a per-extent SHARED flag enforcing safe copy-on-write while allowing cheap in-place edits of private extents; sharing-preserving mark-and-sweep compaction. - Byte-exact 700-byte test vector (verified to parse as valid PCF v1.0 and to round-trip its embedded hex dump) demonstrating fragmentation and a shared extent. https://claude.ai/code/session_01XzcjWWbNiuNX9ZywevfbQu --- specs/PCF-DCP-spec-v1.0.txt | 1323 +++++++++++++++++++++++++++++++++++ 1 file changed, 1323 insertions(+) create mode 100644 specs/PCF-DCP-spec-v1.0.txt diff --git a/specs/PCF-DCP-spec-v1.0.txt b/specs/PCF-DCP-spec-v1.0.txt new file mode 100644 index 0000000..be54c18 --- /dev/null +++ b/specs/PCF-DCP-spec-v1.0.txt @@ -0,0 +1,1323 @@ +=============================================================================== + PCF-DCP -- Dynamic Container Partition Profile + Specification Version 1.0 +=============================================================================== + +Status of This Document + + This document specifies version 1.0 of PCF-DCP, an application-level + profile that uses the Partitioned Container Format (PCF) version 1.0 to + store a set of inner partitions that may each GROW, SHRINK, and be MUTATED + IN THE MIDDLE without relocating their neighbours, inside a single PCF + partition that acts as a self-contained arena. + + PCF-DCP does NOT modify, extend, or fork PCF. A PCF-DCP file is a fully + conforming PCF v1.0 file. All structures defined here live inside the data + region of one PCF partition (the "DCP container") and inside the + application-defined portions of PCF entries. This profile is layered + strictly above the PCF specification; where the two appear to conflict, the + PCF specification governs the byte container and this document governs only + the interpretation of the DCP container partition's contents. + + The profile version described here is major version 1, minor version 0. + + +------------------------------------------------------------------------------- +Table of Contents +------------------------------------------------------------------------------- + + 1. Introduction + 2. Relationship to PCF + 2.1 Relationship to PCF-SIG + 3. Conventions and Terminology + 3.1 Requirement Keywords + 3.2 Terminology + 3.3 Data Types and Byte Order + 4. Profile Model Overview + 4.1 The DCP Container Partition + 4.2 The Arena and Relative Offsets + 4.3 Free Space and the Bump Allocator + 4.4 No Nested Containers + 5. Partition Types and Reserved Values + 6. DCP Header + 7. Inner Partition Table + 7.1 Reuse of the PCF Table Block + 7.2 Reinterpretation of Inner Partition Entry Fields + 8. Fragment Table + 8.1 Fragment Table Header + 8.2 Fragment Entry + 8.3 Logical Content Reconstruction + 8.4 The SHARED Flag and Copy-on-Write + 9. Integrity Model + 9.1 Inner Partition Data Hash + 9.2 Inner Table Block Hash + 9.3 Fragment-Table Protection + 9.4 Container Data Hash and Sealed Mode + 10. Operations (Informative) + 10.1 In-Place Editing via the Fragment Table + 10.2 Deduplication + 10.3 Compaction and Defragmentation + 10.4 Promotion and the Promotion Invariant + 11. Reader Algorithms (Informative) + 12. Writer Algorithms (Informative) + 13. Conformance and Validation + 14. Versioning + 15. Future Considerations (Informative) + 16. Assumptions and Design Decisions (Informative) + 17. Test Vectors + Appendix A. Field Layout Summary + Appendix B. Type and Constant Registry + + +------------------------------------------------------------------------------- +1. Introduction +------------------------------------------------------------------------------- + + PCF stores each partition as a CONTIGUOUS prefix of a pre-allocated region + (PCF Section 6): its used data occupies [start_offset, start_offset + + used_bytes), and growth beyond max_length requires relocating the whole + partition. This is simple and fast, but it offers no way to grow a partition + that is boxed in by its neighbours, and no way to insert or delete bytes in + the middle of a partition without rewriting everything after the edit. + + PCF-DCP adds exactly that capability, without touching the container format. + It defines ONE new application partition type, the DCP CONTAINER (type + 0xAAAC0001). The data region of a DCP container is a self-contained ARENA + holding any number of INNER PARTITIONS. Each inner partition's bytes are + described not by a single contiguous range but by a FRAGMENT TABLE: an + ordered list of EXTENTS (arbitrary (offset, length) slices of the arena) + whose concatenation is the partition's logical content. Because content is + addressed indirectly through extents, an inner partition can: + + - GROW by appending a new extent anywhere in free arena space; + - SHRINK by dropping or trimming trailing extents; + - be MUTATED IN THE MIDDLE -- overwrite, insert, or delete a byte range + -- by splitting the affected extents and rewriting only the changed + bytes into a fresh extent, never moving the unchanged bytes and never + disturbing other inner partitions. + + When the DCP container is the last region in the file, its arena can grow + toward the end of the file without bound, giving the inner partitions + effectively unlimited room to expand. + + A central design requirement is the PROMOTION INVARIANT: an inner partition + can be promoted to an ordinary top-level PCF partition, and an ordinary + partition can be demoted into a DCP container, with its PCF identity and + integrity (uid, type, label, used_bytes, data_hash) byte-for-byte preserved. + This is what makes the profile composable with cryptographic signing + (Section 2.1): a signature over an inner partition's content survives + promotion, defragmentation, and arena relocation. + + Two further capabilities fall out of the extent model almost for free and + are specified here as OPTIONAL: extents MAY be SHARED between inner + partitions (deduplication, Section 10.2), and the arena MAY be compacted to + reclaim free space and defragment partitions (Section 10.3). A per-extent + SHARED flag (Section 8.4) makes safe copy-on-write editing explicit so that + private (unshared) extents can still be overwritten cheaply in place. + + +------------------------------------------------------------------------------- +2. Relationship to PCF +------------------------------------------------------------------------------- + + A PCF-DCP file MUST be a conforming PCF v1.0 file (PCF Section 12). In + particular: + + - The 20-byte PCF File Header is present at offset 0 with the exact PCF + magic and version_major = 1, version_minor = 0. + + - The DCP container is a normal PCF partition with its own PCF Partition + Entry: a unique 16-byte PCF uid, a start_offset, a max_length, a + used_bytes, and a data_hash. Its type is the application value + 0xAAAC0001, permitted by PCF Section 7.1 (any value in + 0x00000001..0xFFFFFFFE is available to the application). + + - The PCF partition table is a chain of PCF Table Blocks linked by + next_table_offset, terminated by 0. + + A DCP container's PCF data_hash_algo_id is normally 0 (none, Section 9.4): + the container's integrity is carried INSIDE the arena, by the inner table + and inner data hashes, so that appending to one inner partition costs O(the + change) rather than O(the whole container). + + A generic PCF reader that knows nothing of this profile sees a valid file. + It traverses the top-level Table Block chain, enumerates the DCP container + as one ordinary partition, and verifies every top-level table_hash. It does + not descend into the arena and assigns the arena bytes no meaning -- exactly + as PCF is content-agnostic about any partition (PCF Section 1). Reconstructing + the inner partitions is the job of a DCP-aware reader (Section 11). The + container partition does not, and need not, appear on any top-level chain as + anything other than one opaque partition. + + PCF-DCP constrains, but does not change, how these PCF facilities are used. + The DCP container reuses PCF's own Table Block (74 bytes) and Partition Entry + (141 bytes) structures, byte-for-byte, INSIDE the arena to describe the inner + partitions (Section 7). The only genuinely new structure is the Fragment + Table (Section 8); everything else is PCF reused recursively. + +2.1 Relationship to PCF-SIG + + PCF-SIG (a sibling profile defining cryptographic signatures over PCF + partitions) signs a partition by committing to a fixed set of its PCF entry + fields. That set is exactly {uid, type, label, used_bytes, data_hash_algo_id, + data_hash}; it deliberately EXCLUDES start_offset and max_length, so that a + signature survives any change to a partition's physical placement or + reservation (PCF-SIG "relocation stability"). + + This set is, field-for-field, the PROMOTION INVARIANT of this profile + (Section 10.4): promotion, demotion, defragmentation, and arena relocation + change only start_offset, max_length, and physical layout, never any of the + six protected fields. Consequently a PCF-SIG signature over an inner + partition remains valid whether the partition currently lives inside a DCP + container or has been promoted to the top level. + + Discoverability. PCF-SIG locates a partition to verify by resolving its uid + over the set of partitions it can enumerate. A plain PCF-SIG verifier + enumerates only the top-level Table Block chain and therefore CANNOT see a + partition that currently lives inside a DCP container; it reports such a uid + as missing (a per-entry condition, not a malformed file). A DCP-aware + reader removes this limitation by extending the resolvable set to include + every inner partition of every DCP container (Section 11): the set of + partitions resolvable by uid becomes (top-level partitions) UNION (the inner + partitions of all DCP containers). This is a reader-side extension of the + search scope; it changes no PCF-SIG byte and no PCF-SIG rule. + + To be signable in place, an inner partition MUST carry a CRYPTOGRAPHIC + data_hash_algo_id (PCF IDs 16, 17, or 18), as required by PCF-SIG. The DCP + container itself, carrying data_hash_algo_id = 0, is not signable as a blob + unless it is sealed (Section 9.4). + + uid uniqueness. Because uids are resolved across the union above, a Writer + MUST keep every inner partition's PCF uid unique across the WHOLE FILE, not + merely within one container (PCF requires uniqueness among live partitions + in a file; this profile reaffirms it across the container boundary). + Promotion MUST therefore be a MOVE -- the inner entry is removed as the + top-level entry is added -- never a copy that would leave two live entries + sharing a uid. + + +------------------------------------------------------------------------------- +3. Conventions and Terminology +------------------------------------------------------------------------------- + +3.1 Requirement Keywords + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in RFC 2119. + +3.2 Terminology + + DCP container A single PCF partition of type 0xAAAC0001 whose data region + is a DCP arena. Often just "the container". + + Arena The data region of a DCP container: the byte range + [start_offset, start_offset + used_bytes) of the container's + PCF entry, holding the DCP Header, the inner table, the + fragment tables, and the data extents. + + Inner A partition stored inside the arena, described by a PCF + partition Partition Entry in the inner table (Section 7) and by a + Fragment Table (Section 8). Its logical content is the + concatenation of its extents. + + Inner table The PCF Table Block chain INSIDE the arena that lists the + inner partitions. Distinct from the file's top-level table. + + Fragment A per-inner-partition structure: a chain of Fragment Table + Table blocks listing that partition's extents in logical order. + + Extent A slice of the arena, an (offset, length) pair of bytes, + referenced by one Fragment Entry. Extents MAY be SHARED by + more than one Fragment Entry (Section 10.2). + + Logical The byte stream obtained by concatenating an inner + content partition's DATA extents in fragment-table order (Section + 8.3). The unit that data_hash and signatures commit to. + + Promotion Moving an inner partition out of its container to become an + ordinary top-level PCF partition (Section 10.4). Demotion is + the reverse. + + Bump pointer The arena offset arena_used (Section 6): the next free byte + for the default append-only allocator. + + This document additionally uses, unchanged, the PCF terms File, Reader, + Writer, Partition, Partition Table, Table Block, Entry, and UID. A "PCF uid" + is the 16-byte PCF partition identifier. + +3.3 Data Types and Byte Order + + PCF-DCP uses the same conventions as PCF Section 2.3. All multi-byte + integers are unsigned and LITTLE-ENDIAN (u8, u16, u32, u64). Byte arrays + (magics, uid, label, hashes) are stored in file order and are not subject + to endianness conversion. + + ARENA-RELATIVE OFFSETS. Every offset stored INSIDE the arena -- the DCP + Header's pointers, an inner entry's start_offset, a Fragment Table's + next_fragtable_offset, and a Fragment Entry's extent_offset -- is a byte + offset RELATIVE TO THE START OF THE ARENA, i.e. relative to the DCP + container's PCF start_offset. The absolute file offset of an arena byte at + arena-relative offset r is (container.start_offset + r). Because the DCP + Header occupies arena offset 0, the relative value 0 is never a valid + pointer to any inner structure and is used as a chain terminator, mirroring + PCF's use of absolute offset 0 (PCF Section 4). + + +------------------------------------------------------------------------------- +4. Profile Model Overview +------------------------------------------------------------------------------- + +4.1 The DCP Container Partition + + A DCP container is one PCF partition. Everything this profile defines lives + in its arena. A file MAY contain several DCP containers and MAY freely mix + them with ordinary PCF partitions and with partitions of other profiles; + each DCP container is independent. + +4.2 The Arena and Relative Offsets + + The arena is laid out as: + + arena offset 0: + +======================================+ + | DCP Header | 24 bytes (Section 6) + +======================================+ + | Inner Table Block chain | reused PCF Table Blocks + + | (lists the inner partitions) | reused PCF Partition Entries + +======================================+ (Section 7) + | Fragment Tables (one chain per | Section 8 + | inner partition) | + +======================================+ + | Data extents (the actual bytes, | referenced by Fragment Entries + | possibly shared, possibly out of | (Section 8.2) + | logical order, possibly fragmented) | + +======================================+ arena_used + | Free arena tail (bump space) | grows; to EOF if container is + +======================================+ the last region in the file + + As in PCF (PCF Section 3), the relative placement of these regions WITHIN + the arena is the Writer's responsibility; only the DCP Header's fixed + position at arena offset 0 is mandated. A Reader locates every structure by + following the pointers, never by assuming an order. + +4.3 Free Space and the Bump Allocator + + Like PCF (PCF Section 5.3, A3), PCF-DCP does NOT encode free space. The DCP + Header stores arena_used, a single bump pointer: the arena bytes + [arena_used, container.max_length - container's reserved tail) are free, and + the default allocation policy is to hand out new extents at arena_used and + advance it. Reuse of the space freed by deleted or rewritten extents is a + Writer-side policy that leaves no on-disk trace; it is reclaimed in bulk by + compaction (Section 10.3). A Reader never needs free-space information. + +4.4 No Nested Containers + + In version 1.0 a DCP container MUST NOT contain another DCP container: an + inner partition's type MUST NOT be 0xAAAC0001. Inner partitions are leaves. + This keeps reconstruction non-recursive and bounds reader complexity. (A + future major version MAY relax this; Section 15.) + + +------------------------------------------------------------------------------- +5. Partition Types and Reserved Values +------------------------------------------------------------------------------- + + PCF-DCP assigns the following PCF type value: + + Type Name Meaning + ----------- ------------- ----------------------------------------- + 0xAAAC0001 DCP_CONTAINER A partition whose data region is a DCP + arena (Section 4). + ----------- ------------- ----------------------------------------- + + The block 0xAAAC0000..0xAAAC00FF is reserved by this profile for future + related types, continuing the existing convention (0xAAAA* = PFS-MS, + 0xAAAB* = PCF-SIG). Inner partitions carry whatever application type their + content requires (including the PCF RAW type 0xFFFFFFFF), with the single + restriction of Section 4.4 (no inner DCP_CONTAINER). + + The PCF reserved values retain their PCF meaning at every level: type + 0x00000000 and the NIL PCF uid MUST NOT be used for any live partition, + inner or top-level (PCF Section 7). The all-zero arena-relative offset is + reserved as a chain terminator (Section 3.3). + + +------------------------------------------------------------------------------- +6. DCP Header +------------------------------------------------------------------------------- + + The arena MUST begin, at arena offset 0, with the following 24-byte header. + + Offset Size Type Field + (rel) + ------ ---- ----- -------------------------------------------------- + 0 4 bytes dcp_magic = 0x50 0x44 0x43 0x50 ("PDCP") + 4 1 u8 profile_version_major = 1 + 5 1 u8 profile_version_minor = 0 + 6 2 u16 flags (reserved; MUST be 0) + 8 8 u64 inner_table_offset (rel; first Inner Table Block) + 16 8 u64 arena_used (bump pointer; next free rel) + ------ ---- ----- -------------------------------------------------- + Total: 24 bytes + + dcp_magic + MUST be the four bytes "PDCP" (0x50 0x44 0x43 0x50). A Reader MUST treat + a partition typed 0xAAAC0001 whose arena does not begin with this magic + as malformed. + + profile_version_major, profile_version_minor + The PCF-DCP profile version of this container (Section 14). A Reader MUST + reject a container whose profile_version_major it does not implement and + SHOULD accept a higher profile_version_minor, ignoring features it does + not understand. + + flags + Reserved in version 1.0; a Writer MUST set all bits to 0 and a Reader + MUST ignore them. (Reserved for future, layout-neutral hints introduced + by a profile minor bump.) + + inner_table_offset + Arena-relative offset of the first Inner Table Block (Section 7). MUST be + >= 24 (it cannot point into the DCP Header). The value 0 would mean a + container with no inner table; a Writer SHOULD instead emit an empty + Inner Table Block (partition_count = 0) and point at it, but a Reader + MUST treat inner_table_offset == 0 as "no inner partitions". + + arena_used + The bump pointer: arena-relative offset of the first free byte (Section + 4.3). MUST be <= the container's PCF used_bytes. Every stored structure + and extent lies within [0, arena_used). + + This profile defines no other fixed arena-wide field. In particular it + defines no arena-wide checksum; integrity is provided by the inner table + and inner data hashes (Section 9), and OPTIONALLY by sealing the whole + container (Section 9.4). + + +------------------------------------------------------------------------------- +7. Inner Partition Table +------------------------------------------------------------------------------- + +7.1 Reuse of the PCF Table Block + + The inner partitions are listed by a chain of PCF Table Blocks stored INSIDE + the arena, byte-for-byte identical in layout to the top-level partition + table (PCF Section 5). The chain begins at inner_table_offset and is + followed via each block's next_table_offset until it reaches 0; each block + is a 74-byte Table Block Header (PCF Section 5.1) followed by partition_count + 141-byte Partition Entries (PCF Section 5.2). + + All offsets within these blocks are ARENA-RELATIVE (Section 3.3): a block's + next_table_offset is an arena-relative offset (0 = end of chain), and each + entry's start_offset is an arena-relative offset (Section 7.2). + + A block's table_hash is computed exactly as in PCF Section 8.4, over the + block's own [block, block + 74 + partition_count * 141) bytes with the + table_hash field treated as zero. Because the container's own PCF + data_hash_algo_id is normally 0, the inner table_hash is the primary + integrity anchor for the inner table; a Writer SHOULD therefore choose a + CRYPTOGRAPHIC table_hash_algo_id (PCF IDs 16, 17, 18) for inner blocks + (Section 9.2). + +7.2 Reinterpretation of Inner Partition Entry Fields + + An inner Partition Entry uses the identical 141-byte PCF layout, but two + fields are REINTERPRETED for the arena. This reinterpretation applies ONLY + to entries in an inner table; a generic PCF reader never parses an inner + table (Section 2) and so never observes it. + + type, uid, label, data_hash_algo_id, data_hash + Used with their normal PCF meaning. uid MUST be unique across the + whole file (Section 2.1). data_hash covers the partition's LOGICAL + content (Section 9.1), not any contiguous file range. + + start_offset (REINTERPRETED) + The ARENA-RELATIVE offset of this inner partition's first Fragment + Table block (Section 8), NOT the offset of contiguous data. MUST be + >= 24. + + used_bytes + The length of the partition's LOGICAL content: the sum of its DATA + extent lengths (Section 8.3). This is the value data_hash covers. + + max_length (REINTERPRETED) + There is no contiguous reservation inside an arena; growth is by + appending extents (Section 10.1). A Writer MUST set max_length equal + to used_bytes, and a DCP-aware Reader MUST ignore max_length for inner + entries (it locates content via the Fragment Table, never via + start_offset + used_bytes). The equality keeps PCF's used_bytes <= + max_length invariant trivially satisfied (PCF Section 5.2). + + All other PCF entry rules apply unchanged (non-NIL uid, non-zero type, + valid label, zero-filled field tails). + + +------------------------------------------------------------------------------- +8. Fragment Table +------------------------------------------------------------------------------- + + Each inner partition has exactly one Fragment Table, located by its entry's + (reinterpreted) start_offset. A Fragment Table is a singly linked chain of + one or more Fragment Table blocks; it lists the partition's extents in + LOGICAL order (the order in which they concatenate to form the content). + +8.1 Fragment Table Header + + Each Fragment Table block begins with the following 9-byte header. + + Offset Size Type Field + (rel) + ------ ---- ----- -------------------------------------------------- + 0 8 u64 next_fragtable_offset (rel; 0 = last block) + 8 1 u8 fragment_count (0..255 in THIS block) + ------ ---- ----- -------------------------------------------------- + Total: 9 bytes + + next_fragtable_offset + Arena-relative offset of the next Fragment Table block of THIS partition, + or 0 if this is the last block. A Reader MUST stop when it reads 0. + + fragment_count + Number of Fragment Entries packed immediately after this header, 0..255. + A partition needing more than 255 extents chains further blocks, so the + number of extents per partition is unbounded (mirroring PCF's overflow + chain, PCF Section 5.3). The Fragment Entries follow immediately: + + fragment[i] at block + 9 + i * 18 (0 <= i < fragment_count) + +8.2 Fragment Entry + + Each Fragment Entry is a fixed-size 18-byte record. + + Offset Size Type Field + (rel) + ------ ---- ----- -------------------------------------------------- + 0 8 u64 extent_offset (rel; start of the extent's bytes) + 8 8 u64 extent_length (length of the extent in bytes) + 16 1 u8 kind (1 = DATA; see below) + 17 1 u8 flags (bit0 = SHARED; others reserved 0) + ------ ---- ----- -------------------------------------------------- + Total: 18 bytes + + extent_offset, extent_length + The extent is the arena byte range [extent_offset, extent_offset + + extent_length). Both endpoints MUST lie within [0, arena_used). Extents + MAY appear in any physical position, MAY be out of logical order + physically, and MAY overlap or coincide with extents of other entries + (sharing, Section 10.2). extent_length MAY be 0 (an empty extent + contributes no bytes; a Writer SHOULD avoid emitting them). + + kind + Extent kind. Defined and RESERVED values: + + 0 RESERVED / INVALID. MUST NOT appear in a live Fragment Entry; like + PCF type 0, it guards against zero-filled records. + 1 DATA. The extent's bytes are literal content (the only kind + defined in version 1.0). + 2 HOLE (RESERVED). Intended for sparse content (a run of zero bytes + with no backing storage). MUST NOT be written in version 1.0. + 3 REF (RESERVED). Intended for a reference to bytes outside this + container (cross-container sharing). MUST NOT be written in version + 1.0. + + A Reader encountering a kind it does not implement MUST treat the + affected inner partition as unreadable, but MUST NOT treat the container + or the file as malformed on that basis alone. + + flags + Bit 0 (0x01) is SHARED (Section 8.4). All other bits are reserved and + MUST be 0 in version 1.0. + +8.3 Logical Content Reconstruction + + The logical content of an inner partition is the concatenation, in + fragment-table order across the whole chain, of the bytes of its DATA + extents: + + content = extent[0].bytes || extent[1].bytes || ... || extent[m-1].bytes + + where extent[i].bytes = arena[extent_offset .. extent_offset + extent_length) + for the i-th DATA extent encountered while walking the Fragment Table chain + (block by block, entry by entry). RESERVED kinds (HOLE, REF) have no + version-1.0 contribution and render the partition unreadable to a v1.0 + reader (Section 8.2). + + The length of content MUST equal the inner entry's used_bytes. Sharing, + physical order, and fragmentation are invisible to this reconstruction: + content depends only on which bytes are named, in which logical order. + +8.4 The SHARED Flag and Copy-on-Write + + The SHARED flag (flags bit 0) is a WRITER CONTRACT that makes safe in-place + editing explicit. It is purely advisory to Readers, which reconstruct + content identically regardless of its value. + + SHARED set + The bytes referenced by this extent MUST NOT be overwritten in place. + Any modification of a logical range that this extent covers MUST be + performed COPY-ON-WRITE: split the extent and write the changed bytes + into a fresh, privately owned extent (Section 10.1), leaving the + shared bytes untouched. + + SHARED clear + The extent is privately owned -- no other live Fragment Entry, in any + inner partition, references any of its bytes -- and a Writer MAY + overwrite those bytes in place. + + Maintenance rules: + + F1. When a Writer creates a Fragment Entry that references (in whole or + in part) arena bytes already referenced by another live Fragment + Entry -- that is, at the moment it shares an extent (Section 10.2) + -- it MUST set SHARED on the new entry AND on every existing live + entry that references any of those bytes. + + F2. Writers only ever SET the SHARED flag. The flag is CLEARED only by + compaction (Section 10.3), which has the global view needed to + prove that an extent has become privately referenced again. + + This asymmetry makes errors safe: a stale-SET flag (an extent marked SHARED + that is in fact no longer aliased) costs only an unnecessary copy-on-write + and is harmless; a stale-CLEAR flag would be dangerous (a Writer might + overwrite shared bytes), and rule F2 makes it unreachable. The flag is a + boolean "is aliased", NOT a reference count; no counts are stored on disk, + and liveness/aliasing is recomputed globally only at compaction. + + +------------------------------------------------------------------------------- +9. Integrity Model +------------------------------------------------------------------------------- + + Integrity layers cleanly, exactly as PCF intends (PCF Section 8.5): + + - each inner partition's data_hash protects its logical content; + - each inner Table Block's table_hash protects the inner entries; + - the container's own PCF data_hash optionally protects the whole arena + (sealed mode); + - the top-level table_hash protects the container's PCF entry, and the + PCF header anchors the top-level chain. + +9.1 Inner Partition Data Hash + + An inner entry's data_hash is computed, using the PCF Hash Algorithm + Registry and field encoding (PCF Sections 8.1, 8.2), over the partition's + LOGICAL content (Section 8.3) -- the reconstructed byte stream, NOT any + contiguous arena range. This is the natural generalisation of PCF Section + 8.3 from a contiguous prefix to a fragmented stream. + + Because data_hash commits to logical content only, it is INVARIANT under + every physical change that preserves content: fragmentation and + defragmentation, extent sharing and un-sharing, arena relocation, and + promotion/demotion. This invariance is the basis of the promotion invariant + (Section 10.4) and of PCF-SIG compatibility (Section 2.1). + + An inner partition intended to be signed by PCF-SIG MUST use a cryptographic + data_hash_algo_id (PCF IDs 16, 17, 18); a value of 0 (none) is permitted for + unsigned inner partitions but then provides no verification. + +9.2 Inner Table Block Hash + + Each inner Table Block carries its own table_hash, computed exactly as PCF + Section 8.4 over the block's header-plus-entries region within the arena. + Because the container normally carries data_hash_algo_id = 0, the inner + table_hash is the only integrity protection for the inner entries; a Writer + SHOULD use a cryptographic algorithm for it (PCF IDs 16, 17, 18). A Writer + MUST keep each inner table_hash consistent after any change to an inner + entry (the PCF hash cascade, PCF Section 8.5, applied within the arena). + +9.3 Fragment-Table Protection + + The Fragment Tables are not covered by any dedicated hash. They do not need + one: any corruption of a Fragment Table that changes which bytes an inner + partition reconstructs changes its logical content and is therefore detected + by the partition's data_hash (Section 9.1). A corruption that leaves the + reconstructed content byte-identical (for example, redirecting an extent to + an identical run of bytes elsewhere) is by definition harmless, because the + content -- the thing this profile and PCF-SIG commit to -- is unchanged. + +9.4 Container Data Hash and Sealed Mode + + By default a DCP container sets data_hash_algo_id = 0 so that incremental + edits do not force an O(arena) re-hash of the container's PCF entry. A + Writer MAY instead SEAL a finalized container by assigning it a cryptographic + PCF data_hash over its whole arena [start_offset, start_offset + used_bytes), + exactly as for any PCF partition (PCF Section 8.3). A sealed container is + itself signable as a blob by PCF-SIG, at the cost of recomputing the + container data_hash on every change; sealing is therefore appropriate only + for finalized containers. + + +------------------------------------------------------------------------------- +10. Operations (Informative) +------------------------------------------------------------------------------- + + The following is illustrative, not normative. All operations edit only the + acting inner partition's Fragment Table and append bytes/extents to free + arena space; no other inner partition is affected, and no unchanged bytes + are moved. + +10.1 In-Place Editing via the Fragment Table + + Let an inner partition have logical content C reconstructed from its extents. + "Split at logical position p" means: find the extent covering p and, if p + falls inside it, replace that one Fragment Entry with two entries describing + the same bytes [.., p) and [p, ..). Splitting changes no data bytes. + + Append N bytes + Write N bytes at arena_used; append a DATA Fragment Entry + (extent_offset = old arena_used, length = N, flags = 0); advance + arena_used by N; increase used_bytes by N. + + Overwrite logical [p, p+n) + Split at p and at p+n. Write the n replacement bytes to free arena + space as a new private extent and replace the covered middle entries + with that one entry. If any covered extent had SHARED set, this + copy-on-write is REQUIRED (Section 8.4); the shared bytes are left + intact. used_bytes is unchanged. + + Insert N bytes at logical p + Split at p. Write the N new bytes to free arena space; insert a DATA + entry between the two halves. used_bytes increases by N. + + Delete logical [p, p+n) + Split at p and at p+n; drop the middle entries. No data bytes are + moved or freed in place; the orphaned bytes are reclaimed by + compaction. used_bytes decreases by n. + + Truncate to length L + Split at L; drop all entries after it. used_bytes becomes L. + + After any of these, the Writer recomputes the inner entry's data_hash over + the new logical content and the enclosing inner Table Block's table_hash + (Section 9). Removing an entire inner partition is the PCF entry-removal + procedure (PCF Section 11.4) applied to the inner table; its extents become + free arena space. + +10.2 Deduplication + + Two Fragment Entries -- in the same or different inner partitions -- MAY + reference the same extent bytes. To deduplicate an identical run, a Writer + points a second Fragment Entry at the existing extent instead of writing the + bytes again, and applies maintenance rule F1 (Section 8.4): it sets SHARED + on the new entry and on every existing entry that aliases those bytes. No + on-disk reference count is kept; sharing is invisible to reconstruction and + to data_hash (Sections 8.3, 9.1). + + Chunking strategy (fixed-size, content-defined, or none) and any in-memory + index of chunk hashes are entirely Writer-side and outside this + specification. A Reader never needs to know that any extent is shared. + +10.3 Compaction and Defragmentation + + Compaction reclaims free arena space and MAY defragment partitions; it is a + full rewrite of the arena, intended for a finalized container. A + conforming compaction MUST PRESERVE SHARING: it collects the set of live + extents (the union over all inner partitions' Fragment Tables), copies each + distinct extent once, and rewrites every Fragment Entry to the new offset + via a (old -> new) map. This mark-and-sweep is also where the SHARED flag is + normalised: an extent that the global scan proves to be referenced exactly + once MAY have its SHARED flag cleared on the surviving entry (the only + permitted clearing, rule F2). A defragmenting compactor MAY additionally + rewrite a partition as a single contiguous extent. + + Because compaction preserves every partition's logical content, every inner + data_hash is unchanged; only inner table_hashes (and, if sealed, the + container data_hash) are recomputed. A naive compactor that is not + sharing-aware still produces a CORRECT container -- it merely copies shared + bytes more than once, losing the deduplication. + +10.4 Promotion and the Promotion Invariant + + Promotion moves an inner partition out to the top level: + + P1. Materialise the partition's logical content (Section 8.3) as a + contiguous region at some free file offset S. + P2. Build a top-level PCF Partition Entry that COPIES the inner entry's + type, uid, label, used_bytes, data_hash_algo_id, and data_hash + verbatim, and sets start_offset = S and max_length = used_bytes. + P3. Add that entry to a top-level Table Block (PCF Section 11.3) and + remove the inner entry from the inner table (PCF Section 11.4) -- a + single MOVE, so the uid is never duplicated (Section 2.1). + P4. Recompute the affected top-level and inner table_hashes. + + The six fields {type, uid, label, used_bytes, data_hash_algo_id, data_hash} + are byte-identical before and after; only start_offset and max_length + change, and only physical bytes move. Because data_hash commits to logical + content (Section 9.1) and the materialised contiguous bytes ARE that same + content, the promoted partition's data_hash verifies unchanged. This is the + PROMOTION INVARIANT, and it is exactly the set of fields a PCF-SIG signature + protects (Section 2.1), so a signature over the partition survives promotion. + + Demotion (top-level partition -> inner partition of a DCP container) is the + reverse: copy the same six fields into a new inner entry, build a Fragment + Table for the bytes (typically one extent), and remove the top-level entry. + The same invariant holds. + + +------------------------------------------------------------------------------- +11. Reader Algorithms (Informative) +------------------------------------------------------------------------------- + + The following pseudocode is illustrative, not normative. + +11.1 Open and enumerate inner partitions + + read and verify the 20-byte PCF header (PCF 11.1) + for each top-level partition P (PCF 11.1): + if P.type != 0xAAAC0001: handle as a normal/other-profile partition + else: treat P as a DCP container: + A = P.start_offset // arena base (absolute) + read 24-byte DCP Header at A+0; verify dcp_magic == "PDCP" + if profile_version_major unsupported: reject this container + it = header.inner_table_offset + while it != 0: // inner Table Block chain + read 74-byte block header at A+it; verify table_hash (PCF 8.4) + for i in 0 .. partition_count-1: + e = read 141-byte entry at A+it+74+i*141 + register inner partition e (key: e.uid) // Section 2.1 + it = block.next_table_offset + +11.2 Reconstruct an inner partition's content + + e = the inner entry + ft = e.start_offset // arena-rel: first FragTable + out = [] + while ft != 0: + read 9-byte FragTable header at A+ft + for j in 0 .. fragment_count-1: + g = read 18-byte Fragment Entry at A+ft+9+j*18 + if g.kind != 1 (DATA): partition unreadable to v1.0 (Section 8.2) + out.append( arena[g.extent_offset .. +g.extent_length) ) + ft = header.next_fragtable_offset + content = concat(out) + assert len(content) == e.used_bytes + if e.data_hash_algo_id != 0: verify hash(content) == e.data_hash + +11.3 Resolve a partition by uid (PCF-SIG-compatible scope, Section 2.1) + + resolvable = (all top-level partitions) + UNION (inner partitions of every DCP container) + find the unique entry whose uid matches; reconstruct (11.2) if inner. + +11.4 Promotion read-back + + a promoted partition is an ordinary top-level partition; no DCP awareness + is needed to read or to PCF-SIG-verify it. Its uid and data_hash equal + those it had inside the container (Section 10.4). + + +------------------------------------------------------------------------------- +12. Writer Algorithms (Informative) +------------------------------------------------------------------------------- + + The following pseudocode is illustrative, not normative. + +12.1 Create a container + + allocate a top-level partition entry, type = 0xAAAC0001, + data_hash_algo_id = 0 (unsealed), unique uid + arena: write DCP Header (magic, version, flags=0, + inner_table_offset -> empty Inner Table Block, arena_used) + maintain the top-level table_hash (PCF 8.5) + +12.2 Add an inner partition + + write the content as one or more DATA extents at arena_used (advance it) + build a Fragment Table listing those extents (flags = 0 unless sharing) + build an inner Partition Entry (Section 7.2): start_offset -> the + Fragment Table, used_bytes = logical length, max_length = used_bytes, + data_hash over logical content (cryptographic if to be signed) + append the entry to an inner Table Block (PCF 11.3), recompute its + table_hash; update arena_used in the DCP Header + +12.3 Edit / append / dedup + + perform the Section 10.1 / 10.2 operation on the partition's Fragment + Table; honour the SHARED flag (Section 8.4); recompute the inner + entry's data_hash and its block's table_hash + +12.4 Finalize + + OPTIONALLY compact the arena, preserving sharing (Section 10.3) + OPTIONALLY seal the container with a cryptographic PCF data_hash + (Section 9.4) if it is to be signed as a blob + + +------------------------------------------------------------------------------- +13. Conformance and Validation +------------------------------------------------------------------------------- + + A conforming PCF-DCP Reader MUST: + + R1. Be a conforming PCF Reader (PCF Section 12, C1..C8). In particular it + validates the file as PCF independently of this profile. + R2. For each partition of type 0xAAAC0001, verify dcp_magic and reject a + container whose profile_version_major it does not implement. + R3. Traverse the inner Table Block chain from inner_table_offset to 0, + verifying each inner table_hash unless its algo id is 0 (PCF 8.4), + interpreting all in-arena offsets as arena-relative (Section 3.3). + R4. Reconstruct an inner partition's content by concatenating its DATA + extents in fragment-table order (Section 8.3), and verify that the + length equals used_bytes and (unless algo id 0) that the content + matches data_hash. + R5. Treat an inner partition as unreadable, but NOT the file as + malformed, if it uses a reserved kind or a hash algorithm the Reader + does not implement (Sections 8.2, 9.1). + R6. When resolving partitions by uid (e.g. for PCF-SIG), include inner + partitions in scope (Section 2.1, 11.3) and treat a uid that is + non-unique across the whole file as malformed. + R7. Treat as malformed: a missing/incorrect dcp_magic; an inner entry + with type 0x00000000, NIL uid, or type 0xAAAC0001 (Section 4.4); an + arena pointer or extent that falls outside [0, arena_used); a + reconstructed length that disagrees with used_bytes. + + A conforming PCF-DCP Writer MUST: + + W1. Be a conforming PCF Writer (PCF Section 12, W1..W5) at every level + (top-level entries and inner entries alike). + W2. Begin each arena with a valid 24-byte DCP Header (Section 6) and + keep arena_used accurate as the bump pointer. + W3. Store every in-arena offset as arena-relative (Section 3.3); set + each inner entry's start_offset to its Fragment Table and its + max_length equal to used_bytes (Section 7.2). + W4. Give every inner partition a file-globally-unique, non-NIL uid + (Section 2.1) and never type an inner partition 0xAAAC0001. + W5. Maintain the hash cascade within the arena: recompute an inner + entry's data_hash over its logical content and its block's + table_hash after any change (Section 9). + W6. Honour the SHARED flag (Section 8.4): perform copy-on-write for any + extent marked SHARED, set SHARED on all aliases when sharing an + extent (F1), and clear it only during sharing-preserving compaction + (F2). Compaction MUST preserve sharing (Section 10.3). + W7. Use a cryptographic data_hash_algo_id for any inner partition that + is to be signed by PCF-SIG (Section 2.1), and treat promotion as a + MOVE that preserves the six protected fields (Section 10.4). + + As in PCF, the format TRUSTS the Writer for physical layout. A DCP-aware + Reader is NOT required to validate that extents do not overlap unintentionally, + that the arena has no gaps, or that free space is minimal; such a container + is not, by those facts alone, non-conforming (PCF Section 12). Intentional + extent overlap is the deduplication mechanism (Section 10.2). + + +------------------------------------------------------------------------------- +14. Versioning +------------------------------------------------------------------------------- + + PCF-DCP carries its own profile version in every DCP Header + (profile_version_major, profile_version_minor), independent of the PCF + container version (which remains 1.0). + + A profile MAJOR change denotes an incompatible change to the arena + layout or to reconstruction semantics (for example, changing the + Fragment Entry size, or allowing nested containers). A Reader MUST reject + a container whose profile_version_major it does not implement. + + A profile MINOR change denotes a backward-compatible addition that does + not alter any existing byte layout -- for example, assigning meaning to a + reserved DCP Header flag bit, defining a reserved Fragment Entry flags + bit, or activating a reserved kind value (HOLE, REF). A Reader + implementing major M MUST read containers with the same M and an equal or + lower minor, and SHOULD accept a higher minor, ignoring features it does + not understand. + + Different DCP containers in one file MAY declare different profile minor + versions. This document defines profile version 1.0. + + +------------------------------------------------------------------------------- +15. Future Considerations (Informative) +------------------------------------------------------------------------------- + + Sparse content (HOLE). kind = 2 is reserved for runs of implicit zero bytes + with no backing extent, enabling sparse inner partitions; activating it is a + minor bump because it does not change any existing layout. + + Cross-container references (REF). kind = 3 is reserved for an extent that + names bytes outside the current container (for example, another container's + arena or a top-level partition), enabling deduplication across container + boundaries. Its exact reference encoding is left to a future minor version. + + Nested containers. Section 4.4 forbids inner DCP containers in v1.0 to keep + reconstruction non-recursive; a future MAJOR version could permit nesting. + + Cryptographic signatures. An inner partition with a cryptographic data_hash + is directly signable in place by PCF-SIG once a reader includes inner + partitions in uid scope (Section 2.1); alternatively a sealed container + (Section 9.4) is signable as a blob. Either way the entry layouts here never + need to change, following PCF Section 13. + + Finalization. A finalized container SHOULD be compacted (Section 10.3) to + reclaim free arena space; the whole file MAY additionally be compacted with + the PCF compaction operation (PCF Section 11.5). + + Performance envelope. The extent/copy-on-write model is optimal for + write-once, append, and snapshot workloads. Heavy random in-place rewriting + remains correct but incurs the usual copy-on-write costs (fragmentation and + write amplification, as in copy-on-write filesystems), repaid by compaction. + + +------------------------------------------------------------------------------- +16. Assumptions and Design Decisions (Informative) +------------------------------------------------------------------------------- + + D1. The profile changes nothing in PCF. It uses one application type + (0xAAAC0001) from the reserved block 0xAAAC00xx, permitted by PCF + Section 7, and reuses PCF's Table Block and Partition Entry structures + verbatim inside the arena. + + D2. Content is addressed indirectly through extents, so an inner partition + can grow, shrink, and be edited in the middle without relocating + neighbours or moving unchanged bytes -- the capability PCF's contiguous + model lacks (PCF Section 6, A4). + + D3. All in-arena offsets are arena-relative, so the whole arena -- and thus + every inner partition -- can be relocated as one block (e.g. by + compaction) without rewriting any in-arena pointer. + + D4. Free space is derived from a single bump pointer (arena_used), echoing + PCF's "free space is derived" decision (PCF A3, A9); reuse of freed + extents is Writer policy, reclaimed in bulk by compaction. + + D5. data_hash commits to LOGICAL content (the concatenation of extents), + not to any physical range, making it invariant under fragmentation, + sharing, relocation, and promotion. This single decision yields both + the promotion invariant and zero-cost deduplication. + + D6. The six PCF-SIG-protected fields equal the six fields preserved by + promotion/demotion/compaction, so signatures survive all of them. A + reader-side extension of uid scope (Section 2.1) makes inner partitions + signable in place without changing a single PCF-SIG byte. + + D7. Sharing is a boolean per-extent flag (SHARED), not a reference count: + Writers only set it (when aliasing), compaction alone clears it (with a + global view). Stale-set is safe (extra copy-on-write); stale-clear is + made unreachable. No counts live on disk. + + D8. The Fragment Table needs no hash of its own: any corruption that + changes reconstructed content is caught by data_hash, and any that does + not is harmless. + + D9. The container is unsealed (data_hash_algo_id = 0) by default so edits + cost O(change), with an OPTIONAL sealed mode (O(arena) re-hash) for a + finalized, blob-signable container. + + D10. Inner partitions are leaves (no nested containers in v1.0), bounding + reader complexity; HOLE and REF kinds are reserved for a later minor + bump. + + D11. Type 0x00000000, the NIL uid, the invalid kind 0, and the all-zero + arena offset all act as guards against accidental zero-filled records + and as chain terminators, mirroring PCF's guards (PCF A11, A15). + + +------------------------------------------------------------------------------- +17. Test Vectors +------------------------------------------------------------------------------- + + This section provides a complete, byte-exact reference file so that + independent implementations can verify conformance. The file is a DCP + container holding TWO inner partitions, and it demonstrates fragmentation, + deduplication via a shared extent, and the SHARED flag: + + * Inner "A": type 0x00000010, uid = 16 x 0xA1, logical content the ASCII + string "Hello, World!" (13 bytes) stored as TWO extents -- "Hello, " + (7 bytes, PRIVATE) and "World!" (6 bytes, SHARED) -- protected by + SHA-256. + * Inner "B": type 0x00000010, uid = 16 x 0xB2, logical content "World!" + (6 bytes), stored as ONE extent that is the SAME arena bytes as A's + second extent (deduplication). Both references carry SHARED = 1. + Protected by SHA-256. + + The DCP container itself is one top-level PCF partition: type 0xAAAC0001, + uid = 16 x 0xDC, label "dcp", data_hash_algo_id = 0 (unsealed). Both the + top-level Table Block and the inner Table Block are hashed with SHA-256. All + multi-byte integers are little-endian. Total file size is 700 bytes. An + implementation that builds the same logical container and emits this exact + canonical layout MUST produce these exact bytes. Note that B's data_hash + equals SHA-256("World!"), the SAME value a standalone PCF partition of + "World!" would carry -- demonstrating the promotion invariant (Section 10.4). + + Top-level structure (absolute file offsets): + + 0x0000..0x0014 File Header (PCF, 20 bytes) + 0x0014..0x005E Top-level Table Block header (PCF, 74 bytes) + 0x005E..0x00EB Top-level Partition Entry "dcp" (PCF, 141 bytes) + 0x00EB..0x02BC DCP arena (the container's data) (465 bytes) + + Arena structure (arena-relative offsets; add 0x00EB for absolute): + + rel 0x000..0x018 DCP Header (24 bytes) + rel 0x018..0x01F extent "Hello, " (7 bytes) + rel 0x01F..0x025 extent "World!" (6 bytes, shared) + rel 0x025..0x052 Fragment Table A (2 entries) (45 bytes) + rel 0x052..0x06D Fragment Table B (1 entry) (27 bytes) + rel 0x06D..0x1D1 Inner Table Block (2 entries) (356 bytes) + arena_used = 0x1D1 (465) + + ---- File Header (offset 0x0000, 20 bytes) ----------------------- + 0000 89 4B 50 52 54 0D 0A 1A magic = 89 'K' 'P' 'R' 'T' 0D 0A 1A + 0008 01 00 version_major = 1 + 000A 00 00 version_minor = 0 + 000C 14 00 00 00 00 00 00 00 partition_table_offset = 20 + + ---- Top-level Table Block (offset 0x0014, header 74 bytes) ------ + 0014 01 partition_count = 1 + 0015 00 00 00 00 00 00 00 00 next_table_offset = 0 (end of chain) + 001D 10 table_hash_algo = 16 (SHA-256) + 001E 22 E5 05 13 61 0D A1 6E 02 A2 F7 C6 12 01 B5 04 table_hash (SHA-256 of this 74-byte header + 002E D8 19 FB CD 05 FF C0 5E 2F B2 3D 06 33 86 C9 53 with this field zeroed, plus the 1 entry) + 003E 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 004E 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + ---- Top-level Partition Entry 0 "dcp" (offset 0x005E, 141 bytes) - + 005E 01 00 AC AA type = 0xAAAC0001 (DCP_CONTAINER) + 0062 DC DC DC DC DC DC DC DC DC DC DC DC DC DC DC DC uid = 16 x 0xDC + 0072 64 63 70 00 00 00 00 00 00 00 00 00 00 00 00 00 label[0..16] = "dcp" then NUL padding + 0082 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label[16..32] = NUL padding + 0092 EB 00 00 00 00 00 00 00 start_offset = 235 (arena base) + 009A D1 01 00 00 00 00 00 00 max_length = 465 + 00A2 D1 01 00 00 00 00 00 00 used_bytes = 465 (= arena_used) + 00AA 00 data_hash_algo = 0 (none; unsealed) + 00AB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 data_hash = all zero (algo 0) + 00BB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero) + 00CB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero) + 00DB 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero) + + ==== DCP ARENA begins at 0x00EB (arena offset 0) ==== + + ---- DCP Header (arena 0x000 / file 0x00EB, 24 bytes) ------------ + 00EB 50 44 43 50 dcp_magic = "PDCP" + 00EF 01 profile_version_major = 1 + 00F0 01 profile_version_minor = 0 + 00F1 00 00 flags = 0 + 00F3 6D 00 00 00 00 00 00 00 inner_table_offset = 109 (arena-rel) + 00FB D1 01 00 00 00 00 00 00 arena_used = 465 (arena-rel) + + ---- Data extents (arena 0x018..0x025 / file 0x0103..0x0110) ----- + 0103 48 65 6C 6C 6F 2C 20 extent "Hello, " (7 bytes, A private) + 010A 57 6F 72 6C 64 21 extent "World!" (6 bytes, A+B shared) + (the two extents happen to be physically adjacent here, but each is + described by its own Fragment Entry; "World!" is referenced twice.) + + ---- Fragment Table A (arena 0x025 / file 0x0110, 45 bytes) ------ + 0110 00 00 00 00 00 00 00 00 next_fragtable_offset = 0 (last) + 0118 02 fragment_count = 2 + 0119 18 00 00 00 00 00 00 00 [0] extent_offset = 24 + 0121 07 00 00 00 00 00 00 00 [0] extent_length = 7 + 0129 01 [0] kind = 1 (DATA) + 012A 00 [0] flags = 0 (private) + 012B 1F 00 00 00 00 00 00 00 [1] extent_offset = 31 + 0133 06 00 00 00 00 00 00 00 [1] extent_length = 6 + 013B 01 [1] kind = 1 (DATA) + 013C 01 [1] flags = 1 (SHARED) + + ---- Fragment Table B (arena 0x052 / file 0x013D, 27 bytes) ------ + 013D 00 00 00 00 00 00 00 00 next_fragtable_offset = 0 (last) + 0145 01 fragment_count = 1 + 0146 1F 00 00 00 00 00 00 00 [0] extent_offset = 31 (same as A[1]) + 014E 06 00 00 00 00 00 00 00 [0] extent_length = 6 + 0156 01 [0] kind = 1 (DATA) + 0157 01 [0] flags = 1 (SHARED) + + ---- Inner Table Block (arena 0x06D / file 0x0158, header 74 B) -- + 0158 02 partition_count = 2 + 0159 00 00 00 00 00 00 00 00 next_table_offset = 0 (end of chain) + 0161 10 table_hash_algo = 16 (SHA-256) + 0162 BE 19 BB 14 7C 68 18 51 CA B7 01 C4 BF 9D 6C 62 table_hash (SHA-256 of this 74-byte header + 0172 82 C0 CB 53 4F 06 BA 97 07 AB EF 01 AD 47 22 1D with this field zeroed, plus the 2 entries) + 0182 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 0192 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + ---- Inner Partition Entry 0 "A" (arena 0x0B7 / file 0x01A2) ----- + 01A2 10 00 00 00 type = 0x00000010 + 01A6 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 uid = 16 x 0xA1 + 01B6 41 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label = "A" then NUL padding + 01C6 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label (cont.) + 01D6 25 00 00 00 00 00 00 00 start_offset = 37 (arena-rel -> FragTable A) + 01DE 0D 00 00 00 00 00 00 00 max_length = 13 (= used_bytes) + 01E6 0D 00 00 00 00 00 00 00 used_bytes = 13 (logical content length) + 01EE 10 data_hash_algo = 16 (SHA-256) + 01EF DF FD 60 21 BB 2B D5 B0 AF 67 62 90 80 9E C3 A5 data_hash = SHA-256("Hello, World!") + 01FF 31 91 DD 81 C7 F7 0A 4B 28 68 8A 36 21 82 98 6F (significant bytes continue) + 020F 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 021F 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + ---- Inner Partition Entry 1 "B" (arena 0x144 / file 0x022F) ----- + 022F 10 00 00 00 type = 0x00000010 + 0233 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 uid = 16 x 0xB2 + 0243 42 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label = "B" then NUL padding + 0253 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 label (cont.) + 0263 52 00 00 00 00 00 00 00 start_offset = 82 (arena-rel -> FragTable B) + 026B 06 00 00 00 00 00 00 00 max_length = 6 (= used_bytes) + 0273 06 00 00 00 00 00 00 00 used_bytes = 6 + 027B 10 data_hash_algo = 16 (SHA-256) + 027C 51 4B 6B B7 C8 46 EC FB 8D 2D 29 EF 0B 5C 79 B6 data_hash = SHA-256("World!") + 028C 3E 6A E8 38 F1 23 DA 93 6F E8 27 FD A6 54 27 6C (significant bytes continue) + 029C 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + 02AC 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (zero padding) + + Plain hex dump of the complete 700-byte file (16 bytes per row): + + 0000 89 4B 50 52 54 0D 0A 1A 01 00 00 00 14 00 00 00 .KPRT........... + 0010 00 00 00 00 01 00 00 00 00 00 00 00 00 10 22 E5 ..............". + 0020 05 13 61 0D A1 6E 02 A2 F7 C6 12 01 B5 04 D8 19 ..a..n.......... + 0030 FB CD 05 FF C0 5E 2F B2 3D 06 33 86 C9 53 00 00 .....^/.=.3..S.. + 0040 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0050 00 00 00 00 00 00 00 00 00 00 00 00 00 00 01 00 ................ + 0060 AC AA DC DC DC DC DC DC DC DC DC DC DC DC DC DC ................ + 0070 DC DC 64 63 70 00 00 00 00 00 00 00 00 00 00 00 ..dcp........... + 0080 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0090 00 00 EB 00 00 00 00 00 00 00 D1 01 00 00 00 00 ................ + 00A0 00 00 D1 01 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00B0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00D0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00E0 00 00 00 00 00 00 00 00 00 00 00 50 44 43 50 01 ...........PDCP. + 00F0 01 00 00 6D 00 00 00 00 00 00 00 D1 01 00 00 00 ...m............ + 0100 00 00 00 48 65 6C 6C 6F 2C 20 57 6F 72 6C 64 21 ...Hello, World! + 0110 00 00 00 00 00 00 00 00 02 18 00 00 00 00 00 00 ................ + 0120 00 07 00 00 00 00 00 00 00 01 00 1F 00 00 00 00 ................ + 0130 00 00 00 06 00 00 00 00 00 00 00 01 01 00 00 00 ................ + 0140 00 00 00 00 00 01 1F 00 00 00 00 00 00 00 06 00 ................ + 0150 00 00 00 00 00 00 01 01 02 00 00 00 00 00 00 00 ................ + 0160 00 10 BE 19 BB 14 7C 68 18 51 CA B7 01 C4 BF 9D ......|h.Q...... + 0170 6C 62 82 C0 CB 53 4F 06 BA 97 07 AB EF 01 AD 47 lb...SO........G + 0180 22 1D 00 00 00 00 00 00 00 00 00 00 00 00 00 00 "............... + 0190 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 01A0 00 00 10 00 00 00 A1 A1 A1 A1 A1 A1 A1 A1 A1 A1 ................ + 01B0 A1 A1 A1 A1 A1 A1 41 00 00 00 00 00 00 00 00 00 ......A......... + 01C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 01D0 00 00 00 00 00 00 25 00 00 00 00 00 00 00 0D 00 ......%......... + 01E0 00 00 00 00 00 00 0D 00 00 00 00 00 00 00 10 DF ................ + 01F0 FD 60 21 BB 2B D5 B0 AF 67 62 90 80 9E C3 A5 31 .`!.+...gb.....1 + 0200 91 DD 81 C7 F7 0A 4B 28 68 8A 36 21 82 98 6F 00 ......K(h.6!..o. + 0210 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0220 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 ................ + 0230 00 00 00 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 B2 ................ + 0240 B2 B2 B2 42 00 00 00 00 00 00 00 00 00 00 00 00 ...B............ + 0250 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 0260 00 00 00 52 00 00 00 00 00 00 00 06 00 00 00 00 ...R............ + 0270 00 00 00 06 00 00 00 00 00 00 00 10 51 4B 6B B7 ............QKk. + 0280 C8 46 EC FB 8D 2D 29 EF 0B 5C 79 B6 3E 6A E8 38 .F...-)..\y.>j.8 + 0290 F1 23 DA 93 6F E8 27 FD A6 54 27 6C 00 00 00 00 .#..o.'..T'l.... + 02A0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 02B0 00 00 00 00 00 00 00 00 00 00 00 00 ............ + + Verification notes: + + - The file is a conforming PCF v1.0 file: a generic PCF reader sees one + partition ("dcp", type 0xAAAC0001, used_bytes 465, data_hash algo 0) + and the top-level table_hash verifies. + - Reconstructing inner "A" concatenates extent [24,31) "Hello, " and + extent [31,37) "World!" -> "Hello, World!", whose SHA-256 is + df fd 60 21 ... 98 6f, matching A.data_hash; length 13 = used_bytes. + - Reconstructing inner "B" reads extent [31,37) "World!" -> SHA-256 + 51 4b 6b b7 ... 27 6c, matching B.data_hash; length 6 = used_bytes. + These are the SAME arena bytes A references at logical offset 7: a + shared extent referenced by two entries -- A[1] and B[0] -- each with + SHARED = 1 (flags bytes at 0x013C and 0x0157). A's other extent + "Hello, " is private (flags = 0 at 0x012A). + - Promoting "B" would write "World!" contiguously, set the top-level + entry's start_offset/max_length, and keep uid (16 x 0xB2) and data_hash + (51 4b 6b b7 ...) byte-identical -- the promotion invariant. + + +------------------------------------------------------------------------------- +Appendix A. Field Layout Summary +------------------------------------------------------------------------------- + + DCP Header (24 bytes, at arena offset 0) + 0 4 bytes dcp_magic = "PDCP" (50 44 43 50) + 4 1 u8 profile_version_major = 1 + 5 1 u8 profile_version_minor = 0 + 6 2 u16 flags (reserved, = 0) + 8 8 u64 inner_table_offset (arena-rel; 0 = none) + 16 8 u64 arena_used (bump pointer, arena-rel) + + Inner Table Block (reused PCF Table Block; offsets arena-relative) + Table Block Header (74 B) partition_count, next_table_offset (arena-rel, + 0 = end), table_hash_algo_id, table_hash + Inner Partition Entry (141 B; PCF layout, two fields reinterpreted) + 0 4 u32 type (NOT 0, NOT 0xAAAC0001) + 4 16 bytes uid (unique file-wide, non-NIL) + 20 32 bytes label + 52 8 u64 start_offset (arena-rel -> Fragment Table) + 60 8 u64 max_length (= used_bytes) + 68 8 u64 used_bytes (logical content length) + 76 1 u8 data_hash_algo_id (crypto if to be signed) + 77 64 bytes data_hash (over logical content) + + Fragment Table Header (9 bytes; offsets arena-relative) + 0 8 u64 next_fragtable_offset (arena-rel; 0 = last block) + 8 1 u8 fragment_count (0..255 in this block) + + Fragment Entry (18 bytes, at block + 9 + i*18) + 0 8 u64 extent_offset (arena-rel) + 8 8 u64 extent_length + 16 1 u8 kind (1 = DATA; 0 invalid; + 2 HOLE / 3 REF reserved) + 17 1 u8 flags (bit0 = SHARED; others 0) + + Logical content = concatenation of DATA extents in fragment-table order. + data_hash and used_bytes describe that logical content (Sections 8.3, 9.1). + + Container facilities used unchanged from PCF + File Header (20 B) magic, version 1.0, partition_table_offset + Top-level Table Block lists the DCP container (type 0xAAAC0001) among + ordinary partitions + DCP container entry start_offset = arena base, used_bytes = + arena_used, data_hash_algo = 0 (unsealed) or + crypto (sealed, Section 9.4) + + +------------------------------------------------------------------------------- +Appendix B. Type and Constant Registry +------------------------------------------------------------------------------- + + PCF partition types used by PCF-DCP + 0xAAAC0001 DCP_CONTAINER (arena holding inner partitions) + 0xAAAC0000..0xAAAC00FF reserved by this profile for future types + + Magics + "PDCP" = 0x50 0x44 0x43 0x50 (DCP Header) + + Fragment Entry kind + 0 = INVALID (reserved guard) 1 = DATA + 2 = HOLE (reserved) 3 = REF (reserved) + + Fragment Entry flags + bit 0 = SHARED (no in-place overwrite; copy-on-write required) + bits 1..7 reserved (MUST be 0) + + Reinterpreted inner-entry fields (inside an arena only) + start_offset -> arena-relative offset of the Fragment Table + max_length -> equal to used_bytes (no contiguous reservation) + + Reserved arena value + arena-relative offset 0 = chain terminator / "none" + + Structure sizes + DCP Header 24 bytes + Fragment Table Header 9 bytes + Fragment Entry 18 bytes + (reused) Table Block Header 74 bytes (PCF) + (reused) Partition Entry 141 bytes (PCF) + + Limits + Entries per (inner) Table Block <= 255 (PCF u8 partition_count) + Inner partitions per container unbounded (inner table chain) + Extents per Fragment Table block <= 255 (u8 fragment_count) + Extents per inner partition unbounded (Fragment Table chain) + Nesting inner partitions are leaves (no inner + DCP_CONTAINER in v1.0) + + Hash algorithms as PCF Section 8.1 (SHA-256 = 16 default/RECOMMENDED + for inner data_hash and inner table_hash) + + Profile version major 1, minor 0 (PCF container version: 1.0) + +=============================================================================== + End of PCF-DCP Specification v1.0 +=============================================================================== From 38546328def0c59d5d50af31fe4c95b61c6abc4d Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 7 Jun 2026 15:55:20 +0000 Subject: [PATCH 2/3] Add pcf-dcp reference implementation (PCF-DCP v1.0 profile) Implement the reference reader/writer for the PCF-DCP profile (Dynamic Container Partition), layered strictly above PCF v1.0 exactly as pcf-sig and pfs-ms are. A DCP container is one opaque PCF partition whose bytes are an arena: a 24-byte DCP Header, a chain of reused PCF Table Blocks listing inner partitions, a Fragment Table per inner partition, and the data extents those fragments name. Inner content is the concatenation of DATA extents, and each inner data_hash covers that logical content, so fragmentation, deduplication, compaction, and promotion all leave the hash (and any PCF-SIG signature over it) unchanged. New crate reference/PCF-DCP-v1.0 (pcf-dcp): - arena: in-memory model with a byte pool plus fragment lists; content- defined deduplication (intra- and cross-partition), copy-on-write edits (append/insert/overwrite/delete/truncate via fragment splitting), mark-and-sweep compaction that normalises the SHARED flag, and a canonical serialiser that reproduces the spec's Section 17 layout. - reader: DcpReader over pcf::Container, so trailer-mode host files read transparently; full DCP-aware verify (PCF integrity, inner table_hash, reconstruction length + data_hash, no nesting, file-wide uid uniqueness). - writer: whole-file model emitting a fresh canonical PCF image; promotion (dynamic->fixed, a MOVE preserving uid + data_hash), demotion, dedup, defrag, and optional trailer-mode finalisation. - vector + example + dcp CLI (info/dedup/defrag/promote/demote, --trailer); every mutating command re-verifies. - tests: byte-exact 700-byte Section 17 vector, spec conformance, round-trips, and error paths (34 tests). testdata/canonical.bin committed. pcf-debug: add a DcpContainerDecoder plugin (renders DCP Header, inner table, fragment tables with SHARED flags, extent summary) and a decode_dcp test, mirroring the PCF-SIG decoder. pcf-debug now depends on pcf-dcp. Wiring: add the crate to the workspace; add a dedicated pcf-dcp CI job (fmt/clippy/build/test + 700-byte vector assertion); publish pcf-dcp before pcf-debug in release.yml; bump/pin pcf-dcp in release-prepare.yml. Spec: add Section 2.2 "Compatibility with the PCF File Trailer". Fix the Section 17 hex dump's profile_version_minor byte at file offset 0x00F0 (01 -> 00): the field is semantically 0 for v1.0 (matching Section 6, the field label, and the const), and no hash covers it, so the file is still 700 bytes and all hashes verify. The reference generator now reproduces the corrected dump byte-for-byte. https://claude.ai/code/session_01XzcjWWbNiuNX9ZywevfbQu --- .github/workflows/ci.yml | 27 + .github/workflows/release-prepare.yml | 3 + .github/workflows/release.yml | 13 + Cargo.toml | 1 + reference/PCF-DCP-v1.0/Cargo.toml | 25 + reference/PCF-DCP-v1.0/README.md | 120 +++ .../PCF-DCP-v1.0/examples/gen_testvector.rs | 57 ++ reference/PCF-DCP-v1.0/src/arena.rs | 881 ++++++++++++++++++ reference/PCF-DCP-v1.0/src/bin/dcp.rs | 269 ++++++ reference/PCF-DCP-v1.0/src/consts.rs | 59 ++ reference/PCF-DCP-v1.0/src/error.rs | 101 ++ reference/PCF-DCP-v1.0/src/fragment.rs | 166 ++++ reference/PCF-DCP-v1.0/src/header.rs | 83 ++ reference/PCF-DCP-v1.0/src/lib.rs | 69 ++ reference/PCF-DCP-v1.0/src/reader.rs | 222 +++++ reference/PCF-DCP-v1.0/src/vector.rs | 43 + reference/PCF-DCP-v1.0/src/writer.rs | 255 +++++ reference/PCF-DCP-v1.0/testdata/canonical.bin | Bin 0 -> 700 bytes reference/PCF-DCP-v1.0/tests/coverage.rs | 228 +++++ reference/PCF-DCP-v1.0/tests/roundtrip.rs | 258 +++++ .../PCF-DCP-v1.0/tests/spec_compliance.rs | 190 ++++ specs/PCF-DCP-spec-v1.0.txt | 37 +- tools/pcf-debug/Cargo.toml | 1 + tools/pcf-debug/src/plugin/dcp.rs | 397 ++++++++ tools/pcf-debug/src/plugin/mod.rs | 3 + tools/pcf-debug/tests/decode_dcp.rs | 125 +++ 26 files changed, 3631 insertions(+), 2 deletions(-) create mode 100644 reference/PCF-DCP-v1.0/Cargo.toml create mode 100644 reference/PCF-DCP-v1.0/README.md create mode 100644 reference/PCF-DCP-v1.0/examples/gen_testvector.rs create mode 100644 reference/PCF-DCP-v1.0/src/arena.rs create mode 100644 reference/PCF-DCP-v1.0/src/bin/dcp.rs create mode 100644 reference/PCF-DCP-v1.0/src/consts.rs create mode 100644 reference/PCF-DCP-v1.0/src/error.rs create mode 100644 reference/PCF-DCP-v1.0/src/fragment.rs create mode 100644 reference/PCF-DCP-v1.0/src/header.rs create mode 100644 reference/PCF-DCP-v1.0/src/lib.rs create mode 100644 reference/PCF-DCP-v1.0/src/reader.rs create mode 100644 reference/PCF-DCP-v1.0/src/vector.rs create mode 100644 reference/PCF-DCP-v1.0/src/writer.rs create mode 100644 reference/PCF-DCP-v1.0/testdata/canonical.bin create mode 100644 reference/PCF-DCP-v1.0/tests/coverage.rs create mode 100644 reference/PCF-DCP-v1.0/tests/roundtrip.rs create mode 100644 reference/PCF-DCP-v1.0/tests/spec_compliance.rs create mode 100644 tools/pcf-debug/src/plugin/dcp.rs create mode 100644 tools/pcf-debug/tests/decode_dcp.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 835948d..04b889f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -135,3 +135,30 @@ jobs: - run: cargo clippy -p pcf-compact --all-targets -- -D warnings - run: cargo build -p pcf-compact --verbose - run: cargo test -p pcf-compact --verbose + + pcf-dcp: + name: pcf-dcp profile + runs-on: ubuntu-latest + defaults: + run: + working-directory: . + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + with: + components: rustfmt, clippy + - uses: Swatinem/rust-cache@v2 + - run: cargo fmt -p pcf-dcp -- --check + - run: cargo clippy -p pcf-dcp --all-targets -- -D warnings + - run: cargo build -p pcf-dcp --verbose + - run: cargo test -p pcf-dcp --verbose + - name: Regenerate the spec test vector + run: cargo run -p pcf-dcp --example gen_testvector -- pcf_dcp_testvector.bin + - name: Inspect generated test vector (spec Section 17 is 700 bytes) + run: | + ls -l pcf_dcp_testvector.bin + test "$(wc -c < pcf_dcp_testvector.bin)" = "700" + - uses: actions/upload-artifact@v4 + with: + name: pcf-dcp-testvector + path: pcf_dcp_testvector.bin diff --git a/.github/workflows/release-prepare.yml b/.github/workflows/release-prepare.yml index a9e9013..95bfe21 100644 --- a/.github/workflows/release-prepare.yml +++ b/.github/workflows/release-prepare.yml @@ -76,13 +76,16 @@ jobs: sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PCF-v1.0/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PFS-MS-v1.0/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PCF-SIG-v1.0/Cargo.toml + sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' reference/PCF-DCP-v1.0/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' tools/pcf-debug/Cargo.toml sed -i 's/^version = "[^"]*"/version = "'"$NEW"'"/' tools/pcf-compact/Cargo.toml # path-dep version pins on pcf sed -i 's|pcf = { path = "\.\./PCF-v1.0", version = "[^"]*" }|pcf = { path = "../PCF-v1.0", version = "'"$NEW"'" }|' reference/PFS-MS-v1.0/Cargo.toml sed -i 's|pcf = { path = "\.\./PCF-v1.0", version = "[^"]*" }|pcf = { path = "../PCF-v1.0", version = "'"$NEW"'" }|' reference/PCF-SIG-v1.0/Cargo.toml + sed -i 's|pcf = { path = "\.\./PCF-v1.0", version = "[^"]*" }|pcf = { path = "../PCF-v1.0", version = "'"$NEW"'" }|' reference/PCF-DCP-v1.0/Cargo.toml sed -i 's|pcf = { path = "\.\./\.\./reference/PCF-v1.0", version = "[^"]*" }|pcf = { path = "../../reference/PCF-v1.0", version = "'"$NEW"'" }|' tools/pcf-debug/Cargo.toml sed -i 's|pcf-sig = { path = "\.\./\.\./reference/PCF-SIG-v1.0", version = "[^"]*" }|pcf-sig = { path = "../../reference/PCF-SIG-v1.0", version = "'"$NEW"'" }|' tools/pcf-debug/Cargo.toml + sed -i 's|pcf-dcp = { path = "\.\./\.\./reference/PCF-DCP-v1.0", version = "[^"]*" }|pcf-dcp = { path = "../../reference/PCF-DCP-v1.0", version = "'"$NEW"'" }|' tools/pcf-debug/Cargo.toml sed -i 's|pcf = { path = "\.\./\.\./reference/PCF-v1.0", version = "[^"]*" }|pcf = { path = "../../reference/PCF-v1.0", version = "'"$NEW"'" }|' tools/pcf-compact/Cargo.toml - name: Bump TypeScript packages diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 90c9928..be1a07c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -132,6 +132,19 @@ jobs: if: needs.resolve.outputs.dry_run != 'true' run: sleep 45 + - name: cargo publish pcf-dcp + shell: bash + run: | + if [ "${{ needs.resolve.outputs.dry_run }}" = "true" ]; then + cargo publish -p pcf-dcp --allow-dirty --dry-run + else + cargo publish -p pcf-dcp --allow-dirty --token "${{ steps.cargo-auth.outputs.token }}" + fi + + - name: Wait for crates.io index + if: needs.resolve.outputs.dry_run != 'true' + run: sleep 45 + - name: cargo publish pcf-debug shell: bash run: | diff --git a/Cargo.toml b/Cargo.toml index 8f2f4c7..6bd8fea 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ members = [ "reference/PCF-v1.0", "reference/PFS-MS-v1.0", "reference/PCF-SIG-v1.0", + "reference/PCF-DCP-v1.0", "tools/pcf-debug", "tools/pcf-compact", ] diff --git a/reference/PCF-DCP-v1.0/Cargo.toml b/reference/PCF-DCP-v1.0/Cargo.toml new file mode 100644 index 0000000..6034449 --- /dev/null +++ b/reference/PCF-DCP-v1.0/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "pcf-dcp" +version = "0.0.8" +edition = "2021" +description = "Reference implementation of PCF-DCP v1.0, the PCF Dynamic Container Partition profile" +license = "MIT OR Apache-2.0" +repository = "https://github.com/kduma-OSS/Partitioned-Container-Format" +homepage = "https://github.com/kduma-OSS/Partitioned-Container-Format" +readme = "README.md" +keywords = ["pcf", "dcp", "container", "deduplication", "fragmentation"] +categories = ["encoding", "filesystem"] + +# This crate is a *reference* implementation of the PCF-DCP profile. Like the +# `pcf` crate it builds on, it favours a direct, auditable mapping onto the +# written specification (`specs/PCF-DCP-spec-v1.0.txt`) over raw performance. + +[[bin]] +name = "dcp" +path = "src/bin/dcp.rs" + +[dependencies] +# The PCF-DCP profile is layered strictly above PCF v1.0; every byte container +# operation goes through the reference PCF crate. The arena reuses PCF's Table +# Block, Partition Entry, and table-hash primitives directly. +pcf = { path = "../PCF-v1.0", version = "0.0.8" } diff --git a/reference/PCF-DCP-v1.0/README.md b/reference/PCF-DCP-v1.0/README.md new file mode 100644 index 0000000..0fab1d8 --- /dev/null +++ b/reference/PCF-DCP-v1.0/README.md @@ -0,0 +1,120 @@ +# pcf-dcp — PCF Dynamic Container Partition (reference implementation) + +Reference reader/writer for **PCF-DCP v1.0**, an application-level profile that +adds *dynamic*, fragmentable, dedup-friendly sub-partitions to the +[Partitioned Container Format](../PCF-v1.0) without modifying the PCF byte +container. + +This crate mirrors the written specification (`specs/PCF-DCP-spec-v1.0.txt`) +field-for-field and is intended as the *normative* implementation against which +language ports are checked. It favours auditability over performance. + +## Model at a glance + +PCF-DCP defines one new PCF partition type: + +| Type | Name | Holds | +|--------------|-----------------|----------------------------------------------------| +| `0xAAAC0001` | `DCP_CONTAINER` | An *arena*: a header, an inner partition table, fragment tables, and data extents | + +A DCP container's bytes are an **arena** addressed by arena-relative offsets: + +``` +arena: +[ DCP Header (24 B) | data extents | Fragment Tables | Inner Table Block(s) ] +``` + +* **DCP Header** — `"PDCP"` magic, profile version, `inner_table_offset`, + `arena_used` (a bump pointer). +* **Inner Table Block** — a chain of reused PCF Table Blocks (74 B header + + 141 B entries), byte-for-byte identical to the top-level table, listing the + *inner* partitions. Two entry fields are reinterpreted: `start_offset` points + at the partition's Fragment Table, and `max_length` equals `used_bytes`. +* **Fragment Table** — per inner partition, a chain of 9-byte block headers each + followed by 18-byte **Fragment Entries**. Each entry names one extent + `(offset, length, kind, flags)`. The logical content of an inner partition is + the concatenation of its DATA extents. + +A generic PCF reader sees a DCP file as **one opaque partition**; only a +DCP-aware reader looks inside. A DCP file is always a conforming PCF v1.0 file. + +## Why a profile + +PCF stores each partition as a contiguous, statically-reserved region. PCF-DCP +makes each *inner* partition grow, shrink, and be edited in the middle without +relocating its neighbours, by describing it as a list of extents rather than one +range. This buys: + +* **Fragmentation / random edits** — append, insert, overwrite, delete, and + truncate are edits of the Fragment Table (copy-on-write for shared bytes); no + data is moved. +* **Deduplication** — two extents may name the same arena bytes; identical + chunks are stored once. The per-extent `SHARED` flag makes safe in-place + editing explicit. +* **Hash / signature stability** — an inner partition's `data_hash` covers its + *logical content*, so fragmentation, dedup, compaction, and promotion all + leave the hash (and any PCF-SIG signature over it) unchanged. + +## Library example + +```rust +use std::io::Cursor; +use pcf_dcp::{Arena, Chunker, DcpReader, DcpWriter, HashAlgo}; + +let mut arena = Arena::new(); +arena.add_inner(0x10, [0xA1; 16], "A", b"Hello, World!", HashAlgo::Sha256, Chunker::Fixed(7))?; +arena.add_inner(0x10, [0xB2; 16], "B", b"World!", HashAlgo::Sha256, Chunker::Whole)?; + +let mut w = DcpWriter::new(); +w.add_container([0xDC; 16], "dcp", arena)?; +let image = w.to_image()?; + +let mut r = DcpReader::open(Cursor::new(image))?; +r.verify()?; +assert_eq!(r.read_inner(&[0xB2; 16])?, b"World!"); +# Ok::<(), pcf_dcp::Error>(()) +``` + +## Promotion / demotion + +`DcpWriter::promote` moves an inner partition out to a top-level PCF partition +(dynamic → fixed); `demote` moves a top-level partition into a container +(fixed → dynamic). Both preserve `uid`, `partition_type`, `label`, +`data_hash_algo_id`, and `data_hash` — the **promotion invariant**, identical to +the set of fields a PCF-SIG signature protects. + +## Command-line tool + +The `dcp` binary inspects and rewrites DCP files; every mutating command +re-verifies before writing: + +``` +dcp info +dcp dedup [--fixed N] [--trailer] +dcp defrag [--trailer] +dcp promote [--trailer] +dcp demote [--trailer] +``` + +UIDs are 32 hex digits, or `0xNN` for a uid of 16 identical bytes (e.g. `0xDC`). + +## Build & test + +``` +cargo test -p pcf-dcp +cargo run -p pcf-dcp --example gen_testvector -- /tmp/dcp.bin # the 700-byte vector +cargo run -p pcf-dcp --bin dcp -- info /tmp/dcp.bin +``` + +The example reproduces the byte-exact 700-byte test vector from Section 17 of +the specification. + +## Relationship to `pcf` + +This crate is layered strictly above [`pcf`](../PCF-v1.0): every container byte +operation goes through the reference PCF crate, and the arena reuses PCF's Table +Block, Partition Entry, and table-hash primitives directly. + +## Licence + +MIT OR Apache-2.0. diff --git a/reference/PCF-DCP-v1.0/examples/gen_testvector.rs b/reference/PCF-DCP-v1.0/examples/gen_testvector.rs new file mode 100644 index 0000000..dd585ce --- /dev/null +++ b/reference/PCF-DCP-v1.0/examples/gen_testvector.rs @@ -0,0 +1,57 @@ +//! Generates the canonical PCF-DCP v1.0 test-vector file used in spec +//! Section 17. +//! +//! Run with: `cargo run --example gen_testvector -- ` +//! (defaults to ./pcf_dcp_testvector.bin). Everything is fixed and +//! deterministic so that ports can reproduce the file byte-for-byte. + +use std::io::Cursor; + +use pcf::Container; +use pcf_dcp::{build_reference_vector, DcpReader}; + +fn main() { + let path = std::env::args() + .nth(1) + .unwrap_or_else(|| "pcf_dcp_testvector.bin".to_string()); + + let image = build_reference_vector().expect("build reference vector"); + std::fs::write(&path, &image).expect("write file"); + + // It is a conforming PCF v1.0 file ... + let mut pcf = Container::open(Cursor::new(image.clone())).expect("pcf open"); + pcf.verify().expect("pcf verify"); + + // ... and a conforming DCP file. + let mut dcp = DcpReader::open(Cursor::new(image.clone())).expect("dcp open"); + dcp.verify().expect("dcp verify"); + + eprintln!("wrote {} ({} bytes)", path, image.len()); + for c in dcp.containers().expect("containers") { + let arena = dcp.open_arena(&c).expect("arena"); + eprintln!( + " container {:<6} type=0x{:08X} used={} inners={}", + c.label_string().unwrap_or_default(), + c.partition_type, + c.used_bytes, + arena.len() + ); + for info in arena.inners() { + let n = info.data_hash_algo.digest_len(); + let hex: String = info.data_hash[..n] + .iter() + .map(|b| format!("{b:02x}")) + .collect(); + let shared = info.extents.iter().filter(|e| e.shared).count(); + eprintln!( + " inner {:<3} type=0x{:08X} used={} extents={} shared={} data_hash={}", + info.label, + info.partition_type, + info.used_bytes, + info.extents.len(), + shared, + hex + ); + } + } +} diff --git a/reference/PCF-DCP-v1.0/src/arena.rs b/reference/PCF-DCP-v1.0/src/arena.rs new file mode 100644 index 0000000..9a2e232 --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/arena.rs @@ -0,0 +1,881 @@ +//! The DCP arena: the in-memory model of one DCP container and its canonical +//! byte serialisation. +//! +//! An [`Arena`] holds a byte pool (`blob`) plus a list of inner partitions, +//! each of which owns a list of [`Frag`]s. A `Frag` addresses a byte range in +//! the pool; two `Frag`s addressing the *same* range share that extent +//! (deduplication, spec Section 10.2). Editing operations (append, overwrite, +//! insert, delete, truncate) work purely on the fragment list and append new +//! bytes to the pool, never overwriting bytes a `SHARED` extent still names +//! (copy-on-write, spec Section 10.1). +//! +//! [`Arena::to_bytes`] always emits the *canonical* layout used by the spec's +//! test vector (Section 17): `DCP Header || data extents || Fragment Tables || +//! Inner Table Block(s)`, with each distinct extent emitted exactly once. + +use std::collections::HashMap; + +use pcf::{ + compute_table_hash, decode_label, encode_label, HashAlgo, PartitionEntry, TableBlockHeader, + ENTRY_SIZE, NIL_UID, TABLE_HEADER_SIZE, UID_SIZE, +}; + +use crate::consts::*; +use crate::error::{Error, Result}; +use crate::fragment::{walk_fragment_table, FragTableHeader, FragmentEntry}; +use crate::header::{read_header, DcpHeader}; + +/// How a Writer splits an inner partition's content into extents +/// (spec Section 10.2; chunking is writer-side policy). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Chunker { + /// One extent for the whole content. + Whole, + /// Fixed-size chunks of `n` bytes (the final chunk may be shorter). `n == 0` + /// is treated as [`Chunker::Whole`]. + Fixed(usize), +} + +impl Chunker { + fn split<'a>(&self, content: &'a [u8]) -> Vec<&'a [u8]> { + match *self { + Chunker::Whole => { + if content.is_empty() { + Vec::new() + } else { + vec![content] + } + } + Chunker::Fixed(0) => Chunker::Whole.split(content), + Chunker::Fixed(n) => content.chunks(n).collect(), + } + } +} + +/// One extent reference inside an inner partition. `offset`/`length` address +/// [`Arena::blob`]; `shared` is the on-disk SHARED flag (bit 0 of `flags`). +#[derive(Debug, Clone, Copy)] +struct Frag { + offset: u64, + length: u64, + kind: u8, + shared: bool, +} + +/// One inner partition. +#[derive(Debug, Clone)] +struct Inner { + partition_type: u32, + uid: [u8; UID_SIZE], + label: [u8; 32], + data_hash_algo: HashAlgo, + frags: Vec, +} + +impl Inner { + fn logical_len(&self) -> u64 { + self.frags + .iter() + .filter(|f| f.kind == KIND_DATA) + .map(|f| f.length) + .sum() + } + + fn content(&self, blob: &[u8]) -> Vec { + let mut out = Vec::with_capacity(self.logical_len() as usize); + for f in &self.frags { + if f.kind == KIND_DATA { + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + out.extend_from_slice(&blob[a..b]); + } + } + out + } + + fn data_hash(&self, blob: &[u8]) -> [u8; 64] { + self.data_hash_algo.compute(&self.content(blob)) + } +} + +/// A read-only view of one extent, for tooling (`dcp info`, tests). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct ExtentInfo { + /// Arena/pool-relative offset of the extent. + pub extent_offset: u64, + /// Length of the extent in bytes. + pub extent_length: u64, + /// Extent kind (`1` = DATA). + pub kind: u8, + /// Whether the SHARED flag is set. + pub shared: bool, +} + +/// A read-only view of one inner partition, for tooling and verification. +#[derive(Debug, Clone)] +pub struct InnerInfo { + /// Application partition type. + pub partition_type: u32, + /// 16-byte uid (unique file-wide). + pub uid: [u8; UID_SIZE], + /// Decoded label. + pub label: String, + /// Logical content length (= `used_bytes`). + pub used_bytes: u64, + /// Hash algorithm protecting the logical content. + pub data_hash_algo: HashAlgo, + /// The 64-byte data-hash field over the logical content. + pub data_hash: [u8; 64], + /// The partition's extents in logical order. + pub extents: Vec, +} + +/// The in-memory model of one DCP container. +#[derive(Debug, Clone)] +pub struct Arena { + profile_version_major: u8, + profile_version_minor: u8, + flags: u16, + inner_table_algo: HashAlgo, + blob: Vec, + inners: Vec, +} + +impl Default for Arena { + fn default() -> Self { + Self::new() + } +} + +impl Arena { + // ---- construction ----------------------------------------------------- + + /// A fresh, empty arena (profile v1.0, SHA-256 inner table hashing). + pub fn new() -> Self { + Arena { + profile_version_major: PROFILE_VERSION_MAJOR, + profile_version_minor: PROFILE_VERSION_MINOR, + flags: 0, + inner_table_algo: HashAlgo::Sha256, + blob: Vec::new(), + inners: Vec::new(), + } + } + + /// Choose the hash algorithm used for inner Table Blocks (default + /// SHA-256). A Writer SHOULD keep this cryptographic (spec Section 9.2). + pub fn with_inner_table_algo(mut self, algo: HashAlgo) -> Self { + self.inner_table_algo = algo; + self + } + + /// Parse an arena from its on-disk bytes (spec Sections 6–8). The byte + /// pool is the arena itself, so every parsed extent offset is + /// arena-relative and indexes directly into it. + pub fn parse(bytes: &[u8]) -> Result { + let header = read_header(bytes)?; + if header.profile_version_major != PROFILE_VERSION_MAJOR { + return Err(Error::UnsupportedProfileMajor(header.profile_version_major)); + } + let arena_used = header.arena_used; + + let mut inners = Vec::new(); + let mut inner_table_algo = HashAlgo::Sha256; + let mut first_block = true; + let mut off = header.inner_table_offset; + let mut budget = bytes.len() / TABLE_HEADER_SIZE as usize + 1; + while off != ARENA_NONE { + if budget == 0 { + return Err(Error::OffsetOutOfRange); + } + budget -= 1; + let base = off as usize; + let hb: [u8; 74] = bytes + .get(base..base + TABLE_HEADER_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let h = TableBlockHeader::from_bytes(&hb)?; + if first_block { + inner_table_algo = h.table_hash_algo; + first_block = false; + } + for i in 0..h.partition_count as u64 { + let eo = base + TABLE_HEADER_SIZE as usize + (i * ENTRY_SIZE) as usize; + let eb: [u8; 141] = bytes + .get(eo..eo + ENTRY_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let entry = PartitionEntry::from_bytes(&eb)?; + let on_disk = walk_fragment_table(bytes, entry.start_offset)?; + let frags = on_disk + .iter() + .map(|fe| Frag { + offset: fe.extent_offset, + length: fe.extent_length, + kind: fe.kind, + shared: fe.is_shared(), + }) + .collect(); + inners.push(Inner { + partition_type: entry.partition_type, + uid: entry.uid, + label: entry.label, + data_hash_algo: entry.data_hash_algo, + frags, + }); + } + off = h.next_table_offset; + } + + let blob = bytes.to_vec(); + let arena = Arena { + profile_version_major: header.profile_version_major, + profile_version_minor: header.profile_version_minor, + flags: header.flags, + inner_table_algo, + blob, + inners, + }; + // Bound every extent by the declared arena_used. + for inner in &arena.inners { + for f in &inner.frags { + let end = f + .offset + .checked_add(f.length) + .ok_or(Error::OffsetOutOfRange)?; + if end > arena_used { + return Err(Error::OffsetOutOfRange); + } + } + } + Ok(arena) + } + + // ---- read-only views -------------------------------------------------- + + /// Number of inner partitions. + pub fn len(&self) -> usize { + self.inners.len() + } + + /// Whether the arena has no inner partitions. + pub fn is_empty(&self) -> bool { + self.inners.is_empty() + } + + /// The uids of all inner partitions, in stored order. + pub fn uids(&self) -> Vec<[u8; UID_SIZE]> { + self.inners.iter().map(|i| i.uid).collect() + } + + fn index_of(&self, uid: &[u8; UID_SIZE]) -> Result { + self.inners + .iter() + .position(|i| &i.uid == uid) + .ok_or(Error::NotFound) + } + + /// A read-only view of one inner partition. + pub fn inner_info(&self, uid: &[u8; UID_SIZE]) -> Result { + let inner = &self.inners[self.index_of(uid)?]; + Ok(self.view(inner)) + } + + /// Read-only views of every inner partition, in stored order. + pub fn inners(&self) -> Vec { + self.inners.iter().map(|i| self.view(i)).collect() + } + + fn view(&self, inner: &Inner) -> InnerInfo { + InnerInfo { + partition_type: inner.partition_type, + uid: inner.uid, + label: decode_label(&inner.label).unwrap_or_default(), + used_bytes: inner.logical_len(), + data_hash_algo: inner.data_hash_algo, + data_hash: inner.data_hash(&self.blob), + extents: inner + .frags + .iter() + .map(|f| ExtentInfo { + extent_offset: f.offset, + extent_length: f.length, + kind: f.kind, + shared: f.shared, + }) + .collect(), + } + } + + /// Reconstruct an inner partition's logical content (spec Section 8.3), + /// checking its length and (when algorithmic) its stored data hash. + pub fn content(&self, uid: &[u8; UID_SIZE]) -> Result> { + let inner = &self.inners[self.index_of(uid)?]; + let bytes = inner.content(&self.blob); + let declared = inner.logical_len(); + if bytes.len() as u64 != declared { + return Err(Error::LengthMismatch { + expected: declared, + got: bytes.len() as u64, + }); + } + Ok(bytes) + } + + // ---- builder ---------------------------------------------------------- + + /// Add an inner partition whose `content` is split by `chunker` into + /// extents, deduplicating against extents already present (spec Section + /// 10.2). Sharing sets the SHARED flag on the new and aliased entries + /// (rule F1, spec Section 8.4). + #[allow(clippy::too_many_arguments)] + pub fn add_inner( + &mut self, + partition_type: u32, + uid: [u8; UID_SIZE], + label: &str, + content: &[u8], + data_hash_algo: HashAlgo, + chunker: Chunker, + ) -> Result<()> { + if partition_type == 0 { + return Err(Error::ReservedType); + } + if partition_type == DCP_CONTAINER_TYPE { + return Err(Error::NestedContainer); + } + if uid == NIL_UID { + return Err(Error::NilUid); + } + if self.inners.iter().any(|i| i.uid == uid) { + return Err(Error::DuplicateUid); + } + let label = encode_label(label).map_err(Error::Pcf)?; + + let mut frags: Vec = Vec::new(); + for chunk in chunker.split(content) { + // Deduplicate against extents already present in other inner + // partitions AND against earlier chunks of this same partition. + let hit = self + .find_extent(chunk) + .or_else(|| find_local(&self.blob, &frags, chunk)); + match hit { + Some((offset, length)) => { + self.mark_shared(offset, length); + for f in &mut frags { + if f.offset == offset && f.length == length { + f.shared = true; + } + } + frags.push(Frag { + offset, + length, + kind: KIND_DATA, + shared: true, + }); + } + None => { + let offset = self.blob.len() as u64; + self.blob.extend_from_slice(chunk); + frags.push(Frag { + offset, + length: chunk.len() as u64, + kind: KIND_DATA, + shared: false, + }); + } + } + } + self.inners.push(Inner { + partition_type, + uid, + label, + data_hash_algo, + frags, + }); + Ok(()) + } + + /// Find an existing DATA extent whose bytes equal `chunk`, returning its + /// `(offset, length)`. Realises content-defined sharing for `add_inner` + /// and `dedup`. + fn find_extent(&self, chunk: &[u8]) -> Option<(u64, u64)> { + if chunk.is_empty() { + return None; + } + for inner in &self.inners { + for f in &inner.frags { + if f.kind == KIND_DATA && f.length == chunk.len() as u64 { + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + if &self.blob[a..b] == chunk { + return Some((f.offset, f.length)); + } + } + } + } + None + } + + /// Set the SHARED flag on every live fragment that references exactly the + /// `(offset, length)` extent (rule F1). + fn mark_shared(&mut self, offset: u64, length: u64) { + for inner in &mut self.inners { + for f in &mut inner.frags { + if f.offset == offset && f.length == length { + f.shared = true; + } + } + } + } + + // ---- logical edits (copy-on-write) ------------------------------------ + + /// Append `bytes` to the end of an inner partition's logical content. + pub fn append(&mut self, uid: &[u8; UID_SIZE], bytes: &[u8]) -> Result<()> { + let idx = self.index_of(uid)?; + if bytes.is_empty() { + return Ok(()); + } + let offset = self.blob.len() as u64; + self.blob.extend_from_slice(bytes); + self.inners[idx].frags.push(Frag { + offset, + length: bytes.len() as u64, + kind: KIND_DATA, + shared: false, + }); + Ok(()) + } + + /// Overwrite the logical range `[pos, pos+len)` with `bytes` (which need not + /// be the same length: this is delete-then-insert). The replaced bytes go + /// into a fresh private extent, leaving any SHARED bytes untouched. + pub fn overwrite( + &mut self, + uid: &[u8; UID_SIZE], + pos: u64, + len: u64, + bytes: &[u8], + ) -> Result<()> { + self.delete(uid, pos, len)?; + self.insert(uid, pos, bytes) + } + + /// Insert `bytes` at logical position `pos` (`pos == content length` + /// appends). The new bytes form a fresh private extent. + pub fn insert(&mut self, uid: &[u8; UID_SIZE], pos: u64, bytes: &[u8]) -> Result<()> { + let idx = self.index_of(uid)?; + let total = self.inners[idx].logical_len(); + if pos > total { + return Err(Error::PositionOutOfRange); + } + if bytes.is_empty() { + return Ok(()); + } + let split = self.split_at(idx, pos); + let offset = self.blob.len() as u64; + self.blob.extend_from_slice(bytes); + self.inners[idx].frags.insert( + split, + Frag { + offset, + length: bytes.len() as u64, + kind: KIND_DATA, + shared: false, + }, + ); + Ok(()) + } + + /// Delete the logical range `[pos, pos+len)`, dropping the covered + /// fragments without moving any bytes (spec Section 10.1). + pub fn delete(&mut self, uid: &[u8; UID_SIZE], pos: u64, len: u64) -> Result<()> { + let idx = self.index_of(uid)?; + let total = self.inners[idx].logical_len(); + let end = pos.checked_add(len).ok_or(Error::PositionOutOfRange)?; + if end > total { + return Err(Error::PositionOutOfRange); + } + if len == 0 { + return Ok(()); + } + let lo = self.split_at(idx, pos); + let hi = self.split_at(idx, end); + self.inners[idx].frags.drain(lo..hi); + Ok(()) + } + + /// Truncate the partition's logical content to `new_len` bytes. + pub fn truncate(&mut self, uid: &[u8; UID_SIZE], new_len: u64) -> Result<()> { + let idx = self.index_of(uid)?; + let total = self.inners[idx].logical_len(); + if new_len > total { + return Err(Error::PositionOutOfRange); + } + let cut = self.split_at(idx, new_len); + self.inners[idx].frags.truncate(cut); + Ok(()) + } + + /// Ensure a fragment boundary exists at logical position `pos` in inner + /// `idx`, splitting the straddling fragment if needed. Returns the index of + /// the first fragment at-or-after `pos`. Splitting never copies bytes: both + /// halves keep the parent's `shared` flag and address the same pool bytes. + fn split_at(&mut self, idx: usize, pos: u64) -> usize { + let frags = &mut self.inners[idx].frags; + let mut logical = 0u64; + let mut i = 0; + while i < frags.len() { + let flen = frags[i].length; + if logical == pos { + return i; + } + if pos < logical + flen { + // Split fragment i at (pos - logical). + let head = pos - logical; + let f = frags[i]; + let left = Frag { + offset: f.offset, + length: head, + kind: f.kind, + shared: f.shared, + }; + let right = Frag { + offset: f.offset + head, + length: flen - head, + kind: f.kind, + shared: f.shared, + }; + frags[i] = left; + frags.insert(i + 1, right); + return i + 1; + } + logical += flen; + i += 1; + } + frags.len() + } + + // ---- promotion support ------------------------------------------------ + + /// Remove an inner partition, returning the pieces a promotion needs: its + /// type, label, hash algorithm, and reconstructed logical content. The uid + /// is the caller's; the data hash is recomputed from the content (and is, + /// by construction, identical to the inner entry's — the promotion + /// invariant, spec Section 10.4). + pub fn remove_inner( + &mut self, + uid: &[u8; UID_SIZE], + ) -> Result<(u32, String, HashAlgo, Vec)> { + let idx = self.index_of(uid)?; + let content = self.content(uid)?; + let inner = self.inners.remove(idx); + let label = decode_label(&inner.label).unwrap_or_default(); + Ok((inner.partition_type, label, inner.data_hash_algo, content)) + } + + // ---- deduplication and compaction ------------------------------------- + + /// Re-chunk every inner partition with `chunker` and deduplicate identical + /// extents across the whole arena (spec Section 10.2). Logical content and + /// every `data_hash` are preserved. Returns the number of bytes the pool + /// shrank by once re-serialised (an estimate of dedup savings). + pub fn dedup(&mut self, chunker: Chunker) -> u64 { + let before = self.canonical_extent_bytes(); + // Rebuild the pool from each partition's logical content, re-chunking + // and sharing identical chunks. A fresh arena guarantees a clean pool. + let mut rebuilt = Arena { + profile_version_major: self.profile_version_major, + profile_version_minor: self.profile_version_minor, + flags: self.flags, + inner_table_algo: self.inner_table_algo, + blob: Vec::new(), + inners: Vec::new(), + }; + for inner in &self.inners { + let content = inner.content(&self.blob); + // add_inner cannot fail here: inputs already passed validation. + let _ = rebuilt.add_inner( + inner.partition_type, + inner.uid, + &decode_label(&inner.label).unwrap_or_default(), + &content, + inner.data_hash_algo, + chunker, + ); + } + *self = rebuilt; + let after = self.canonical_extent_bytes(); + before.saturating_sub(after) + } + + /// Compact the arena (spec Section 10.3): drop unreferenced pool bytes and + /// normalise the SHARED flag, clearing it on any extent now referenced + /// exactly once (rule F2). Returns the number of dead pool bytes reclaimed. + pub fn compact(&mut self) -> u64 { + // Reference count by distinct (offset, length) extent. + let mut refcount: HashMap<(u64, u64), u32> = HashMap::new(); + for inner in &self.inners { + for f in &inner.frags { + *refcount.entry((f.offset, f.length)).or_insert(0) += 1; + } + } + // Normalise SHARED: an extent referenced once is private again. + for inner in &mut self.inners { + for f in &mut inner.frags { + let rc = refcount[&(f.offset, f.length)]; + if rc <= 1 { + f.shared = false; + } + } + } + // Sweep: copy each distinct live extent once into a fresh pool, in + // first-reference order, and rewrite offsets. + let dead_before = self.blob.len() as u64 - self.live_extent_bytes(&refcount); + let mut newpool: Vec = Vec::new(); + let mut remap: HashMap<(u64, u64), u64> = HashMap::new(); + for inner in &self.inners { + for f in &inner.frags { + remap.entry((f.offset, f.length)).or_insert_with(|| { + let at = newpool.len() as u64; + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + newpool.extend_from_slice(&self.blob[a..b]); + at + }); + } + } + for inner in &mut self.inners { + for f in &mut inner.frags { + f.offset = remap[&(f.offset, f.length)]; + } + } + self.blob = newpool; + dead_before + } + + fn live_extent_bytes(&self, refcount: &HashMap<(u64, u64), u32>) -> u64 { + refcount + .keys() + .map(|&(_, len)| len) + .sum::() + .min(self.blob.len() as u64) + } + + /// Total bytes of the distinct extents that [`Self::to_bytes`] would emit. + fn canonical_extent_bytes(&self) -> u64 { + let mut seen: HashMap<(u64, u64), ()> = HashMap::new(); + let mut total = 0u64; + for inner in &self.inners { + for f in &inner.frags { + if seen.insert((f.offset, f.length), ()).is_none() { + total += f.length; + } + } + } + total + } + + // ---- canonical serialisation ------------------------------------------ + + /// Serialise the arena into its canonical on-disk layout (spec Section 17): + /// `DCP Header || data extents || Fragment Tables || Inner Table Block(s)`, + /// each distinct extent emitted once. The returned bytes are a complete DCP + /// arena ready to become a PCF partition's data. + pub fn to_bytes(&self) -> Vec { + // --- 1. distinct extents, first-reference order -------------------- + let mut ext_order: Vec<(u64, u64)> = Vec::new(); + let mut ext_index: HashMap<(u64, u64), usize> = HashMap::new(); + for inner in &self.inners { + for f in &inner.frags { + let key = (f.offset, f.length); + ext_index.entry(key).or_insert_with(|| { + ext_order.push(key); + ext_order.len() - 1 + }); + } + } + + // --- 2. lay out extents right after the header --------------------- + let mut cur = DCP_HEADER_SIZE; + let mut ext_arena_off: Vec = Vec::with_capacity(ext_order.len()); + for &(_, len) in &ext_order { + ext_arena_off.push(cur); + cur += len; + } + + // --- 3. Fragment Tables (one chain per inner) ---------------------- + let mut frag_off: Vec = Vec::with_capacity(self.inners.len()); + for inner in &self.inners { + frag_off.push(cur); + cur += fragtable_span(inner.frags.len()); + } + + // --- 4. Inner Table Block(s) --------------------------------------- + let inner_table_offset = cur; + let counts = block_counts(self.inners.len()); + let mut block_off: Vec = Vec::with_capacity(counts.len()); + for &c in &counts { + block_off.push(cur); + cur += TABLE_HEADER_SIZE + c as u64 * ENTRY_SIZE; + } + let arena_used = cur; + + // --- serialise into a zeroed buffer -------------------------------- + let mut buf = vec![0u8; arena_used as usize]; + + let header = DcpHeader { + profile_version_major: self.profile_version_major, + profile_version_minor: self.profile_version_minor, + flags: self.flags, + inner_table_offset, + arena_used, + }; + buf[0..24].copy_from_slice(&header.to_bytes()); + + for (i, &(boff, len)) in ext_order.iter().enumerate() { + let dst = ext_arena_off[i] as usize; + let (a, b) = (boff as usize, (boff + len) as usize); + buf[dst..dst + len as usize].copy_from_slice(&self.blob[a..b]); + } + + for (ii, inner) in self.inners.iter().enumerate() { + write_fragment_table( + &mut buf, + frag_off[ii], + &inner.frags, + &ext_index, + &ext_arena_off, + ); + } + + let entries: Vec = self + .inners + .iter() + .enumerate() + .map(|(ii, inner)| { + let used = inner.logical_len(); + PartitionEntry { + partition_type: inner.partition_type, + uid: inner.uid, + label: inner.label, + start_offset: frag_off[ii], + max_length: used, + used_bytes: used, + data_hash_algo: inner.data_hash_algo, + data_hash: inner.data_hash(&self.blob), + } + }) + .collect(); + + let mut idx = 0usize; + for (b, &c) in counts.iter().enumerate() { + let next = if b + 1 < counts.len() { + block_off[b + 1] + } else { + 0 + }; + let slice = &entries[idx..idx + c]; + let th = compute_table_hash(self.inner_table_algo, next, slice); + let bh = TableBlockHeader { + partition_count: c as u8, + next_table_offset: next, + table_hash_algo: self.inner_table_algo, + table_hash: th, + }; + let bo = block_off[b] as usize; + buf[bo..bo + 74].copy_from_slice(&bh.to_bytes()); + for (j, e) in slice.iter().enumerate() { + let eo = bo + 74 + j * ENTRY_SIZE as usize; + buf[eo..eo + ENTRY_SIZE as usize].copy_from_slice(&e.to_bytes()); + } + idx += c; + } + + buf + } +} + +/// Find an extent among `frags` whose pool bytes equal `chunk`, for +/// intra-partition deduplication while a partition is being built. +fn find_local(blob: &[u8], frags: &[Frag], chunk: &[u8]) -> Option<(u64, u64)> { + if chunk.is_empty() { + return None; + } + for f in frags { + if f.kind == KIND_DATA && f.length == chunk.len() as u64 { + let (a, b) = (f.offset as usize, (f.offset + f.length) as usize); + if &blob[a..b] == chunk { + return Some((f.offset, f.length)); + } + } + } + None +} + +/// On-disk span of an inner partition's Fragment Table chain holding `n` +/// extents, split into blocks of at most 255 entries. +fn fragtable_span(n: usize) -> u64 { + let mut span = 0u64; + for c in block_counts(n) { + span += FRAGTABLE_HEADER_SIZE + c as u64 * FRAGMENT_ENTRY_SIZE; + } + span +} + +/// Split `n` items into blocks of at most 255; always at least one block (an +/// empty block when `n == 0`). +fn block_counts(n: usize) -> Vec { + if n == 0 { + return vec![0]; + } + let mut out = Vec::new(); + let mut rem = n; + while rem > 0 { + let c = rem.min(MAX_ENTRIES_PER_BLOCK); + out.push(c); + rem -= c; + } + out +} + +/// Write one inner partition's Fragment Table chain at `start`. +fn write_fragment_table( + buf: &mut [u8], + start: u64, + frags: &[Frag], + ext_index: &HashMap<(u64, u64), usize>, + ext_arena_off: &[u64], +) { + let counts = block_counts(frags.len()); + let mut block_start = start; + let mut idx = 0usize; + for (b, &c) in counts.iter().enumerate() { + let span = FRAGTABLE_HEADER_SIZE + c as u64 * FRAGMENT_ENTRY_SIZE; + let next = if b + 1 < counts.len() { + block_start + span + } else { + 0 + }; + let bs = block_start as usize; + let fh = FragTableHeader { + next_fragtable_offset: next, + fragment_count: c as u8, + }; + buf[bs..bs + 9].copy_from_slice(&fh.to_bytes()); + for j in 0..c { + let f = &frags[idx + j]; + let arena_off = ext_arena_off[ext_index[&(f.offset, f.length)]]; + let fe = FragmentEntry { + extent_offset: arena_off, + extent_length: f.length, + kind: f.kind, + flags: if f.shared { FLAG_SHARED } else { 0 }, + }; + let eo = bs + 9 + j * FRAGMENT_ENTRY_SIZE as usize; + buf[eo..eo + FRAGMENT_ENTRY_SIZE as usize].copy_from_slice(&fe.to_bytes()); + } + block_start += span; + idx += c; + } +} diff --git a/reference/PCF-DCP-v1.0/src/bin/dcp.rs b/reference/PCF-DCP-v1.0/src/bin/dcp.rs new file mode 100644 index 0000000..0ace1ed --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/bin/dcp.rs @@ -0,0 +1,269 @@ +//! `dcp` — a small command-line tool for DCP containers. +//! +//! Subcommands (arguments parsed by hand, in the style of the other reference +//! tools): +//! +//! ```text +//! dcp info +//! dcp dedup [--fixed N] [--trailer] +//! dcp defrag [--trailer] +//! dcp promote [--trailer] +//! dcp demote [--trailer] +//! ``` +//! +//! UIDs are given as 32 hex digits (16 bytes), or as `0xNN` to mean a uid of 16 +//! identical bytes (e.g. `0xDC` = 16×0xDC), matching the test vector's notation. +//! Every mutating command rewrites the file and then re-verifies it. + +use std::io::Cursor; +use std::process::ExitCode; + +use pcf_dcp::{Chunker, DcpReader, DcpWriter, UID_SIZE}; + +fn main() -> ExitCode { + let args: Vec = std::env::args().skip(1).collect(); + if args.is_empty() { + usage(); + return ExitCode::FAILURE; + } + let cmd = args[0].as_str(); + let rest = &args[1..]; + let result = match cmd { + "info" => cmd_info(rest), + "dedup" => cmd_dedup(rest), + "defrag" => cmd_defrag(rest), + "promote" => cmd_promote(rest), + "demote" => cmd_demote(rest), + "-h" | "--help" | "help" => { + usage(); + return ExitCode::SUCCESS; + } + other => Err(format!("unknown command '{other}'")), + }; + match result { + Ok(()) => ExitCode::SUCCESS, + Err(e) => { + eprintln!("dcp: {e}"); + ExitCode::FAILURE + } + } +} + +fn usage() { + eprintln!( + "usage:\n dcp info \n dcp dedup [--fixed N] [--trailer]\n \ + dcp defrag [--trailer]\n dcp promote [--trailer]\n \ + dcp demote [--trailer]" + ); +} + +// ---- commands ------------------------------------------------------------- + +fn cmd_info(args: &[String]) -> Result<(), String> { + let path = args.first().ok_or("info: missing ")?; + let bytes = std::fs::read(path).map_err(|e| format!("read {path}: {e}"))?; + let mut r = DcpReader::open(Cursor::new(bytes)).map_err(de)?; + r.verify().map_err(de)?; + let containers = r.containers().map_err(de)?; + println!("{}: {} DCP container(s)", path, containers.len()); + for c in containers { + let arena = r.open_arena(&c).map_err(de)?; + println!( + " container {} (uid {}) used={} inner={}", + c.label_string().unwrap_or_default(), + hex(&c.uid), + c.used_bytes, + arena.len() + ); + for info in arena.inners() { + let n = info.data_hash_algo.digest_len(); + let dh: String = info.data_hash[..n] + .iter() + .map(|b| format!("{b:02x}")) + .collect(); + let shared = info.extents.iter().filter(|e| e.shared).count(); + println!( + " inner {} (uid {}) type=0x{:08X} used={} extents={} shared={} algo={:?} data_hash={}", + info.label, + hex(&info.uid), + info.partition_type, + info.used_bytes, + info.extents.len(), + shared, + info.data_hash_algo, + dh + ); + } + } + Ok(()) +} + +fn cmd_dedup(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 1)?; + let path = &opts.positional[0]; + let chunker = match opts.fixed { + Some(n) => Chunker::Fixed(n), + None => Chunker::Whole, + }; + let mut w = open_writer(path, opts.trailer)?; + let containers = container_uids(path)?; + let mut saved = 0u64; + for uid in &containers { + saved += w.dedup(uid, chunker).map_err(de)?; + } + commit(path, &w)?; + println!( + "deduplicated {} container(s); ~{} bytes saved", + containers.len(), + saved + ); + Ok(()) +} + +fn cmd_defrag(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 1)?; + let path = &opts.positional[0]; + let mut w = open_writer(path, opts.trailer)?; + let containers = container_uids(path)?; + let mut reclaimed = 0u64; + for uid in &containers { + reclaimed += w.defrag(uid).map_err(de)?; + } + commit(path, &w)?; + println!( + "defragmented {} container(s); {} dead bytes reclaimed", + containers.len(), + reclaimed + ); + Ok(()) +} + +fn cmd_promote(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 3)?; + let path = &opts.positional[0]; + let cuid = parse_uid(&opts.positional[1])?; + let iuid = parse_uid(&opts.positional[2])?; + let mut w = open_writer(path, opts.trailer)?; + w.promote(&cuid, &iuid).map_err(de)?; + commit(path, &w)?; + println!( + "promoted inner {} out of container {}", + hex(&iuid), + hex(&cuid) + ); + Ok(()) +} + +fn cmd_demote(args: &[String]) -> Result<(), String> { + let opts = Opts::parse(args, 3)?; + let path = &opts.positional[0]; + let puid = parse_uid(&opts.positional[1])?; + let cuid = parse_uid(&opts.positional[2])?; + let mut w = open_writer(path, opts.trailer)?; + w.demote(&puid, &cuid).map_err(de)?; + commit(path, &w)?; + println!( + "demoted partition {} into container {}", + hex(&puid), + hex(&cuid) + ); + Ok(()) +} + +// ---- helpers -------------------------------------------------------------- + +fn open_writer(path: &str, trailer: bool) -> Result { + let bytes = std::fs::read(path).map_err(|e| format!("read {path}: {e}"))?; + let mut w = DcpWriter::open(Cursor::new(bytes)).map_err(de)?; + w.set_trailer(trailer); + Ok(w) +} + +fn container_uids(path: &str) -> Result, String> { + let bytes = std::fs::read(path).map_err(|e| format!("read {path}: {e}"))?; + let mut r = DcpReader::open(Cursor::new(bytes)).map_err(de)?; + Ok(r.containers() + .map_err(de)? + .into_iter() + .map(|c| c.uid) + .collect()) +} + +fn commit(path: &str, w: &DcpWriter) -> Result<(), String> { + let image = w.to_image().map_err(de)?; + // Re-verify before overwriting the file on disk. + let mut r = DcpReader::open(Cursor::new(image.clone())).map_err(de)?; + r.verify().map_err(de)?; + std::fs::write(path, &image).map_err(|e| format!("write {path}: {e}"))?; + Ok(()) +} + +fn de(e: E) -> String { + e.to_string() +} + +fn hex(uid: &[u8; UID_SIZE]) -> String { + uid.iter().map(|b| format!("{b:02x}")).collect() +} + +/// Parse a uid: either 32 hex digits, or `0xNN` meaning 16 identical bytes. +fn parse_uid(s: &str) -> Result<[u8; UID_SIZE], String> { + if let Some(rest) = s.strip_prefix("0x").or_else(|| s.strip_prefix("0X")) { + if rest.len() == 2 { + let b = u8::from_str_radix(rest, 16).map_err(|_| format!("bad uid byte '{s}'"))?; + return Ok([b; UID_SIZE]); + } + } + let clean: String = s + .chars() + .filter(|c| !c.is_whitespace() && *c != '-') + .collect(); + if clean.len() != 32 { + return Err(format!("uid '{s}' must be 32 hex digits or 0xNN")); + } + let mut uid = [0u8; UID_SIZE]; + for (i, byte) in uid.iter_mut().enumerate() { + *byte = u8::from_str_radix(&clean[i * 2..i * 2 + 2], 16) + .map_err(|_| format!("bad hex in uid '{s}'"))?; + } + Ok(uid) +} + +/// Parsed options common to the subcommands. +struct Opts { + positional: Vec, + fixed: Option, + trailer: bool, +} + +impl Opts { + fn parse(args: &[String], need: usize) -> Result { + let mut positional = Vec::new(); + let mut fixed = None; + let mut trailer = false; + let mut i = 0; + while i < args.len() { + match args[i].as_str() { + "--trailer" => trailer = true, + "--fixed" => { + i += 1; + let n = args.get(i).ok_or("--fixed needs a value")?; + fixed = Some(n.parse().map_err(|_| format!("bad --fixed value '{n}'"))?); + } + other => positional.push(other.to_string()), + } + i += 1; + } + if positional.len() < need { + return Err(format!( + "expected {need} positional argument(s), got {}", + positional.len() + )); + } + Ok(Opts { + positional, + fixed, + trailer, + }) + } +} diff --git a/reference/PCF-DCP-v1.0/src/consts.rs b/reference/PCF-DCP-v1.0/src/consts.rs new file mode 100644 index 0000000..4173650 --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/consts.rs @@ -0,0 +1,59 @@ +//! On-disk constants defined by PCF-DCP v1.0. +//! +//! Every value here is normative and corresponds directly to a figure in the +//! specification (`specs/PCF-DCP-spec-v1.0.txt`, Appendix A and B). + +/// PCF partition type carrying one DCP arena (spec Appendix B). A generic PCF +/// reader sees this as one opaque, typed partition. +pub const DCP_CONTAINER_TYPE: u32 = 0xAAAC_0001; + +/// First value of the block reserved by this profile for future partition +/// types (spec Appendix B). +pub const DCP_TYPE_RESERVED_LO: u32 = 0xAAAC_0000; + +/// Last value of the block reserved by this profile (spec Appendix B). +pub const DCP_TYPE_RESERVED_HI: u32 = 0xAAAC_00FF; + +/// 4-byte magic at the start of a DCP arena (spec Section 6): `"PDCP"`. +pub const DCP_MAGIC: [u8; 4] = [0x50, 0x44, 0x43, 0x50]; + +/// PCF-DCP profile version implemented by this crate (major, spec Section 14). +pub const PROFILE_VERSION_MAJOR: u8 = 1; + +/// PCF-DCP profile version implemented by this crate (minor, spec Section 14). +pub const PROFILE_VERSION_MINOR: u8 = 0; + +/// Fixed size of the DCP Header, in bytes (spec Section 6). +pub const DCP_HEADER_SIZE: u64 = 24; + +/// Fixed size of a Fragment Table block header, in bytes (spec Section 8.1). +pub const FRAGTABLE_HEADER_SIZE: u64 = 9; + +/// Fixed size of one Fragment Entry, in bytes (spec Section 8.2). +pub const FRAGMENT_ENTRY_SIZE: u64 = 18; + +/// Fragment Entry kind: RESERVED / INVALID guard (spec Section 8.2). MUST NOT +/// appear in a live entry. +pub const KIND_INVALID: u8 = 0; +/// Fragment Entry kind: DATA — literal content bytes (the only kind defined in +/// v1.0). +pub const KIND_DATA: u8 = 1; +/// Fragment Entry kind: HOLE (RESERVED for sparse content; MUST NOT be written +/// in v1.0). +pub const KIND_HOLE: u8 = 2; +/// Fragment Entry kind: REF (RESERVED for cross-container references; MUST NOT +/// be written in v1.0). +pub const KIND_REF: u8 = 3; + +/// Fragment Entry `flags` bit 0: SHARED — the extent's bytes MUST NOT be +/// overwritten in place; edits must be copy-on-write (spec Section 8.4). +pub const FLAG_SHARED: u8 = 0x01; + +/// The arena-relative offset value reserved as "none" / chain terminator +/// (spec Appendix B). +pub const ARENA_NONE: u64 = 0; + +/// Maximum number of entries a single (inner) Table Block can hold, and the +/// maximum number of Fragment Entries a single Fragment Table block can hold +/// (both counts are a `u8`). +pub const MAX_ENTRIES_PER_BLOCK: usize = 255; diff --git a/reference/PCF-DCP-v1.0/src/error.rs b/reference/PCF-DCP-v1.0/src/error.rs new file mode 100644 index 0000000..499991e --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/error.rs @@ -0,0 +1,101 @@ +//! Error type shared across the crate. + +use std::fmt; + +/// All ways a PCF-DCP operation can fail. +#[derive(Debug)] +pub enum Error { + /// Underlying PCF container error. + Pcf(pcf::Error), + /// Underlying I/O failure. + Io(std::io::Error), + + // ----- Malformed arena (spec Sections 6, 8, 13) ------------------------ + /// The arena did not begin with the `"PDCP"` magic (spec Section 6). + BadDcpMagic, + /// The arena's `profile_version_major` is not implemented by this crate. + UnsupportedProfileMajor(u8), + /// A Fragment Entry carried a `kind` this version does not implement + /// (HOLE/REF/unknown), rendering the inner partition unreadable. + BadFragmentKind(u8), + /// An extent's `[offset, offset+length)` range escapes `[0, arena_used)`. + OffsetOutOfRange, + /// Reconstructed logical content length did not match the inner entry's + /// `used_bytes` (spec Section 8.3), or a stored data hash did not verify. + LengthMismatch { + /// The `used_bytes` the inner entry declared. + expected: u64, + /// The length actually reconstructed from the Fragment Table. + got: u64, + }, + /// A stored hash (inner `table_hash` or inner `data_hash`) did not verify. + HashMismatch, + + // ----- Logical-model violations (spec Sections 2.1, 7.2, 13) ----------- + /// No inner partition (or top-level partition) with the requested uid. + NotFound, + /// A uid is used by more than one partition file-wide (spec Section 2.1). + DuplicateUid, + /// An inner partition is itself a DCP container; nesting is forbidden in + /// v1.0 (spec Appendix B, "Nesting"). + NestedContainer, + /// A partition uid is the PCF NIL uid. + NilUid, + /// A partition type is the PCF reserved type `0x00000000`. + ReservedType, + /// A top-level partition expected to be a DCP container is not one. + NotADcpContainer, + /// A logical edit addressed a position beyond the partition's content. + PositionOutOfRange, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Error::Pcf(e) => write!(f, "pcf error: {e}"), + Error::Io(e) => write!(f, "i/o error: {e}"), + Error::BadDcpMagic => write!(f, "arena does not begin with \"PDCP\" magic"), + Error::UnsupportedProfileMajor(v) => { + write!(f, "unsupported PCF-DCP profile major version {v}") + } + Error::BadFragmentKind(k) => write!(f, "unsupported fragment kind {k}"), + Error::OffsetOutOfRange => write!(f, "extent range escapes the arena"), + Error::LengthMismatch { expected, got } => { + write!(f, "logical length mismatch: expected {expected}, got {got}") + } + Error::HashMismatch => write!(f, "stored hash does not verify"), + Error::NotFound => write!(f, "no partition with that uid"), + Error::DuplicateUid => write!(f, "uid is not unique file-wide"), + Error::NestedContainer => write!(f, "an inner partition may not be a DCP container"), + Error::NilUid => write!(f, "uid is the NIL uid"), + Error::ReservedType => write!(f, "partition type is the reserved type 0x00000000"), + Error::NotADcpContainer => write!(f, "partition is not a DCP container"), + Error::PositionOutOfRange => write!(f, "logical position is past end of content"), + } + } +} + +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { + match self { + Error::Pcf(e) => Some(e), + Error::Io(e) => Some(e), + _ => None, + } + } +} + +impl From for Error { + fn from(e: pcf::Error) -> Self { + Error::Pcf(e) + } +} + +impl From for Error { + fn from(e: std::io::Error) -> Self { + Error::Io(e) + } +} + +/// Convenience alias. +pub type Result = std::result::Result; diff --git a/reference/PCF-DCP-v1.0/src/fragment.rs b/reference/PCF-DCP-v1.0/src/fragment.rs new file mode 100644 index 0000000..24ae32c --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/fragment.rs @@ -0,0 +1,166 @@ +//! The Fragment Table: its 9-byte block header and 18-byte entries +//! (spec Section 8). + +use crate::consts::{ + ARENA_NONE, FLAG_SHARED, FRAGMENT_ENTRY_SIZE, FRAGTABLE_HEADER_SIZE, KIND_DATA, +}; +use crate::error::{Error, Result}; + +/// One Fragment Entry: a single extent of an inner partition (spec Section 8.2). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FragmentEntry { + /// Arena-relative start of the extent's bytes. + pub extent_offset: u64, + /// Length of the extent in bytes. + pub extent_length: u64, + /// Extent kind (`1` = DATA; `0` invalid; `2`/`3` reserved). + pub kind: u8, + /// `flags` byte (bit 0 = SHARED; others reserved 0). + pub flags: u8, +} + +impl FragmentEntry { + /// Serialise to the on-disk 18-byte layout. + pub fn to_bytes(&self) -> [u8; 18] { + let mut b = [0u8; 18]; + b[0..8].copy_from_slice(&self.extent_offset.to_le_bytes()); + b[8..16].copy_from_slice(&self.extent_length.to_le_bytes()); + b[16] = self.kind; + b[17] = self.flags; + b + } + + /// Parse from the on-disk 18-byte layout. + pub fn from_bytes(b: &[u8; 18]) -> Self { + FragmentEntry { + extent_offset: u64::from_le_bytes(b[0..8].try_into().unwrap()), + extent_length: u64::from_le_bytes(b[8..16].try_into().unwrap()), + kind: b[16], + flags: b[17], + } + } + + /// Whether this entry's `kind` is DATA (the only v1.0 content kind). + pub fn is_data(&self) -> bool { + self.kind == KIND_DATA + } + + /// Whether the SHARED flag (bit 0) is set. + pub fn is_shared(&self) -> bool { + self.flags & FLAG_SHARED != 0 + } +} + +/// The 9-byte header that begins each Fragment Table block (spec Section 8.1). +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct FragTableHeader { + /// Arena-relative offset of the next Fragment Table block of this + /// partition, or 0 if this is the last block. + pub next_fragtable_offset: u64, + /// Number of Fragment Entries packed immediately after this header. + pub fragment_count: u8, +} + +impl FragTableHeader { + /// Serialise to the on-disk 9-byte layout. + pub fn to_bytes(&self) -> [u8; 9] { + let mut b = [0u8; 9]; + b[0..8].copy_from_slice(&self.next_fragtable_offset.to_le_bytes()); + b[8] = self.fragment_count; + b + } + + /// Parse from the on-disk 9-byte layout. + pub fn from_bytes(b: &[u8; 9]) -> Self { + FragTableHeader { + next_fragtable_offset: u64::from_le_bytes(b[0..8].try_into().unwrap()), + fragment_count: b[8], + } + } +} + +/// Walk an inner partition's Fragment Table chain starting at arena-relative +/// `first_off`, returning its Fragment Entries in logical order across the +/// whole chain (spec Section 8.3). `first_off == 0` yields an empty list. +pub fn walk_fragment_table(arena: &[u8], first_off: u64) -> Result> { + let mut out = Vec::new(); + let mut off = first_off; + // A simple cycle guard: a well-formed chain only ever moves forward, but a + // corrupt file could loop. Bound the walk by the arena length. + let mut budget = arena.len() / FRAGTABLE_HEADER_SIZE as usize + 1; + while off != ARENA_NONE { + if budget == 0 { + return Err(Error::OffsetOutOfRange); + } + budget -= 1; + let base = off as usize; + let hb: [u8; 9] = arena + .get(base..base + FRAGTABLE_HEADER_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let h = FragTableHeader::from_bytes(&hb); + let mut eo = base + FRAGTABLE_HEADER_SIZE as usize; + for _ in 0..h.fragment_count { + let eb: [u8; 18] = arena + .get(eo..eo + FRAGMENT_ENTRY_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + out.push(FragmentEntry::from_bytes(&eb)); + eo += FRAGMENT_ENTRY_SIZE as usize; + } + off = h.next_fragtable_offset; + } + Ok(out) +} + +/// Reconstruct the logical content of a partition from its Fragment Entries +/// (spec Section 8.3): concatenate the bytes of its DATA extents in order. +/// +/// `arena_used` bounds every extent range; a reserved (non-DATA) kind makes the +/// partition unreadable to a v1.0 reader (spec Section 8.2). +pub fn reconstruct(arena: &[u8], frags: &[FragmentEntry], arena_used: u64) -> Result> { + let mut out = Vec::new(); + for f in frags { + if !f.is_data() { + return Err(Error::BadFragmentKind(f.kind)); + } + let end = f + .extent_offset + .checked_add(f.extent_length) + .ok_or(Error::OffsetOutOfRange)?; + if end > arena_used || end > arena.len() as u64 { + return Err(Error::OffsetOutOfRange); + } + out.extend_from_slice(&arena[f.extent_offset as usize..end as usize]); + } + Ok(out) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn entry_roundtrip() { + let e = FragmentEntry { + extent_offset: 31, + extent_length: 6, + kind: KIND_DATA, + flags: FLAG_SHARED, + }; + assert_eq!(FragmentEntry::from_bytes(&e.to_bytes()), e); + assert!(e.is_data()); + assert!(e.is_shared()); + } + + #[test] + fn header_roundtrip() { + let h = FragTableHeader { + next_fragtable_offset: 0, + fragment_count: 2, + }; + assert_eq!(FragTableHeader::from_bytes(&h.to_bytes()), h); + } +} diff --git a/reference/PCF-DCP-v1.0/src/header.rs b/reference/PCF-DCP-v1.0/src/header.rs new file mode 100644 index 0000000..07c4403 --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/header.rs @@ -0,0 +1,83 @@ +//! The fixed 24-byte DCP Header at arena offset 0 (spec Section 6). + +use crate::consts::{DCP_HEADER_SIZE, DCP_MAGIC}; +use crate::error::{Error, Result}; + +/// Parsed DCP Header. All offsets it carries are arena-relative. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct DcpHeader { + /// PCF-DCP profile major version (MUST be implemented by the reader). + pub profile_version_major: u8, + /// PCF-DCP profile minor version (a reader SHOULD accept a higher value). + pub profile_version_minor: u8, + /// Reserved; MUST be 0 in v1.0. + pub flags: u16, + /// Arena-relative offset of the first Inner Table Block (0 = no inner + /// partitions). + pub inner_table_offset: u64, + /// Bump pointer: arena-relative offset of the first free byte. Every stored + /// structure and extent lies within `[0, arena_used)`. + pub arena_used: u64, +} + +impl DcpHeader { + /// Serialise to the on-disk 24-byte layout. + pub fn to_bytes(&self) -> [u8; 24] { + let mut b = [0u8; 24]; + b[0..4].copy_from_slice(&DCP_MAGIC); + b[4] = self.profile_version_major; + b[5] = self.profile_version_minor; + b[6..8].copy_from_slice(&self.flags.to_le_bytes()); + b[8..16].copy_from_slice(&self.inner_table_offset.to_le_bytes()); + b[16..24].copy_from_slice(&self.arena_used.to_le_bytes()); + b + } + + /// Parse from the on-disk 24-byte layout, validating the magic. + pub fn from_bytes(b: &[u8; 24]) -> Result { + if b[0..4] != DCP_MAGIC { + return Err(Error::BadDcpMagic); + } + Ok(DcpHeader { + profile_version_major: b[4], + profile_version_minor: b[5], + flags: u16::from_le_bytes([b[6], b[7]]), + inner_table_offset: u64::from_le_bytes(b[8..16].try_into().unwrap()), + arena_used: u64::from_le_bytes(b[16..24].try_into().unwrap()), + }) + } +} + +/// Read a DCP Header from the start of an arena byte slice. +pub(crate) fn read_header(arena: &[u8]) -> Result { + let fixed: [u8; 24] = arena + .get(0..DCP_HEADER_SIZE as usize) + .ok_or(Error::BadDcpMagic)? + .try_into() + .unwrap(); + DcpHeader::from_bytes(&fixed) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn header_roundtrip() { + let h = DcpHeader { + profile_version_major: 1, + profile_version_minor: 0, + flags: 0, + inner_table_offset: 109, + arena_used: 465, + }; + assert_eq!(DcpHeader::from_bytes(&h.to_bytes()).unwrap(), h); + } + + #[test] + fn rejects_bad_magic() { + let mut b = [0u8; 24]; + b[0..4].copy_from_slice(b"XXXX"); + assert!(matches!(DcpHeader::from_bytes(&b), Err(Error::BadDcpMagic))); + } +} diff --git a/reference/PCF-DCP-v1.0/src/lib.rs b/reference/PCF-DCP-v1.0/src/lib.rs new file mode 100644 index 0000000..3710f4a --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/lib.rs @@ -0,0 +1,69 @@ +//! # `pcf-dcp` — PCF Dynamic Container Partition (reference implementation) +//! +//! This crate is the reference reader/writer for **PCF-DCP v1.0**, an +//! application-level profile that adds *dynamic*, fragmentable, dedup-friendly +//! sub-partitions to [PCF v1.0](../pcf/index.html) without changing the PCF +//! byte container. It mirrors the written specification +//! (`specs/PCF-DCP-spec-v1.0.txt`) field-for-field and favours auditability +//! over performance. +//! +//! ## Layout at a glance +//! +//! One new PCF partition type is defined: +//! +//! * **`DCP_CONTAINER`** (type `0xAAAC0001`) — a partition whose bytes are an +//! *arena*: a [`DcpHeader`], a chain of reused PCF Table Blocks listing +//! *inner* partitions, a [`FragmentEntry`] table per inner partition, and the +//! data extents those fragments name. +//! +//! Each inner partition's logical content is the concatenation of its DATA +//! extents (spec Section 8.3); its `data_hash` covers that logical content, so +//! fragmentation, deduplication, compaction, and promotion all leave the hash +//! (and any PCF-SIG signature over it) unchanged. +//! +//! A generic PCF reader sees a DCP file as one opaque, typed partition; only a +//! DCP-aware reader looks inside. A DCP file is always a conforming PCF v1.0 +//! file. +//! +//! ## Example +//! +//! ``` +//! use std::io::Cursor; +//! use pcf_dcp::{Arena, Chunker, DcpReader, DcpWriter, HashAlgo}; +//! +//! // Build a container with two inner partitions that share an extent. +//! let mut arena = Arena::new(); +//! arena.add_inner(0x10, [0xA1; 16], "A", b"Hello, World!", HashAlgo::Sha256, Chunker::Fixed(7))?; +//! arena.add_inner(0x10, [0xB2; 16], "B", b"World!", HashAlgo::Sha256, Chunker::Whole)?; +//! +//! let mut w = DcpWriter::new(); +//! w.add_container([0xDC; 16], "dcp", arena)?; +//! let image = w.to_image()?; +//! +//! // Read it back: a valid PCF file whose inner content reconstructs exactly. +//! let mut r = DcpReader::open(Cursor::new(image))?; +//! r.verify()?; +//! assert_eq!(r.read_inner(&[0xB2; 16])?, b"World!"); +//! # Ok::<(), pcf_dcp::Error>(()) +//! ``` + +mod arena; +pub mod consts; +mod error; +mod fragment; +mod header; +mod reader; +mod vector; +mod writer; + +pub use arena::{Arena, Chunker, ExtentInfo, InnerInfo}; +pub use consts::*; +pub use error::{Error, Result}; +pub use fragment::{reconstruct, walk_fragment_table, FragTableHeader, FragmentEntry}; +pub use header::DcpHeader; +pub use reader::{DcpReader, InnerLocation, Resolved}; +pub use vector::build_reference_vector; +pub use writer::DcpWriter; + +// Re-export underlying PCF primitives used across the DCP API surface. +pub use pcf::{HashAlgo, PartitionEntry, UID_SIZE}; diff --git a/reference/PCF-DCP-v1.0/src/reader.rs b/reference/PCF-DCP-v1.0/src/reader.rs new file mode 100644 index 0000000..0f7296b --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/reader.rs @@ -0,0 +1,222 @@ +//! [`DcpReader`]: reading DCP containers from a PCF file. +//! +//! The reader works entirely through the high-level [`pcf::Container`] API +//! (`open`, `entries`, `read_partition_data`, `verify`). Because +//! `Container::open` resolves a file [`pcf::Trailer`] and exposes the table +//! head itself, a DCP file written in trailer mode (append-only host) reads +//! back transparently here — this code never assumes the header's +//! `partition_table_offset` is a real offset (spec Section 2, "Compatibility +//! with the PCF File Trailer"). + +use std::collections::HashSet; +use std::io::{Read, Seek, Write}; + +use pcf::{Container, PartitionEntry, UID_SIZE}; + +use crate::arena::{Arena, InnerInfo}; +use crate::consts::DCP_CONTAINER_TYPE; +use crate::error::{Error, Result}; + +/// An inner partition together with the container that holds it. +#[derive(Debug, Clone)] +pub struct InnerLocation { + /// uid of the enclosing DCP container partition. + pub container_uid: [u8; UID_SIZE], + /// The inner partition's metadata and extents. + pub info: InnerInfo, +} + +/// The result of resolving a uid against the flattened partition set +/// (top-level ∪ inner), per the Opt-B scope of spec Section 2.1. +#[derive(Debug, Clone)] +pub enum Resolved { + /// A top-level PCF partition. + TopLevel(PartitionEntry), + /// An inner partition inside a DCP container. + Inner(InnerLocation), +} + +/// A reader for DCP containers layered over a PCF file. +pub struct DcpReader { + container: Container, +} + +impl DcpReader { + /// Open a PCF file for DCP-aware reading. + pub fn open(storage: S) -> Result { + Ok(Self { + container: Container::open(storage)?, + }) + } + + /// Borrow the underlying PCF container (e.g. to inspect non-DCP + /// partitions). + pub fn container(&mut self) -> &mut Container { + &mut self.container + } + + /// All top-level entries, in chain order. + pub fn entries(&mut self) -> Result> { + Ok(self.container.entries()?) + } + + /// The top-level DCP container entries (`partition_type == + /// DCP_CONTAINER_TYPE`). + pub fn containers(&mut self) -> Result> { + Ok(self + .container + .entries()? + .into_iter() + .filter(|e| e.partition_type == DCP_CONTAINER_TYPE) + .collect()) + } + + /// Parse the arena of a DCP container entry. + pub fn open_arena(&mut self, entry: &PartitionEntry) -> Result { + if entry.partition_type != DCP_CONTAINER_TYPE { + return Err(Error::NotADcpContainer); + } + let data = self.container.read_partition_data(entry)?; + Arena::parse(&data) + } + + /// Every inner partition across every DCP container, in file order. + pub fn inner_partitions(&mut self) -> Result> { + let mut out = Vec::new(); + for c in self.containers()? { + let arena = self.open_arena(&c)?; + for info in arena.inners() { + out.push(InnerLocation { + container_uid: c.uid, + info, + }); + } + } + Ok(out) + } + + /// Resolve a uid against the flattened set top-level ∪ inner (spec Section + /// 2.1). Top-level entries are checked first. + pub fn resolve_uid(&mut self, uid: &[u8; UID_SIZE]) -> Result { + if let Some(e) = self + .container + .entries()? + .into_iter() + .find(|e| &e.uid == uid) + { + return Ok(Resolved::TopLevel(e)); + } + for loc in self.inner_partitions()? { + if &loc.info.uid == uid { + return Ok(Resolved::Inner(loc)); + } + } + Err(Error::NotFound) + } + + /// Reconstruct an inner partition's logical content by uid, searching every + /// DCP container. + pub fn read_inner(&mut self, uid: &[u8; UID_SIZE]) -> Result> { + for c in self.containers()? { + let arena = self.open_arena(&c)?; + if arena.uids().iter().any(|u| u == uid) { + return arena.content(uid); + } + } + Err(Error::NotFound) + } + + /// Full DCP-aware verification: + /// + /// 1. PCF integrity (`Container::verify`): every table block and partition + /// data hash, and per-entry conformance. + /// 2. Per container: valid `"PDCP"` magic and supported profile major (via + /// `Arena::parse`), each inner Table Block's `table_hash` (checked while + /// parsing through PCF), reconstruction length and (when algorithmic) + /// `data_hash`, no nested container, and file-wide uid uniqueness. + pub fn verify(&mut self) -> Result<()> { + self.container.verify()?; + + let mut seen: HashSet<[u8; UID_SIZE]> = HashSet::new(); + // Top-level uids participate in the file-wide namespace too. + for e in self.container.entries()? { + if !seen.insert(e.uid) { + return Err(Error::DuplicateUid); + } + } + + for c in self.containers()? { + // Verify the inner Table Block hashes the same way PCF does. + let data = self.container.read_partition_data(&c)?; + verify_inner_table_hashes(&data)?; + + let arena = Arena::parse(&data)?; + for info in arena.inners() { + if info.partition_type == DCP_CONTAINER_TYPE { + return Err(Error::NestedContainer); + } + if !seen.insert(info.uid) { + return Err(Error::DuplicateUid); + } + // Reconstruct and check length + data hash. + let content = arena.content(&info.uid)?; + if content.len() as u64 != info.used_bytes { + return Err(Error::LengthMismatch { + expected: info.used_bytes, + got: content.len() as u64, + }); + } + if !info.data_hash_algo.verify(&content, &info.data_hash) { + return Err(Error::HashMismatch); + } + } + } + Ok(()) + } +} + +/// Walk the inner Table Block chain in an arena and recompute each block's +/// `table_hash`, exactly as PCF does for the top-level table (spec Section +/// 9.2). The inner table is the primary integrity anchor for the inner entries +/// because the container's own PCF `data_hash_algo` is normally 0. +fn verify_inner_table_hashes(arena: &[u8]) -> Result<()> { + use pcf::{ + compute_table_hash, PartitionEntry, TableBlockHeader, ENTRY_SIZE, TABLE_HEADER_SIZE, + }; + + let header = crate::header::read_header(arena)?; + let mut off = header.inner_table_offset; + let mut budget = arena.len() / TABLE_HEADER_SIZE as usize + 1; + while off != 0 { + if budget == 0 { + return Err(Error::OffsetOutOfRange); + } + budget -= 1; + let base = off as usize; + let hb: [u8; 74] = arena + .get(base..base + TABLE_HEADER_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + let h = TableBlockHeader::from_bytes(&hb)?; + let mut entries = Vec::with_capacity(h.partition_count as usize); + for i in 0..h.partition_count as u64 { + let eo = base + TABLE_HEADER_SIZE as usize + (i * ENTRY_SIZE) as usize; + let eb: [u8; 141] = arena + .get(eo..eo + ENTRY_SIZE as usize) + .ok_or(Error::OffsetOutOfRange)? + .try_into() + .unwrap(); + entries.push(PartitionEntry::from_bytes(&eb)?); + } + if h.table_hash_algo.verifies() { + let computed = compute_table_hash(h.table_hash_algo, h.next_table_offset, &entries); + let n = h.table_hash_algo.digest_len(); + if computed[..n] != h.table_hash[..n] { + return Err(Error::HashMismatch); + } + } + off = h.next_table_offset; + } + Ok(()) +} diff --git a/reference/PCF-DCP-v1.0/src/vector.rs b/reference/PCF-DCP-v1.0/src/vector.rs new file mode 100644 index 0000000..599db6f --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/vector.rs @@ -0,0 +1,43 @@ +//! The canonical PCF-DCP v1.0 test vector (spec Section 17). + +use pcf::HashAlgo; + +use crate::arena::{Arena, Chunker}; +use crate::error::Result; +use crate::writer::DcpWriter; + +/// Build the byte-exact 700-byte reference file from spec Section 17. +/// +/// The file is one DCP container ("dcp", uid 16×0xDC, unsealed) holding two +/// inner partitions: +/// +/// * **A** ("Hello, World!", 13 B) stored as two extents — `"Hello, "` (7 B, +/// private) and `"World!"` (6 B, shared) — via fixed-7 chunking. +/// * **B** ("World!", 6 B) stored as one extent that *deduplicates* onto A's +/// second extent; both references carry SHARED = 1. +/// +/// Building the same logical container and emitting the canonical layout MUST +/// reproduce these exact bytes. +pub fn build_reference_vector() -> Result> { + let mut arena = Arena::new(); + arena.add_inner( + 0x0000_0010, + [0xA1u8; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + )?; + arena.add_inner( + 0x0000_0010, + [0xB2u8; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + )?; + + let mut w = DcpWriter::new(); + w.add_container([0xDCu8; 16], "dcp", arena)?; + w.to_image() +} diff --git a/reference/PCF-DCP-v1.0/src/writer.rs b/reference/PCF-DCP-v1.0/src/writer.rs new file mode 100644 index 0000000..5a85eba --- /dev/null +++ b/reference/PCF-DCP-v1.0/src/writer.rs @@ -0,0 +1,255 @@ +//! [`DcpWriter`]: building and rewriting PCF files that carry DCP containers. +//! +//! The writer keeps the whole file as an in-memory list of top-level partitions +//! (plain partitions and DCP containers) and emits a fresh, canonical PCF image +//! on demand. Every mutating operation — adding a container, promotion, +//! demotion, dedup, defrag — is a logical edit of that list followed by a +//! rebuild. This is deliberately simple and always correct for a reference +//! implementation; the resulting file is a fully conforming PCF v1.0 file. + +use std::io::{Cursor, Read, Seek, Write}; + +use pcf::{decode_label, Container, HashAlgo, UID_SIZE}; + +use crate::arena::{Arena, Chunker}; +use crate::consts::DCP_CONTAINER_TYPE; +use crate::error::{Error, Result}; + +/// The body of a top-level partition. +enum Body { + /// An ordinary partition's raw bytes. + Plain(Vec), + /// A DCP container's arena. + Container(Arena), +} + +/// One top-level partition. +struct TopPart { + partition_type: u32, + uid: [u8; UID_SIZE], + label: String, + data_hash_algo: HashAlgo, + body: Body, +} + +/// A writer that assembles a PCF file containing DCP containers. +pub struct DcpWriter { + parts: Vec, + table_hash_algo: HashAlgo, + trailer: bool, +} + +impl Default for DcpWriter { + fn default() -> Self { + Self::new() + } +} + +impl DcpWriter { + /// A new, empty writer (top-level table hashed with SHA-256). + pub fn new() -> Self { + DcpWriter { + parts: Vec::new(), + table_hash_algo: HashAlgo::Sha256, + trailer: false, + } + } + + /// Load an existing PCF file into the writer's model, classifying each + /// top-level partition as a plain partition or a DCP container. + pub fn open(storage: S) -> Result { + let mut c = Container::open(storage)?; + let mut parts = Vec::new(); + for e in c.entries()? { + let data = c.read_partition_data(&e)?; + let label = decode_label(&e.label).unwrap_or_default(); + let body = if e.partition_type == DCP_CONTAINER_TYPE { + Body::Container(Arena::parse(&data)?) + } else { + Body::Plain(data) + }; + parts.push(TopPart { + partition_type: e.partition_type, + uid: e.uid, + label, + data_hash_algo: e.data_hash_algo, + body, + }); + } + Ok(DcpWriter { + parts, + table_hash_algo: HashAlgo::Sha256, + trailer: false, + }) + } + + /// Finalise emitted images in trailer mode (append-only host). Off by + /// default; passes through to [`pcf::Container::finalize_with_trailer`]. + pub fn set_trailer(&mut self, on: bool) { + self.trailer = on; + } + + // ---- top-level construction ------------------------------------------- + + /// Add a DCP container partition holding `arena` (data hash algo 0, + /// unsealed; spec Section 9). + pub fn add_container(&mut self, uid: [u8; UID_SIZE], label: &str, arena: Arena) -> Result<()> { + self.ensure_unique(&uid)?; + self.parts.push(TopPart { + partition_type: DCP_CONTAINER_TYPE, + uid, + label: label.to_string(), + data_hash_algo: HashAlgo::None, + body: Body::Container(arena), + }); + Ok(()) + } + + /// Add an ordinary top-level partition. + pub fn add_plain( + &mut self, + partition_type: u32, + uid: [u8; UID_SIZE], + label: &str, + data: Vec, + data_hash_algo: HashAlgo, + ) -> Result<()> { + self.ensure_unique(&uid)?; + self.parts.push(TopPart { + partition_type, + uid, + label: label.to_string(), + data_hash_algo, + body: Body::Plain(data), + }); + Ok(()) + } + + fn ensure_unique(&self, uid: &[u8; UID_SIZE]) -> Result<()> { + if self.parts.iter().any(|p| &p.uid == uid) { + return Err(Error::DuplicateUid); + } + Ok(()) + } + + fn container_mut(&mut self, uid: &[u8; UID_SIZE]) -> Result<&mut Arena> { + for p in &mut self.parts { + if &p.uid == uid { + return match &mut p.body { + Body::Container(a) => Ok(a), + Body::Plain(_) => Err(Error::NotADcpContainer), + }; + } + } + Err(Error::NotFound) + } + + /// Borrow a container's arena for inspection or in-place editing. + pub fn arena_mut(&mut self, container_uid: &[u8; UID_SIZE]) -> Result<&mut Arena> { + self.container_mut(container_uid) + } + + // ---- migration: promotion / demotion ---------------------------------- + + /// Promote an inner partition out of its DCP container to a top-level PCF + /// partition (dynamic → fixed), preserving uid, type, label, hash algorithm + /// and `data_hash` (the promotion invariant, spec Section 10.4). The inner + /// partition is removed from the arena (a MOVE, keeping uids unique). + pub fn promote( + &mut self, + container_uid: &[u8; UID_SIZE], + inner_uid: &[u8; UID_SIZE], + ) -> Result<()> { + let (ptype, label, algo, content) = { + let arena = self.container_mut(container_uid)?; + arena.remove_inner(inner_uid)? + }; + // The inner uid is now free file-wide; add it as a top-level partition. + self.parts.push(TopPart { + partition_type: ptype, + uid: *inner_uid, + label, + data_hash_algo: algo, + body: Body::Plain(content), + }); + Ok(()) + } + + /// Demote a top-level partition into a DCP container as an inner partition + /// (fixed → dynamic), preserving uid, type, label, hash algorithm and + /// `data_hash`. The content becomes a single DATA extent. + pub fn demote( + &mut self, + part_uid: &[u8; UID_SIZE], + container_uid: &[u8; UID_SIZE], + ) -> Result<()> { + let pos = self + .parts + .iter() + .position(|p| &p.uid == part_uid) + .ok_or(Error::NotFound)?; + if self.parts[pos].partition_type == DCP_CONTAINER_TYPE { + return Err(Error::NestedContainer); + } + let (ptype, label, algo, content) = { + let p = &self.parts[pos]; + let content = match &p.body { + Body::Plain(b) => b.clone(), + Body::Container(_) => return Err(Error::NestedContainer), + }; + (p.partition_type, p.label.clone(), p.data_hash_algo, content) + }; + let arena = self.container_mut(container_uid)?; + arena.add_inner(ptype, *part_uid, &label, &content, algo, Chunker::Whole)?; + self.parts.remove(pos); + Ok(()) + } + + // ---- container-level maintenance -------------------------------------- + + /// Re-chunk and deduplicate a container's inner partitions (spec Section + /// 10.2). Returns estimated bytes saved. + pub fn dedup(&mut self, container_uid: &[u8; UID_SIZE], chunker: Chunker) -> Result { + Ok(self.container_mut(container_uid)?.dedup(chunker)) + } + + /// Compact / defragment a container's arena, reclaiming dead bytes and + /// normalising the SHARED flag (spec Section 10.3). Returns bytes reclaimed. + pub fn defrag(&mut self, container_uid: &[u8; UID_SIZE]) -> Result { + Ok(self.container_mut(container_uid)?.compact()) + } + + // ---- serialisation ---------------------------------------------------- + + /// Build a fresh, canonical PCF image of the whole file. The first table + /// block is sized to hold every partition (a single block, no overflow), + /// matching the spec's canonical test-vector layout. + pub fn to_image(&self) -> Result> { + let cap = self.parts.len().max(1) as u32; + let mut c = Container::create_with(Cursor::new(Vec::new()), cap, self.table_hash_algo)?; + for p in &self.parts { + let data = match &p.body { + Body::Plain(b) => b.clone(), + Body::Container(a) => a.to_bytes(), + }; + c.add_partition( + p.partition_type, + p.uid, + &p.label, + &data, + 0, + p.data_hash_algo, + )?; + } + if self.trailer { + c.finalize_with_trailer()?; + } + Ok(c.into_storage().into_inner()) + } + + /// Write the image to any [`Write`] sink. + pub fn write_to(&self, mut out: W) -> Result<()> { + out.write_all(&self.to_image()?)?; + Ok(()) + } +} diff --git a/reference/PCF-DCP-v1.0/testdata/canonical.bin b/reference/PCF-DCP-v1.0/testdata/canonical.bin new file mode 100644 index 0000000000000000000000000000000000000000..834aea47b77bea3c4c6aebe373cca2a909cea9cd GIT binary patch literal 700 zcmeD54hRb2<&t7#U|YnKA^H0(>;D6B`kQRo zjN48I!xZA885!2Bx`PH%k_&Jt2aCUk%3g#z14iT3M2uvBi*o?T%v>}JJW_LV@^uu# z^NVs)6k!H2NkCcbP@0iJ9?D{aiNM2&5mSl@s)#{gpX6?lnhc4+Q`;Gj?4O&H)O6r< zus_?b>FlfDGp==4l7%V6MGF8W7NP-1T#BLMs!*C2jTX57H$ic?_SFsR(~~AN%saf) zaN^y@k| ZW|#HC;-m7d$@wqT|1Jwr&mmwg0|0y;Z>#_S literal 0 HcmV?d00001 diff --git a/reference/PCF-DCP-v1.0/tests/coverage.rs b/reference/PCF-DCP-v1.0/tests/coverage.rs new file mode 100644 index 0000000..a6236ae --- /dev/null +++ b/reference/PCF-DCP-v1.0/tests/coverage.rs @@ -0,0 +1,228 @@ +//! Error paths and edge cases (spec Sections 8, 13). + +use std::io::Cursor; + +use pcf::HashAlgo; +use pcf_dcp::{ + build_reference_vector, Arena, Chunker, DcpReader, Error, FragTableHeader, FragmentEntry, +}; + +#[test] +fn bad_magic_is_rejected() { + let mut bytes = build_reference_vector().unwrap(); + // Corrupt the arena magic (file offset 0x00EB). + bytes[0xEB] = b'X'; + // The PCF layer is still valid; the DCP arena parse must fail. + let mut c = pcf::Container::open(Cursor::new(bytes)).unwrap(); + let e = c.entries().unwrap().into_iter().next().unwrap(); + let data = c.read_partition_data(&e).unwrap(); + assert!(matches!(Arena::parse(&data), Err(Error::BadDcpMagic))); +} + +#[test] +fn unsupported_profile_major_is_rejected() { + let mut a = Arena::new(); + a.add_inner(0x10, [1; 16], "x", b"hi", HashAlgo::Sha256, Chunker::Whole) + .unwrap(); + let mut bytes = a.to_bytes(); + bytes[4] = 2; // profile_version_major + assert!(matches!( + Arena::parse(&bytes), + Err(Error::UnsupportedProfileMajor(2)) + )); +} + +#[test] +fn reserved_and_nil_and_nested_are_rejected() { + let mut a = Arena::new(); + assert!(matches!( + a.add_inner(0, [1; 16], "x", b"", HashAlgo::None, Chunker::Whole), + Err(Error::ReservedType) + )); + assert!(matches!( + a.add_inner( + 0xAAAC_0001, + [1; 16], + "x", + b"", + HashAlgo::None, + Chunker::Whole + ), + Err(Error::NestedContainer) + )); + assert!(matches!( + a.add_inner(0x10, [0; 16], "x", b"", HashAlgo::None, Chunker::Whole), + Err(Error::NilUid) + )); +} + +#[test] +fn duplicate_uid_within_arena_is_rejected() { + let mut a = Arena::new(); + a.add_inner(0x10, [1; 16], "x", b"a", HashAlgo::None, Chunker::Whole) + .unwrap(); + assert!(matches!( + a.add_inner(0x10, [1; 16], "y", b"b", HashAlgo::None, Chunker::Whole), + Err(Error::DuplicateUid) + )); +} + +#[test] +fn bad_fragment_kind_renders_partition_unreadable() { + // Hand-build a fragment entry with a reserved kind and walk it. + let fe = FragmentEntry { + extent_offset: 24, + extent_length: 1, + kind: 2, // HOLE (reserved) + flags: 0, + }; + assert!(!fe.is_data()); + let frags = vec![fe]; + let arena = vec![0u8; 64]; + assert!(matches!( + pcf_dcp::reconstruct(&arena, &frags, 64), + Err(Error::BadFragmentKind(2)) + )); +} + +#[test] +fn offset_out_of_range_is_rejected() { + let fe = FragmentEntry { + extent_offset: 60, + extent_length: 100, // runs past arena_used + kind: 1, + flags: 0, + }; + assert!(matches!( + pcf_dcp::reconstruct(&[0u8; 64], &[fe], 64), + Err(Error::OffsetOutOfRange) + )); +} + +#[test] +fn empty_inner_is_allowed() { + let mut a = Arena::new(); + a.add_inner( + 0x10, + [1; 16], + "empty", + b"", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let info = a.inner_info(&[1; 16]).unwrap(); + assert_eq!(info.used_bytes, 0); + assert_eq!(info.extents.len(), 0); + assert_eq!(a.content(&[1; 16]).unwrap(), b""); + // Round-trips through serialise/parse. + let bytes = a.to_bytes(); + let parsed = Arena::parse(&bytes).unwrap(); + assert_eq!(parsed.content(&[1; 16]).unwrap(), b""); +} + +#[test] +fn many_inners_chain_the_inner_table() { + // More than 255 inner partitions force a multi-block inner table. + let mut a = Arena::new(); + for i in 0..300u32 { + let mut uid = [0u8; 16]; + uid[0..4].copy_from_slice(&i.to_le_bytes()); + uid[15] = 1; // keep non-NIL even when i == 0 + a.add_inner( + 0x10, + uid, + "n", + &i.to_le_bytes(), + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + } + assert_eq!(a.len(), 300); + let bytes = a.to_bytes(); + let parsed = Arena::parse(&bytes).unwrap(); + assert_eq!(parsed.len(), 300); + // Spot-check a late partition. + let mut uid = [0u8; 16]; + uid[0..4].copy_from_slice(&299u32.to_le_bytes()); + uid[15] = 1; + assert_eq!(parsed.content(&uid).unwrap(), 299u32.to_le_bytes()); + + // The whole thing is a valid PCF + DCP file. + let mut w = pcf_dcp::DcpWriter::new(); + w.add_container([0xDC; 16], "big", a).unwrap(); + let image = w.to_image().unwrap(); + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); +} + +#[test] +fn many_extents_chain_the_fragment_table() { + // More than 255 extents in one partition force a multi-block fragment table. + let mut a = Arena::new(); + let content = vec![0xAB; 300]; + a.add_inner( + 0x10, + [1; 16], + "frag", + &content, + HashAlgo::Sha256, + Chunker::Fixed(1), + ) + .unwrap(); + let info = a.inner_info(&[1; 16]).unwrap(); + // Fixed(1) with identical bytes deduplicates to a single shared extent, so + // assert the *logical* length instead, then force distinct extents. + assert_eq!(info.used_bytes, 300); + + let mut b = Arena::new(); + let distinct: Vec = (0..300u32).map(|i| i as u8).collect(); + // 300 distinct-ish single-byte chunks; some repeat (values wrap mod 256), + // but the fragment list still has 300 entries. + b.add_inner( + 0x10, + [2; 16], + "frag2", + &distinct, + HashAlgo::Sha256, + Chunker::Fixed(1), + ) + .unwrap(); + let bytes = b.to_bytes(); + let parsed = Arena::parse(&bytes).unwrap(); + assert_eq!(parsed.content(&[2; 16]).unwrap(), distinct); +} + +#[test] +fn fragtable_header_count_bounds() { + let h = FragTableHeader { + next_fragtable_offset: 7, + fragment_count: 255, + }; + assert_eq!(FragTableHeader::from_bytes(&h.to_bytes()), h); +} + +#[test] +fn verify_detects_global_uid_collision() { + // A top-level partition sharing a uid with an inner partition is a file-wide + // collision (spec Section 2.1). + let mut a = Arena::new(); + a.add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = pcf_dcp::DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", a).unwrap(); + // Add a top-level plain partition with the SAME uid as the inner one. + w.add_plain(0x10, [0xB2; 16], "dup", b"x".to_vec(), HashAlgo::Sha256) + .unwrap(); + let image = w.to_image().unwrap(); + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + assert!(matches!(r.verify(), Err(Error::DuplicateUid))); +} diff --git a/reference/PCF-DCP-v1.0/tests/roundtrip.rs b/reference/PCF-DCP-v1.0/tests/roundtrip.rs new file mode 100644 index 0000000..8b559de --- /dev/null +++ b/reference/PCF-DCP-v1.0/tests/roundtrip.rs @@ -0,0 +1,258 @@ +//! End-to-end round-trips: build, edit, dedup/defrag, promote/demote. + +use std::io::Cursor; + +use pcf::HashAlgo; +use pcf_dcp::{Arena, Chunker, DcpReader, DcpWriter, Resolved}; + +fn build_two_inner_file() -> Vec { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", arena).unwrap(); + w.to_image().unwrap() +} + +#[test] +fn edits_reconstruct_correctly() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [1; 16], + "f", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + + arena.append(&[1; 16], b"!!").unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"Hello, World!!!"); + + arena.insert(&[1; 16], 5, b"XYZ").unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"HelloXYZ, World!!!"); + + arena.delete(&[1; 16], 5, 3).unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"Hello, World!!!"); + + arena.overwrite(&[1; 16], 0, 5, b"HOWDY").unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"HOWDY, World!!!"); + + arena.truncate(&[1; 16], 5).unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"HOWDY"); +} + +#[test] +fn cow_does_not_disturb_shared_bytes() { + // A and B share "World!"; overwriting A's copy must not change B. + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + // Overwrite the "World!" part of A (logical [7,13)). + arena.overwrite(&[0xA1; 16], 7, 6, b"PLANET").unwrap(); + assert_eq!(arena.content(&[0xA1; 16]).unwrap(), b"Hello, PLANET"); + assert_eq!(arena.content(&[0xB2; 16]).unwrap(), b"World!"); +} + +#[test] +fn dedup_then_defrag_preserve_content() { + // Two inners with no initial sharing; dedup should fold the identical chunk. + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [1; 16], + "A", + b"abcabc", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + arena + .add_inner( + 0x10, + [2; 16], + "B", + b"abcabc", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let h1 = arena.inner_info(&[1; 16]).unwrap().data_hash; + + let saved = arena.dedup(Chunker::Fixed(3)); + assert!(saved > 0, "identical chunks should dedup"); + // Content and hash unchanged. + assert_eq!(arena.content(&[1; 16]).unwrap(), b"abcabc"); + assert_eq!(arena.content(&[2; 16]).unwrap(), b"abcabc"); + assert_eq!(arena.inner_info(&[1; 16]).unwrap().data_hash, h1); + + arena.compact(); + assert_eq!(arena.content(&[2; 16]).unwrap(), b"abcabc"); +} + +#[test] +fn defrag_clears_shared_when_no_longer_aliased() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + // Remove B, so "World!" is referenced only by A now. + arena.remove_inner(&[0xB2; 16]).unwrap(); + arena.compact(); + let a = arena.inner_info(&[0xA1; 16]).unwrap(); + assert!( + a.extents.iter().all(|e| !e.shared), + "F2: shared cleared at compaction" + ); + assert_eq!(arena.content(&[0xA1; 16]).unwrap(), b"Hello, World!"); +} + +#[test] +fn promote_preserves_uid_and_data_hash() { + let image = build_two_inner_file(); + let mut w = DcpWriter::open(Cursor::new(image)).unwrap(); + + // data_hash of inner B before promotion. + let before = { + let bytes = w.to_image().unwrap(); + let mut r = DcpReader::open(Cursor::new(bytes)).unwrap(); + let inner = r + .inner_partitions() + .unwrap() + .into_iter() + .find(|l| l.info.uid == [0xB2; 16]) + .unwrap(); + inner.info.data_hash + }; + + w.promote(&[0xDC; 16], &[0xB2; 16]).unwrap(); + let image = w.to_image().unwrap(); + + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + match r.resolve_uid(&[0xB2; 16]).unwrap() { + Resolved::TopLevel(e) => { + assert_eq!(e.uid, [0xB2; 16]); + assert_eq!( + e.data_hash, before, + "promotion invariant: data_hash unchanged" + ); + assert_eq!(e.used_bytes, 6); + } + _ => panic!("B should now be top-level"), + } + // The promoted partition reads back as "World!". + assert_eq!(r.read_inner(&[0xA1; 16]).unwrap(), b"Hello, World!"); +} + +#[test] +fn demote_then_promote_is_identity_for_content() { + let image = build_two_inner_file(); + let mut w = DcpWriter::open(Cursor::new(image)).unwrap(); + w.promote(&[0xDC; 16], &[0xB2; 16]).unwrap(); + // Now B is top-level; demote it back into the container. + w.demote(&[0xB2; 16], &[0xDC; 16]).unwrap(); + let image = w.to_image().unwrap(); + + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_inner(&[0xB2; 16]).unwrap(), b"World!"); + // And it is an inner partition again. + assert!(matches!( + r.resolve_uid(&[0xB2; 16]).unwrap(), + Resolved::Inner(_) + )); +} + +#[test] +fn trailer_mode_reads_back_identically() { + // Build the same file in trailer mode (append-only host); the reader must + // resolve the table head from the trailer and expose every inner partition. + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", arena).unwrap(); + w.set_trailer(true); + let image = w.to_image().unwrap(); + + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_inner(&[0xA1; 16]).unwrap(), b"Hello, World!"); + assert_eq!(r.read_inner(&[0xB2; 16]).unwrap(), b"World!"); + assert_eq!(r.inner_partitions().unwrap().len(), 2); +} diff --git a/reference/PCF-DCP-v1.0/tests/spec_compliance.rs b/reference/PCF-DCP-v1.0/tests/spec_compliance.rs new file mode 100644 index 0000000..bbcd0be --- /dev/null +++ b/reference/PCF-DCP-v1.0/tests/spec_compliance.rs @@ -0,0 +1,190 @@ +//! Conformance tests tying the implementation to specific sections of +//! `specs/PCF-DCP-spec-v1.0.txt`, culminating in the byte-exact Section 17 +//! test vector. + +use std::io::Cursor; + +use pcf::{Container, HashAlgo}; +use pcf_dcp::{ + build_reference_vector, Arena, Chunker, DcpHeader, DcpReader, FragTableHeader, FragmentEntry, + DCP_CONTAINER_TYPE, DCP_HEADER_SIZE, FRAGMENT_ENTRY_SIZE, FRAGTABLE_HEADER_SIZE, +}; + +/// The canonical 700-byte file, byte-for-byte equal to the spec's Section 17 +/// hex dump (verified during development). +const CANONICAL: &[u8] = include_bytes!("../testdata/canonical.bin"); + +#[test] +fn structure_sizes_match_appendix_a() { + assert_eq!(DCP_HEADER_SIZE, 24); + assert_eq!(FRAGTABLE_HEADER_SIZE, 9); + assert_eq!(FRAGMENT_ENTRY_SIZE, 18); + assert_eq!(DCP_CONTAINER_TYPE, 0xAAAC_0001); +} + +#[test] +fn header_roundtrip_and_magic() { + let h = DcpHeader { + profile_version_major: 1, + profile_version_minor: 0, + flags: 0, + inner_table_offset: 109, + arena_used: 465, + }; + let b = h.to_bytes(); + assert_eq!(&b[0..4], b"PDCP"); + assert_eq!(DcpHeader::from_bytes(&b).unwrap(), h); +} + +#[test] +fn fragment_records_roundtrip() { + let e = FragmentEntry { + extent_offset: 31, + extent_length: 6, + kind: 1, + flags: 1, + }; + assert_eq!(FragmentEntry::from_bytes(&e.to_bytes()), e); + let h = FragTableHeader { + next_fragtable_offset: 0, + fragment_count: 2, + }; + assert_eq!(FragTableHeader::from_bytes(&h.to_bytes()), h); +} + +#[test] +fn reconstruction_equals_logical_content() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [1; 16], + "x", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + assert_eq!(arena.content(&[1; 16]).unwrap(), b"Hello, World!"); + // Two extents, total used_bytes 13. + let info = arena.inner_info(&[1; 16]).unwrap(); + assert_eq!(info.used_bytes, 13); + assert_eq!(info.extents.len(), 2); +} + +#[test] +fn data_hash_is_invariant_under_fragmentation() { + // The same content chunked differently yields the same data_hash (it covers + // logical content only — spec Section 8.3 / 9.1). + let mk = |c: Chunker| { + let mut a = Arena::new(); + a.add_inner(0x10, [7; 16], "x", b"abcdefghij", HashAlgo::Sha256, c) + .unwrap(); + a.inner_info(&[7; 16]).unwrap().data_hash + }; + assert_eq!(mk(Chunker::Whole), mk(Chunker::Fixed(3))); + assert_eq!(mk(Chunker::Whole), HashAlgo::Sha256.compute(b"abcdefghij")); +} + +#[test] +fn dedup_sets_shared_on_all_aliases_rule_f1() { + let mut arena = Arena::new(); + arena + .add_inner( + 0x10, + [0xA1; 16], + "A", + b"Hello, World!", + HashAlgo::Sha256, + Chunker::Fixed(7), + ) + .unwrap(); + arena + .add_inner( + 0x10, + [0xB2; 16], + "B", + b"World!", + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + + let a = arena.inner_info(&[0xA1; 16]).unwrap(); + let b = arena.inner_info(&[0xB2; 16]).unwrap(); + // A: "Hello, " private, "World!" shared. + assert!(!a.extents[0].shared); + assert!(a.extents[1].shared); + // B: single extent, shared, deduplicated onto A's second extent. + assert_eq!(b.extents.len(), 1); + assert!(b.extents[0].shared); + // B's data_hash equals a standalone SHA-256("World!") — promotion invariant. + assert_eq!(b.data_hash, HashAlgo::Sha256.compute(b"World!")); +} + +#[test] +fn canonical_vector_is_byte_exact_700() { + let image = build_reference_vector().unwrap(); + assert_eq!(image.len(), 700, "spec Section 17 total file size"); + assert_eq!( + image, CANONICAL, + "must reproduce the Section 17 bytes exactly" + ); +} + +#[test] +fn canonical_vector_key_offsets() { + let image = build_reference_vector().unwrap(); + // Top-level: file header partition_table_offset = 20, one entry of type DCP. + assert_eq!(&image[0..8], &pcf::MAGIC); + // Arena begins at file offset 0x00EB (235). + assert_eq!(&image[0xEB..0xEF], b"PDCP"); + assert_eq!(image[0xEF], 1); // profile_version_major + assert_eq!(image[0xF0], 0); // profile_version_minor (the spec dump's 01 was a typo) + // inner_table_offset = 109 (arena-rel), arena_used = 465. + assert_eq!( + u64::from_le_bytes(image[0xF3..0xFB].try_into().unwrap()), + 109 + ); + assert_eq!( + u64::from_le_bytes(image[0xFB..0x103].try_into().unwrap()), + 465 + ); + // Shared flags: A[1] at 0x013C and B[0] at 0x0157 are 1; A[0] at 0x012A is 0. + assert_eq!(image[0x012A], 0); + assert_eq!(image[0x013C], 1); + assert_eq!(image[0x0157], 1); +} + +#[test] +fn canonical_vector_is_valid_pcf() { + // A generic PCF reader sees one valid partition and the table hash verifies. + let image = build_reference_vector().unwrap(); + let mut c = Container::open(Cursor::new(image)).unwrap(); + c.verify().unwrap(); + let entries = c.entries().unwrap(); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].partition_type, DCP_CONTAINER_TYPE); + assert_eq!(entries[0].used_bytes, 465); + assert_eq!(entries[0].data_hash_algo, HashAlgo::None); +} + +#[test] +fn canonical_vector_is_valid_dcp() { + let image = build_reference_vector().unwrap(); + let mut r = DcpReader::open(Cursor::new(image)).unwrap(); + r.verify().unwrap(); + assert_eq!(r.read_inner(&[0xA1; 16]).unwrap(), b"Hello, World!"); + assert_eq!(r.read_inner(&[0xB2; 16]).unwrap(), b"World!"); +} + +#[test] +fn parse_roundtrips_canonical_arena_byte_exact() { + // Parsing the canonical arena and re-serialising reproduces it exactly, + // because the test vector is already in canonical layout. + let mut c = Container::open(Cursor::new(CANONICAL.to_vec())).unwrap(); + let entry = c.entries().unwrap().into_iter().next().unwrap(); + let data = c.read_partition_data(&entry).unwrap(); + let arena = Arena::parse(&data).unwrap(); + assert_eq!(arena.to_bytes(), data); +} diff --git a/specs/PCF-DCP-spec-v1.0.txt b/specs/PCF-DCP-spec-v1.0.txt index be54c18..5c51234 100644 --- a/specs/PCF-DCP-spec-v1.0.txt +++ b/specs/PCF-DCP-spec-v1.0.txt @@ -29,6 +29,7 @@ Table of Contents 1. Introduction 2. Relationship to PCF 2.1 Relationship to PCF-SIG + 2.2 Compatibility with the PCF File Trailer 3. Conventions and Terminology 3.1 Requirement Keywords 3.2 Terminology @@ -195,6 +196,38 @@ Table of Contents top-level entry is added -- never a copy that would leave two live entries sharing a uid. +2.2 Compatibility with the PCF File Trailer + + PCF v1.0 defines an OPTIONAL File Trailer: when the PCF File Header's + partition_table_offset holds the all-ones sentinel, the top-level + partition-table head is recorded in a fixed trailer at the end of the file + (and the table chain MAY be backward-linked). This lets append-only writers + commit without rewriting the header. PCF-DCP is fully compatible with both + header-pointer and trailer-mode host files: + + - A DCP-aware Reader MUST locate the top-level table head through the PCF + layer (which resolves the trailer when present); it MUST NOT assume the + File Header's partition_table_offset is a real offset. In trailer mode + that field is the sentinel, not the table position. Once the top-level + partitions are enumerated, locating and reading a DCP container is + unchanged: the container is one ordinary PCF partition. + + - Whether the host file uses a header pointer or a trailer is invisible to + the arena: the DCP Header, Inner Table Block chain, and Fragment Tables + are addressed by ARENA-RELATIVE offsets within the container's data + (Section 3.3) and never reference the enclosing file's layout. + + - The arena itself does NOT contain a PCF trailer. The inner table is + always located by inner_table_offset in the DCP Header (Section 6); the + inner Table Block chain is forward-linked (next_table_offset), because a + DCP Writer rewrites the arena as a whole (Section 4.3, 10.3) and so has + no append-only motive to invert it. + + A Writer MAY publish a finished DCP file in trailer mode (e.g. when the DCP + file is itself appended into a larger append-only host); doing so changes no + arena byte and leaves every inner partition, data_hash, and signature intact. + The test vector in Section 17 is given in classic header-pointer form. + ------------------------------------------------------------------------------- 3. Conventions and Terminology @@ -1090,7 +1123,7 @@ Table of Contents ---- DCP Header (arena 0x000 / file 0x00EB, 24 bytes) ------------ 00EB 50 44 43 50 dcp_magic = "PDCP" 00EF 01 profile_version_major = 1 - 00F0 01 profile_version_minor = 0 + 00F0 00 profile_version_minor = 0 00F1 00 00 flags = 0 00F3 6D 00 00 00 00 00 00 00 inner_table_offset = 109 (arena-rel) 00FB D1 01 00 00 00 00 00 00 arena_used = 465 (arena-rel) @@ -1175,7 +1208,7 @@ Table of Contents 00C0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00D0 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00E0 00 00 00 00 00 00 00 00 00 00 00 50 44 43 50 01 ...........PDCP. - 00F0 01 00 00 6D 00 00 00 00 00 00 00 D1 01 00 00 00 ...m............ + 00F0 00 00 00 6D 00 00 00 00 00 00 00 D1 01 00 00 00 ...m............ 0100 00 00 00 48 65 6C 6C 6F 2C 20 57 6F 72 6C 64 21 ...Hello, World! 0110 00 00 00 00 00 00 00 00 02 18 00 00 00 00 00 00 ................ 0120 00 07 00 00 00 00 00 00 00 01 00 1F 00 00 00 00 ................ diff --git a/tools/pcf-debug/Cargo.toml b/tools/pcf-debug/Cargo.toml index dd5eab9..9c8ab42 100644 --- a/tools/pcf-debug/Cargo.toml +++ b/tools/pcf-debug/Cargo.toml @@ -18,3 +18,4 @@ path = "src/main.rs" [dependencies] pcf = { path = "../../reference/PCF-v1.0", version = "0.0.8" } pcf-sig = { path = "../../reference/PCF-SIG-v1.0", version = "0.0.8" } +pcf-dcp = { path = "../../reference/PCF-DCP-v1.0", version = "0.0.8" } diff --git a/tools/pcf-debug/src/plugin/dcp.rs b/tools/pcf-debug/src/plugin/dcp.rs new file mode 100644 index 0000000..888770c --- /dev/null +++ b/tools/pcf-debug/src/plugin/dcp.rs @@ -0,0 +1,397 @@ +//! Decoder for PCF-DCP containers (see `specs/PCF-DCP-spec-v1.0.txt`): +//! `DCP_CONTAINER` (partition type `0xAAAC0001`, arena magic `"PDCP"`). +//! +//! The decoder mirrors the spec's byte tables field-for-field — DCP Header, +//! Inner Table Block chain, and a Fragment Table per inner partition — and +//! reports spec violations as warnings rather than failing. Parsing is inline +//! (it does not depend on the `pcf-dcp` reader), but it borrows the profile's +//! constants from the `pcf-dcp` crate so the two never drift. + +use pcf::{HashAlgo, ENTRY_SIZE, TABLE_HEADER_SIZE}; +use pcf_dcp::{ + DCP_CONTAINER_TYPE, DCP_HEADER_SIZE, DCP_MAGIC, FRAGMENT_ENTRY_SIZE, FRAGTABLE_HEADER_SIZE, + KIND_DATA, +}; + +use super::{ + le_u16, le_u32, le_u64, uid_at, Decoded, FieldNode, FieldValue, PartitionDecoder, PartitionMeta, +}; + +fn kind_name(kind: u8) -> &'static str { + match kind { + 0 => "INVALID (reserved)", + 1 => "DATA", + 2 => "HOLE (reserved)", + 3 => "REF (reserved)", + _ => "unknown", + } +} + +fn hash_algo_name(id: u8) -> &'static str { + match HashAlgo::from_id(id) { + Ok(a) => crate::model::algo_name(a), + Err(_) => "unknown", + } +} + +/// Render a `<64-byte hash>` pair, truncated to the digest length. +fn hash_field(data: &[u8], algo_off: usize, hash_off: usize) -> FieldNode { + let id = data.get(algo_off).copied().unwrap_or(0); + let len = HashAlgo::from_id(id).map(|a| a.digest_len()).unwrap_or(0); + let bytes = data + .get(hash_off..hash_off + 64) + .map(|b| b[..len.min(64)].to_vec()) + .unwrap_or_default(); + FieldNode::group("data_hash") + .child(FieldNode::leaf( + "algo_id", + FieldValue::Enum { + raw: id as u64, + name: hash_algo_name(id).into(), + }, + (algo_off as u64, algo_off as u64 + 1), + )) + .child(FieldNode::leaf( + "hash", + FieldValue::Bytes(bytes), + (hash_off as u64, hash_off as u64 + 64), + )) +} + +pub struct DcpContainerDecoder; + +impl PartitionDecoder for DcpContainerDecoder { + fn name(&self) -> &'static str { + "dcp-container" + } + + fn matches(&self, meta: &PartitionMeta, data: &[u8]) -> bool { + meta.partition_type == DCP_CONTAINER_TYPE || data.get(0..4) == Some(&DCP_MAGIC) + } + + fn decode(&self, _meta: &PartitionMeta, data: &[u8]) -> Decoded { + let mut warnings = Vec::new(); + let mut fields = Vec::new(); + + if (data.len() as u64) < DCP_HEADER_SIZE { + warnings.push(format!( + "arena is {} bytes; DCP Header needs at least {DCP_HEADER_SIZE}", + data.len() + )); + } + + // ---- DCP Header --------------------------------------------------- + let magic_ok = data.get(0..4) == Some(&DCP_MAGIC); + if !magic_ok { + warnings.push("dcp_magic is not \"PDCP\"".into()); + } + let mut header = FieldNode::group("DCP Header"); + header.push( + FieldNode::leaf( + "dcp_magic", + FieldValue::Text(ascii4(data.get(0..4).unwrap_or(&[]))), + (0, 4), + ) + .with_note(if magic_ok { + "magic OK" + } else { + "expected \"PDCP\"" + }), + ); + let major = data.get(4).copied().unwrap_or(0); + if major != 1 { + warnings.push(format!( + "profile_version_major is {major} (v1.0 reader expects 1)" + )); + } + header.push(FieldNode::leaf( + "profile_version_major", + FieldValue::U64(major as u64), + (4, 5), + )); + header.push(FieldNode::leaf( + "profile_version_minor", + FieldValue::U64(data.get(5).copied().unwrap_or(0) as u64), + (5, 6), + )); + let flags = le_u16(data, 6).unwrap_or(0); + if flags != 0 { + warnings.push(format!("flags is {flags:#06x}; v1.0 requires 0")); + } + header.push(FieldNode::leaf( + "flags", + FieldValue::U64(flags as u64), + (6, 8), + )); + let inner_table_offset = le_u64(data, 8).unwrap_or(0); + header.push(FieldNode::leaf( + "inner_table_offset", + FieldValue::U64(inner_table_offset), + (8, 16), + )); + let arena_used = le_u64(data, 16).unwrap_or(0); + header.push(FieldNode::leaf( + "arena_used", + FieldValue::U64(arena_used), + (16, 24), + )); + fields.push(header); + + // ---- Inner Table Block chain -------------------------------------- + let mut inner_group = FieldNode::group("Inner Table Block(s)"); + let mut frag_offsets: Vec<(String, u64)> = Vec::new(); // (label, start_offset) + let mut off = inner_table_offset; + let mut block_idx = 0usize; + let mut budget = data.len() / TABLE_HEADER_SIZE as usize + 1; + while off != 0 { + if budget == 0 { + warnings.push("inner table chain does not terminate".into()); + break; + } + budget -= 1; + let base = off as usize; + if base + TABLE_HEADER_SIZE as usize > data.len() { + warnings.push(format!("inner Table Block at {off} runs past end of arena")); + break; + } + let count = data[base]; + let next = le_u64(data, base + 1).unwrap_or(0); + let th_algo = data.get(base + 9).copied().unwrap_or(0); + let mut block = FieldNode::group(format!("block[{block_idx}] @ {off}")); + block.push(FieldNode::leaf( + "partition_count", + FieldValue::U64(count as u64), + (base as u64, base as u64 + 1), + )); + block.push(FieldNode::leaf( + "next_table_offset", + FieldValue::U64(next), + (base as u64 + 1, base as u64 + 9), + )); + block.push( + hash_field(data, base + 9, base + 10) + .with_note(format!("table_hash ({})", hash_algo_name(th_algo))), + ); + + for i in 0..count as usize { + let eo = base + TABLE_HEADER_SIZE as usize + i * ENTRY_SIZE as usize; + if eo + ENTRY_SIZE as usize > data.len() { + warnings.push(format!("inner entry {i} runs past end of arena")); + break; + } + let ptype = le_u32(data, eo).unwrap_or(0); + let uid = uid_at(data, eo + 4).unwrap_or([0; 16]); + let label = label32(data, eo + 20); + let start_offset = le_u64(data, eo + 52).unwrap_or(0); + let max_length = le_u64(data, eo + 60).unwrap_or(0); + let used_bytes = le_u64(data, eo + 68).unwrap_or(0); + + if ptype == DCP_CONTAINER_TYPE { + warnings.push(format!( + "inner entry \"{label}\" is itself a DCP container (nesting forbidden)" + )); + } + if max_length != used_bytes { + warnings.push(format!( + "inner entry \"{label}\": max_length ({max_length}) != used_bytes ({used_bytes}) (spec 7.2)" + )); + } + frag_offsets.push((label.clone(), start_offset)); + + let mut entry = FieldNode::group(format!("inner[{label}]")); + entry.push(FieldNode::leaf( + "type", + FieldValue::U64(ptype as u64), + (eo as u64, eo as u64 + 4), + )); + entry.push(FieldNode::leaf( + "uid", + FieldValue::Uid(uid), + (eo as u64 + 4, eo as u64 + 20), + )); + entry.push(FieldNode::leaf( + "label", + FieldValue::Text(label), + (eo as u64 + 20, eo as u64 + 52), + )); + entry.push( + FieldNode::leaf( + "start_offset", + FieldValue::U64(start_offset), + (eo as u64 + 52, eo as u64 + 60), + ) + .with_note("reinterpreted -> Fragment Table"), + ); + entry.push( + FieldNode::leaf( + "max_length", + FieldValue::U64(max_length), + (eo as u64 + 60, eo as u64 + 68), + ) + .with_note("reinterpreted = used_bytes"), + ); + entry.push(FieldNode::leaf( + "used_bytes", + FieldValue::U64(used_bytes), + (eo as u64 + 68, eo as u64 + 76), + )); + entry.push(hash_field(data, eo + 76, eo + 77)); + block.push(entry); + } + inner_group.push(block); + off = next; + block_idx += 1; + } + fields.push(inner_group); + + // ---- Fragment Tables, one chain per inner partition --------------- + let mut frag_group = FieldNode::group("Fragment Tables"); + let mut total_extents = 0usize; + let mut shared_extents = 0usize; + for (label, start) in &frag_offsets { + let mut inner = FieldNode::group(format!("frags[{label}] @ {start}")); + let mut foff = *start; + let mut fbudget = data.len() / FRAGTABLE_HEADER_SIZE as usize + 1; + let mut chain_idx = 0usize; + while foff != 0 { + if fbudget == 0 { + warnings.push(format!("fragment table for \"{label}\" does not terminate")); + break; + } + fbudget -= 1; + let base = foff as usize; + if base + FRAGTABLE_HEADER_SIZE as usize > data.len() { + warnings.push(format!( + "fragment table for \"{label}\" runs past end of arena" + )); + break; + } + let next = le_u64(data, base).unwrap_or(0); + let fcount = data[base + 8]; + let mut blk = FieldNode::group(format!("block[{chain_idx}] @ {foff}")); + blk.push(FieldNode::leaf( + "next_fragtable_offset", + FieldValue::U64(next), + (base as u64, base as u64 + 8), + )); + blk.push(FieldNode::leaf( + "fragment_count", + FieldValue::U64(fcount as u64), + (base as u64 + 8, base as u64 + 9), + )); + for i in 0..fcount as usize { + let xo = + base + FRAGTABLE_HEADER_SIZE as usize + i * FRAGMENT_ENTRY_SIZE as usize; + if xo + FRAGMENT_ENTRY_SIZE as usize > data.len() { + warnings.push(format!( + "fragment {i} of \"{label}\" runs past end of arena" + )); + break; + } + let ext_off = le_u64(data, xo).unwrap_or(0); + let ext_len = le_u64(data, xo + 8).unwrap_or(0); + let kind = data.get(xo + 16).copied().unwrap_or(0); + let eflags = data.get(xo + 17).copied().unwrap_or(0); + let shared = eflags & 1 != 0; + total_extents += 1; + if shared { + shared_extents += 1; + } + if kind != KIND_DATA { + warnings.push(format!( + "fragment {i} of \"{label}\" has kind {kind} ({}) — unreadable in v1.0", + kind_name(kind) + )); + } + if eflags & !1 != 0 { + warnings.push(format!( + "fragment {i} of \"{label}\" has reserved flag bits set" + )); + } + let mut frag = FieldNode::group(format!("extent[{i}]")); + frag.push(FieldNode::leaf( + "extent_offset", + FieldValue::U64(ext_off), + (xo as u64, xo as u64 + 8), + )); + frag.push(FieldNode::leaf( + "extent_length", + FieldValue::U64(ext_len), + (xo as u64 + 8, xo as u64 + 16), + )); + frag.push(FieldNode::leaf( + "kind", + FieldValue::Enum { + raw: kind as u64, + name: kind_name(kind).into(), + }, + (xo as u64 + 16, xo as u64 + 17), + )); + frag.push(FieldNode::leaf( + "flags", + FieldValue::Flags { + raw: eflags as u64, + set: if shared { + vec!["SHARED".into()] + } else { + Vec::new() + }, + }, + (xo as u64 + 17, xo as u64 + 18), + )); + blk.push(frag); + } + inner.push(blk); + foff = next; + chain_idx += 1; + } + frag_group.push(inner); + } + fields.push(frag_group); + + // ---- Summary ------------------------------------------------------ + let mut summary = FieldNode::group("summary"); + summary.push(FieldNode::leaf( + "inner_partitions", + FieldValue::U64(frag_offsets.len() as u64), + (0, 0), + )); + summary.push(FieldNode::leaf( + "extents", + FieldValue::U64(total_extents as u64), + (0, 0), + )); + summary.push(FieldNode::leaf( + "shared_extents", + FieldValue::U64(shared_extents as u64), + (0, 0), + )); + fields.push(summary); + + Decoded { + format_name: "DCP_CONTAINER".into(), + fields, + warnings, + } + } +} + +/// Render a 4-byte magic as ASCII (non-printable bytes shown as `\xNN`). +fn ascii4(b: &[u8]) -> String { + b.iter() + .map(|&c| { + if (0x20..0x7f).contains(&c) { + (c as char).to_string() + } else { + format!("\\x{c:02x}") + } + }) + .collect() +} + +/// Decode a 32-byte label field (read until the first NUL). +fn label32(data: &[u8], off: usize) -> String { + let bytes = data.get(off..off + 32).unwrap_or(&[]); + let end = bytes.iter().position(|&b| b == 0).unwrap_or(bytes.len()); + String::from_utf8_lossy(&bytes[..end]).into_owned() +} diff --git a/tools/pcf-debug/src/plugin/mod.rs b/tools/pcf-debug/src/plugin/mod.rs index 6aad704..35627b1 100644 --- a/tools/pcf-debug/src/plugin/mod.rs +++ b/tools/pcf-debug/src/plugin/mod.rs @@ -11,10 +11,12 @@ //! (shared-library) backend could be added behind a feature without reworking //! any decoder. +mod dcp; mod pcfsig; mod pfs; mod raw; +pub use dcp::DcpContainerDecoder; pub use pcfsig::{PcfSigKeyDecoder, PcfSigSignatureDecoder}; pub use pfs::{PfsNodeDecoder, PfsSessionDecoder}; pub use raw::RawDecoder; @@ -141,6 +143,7 @@ impl DecoderRegistry { Box::new(PfsSessionDecoder), Box::new(PcfSigKeyDecoder), Box::new(PcfSigSignatureDecoder), + Box::new(DcpContainerDecoder), Box::new(RawDecoder), ], } diff --git a/tools/pcf-debug/tests/decode_dcp.rs b/tools/pcf-debug/tests/decode_dcp.rs new file mode 100644 index 0000000..1fb449c --- /dev/null +++ b/tools/pcf-debug/tests/decode_dcp.rs @@ -0,0 +1,125 @@ +//! Tests for the PCF-DCP container decoder, both directly (with synthesised +//! bytes) and through the full walk → registry → decode pipeline using the +//! canonical 700-byte test vector from `reference/PCF-DCP-v1.0/testdata/`. + +use pcf_debug::build_report; +use pcf_debug::plugin::{ + DcpContainerDecoder, Decoded, DecoderRegistry, FieldNode, FieldValue, PartitionDecoder, + PartitionMeta, +}; + +const CANONICAL: &[u8] = include_bytes!("../../../reference/PCF-DCP-v1.0/testdata/canonical.bin"); + +const DCP_CONTAINER_TYPE: u32 = 0xAAAC_0001; + +/// Find a (possibly nested) field by name. +fn find<'a>(fields: &'a [FieldNode], name: &str) -> Option<&'a FieldNode> { + for f in fields { + if f.name == name { + return Some(f); + } + if let Some(hit) = find(&f.children, name) { + return Some(hit); + } + } + None +} + +fn find_decoded<'a>( + report: &'a pcf_debug::render::Report, + format_name: &str, +) -> Option<&'a Decoded> { + report + .decoded + .iter() + .find(|(_, d)| d.format_name == format_name) + .map(|(_, d)| d) +} + +#[test] +fn registry_routes_dcp_type_to_dedicated_decoder() { + let r = DecoderRegistry::with_builtins(); + assert!(r.names().contains(&"dcp-container")); +} + +#[test] +fn dcp_decoder_on_canonical_vector() { + let report = build_report(CANONICAL, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").expect("canonical vector has a DCP_CONTAINER"); + + assert!( + dcp.warnings.is_empty(), + "clean container has no warnings: {:?}", + dcp.warnings + ); + + // DCP Header. + let magic = find(&dcp.fields, "dcp_magic").unwrap(); + assert_eq!(magic.value, FieldValue::Text("PDCP".into())); + assert_eq!(magic.note.as_deref(), Some("magic OK")); + + let ito = find(&dcp.fields, "inner_table_offset").unwrap(); + assert_eq!(ito.value, FieldValue::U64(109)); + let used = find(&dcp.fields, "arena_used").unwrap(); + assert_eq!(used.value, FieldValue::U64(465)); + + // Inner partition A: two extents, reinterpreted start_offset. + let inner_a = find(&dcp.fields, "inner[A]").unwrap(); + let start = find(&inner_a.children, "start_offset").unwrap(); + assert_eq!(start.value, FieldValue::U64(37)); + assert_eq!( + start.note.as_deref(), + Some("reinterpreted -> Fragment Table") + ); + + // Summary: 2 inner partitions, 3 extent references, 2 of them shared. + let inner_count = find(&dcp.fields, "inner_partitions").unwrap(); + assert_eq!(inner_count.value, FieldValue::U64(2)); + let extents = find(&dcp.fields, "extents").unwrap(); + assert_eq!(extents.value, FieldValue::U64(3)); + let shared = find(&dcp.fields, "shared_extents").unwrap(); + assert_eq!(shared.value, FieldValue::U64(2)); +} + +#[test] +fn dcp_decoder_flags_shared_extent() { + let report = build_report(CANONICAL, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").unwrap(); + // Fragment B's only extent is SHARED. + let frags_b = find(&dcp.fields, "frags[B] @ 82").unwrap(); + let flags = find(&frags_b.children, "flags").unwrap(); + match &flags.value { + FieldValue::Flags { raw, set } => { + assert_eq!(*raw, 1); + assert_eq!(set, &vec!["SHARED".to_string()]); + } + other => panic!("flags has wrong shape: {other:?}"), + } +} + +#[test] +fn dcp_decoder_warns_on_bad_magic() { + let mut bytes = vec![0u8; 24]; + bytes[..4].copy_from_slice(b"XDCP"); + let uid = [0u8; 16]; + let meta = PartitionMeta { + partition_type: DCP_CONTAINER_TYPE, + uid: &uid, + label: "dcp", + }; + let d: Decoded = DcpContainerDecoder.decode(&meta, &bytes); + assert!(d.warnings.iter().any(|w| w.contains("magic"))); +} + +#[test] +fn dcp_decoder_matches_by_magic_without_type() { + let mut bytes = vec![0u8; 24]; + bytes[..4].copy_from_slice(b"PDCP"); + let uid = [0u8; 16]; + let meta = PartitionMeta { + partition_type: 0xFFFF_FFFF, + uid: &uid, + label: "raw", + }; + assert!(DcpContainerDecoder.matches(&meta, &bytes)); +} From 484866d2ee48041571c0dd8c5077a6d94dcab05b Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 7 Jun 2026 23:54:55 +0000 Subject: [PATCH 3/3] pcf-debug: recursively decode DCP inner partition content MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The DcpContainerDecoder showed a DCP container's structure and the metadata of its inner partitions, but never interpreted their content. Add a generic recursion mechanism so container decoders can have what they hold decoded: - New optional `PartitionDecoder::children(meta, data) -> Vec` (default empty) plus `DecoderRegistry::children` (first-match, mirrors `decode`). `DecodedChild` carries a sub-partition's type/uid/label and its reconstructed logical content. - `decode_recursive` / `attach_inner_decodes` in lib.rs decode a partition, then decode each child recursively and nest the results under a "decoded inner partitions" group (child titled `content[label] -> FORMAT`, with the child's own warnings preserved as a sub-group). Guarded by MAX_DECODE_DEPTH. build_report and the `decode` subcommand (filter_decode) use it, so nesting applies to text, HTML, and forced-decoder output alike. - DcpContainerDecoder implements `children` via pcf_dcp::Arena (parse + per inner content reconstruction); malformed arenas or non-reconstructable inners are skipped defensively. The mechanism is profile-agnostic: lib.rs gains no pcf-dcp dependency, and any future container-like decoder gets recursion for free. Renderers and the `Report.decoded` key are unchanged — the nested group flows through the existing field-tree renderer. Tests: nested decode of the canonical vector (content[A]/content[B] -> RAW), routing of a recognizable inner format (PFS_NODE) through the registry, and that leaf partitions report no children. Existing decode_dcp assertions still hold (the new group uses a distinct name prefix; inner warnings nest under the child, not the container). https://claude.ai/code/session_01XzcjWWbNiuNX9ZywevfbQu --- tools/pcf-debug/README.md | 9 +++ tools/pcf-debug/src/lib.rs | 82 ++++++++++++++++++++++++++- tools/pcf-debug/src/main.rs | 12 ++-- tools/pcf-debug/src/plugin/dcp.rs | 28 +++++++++- tools/pcf-debug/src/plugin/mod.rs | 35 ++++++++++++ tools/pcf-debug/tests/decode_dcp.rs | 86 +++++++++++++++++++++++++++++ 6 files changed, 245 insertions(+), 7 deletions(-) diff --git a/tools/pcf-debug/README.md b/tools/pcf-debug/README.md index b4eda26..fb3a318 100644 --- a/tools/pcf-debug/README.md +++ b/tools/pcf-debug/README.md @@ -82,6 +82,10 @@ pcf-debug fs.pcf decode - **Diagnostics** — gaps, overlaps, truncated regions, chain cycles, and hash mismatches, by severity. - **Decoded partitions** — field trees produced by the plugin decoders. + *Container* decoders also decode what they contain: a `DCP_CONTAINER` + (`0xAAAC0001`) reconstructs each inner partition's logical content and routes + it back through the registry, nesting the result under a *decoded inner + partitions* group (e.g. an inner `PFS_NODE` is shown as a full PFS field tree). ## Writing a decoder plugin @@ -126,6 +130,11 @@ The first decoder whose `matches` returns true wins; `raw` is always last and matches everything. `decode` must be infallible — on malformed input, return the fields you could read plus `warnings`. +A *container* decoder may also override the optional `children` method to return +the sub-partitions it holds (each as a `DecodedChild` carrying a reconstructed +content blob). The pipeline decodes those recursively and nests them under the +parent — see `dcp-container` (`src/plugin/dcp.rs`). + The built-in `pfs-node` and `pfs-session` decoders (`src/plugin/pfs.rs`) are a complete worked example covering the PFS-MS record formats. diff --git a/tools/pcf-debug/src/lib.rs b/tools/pcf-debug/src/lib.rs index 68a35dc..0a264de 100644 --- a/tools/pcf-debug/src/lib.rs +++ b/tools/pcf-debug/src/lib.rs @@ -18,9 +18,14 @@ pub mod model; pub mod plugin; pub mod render; -use plugin::{DecoderRegistry, PartitionMeta}; +use plugin::{Decoded, DecodedChild, DecoderRegistry, FieldNode, FieldValue, PartitionMeta}; use render::Report; +/// Maximum container nesting depth followed by [`decode_recursive`]. DCP forbids +/// nesting, so the real depth is at most 2; this is a guard against pathological +/// or hostile inputs. +pub const MAX_DECODE_DEPTH: usize = 8; + /// Read a partition's used bytes from the file image, or an empty slice when the /// region is out of bounds or empty. fn partition_bytes(data: &[u8], entry: &pcf::PartitionEntry, in_bounds: bool) -> Vec { @@ -49,8 +54,81 @@ pub fn build_report(data: &[u8], verify: bool, registry: &DecoderRegistry) -> Re uid: &e.uid, label: &label, }; - decoded.push((e.uid, registry.decode(&meta, &bytes))); + decoded.push((e.uid, decode_recursive(registry, &meta, &bytes))); } } Report { layout, decoded } } + +/// Decode `data`, then recursively decode and nest any sub-partitions a +/// container decoder surfaces (e.g. the inner partitions of a DCP container). +/// The nested decodes appear as a `"decoded inner partitions"` group at the end +/// of the field tree. +pub fn decode_recursive(registry: &DecoderRegistry, meta: &PartitionMeta, data: &[u8]) -> Decoded { + let mut dec = registry.decode(meta, data); + attach_inner_decodes(registry, meta, data, &mut dec); + dec +} + +/// Append a `"decoded inner partitions"` group to `dec` for every sub-partition +/// the matching container decoder reports, decoding each recursively. A no-op +/// for non-container partitions. Useful when `dec` was produced by a forced +/// decoder (`--decoder`) and should still gain its nested children. +pub fn attach_inner_decodes( + registry: &DecoderRegistry, + meta: &PartitionMeta, + data: &[u8], + dec: &mut Decoded, +) { + attach_at_depth(registry, meta, data, dec, 0); +} + +fn attach_at_depth( + registry: &DecoderRegistry, + meta: &PartitionMeta, + data: &[u8], + dec: &mut Decoded, + depth: usize, +) { + if depth >= MAX_DECODE_DEPTH { + return; + } + let kids = registry.children(meta, data); + if kids.is_empty() { + return; + } + let mut group = FieldNode::group("decoded inner partitions"); + for ch in kids { + let cmeta = PartitionMeta { + partition_type: ch.partition_type, + uid: &ch.uid, + label: &ch.label, + }; + let mut cdec = registry.decode(&cmeta, &ch.data); + attach_at_depth(registry, &cmeta, &ch.data, &mut cdec, depth + 1); + group.push(child_to_field(&ch, cdec)); + } + dec.fields.push(group); +} + +/// Wrap one child's decoded field tree as a single named group, carrying its +/// uid/type as a note and preserving any decoder warnings as a sub-group. +fn child_to_field(child: &DecodedChild, dec: Decoded) -> FieldNode { + let mut node = FieldNode::group(format!("content[{}] -> {}", child.label, dec.format_name)) + .with_note(format!( + "uid {} type 0x{:08X}", + render::uid_hex(&child.uid), + child.partition_type + )); + for f in dec.fields { + node.push(f); + } + if !dec.warnings.is_empty() { + let mut warns = FieldNode::group("warnings"); + for msg in dec.warnings { + warns.push(FieldNode::leaf("warning", FieldValue::Text(msg), (0, 0))); + } + node.push(warns); + } + node +} diff --git a/tools/pcf-debug/src/main.rs b/tools/pcf-debug/src/main.rs index c412c2b..21af8b9 100644 --- a/tools/pcf-debug/src/main.rs +++ b/tools/pcf-debug/src/main.rs @@ -160,10 +160,14 @@ fn filter_decode( label: &label, }; let dec = match &opts.decoder { - Some(name) => registry - .decode_with(name, &meta, &bytes) - .unwrap_or_else(|| registry.decode(&meta, &bytes)), - None => registry.decode(&meta, &bytes), + Some(name) => { + let mut d = registry + .decode_with(name, &meta, &bytes) + .unwrap_or_else(|| registry.decode(&meta, &bytes)); + pcf_debug::attach_inner_decodes(registry, &meta, &bytes, &mut d); + d + } + None => pcf_debug::decode_recursive(registry, &meta, &bytes), }; decoded.push((e.uid, dec)); } diff --git a/tools/pcf-debug/src/plugin/dcp.rs b/tools/pcf-debug/src/plugin/dcp.rs index 888770c..8796840 100644 --- a/tools/pcf-debug/src/plugin/dcp.rs +++ b/tools/pcf-debug/src/plugin/dcp.rs @@ -14,7 +14,8 @@ use pcf_dcp::{ }; use super::{ - le_u16, le_u32, le_u64, uid_at, Decoded, FieldNode, FieldValue, PartitionDecoder, PartitionMeta, + le_u16, le_u32, le_u64, uid_at, Decoded, DecodedChild, FieldNode, FieldValue, PartitionDecoder, + PartitionMeta, }; fn kind_name(kind: u8) -> &'static str { @@ -374,6 +375,31 @@ impl PartitionDecoder for DcpContainerDecoder { warnings, } } + + /// The inner partitions of the DCP container, each with its reconstructed + /// logical content, so the pipeline can decode them recursively (spec + /// Sections 7–8). Defensive: a malformed arena or an inner partition whose + /// content cannot be reconstructed (reserved fragment kind, length + /// mismatch) is simply omitted — `decode` already surfaces the structural + /// detail and any warnings. + fn children(&self, _meta: &PartitionMeta, data: &[u8]) -> Vec { + let arena = match pcf_dcp::Arena::parse(data) { + Ok(a) => a, + Err(_) => return Vec::new(), + }; + arena + .inners() + .into_iter() + .filter_map(|info| { + arena.content(&info.uid).ok().map(|content| DecodedChild { + partition_type: info.partition_type, + uid: info.uid, + label: info.label, + data: content, + }) + }) + .collect() + } } /// Render a 4-byte magic as ASCII (non-printable bytes shown as `\xNN`). diff --git a/tools/pcf-debug/src/plugin/mod.rs b/tools/pcf-debug/src/plugin/mod.rs index 35627b1..145ea3c 100644 --- a/tools/pcf-debug/src/plugin/mod.rs +++ b/tools/pcf-debug/src/plugin/mod.rs @@ -113,6 +113,22 @@ pub struct Decoded { pub warnings: Vec, } +/// A sub-partition surfaced by a *container* decoder (e.g. the inner partitions +/// of a DCP container) whose reconstructed logical content should itself be +/// decoded. Returned by [`PartitionDecoder::children`] and decoded recursively +/// by [`crate::decode_recursive`]. +#[derive(Debug, Clone)] +pub struct DecodedChild { + /// The sub-partition's application type. + pub partition_type: u32, + /// The sub-partition's 16-byte uid. + pub uid: [u8; 16], + /// The sub-partition's decoded label. + pub label: String, + /// The sub-partition's reconstructed logical content. + pub data: Vec, +} + /// A plugin that turns partition bytes into a field tree. pub trait PartitionDecoder { /// Stable identifier, used for `--decoder` selection and HTML anchors. @@ -125,6 +141,14 @@ pub trait PartitionDecoder { /// Full decode. Must never panic: on malformed input it returns whatever /// fields it could read plus `warnings`. fn decode(&self, meta: &PartitionMeta, data: &[u8]) -> Decoded; + + /// Sub-partitions contained within this partition whose reconstructed + /// content should itself be decoded (e.g. the inner partitions of a DCP + /// container). The default is none; only container-like decoders override + /// it. Must never panic: on malformed input it returns an empty list. + fn children(&self, _meta: &PartitionMeta, _data: &[u8]) -> Vec { + Vec::new() + } } /// An ordered set of decoders. The first decoder whose `matches` returns true @@ -171,6 +195,17 @@ impl DecoderRegistry { RawDecoder.decode(meta, data) } + /// The sub-partitions of `data`, as reported by the first matching decoder + /// (mirrors [`Self::decode`]). Empty for non-container partitions. + pub fn children(&self, meta: &PartitionMeta, data: &[u8]) -> Vec { + for d in &self.decoders { + if d.matches(meta, data) { + return d.children(meta, data); + } + } + Vec::new() + } + /// Decode with a specific decoder by name, if present. pub fn decode_with(&self, name: &str, meta: &PartitionMeta, data: &[u8]) -> Option { self.decoders diff --git a/tools/pcf-debug/tests/decode_dcp.rs b/tools/pcf-debug/tests/decode_dcp.rs index 1fb449c..0c43c05 100644 --- a/tools/pcf-debug/tests/decode_dcp.rs +++ b/tools/pcf-debug/tests/decode_dcp.rs @@ -123,3 +123,89 @@ fn dcp_decoder_matches_by_magic_without_type() { }; assert!(DcpContainerDecoder.matches(&meta, &bytes)); } + +/// Find the first (possibly nested) field whose name contains `needle`. +fn find_contains<'a>(fields: &'a [FieldNode], needle: &str) -> Option<&'a FieldNode> { + for f in fields { + if f.name.contains(needle) { + return Some(f); + } + if let Some(hit) = find_contains(&f.children, needle) { + return Some(hit); + } + } + None +} + +#[test] +fn recursively_decodes_inner_partition_content() { + // The pipeline reconstructs each inner partition's logical content and + // decodes it, nesting the result under the DCP container. + let report = build_report(CANONICAL, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").unwrap(); + + let group = find(&dcp.fields, "decoded inner partitions") + .expect("container has a decoded-inner-partitions group"); + // A's content "Hello, World!" (13 B) and B's "World!" (6 B) are raw text. + let a = find_contains(&group.children, "content[A]").unwrap(); + assert!(a.name.contains("-> RAW"), "A decodes as RAW: {}", a.name); + let a_size = find(&a.children, "size").unwrap(); + assert_eq!(a_size.value, FieldValue::U64(13)); + let b = find_contains(&group.children, "content[B]").unwrap(); + let b_size = find(&b.children, "size").unwrap(); + assert_eq!(b_size.value, FieldValue::U64(6)); +} + +#[test] +fn recursive_decode_routes_inner_to_matching_decoder() { + use pcf_dcp::{Arena, Chunker, DcpWriter, HashAlgo}; + + // An inner partition typed as PFS_NODE (0xAAAA0001) must route, after + // reconstruction, to the PFS node decoder — not the raw fallback. + let mut node = b"PFSN".to_vec(); + node.extend_from_slice(&[0u8; 60]); // pad past the fixed prefix + let mut arena = Arena::new(); + arena + .add_inner( + 0xAAAA_0001, + [0x0A; 16], + "node", + &node, + HashAlgo::Sha256, + Chunker::Whole, + ) + .unwrap(); + let mut w = DcpWriter::new(); + w.add_container([0xDC; 16], "dcp", arena).unwrap(); + let image = w.to_image().unwrap(); + + let report = build_report(&image, true, &DecoderRegistry::with_builtins()); + let dcp = find_decoded(&report, "DCP_CONTAINER").unwrap(); + let group = find(&dcp.fields, "decoded inner partitions").unwrap(); + let child = find_contains(&group.children, "content[node]").unwrap(); + assert!( + child.name.contains("-> PFS_NODE"), + "inner routed to PFS decoder, got: {}", + child.name + ); + // The container's own warnings are unaffected by the inner's warnings, + // which are nested under the child instead. + assert!( + dcp.warnings.is_empty(), + "container warnings: {:?}", + dcp.warnings + ); +} + +#[test] +fn registry_reports_no_children_for_leaf_partitions() { + // A non-container partition yields no children (default trait impl). + let uid = [0u8; 16]; + let meta = PartitionMeta { + partition_type: 0xFFFF_FFFF, + uid: &uid, + label: "raw", + }; + let registry = DecoderRegistry::with_builtins(); + assert!(registry.children(&meta, b"plain bytes").is_empty()); +}