diff --git a/package-lock.json b/package-lock.json index bdd27102..86333d1e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -8,7 +8,7 @@ "name": "ipfs-specs-website", "version": "1.0.0", "dependencies": { - "spec-generator": "^1.6.1" + "spec-generator": "^1.7.0" } }, "node_modules/@11ty/dependency-tree": { @@ -5201,6 +5201,16 @@ "markdown-it": "bin/markdown-it.js" } }, + "node_modules/markdown-table": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", + "integrity": "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw==", + "license": "MIT", + "funding": { + "type": "github", + "url": "https://github.com/sponsors/wooorm" + } + }, "node_modules/marked": { "version": "12.0.2", "resolved": "https://registry.npmjs.org/marked/-/marked-12.0.2.tgz", @@ -5354,6 +5364,107 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/mdast-util-gfm": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm/-/mdast-util-gfm-3.1.0.tgz", + "integrity": "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ==", + "license": "MIT", + "dependencies": { + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-gfm-autolink-literal": "^2.0.0", + "mdast-util-gfm-footnote": "^2.0.0", + "mdast-util-gfm-strikethrough": "^2.0.0", + "mdast-util-gfm-table": "^2.0.0", + "mdast-util-gfm-task-list-item": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-autolink-literal": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-autolink-literal/-/mdast-util-gfm-autolink-literal-2.0.1.tgz", + "integrity": "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "ccount": "^2.0.0", + "devlop": "^1.0.0", + "mdast-util-find-and-replace": "^3.0.0", + "micromark-util-character": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-footnote/-/mdast-util-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.1.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-strikethrough": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-strikethrough/-/mdast-util-gfm-strikethrough-2.0.0.tgz", + "integrity": "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-table": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-table/-/mdast-util-gfm-table-2.0.0.tgz", + "integrity": "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "markdown-table": "^3.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/mdast-util-gfm-task-list-item": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/mdast-util-gfm-task-list-item/-/mdast-util-gfm-task-list-item-2.0.0.tgz", + "integrity": "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "devlop": "^1.0.0", + "mdast-util-from-markdown": "^2.0.0", + "mdast-util-to-markdown": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/mdast-util-phrasing": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/mdast-util-phrasing/-/mdast-util-phrasing-4.1.0.tgz", @@ -5565,6 +5676,127 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/micromark-extension-gfm": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm/-/micromark-extension-gfm-3.0.0.tgz", + "integrity": "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w==", + "license": "MIT", + "dependencies": { + "micromark-extension-gfm-autolink-literal": "^2.0.0", + "micromark-extension-gfm-footnote": "^2.0.0", + "micromark-extension-gfm-strikethrough": "^2.0.0", + "micromark-extension-gfm-table": "^2.0.0", + "micromark-extension-gfm-tagfilter": "^2.0.0", + "micromark-extension-gfm-task-list-item": "^2.0.0", + "micromark-util-combine-extensions": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-autolink-literal": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-autolink-literal/-/micromark-extension-gfm-autolink-literal-2.1.0.tgz", + "integrity": "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw==", + "license": "MIT", + "dependencies": { + "micromark-util-character": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-footnote": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-footnote/-/micromark-extension-gfm-footnote-2.1.0.tgz", + "integrity": "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-core-commonmark": "^2.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-normalize-identifier": "^2.0.0", + "micromark-util-sanitize-uri": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-strikethrough": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-strikethrough/-/micromark-extension-gfm-strikethrough-2.1.0.tgz", + "integrity": "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-util-chunked": "^2.0.0", + "micromark-util-classify-character": "^2.0.0", + "micromark-util-resolve-all": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-table": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-table/-/micromark-extension-gfm-table-2.1.1.tgz", + "integrity": "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-tagfilter": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-tagfilter/-/micromark-extension-gfm-tagfilter-2.0.0.tgz", + "integrity": "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg==", + "license": "MIT", + "dependencies": { + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, + "node_modules/micromark-extension-gfm-task-list-item": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/micromark-extension-gfm-task-list-item/-/micromark-extension-gfm-task-list-item-2.1.0.tgz", + "integrity": "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw==", + "license": "MIT", + "dependencies": { + "devlop": "^1.0.0", + "micromark-factory-space": "^2.0.0", + "micromark-util-character": "^2.0.0", + "micromark-util-symbol": "^2.0.0", + "micromark-util-types": "^2.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/micromark-factory-destination": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/micromark-factory-destination/-/micromark-factory-destination-2.0.1.tgz", @@ -8658,6 +8890,24 @@ "url": "https://opencollective.com/unified" } }, + "node_modules/remark-gfm": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/remark-gfm/-/remark-gfm-4.0.1.tgz", + "integrity": "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg==", + "license": "MIT", + "dependencies": { + "@types/mdast": "^4.0.0", + "mdast-util-gfm": "^3.0.0", + "micromark-extension-gfm": "^3.0.0", + "remark-parse": "^11.0.0", + "remark-stringify": "^11.0.0", + "unified": "^11.0.0" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/unified" + } + }, "node_modules/remark-heading-id": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/remark-heading-id/-/remark-heading-id-1.0.1.tgz", @@ -9320,9 +9570,9 @@ "integrity": "sha512-zC8zGoGkmc8J9ndvml8Xksr1Amk9qBujgbF0JAIWO7kXr43w0h/0GJNM/Vustixu+YE8N/MTrQ7N31FvHUACxQ==" }, "node_modules/spec-generator": { - "version": "1.6.1", - "resolved": "https://registry.npmjs.org/spec-generator/-/spec-generator-1.6.1.tgz", - "integrity": "sha512-yDzubb+cWKPlg82SQSaFeHjHVbKu58tlcvbnAy8yFtxnikUL2c06GViBw7yAOZPYjTS/meZ7vQp61IJ0myG0XQ==", + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/spec-generator/-/spec-generator-1.7.0.tgz", + "integrity": "sha512-U5itp3X8mU84chN0xmwgEtAaq/VL4gbC4AK5EdqJxGydhUhz+OnNUcRTKP9v3k9wYVaXbNIeoOzVBnzrNDV1XQ==", "license": "MIT", "dependencies": { "@11ty/eleventy": "^2.0.1", @@ -9342,6 +9592,7 @@ "pluralize": "^8.0.0", "remark": "^15.0.1", "remark-directive": "^3.0.0", + "remark-gfm": "^4.0.1", "remark-heading-id": "^1.0.1", "remark-html": "^16.0.1", "remark-squeeze-paragraphs": "^6.0.0", diff --git a/package.json b/package.json index 53ed9595..2a9f851c 100644 --- a/package.json +++ b/package.json @@ -10,6 +10,6 @@ "license": "", "private": true, "dependencies": { - "spec-generator": "^1.6.1" + "spec-generator": "^1.7.0" } } diff --git a/src/ipips/ipip-0499.md b/src/ipips/ipip-0499.md new file mode 100644 index 00000000..77a0c621 --- /dev/null +++ b/src/ipips/ipip-0499.md @@ -0,0 +1,251 @@ +--- +title: 'IPIP-0499: UnixFS CID Profiles' +date: 2026-01-14 +ipip: proposal +editors: + - name: Michelle Lee + github: mishmosh + affiliation: + name: IPFS Foundation + url: https://ipfsfoundation.org + - name: Daniel Norman + github: 2color + affiliation: + name: Independent + url: https://norman.life + - name: Marcin Rataj + github: lidel + affiliation: + name: Shipyard + url: https://ipshipyard.com/ +relatedIssues: + - https://discuss.ipfs.tech/t/should-we-profile-cids/18507 +thanks: + - name: Alex Potsides + github: achingbrain + affiliation: + name: Shipyard + url: https://ipshipyard.com/ + - name: Juan Caballero + github: bumblefudge + affiliation: + name: IPFS Foundation + url: https://ipfsfoundation.org + - name: Hector Sanjuan + github: hsanjuan + affiliation: + name: Shipyard + url: https://ipshipyard.com/ + - name: Steven Vandevelde + github: icidasset + - name: Christian Paul + github: jaller94 + - name: Rod Vagg + github: rvagg + - name: Seth Docherty + github: SethDocherty +order: 0499 +tags: ['ipips'] +--- + +## Summary + +This proposal introduces **configuration profiles** for CIDs that represent files and directories using [UnixFS](https://specs.ipfs.tech/unixfs/). The legacy profiles table also documents non-UnixFS implementations for reference. + +## Motivation + +While CIDs and UnixFS DAGs are cryptographically verifiable, the same file or directory can produce different CIDs across UnixFS implementations, because DAG construction parameters like chunk size, DAG width, and layout vary between tools. Often, these parameters are not even configurable by users. + +This creates two problems: + +- **Broken hash semantics:** Unlike standard hash functions where identical input produces identical output, UnixFS CIDs depend on DAG construction parameters. Simple CID comparison leads to false-negatives. +- **Verification overhead:** Without knowing the original parameters, users must retrieve and compare entire DAGs to verify content, adding storage, bandwidth, and complexity. + +A potential solution is to define configuration profiles: well-known parameter presets that implementations can adopt when common conventions for DAG creation are desired. + +See related discussion at + +### UnixFS parameters + +The following [UnixFS](https://specs.ipfs.tech/unixfs/) parameters were identified as factors that affect the resulting CID: + +1. CID version, e.g. CIDv0 or CIDv1 +1. Multibase encoding for the CID, e.g. `base32` +1. Hash function used for all nodes in the DAG, e.g. `sha2-256` +1. UnixFS file chunking algorithm and chunk size (e.g., fixed-size chunks of 256KiB) +1. UnixFS DAG layout: + - `balanced`: builds a balanced tree where all leaf nodes are at the same depth. Optimized for random access, seeking, and range requests within files (e.g., video). + - `balanced-packed`: variant of `balanced` that may produce different tree structure for large files. See [Balanced DAG layout variants](#balanced-dag-layout-variants) below. + - `trickle`: builds a tree optimized for on-the-fly one-time streaming, where data can be consumed before the entire file is available. Useful for logs and other append-only data structures where random access is not important. +1. UnixFS DAG width (max number of links per `File` node) +1. [HAMTDirectory](https://specs.ipfs.tech/unixfs/#dag-pb-hamtdirectory) fanout: the branching factor at each level of the HAMT tree (e.g., 256 leaves). +1. [HAMTDirectory threshold](https://specs.ipfs.tech/unixfs/#when-to-use-hamt-sharding): max `Directory` size before converting to `HAMTDirectory`, based on `PBNode.Links` count or estimated serialized [dag-pb](https://ipld.io/specs/codecs/dag-pb/spec/) size: + - `links-count`: `PBNode.Links` length (child count). Simple but ignores varying entry sizes. + - `links-bytes`: sum of `PBNode.Links[].Name` and `PBNode.Links[].Hash` byte lengths. Underestimates actual size by ignoring UnixFS Data, Tsize, and protobuf overhead. + - `block-bytes`: full serialized dag-pb node size. Most accurate, accounts for varint `Tsize` and optional metadata such as `mode` or `mtime`. +1. Leaves: either [dag-pb wrapped](https://specs.ipfs.tech/unixfs/#dag-pb-node) or [raw](https://specs.ipfs.tech/unixfs/#raw-node) +1. Whether empty directories are included in the DAG. Some implementations may apply filtering. +1. Whether hidden entities (including dot files) are included in the DAG. Some implementations may apply filtering. +1. Directory wrapping for single files: in order to retain the name of a single file, some implementations have the option to wrap the file in a `Directory` with link to the file. +1. Presence and accurate setting of `Tsize` (correct UnixFS has `Tsize` of child sub-DAGs). +1. [Symlink](https://specs.ipfs.tech/unixfs/#dag-pb-symlink) handling: preserved as UnixFS Type=4 nodes, or followed (dereferenced to target). +1. [Mode](https://specs.ipfs.tech/unixfs/#mode-field): optional POSIX file permissions. +1. [Mtime](https://specs.ipfs.tech/unixfs/#mtime-field): optional modification timestamp. + +#### Balanced DAG layout variants + +The `balanced` DAG layout has implementation variants that affect CID determinism for large files. CID mismatches have been [observed](https://discuss.ipfs.tech/t/should-we-profile-cids/18507/41) and [investigated](https://discuss.ipfs.tech/t/should-we-profile-cids/18507/44) when comparing [kubo][] and [Singularity][singularity] outputs for files exceeding 1 GiB. This IPIP introduces the name `balanced-packed` to distinguish Singularity's variant from the original `balanced` layout. + +Implementations adopting a profile SHOULD specify which balanced variant they use. The `unixfs-v1-2025` profile uses `balanced` for maximum compatibility with existing implementations. + +##### `balanced` + +The original balanced layout used by [kubo][]/[boxo][], [helia][], and others in the ecosystem. Builds the tree incrementally as chunks stream in: +- Starts with first chunk as root, grows tree upward as needed +- Uses explicit depth tracking to fill nodes recursively +- All leaf nodes end up at the **same depth** from the root +- Reference: [`boxo/ipld/unixfs/importer/balanced/builder.go`](https://github.com/ipfs/boxo/blob/v0.35.2/ipld/unixfs/importer/balanced/builder.go) + +##### `balanced-packed` + +Name introduced by this IPIP for [Singularity][singularity]'s variant. Groups pre-computed links in batch: +- Takes all chunk links as input, then packs them into parent nodes (up to max width) +- Repeats packing level-by-level until single root remains +- Trailing nodes may have fewer children, causing leaf depth to vary +- Optimized for batch processing of pre-chunked data in CAR files +- Reference: [`singularity/pack/packutil/util.go`](https://github.com/data-preservation-programs/singularity/blob/v0.6.0-RC4/pack/packutil/util.go) `AssembleFileFromLinks()` + +##### Observed differences + +According to [Singularity issue #525](https://github.com/data-preservation-programs/singularity/issues/525): +> "In Singularity's DAG, the last leaf node is not at the same distance from the root as the others." + +This structural difference causes CID mismatches for files larger than `chunk_size * dag_width` (e.g., >1 GiB with 1 MiB chunks and 1024 links per node), even when all other parameters match. + +### Divergence in current implementations + +We analyzed the default settings across the most popular UnixFS implementations in the ecosystem. The table below documents the divergence that prevents deterministic CID generation today: + +| Parameter | [kubo][] (CIDv0) | [helia][] | [storacha][] | [kubo][] (CIDv1) | [singularity][] | [dasl][] | +| ----------------------------- | ------------------------ | -------------------- | ------------------ | ----------------------------- | ----------------------------------- | ------------ | +| Based on | v0.39 (`unixfs-v0-2015`) | @helia/unixfs 6.0.4 | w3cli 7.12.0 | v0.39 (`test-cid-v1` profile) | v0.6.0-RC4 (454b630) | spec 2025-12 | +| CID version | CIDv0 | CIDv1 | CIDv1 | CIDv1 | CIDv1 | CIDv1 | +| Hash function | sha2-256 | sha2-256 | sha2-256 | sha2-256 | sha2-256 | sha2-256 | +| Chunking algorithm | fixed-size | fixed-size | fixed-size | fixed-size | fixed-size | N/A | +| Max chunk size | 256KiB | 1MiB | 1MiB | 1MiB | 1MiB | N/A | +| DAG layout | balanced | balanced | balanced | balanced | [balanced-packed](#balanced-packed) | N/A | +| DAG width (children per node) | 174 | 1024 | 1024 | 174 | 1024 | N/A | +| HAMTDirectory fanout | 256 blocks | 256 blocks | 256 blocks | 256 blocks | 256 blocks (boxo) | N/A | +| HAMTDirectory threshold | 256KiB (links-bytes) | 256KiB (links-bytes) | 1000 (links-count) | 256KiB (links-bytes) | 256KiB (links-bytes) (boxo) | N/A | +| Leaves | dag-pb | raw | raw | raw | raw | N/A | +| Empty directories | included | included | excluded | included | included | N/A | +| Hidden entities | excluded (opt-in) | excluded (opt-in) | excluded (opt-in) | excluded (opt-in) | included (rclone) | N/A | +| Symlinks | preserved | followed | followed | preserved | skipped (rclone) | N/A | +| Mode (permissions) | excluded (opt-in) | excluded (opt-in) | not supported | excluded (opt-in) | not supported | N/A | +| Mtime (modification time) | excluded (opt-in) | excluded (opt-in) | not supported | excluded (opt-in) | not supported | N/A | + +**Terminology:** + +- `included`: Always included in the DAG (no option to exclude) +- `excluded`: Always excluded from the DAG (no option to include) +- `opt-in`: Excluded by default; implementations provide a flag to include (e.g., `--hidden` in Kubo/Storacha, `hidden: true` in Helia) +- `opt-out`: Included by default; implementations provide a flag to exclude +- `preserved`: Symlinks stored as UnixFS Type=4 nodes with target path (per [UnixFS spec](https://specs.ipfs.tech/unixfs/)). Note: Kubo (v0.39) `--dereference-args` only follows symlinks passed as CLI arguments; symlinks found during recursive traversal are always preserved. +- `followed`: Symlinks dereferenced and treated as target files/directories +- `skipped`: Symlinks ignored during traversal (not included in DAG) +- `(rclone)`: Singularity delegates file traversal to [rclone](https://rclone.org/); values shown reflect rclone defaults +- `(boxo)`: Singularity overrides some [boxo][] defaults but relies on implicit boxo defaults for these values + +## Detailed design + +We introduce a set of **named configuration profiles**, each specifying the complete set of parameters for generating UnixFS CIDs. When implementations use these profiles, they guarantee that the same input, processed with the same profile, will yield the same CID across different tools and implementations. + +### The `unixfs-v1-2025` modern profile + +Based on the research above, we define **`unixfs-v1-2025`** as an opinionated profile for implementations that want to adopt deterministic CID generation for UnixFS DAGs with CIDv1. + +| Parameter | `unixfs-v1-2025` | +| ----------------------------- | -------------------- | +| CID version | CIDv1 | +| Hash function | sha2-256 | +| Chunking algorithm | fixed-size | +| Max chunk size | 1MiB | +| DAG layout | balanced | +| DAG width (children per node) | 1024 | +| HAMTDirectory fanout | 256 blocks | +| HAMTDirectory threshold | 256KiB (block-bytes) | +| Leaves | raw | +| Empty directories | included (opt-out) | +| Hidden entities | excluded (opt-in) | +| Symlinks | preserved | +| Mode (permissions) | excluded (opt-in) | +| Mtime (modification time) | excluded (opt-in) | + +### The `unixfs-v0-2015` legacy profile + +This profile documents the default UnixFS DAG construction parameters used by Kubo through version 0.39 when producing CIDv0. It is provided for users who depend on CIDv0 identifiers generated by Kubo and need to reproduce them with other implementations, or verify content against existing CIDv0 references. The year 2015 in the name indicates that the majority of these parameters were picked a decade ago, when the initial go-ipfs alpha software was implemented, and these defaults were never contested since then. + +| Parameter | `unixfs-v0-2015` | +| ----------------------------- | -------------------- | +| CID version | CIDv0 | +| Hash function | sha2-256 | +| Chunking algorithm | fixed-size | +| Max chunk size | 256KiB | +| DAG layout | balanced | +| DAG width (children per node) | 174 | +| HAMTDirectory fanout | 256 blocks | +| HAMTDirectory threshold | 256KiB (links-bytes) | +| Leaves | dag-pb | +| Empty directories | included | +| Hidden entities | excluded (opt-in) | +| Symlinks | preserved | +| Mode (permissions) | excluded (opt-in) | +| Mtime (modification time) | excluded (opt-in) | + +### User benefit + +Profiles provide key advantages for working with content-addressed data: + +1. **Predictable, deterministic behavior:** Profiles restore intuitive hash-like behavior: identical input data always produces identical CIDs, regardless of which implementation generates them. + +2. **Lightweight verification:** Users can verify content without needing to rely on additional merkle proofs or CAR files. + +3. **Simplified workflow:** Users can select a profile and automatically get consistent CIDs across all implementations, without needing to configure or understand the underlying parameters. + +4. **Improved efficiency:** The `unixfs-v1-2025` profile uses 1 MiB chunks with 1024 links per node, compared to the legacy 256 KiB chunks with 174 links. This results in: + - Shallower DAG trees (3 levels for a 1 TiB file vs 4 levels with legacy parameters) + - Approximately 4x fewer total nodes for the same content + - Faster random access and seeking in large files (fewer round-trips to traverse the tree) + - Fewer CIDs to announce, reducing stress on public good routing infrastructure such as the Amino DHT + +### Compatibility + +UnixFS data encoded with the CID profiles defined in this IPIP remains fully compatible with existing implementations, since it conforms to the [UnixFS specification](https://specs.ipfs.tech/unixfs/). + +To generate CIDs in compliance with this IPIP, implementations MUST support the `unixfs-v1-2025` profile. The `unixfs-v0-2015` profile is provided for backward compatibility and MAY be supported by implementations that need to produce CIDs matching historical Kubo output. + +Implementations SHOULD allow users to inspect default values and adjust configuration options related to CID generation. + +### Alternatives + +As an alternative to profiles, users can store and transfer CAR files of UnixFS content, which include the merkle DAG nodes needed to verify the CID. + +## Test fixtures + +TODO + +List relevant CIDs. Describe how implementations can use them to determine +specification compliance. This section can be skipped if IPIP does not deal +with the way IPFS handles content-addressed data, or the modified specification +file already includes this information. + +[kubo]: https://github.com/ipfs/kubo +[boxo]: https://github.com/ipfs/boxo +[helia]: https://github.com/ipfs/helia +[storacha]: https://github.com/storacha/w3cli +[singularity]: https://github.com/data-preservation-programs/singularity +[dasl]: https://dasl.ing + +### Copyright + +Copyright and related rights waived via [CC0](https://creativecommons.org/publicdomain/zero/1.0/). diff --git a/src/unixfs.md b/src/unixfs.md index 934414b3..85664067 100644 --- a/src/unixfs.md +++ b/src/unixfs.md @@ -501,6 +501,10 @@ node exceeds a size threshold between 256 KiB and 1 MiB. This threshold: See [Block Size Considerations](#block-size-considerations) for details on block size limits and conventions. +:::note +For standardized threshold estimation methods that enable deterministic CID generation, see [IPIP-499: UnixFS CID Profiles](https://specs.ipfs.tech/ipips/ipip-0499/). +::: + ### `dag-pb` `Symlink` A :dfn[Symlink] represents a POSIX [symbolic link](https://pubs.opengroup.org/onlinepubs/9699919799/functions/symlink.html).