diff --git a/packages/backfill/README.md b/packages/backfill/README.md index 70576fd39..cd1e5f33b 100644 --- a/packages/backfill/README.md +++ b/packages/backfill/README.md @@ -13,7 +13,9 @@ This package provides ActivityPub conversation backfill support for the [Fedify] ecosystem. It can retrieve post-like objects from a seed object's context collection, following the direct FEP-f228-style path where the context dereferences to a `Collection`, `OrderedCollection`, `CollectionPage`, -or `OrderedCollectionPage`. +or `OrderedCollectionPage`. It can also use an opt-in reply-tree strategy to +walk `inReplyTo` ancestors and `replies` descendants when context collections +are unavailable or incomplete. [JSR badge]: https://jsr.io/badges/@fedify/backfill [JSR]: https://jsr.io/@fedify/backfill @@ -62,6 +64,10 @@ for await ( The seed object itself is not yielded. If it appears in the discovered collection, it is skipped by ID. +Configured strategies run in order. They share `maxItems`, `maxRequests`, +abort state, and object ID deduplication; if two strategies discover the same +object, the first strategy keeps its `BackfillItem` metadata. + By default, `backfill()` uses the `context-auto` strategy. In this mode, collection items are treated as backfillable objects by default. If an item is recognized as a supported `Create` activity, `backfill()` extracts the @@ -82,3 +88,24 @@ for await ( The `context-activities` strategy currently supports `Create` activities and yields the activity's object, not the activity itself. + +To combine the FEP-f228 context collection path with traditional reply-tree +crawling, add the `reply-tree` strategy after `context-auto`: + +~~~~ typescript +for await ( + const item of backfill({ documentLoader }, note, { + strategies: ["context-auto", "reply-tree"], + maxDepth: 4, + }) +) { + console.log(item.origin, item.depth, item.object); +} +~~~~ + +The `reply-tree` strategy walks `inReplyTo` ancestors and `replies` +descendants. It yields discovered post-like objects only; it does not extract +objects from Activity wrappers. Immediate parents and direct replies have +depth 1, their next-level parents or replies have depth 2, and so on. +Reply-tree traversal defaults to a maximum depth of 10; set `maxDepth` to use a +different limit. diff --git a/packages/backfill/src/backfill.test.ts b/packages/backfill/src/backfill.test.ts index 12dc0928d..9a38c6278 100644 --- a/packages/backfill/src/backfill.test.ts +++ b/packages/backfill/src/backfill.test.ts @@ -201,7 +201,664 @@ describe("backfill", () => { deepStrictEqual(await collect(context, note, { strategies: [] }), []); }); - test("context auto overrides overlapping strategies", async () => { + test("reply tree strategy does not require context collection", async () => { + const note = new Note({ + id: new URL("https://example.com/notes/1"), + contexts: [new URL("https://example.com/contexts/1")], + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + deepStrictEqual( + await collect(context, note, { strategies: ["reply-tree"] }), + [], + ); + }); + + test("reply tree yields embedded ancestor", async () => { + const parent = new Note({ + id: new URL("https://example.com/notes/1"), + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + replyTarget: parent, + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, parent); + deepStrictEqual(items[0].id, parent.id); + strictEqual(items[0].strategy, "reply-tree"); + strictEqual(items[0].origin, "in-reply-to"); + strictEqual(items[0].depth, 1); + }); + + test("reply tree dereferences ancestor URL", async () => { + const parentId = new URL("https://example.com/notes/1"); + const parent = new Note({ + id: parentId, + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + replyTarget: parentId, + }); + const context: BackfillContext = { + documentLoader: (iri) => + Promise.resolve(iri.href === parentId.href ? parent : null), + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 1); + deepStrictEqual(items[0].object.id, parent.id); + strictEqual(items[0].origin, "in-reply-to"); + strictEqual(items[0].depth, 1); + }); + + test("reply tree maxDepth limits ancestors", async () => { + const rootId = new URL("https://example.com/notes/1"); + const parentId = new URL("https://example.com/notes/2"); + const root = new Note({ + id: rootId, + content: "root", + }); + const parent = new Note({ + id: parentId, + content: "parent", + replyTarget: rootId, + }); + const note = new Note({ + id: new URL("https://example.com/notes/3"), + replyTarget: parentId, + }); + const context: BackfillContext = { + documentLoader: (iri) => { + if (iri.href === parentId.href) return Promise.resolve(parent); + if (iri.href === rootId.href) return Promise.resolve(root); + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + maxDepth: 1, + }); + + strictEqual(items.length, 1); + deepStrictEqual(items[0].object.id, parent.id); + strictEqual(items[0].depth, 1); + }); + + test("reply tree defaults maxDepth to 10 for ancestors", async () => { + let note = new Note({ + id: new URL("https://example.com/notes/0"), + }); + for (let i = 1; i <= 12; i++) { + note = new Note({ + id: new URL(`https://example.com/notes/${i}`), + replyTarget: note, + }); + } + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 10); + strictEqual(items.at(-1)?.depth, 10); + }); + + test("maxRequests limits reply tree ancestor dereferencing", async () => { + const parentId = new URL("https://example.com/notes/1"); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + replyTarget: parentId, + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + deepStrictEqual( + await collect(context, note, { + strategies: ["reply-tree"], + maxRequests: 0, + }), + [], + ); + }); + + test("reply tree avoids ancestor cycles", async () => { + const seedId = new URL("https://example.com/notes/1"); + const parentId = new URL("https://example.com/notes/2"); + const note = new Note({ + id: seedId, + replyTarget: parentId, + }); + const parent = new Note({ + id: parentId, + replyTarget: seedId, + }); + const context: BackfillContext = { + documentLoader: (iri) => { + if (iri.href === seedId.href) return Promise.resolve(note); + if (iri.href === parentId.href) return Promise.resolve(parent); + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 1); + deepStrictEqual(items[0].object.id, parent.id); + }); + + test("reply tree deduplicates ancestors from context collection", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/1"); + const parent = new Note({ + id: parentId, + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + contexts: [contextId], + replyTarget: parentId, + }); + const context: BackfillContext = { + documentLoader: (iri) => { + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [parent], + }), + ); + } + if (iri.href === parentId.href) return Promise.resolve(parent); + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["context-auto", "reply-tree"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, parent); + strictEqual(items[0].strategy, "context-auto"); + }); + + test("document cache avoids duplicate dereferences across strategies", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/1"); + const parent = new Note({ + id: parentId, + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + contexts: [contextId], + replyTarget: parentId, + }); + const requests: URL[] = []; + const context: BackfillContext = { + documentLoader: (iri) => { + requests.push(iri); + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [parentId], + }), + ); + } + if (iri.href === parentId.href) return Promise.resolve(parent); + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["context-auto", "reply-tree"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object.id?.href, parentId.href); + deepStrictEqual(requests.map((url) => url.href), [ + contextId.href, + parentId.href, + ]); + }); + + test("document cache does not keep failed dereferences", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/1"); + const parent = new Note({ + id: parentId, + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + contexts: [contextId], + replyTarget: parentId, + }); + const requests: URL[] = []; + let parentRequests = 0; + const context: BackfillContext = { + documentLoader: (iri) => { + requests.push(iri); + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [parentId], + }), + ); + } + if (iri.href === parentId.href) { + parentRequests++; + if (parentRequests === 1) throw new Error("temporary failure"); + return Promise.resolve(parent); + } + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["context-auto", "reply-tree"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object.id?.href, parentId.href); + deepStrictEqual(requests.map((url) => url.href), [ + contextId.href, + parentId.href, + parentId.href, + ]); + }); + + test("strategy order controls deduplicated item metadata", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/1"); + const parent = new Note({ + id: parentId, + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + contexts: [contextId], + replyTarget: parentId, + }); + const context: BackfillContext = { + documentLoader: (iri) => { + if (iri.href === parentId.href) return Promise.resolve(parent); + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [parent], + }), + ); + } + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree", "context-auto"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object.id?.href, parentId.href); + strictEqual(items[0].strategy, "reply-tree"); + strictEqual(items[0].origin, "in-reply-to"); + }); + + test("context auto preserves strategy order across reply tree", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/1"); + const parent = new Note({ + id: parentId, + content: "parent", + }); + const note = new Note({ + id: new URL("https://example.com/notes/2"), + contexts: [contextId], + replyTarget: parentId, + }); + const context: BackfillContext = { + documentLoader: (iri) => { + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [parent], + }), + ); + } + if (iri.href === parentId.href) return Promise.resolve(parent); + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["context-objects", "reply-tree", "context-auto"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object.id?.href, parentId.href); + strictEqual(items[0].strategy, "context-objects"); + strictEqual(items[0].origin, "collection"); + }); + + test("reply tree yields embedded descendants", async () => { + const reply = new Note({ + id: new URL("https://example.com/notes/2"), + content: "reply", + }); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + replies: new Collection({ + id: new URL("https://example.com/notes/1/replies"), + items: [reply], + }), + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, reply); + deepStrictEqual(items[0].id, reply.id); + strictEqual(items[0].strategy, "reply-tree"); + strictEqual(items[0].origin, "replies"); + strictEqual(items[0].depth, 1); + }); + + test("reply tree walks sibling descendants from discovered ancestor", async () => { + const seedId = new URL("https://example.com/notes/2"); + const sibling = new Note({ + id: new URL("https://example.com/notes/3"), + content: "sibling", + }); + const parent = new Note({ + id: new URL("https://example.com/notes/1"), + content: "parent", + replies: new Collection({ + id: new URL("https://example.com/notes/1/replies"), + items: [seedId, sibling], + }), + }); + const note = new Note({ + id: seedId, + replyTarget: parent, + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 2); + strictEqual(items[0].object, parent); + strictEqual(items[0].origin, "in-reply-to"); + strictEqual(items[1].object, sibling); + strictEqual(items[1].origin, "replies"); + }); + + test("reply tree dereferences replies collection URL", async () => { + const repliesId = new URL("https://example.com/notes/1/replies"); + const reply = new Note({ + id: new URL("https://example.com/notes/2"), + content: "reply", + }); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + replies: repliesId, + }); + const context: BackfillContext = { + documentLoader: (iri) => + Promise.resolve( + iri.href === repliesId.href + ? new Collection({ + id: repliesId, + items: [reply], + }) + : null, + ), + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 1); + deepStrictEqual(items[0].object.id, reply.id); + strictEqual(items[0].origin, "replies"); + strictEqual(items[0].depth, 1); + }); + + test("reply tree maxDepth limits descendants", async () => { + const grandchild = new Note({ + id: new URL("https://example.com/notes/3"), + content: "grandchild", + }); + const reply = new Note({ + id: new URL("https://example.com/notes/2"), + content: "reply", + replies: new Collection({ + id: new URL("https://example.com/notes/2/replies"), + items: [grandchild], + }), + }); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + replies: new Collection({ + id: new URL("https://example.com/notes/1/replies"), + items: [reply], + }), + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + maxDepth: 1, + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, reply); + strictEqual(items[0].depth, 1); + }); + + test("reply tree defaults maxDepth to 10 for descendants", async () => { + let note = new Note({ + id: new URL("https://example.com/notes/12"), + }); + for (let i = 11; i >= 0; i--) { + note = new Note({ + id: new URL(`https://example.com/notes/${i}`), + replies: new Collection({ + id: new URL(`https://example.com/notes/${i}/replies`), + items: [note], + }), + }); + } + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 10); + strictEqual(items.at(-1)?.depth, 10); + }); + + test("maxRequests limits reply tree replies dereferencing", async () => { + const repliesId = new URL("https://example.com/notes/1/replies"); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + replies: repliesId, + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + deepStrictEqual( + await collect(context, note, { + strategies: ["reply-tree"], + maxRequests: 0, + }), + [], + ); + }); + + test("reply tree does not reload visited replies collection URL", async () => { + const repliesId = new URL("https://example.com/notes/1/replies"); + const reply = new Note({ + id: new URL("https://example.com/notes/2"), + content: "reply", + replies: repliesId, + }); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + replies: repliesId, + }); + let requests = 0; + const context: BackfillContext = { + documentLoader: (iri) => { + requests++; + strictEqual(iri.href, repliesId.href); + return Promise.resolve( + new Collection({ + id: repliesId, + items: [reply], + }), + ); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + }); + + strictEqual(requests, 1); + strictEqual(items.length, 1); + strictEqual(items[0].object.id?.href, reply.id?.href); + }); + + test("reply tree skips visited reply IRIs before dereferencing", async () => { + const seedId = new URL("https://example.com/notes/1"); + const siblingId = new URL("https://example.com/notes/2"); + const sibling = new Note({ + id: siblingId, + content: "sibling", + }); + const note = new Note({ + id: seedId, + replies: new Collection({ + id: new URL("https://example.com/notes/1/replies"), + items: [seedId, siblingId], + }), + }); + const requests: string[] = []; + const context: BackfillContext = { + documentLoader: (iri) => { + requests.push(iri.href); + if (iri.href === siblingId.href) return Promise.resolve(sibling); + if (iri.href === seedId.href) { + throw new Error("seed should have been skipped"); + } + return Promise.resolve(null); + }, + }; + + const items = await collect(context, note, { + strategies: ["reply-tree"], + maxRequests: 1, + }); + + deepStrictEqual(requests, [siblingId.href]); + strictEqual(items.length, 1); + strictEqual(items[0].object.id?.href, siblingId.href); + }); + + test("reply tree avoids descendant cycles", async () => { + const seedId = new URL("https://example.com/notes/1"); + const replyId = new URL("https://example.com/notes/2"); + const note = new Note({ + id: seedId, + }); + const reply = new Note({ + id: replyId, + replies: new Collection({ + id: new URL("https://example.com/notes/2/replies"), + items: [note], + }), + }); + const seed = note.clone({ + replies: new Collection({ + id: new URL("https://example.com/notes/1/replies"), + items: [reply], + }), + }); + const context: BackfillContext = { + documentLoader: () => { + throw new Error("documentLoader should not be called"); + }, + }; + + const items = await collect(context, seed, { + strategies: ["reply-tree"], + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, reply); + }); + + test("context auto overrides overlapping context strategies", async () => { const contextId = new URL("https://example.com/contexts/1"); const item = new Note({ content: "anonymous" }); const note = new Note({ @@ -219,7 +876,7 @@ describe("backfill", () => { }; const items = await collect(context, note, { - strategies: ["context-auto", "context-objects"], + strategies: ["context-objects", "context-auto", "reply-tree"], }); strictEqual(items.length, 1); @@ -327,6 +984,48 @@ describe("backfill", () => { strictEqual(items[1].strategy, "context-activities"); }); + test("combined context strategies share context collection loading", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const post = new Note({ + id: new URL("https://example.com/notes/2"), + content: "hello", + }); + const activityObject = new Note({ + id: new URL("https://example.com/notes/3"), + content: "activity object", + }); + const activity = new Create({ + id: new URL("https://example.com/activities/1"), + object: activityObject, + }); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + contexts: [contextId], + }); + let requests = 0; + const context: BackfillContext = { + documentLoader: (iri) => { + requests++; + strictEqual(iri.href, contextId.href); + return Promise.resolve( + new Collection({ + id: contextId, + items: [post, activity], + }), + ); + }, + }; + + const items = await collect(context, note, { + strategies: ["context-objects", "context-activities"], + }); + + strictEqual(requests, 1); + strictEqual(items.length, 2); + strictEqual(items[0].object, post); + strictEqual(items[1].object, activityObject); + }); + test("context activity collection dereferences activity object URL", async () => { const contextId = new URL("https://example.com/contexts/1"); const itemId = new URL("https://example.com/notes/2"); @@ -614,6 +1313,45 @@ describe("backfill", () => { ]); }); + test("seen context collection URL items are not loaded", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const seedId = new URL("https://example.com/notes/1"); + const itemId = new URL("https://example.com/notes/2"); + const item = new Note({ + id: itemId, + content: "hello", + }); + const note = new Note({ + id: seedId, + contexts: [contextId], + }); + const requests: URL[] = []; + const context: BackfillContext = { + documentLoader: (iri) => { + requests.push(iri); + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [seedId, itemId], + }), + ); + } + if (iri.href === itemId.href) return Promise.resolve(item); + throw new Error("seen collection item should not be loaded"); + }, + }; + + const items = await collect(context, note); + + strictEqual(items.length, 1); + strictEqual(items[0].id?.href, itemId.href); + deepStrictEqual(requests.map((url) => url.href), [ + contextId.href, + itemId.href, + ]); + }); + test("failed URL collection items are skipped", async () => { const contextId = new URL("https://example.com/contexts/1"); const missingItemId = new URL("https://example.com/notes/missing"); @@ -727,6 +1465,44 @@ describe("backfill", () => { strictEqual(items[0].id?.href, "https://example.com/notes/2"); }); + test("maxItems is shared across context and reply tree", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const reply = new Note({ + id: new URL("https://example.com/notes/3"), + content: "reply", + }); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + contexts: [contextId], + replies: new Collection({ + id: new URL("https://example.com/notes/1/replies"), + items: [reply], + }), + }); + const contextItem = new Note({ + id: new URL("https://example.com/notes/2"), + content: "context item", + }); + const context: BackfillContext = { + documentLoader: () => + Promise.resolve( + new Collection({ + id: contextId, + items: [contextItem], + }), + ), + }; + + const items = await collect(context, note, { + strategies: ["context-auto", "reply-tree"], + maxItems: 1, + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, contextItem); + strictEqual(items[0].strategy, "context-auto"); + }); + test("maxRequests limits dereferencing", async () => { const contextId = new URL("https://example.com/contexts/1"); const itemId = new URL("https://example.com/notes/2"); @@ -751,6 +1527,41 @@ describe("backfill", () => { deepStrictEqual(await collect(context, note, { maxRequests: 1 }), []); }); + test("maxRequests is shared across context and reply tree", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/0"); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + contexts: [contextId], + replyTarget: parentId, + }); + const contextItem = new Note({ + id: new URL("https://example.com/notes/2"), + content: "context item", + }); + const context: BackfillContext = { + documentLoader: (iri) => { + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [contextItem], + }), + ); + } + throw new Error("reply-tree request should be budgeted out"); + }, + }; + + const items = await collect(context, note, { + strategies: ["context-auto", "reply-tree"], + maxRequests: 1, + }); + + strictEqual(items.length, 1); + strictEqual(items[0].object, contextItem); + }); + test("AbortSignal stops traversal", async () => { const contextId = new URL("https://example.com/contexts/1"); const note = new Note({ @@ -775,6 +1586,56 @@ describe("backfill", () => { ); }); + test("AbortSignal stops traversal across strategies", async () => { + const contextId = new URL("https://example.com/contexts/1"); + const parentId = new URL("https://example.com/notes/0"); + const controller = new AbortController(); + const note = new Note({ + id: new URL("https://example.com/notes/1"), + contexts: [contextId], + replyTarget: parentId, + }); + const contextItem = new Note({ + id: new URL("https://example.com/notes/2"), + content: "context item", + }); + let requests = 0; + const context: BackfillContext = { + documentLoader: (iri) => { + requests++; + if (iri.href === contextId.href) { + return Promise.resolve( + new Collection({ + id: contextId, + items: [contextItem], + }), + ); + } + throw new Error("reply-tree request should not be started"); + }, + }; + + const items: Awaited> = []; + await rejects( + async () => { + for await ( + const item of backfill(context, note, { + strategies: ["context-auto", "reply-tree"], + signal: controller.signal, + }) + ) { + items.push(item); + controller.abort(); + } + }, + { name: "AbortError" }, + ); + + strictEqual(requests, 1); + strictEqual(items.length, 1); + strictEqual(items[0].object, contextItem); + }); + test("documentLoader receives AbortSignal", async () => { const contextId = new URL("https://example.com/contexts/1"); const note = new Note({ diff --git a/packages/backfill/src/backfill.ts b/packages/backfill/src/backfill.ts index 3a0b4361d..db56361f2 100644 --- a/packages/backfill/src/backfill.ts +++ b/packages/backfill/src/backfill.ts @@ -13,6 +13,7 @@ import type { BackfillContext, BackfillItem, BackfillOptions, + BackfillOrigin, BackfillStrategy, } from "./types.ts"; @@ -20,6 +21,8 @@ const defaultStrategies = [ "context-auto", ] as const satisfies readonly BackfillStrategy[]; +const DEFAULT_MAX_DEPTH = 10; + /** * Thrown when backfill traversal exceeds the configured request budget. * @@ -30,8 +33,24 @@ export class MaxRequestsExceeded extends Error {} interface RequestBudget { readonly signal?: AbortSignal; requestCount: number; + readonly documents: Map>; } +type StrategyItem = { + readonly object: APObject; + readonly strategy: BackfillStrategy; + readonly origin: BackfillOrigin; + readonly depth: number; +}; + +type ReplyTreeTraversal = { + readonly depth: number; + readonly visitedObjectIds: Set; + readonly visitedObjects: WeakSet; + readonly visitedCollectionIds: Set; + readonly visitedCollections: WeakSet; +}; + /** * Backfills post-like objects related to a seed object. * @@ -51,33 +70,49 @@ export async function* backfill< const strategies = normalizeStrategies(options.strategies); if (strategies.length < 1) return; - const contextId = note.contextIds[0]; - if (contextId == null) return; - const budget: RequestBudget = { signal: options.signal, requestCount: 0, + documents: new Map(), }; const seenIds = new Set(); if (note.id != null) seenIds.add(note.id.href); - const collection = await loadObject(context, contextId, options, budget); - if (!isCollection(collection)) return; - let yielded = 0; try { - for await ( - const object of getCollectionItems(context, collection, options, budget) - ) { - for await ( - const item of getBackfillItems( + for (let i = 0; i < strategies.length; i++) { + const strategy = strategies[i]; + let items: AsyncIterable; + if (isContextStrategy(strategy)) { + const contextStrategies: Exclude[] = [ + strategy, + ]; + while (true) { + const nextStrategy = strategies[i + 1]; + if (nextStrategy == null || !isContextStrategy(nextStrategy)) break; + contextStrategies.push(nextStrategy); + i++; + } + items = getContextStrategyItems( context, - object, - strategies, + note, + contextStrategies, options, budget, - ) - ) { + seenIds, + ); + } else { + items = getStrategyItems( + context, + note, + strategy, + options, + budget, + seenIds, + ); + } + + for await (const item of items) { const id = item.object.id ?? undefined; if (id != null) { if (seenIds.has(id.href)) continue; @@ -89,8 +124,8 @@ export async function* backfill< object: item.object as TObject, id, strategy: item.strategy, - origin: "collection", - depth: 0, + origin: item.origin, + depth: item.depth, }; yielded++; @@ -106,24 +141,358 @@ export async function* backfill< function normalizeStrategies( strategies: readonly BackfillStrategy[] = defaultStrategies, ): readonly BackfillStrategy[] { - if (strategies.includes("context-auto")) return ["context-auto"]; - return Array.from(new Set(strategies)); + const normalized: BackfillStrategy[] = []; + for (const strategy of strategies) { + if (strategy === "context-auto") { + for ( + let i = normalized.length - 1; + i >= 0 && isContextStrategy(normalized[i]); + i-- + ) { + normalized.splice(i, 1); + } + if (!normalized.includes(strategy)) normalized.push(strategy); + } else if (isContextStrategy(strategy)) { + if ( + !currentContextGroupHasAuto(normalized) && + !normalized.includes(strategy) + ) { + normalized.push(strategy); + } + } else if (!normalized.includes(strategy)) { + normalized.push(strategy); + } + } + return normalized; } -async function* getBackfillItems( - context: BackfillContext, - object: APObject | Link, +function isContextStrategy( + strategy: BackfillStrategy, +): strategy is Exclude { + return strategy === "context-objects" || + strategy === "context-activities" || + strategy === "context-auto"; +} + +function currentContextGroupHasAuto( strategies: readonly BackfillStrategy[], +): boolean { + for (let i = strategies.length - 1; i >= 0; i--) { + const strategy = strategies[i]; + if (!isContextStrategy(strategy)) return false; + if (strategy === "context-auto") return true; + } + return false; +} + +async function* getContextStrategyItems( + context: BackfillContext, + note: APObject, + strategies: readonly Exclude[], options: BackfillOptions, budget: RequestBudget, + seenIds: ReadonlySet, +): AsyncIterable<{ + readonly object: APObject; + readonly strategy: Exclude; + readonly origin: "collection"; + readonly depth: 0; +}> { + const contextId = note.contextIds[0]; + if (contextId == null) return; + const collection = await loadObject(context, contextId, options, budget); + if (!isCollection(collection)) return; + for await ( + const object of getCollectionItems( + context, + collection, + options, + budget, + seenIds, + ) + ) { + for (const strategy of strategies) { + for await ( + const item of getContextBackfillItems( + context, + object, + strategy, + options, + budget, + ) + ) { + yield { + object: item.object, + strategy: item.strategy, + origin: "collection", + depth: 0, + }; + } + } + } +} + +async function* getStrategyItems( + context: BackfillContext, + note: APObject, + strategy: BackfillStrategy, + options: BackfillOptions, + budget: RequestBudget, + seenIds: ReadonlySet, ): AsyncIterable<{ readonly object: APObject; readonly strategy: BackfillStrategy; + readonly origin: BackfillOrigin; + readonly depth: number; }> { - for (const strategy of strategies) { - if (strategy === "context-objects" && isContextPostObject(object)) { - yield { object, strategy }; - } else if (strategy === "context-activities") { + if (isContextStrategy(strategy)) { + yield* getContextStrategyItems( + context, + note, + [strategy], + options, + budget, + seenIds, + ); + } else if (strategy === "reply-tree") { + yield* getReplyTreeItems(context, note, options, budget); + } +} + +async function* getReplyTreeItems( + context: BackfillContext, + note: APObject, + options: BackfillOptions, + budget: RequestBudget, +): AsyncIterable<{ + readonly object: APObject; + readonly strategy: "reply-tree"; + readonly origin: "in-reply-to" | "replies"; + readonly depth: number; +}> { + const visitedObjectIds = new Set(); + const visitedObjects = new WeakSet(); + const visitedCollectionIds = new Set(); + const visitedCollections = new WeakSet(); + if (note.id != null) visitedObjectIds.add(note.id.href); + visitedObjects.add(note); + const ancestors: APObject[] = []; + for await ( + const item of getReplyAncestors(context, note, options, budget, { + depth: 1, + visitedObjectIds, + visitedObjects, + visitedCollectionIds, + visitedCollections, + }) + ) { + ancestors.push(item.object); + yield item; + } + for (const object of ancestors.toReversed()) { + yield* getReplyDescendants(context, object, options, budget, { + depth: 1, + visitedObjectIds, + visitedObjects, + visitedCollectionIds, + visitedCollections, + }); + } + yield* getReplyDescendants(context, note, options, budget, { + depth: 1, + visitedObjectIds, + visitedObjects, + visitedCollectionIds, + visitedCollections, + }); +} + +async function* getReplyAncestors( + context: BackfillContext, + object: APObject, + options: BackfillOptions, + budget: RequestBudget, + traversal: ReplyTreeTraversal, +): AsyncIterable<{ + readonly object: APObject; + readonly strategy: "reply-tree"; + readonly origin: "in-reply-to"; + readonly depth: number; +}> { + if (traversal.depth > (options.maxDepth ?? DEFAULT_MAX_DEPTH)) return; + for await ( + const target of getReplyTargets(context, object, options, budget) + ) { + if (!isContextPostObject(target)) continue; + if (!visitReplyTreeObject(target, traversal)) continue; + yield { + object: target, + strategy: "reply-tree", + origin: "in-reply-to", + depth: traversal.depth, + }; + yield* getReplyAncestors(context, target, options, budget, { + depth: traversal.depth + 1, + visitedObjectIds: traversal.visitedObjectIds, + visitedObjects: traversal.visitedObjects, + visitedCollectionIds: traversal.visitedCollectionIds, + visitedCollections: traversal.visitedCollections, + }); + } +} + +async function* getReplyDescendants( + context: BackfillContext, + object: APObject, + options: BackfillOptions, + budget: RequestBudget, + traversal: ReplyTreeTraversal, +): AsyncIterable<{ + readonly object: APObject; + readonly strategy: "reply-tree"; + readonly origin: "replies"; + readonly depth: number; +}> { + if (traversal.depth > (options.maxDepth ?? DEFAULT_MAX_DEPTH)) return; + const repliesId = object.repliesId; + let repliesIdVisited = false; + if (repliesId != null && !visitReplyTreeCollectionId(repliesId, traversal)) { + return; + } + repliesIdVisited = repliesId != null; + const replies = await getRepliesCollection(context, object, options, budget); + if (replies == null) return; + if (repliesIdVisited) { + traversal.visitedCollections.add(replies); + } else if (!visitReplyTreeCollection(replies, traversal)) { + return; + } + for await ( + const reply of getCollectionItems( + context, + replies, + options, + budget, + traversal.visitedObjectIds, + ) + ) { + if (!isContextPostObject(reply)) continue; + if (!visitReplyTreeObject(reply, traversal)) continue; + yield { + object: reply, + strategy: "reply-tree", + origin: "replies", + depth: traversal.depth, + }; + yield* getReplyDescendants(context, reply, options, budget, { + depth: traversal.depth + 1, + visitedObjectIds: traversal.visitedObjectIds, + visitedObjects: traversal.visitedObjects, + visitedCollectionIds: traversal.visitedCollectionIds, + visitedCollections: traversal.visitedCollections, + }); + } +} + +async function* getReplyTargets( + context: BackfillContext, + object: APObject, + options: BackfillOptions, + budget: RequestBudget, +): AsyncIterable { + try { + yield* object.getReplyTargets({ + documentLoader: async (url) => { + return await loadCollectionItemDocument(context, url, options, budget); + }, + crossOrigin: "trust", + }); + } catch (error) { + if (error instanceof MaxRequestsExceeded) throw error; + budget.signal?.throwIfAborted(); + } +} + +async function getRepliesCollection( + context: BackfillContext, + object: APObject, + options: BackfillOptions, + budget: RequestBudget, +): Promise { + try { + return await object.getReplies({ + documentLoader: async (url) => { + return await loadCollectionItemDocument(context, url, options, budget); + }, + crossOrigin: "trust", + }); + } catch (error) { + if (error instanceof MaxRequestsExceeded) throw error; + budget.signal?.throwIfAborted(); + return null; + } +} + +function visitReplyTreeObject( + object: APObject, + traversal: ReplyTreeTraversal, +): boolean { + if (object.id != null) { + if (traversal.visitedObjectIds.has(object.id.href)) return false; + traversal.visitedObjectIds.add(object.id.href); + } else { + if (traversal.visitedObjects.has(object)) return false; + } + traversal.visitedObjects.add(object); + return true; +} + +function visitReplyTreeCollection( + collection: BackfillCollection, + traversal: ReplyTreeTraversal, +): boolean { + if (collection.id != null) { + return visitReplyTreeCollectionId(collection.id, traversal); + } else { + if (traversal.visitedCollections.has(collection)) return false; + } + traversal.visitedCollections.add(collection); + return true; +} + +function visitReplyTreeCollectionId( + id: URL, + traversal: ReplyTreeTraversal, +): boolean { + if (traversal.visitedCollectionIds.has(id.href)) return false; + traversal.visitedCollectionIds.add(id.href); + return true; +} + +async function* getContextBackfillItems( + context: BackfillContext, + object: APObject | Link, + strategy: Exclude, + options: BackfillOptions, + budget: RequestBudget, +): AsyncIterable<{ + readonly object: APObject; + readonly strategy: Exclude; +}> { + if (strategy === "context-objects" && isContextPostObject(object)) { + yield { object, strategy }; + } else if (strategy === "context-activities") { + const activityObject = await getCreateActivityObject( + context, + object, + options, + budget, + ); + if (activityObject != null && isContextPostObject(activityObject)) { + yield { object: activityObject, strategy }; + } + } else if (strategy === "context-auto") { + if (object instanceof Activity) { const activityObject = await getCreateActivityObject( context, object, @@ -133,20 +502,8 @@ async function* getBackfillItems( if (activityObject != null && isContextPostObject(activityObject)) { yield { object: activityObject, strategy }; } - } else if (strategy === "context-auto") { - if (object instanceof Activity) { - const activityObject = await getCreateActivityObject( - context, - object, - options, - budget, - ); - if (activityObject != null && isContextPostObject(activityObject)) { - yield { object: activityObject, strategy }; - } - } else if (isContextPostObject(object)) { - yield { object, strategy }; - } + } else if (isContextPostObject(object)) { + yield { object, strategy }; } } } @@ -156,10 +513,17 @@ async function* getCollectionItems( collection: BackfillCollection, options: BackfillOptions, budget: RequestBudget, + skipIds?: ReadonlySet, ): AsyncIterable { yield* collection.getItems({ documentLoader: async (url) => { - return await loadCollectionItemDocument(context, url, options, budget); + return await loadCollectionItemDocument( + context, + url, + options, + budget, + skipIds, + ); }, crossOrigin: "trust", }); @@ -191,12 +555,15 @@ async function loadCollectionItemDocument( url: string, options: BackfillOptions, budget: RequestBudget, + skipIds?: ReadonlySet, ) { let object: APObject | null; try { + const iri = new URL(url); + if (skipIds?.has(iri.href)) return skippedCollectionItemDocument(url); object = await loadObject( context, - new URL(url), + iri, options, budget, true, @@ -233,6 +600,10 @@ async function loadObject( throwOnBudgetExceeded = false, ): Promise { budget.signal?.throwIfAborted(); + const cacheKey = iri.href; + const cached = budget.documents.get(cacheKey); + if (cached != null) return await cached; + if ( options.maxRequests != null && budget.requestCount >= options.maxRequests @@ -245,7 +616,16 @@ async function loadObject( budget.signal?.throwIfAborted(); budget.requestCount++; - return await context.documentLoader(iri, { signal: budget.signal }); + const document = context.documentLoader(iri, { signal: budget.signal }); + budget.documents.set(cacheKey, document); + try { + return await document; + } catch (error) { + if (budget.documents.get(cacheKey) === document) { + budget.documents.delete(cacheKey); + } + throw error; + } } async function waitForInterval( diff --git a/packages/backfill/src/types.ts b/packages/backfill/src/types.ts index ae6d264ba..c7d15f80d 100644 --- a/packages/backfill/src/types.ts +++ b/packages/backfill/src/types.ts @@ -9,21 +9,28 @@ import type { Object as APObject } from "@fedify/vocab"; * activities in the context collection. * - `"context-auto"` classifies context collection items automatically, * handling direct post-like objects and supported `Create` activities. - * If included, it absorbs all other strategies. + * If included, it absorbs other context collection strategies. + * - `"reply-tree"` walks the reply graph through `inReplyTo` ancestors and + * `replies` descendants, yielding discovered post-like objects. * * @since 2.x.0 */ export type BackfillStrategy = | "context-objects" | "context-activities" - | "context-auto"; + | "context-auto" + | "reply-tree"; /** * Source relation that produced a backfilled object. * * @since 2.x.0 */ -export type BackfillOrigin = "context" | "collection"; +export type BackfillOrigin = + | "context" + | "collection" + | "in-reply-to" + | "replies"; /** * Options passed to {@link BackfillDocumentLoader}. @@ -54,13 +61,14 @@ export type BackfillDocumentLoader = ( */ export interface BackfillContext { /** - * Dereferences context collections and collection item IRIs. + * Dereferences context collections, collection item IRIs, reply targets, + * and replies collections. */ readonly documentLoader: BackfillDocumentLoader; } /** - * Controls direct context collection backfill traversal. + * Controls backfill traversal. * * @since 2.x.0 */ @@ -70,8 +78,13 @@ export interface BackfillOptions< /** * Backfill strategies to run. * + * Strategies run in order and share request, item, abort, and deduplication + * state. If multiple strategies discover the same object ID, the first + * strategy keeps its {@link BackfillItem} metadata. + * * Defaults to `["context-auto"]`. - * If `"context-auto"` is included, it absorbs all other strategies. + * If `"context-auto"` is included, it absorbs other context collection + * strategies. * * @since 2.x.0 */ @@ -83,15 +96,22 @@ export interface BackfillOptions< readonly maxItems?: number; /** - * Maximum traversal depth. This is reserved for future reply-tree traversal; + * Maximum reply-tree traversal depth. + * + * Immediate `inReplyTo` targets and direct `replies` collection items have + * depth 1. Their parents or replies have depth 2, and so on. Context + * collection items are depth 0 and are not limited by this option. + * + * Defaults to 10. */ readonly maxDepth?: number; /** * Maximum number of calls to {@link BackfillContext.documentLoader}. * - * Dereferencing the note context, collection item IRIs, and future page IRIs - * all count as requests. Embedded collection items do not count. + * Dereferencing the note context, collection item IRIs, reply target IRIs, + * replies collection IRIs, and future page IRIs all count as requests across + * all strategies. Embedded objects and collections do not count. */ readonly maxRequests?: number; @@ -140,8 +160,11 @@ export interface BackfillItem< readonly origin: BackfillOrigin; /** - * Traversal depth. Direct context collection items are depth 0; deeper - * values are reserved for future reply-tree traversal. + * Traversal depth. + * + * Direct context collection items are depth 0. Reply-tree items use depth + * 1 for immediate `inReplyTo` targets and direct `replies` collection items, + * depth 2 for the next level, and so on. */ readonly depth?: number; }