From 48c099ba0337411e2345b125698d8c07e3d6f474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Sun, 7 Jun 2026 00:25:39 +0000 Subject: [PATCH] perf: implement lazy loading in FsStore FsStore previously CBOR-decoded all .bin nodes into memory at startup, making cold-open O(n) in time and memory. Now it scans only filenames into a Set at init and reads/decodes nodes from disk on first get(). has() and listAll() use the filename set; put() write-throughs to cache; delete() clears cache and disk. Index/meta migration still performs a one-time scan when _index/ is missing. Adds 12 new tests (L1-L12) covering startup-no-decode, lazy get, filename-based has/listAll, write-through put, delete cleanup, list operations, and migration/bootstrap regression. All existing tests pass unchanged. Fixes #85 --- .changeset/fsstore-lazy-loading.md | 5 + packages/fs/src/store.test.ts | 273 +++++++++++++++++++++++++++++ packages/fs/src/store.ts | 174 +++++++++++------- 3 files changed, 387 insertions(+), 65 deletions(-) create mode 100644 .changeset/fsstore-lazy-loading.md diff --git a/.changeset/fsstore-lazy-loading.md b/.changeset/fsstore-lazy-loading.md new file mode 100644 index 0000000..595afc4 --- /dev/null +++ b/.changeset/fsstore-lazy-loading.md @@ -0,0 +1,5 @@ +--- +"@ocas/fs": patch +--- + +`FsStore` now uses lazy loading: at startup it scans only filenames in the `nodes/` subdirectory (no CBOR decoding) and reads each node from disk on first `get()`. This makes startup O(filenames) instead of O(decoded-bytes), keeps memory usage bounded by what's actually accessed, and avoids paying the full-load cost for stores with many nodes. Behaviour is unchanged: `has()`, `listAll()`, `listByType()`, `listMeta()`, and `listSchemas()` return the same results as before. Index/meta migration paths still work — they perform a one-time scan + decode when `_index/` is missing. diff --git a/packages/fs/src/store.test.ts b/packages/fs/src/store.test.ts index 309384a..5d00bad 100644 --- a/packages/fs/src/store.test.ts +++ b/packages/fs/src/store.test.ts @@ -852,3 +852,276 @@ describe("createFsStore – nodes/ subdirectory layout", () => { expect(existsSync(join(dir, "nodes", `${h2}.bin`))).toBe(true); }); }); + +// ────────────────────────────────────────────────────────────────────────────── +// Lazy loading (#85) +// ────────────────────────────────────────────────────────────────────────────── +describe("createFsStore – lazy loading (#85)", () => { + let dir: string; + beforeEach(() => { + dir = makeTmpDir(); + }); + afterEach(() => { + rmSync(dir, { recursive: true, force: true }); + }); + + test("L1. createFsStore does NOT CBOR-decode nodes at startup", async () => { + const typeHash = await computeSelfHash({ name: "L1" }); + + const store1 = createFsStore(dir); + const h1 = await store1.put(typeHash, { i: 1 }); + const h2 = await store1.put(typeHash, { i: 2 }); + const h3 = await store1.put(typeHash, { i: 3 }); + + // Corrupt h2 by overwriting its .bin file with garbage CBOR + const corruptedPath = join(dir, "nodes", `${h2}.bin`); + writeFileSync(corruptedPath, Buffer.from([0xff, 0xfe, 0xfd, 0xfc])); + + // Opening the store should NOT throw, even though h2 is corrupted — + // because nothing is decoded at startup. + const store2 = createFsStore(dir); + + // has() should return true for all three (filename-based) + expect(store2.has(h1)).toBe(true); + expect(store2.has(h2)).toBe(true); + expect(store2.has(h3)).toBe(true); + + // listAll() reads filenames, so all three appear + const all = store2.listAll(); + expect(all).toContain(h1); + expect(all).toContain(h2); + expect(all).toContain(h3); + + // Non-corrupted nodes load fine + expect(store2.get(h1)).not.toBeNull(); + expect(store2.get(h3)).not.toBeNull(); + + // Corrupted node fails to load (returns null) + expect(store2.get(h2)).toBeNull(); + }); + + test("L2. get() loads node from disk on demand (cache miss)", async () => { + const typeHash = await computeSelfHash({ name: "L2" }); + + const store1 = createFsStore(dir); + const hash = await store1.put(typeHash, { value: 42, label: "answer" }); + const original = store1.get(hash) as CasNode; + + // Lazy-load instance + const store2 = createFsStore(dir); + const loaded1 = store2.get(hash) as CasNode; + expect(loaded1.type).toBe(typeHash); + expect(loaded1.payload).toEqual({ value: 42, label: "answer" }); + expect(loaded1.timestamp).toBe(original.timestamp); + + // Second get should return the same data (from cache) + const loaded2 = store2.get(hash) as CasNode; + expect(loaded2).toEqual(loaded1); + }); + + test("L3. has() works without loading node data", async () => { + const typeHash = await computeSelfHash({ name: "L3" }); + + const store1 = createFsStore(dir); + const hashes: string[] = []; + for (let i = 0; i < 5; i++) { + hashes.push(await store1.put(typeHash, { i })); + } + + const store2 = createFsStore(dir); + for (const hash of hashes) { + expect(store2.has(hash)).toBe(true); + } + + // Non-existent hash returns false + expect(store2.has("0000000000000")).toBe(false); + }); + + test("L4. listAll() returns hashes from filenames without decoding", async () => { + const typeHash = await computeSelfHash({ name: "L4" }); + + const store1 = createFsStore(dir); + const realHashes: string[] = []; + for (let i = 0; i < 3; i++) { + realHashes.push(await store1.put(typeHash, { i })); + } + + // Add a corrupted .bin file with valid filename but garbage content + const corruptedHash = "ABCDEFGHJKMNP"; + writeFileSync( + join(dir, "nodes", `${corruptedHash}.bin`), + Buffer.from([0xff, 0xee, 0xdd]), + ); + + const store2 = createFsStore(dir); + const all = store2.listAll(); + expect(all).toHaveLength(realHashes.length + 1); + for (const h of realHashes) { + expect(all).toContain(h); + } + expect(all).toContain(corruptedHash); + + // Real nodes still readable + for (const h of realHashes) { + expect(store2.get(h)).not.toBeNull(); + } + // Corrupted one returns null + expect(store2.get(corruptedHash)).toBeNull(); + }); + + test("L5. put() makes node immediately available without re-reading disk", async () => { + const store = createFsStore(dir); + const typeHash = await computeSelfHash({ name: "L5" }); + + const hash = await store.put(typeHash, { written: true }); + + // Immediately available via get(), has(), and listAll() + const node = store.get(hash) as CasNode; + expect(node.type).toBe(typeHash); + expect(node.payload).toEqual({ written: true }); + expect(store.has(hash)).toBe(true); + expect(store.listAll()).toContain(hash); + }); + + test("L6. delete() removes node from cache and disk", async () => { + const store = createFsStore(dir); + const typeHash = await computeSelfHash({ name: "L6" }); + + const hash = await store.put(typeHash, { temporary: true }); + // populate cache by getting once + expect(store.get(hash)).not.toBeNull(); + + expect(store.delete(hash)).toBe(true); + + expect(store.get(hash)).toBeNull(); + expect(store.has(hash)).toBe(false); + expect(store.listAll()).not.toContain(hash); + expect(existsSync(join(dir, "nodes", `${hash}.bin`))).toBe(false); + }); + + test("L7. listByType works with lazy loading (loads timestamps on demand)", async () => { + const typeA = await computeSelfHash({ name: "typeA-L7" }); + const typeB = await computeSelfHash({ name: "typeB-L7" }); + + const store1 = createFsStore(dir); + const aHashes: string[] = []; + for (let i = 0; i < 3; i++) { + aHashes.push(await store1.put(typeA, { i })); + } + const bHashes: string[] = []; + for (let i = 0; i < 2; i++) { + bHashes.push(await store1.put(typeB, { i })); + } + + const store2 = createFsStore(dir); + const aList = store2.listByType(typeA); + expect(aList).toHaveLength(3); + for (const e of aList) { + expect(aHashes).toContain(e.hash); + expect(typeof e.created).toBe("number"); + expect(e.created).toBeGreaterThan(0); + } + + const bList = store2.listByType(typeB); + expect(bList).toHaveLength(2); + + expect(store2.listByType("0000000000000")).toEqual([]); + }); + + test("L8. listMeta works with lazy loading", async () => { + const store1 = createFsStore(dir); + const m1 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L8a" }); + const m2 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L8b" }); + + const store2 = createFsStore(dir); + const meta = store2.listMeta(); + const metaHashes = meta.map((e) => e.hash); + expect(metaHashes).toHaveLength(2); + expect(metaHashes).toContain(m1); + expect(metaHashes).toContain(m2); + for (const e of meta) { + expect(typeof e.created).toBe("number"); + expect(e.created).toBeGreaterThan(0); + } + }); + + test("L9. listSchemas works with lazy loading", async () => { + const store1 = createFsStore(dir); + const m = await store1[BOOTSTRAP_STORE]({ type: "object" }); + const s1 = await store1.put(m, { type: "string" }); + const s2 = await store1.put(m, { type: "number" }); + + const store2 = createFsStore(dir); + const schemas = store2.listSchemas().map((e) => e.hash); + expect(schemas).toHaveLength(3); + expect(schemas).toContain(m); + expect(schemas).toContain(s1); + expect(schemas).toContain(s2); + }); + + test("L10. index migration still works with lazy loading", async () => { + const typeHash = await computeSelfHash({ name: "L10" }); + + const store1 = createFsStore(dir); + const h1 = await store1.put(typeHash, { i: 1 }); + const h2 = await store1.put(typeHash, { i: 2 }); + + rmSync(join(dir, "_index"), { recursive: true, force: true }); + + // Re-open: should rebuild type index by scanning + decoding nodes on disk + const store2 = createFsStore(dir); + const list = store2.listByType(typeHash).map((e) => e.hash); + expect(list).toHaveLength(2); + expect(list).toContain(h1); + expect(list).toContain(h2); + expect(existsSync(join(dir, "_index", typeHash))).toBe(true); + + // Re-open again: index already on disk, no re-scan needed + const store3 = createFsStore(dir); + const list3 = store3.listByType(typeHash).map((e) => e.hash); + expect(list3).toHaveLength(2); + }); + + test("L11. meta migration still works with lazy loading", async () => { + const store1 = createFsStore(dir); + const h1 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L11a" }); + const h2 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L11b" }); + + const metaPath = join(dir, "_index", "_meta"); + rmSync(metaPath, { force: true }); + expect(existsSync(metaPath)).toBe(false); + + const store2 = createFsStore(dir); + const meta = store2.listMeta().map((e) => e.hash); + expect(meta).toHaveLength(2); + expect(meta).toContain(h1); + expect(meta).toContain(h2); + expect(existsSync(metaPath)).toBe(true); + }); + + test("L12. bootstrap round-trip works with lazy store", async () => { + const store1 = await openStore(dir); + const schemas1 = bootstrap(store1); + const typeHash = await computeSelfHash({ name: "L12-user" }); + const userHash = store1.cas.put(typeHash, { user: "data" }); + + const store2 = await openStore(dir); + // All bootstrap schemas accessible + for (const name of [ + "@ocas/schema", + "@ocas/string", + "@ocas/number", + "@ocas/object", + "@ocas/array", + "@ocas/bool", + ]) { + const h = schemas1[name] as string; + expect(store2.cas.has(h)).toBe(true); + expect(store2.cas.get(h)).not.toBeNull(); + } + // User data still accessible + expect(store2.cas.has(userHash)).toBe(true); + const userNode = store2.cas.get(userHash) as CasNode; + expect(userNode.payload).toEqual({ user: "data" }); + }); +}); diff --git a/packages/fs/src/store.ts b/packages/fs/src/store.ts index 9dbaee6..bcefed1 100644 --- a/packages/fs/src/store.ts +++ b/packages/fs/src/store.ts @@ -59,23 +59,36 @@ function migrateFlatLayoutToNodes(dir: string): void { } } -function loadDir(dir: string, data: Map): void { +/** + * Scan `nodes/` directory for `.bin` filenames and return the set of hashes + * present on disk. Does NOT read or decode any node content — this is the + * cheap O(n) startup operation that replaces the legacy full-load. + */ +function loadHashSet(dir: string): Set { + const hashes = new Set(); let entries: string[]; try { entries = readdirSync(dir); } catch { - return; + return hashes; } for (const name of entries) { if (!name.endsWith(".bin")) continue; - const hash = name.slice(0, -4) as Hash; - try { - const buf = readFileSync(join(dir, name)); - const node = decode(new Uint8Array(buf)) as CasNode; - data.set(hash, node); - } catch { - // skip corrupted files - } + hashes.add(name.slice(0, -4) as Hash); + } + return hashes; +} + +/** + * Read and CBOR-decode a single node from disk. Returns `null` if the file + * is missing or its content is corrupted. + */ +function readNodeFromDisk(nodesDir: string, hash: Hash): CasNode | null { + try { + const buf = readFileSync(join(nodesDir, `${hash}.bin`)); + return decode(new Uint8Array(buf)) as CasNode; + } catch { + return null; } } @@ -103,9 +116,19 @@ function loadTypeIndex(indexDir: string): Map { return typeIndex; } -function buildTypeIndexFromNodes(data: Map): Map { +/** + * Migration helper: scan all `.bin` files on disk, decoding each one to read + * its `type` field, and rebuild the type index. Used only when `_index/` is + * missing — a one-time cost. + */ +function buildTypeIndexFromDisk( + nodesDir: string, + hashSet: Set, +): Map { const typeIndex = new Map(); - for (const [hash, node] of data) { + for (const hash of hashSet) { + const node = readNodeFromDisk(nodesDir, hash); + if (!node) continue; const list = typeIndex.get(node.type) ?? []; list.push(hash); typeIndex.set(node.type, list); @@ -123,11 +146,12 @@ function writeTypeIndex(indexDir: string, typeIndex: Map): void { function loadOrMigrateTypeIndex( dir: string, - data: Map, + nodesDir: string, + hashSet: Set, ): Map { const indexDir = join(dir, INDEX_DIR); if (!existsSync(indexDir)) { - const typeIndex = buildTypeIndexFromNodes(data); + const typeIndex = buildTypeIndexFromDisk(nodesDir, hashSet); if (typeIndex.size > 0) { writeTypeIndex(indexDir, typeIndex); } @@ -138,7 +162,8 @@ function loadOrMigrateTypeIndex( function loadOrMigrateMetaSet( dir: string, - data: Map, + nodesDir: string, + hashSet: Set, ): Set { const indexDir = join(dir, INDEX_DIR); const metaPath = join(indexDir, META_FILE); @@ -150,10 +175,11 @@ function loadOrMigrateMetaSet( return new Set(); } } - // Migration: scan loaded nodes for self-referencing nodes (type === hash) + // Migration: scan nodes on disk for self-referencing nodes (type === hash) const metaSet = new Set(); - for (const [hash, node] of data) { - if (node.type === hash) { + for (const hash of hashSet) { + const node = readNodeFromDisk(nodesDir, hash); + if (node && node.type === hash) { metaSet.add(hash); } } @@ -204,18 +230,6 @@ function appendToTypeIndex( typeIndex.set(type, list); } -function hashesToEntries( - data: Map, - hashes: Iterable, -): ListEntry[] { - const result: ListEntry[] = []; - for (const h of hashes) { - const node = data.get(h); - if (node) result.push(casListEntry(h, node.timestamp)); - } - return result; -} - /** * The CAS sub-store of an FS-backed `Store` — also satisfies the legacy * `BootstrapCapableStore` interface so `bootstrap()` can run against it. @@ -230,17 +244,42 @@ export function createFsStore(dir: string): FsCasStore { migrateFlatLayoutToNodes(dir); const nodesDir = join(dir, NODES_DIR); - const data = new Map(); - loadDir(nodesDir, data); + // Lazy loading (#85): only scan filenames at startup — do NOT decode. + const hashSet = loadHashSet(nodesDir); + // In-memory cache of decoded nodes. Populated on first get() of each hash. + const cache = new Map(); const indexDir = join(dir, INDEX_DIR); - const typeIndex = loadOrMigrateTypeIndex(dir, data); - const metaSet = loadOrMigrateMetaSet(dir, data); + const typeIndex = loadOrMigrateTypeIndex(dir, nodesDir, hashSet); + const metaSet = loadOrMigrateMetaSet(dir, nodesDir, hashSet); + + /** + * Look up a node by hash, loading from disk on cache miss. Returns `null` + * if the hash is unknown or the file is corrupted. + */ + function loadNode(hash: Hash): CasNode | null { + const cached = cache.get(hash); + if (cached) return cached; + if (!hashSet.has(hash)) return null; + const node = readNodeFromDisk(nodesDir, hash); + if (node) cache.set(hash, node); + return node; + } + + function hashesToEntries(hashes: Iterable): ListEntry[] { + const result: ListEntry[] = []; + for (const h of hashes) { + const node = loadNode(h); + if (node) result.push(casListEntry(h, node.timestamp)); + } + return result; + } function putSelfReferencing(payload: unknown): Hash { const hash = computeSelfHashSync(payload); - if (!data.has(hash)) { + if (!hashSet.has(hash)) { const node: CasNode = { type: hash, payload, timestamp: Date.now() }; - data.set(hash, node); + hashSet.add(hash); + cache.set(hash, node); mkdirSync(nodesDir, { recursive: true }); const tmp = join(nodesDir, `${hash}.tmp`); @@ -261,13 +300,14 @@ export function createFsStore(dir: string): FsCasStore { put(typeHash: Hash, payload: unknown): Hash { const hash = computeHashSync(typeHash, payload); - if (!data.has(hash)) { + if (!hashSet.has(hash)) { const node: CasNode = { type: typeHash, payload, timestamp: Date.now(), }; - data.set(hash, node); + hashSet.add(hash); + cache.set(hash, node); mkdirSync(nodesDir, { recursive: true }); const tmp = join(nodesDir, `${hash}.tmp`); @@ -285,25 +325,25 @@ export function createFsStore(dir: string): FsCasStore { }, get(hash: Hash): CasNode | null { - return data.get(hash) ?? null; + return loadNode(hash); }, has(hash: Hash): boolean { - return data.has(hash); + return hashSet.has(hash); }, listByType(typeHash: Hash, options?: ListOptions): ListEntry[] { const list = typeIndex.get(typeHash); if (!list) return []; - return applyListOptions(hashesToEntries(data, list), options); + return applyListOptions(hashesToEntries(list), options); }, listAll(): Hash[] { - return Array.from(data.keys()); + return Array.from(hashSet); }, listMeta(options?: ListOptions): ListEntry[] { - return applyListOptions(hashesToEntries(data, metaSet), options); + return applyListOptions(hashesToEntries(metaSet), options); }, listSchemas(options?: ListOptions): ListEntry[] { @@ -315,38 +355,42 @@ export function createFsStore(dir: string): FsCasStore { for (const h of list) result.add(h); } } - return applyListOptions(hashesToEntries(data, result), options); + return applyListOptions(hashesToEntries(result), options); }, delete(hash: Hash): boolean { - const node = data.get(hash); - if (!node) return false; - data.delete(hash); + if (!hashSet.has(hash)) return false; + // Need the node's type to clean up the type index. Lazy-load if needed. + const node = loadNode(hash); + hashSet.delete(hash); + cache.delete(hash); // Delete file try { unlinkSync(join(nodesDir, `${hash}.bin`)); } catch { // ignore if file doesn't exist } - // Remove from type index - const list = typeIndex.get(node.type); - if (list) { - const idx = list.indexOf(hash); - if (idx !== -1) { - list.splice(idx, 1); - } - if (list.length === 0) { - typeIndex.delete(node.type); - // Delete empty index file - try { - unlinkSync(join(indexDir, node.type)); - } catch { - // ignore + // Remove from type index (only if we could decode the node) + if (node) { + const list = typeIndex.get(node.type); + if (list) { + const idx = list.indexOf(hash); + if (idx !== -1) { + list.splice(idx, 1); + } + if (list.length === 0) { + typeIndex.delete(node.type); + // Delete empty index file + try { + unlinkSync(join(indexDir, node.type)); + } catch { + // ignore + } + } else { + // Rewrite index file + const body = `${list.join("\n")}\n`; + writeFileSync(join(indexDir, node.type), body, "utf8"); } - } else { - // Rewrite index file - const body = `${list.join("\n")}\n`; - writeFileSync(join(indexDir, node.type), body, "utf8"); } } // Remove from meta set if applicable