perf: implement lazy loading in FsStore (#85) #89

Merged
xiaomo merged 1 commits from fix/85-fsstore-lazy-loading into main 2026-06-07 00:31:31 +00:00
3 changed files with 387 additions and 65 deletions
Showing only changes of commit 48c099ba03 - Show all commits
+5
View File
@@ -0,0 +1,5 @@
---
"@ocas/fs": patch
---
`FsStore` now uses lazy loading: at startup it scans only filenames in the `nodes/` subdirectory (no CBOR decoding) and reads each node from disk on first `get()`. This makes startup O(filenames) instead of O(decoded-bytes), keeps memory usage bounded by what's actually accessed, and avoids paying the full-load cost for stores with many nodes. Behaviour is unchanged: `has()`, `listAll()`, `listByType()`, `listMeta()`, and `listSchemas()` return the same results as before. Index/meta migration paths still work — they perform a one-time scan + decode when `_index/` is missing.
+273
View File
@@ -852,3 +852,276 @@ describe("createFsStore – nodes/ subdirectory layout", () => {
expect(existsSync(join(dir, "nodes", `${h2}.bin`))).toBe(true);
});
});
// ──────────────────────────────────────────────────────────────────────────────
// Lazy loading (#85)
// ──────────────────────────────────────────────────────────────────────────────
describe("createFsStore – lazy loading (#85)", () => {
let dir: string;
beforeEach(() => {
dir = makeTmpDir();
});
afterEach(() => {
rmSync(dir, { recursive: true, force: true });
});
test("L1. createFsStore does NOT CBOR-decode nodes at startup", async () => {
const typeHash = await computeSelfHash({ name: "L1" });
const store1 = createFsStore(dir);
const h1 = await store1.put(typeHash, { i: 1 });
const h2 = await store1.put(typeHash, { i: 2 });
const h3 = await store1.put(typeHash, { i: 3 });
// Corrupt h2 by overwriting its .bin file with garbage CBOR
const corruptedPath = join(dir, "nodes", `${h2}.bin`);
writeFileSync(corruptedPath, Buffer.from([0xff, 0xfe, 0xfd, 0xfc]));
// Opening the store should NOT throw, even though h2 is corrupted —
// because nothing is decoded at startup.
const store2 = createFsStore(dir);
// has() should return true for all three (filename-based)
expect(store2.has(h1)).toBe(true);
expect(store2.has(h2)).toBe(true);
expect(store2.has(h3)).toBe(true);
// listAll() reads filenames, so all three appear
const all = store2.listAll();
expect(all).toContain(h1);
expect(all).toContain(h2);
expect(all).toContain(h3);
// Non-corrupted nodes load fine
expect(store2.get(h1)).not.toBeNull();
expect(store2.get(h3)).not.toBeNull();
// Corrupted node fails to load (returns null)
expect(store2.get(h2)).toBeNull();
});
test("L2. get() loads node from disk on demand (cache miss)", async () => {
const typeHash = await computeSelfHash({ name: "L2" });
const store1 = createFsStore(dir);
const hash = await store1.put(typeHash, { value: 42, label: "answer" });
const original = store1.get(hash) as CasNode;
// Lazy-load instance
const store2 = createFsStore(dir);
const loaded1 = store2.get(hash) as CasNode;
expect(loaded1.type).toBe(typeHash);
expect(loaded1.payload).toEqual({ value: 42, label: "answer" });
expect(loaded1.timestamp).toBe(original.timestamp);
// Second get should return the same data (from cache)
const loaded2 = store2.get(hash) as CasNode;
expect(loaded2).toEqual(loaded1);
});
test("L3. has() works without loading node data", async () => {
const typeHash = await computeSelfHash({ name: "L3" });
const store1 = createFsStore(dir);
const hashes: string[] = [];
for (let i = 0; i < 5; i++) {
hashes.push(await store1.put(typeHash, { i }));
}
const store2 = createFsStore(dir);
for (const hash of hashes) {
expect(store2.has(hash)).toBe(true);
}
// Non-existent hash returns false
expect(store2.has("0000000000000")).toBe(false);
});
test("L4. listAll() returns hashes from filenames without decoding", async () => {
const typeHash = await computeSelfHash({ name: "L4" });
const store1 = createFsStore(dir);
const realHashes: string[] = [];
for (let i = 0; i < 3; i++) {
realHashes.push(await store1.put(typeHash, { i }));
}
// Add a corrupted .bin file with valid filename but garbage content
const corruptedHash = "ABCDEFGHJKMNP";
writeFileSync(
join(dir, "nodes", `${corruptedHash}.bin`),
Buffer.from([0xff, 0xee, 0xdd]),
);
const store2 = createFsStore(dir);
const all = store2.listAll();
expect(all).toHaveLength(realHashes.length + 1);
for (const h of realHashes) {
expect(all).toContain(h);
}
expect(all).toContain(corruptedHash);
// Real nodes still readable
for (const h of realHashes) {
expect(store2.get(h)).not.toBeNull();
}
// Corrupted one returns null
expect(store2.get(corruptedHash)).toBeNull();
});
test("L5. put() makes node immediately available without re-reading disk", async () => {
const store = createFsStore(dir);
const typeHash = await computeSelfHash({ name: "L5" });
const hash = await store.put(typeHash, { written: true });
// Immediately available via get(), has(), and listAll()
const node = store.get(hash) as CasNode;
expect(node.type).toBe(typeHash);
expect(node.payload).toEqual({ written: true });
expect(store.has(hash)).toBe(true);
expect(store.listAll()).toContain(hash);
});
test("L6. delete() removes node from cache and disk", async () => {
const store = createFsStore(dir);
const typeHash = await computeSelfHash({ name: "L6" });
const hash = await store.put(typeHash, { temporary: true });
// populate cache by getting once
expect(store.get(hash)).not.toBeNull();
expect(store.delete(hash)).toBe(true);
expect(store.get(hash)).toBeNull();
expect(store.has(hash)).toBe(false);
expect(store.listAll()).not.toContain(hash);
expect(existsSync(join(dir, "nodes", `${hash}.bin`))).toBe(false);
});
test("L7. listByType works with lazy loading (loads timestamps on demand)", async () => {
const typeA = await computeSelfHash({ name: "typeA-L7" });
const typeB = await computeSelfHash({ name: "typeB-L7" });
const store1 = createFsStore(dir);
const aHashes: string[] = [];
for (let i = 0; i < 3; i++) {
aHashes.push(await store1.put(typeA, { i }));
}
const bHashes: string[] = [];
for (let i = 0; i < 2; i++) {
bHashes.push(await store1.put(typeB, { i }));
}
const store2 = createFsStore(dir);
const aList = store2.listByType(typeA);
expect(aList).toHaveLength(3);
for (const e of aList) {
expect(aHashes).toContain(e.hash);
expect(typeof e.created).toBe("number");
expect(e.created).toBeGreaterThan(0);
}
const bList = store2.listByType(typeB);
expect(bList).toHaveLength(2);
expect(store2.listByType("0000000000000")).toEqual([]);
});
test("L8. listMeta works with lazy loading", async () => {
const store1 = createFsStore(dir);
const m1 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L8a" });
const m2 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L8b" });
const store2 = createFsStore(dir);
const meta = store2.listMeta();
const metaHashes = meta.map((e) => e.hash);
expect(metaHashes).toHaveLength(2);
expect(metaHashes).toContain(m1);
expect(metaHashes).toContain(m2);
for (const e of meta) {
expect(typeof e.created).toBe("number");
expect(e.created).toBeGreaterThan(0);
}
});
test("L9. listSchemas works with lazy loading", async () => {
const store1 = createFsStore(dir);
const m = await store1[BOOTSTRAP_STORE]({ type: "object" });
const s1 = await store1.put(m, { type: "string" });
const s2 = await store1.put(m, { type: "number" });
const store2 = createFsStore(dir);
const schemas = store2.listSchemas().map((e) => e.hash);
expect(schemas).toHaveLength(3);
expect(schemas).toContain(m);
expect(schemas).toContain(s1);
expect(schemas).toContain(s2);
});
test("L10. index migration still works with lazy loading", async () => {
const typeHash = await computeSelfHash({ name: "L10" });
const store1 = createFsStore(dir);
const h1 = await store1.put(typeHash, { i: 1 });
const h2 = await store1.put(typeHash, { i: 2 });
rmSync(join(dir, "_index"), { recursive: true, force: true });
// Re-open: should rebuild type index by scanning + decoding nodes on disk
const store2 = createFsStore(dir);
const list = store2.listByType(typeHash).map((e) => e.hash);
expect(list).toHaveLength(2);
expect(list).toContain(h1);
expect(list).toContain(h2);
expect(existsSync(join(dir, "_index", typeHash))).toBe(true);
// Re-open again: index already on disk, no re-scan needed
const store3 = createFsStore(dir);
const list3 = store3.listByType(typeHash).map((e) => e.hash);
expect(list3).toHaveLength(2);
});
test("L11. meta migration still works with lazy loading", async () => {
const store1 = createFsStore(dir);
const h1 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L11a" });
const h2 = await store1[BOOTSTRAP_STORE]({ type: "object", v: "L11b" });
const metaPath = join(dir, "_index", "_meta");
rmSync(metaPath, { force: true });
expect(existsSync(metaPath)).toBe(false);
const store2 = createFsStore(dir);
const meta = store2.listMeta().map((e) => e.hash);
expect(meta).toHaveLength(2);
expect(meta).toContain(h1);
expect(meta).toContain(h2);
expect(existsSync(metaPath)).toBe(true);
});
test("L12. bootstrap round-trip works with lazy store", async () => {
const store1 = await openStore(dir);
const schemas1 = bootstrap(store1);
const typeHash = await computeSelfHash({ name: "L12-user" });
const userHash = store1.cas.put(typeHash, { user: "data" });
const store2 = await openStore(dir);
// All bootstrap schemas accessible
for (const name of [
"@ocas/schema",
"@ocas/string",
"@ocas/number",
"@ocas/object",
"@ocas/array",
"@ocas/bool",
]) {
const h = schemas1[name] as string;
expect(store2.cas.has(h)).toBe(true);
expect(store2.cas.get(h)).not.toBeNull();
}
// User data still accessible
expect(store2.cas.has(userHash)).toBe(true);
const userNode = store2.cas.get(userHash) as CasNode;
expect(userNode.payload).toEqual({ user: "data" });
});
});
+109 -65
View File
@@ -59,23 +59,36 @@ function migrateFlatLayoutToNodes(dir: string): void {
}
}
function loadDir(dir: string, data: Map<Hash, CasNode>): void {
/**
* Scan `nodes/` directory for `.bin` filenames and return the set of hashes
* present on disk. Does NOT read or decode any node content — this is the
* cheap O(n) startup operation that replaces the legacy full-load.
*/
function loadHashSet(dir: string): Set<Hash> {
const hashes = new Set<Hash>();
let entries: string[];
try {
entries = readdirSync(dir);
} catch {
return;
return hashes;
}
for (const name of entries) {
if (!name.endsWith(".bin")) continue;
const hash = name.slice(0, -4) as Hash;
try {
const buf = readFileSync(join(dir, name));
const node = decode(new Uint8Array(buf)) as CasNode;
data.set(hash, node);
} catch {
// skip corrupted files
}
hashes.add(name.slice(0, -4) as Hash);
}
return hashes;
}
/**
* Read and CBOR-decode a single node from disk. Returns `null` if the file
* is missing or its content is corrupted.
*/
function readNodeFromDisk(nodesDir: string, hash: Hash): CasNode | null {
try {
const buf = readFileSync(join(nodesDir, `${hash}.bin`));
return decode(new Uint8Array(buf)) as CasNode;
} catch {
return null;
}
}
@@ -103,9 +116,19 @@ function loadTypeIndex(indexDir: string): Map<Hash, Hash[]> {
return typeIndex;
}
function buildTypeIndexFromNodes(data: Map<Hash, CasNode>): Map<Hash, Hash[]> {
/**
* Migration helper: scan all `.bin` files on disk, decoding each one to read
* its `type` field, and rebuild the type index. Used only when `_index/` is
* missing — a one-time cost.
*/
function buildTypeIndexFromDisk(
nodesDir: string,
hashSet: Set<Hash>,
): Map<Hash, Hash[]> {
const typeIndex = new Map<Hash, Hash[]>();
for (const [hash, node] of data) {
for (const hash of hashSet) {
const node = readNodeFromDisk(nodesDir, hash);
if (!node) continue;
const list = typeIndex.get(node.type) ?? [];
list.push(hash);
typeIndex.set(node.type, list);
@@ -123,11 +146,12 @@ function writeTypeIndex(indexDir: string, typeIndex: Map<Hash, Hash[]>): void {
function loadOrMigrateTypeIndex(
dir: string,
data: Map<Hash, CasNode>,
nodesDir: string,
hashSet: Set<Hash>,
): Map<Hash, Hash[]> {
const indexDir = join(dir, INDEX_DIR);
if (!existsSync(indexDir)) {
const typeIndex = buildTypeIndexFromNodes(data);
const typeIndex = buildTypeIndexFromDisk(nodesDir, hashSet);
if (typeIndex.size > 0) {
writeTypeIndex(indexDir, typeIndex);
}
@@ -138,7 +162,8 @@ function loadOrMigrateTypeIndex(
function loadOrMigrateMetaSet(
dir: string,
data: Map<Hash, CasNode>,
nodesDir: string,
hashSet: Set<Hash>,
): Set<Hash> {
const indexDir = join(dir, INDEX_DIR);
const metaPath = join(indexDir, META_FILE);
@@ -150,10 +175,11 @@ function loadOrMigrateMetaSet(
return new Set();
}
}
// Migration: scan loaded nodes for self-referencing nodes (type === hash)
// Migration: scan nodes on disk for self-referencing nodes (type === hash)
const metaSet = new Set<Hash>();
for (const [hash, node] of data) {
if (node.type === hash) {
for (const hash of hashSet) {
const node = readNodeFromDisk(nodesDir, hash);
if (node && node.type === hash) {
metaSet.add(hash);
}
}
@@ -204,18 +230,6 @@ function appendToTypeIndex(
typeIndex.set(type, list);
}
function hashesToEntries(
data: Map<Hash, CasNode>,
hashes: Iterable<Hash>,
): ListEntry[] {
const result: ListEntry[] = [];
for (const h of hashes) {
const node = data.get(h);
if (node) result.push(casListEntry(h, node.timestamp));
}
return result;
}
/**
* The CAS sub-store of an FS-backed `Store` — also satisfies the legacy
* `BootstrapCapableStore` interface so `bootstrap()` can run against it.
@@ -230,17 +244,42 @@ export function createFsStore(dir: string): FsCasStore {
migrateFlatLayoutToNodes(dir);
const nodesDir = join(dir, NODES_DIR);
const data = new Map<Hash, CasNode>();
loadDir(nodesDir, data);
// Lazy loading (#85): only scan filenames at startup — do NOT decode.
const hashSet = loadHashSet(nodesDir);
// In-memory cache of decoded nodes. Populated on first get() of each hash.
const cache = new Map<Hash, CasNode>();
const indexDir = join(dir, INDEX_DIR);
const typeIndex = loadOrMigrateTypeIndex(dir, data);
const metaSet = loadOrMigrateMetaSet(dir, data);
const typeIndex = loadOrMigrateTypeIndex(dir, nodesDir, hashSet);
const metaSet = loadOrMigrateMetaSet(dir, nodesDir, hashSet);
/**
* Look up a node by hash, loading from disk on cache miss. Returns `null`
* if the hash is unknown or the file is corrupted.
*/
function loadNode(hash: Hash): CasNode | null {
const cached = cache.get(hash);
if (cached) return cached;
if (!hashSet.has(hash)) return null;
const node = readNodeFromDisk(nodesDir, hash);
if (node) cache.set(hash, node);
return node;
}
function hashesToEntries(hashes: Iterable<Hash>): ListEntry[] {
const result: ListEntry[] = [];
for (const h of hashes) {
const node = loadNode(h);
if (node) result.push(casListEntry(h, node.timestamp));
}
return result;
}
function putSelfReferencing(payload: unknown): Hash {
const hash = computeSelfHashSync(payload);
if (!data.has(hash)) {
if (!hashSet.has(hash)) {
const node: CasNode = { type: hash, payload, timestamp: Date.now() };
data.set(hash, node);
hashSet.add(hash);
cache.set(hash, node);
mkdirSync(nodesDir, { recursive: true });
const tmp = join(nodesDir, `${hash}.tmp`);
@@ -261,13 +300,14 @@ export function createFsStore(dir: string): FsCasStore {
put(typeHash: Hash, payload: unknown): Hash {
const hash = computeHashSync(typeHash, payload);
if (!data.has(hash)) {
if (!hashSet.has(hash)) {
const node: CasNode = {
type: typeHash,
payload,
timestamp: Date.now(),
};
data.set(hash, node);
hashSet.add(hash);
cache.set(hash, node);
mkdirSync(nodesDir, { recursive: true });
const tmp = join(nodesDir, `${hash}.tmp`);
@@ -285,25 +325,25 @@ export function createFsStore(dir: string): FsCasStore {
},
get(hash: Hash): CasNode | null {
return data.get(hash) ?? null;
return loadNode(hash);
},
has(hash: Hash): boolean {
return data.has(hash);
return hashSet.has(hash);
},
listByType(typeHash: Hash, options?: ListOptions): ListEntry[] {
const list = typeIndex.get(typeHash);
if (!list) return [];
return applyListOptions(hashesToEntries(data, list), options);
return applyListOptions(hashesToEntries(list), options);
},
listAll(): Hash[] {
return Array.from(data.keys());
return Array.from(hashSet);
},
listMeta(options?: ListOptions): ListEntry[] {
return applyListOptions(hashesToEntries(data, metaSet), options);
return applyListOptions(hashesToEntries(metaSet), options);
},
listSchemas(options?: ListOptions): ListEntry[] {
@@ -315,38 +355,42 @@ export function createFsStore(dir: string): FsCasStore {
for (const h of list) result.add(h);
}
}
return applyListOptions(hashesToEntries(data, result), options);
return applyListOptions(hashesToEntries(result), options);
},
delete(hash: Hash): boolean {
const node = data.get(hash);
if (!node) return false;
data.delete(hash);
if (!hashSet.has(hash)) return false;
// Need the node's type to clean up the type index. Lazy-load if needed.
const node = loadNode(hash);
hashSet.delete(hash);
cache.delete(hash);
// Delete file
try {
unlinkSync(join(nodesDir, `${hash}.bin`));
} catch {
// ignore if file doesn't exist
}
// Remove from type index
const list = typeIndex.get(node.type);
if (list) {
const idx = list.indexOf(hash);
if (idx !== -1) {
list.splice(idx, 1);
}
if (list.length === 0) {
typeIndex.delete(node.type);
// Delete empty index file
try {
unlinkSync(join(indexDir, node.type));
} catch {
// ignore
// Remove from type index (only if we could decode the node)
if (node) {
const list = typeIndex.get(node.type);
if (list) {
const idx = list.indexOf(hash);
if (idx !== -1) {
list.splice(idx, 1);
}
if (list.length === 0) {
typeIndex.delete(node.type);
// Delete empty index file
try {
unlinkSync(join(indexDir, node.type));
} catch {
// ignore
}
} else {
// Rewrite index file
const body = `${list.join("\n")}\n`;
writeFileSync(join(indexDir, node.type), body, "utf8");
}
} else {
// Rewrite index file
const body = `${list.join("\n")}\n`;
writeFileSync(join(indexDir, node.type), body, "utf8");
}
}
// Remove from meta set if applicable