Compare commits

..

17 Commits

Author SHA1 Message Date
xiaoju 9260d81084 chore: version bump for --version fix
CI / check (push) Successful in 3m2s
agent-hermes@0.1.2 agent-claude-code@0.1.1 agent-builtin@0.1.1
agent-mock@0.1.1 eval@0.1.3 util@0.1.1

小橘 🍊(NEKO Team)
2026-06-05 08:12:50 +00:00
xiaomo c8d884072a Merge pull request 'fix: acp-client reports agent-hermes own version in MCP clientInfo' (#98) from fix/acp-client-own-version into main
CI / check (push) Successful in 2m27s
2026-06-05 08:10:57 +00:00
xiaoju abeb465f46 fix: acp-client reports own package version, not util VERSION
CI / check (pull_request) Successful in 2m36s
Address review nit from PR #97: clientInfo.version should be
agent-hermes's own version for correct identification under
independent versioning.

小橘 🍊(NEKO Team)
2026-06-05 07:50:03 +00:00
xiaomo 28427a973f Merge pull request 'fix: add --version to adapter CLIs, read VERSION from package.json' (#97) from fix/adapter-version into main
CI / check (push) Successful in 3m3s
2026-06-05 07:36:15 +00:00
xiaoju 794f9db568 fix: add --version to adapter CLIs, read VERSION from package.json
CI / check (pull_request) Successful in 3m29s
- All uwf-* adapter CLIs now support --version / -V
- util VERSION constant reads from package.json at runtime
- agent-hermes ACP clientInfo uses dynamic VERSION

小橘 🍊(NEKO Team)
2026-06-05 07:29:54 +00:00
xiaoju cd585a26f1 Merge pull request 'fix: read eval CLI version from package.json' (#96) from fix/95-eval-version into main
CI / check (push) Successful in 3m28s
2026-06-05 06:46:32 +00:00
xiaoju 1cf8f350d0 fix: read eval CLI version from package.json
CI / check (pull_request) Successful in 3m30s
Fixes #95

小橘 🍊(NEKO Team)
2026-06-05 06:43:27 +00:00
xiaoju 427568a21d chore: version bump agent-hermes@0.1.1 cli@0.1.1 eval@0.1.2
CI / check (push) Successful in 2m37s
小橘 🍊(NEKO Team)
2026-06-05 06:29:25 +00:00
xiaomo d3a2353acf Merge pull request 'fix: read token usage from ACP response instead of DB' (#94) from fix/usage-tokens-from-acp into main
CI / check (push) Successful in 3m25s
2026-06-05 06:18:05 +00:00
xiaoju 8085d1d6e0 fix: read token usage from ACP response instead of DB
CI / check (pull_request) Successful in 3m10s
Tokens (inputTokens, outputTokens) now come from ACP PromptResponse.usage
which is populated synchronously from run_conversation() — no WAL race.
Turns still come from DB before/after snapshot.

Previously both were read from hermes state.db after ACP prompt returned,
but WAL write lag caused incomplete token data (e.g. 235 vs actual 26,080).

Refs #91
2026-06-05 06:08:11 +00:00
xiaomo 8764d7bda3 Merge pull request 'chore: add changeset for #92 agent override alias fix' (#93) from chore/changeset-agent-override into main
CI / check (push) Successful in 3m33s
2026-06-05 05:17:36 +00:00
xiaoju 850a3b2f25 chore: add changeset for #92 agent override alias fix
CI / check (pull_request) Successful in 3m8s
2026-06-05 04:36:41 +00:00
xiaomo 3d6a517e83 Merge pull request 'fix: resolve --agent override via config alias before raw command' (#92) from fix/agent-override-alias into main
CI / check (push) Successful in 3m30s
2026-06-05 04:31:50 +00:00
xiaoju 825f0c641a fix: resolve --agent override via config alias before raw command
CI / check (pull_request) Successful in 3m37s
When --agent is passed to uwf thread exec, try config.agents[alias]
first (e.g. 'hermes' → config.agents.hermes = {command: 'uwf-hermes'}),
then fall back to parseAgentOverride for raw command names.

Also change eval CLI default --agent from 'hermes' to 'uwf-hermes'
so it works without config alias lookup.

Refs #91
2026-06-05 04:20:09 +00:00
xiaoju 81bbe1178f chore: release @united-workforce/eval@0.1.1
CI / check (push) Successful in 2m45s
2026-06-05 03:02:05 +00:00
xiaoju a0e139935e Merge pull request 'fix: frontmatter judge handles parsed object output' (#90) from fix/frontmatter-judge-object-output into main
CI / check (push) Successful in 2m12s
2026-06-05 03:01:30 +00:00
xiaoju a08775896f fix: frontmatter judge handles parsed object output
CI / check (pull_request) Successful in 2m38s
The extract pipeline stores step output as a JSON object in CAS,
but the frontmatter judge only checked for raw markdown strings.
Now accepts both formats: parsed objects check $status directly,
raw strings go through YAML frontmatter extraction.

Fixes eval frontmatter-compliance scoring 0 on valid outputs.
2026-06-05 02:55:58 +00:00
24 changed files with 244 additions and 82 deletions
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-builtin",
"version": "0.1.0",
"version": "0.1.1",
"files": [
"src",
"dist",
+7
View File
@@ -1,5 +1,12 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createBuiltinAgent } from "./agent.js";
const main = createBuiltinAgent();
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-claude-code",
"version": "0.1.0",
"version": "0.1.1",
"files": [
"src",
"dist",
+7
View File
@@ -1,5 +1,12 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createClaudeCodeAgent } from "./claude-code.js";
const model = process.env.CLAUDE_MODEL ?? null;
+18
View File
@@ -0,0 +1,18 @@
# @united-workforce/agent-hermes
## 0.1.1
### Patch Changes
- 8085d1d: fix: read token usage from ACP PromptResponse instead of DB
Token counts (inputTokens, outputTokens) now come from the ACP
`PromptResponse.usage` field, which is populated synchronously from
`run_conversation()` return data — no WAL race condition.
Turns (assistant message count) still come from the DB via
`snapshotTurns()` before/after delta.
Previously both tokens and turns were read from the Hermes state DB
after the ACP prompt returned, but due to WAL write lag the DB often
had incomplete token data at read time (e.g. 235 vs actual 26,080).
@@ -1,5 +1,6 @@
import { describe, expect, test } from "vitest";
import { computeUsageDelta, snapshotUsage } from "../src/hermes.js";
import type { AcpUsage } from "../src/acp-client.js";
import { buildUsage, snapshotTurns } from "../src/hermes.js";
import type { HermesSessionJson } from "../src/types.js";
function makeSession(overrides: Partial<HermesSessionJson> = {}): HermesSessionJson {
@@ -14,19 +15,19 @@ function makeSession(overrides: Partial<HermesSessionJson> = {}): HermesSessionJ
};
}
describe("snapshotUsage", () => {
test("returns zero snapshot for null session", () => {
const result = snapshotUsage(null);
expect(result).toEqual({ turns: 0, inputTokens: 0, outputTokens: 0 });
describe("snapshotTurns", () => {
test("returns zero for null session", () => {
const result = snapshotTurns(null);
expect(result).toEqual({ turns: 0 });
});
test("returns zero snapshot for empty session", () => {
const result = snapshotUsage(makeSession());
expect(result).toEqual({ turns: 0, inputTokens: 0, outputTokens: 0 });
test("returns zero for empty session", () => {
const result = snapshotTurns(makeSession());
expect(result).toEqual({ turns: 0 });
});
test("counts assistant messages as turns", () => {
const result = snapshotUsage(
const result = snapshotTurns(
makeSession({
messages: [
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
@@ -39,11 +40,11 @@ describe("snapshotUsage", () => {
outputTokens: 500,
}),
);
expect(result).toEqual({ turns: 2, inputTokens: 1000, outputTokens: 500 });
expect(result).toEqual({ turns: 2 });
});
test("ignores non-assistant messages for turn count", () => {
const result = snapshotUsage(
const result = snapshotTurns(
makeSession({
messages: [
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
@@ -55,11 +56,13 @@ describe("snapshotUsage", () => {
});
});
describe("computeUsageDelta", () => {
test("first visit: before is zero, after has all values", () => {
const before = { turns: 0, inputTokens: 0, outputTokens: 0 };
const after = { turns: 3, inputTokens: 5000, outputTokens: 2000 };
const result = computeUsageDelta(before, after, 12.5);
describe("buildUsage", () => {
const acpUsage: AcpUsage = { inputTokens: 5000, outputTokens: 2000, totalTokens: 7000 };
test("first visit: tokens from ACP, turns from DB delta", () => {
const beforeTurns = { turns: 0 };
const afterTurns = { turns: 3 };
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 12.5);
expect(result).toEqual({
turns: 3,
inputTokens: 5000,
@@ -68,43 +71,52 @@ describe("computeUsageDelta", () => {
});
});
test("re-entry: computes delta correctly", () => {
const before = { turns: 2, inputTokens: 3000, outputTokens: 1000 };
const after = { turns: 4, inputTokens: 8000, outputTokens: 3500 };
const result = computeUsageDelta(before, after, 7.3);
test("re-entry: turn delta computed correctly, tokens from ACP", () => {
const beforeTurns = { turns: 2 };
const afterTurns = { turns: 4 };
const acpDelta: AcpUsage = { inputTokens: 8000, outputTokens: 3500, totalTokens: 11500 };
const result = buildUsage(acpDelta, beforeTurns, afterTurns, 7.3);
expect(result).toEqual({
turns: 2,
inputTokens: 5000,
outputTokens: 2500,
inputTokens: 8000,
outputTokens: 3500,
duration: 7,
});
});
test("floors negative deltas at 0 (defensive)", () => {
const before = { turns: 5, inputTokens: 10000, outputTokens: 5000 };
const after = { turns: 3, inputTokens: 8000, outputTokens: 4000 };
const result = computeUsageDelta(before, after, 1.0);
test("floors negative turn deltas at 0, then defaults to 1", () => {
const beforeTurns = { turns: 5 };
const afterTurns = { turns: 3 };
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 1.0);
// turns would be negative (-2), floored to 0, then || 1 gives 1
expect(result.turns).toBe(1);
expect(result.inputTokens).toBe(0);
expect(result.outputTokens).toBe(0);
});
test("zero turns delta defaults to 1 (at least one turn happened)", () => {
const before = { turns: 3, inputTokens: 1000, outputTokens: 500 };
const after = { turns: 3, inputTokens: 2000, outputTokens: 1000 };
const result = computeUsageDelta(before, after, 5.0);
const beforeTurns = { turns: 3 };
const afterTurns = { turns: 3 };
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 5.0);
// turns delta is 0, || 1 gives 1
expect(result.turns).toBe(1);
expect(result.inputTokens).toBe(1000);
expect(result.outputTokens).toBe(500);
});
test("null ACP usage yields zero tokens", () => {
const beforeTurns = { turns: 0 };
const afterTurns = { turns: 2 };
const result = buildUsage(null, beforeTurns, afterTurns, 10.0);
expect(result).toEqual({
turns: 2,
inputTokens: 0,
outputTokens: 0,
duration: 10,
});
});
test("duration is rounded", () => {
const before = { turns: 0, inputTokens: 0, outputTokens: 0 };
const after = { turns: 1, inputTokens: 100, outputTokens: 50 };
expect(computeUsageDelta(before, after, 3.7).duration).toBe(4);
expect(computeUsageDelta(before, after, 3.2).duration).toBe(3);
expect(computeUsageDelta(before, after, 0.0).duration).toBe(0);
const beforeTurns = { turns: 0 };
const afterTurns = { turns: 1 };
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.7).duration).toBe(4);
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.2).duration).toBe(3);
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 0.0).duration).toBe(0);
});
});
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-hermes",
"version": "0.1.0",
"version": "0.1.2",
"files": [
"src",
"dist",
+35 -1
View File
@@ -1,6 +1,16 @@
import type { ChildProcess } from "node:child_process";
import { spawn } from "node:child_process";
import { readFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { createInterface } from "node:readline";
import { fileURLToPath } from "node:url";
const __dirname = dirname(fileURLToPath(import.meta.url));
const OWN_VERSION = (
JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf-8")) as {
version: string;
}
).version;
const HERMES_COMMAND = "hermes";
const PROTOCOL_VERSION = 1;
@@ -17,9 +27,17 @@ type PendingRequest = {
reject: (reason: Error) => void;
};
/** Token usage returned by ACP PromptResponse. */
export type AcpUsage = {
inputTokens: number;
outputTokens: number;
totalTokens: number;
};
export type AcpPromptResult = {
text: string;
sessionId: string;
usage: AcpUsage | null;
};
export class HermesAcpClient {
@@ -96,9 +114,25 @@ export class HermesAcpClient {
);
}
// Extract token usage from ACP PromptResponse.result.usage (camelCase wire format)
const result = (response as { result?: Record<string, unknown> }).result;
const rawUsage = result?.usage as Record<string, unknown> | undefined;
const usage: AcpUsage | null =
rawUsage !== undefined &&
typeof rawUsage.inputTokens === "number" &&
typeof rawUsage.outputTokens === "number" &&
typeof rawUsage.totalTokens === "number"
? {
inputTokens: rawUsage.inputTokens,
outputTokens: rawUsage.outputTokens,
totalTokens: rawUsage.totalTokens,
}
: null;
return {
text: this.messageChunks.join(""),
sessionId: this.sessionId,
usage,
};
}
@@ -275,7 +309,7 @@ export class HermesAcpClient {
private async initialize(): Promise<void> {
const initResponse = await this.sendRequest("initialize", {
protocolVersion: PROTOCOL_VERSION,
clientInfo: { name: "uwf", version: "0.1.0" },
clientInfo: { name: "uwf-hermes", version: OWN_VERSION },
capabilities: {},
});
+7
View File
@@ -1,5 +1,12 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createHermesAgent } from "./hermes.js";
import { isResumeDisabled } from "./session-cache.js";
+32 -30
View File
@@ -8,7 +8,7 @@ import {
buildRolePrompt,
createAgent,
} from "@united-workforce/util-agent";
import type { AcpUsage } from "./acp-client.js";
import { HermesAcpClient } from "./acp-client.js";
import { getCachedSessionId, setCachedSessionId } from "./session-cache.js";
import { loadHermesSession, storeHermesSessionDetail } from "./session-detail.js";
@@ -17,36 +17,37 @@ import type { HermesSessionJson } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
/** Snapshot of session metrics taken before and after a prompt call. */
type UsageSnapshot = {
type TurnsSnapshot = {
turns: number;
inputTokens: number;
outputTokens: number;
};
const ZERO_SNAPSHOT: UsageSnapshot = { turns: 0, inputTokens: 0, outputTokens: 0 };
const ZERO_TURNS: TurnsSnapshot = { turns: 0 };
/** Extract usage metrics from a session. Returns zeros for null sessions. */
export function snapshotUsage(session: HermesSessionJson | null): UsageSnapshot {
/** Extract assistant turn count from a session. Returns zero for null sessions. */
export function snapshotTurns(session: HermesSessionJson | null): TurnsSnapshot {
if (session === null) {
return ZERO_SNAPSHOT;
return ZERO_TURNS;
}
return {
turns: session.messages.filter((m) => m.role === "assistant").length,
inputTokens: session.inputTokens,
outputTokens: session.outputTokens,
};
}
/** Compute the delta between two snapshots (after minus before). Floors at 0. */
export function computeUsageDelta(
before: UsageSnapshot,
after: UsageSnapshot,
/**
* Build Usage from ACP token data + DB turn delta.
* Tokens come from ACP PromptResponse (synchronous, accurate).
* Turns come from DB before/after snapshots (may have WAL lag, but acceptable).
*/
export function buildUsage(
acpUsage: AcpUsage | null,
beforeTurns: TurnsSnapshot,
afterTurns: TurnsSnapshot,
durationSec: number,
): Usage {
return {
turns: Math.max(0, after.turns - before.turns) || 1,
inputTokens: Math.max(0, after.inputTokens - before.inputTokens),
outputTokens: Math.max(0, after.outputTokens - before.outputTokens),
turns: Math.max(0, afterTurns.turns - beforeTurns.turns) || 1,
inputTokens: acpUsage?.inputTokens ?? 0,
outputTokens: acpUsage?.outputTokens ?? 0,
duration: Math.round(durationSec),
};
}
@@ -148,12 +149,12 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
async function runPrompt(
ctx: AgentContext,
useContinuation: boolean,
beforeSnapshot: UsageSnapshot,
beforeTurns: TurnsSnapshot,
): Promise<AgentRunResult> {
const effectiveCtx = useContinuation ? ctx : { ...ctx, isFirstVisit: true };
const fullPrompt = buildHermesPrompt(effectiveCtx);
const startMs = Date.now();
const { text, sessionId } = await client.prompt(fullPrompt);
const { text, sessionId, usage: acpUsage } = await client.prompt(fullPrompt);
const durationSec = (Date.now() - startMs) / 1000;
const { detailHash } = await storePromptResult(ctx.store, sessionId);
@@ -161,9 +162,10 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot);
}
// Turns from DB (may lag slightly due to WAL, but acceptable)
const afterSession = await loadHermesSession(sessionId);
const afterSnapshot = snapshotUsage(afterSession);
const usage = computeUsageDelta(beforeSnapshot, afterSnapshot, durationSec);
const afterTurns = snapshotTurns(afterSession);
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage };
}
@@ -173,16 +175,16 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
const attempt = await prepareSession(client, ctx, cwd, resumeDisabled);
// Snapshot before prompt: for resumed sessions, captures cumulative state
// so we can compute the delta. For new sessions, this is ZERO_SNAPSHOT.
// so we can compute the turn delta. For new sessions, this is ZERO_TURNS.
const currentSessionId = client.getSessionId();
const beforeSession =
attempt.resumed && currentSessionId !== null
? await loadHermesSession(currentSessionId)
: null;
const beforeSnapshot = snapshotUsage(beforeSession);
const beforeTurns = snapshotTurns(beforeSession);
try {
return await runPrompt(ctx, attempt.useContinuation, beforeSnapshot);
return await runPrompt(ctx, attempt.useContinuation, beforeTurns);
} catch (error) {
if (!attempt.resumed) {
throw error;
@@ -193,7 +195,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
await client.close();
await client.connect(cwd);
// Fresh session after retry — reset snapshot to zero
return runPrompt(ctx, false, ZERO_SNAPSHOT);
return runPrompt(ctx, false, ZERO_TURNS);
}
}
@@ -204,20 +206,20 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
): Promise<AgentRunResult> {
// Client is already connected from runHermes — same ACP session,
// so the agent sees the full conversation history (crucial for retries).
// Snapshot before the continuation prompt for delta computation.
// Snapshot turns before the continuation prompt for delta computation.
const currentSessionId = client.getSessionId();
const beforeSession =
currentSessionId !== null ? await loadHermesSession(currentSessionId) : null;
const beforeSnapshot = snapshotUsage(beforeSession);
const beforeTurns = snapshotTurns(beforeSession);
const startMs = Date.now();
const { text, sessionId } = await client.prompt(message);
const { text, sessionId, usage: acpUsage } = await client.prompt(message);
const durationSec = (Date.now() - startMs) / 1000;
const { detailHash } = await storePromptResult(store, sessionId);
const afterSession = await loadHermesSession(sessionId);
const afterSnapshot = snapshotUsage(afterSession);
const usage = computeUsageDelta(beforeSnapshot, afterSnapshot, durationSec);
const afterTurns = snapshotTurns(afterSession);
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
return { output: text, detailHash, sessionId, assembledPrompt: "", usage };
}
+3 -2
View File
@@ -1,7 +1,8 @@
export type { AcpUsage } from "./acp-client.js";
export { HermesAcpClient } from "./acp-client.js";
export {
buildHermesPrompt,
computeUsageDelta,
buildUsage,
createHermesAgent,
snapshotUsage,
snapshotTurns,
} from "./hermes.js";
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-mock",
"version": "0.1.0",
"version": "0.1.1",
"files": [
"src",
"dist",
+7
View File
@@ -1,5 +1,12 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createMockAgent } from "./mock-agent.js";
const USAGE = "usage: uwf-mock --mock-data <path> --thread <id> --role <role> --prompt <text>";
+9
View File
@@ -0,0 +1,9 @@
# @united-workforce/cli
## 0.1.1
### Patch Changes
- 850a3b2: fix: resolve --agent override via config alias before raw command
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/cli",
"version": "0.1.0",
"version": "0.1.1",
"files": [
"src",
"dist",
+6
View File
@@ -961,6 +961,12 @@ function resolveAgentConfig(
agentOverride: string | null,
): AgentConfig {
if (agentOverride !== null) {
// Try config alias first (e.g. "hermes" → config.agents.hermes),
// then fall back to raw command name (e.g. "uwf-hermes" or "/usr/bin/agent").
const fromAlias = config.agents[agentOverride as AgentAlias];
if (fromAlias !== undefined) {
return fromAlias;
}
return parseAgentOverride(agentOverride);
}
+9
View File
@@ -0,0 +1,9 @@
# @united-workforce/eval
## 0.1.2
### Patch Changes
- 850a3b2: fix: resolve --agent override via config alias before raw command
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
@@ -91,6 +91,29 @@ describe("frontmatter-compliance judge", () => {
const result = await runFrontmatterJudge("T4");
expect(result.score).toBe(0);
});
test("parsed object output with $status → score 1.0", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", output: { $status: "done", summary: "fixed" } as unknown as string }),
makeStep({ role: "b", output: { $status: "reviewed" } as unknown as string }),
]);
const result = await runFrontmatterJudge("T5");
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
expect(result.score).toBe(1.0);
expect(data.stepsTotal).toBe(2);
expect(data.stepsValid).toBe(2);
});
test("parsed object output missing $status → score 0", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", output: { summary: "no status field" } as unknown as string }),
]);
const result = await runFrontmatterJudge("T6");
expect(result.score).toBe(0);
});
});
describe("token-stats judge", () => {
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/eval",
"version": "0.1.0",
"version": "0.1.3",
"private": false,
"files": [
"src",
+4 -1
View File
@@ -7,12 +7,15 @@ import {
registerRunCommand,
} from "./commands/index.js";
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
const program = new Command();
program
.name("uwf-eval")
.description("Evaluate uwf workflow quality with real agents")
.version("0.1.0");
.version(pkg.default.version, "-V, --version");
registerRunCommand(program);
registerReportCommand(program);
+1 -1
View File
@@ -52,7 +52,7 @@ export function registerRunCommand(program: Command): void {
program
.command("run <task>")
.description("Run eval on a task directory or tarball")
.option("--agent <name>", "agent adapter to use", "hermes")
.option("--agent <name>", "agent adapter to use", "uwf-hermes")
.option("--model <model>", "model override")
.option("--count <n>", "number of eval runs", "1")
.action(async (task: string, opts: RunCliOptions) => {
@@ -39,6 +39,16 @@ function extractFrontmatterYaml(output: unknown): string | null {
/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
function validateStepFrontmatter(output: unknown): string[] {
// CAS stores the extracted output as a JSON object after the extract pipeline.
// Accept both: parsed object (from step.output) or raw markdown string.
if (typeof output === "object" && output !== null && !Array.isArray(output)) {
const status = (output as Record<string, unknown>).$status;
if (typeof status !== "string" || status.trim() === "") {
return ["$status field is missing or not a non-empty string"];
}
return [];
}
const yaml = extractFrontmatterYaml(output);
if (yaml === null) {
return ["output does not begin with a valid '---' frontmatter block"];
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/util",
"version": "0.1.0",
"version": "0.1.1",
"files": [
"src",
"dist",
+9 -2
View File
@@ -1,2 +1,9 @@
// This version is kept in sync with package.json during releases.
export const VERSION = "0.1.0";
import { readFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
const __dirname = dirname(fileURLToPath(import.meta.url));
const pkg = JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf-8")) as {
version: string;
};
export const VERSION = pkg.version;