/** * Regression tests for the round 2 review feedback on PR #481: * - Daemon-computed composite is authoritative; agent-supplied * and are advisory. * - When SHIP refers to a round whose daemon composite is below the * configured threshold, the run finalizes as below_threshold even when * the agent claimed status="shipped". * - A composite divergence beyond COMPOSITE_TOLERANCE emits a * composite_mismatch parser_warning event. */ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import { mkdtempSync } from 'node:fs'; import { rm } from 'node:fs/promises'; import { tmpdir } from 'node:os'; import { join } from 'node:path'; import Database from 'better-sqlite3'; import { migrateCritique, getCritiqueRun } from '../src/critique/persistence.js'; import { runOrchestrator, type CritiqueSseBus } from '../src/critique/orchestrator.js'; import type { CritiqueSseEvent } from '@open-design/contracts/critique'; import { defaultCritiqueConfig } from '@open-design/contracts/critique'; function freshDb(): Database.Database { const db = new Database(':memory:'); db.pragma('journal_mode = WAL'); db.pragma('foreign_keys = ON'); db.exec(` CREATE TABLE projects ( id TEXT PRIMARY KEY, name TEXT NOT NULL, created_at INTEGER NOT NULL, updated_at INTEGER NOT NULL ); CREATE TABLE conversations ( id TEXT PRIMARY KEY, project_id TEXT NOT NULL, created_at INTEGER NOT NULL, updated_at INTEGER NOT NULL, FOREIGN KEY(project_id) REFERENCES projects(id) ON DELETE CASCADE ); INSERT INTO projects (id, name, created_at, updated_at) VALUES ('p1', 'p1', 0, 0); INSERT INTO conversations (id, project_id, created_at, updated_at) VALUES ('c1', 'p1', 0, 0); `); migrateCritique(db); return db; } function makeBus(): { bus: CritiqueSseBus; events: CritiqueSseEvent[] } { const events: CritiqueSseEvent[] = []; const bus: CritiqueSseBus = { emit: (e) => { events.push(e); } }; return { bus, events }; } async function* streamOf(text: string, chunkSize = 64): AsyncIterable { for (let i = 0; i < text.length; i += chunkSize) { yield text.slice(i, i + chunkSize); } } let tmpDir: string; let db: Database.Database; beforeEach(() => { tmpDir = mkdtempSync(join(tmpdir(), 'od-authority-test-')); db = freshDb(); }); afterEach(async () => { db.close(); await rm(tmpDir, { recursive: true, force: true }); }); /** * One round, all panelists score ~6.0 so the daemon-computed composite is * well below the default threshold of 8.0. The agent lies in both * and to * try to force a ship despite low panelist scores. */ function lyingShipStream(): string { return ` v1 ]]> ok ok ok ok liar fake]]> Pretending we shipped. `; } /** Agent claims a slightly different composite than the daemon will compute, * but well within threshold. Used to exercise composite_mismatch warning * without flipping the ship decision. */ function nearMissCompositeStream(): string { return ` v1 ]]> good good good good arithmetic skipped x]]> Wrong composite reported but real scores are high. `; } describe('orchestrator daemon-authoritative scoring (PR #481 round 2 review)', () => { it('SHIP claiming shipped is downgraded to below_threshold when daemon composite is below threshold', async () => { const { bus, events } = makeBus(); const artifactDir = join(tmpDir, 'authority-1'); const result = await runOrchestrator({ runId: 'r-lying', projectId: 'p1', conversationId: null, artifactId: 'a1', artifactDir, adapter: 'claude', cfg: defaultCritiqueConfig(), db, bus, stdout: streamOf(lyingShipStream()), }); expect(result.status).toBe('below_threshold'); expect(result.composite).not.toBeNull(); expect(result.composite!).toBeLessThan(8.0); const row = getCritiqueRun(db, 'r-lying'); expect(row?.status).toBe('below_threshold'); expect(row?.score).toBeLessThan(8.0); const shipEvents = events.filter((e) => e.event === 'critique.ship'); expect(shipEvents).toHaveLength(1); // Round 4 review: the SSE bus must only see the daemon-authoritative // ship payload, never the agent's raw claim. const shipPayload = shipEvents[0]?.data as { status: string; composite: number } | undefined; expect(shipPayload?.status).toBe('below_threshold'); expect(shipPayload?.composite).toBeLessThan(8.0); }); it('SHIP referencing an unclosed round is dropped, parser_warning emitted, fallback selected', async () => { const { bus, events } = makeBus(); const artifactDir = join(tmpDir, 'authority-4'); // Round 1 closes with low scores. Round 2 is opened but never closed. // The agent then ships round 2 with a high composite. The daemon must // refuse to score against an unclosed round and fall back to round 1. const stream = ` v1 ]]> ok ok ok ok continue ]]> Forged ship for an unclosed round. `; const result = await runOrchestrator({ runId: 'r-unclosed', projectId: 'p1', conversationId: null, artifactId: 'a1', artifactDir, adapter: 'claude', cfg: defaultCritiqueConfig(), db, bus, stdout: streamOf(stream), }); expect(result.status).toBe('below_threshold'); expect(result.composite).not.toBeNull(); expect(result.composite!).toBeLessThan(8.0); // Exactly one synthetic ship from the fallback path. The agent's forged // ship for the unclosed round must NOT appear on the SSE bus. const shipEvents = events.filter((e) => e.event === 'critique.ship'); expect(shipEvents).toHaveLength(1); const shipPayload = shipEvents[0]?.data as { round: number; status: string } | undefined; expect(shipPayload?.round).toBe(1); expect(shipPayload?.status).toBe('below_threshold'); // A parser_warning must have been emitted to flag the rejected SHIP. const warnings = events.filter((e) => e.event === 'critique.parser_warning'); expect(warnings.length).toBeGreaterThanOrEqual(1); }); it('emits composite_mismatch parser_warning when ROUND_END/SHIP composite diverges beyond tolerance', async () => { const { bus, events } = makeBus(); const artifactDir = join(tmpDir, 'authority-2'); await runOrchestrator({ runId: 'r-mismatch', projectId: 'p1', conversationId: null, artifactId: 'a1', artifactDir, adapter: 'claude', cfg: defaultCritiqueConfig(), db, bus, stdout: streamOf(nearMissCompositeStream()), }); const warnings = events.filter((e) => e.event === 'critique.parser_warning'); expect(warnings.length).toBeGreaterThanOrEqual(1); const mismatch = warnings.find((e) => 'kind' in e.data && e.data.kind === 'composite_mismatch'); expect(mismatch).toBeDefined(); }); it('does not emit composite_mismatch when agent and daemon agree within tolerance', async () => { const { bus, events } = makeBus(); const artifactDir = join(tmpDir, 'authority-3'); // Build a stream where ROUND_END composite matches the weighted sum exactly. // Default weights: critic=0.4, brand=0.2, a11y=0.2, copy=0.2; all 9.0 -> 9.0. const aligned = ` v1 ]]> ok ok ok ok ok ]]> aligned `; await runOrchestrator({ runId: 'r-aligned', projectId: 'p1', conversationId: null, artifactId: 'a1', artifactDir, adapter: 'claude', cfg: defaultCritiqueConfig(), db, bus, stdout: streamOf(aligned), }); const compositeWarnings = events.filter( (e) => e.event === 'critique.parser_warning' && 'kind' in e.data && e.data.kind === 'composite_mismatch', ); expect(compositeWarnings).toHaveLength(0); }); });