From d2b19a5439fe4800b9785b84bde3b9e68da3e9f3 Mon Sep 17 00:00:00 2001 From: Marco Sadjadi Date: Thu, 28 May 2026 19:34:40 +0200 Subject: [PATCH] =?UTF-8?q?fix(preview):=20max=5Ftokens=204096=E2=86=92819?= =?UTF-8?q?2=20+=20detect=20truncation=20explicitly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root cause of repeat 422s: 4096 was too tight for ambitious prompts (Marco's research-assistant prompt produces ~12kB of JSON before the model gets cut off mid-string). The error then surfaced as an opaque "Unterminated string in JSON" zod failure instead of pointing the user at the real problem. Two fixes: - maxTokens back to 8192 (the original) for all Claude tiers, 4096 for GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s edge cap. - Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE parsing and throw the new SpecTruncatedError. /preview catches it and returns 422 spec_too_large with a clear "split the prompt" message instead of leaking the zod parse failure. --- apps/api/src/routes/servers.ts | 16 +++++++++++ packages/llm/src/index.ts | 51 ++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/apps/api/src/routes/servers.ts b/apps/api/src/routes/servers.ts index bc0ae41..a3a3b4a 100644 --- a/apps/api/src/routes/servers.ts +++ b/apps/api/src/routes/servers.ts @@ -14,6 +14,7 @@ import { import { BannedPatternError, SpecTimeoutError, + SpecTruncatedError, SpecValidationError, generateSpec, pickPreviewModel, @@ -153,6 +154,21 @@ export async function serverRoutes(app: FastifyInstance): Promise { detail: 'Spec generation took too long. Try a shorter, more specific prompt.', }); } + if (err instanceof SpecTruncatedError) { + app.log.warn( + { + reason: err.message, + prompt: parsed.data.prompt.slice(0, 200), + model: choice.displayName, + }, + 'preview_spec_truncated', + ); + return reply.code(422).send({ + error: 'spec_too_large', + detail: + 'The spec for this prompt exceeded the maximum response size. Split it into fewer tools or describe one capability per prompt.', + }); + } app.log.error(err); return reply.code(500).send({ error: 'preview_failed', detail: (err as Error).message }); } diff --git a/packages/llm/src/index.ts b/packages/llm/src/index.ts index 0711440..9872e3d 100644 --- a/packages/llm/src/index.ts +++ b/packages/llm/src/index.ts @@ -88,43 +88,50 @@ export interface ModelChoice { * visible quality/speed jump *is* the upgrade pitch. * * Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s · - * Claude Sonnet 4.6 ~80 tok/s. A spec is small (<= ~10 tools with short - * descriptions, ~1.5–2.5k output tokens in practice) so we cap maxTokens at - * 4096 — well under the model's hard ceiling and tight enough that even - * Sonnet finishes inside 60s in the worst case (4096 / 80 ≈ 51s). The - * timeouts above 90s buy headroom for cold starts / slow API responses - * while staying clear of Cloudflare's 100s edge cap. + * Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s + * number was from the pre-4.6 generation). + * + * Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious + * prompts ("research assistant with web search, papers, wikipedia, …") + * routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at + * 8192 — the model's effective ceiling for these prompts — and detect the + * `stop_reason === 'max_tokens'` case to surface a "spec too large" message + * instead of letting the truncated JSON blow up at the zod boundary. + * + * Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at + * 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold + * starts and TCP/TLS setup. */ const PREVIEW_MODELS: Record = { hobby: { provider: 'glm', model: 'glm-4-plus', - maxTokens: 3500, - timeoutMs: 90_000, + maxTokens: 4096, + timeoutMs: 95_000, displayName: 'Open-tier AI', displayBadge: 'open-tier', }, pro: { provider: 'anthropic', model: 'claude-haiku-4-5-20251001', - maxTokens: 4096, - timeoutMs: 90_000, + maxTokens: 8192, + timeoutMs: 95_000, displayName: 'Claude Haiku 4.5', displayBadge: 'claude-haiku', }, team: { provider: 'anthropic', model: 'claude-sonnet-4-6', - maxTokens: 4096, - timeoutMs: 90_000, + maxTokens: 8192, + timeoutMs: 95_000, displayName: 'Claude Sonnet 4.6', displayBadge: 'claude-sonnet', }, enterprise: { provider: 'anthropic', model: 'claude-sonnet-4-6', - maxTokens: 4096, - timeoutMs: 90_000, + maxTokens: 8192, + timeoutMs: 95_000, displayName: 'Claude Sonnet 4.6', displayBadge: 'claude-sonnet', }, @@ -266,6 +273,18 @@ async function generateWithAnthropic( .filter((b): b is { type: 'text'; text: string } => b.type === 'text') .map((b) => b.text) .join(''); + // Detect token-limit truncation BEFORE attempting to parse. The model + // chops mid-token when it hits max_tokens, so the closing `}` of a deeply + // nested tool schema never gets emitted and JSON.parse blows up with an + // unterminated-string error that's indistinguishable from a refusal at + // the catch site. With stop_reason in hand we can surface a precise + // "spec too large" message and tell the user to split / simplify the + // prompt instead of letting them keep retrying the same one. + if (response.stop_reason === 'max_tokens') { + throw new SpecTruncatedError( + `model hit max_tokens (${opts.maxTokens}) before finishing the spec`, + ); + } const json = extractJson(text); const parsed = GeneratorSpec.safeParse(json); if (!parsed.success) { @@ -341,6 +360,10 @@ export class SpecTimeoutError extends Error { override readonly name = 'SpecTimeoutError'; } +export class SpecTruncatedError extends Error { + override readonly name = 'SpecTruncatedError'; +} + function extractJson(text: string): unknown { const trimmed = text.trim(); const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);