fix(preview): max_tokens 4096→8192 + detect truncation explicitly
All checks were successful
Deploy to Production / deploy (push) Successful in 1m24s

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.
This commit is contained in:
Marco Sadjadi 2026-05-28 19:34:40 +02:00
parent 979d1abfca
commit d2b19a5439
2 changed files with 53 additions and 14 deletions

View File

@ -14,6 +14,7 @@ import {
import {
BannedPatternError,
SpecTimeoutError,
SpecTruncatedError,
SpecValidationError,
generateSpec,
pickPreviewModel,
@ -153,6 +154,21 @@ export async function serverRoutes(app: FastifyInstance): Promise<void> {
detail: 'Spec generation took too long. Try a shorter, more specific prompt.',
});
}
if (err instanceof SpecTruncatedError) {
app.log.warn(
{
reason: err.message,
prompt: parsed.data.prompt.slice(0, 200),
model: choice.displayName,
},
'preview_spec_truncated',
);
return reply.code(422).send({
error: 'spec_too_large',
detail:
'The spec for this prompt exceeded the maximum response size. Split it into fewer tools or describe one capability per prompt.',
});
}
app.log.error(err);
return reply.code(500).send({ error: 'preview_failed', detail: (err as Error).message });
}

View File

@ -88,43 +88,50 @@ export interface ModelChoice {
* visible quality/speed jump *is* the upgrade pitch.
*
* Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
* Claude Sonnet 4.6 ~80 tok/s. A spec is small (<= ~10 tools with short
* descriptions, ~1.52.5k output tokens in practice) so we cap maxTokens at
* 4096 well under the model's hard ceiling and tight enough that even
* Sonnet finishes inside 60s in the worst case (4096 / 80 51s). The
* timeouts above 90s buy headroom for cold starts / slow API responses
* while staying clear of Cloudflare's 100s edge cap.
* Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
* number was from the pre-4.6 generation).
*
* Token budget: a *small* spec is ~1.52.5k output tokens, but ambitious
* prompts ("research assistant with web search, papers, wikipedia, …")
* routinely produce 68k tokens of deeply-nested tool schemas. We cap at
* 8192 the model's effective ceiling for these prompts and detect the
* `stop_reason === 'max_tokens'` case to surface a "spec too large" message
* instead of letting the truncated JSON blow up at the zod boundary.
*
* Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
* 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
* starts and TCP/TLS setup.
*/
const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
hobby: {
provider: 'glm',
model: 'glm-4-plus',
maxTokens: 3500,
timeoutMs: 90_000,
maxTokens: 4096,
timeoutMs: 95_000,
displayName: 'Open-tier AI',
displayBadge: 'open-tier',
},
pro: {
provider: 'anthropic',
model: 'claude-haiku-4-5-20251001',
maxTokens: 4096,
timeoutMs: 90_000,
maxTokens: 8192,
timeoutMs: 95_000,
displayName: 'Claude Haiku 4.5',
displayBadge: 'claude-haiku',
},
team: {
provider: 'anthropic',
model: 'claude-sonnet-4-6',
maxTokens: 4096,
timeoutMs: 90_000,
maxTokens: 8192,
timeoutMs: 95_000,
displayName: 'Claude Sonnet 4.6',
displayBadge: 'claude-sonnet',
},
enterprise: {
provider: 'anthropic',
model: 'claude-sonnet-4-6',
maxTokens: 4096,
timeoutMs: 90_000,
maxTokens: 8192,
timeoutMs: 95_000,
displayName: 'Claude Sonnet 4.6',
displayBadge: 'claude-sonnet',
},
@ -266,6 +273,18 @@ async function generateWithAnthropic(
.filter((b): b is { type: 'text'; text: string } => b.type === 'text')
.map((b) => b.text)
.join('');
// Detect token-limit truncation BEFORE attempting to parse. The model
// chops mid-token when it hits max_tokens, so the closing `}` of a deeply
// nested tool schema never gets emitted and JSON.parse blows up with an
// unterminated-string error that's indistinguishable from a refusal at
// the catch site. With stop_reason in hand we can surface a precise
// "spec too large" message and tell the user to split / simplify the
// prompt instead of letting them keep retrying the same one.
if (response.stop_reason === 'max_tokens') {
throw new SpecTruncatedError(
`model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
);
}
const json = extractJson(text);
const parsed = GeneratorSpec.safeParse(json);
if (!parsed.success) {
@ -341,6 +360,10 @@ export class SpecTimeoutError extends Error {
override readonly name = 'SpecTimeoutError';
}
export class SpecTruncatedError extends Error {
override readonly name = 'SpecTruncatedError';
}
function extractJson(text: string): unknown {
const trimmed = text.trim();
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);