fix(preview): max_tokens 4096→8192 + detect truncation explicitly
All checks were successful
Deploy to Production / deploy (push) Successful in 1m24s
All checks were successful
Deploy to Production / deploy (push) Successful in 1m24s
Root cause of repeat 422s: 4096 was too tight for ambitious prompts (Marco's research-assistant prompt produces ~12kB of JSON before the model gets cut off mid-string). The error then surfaced as an opaque "Unterminated string in JSON" zod failure instead of pointing the user at the real problem. Two fixes: - maxTokens back to 8192 (the original) for all Claude tiers, 4096 for GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s edge cap. - Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE parsing and throw the new SpecTruncatedError. /preview catches it and returns 422 spec_too_large with a clear "split the prompt" message instead of leaking the zod parse failure.
This commit is contained in:
parent
979d1abfca
commit
d2b19a5439
@ -14,6 +14,7 @@ import {
|
||||
import {
|
||||
BannedPatternError,
|
||||
SpecTimeoutError,
|
||||
SpecTruncatedError,
|
||||
SpecValidationError,
|
||||
generateSpec,
|
||||
pickPreviewModel,
|
||||
@ -153,6 +154,21 @@ export async function serverRoutes(app: FastifyInstance): Promise<void> {
|
||||
detail: 'Spec generation took too long. Try a shorter, more specific prompt.',
|
||||
});
|
||||
}
|
||||
if (err instanceof SpecTruncatedError) {
|
||||
app.log.warn(
|
||||
{
|
||||
reason: err.message,
|
||||
prompt: parsed.data.prompt.slice(0, 200),
|
||||
model: choice.displayName,
|
||||
},
|
||||
'preview_spec_truncated',
|
||||
);
|
||||
return reply.code(422).send({
|
||||
error: 'spec_too_large',
|
||||
detail:
|
||||
'The spec for this prompt exceeded the maximum response size. Split it into fewer tools or describe one capability per prompt.',
|
||||
});
|
||||
}
|
||||
app.log.error(err);
|
||||
return reply.code(500).send({ error: 'preview_failed', detail: (err as Error).message });
|
||||
}
|
||||
|
||||
@ -88,43 +88,50 @@ export interface ModelChoice {
|
||||
* visible quality/speed jump *is* the upgrade pitch.
|
||||
*
|
||||
* Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
|
||||
* Claude Sonnet 4.6 ~80 tok/s. A spec is small (<= ~10 tools with short
|
||||
* descriptions, ~1.5–2.5k output tokens in practice) so we cap maxTokens at
|
||||
* 4096 — well under the model's hard ceiling and tight enough that even
|
||||
* Sonnet finishes inside 60s in the worst case (4096 / 80 ≈ 51s). The
|
||||
* timeouts above 90s buy headroom for cold starts / slow API responses
|
||||
* while staying clear of Cloudflare's 100s edge cap.
|
||||
* Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
|
||||
* number was from the pre-4.6 generation).
|
||||
*
|
||||
* Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious
|
||||
* prompts ("research assistant with web search, papers, wikipedia, …")
|
||||
* routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at
|
||||
* 8192 — the model's effective ceiling for these prompts — and detect the
|
||||
* `stop_reason === 'max_tokens'` case to surface a "spec too large" message
|
||||
* instead of letting the truncated JSON blow up at the zod boundary.
|
||||
*
|
||||
* Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
|
||||
* 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
|
||||
* starts and TCP/TLS setup.
|
||||
*/
|
||||
const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
|
||||
hobby: {
|
||||
provider: 'glm',
|
||||
model: 'glm-4-plus',
|
||||
maxTokens: 3500,
|
||||
timeoutMs: 90_000,
|
||||
maxTokens: 4096,
|
||||
timeoutMs: 95_000,
|
||||
displayName: 'Open-tier AI',
|
||||
displayBadge: 'open-tier',
|
||||
},
|
||||
pro: {
|
||||
provider: 'anthropic',
|
||||
model: 'claude-haiku-4-5-20251001',
|
||||
maxTokens: 4096,
|
||||
timeoutMs: 90_000,
|
||||
maxTokens: 8192,
|
||||
timeoutMs: 95_000,
|
||||
displayName: 'Claude Haiku 4.5',
|
||||
displayBadge: 'claude-haiku',
|
||||
},
|
||||
team: {
|
||||
provider: 'anthropic',
|
||||
model: 'claude-sonnet-4-6',
|
||||
maxTokens: 4096,
|
||||
timeoutMs: 90_000,
|
||||
maxTokens: 8192,
|
||||
timeoutMs: 95_000,
|
||||
displayName: 'Claude Sonnet 4.6',
|
||||
displayBadge: 'claude-sonnet',
|
||||
},
|
||||
enterprise: {
|
||||
provider: 'anthropic',
|
||||
model: 'claude-sonnet-4-6',
|
||||
maxTokens: 4096,
|
||||
timeoutMs: 90_000,
|
||||
maxTokens: 8192,
|
||||
timeoutMs: 95_000,
|
||||
displayName: 'Claude Sonnet 4.6',
|
||||
displayBadge: 'claude-sonnet',
|
||||
},
|
||||
@ -266,6 +273,18 @@ async function generateWithAnthropic(
|
||||
.filter((b): b is { type: 'text'; text: string } => b.type === 'text')
|
||||
.map((b) => b.text)
|
||||
.join('');
|
||||
// Detect token-limit truncation BEFORE attempting to parse. The model
|
||||
// chops mid-token when it hits max_tokens, so the closing `}` of a deeply
|
||||
// nested tool schema never gets emitted and JSON.parse blows up with an
|
||||
// unterminated-string error that's indistinguishable from a refusal at
|
||||
// the catch site. With stop_reason in hand we can surface a precise
|
||||
// "spec too large" message and tell the user to split / simplify the
|
||||
// prompt instead of letting them keep retrying the same one.
|
||||
if (response.stop_reason === 'max_tokens') {
|
||||
throw new SpecTruncatedError(
|
||||
`model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
|
||||
);
|
||||
}
|
||||
const json = extractJson(text);
|
||||
const parsed = GeneratorSpec.safeParse(json);
|
||||
if (!parsed.success) {
|
||||
@ -341,6 +360,10 @@ export class SpecTimeoutError extends Error {
|
||||
override readonly name = 'SpecTimeoutError';
|
||||
}
|
||||
|
||||
export class SpecTruncatedError extends Error {
|
||||
override readonly name = 'SpecTruncatedError';
|
||||
}
|
||||
|
||||
function extractJson(text: string): unknown {
|
||||
const trimmed = text.trim();
|
||||
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user