fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts (Marco's research-assistant prompt produces ~12kB of JSON before the model gets cut off mid-string). The error then surfaced as an opaque "Unterminated string in JSON" zod failure instead of pointing the user at the real problem. Two fixes: - maxTokens back to 8192 (the original) for all Claude tiers, 4096 for GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s edge cap. - Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE parsing and throw the new SpecTruncatedError. /preview catches it and returns 422 spec_too_large with a clear "split the prompt" message instead of leaking the zod parse failure.
2026-05-28 19:34:40 +02:00 · 2026-05-28 19:34:40 +02:00 · d2b19a5439
commit d2b19a5439
parent 979d1abfca
2 changed files with 53 additions and 14 deletions
--- a/apps/api/src/routes/servers.ts
+++ b/apps/api/src/routes/servers.ts
@ -14,6 +14,7 @@ import {
 import {
  BannedPatternError,
  SpecTimeoutError,
+  SpecTruncatedError,
  SpecValidationError,
  generateSpec,
  pickPreviewModel,
@ -153,6 +154,21 @@ export async function serverRoutes(app: FastifyInstance): Promise<void> {
          detail: 'Spec generation took too long. Try a shorter, more specific prompt.',
        });
      }
+      if (err instanceof SpecTruncatedError) {
+        app.log.warn(
+          {
+            reason: err.message,
+            prompt: parsed.data.prompt.slice(0, 200),
+            model: choice.displayName,
+          },
+          'preview_spec_truncated',
+        );
+        return reply.code(422).send({
+          error: 'spec_too_large',
+          detail:
+            'The spec for this prompt exceeded the maximum response size. Split it into fewer tools or describe one capability per prompt.',
+        });
+      }
      app.log.error(err);
      return reply.code(500).send({ error: 'preview_failed', detail: (err as Error).message });
    }
--- a/packages/llm/src/index.ts
+++ b/packages/llm/src/index.ts
@ -88,43 +88,50 @@ export interface ModelChoice {
 * visible quality/speed jump *is* the upgrade pitch.
 *
 * Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
- * Claude Sonnet 4.6 ~80 tok/s. A spec is small (<= ~10 tools with short
- * descriptions, ~1.5–2.5k output tokens in practice) so we cap maxTokens at
- * 4096 — well under the model's hard ceiling and tight enough that even
- * Sonnet finishes inside 60s in the worst case (4096 / 80 ≈ 51s). The
- * timeouts above 90s buy headroom for cold starts / slow API responses
- * while staying clear of Cloudflare's 100s edge cap.
+ * Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
+ * number was from the pre-4.6 generation).
+ *
+ * Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious
+ * prompts ("research assistant with web search, papers, wikipedia, …")
+ * routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at
+ * 8192 — the model's effective ceiling for these prompts — and detect the
+ * `stop_reason === 'max_tokens'` case to surface a "spec too large" message
+ * instead of letting the truncated JSON blow up at the zod boundary.
+ *
+ * Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
+ * 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
+ * starts and TCP/TLS setup.
 */
 const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
  hobby: {
    provider: 'glm',
    model: 'glm-4-plus',
-    maxTokens: 3500,
-    timeoutMs: 90_000,
+    maxTokens: 4096,
+    timeoutMs: 95_000,
    displayName: 'Open-tier AI',
    displayBadge: 'open-tier',
  },
  pro: {
    provider: 'anthropic',
    model: 'claude-haiku-4-5-20251001',
-    maxTokens: 4096,
-    timeoutMs: 90_000,
+    maxTokens: 8192,
+    timeoutMs: 95_000,
    displayName: 'Claude Haiku 4.5',
    displayBadge: 'claude-haiku',
  },
  team: {
    provider: 'anthropic',
    model: 'claude-sonnet-4-6',
-    maxTokens: 4096,
-    timeoutMs: 90_000,
+    maxTokens: 8192,
+    timeoutMs: 95_000,
    displayName: 'Claude Sonnet 4.6',
    displayBadge: 'claude-sonnet',
  },
  enterprise: {
    provider: 'anthropic',
    model: 'claude-sonnet-4-6',
-    maxTokens: 4096,
-    timeoutMs: 90_000,
+    maxTokens: 8192,
+    timeoutMs: 95_000,
    displayName: 'Claude Sonnet 4.6',
    displayBadge: 'claude-sonnet',
  },
@ -266,6 +273,18 @@ async function generateWithAnthropic(
    .filter((b): b is { type: 'text'; text: string } => b.type === 'text')
    .map((b) => b.text)
    .join('');
+  // Detect token-limit truncation BEFORE attempting to parse. The model
+  // chops mid-token when it hits max_tokens, so the closing `}` of a deeply
+  // nested tool schema never gets emitted and JSON.parse blows up with an
+  // unterminated-string error that's indistinguishable from a refusal at
+  // the catch site. With stop_reason in hand we can surface a precise
+  // "spec too large" message and tell the user to split / simplify the
+  // prompt instead of letting them keep retrying the same one.
+  if (response.stop_reason === 'max_tokens') {
+    throw new SpecTruncatedError(
+      `model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
+    );
+  }
  const json = extractJson(text);
  const parsed = GeneratorSpec.safeParse(json);
  if (!parsed.success) {
@ -341,6 +360,10 @@ export class SpecTimeoutError extends Error {
  override readonly name = 'SpecTimeoutError';
 }

+export class SpecTruncatedError extends Error {
+  override readonly name = 'SpecTruncatedError';
+}
+
 function extractJson(text: string): unknown {
  const trimmed = text.trim();
  const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);