From d2b19a5439fe4800b9785b84bde3b9e68da3e9f3 Mon Sep 17 00:00:00 2001
From: Marco Sadjadi <marco@buildmymcpserver.local>
Date: Thu, 28 May 2026 19:34:40 +0200
Subject: [PATCH] =?UTF-8?q?fix(preview):=20max=5Ftokens=204096=E2=86=92819?=
 =?UTF-8?q?2=20+=20detect=20truncation=20explicitly?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.
---
 apps/api/src/routes/servers.ts | 16 +++++++++++
 packages/llm/src/index.ts      | 51 ++++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 14 deletions(-)
diff --git a/apps/api/src/routes/servers.ts b/apps/api/src/routes/servers.ts
index bc0ae41..a3a3b4a 100644
--- a/apps/api/src/routes/servers.ts
+++ b/apps/api/src/routes/servers.ts
@@ -14,6 +14,7 @@ import {
 import {
   BannedPatternError,
   SpecTimeoutError,
+  SpecTruncatedError,
   SpecValidationError,
   generateSpec,
   pickPreviewModel,
@@ -153,6 +154,21 @@ export async function serverRoutes(app: FastifyInstance): Promise<void> {
           detail: 'Spec generation took too long. Try a shorter, more specific prompt.',
         });
       }
+      if (err instanceof SpecTruncatedError) {
+        app.log.warn(
+          {
+            reason: err.message,
+            prompt: parsed.data.prompt.slice(0, 200),
+            model: choice.displayName,
+          },
+          'preview_spec_truncated',
+        );
+        return reply.code(422).send({
+          error: 'spec_too_large',
+          detail:
+            'The spec for this prompt exceeded the maximum response size. Split it into fewer tools or describe one capability per prompt.',
+        });
+      }
       app.log.error(err);
       return reply.code(500).send({ error: 'preview_failed', detail: (err as Error).message });
     }
diff --git a/packages/llm/src/index.ts b/packages/llm/src/index.ts
index 0711440..9872e3d 100644
--- a/packages/llm/src/index.ts
+++ b/packages/llm/src/index.ts
@@ -88,43 +88,50 @@ export interface ModelChoice {
  * visible quality/speed jump *is* the upgrade pitch.
  *
  * Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
- * Claude Sonnet 4.6 ~80 tok/s. A spec is small (<= ~10 tools with short
- * descriptions, ~1.5–2.5k output tokens in practice) so we cap maxTokens at
- * 4096 — well under the model's hard ceiling and tight enough that even
- * Sonnet finishes inside 60s in the worst case (4096 / 80 ≈ 51s). The
- * timeouts above 90s buy headroom for cold starts / slow API responses
- * while staying clear of Cloudflare's 100s edge cap.
+ * Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
+ * number was from the pre-4.6 generation).
+ *
+ * Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious
+ * prompts ("research assistant with web search, papers, wikipedia, …")
+ * routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at
+ * 8192 — the model's effective ceiling for these prompts — and detect the
+ * `stop_reason === 'max_tokens'` case to surface a "spec too large" message
+ * instead of letting the truncated JSON blow up at the zod boundary.
+ *
+ * Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
+ * 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
+ * starts and TCP/TLS setup.
  */
 const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
   hobby: {
     provider: 'glm',
     model: 'glm-4-plus',
-    maxTokens: 3500,
-    timeoutMs: 90_000,
+    maxTokens: 4096,
+    timeoutMs: 95_000,
     displayName: 'Open-tier AI',
     displayBadge: 'open-tier',
   },
   pro: {
     provider: 'anthropic',
     model: 'claude-haiku-4-5-20251001',
-    maxTokens: 4096,
-    timeoutMs: 90_000,
+    maxTokens: 8192,
+    timeoutMs: 95_000,
     displayName: 'Claude Haiku 4.5',
     displayBadge: 'claude-haiku',
   },
   team: {
     provider: 'anthropic',
     model: 'claude-sonnet-4-6',
-    maxTokens: 4096,
-    timeoutMs: 90_000,
+    maxTokens: 8192,
+    timeoutMs: 95_000,
     displayName: 'Claude Sonnet 4.6',
     displayBadge: 'claude-sonnet',
   },
   enterprise: {
     provider: 'anthropic',
     model: 'claude-sonnet-4-6',
-    maxTokens: 4096,
-    timeoutMs: 90_000,
+    maxTokens: 8192,
+    timeoutMs: 95_000,
     displayName: 'Claude Sonnet 4.6',
     displayBadge: 'claude-sonnet',
   },
@@ -266,6 +273,18 @@ async function generateWithAnthropic(
     .filter((b): b is { type: 'text'; text: string } => b.type === 'text')
     .map((b) => b.text)
     .join('');
+  // Detect token-limit truncation BEFORE attempting to parse. The model
+  // chops mid-token when it hits max_tokens, so the closing `}` of a deeply
+  // nested tool schema never gets emitted and JSON.parse blows up with an
+  // unterminated-string error that's indistinguishable from a refusal at
+  // the catch site. With stop_reason in hand we can surface a precise
+  // "spec too large" message and tell the user to split / simplify the
+  // prompt instead of letting them keep retrying the same one.
+  if (response.stop_reason === 'max_tokens') {
+    throw new SpecTruncatedError(
+      `model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
+    );
+  }
   const json = extractJson(text);
   const parsed = GeneratorSpec.safeParse(json);
   if (!parsed.success) {
@@ -341,6 +360,10 @@ export class SpecTimeoutError extends Error {
   override readonly name = 'SpecTimeoutError';
 }
 
+export class SpecTruncatedError extends Error {
+  override readonly name = 'SpecTruncatedError';
+}
+
 function extractJson(text: string): unknown {
   const trimmed = text.trim();
   const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);