buildmymcpserver/packages/llm/src/index.ts
Marco Sadjadi 092290bb38
All checks were successful
Deploy to Production / deploy (push) Successful in 1m21s
fix(preview/stream): await onSpec/onError handlers
The llm package called the user-supplied onSpec/onError handlers
without awaiting them. In the /preview/stream route onSpec is async
(it does `await cacheSpec(...)` then writes the SSE `spec` event), so
the api handler's `await streamSpecFromAnthropic(...)` returned BEFORE
the terminal event had been written. The route's finally block then
ran `reply.raw.end()`, the queued `send('spec', ...)` hit a closed
stream and silently no-op'd, and the browser saw zero terminal
events — frontend ran into the "Spec generation failed." fallback
even though Anthropic had delivered a perfectly valid spec.

Verified against prod log: req-8 ran 66s with 200 and produced no
preview_spec_* log line, which is exactly the success-but-event-lost
signature.

Fix:
- StreamHandlers.onSpec / onError typed as Promise<void> | void
- Both call sites in streamSpecFromAnthropic now `await` them
- /preview/stream sets `resolved = true` at the END of each handler
  (after the SSE write completes) so the post-stream "unresolved"
  fallback only fires on a genuine programming bug
- Added preview_spec_ready info log on the happy path so future
  diagnosis doesn't have to infer success from the absence of error
  logs
2026-05-28 22:00:03 +02:00

519 lines
19 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import Anthropic from '@anthropic-ai/sdk';
import { GeneratorSpec, type GeneratorSpec as GeneratorSpecT } from '@bmm/types';
export const SYSTEM_PROMPT = `You generate production-grade MCP server specifications as STRICT JSON.
Output ONE JSON object (no markdown, no prose, no code fences) with this exact shape:
{
"name": "human-readable server name (max 80 chars)",
"description": "one sentence",
"tools": [
{
"name": "snake_case_tool_name",
"description": "single sentence, max 100 chars",
"inputSchema": {
"param_name": { "type": "string|number|boolean|array|object", "description": "short", "required": true }
},
"implementation": "async TS body. The tool's validated arguments arrive in the variable named EXACTLY 'args' (e.g. args.location, args.query). Return { content: [{ type:'text', text:'...' }] }. Secrets via process.env. HTTP via globalThis.fetch with AbortSignal.timeout(10000). Wrap external calls in try/catch and return { content:[{type:'text',text:'Error: ...'}], isError:true } on failure. No eval/Function/child_process. No import statements."
}
],
"resources": [],
"prompts": [],
"requiredSecrets": ["UPPER_SNAKE_CASE"],
"scopes": ["mcp:read"],
"dependencies": {}
}
Hard limits (the output gets truncated past these — write tight):
- At most 6 tools. Combine related capabilities into one tool with a "mode" param rather than splitting.
- Each implementation body: at most 40 lines of code, no defensive overengineering, no comments.
- Each description / inputSchema description: one short clause, no examples.
- Parameterised SQL only (pg with $1 placeholders). No prose, no JSON examples in code.
Return JSON only. No preamble, no closing remark.`;
// Regex blacklist — explicitly NOT a security boundary, just an early-warning
// for obviously-dangerous LLM output. The real defence is the Docker
// hardening in apps/generator/src/lib/deploy.ts (--cap-drop=ALL etc.). A
// determined attacker can bypass any of these with string concatenation
// (`'chi'+'ld_process'`) or alternate APIs — that's why container isolation
// has to hold even when this fails.
//
// Exported so the publish-time template scan in apps/api/src/routes/templates
// can reuse it instead of maintaining a parallel list that drifts. (Zc-001.)
export const SHARED_BANNED_PATTERNS: readonly RegExp[] = [
/\beval\s*\(/,
/\bnew\s+Function\s*\(/,
/\bFunction\s*\(\s*['"`]/, // Function('...') without `new`
/\brequire\s*\(\s*['"]child_process['"]/,
/\bchild_process\b/,
/\bprocess\.binding\b/,
/\bprocess\.dlopen\b/,
/\.constructor\s*\.\s*constructor\b/, // [].constructor.constructor('...')
/\b_load\s*\(/,
/\bvm\.runIn(This|New)Context\b/,
/globalThis\s*\[\s*['"`]/, // globalThis['Fun'+'ction']
/ignore\s+previous\s+instructions/i,
/disregard\s+(the\s+)?(above|previous)/i,
/system\s+prompt\s+override/i,
];
// ──────────────────────────────────────────────────────────────────────────
// Plan-aware model selection
// ──────────────────────────────────────────────────────────────────────────
export type Plan = 'hobby' | 'pro' | 'team' | 'enterprise';
export type Purpose = 'preview' | 'build';
export type Provider = 'anthropic' | 'glm';
export type DisplayBadge = 'open-tier' | 'claude-haiku' | 'claude-sonnet' | 'claude-opus';
export interface ModelChoice {
provider: Provider;
model: string;
maxTokens: number;
timeoutMs: number;
/** User-facing model name shown in the wizard + previews. */
displayName: string;
displayBadge: DisplayBadge;
}
/**
* Preview runs synchronously inside an HTTP request behind Cloudflare's
* ~100s edge cap. Each tier's (model + max_tokens + timeout) is bounded to
* fit. Hobby uses GLM as the cost lever; paid tiers escalate to Claude — the
* visible quality/speed jump *is* the upgrade pitch.
*
* Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
* Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
* number was from the pre-4.6 generation).
*
* Token budget: a *small* spec is ~1.52.5k output tokens, but ambitious
* prompts ("research assistant with web search, papers, wikipedia, …")
* routinely produce 68k tokens of deeply-nested tool schemas. We cap at
* 8192 — the model's effective ceiling for these prompts — and detect the
* `stop_reason === 'max_tokens'` case to surface a "spec too large" message
* instead of letting the truncated JSON blow up at the zod boundary.
*
* Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
* 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
* starts and TCP/TLS setup.
*/
const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
hobby: {
provider: 'glm',
model: 'glm-4-plus',
maxTokens: 4096,
timeoutMs: 95_000,
displayName: 'Open-tier AI',
displayBadge: 'open-tier',
},
pro: {
provider: 'anthropic',
model: 'claude-haiku-4-5-20251001',
maxTokens: 8192,
timeoutMs: 95_000,
displayName: 'Claude Haiku 4.5',
displayBadge: 'claude-haiku',
},
team: {
provider: 'anthropic',
model: 'claude-sonnet-4-6',
maxTokens: 12288,
timeoutMs: 95_000,
displayName: 'Claude Sonnet 4.6',
displayBadge: 'claude-sonnet',
},
enterprise: {
provider: 'anthropic',
model: 'claude-sonnet-4-6',
maxTokens: 12288,
timeoutMs: 95_000,
displayName: 'Claude Sonnet 4.6',
displayBadge: 'claude-sonnet',
},
};
/**
* Build worker runs async via BullMQ — no proxy timeout. With the 24h preview
* cache TTL cache-misses are rare, so GLM as the default keeps that rare path
* cheap; Enterprise gets Opus as a premium-quality promise.
*/
const BUILD_MODELS: Record<Plan, ModelChoice> = {
hobby: {
provider: 'glm',
model: 'glm-4.5',
maxTokens: 8192,
timeoutMs: 180_000,
displayName: 'Open-tier AI',
displayBadge: 'open-tier',
},
pro: {
provider: 'glm',
model: 'glm-4.5',
maxTokens: 8192,
timeoutMs: 180_000,
displayName: 'Open-tier AI',
displayBadge: 'open-tier',
},
team: {
provider: 'glm',
model: 'glm-4.5',
maxTokens: 8192,
timeoutMs: 180_000,
displayName: 'Open-tier AI',
displayBadge: 'open-tier',
},
enterprise: {
provider: 'anthropic',
model: 'claude-opus-4-7',
maxTokens: 8192,
timeoutMs: 600_000,
displayName: 'Claude Opus 4.7',
displayBadge: 'claude-opus',
},
};
export function pickPreviewModel(plan: Plan): ModelChoice {
return PREVIEW_MODELS[plan];
}
export function pickBuildModel(plan: Plan): ModelChoice {
return BUILD_MODELS[plan];
}
// ──────────────────────────────────────────────────────────────────────────
// Generation API
// ──────────────────────────────────────────────────────────────────────────
export interface GenerationResult {
spec: GeneratorSpecT;
source: 'claude' | 'glm' | 'mock';
}
export interface GenerateOptions {
/** 'anthropic' (default) or 'glm'. */
provider?: Provider;
/** Anthropic API key — required if provider === 'anthropic'. */
apiKey?: string;
/** Zhipu (GLM) API key — required if provider === 'glm'. */
glmApiKey?: string;
model?: string;
maxTokens?: number;
/** Per-attempt request timeout in ms. */
timeoutMs?: number;
/** SDK retry count. Anthropic only. */
maxRetries?: number;
}
export async function generateSpec(
prompt: string,
opts: GenerateOptions = {},
): Promise<GenerationResult> {
const provider = opts.provider ?? 'anthropic';
if (provider === 'glm') {
if (!opts.glmApiKey) return { spec: mockSpec(prompt), source: 'mock' };
return generateWithGlm(prompt, {
apiKey: opts.glmApiKey,
model: opts.model ?? 'glm-4-plus',
maxTokens: opts.maxTokens ?? 4096,
timeoutMs: opts.timeoutMs,
});
}
if (!opts.apiKey) {
return { spec: mockSpec(prompt), source: 'mock' };
}
return generateWithAnthropic(prompt, {
apiKey: opts.apiKey,
model: opts.model ?? 'claude-opus-4-7',
maxTokens: opts.maxTokens ?? 8192,
timeoutMs: opts.timeoutMs,
maxRetries: opts.maxRetries,
});
}
async function generateWithAnthropic(
prompt: string,
opts: {
apiKey: string;
model: string;
maxTokens: number;
timeoutMs?: number;
maxRetries?: number;
},
): Promise<GenerationResult> {
const client = new Anthropic({ apiKey: opts.apiKey });
const requestOptions: { timeout?: number; maxRetries?: number } = {};
if (opts.timeoutMs !== undefined) requestOptions.timeout = opts.timeoutMs;
if (opts.maxRetries !== undefined) requestOptions.maxRetries = opts.maxRetries;
const response = await client.messages
.create(
{
model: opts.model,
max_tokens: opts.maxTokens,
system: SYSTEM_PROMPT,
messages: [{ role: 'user', content: prompt }],
},
requestOptions,
)
.catch((err: unknown) => {
if (err instanceof Anthropic.APIConnectionTimeoutError) {
throw new SpecTimeoutError('spec generation exceeded the time budget');
}
throw err;
});
const text = response.content
.filter((b): b is { type: 'text'; text: string } => b.type === 'text')
.map((b) => b.text)
.join('');
// Detect token-limit truncation BEFORE attempting to parse. The model
// chops mid-token when it hits max_tokens, so the closing `}` of a deeply
// nested tool schema never gets emitted and JSON.parse blows up with an
// unterminated-string error that's indistinguishable from a refusal at
// the catch site. With stop_reason in hand we can surface a precise
// "spec too large" message and tell the user to split / simplify the
// prompt instead of letting them keep retrying the same one.
if (response.stop_reason === 'max_tokens') {
throw new SpecTruncatedError(
`model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
);
}
const json = extractJson(text);
const parsed = GeneratorSpec.safeParse(json);
if (!parsed.success) {
// Include a truncated raw preview so the caller (api log) can see whether
// the model returned non-JSON / a refusal / a near-miss schema, instead
// of just the opaque zod error.
const preview = text.slice(0, 400).replace(/\s+/g, ' ');
throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`);
}
scanForInjection(parsed.data);
return { spec: parsed.data, source: 'claude' };
}
// ──────────────────────────────────────────────────────────────────────────
// Streaming generation (Anthropic only)
// ──────────────────────────────────────────────────────────────────────────
export interface StreamHandlers {
/** Called for each text delta emitted by the model. Sync — must not throw. */
onText: (text: string) => void;
/**
* Called once when the stream completes successfully with the final spec.
* MAY return a Promise — the caller awaits it before considering the
* stream finished. This is critical for SSE callers that need to write
* a final event and end the response: returning a void instead of
* Promise<void> would leak the response.end() call before the event
* is actually written, leaving the client with no terminal frame.
*/
onSpec: (result: GenerationResult) => Promise<void> | void;
/**
* Called once on any terminal error (timeout, truncation, validation).
* Same async contract as onSpec.
*/
onError: (err: Error) => Promise<void> | void;
}
/**
* Stream a spec from Anthropic, piping text deltas to a handler and finally
* surfacing the parsed/validated spec or the relevant typed error.
*
* Why streaming for /preview: Cloudflare's edge timeout is ~100s on the Free
* tier and our previous sync call could approach that for ambitious prompts.
* With streaming the TCP connection writes bytes from the first model token,
* which keeps CF (and nginx) from cutting us off — runtime is bounded only by
* the model itself and our own AbortController, not by CF.
*/
export async function streamSpecFromAnthropic(
prompt: string,
opts: { apiKey: string; model: string; maxTokens: number; signal?: AbortSignal },
handlers: StreamHandlers,
): Promise<void> {
const client = new Anthropic({ apiKey: opts.apiKey });
let accumulated = '';
try {
const stream = client.messages.stream({
model: opts.model,
max_tokens: opts.maxTokens,
system: SYSTEM_PROMPT,
messages: [{ role: 'user', content: prompt }],
});
if (opts.signal) {
opts.signal.addEventListener('abort', () => stream.abort(), { once: true });
}
stream.on('text', (delta) => {
accumulated += delta;
handlers.onText(delta);
});
const final = await stream.finalMessage();
if (final.stop_reason === 'max_tokens') {
throw new SpecTruncatedError(
`model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
);
}
const json = extractJson(accumulated);
const parsed = GeneratorSpec.safeParse(json);
if (!parsed.success) {
const preview = accumulated.slice(0, 400).replace(/\s+/g, ' ');
throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`);
}
scanForInjection(parsed.data);
// AWAITED on purpose — the SSE caller writes the terminal 'spec' event
// inside this handler and we must not return (and thereby allow the
// caller to .end() the response) until that write has completed.
await handlers.onSpec({ spec: parsed.data, source: 'claude' });
} catch (err) {
const terminal =
err instanceof Anthropic.APIConnectionTimeoutError
? new SpecTimeoutError('spec generation exceeded the time budget')
: err instanceof Error
? err
: new Error(String(err));
await handlers.onError(terminal);
}
}
const GLM_ENDPOINT = 'https://open.bigmodel.cn/api/paas/v4/chat/completions';
async function generateWithGlm(
prompt: string,
opts: { apiKey: string; model: string; maxTokens: number; timeoutMs?: number },
): Promise<GenerationResult> {
const controller = new AbortController();
const timer = opts.timeoutMs ? setTimeout(() => controller.abort(), opts.timeoutMs) : null;
let res: Response;
try {
res = await fetch(GLM_ENDPOINT, {
method: 'POST',
headers: {
Authorization: `Bearer ${opts.apiKey}`,
'Content-Type': 'application/json',
},
body: JSON.stringify({
model: opts.model,
max_tokens: opts.maxTokens,
messages: [
{ role: 'system', content: SYSTEM_PROMPT },
{ role: 'user', content: prompt },
],
}),
signal: controller.signal,
});
} catch (err) {
if ((err as { name?: string }).name === 'AbortError') {
throw new SpecTimeoutError('glm spec generation exceeded the time budget');
}
throw err;
} finally {
if (timer) clearTimeout(timer);
}
if (!res.ok) {
const body = await res.text().catch(() => '');
throw new Error(`glm_api_${res.status}: ${body.slice(0, 200)}`);
}
const data = (await res.json()) as {
choices?: Array<{ message?: { content?: string }; finish_reason?: string }>;
};
const content = data.choices?.[0]?.message?.content;
if (!content) throw new SpecValidationError('glm_empty_response');
const json = extractJson(content);
const parsed = GeneratorSpec.safeParse(json);
if (!parsed.success) throw new SpecValidationError(parsed.error.message);
scanForInjection(parsed.data);
return { spec: parsed.data, source: 'glm' };
}
export class SpecValidationError extends Error {
override readonly name = 'SpecValidationError';
}
export class BannedPatternError extends Error {
override readonly name = 'BannedPatternError';
}
export class SpecTimeoutError extends Error {
override readonly name = 'SpecTimeoutError';
}
export class SpecTruncatedError extends Error {
override readonly name = 'SpecTruncatedError';
}
function extractJson(text: string): unknown {
const trimmed = text.trim();
const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);
const body = fenced ? fenced[1] : trimmed;
if (!body) throw new SpecValidationError('empty_generation_output');
try {
return JSON.parse(body);
} catch (e) {
throw new SpecValidationError(`generation_not_json: ${(e as Error).message}`);
}
}
/**
* Public so other layers (the spec-edit merge in apps/api) can re-scan a
* user-edited spec without duplicating the pattern list — single source of
* truth for what counts as obviously-dangerous LLM output.
*/
export function scanForInjection(spec: GeneratorSpecT): void {
for (const tool of spec.tools) {
// Collect every string the LLM could have planted a payload in. Downstream
// AI clients (Claude Desktop, Cursor) read tool.name + every inputSchema
// description verbatim, so an injection there can pivot the user's AI
// session — not only the runtime code.
const surfaces: string[] = [tool.name, tool.description, tool.implementation];
for (const param of Object.values(tool.inputSchema)) {
if (param && typeof param === 'object' && 'description' in param) {
const d = (param as { description?: unknown }).description;
if (typeof d === 'string') surfaces.push(d);
}
}
for (const text of surfaces) {
for (const pattern of SHARED_BANNED_PATTERNS) {
if (pattern.test(text)) {
throw new BannedPatternError(`banned_pattern_detected: ${pattern.source}`);
}
}
}
}
}
export function mockSpec(prompt: string): GeneratorSpecT {
return {
name: 'Echo MCP',
description: `Mock server (no LLM key). Prompt was: ${prompt.slice(0, 200)}`,
tools: [
{
name: 'echo',
description: 'Echoes the input string back to the caller.',
inputSchema: {
message: { type: 'string', description: 'Message to echo back', required: true },
},
implementation: `const msg = String(args.message ?? '');\nreturn { content: [{ type: 'text', text: \`echo: \${msg}\` }] };`,
},
{
name: 'now',
description: 'Returns the current server UTC timestamp.',
inputSchema: {},
implementation: `return { content: [{ type: 'text', text: new Date().toISOString() }] };`,
},
],
resources: [],
prompts: [],
requiredSecrets: [],
scopes: ['mcp:read'],
dependencies: {},
};
}