import Anthropic from '@anthropic-ai/sdk'; import { GeneratorSpec, type GeneratorSpec as GeneratorSpecT } from '@bmm/types'; export const SYSTEM_PROMPT = `You generate production-grade MCP server specifications as STRICT JSON. Output ONE JSON object (no markdown, no prose, no code fences) with this exact shape: { "name": "human-readable server name (max 80 chars)", "description": "one sentence", "tools": [ { "name": "snake_case_tool_name", "description": "single sentence, max 100 chars", "inputSchema": { "param_name": { "type": "string|number|boolean|array|object", "description": "short", "required": true } }, "implementation": "async TS body. The tool's validated arguments arrive in the variable named EXACTLY 'args' (e.g. args.location, args.query). Return { content: [{ type:'text', text:'...' }] }. Secrets via process.env. HTTP via globalThis.fetch with AbortSignal.timeout(10000). Wrap external calls in try/catch and return { content:[{type:'text',text:'Error: ...'}], isError:true } on failure. No eval/Function/child_process. No import statements." } ], "resources": [], "prompts": [], "requiredSecrets": ["UPPER_SNAKE_CASE"], "scopes": ["mcp:read"], "dependencies": {} } Hard limits (the output gets truncated past these — write tight): - At most 6 tools. Combine related capabilities into one tool with a "mode" param rather than splitting. - Each implementation body: at most 40 lines of code, no defensive overengineering, no comments. - Each description / inputSchema description: one short clause, no examples. - Parameterised SQL only (pg with $1 placeholders). No prose, no JSON examples in code. Return JSON only. No preamble, no closing remark.`; // Regex blacklist — explicitly NOT a security boundary, just an early-warning // for obviously-dangerous LLM output. The real defence is the Docker // hardening in apps/generator/src/lib/deploy.ts (--cap-drop=ALL etc.). A // determined attacker can bypass any of these with string concatenation // (`'chi'+'ld_process'`) or alternate APIs — that's why container isolation // has to hold even when this fails. // // Exported so the publish-time template scan in apps/api/src/routes/templates // can reuse it instead of maintaining a parallel list that drifts. (Zc-001.) export const SHARED_BANNED_PATTERNS: readonly RegExp[] = [ /\beval\s*\(/, /\bnew\s+Function\s*\(/, /\bFunction\s*\(\s*['"`]/, // Function('...') without `new` /\brequire\s*\(\s*['"]child_process['"]/, /\bchild_process\b/, /\bprocess\.binding\b/, /\bprocess\.dlopen\b/, /\.constructor\s*\.\s*constructor\b/, // [].constructor.constructor('...') /\b_load\s*\(/, /\bvm\.runIn(This|New)Context\b/, /globalThis\s*\[\s*['"`]/, // globalThis['Fun'+'ction'] /ignore\s+previous\s+instructions/i, /disregard\s+(the\s+)?(above|previous)/i, /system\s+prompt\s+override/i, ]; // ────────────────────────────────────────────────────────────────────────── // Plan-aware model selection // ────────────────────────────────────────────────────────────────────────── export type Plan = 'hobby' | 'pro' | 'team' | 'enterprise'; export type Purpose = 'preview' | 'build'; export type Provider = 'anthropic' | 'glm'; export type DisplayBadge = 'open-tier' | 'claude-haiku' | 'claude-sonnet' | 'claude-opus'; export interface ModelChoice { provider: Provider; model: string; maxTokens: number; timeoutMs: number; /** User-facing model name shown in the wizard + previews. */ displayName: string; displayBadge: DisplayBadge; } /** * Preview runs synchronously inside an HTTP request behind Cloudflare's * ~100s edge cap. Each tier's (model + max_tokens + timeout) is bounded to * fit. Hobby uses GLM as the cost lever; paid tiers escalate to Claude — the * visible quality/speed jump *is* the upgrade pitch. * * Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s · * Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s * number was from the pre-4.6 generation). * * Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious * prompts ("research assistant with web search, papers, wikipedia, …") * routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at * 8192 — the model's effective ceiling for these prompts — and detect the * `stop_reason === 'max_tokens'` case to surface a "spec too large" message * instead of letting the truncated JSON blow up at the zod boundary. * * Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at * 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold * starts and TCP/TLS setup. */ const PREVIEW_MODELS: Record = { hobby: { provider: 'glm', model: 'glm-4-plus', maxTokens: 4096, timeoutMs: 95_000, displayName: 'Open-tier AI', displayBadge: 'open-tier', }, pro: { provider: 'anthropic', model: 'claude-haiku-4-5-20251001', maxTokens: 8192, timeoutMs: 95_000, displayName: 'Claude Haiku 4.5', displayBadge: 'claude-haiku', }, team: { provider: 'anthropic', model: 'claude-sonnet-4-6', maxTokens: 12288, timeoutMs: 95_000, displayName: 'Claude Sonnet 4.6', displayBadge: 'claude-sonnet', }, enterprise: { provider: 'anthropic', model: 'claude-sonnet-4-6', maxTokens: 12288, timeoutMs: 95_000, displayName: 'Claude Sonnet 4.6', displayBadge: 'claude-sonnet', }, }; /** * Build worker runs async via BullMQ — no proxy timeout. With the 24h preview * cache TTL cache-misses are rare, so GLM as the default keeps that rare path * cheap; Enterprise gets Opus as a premium-quality promise. */ const BUILD_MODELS: Record = { hobby: { provider: 'glm', model: 'glm-4.5', maxTokens: 8192, timeoutMs: 180_000, displayName: 'Open-tier AI', displayBadge: 'open-tier', }, pro: { provider: 'glm', model: 'glm-4.5', maxTokens: 8192, timeoutMs: 180_000, displayName: 'Open-tier AI', displayBadge: 'open-tier', }, team: { provider: 'glm', model: 'glm-4.5', maxTokens: 8192, timeoutMs: 180_000, displayName: 'Open-tier AI', displayBadge: 'open-tier', }, enterprise: { provider: 'anthropic', model: 'claude-opus-4-7', maxTokens: 8192, timeoutMs: 600_000, displayName: 'Claude Opus 4.7', displayBadge: 'claude-opus', }, }; export function pickPreviewModel(plan: Plan): ModelChoice { return PREVIEW_MODELS[plan]; } export function pickBuildModel(plan: Plan): ModelChoice { return BUILD_MODELS[plan]; } // ────────────────────────────────────────────────────────────────────────── // Generation API // ────────────────────────────────────────────────────────────────────────── export interface GenerationResult { spec: GeneratorSpecT; source: 'claude' | 'glm' | 'mock'; } export interface GenerateOptions { /** 'anthropic' (default) or 'glm'. */ provider?: Provider; /** Anthropic API key — required if provider === 'anthropic'. */ apiKey?: string; /** Zhipu (GLM) API key — required if provider === 'glm'. */ glmApiKey?: string; model?: string; maxTokens?: number; /** Per-attempt request timeout in ms. */ timeoutMs?: number; /** SDK retry count. Anthropic only. */ maxRetries?: number; } export async function generateSpec( prompt: string, opts: GenerateOptions = {}, ): Promise { const provider = opts.provider ?? 'anthropic'; if (provider === 'glm') { if (!opts.glmApiKey) return { spec: mockSpec(prompt), source: 'mock' }; return generateWithGlm(prompt, { apiKey: opts.glmApiKey, model: opts.model ?? 'glm-4-plus', maxTokens: opts.maxTokens ?? 4096, timeoutMs: opts.timeoutMs, }); } if (!opts.apiKey) { return { spec: mockSpec(prompt), source: 'mock' }; } return generateWithAnthropic(prompt, { apiKey: opts.apiKey, model: opts.model ?? 'claude-opus-4-7', maxTokens: opts.maxTokens ?? 8192, timeoutMs: opts.timeoutMs, maxRetries: opts.maxRetries, }); } async function generateWithAnthropic( prompt: string, opts: { apiKey: string; model: string; maxTokens: number; timeoutMs?: number; maxRetries?: number; }, ): Promise { const client = new Anthropic({ apiKey: opts.apiKey }); const requestOptions: { timeout?: number; maxRetries?: number } = {}; if (opts.timeoutMs !== undefined) requestOptions.timeout = opts.timeoutMs; if (opts.maxRetries !== undefined) requestOptions.maxRetries = opts.maxRetries; const response = await client.messages .create( { model: opts.model, max_tokens: opts.maxTokens, system: SYSTEM_PROMPT, messages: [{ role: 'user', content: prompt }], }, requestOptions, ) .catch((err: unknown) => { if (err instanceof Anthropic.APIConnectionTimeoutError) { throw new SpecTimeoutError('spec generation exceeded the time budget'); } throw err; }); const text = response.content .filter((b): b is { type: 'text'; text: string } => b.type === 'text') .map((b) => b.text) .join(''); // Detect token-limit truncation BEFORE attempting to parse. The model // chops mid-token when it hits max_tokens, so the closing `}` of a deeply // nested tool schema never gets emitted and JSON.parse blows up with an // unterminated-string error that's indistinguishable from a refusal at // the catch site. With stop_reason in hand we can surface a precise // "spec too large" message and tell the user to split / simplify the // prompt instead of letting them keep retrying the same one. if (response.stop_reason === 'max_tokens') { throw new SpecTruncatedError( `model hit max_tokens (${opts.maxTokens}) before finishing the spec`, ); } const json = extractJson(text); const parsed = GeneratorSpec.safeParse(json); if (!parsed.success) { // Include a truncated raw preview so the caller (api log) can see whether // the model returned non-JSON / a refusal / a near-miss schema, instead // of just the opaque zod error. const preview = text.slice(0, 400).replace(/\s+/g, ' '); throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`); } scanForInjection(parsed.data); return { spec: parsed.data, source: 'claude' }; } // ────────────────────────────────────────────────────────────────────────── // Streaming generation (Anthropic only) // ────────────────────────────────────────────────────────────────────────── export interface StreamHandlers { /** Called for each text delta emitted by the model. Sync — must not throw. */ onText: (text: string) => void; /** * Called once when the stream completes successfully with the final spec. * MAY return a Promise — the caller awaits it before considering the * stream finished. This is critical for SSE callers that need to write * a final event and end the response: returning a void instead of * Promise would leak the response.end() call before the event * is actually written, leaving the client with no terminal frame. */ onSpec: (result: GenerationResult) => Promise | void; /** * Called once on any terminal error (timeout, truncation, validation). * Same async contract as onSpec. */ onError: (err: Error) => Promise | void; } /** * Stream a spec from Anthropic, piping text deltas to a handler and finally * surfacing the parsed/validated spec or the relevant typed error. * * Why streaming for /preview: Cloudflare's edge timeout is ~100s on the Free * tier and our previous sync call could approach that for ambitious prompts. * With streaming the TCP connection writes bytes from the first model token, * which keeps CF (and nginx) from cutting us off — runtime is bounded only by * the model itself and our own AbortController, not by CF. */ export async function streamSpecFromAnthropic( prompt: string, opts: { apiKey: string; model: string; maxTokens: number; signal?: AbortSignal }, handlers: StreamHandlers, ): Promise { const client = new Anthropic({ apiKey: opts.apiKey }); let accumulated = ''; try { const stream = client.messages.stream({ model: opts.model, max_tokens: opts.maxTokens, system: SYSTEM_PROMPT, messages: [{ role: 'user', content: prompt }], }); if (opts.signal) { opts.signal.addEventListener('abort', () => stream.abort(), { once: true }); } stream.on('text', (delta) => { accumulated += delta; handlers.onText(delta); }); const final = await stream.finalMessage(); if (final.stop_reason === 'max_tokens') { throw new SpecTruncatedError( `model hit max_tokens (${opts.maxTokens}) before finishing the spec`, ); } const json = extractJson(accumulated); const parsed = GeneratorSpec.safeParse(json); if (!parsed.success) { const preview = accumulated.slice(0, 400).replace(/\s+/g, ' '); throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`); } scanForInjection(parsed.data); // AWAITED on purpose — the SSE caller writes the terminal 'spec' event // inside this handler and we must not return (and thereby allow the // caller to .end() the response) until that write has completed. await handlers.onSpec({ spec: parsed.data, source: 'claude' }); } catch (err) { const terminal = err instanceof Anthropic.APIConnectionTimeoutError ? new SpecTimeoutError('spec generation exceeded the time budget') : err instanceof Error ? err : new Error(String(err)); await handlers.onError(terminal); } } const GLM_ENDPOINT = 'https://open.bigmodel.cn/api/paas/v4/chat/completions'; async function generateWithGlm( prompt: string, opts: { apiKey: string; model: string; maxTokens: number; timeoutMs?: number }, ): Promise { const controller = new AbortController(); const timer = opts.timeoutMs ? setTimeout(() => controller.abort(), opts.timeoutMs) : null; let res: Response; try { res = await fetch(GLM_ENDPOINT, { method: 'POST', headers: { Authorization: `Bearer ${opts.apiKey}`, 'Content-Type': 'application/json', }, body: JSON.stringify({ model: opts.model, max_tokens: opts.maxTokens, messages: [ { role: 'system', content: SYSTEM_PROMPT }, { role: 'user', content: prompt }, ], }), signal: controller.signal, }); } catch (err) { if ((err as { name?: string }).name === 'AbortError') { throw new SpecTimeoutError('glm spec generation exceeded the time budget'); } throw err; } finally { if (timer) clearTimeout(timer); } if (!res.ok) { const body = await res.text().catch(() => ''); throw new Error(`glm_api_${res.status}: ${body.slice(0, 200)}`); } const data = (await res.json()) as { choices?: Array<{ message?: { content?: string }; finish_reason?: string }>; }; const content = data.choices?.[0]?.message?.content; if (!content) throw new SpecValidationError('glm_empty_response'); const json = extractJson(content); const parsed = GeneratorSpec.safeParse(json); if (!parsed.success) throw new SpecValidationError(parsed.error.message); scanForInjection(parsed.data); return { spec: parsed.data, source: 'glm' }; } export class SpecValidationError extends Error { override readonly name = 'SpecValidationError'; } export class BannedPatternError extends Error { override readonly name = 'BannedPatternError'; } export class SpecTimeoutError extends Error { override readonly name = 'SpecTimeoutError'; } export class SpecTruncatedError extends Error { override readonly name = 'SpecTruncatedError'; } function extractJson(text: string): unknown { const trimmed = text.trim(); const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/); const body = fenced ? fenced[1] : trimmed; if (!body) throw new SpecValidationError('empty_generation_output'); try { return JSON.parse(body); } catch (e) { throw new SpecValidationError(`generation_not_json: ${(e as Error).message}`); } } /** * Public so other layers (the spec-edit merge in apps/api) can re-scan a * user-edited spec without duplicating the pattern list — single source of * truth for what counts as obviously-dangerous LLM output. */ export function scanForInjection(spec: GeneratorSpecT): void { for (const tool of spec.tools) { // Collect every string the LLM could have planted a payload in. Downstream // AI clients (Claude Desktop, Cursor) read tool.name + every inputSchema // description verbatim, so an injection there can pivot the user's AI // session — not only the runtime code. const surfaces: string[] = [tool.name, tool.description, tool.implementation]; for (const param of Object.values(tool.inputSchema)) { if (param && typeof param === 'object' && 'description' in param) { const d = (param as { description?: unknown }).description; if (typeof d === 'string') surfaces.push(d); } } for (const text of surfaces) { for (const pattern of SHARED_BANNED_PATTERNS) { if (pattern.test(text)) { throw new BannedPatternError(`banned_pattern_detected: ${pattern.source}`); } } } } } export function mockSpec(prompt: string): GeneratorSpecT { return { name: 'Echo MCP', description: `Mock server (no LLM key). Prompt was: ${prompt.slice(0, 200)}`, tools: [ { name: 'echo', description: 'Echoes the input string back to the caller.', inputSchema: { message: { type: 'string', description: 'Message to echo back', required: true }, }, implementation: `const msg = String(args.message ?? '');\nreturn { content: [{ type: 'text', text: \`echo: \${msg}\` }] };`, }, { name: 'now', description: 'Returns the current server UTC timestamp.', inputSchema: {}, implementation: `return { content: [{ type: 'text', text: new Date().toISOString() }] };`, }, ], resources: [], prompts: [], requiredSecrets: [], scopes: ['mcp:read'], dependencies: {}, }; }