buildmymcpserver/packages/llm/src/index.ts

import Anthropic from '@anthropic-ai/sdk';
import { GeneratorSpec, type GeneratorSpec as GeneratorSpecT } from '@bmm/types';

export const SYSTEM_PROMPT = `You generate production-grade MCP server specifications as STRICT JSON.

Output ONE JSON object (no markdown, no prose, no code fences) with this exact shape:

{
  "name": "human-readable server name (max 80 chars)",
  "description": "one sentence",
  "tools": [
    {
      "name": "snake_case_tool_name",
      "description": "single sentence, max 100 chars",
      "inputSchema": {
        "param_name": { "type": "string|number|boolean|array|object", "description": "short", "required": true }
      },
      "implementation": "async TS body. The tool's validated arguments arrive in the variable named EXACTLY 'args' (e.g. args.location, args.query). Return { content: [{ type:'text', text:'...' }] }. Secrets via process.env. HTTP via globalThis.fetch with AbortSignal.timeout(10000). Wrap external calls in try/catch and return { content:[{type:'text',text:'Error: ...'}], isError:true } on failure. No eval/Function/child_process. No import statements."
    }
  ],
  "resources": [],
  "prompts": [],
  "requiredSecrets": ["UPPER_SNAKE_CASE"],
  "scopes": ["mcp:read"],
  "dependencies": {}
}

Hard limits (the output gets truncated past these — write tight):
- At most 6 tools. Combine related capabilities into one tool with a "mode" param rather than splitting.
- Each implementation body: at most 40 lines of code, no defensive overengineering, no comments.
- Each description / inputSchema description: one short clause, no examples.
- Parameterised SQL only (pg with $1 placeholders). No prose, no JSON examples in code.

Return JSON only. No preamble, no closing remark.`;

// Regex blacklist — explicitly NOT a security boundary, just an early-warning
// for obviously-dangerous LLM output. The real defence is the Docker
// hardening in apps/generator/src/lib/deploy.ts (--cap-drop=ALL etc.). A
// determined attacker can bypass any of these with string concatenation
// (`'chi'+'ld_process'`) or alternate APIs — that's why container isolation
// has to hold even when this fails.
//
// Exported so the publish-time template scan in apps/api/src/routes/templates
// can reuse it instead of maintaining a parallel list that drifts. (Zc-001.)
export const SHARED_BANNED_PATTERNS: readonly RegExp[] = [
  /\beval\s*\(/,
  /\bnew\s+Function\s*\(/,
  /\bFunction\s*\(\s*['"`]/, // Function('...') without `new`
  /\brequire\s*\(\s*['"]child_process['"]/,
  /\bchild_process\b/,
  /\bprocess\.binding\b/,
  /\bprocess\.dlopen\b/,
  /\.constructor\s*\.\s*constructor\b/, // [].constructor.constructor('...')
  /\b_load\s*\(/,
  /\bvm\.runIn(This|New)Context\b/,
  /globalThis\s*\[\s*['"`]/, // globalThis['Fun'+'ction']
  /ignore\s+previous\s+instructions/i,
  /disregard\s+(the\s+)?(above|previous)/i,
  /system\s+prompt\s+override/i,
];

// ──────────────────────────────────────────────────────────────────────────
// Plan-aware model selection
// ──────────────────────────────────────────────────────────────────────────

export type Plan = 'hobby' | 'pro' | 'team' | 'enterprise';
export type Purpose = 'preview' | 'build';
export type Provider = 'anthropic' | 'glm';
export type DisplayBadge = 'open-tier' | 'claude-haiku' | 'claude-sonnet' | 'claude-opus';

export interface ModelChoice {
  provider: Provider;
  model: string;
  maxTokens: number;
  timeoutMs: number;
  /** User-facing model name shown in the wizard + previews. */
  displayName: string;
  displayBadge: DisplayBadge;
}

/**
 * Preview runs synchronously inside an HTTP request behind Cloudflare's
 * ~100s edge cap. Each tier's (model + max_tokens + timeout) is bounded to
 * fit. Hobby uses GLM as the cost lever; paid tiers escalate to Claude — the
 * visible quality/speed jump *is* the upgrade pitch.
 *
 * Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
 * Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
 * number was from the pre-4.6 generation).
 *
 * Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious
 * prompts ("research assistant with web search, papers, wikipedia, …")
 * routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at
 * 8192 — the model's effective ceiling for these prompts — and detect the
 * `stop_reason === 'max_tokens'` case to surface a "spec too large" message
 * instead of letting the truncated JSON blow up at the zod boundary.
 *
 * Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
 * 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
 * starts and TCP/TLS setup.
 */
const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
  hobby: {
    provider: 'glm',
    model: 'glm-4-plus',
    maxTokens: 4096,
    timeoutMs: 95_000,
    displayName: 'Open-tier AI',
    displayBadge: 'open-tier',
  },
  pro: {
    provider: 'anthropic',
    model: 'claude-haiku-4-5-20251001',
    maxTokens: 8192,
    timeoutMs: 95_000,
    displayName: 'Claude Haiku 4.5',
    displayBadge: 'claude-haiku',
  },
  team: {
    provider: 'anthropic',
    model: 'claude-sonnet-4-6',
    maxTokens: 12288,
    timeoutMs: 95_000,
    displayName: 'Claude Sonnet 4.6',
    displayBadge: 'claude-sonnet',
  },
  enterprise: {
    provider: 'anthropic',
    model: 'claude-sonnet-4-6',
    maxTokens: 12288,
    timeoutMs: 95_000,
    displayName: 'Claude Sonnet 4.6',
    displayBadge: 'claude-sonnet',
  },
};

/**
 * Build worker runs async via BullMQ — no proxy timeout. With the 24h preview
 * cache TTL cache-misses are rare, so GLM as the default keeps that rare path
 * cheap; Enterprise gets Opus as a premium-quality promise.
 */
const BUILD_MODELS: Record<Plan, ModelChoice> = {
  hobby: {
    provider: 'glm',
    model: 'glm-4.5',
    maxTokens: 8192,
    timeoutMs: 180_000,
    displayName: 'Open-tier AI',
    displayBadge: 'open-tier',
  },
  pro: {
    provider: 'glm',
    model: 'glm-4.5',
    maxTokens: 8192,
    timeoutMs: 180_000,
    displayName: 'Open-tier AI',
    displayBadge: 'open-tier',
  },
  team: {
    provider: 'glm',
    model: 'glm-4.5',
    maxTokens: 8192,
    timeoutMs: 180_000,
    displayName: 'Open-tier AI',
    displayBadge: 'open-tier',
  },
  enterprise: {
    provider: 'anthropic',
    model: 'claude-opus-4-7',
    maxTokens: 8192,
    timeoutMs: 600_000,
    displayName: 'Claude Opus 4.7',
    displayBadge: 'claude-opus',
  },
};

export function pickPreviewModel(plan: Plan): ModelChoice {
  return PREVIEW_MODELS[plan];
}

export function pickBuildModel(plan: Plan): ModelChoice {
  return BUILD_MODELS[plan];
}

// ──────────────────────────────────────────────────────────────────────────
// Generation API
// ──────────────────────────────────────────────────────────────────────────

export interface GenerationResult {
  spec: GeneratorSpecT;
  source: 'claude' | 'glm' | 'mock';
}

export interface GenerateOptions {
  /** 'anthropic' (default) or 'glm'. */
  provider?: Provider;
  /** Anthropic API key — required if provider === 'anthropic'. */
  apiKey?: string;
  /** Zhipu (GLM) API key — required if provider === 'glm'. */
  glmApiKey?: string;
  model?: string;
  maxTokens?: number;
  /** Per-attempt request timeout in ms. */
  timeoutMs?: number;
  /** SDK retry count. Anthropic only. */
  maxRetries?: number;
}

export async function generateSpec(
  prompt: string,
  opts: GenerateOptions = {},
): Promise<GenerationResult> {
  const provider = opts.provider ?? 'anthropic';

  if (provider === 'glm') {
    if (!opts.glmApiKey) return { spec: mockSpec(prompt), source: 'mock' };
    return generateWithGlm(prompt, {
      apiKey: opts.glmApiKey,
      model: opts.model ?? 'glm-4-plus',
      maxTokens: opts.maxTokens ?? 4096,
      timeoutMs: opts.timeoutMs,
    });
  }

  if (!opts.apiKey) {
    return { spec: mockSpec(prompt), source: 'mock' };
  }
  return generateWithAnthropic(prompt, {
    apiKey: opts.apiKey,
    model: opts.model ?? 'claude-opus-4-7',
    maxTokens: opts.maxTokens ?? 8192,
    timeoutMs: opts.timeoutMs,
    maxRetries: opts.maxRetries,
  });
}

async function generateWithAnthropic(
  prompt: string,
  opts: {
    apiKey: string;
    model: string;
    maxTokens: number;
    timeoutMs?: number;
    maxRetries?: number;
  },
): Promise<GenerationResult> {
  const client = new Anthropic({ apiKey: opts.apiKey });
  const requestOptions: { timeout?: number; maxRetries?: number } = {};
  if (opts.timeoutMs !== undefined) requestOptions.timeout = opts.timeoutMs;
  if (opts.maxRetries !== undefined) requestOptions.maxRetries = opts.maxRetries;

  const response = await client.messages
    .create(
      {
        model: opts.model,
        max_tokens: opts.maxTokens,
        system: SYSTEM_PROMPT,
        messages: [{ role: 'user', content: prompt }],
      },
      requestOptions,
    )
    .catch((err: unknown) => {
      if (err instanceof Anthropic.APIConnectionTimeoutError) {
        throw new SpecTimeoutError('spec generation exceeded the time budget');
      }
      throw err;
    });

  const text = response.content
    .filter((b): b is { type: 'text'; text: string } => b.type === 'text')
    .map((b) => b.text)
    .join('');
  // Detect token-limit truncation BEFORE attempting to parse. The model
  // chops mid-token when it hits max_tokens, so the closing `}` of a deeply
  // nested tool schema never gets emitted and JSON.parse blows up with an
  // unterminated-string error that's indistinguishable from a refusal at
  // the catch site. With stop_reason in hand we can surface a precise
  // "spec too large" message and tell the user to split / simplify the
  // prompt instead of letting them keep retrying the same one.
  if (response.stop_reason === 'max_tokens') {
    throw new SpecTruncatedError(
      `model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
    );
  }
  const json = extractJson(text);
  const parsed = GeneratorSpec.safeParse(json);
  if (!parsed.success) {
    // Include a truncated raw preview so the caller (api log) can see whether
    // the model returned non-JSON / a refusal / a near-miss schema, instead
    // of just the opaque zod error.
    const preview = text.slice(0, 400).replace(/\s+/g, ' ');
    throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`);
  }
  scanForInjection(parsed.data);
  return { spec: parsed.data, source: 'claude' };
}

// ──────────────────────────────────────────────────────────────────────────
// Streaming generation (Anthropic only)
// ──────────────────────────────────────────────────────────────────────────

export interface StreamHandlers {
  /** Called for each text delta emitted by the model. */
  onText: (text: string) => void;
  /** Called once when the stream completes successfully with the final spec. */
  onSpec: (result: GenerationResult) => void;
  /** Called once on any terminal error (timeout, truncation, validation). */
  onError: (err: Error) => void;
}

/**
 * Stream a spec from Anthropic, piping text deltas to a handler and finally
 * surfacing the parsed/validated spec or the relevant typed error.
 *
 * Why streaming for /preview: Cloudflare's edge timeout is ~100s on the Free
 * tier and our previous sync call could approach that for ambitious prompts.
 * With streaming the TCP connection writes bytes from the first model token,
 * which keeps CF (and nginx) from cutting us off — runtime is bounded only by
 * the model itself and our own AbortController, not by CF.
 */
export async function streamSpecFromAnthropic(
  prompt: string,
  opts: { apiKey: string; model: string; maxTokens: number; signal?: AbortSignal },
  handlers: StreamHandlers,
): Promise<void> {
  const client = new Anthropic({ apiKey: opts.apiKey });
  let accumulated = '';

  try {
    const stream = client.messages.stream({
      model: opts.model,
      max_tokens: opts.maxTokens,
      system: SYSTEM_PROMPT,
      messages: [{ role: 'user', content: prompt }],
    });

    if (opts.signal) {
      opts.signal.addEventListener('abort', () => stream.abort(), { once: true });
    }

    stream.on('text', (delta) => {
      accumulated += delta;
      handlers.onText(delta);
    });

    const final = await stream.finalMessage();

    if (final.stop_reason === 'max_tokens') {
      throw new SpecTruncatedError(
        `model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
      );
    }

    const json = extractJson(accumulated);
    const parsed = GeneratorSpec.safeParse(json);
    if (!parsed.success) {
      const preview = accumulated.slice(0, 400).replace(/\s+/g, ' ');
      throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`);
    }
    scanForInjection(parsed.data);
    handlers.onSpec({ spec: parsed.data, source: 'claude' });
  } catch (err) {
    if (err instanceof Anthropic.APIConnectionTimeoutError) {
      handlers.onError(new SpecTimeoutError('spec generation exceeded the time budget'));
      return;
    }
    handlers.onError(err instanceof Error ? err : new Error(String(err)));
  }
}

const GLM_ENDPOINT = 'https://open.bigmodel.cn/api/paas/v4/chat/completions';

async function generateWithGlm(
  prompt: string,
  opts: { apiKey: string; model: string; maxTokens: number; timeoutMs?: number },
): Promise<GenerationResult> {
  const controller = new AbortController();
  const timer = opts.timeoutMs ? setTimeout(() => controller.abort(), opts.timeoutMs) : null;
  let res: Response;
  try {
    res = await fetch(GLM_ENDPOINT, {
      method: 'POST',
      headers: {
        Authorization: `Bearer ${opts.apiKey}`,
        'Content-Type': 'application/json',
      },
      body: JSON.stringify({
        model: opts.model,
        max_tokens: opts.maxTokens,
        messages: [
          { role: 'system', content: SYSTEM_PROMPT },
          { role: 'user', content: prompt },
        ],
      }),
      signal: controller.signal,
    });
  } catch (err) {
    if ((err as { name?: string }).name === 'AbortError') {
      throw new SpecTimeoutError('glm spec generation exceeded the time budget');
    }
    throw err;
  } finally {
    if (timer) clearTimeout(timer);
  }
  if (!res.ok) {
    const body = await res.text().catch(() => '');
    throw new Error(`glm_api_${res.status}: ${body.slice(0, 200)}`);
  }
  const data = (await res.json()) as {
    choices?: Array<{ message?: { content?: string }; finish_reason?: string }>;
  };
  const content = data.choices?.[0]?.message?.content;
  if (!content) throw new SpecValidationError('glm_empty_response');
  const json = extractJson(content);
  const parsed = GeneratorSpec.safeParse(json);
  if (!parsed.success) throw new SpecValidationError(parsed.error.message);
  scanForInjection(parsed.data);
  return { spec: parsed.data, source: 'glm' };
}

export class SpecValidationError extends Error {
  override readonly name = 'SpecValidationError';
}

export class BannedPatternError extends Error {
  override readonly name = 'BannedPatternError';
}

export class SpecTimeoutError extends Error {
  override readonly name = 'SpecTimeoutError';
}

export class SpecTruncatedError extends Error {
  override readonly name = 'SpecTruncatedError';
}

function extractJson(text: string): unknown {
  const trimmed = text.trim();
  const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);
  const body = fenced ? fenced[1] : trimmed;
  if (!body) throw new SpecValidationError('empty_generation_output');
  try {
    return JSON.parse(body);
  } catch (e) {
    throw new SpecValidationError(`generation_not_json: ${(e as Error).message}`);
  }
}

/**
 * Public so other layers (the spec-edit merge in apps/api) can re-scan a
 * user-edited spec without duplicating the pattern list — single source of
 * truth for what counts as obviously-dangerous LLM output.
 */
export function scanForInjection(spec: GeneratorSpecT): void {
  for (const tool of spec.tools) {
    // Collect every string the LLM could have planted a payload in. Downstream
    // AI clients (Claude Desktop, Cursor) read tool.name + every inputSchema
    // description verbatim, so an injection there can pivot the user's AI
    // session — not only the runtime code.
    const surfaces: string[] = [tool.name, tool.description, tool.implementation];
    for (const param of Object.values(tool.inputSchema)) {
      if (param && typeof param === 'object' && 'description' in param) {
        const d = (param as { description?: unknown }).description;
        if (typeof d === 'string') surfaces.push(d);
      }
    }
    for (const text of surfaces) {
      for (const pattern of SHARED_BANNED_PATTERNS) {
        if (pattern.test(text)) {
          throw new BannedPatternError(`banned_pattern_detected: ${pattern.source}`);
        }
      }
    }
  }
}

export function mockSpec(prompt: string): GeneratorSpecT {
  return {
    name: 'Echo MCP',
    description: `Mock server (no LLM key). Prompt was: ${prompt.slice(0, 200)}`,
    tools: [
      {
        name: 'echo',
        description: 'Echoes the input string back to the caller.',
        inputSchema: {
          message: { type: 'string', description: 'Message to echo back', required: true },
        },
        implementation: `const msg = String(args.message ?? '');\nreturn { content: [{ type: 'text', text: \`echo: \${msg}\` }] };`,
      },
      {
        name: 'now',
        description: 'Returns the current server UTC timestamp.',
        inputSchema: {},
        implementation: `return { content: [{ type: 'text', text: new Date().toISOString() }] };`,
      },
    ],
    resources: [],
    prompts: [],
    requiredSecrets: [],
    scopes: ['mcp:read'],
    dependencies: {},
  };
}
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								import Anthropic from '@anthropic-ai/sdk';
 								import { GeneratorSpec, type GeneratorSpec as GeneratorSpecT } from '@bmm/types';
 								export const SYSTEM_PROMPT = `You generate production-grade MCP server specifications as STRICT JSON.
 								Output ONE JSON object (no markdown, no prose, no code fences) with this exact shape:
 								{
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								  "name": "human-readable server name (max 80 chars)",
 								  "description": "one sentence",
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  "tools": [
 								    {
 								      "name": "snake_case_tool_name",
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								      "description": "single sentence, max 100 chars",
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								      "inputSchema": {
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								        "param_name": { "type": "string|number|boolean|array|object", "description": "short", "required": true }
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								      },
-												fix(llm): escape backticks in SYSTEM_PROMPT (broke typecheck)

											
										
										
											2026-05-28 21:39:34 +02:00
+								      "implementation": "async TS body. The tool's validated arguments arrive in the variable named EXACTLY 'args' (e.g. args.location, args.query). Return { content: [{ type:'text', text:'...' }] }. Secrets via process.env. HTTP via globalThis.fetch with AbortSignal.timeout(10000). Wrap external calls in try/catch and return { content:[{type:'text',text:'Error: ...'}], isError:true } on failure. No eval/Function/child_process. No import statements."
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								    }
 								  ],
 								  "resources": [],
 								  "prompts": [],
 								  "requiredSecrets": ["UPPER_SNAKE_CASE"],
 								  "scopes": ["mcp:read"],
 								  "dependencies": {}
 								}
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								Hard limits (the output gets truncated past these — write tight):
 								- At most 6 tools. Combine related capabilities into one tool with a "mode" param rather than splitting.
 								- Each implementation body: at most 40 lines of code, no defensive overengineering, no comments.
 								- Each description / inputSchema description: one short clause, no examples.
 								- Parameterised SQL only (pg with $1 placeholders). No prose, no JSON examples in code.
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								Return JSON only. No preamble, no closing remark.`;
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								// Regex blacklist — explicitly NOT a security boundary, just an early-warning
 								// for obviously-dangerous LLM output. The real defence is the Docker
 								// hardening in apps/generator/src/lib/deploy.ts (--cap-drop=ALL etc.). A
 								// determined attacker can bypass any of these with string concatenation
 								// (`'chi'+'ld_process'`) or alternate APIs — that's why container isolation
 								// has to hold even when this fails.
-												security: sovereign-audit Pass-2 fixes — auth-lib, oauth, templates

Six confirmed findings closed (3 MEDIUM, 3 LOW). Tier-1 surfaces from
Pass-1 re-verified non-regressed; this pass deepened the audit on the
auth library, OAuth issuer, and template marketplace.

Za-002 MEDIUM (scrypt cost) — bump SCRYPT_N from 2^14 → 2^17 (131072)
  matching current OWASP guidance for password hashing in 2026. Hash
  format embeds N (`scrypt$N$salt$hash`), so the existing admin
  password at the old cost still verifies — backward-compatible. Also
  added explicit maxmem ceilings since Node's default (~32MiB) is
  insufficient for the new N.

Za-003 MEDIUM (single-use race) — consumeMagicLink was SELECT-then-
  UPDATE; two parallel redemptions could both win and mint two
  sessions from the same token. Now uses the same atomic
  `UPDATE … WHERE id = ? AND consumedAt IS NULL RETURNING id` pattern
  /oauth/token already had — loser of the race gets
  invalid_or_expired_token.

Za-004 LOW (membership ordering) — `.orderBy(memberships.createdAt)`
  added so when org-invites eventually let a user belong to multiple
  orgs, the same one wins every login instead of insertion-order
  roulette. Latent-bug pre-empt.

Zb-002 LOW (OAuth register spam) — /oauth/register now per-IP daily
  rate-limited at 20/day (well above any legitimate MCP-client
  bootstrap pattern). Prevents DB-row spam.

Zc-001 MEDIUM (banned-pattern drift) — three separate copies of
  BANNED_PATTERNS had drifted apart. The publish-time scanner in
  templates.ts was MISSING the 7 new patterns added in Pass-1
  (process.binding, dlopen, .constructor.constructor, vm.runIn*,
  globalThis['..']). Single source of truth in @bmm/llm now exports
  SHARED_BANNED_PATTERNS; templates.ts composes PUBLISH_BANNED_PATTERNS
  = SHARED ∪ code-only-extras (dynamic import, fs.rm, setTimeout-with-
  string, process.kill, jailbreak markers).

Zc-002 LOW (N+1) — /v1/templates list was issuing one COUNT(*) per
  template (101 queries for a 100-row page). Now one grouped query
  with templateId GROUP BY, merged in JS. p95 doesn't degrade with
  marketplace growth.

DEFERRED (documented, scoped for next sprint):
  Za-001 HIGH — Account takeover via cross-provider email lookup.
    Requires schema change (users.primaryProvider). Mitigation in
    /settings/account banner planned.
  Zb-001 MEDIUM — /oauth/token refresh_token grant: advertised in
    AS metadata but unsupported_grant_type. Either implement (~40
    LOC) or strip from metadata.
  Zc-003 LOW — Admin takedown partial-failure consistency.
  Zd-001 IMPROVE — DEK cache invalidation across replicas (single-
    instance today).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:15:54 +02:00
+								//
 								// Exported so the publish-time template scan in apps/api/src/routes/templates
 								// can reuse it instead of maintaining a parallel list that drifts. (Zc-001.)
 								export const SHARED_BANNED_PATTERNS: readonly RegExp[] = [
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  /\beval\s*\(/,
 								  /\bnew\s+Function\s*\(/,
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								  /\bFunction\s*\(\s*['"`]/, // Function('...') without `new`
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  /\brequire\s*\(\s*['"]child_process['"]/,
 								  /\bchild_process\b/,
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								  /\bprocess\.binding\b/,
 								  /\bprocess\.dlopen\b/,
 								  /\.constructor\s*\.\s*constructor\b/, // [].constructor.constructor('...')
 								  /\b_load\s*\(/,
 								  /\bvm\.runIn(This|New)Context\b/,
 								  /globalThis\s*\[\s*['"`]/, // globalThis['Fun'+'ction']
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  /ignore\s+previous\s+instructions/i,
 								  /disregard\s+(the\s+)?(above|previous)/i,
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								  /system\s+prompt\s+override/i,
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								];
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								// ──────────────────────────────────────────────────────────────────────────
 								// Plan-aware model selection
 								// ──────────────────────────────────────────────────────────────────────────
 								export type Plan = 'hobby' | 'pro' | 'team' | 'enterprise';
 								export type Purpose = 'preview' | 'build';
 								export type Provider = 'anthropic' | 'glm';
 								export type DisplayBadge = 'open-tier' | 'claude-haiku' | 'claude-sonnet' | 'claude-opus';
 								export interface ModelChoice {
 								  provider: Provider;
 								  model: string;
 								  maxTokens: number;
 								  timeoutMs: number;
 								  /** User-facing model name shown in the wizard + previews. */
 								  displayName: string;
 								  displayBadge: DisplayBadge;
 								}
 								/**
 								 * Preview runs synchronously inside an HTTP request behind Cloudflare's
 								 * ~100s edge cap. Each tier's (model + max_tokens + timeout) is bounded to
 								 * fit. Hobby uses GLM as the cost lever; paid tiers escalate to Claude — the
 								 * visible quality/speed jump *is* the upgrade pitch.
 								 *
-												fix(llm): preview timeout 60s→90s + maxTokens 8192→4096

Enterprise plan was hitting SpecTimeoutError exactly at 60s because the
Sonnet 4.6 preview was budgeted for 8192 tokens at ~80 tok/s (≈102s
worst case) inside a 60s window. The frontend then rolled back to step
1 with no spec.

A real spec is small (<= ~10 tools, ~1.5–2.5k output tokens in practice)
so 4096 is plenty and lets even Sonnet finish in ~51s worst case. The
90s timeout buys headroom for cold starts while staying under
Cloudflare's 100s edge cap. Hobby/GLM bumped to 90s too — same
headroom argument.

											
										
										
											2026-05-28 18:51:51 +02:00
+								 * Measured token rates: glm-4-plus ~58 tok/s · Claude Haiku 4.5 ~200 tok/s ·
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								 * Claude Sonnet 4.6 ~130 tok/s (current measurement; the older ~80 tok/s
 								 * number was from the pre-4.6 generation).
 								 *
 								 * Token budget: a *small* spec is ~1.5–2.5k output tokens, but ambitious
 								 * prompts ("research assistant with web search, papers, wikipedia, …")
 								 * routinely produce 6–8k tokens of deeply-nested tool schemas. We cap at
 								 * 8192 — the model's effective ceiling for these prompts — and detect the
 								 * `stop_reason === 'max_tokens'` case to surface a "spec too large" message
 								 * instead of letting the truncated JSON blow up at the zod boundary.
 								 *
 								 * Timeouts sit at 95s, just under Cloudflare's 100s edge cap. Sonnet at
 								 * 130 tok/s finishes 8192 tokens in ~63s, giving ~30s headroom for cold
 								 * starts and TCP/TLS setup.
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								 */
 								const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
 								  hobby: {
 								    provider: 'glm',
 								    model: 'glm-4-plus',
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								    maxTokens: 4096,
 								    timeoutMs: 95_000,
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								    displayName: 'Open-tier AI',
 								    displayBadge: 'open-tier',
 								  },
 								  pro: {
 								    provider: 'anthropic',
 								    model: 'claude-haiku-4-5-20251001',
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								    maxTokens: 8192,
 								    timeoutMs: 95_000,
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								    displayName: 'Claude Haiku 4.5',
 								    displayBadge: 'claude-haiku',
 								  },
 								  team: {
 								    provider: 'anthropic',
 								    model: 'claude-sonnet-4-6',
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								    maxTokens: 12288,
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								    timeoutMs: 95_000,
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								    displayName: 'Claude Sonnet 4.6',
 								    displayBadge: 'claude-sonnet',
 								  },
 								  enterprise: {
 								    provider: 'anthropic',
 								    model: 'claude-sonnet-4-6',
-												fix(llm): tighter system prompt + 12288 max_tokens for paid tiers

Sonnet 4.6 was still hitting max_tokens on ambitious prompts like
"WorldWeather MCP for any location" because the implementation bodies
ballooned with defensive scaffolding. Two changes:

1. SYSTEM_PROMPT now imposes hard limits the model can self-enforce:
   - at most 6 tools (combine related capabilities with a mode param)
   - implementation body <= 40 lines, no comments, no overengineering
   - descriptions <= 100 chars
   These keep a typical preview under ~7k output tokens.

2. team/enterprise maxTokens 8192 -> 12288. At ~130 tok/s that fits in
   ~94s, still under Cloudflare's 100s edge cap. Hobby (GLM) and pro
   (Haiku) keep their existing limits — they were not hitting the
   ceiling.

SpecTruncatedError still fires + surfaces 422 spec_too_large when even
12288 isn't enough, so the user gets actionable feedback instead of an
opaque zod error.

											
										
										
											2026-05-28 21:01:50 +02:00
+								    maxTokens: 12288,
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								    timeoutMs: 95_000,
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								    displayName: 'Claude Sonnet 4.6',
 								    displayBadge: 'claude-sonnet',
 								  },
 								};
 								/**
 								 * Build worker runs async via BullMQ — no proxy timeout. With the 24h preview
 								 * cache TTL cache-misses are rare, so GLM as the default keeps that rare path
 								 * cheap; Enterprise gets Opus as a premium-quality promise.
 								 */
 								const BUILD_MODELS: Record<Plan, ModelChoice> = {
 								  hobby: {
 								    provider: 'glm',
 								    model: 'glm-4.5',
 								    maxTokens: 8192,
 								    timeoutMs: 180_000,
 								    displayName: 'Open-tier AI',
 								    displayBadge: 'open-tier',
 								  },
 								  pro: {
 								    provider: 'glm',
 								    model: 'glm-4.5',
 								    maxTokens: 8192,
 								    timeoutMs: 180_000,
 								    displayName: 'Open-tier AI',
 								    displayBadge: 'open-tier',
 								  },
 								  team: {
 								    provider: 'glm',
 								    model: 'glm-4.5',
 								    maxTokens: 8192,
 								    timeoutMs: 180_000,
 								    displayName: 'Open-tier AI',
 								    displayBadge: 'open-tier',
 								  },
 								  enterprise: {
 								    provider: 'anthropic',
 								    model: 'claude-opus-4-7',
 								    maxTokens: 8192,
 								    timeoutMs: 600_000,
 								    displayName: 'Claude Opus 4.7',
 								    displayBadge: 'claude-opus',
 								  },
 								};
 								export function pickPreviewModel(plan: Plan): ModelChoice {
 								  return PREVIEW_MODELS[plan];
 								}
 								export function pickBuildModel(plan: Plan): ModelChoice {
 								  return BUILD_MODELS[plan];
 								}
 								// ──────────────────────────────────────────────────────────────────────────
 								// Generation API
 								// ──────────────────────────────────────────────────────────────────────────
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								export interface GenerationResult {
 								  spec: GeneratorSpecT;
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  source: 'claude' | 'glm' | 'mock';
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								}
 								export interface GenerateOptions {
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  /** 'anthropic' (default) or 'glm'. */
 								  provider?: Provider;
 								  /** Anthropic API key — required if provider === 'anthropic'. */
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  apiKey?: string;
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  /** Zhipu (GLM) API key — required if provider === 'glm'. */
 								  glmApiKey?: string;
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  model?: string;
 								  maxTokens?: number;
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  /** Per-attempt request timeout in ms. */
-												fix(preview): stop spec generation timing out behind the edge proxy

The /v1/servers/preview route ran claude-opus-4-7 synchronously; full spec
generation routinely exceeded Cloudflare's ~100s proxy cap, so the browser
received a headerless 524 and reported it as a CORS failure.

- preview now uses claude-sonnet-4-6 with a 45s per-attempt timeout and one
  retry — comfortably inside the proxy budget
- generateSpec maps an exhausted timeout to SpecTimeoutError; the route
  returns a clean 504 (with CORS headers) instead of a stalled connection
- analyze step: live elapsed-seconds counter as freeze-proof, plus a
  reduced-motion exception so the loading spinner keeps spinning (a status
  indicator, which WCAG exempts from reduced-motion)
- textarea resize grip restyled to dark theme (light hatch on dark square)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-21 23:52:48 +02:00
+								  timeoutMs?: number;
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  /** SDK retry count. Anthropic only. */
-												fix(preview): stop spec generation timing out behind the edge proxy

The /v1/servers/preview route ran claude-opus-4-7 synchronously; full spec
generation routinely exceeded Cloudflare's ~100s proxy cap, so the browser
received a headerless 524 and reported it as a CORS failure.

- preview now uses claude-sonnet-4-6 with a 45s per-attempt timeout and one
  retry — comfortably inside the proxy budget
- generateSpec maps an exhausted timeout to SpecTimeoutError; the route
  returns a clean 504 (with CORS headers) instead of a stalled connection
- analyze step: live elapsed-seconds counter as freeze-proof, plus a
  reduced-motion exception so the loading spinner keeps spinning (a status
  indicator, which WCAG exempts from reduced-motion)
- textarea resize grip restyled to dark theme (light hatch on dark square)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-21 23:52:48 +02:00
+								  maxRetries?: number;
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								}
-												fix(preview): stop spec generation timing out behind the edge proxy

The /v1/servers/preview route ran claude-opus-4-7 synchronously; full spec
generation routinely exceeded Cloudflare's ~100s proxy cap, so the browser
received a headerless 524 and reported it as a CORS failure.

- preview now uses claude-sonnet-4-6 with a 45s per-attempt timeout and one
  retry — comfortably inside the proxy budget
- generateSpec maps an exhausted timeout to SpecTimeoutError; the route
  returns a clean 504 (with CORS headers) instead of a stalled connection
- analyze step: live elapsed-seconds counter as freeze-proof, plus a
  reduced-motion exception so the loading spinner keeps spinning (a status
  indicator, which WCAG exempts from reduced-motion)
- textarea resize grip restyled to dark theme (light hatch on dark square)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-21 23:52:48 +02:00
+								export async function generateSpec(
 								  prompt: string,
 								  opts: GenerateOptions = {},
 								): Promise<GenerationResult> {
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  const provider = opts.provider ?? 'anthropic';
 								  if (provider === 'glm') {
 								    if (!opts.glmApiKey) return { spec: mockSpec(prompt), source: 'mock' };
 								    return generateWithGlm(prompt, {
 								      apiKey: opts.glmApiKey,
 								      model: opts.model ?? 'glm-4-plus',
 								      maxTokens: opts.maxTokens ?? 4096,
 								      timeoutMs: opts.timeoutMs,
 								    });
 								  }
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  if (!opts.apiKey) {
 								    return { spec: mockSpec(prompt), source: 'mock' };
 								  }
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								  return generateWithAnthropic(prompt, {
 								    apiKey: opts.apiKey,
 								    model: opts.model ?? 'claude-opus-4-7',
 								    maxTokens: opts.maxTokens ?? 8192,
 								    timeoutMs: opts.timeoutMs,
 								    maxRetries: opts.maxRetries,
 								  });
 								}
 								async function generateWithAnthropic(
 								  prompt: string,
 								  opts: {
 								    apiKey: string;
 								    model: string;
 								    maxTokens: number;
 								    timeoutMs?: number;
 								    maxRetries?: number;
 								  },
 								): Promise<GenerationResult> {
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  const client = new Anthropic({ apiKey: opts.apiKey });
-												fix(preview): stop spec generation timing out behind the edge proxy

The /v1/servers/preview route ran claude-opus-4-7 synchronously; full spec
generation routinely exceeded Cloudflare's ~100s proxy cap, so the browser
received a headerless 524 and reported it as a CORS failure.

- preview now uses claude-sonnet-4-6 with a 45s per-attempt timeout and one
  retry — comfortably inside the proxy budget
- generateSpec maps an exhausted timeout to SpecTimeoutError; the route
  returns a clean 504 (with CORS headers) instead of a stalled connection
- analyze step: live elapsed-seconds counter as freeze-proof, plus a
  reduced-motion exception so the loading spinner keeps spinning (a status
  indicator, which WCAG exempts from reduced-motion)
- textarea resize grip restyled to dark theme (light hatch on dark square)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-21 23:52:48 +02:00
+								  const requestOptions: { timeout?: number; maxRetries?: number } = {};
 								  if (opts.timeoutMs !== undefined) requestOptions.timeout = opts.timeoutMs;
 								  if (opts.maxRetries !== undefined) requestOptions.maxRetries = opts.maxRetries;
 								  const response = await client.messages
 								    .create(
 								      {
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								        model: opts.model,
 								        max_tokens: opts.maxTokens,
-												fix(preview): stop spec generation timing out behind the edge proxy

The /v1/servers/preview route ran claude-opus-4-7 synchronously; full spec
generation routinely exceeded Cloudflare's ~100s proxy cap, so the browser
received a headerless 524 and reported it as a CORS failure.

- preview now uses claude-sonnet-4-6 with a 45s per-attempt timeout and one
  retry — comfortably inside the proxy budget
- generateSpec maps an exhausted timeout to SpecTimeoutError; the route
  returns a clean 504 (with CORS headers) instead of a stalled connection
- analyze step: live elapsed-seconds counter as freeze-proof, plus a
  reduced-motion exception so the loading spinner keeps spinning (a status
  indicator, which WCAG exempts from reduced-motion)
- textarea resize grip restyled to dark theme (light hatch on dark square)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-21 23:52:48 +02:00
+								        system: SYSTEM_PROMPT,
 								        messages: [{ role: 'user', content: prompt }],
 								      },
 								      requestOptions,
 								    )
 								    .catch((err: unknown) => {
 								      if (err instanceof Anthropic.APIConnectionTimeoutError) {
 								        throw new SpecTimeoutError('spec generation exceeded the time budget');
 								      }
 								      throw err;
 								    });
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  const text = response.content
 								    .filter((b): b is { type: 'text'; text: string } => b.type === 'text')
 								    .map((b) => b.text)
 								    .join('');
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								  // Detect token-limit truncation BEFORE attempting to parse. The model
 								  // chops mid-token when it hits max_tokens, so the closing `}` of a deeply
 								  // nested tool schema never gets emitted and JSON.parse blows up with an
 								  // unterminated-string error that's indistinguishable from a refusal at
 								  // the catch site. With stop_reason in hand we can surface a precise
 								  // "spec too large" message and tell the user to split / simplify the
 								  // prompt instead of letting them keep retrying the same one.
 								  if (response.stop_reason === 'max_tokens') {
 								    throw new SpecTruncatedError(
 								      `model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
 								    );
 								  }
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  const json = extractJson(text);
 								  const parsed = GeneratorSpec.safeParse(json);
-												feat(preview): log spec validation failures with raw output

422s from /preview hid the actual reason: zod_message tells which field
was wrong and a 400-char preview of the model output reveals refusals
or non-JSON returns. Both stay in the api log only — never surfaced
to the client unchanged.

											
										
										
											2026-05-28 19:19:57 +02:00
+								  if (!parsed.success) {
 								    // Include a truncated raw preview so the caller (api log) can see whether
 								    // the model returned non-JSON / a refusal / a near-miss schema, instead
 								    // of just the opaque zod error.
 								    const preview = text.slice(0, 400).replace(/\s+/g, ' ');
 								    throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`);
 								  }
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  scanForInjection(parsed.data);
 								  return { spec: parsed.data, source: 'claude' };
 								}
-												feat(preview): SSE-streamed generation, no CF 100s edge cap

Architectural fix for "spec_too_large" / preview_timeout — the sync
endpoint had to fit the whole model run into Cloudflare's ~100s edge
window, which made the system fragile against any prompt that produced
a verbose spec. The new streaming path pipes Anthropic's token deltas
as Server-Sent Events; every chunk resets CF's idle timer and a 15s
keepalive comment guarantees activity even during slow first-token
windows.

@bmm/llm: new streamSpecFromAnthropic() exposes the SDK's .stream()
flow with the same typed-error contract as generateSpec — same
SpecTruncatedError / SpecValidationError / SpecTimeoutError raised from
the relevant moment.

API: POST /v1/servers/preview/stream returns text/event-stream with
events 'text' (deltas), 'spec' (final success payload, same shape as
the sync endpoint), 'error' (typed). Anthropic-only — GLM/hobby falls
back to the sync route via 409 streaming_unavailable.

Frontend: apiSseStream() handles the POST + ReadableStream + SSE
parser. The wizard's analyze() prefers the stream and only uses the
sync endpoint on the explicit 409 fallback.

nginx (api.buildmymcpserver.com): the /v1/builds/ location block (which
already had proxy_buffering off + 600s read timeout for the WS build
stream) now also matches /v1/servers/preview/stream so the SSE
response isn't buffered.

											
										
										
											2026-05-28 21:11:05 +02:00
+								// ──────────────────────────────────────────────────────────────────────────
 								// Streaming generation (Anthropic only)
 								// ──────────────────────────────────────────────────────────────────────────
 								export interface StreamHandlers {
 								  /** Called for each text delta emitted by the model. */
 								  onText: (text: string) => void;
 								  /** Called once when the stream completes successfully with the final spec. */
 								  onSpec: (result: GenerationResult) => void;
 								  /** Called once on any terminal error (timeout, truncation, validation). */
 								  onError: (err: Error) => void;
 								}
 								/**
 								 * Stream a spec from Anthropic, piping text deltas to a handler and finally
 								 * surfacing the parsed/validated spec or the relevant typed error.
 								 *
 								 * Why streaming for /preview: Cloudflare's edge timeout is ~100s on the Free
 								 * tier and our previous sync call could approach that for ambitious prompts.
 								 * With streaming the TCP connection writes bytes from the first model token,
 								 * which keeps CF (and nginx) from cutting us off — runtime is bounded only by
 								 * the model itself and our own AbortController, not by CF.
 								 */
 								export async function streamSpecFromAnthropic(
 								  prompt: string,
 								  opts: { apiKey: string; model: string; maxTokens: number; signal?: AbortSignal },
 								  handlers: StreamHandlers,
 								): Promise<void> {
 								  const client = new Anthropic({ apiKey: opts.apiKey });
 								  let accumulated = '';
 								  try {
 								    const stream = client.messages.stream({
 								      model: opts.model,
 								      max_tokens: opts.maxTokens,
 								      system: SYSTEM_PROMPT,
 								      messages: [{ role: 'user', content: prompt }],
 								    });
 								    if (opts.signal) {
 								      opts.signal.addEventListener('abort', () => stream.abort(), { once: true });
 								    }
 								    stream.on('text', (delta) => {
 								      accumulated += delta;
 								      handlers.onText(delta);
 								    });
 								    const final = await stream.finalMessage();
 								    if (final.stop_reason === 'max_tokens') {
 								      throw new SpecTruncatedError(
 								        `model hit max_tokens (${opts.maxTokens}) before finishing the spec`,
 								      );
 								    }
 								    const json = extractJson(accumulated);
 								    const parsed = GeneratorSpec.safeParse(json);
 								    if (!parsed.success) {
 								      const preview = accumulated.slice(0, 400).replace(/\s+/g, ' ');
 								      throw new SpecValidationError(`${parsed.error.message} :: raw="${preview}"`);
 								    }
 								    scanForInjection(parsed.data);
 								    handlers.onSpec({ spec: parsed.data, source: 'claude' });
 								  } catch (err) {
 								    if (err instanceof Anthropic.APIConnectionTimeoutError) {
 								      handlers.onError(new SpecTimeoutError('spec generation exceeded the time budget'));
 								      return;
 								    }
 								    handlers.onError(err instanceof Error ? err : new Error(String(err)));
 								  }
 								}
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								const GLM_ENDPOINT = 'https://open.bigmodel.cn/api/paas/v4/chat/completions';
 								async function generateWithGlm(
 								  prompt: string,
 								  opts: { apiKey: string; model: string; maxTokens: number; timeoutMs?: number },
 								): Promise<GenerationResult> {
 								  const controller = new AbortController();
 								  const timer = opts.timeoutMs ? setTimeout(() => controller.abort(), opts.timeoutMs) : null;
 								  let res: Response;
 								  try {
 								    res = await fetch(GLM_ENDPOINT, {
 								      method: 'POST',
 								      headers: {
 								        Authorization: `Bearer ${opts.apiKey}`,
 								        'Content-Type': 'application/json',
 								      },
 								      body: JSON.stringify({
 								        model: opts.model,
 								        max_tokens: opts.maxTokens,
 								        messages: [
 								          { role: 'system', content: SYSTEM_PROMPT },
 								          { role: 'user', content: prompt },
 								        ],
 								      }),
 								      signal: controller.signal,
 								    });
 								  } catch (err) {
 								    if ((err as { name?: string }).name === 'AbortError') {
 								      throw new SpecTimeoutError('glm spec generation exceeded the time budget');
 								    }
 								    throw err;
 								  } finally {
 								    if (timer) clearTimeout(timer);
 								  }
 								  if (!res.ok) {
 								    const body = await res.text().catch(() => '');
 								    throw new Error(`glm_api_${res.status}: ${body.slice(0, 200)}`);
 								  }
 								  const data = (await res.json()) as {
 								    choices?: Array<{ message?: { content?: string }; finish_reason?: string }>;
 								  };
 								  const content = data.choices?.[0]?.message?.content;
 								  if (!content) throw new SpecValidationError('glm_empty_response');
 								  const json = extractJson(content);
 								  const parsed = GeneratorSpec.safeParse(json);
 								  if (!parsed.success) throw new SpecValidationError(parsed.error.message);
 								  scanForInjection(parsed.data);
 								  return { spec: parsed.data, source: 'glm' };
 								}
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								export class SpecValidationError extends Error {
 								  override readonly name = 'SpecValidationError';
 								}
 								export class BannedPatternError extends Error {
 								  override readonly name = 'BannedPatternError';
 								}
-												fix(preview): stop spec generation timing out behind the edge proxy

The /v1/servers/preview route ran claude-opus-4-7 synchronously; full spec
generation routinely exceeded Cloudflare's ~100s proxy cap, so the browser
received a headerless 524 and reported it as a CORS failure.

- preview now uses claude-sonnet-4-6 with a 45s per-attempt timeout and one
  retry — comfortably inside the proxy budget
- generateSpec maps an exhausted timeout to SpecTimeoutError; the route
  returns a clean 504 (with CORS headers) instead of a stalled connection
- analyze step: live elapsed-seconds counter as freeze-proof, plus a
  reduced-motion exception so the loading spinner keeps spinning (a status
  indicator, which WCAG exempts from reduced-motion)
- textarea resize grip restyled to dark theme (light hatch on dark square)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-21 23:52:48 +02:00
+								export class SpecTimeoutError extends Error {
 								  override readonly name = 'SpecTimeoutError';
 								}
-												fix(preview): max_tokens 4096→8192 + detect truncation explicitly

Root cause of repeat 422s: 4096 was too tight for ambitious prompts
(Marco's research-assistant prompt produces ~12kB of JSON before the
model gets cut off mid-string). The error then surfaced as an opaque
"Unterminated string in JSON" zod failure instead of pointing the user
at the real problem.

Two fixes:
- maxTokens back to 8192 (the original) for all Claude tiers, 4096 for
  GLM. Timeouts bumped to 95s — Sonnet 4.6 at ~130 tok/s does 8192 in
  ~63s, ~30s headroom for cold starts, still under Cloudflare's 100s
  edge cap.
- Detect stop_reason === 'max_tokens' on the Anthropic response BEFORE
  parsing and throw the new SpecTruncatedError. /preview catches it
  and returns 422 spec_too_large with a clear "split the prompt"
  message instead of leaking the zod parse failure.

											
										
										
											2026-05-28 19:34:40 +02:00
+								export class SpecTruncatedError extends Error {
 								  override readonly name = 'SpecTruncatedError';
 								}
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								function extractJson(text: string): unknown {
 								  const trimmed = text.trim();
 								  const fenced = trimmed.match(/```(?:json)?\s*([\s\S]*?)```/);
 								  const body = fenced ? fenced[1] : trimmed;
 								  if (!body) throw new SpecValidationError('empty_generation_output');
 								  try {
 								    return JSON.parse(body);
 								  } catch (e) {
 								    throw new SpecValidationError(`generation_not_json: ${(e as Error).message}`);
 								  }
 								}
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								/**
 								 * Public so other layers (the spec-edit merge in apps/api) can re-scan a
 								 * user-edited spec without duplicating the pattern list — single source of
 								 * truth for what counts as obviously-dangerous LLM output.
 								 */
 								export function scanForInjection(spec: GeneratorSpecT): void {
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								  for (const tool of spec.tools) {
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								    // Collect every string the LLM could have planted a payload in. Downstream
 								    // AI clients (Claude Desktop, Cursor) read tool.name + every inputSchema
 								    // description verbatim, so an injection there can pivot the user's AI
 								    // session — not only the runtime code.
 								    const surfaces: string[] = [tool.name, tool.description, tool.implementation];
 								    for (const param of Object.values(tool.inputSchema)) {
 								      if (param && typeof param === 'object' && 'description' in param) {
 								        const d = (param as { description?: unknown }).description;
 								        if (typeof d === 'string') surfaces.push(d);
 								      }
 								    }
 								    for (const text of surfaces) {
-												security: sovereign-audit Pass-2 fixes — auth-lib, oauth, templates

Six confirmed findings closed (3 MEDIUM, 3 LOW). Tier-1 surfaces from
Pass-1 re-verified non-regressed; this pass deepened the audit on the
auth library, OAuth issuer, and template marketplace.

Za-002 MEDIUM (scrypt cost) — bump SCRYPT_N from 2^14 → 2^17 (131072)
  matching current OWASP guidance for password hashing in 2026. Hash
  format embeds N (`scrypt$N$salt$hash`), so the existing admin
  password at the old cost still verifies — backward-compatible. Also
  added explicit maxmem ceilings since Node's default (~32MiB) is
  insufficient for the new N.

Za-003 MEDIUM (single-use race) — consumeMagicLink was SELECT-then-
  UPDATE; two parallel redemptions could both win and mint two
  sessions from the same token. Now uses the same atomic
  `UPDATE … WHERE id = ? AND consumedAt IS NULL RETURNING id` pattern
  /oauth/token already had — loser of the race gets
  invalid_or_expired_token.

Za-004 LOW (membership ordering) — `.orderBy(memberships.createdAt)`
  added so when org-invites eventually let a user belong to multiple
  orgs, the same one wins every login instead of insertion-order
  roulette. Latent-bug pre-empt.

Zb-002 LOW (OAuth register spam) — /oauth/register now per-IP daily
  rate-limited at 20/day (well above any legitimate MCP-client
  bootstrap pattern). Prevents DB-row spam.

Zc-001 MEDIUM (banned-pattern drift) — three separate copies of
  BANNED_PATTERNS had drifted apart. The publish-time scanner in
  templates.ts was MISSING the 7 new patterns added in Pass-1
  (process.binding, dlopen, .constructor.constructor, vm.runIn*,
  globalThis['..']). Single source of truth in @bmm/llm now exports
  SHARED_BANNED_PATTERNS; templates.ts composes PUBLISH_BANNED_PATTERNS
  = SHARED ∪ code-only-extras (dynamic import, fs.rm, setTimeout-with-
  string, process.kill, jailbreak markers).

Zc-002 LOW (N+1) — /v1/templates list was issuing one COUNT(*) per
  template (101 queries for a 100-row page). Now one grouped query
  with templateId GROUP BY, merged in JS. p95 doesn't degrade with
  marketplace growth.

DEFERRED (documented, scoped for next sprint):
  Za-001 HIGH — Account takeover via cross-provider email lookup.
    Requires schema change (users.primaryProvider). Mitigation in
    /settings/account banner planned.
  Zb-001 MEDIUM — /oauth/token refresh_token grant: advertised in
    AS metadata but unsupported_grant_type. Either implement (~40
    LOC) or strip from metadata.
  Zc-003 LOW — Admin takedown partial-failure consistency.
  Zd-001 IMPROVE — DEK cache invalidation across replicas (single-
    instance today).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:15:54 +02:00
+								      for (const pattern of SHARED_BANNED_PATTERNS) {
-												security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul

Five confirmed findings from the sovereign-audit pass, ordered by severity:

Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the
real visitor IP via X-Forwarded-For instead of always being the nginx /
docker-bridge peer. Every per-IP rate-limit in the codebase was silently
collapsed into one global counter; this restores them.

Z1-001 CRITICAL — runner container hardening flags (--read-only,
--cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100,
--memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a
TODO despite /security promising them. Now applied unconditionally on
production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev.

Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened
(Function(...) without `new`, process.binding, process.dlopen,
.constructor.constructor, _load, vm.runIn*Context, globalThis['..'],
"system prompt override"). scanForInjection now also walks tool.name and
every inputSchema property description, not only implementation +
description — closes the prompt-injection-into-AI-client surface that
downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate
BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of
the single shared scanForInjection export from @bmm/llm.

Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit
the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the
trustProxy fix above these are now real per-visitor limits.

Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in
production. In dev it still prints (so devs can click the link); in
production we log only "issued, URL withheld" and a loud error if no
email sender is wired (Resend integration is the actual launch
blocker — left as a TODO).

Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin
upgrades. SameSite=Lax already mitigates in modern browsers; this is the
defense-in-depth against browser bugs and non-browser clients.

FALSE POSITIVES dismissed: slug path-traversal (schema regex
^[a-z][a-z0-9-]*$ in @bmm/types catches it); session-after-promote
(getSession re-fetches isAdmin from DB on every request).

DEFERRED (not blockers, tracked):
- Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS
- Z1-003 docker image cleanup cron
- Z2-001 v2 — real sandbox runtime (multi-week refactor)
- Z3-002 rawBody-per-request memory — branch on webhook path only
- Z5-001 multi-user org RBAC for billing — gated on Team feature
- Email sender integration (Resend) — launch blocker

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-25 18:02:59 +02:00
+								        if (pattern.test(text)) {
 								          throw new BannedPatternError(`banned_pattern_detected: ${pattern.source}`);
 								        }
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								      }
 								    }
 								  }
 								}
 								export function mockSpec(prompt: string): GeneratorSpecT {
 								  return {
 								    name: 'Echo MCP',
-												feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate
limit on /preview, Opus default in the build worker, 5-min cache TTL that
made cache-miss the common case). This switches free users to GLM, paid
users to Claude tiers, and tightens every leak found in the audit.

Backend:
- @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel
  + pickBuildModel helpers, plan-aware ModelChoice
- preview-cache TTL 5min -> 24h (kills the cache-miss path)
- /v1/servers/preview: picks model from caller's plan, returns model name to UI
- /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds
- daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500)
- /v1/auth/me returns plan so the wizard can show the right model name
- generator worker: GLM default, Anthropic Sonnet fallback if GLM errors

Frontend:
- Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively,
  upgrade hint for hobby users, friendly errors for 402 / 429
- Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus),
  Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier
- Privacy + Security: explicit subprocessor disclosure for Anthropic (US) /
  Zhipu (CN) and which tier uses which

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

											
										
										
											2026-05-23 23:50:00 +02:00
+								    description: `Mock server (no LLM key). Prompt was: ${prompt.slice(0, 200)}`,
-												feat(llm): extract Claude SYSTEM_PROMPT + generateSpec into shared @bmm/llm package

											
										
										
											2026-05-19 18:05:31 +02:00
+								    tools: [
 								      {
 								        name: 'echo',
 								        description: 'Echoes the input string back to the caller.',
 								        inputSchema: {
 								          message: { type: 'string', description: 'Message to echo back', required: true },
 								        },
 								        implementation: `const msg = String(args.message ?? '');\nreturn { content: [{ type: 'text', text: \`echo: \${msg}\` }] };`,
 								      },
 								      {
 								        name: 'now',
 								        description: 'Returns the current server UTC timestamp.',
 								        inputSchema: {},
 								        implementation: `return { content: [{ type: 'text', text: new Date().toISOString() }] };`,
 								      },
 								    ],
 								    resources: [],
 								    prompts: [],
 								    requiredSecrets: [],
 								    scopes: ['mcp:read'],
 								    dependencies: {},
 								  };
 								}