feat: tiered LLM (GLM free / Claude paid) + rate limits + quota enforcement

The free tier was hemorrhaging Anthropic cost with no abuse cap (no rate limit on /preview, Opus default in the build worker, 5-min cache TTL that made cache-miss the common case). This switches free users to GLM, paid users to Claude tiers, and tightens every leak found in the audit. Backend: - @bmm/llm: GLM provider via Zhipu's OpenAI-compatible endpoint, pickPreviewModel + pickBuildModel helpers, plan-aware ModelChoice - preview-cache TTL 5min -> 24h (kills the cache-miss path) - /v1/servers/preview: picks model from caller's plan, returns model name to UI - /v1/servers POST: enforces SERVER_LIMITS per plan (402), rate-limits builds - daily rate-limit on preview (5/40/150/1000) and build (3/20/100/500) - /v1/auth/me returns plan so the wizard can show the right model name - generator worker: GLM default, Anthropic Sonnet fallback if GLM errors Frontend: - Wizard fetches plan, shows "<model> is drafting the tool spec" pre-emptively, upgrade hint for hobby users, friendly errors for 402 / 429 - Pricing page: AI-model line per tier (Open-tier / Haiku / Sonnet / Opus), Team €149 -> €199, Enterprise €499 -> €999, daily-preview limit per tier - Privacy + Security: explicit subprocessor disclosure for Anthropic (US) / Zhipu (CN) and which tier uses which Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 23:50:00 +02:00 · 2026-05-23 23:50:00 +02:00 · bc174c1302
commit bc174c1302
parent 66128c73d8
14 changed files with 537 additions and 58 deletions
--- a/apps/api/src/config.ts
+++ b/apps/api/src/config.ts
@ -8,6 +8,7 @@ const Env = z.object({
  NEXT_PUBLIC_APP_URL: z.string().default('http://localhost:3001'),
  OAUTH_KEY_DIR: z.string().default('./keys'),
  ANTHROPIC_API_KEY: z.string().optional(),
+  GLM_API_KEY: z.string().optional(),
  SECRETS_ENCRYPTION_KEY: z
    .string()
    .min(64, '32 bytes hex required')
@ -33,6 +34,7 @@ export const config = Env.parse({
  NEXT_PUBLIC_APP_URL: process.env.NEXT_PUBLIC_APP_URL,
  OAUTH_KEY_DIR: process.env.OAUTH_KEY_DIR,
  ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
+  GLM_API_KEY: process.env.GLM_API_KEY,
  SECRETS_ENCRYPTION_KEY: process.env.SECRETS_ENCRYPTION_KEY,
  CONTROL_PLANE_PUBLIC_URL: process.env.CONTROL_PLANE_PUBLIC_URL,
  ADMIN_EMAIL: process.env.ADMIN_EMAIL,
--- a/apps/api/src/lib/plan.ts
+++ b/apps/api/src/lib/plan.ts
@ -0,0 +1,23 @@
+import { createDb, eq, organizations } from '@bmm/db';
+import type { Plan } from '@bmm/llm';
+
+const db = createDb();
+
+/** Look up an org's current plan. Defaults to 'hobby' if the org row is gone
+ *  for any reason — fail-closed to the least expensive tier. */
+export async function getOrgPlan(orgId: string): Promise<Plan> {
+  const [row] = await db
+    .select({ plan: organizations.plan })
+    .from(organizations)
+    .where(eq(organizations.id, orgId))
+    .limit(1);
+  return (row?.plan ?? 'hobby') as Plan;
+}
+
+/** Max MCP servers per org by plan. Enforced at POST /v1/servers. */
+export const SERVER_LIMITS: Record<Plan, number> = {
+  hobby: 1,
+  pro: 5,
+  team: 25,
+  enterprise: Number.MAX_SAFE_INTEGER,
+};
--- a/apps/api/src/lib/preview-cache.ts
+++ b/apps/api/src/lib/preview-cache.ts
@ -1,8 +1,11 @@
 import crypto from 'node:crypto';
-import { getRedis } from './redis.js';
 import type { GeneratorSpec } from '@bmm/types';
+import { getRedis } from './redis.js';

-const TTL_SECONDS = 5 * 60;
+// 24h: previews are LLM-priced; a long TTL eliminates the cache-miss path on
+// the build worker (each miss = another LLM call). Specs are tiny JSON (~5KB),
+// Redis-memory impact is negligible.
+const TTL_SECONDS = 24 * 60 * 60;

 function key(previewId: string): string {
  return `preview:${previewId}`;
--- a/apps/api/src/lib/rate-limit.ts
+++ b/apps/api/src/lib/rate-limit.ts
@ -0,0 +1,51 @@
+import type { Plan } from '@bmm/llm';
+import { getRedis } from './redis.js';
+
+const DAY_SEC = 24 * 60 * 60;
+
+function todayKey(): string {
+  return new Date().toISOString().slice(0, 10);
+}
+
+export interface RateLimitResult {
+  ok: boolean;
+  remaining: number;
+  resetIn: number;
+}
+
+/**
+ * Daily counter via Redis INCR. Atomic — no race window between read & write.
+ * First INCR (count === 1) sets the TTL so the key auto-rolls at midnight UTC.
+ */
+export async function checkDailyLimit(
+  scope: string,
+  userId: string,
+  max: number,
+): Promise<RateLimitResult> {
+  const key = `ratelimit:${scope}:${userId}:${todayKey()}`;
+  const redis = getRedis();
+  const count = await redis.incr(key);
+  if (count === 1) await redis.expire(key, DAY_SEC);
+  const ttl = await redis.ttl(key);
+  return {
+    ok: count <= max,
+    remaining: Math.max(0, max - count),
+    resetIn: ttl > 0 ? ttl : DAY_SEC,
+  };
+}
+
+// Per-tier daily limits on the two LLM-priced actions.
+// Preview = ~€0.002-0.015/call · Build = ~€0.005-0.22/call.
+export const PREVIEW_DAILY_LIMIT: Record<Plan, number> = {
+  hobby: 5,
+  pro: 40,
+  team: 150,
+  enterprise: 1000,
+};
+
+export const BUILD_DAILY_LIMIT: Record<Plan, number> = {
+  hobby: 3,
+  pro: 20,
+  team: 100,
+  enterprise: 500,
+};
--- a/apps/api/src/routes/auth.ts
+++ b/apps/api/src/routes/auth.ts
@ -13,6 +13,7 @@ import type { FastifyInstance } from 'fastify';
 import { z } from 'zod';
 import { config } from '../config.js';
 import { audit } from '../lib/audit.js';
+import { getOrgPlan } from '../lib/plan.js';
 import { sendSms, smsConfigured } from '../lib/sms.js';

 const SESSION_COOKIE = 'bmm_session';
@ -128,7 +129,10 @@ export async function authRoutes(app: FastifyInstance): Promise<void> {
    const token = req.cookies[SESSION_COOKIE];
    const session = await getSession(token);
    if (!session) return reply.code(401).send({ error: 'unauthorized' });
-    return reply.send({ user: session });
+    // Plan is on the org, not the session — look it up fresh so a Stripe
+    // upgrade is reflected without forcing a re-login.
+    const plan = await getOrgPlan(session.orgId);
+    return reply.send({ user: { ...session, plan } });
  });

  app.post('/v1/auth/admin/login', async (req, reply) => {
--- a/apps/api/src/routes/servers.ts
+++ b/apps/api/src/routes/servers.ts
@ -11,7 +11,13 @@ import {
  sql,
  templates,
 } from '@bmm/db';
-import { BannedPatternError, SpecTimeoutError, SpecValidationError, generateSpec } from '@bmm/llm';
+import {
+  BannedPatternError,
+  SpecTimeoutError,
+  SpecValidationError,
+  generateSpec,
+  pickPreviewModel,
+} from '@bmm/llm';
 import {
  BuildEvent,
  CreateServerInput,
@ -26,8 +32,10 @@ import { config } from '../config.js';
 import { audit } from '../lib/audit.js';
 import { encryptSecret } from '../lib/crypto.js';
 import { stopContainer } from '../lib/docker.js';
+import { SERVER_LIMITS, getOrgPlan } from '../lib/plan.js';
 import { cacheSpec, loadSpec, overwriteSpec } from '../lib/preview-cache.js';
 import { getBuildQueue } from '../lib/queue.js';
+import { BUILD_DAILY_LIMIT, PREVIEW_DAILY_LIMIT, checkDailyLimit } from '../lib/rate-limit.js';
 import { buildChannel, getSubscriber } from '../lib/redis.js';
 import { requireAuth } from '../plugins/session.js';
 import { getForkRefTemplate } from './templates.js';
@ -46,26 +54,47 @@ export async function serverRoutes(app: FastifyInstance): Promise<void> {
  });

  app.post('/v1/servers/preview', { preHandler: requireAuth }, async (req, reply) => {
+    const user = req.user!;
    const parsed = PreviewInput.safeParse(req.body);
    if (!parsed.success) {
      return reply.code(400).send({ error: 'invalid_input', issues: parsed.error.flatten() });
    }
+
+    const plan = await getOrgPlan(user.orgId);
+
+    // Daily preview rate-limit per user. Free is tight (5/day) because every
+    // preview is a paid LLM call; paid tiers have headroom for real iteration.
+    const rl = await checkDailyLimit('preview', user.userId, PREVIEW_DAILY_LIMIT[plan]);
+    if (!rl.ok) {
+      return reply.code(429).send({
+        error: 'rate_limited',
+        detail: `Daily preview limit reached for plan "${plan}" (${PREVIEW_DAILY_LIMIT[plan]}/day). Resets in ${Math.ceil(rl.resetIn / 3600)}h.`,
+        plan,
+        limit: PREVIEW_DAILY_LIMIT[plan],
+        resetIn: rl.resetIn,
+      });
+    }
+
+    const choice = pickPreviewModel(plan);
+
    try {
      const { spec, source } = await generateSpec(parsed.data.prompt, {
+        provider: choice.provider,
        apiKey: config.ANTHROPIC_API_KEY,
-        // Preview generates the spec synchronously inside an HTTP request that
-        // sits behind Cloudflare's edge timeout. Haiku 4.5 (~200 tok/s — a full
-        // 8k-token spec in ~40s) is the only model fast enough; Sonnet and Opus
-        // overran the proxy cap, which reached the browser as a CORS error. The
-        // hard 60s timeout guarantees a clean 504 before the proxy gives up.
-        model: 'claude-haiku-4-5-20251001',
-        timeoutMs: 60_000,
+        glmApiKey: config.GLM_API_KEY,
+        model: choice.model,
+        maxTokens: choice.maxTokens,
+        timeoutMs: choice.timeoutMs,
        maxRetries: 0,
      });
      const previewId = await cacheSpec(spec);
      return reply.send({
        previewId,
        source,
+        plan,
+        modelDisplayName: choice.displayName,
+        modelBadge: choice.displayBadge,
+        upgradeHint: plan === 'hobby',
        spec: {
          name: spec.name,
          description: spec.description,
@ -112,6 +141,37 @@ export async function serverRoutes(app: FastifyInstance): Promise<void> {
      templateId,
    } = parsed.data;

+    // ---- Plan enforcement (must happen before any DB write) ----
+    const plan = await getOrgPlan(user.orgId);
+
+    // Daily build rate-limit.
+    const rl = await checkDailyLimit('build', user.userId, BUILD_DAILY_LIMIT[plan]);
+    if (!rl.ok) {
+      return reply.code(429).send({
+        error: 'rate_limited',
+        detail: `Daily build limit reached for plan "${plan}" (${BUILD_DAILY_LIMIT[plan]}/day). Resets in ${Math.ceil(rl.resetIn / 3600)}h.`,
+        plan,
+        limit: BUILD_DAILY_LIMIT[plan],
+        resetIn: rl.resetIn,
+      });
+    }
+
+    // Server-count quota. Counted via SQL (not cached) so race risk is tiny.
+    const [serverCountRow] = await db
+      .select({ count: sql<number>`count(*)::int` })
+      .from(mcpServers)
+      .where(eq(mcpServers.orgId, user.orgId));
+    const existingCount = serverCountRow?.count ?? 0;
+    if (existingCount >= SERVER_LIMITS[plan]) {
+      return reply.code(402).send({
+        error: 'plan_limit_reached',
+        detail: `Plan "${plan}" allows ${SERVER_LIMITS[plan]} server(s); you have ${existingCount}. Upgrade to add more.`,
+        plan,
+        limit: SERVER_LIMITS[plan],
+        current: existingCount,
+      });
+    }
+
    // ---- Template-fork validation ----
    // templateId is user-controlled. To prevent fork_count manipulation + garbage
    // template_id rows, the user MUST have hit POST /v1/templates/:slug/fork,
--- a/apps/generator/src/config.ts
+++ b/apps/generator/src/config.ts
@ -4,13 +4,14 @@ const Env = z.object({
  DATABASE_URL: z.string(),
  REDIS_URL: z.string().default('redis://localhost:6379'),
  ANTHROPIC_API_KEY: z.string().optional(),
+  GLM_API_KEY: z.string().optional(),
  RUNNER_HOST: z.string().default('localhost'),
  RUNNER_PORT_RANGE_START: z.coerce.number().default(4100),
  RUNNER_PORT_RANGE_END: z.coerce.number().default(4999),
  CONTROL_PLANE_URL: z.string().default('http://host.docker.internal:4000'),
  CONTROL_PLANE_PUBLIC_URL: z.string().default('http://localhost:4000'),
  OAUTH_ISSUER: z.string().optional(),
-  MODEL_GENERATE: z.string().default('claude-opus-4-7'),
+  MODEL_GENERATE: z.string().default('glm-4.5'),
  MODEL_FIX: z.string().default('claude-haiku-4-5-20251001'),
 });

--- a/apps/generator/src/lib/claude.ts
+++ b/apps/generator/src/lib/claude.ts
@ -1,12 +1,40 @@
-import { generateSpec as sharedGenerate, type GenerationResult } from '@bmm/llm';
+import { type GenerationResult, generateSpec as sharedGenerate } from '@bmm/llm';
 import { config } from '../config.js';

 export type { GenerationResult };

+/**
+ * Build-worker spec generation (cache-miss path). Runs async in a BullMQ
+ * worker — no proxy timeout. Defaults to GLM to keep this rare path cheap;
+ * falls back to Anthropic Sonnet on GLM failure so a temporary outage at one
+ * provider doesn't break builds.
+ */
 export async function generateSpec(prompt: string): Promise<GenerationResult> {
+  if (config.GLM_API_KEY) {
+    try {
+      return await sharedGenerate(prompt, {
+        provider: 'glm',
+        glmApiKey: config.GLM_API_KEY,
+        model: config.MODEL_GENERATE,
+        maxTokens: 8192,
+        timeoutMs: 180_000,
+      });
+    } catch (err) {
+      console.warn(
+        '[generator] GLM failed, falling back to Anthropic Sonnet:',
+        (err as Error).message,
+      );
+    }
+  }
+  if (!config.ANTHROPIC_API_KEY) {
+    // No keys at all → @bmm/llm returns mockSpec, which keeps builds working
+    // in dev without any provider configured.
+    return sharedGenerate(prompt, { provider: 'anthropic' });
+  }
  return sharedGenerate(prompt, {
+    provider: 'anthropic',
    apiKey: config.ANTHROPIC_API_KEY,
-    model: config.MODEL_GENERATE,
+    model: 'claude-sonnet-4-6',
    maxTokens: 8192,
  });
 }
--- a/apps/generator/src/worker.ts
+++ b/apps/generator/src/worker.ts
@ -1,13 +1,13 @@
+import { builds, createDb, eq, mcpServers } from '@bmm/db';
+import { GeneratorSpec } from '@bmm/types';
 import { Worker } from 'bullmq';
 import { Redis } from 'ioredis';
-import { GeneratorSpec } from '@bmm/types';
-import { builds, createDb, eq, mcpServers } from '@bmm/db';
 import { config } from './config.js';
-import { generateSpec } from './lib/claude.js';
-import { renderServerCode } from './lib/render.js';
 import { dockerBuild, prepareBuildContext, staticCheck } from './lib/build.js';
+import { generateSpec } from './lib/claude.js';
 import { allocatePort, deployContainer, dockerAvailable, stopContainer } from './lib/deploy.js';
 import { emitDone, emitError, emitLog, emitStatus } from './lib/emit.js';
+import { renderServerCode } from './lib/render.js';

 const db = createDb();
 const connection = new Redis(config.REDIS_URL, { maxRetriesPerRequest: null });
@ -57,12 +57,18 @@ export const worker = new Worker<JobData>(
    const oldContainerId = priorState?.containerId ?? null;

    try {
-      await db.update(builds).set({ status: 'generating', startedAt: new Date() }).where(eq(builds.id, buildId));
-      await db.update(mcpServers).set({ status: 'generating', updatedAt: new Date() }).where(eq(mcpServers.id, serverId));
+      await db
+        .update(builds)
+        .set({ status: 'generating', startedAt: new Date() })
+        .where(eq(builds.id, buildId));
+      await db
+        .update(mcpServers)
+        .set({ status: 'generating', updatedAt: new Date() })
+        .where(eq(mcpServers.id, serverId));
      await emitStatus(buildId, 'generating');

      let spec: GeneratorSpec | null = null;
-      let source: 'claude' | 'mock' | 'cached' = 'mock';
+      let source: 'claude' | 'glm' | 'mock' | 'cached' = 'mock';

      if (previewId) {
        spec = await loadCachedSpec(previewId);
@ -87,7 +93,10 @@ export const worker = new Worker<JobData>(
      let generatedCode: string;
      const prebuilt = previewId ? await loadPrebuiltCode(previewId) : null;
      if (prebuilt) {
-        await log('info', `Using pre-rendered template code (${prebuilt.length} chars) — skipping render`);
+        await log(
+          'info',
+          `Using pre-rendered template code (${prebuilt.length} chars) — skipping render`,
+        );
        generatedCode = prebuilt;
      } else {
        generatedCode = renderServerCode(spec);
@ -98,11 +107,20 @@ export const worker = new Worker<JobData>(
        .where(eq(builds.id, buildId));

      await db.update(builds).set({ status: 'building' }).where(eq(builds.id, buildId));
-      await db.update(mcpServers).set({ status: 'building', toolsSchema: spec.tools, updatedAt: new Date() }).where(eq(mcpServers.id, serverId));
+      await db
+        .update(mcpServers)
+        .set({ status: 'building', toolsSchema: spec.tools, updatedAt: new Date() })
+        .where(eq(mcpServers.id, serverId));
      await emitStatus(buildId, 'building');
      await log('info', 'Preparing build context...');

-      const { contextDir, imageTag } = await prepareBuildContext(serverId, version, slug, generatedCode, spec);
+      const { contextDir, imageTag } = await prepareBuildContext(
+        serverId,
+        version,
+        slug,
+        generatedCode,
+        spec,
+      );
      await log('info', `Build context at ${contextDir}`);

      await log('info', 'Running static checks...');
@ -112,8 +130,14 @@ export const worker = new Worker<JobData>(
      const hasDocker = await dockerAvailable();
      if (!hasDocker) {
        await log('warn', 'Docker not available — skipping build/deploy. Server marked draft.');
-        await db.update(builds).set({ status: 'failed', errorMessage: 'docker_unavailable', finishedAt: new Date() }).where(eq(builds.id, buildId));
-        await db.update(mcpServers).set({ status: 'failed', updatedAt: new Date() }).where(eq(mcpServers.id, serverId));
+        await db
+          .update(builds)
+          .set({ status: 'failed', errorMessage: 'docker_unavailable', finishedAt: new Date() })
+          .where(eq(builds.id, buildId));
+        await db
+          .update(mcpServers)
+          .set({ status: 'failed', updatedAt: new Date() })
+          .where(eq(mcpServers.id, serverId));
        await emitDone(buildId, 'failed', serverId, null);
        return;
      }
@ -125,7 +149,10 @@ export const worker = new Worker<JobData>(
      await log('info', 'Image built.');

      await db.update(builds).set({ status: 'deploying' }).where(eq(builds.id, buildId));
-      await db.update(mcpServers).set({ status: 'deploying', updatedAt: new Date() }).where(eq(mcpServers.id, serverId));
+      await db
+        .update(mcpServers)
+        .set({ status: 'deploying', updatedAt: new Date() })
+        .where(eq(mcpServers.id, serverId));
      await emitStatus(buildId, 'deploying');

      const port = await allocatePort();
@ -140,7 +167,10 @@ export const worker = new Worker<JobData>(
      };

      const handle = await deployContainer({ serverId, slug, hostPort: port, imageTag, envVars });
-      await log('info', `Container ${handle.containerId.slice(0, 12)} running at ${handle.publicUrl}`);
+      await log(
+        'info',
+        `Container ${handle.containerId.slice(0, 12)} running at ${handle.publicUrl}`,
+      );

      await db
        .update(builds)
@ -148,7 +178,12 @@ export const worker = new Worker<JobData>(
        .where(eq(builds.id, buildId));
      await db
        .update(mcpServers)
-        .set({ status: 'live', currentVersion: version, publicUrl: handle.publicUrl, updatedAt: new Date() })
+        .set({
+          status: 'live',
+          currentVersion: version,
+          publicUrl: handle.publicUrl,
+          updatedAt: new Date(),
+        })
        .where(eq(mcpServers.id, serverId));

      // Rolling deploy: the new container is live — now retire the previous one.
--- a/apps/web/app/(dashboard)/servers/new/page.tsx
+++ b/apps/web/app/(dashboard)/servers/new/page.tsx
@ -7,6 +7,7 @@ import { StreamingLogs } from '@/components/streaming-logs';
 import { Button } from '@/components/ui/button';
 import { apiFetch } from '@/lib/api';
 import { Loader2, RotateCcw, X } from 'lucide-react';
+import Link from 'next/link';
 import { useRouter, useSearchParams } from 'next/navigation';
 import { Suspense, useEffect, useState } from 'react';

@ -41,9 +42,15 @@ interface PreviewTool {
  inputSchema: Record<string, unknown>;
 }

+type Plan = 'hobby' | 'pro' | 'team' | 'enterprise';
+
 interface PreviewResponse {
  previewId: string;
-  source: 'claude' | 'mock';
+  source: 'claude' | 'glm' | 'mock';
+  plan?: Plan;
+  modelDisplayName?: string;
+  modelBadge?: 'open-tier' | 'claude-haiku' | 'claude-sonnet' | 'claude-opus';
+  upgradeHint?: boolean;
  spec: {
    name: string;
    description?: string;
@ -53,6 +60,13 @@ interface PreviewResponse {
  };
 }

+const PREVIEW_MODEL_BY_PLAN: Record<Plan, { name: string; estimate: string }> = {
+  hobby: { name: 'Open-tier AI', estimate: '30–60 seconds' },
+  pro: { name: 'Claude Haiku 4.5', estimate: '10–20 seconds' },
+  team: { name: 'Claude Sonnet 4.6', estimate: '15–40 seconds' },
+  enterprise: { name: 'Claude Sonnet 4.6', estimate: '15–40 seconds' },
+};
+
 interface EditableTool {
  name: string;
  description: string;
@ -86,6 +100,7 @@ function NewServerPageInner() {
  const router = useRouter();
  const [step, setStep] = useState<Step>('prompt');
  const [elapsedSec, setElapsedSec] = useState(0);
+  const [userPlan, setUserPlan] = useState<Plan | null>(null);

  const [prompt, setPrompt] = useState('');
  const [name, setName] = useState('');
@ -207,6 +222,14 @@ function NewServerPageInner() {
    return () => clearInterval(id);
  }, [step]);

+  // Plan determines which model the preview will use — we display its name
+  // *before* the request so the user knows what they're waiting for.
+  useEffect(() => {
+    apiFetch<{ user: { plan?: Plan } }>('/v1/auth/me')
+      .then((r) => setUserPlan(r.user.plan ?? 'hobby'))
+      .catch(() => setUserPlan('hobby'));
+  }, []);
+
  async function analyze() {
    setError(null);
    if (prompt.trim().length < 10) {
@ -358,13 +381,23 @@ function NewServerPageInner() {
      setServerId(res.server.id);
      setStep('building');
    } catch (e) {
-      const detail = (e as { detail?: { error?: string; detail?: unknown } }).detail;
+      const detail = (e as { detail?: { error?: string; detail?: string } }).detail;
      const code = detail?.error;
-      setError(
-        code === 'slug_taken'
-          ? `The slug "${slug}" is already used by one of your servers — change the Slug field above.`
-          : (code ?? (e as Error).message),
-      );
+      if (code === 'slug_taken') {
+        setError(
+          `The slug "${slug}" is already used by one of your servers — change the Slug field above.`,
+        );
+        return;
+      }
+      if (code === 'plan_limit_reached') {
+        setError(`${detail?.detail ?? 'Plan limit reached.'} See /pricing to upgrade.`);
+        return;
+      }
+      if (code === 'rate_limited') {
+        setError(detail?.detail ?? 'Daily build limit reached — try again tomorrow or upgrade.');
+        return;
+      }
+      setError(detail?.detail ?? code ?? (e as Error).message);
    }
  }

@ -457,8 +490,18 @@ function NewServerPageInner() {
          <Loader2 className="mx-auto animate-spin text-[--color-accent]" size={22} />
          <p className="mt-4 text-[13px]">Analyzing your prompt…</p>
          <p className="mt-1 text-[12px] text-[--color-fg-subtle]">
-            Claude is drafting the tool spec. Usually 15–40 seconds.
+            {(userPlan ? PREVIEW_MODEL_BY_PLAN[userPlan] : PREVIEW_MODEL_BY_PLAN.hobby).name} is
+            drafting the tool spec. Usually{' '}
+            {(userPlan ? PREVIEW_MODEL_BY_PLAN[userPlan] : PREVIEW_MODEL_BY_PLAN.hobby).estimate}.
          </p>
+          {userPlan === 'hobby' && (
+            <p className="mt-2 text-[11px] text-[--color-fg-muted]">
+              <Link href="/pricing" className="text-[--color-accent] hover:underline">
+                Upgrade to Pro
+              </Link>{' '}
+              for ~3× faster analysis with Claude Haiku.
+            </p>
+          )}
          <p className="mono mt-3 text-[11px] tabular-nums text-[--color-fg-muted]">
            {elapsedSec}s elapsed
          </p>
@ -524,7 +567,7 @@ function NewServerPageInner() {
                  </button>
                )}
                <span className="mono text-[10.5px] text-[--color-fg-subtle]">
-                  spec via {preview.source}
+                  drafted with {preview.modelDisplayName ?? preview.source}
                </span>
              </div>
            </div>
--- a/apps/web/app/(marketing)/pricing/page.tsx
+++ b/apps/web/app/(marketing)/pricing/page.tsx
@ -14,9 +14,12 @@ const TIERS = [
    price: '€0',
    tag: 'Forever free',
    description: 'For trying things out and shipping single-user tools.',
+    model: 'Open-tier AI',
+    modelDetail: 'Free-tier model · ~30-60s analyze',
    features: [
      '1 MCP server',
      '100,000 tool calls / month',
+      '5 prompt analyses / day',
      'BuildMyMCP subdomain',
      'Community support',
    ],
@ -28,9 +31,12 @@ const TIERS = [
    price: '€49',
    tag: '/ month',
    description: 'For solo founders and small teams shipping production tools.',
+    model: 'Claude Haiku 4.5',
+    modelDetail: 'Anthropic · ~10-20s analyze',
    features: [
      '5 MCP servers',
      '1M tool calls / month',
+      '40 prompt analyses / day',
      'Custom domain',
      'Priority build queue',
      'Email support, 1 business-day SLA',
@ -41,12 +47,15 @@ const TIERS = [
  },
  {
    name: 'Team',
-    price: '€149',
+    price: '€199',
    tag: '/ month',
    description: 'For teams with RBAC, audit, and 99.9% SLA needs.',
+    model: 'Claude Sonnet 4.6',
+    modelDetail: "Anthropic's flagship",
    features: [
      '25 MCP servers',
      '10M tool calls / month',
+      '150 prompt analyses / day',
      'RBAC + extended audit log',
      '99.9% uptime SLA',
      'Shared Slack channel support',
@ -56,9 +65,11 @@ const TIERS = [
  },
  {
    name: 'Enterprise',
-    price: '€499+',
+    price: '€999+',
    tag: '/ month',
    description: 'For organizations bringing their own cloud, SSO and dedicated infra.',
+    model: 'Sonnet + Opus on build',
+    modelDetail: 'EU data-residency option',
    features: [
      'Unlimited servers',
      'BYOC (AWS, GCP, Azure, Hetzner)',
@ -122,6 +133,13 @@ export default function Pricing() {
            <p className="mt-2 text-[12px] leading-relaxed text-[--color-fg-muted]">
              {t.description}
            </p>
+            <div className="mt-3 rounded-md border border-[--color-border] bg-[--color-bg-subtle] px-2.5 py-1.5">
+              <div className="text-[10.5px] uppercase tracking-wider text-[--color-fg-subtle]">
+                AI model
+              </div>
+              <div className="mt-0.5 text-[12.5px] font-medium text-[--color-fg]">{t.model}</div>
+              <div className="text-[10.5px] text-[--color-fg-subtle]">{t.modelDetail}</div>
+            </div>
            <ul className="mt-4 space-y-1.5 text-[12.5px] text-[--color-fg-muted]">
              {t.features.map((f) => (
                <li key={f}>— {f}</li>
--- a/apps/web/app/(marketing)/privacy/page.tsx
+++ b/apps/web/app/(marketing)/privacy/page.tsx
@ -36,11 +36,21 @@ const SECTIONS = [
  {
    h: 'Subprocessors',
    p: [
-      "Anthropic (generation) — only the prompt text you send. Anthropic's data-retention policy applies.",
-      'Hetzner (compute).',
-      'Backblaze (encrypted backups).',
-      'Stripe (billing).',
-      'Cloudflare (DNS + DDoS).',
+      "Anthropic, USA (Claude AI — used for prompt analysis and code generation on Pro / Team / Enterprise tiers). Only the prompt text and resulting spec are sent. Anthropic's data-retention policy applies.",
+      'Zhipu AI, China (GLM model — used for prompt analysis on the free Hobby tier only). Only the prompt text and resulting spec are sent. Upgrade to a paid tier to keep all AI processing within Anthropic (US).',
+      'Hetzner, Germany (compute).',
+      'Backblaze, EU (encrypted backups).',
+      'Stripe, Ireland (billing).',
+      'Cloudflare (DNS + DDoS protection).',
+    ],
+  },
+  {
+    h: 'AI processing per tier',
+    p: [
+      'Hobby (free): prompts are sent to Zhipu AI (GLM, China) for analysis. Choose a paid tier if your prompts contain data that must not leave the EU/US.',
+      'Pro: prompts are sent to Anthropic (Claude Haiku 4.5, USA).',
+      'Team: prompts are sent to Anthropic (Claude Sonnet 4.6, USA).',
+      'Enterprise: Anthropic (Claude Sonnet + Opus, USA) with EU-data-residency opt-in available on request.',
    ],
  },
  {
--- a/apps/web/app/(marketing)/security/page.tsx
+++ b/apps/web/app/(marketing)/security/page.tsx
@ -40,7 +40,11 @@ const PILLARS = [
  },
  {
    title: 'Rate limiting',
-    body: 'Default 100 requests/min/IP per tool, enforced at the Traefik layer before traffic ever reaches your container.',
+    body: 'Default 100 requests/min/IP per tool, enforced at the Traefik layer before traffic ever reaches your container. Daily preview + build caps per tier protect against runaway LLM spend.',
+  },
+  {
+    title: 'AI provider by tier — transparent',
+    body: "Hobby (free) tier uses Zhipu's GLM model (servers in China) for prompt analysis — chosen for cost so we can offer a real free tier. Pro, Team and Enterprise use Anthropic Claude (US). Enterprise can request EU-only data residency. The provider is shown live in the wizard so you always know where your prompt is going.",
  },
 ];

--- a/packages/llm/src/index.ts
+++ b/packages/llm/src/index.ts
@ -45,18 +45,138 @@ const BANNED_PATTERNS = [
  /disregard\s+(the\s+)?(above|previous)/i,
 ];

+// ──────────────────────────────────────────────────────────────────────────
+// Plan-aware model selection
+// ──────────────────────────────────────────────────────────────────────────
+
+export type Plan = 'hobby' | 'pro' | 'team' | 'enterprise';
+export type Purpose = 'preview' | 'build';
+export type Provider = 'anthropic' | 'glm';
+export type DisplayBadge = 'open-tier' | 'claude-haiku' | 'claude-sonnet' | 'claude-opus';
+
+export interface ModelChoice {
+  provider: Provider;
+  model: string;
+  maxTokens: number;
+  timeoutMs: number;
+  /** User-facing model name shown in the wizard + previews. */
+  displayName: string;
+  displayBadge: DisplayBadge;
+}
+
+/**
+ * Preview runs synchronously inside an HTTP request behind Cloudflare's
+ * ~100s edge cap. Each tier's (model + max_tokens + timeout) is bounded to
+ * fit. Hobby uses GLM as the cost lever; paid tiers escalate to Claude — the
+ * visible quality/speed jump *is* the upgrade pitch.
+ *
+ * Measured token rates: glm-4-plus ~58 tok/s (3500 tok ≈ 60s) ·
+ * Claude Haiku 4.5 ~200 tok/s (8192 tok ≈ 41s) · Claude Sonnet 4.6 ~80 tok/s.
+ */
+const PREVIEW_MODELS: Record<Plan, ModelChoice> = {
+  hobby: {
+    provider: 'glm',
+    model: 'glm-4-plus',
+    maxTokens: 3500,
+    timeoutMs: 65_000,
+    displayName: 'Open-tier AI',
+    displayBadge: 'open-tier',
+  },
+  pro: {
+    provider: 'anthropic',
+    model: 'claude-haiku-4-5-20251001',
+    maxTokens: 8192,
+    timeoutMs: 60_000,
+    displayName: 'Claude Haiku 4.5',
+    displayBadge: 'claude-haiku',
+  },
+  team: {
+    provider: 'anthropic',
+    model: 'claude-sonnet-4-6',
+    maxTokens: 8192,
+    timeoutMs: 60_000,
+    displayName: 'Claude Sonnet 4.6',
+    displayBadge: 'claude-sonnet',
+  },
+  enterprise: {
+    provider: 'anthropic',
+    model: 'claude-sonnet-4-6',
+    maxTokens: 8192,
+    timeoutMs: 60_000,
+    displayName: 'Claude Sonnet 4.6',
+    displayBadge: 'claude-sonnet',
+  },
+};
+
+/**
+ * Build worker runs async via BullMQ — no proxy timeout. With the 24h preview
+ * cache TTL cache-misses are rare, so GLM as the default keeps that rare path
+ * cheap; Enterprise gets Opus as a premium-quality promise.
+ */
+const BUILD_MODELS: Record<Plan, ModelChoice> = {
+  hobby: {
+    provider: 'glm',
+    model: 'glm-4.5',
+    maxTokens: 8192,
+    timeoutMs: 180_000,
+    displayName: 'Open-tier AI',
+    displayBadge: 'open-tier',
+  },
+  pro: {
+    provider: 'glm',
+    model: 'glm-4.5',
+    maxTokens: 8192,
+    timeoutMs: 180_000,
+    displayName: 'Open-tier AI',
+    displayBadge: 'open-tier',
+  },
+  team: {
+    provider: 'glm',
+    model: 'glm-4.5',
+    maxTokens: 8192,
+    timeoutMs: 180_000,
+    displayName: 'Open-tier AI',
+    displayBadge: 'open-tier',
+  },
+  enterprise: {
+    provider: 'anthropic',
+    model: 'claude-opus-4-7',
+    maxTokens: 8192,
+    timeoutMs: 600_000,
+    displayName: 'Claude Opus 4.7',
+    displayBadge: 'claude-opus',
+  },
+};
+
+export function pickPreviewModel(plan: Plan): ModelChoice {
+  return PREVIEW_MODELS[plan];
+}
+
+export function pickBuildModel(plan: Plan): ModelChoice {
+  return BUILD_MODELS[plan];
+}
+
+// ──────────────────────────────────────────────────────────────────────────
+// Generation API
+// ──────────────────────────────────────────────────────────────────────────
+
 export interface GenerationResult {
  spec: GeneratorSpecT;
-  source: 'claude' | 'mock';
+  source: 'claude' | 'glm' | 'mock';
 }

 export interface GenerateOptions {
+  /** 'anthropic' (default) or 'glm'. */
+  provider?: Provider;
+  /** Anthropic API key — required if provider === 'anthropic'. */
  apiKey?: string;
+  /** Zhipu (GLM) API key — required if provider === 'glm'. */
+  glmApiKey?: string;
  model?: string;
  maxTokens?: number;
-  /** Per-attempt request timeout in ms. Omit to use the SDK default. */
+  /** Per-attempt request timeout in ms. */
  timeoutMs?: number;
-  /** SDK retry count. Omit to use the SDK default. */
+  /** SDK retry count. Anthropic only. */
  maxRetries?: number;
 }

@ -64,9 +184,40 @@ export async function generateSpec(
  prompt: string,
  opts: GenerateOptions = {},
 ): Promise<GenerationResult> {
+  const provider = opts.provider ?? 'anthropic';
+
+  if (provider === 'glm') {
+    if (!opts.glmApiKey) return { spec: mockSpec(prompt), source: 'mock' };
+    return generateWithGlm(prompt, {
+      apiKey: opts.glmApiKey,
+      model: opts.model ?? 'glm-4-plus',
+      maxTokens: opts.maxTokens ?? 4096,
+      timeoutMs: opts.timeoutMs,
+    });
+  }
+
  if (!opts.apiKey) {
    return { spec: mockSpec(prompt), source: 'mock' };
  }
+  return generateWithAnthropic(prompt, {
+    apiKey: opts.apiKey,
+    model: opts.model ?? 'claude-opus-4-7',
+    maxTokens: opts.maxTokens ?? 8192,
+    timeoutMs: opts.timeoutMs,
+    maxRetries: opts.maxRetries,
+  });
+}
+
+async function generateWithAnthropic(
+  prompt: string,
+  opts: {
+    apiKey: string;
+    model: string;
+    maxTokens: number;
+    timeoutMs?: number;
+    maxRetries?: number;
+  },
+): Promise<GenerationResult> {
  const client = new Anthropic({ apiKey: opts.apiKey });
  const requestOptions: { timeout?: number; maxRetries?: number } = {};
  if (opts.timeoutMs !== undefined) requestOptions.timeout = opts.timeoutMs;
@ -75,35 +226,81 @@ export async function generateSpec(
  const response = await client.messages
    .create(
      {
-        model: opts.model ?? 'claude-opus-4-7',
-        max_tokens: opts.maxTokens ?? 8192,
+        model: opts.model,
+        max_tokens: opts.maxTokens,
        system: SYSTEM_PROMPT,
        messages: [{ role: 'user', content: prompt }],
      },
      requestOptions,
    )
    .catch((err: unknown) => {
-      // A per-attempt timeout surfaces as APIConnectionTimeoutError once the
-      // SDK exhausts retries. Map it to a typed error so the API layer returns
-      // a clean 504 instead of letting the edge proxy time out headerless.
      if (err instanceof Anthropic.APIConnectionTimeoutError) {
        throw new SpecTimeoutError('spec generation exceeded the time budget');
      }
      throw err;
    });
+
  const text = response.content
    .filter((b): b is { type: 'text'; text: string } => b.type === 'text')
    .map((b) => b.text)
    .join('');
  const json = extractJson(text);
  const parsed = GeneratorSpec.safeParse(json);
-  if (!parsed.success) {
-    throw new SpecValidationError(parsed.error.message);
-  }
+  if (!parsed.success) throw new SpecValidationError(parsed.error.message);
  scanForInjection(parsed.data);
  return { spec: parsed.data, source: 'claude' };
 }

+const GLM_ENDPOINT = 'https://open.bigmodel.cn/api/paas/v4/chat/completions';
+
+async function generateWithGlm(
+  prompt: string,
+  opts: { apiKey: string; model: string; maxTokens: number; timeoutMs?: number },
+): Promise<GenerationResult> {
+  const controller = new AbortController();
+  const timer = opts.timeoutMs ? setTimeout(() => controller.abort(), opts.timeoutMs) : null;
+  let res: Response;
+  try {
+    res = await fetch(GLM_ENDPOINT, {
+      method: 'POST',
+      headers: {
+        Authorization: `Bearer ${opts.apiKey}`,
+        'Content-Type': 'application/json',
+      },
+      body: JSON.stringify({
+        model: opts.model,
+        max_tokens: opts.maxTokens,
+        messages: [
+          { role: 'system', content: SYSTEM_PROMPT },
+          { role: 'user', content: prompt },
+        ],
+      }),
+      signal: controller.signal,
+    });
+  } catch (err) {
+    if ((err as { name?: string }).name === 'AbortError') {
+      throw new SpecTimeoutError('glm spec generation exceeded the time budget');
+    }
+    throw err;
+  } finally {
+    if (timer) clearTimeout(timer);
+  }
+  if (!res.ok) {
+    const body = await res.text().catch(() => '');
+    throw new Error(`glm_api_${res.status}: ${body.slice(0, 200)}`);
+  }
+  const data = (await res.json()) as {
+    choices?: Array<{ message?: { content?: string }; finish_reason?: string }>;
+  };
+  const content = data.choices?.[0]?.message?.content;
+  if (!content) throw new SpecValidationError('glm_empty_response');
+  const json = extractJson(content);
+  const parsed = GeneratorSpec.safeParse(json);
+  if (!parsed.success) throw new SpecValidationError(parsed.error.message);
+  scanForInjection(parsed.data);
+  return { spec: parsed.data, source: 'glm' };
+}
+
 export class SpecValidationError extends Error {
  override readonly name = 'SpecValidationError';
 }
@ -141,7 +338,7 @@ function scanForInjection(spec: GeneratorSpecT): void {
 export function mockSpec(prompt: string): GeneratorSpecT {
  return {
    name: 'Echo MCP',
-    description: `Mock server (no ANTHROPIC_API_KEY). Prompt was: ${prompt.slice(0, 200)}`,
+    description: `Mock server (no LLM key). Prompt was: ${prompt.slice(0, 200)}`,
    tools: [
      {
        name: 'echo',