buildmymcpserver/apps/generator/src/lib/deploy.ts

import net from 'node:net';
import { createDb, eq, isNotNull, mcpServers } from '@bmm/db';
import { config } from '../config.js';

/**
 * Container hardening flags applied on every runner deployment on Linux
 * production hosts. Skipped only when explicitly disabled (dev/Windows
 * Docker Desktop, which doesn't fully honour --read-only on bind mounts).
 *
 * Without these, a tenant container runs as root with full capabilities on
 * the shared host — combined with the LLM static-check being a regex
 * blacklist (Z2-001), this would let a malicious tenant execute arbitrary
 * code on the host. With them, the blast radius collapses to "within the
 * container", which holds only that tenant's own decrypted secrets.
 */
const HARDENING_FLAGS = [
  '--read-only',
  '--cap-drop=ALL',
  '--security-opt=no-new-privileges:true',
  '--pids-limit=100',
  '--memory=512m',
  '--memory-swap=512m',
  '--cpus=0.5',
  // /tmp needs writable space — runner-template uses it for build/cache.
  '--tmpfs=/tmp:rw,nosuid,nodev,size=64m',
];

function shouldHarden(): boolean {
  // Explicit opt-out for local dev on Windows where --read-only conflicts
  // with how Docker Desktop binds volumes. Production must always harden.
  if (process.env.RUNNER_DISABLE_HARDENING === '1') return false;
  const env = process.env.NODE_ENV;
  return env === 'production' || env === 'staging';
}

const db = createDb();

async function portFree(port: number, host = '127.0.0.1'): Promise<boolean> {
  return new Promise((resolve) => {
    const tester = net
      .createServer()
      .once('error', () => resolve(false))
      .once('listening', () => tester.close(() => resolve(true)))
      .listen(port, host);
  });
}

export async function allocatePort(): Promise<number> {
  const used = new Set(
    (
      await db
        .select({ port: mcpServers.hostPort })
        .from(mcpServers)
        .where(isNotNull(mcpServers.hostPort))
    )
      .map((r) => r.port)
      .filter((p): p is number => typeof p === 'number'),
  );
  for (let port = config.RUNNER_PORT_RANGE_START; port <= config.RUNNER_PORT_RANGE_END; port++) {
    if (used.has(port)) continue;
    if (await portFree(port)) return port;
  }
  throw new Error('no_free_port');
}

export interface DeployHandle {
  containerId: string;
  publicUrl: string;
  hostPort: number;
}

export interface DeployInput {
  serverId: string;
  slug: string;
  hostPort: number;
  imageTag: string;
  envVars: Record<string, string>;
}

export async function deployContainer(input: DeployInput): Promise<DeployHandle> {
  // Docker CLI is portable across linux/mac/win — sufficient for now; future
  // iteration will switch to the engine API via UNIX socket.
  const { spawn } = await import('node:child_process');
  const containerName = `bmm-mcp-${input.slug}-${Date.now().toString(36)}`;
  const args = [
    'run',
    '-d',
    '--name',
    containerName,
    '-p',
    `${input.hostPort}:3000`,
  ];
  if (shouldHarden()) {
    args.push(...HARDENING_FLAGS);
  }
  for (const [k, v] of Object.entries(input.envVars)) {
    args.push('-e', `${k}=${v}`);
  }
  args.push('--restart=unless-stopped', input.imageTag);

  return await new Promise<DeployHandle>((resolve, reject) => {
    const child = spawn('docker', args, { stdio: ['ignore', 'pipe', 'pipe'] });
    let out = '';
    let err = '';
    child.stdout.on('data', (d) => {
      out += d.toString();
    });
    child.stderr.on('data', (d) => {
      err += d.toString();
    });
    child.on('error', (e) => reject(e));
    child.on('close', async (code) => {
      if (code !== 0) {
        reject(new Error(`docker_run_failed (exit ${code}): ${err.trim() || out.trim()}`));
        return;
      }
      const containerId = out.trim().slice(0, 64);
      const publicUrl = `http://${config.RUNNER_HOST}:${input.hostPort}`;
      await db
        .update(mcpServers)
        .set({
          containerId,
          hostPort: input.hostPort,
          publicUrl,
          status: 'live',
          updatedAt: new Date(),
        })
        .where(eq(mcpServers.id, input.serverId));
      resolve({ containerId, publicUrl, hostPort: input.hostPort });
    });
  });
}

export async function stopContainer(
  containerId: string,
): Promise<{ ok: boolean; detail: string }> {
  if (!containerId || containerId.length < 4) {
    return { ok: false, detail: 'invalid_container_id' };
  }
  const { spawn } = await import('node:child_process');
  return await new Promise<{ ok: boolean; detail: string }>((resolve) => {
    const child = spawn('docker', ['rm', '-f', containerId], {
      stdio: ['ignore', 'pipe', 'pipe'],
    });
    let err = '';
    child.stderr?.on('data', (d: Buffer) => {
      err += d.toString();
    });
    child.on('error', () => resolve({ ok: false, detail: 'spawn_failed' }));
    child.on('close', (code) =>
      resolve(code === 0 ? { ok: true, detail: '' } : { ok: false, detail: err.trim() || `exit ${code}` }),
    );
  });
}

export async function dockerAvailable(): Promise<boolean> {
  const { spawn } = await import('node:child_process');
  return await new Promise<boolean>((resolve) => {
    const child = spawn('docker', ['version'], { stdio: 'ignore' });
    child.on('error', () => resolve(false));
    child.on('close', (code) => resolve(code === 0));
  });
}
feat(generator): BullMQ worker (Claude API + spec render + docker build + local deploy) 2026-05-19 00:26:53 +02:00			`import net from 'node:net';`
			`import { createDb, eq, isNotNull, mcpServers } from '@bmm/db';`
			`import { config } from '../config.js';`

security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul Five confirmed findings from the sovereign-audit pass, ordered by severity: Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the real visitor IP via X-Forwarded-For instead of always being the nginx / docker-bridge peer. Every per-IP rate-limit in the codebase was silently collapsed into one global counter; this restores them. Z1-001 CRITICAL — runner container hardening flags (--read-only, --cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100, --memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a TODO despite /security promising them. Now applied unconditionally on production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev. Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened (Function(...) without `new`, process.binding, process.dlopen, .constructor.constructor, _load, vm.runInContext, globalThis['..'], "system prompt override"). scanForInjection now also walks tool.name and every inputSchema property description, not only implementation + description — closes the prompt-injection-into-AI-client surface that downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of the single shared scanForInjection export from @bmm/llm. Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the trustProxy fix above these are now real per-visitor limits. Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in production. In dev it still prints (so devs can click the link); in production we log only "issued, URL withheld" and a loud error if no email sender is wired (Resend integration is the actual launch blocker — left as a TODO). Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin upgrades. SameSite=Lax already mitigates in modern browsers; this is the defense-in-depth against browser bugs and non-browser clients. FALSE POSITIVES dismissed: slug path-traversal (schema regex ^[a-z][a-z0-9-]$ in @bmm/types catches it); session-after-promote (getSession re-fetches isAdmin from DB on every request). DEFERRED (not blockers, tracked): - Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS - Z1-003 docker image cleanup cron - Z2-001 v2 — real sandbox runtime (multi-week refactor) - Z3-002 rawBody-per-request memory — branch on webhook path only - Z5-001 multi-user org RBAC for billing — gated on Team feature - Email sender integration (Resend) — launch blocker Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-25 18:02:59 +02:00			`/**`
			`* Container hardening flags applied on every runner deployment on Linux`
			`* production hosts. Skipped only when explicitly disabled (dev/Windows`
			`* Docker Desktop, which doesn't fully honour --read-only on bind mounts).`
			`*`
			`* Without these, a tenant container runs as root with full capabilities on`
			`* the shared host — combined with the LLM static-check being a regex`
			`* blacklist (Z2-001), this would let a malicious tenant execute arbitrary`
			`* code on the host. With them, the blast radius collapses to "within the`
			`* container", which holds only that tenant's own decrypted secrets.`
			`*/`
			`const HARDENING_FLAGS = [`
			`'--read-only',`
			`'--cap-drop=ALL',`
			`'--security-opt=no-new-privileges:true',`
			`'--pids-limit=100',`
			`'--memory=512m',`
			`'--memory-swap=512m',`
			`'--cpus=0.5',`
			`// /tmp needs writable space — runner-template uses it for build/cache.`
			`'--tmpfs=/tmp:rw,nosuid,nodev,size=64m',`
			`];`

			`function shouldHarden(): boolean {`
			`// Explicit opt-out for local dev on Windows where --read-only conflicts`
			`// with how Docker Desktop binds volumes. Production must always harden.`
			`if (process.env.RUNNER_DISABLE_HARDENING === '1') return false;`
			`const env = process.env.NODE_ENV;`
			`return env === 'production' \|\| env === 'staging';`
			`}`

feat(generator): BullMQ worker (Claude API + spec render + docker build + local deploy) 2026-05-19 00:26:53 +02:00			`const db = createDb();`

			`async function portFree(port: number, host = '127.0.0.1'): Promise<boolean> {`
			`return new Promise((resolve) => {`
			`const tester = net`
			`.createServer()`
			`.once('error', () => resolve(false))`
			`.once('listening', () => tester.close(() => resolve(true)))`
			`.listen(port, host);`
			`});`
			`}`

			`export async function allocatePort(): Promise<number> {`
			`const used = new Set(`
			`(`
			`await db`
			`.select({ port: mcpServers.hostPort })`
			`.from(mcpServers)`
			`.where(isNotNull(mcpServers.hostPort))`
			`)`
			`.map((r) => r.port)`
			`.filter((p): p is number => typeof p === 'number'),`
			`);`
			`for (let port = config.RUNNER_PORT_RANGE_START; port <= config.RUNNER_PORT_RANGE_END; port++) {`
			`if (used.has(port)) continue;`
			`if (await portFree(port)) return port;`
			`}`
			`throw new Error('no_free_port');`
			`}`

			`export interface DeployHandle {`
			`containerId: string;`
			`publicUrl: string;`
			`hostPort: number;`
			`}`

			`export interface DeployInput {`
			`serverId: string;`
			`slug: string;`
			`hostPort: number;`
			`imageTag: string;`
			`envVars: Record<string, string>;`
			`}`

			`export async function deployContainer(input: DeployInput): Promise<DeployHandle> {`
security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul Five confirmed findings from the sovereign-audit pass, ordered by severity: Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the real visitor IP via X-Forwarded-For instead of always being the nginx / docker-bridge peer. Every per-IP rate-limit in the codebase was silently collapsed into one global counter; this restores them. Z1-001 CRITICAL — runner container hardening flags (--read-only, --cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100, --memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a TODO despite /security promising them. Now applied unconditionally on production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev. Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened (Function(...) without `new`, process.binding, process.dlopen, .constructor.constructor, _load, vm.runInContext, globalThis['..'], "system prompt override"). scanForInjection now also walks tool.name and every inputSchema property description, not only implementation + description — closes the prompt-injection-into-AI-client surface that downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of the single shared scanForInjection export from @bmm/llm. Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the trustProxy fix above these are now real per-visitor limits. Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in production. In dev it still prints (so devs can click the link); in production we log only "issued, URL withheld" and a loud error if no email sender is wired (Resend integration is the actual launch blocker — left as a TODO). Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin upgrades. SameSite=Lax already mitigates in modern browsers; this is the defense-in-depth against browser bugs and non-browser clients. FALSE POSITIVES dismissed: slug path-traversal (schema regex ^[a-z][a-z0-9-]$ in @bmm/types catches it); session-after-promote (getSession re-fetches isAdmin from DB on every request). DEFERRED (not blockers, tracked): - Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS - Z1-003 docker image cleanup cron - Z2-001 v2 — real sandbox runtime (multi-week refactor) - Z3-002 rawBody-per-request memory — branch on webhook path only - Z5-001 multi-user org RBAC for billing — gated on Team feature - Email sender integration (Resend) — launch blocker Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-25 18:02:59 +02:00			`// Docker CLI is portable across linux/mac/win — sufficient for now; future`
			`// iteration will switch to the engine API via UNIX socket.`
feat(generator): BullMQ worker (Claude API + spec render + docker build + local deploy) 2026-05-19 00:26:53 +02:00			`const { spawn } = await import('node:child_process');`
			const containerName = `bmm-mcp-${input.slug}-${Date.now().toString(36)}`;
			`const args = [`
			`'run',`
			`'-d',`
			`'--name',`
			`containerName,`
			`'-p',`
			`${input.hostPort}:3000`,
			`];`
security: sovereign-audit Phase 2 fixes — trustProxy, Docker hardening, banned-pattern overhaul Five confirmed findings from the sovereign-audit pass, ordered by severity: Z3-001 CRITICAL — Fastify now trustProxy:true so req.ip resolves to the real visitor IP via X-Forwarded-For instead of always being the nginx / docker-bridge peer. Every per-IP rate-limit in the codebase was silently collapsed into one global counter; this restores them. Z1-001 CRITICAL — runner container hardening flags (--read-only, --cap-drop=ALL, --security-opt=no-new-privileges:true, --pids-limit=100, --memory=512m, --cpus=0.5, tmpfs /tmp) were sitting commented-out as a TODO despite /security promising them. Now applied unconditionally on production/staging; opt-out flag RUNNER_DISABLE_HARDENING=1 for Win-dev. Z2-001 + Z2-002 CRITICAL / MEDIUM — banned-pattern blacklist tightened (Function(...) without `new`, process.binding, process.dlopen, .constructor.constructor, _load, vm.runInContext, globalThis['..'], "system prompt override"). scanForInjection now also walks tool.name and every inputSchema property description, not only implementation + description — closes the prompt-injection-into-AI-client surface that downstream clients (Claude Desktop, Cursor) read verbatim. The duplicate BANNED_PATTERNS in apps/api/src/routes/servers.ts deleted in favour of the single shared scanForInjection export from @bmm/llm. Z4-001 HIGH — /v1/auth/magic-link gained the two-axis daily rate-limit the SMS endpoint already had: 10/IP/day + 5/email/day. Combined with the trustProxy fix above these are now real per-visitor limits. Z4-002 MEDIUM — magic-link callback URL no longer printed to stdout in production. In dev it still prints (so devs can click the link); in production we log only "issued, URL withheld" and a loud error if no email sender is wired (Resend integration is the actual launch blocker — left as a TODO). Z6-001 MEDIUM — /v1/builds/:id/stream WebSocket now refuses cross-origin upgrades. SameSite=Lax already mitigates in modern browsers; this is the defense-in-depth against browser bugs and non-browser clients. FALSE POSITIVES dismissed: slug path-traversal (schema regex ^[a-z][a-z0-9-]$ in @bmm/types catches it); session-after-promote (getSession re-fetches isAdmin from DB on every request). DEFERRED (not blockers, tracked): - Z1-002 generated-server HTTPS — needs nginx wildcard subdomain TLS - Z1-003 docker image cleanup cron - Z2-001 v2 — real sandbox runtime (multi-week refactor) - Z3-002 rawBody-per-request memory — branch on webhook path only - Z5-001 multi-user org RBAC for billing — gated on Team feature - Email sender integration (Resend) — launch blocker Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> 2026-05-25 18:02:59 +02:00			`if (shouldHarden()) {`
			`args.push(...HARDENING_FLAGS);`
			`}`
feat(generator): BullMQ worker (Claude API + spec render + docker build + local deploy) 2026-05-19 00:26:53 +02:00			`for (const [k, v] of Object.entries(input.envVars)) {`
			args.push('-e', `${k}=${v}`);
			`}`
			`args.push('--restart=unless-stopped', input.imageTag);`

			`return await new Promise<DeployHandle>((resolve, reject) => {`
			`const child = spawn('docker', args, { stdio: ['ignore', 'pipe', 'pipe'] });`
			`let out = '';`
			`let err = '';`
			`child.stdout.on('data', (d) => {`
			`out += d.toString();`
			`});`
			`child.stderr.on('data', (d) => {`
			`err += d.toString();`
			`});`
			`child.on('error', (e) => reject(e));`
			`child.on('close', async (code) => {`
			`if (code !== 0) {`
			reject(new Error(`docker_run_failed (exit ${code}): ${err.trim() \|\| out.trim()}`));
			`return;`
			`}`
			`const containerId = out.trim().slice(0, 64);`
			const publicUrl = `http://${config.RUNNER_HOST}:${input.hostPort}`;
			`await db`
			`.update(mcpServers)`
			`.set({`
			`containerId,`
			`hostPort: input.hostPort,`
			`publicUrl,`
			`status: 'live',`
			`updatedAt: new Date(),`
			`})`
			`.where(eq(mcpServers.id, input.serverId));`
			`resolve({ containerId, publicUrl, hostPort: input.hostPort });`
			`});`
			`});`
			`}`

fix(generator): iterate orphaned the previous container — rolling deploy Sovereign-audit follow-up. The audit's finding pass missed this: every Iterate (version > 1) ran allocatePort -> a NEW port and deployContainer -> a NEW container, then pointed the DB row at it — and never stopped the old container. The previous version kept running forever, holding a host port, with the old secrets baked into its env, untracked (its containerId was overwritten in the DB by deployContainer). Same bug class as API-SERVERS-001 but on the iterate path. Fix: the worker captures the server's current containerId before the build mutates the row, and after the new container is confirmed live + the DB updated, it stops the old one. This also makes the 'rolling deploy' the UI promises actually true — the old version stays up until the new one is live, then is retired. deploy.ts stopContainer now returns { ok, detail } (was void) so the worker can log the outcome. Verified: generator typecheck clean. 2026-05-20 20:58:30 +02:00			`export async function stopContainer(`
			`containerId: string,`
			`): Promise<{ ok: boolean; detail: string }> {`
			`if (!containerId \|\| containerId.length < 4) {`
			`return { ok: false, detail: 'invalid_container_id' };`
			`}`
feat(generator): BullMQ worker (Claude API + spec render + docker build + local deploy) 2026-05-19 00:26:53 +02:00			`const { spawn } = await import('node:child_process');`
fix(generator): iterate orphaned the previous container — rolling deploy Sovereign-audit follow-up. The audit's finding pass missed this: every Iterate (version > 1) ran allocatePort -> a NEW port and deployContainer -> a NEW container, then pointed the DB row at it — and never stopped the old container. The previous version kept running forever, holding a host port, with the old secrets baked into its env, untracked (its containerId was overwritten in the DB by deployContainer). Same bug class as API-SERVERS-001 but on the iterate path. Fix: the worker captures the server's current containerId before the build mutates the row, and after the new container is confirmed live + the DB updated, it stops the old one. This also makes the 'rolling deploy' the UI promises actually true — the old version stays up until the new one is live, then is retired. deploy.ts stopContainer now returns { ok, detail } (was void) so the worker can log the outcome. Verified: generator typecheck clean. 2026-05-20 20:58:30 +02:00			`return await new Promise<{ ok: boolean; detail: string }>((resolve) => {`
			`const child = spawn('docker', ['rm', '-f', containerId], {`
			`stdio: ['ignore', 'pipe', 'pipe'],`
			`});`
			`let err = '';`
			`child.stderr?.on('data', (d: Buffer) => {`
			`err += d.toString();`
			`});`
			`child.on('error', () => resolve({ ok: false, detail: 'spawn_failed' }));`
			`child.on('close', (code) =>`
			resolve(code === 0 ? { ok: true, detail: '' } : { ok: false, detail: err.trim() \|\| `exit ${code}` }),
			`);`
feat(generator): BullMQ worker (Claude API + spec render + docker build + local deploy) 2026-05-19 00:26:53 +02:00			`});`
			`}`

			`export async function dockerAvailable(): Promise<boolean> {`
			`const { spawn } = await import('node:child_process');`
			`return await new Promise<boolean>((resolve) => {`
			`const child = spawn('docker', ['version'], { stdio: 'ignore' });`
			`child.on('error', () => resolve(false));`
			`child.on('close', (code) => resolve(code === 0));`
			`});`
			`}`