buildmymcpserver/apps/generator/src/lib/deploy.ts
Marco Sadjadi 9d5386ccba @
fix(security): sovereign-audit hardening pass — RCE, multi-tenant, reliability

Reasoning-based audit fixes (all verified by typecheck, attack paths re-traced):

- build-time RCE: validate spec.dependencies to npm-registry semver only
  (no git/url/file specifiers) + --ignore-scripts in runner Dockerfile.
- container hardening fail-CLOSED: harden unless RUNNER_DISABLE_HARDENING=1,
  no longer gated on a fragile NODE_ENV string compare.
- secret env keys validated (UPPER_SNAKE, reject NODE_*/PATH/LD_*).
- cross-org image-tag collision: qualify tag with serverId.
- /iterate now enforces suspension + daily-build limits like /servers.
- preview SSE: clear keepalive in finally + on client close (timer/FD leak).
- SMS OTP: atomic attempt counter (lt(attempts,MAX) in UPDATE) — brute-force race.
- getSession orders membership by createdAt (deterministic primary org).
- template scopes aggregated from real tool scopes (was hardcoded mcp:read).
- template category filter pushed into WHERE (was applied after LIMIT).
- support admin reply/status: 404 on unknown ticket; status change now audited.
- build worker: queue defaultJobOptions, docker build/run/stop timeouts,
  old-container teardown in finally (no orphan on post-deploy DB failure).
- nginx: HSTS, X-Frame-Options DENY, nosniff, Referrer-Policy.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
@
2026-05-29 20:56:30 +02:00

259 lines
9.1 KiB
TypeScript

import fs from 'node:fs/promises';
import net from 'node:net';
import path from 'node:path';
import { createDb, eq, isNotNull, mcpServers } from '@bmm/db';
import { config } from '../config.js';
/**
* Per-runner TLS via path-routing on mcp.buildmymcpserver.com. When
* MCP_DOMAIN is set, the generator publishes each container at
* https://<MCP_DOMAIN>/<slug>
* and writes a one-line nginx snippet per server into RUNNER_MAP_DIR.
* A host-side systemd inotify watcher combines the snippets into a single
* file that the nginx vhost includes inside its location block, mapping
* the captured slug to its local runner port.
*
* Path-routing (instead of per-subdomain) is the bootstrap-friendly choice:
* mcp.buildmymcpserver.com is covered by Cloudflare's free Universal SSL,
* whereas *.mcp.buildmymcpserver.com would need CF Advanced Cert Manager
* ($10/mo) or a custom Let's-Encrypt wildcard via DNS-01 (free but more
* ops). See scripts/setup-runner-tls.sh for the one-time host setup.
*
* If MCP_DOMAIN is unset, both the URL formatter and the map writer no-op
* and we fall back to the legacy http://host:port URL — zero behaviour
* change without the host-side infra in place.
*/
function runnerMapPath(slug: string): string {
return path.join(config.RUNNER_MAP_DIR, `${slug}.conf`);
}
async function writeRunnerMapEntry(slug: string, port: number): Promise<void> {
if (!config.MCP_DOMAIN) return;
// nginx snippet — included inside a `location ~` block that captures
// $bmm_slug. Each runner contributes one line; the systemd watcher
// concatenates them into /opt/buildmymcpserver/runner-map.combined.
const line = `if ($bmm_slug = "${slug}") { set $bmm_port ${port}; }\n`;
try {
await fs.mkdir(config.RUNNER_MAP_DIR, { recursive: true });
await fs.writeFile(runnerMapPath(slug), line, 'utf8');
} catch (err) {
// Don't fail the deploy if the map dir isn't mounted yet — runner still
// serves on http://host:port and the user can manually proxy.
console.warn(`[runner-tls] could not write map entry for ${slug}:`, err);
}
}
async function removeRunnerMapEntry(slug: string): Promise<void> {
if (!config.MCP_DOMAIN) return;
try {
await fs.rm(runnerMapPath(slug), { force: true });
} catch {
// Idempotent — missing file is fine.
}
}
export function computePublicUrl(slug: string, port: number): string {
if (config.MCP_DOMAIN) return `https://${config.MCP_DOMAIN}/${slug}`;
return `http://${config.RUNNER_HOST}:${port}`;
}
/**
* Container hardening flags applied on every runner deployment on Linux
* production hosts. Skipped only when explicitly disabled (dev/Windows
* Docker Desktop, which doesn't fully honour --read-only on bind mounts).
*
* Without these, a tenant container runs as root with full capabilities on
* the shared host — combined with the LLM static-check being a regex
* blacklist (Z2-001), this would let a malicious tenant execute arbitrary
* code on the host. With them, the blast radius collapses to "within the
* container", which holds only that tenant's own decrypted secrets.
*/
const HARDENING_FLAGS = [
'--read-only',
'--cap-drop=ALL',
'--security-opt=no-new-privileges:true',
'--pids-limit=100',
'--memory=512m',
'--memory-swap=512m',
'--cpus=0.5',
// /tmp needs writable space — runner-template uses it for build/cache.
'--tmpfs=/tmp:rw,nosuid,nodev,size=64m',
];
function shouldHarden(): boolean {
// Fail-CLOSED: harden by default everywhere. The only opt-out is the explicit
// RUNNER_DISABLE_HARDENING=1 flag (local Windows Docker Desktop, where
// --read-only conflicts with how volumes bind). The previous NODE_ENV gate was
// fail-OPEN — a missing/typo'd NODE_ENV silently ran tenant containers as root
// with full caps on the shared host, which is the one defense the LLM
// static-check explicitly is NOT. (GEN-002)
if (process.env.RUNNER_DISABLE_HARDENING === '1') {
console.warn(
'[deploy] container hardening DISABLED via RUNNER_DISABLE_HARDENING=1 — never set this in production',
);
return false;
}
return true;
}
// docker run / rm should return in seconds; cap them so a wedged daemon can't
// hang a worker slot indefinitely. (GEN-008)
const DOCKER_RUN_TIMEOUT_MS = 60 * 1000;
const DOCKER_STOP_TIMEOUT_MS = 60 * 1000;
const db = createDb();
async function portFree(port: number, host = '127.0.0.1'): Promise<boolean> {
return new Promise((resolve) => {
const tester = net
.createServer()
.once('error', () => resolve(false))
.once('listening', () => tester.close(() => resolve(true)))
.listen(port, host);
});
}
export async function allocatePort(): Promise<number> {
const used = new Set(
(
await db
.select({ port: mcpServers.hostPort })
.from(mcpServers)
.where(isNotNull(mcpServers.hostPort))
)
.map((r) => r.port)
.filter((p): p is number => typeof p === 'number'),
);
for (let port = config.RUNNER_PORT_RANGE_START; port <= config.RUNNER_PORT_RANGE_END; port++) {
if (used.has(port)) continue;
if (await portFree(port)) return port;
}
throw new Error('no_free_port');
}
export interface DeployHandle {
containerId: string;
publicUrl: string;
hostPort: number;
}
export interface DeployInput {
serverId: string;
slug: string;
hostPort: number;
imageTag: string;
envVars: Record<string, string>;
}
export async function deployContainer(input: DeployInput): Promise<DeployHandle> {
// Docker CLI is portable across linux/mac/win — sufficient for now; future
// iteration will switch to the engine API via UNIX socket.
const { spawn } = await import('node:child_process');
const containerName = `bmm-mcp-${input.slug}-${Date.now().toString(36)}`;
const args = [
'run',
'-d',
'--name',
containerName,
'-p',
`${input.hostPort}:3000`,
];
if (shouldHarden()) {
args.push(...HARDENING_FLAGS);
}
for (const [k, v] of Object.entries(input.envVars)) {
args.push('-e', `${k}=${v}`);
}
args.push('--restart=unless-stopped', input.imageTag);
return await new Promise<DeployHandle>((resolve, reject) => {
const child = spawn('docker', args, { stdio: ['ignore', 'pipe', 'pipe'] });
let out = '';
let err = '';
// `docker run -d` returns promptly; if it hangs (wedged daemon) don't pin a
// worker slot forever. (GEN-008)
const timer = setTimeout(() => {
child.kill('SIGKILL');
reject(new Error('docker_run_timeout'));
}, DOCKER_RUN_TIMEOUT_MS);
child.stdout.on('data', (d) => {
out += d.toString();
});
child.stderr.on('data', (d) => {
err += d.toString();
});
child.on('error', (e) => {
clearTimeout(timer);
reject(e);
});
child.on('close', async (code) => {
clearTimeout(timer);
if (code !== 0) {
reject(new Error(`docker_run_failed (exit ${code}): ${err.trim() || out.trim()}`));
return;
}
const containerId = out.trim().slice(0, 64);
const publicUrl = computePublicUrl(input.slug, input.hostPort);
// Drop the nginx map fragment BEFORE persisting publicUrl so the
// user-visible URL is reachable by the time the wizard polls "live".
await writeRunnerMapEntry(input.slug, input.hostPort);
await db
.update(mcpServers)
.set({
containerId,
hostPort: input.hostPort,
publicUrl,
status: 'live',
updatedAt: new Date(),
})
.where(eq(mcpServers.id, input.serverId));
resolve({ containerId, publicUrl, hostPort: input.hostPort });
});
});
}
export async function stopContainer(
containerId: string,
slug?: string,
): Promise<{ ok: boolean; detail: string }> {
if (!containerId || containerId.length < 4) {
return { ok: false, detail: 'invalid_container_id' };
}
// Remove the nginx map fragment first so the slug stops serving 502 from
// the proxy as soon as the container goes down. Idempotent — called
// multiple times with the same slug is fine.
if (slug) await removeRunnerMapEntry(slug);
const { spawn } = await import('node:child_process');
return await new Promise<{ ok: boolean; detail: string }>((resolve) => {
const child = spawn('docker', ['rm', '-f', containerId], {
stdio: ['ignore', 'pipe', 'pipe'],
});
let err = '';
const timer = setTimeout(() => {
child.kill('SIGKILL');
resolve({ ok: false, detail: 'stop_timeout' });
}, DOCKER_STOP_TIMEOUT_MS);
child.stderr?.on('data', (d: Buffer) => {
err += d.toString();
});
child.on('error', () => {
clearTimeout(timer);
resolve({ ok: false, detail: 'spawn_failed' });
});
child.on('close', (code) => {
clearTimeout(timer);
resolve(code === 0 ? { ok: true, detail: '' } : { ok: false, detail: err.trim() || `exit ${code}` });
});
});
}
export async function dockerAvailable(): Promise<boolean> {
const { spawn } = await import('node:child_process');
return await new Promise<boolean>((resolve) => {
const child = spawn('docker', ['version'], { stdio: 'ignore' });
child.on('error', () => resolve(false));
child.on('close', (code) => resolve(code === 0));
});
}