open-design/apps/web/src/media/models.ts

501 lines
17 KiB
TypeScript
Raw Normal View History

/**
* Single source of truth for the media-generation model registry.
*
* Both the frontend (NewProjectPanel model pickers, Settings dialog
* provider list) and the daemon (od media generate dispatcher) consume
* this registry. When you add a model entry here, the picker shows it,
* the daemon can dispatch to it, and the Settings dialog knows which
* API keys are needed.
*
* The model catalogue mirrors the breadth of lobehub's model-bank:
* every image / video model that lobehub natively supports is listed
* here so the user can pick from the same surface area without us
* re-implementing every provider's transport. For provider integrations
* we only ship the two flagship paths today OpenAI (gpt-image-*) and
* Volcengine Ark (Seedance 2.0) the rest fall back to a placeholder
* with a clear "no provider integration yet" note. The contract the
* code agent follows is identical regardless.
*
* The daemon imports the JS mirror of this file at
* daemon/media-models.js (kept in sync by review).
*/
import type { AudioKind, MediaAspect } from '../types';
/**
* Provider identifier used both as a grouping key in the picker and as
* the lookup key for API-credentials in `AppConfig.mediaProviders`. New
* providers must be added to {@link MEDIA_PROVIDERS} below.
*/
export type MediaProviderId =
| 'openai'
| 'volcengine'
| 'grok'
| 'hyperframes'
| 'nanobanana'
| 'bfl'
| 'fal'
| 'replicate'
| 'google'
| 'midjourney'
| 'kling'
| 'minimax'
| 'suno'
| 'udio'
| 'elevenlabs'
| 'fishaudio'
| 'stub';
export interface MediaProvider {
id: MediaProviderId;
/** Display name shown in Settings + ModelPicker headers. */
label: string;
/** Short marketing-style sub-label. */
hint: string;
/** Whether the daemon ships a real integration for this provider. */
integrated: boolean;
/** Whether the provider needs user-supplied credentials. */
credentialsRequired?: boolean;
/** Whether the provider should appear in Settings -> Media. */
settingsVisible?: boolean;
/** Default base URL the daemon hits when no override is configured. */
defaultBaseUrl?: string;
/** Documentation URL for getting an API key. */
docsUrl?: string;
/** Whether Settings should expose a custom model override field. */
supportsCustomModel?: boolean;
}
/**
* Catalogue of providers. The Settings dialog renders one section per
* entry; the new-project model picker uses {@link integrated} to flag
* cards that will silently fall back to a stub if the user hasn't
* configured a key.
*/
export const MEDIA_PROVIDERS: MediaProvider[] = [
{
id: 'openai',
label: 'OpenAI',
hint: 'gpt-image-2 / dall-e-3',
integrated: true,
defaultBaseUrl: 'https://api.openai.com/v1',
docsUrl: 'https://platform.openai.com/api-keys',
},
{
id: 'volcengine',
label: 'Volcengine Ark (Doubao)',
hint: 'Seedance 2.0 / Seedream',
integrated: true,
defaultBaseUrl: 'https://ark.cn-beijing.volces.com/api/v3',
docsUrl: 'https://console.volcengine.com/ark',
},
{
id: 'grok',
label: 'xAI Grok Imagine',
hint: 'grok-imagine — image + video with native audio',
integrated: true,
defaultBaseUrl: 'https://api.x.ai/v1',
docsUrl: 'https://docs.x.ai/developers/model-capabilities/video/generation',
},
{
id: 'hyperframes',
label: 'HyperFrames',
hint: 'Local HTML -> MP4 renderer',
integrated: true,
credentialsRequired: false,
settingsVisible: false,
docsUrl: 'https://hyperframes.heygen.com',
},
{
id: 'nanobanana',
label: 'Nano Banana',
hint: 'Google official by default; custom gateway configurable',
integrated: true,
defaultBaseUrl: 'https://generativelanguage.googleapis.com',
docsUrl: 'https://ai.google.dev/gemini-api/docs/api-key',
supportsCustomModel: true,
},
{
id: 'bfl',
label: 'Black Forest Labs',
hint: 'FLUX 1.1 Pro / FLUX Pro / Dev',
integrated: false,
defaultBaseUrl: 'https://api.bfl.ai',
docsUrl: 'https://docs.bfl.ai/quick_start/create_account',
},
{
id: 'fal',
label: 'Fal.ai',
hint: 'Sora / Seedance / Veo / FLUX',
integrated: false,
defaultBaseUrl: 'https://fal.run',
docsUrl: 'https://fal.ai/dashboard/keys',
},
{
id: 'replicate',
label: 'Replicate',
hint: 'FLUX / SDXL / Ideogram',
integrated: false,
defaultBaseUrl: 'https://api.replicate.com/v1',
docsUrl: 'https://replicate.com/account/api-tokens',
},
{
id: 'google',
label: 'Google AI / Vertex',
hint: 'Imagen 4 / Veo 3 / Lyria',
integrated: false,
docsUrl: 'https://ai.google.dev/gemini-api/docs/api-key',
},
{
id: 'kling',
label: 'Kuaishou Kling',
hint: 'Kling 1.6 / 2.0 video',
integrated: false,
docsUrl: 'https://klingai.com/dev-center',
},
{
id: 'midjourney',
label: 'Midjourney (proxy)',
hint: 'midjourney-v7',
integrated: false,
},
{
id: 'minimax',
label: 'MiniMax',
hint: 'TTS / video-01',
integrated: true,
defaultBaseUrl: 'https://api.minimaxi.chat/v1',
docsUrl: 'https://platform.minimaxi.com',
},
{
id: 'suno',
label: 'Suno',
hint: 'Music generation',
integrated: false,
},
{
id: 'udio',
label: 'Udio',
hint: 'Music generation',
integrated: false,
},
{
id: 'elevenlabs',
label: 'ElevenLabs',
hint: 'Voice / SFX',
integrated: false,
docsUrl: 'https://elevenlabs.io/app/settings/api-keys',
},
{
id: 'fishaudio',
label: 'FishAudio',
hint: 'Speech / voice clone',
integrated: true,
defaultBaseUrl: 'https://api.fish.audio',
docsUrl: 'https://fish.audio',
},
{
id: 'stub',
label: 'Stub (placeholder)',
hint: 'Deterministic local placeholder bytes',
integrated: true,
},
];
export interface MediaModel {
/** Stable ID used in metadata.imageModel / videoModel / audioModel. */
id: string;
/** Short label shown in pickers. */
label: string;
/** Vendor / context hint shown under the label. */
hint: string;
/** Provider this model is dispatched through. */
provider: MediaProviderId;
/**
* Capabilities the agent may rely on when planning. Used downstream by
* the dispatcher to decide which provider call to make.
*/
caps?: string[];
/** Marks the default-checked card per surface in the picker. */
default?: boolean;
}
/**
* Image generation models. Mirrors the breadth of
* `packages/model-bank/src/aiModels/openai.ts` and friends in lobehub.
*/
export const IMAGE_MODELS: MediaModel[] = [
// OpenAI — fully integrated path.
{
id: 'gpt-image-2',
label: 'gpt-image-2',
hint: 'OpenAI · 4K, native multimodal',
provider: 'openai',
caps: ['t2i', 'i2i', 'inpaint'],
default: true,
},
{
id: 'gpt-image-1.5',
label: 'gpt-image-1.5',
hint: 'OpenAI · 4× faster than gpt-image-1',
provider: 'openai',
caps: ['t2i', 'i2i', 'inpaint'],
},
{
id: 'gpt-image-1',
label: 'gpt-image-1',
hint: 'OpenAI · ChatGPT native',
provider: 'openai',
caps: ['t2i', 'i2i', 'inpaint'],
},
{
id: 'gpt-image-1-mini',
label: 'gpt-image-1-mini',
hint: 'OpenAI · low-cost variant',
provider: 'openai',
caps: ['t2i', 'i2i'],
},
{
id: 'dall-e-3',
label: 'dall-e-3',
hint: 'OpenAI · classic',
provider: 'openai',
caps: ['t2i'],
},
{
id: 'dall-e-2',
label: 'dall-e-2',
hint: 'OpenAI · legacy',
provider: 'openai',
caps: ['t2i'],
},
// Volcengine — Doubao Seedream image generation.
{
id: 'doubao-seedream-3-0-t2i-250415',
label: 'seedream-3.0',
hint: 'ByteDance · Doubao image',
provider: 'volcengine',
caps: ['t2i'],
},
{
id: 'doubao-seededit-3-0-i2i-250628',
label: 'seededit-3.0',
hint: 'ByteDance · image edit',
provider: 'volcengine',
caps: ['i2i'],
},
// xAI Grok Imagine — text-to-image (1k/2k, 11+ aspect ratios).
{
id: 'grok-imagine-image',
label: 'grok-imagine-image',
hint: 'xAI · 2K text-to-image',
provider: 'grok',
caps: ['t2i'],
},
// Nano Banana — Google-compatible generateContent image path.
{
id: 'gemini-3.1-flash-image-preview',
label: 'nano-banana-2',
hint: 'Nano Banana · text-to-image',
provider: 'nanobanana',
caps: ['t2i'],
},
// Black Forest Labs FLUX family.
{ id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'BFL · flagship', provider: 'bfl', caps: ['t2i', 'i2i'] },
{ id: 'flux-pro', label: 'flux-pro', hint: 'BFL', provider: 'bfl', caps: ['t2i'] },
{ id: 'flux-dev', label: 'flux-dev', hint: 'BFL · open weights', provider: 'bfl', caps: ['t2i'] },
{ id: 'flux-schnell', label: 'flux-schnell', hint: 'BFL · fast', provider: 'bfl', caps: ['t2i'] },
{ id: 'flux-kontext-pro', label: 'flux-kontext-pro', hint: 'BFL · in-context edits', provider: 'bfl', caps: ['t2i', 'i2i'] },
// Google.
{ id: 'imagen-4', label: 'imagen-4', hint: 'Google · latest', provider: 'google', caps: ['t2i'] },
{ id: 'imagen-3', label: 'imagen-3', hint: 'Google', provider: 'google', caps: ['t2i'] },
{ id: 'gemini-3-pro-image-preview', label: 'gemini-3-pro-image', hint: 'Google · Nano Banana Pro', provider: 'google', caps: ['t2i', 'i2i'] },
// Replicate / Fal hosted image models.
{ id: 'ideogram-v2', label: 'ideogram-v2', hint: 'Replicate · typography', provider: 'replicate', caps: ['t2i'] },
{ id: 'sdxl', label: 'stable-diffusion-xl', hint: 'Replicate · SDXL', provider: 'replicate', caps: ['t2i'] },
{ id: 'sd-3.5', label: 'stable-diffusion-3.5', hint: 'Fal · SD 3.5', provider: 'fal', caps: ['t2i'] },
// Midjourney via community proxies.
{ id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney · via proxy', provider: 'midjourney', caps: ['t2i'] },
];
/**
* Video generation models. Mirrors lobehub's volcengine.ts (Seedance,
* Seedance Lite), kling.ts and friends.
*/
export const VIDEO_MODELS: MediaModel[] = [
// Volcengine — Seedance 2.0 (integrated).
{
id: 'doubao-seedance-2-0-260128',
label: 'seedance-2.0',
hint: 'ByteDance · t2v + i2v + audio',
provider: 'volcengine',
caps: ['t2v', 'i2v', 'audio'],
default: true,
},
{
id: 'doubao-seedance-2-0-fast-260128',
label: 'seedance-2.0-fast',
hint: 'ByteDance · faster, cheaper',
provider: 'volcengine',
caps: ['t2v', 'i2v', 'audio'],
},
{
id: 'doubao-seedance-1-0-pro-250528',
label: 'seedance-1.0-pro',
hint: 'ByteDance · 1.0',
provider: 'volcengine',
caps: ['t2v', 'i2v'],
},
{
id: 'doubao-seedance-1-0-lite-i2v-250428',
label: 'seedance-1.0-lite-i2v',
hint: 'ByteDance · image-to-video',
provider: 'volcengine',
caps: ['i2v'],
},
{
id: 'doubao-seedance-1-0-lite-t2v-250428',
label: 'seedance-1.0-lite-t2v',
hint: 'ByteDance · text-to-video',
provider: 'volcengine',
caps: ['t2v'],
},
// xAI Grok Imagine — 720p t2v + i2v with natively generated audio.
{
id: 'grok-imagine-video',
label: 'grok-imagine-video',
hint: 'xAI · 720p t2v + i2v + native audio',
provider: 'grok',
caps: ['t2v', 'i2v', 'audio'],
},
// Kuaishou Kling.
{ id: 'kling-2.0', label: 'kling-2.0', hint: 'Kuaishou · latest', provider: 'kling', caps: ['t2v', 'i2v'] },
{ id: 'kling-1.6', label: 'kling-1.6', hint: 'Kuaishou', provider: 'kling', caps: ['t2v', 'i2v'] },
{ id: 'kling-1.5', label: 'kling-1.5', hint: 'Kuaishou', provider: 'kling', caps: ['t2v', 'i2v'] },
// Google Veo.
{ id: 'veo-3', label: 'veo-3', hint: 'Google · sound-on', provider: 'google', caps: ['t2v', 'audio'] },
{ id: 'veo-2', label: 'veo-2', hint: 'Google', provider: 'google', caps: ['t2v'] },
// OpenAI Sora (via Fal hosting today).
{ id: 'sora-2', label: 'sora-2', hint: 'OpenAI · via Fal', provider: 'fal', caps: ['t2v'] },
{ id: 'sora-2-pro', label: 'sora-2-pro', hint: 'OpenAI · via Fal', provider: 'fal', caps: ['t2v'] },
// MiniMax video.
{ id: 'minimax-video-01', label: 'video-01', hint: 'MiniMax · Hailuo', provider: 'minimax', caps: ['t2v', 'i2v'] },
{ id: 'hyperframes-html', label: 'hyperframes-html', hint: 'HyperFrames · local HTML renderer', provider: 'hyperframes', caps: ['t2v'] },
];
export const AUDIO_MODELS_BY_KIND: Record<AudioKind, MediaModel[]> = {
music: [
{ id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', provider: 'suno', caps: ['music'], default: true },
{ id: 'suno-v4-5', label: 'suno-v4.5', hint: 'Suno', provider: 'suno', caps: ['music'] },
{ id: 'udio-v2', label: 'udio-v2', hint: 'Udio', provider: 'udio', caps: ['music'] },
{ id: 'lyria-2', label: 'lyria-2', hint: 'Google', provider: 'google', caps: ['music'] },
],
speech: [
{ id: 'gpt-4o-mini-tts', label: 'gpt-4o-mini-tts', hint: 'OpenAI · expressive TTS', provider: 'openai', caps: ['tts'] },
{ id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax · default', provider: 'minimax', caps: ['tts'], default: true },
{ id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', provider: 'fishaudio', caps: ['tts', 'voice-clone'] },
{ id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', provider: 'elevenlabs', caps: ['tts', 'voice-clone'] },
{ id: 'doubao-tts', label: 'doubao-tts', hint: 'Volcengine · TTS', provider: 'volcengine', caps: ['tts'] },
],
sfx: [
{ id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', provider: 'elevenlabs', caps: ['sfx'], default: true },
{ id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', provider: 'replicate', caps: ['sfx', 'music'] },
],
};
export const MEDIA_ASPECTS: MediaAspect[] = ['1:1', '16:9', '9:16', '4:3', '3:4'];
export const VIDEO_LENGTHS_SEC: number[] = [3, 5, 8, 10, 15, 30];
export const AUDIO_DURATIONS_SEC: number[] = [5, 10, 15, 30, 60, 120];
export const DEFAULT_IMAGE_MODEL =
IMAGE_MODELS.find((m) => m.default)?.id ?? IMAGE_MODELS[0]!.id;
export const DEFAULT_VIDEO_MODEL =
VIDEO_MODELS.find((m) => m.default)?.id ?? VIDEO_MODELS[0]!.id;
export const DEFAULT_AUDIO_MODEL: Record<AudioKind, string> = {
music:
AUDIO_MODELS_BY_KIND.music.find((m) => m.default)?.id
?? AUDIO_MODELS_BY_KIND.music[0]!.id,
speech:
AUDIO_MODELS_BY_KIND.speech.find((m) => m.default)?.id
?? AUDIO_MODELS_BY_KIND.speech[0]!.id,
sfx:
AUDIO_MODELS_BY_KIND.sfx.find((m) => m.default)?.id
?? AUDIO_MODELS_BY_KIND.sfx[0]!.id,
};
/**
* Look up a model record across all surfaces by ID. Returns null if the
* agent passes an unknown model the dispatcher rejects with a clear
* error so the agent re-plans instead of silently falling back.
*/
export function findMediaModel(id: string): MediaModel | null {
const all: MediaModel[] = [
...IMAGE_MODELS,
...VIDEO_MODELS,
...AUDIO_MODELS_BY_KIND.music,
...AUDIO_MODELS_BY_KIND.speech,
...AUDIO_MODELS_BY_KIND.sfx,
];
return all.find((m) => m.id === id) ?? null;
}
export function findProvider(id: MediaProviderId): MediaProvider | null {
return MEDIA_PROVIDERS.find((p) => p.id === id) ?? null;
}
/** All model IDs grouped by surface, used for prompt-side disclosure. */
export function modelIdsBySurface(): {
image: string[];
video: string[];
audio: { music: string[]; speech: string[]; sfx: string[] };
} {
return {
image: IMAGE_MODELS.map((m) => m.id),
video: VIDEO_MODELS.map((m) => m.id),
audio: {
music: AUDIO_MODELS_BY_KIND.music.map((m) => m.id),
speech: AUDIO_MODELS_BY_KIND.speech.map((m) => m.id),
sfx: AUDIO_MODELS_BY_KIND.sfx.map((m) => m.id),
},
};
}
/**
* Group a flat list of {@link MediaModel} by provider while preserving
* the catalogue order. Used by the picker to render section headers.
*/
export function groupByProvider(models: MediaModel[]): Array<{
provider: MediaProvider;
models: MediaModel[];
}> {
const order: MediaProviderId[] = [];
const map = new Map<MediaProviderId, MediaModel[]>();
for (const m of models) {
if (!map.has(m.provider)) {
order.push(m.provider);
map.set(m.provider, []);
}
map.get(m.provider)!.push(m);
}
return order
.map((id) => {
const provider = findProvider(id);
const list = map.get(id) ?? [];
return provider ? { provider, models: list } : null;
})
.filter((entry): entry is { provider: MediaProvider; models: MediaModel[] } => entry != null);
}