mirror of
https://github.com/nexu-io/open-design.git
synced 2026-05-31 19:04:39 +07:00
* feat(daemon): implement fal.ai renderer for image + video generation Adds renderFalImage and renderFalVideo backed by the fal queue API (queue.fal.run). Any fal-ai/* model path can be used directly without a catalog entry, enabling the full fal model library without code changes. Catalogued shortcuts are mapped via FAL_ENDPOINTS to their fal-ai/* paths; OD_FAL_MAX_POLL_MS controls the poll ceiling. Expands the fal model catalog with flux-pro-ultra, flux-dev-fal, flux-schnell-fal, ideogram-v3-fal, recraft-v3-fal (images) and veo-3-fal, veo-2-fal, wan-2.1-t2v, wan-2.1-i2v, seedance-1-pro-fal, kling-2.1-t2v-fal (video). Marks fal provider as integrated: true in both daemon and web model registries. * fix(daemon): address fal renderer review comments - Correct Wan 2.1 endpoints: wan-video/v2.1/* → fal-ai/wan-t2v / fal-ai/wan-i2v - Correct Kling 2.1 t2v endpoint: .../pro/... → .../master/text-to-video - Add FAL_IMAGE_USES_ASPECT_RATIO: flux-pro-ultra sends aspect_ratio not image_size - Add FAL_VIDEO_NO_DURATION: Wan models reject the duration field - Add FAL_VIDEO_STRING_DURATION: Veo expects duration as "5s" not 5 - Fix falQueueBase() to use anchored regex replace, avoiding mangled custom base URLs - Do not wrap payload under input — raw fal queue HTTP API expects flat body; the input wrapper is an SDK abstraction only (confirmed by 422 validation error from fal showing prompt missing at body.prompt) * fix(daemon): correct fal queue protocol comment (flat body, no SDK input wrapper) * fix(daemon): clamp Veo duration to valid fal buckets (4s/6s/8s) * fix(daemon): report effective fal Veo duration in providerNote (with snap warning) * fix(daemon): reduce image generation latency from 4m37s to ~73s Five layered fixes targeting the overhead that padded a ~10s fal API call into a 4m37s user-facing wait: 1. Skip DISCOVERY_AND_PHILOSOPHY for media surfaces (image/video/audio). The ~3000-token HTML-artifact discovery layer is irrelevant for media generation and forced the agent to parse and override all its rules before dispatching. Removes it from the system prompt entirely for these surfaces; MEDIA_GENERATION_CONTRACT is the sole authority. 2. Broaden the wait-loop contract to cover ALL slow models, not just "Volcengine i2v / hyperframes-html". Any model whose generation exceeds 25s — including fal flux-pro-ultra, Veo, Sora — returns exit 2 from od media generate. The contract now makes this universal and provides a python3-based bash pattern (jq is not guaranteed to be installed on all agent runtimes). 3. Increase od media wait polling budget from 25s to 120s. od media generate keeps its 25s budget for fast feedback; od media wait is purpose-built to sit and poll, so it can safely use the full 2-minute bash-tool window. Reduces re-entries for a 3-minute generation from ~7 to ~2. 4. First fal poll is now immediate instead of always sleeping 3s before the first status check. Saves 3s for all fal jobs. 5. Project metadata no longer emits "(unknown — ask)" for imageModel and aspectRatio when unset. Emits the actual defaults (gpt-image-2, aspect-ratio scene heuristic) so the agent can dispatch without extended reasoning about model selection. Also adds dispatch-immediately defaults and a brief-reply rule (2–3 sentences max after generation). Measured end-to-end on the exact problem prompt before/after: Before: 4m37s (discovery form + 7x LLM re-entries + jq failure) After: ~73s (single bash loop, no question turn, image delivered) * feat(daemon): inject media dispatch hint for non-media project surfaces Agents running inside prototype, deck, and other non-image/video/audio projects previously had no knowledge of `od media generate`, so when asked to create an image with fal they would try to call provider REST APIs directly and ask the user for API keys — even though the daemon already holds credentials in .od/media-config.json. Add MEDIA_DISPATCH_HINT to composeSystemPrompt for all non-media surfaces. The hint tells the agent to always route media generation through the daemon dispatcher, and explicitly forbids prompting for API keys. Verified end-to-end: a prototype project generates a 952 KB image via flux-pro-ultra in ~52s with no key errors. * fix(daemon): prevent agent from converting bash env vars to PowerShell syntax MEDIA_DISPATCH_HINT now explicitly labels the shell as POSIX bash and shows the correct $VAR form side-by-side with a warning NOT to use PowerShell $env:VAR. Without this, claude-sonnet running on a Windows host converts the example to PowerShell syntax (`& $env:OD_NODE_BIN`) which then fails at the bash executor with 'syntax error near unexpected token &'. * fix(daemon): add generate→wait loop to MEDIA_DISPATCH_HINT for slow models MEDIA_DISPATCH_HINT previously showed only a bare call. flux-pro-ultra and other slow models always exit 2 after ~25s — without the wait loop the agent would treat exit 2 as a failure and report an error to the user. Replace the single-command example with the canonical generate→wait loop (matching media-contract.ts), add an explicit note that exit 2 means 'keep polling', and reinforce the POSIX bash / no-PowerShell rule directly inside the code block. * fix(daemon): allow fal-ai/* passthrough in media-agent contract The media-agent prompt instructed the agent to warn and substitute the default model for any ID not in the catalogue. This blocked the custom fal-ai/* passthrough path the daemon already supports, so users could not reach uncatalogued fal models from the normal chat flow. Carve out the fal-ai/* exception so the agent passes those IDs through directly instead of warning or substituting. * fix(daemon): align MEDIA_DISPATCH_HINT with exit-0 generate contract media generate now always exits 0 (handoff included). The non-media agent hint still checked ec==2 to decide whether to keep polling, so slow fal models (flux-pro-ultra, veo-3-fal) would stop after printing the handoff JSON instead of entering the wait loop. - generate error check: drop the ec!=2 exception (exits 0 always) - while loop: drive on taskId presence, not ec==2; stop on ec==0/5 - footer: remove --surface inference claim; CLI requires it explicitly * fix(guard): add test-fal-webui.ts to e2e scripts allowlist CI failed: guard flagged e2e/scripts/test-fal-webui.ts as an unapproved package-owned entrypoint. Add it to allowedE2eScripts. * fix(daemon): update prompt test expectations to match exit-0 handoff wording The two stale assertions checked for the old generate-exits-2 copy which no longer exists in the contract. Update them to match the current always-exits-0 wording. * fix(daemon): move skipDiscoveryBrief override before discovery block * chore(e2e): remove ad-hoc fal webui test script The script was a one-time developer helper used to manually validate fal image generation through the live UI. It relied on a real fal API key and hardcoded local port, so it cannot participate in the e2e package's fixture/reporting/CI conventions. Removing it per reviewer feedback. - Delete e2e/scripts/test-fal-webui.ts - Remove its guard.ts allowlist entry - Gitignore the file and its screenshots to prevent accidental re-addition * chore: remove accidental local scratch files from branch Remove bash.exe.stackdump (MSYS crash dump) and fix_loop.py (one-off local rewrite helper) — neither is a repo-owned source artifact. * fix(prompts): document fal-ai/* passthrough in non-media dispatch hint Prototype/deck agents now know arbitrary fal-ai/* model ids are valid --model values and should be forwarded as-is, mirroring the exception already present in media-contract.ts. Adds a prompt regression test. * fix(daemon): use renderMediaGenerationContract(mediaExecution) for media surfaces --------- Co-authored-by: mrcfps <mrc@powerformer.com>
667 lines
22 KiB
TypeScript
667 lines
22 KiB
TypeScript
/**
|
||
* Single source of truth for the media-generation model registry.
|
||
*
|
||
* Both the frontend (NewProjectPanel model pickers, Settings dialog
|
||
* provider list) and the daemon (od media generate dispatcher) consume
|
||
* this registry. When you add a model entry here, the picker shows it,
|
||
* the daemon can dispatch to it, and the Settings dialog knows which
|
||
* API keys are needed.
|
||
*
|
||
* The model catalogue mirrors the breadth of lobehub's model-bank:
|
||
* every image / video model that lobehub natively supports is listed
|
||
* here so the user can pick from the same surface area without us
|
||
* re-implementing every provider's transport. For provider integrations
|
||
* we only ship the two flagship paths today — OpenAI (gpt-image-*) and
|
||
* Volcengine Ark (Seedance 2.0) — the rest fall back to a placeholder
|
||
* with a clear "no provider integration yet" note. The contract the
|
||
* code agent follows is identical regardless.
|
||
*
|
||
* The daemon imports the JS mirror of this file at
|
||
* daemon/media-models.js (kept in sync by review).
|
||
*/
|
||
|
||
import type { AudioKind, MediaAspect } from '../types';
|
||
|
||
/**
|
||
* Provider identifier — used both as a grouping key in the picker and as
|
||
* the lookup key for API-credentials in `AppConfig.mediaProviders`. New
|
||
* providers must be added to {@link MEDIA_PROVIDERS} below.
|
||
*/
|
||
export type MediaProviderId =
|
||
| 'openai'
|
||
| 'volcengine'
|
||
| 'grok'
|
||
| 'hyperframes'
|
||
| 'nanobanana'
|
||
| 'imagerouter'
|
||
| 'custom-image'
|
||
| 'comfyui'
|
||
| 'bfl'
|
||
| 'fal'
|
||
| 'replicate'
|
||
| 'google'
|
||
| 'midjourney'
|
||
| 'kling'
|
||
| 'minimax'
|
||
| 'suno'
|
||
| 'udio'
|
||
| 'elevenlabs'
|
||
| 'fishaudio'
|
||
| 'senseaudio'
|
||
| 'tavily'
|
||
| 'leonardo'
|
||
| 'stub';
|
||
|
||
export interface MediaProvider {
|
||
id: MediaProviderId;
|
||
/** Display name shown in Settings + ModelPicker headers. */
|
||
label: string;
|
||
/** Short marketing-style sub-label. */
|
||
hint: string;
|
||
/** Whether the daemon ships a real integration for this provider. */
|
||
integrated: boolean;
|
||
/** Whether the provider needs user-supplied credentials. */
|
||
credentialsRequired?: boolean;
|
||
/** Whether the provider should appear in Settings -> Media. */
|
||
settingsVisible?: boolean;
|
||
/** Default base URL the daemon hits when no override is configured. */
|
||
defaultBaseUrl?: string;
|
||
/** Documentation URL for getting an API key. */
|
||
docsUrl?: string;
|
||
/** Whether Settings should expose a custom model override field. */
|
||
supportsCustomModel?: boolean;
|
||
/** Placeholder text for custom model override fields in Settings. */
|
||
customModelPlaceholder?: string;
|
||
}
|
||
|
||
/**
|
||
* Catalogue of providers. The Settings dialog renders one section per
|
||
* entry; the new-project model picker uses {@link integrated} to flag
|
||
* cards that will silently fall back to a stub if the user hasn't
|
||
* configured a key.
|
||
*/
|
||
export const MEDIA_PROVIDERS: MediaProvider[] = [
|
||
{
|
||
id: 'openai',
|
||
label: 'OpenAI',
|
||
hint: 'gpt-image-2 / dall-e-3',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.openai.com/v1',
|
||
docsUrl: 'https://platform.openai.com/api-keys',
|
||
},
|
||
{
|
||
id: 'volcengine',
|
||
label: 'Volcengine Ark (Doubao)',
|
||
hint: 'Seedance 2.0 / Seedream',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://ark.cn-beijing.volces.com/api/v3',
|
||
docsUrl: 'https://console.volcengine.com/ark',
|
||
},
|
||
{
|
||
id: 'grok',
|
||
label: 'xAI Grok Imagine',
|
||
hint: 'grok-imagine — image + video with native audio',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.x.ai/v1',
|
||
docsUrl: 'https://docs.x.ai/developers/model-capabilities/video/generation',
|
||
},
|
||
{
|
||
id: 'hyperframes',
|
||
label: 'HyperFrames',
|
||
hint: 'Local HTML -> MP4 renderer',
|
||
integrated: true,
|
||
credentialsRequired: false,
|
||
settingsVisible: false,
|
||
docsUrl: 'https://hyperframes.heygen.com',
|
||
},
|
||
{
|
||
id: 'nanobanana',
|
||
label: 'Nano Banana',
|
||
hint: 'Google official by default; custom gateway configurable',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://generativelanguage.googleapis.com',
|
||
docsUrl: 'https://ai.google.dev/gemini-api/docs/api-key',
|
||
supportsCustomModel: true,
|
||
},
|
||
{
|
||
id: 'imagerouter',
|
||
label: 'ImageRouter',
|
||
hint: 'OpenAI-compatible image + video routing',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.imagerouter.io/v1/openai',
|
||
docsUrl: 'https://docs.imagerouter.io/api-reference/image-generation/',
|
||
supportsCustomModel: true,
|
||
customModelPlaceholder: 'openai/gpt-image-2 or xAI/grok-imagine-video',
|
||
},
|
||
{
|
||
id: 'custom-image',
|
||
label: 'Custom Image API',
|
||
hint: 'OpenAI-compatible images/generations + images/edits (local or cloud)',
|
||
integrated: true,
|
||
docsUrl: 'https://platform.openai.com/docs/api-reference/images',
|
||
supportsCustomModel: true,
|
||
customModelPlaceholder: 'my-image-model',
|
||
},
|
||
{
|
||
id: 'comfyui',
|
||
label: 'ComfyUI',
|
||
hint: 'Local JSON workflow server (planned adapter)',
|
||
integrated: false,
|
||
defaultBaseUrl: 'http://127.0.0.1:8188',
|
||
docsUrl: 'https://docs.comfy.org/development/core-concepts/workflow',
|
||
},
|
||
{
|
||
id: 'bfl',
|
||
label: 'Black Forest Labs',
|
||
hint: 'FLUX 1.1 Pro / FLUX Pro / Dev',
|
||
integrated: false,
|
||
defaultBaseUrl: 'https://api.bfl.ai',
|
||
docsUrl: 'https://docs.bfl.ai/quick_start/create_account',
|
||
},
|
||
{
|
||
id: 'fal',
|
||
label: 'Fal.ai',
|
||
hint: 'FLUX / Sora / Veo / Wan / Ideogram / Recraft and any fal-ai/* model',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://fal.run',
|
||
docsUrl: 'https://fal.ai/dashboard/keys',
|
||
supportsCustomModel: true,
|
||
},
|
||
{
|
||
id: 'leonardo',
|
||
label: 'Leonardo.ai',
|
||
hint: 'Phoenix / Kino XL / FLUX',
|
||
integrated: true,
|
||
credentialsRequired: true,
|
||
settingsVisible: true,
|
||
defaultBaseUrl: 'https://cloud.leonardo.ai/api/rest/v1',
|
||
docsUrl: 'https://docs.leonardo.ai/docs/create-an-api-key',
|
||
},
|
||
{
|
||
id: 'replicate',
|
||
label: 'Replicate',
|
||
hint: 'FLUX / SDXL / Ideogram',
|
||
integrated: false,
|
||
defaultBaseUrl: 'https://api.replicate.com/v1',
|
||
docsUrl: 'https://replicate.com/account/api-tokens',
|
||
},
|
||
{
|
||
id: 'google',
|
||
label: 'Google AI / Vertex',
|
||
hint: 'Imagen 4 / Veo 3 / Lyria',
|
||
integrated: false,
|
||
docsUrl: 'https://ai.google.dev/gemini-api/docs/api-key',
|
||
},
|
||
{
|
||
id: 'kling',
|
||
label: 'Kuaishou Kling',
|
||
hint: 'Kling 1.6 / 2.0 video',
|
||
integrated: false,
|
||
docsUrl: 'https://klingai.com/dev-center',
|
||
},
|
||
{
|
||
id: 'midjourney',
|
||
label: 'Midjourney (proxy)',
|
||
hint: 'midjourney-v7',
|
||
integrated: false,
|
||
},
|
||
{
|
||
id: 'minimax',
|
||
label: 'MiniMax',
|
||
hint: 'TTS / video-01',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.minimaxi.chat/v1',
|
||
docsUrl: 'https://platform.minimaxi.com',
|
||
},
|
||
{
|
||
id: 'suno',
|
||
label: 'Suno',
|
||
hint: 'Music generation',
|
||
integrated: false,
|
||
},
|
||
{
|
||
id: 'udio',
|
||
label: 'Udio',
|
||
hint: 'Music generation',
|
||
integrated: false,
|
||
},
|
||
{
|
||
id: 'elevenlabs',
|
||
label: 'ElevenLabs',
|
||
hint: 'Voice / SFX',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.elevenlabs.io',
|
||
docsUrl: 'https://elevenlabs.io/app/settings/api-keys',
|
||
},
|
||
{
|
||
id: 'fishaudio',
|
||
label: 'FishAudio',
|
||
hint: 'Speech / voice clone',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.fish.audio',
|
||
docsUrl: 'https://fish.audio',
|
||
},
|
||
{
|
||
id: 'senseaudio',
|
||
label: 'SenseAudio',
|
||
hint: '',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.senseaudio.cn',
|
||
docsUrl: 'https://docs.senseaudio.cn',
|
||
},
|
||
{
|
||
id: 'tavily',
|
||
label: 'Tavily Search',
|
||
hint: 'Agent-callable web research',
|
||
integrated: true,
|
||
defaultBaseUrl: 'https://api.tavily.com',
|
||
docsUrl: 'https://app.tavily.com/home',
|
||
},
|
||
{
|
||
id: 'stub',
|
||
label: 'Stub (placeholder)',
|
||
hint: 'Deterministic local placeholder bytes',
|
||
integrated: true,
|
||
// Internal fixture provider used by the daemon for deterministic
|
||
// tests / offline demos. Hidden from Settings the same way
|
||
// HyperFrames is — end users have nothing to configure here, and
|
||
// exposing it pollutes the provider list.
|
||
settingsVisible: false,
|
||
},
|
||
];
|
||
|
||
export interface MediaModel {
|
||
/** Stable ID used in metadata.imageModel / videoModel / audioModel. */
|
||
id: string;
|
||
/** Short label shown in pickers. */
|
||
label: string;
|
||
/** Vendor / context hint shown under the label. */
|
||
hint: string;
|
||
/** Provider this model is dispatched through. */
|
||
provider: MediaProviderId;
|
||
/**
|
||
* Capabilities the agent may rely on when planning. Used downstream by
|
||
* the dispatcher to decide which provider call to make.
|
||
*/
|
||
caps?: string[];
|
||
/** Marks the default-checked card per surface in the picker. */
|
||
default?: boolean;
|
||
}
|
||
|
||
/**
|
||
* Image generation models. Mirrors the breadth of
|
||
* `packages/model-bank/src/aiModels/openai.ts` and friends in lobehub.
|
||
*/
|
||
export const IMAGE_MODELS: MediaModel[] = [
|
||
// OpenAI — fully integrated path.
|
||
{
|
||
id: 'gpt-image-2',
|
||
label: 'gpt-image-2',
|
||
hint: 'OpenAI · 4K, native multimodal',
|
||
provider: 'openai',
|
||
caps: ['t2i', 'i2i', 'inpaint'],
|
||
default: true,
|
||
},
|
||
{
|
||
id: 'gpt-image-1.5',
|
||
label: 'gpt-image-1.5',
|
||
hint: 'OpenAI · 4× faster than gpt-image-1',
|
||
provider: 'openai',
|
||
caps: ['t2i', 'i2i', 'inpaint'],
|
||
},
|
||
{
|
||
id: 'gpt-image-1',
|
||
label: 'gpt-image-1',
|
||
hint: 'OpenAI · ChatGPT native',
|
||
provider: 'openai',
|
||
caps: ['t2i', 'i2i', 'inpaint'],
|
||
},
|
||
{
|
||
id: 'gpt-image-1-mini',
|
||
label: 'gpt-image-1-mini',
|
||
hint: 'OpenAI · low-cost variant',
|
||
provider: 'openai',
|
||
caps: ['t2i', 'i2i'],
|
||
},
|
||
{
|
||
id: 'dall-e-3',
|
||
label: 'dall-e-3',
|
||
hint: 'OpenAI · classic',
|
||
provider: 'openai',
|
||
caps: ['t2i'],
|
||
},
|
||
{
|
||
id: 'dall-e-2',
|
||
label: 'dall-e-2',
|
||
hint: 'OpenAI · legacy',
|
||
provider: 'openai',
|
||
caps: ['t2i'],
|
||
},
|
||
|
||
// Volcengine — Doubao Seedream image generation.
|
||
{
|
||
id: 'doubao-seedream-3-0-t2i-250415',
|
||
label: 'seedream-3.0',
|
||
hint: 'ByteDance · Doubao image',
|
||
provider: 'volcengine',
|
||
caps: ['t2i'],
|
||
},
|
||
{
|
||
id: 'doubao-seededit-3-0-i2i-250628',
|
||
label: 'seededit-3.0',
|
||
hint: 'ByteDance · image edit',
|
||
provider: 'volcengine',
|
||
caps: ['i2i'],
|
||
},
|
||
|
||
// SenseAudio — synchronous /v1/image/sync, Bearer auth, reference URL or data URI.
|
||
{
|
||
id: 'senseaudio-image-2.0-260319',
|
||
label: 'senseaudio-image-2.0',
|
||
hint: 'SenseAudio · multi-aspect, latest',
|
||
provider: 'senseaudio',
|
||
caps: ['t2i', 'i2i'],
|
||
},
|
||
{
|
||
id: 'senseaudio-image-1.0-260319',
|
||
label: 'senseaudio-image-1.0',
|
||
hint: 'SenseAudio · standard',
|
||
provider: 'senseaudio',
|
||
caps: ['t2i', 'i2i'],
|
||
},
|
||
{
|
||
id: 'doubao-seedream-5-0-260128',
|
||
label: 'seedream-5.0',
|
||
hint: 'SenseAudio · ByteDance Seedream 5.0 hi-res',
|
||
provider: 'senseaudio',
|
||
caps: ['t2i', 'i2i'],
|
||
},
|
||
|
||
// xAI Grok Imagine — text-to-image (1k/2k, 11+ aspect ratios).
|
||
{
|
||
id: 'grok-imagine-image',
|
||
label: 'grok-imagine-image',
|
||
hint: 'xAI · 2K text-to-image',
|
||
provider: 'grok',
|
||
caps: ['t2i'],
|
||
},
|
||
|
||
// Nano Banana — Google-compatible generateContent image path.
|
||
{
|
||
id: 'gemini-3.1-flash-image-preview',
|
||
label: 'nano-banana-2',
|
||
hint: 'Nano Banana · text-to-image',
|
||
provider: 'nanobanana',
|
||
caps: ['t2i'],
|
||
},
|
||
|
||
// ImageRouter — OpenAI-compatible routed image models.
|
||
{
|
||
id: 'openai/gpt-image-2',
|
||
label: 'openai/gpt-image-2',
|
||
hint: 'ImageRouter · routed GPT Image',
|
||
provider: 'imagerouter',
|
||
caps: ['t2i'],
|
||
},
|
||
{
|
||
id: 'openai/gpt-image-1.5',
|
||
label: 'openai/gpt-image-1.5',
|
||
hint: 'ImageRouter · routed GPT Image',
|
||
provider: 'imagerouter',
|
||
caps: ['t2i'],
|
||
},
|
||
{
|
||
id: 'black-forest-labs/FLUX-1.1-pro',
|
||
label: 'FLUX-1.1-pro',
|
||
hint: 'ImageRouter · Black Forest Labs',
|
||
provider: 'imagerouter',
|
||
caps: ['t2i'],
|
||
},
|
||
|
||
// Custom OpenAI-compatible image generation + edit endpoints.
|
||
{
|
||
id: 'custom-image',
|
||
label: 'custom-image',
|
||
hint: 'Custom · OpenAI-compatible endpoint',
|
||
provider: 'custom-image',
|
||
caps: ['t2i', 'i2i'],
|
||
},
|
||
|
||
// Black Forest Labs FLUX family.
|
||
{ id: 'flux-1.1-pro', label: 'flux-1.1-pro', hint: 'BFL · flagship', provider: 'bfl', caps: ['t2i', 'i2i'] },
|
||
{ id: 'flux-pro', label: 'flux-pro', hint: 'BFL', provider: 'bfl', caps: ['t2i'] },
|
||
{ id: 'flux-dev', label: 'flux-dev', hint: 'BFL · open weights', provider: 'bfl', caps: ['t2i'] },
|
||
{ id: 'flux-schnell', label: 'flux-schnell', hint: 'BFL · fast', provider: 'bfl', caps: ['t2i'] },
|
||
{ id: 'flux-kontext-pro', label: 'flux-kontext-pro', hint: 'BFL · in-context edits', provider: 'bfl', caps: ['t2i', 'i2i'] },
|
||
|
||
// Google.
|
||
{ id: 'imagen-4', label: 'imagen-4', hint: 'Google · latest', provider: 'google', caps: ['t2i'] },
|
||
{ id: 'imagen-3', label: 'imagen-3', hint: 'Google', provider: 'google', caps: ['t2i'] },
|
||
{ id: 'gemini-3-pro-image-preview', label: 'gemini-3-pro-image', hint: 'Google · Nano Banana Pro', provider: 'google', caps: ['t2i', 'i2i'] },
|
||
|
||
// Replicate hosted image models.
|
||
{ id: 'ideogram-v2', label: 'ideogram-v2', hint: 'Replicate · typography', provider: 'replicate', caps: ['t2i'] },
|
||
{ id: 'sdxl', label: 'stable-diffusion-xl', hint: 'Replicate · SDXL', provider: 'replicate', caps: ['t2i'] },
|
||
|
||
// Fal.ai image models — pass any fal-ai/* path as model for custom models.
|
||
{ id: 'flux-pro-ultra', label: 'flux-pro-ultra', hint: 'Fal · FLUX 1.1 Pro Ultra · highest quality', provider: 'fal', caps: ['t2i'] },
|
||
{ id: 'flux-dev-fal', label: 'flux-dev (fal)', hint: 'Fal · FLUX Dev · open weights', provider: 'fal', caps: ['t2i'] },
|
||
{ id: 'flux-schnell-fal', label: 'flux-schnell (fal)', hint: 'Fal · FLUX Schnell · fastest / cheapest', provider: 'fal', caps: ['t2i'] },
|
||
{ id: 'ideogram-v3-fal', label: 'ideogram-v3', hint: 'Fal · Ideogram v3 · typography + design', provider: 'fal', caps: ['t2i'] },
|
||
{ id: 'recraft-v3-fal', label: 'recraft-v3', hint: 'Fal · Recraft v3 · vector + illustration', provider: 'fal', caps: ['t2i'] },
|
||
{ id: 'sd-3.5', label: 'stable-diffusion-3.5', hint: 'Fal · SD 3.5', provider: 'fal', caps: ['t2i'] },
|
||
|
||
// Leonardo.ai models
|
||
{ id: 'leonardo-phoenix', label: 'Phoenix', hint: 'Leonardo · versatile', provider: 'leonardo', caps: ['t2i'] },
|
||
{ id: 'leonardo-kino-xl', label: 'Kino XL', hint: 'Leonardo · cinematic', provider: 'leonardo', caps: ['t2i'] },
|
||
{ id: 'leonardo-flux-dev', label: 'FLUX Dev', hint: 'Leonardo · FLUX', provider: 'leonardo', caps: ['t2i'] },
|
||
{ id: 'leonardo-flux-schnell', label: 'FLUX Schnell', hint: 'Leonardo · fast', provider: 'leonardo', caps: ['t2i'] },
|
||
{ id: 'leonardo-anime-pastel', label: 'Anime Pastel Dream', hint: 'Leonardo · anime', provider: 'leonardo', caps: ['t2i'] },
|
||
|
||
// Midjourney via community proxies.
|
||
{ id: 'midjourney-v7', label: 'midjourney-v7', hint: 'Midjourney · via proxy', provider: 'midjourney', caps: ['t2i'] },
|
||
];
|
||
|
||
/**
|
||
* Video generation models. Mirrors lobehub's volcengine.ts (Seedance,
|
||
* Seedance Lite), kling.ts and friends.
|
||
*/
|
||
export const VIDEO_MODELS: MediaModel[] = [
|
||
// Volcengine — Seedance 2.0 (integrated).
|
||
{
|
||
id: 'doubao-seedance-2-0-260128',
|
||
label: 'seedance-2.0',
|
||
hint: 'ByteDance · t2v + i2v + audio',
|
||
provider: 'volcengine',
|
||
caps: ['t2v', 'i2v', 'audio'],
|
||
default: true,
|
||
},
|
||
{
|
||
id: 'doubao-seedance-2-0-fast-260128',
|
||
label: 'seedance-2.0-fast',
|
||
hint: 'ByteDance · faster, cheaper',
|
||
provider: 'volcengine',
|
||
caps: ['t2v', 'i2v', 'audio'],
|
||
},
|
||
{
|
||
id: 'doubao-seedance-1-0-pro-250528',
|
||
label: 'seedance-1.0-pro',
|
||
hint: 'ByteDance · 1.0',
|
||
provider: 'volcengine',
|
||
caps: ['t2v', 'i2v'],
|
||
},
|
||
{
|
||
id: 'doubao-seedance-1-0-lite-i2v-250428',
|
||
label: 'seedance-1.0-lite-i2v',
|
||
hint: 'ByteDance · image-to-video',
|
||
provider: 'volcengine',
|
||
caps: ['i2v'],
|
||
},
|
||
{
|
||
id: 'doubao-seedance-1-0-lite-t2v-250428',
|
||
label: 'seedance-1.0-lite-t2v',
|
||
hint: 'ByteDance · text-to-video',
|
||
provider: 'volcengine',
|
||
caps: ['t2v'],
|
||
},
|
||
|
||
// xAI Grok Imagine — 720p t2v + i2v with natively generated audio.
|
||
{
|
||
id: 'grok-imagine-video',
|
||
label: 'grok-imagine-video',
|
||
hint: 'xAI · 720p t2v + i2v + native audio',
|
||
provider: 'grok',
|
||
caps: ['t2v', 'i2v', 'audio'],
|
||
},
|
||
|
||
// ImageRouter — routed video models.
|
||
{
|
||
id: 'xAI/grok-imagine-video',
|
||
label: 'xAI/grok-imagine-video',
|
||
hint: 'ImageRouter · routed video',
|
||
provider: 'imagerouter',
|
||
caps: ['t2v', 'audio'],
|
||
},
|
||
{
|
||
id: 'bytedance/seedance-1.5-pro',
|
||
label: 'seedance-1.5-pro',
|
||
hint: 'ImageRouter · Bytedance',
|
||
provider: 'imagerouter',
|
||
caps: ['t2v'],
|
||
},
|
||
{
|
||
id: 'google/veo-3.1-lite',
|
||
label: 'veo-3.1-lite',
|
||
hint: 'ImageRouter · Google',
|
||
provider: 'imagerouter',
|
||
caps: ['t2v'],
|
||
},
|
||
|
||
// Kuaishou Kling.
|
||
{ id: 'kling-2.0', label: 'kling-2.0', hint: 'Kuaishou · latest', provider: 'kling', caps: ['t2v', 'i2v'] },
|
||
{ id: 'kling-1.6', label: 'kling-1.6', hint: 'Kuaishou', provider: 'kling', caps: ['t2v', 'i2v'] },
|
||
{ id: 'kling-1.5', label: 'kling-1.5', hint: 'Kuaishou', provider: 'kling', caps: ['t2v', 'i2v'] },
|
||
|
||
// Google Veo.
|
||
{ id: 'veo-3', label: 'veo-3', hint: 'Google · sound-on', provider: 'google', caps: ['t2v', 'audio'] },
|
||
{ id: 'veo-2', label: 'veo-2', hint: 'Google', provider: 'google', caps: ['t2v'] },
|
||
|
||
// Fal.ai video models — pass any fal-ai/* path as model for custom models.
|
||
{ id: 'veo-3-fal', label: 'veo-3 (fal)', hint: 'Fal · Google Veo 3 · sound-on', provider: 'fal', caps: ['t2v', 'audio'] },
|
||
{ id: 'veo-2-fal', label: 'veo-2 (fal)', hint: 'Fal · Google Veo 2', provider: 'fal', caps: ['t2v'] },
|
||
{ id: 'wan-2.1-t2v', label: 'wan-2.1-t2v', hint: 'Fal · Wan 2.1 text-to-video', provider: 'fal', caps: ['t2v'] },
|
||
{ id: 'wan-2.1-i2v', label: 'wan-2.1-i2v', hint: 'Fal · Wan 2.1 image-to-video', provider: 'fal', caps: ['i2v'] },
|
||
{ id: 'seedance-1-pro-fal', label: 'seedance-1-pro (fal)', hint: 'Fal · Seedance 1 Pro', provider: 'fal', caps: ['t2v', 'i2v'] },
|
||
{ id: 'kling-2.1-t2v-fal', label: 'kling-2.1 (fal)', hint: 'Fal · Kling 2.1 Pro text-to-video', provider: 'fal', caps: ['t2v'] },
|
||
{ id: 'sora-2', label: 'sora-2', hint: 'Fal · OpenAI Sora 2', provider: 'fal', caps: ['t2v'] },
|
||
{ id: 'sora-2-pro', label: 'sora-2-pro', hint: 'Fal · OpenAI Sora 2 Pro', provider: 'fal', caps: ['t2v'] },
|
||
|
||
// MiniMax video.
|
||
{ id: 'minimax-video-01', label: 'video-01', hint: 'MiniMax · Hailuo', provider: 'minimax', caps: ['t2v', 'i2v'] },
|
||
{ id: 'hyperframes-html', label: 'hyperframes-html', hint: 'HyperFrames · local HTML renderer', provider: 'hyperframes', caps: ['t2v'] },
|
||
];
|
||
|
||
export const AUDIO_MODELS_BY_KIND: Record<AudioKind, MediaModel[]> = {
|
||
music: [
|
||
{ id: 'suno-v5', label: 'suno-v5', hint: 'Suno · default', provider: 'suno', caps: ['music'], default: true },
|
||
{ id: 'suno-v4-5', label: 'suno-v4.5', hint: 'Suno', provider: 'suno', caps: ['music'] },
|
||
{ id: 'udio-v2', label: 'udio-v2', hint: 'Udio', provider: 'udio', caps: ['music'] },
|
||
{ id: 'lyria-2', label: 'lyria-2', hint: 'Google', provider: 'google', caps: ['music'] },
|
||
],
|
||
speech: [
|
||
{ id: 'minimax-tts', label: 'minimax-tts', hint: 'MiniMax', provider: 'minimax', caps: ['tts'], default: true },
|
||
{ id: 'fish-speech-2', label: 'fish-speech-2', hint: 'FishAudio', provider: 'fishaudio', caps: ['tts', 'voice-clone'] },
|
||
{ id: 'elevenlabs-v3', label: 'elevenlabs-v3', hint: 'ElevenLabs', provider: 'elevenlabs', caps: ['tts', 'voice-clone'] },
|
||
{ id: 'senseaudio-tts', label: 'senseaudio-tts', hint: 'SenseAudio', provider: 'senseaudio', caps: ['tts', 'voice-clone'] },
|
||
{ id: 'doubao-tts', label: 'doubao-tts', hint: 'Volcengine', provider: 'volcengine', caps: ['tts'] },
|
||
{ id: 'gpt-4o-mini-tts', label: 'gpt-4o-mini-tts', hint: 'OpenAI', provider: 'openai', caps: ['tts'] },
|
||
],
|
||
sfx: [
|
||
{ id: 'elevenlabs-sfx', label: 'elevenlabs-sfx', hint: 'ElevenLabs SFX', provider: 'elevenlabs', caps: ['sfx'], default: true },
|
||
{ id: 'audiocraft', label: 'audiocraft', hint: 'Meta · open', provider: 'replicate', caps: ['sfx', 'music'] },
|
||
],
|
||
};
|
||
|
||
export const MEDIA_ASPECTS: MediaAspect[] = ['1:1', '16:9', '9:16', '4:3', '3:4'];
|
||
|
||
export const VIDEO_LENGTHS_SEC: number[] = [3, 5, 8, 10, 15, 30];
|
||
export const AUDIO_DURATIONS_SEC: number[] = [5, 10, 15, 30, 60, 120];
|
||
|
||
export const DEFAULT_IMAGE_MODEL =
|
||
IMAGE_MODELS.find((m) => m.default)?.id ?? IMAGE_MODELS[0]!.id;
|
||
export const DEFAULT_VIDEO_MODEL =
|
||
VIDEO_MODELS.find((m) => m.default)?.id ?? VIDEO_MODELS[0]!.id;
|
||
export const DEFAULT_AUDIO_MODEL: Record<AudioKind, string> = {
|
||
music:
|
||
AUDIO_MODELS_BY_KIND.music.find((m) => m.default)?.id
|
||
?? AUDIO_MODELS_BY_KIND.music[0]!.id,
|
||
speech:
|
||
AUDIO_MODELS_BY_KIND.speech.find((m) => m.default)?.id
|
||
?? AUDIO_MODELS_BY_KIND.speech[0]!.id,
|
||
sfx:
|
||
AUDIO_MODELS_BY_KIND.sfx.find((m) => m.default)?.id
|
||
?? AUDIO_MODELS_BY_KIND.sfx[0]!.id,
|
||
};
|
||
|
||
/**
|
||
* Look up a model record across all surfaces by ID. Returns null if the
|
||
* agent passes an unknown model — the dispatcher rejects with a clear
|
||
* error so the agent re-plans instead of silently falling back.
|
||
*/
|
||
export function findMediaModel(id: string): MediaModel | null {
|
||
const all: MediaModel[] = [
|
||
...IMAGE_MODELS,
|
||
...VIDEO_MODELS,
|
||
...AUDIO_MODELS_BY_KIND.music,
|
||
...AUDIO_MODELS_BY_KIND.speech,
|
||
...AUDIO_MODELS_BY_KIND.sfx,
|
||
];
|
||
return all.find((m) => m.id === id) ?? null;
|
||
}
|
||
|
||
export function findProvider(id: MediaProviderId): MediaProvider | null {
|
||
return MEDIA_PROVIDERS.find((p) => p.id === id) ?? null;
|
||
}
|
||
|
||
/** All model IDs grouped by surface, used for prompt-side disclosure. */
|
||
export function modelIdsBySurface(): {
|
||
image: string[];
|
||
video: string[];
|
||
audio: { music: string[]; speech: string[]; sfx: string[] };
|
||
} {
|
||
return {
|
||
image: IMAGE_MODELS.map((m) => m.id),
|
||
video: VIDEO_MODELS.map((m) => m.id),
|
||
audio: {
|
||
music: AUDIO_MODELS_BY_KIND.music.map((m) => m.id),
|
||
speech: AUDIO_MODELS_BY_KIND.speech.map((m) => m.id),
|
||
sfx: AUDIO_MODELS_BY_KIND.sfx.map((m) => m.id),
|
||
},
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Group a flat list of {@link MediaModel} by provider while preserving
|
||
* the catalogue order. Used by the picker to render section headers.
|
||
*/
|
||
export function groupByProvider(models: MediaModel[]): Array<{
|
||
provider: MediaProvider;
|
||
models: MediaModel[];
|
||
}> {
|
||
const order: MediaProviderId[] = [];
|
||
const map = new Map<MediaProviderId, MediaModel[]>();
|
||
for (const m of models) {
|
||
if (!map.has(m.provider)) {
|
||
order.push(m.provider);
|
||
map.set(m.provider, []);
|
||
}
|
||
map.get(m.provider)!.push(m);
|
||
}
|
||
return order
|
||
.map((id) => {
|
||
const provider = findProvider(id);
|
||
const list = map.get(id) ?? [];
|
||
return provider ? { provider, models: list } : null;
|
||
})
|
||
.filter((entry): entry is { provider: MediaProvider; models: MediaModel[] } => entry != null);
|
||
}
|