mirror of
https://github.com/nexu-io/open-design.git
synced 2026-06-01 03:14:35 +07:00
feat(daemon): add language boost support for Minimax TTS (#773)
* feat(daemon): add language boost support for Minimax TTS Add --language CLI flag to support language boost parameter for Minimax TTS. This enables better pronunciation for specific languages like Cantonese (Yue). * docs(media): add --language flag to media generation contract Document the language boost parameter for Minimax TTS, enabling better pronunciation for specific languages like Cantonese (Yue). * fix(media): correct Cantonese language_boost value and add input validation - Use correct MiniMax value 'Chinese,Yue' for Cantonese (no space) - Add type guard in server.ts to reject non-string language values - Trim language string before sending to MiniMax API --------- Co-authored-by: root <root@DELLN40.asiacredit.org>
This commit is contained in:
parent
9674f48f2f
commit
e52720aa12
5 changed files with 22 additions and 8 deletions
|
|
@ -44,6 +44,7 @@ const MEDIA_GENERATE_STRING_FLAGS = new Set([
|
|||
'composition-dir',
|
||||
'image',
|
||||
'daemon-url',
|
||||
'language',
|
||||
]);
|
||||
const MEDIA_GENERATE_BOOLEAN_FLAGS = new Set([
|
||||
'help',
|
||||
|
|
@ -333,6 +334,7 @@ async function runMediaGenerate(rawArgs) {
|
|||
audioKind: flags['audio-kind'],
|
||||
compositionDir: flags['composition-dir'],
|
||||
image: flags.image,
|
||||
language: flags.language,
|
||||
};
|
||||
if (flags.length != null) body.length = Number(flags.length);
|
||||
if (flags.duration != null) body.duration = Number(flags.duration);
|
||||
|
|
@ -567,6 +569,7 @@ Common options:
|
|||
--length <seconds> Video length.
|
||||
--duration <seconds> Audio duration.
|
||||
--voice <voice-id> Speech / TTS voice.
|
||||
--language <lang> Language boost for TTS (e.g. Chinese,Yue for Cantonese).
|
||||
--audio-kind music|speech|sfx
|
||||
--composition-dir <path> hyperframes-html only — project-relative path
|
||||
to the dir containing hyperframes.json /
|
||||
|
|
|
|||
|
|
@ -211,6 +211,7 @@ function clampWithWarning(value, allowed, flagName) {
|
|||
* @param {number} [args.duration]
|
||||
* @param {string} [args.voice]
|
||||
* @param {string} [args.audioKind]
|
||||
* @param {string} [args.language]
|
||||
* @returns {Promise<{ name: string, size: number, mtime: number, kind: string, mime: string, model: string, surface: string, providerNote: string, providerId: string }>}
|
||||
*/
|
||||
export async function generateMedia(args) {
|
||||
|
|
@ -227,6 +228,7 @@ export async function generateMedia(args) {
|
|||
duration,
|
||||
voice,
|
||||
audioKind,
|
||||
language,
|
||||
compositionDir,
|
||||
image,
|
||||
} = args;
|
||||
|
|
@ -310,6 +312,7 @@ export async function generateMedia(args) {
|
|||
duration: clampedDuration,
|
||||
voice: voice || '',
|
||||
audioKind: resolvedAudioKind,
|
||||
language: language || '',
|
||||
// Project-relative path to the directory the agent scaffolded with
|
||||
// hyperframes.json / meta.json / index.html. Only consumed by the
|
||||
// hyperframes renderer; null/empty for every other provider.
|
||||
|
|
@ -1361,10 +1364,13 @@ async function renderMinimaxTTS(ctx, credentials) {
|
|||
// platform.minimaxi.com under voice management.
|
||||
const voiceId = (ctx.voice && ctx.voice.trim()) || 'male-qn-qingse';
|
||||
|
||||
const languageBoost = typeof ctx.language === 'string' ? ctx.language.trim() : '';
|
||||
|
||||
const body = {
|
||||
model: wireModel,
|
||||
text,
|
||||
stream: false,
|
||||
...(languageBoost ? { language_boost: languageBoost } : {}),
|
||||
voice_setting: {
|
||||
voice_id: voiceId,
|
||||
speed: 1.0,
|
||||
|
|
|
|||
|
|
@ -87,6 +87,7 @@ Run via your shell tool (Bash on Claude Code, exec on Codex/Gemini, etc.):
|
|||
[--duration <seconds>] # audio only
|
||||
[--audio-kind music|speech|sfx] # audio only
|
||||
[--voice <provider-voice-id>] # audio:speech only; omit to use provider default
|
||||
[--language <lang>] # audio:speech only; language boost (e.g. Chinese,Yue for Cantonese)
|
||||
\`\`\`
|
||||
|
||||
Always quote the prompt value. Use \`--prompt "<full prompt>"\` (or the
|
||||
|
|
@ -255,13 +256,15 @@ substitution. Do not silently fall back.
|
|||
### Workflow rules
|
||||
|
||||
1. **Read project metadata first.** The "Project metadata" block above
|
||||
tells you the user's pre-selected model, aspect, length, voice, audio
|
||||
kind, etc. Treat those as authoritative defaults — only override if
|
||||
the user's chat message explicitly contradicts them.
|
||||
For \`minimax-tts\`, \`voice\` must be a valid MiniMax \`voice_id\`
|
||||
(example: \`male-qn-qingse\`). Do not pass natural-language voice
|
||||
descriptions like "warm Mandarin narrator" as \`--voice\`; omit the
|
||||
flag instead unless you have a real id.
|
||||
tells you the user's pre-selected model, aspect, length, voice, audio
|
||||
kind, etc. Treat those as authoritative defaults — only override if
|
||||
the user's chat message explicitly contradicts them.
|
||||
For \`minimax-tts\`, \`voice\` must be a valid MiniMax \`voice_id\`
|
||||
(example: \`male-qn-qingse\`). Do not pass natural-language voice
|
||||
descriptions like "warm Mandarin narrator" as \`--voice\`; omit the
|
||||
flag instead unless you have a real id.
|
||||
\`language\` enables pronunciation boost for specific languages
|
||||
(e.g. \`Chinese,Yue\` for Cantonese, \`Chinese\` for Mandarin).
|
||||
2. **One discovery turn before generating.** Even with metadata defaults
|
||||
present, restate what you're about to make and ask one targeted
|
||||
question if anything is ambiguous (subject, mood, brand, voice). The
|
||||
|
|
|
|||
|
|
@ -3825,6 +3825,7 @@ export async function startServer({ port = 7456, host = process.env.OD_BIND_HOST
|
|||
: undefined,
|
||||
voice: req.body?.voice,
|
||||
audioKind: req.body?.audioKind,
|
||||
language: typeof req.body?.language === 'string' ? req.body.language : undefined,
|
||||
compositionDir: req.body?.compositionDir,
|
||||
image: req.body?.image,
|
||||
onProgress: (line) => appendTaskProgress(task, line),
|
||||
|
|
|
|||
|
|
@ -30,7 +30,8 @@ Run media generation through the dispatcher:
|
|||
[--length <seconds>] \\
|
||||
[--duration <seconds>] \\
|
||||
[--audio-kind music|speech|sfx] \\
|
||||
[--voice <provider-voice-id>]
|
||||
[--voice <provider-voice-id>] \\
|
||||
[--language <lang>]
|
||||
\`\`\`
|
||||
|
||||
Always quote the prompt value. Never splice unquoted user text into the
|
||||
|
|
|
|||
Loading…
Reference in a new issue