fix: enable SenseAudio BYOK TTS (#2570)

* fix: enable SenseAudio BYOK TTS

* fix: handle BYOK SenseAudio TTS failures

* fix: harden SenseAudio BYOK TTS responses

* fix: preserve BYOK speech tool failure kind

* fix: handle versioned SenseAudio TTS base URLs
This commit is contained in:
Fl0rencess 2026-05-22 14:04:29 +08:00 committed by GitHub
parent e8b5dd8aaf
commit 5b53c44e13
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 415 additions and 11 deletions

View file

@ -9,10 +9,8 @@
// back as a `role: 'tool'` message → re-issue the completion. The chat surface
// stays the same; the tool dispatch happens entirely daemon-side.
//
// Today we ship one tool — `generate_image` — backed by SenseAudio's
// /v1/image/sync endpoint, since the BYOK chat session already authenticates
// against SenseAudio with the same API key. Additional tools (TTS, video,
// research) can be added here as the BYOK surface expands.
// Today we ship image, video, and speech tools backed by SenseAudio endpoints,
// since the BYOK chat session already authenticates with the same API key.
import path from 'node:path';
import { writeFile } from 'node:fs/promises';
@ -43,6 +41,18 @@ export function isSenseAudioImageModel(value: unknown): value is string {
const SENSEAUDIO_DEFAULT_BASE_URL = 'https://api.senseaudio.cn';
const PROMPT_MAX_LENGTH = 2000;
const SENSEAUDIO_TTS_MODEL = 'senseaudio-tts-1.5-260319';
const SENSEAUDIO_DEFAULT_VOICE_ID = 'female_0033_b';
const HEX_AUDIO_PATTERN = /^[0-9a-fA-F]+$/;
function appendSenseAudioApiPath(baseUrl: string, path: string): string {
const url = new URL(baseUrl);
const trimmed = url.pathname.replace(/\/+$/, '');
url.pathname = /\/v\d+(\/|$)/.test(trimmed)
? `${trimmed}${path}`
: `${trimmed}/v1${path}`;
return url.toString();
}
// SenseAudio video — the API only documents one model today, so the
// wire id is a const. The chat tool's `generate_video` param surface
@ -122,6 +132,30 @@ export const BYOK_SENSEAUDIO_TOOLS = [
},
},
},
{
type: 'function' as const,
function: {
name: 'generate_speech',
description:
'Generate a text-to-speech voiceover using SenseAudio TTS. Returns a URL pointing to the rendered MP3. Use this whenever the user asks for narration, voiceover, speech, TTS, or spoken audio. After this tool succeeds, reply with a clickable markdown link to the MP3.',
parameters: {
type: 'object',
properties: {
text: {
type: 'string',
description:
'Exact script to speak. Include only the words that should be spoken, not production notes.',
},
voice_id: {
type: 'string',
description:
`Optional SenseAudio voice id. Defaults to ${SENSEAUDIO_DEFAULT_VOICE_ID}.`,
},
},
required: ['text'],
},
},
},
{
type: 'function' as const,
function: {
@ -217,6 +251,102 @@ export interface ImageToolResult {
error?: string;
}
export async function executeGenerateSpeech(
args: { text?: unknown; voice_id?: unknown },
ctx: BYOKToolContext,
): Promise<ImageToolResult> {
const text = typeof args.text === 'string' ? args.text.trim() : '';
if (!text) return { ok: false, error: 'text is required' };
let dir: string;
try {
dir = await ensureProject(ctx.projectsRoot, ctx.projectId);
} catch (err) {
return {
ok: false,
error: `invalid projectId for speech storage: ${err instanceof Error ? err.message : String(err)}`,
};
}
const apiKey = ctx.upstreamApiKey;
if (!apiKey) return { ok: false, error: 'no SenseAudio API key available' };
const voiceId =
typeof args.voice_id === 'string' && args.voice_id.trim()
? args.voice_id.trim()
: SENSEAUDIO_DEFAULT_VOICE_ID;
const baseUrl = ctx.upstreamBaseUrl || SENSEAUDIO_DEFAULT_BASE_URL;
let data: {
data?: { audio?: string };
base_resp?: { status_code?: number; status_msg?: string };
};
try {
const resp = await fetch(appendSenseAudioApiPath(baseUrl, '/t2a_v2'), {
method: 'POST',
redirect: 'error',
headers: {
authorization: `Bearer ${apiKey}`,
'content-type': 'application/json',
},
body: JSON.stringify({
model: SENSEAUDIO_TTS_MODEL,
text,
stream: false,
voice_setting: {
voice_id: voiceId,
speed: 1,
vol: 1,
pitch: 0,
},
audio_setting: {
format: 'mp3',
sample_rate: 32000,
bitrate: 128000,
channel: 2,
},
}),
});
const respText = await resp.text();
if (!resp.ok) {
return { ok: false, error: `senseaudio speech ${resp.status}: ${respText.slice(0, 240)}` };
}
try {
data = JSON.parse(respText) as typeof data;
} catch {
return { ok: false, error: `senseaudio speech non-JSON: ${respText.slice(0, 200)}` };
}
} catch (err) {
return {
ok: false,
error: err instanceof Error ? err.message : String(err),
};
}
if (data?.base_resp && data.base_resp.status_code !== 0) {
return {
ok: false,
error: `senseaudio speech api error ${data.base_resp.status_code}: ${data.base_resp.status_msg || 'unknown'}`,
};
}
const hex = data?.data?.audio;
if (typeof hex !== 'string' || !hex) {
return { ok: false, error: 'senseaudio speech response missing data.audio' };
}
if (hex.length % 2 !== 0 || !HEX_AUDIO_PATTERN.test(hex)) {
return { ok: false, error: 'senseaudio speech response contained invalid hex audio' };
}
const bytes = Buffer.from(hex, 'hex');
if (bytes.length === 0) return { ok: false, error: 'senseaudio speech decoded zero bytes' };
const id = `${Date.now().toString(36)}-${randomBytes(4).toString('hex')}`;
const filename = `byok-speech-${id}.mp3`;
await writeFile(path.join(dir, filename), bytes);
return {
ok: true,
url: `/api/projects/${encodeURIComponent(ctx.projectId)}/files/${filename}`,
};
}
function sanitizeAspectRatio(raw: unknown): string {
if (typeof raw !== 'string') return '1:1';
return ASPECT_TO_SIZE[raw] ? raw : '1:1';
@ -595,4 +725,3 @@ export async function executeGenerateVideo(
url: `/api/projects/${encodeURIComponent(ctx.projectId)}/files/${filename}`,
};
}

View file

@ -4,6 +4,7 @@ import { seedProviderIfMissing } from './media-config.js';
import {
BYOK_SENSEAUDIO_TOOLS,
executeGenerateImage,
executeGenerateSpeech,
executeGenerateVideo,
isSenseAudioImageModel,
type BYOKToolContext,
@ -1255,24 +1256,29 @@ export function registerChatRoutes(app: Express, ctx: RegisterChatRoutesDeps) {
const executeOneTool = async (call: {
id: string;
function: { name: string; arguments: string };
}): Promise<{ ok: boolean; url?: string; error?: string; kind?: 'image' | 'video' }> => {
}): Promise<{ ok: boolean; url?: string; error?: string; kind?: 'image' | 'video' | 'speech' }> => {
const fnName = call?.function?.name ?? '';
if (fnName !== 'generate_image' && fnName !== 'generate_video') {
if (fnName !== 'generate_image' && fnName !== 'generate_video' && fnName !== 'generate_speech') {
return {
ok: false,
error: `unknown tool: ${fnName || 'unnamed'}`,
};
}
const toolKind = fnName === 'generate_image' ? 'image' : fnName === 'generate_video' ? 'video' : 'speech';
let args: any = {};
try {
args = JSON.parse(call.function.arguments || '{}');
} catch {
return { ok: false, error: 'tool arguments were not valid JSON' };
return { ok: false, error: 'tool arguments were not valid JSON', kind: toolKind };
}
if (fnName === 'generate_image') {
const result = await executeGenerateImage(args, toolCtx);
return { ...result, kind: 'image' };
}
if (fnName === 'generate_speech') {
const result = await executeGenerateSpeech(args, toolCtx);
return { ...result, kind: 'speech' };
}
// generate_video — longer (up to 5 min), async-with-polling.
const result = await executeGenerateVideo(args, toolCtx);
return { ...result, kind: 'video' };
@ -1339,9 +1345,13 @@ export function registerChatRoutes(app: Express, ctx: RegisterChatRoutesDeps) {
const content = result.ok
? result.kind === 'video'
? `Video generated successfully. URL: ${result.url}. Reply to the user with a clickable markdown link, e.g. [▶ Play video](${result.url}). Do NOT use markdown image syntax — the chat renderer does not embed <video> tags.`
: result.kind === 'speech'
? `Speech generated successfully. URL: ${result.url}. Reply to the user with a clickable markdown link to the MP3, e.g. [▶ Play voiceover](${result.url}).`
: `Image generated successfully. URL: ${result.url}. Reply to the user with: ![generated image](${result.url})`
: result.kind === 'video'
? `Video generation failed: ${result.error}. Apologize briefly and suggest a retry with a more specific prompt or a shorter duration.`
: result.kind === 'speech'
? `Speech generation failed: ${result.error}. Apologize briefly and suggest a retry with a shorter script or a valid voice id.`
: `Image generation failed: ${result.error}. Apologize briefly and suggest a retry with a more specific prompt.`;
workingMessages.push({
role: 'tool',

View file

@ -6,6 +6,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import {
BYOK_SENSEAUDIO_TOOLS,
executeGenerateImage,
executeGenerateSpeech,
executeGenerateVideo,
} from '../src/byok-tools.js';
@ -17,7 +18,8 @@ describe('BYOK_SENSEAUDIO_TOOLS', () => {
expect(tool).toBeDefined();
expect(tool!.type).toBe('function');
expect(tool!.function.parameters.required).toEqual(['prompt']);
expect(tool!.function.parameters.properties.aspect_ratio.enum).toEqual([
const properties = tool!.function.parameters.properties as Record<string, any>;
expect(properties.aspect_ratio.enum).toEqual([
'1:1',
'16:9',
'9:16',
@ -26,9 +28,9 @@ describe('BYOK_SENSEAUDIO_TOOLS', () => {
]);
});
it('exposes both generate_image and generate_video tools', () => {
it('exposes image, speech, and video tools', () => {
const names = BYOK_SENSEAUDIO_TOOLS.map((t) => t.function.name).sort();
expect(names).toEqual(['generate_image', 'generate_video']);
expect(names).toEqual(['generate_image', 'generate_speech', 'generate_video']);
});
});
@ -381,6 +383,212 @@ describe('BYOK_SENSEAUDIO_TOOLS — video', () => {
});
});
describe('executeGenerateSpeech', () => {
let root: string;
let projectsRoot: string;
const PROJECT_ID = 'test-project';
const realFetch = globalThis.fetch;
beforeEach(async () => {
root = await mkdtemp(path.join(tmpdir(), 'od-byok-speech-'));
projectsRoot = path.join(root, 'projects');
});
afterEach(async () => {
globalThis.fetch = realFetch;
vi.unstubAllGlobals();
await rm(root, { recursive: true, force: true });
});
it('calls /v1/t2a_v2, persists mp3 bytes, and returns a daemon URL', async () => {
const audioBytes = Buffer.from([0x49, 0x44, 0x33, 0x04]);
const fetchMock = vi.fn(async (input: unknown, init?: RequestInit) => {
expect(String(input)).toBe('https://api.senseaudio.cn/v1/t2a_v2');
expect(init?.method).toBe('POST');
expect(init?.redirect).toBe('error');
expect(init?.headers).toMatchObject({
authorization: 'Bearer sa-byok-key',
'content-type': 'application/json',
});
expect(JSON.parse(String(init?.body))).toEqual({
model: 'senseaudio-tts-1.5-260319',
text: 'Meet saddle2 — the way work was supposed to feel.',
stream: false,
voice_setting: {
voice_id: 'female_0033_b',
speed: 1,
vol: 1,
pitch: 0,
},
audio_setting: {
format: 'mp3',
sample_rate: 32000,
bitrate: 128000,
channel: 2,
},
});
return new Response(
JSON.stringify({
data: { audio: audioBytes.toString('hex') },
base_resp: { status_code: 0, status_msg: 'success' },
}),
{ status: 200, headers: { 'content-type': 'application/json' } },
);
});
vi.stubGlobal('fetch', fetchMock);
const result = await executeGenerateSpeech(
{ text: 'Meet saddle2 — the way work was supposed to feel.' },
{
projectRoot: root,
projectsRoot,
projectId: PROJECT_ID,
upstreamApiKey: 'sa-byok-key',
upstreamBaseUrl: 'https://api.senseaudio.cn',
},
);
expect(result.ok).toBe(true);
expect(result.url).toMatch(
new RegExp(`^/api/projects/${PROJECT_ID}/files/byok-speech-[a-z0-9-]+\\.mp3$`),
);
const filename = result.url!.split('/').pop()!;
const onDisk = await readFile(path.join(projectsRoot, PROJECT_ID, filename));
expect(onDisk.equals(audioBytes)).toBe(true);
});
it('does not duplicate /v1 when the BYOK gateway base URL is already versioned', async () => {
const audioBytes = Buffer.from([0x49, 0x44, 0x33, 0x04]);
const fetchMock = vi.fn(async (input: unknown) => {
expect(String(input)).toBe('https://gateway.example.com/api/v1/openai/t2a_v2');
return new Response(
JSON.stringify({
data: { audio: audioBytes.toString('hex') },
base_resp: { status_code: 0, status_msg: 'success' },
}),
{ status: 200, headers: { 'content-type': 'application/json' } },
);
});
vi.stubGlobal('fetch', fetchMock);
const result = await executeGenerateSpeech(
{ text: 'hello' },
{
projectRoot: root,
projectsRoot,
projectId: PROJECT_ID,
upstreamApiKey: 'sa-byok-key',
upstreamBaseUrl: 'https://gateway.example.com/api/v1/openai',
},
);
expect(result.ok).toBe(true);
expect(fetchMock).toHaveBeenCalledTimes(1);
});
it('returns { ok: false } when SenseAudio returns malformed JSON', async () => {
vi.stubGlobal(
'fetch',
vi.fn(async () =>
new Response('not json', {
status: 200,
headers: { 'content-type': 'text/plain' },
}),
),
);
const result = await executeGenerateSpeech(
{ text: 'hello' },
{
projectRoot: root,
projectsRoot,
projectId: PROJECT_ID,
upstreamApiKey: 'sa-byok-key',
upstreamBaseUrl: 'https://api.senseaudio.cn',
},
);
expect(result.ok).toBe(false);
expect(result.error).toMatch(/senseaudio speech non-JSON/);
});
it('returns { ok: false } when the SenseAudio request fails', async () => {
vi.stubGlobal(
'fetch',
vi.fn(async () => {
throw new Error('network down');
}),
);
const result = await executeGenerateSpeech(
{ text: 'hello' },
{
projectRoot: root,
projectsRoot,
projectId: PROJECT_ID,
upstreamApiKey: 'sa-byok-key',
upstreamBaseUrl: 'https://api.senseaudio.cn',
},
);
expect(result).toEqual({ ok: false, error: 'network down' });
});
it('asks fetch to reject redirected SenseAudio TTS upstreams', async () => {
const fetchMock = vi.fn(async (_input: unknown, init?: RequestInit) => {
expect(init?.redirect).toBe('error');
throw new TypeError('redirect mode is set to error');
});
vi.stubGlobal('fetch', fetchMock);
const result = await executeGenerateSpeech(
{ text: 'hello' },
{
projectRoot: root,
projectsRoot,
projectId: PROJECT_ID,
upstreamApiKey: 'sa-byok-key',
upstreamBaseUrl: 'https://api.senseaudio.cn',
},
);
expect(result).toEqual({ ok: false, error: 'redirect mode is set to error' });
});
it.each(['aaZZ', 'abc'])(
'returns { ok: false } when SenseAudio returns malformed hex audio: %s',
async (audio) => {
vi.stubGlobal(
'fetch',
vi.fn(async () =>
new Response(
JSON.stringify({
data: { audio },
base_resp: { status_code: 0, status_msg: 'success' },
}),
{ status: 200, headers: { 'content-type': 'application/json' } },
),
),
);
const result = await executeGenerateSpeech(
{ text: 'hello' },
{
projectRoot: root,
projectsRoot,
projectId: PROJECT_ID,
upstreamApiKey: 'sa-byok-key',
upstreamBaseUrl: 'https://api.senseaudio.cn',
},
);
expect(result.ok).toBe(false);
expect(result.error).toMatch(/invalid hex audio/);
},
);
});
describe('executeGenerateVideo', () => {
let root: string;
let projectsRoot: string;

View file

@ -849,6 +849,63 @@ describe('API proxy routes', () => {
expect(toolMsg.content).toMatch(/sensitive_content_blocked/);
});
it('feeds speech-specific tool error copy when generate_speech arguments are malformed', async () => {
const upstreamChatBodies: any[] = [];
let chatCallIndex = 0;
const fetchMock = vi.fn(async (input: FetchInput, init?: FetchInit) => {
const url = String(input);
if (url.startsWith(baseUrl)) return realFetch(input, init);
if (url === 'https://api.senseaudio.cn/v1/chat/completions') {
upstreamChatBodies.push(JSON.parse(String(init?.body || '{}')));
chatCallIndex++;
if (chatCallIndex === 1) {
return sseResponse([
'data: {"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"id":"call_speech_bad_args","type":"function","function":{"name":"generate_speech","arguments":"{\\"text\\":"}}]},"finish_reason":null}]}',
'',
'data: {"choices":[{"index":0,"delta":{},"finish_reason":"tool_calls"}]}',
'',
'data: [DONE]',
'',
].join('\n'));
}
return sseResponse([
'data: {"choices":[{"index":0,"delta":{"content":"I need a valid script before generating speech."}}]}',
'',
'data: {"choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}',
'',
'data: [DONE]',
'',
].join('\n'));
}
throw new Error(`unexpected fetch: ${url}`);
});
vi.stubGlobal('fetch', fetchMock);
const res = await realFetch(`${baseUrl}/api/proxy/senseaudio/stream`, {
method: 'POST',
headers: { 'content-type': 'application/json' },
body: JSON.stringify({
baseUrl: 'https://api.senseaudio.cn',
apiKey: 'sa-test',
projectId: 'test-project',
model: 'senseaudio-s2',
messages: [{ role: 'user', content: 'make a voiceover' }],
}),
});
expect(res.status).toBe(200);
const body = await res.text();
expect(body).toContain('I need a valid script before generating speech.');
expect(upstreamChatBodies).toHaveLength(2);
const toolMsg = upstreamChatBodies[1].messages[2];
expect(toolMsg.role).toBe('tool');
expect(toolMsg.tool_call_id).toBe('call_speech_bad_args');
expect(toolMsg.content).toMatch(/Speech generation failed/);
expect(toolMsg.content).toMatch(/tool arguments were not valid JSON/);
expect(toolMsg.content).not.toMatch(/Image generation failed/);
});
it('bounds the BYOK tool loop at MAX_BYOK_TOOL_LOOPS=3', async () => {
let chatCallIndex = 0;
const fetchMock = vi.fn(async (input: FetchInput, init?: FetchInit) => {