fix: enable SenseAudio BYOK TTS (#2570)

* fix: enable SenseAudio BYOK TTS * fix: handle BYOK SenseAudio TTS failures * fix: harden SenseAudio BYOK TTS responses * fix: preserve BYOK speech tool failure kind * fix: handle versioned SenseAudio TTS base URLs
2026-06-01 03:14:35 +07:00 · 2026-05-22 14:04:29 +08:00 · 2026-05-22 14:04:29 +08:00 · 5b53c44e13
commit 5b53c44e13
parent e8b5dd8aaf
4 changed files with 415 additions and 11 deletions
--- a/apps/daemon/src/byok-tools.ts
+++ b/apps/daemon/src/byok-tools.ts
@ -9,10 +9,8 @@
 // back as a `role: 'tool'` message → re-issue the completion. The chat surface
 // stays the same; the tool dispatch happens entirely daemon-side.
 //
-// Today we ship one tool — `generate_image` — backed by SenseAudio's
-// /v1/image/sync endpoint, since the BYOK chat session already authenticates
-// against SenseAudio with the same API key. Additional tools (TTS, video,
-// research) can be added here as the BYOK surface expands.
+// Today we ship image, video, and speech tools backed by SenseAudio endpoints,
+// since the BYOK chat session already authenticates with the same API key.

 import path from 'node:path';
 import { writeFile } from 'node:fs/promises';
@ -43,6 +41,18 @@ export function isSenseAudioImageModel(value: unknown): value is string {

 const SENSEAUDIO_DEFAULT_BASE_URL = 'https://api.senseaudio.cn';
 const PROMPT_MAX_LENGTH = 2000;
+const SENSEAUDIO_TTS_MODEL = 'senseaudio-tts-1.5-260319';
+const SENSEAUDIO_DEFAULT_VOICE_ID = 'female_0033_b';
+const HEX_AUDIO_PATTERN = /^[0-9a-fA-F]+$/;
+
+function appendSenseAudioApiPath(baseUrl: string, path: string): string {
+  const url = new URL(baseUrl);
+  const trimmed = url.pathname.replace(/\/+$/, '');
+  url.pathname = /\/v\d+(\/|$)/.test(trimmed)
+    ? `${trimmed}${path}`
+    : `${trimmed}/v1${path}`;
+  return url.toString();
+}

 // SenseAudio video — the API only documents one model today, so the
 // wire id is a const. The chat tool's `generate_video` param surface
@ -122,6 +132,30 @@ export const BYOK_SENSEAUDIO_TOOLS = [
      },
    },
  },
+  {
+    type: 'function' as const,
+    function: {
+      name: 'generate_speech',
+      description:
+        'Generate a text-to-speech voiceover using SenseAudio TTS. Returns a URL pointing to the rendered MP3. Use this whenever the user asks for narration, voiceover, speech, TTS, or spoken audio. After this tool succeeds, reply with a clickable markdown link to the MP3.',
+      parameters: {
+        type: 'object',
+        properties: {
+          text: {
+            type: 'string',
+            description:
+              'Exact script to speak. Include only the words that should be spoken, not production notes.',
+          },
+          voice_id: {
+            type: 'string',
+            description:
+              `Optional SenseAudio voice id. Defaults to ${SENSEAUDIO_DEFAULT_VOICE_ID}.`,
+          },
+        },
+        required: ['text'],
+      },
+    },
+  },
  {
    type: 'function' as const,
    function: {
@ -217,6 +251,102 @@ export interface ImageToolResult {
  error?: string;
 }

+export async function executeGenerateSpeech(
+  args: { text?: unknown; voice_id?: unknown },
+  ctx: BYOKToolContext,
+): Promise<ImageToolResult> {
+  const text = typeof args.text === 'string' ? args.text.trim() : '';
+  if (!text) return { ok: false, error: 'text is required' };
+
+  let dir: string;
+  try {
+    dir = await ensureProject(ctx.projectsRoot, ctx.projectId);
+  } catch (err) {
+    return {
+      ok: false,
+      error: `invalid projectId for speech storage: ${err instanceof Error ? err.message : String(err)}`,
+    };
+  }
+
+  const apiKey = ctx.upstreamApiKey;
+  if (!apiKey) return { ok: false, error: 'no SenseAudio API key available' };
+
+  const voiceId =
+    typeof args.voice_id === 'string' && args.voice_id.trim()
+      ? args.voice_id.trim()
+      : SENSEAUDIO_DEFAULT_VOICE_ID;
+  const baseUrl = ctx.upstreamBaseUrl || SENSEAUDIO_DEFAULT_BASE_URL;
+  let data: {
+    data?: { audio?: string };
+    base_resp?: { status_code?: number; status_msg?: string };
+  };
+  try {
+    const resp = await fetch(appendSenseAudioApiPath(baseUrl, '/t2a_v2'), {
+      method: 'POST',
+      redirect: 'error',
+      headers: {
+        authorization: `Bearer ${apiKey}`,
+        'content-type': 'application/json',
+      },
+      body: JSON.stringify({
+        model: SENSEAUDIO_TTS_MODEL,
+        text,
+        stream: false,
+        voice_setting: {
+          voice_id: voiceId,
+          speed: 1,
+          vol: 1,
+          pitch: 0,
+        },
+        audio_setting: {
+          format: 'mp3',
+          sample_rate: 32000,
+          bitrate: 128000,
+          channel: 2,
+        },
+      }),
+    });
+    const respText = await resp.text();
+    if (!resp.ok) {
+      return { ok: false, error: `senseaudio speech ${resp.status}: ${respText.slice(0, 240)}` };
+    }
+    try {
+      data = JSON.parse(respText) as typeof data;
+    } catch {
+      return { ok: false, error: `senseaudio speech non-JSON: ${respText.slice(0, 200)}` };
+    }
+  } catch (err) {
+    return {
+      ok: false,
+      error: err instanceof Error ? err.message : String(err),
+    };
+  }
+  if (data?.base_resp && data.base_resp.status_code !== 0) {
+    return {
+      ok: false,
+      error: `senseaudio speech api error ${data.base_resp.status_code}: ${data.base_resp.status_msg || 'unknown'}`,
+    };
+  }
+  const hex = data?.data?.audio;
+  if (typeof hex !== 'string' || !hex) {
+    return { ok: false, error: 'senseaudio speech response missing data.audio' };
+  }
+  if (hex.length % 2 !== 0 || !HEX_AUDIO_PATTERN.test(hex)) {
+    return { ok: false, error: 'senseaudio speech response contained invalid hex audio' };
+  }
+  const bytes = Buffer.from(hex, 'hex');
+  if (bytes.length === 0) return { ok: false, error: 'senseaudio speech decoded zero bytes' };
+
+  const id = `${Date.now().toString(36)}-${randomBytes(4).toString('hex')}`;
+  const filename = `byok-speech-${id}.mp3`;
+  await writeFile(path.join(dir, filename), bytes);
+
+  return {
+    ok: true,
+    url: `/api/projects/${encodeURIComponent(ctx.projectId)}/files/${filename}`,
+  };
+}
+
 function sanitizeAspectRatio(raw: unknown): string {
  if (typeof raw !== 'string') return '1:1';
  return ASPECT_TO_SIZE[raw] ? raw : '1:1';
@ -595,4 +725,3 @@ export async function executeGenerateVideo(
    url: `/api/projects/${encodeURIComponent(ctx.projectId)}/files/${filename}`,
  };
 }
-
--- a/apps/daemon/src/chat-routes.ts
+++ b/apps/daemon/src/chat-routes.ts
@ -4,6 +4,7 @@ import { seedProviderIfMissing } from './media-config.js';
 import {
  BYOK_SENSEAUDIO_TOOLS,
  executeGenerateImage,
+  executeGenerateSpeech,
  executeGenerateVideo,
  isSenseAudioImageModel,
  type BYOKToolContext,
@ -1255,24 +1256,29 @@ export function registerChatRoutes(app: Express, ctx: RegisterChatRoutesDeps) {
    const executeOneTool = async (call: {
      id: string;
      function: { name: string; arguments: string };
-    }): Promise<{ ok: boolean; url?: string; error?: string; kind?: 'image' | 'video' }> => {
+    }): Promise<{ ok: boolean; url?: string; error?: string; kind?: 'image' | 'video' | 'speech' }> => {
      const fnName = call?.function?.name ?? '';
-      if (fnName !== 'generate_image' && fnName !== 'generate_video') {
+      if (fnName !== 'generate_image' && fnName !== 'generate_video' && fnName !== 'generate_speech') {
        return {
          ok: false,
          error: `unknown tool: ${fnName || 'unnamed'}`,
        };
      }
+      const toolKind = fnName === 'generate_image' ? 'image' : fnName === 'generate_video' ? 'video' : 'speech';
      let args: any = {};
      try {
        args = JSON.parse(call.function.arguments || '{}');
      } catch {
-        return { ok: false, error: 'tool arguments were not valid JSON' };
+        return { ok: false, error: 'tool arguments were not valid JSON', kind: toolKind };
      }
      if (fnName === 'generate_image') {
        const result = await executeGenerateImage(args, toolCtx);
        return { ...result, kind: 'image' };
      }
+      if (fnName === 'generate_speech') {
+        const result = await executeGenerateSpeech(args, toolCtx);
+        return { ...result, kind: 'speech' };
+      }
      // generate_video — longer (up to 5 min), async-with-polling.
      const result = await executeGenerateVideo(args, toolCtx);
      return { ...result, kind: 'video' };
@ -1339,9 +1345,13 @@ export function registerChatRoutes(app: Express, ctx: RegisterChatRoutesDeps) {
          const content = result.ok
            ? result.kind === 'video'
              ? `Video generated successfully. URL: ${result.url}. Reply to the user with a clickable markdown link, e.g. [▶ Play video](${result.url}). Do NOT use markdown image syntax — the chat renderer does not embed <video> tags.`
+              : result.kind === 'speech'
+                ? `Speech generated successfully. URL: ${result.url}. Reply to the user with a clickable markdown link to the MP3, e.g. [▶ Play voiceover](${result.url}).`
              : `Image generated successfully. URL: ${result.url}. Reply to the user with: ![generated image](${result.url})`
            : result.kind === 'video'
              ? `Video generation failed: ${result.error}. Apologize briefly and suggest a retry with a more specific prompt or a shorter duration.`
+              : result.kind === 'speech'
+                ? `Speech generation failed: ${result.error}. Apologize briefly and suggest a retry with a shorter script or a valid voice id.`
              : `Image generation failed: ${result.error}. Apologize briefly and suggest a retry with a more specific prompt.`;
          workingMessages.push({
            role: 'tool',
--- a/apps/daemon/tests/byok-tools.test.ts
+++ b/apps/daemon/tests/byok-tools.test.ts
@ -6,6 +6,7 @@ import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import {
  BYOK_SENSEAUDIO_TOOLS,
  executeGenerateImage,
+  executeGenerateSpeech,
  executeGenerateVideo,
 } from '../src/byok-tools.js';

@ -17,7 +18,8 @@ describe('BYOK_SENSEAUDIO_TOOLS', () => {
    expect(tool).toBeDefined();
    expect(tool!.type).toBe('function');
    expect(tool!.function.parameters.required).toEqual(['prompt']);
-    expect(tool!.function.parameters.properties.aspect_ratio.enum).toEqual([
+    const properties = tool!.function.parameters.properties as Record<string, any>;
+    expect(properties.aspect_ratio.enum).toEqual([
      '1:1',
      '16:9',
      '9:16',
@ -26,9 +28,9 @@ describe('BYOK_SENSEAUDIO_TOOLS', () => {
    ]);
  });

-  it('exposes both generate_image and generate_video tools', () => {
+  it('exposes image, speech, and video tools', () => {
    const names = BYOK_SENSEAUDIO_TOOLS.map((t) => t.function.name).sort();
-    expect(names).toEqual(['generate_image', 'generate_video']);
+    expect(names).toEqual(['generate_image', 'generate_speech', 'generate_video']);
  });
 });

@ -381,6 +383,212 @@ describe('BYOK_SENSEAUDIO_TOOLS — video', () => {
  });
 });

+describe('executeGenerateSpeech', () => {
+  let root: string;
+  let projectsRoot: string;
+  const PROJECT_ID = 'test-project';
+  const realFetch = globalThis.fetch;
+
+  beforeEach(async () => {
+    root = await mkdtemp(path.join(tmpdir(), 'od-byok-speech-'));
+    projectsRoot = path.join(root, 'projects');
+  });
+
+  afterEach(async () => {
+    globalThis.fetch = realFetch;
+    vi.unstubAllGlobals();
+    await rm(root, { recursive: true, force: true });
+  });
+
+  it('calls /v1/t2a_v2, persists mp3 bytes, and returns a daemon URL', async () => {
+    const audioBytes = Buffer.from([0x49, 0x44, 0x33, 0x04]);
+    const fetchMock = vi.fn(async (input: unknown, init?: RequestInit) => {
+      expect(String(input)).toBe('https://api.senseaudio.cn/v1/t2a_v2');
+      expect(init?.method).toBe('POST');
+      expect(init?.redirect).toBe('error');
+      expect(init?.headers).toMatchObject({
+        authorization: 'Bearer sa-byok-key',
+        'content-type': 'application/json',
+      });
+      expect(JSON.parse(String(init?.body))).toEqual({
+        model: 'senseaudio-tts-1.5-260319',
+        text: 'Meet saddle2 — the way work was supposed to feel.',
+        stream: false,
+        voice_setting: {
+          voice_id: 'female_0033_b',
+          speed: 1,
+          vol: 1,
+          pitch: 0,
+        },
+        audio_setting: {
+          format: 'mp3',
+          sample_rate: 32000,
+          bitrate: 128000,
+          channel: 2,
+        },
+      });
+      return new Response(
+        JSON.stringify({
+          data: { audio: audioBytes.toString('hex') },
+          base_resp: { status_code: 0, status_msg: 'success' },
+        }),
+        { status: 200, headers: { 'content-type': 'application/json' } },
+      );
+    });
+    vi.stubGlobal('fetch', fetchMock);
+
+    const result = await executeGenerateSpeech(
+      { text: 'Meet saddle2 — the way work was supposed to feel.' },
+      {
+        projectRoot: root,
+        projectsRoot,
+        projectId: PROJECT_ID,
+        upstreamApiKey: 'sa-byok-key',
+        upstreamBaseUrl: 'https://api.senseaudio.cn',
+      },
+    );
+
+    expect(result.ok).toBe(true);
+    expect(result.url).toMatch(
+      new RegExp(`^/api/projects/${PROJECT_ID}/files/byok-speech-[a-z0-9-]+\\.mp3$`),
+    );
+
+    const filename = result.url!.split('/').pop()!;
+    const onDisk = await readFile(path.join(projectsRoot, PROJECT_ID, filename));
+    expect(onDisk.equals(audioBytes)).toBe(true);
+  });
+
+  it('does not duplicate /v1 when the BYOK gateway base URL is already versioned', async () => {
+    const audioBytes = Buffer.from([0x49, 0x44, 0x33, 0x04]);
+    const fetchMock = vi.fn(async (input: unknown) => {
+      expect(String(input)).toBe('https://gateway.example.com/api/v1/openai/t2a_v2');
+      return new Response(
+        JSON.stringify({
+          data: { audio: audioBytes.toString('hex') },
+          base_resp: { status_code: 0, status_msg: 'success' },
+        }),
+        { status: 200, headers: { 'content-type': 'application/json' } },
+      );
+    });
+    vi.stubGlobal('fetch', fetchMock);
+
+    const result = await executeGenerateSpeech(
+      { text: 'hello' },
+      {
+        projectRoot: root,
+        projectsRoot,
+        projectId: PROJECT_ID,
+        upstreamApiKey: 'sa-byok-key',
+        upstreamBaseUrl: 'https://gateway.example.com/api/v1/openai',
+      },
+    );
+
+    expect(result.ok).toBe(true);
+    expect(fetchMock).toHaveBeenCalledTimes(1);
+  });
+
+  it('returns { ok: false } when SenseAudio returns malformed JSON', async () => {
+    vi.stubGlobal(
+      'fetch',
+      vi.fn(async () =>
+        new Response('not json', {
+          status: 200,
+          headers: { 'content-type': 'text/plain' },
+        }),
+      ),
+    );
+
+    const result = await executeGenerateSpeech(
+      { text: 'hello' },
+      {
+        projectRoot: root,
+        projectsRoot,
+        projectId: PROJECT_ID,
+        upstreamApiKey: 'sa-byok-key',
+        upstreamBaseUrl: 'https://api.senseaudio.cn',
+      },
+    );
+
+    expect(result.ok).toBe(false);
+    expect(result.error).toMatch(/senseaudio speech non-JSON/);
+  });
+
+  it('returns { ok: false } when the SenseAudio request fails', async () => {
+    vi.stubGlobal(
+      'fetch',
+      vi.fn(async () => {
+        throw new Error('network down');
+      }),
+    );
+
+    const result = await executeGenerateSpeech(
+      { text: 'hello' },
+      {
+        projectRoot: root,
+        projectsRoot,
+        projectId: PROJECT_ID,
+        upstreamApiKey: 'sa-byok-key',
+        upstreamBaseUrl: 'https://api.senseaudio.cn',
+      },
+    );
+
+    expect(result).toEqual({ ok: false, error: 'network down' });
+  });
+
+  it('asks fetch to reject redirected SenseAudio TTS upstreams', async () => {
+    const fetchMock = vi.fn(async (_input: unknown, init?: RequestInit) => {
+      expect(init?.redirect).toBe('error');
+      throw new TypeError('redirect mode is set to error');
+    });
+    vi.stubGlobal('fetch', fetchMock);
+
+    const result = await executeGenerateSpeech(
+      { text: 'hello' },
+      {
+        projectRoot: root,
+        projectsRoot,
+        projectId: PROJECT_ID,
+        upstreamApiKey: 'sa-byok-key',
+        upstreamBaseUrl: 'https://api.senseaudio.cn',
+      },
+    );
+
+    expect(result).toEqual({ ok: false, error: 'redirect mode is set to error' });
+  });
+
+  it.each(['aaZZ', 'abc'])(
+    'returns { ok: false } when SenseAudio returns malformed hex audio: %s',
+    async (audio) => {
+      vi.stubGlobal(
+        'fetch',
+        vi.fn(async () =>
+          new Response(
+            JSON.stringify({
+              data: { audio },
+              base_resp: { status_code: 0, status_msg: 'success' },
+            }),
+            { status: 200, headers: { 'content-type': 'application/json' } },
+          ),
+        ),
+      );
+
+      const result = await executeGenerateSpeech(
+        { text: 'hello' },
+        {
+          projectRoot: root,
+          projectsRoot,
+          projectId: PROJECT_ID,
+          upstreamApiKey: 'sa-byok-key',
+          upstreamBaseUrl: 'https://api.senseaudio.cn',
+        },
+      );
+
+      expect(result.ok).toBe(false);
+      expect(result.error).toMatch(/invalid hex audio/);
+    },
+  );
+});
+
 describe('executeGenerateVideo', () => {
  let root: string;
  let projectsRoot: string;
--- a/apps/daemon/tests/proxy-routes.test.ts
+++ b/apps/daemon/tests/proxy-routes.test.ts
@ -849,6 +849,63 @@ describe('API proxy routes', () => {
    expect(toolMsg.content).toMatch(/sensitive_content_blocked/);
  });

+  it('feeds speech-specific tool error copy when generate_speech arguments are malformed', async () => {
+    const upstreamChatBodies: any[] = [];
+    let chatCallIndex = 0;
+    const fetchMock = vi.fn(async (input: FetchInput, init?: FetchInit) => {
+      const url = String(input);
+      if (url.startsWith(baseUrl)) return realFetch(input, init);
+      if (url === 'https://api.senseaudio.cn/v1/chat/completions') {
+        upstreamChatBodies.push(JSON.parse(String(init?.body || '{}')));
+        chatCallIndex++;
+        if (chatCallIndex === 1) {
+          return sseResponse([
+            'data: {"choices":[{"index":0,"delta":{"tool_calls":[{"index":0,"id":"call_speech_bad_args","type":"function","function":{"name":"generate_speech","arguments":"{\\"text\\":"}}]},"finish_reason":null}]}',
+            '',
+            'data: {"choices":[{"index":0,"delta":{},"finish_reason":"tool_calls"}]}',
+            '',
+            'data: [DONE]',
+            '',
+          ].join('\n'));
+        }
+        return sseResponse([
+          'data: {"choices":[{"index":0,"delta":{"content":"I need a valid script before generating speech."}}]}',
+          '',
+          'data: {"choices":[{"index":0,"delta":{},"finish_reason":"stop"}]}',
+          '',
+          'data: [DONE]',
+          '',
+        ].join('\n'));
+      }
+      throw new Error(`unexpected fetch: ${url}`);
+    });
+    vi.stubGlobal('fetch', fetchMock);
+
+    const res = await realFetch(`${baseUrl}/api/proxy/senseaudio/stream`, {
+      method: 'POST',
+      headers: { 'content-type': 'application/json' },
+      body: JSON.stringify({
+        baseUrl: 'https://api.senseaudio.cn',
+        apiKey: 'sa-test',
+        projectId: 'test-project',
+        model: 'senseaudio-s2',
+        messages: [{ role: 'user', content: 'make a voiceover' }],
+      }),
+    });
+
+    expect(res.status).toBe(200);
+    const body = await res.text();
+    expect(body).toContain('I need a valid script before generating speech.');
+
+    expect(upstreamChatBodies).toHaveLength(2);
+    const toolMsg = upstreamChatBodies[1].messages[2];
+    expect(toolMsg.role).toBe('tool');
+    expect(toolMsg.tool_call_id).toBe('call_speech_bad_args');
+    expect(toolMsg.content).toMatch(/Speech generation failed/);
+    expect(toolMsg.content).toMatch(/tool arguments were not valid JSON/);
+    expect(toolMsg.content).not.toMatch(/Image generation failed/);
+  });
+
  it('bounds the BYOK tool loop at MAX_BYOK_TOOL_LOOPS=3', async () => {
    let chatCallIndex = 0;
    const fetchMock = vi.fn(async (input: FetchInput, init?: FetchInit) => {