add hermes tts playback (#541)

2026-05-08 15:34:11 +08:00
parent 10d2f892ff
commit d54f9479b9
8 changed files with 218 additions and 144 deletions
@@ -0,0 +1,28 @@
+import type { Context } from 'koa'
+import { textToSpeech } from '../../services/hermes/tts'
+
+export async function generate(ctx: Context) {
+  const { text, lang } = ctx.request.body as {
+    text?: string
+    lang?: string
+  }
+
+  if (!text || typeof text !== 'string') {
+    ctx.status = 400
+    ctx.body = { error: 'text is required' }
+    return
+  }
+
+  if (text.length > 5000) {
+    ctx.status = 400
+    ctx.body = { error: 'text is too long (max 5000 characters)' }
+    return
+  }
+
+  const { audio, engine } = await textToSpeech({ text, lang })
+
+  ctx.set('Content-Type', 'audio/mpeg')
+  ctx.set('Content-Length', String(audio.length))
+  ctx.set('X-TTS-Engine', engine)
+  ctx.body = audio
+}
@@ -0,0 +1,6 @@
+import Router from '@koa/router'
+import * as ctrl from '../../controllers/hermes/tts'
+
+export const ttsRoutes = new Router()
+
+ttsRoutes.post('/api/hermes/tts', ctrl.generate)
@@ -26,6 +26,7 @@ import { downloadRoutes } from './hermes/download'
 import { jobRoutes } from './hermes/jobs'
 import { cronHistoryRoutes } from './hermes/cron-history'
 import { kanbanRoutes } from './hermes/kanban'
+import { ttsRoutes } from './hermes/tts'
 import { proxyRoutes, proxyMiddleware } from './hermes/proxy'
 import { groupChatRoutes, setGroupChatServer } from './hermes/group-chat'

@@ -66,6 +67,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
  app.use(jobRoutes.routes())               // Must be before proxy
  app.use(cronHistoryRoutes.routes())        // Must be before proxy
  app.use(kanbanRoutes.routes())             // Must be before proxy
+  app.use(ttsRoutes.routes())                // Must be before proxy
  app.use(proxyRoutes.routes())

  // Proxy catch-all middleware (must be last)
@@ -0,0 +1,41 @@
+import { EdgeTTS } from 'node-edge-tts'
+import { tmpdir } from 'os'
+import { join } from 'path'
+import { readFile, unlink } from 'fs/promises'
+import { randomUUID } from 'crypto'
+import { logger } from '../logger'
+
+const FIXED_VOICE = 'zh-CN-XiaoxiaoNeural'
+const FIXED_RATE = '+4%'
+const FIXED_PITCH = '+12Hz'
+
+export interface TtsOptions {
+  text: string
+  lang?: string
+}
+
+export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
+  const id = randomUUID()
+  const tmpFile = join(tmpdir(), `tts-${id}.mp3`)
+
+  try {
+    const tts = new EdgeTTS({
+      voice: FIXED_VOICE,
+      rate: FIXED_RATE,
+      pitch: FIXED_PITCH,
+      timeout: 15000,
+    })
+
+    await tts.ttsPromise(opts.text, tmpFile)
+    const buf = await readFile(tmpFile)
+    return buf
+  } finally {
+    unlink(tmpFile).catch(() => {})
+  }
+}
+
+export async function textToSpeech(opts: TtsOptions): Promise<{ audio: Buffer; engine: string }> {
+  const audio = await edgeTts(opts)
+  logger.debug({ engine: 'edge', voice: FIXED_VOICE, rate: FIXED_RATE, pitch: FIXED_PITCH }, 'TTS generated via Edge')
+  return { audio, engine: 'edge' }
+}