add hermes tts playback (#541)

This commit is contained in:
ekko
2026-05-08 15:34:11 +08:00
committed by GitHub
parent 10d2f892ff
commit d54f9479b9
8 changed files with 218 additions and 144 deletions
@@ -0,0 +1,28 @@
import type { Context } from 'koa'
import { textToSpeech } from '../../services/hermes/tts'
export async function generate(ctx: Context) {
const { text, lang } = ctx.request.body as {
text?: string
lang?: string
}
if (!text || typeof text !== 'string') {
ctx.status = 400
ctx.body = { error: 'text is required' }
return
}
if (text.length > 5000) {
ctx.status = 400
ctx.body = { error: 'text is too long (max 5000 characters)' }
return
}
const { audio, engine } = await textToSpeech({ text, lang })
ctx.set('Content-Type', 'audio/mpeg')
ctx.set('Content-Length', String(audio.length))
ctx.set('X-TTS-Engine', engine)
ctx.body = audio
}
+6
View File
@@ -0,0 +1,6 @@
import Router from '@koa/router'
import * as ctrl from '../../controllers/hermes/tts'
export const ttsRoutes = new Router()
ttsRoutes.post('/api/hermes/tts', ctrl.generate)
+2
View File
@@ -26,6 +26,7 @@ import { downloadRoutes } from './hermes/download'
import { jobRoutes } from './hermes/jobs'
import { cronHistoryRoutes } from './hermes/cron-history'
import { kanbanRoutes } from './hermes/kanban'
import { ttsRoutes } from './hermes/tts'
import { proxyRoutes, proxyMiddleware } from './hermes/proxy'
import { groupChatRoutes, setGroupChatServer } from './hermes/group-chat'
@@ -66,6 +67,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
app.use(jobRoutes.routes()) // Must be before proxy
app.use(cronHistoryRoutes.routes()) // Must be before proxy
app.use(kanbanRoutes.routes()) // Must be before proxy
app.use(ttsRoutes.routes()) // Must be before proxy
app.use(proxyRoutes.routes())
// Proxy catch-all middleware (must be last)
@@ -0,0 +1,41 @@
import { EdgeTTS } from 'node-edge-tts'
import { tmpdir } from 'os'
import { join } from 'path'
import { readFile, unlink } from 'fs/promises'
import { randomUUID } from 'crypto'
import { logger } from '../logger'
const FIXED_VOICE = 'zh-CN-XiaoxiaoNeural'
const FIXED_RATE = '+4%'
const FIXED_PITCH = '+12Hz'
export interface TtsOptions {
text: string
lang?: string
}
export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
const id = randomUUID()
const tmpFile = join(tmpdir(), `tts-${id}.mp3`)
try {
const tts = new EdgeTTS({
voice: FIXED_VOICE,
rate: FIXED_RATE,
pitch: FIXED_PITCH,
timeout: 15000,
})
await tts.ttsPromise(opts.text, tmpFile)
const buf = await readFile(tmpFile)
return buf
} finally {
unlink(tmpFile).catch(() => {})
}
}
export async function textToSpeech(opts: TtsOptions): Promise<{ audio: Buffer; engine: string }> {
const audio = await edgeTts(opts)
logger.debug({ engine: 'edge', voice: FIXED_VOICE, rate: FIXED_RATE, pitch: FIXED_PITCH }, 'TTS generated via Edge')
return { audio, engine: 'edge' }
}