feat: add voice playback settings with 4-provider support (#608)

Add WebSpeech, OpenAI TTS, Custom endpoint, and Edge TTS providers. Co-authored-by: Hermes Agent <noreply@nousresearch.com>
2026-05-10 20:08:38 +08:00
parent 838791a740
commit 15195f0795
18 changed files with 1237 additions and 20 deletions
@@ -1,5 +1,5 @@
 import type { Context } from 'koa'
-import { textToSpeech } from '../../services/hermes/tts'
+import { textToSpeech, openaiCompatibleTts, speedToEdgeRate } from '../../services/hermes/tts'

 export async function generate(ctx: Context) {
  const { text, lang } = ctx.request.body as {
@@ -26,3 +26,41 @@ export async function generate(ctx: Context) {
  ctx.set('X-TTS-Engine', engine)
  ctx.body = audio
 }
+
+/**
+ * OpenAI-compatible TTS endpoint.
+ * Accepts: { model, input, voice, speed }
+ * Returns audio/mpeg stream.
+ */
+export async function openaiProxy(ctx: Context) {
+  const body = ctx.request.body as {
+    input?: string
+    voice?: string
+    speed?: number
+    model?: string
+  }
+
+  if (!body.input || typeof body.input !== 'string') {
+    ctx.status = 400
+    ctx.body = { error: 'input is required' }
+    return
+  }
+
+  if (body.input.length > 5000) {
+    ctx.status = 400
+    ctx.body = { error: 'input is too long (max 5000 characters)' }
+    return
+  }
+
+  const { audio, engine } = await openaiCompatibleTts({
+    input: body.input,
+    voice: body.voice,
+    speed: body.speed,
+    model: body.model,
+  })
+
+  ctx.set('Content-Type', 'audio/mpeg')
+  ctx.set('Content-Length', String(audio.length))
+  ctx.set('X-TTS-Engine', engine)
+  ctx.body = audio
+}
@@ -4,3 +4,4 @@ import * as ctrl from '../../controllers/hermes/tts'
 export const ttsRoutes = new Router()

 ttsRoutes.post('/api/hermes/tts', ctrl.generate)
+ttsRoutes.post('/api/tts/proxy/audio/speech', ctrl.openaiProxy)
@@ -41,6 +41,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
  app.use(healthRoutes.routes())
  app.use(webhookRoutes.routes())
  app.use(authPublicRoutes.routes())
+  app.use(ttsRoutes.routes())              // TTS proxy/generation — must be before auth

  // --- Auth middleware: all routes below require authentication ---
  app.use(requireAuth)
@@ -69,7 +70,6 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
  app.use(jobRoutes.routes())               // Must be before proxy
  app.use(cronHistoryRoutes.routes())        // Must be before proxy
  app.use(kanbanRoutes.routes())             // Must be before proxy
-  app.use(ttsRoutes.routes())                // Must be before proxy
  app.use(proxyRoutes.routes())

  // Proxy catch-all middleware (must be last)
@@ -12,6 +12,9 @@ const FIXED_PITCH = '+12Hz'
 export interface TtsOptions {
  text: string
  lang?: string
+  voice?: string
+  rate?: string
+  pitch?: string
 }

 export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
@@ -20,9 +23,9 @@ export async function edgeTts(opts: TtsOptions): Promise<Buffer> {

  try {
    const tts = new EdgeTTS({
-      voice: FIXED_VOICE,
-      rate: FIXED_RATE,
-      pitch: FIXED_PITCH,
+      voice: opts.voice || FIXED_VOICE,
+      rate: opts.rate || FIXED_RATE,
+      pitch: opts.pitch || FIXED_PITCH,
      timeout: 15000,
    })

@@ -35,7 +38,41 @@ export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
 }

 export async function textToSpeech(opts: TtsOptions): Promise<{ audio: Buffer; engine: string }> {
+  const voice = opts.voice || FIXED_VOICE
+  const rate = opts.rate || FIXED_RATE
+  const pitch = opts.pitch || FIXED_PITCH
  const audio = await edgeTts(opts)
-  logger.debug({ engine: 'edge', voice: FIXED_VOICE, rate: FIXED_RATE, pitch: FIXED_PITCH }, 'TTS generated via Edge')
+  logger.debug({ engine: 'edge', voice, rate, pitch }, 'TTS generated via Edge')
  return { audio, engine: 'edge' }
 }
+
+/**
+ * Convert speed multiplier (0.5-2.0) to Edge TTS rate string.
+ * Edge TTS rate format: "+/-NN%"
+ */
+export function speedToEdgeRate(speed: number): string {
+  const percent = Math.round((speed - 1) * 100)
+  return percent >= 0 ? `+${percent}%` : `${percent}%`
+}
+
+/**
+ * Convert OpenAI TTS request to internal TtsOptions.
+ * OpenAI format: { model, input, voice, speed }
+ */
+export interface OpenaiTtsRequest {
+  model?: string
+  input: string
+  voice?: string
+  speed?: number
+}
+
+export async function openaiCompatibleTts(
+  body: OpenaiTtsRequest,
+): Promise<{ audio: Buffer; engine: string }> {
+  return textToSpeech({
+    text: body.input,
+    voice: body.voice || FIXED_VOICE,
+    rate: body.speed ? speedToEdgeRate(body.speed) : FIXED_RATE,
+    pitch: FIXED_PITCH,
+  })
+}