feat: add voice playback settings with 4-provider support (#608)
Add WebSpeech, OpenAI TTS, Custom endpoint, and Edge TTS providers. Co-authored-by: Hermes Agent <noreply@nousresearch.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import type { Context } from 'koa'
|
||||
import { textToSpeech } from '../../services/hermes/tts'
|
||||
import { textToSpeech, openaiCompatibleTts, speedToEdgeRate } from '../../services/hermes/tts'
|
||||
|
||||
export async function generate(ctx: Context) {
|
||||
const { text, lang } = ctx.request.body as {
|
||||
@@ -26,3 +26,41 @@ export async function generate(ctx: Context) {
|
||||
ctx.set('X-TTS-Engine', engine)
|
||||
ctx.body = audio
|
||||
}
|
||||
|
||||
/**
|
||||
* OpenAI-compatible TTS endpoint.
|
||||
* Accepts: { model, input, voice, speed }
|
||||
* Returns audio/mpeg stream.
|
||||
*/
|
||||
export async function openaiProxy(ctx: Context) {
|
||||
const body = ctx.request.body as {
|
||||
input?: string
|
||||
voice?: string
|
||||
speed?: number
|
||||
model?: string
|
||||
}
|
||||
|
||||
if (!body.input || typeof body.input !== 'string') {
|
||||
ctx.status = 400
|
||||
ctx.body = { error: 'input is required' }
|
||||
return
|
||||
}
|
||||
|
||||
if (body.input.length > 5000) {
|
||||
ctx.status = 400
|
||||
ctx.body = { error: 'input is too long (max 5000 characters)' }
|
||||
return
|
||||
}
|
||||
|
||||
const { audio, engine } = await openaiCompatibleTts({
|
||||
input: body.input,
|
||||
voice: body.voice,
|
||||
speed: body.speed,
|
||||
model: body.model,
|
||||
})
|
||||
|
||||
ctx.set('Content-Type', 'audio/mpeg')
|
||||
ctx.set('Content-Length', String(audio.length))
|
||||
ctx.set('X-TTS-Engine', engine)
|
||||
ctx.body = audio
|
||||
}
|
||||
|
||||
@@ -4,3 +4,4 @@ import * as ctrl from '../../controllers/hermes/tts'
|
||||
export const ttsRoutes = new Router()
|
||||
|
||||
ttsRoutes.post('/api/hermes/tts', ctrl.generate)
|
||||
ttsRoutes.post('/api/tts/proxy/audio/speech', ctrl.openaiProxy)
|
||||
|
||||
@@ -41,6 +41,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
|
||||
app.use(healthRoutes.routes())
|
||||
app.use(webhookRoutes.routes())
|
||||
app.use(authPublicRoutes.routes())
|
||||
app.use(ttsRoutes.routes()) // TTS proxy/generation — must be before auth
|
||||
|
||||
// --- Auth middleware: all routes below require authentication ---
|
||||
app.use(requireAuth)
|
||||
@@ -69,7 +70,6 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
|
||||
app.use(jobRoutes.routes()) // Must be before proxy
|
||||
app.use(cronHistoryRoutes.routes()) // Must be before proxy
|
||||
app.use(kanbanRoutes.routes()) // Must be before proxy
|
||||
app.use(ttsRoutes.routes()) // Must be before proxy
|
||||
app.use(proxyRoutes.routes())
|
||||
|
||||
// Proxy catch-all middleware (must be last)
|
||||
|
||||
@@ -12,6 +12,9 @@ const FIXED_PITCH = '+12Hz'
|
||||
export interface TtsOptions {
|
||||
text: string
|
||||
lang?: string
|
||||
voice?: string
|
||||
rate?: string
|
||||
pitch?: string
|
||||
}
|
||||
|
||||
export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
|
||||
@@ -20,9 +23,9 @@ export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
|
||||
|
||||
try {
|
||||
const tts = new EdgeTTS({
|
||||
voice: FIXED_VOICE,
|
||||
rate: FIXED_RATE,
|
||||
pitch: FIXED_PITCH,
|
||||
voice: opts.voice || FIXED_VOICE,
|
||||
rate: opts.rate || FIXED_RATE,
|
||||
pitch: opts.pitch || FIXED_PITCH,
|
||||
timeout: 15000,
|
||||
})
|
||||
|
||||
@@ -35,7 +38,41 @@ export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
|
||||
}
|
||||
|
||||
export async function textToSpeech(opts: TtsOptions): Promise<{ audio: Buffer; engine: string }> {
|
||||
const voice = opts.voice || FIXED_VOICE
|
||||
const rate = opts.rate || FIXED_RATE
|
||||
const pitch = opts.pitch || FIXED_PITCH
|
||||
const audio = await edgeTts(opts)
|
||||
logger.debug({ engine: 'edge', voice: FIXED_VOICE, rate: FIXED_RATE, pitch: FIXED_PITCH }, 'TTS generated via Edge')
|
||||
logger.debug({ engine: 'edge', voice, rate, pitch }, 'TTS generated via Edge')
|
||||
return { audio, engine: 'edge' }
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert speed multiplier (0.5-2.0) to Edge TTS rate string.
|
||||
* Edge TTS rate format: "+/-NN%"
|
||||
*/
|
||||
export function speedToEdgeRate(speed: number): string {
|
||||
const percent = Math.round((speed - 1) * 100)
|
||||
return percent >= 0 ? `+${percent}%` : `${percent}%`
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert OpenAI TTS request to internal TtsOptions.
|
||||
* OpenAI format: { model, input, voice, speed }
|
||||
*/
|
||||
export interface OpenaiTtsRequest {
|
||||
model?: string
|
||||
input: string
|
||||
voice?: string
|
||||
speed?: number
|
||||
}
|
||||
|
||||
export async function openaiCompatibleTts(
|
||||
body: OpenaiTtsRequest,
|
||||
): Promise<{ audio: Buffer; engine: string }> {
|
||||
return textToSpeech({
|
||||
text: body.input,
|
||||
voice: body.voice || FIXED_VOICE,
|
||||
rate: body.speed ? speedToEdgeRate(body.speed) : FIXED_RATE,
|
||||
pitch: FIXED_PITCH,
|
||||
})
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user