add hermes tts playback (#541)

This commit is contained in:
ekko
2026-05-08 15:34:11 +08:00
committed by GitHub
parent 10d2f892ff
commit d54f9479b9
8 changed files with 218 additions and 144 deletions
+2 -1
View File
@@ -67,6 +67,7 @@
"dependencies": {
"eventsource": "^4.1.0",
"js-tiktoken": "^1.0.21",
"node-edge-tts": "^1.2.10",
"node-pty": "^1.1.0",
"socket.io": "^4.8.3",
"socket.io-client": "^4.8.3"
@@ -124,4 +125,4 @@
"vue-tsc": "^3.2.6",
"ws": "^8.20.0"
}
}
}
+34
View File
@@ -0,0 +1,34 @@
export interface TtsOptions {
text: string
lang?: string
}
export async function generateSpeech(opts: TtsOptions): Promise<{ audio: Blob; engine: string }> {
const res = await fetch(
`${localStorage.getItem('hermes_server_url') || ''}/api/hermes/tts`,
{
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${localStorage.getItem('hermes_api_key') || ''}`,
},
body: JSON.stringify(opts),
},
)
if (!res.ok) {
throw new Error(`TTS request failed: ${res.status}`)
}
const audio = await res.blob()
const engine = res.headers.get('X-TTS-Engine') || 'unknown'
return { audio, engine }
}
export function playAudioBlob(blob: Blob): HTMLAudioElement {
const url = URL.createObjectURL(blob)
const audio = new Audio(url)
audio.play()
audio.onended = () => URL.revokeObjectURL(url)
return audio
}
@@ -370,38 +370,7 @@ function handleSpeechToggle() {
return
}
const content = props.message.content || ''
speech.toggle(props.message.id, content, getSpeechOptions())
}
function getSpeechOptions() {
// 尝试获取男声语音包
const allVoices = speech.getAllVoices()
let maleVoice: SpeechSynthesisVoice | null = null
// 查找可能的男声语音包
for (const voice of allVoices) {
const name = voice.name.toLowerCase()
// 常见男声关键词
if (name.includes('male') || name.includes('david') || name.includes('daniel') ||
name.includes('mark') || name.includes('yaoyao') || name.includes('google')) {
// 优先选择中文男声
if (voice.lang.startsWith('zh')) {
maleVoice = voice
break
}
// 如果没有找到中文男声,记住第一个男声
if (!maleVoice) {
maleVoice = voice
}
}
}
// 快速男声:语速快、音调低
return {
pitch: 0.5, // 低沉
rate: 1.2, // 快速
voice: maleVoice || undefined, // 使用男声,如果没有就用默认
}
speech.toggle(props.message.id, content)
}
// 监听自动播放事件
@@ -411,7 +380,7 @@ onMounted(() => {
autoPlayHandler = (e: Event) => {
const customEvent = e as CustomEvent<{ messageId: string; content: string }>
if (customEvent.detail.messageId === props.message.id && canPlaySpeech.value) {
speech.enqueue(props.message.id, customEvent.detail.content || props.message.content || '', getSpeechOptions())
speech.enqueue(props.message.id, customEvent.detail.content || props.message.content || '')
}
}
window.addEventListener('auto-play-speech', autoPlayHandler)
+103 -110
View File
@@ -1,10 +1,7 @@
import { ref, computed, onUnmounted } from 'vue'
import { generateSpeech, playAudioBlob } from '@/api/hermes/tts'
export interface SpeechOptions {
rate?: number // 语速 0.1-10,默认 1
pitch?: number // 音调 0-2,默认 1
volume?: number // 音量 0-1,默认 1
voice?: SpeechSynthesisVoice | null
lang?: string // 语言 'zh-CN', 'en-US' 等
}
@@ -13,6 +10,7 @@ export interface SpeechState {
isPaused: boolean
currentMessageId: string | null
progress: number // 当前进度(字符数)
engine: 'none' | 'tts' | 'browser' // 当前使用的引擎
}
interface SpeechQueueItem {
@@ -22,7 +20,8 @@ interface SpeechQueueItem {
}
/**
* Web Speech API 语音播放 Composable
* 语音播放 Composable
* 优先后端 TTSEdge → Google),失败降级浏览器 speechSynthesis
*/
export function useSpeech() {
const synth = window.speechSynthesis
@@ -32,9 +31,11 @@ export function useSpeech() {
isPaused: false,
currentMessageId: null,
progress: 0,
engine: 'none',
})
let utterance: SpeechSynthesisUtterance | null = null
let currentAudio: HTMLAudioElement | null = null
let playbackToken = 0
const speechQueue: SpeechQueueItem[] = []
@@ -43,9 +44,8 @@ export function useSpeech() {
availableVoices.value = synth.getVoices()
}
// 浏览器会在语音列表变化时触发 voiceschanged 事件
synth.addEventListener('voiceschanged', loadVoices)
loadVoices() // 初始加载
loadVoices()
/**
* 从文本中提取纯文本内容,过滤代码块、thinking 标签等
@@ -66,86 +66,110 @@ export function useSpeech() {
// 移除 HTML 标签
text = text.replace(/<[^>]+>/g, '')
// 只保留:字母、数字、空格、常用标点、中文
// 保留的标点:。!?;,,。!?;:、""''()【】《》
// 移除:*# 等特殊符号、表情符号、emoji 等
text = text.replace(/[^\p{L}\p{N}\s!?;,""''\n-鿿-䶿]/gu, '')
// 移除多余的空白
text = text.replace(/\s+/g, ' ').trim()
return text
}
/**
* 检查浏览器是否支持 Web Speech API
*/
const isSupported = computed(() => {
return 'speechSynthesis' in window && 'SpeechSynthesisUtterance' in window
})
/**
* 获取默认语音(优先选择中文)
*/
function getDefaultVoice(): SpeechSynthesisVoice | null {
const voices = availableVoices.value
if (voices.length === 0) return null
// 优先选择中文语音
const zhVoice = voices.find(v => v.lang.startsWith('zh'))
if (zhVoice) return zhVoice
// 其次选择英文语音
const enVoice = voices.find(v => v.lang.startsWith('en'))
if (enVoice) return enVoice
// 默认第一个
return voices[0]
}
/**
* 获取所有可用语音(用于调试)
*/
function getAllVoices(): SpeechSynthesisVoice[] {
return availableVoices.value
}
/**
* 停止当前播放
*/
function stop(clearQueue = true) {
playbackToken += 1
if (clearQueue) {
speechQueue.length = 0
}
// Stop TTS audio
if (currentAudio) {
currentAudio.pause()
currentAudio.src = ''
currentAudio = null
}
// Stop browser speech
if (synth.speaking || synth.pending || synth.paused) {
synth.cancel()
}
if (utterance) {
utterance = null
}
utterance = null
state.value = {
isPlaying: false,
isPaused: false,
currentMessageId: null,
progress: 0,
engine: 'none',
}
}
function speak(messageId: string, text: string, options: SpeechOptions = {}) {
const token = ++playbackToken
// ─── TTS Engine (server-side) ───────────────────────────────
async function speakViaTts(messageId: string, text: string, options: SpeechOptions, token: number) {
// Set playing state immediately so UI shows breathing animation right away
state.value.isPlaying = true
state.value.isPaused = false
state.value.currentMessageId = messageId
state.value.progress = 0
state.value.engine = 'tts'
try {
const lang = options.lang || 'zh-CN'
const { audio } = await generateSpeech({ text, lang })
if (token !== playbackToken) return
currentAudio = playAudioBlob(audio)
currentAudio.onended = () => {
if (token !== playbackToken) return
state.value.isPlaying = false
state.value.isPaused = false
state.value.currentMessageId = null
state.value.progress = text.length
state.value.engine = 'none'
currentAudio = null
if (speechQueue.length > 0) {
setTimeout(playNextQueuedSpeech, 0)
}
}
currentAudio.onerror = () => {
if (token !== playbackToken) return
// TTS playback failed, fallback to browser
console.warn('[useSpeech] TTS audio playback error, falling back to browser')
speakViaBrowser(messageId, text, options, token)
}
} catch (err) {
if (token !== playbackToken) return
console.warn('[useSpeech] TTS API failed, falling back to browser:', err)
speakViaBrowser(messageId, text, options, token)
}
}
// ─── Browser Engine (Web Speech API) ────────────────────────
function speakViaBrowser(messageId: string, text: string, options: SpeechOptions, token: number) {
utterance = new SpeechSynthesisUtterance(text)
const activeUtterance = utterance
const activeText = text
// 设置语音参数
utterance.rate = options.rate ?? 1
utterance.pitch = options.pitch ?? 1
utterance.volume = options.volume ?? 1
utterance.voice = options.voice ?? getDefaultVoice()
console.log('[useSpeech] Selected voice:', utterance.voice?.name, utterance.voice?.lang)
utterance.rate = 1
utterance.pitch = 1
utterance.volume = 1
utterance.voice = getDefaultVoice()
if (options.lang) {
utterance.lang = options.lang
@@ -153,15 +177,11 @@ export function useSpeech() {
utterance.lang = utterance.voice.lang
}
// 事件监听
utterance.onstart = () => {
if (token !== playbackToken || utterance !== activeUtterance) return
console.log('[useSpeech] onstart fired')
state.value.isPlaying = true
state.value.isPaused = false
state.value.currentMessageId = messageId
state.value.progress = 0
}
state.value.engine = 'browser'
state.value.isPlaying = true
state.value.isPaused = false
state.value.currentMessageId = messageId
state.value.progress = 0
utterance.onboundary = (event) => {
if (token !== playbackToken || utterance !== activeUtterance) return
@@ -172,66 +192,62 @@ export function useSpeech() {
utterance.onend = () => {
if (token !== playbackToken || utterance !== activeUtterance) return
console.log('[useSpeech] onend fired')
state.value.isPlaying = false
state.value.isPaused = false
state.value.currentMessageId = null
state.value.progress = activeText.length
state.value.progress = text.length
state.value.engine = 'none'
utterance = null
if (speechQueue.length > 0) {
window.setTimeout(playNextQueuedSpeech, 0)
setTimeout(playNextQueuedSpeech, 0)
}
}
utterance.onerror = (event) => {
utterance.onerror = () => {
if (token !== playbackToken || utterance !== activeUtterance) return
console.error('[useSpeech] Speech synthesis error:', event.error)
state.value.isPlaying = false
state.value.isPaused = false
state.value.currentMessageId = null
state.value.engine = 'none'
utterance = null
if (speechQueue.length > 0) {
window.setTimeout(playNextQueuedSpeech, 0)
setTimeout(playNextQueuedSpeech, 0)
}
}
// 开始播放
console.log('[useSpeech] Calling synth.speak()')
synth.speak(utterance)
}
// ─── Unified speak ──────────────────────────────────────────
function speak(messageId: string, text: string, options: SpeechOptions = {}) {
const token = ++playbackToken
// Try server-side TTS first, fallback to browser
speakViaTts(messageId, text, options, token)
}
function playNextQueuedSpeech() {
if (state.value.isPlaying || state.value.isPaused || synth.speaking || synth.pending) return
if (state.value.isPlaying || state.value.isPaused) return
const next = speechQueue.shift()
if (!next) return
const text = extractReadableText(next.content)
if (!text) {
window.setTimeout(playNextQueuedSpeech, 0)
setTimeout(playNextQueuedSpeech, 0)
return
}
console.log('[useSpeech] Playing queued text:', text.substring(0, 50) + '...')
speak(next.messageId, text, next.options)
}
/**
* 播放文本
*/
function play(messageId: string, content: string, options: SpeechOptions = {}) {
if (!isSupported.value) {
console.warn('[useSpeech] Speech synthesis not supported')
return
}
console.log('[useSpeech] play called:', messageId)
// 如果正在播放其他消息,先停止
// If playing other message, stop first
if (state.value.currentMessageId && state.value.currentMessageId !== messageId) {
stop()
}
// 如果已经在播放这条消息,暂停/恢复
// Toggle play/pause for same message
if (state.value.currentMessageId === messageId) {
if (state.value.isPaused) {
resume()
@@ -241,59 +257,40 @@ export function useSpeech() {
return
}
// 提取可读文本
const text = extractReadableText(content)
if (!text) {
console.warn('[useSpeech] No readable text found')
return
}
if (!text) return
console.log('[useSpeech] Playing text:', text.substring(0, 50) + '...')
// 停止当前播放
stop()
speak(messageId, text, options)
}
/**
* 自动播放入队:不打断当前语音,按完成顺序依次播放。
*/
function enqueue(messageId: string, content: string, options: SpeechOptions = {}) {
if (!isSupported.value) {
console.warn('[useSpeech] Speech synthesis not supported')
return
}
if (!extractReadableText(content)) {
console.warn('[useSpeech] No readable text found')
return
}
if (!extractReadableText(content)) return
speechQueue.push({ messageId, content, options })
playNextQueuedSpeech()
}
/**
* 暂停播放
*/
function pause() {
if (synth.speaking && !state.value.isPaused) {
if (state.value.engine === 'tts' && currentAudio) {
currentAudio.pause()
state.value.isPaused = true
} else if (synth.speaking && !state.value.isPaused) {
synth.pause()
state.value.isPaused = true
}
}
/**
* 恢复播放
*/
function resume() {
if (state.value.isPaused) {
synth.resume()
if (state.value.engine === 'tts' && currentAudio) {
currentAudio.play()
} else {
synth.resume()
}
state.value.isPaused = false
}
}
/**
* 切换播放/暂停
*/
function toggle(messageId: string, content: string, options: SpeechOptions = {}) {
if (state.value.currentMessageId === messageId && state.value.isPlaying) {
if (state.value.isPaused) {
@@ -306,22 +303,20 @@ export function useSpeech() {
}
}
// 清理
onUnmounted(() => {
stop()
synth.removeEventListener('voiceschanged', loadVoices)
})
return {
// 状态
isSupported,
availableVoices,
isPlaying: computed(() => state.value.isPlaying),
isPaused: computed(() => state.value.isPaused),
currentMessageId: computed(() => state.value.currentMessageId),
progress: computed(() => state.value.progress),
engine: computed(() => state.value.engine),
// 方法
play,
pause,
resume,
@@ -329,12 +324,10 @@ export function useSpeech() {
toggle,
enqueue,
getDefaultVoice,
getAllVoices,
extractReadableText,
}
}
// 单例模式,全局共享一个语音实例
let globalSpeech: ReturnType<typeof useSpeech> | null = null
export function useGlobalSpeech() {
@@ -0,0 +1,28 @@
import type { Context } from 'koa'
import { textToSpeech } from '../../services/hermes/tts'
export async function generate(ctx: Context) {
const { text, lang } = ctx.request.body as {
text?: string
lang?: string
}
if (!text || typeof text !== 'string') {
ctx.status = 400
ctx.body = { error: 'text is required' }
return
}
if (text.length > 5000) {
ctx.status = 400
ctx.body = { error: 'text is too long (max 5000 characters)' }
return
}
const { audio, engine } = await textToSpeech({ text, lang })
ctx.set('Content-Type', 'audio/mpeg')
ctx.set('Content-Length', String(audio.length))
ctx.set('X-TTS-Engine', engine)
ctx.body = audio
}
+6
View File
@@ -0,0 +1,6 @@
import Router from '@koa/router'
import * as ctrl from '../../controllers/hermes/tts'
export const ttsRoutes = new Router()
ttsRoutes.post('/api/hermes/tts', ctrl.generate)
+2
View File
@@ -26,6 +26,7 @@ import { downloadRoutes } from './hermes/download'
import { jobRoutes } from './hermes/jobs'
import { cronHistoryRoutes } from './hermes/cron-history'
import { kanbanRoutes } from './hermes/kanban'
import { ttsRoutes } from './hermes/tts'
import { proxyRoutes, proxyMiddleware } from './hermes/proxy'
import { groupChatRoutes, setGroupChatServer } from './hermes/group-chat'
@@ -66,6 +67,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next)
app.use(jobRoutes.routes()) // Must be before proxy
app.use(cronHistoryRoutes.routes()) // Must be before proxy
app.use(kanbanRoutes.routes()) // Must be before proxy
app.use(ttsRoutes.routes()) // Must be before proxy
app.use(proxyRoutes.routes())
// Proxy catch-all middleware (must be last)
@@ -0,0 +1,41 @@
import { EdgeTTS } from 'node-edge-tts'
import { tmpdir } from 'os'
import { join } from 'path'
import { readFile, unlink } from 'fs/promises'
import { randomUUID } from 'crypto'
import { logger } from '../logger'
const FIXED_VOICE = 'zh-CN-XiaoxiaoNeural'
const FIXED_RATE = '+4%'
const FIXED_PITCH = '+12Hz'
export interface TtsOptions {
text: string
lang?: string
}
export async function edgeTts(opts: TtsOptions): Promise<Buffer> {
const id = randomUUID()
const tmpFile = join(tmpdir(), `tts-${id}.mp3`)
try {
const tts = new EdgeTTS({
voice: FIXED_VOICE,
rate: FIXED_RATE,
pitch: FIXED_PITCH,
timeout: 15000,
})
await tts.ttsPromise(opts.text, tmpFile)
const buf = await readFile(tmpFile)
return buf
} finally {
unlink(tmpFile).catch(() => {})
}
}
export async function textToSpeech(opts: TtsOptions): Promise<{ audio: Buffer; engine: string }> {
const audio = await edgeTts(opts)
logger.debug({ engine: 'edge', voice: FIXED_VOICE, rate: FIXED_RATE, pitch: FIXED_PITCH }, 'TTS generated via Edge')
return { audio, engine: 'edge' }
}