From d54f9479b97bc25bd6cd4ca0c41f9f29576dfb58 Mon Sep 17 00:00:00 2001 From: ekko <152005280+EKKOLearnAI@users.noreply.github.com> Date: Fri, 8 May 2026 15:34:11 +0800 Subject: [PATCH] add hermes tts playback (#541) --- package.json | 3 +- packages/client/src/api/hermes/tts.ts | 34 +++ .../components/hermes/chat/MessageItem.vue | 35 +-- packages/client/src/composables/useSpeech.ts | 213 +++++++++--------- packages/server/src/controllers/hermes/tts.ts | 28 +++ packages/server/src/routes/hermes/tts.ts | 6 + packages/server/src/routes/index.ts | 2 + packages/server/src/services/hermes/tts.ts | 41 ++++ 8 files changed, 218 insertions(+), 144 deletions(-) create mode 100644 packages/client/src/api/hermes/tts.ts create mode 100644 packages/server/src/controllers/hermes/tts.ts create mode 100644 packages/server/src/routes/hermes/tts.ts create mode 100644 packages/server/src/services/hermes/tts.ts diff --git a/package.json b/package.json index 6871717..ec89486 100644 --- a/package.json +++ b/package.json @@ -67,6 +67,7 @@ "dependencies": { "eventsource": "^4.1.0", "js-tiktoken": "^1.0.21", + "node-edge-tts": "^1.2.10", "node-pty": "^1.1.0", "socket.io": "^4.8.3", "socket.io-client": "^4.8.3" @@ -124,4 +125,4 @@ "vue-tsc": "^3.2.6", "ws": "^8.20.0" } -} \ No newline at end of file +} diff --git a/packages/client/src/api/hermes/tts.ts b/packages/client/src/api/hermes/tts.ts new file mode 100644 index 0000000..c22d28d --- /dev/null +++ b/packages/client/src/api/hermes/tts.ts @@ -0,0 +1,34 @@ +export interface TtsOptions { + text: string + lang?: string +} + +export async function generateSpeech(opts: TtsOptions): Promise<{ audio: Blob; engine: string }> { + const res = await fetch( + `${localStorage.getItem('hermes_server_url') || ''}/api/hermes/tts`, + { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${localStorage.getItem('hermes_api_key') || ''}`, + }, + body: JSON.stringify(opts), + }, + ) + + if (!res.ok) { + throw new Error(`TTS request failed: ${res.status}`) + } + + const audio = await res.blob() + const engine = res.headers.get('X-TTS-Engine') || 'unknown' + return { audio, engine } +} + +export function playAudioBlob(blob: Blob): HTMLAudioElement { + const url = URL.createObjectURL(blob) + const audio = new Audio(url) + audio.play() + audio.onended = () => URL.revokeObjectURL(url) + return audio +} diff --git a/packages/client/src/components/hermes/chat/MessageItem.vue b/packages/client/src/components/hermes/chat/MessageItem.vue index 6170abe..b863801 100644 --- a/packages/client/src/components/hermes/chat/MessageItem.vue +++ b/packages/client/src/components/hermes/chat/MessageItem.vue @@ -370,38 +370,7 @@ function handleSpeechToggle() { return } const content = props.message.content || '' - speech.toggle(props.message.id, content, getSpeechOptions()) -} - -function getSpeechOptions() { - // 尝试获取男声语音包 - const allVoices = speech.getAllVoices() - let maleVoice: SpeechSynthesisVoice | null = null - - // 查找可能的男声语音包 - for (const voice of allVoices) { - const name = voice.name.toLowerCase() - // 常见男声关键词 - if (name.includes('male') || name.includes('david') || name.includes('daniel') || - name.includes('mark') || name.includes('yaoyao') || name.includes('google')) { - // 优先选择中文男声 - if (voice.lang.startsWith('zh')) { - maleVoice = voice - break - } - // 如果没有找到中文男声,记住第一个男声 - if (!maleVoice) { - maleVoice = voice - } - } - } - - // 快速男声:语速快、音调低 - return { - pitch: 0.5, // 低沉 - rate: 1.2, // 快速 - voice: maleVoice || undefined, // 使用男声,如果没有就用默认 - } + speech.toggle(props.message.id, content) } // 监听自动播放事件 @@ -411,7 +380,7 @@ onMounted(() => { autoPlayHandler = (e: Event) => { const customEvent = e as CustomEvent<{ messageId: string; content: string }> if (customEvent.detail.messageId === props.message.id && canPlaySpeech.value) { - speech.enqueue(props.message.id, customEvent.detail.content || props.message.content || '', getSpeechOptions()) + speech.enqueue(props.message.id, customEvent.detail.content || props.message.content || '') } } window.addEventListener('auto-play-speech', autoPlayHandler) diff --git a/packages/client/src/composables/useSpeech.ts b/packages/client/src/composables/useSpeech.ts index ea1b754..208a1b0 100644 --- a/packages/client/src/composables/useSpeech.ts +++ b/packages/client/src/composables/useSpeech.ts @@ -1,10 +1,7 @@ import { ref, computed, onUnmounted } from 'vue' +import { generateSpeech, playAudioBlob } from '@/api/hermes/tts' export interface SpeechOptions { - rate?: number // 语速 0.1-10,默认 1 - pitch?: number // 音调 0-2,默认 1 - volume?: number // 音量 0-1,默认 1 - voice?: SpeechSynthesisVoice | null lang?: string // 语言 'zh-CN', 'en-US' 等 } @@ -13,6 +10,7 @@ export interface SpeechState { isPaused: boolean currentMessageId: string | null progress: number // 当前进度(字符数) + engine: 'none' | 'tts' | 'browser' // 当前使用的引擎 } interface SpeechQueueItem { @@ -22,7 +20,8 @@ interface SpeechQueueItem { } /** - * Web Speech API 语音播放 Composable + * 语音播放 Composable + * 优先后端 TTS(Edge → Google),失败降级浏览器 speechSynthesis */ export function useSpeech() { const synth = window.speechSynthesis @@ -32,9 +31,11 @@ export function useSpeech() { isPaused: false, currentMessageId: null, progress: 0, + engine: 'none', }) let utterance: SpeechSynthesisUtterance | null = null + let currentAudio: HTMLAudioElement | null = null let playbackToken = 0 const speechQueue: SpeechQueueItem[] = [] @@ -43,9 +44,8 @@ export function useSpeech() { availableVoices.value = synth.getVoices() } - // 浏览器会在语音列表变化时触发 voiceschanged 事件 synth.addEventListener('voiceschanged', loadVoices) - loadVoices() // 初始加载 + loadVoices() /** * 从文本中提取纯文本内容,过滤代码块、thinking 标签等 @@ -66,86 +66,110 @@ export function useSpeech() { // 移除 HTML 标签 text = text.replace(/<[^>]+>/g, '') - // 只保留:字母、数字、空格、常用标点、中文 - // 保留的标点:。!?;,,。!?;:、""''()【】《》 - // 移除:*# 等特殊符号、表情符号、emoji 等 text = text.replace(/[^\p{L}\p{N}\s。!?;,,。!?;:、""''()【】《》\n一-鿿㐀-䶿]/gu, '') - // 移除多余的空白 text = text.replace(/\s+/g, ' ').trim() return text } - /** - * 检查浏览器是否支持 Web Speech API - */ const isSupported = computed(() => { return 'speechSynthesis' in window && 'SpeechSynthesisUtterance' in window }) - /** - * 获取默认语音(优先选择中文) - */ function getDefaultVoice(): SpeechSynthesisVoice | null { const voices = availableVoices.value if (voices.length === 0) return null - // 优先选择中文语音 const zhVoice = voices.find(v => v.lang.startsWith('zh')) if (zhVoice) return zhVoice - // 其次选择英文语音 const enVoice = voices.find(v => v.lang.startsWith('en')) if (enVoice) return enVoice - // 默认第一个 return voices[0] } - /** - * 获取所有可用语音(用于调试) - */ - function getAllVoices(): SpeechSynthesisVoice[] { - return availableVoices.value - } - - /** - * 停止当前播放 - */ function stop(clearQueue = true) { playbackToken += 1 if (clearQueue) { speechQueue.length = 0 } + // Stop TTS audio + if (currentAudio) { + currentAudio.pause() + currentAudio.src = '' + currentAudio = null + } + // Stop browser speech if (synth.speaking || synth.pending || synth.paused) { synth.cancel() } - if (utterance) { - utterance = null - } + utterance = null state.value = { isPlaying: false, isPaused: false, currentMessageId: null, progress: 0, + engine: 'none', } } - function speak(messageId: string, text: string, options: SpeechOptions = {}) { - const token = ++playbackToken + // ─── TTS Engine (server-side) ─────────────────────────────── + async function speakViaTts(messageId: string, text: string, options: SpeechOptions, token: number) { + // Set playing state immediately so UI shows breathing animation right away + state.value.isPlaying = true + state.value.isPaused = false + state.value.currentMessageId = messageId + state.value.progress = 0 + state.value.engine = 'tts' + + try { + const lang = options.lang || 'zh-CN' + + const { audio } = await generateSpeech({ text, lang }) + + if (token !== playbackToken) return + + currentAudio = playAudioBlob(audio) + + currentAudio.onended = () => { + if (token !== playbackToken) return + state.value.isPlaying = false + state.value.isPaused = false + state.value.currentMessageId = null + state.value.progress = text.length + state.value.engine = 'none' + currentAudio = null + if (speechQueue.length > 0) { + setTimeout(playNextQueuedSpeech, 0) + } + } + + currentAudio.onerror = () => { + if (token !== playbackToken) return + // TTS playback failed, fallback to browser + console.warn('[useSpeech] TTS audio playback error, falling back to browser') + speakViaBrowser(messageId, text, options, token) + } + } catch (err) { + if (token !== playbackToken) return + console.warn('[useSpeech] TTS API failed, falling back to browser:', err) + speakViaBrowser(messageId, text, options, token) + } + } + + // ─── Browser Engine (Web Speech API) ──────────────────────── + + function speakViaBrowser(messageId: string, text: string, options: SpeechOptions, token: number) { utterance = new SpeechSynthesisUtterance(text) const activeUtterance = utterance - const activeText = text - // 设置语音参数 - utterance.rate = options.rate ?? 1 - utterance.pitch = options.pitch ?? 1 - utterance.volume = options.volume ?? 1 - utterance.voice = options.voice ?? getDefaultVoice() - - console.log('[useSpeech] Selected voice:', utterance.voice?.name, utterance.voice?.lang) + utterance.rate = 1 + utterance.pitch = 1 + utterance.volume = 1 + utterance.voice = getDefaultVoice() if (options.lang) { utterance.lang = options.lang @@ -153,15 +177,11 @@ export function useSpeech() { utterance.lang = utterance.voice.lang } - // 事件监听 - utterance.onstart = () => { - if (token !== playbackToken || utterance !== activeUtterance) return - console.log('[useSpeech] onstart fired') - state.value.isPlaying = true - state.value.isPaused = false - state.value.currentMessageId = messageId - state.value.progress = 0 - } + state.value.engine = 'browser' + state.value.isPlaying = true + state.value.isPaused = false + state.value.currentMessageId = messageId + state.value.progress = 0 utterance.onboundary = (event) => { if (token !== playbackToken || utterance !== activeUtterance) return @@ -172,66 +192,62 @@ export function useSpeech() { utterance.onend = () => { if (token !== playbackToken || utterance !== activeUtterance) return - console.log('[useSpeech] onend fired') state.value.isPlaying = false state.value.isPaused = false state.value.currentMessageId = null - state.value.progress = activeText.length + state.value.progress = text.length + state.value.engine = 'none' utterance = null if (speechQueue.length > 0) { - window.setTimeout(playNextQueuedSpeech, 0) + setTimeout(playNextQueuedSpeech, 0) } } - utterance.onerror = (event) => { + utterance.onerror = () => { if (token !== playbackToken || utterance !== activeUtterance) return - console.error('[useSpeech] Speech synthesis error:', event.error) state.value.isPlaying = false state.value.isPaused = false state.value.currentMessageId = null + state.value.engine = 'none' utterance = null if (speechQueue.length > 0) { - window.setTimeout(playNextQueuedSpeech, 0) + setTimeout(playNextQueuedSpeech, 0) } } - // 开始播放 - console.log('[useSpeech] Calling synth.speak()') synth.speak(utterance) } + // ─── Unified speak ────────────────────────────────────────── + + function speak(messageId: string, text: string, options: SpeechOptions = {}) { + const token = ++playbackToken + + // Try server-side TTS first, fallback to browser + speakViaTts(messageId, text, options, token) + } + function playNextQueuedSpeech() { - if (state.value.isPlaying || state.value.isPaused || synth.speaking || synth.pending) return + if (state.value.isPlaying || state.value.isPaused) return const next = speechQueue.shift() if (!next) return const text = extractReadableText(next.content) if (!text) { - window.setTimeout(playNextQueuedSpeech, 0) + setTimeout(playNextQueuedSpeech, 0) return } - console.log('[useSpeech] Playing queued text:', text.substring(0, 50) + '...') speak(next.messageId, text, next.options) } - /** - * 播放文本 - */ function play(messageId: string, content: string, options: SpeechOptions = {}) { - if (!isSupported.value) { - console.warn('[useSpeech] Speech synthesis not supported') - return - } - - console.log('[useSpeech] play called:', messageId) - - // 如果正在播放其他消息,先停止 + // If playing other message, stop first if (state.value.currentMessageId && state.value.currentMessageId !== messageId) { stop() } - // 如果已经在播放这条消息,暂停/恢复 + // Toggle play/pause for same message if (state.value.currentMessageId === messageId) { if (state.value.isPaused) { resume() @@ -241,59 +257,40 @@ export function useSpeech() { return } - // 提取可读文本 const text = extractReadableText(content) - if (!text) { - console.warn('[useSpeech] No readable text found') - return - } + if (!text) return - console.log('[useSpeech] Playing text:', text.substring(0, 50) + '...') - - // 停止当前播放 stop() speak(messageId, text, options) } - /** - * 自动播放入队:不打断当前语音,按完成顺序依次播放。 - */ function enqueue(messageId: string, content: string, options: SpeechOptions = {}) { - if (!isSupported.value) { - console.warn('[useSpeech] Speech synthesis not supported') - return - } - if (!extractReadableText(content)) { - console.warn('[useSpeech] No readable text found') - return - } + if (!extractReadableText(content)) return speechQueue.push({ messageId, content, options }) playNextQueuedSpeech() } - /** - * 暂停播放 - */ function pause() { - if (synth.speaking && !state.value.isPaused) { + if (state.value.engine === 'tts' && currentAudio) { + currentAudio.pause() + state.value.isPaused = true + } else if (synth.speaking && !state.value.isPaused) { synth.pause() state.value.isPaused = true } } - /** - * 恢复播放 - */ function resume() { if (state.value.isPaused) { - synth.resume() + if (state.value.engine === 'tts' && currentAudio) { + currentAudio.play() + } else { + synth.resume() + } state.value.isPaused = false } } - /** - * 切换播放/暂停 - */ function toggle(messageId: string, content: string, options: SpeechOptions = {}) { if (state.value.currentMessageId === messageId && state.value.isPlaying) { if (state.value.isPaused) { @@ -306,22 +303,20 @@ export function useSpeech() { } } - // 清理 onUnmounted(() => { stop() synth.removeEventListener('voiceschanged', loadVoices) }) return { - // 状态 isSupported, availableVoices, isPlaying: computed(() => state.value.isPlaying), isPaused: computed(() => state.value.isPaused), currentMessageId: computed(() => state.value.currentMessageId), progress: computed(() => state.value.progress), + engine: computed(() => state.value.engine), - // 方法 play, pause, resume, @@ -329,12 +324,10 @@ export function useSpeech() { toggle, enqueue, getDefaultVoice, - getAllVoices, extractReadableText, } } -// 单例模式,全局共享一个语音实例 let globalSpeech: ReturnType | null = null export function useGlobalSpeech() { diff --git a/packages/server/src/controllers/hermes/tts.ts b/packages/server/src/controllers/hermes/tts.ts new file mode 100644 index 0000000..808e333 --- /dev/null +++ b/packages/server/src/controllers/hermes/tts.ts @@ -0,0 +1,28 @@ +import type { Context } from 'koa' +import { textToSpeech } from '../../services/hermes/tts' + +export async function generate(ctx: Context) { + const { text, lang } = ctx.request.body as { + text?: string + lang?: string + } + + if (!text || typeof text !== 'string') { + ctx.status = 400 + ctx.body = { error: 'text is required' } + return + } + + if (text.length > 5000) { + ctx.status = 400 + ctx.body = { error: 'text is too long (max 5000 characters)' } + return + } + + const { audio, engine } = await textToSpeech({ text, lang }) + + ctx.set('Content-Type', 'audio/mpeg') + ctx.set('Content-Length', String(audio.length)) + ctx.set('X-TTS-Engine', engine) + ctx.body = audio +} diff --git a/packages/server/src/routes/hermes/tts.ts b/packages/server/src/routes/hermes/tts.ts new file mode 100644 index 0000000..ecf4f81 --- /dev/null +++ b/packages/server/src/routes/hermes/tts.ts @@ -0,0 +1,6 @@ +import Router from '@koa/router' +import * as ctrl from '../../controllers/hermes/tts' + +export const ttsRoutes = new Router() + +ttsRoutes.post('/api/hermes/tts', ctrl.generate) diff --git a/packages/server/src/routes/index.ts b/packages/server/src/routes/index.ts index bd86d32..5cc36f0 100644 --- a/packages/server/src/routes/index.ts +++ b/packages/server/src/routes/index.ts @@ -26,6 +26,7 @@ import { downloadRoutes } from './hermes/download' import { jobRoutes } from './hermes/jobs' import { cronHistoryRoutes } from './hermes/cron-history' import { kanbanRoutes } from './hermes/kanban' +import { ttsRoutes } from './hermes/tts' import { proxyRoutes, proxyMiddleware } from './hermes/proxy' import { groupChatRoutes, setGroupChatServer } from './hermes/group-chat' @@ -66,6 +67,7 @@ export function registerRoutes(app: any, requireAuth: (ctx: Context, next: Next) app.use(jobRoutes.routes()) // Must be before proxy app.use(cronHistoryRoutes.routes()) // Must be before proxy app.use(kanbanRoutes.routes()) // Must be before proxy + app.use(ttsRoutes.routes()) // Must be before proxy app.use(proxyRoutes.routes()) // Proxy catch-all middleware (must be last) diff --git a/packages/server/src/services/hermes/tts.ts b/packages/server/src/services/hermes/tts.ts new file mode 100644 index 0000000..ce676f1 --- /dev/null +++ b/packages/server/src/services/hermes/tts.ts @@ -0,0 +1,41 @@ +import { EdgeTTS } from 'node-edge-tts' +import { tmpdir } from 'os' +import { join } from 'path' +import { readFile, unlink } from 'fs/promises' +import { randomUUID } from 'crypto' +import { logger } from '../logger' + +const FIXED_VOICE = 'zh-CN-XiaoxiaoNeural' +const FIXED_RATE = '+4%' +const FIXED_PITCH = '+12Hz' + +export interface TtsOptions { + text: string + lang?: string +} + +export async function edgeTts(opts: TtsOptions): Promise { + const id = randomUUID() + const tmpFile = join(tmpdir(), `tts-${id}.mp3`) + + try { + const tts = new EdgeTTS({ + voice: FIXED_VOICE, + rate: FIXED_RATE, + pitch: FIXED_PITCH, + timeout: 15000, + }) + + await tts.ttsPromise(opts.text, tmpFile) + const buf = await readFile(tmpFile) + return buf + } finally { + unlink(tmpFile).catch(() => {}) + } +} + +export async function textToSpeech(opts: TtsOptions): Promise<{ audio: Buffer; engine: string }> { + const audio = await edgeTts(opts) + logger.debug({ engine: 'edge', voice: FIXED_VOICE, rate: FIXED_RATE, pitch: FIXED_PITCH }, 'TTS generated via Edge') + return { audio, engine: 'edge' } +}