feat: add voice playback settings with 4-provider support (#608)

Add WebSpeech, OpenAI TTS, Custom endpoint, and Edge TTS providers.

Co-authored-by: Hermes Agent <noreply@nousresearch.com>
This commit is contained in:
memeflyfly
2026-05-10 20:08:38 +08:00
committed by GitHub
parent 838791a740
commit 15195f0795
18 changed files with 1237 additions and 20 deletions
+147 -2
View File
@@ -3,6 +3,14 @@ import { generateSpeech, playAudioBlob } from '@/api/hermes/tts'
export interface SpeechOptions {
lang?: string // 语言 'zh-CN', 'en-US' 等
voiceName?: string // 指定 WebSpeech 音色名称
}
export interface OpenaiTtsOptions {
baseUrl: string
apiKey?: string
model?: string
voice?: string
}
export interface SpeechState {
@@ -39,6 +47,11 @@ export function useSpeech() {
let playbackToken = 0
const speechQueue: SpeechQueueItem[] = []
// 自定义 TTSOpenAI / Custom / Edge)播放状态
const isCustomPlaying = ref(false)
const isCustomPaused = ref(false)
const currentCustomMessageId = ref<string | null>(null)
// 加载可用语音列表
function loadVoices() {
availableVoices.value = synth.getVoices()
@@ -162,14 +175,25 @@ export function useSpeech() {
// ─── Browser Engine (Web Speech API) ────────────────────────
function speakViaBrowser(messageId: string, text: string, options: SpeechOptions, token: number) {
function speakViaBrowser(messageId: string, text: string, options: SpeechOptions, token?: number) {
token = token || ++playbackToken
utterance = new SpeechSynthesisUtterance(text)
const activeUtterance = utterance
utterance.rate = 1
utterance.pitch = 1
utterance.volume = 1
utterance.voice = getDefaultVoice()
// 使用指定的音色(如果有),否则用默认
if (options.voiceName) {
const voice = availableVoices.value.find(v => v.name === options.voiceName)
if (voice) {
utterance.voice = voice
}
}
if (!utterance.voice) {
utterance.voice = getDefaultVoice()
}
if (options.lang) {
utterance.lang = options.lang
@@ -218,6 +242,115 @@ export function useSpeech() {
synth.speak(utterance)
}
// ─── OpenAI-compatible TTS Engine ────────────────────────────
let customAudio: HTMLAudioElement | null = null
async function openaiPlay(
messageId: string,
content: string,
opts: OpenaiTtsOptions,
) {
const text = extractReadableText(content)
if (!text) return
const token = ++playbackToken
isCustomPlaying.value = true
isCustomPaused.value = false
currentCustomMessageId.value = messageId
const url = `${opts.baseUrl.replace(/\/+$/, '')}/audio/speech`
const body: Record<string, any> = {
model: opts.model || 'tts-1',
input: text,
voice: opts.voice || 'alloy',
}
const headers: Record<string, string> = {
'Content-Type': 'application/json',
}
if (opts.apiKey) {
headers['Authorization'] = `Bearer ${opts.apiKey}`
}
try {
const res = await fetch(url, {
method: 'POST',
headers,
body: JSON.stringify(body),
})
if (token !== playbackToken) return
if (!res.ok) {
const errText = await res.text().catch(() => '')
throw new Error(`OpenAI TTS 返回 ${res.status}: ${errText || res.statusText}`)
}
const audioBlob = await res.blob()
if (token !== playbackToken) return
const audioUrl = URL.createObjectURL(audioBlob)
const audio = new Audio(audioUrl)
customAudio = audio
audio.onended = () => {
if (token !== playbackToken) return
URL.revokeObjectURL(audioUrl)
isCustomPlaying.value = false
isCustomPaused.value = false
currentCustomMessageId.value = null
customAudio = null
}
audio.onerror = () => {
if (token !== playbackToken) return
URL.revokeObjectURL(audioUrl)
console.warn('[useSpeech] Custom TTS audio playback error')
isCustomPlaying.value = false
isCustomPaused.value = false
currentCustomMessageId.value = null
customAudio = null
}
await audio.play()
} catch (err) {
if (token !== playbackToken) return
console.error('[useSpeech] OpenAI TTS 请求失败:', err)
isCustomPlaying.value = false
isCustomPaused.value = false
currentCustomMessageId.value = null
throw err
}
}
function openaiToggle(messageId: string, content: string, opts: OpenaiTtsOptions) {
if (currentCustomMessageId.value === messageId && isCustomPlaying.value) {
if (isCustomPaused.value) {
// Resume
if (customAudio) {
customAudio.play()
}
isCustomPaused.value = false
} else {
// Pause
if (customAudio) {
customAudio.pause()
}
isCustomPaused.value = true
}
} else {
// Stop other speech and start new
stop(false)
if (customAudio) {
customAudio.pause()
customAudio = null
}
openaiPlay(messageId, content, opts)
}
}
// ─── Unified speak ──────────────────────────────────────────
function speak(messageId: string, text: string, options: SpeechOptions = {}) {
@@ -317,6 +450,11 @@ export function useSpeech() {
progress: computed(() => state.value.progress),
engine: computed(() => state.value.engine),
// Custom TTS state
isCustomPlaying,
isCustomPaused,
currentCustomMessageId,
play,
pause,
resume,
@@ -325,6 +463,13 @@ export function useSpeech() {
enqueue,
getDefaultVoice,
extractReadableText,
// OpenAI-compatible TTS
openaiPlay,
openaiToggle,
// Browser WebSpeech (直接调用避免 Rolldown 树摇)
speakViaBrowser,
}
}
@@ -0,0 +1,164 @@
import { ref, watch } from 'vue'
export type TtsProvider = 'webspeech' | 'openai' | 'custom' | 'edge'
export interface VoiceSettingsData {
provider: TtsProvider
// WebSpeech
webspeechVoice: string
// OpenAI
openaiApiKey: string
openaiBaseUrl: string
openaiModel: string
openaiVoice: string
// Custom endpoint (OpenAI-compatible)
customUrl: string
customApiKey: string
// Edge TTS
edgeUrl: string
edgeVoice: string
}
const STORAGE_KEY = 'hermes-tts-settings-v2'
function migrateOldKeys() {
const oldKey = 'hermes-tts-settings'
try {
const old = localStorage.getItem(oldKey)
if (old) {
const parsed = JSON.parse(old)
// Old 'custom' provider maps to new 'custom'
// Old 'gptsovits' provider maps to new 'custom'
if (parsed.provider === 'gptsovits') {
parsed.provider = 'custom'
// old gptsovitsUrl -> customUrl
if (parsed.gptsovitsUrl && !parsed.customUrl) {
parsed.customUrl = parsed.gptsovitsUrl
}
}
// Store as new format
const data = { ...DEFAULT, ...parsed }
localStorage.setItem(STORAGE_KEY, JSON.stringify(data))
localStorage.removeItem(oldKey)
}
} catch { /* ignore */ }
}
const DEFAULT: VoiceSettingsData = {
provider: 'webspeech',
webspeechVoice: '',
openaiApiKey: '',
openaiBaseUrl: '',
openaiModel: 'tts-1',
openaiVoice: 'alloy',
customUrl: '',
customApiKey: '',
edgeUrl: '',
edgeVoice: 'zh-CN-XiaoxiaoNeural',
}
function sanitize(data: VoiceSettingsData): VoiceSettingsData {
// Clear old Edge TTS adapter URLs — now uses internal node-edge-tts
if (data.edgeUrl && data.edgeUrl !== '') {
data.edgeUrl = ''
}
return data
}
function load(): VoiceSettingsData {
try {
const raw = localStorage.getItem(STORAGE_KEY)
if (raw) return sanitize({ ...DEFAULT, ...JSON.parse(raw) })
} catch { /* ignore */ }
return { ...DEFAULT }
}
// Run migration once on import
migrateOldKeys()
// ── Reactive state ──
const provider = ref<TtsProvider>(load().provider)
// WebSpeech
const webspeechVoice = ref<string>(load().webspeechVoice)
// OpenAI
const openaiApiKey = ref<string>(load().openaiApiKey)
const openaiBaseUrl = ref<string>(load().openaiBaseUrl)
const openaiModel = ref<string>(load().openaiModel)
const openaiVoice = ref<string>(load().openaiVoice)
// Custom
const customUrl = ref<string>(load().customUrl)
const customApiKey = ref<string>(load().customApiKey)
// Edge TTS
const edgeUrl = ref<string>(load().edgeUrl)
const edgeVoice = ref<string>(load().edgeVoice)
// Auto-persist on change
watch(
[provider, webspeechVoice, openaiApiKey, openaiBaseUrl, openaiModel, openaiVoice,
customUrl, customApiKey, edgeUrl, edgeVoice],
() => {
localStorage.setItem(STORAGE_KEY, JSON.stringify({
provider: provider.value,
webspeechVoice: webspeechVoice.value,
openaiApiKey: openaiApiKey.value,
openaiBaseUrl: openaiBaseUrl.value,
openaiModel: openaiModel.value,
openaiVoice: openaiVoice.value,
customUrl: customUrl.value,
customApiKey: customApiKey.value,
edgeUrl: edgeUrl.value,
edgeVoice: edgeVoice.value,
}))
},
)
export function useVoiceSettings() {
return {
provider,
webspeechVoice,
openaiApiKey,
openaiBaseUrl,
openaiModel,
openaiVoice,
customUrl,
customApiKey,
edgeUrl,
edgeVoice,
setProvider(v: TtsProvider) { provider.value = v },
setWebSpeechVoice(v: string) { webspeechVoice.value = v },
setOpenaiApiKey(v: string) { openaiApiKey.value = v },
setOpenaiBaseUrl(v: string) { openaiBaseUrl.value = v },
setOpenaiModel(v: string) { openaiModel.value = v },
setOpenaiVoice(v: string) { openaiVoice.value = v },
setCustomUrl(v: string) { customUrl.value = v },
setCustomApiKey(v: string) { customApiKey.value = v },
setEdgeUrl(v: string) { edgeUrl.value = v },
setEdgeVoice(v: string) { edgeVoice.value = v },
reset() {
provider.value = DEFAULT.provider
webspeechVoice.value = DEFAULT.webspeechVoice
openaiApiKey.value = DEFAULT.openaiApiKey
openaiBaseUrl.value = DEFAULT.openaiBaseUrl
openaiModel.value = DEFAULT.openaiModel
openaiVoice.value = DEFAULT.openaiVoice
customUrl.value = DEFAULT.customUrl
customApiKey.value = DEFAULT.customApiKey
edgeUrl.value = DEFAULT.edgeUrl
edgeVoice.value = DEFAULT.edgeVoice
},
}
}