feat: add voice playback settings with 4-provider support (#608)

Add WebSpeech, OpenAI TTS, Custom endpoint, and Edge TTS providers. Co-authored-by: Hermes Agent <noreply@nousresearch.com>
2026-05-10 20:08:38 +08:00
parent 838791a740
commit 15195f0795
18 changed files with 1237 additions and 20 deletions
@@ -3,6 +3,14 @@ import { generateSpeech, playAudioBlob } from '@/api/hermes/tts'

 export interface SpeechOptions {
  lang?: string      // 语言 'zh-CN', 'en-US' 等
+  voiceName?: string // 指定 WebSpeech 音色名称
+}
+
+export interface OpenaiTtsOptions {
+  baseUrl: string
+  apiKey?: string
+  model?: string
+  voice?: string
 }

 export interface SpeechState {
@@ -39,6 +47,11 @@ export function useSpeech() {
  let playbackToken = 0
  const speechQueue: SpeechQueueItem[] = []

+  // 自定义 TTS（OpenAI / Custom / Edge）播放状态
+  const isCustomPlaying = ref(false)
+  const isCustomPaused = ref(false)
+  const currentCustomMessageId = ref<string | null>(null)
+
  // 加载可用语音列表
  function loadVoices() {
    availableVoices.value = synth.getVoices()
@@ -162,14 +175,25 @@ export function useSpeech() {

  // ─── Browser Engine (Web Speech API) ────────────────────────

-  function speakViaBrowser(messageId: string, text: string, options: SpeechOptions, token: number) {
+  function speakViaBrowser(messageId: string, text: string, options: SpeechOptions, token?: number) {
+    token = token || ++playbackToken
    utterance = new SpeechSynthesisUtterance(text)
    const activeUtterance = utterance

    utterance.rate = 1
    utterance.pitch = 1
    utterance.volume = 1
-    utterance.voice = getDefaultVoice()
+
+    // 使用指定的音色（如果有），否则用默认
+    if (options.voiceName) {
+      const voice = availableVoices.value.find(v => v.name === options.voiceName)
+      if (voice) {
+        utterance.voice = voice
+      }
+    }
+    if (!utterance.voice) {
+      utterance.voice = getDefaultVoice()
+    }

    if (options.lang) {
      utterance.lang = options.lang
@@ -218,6 +242,115 @@ export function useSpeech() {
    synth.speak(utterance)
  }

+  // ─── OpenAI-compatible TTS Engine ────────────────────────────
+
+  let customAudio: HTMLAudioElement | null = null
+
+  async function openaiPlay(
+    messageId: string,
+    content: string,
+    opts: OpenaiTtsOptions,
+  ) {
+    const text = extractReadableText(content)
+    if (!text) return
+
+    const token = ++playbackToken
+
+    isCustomPlaying.value = true
+    isCustomPaused.value = false
+    currentCustomMessageId.value = messageId
+
+    const url = `${opts.baseUrl.replace(/\/+$/, '')}/audio/speech`
+    const body: Record<string, any> = {
+      model: opts.model || 'tts-1',
+      input: text,
+      voice: opts.voice || 'alloy',
+    }
+
+    const headers: Record<string, string> = {
+      'Content-Type': 'application/json',
+    }
+    if (opts.apiKey) {
+      headers['Authorization'] = `Bearer ${opts.apiKey}`
+    }
+
+    try {
+      const res = await fetch(url, {
+        method: 'POST',
+        headers,
+        body: JSON.stringify(body),
+      })
+
+      if (token !== playbackToken) return
+
+      if (!res.ok) {
+        const errText = await res.text().catch(() => '')
+        throw new Error(`OpenAI TTS 返回 ${res.status}: ${errText || res.statusText}`)
+      }
+
+      const audioBlob = await res.blob()
+      if (token !== playbackToken) return
+
+      const audioUrl = URL.createObjectURL(audioBlob)
+      const audio = new Audio(audioUrl)
+      customAudio = audio
+
+      audio.onended = () => {
+        if (token !== playbackToken) return
+        URL.revokeObjectURL(audioUrl)
+        isCustomPlaying.value = false
+        isCustomPaused.value = false
+        currentCustomMessageId.value = null
+        customAudio = null
+      }
+
+      audio.onerror = () => {
+        if (token !== playbackToken) return
+        URL.revokeObjectURL(audioUrl)
+        console.warn('[useSpeech] Custom TTS audio playback error')
+        isCustomPlaying.value = false
+        isCustomPaused.value = false
+        currentCustomMessageId.value = null
+        customAudio = null
+      }
+
+      await audio.play()
+    } catch (err) {
+      if (token !== playbackToken) return
+      console.error('[useSpeech] OpenAI TTS 请求失败:', err)
+      isCustomPlaying.value = false
+      isCustomPaused.value = false
+      currentCustomMessageId.value = null
+      throw err
+    }
+  }
+
+  function openaiToggle(messageId: string, content: string, opts: OpenaiTtsOptions) {
+    if (currentCustomMessageId.value === messageId && isCustomPlaying.value) {
+      if (isCustomPaused.value) {
+        // Resume
+        if (customAudio) {
+          customAudio.play()
+        }
+        isCustomPaused.value = false
+      } else {
+        // Pause
+        if (customAudio) {
+          customAudio.pause()
+        }
+        isCustomPaused.value = true
+      }
+    } else {
+      // Stop other speech and start new
+      stop(false)
+      if (customAudio) {
+        customAudio.pause()
+        customAudio = null
+      }
+      openaiPlay(messageId, content, opts)
+    }
+  }
+
  // ─── Unified speak ──────────────────────────────────────────

  function speak(messageId: string, text: string, options: SpeechOptions = {}) {
@@ -317,6 +450,11 @@ export function useSpeech() {
    progress: computed(() => state.value.progress),
    engine: computed(() => state.value.engine),

+    // Custom TTS state
+    isCustomPlaying,
+    isCustomPaused,
+    currentCustomMessageId,
+
    play,
    pause,
    resume,
@@ -325,6 +463,13 @@ export function useSpeech() {
    enqueue,
    getDefaultVoice,
    extractReadableText,
+
+    // OpenAI-compatible TTS
+    openaiPlay,
+    openaiToggle,
+
+    // Browser WebSpeech (直接调用避免 Rolldown 树摇)
+    speakViaBrowser,
  }
 }

@@ -0,0 +1,164 @@
+import { ref, watch } from 'vue'
+
+export type TtsProvider = 'webspeech' | 'openai' | 'custom' | 'edge'
+
+export interface VoiceSettingsData {
+  provider: TtsProvider
+
+  // WebSpeech
+  webspeechVoice: string
+
+  // OpenAI
+  openaiApiKey: string
+  openaiBaseUrl: string
+  openaiModel: string
+  openaiVoice: string
+
+  // Custom endpoint (OpenAI-compatible)
+  customUrl: string
+  customApiKey: string
+
+  // Edge TTS
+  edgeUrl: string
+  edgeVoice: string
+}
+
+const STORAGE_KEY = 'hermes-tts-settings-v2'
+
+function migrateOldKeys() {
+  const oldKey = 'hermes-tts-settings'
+  try {
+    const old = localStorage.getItem(oldKey)
+    if (old) {
+      const parsed = JSON.parse(old)
+      // Old 'custom' provider maps to new 'custom'
+      // Old 'gptsovits' provider maps to new 'custom'
+      if (parsed.provider === 'gptsovits') {
+        parsed.provider = 'custom'
+        // old gptsovitsUrl -> customUrl
+        if (parsed.gptsovitsUrl && !parsed.customUrl) {
+          parsed.customUrl = parsed.gptsovitsUrl
+        }
+      }
+      // Store as new format
+      const data = { ...DEFAULT, ...parsed }
+      localStorage.setItem(STORAGE_KEY, JSON.stringify(data))
+      localStorage.removeItem(oldKey)
+    }
+  } catch { /* ignore */ }
+}
+
+const DEFAULT: VoiceSettingsData = {
+  provider: 'webspeech',
+
+  webspeechVoice: '',
+
+  openaiApiKey: '',
+  openaiBaseUrl: '',
+  openaiModel: 'tts-1',
+  openaiVoice: 'alloy',
+
+  customUrl: '',
+  customApiKey: '',
+
+  edgeUrl: '',
+  edgeVoice: 'zh-CN-XiaoxiaoNeural',
+}
+
+function sanitize(data: VoiceSettingsData): VoiceSettingsData {
+  // Clear old Edge TTS adapter URLs — now uses internal node-edge-tts
+  if (data.edgeUrl && data.edgeUrl !== '') {
+    data.edgeUrl = ''
+  }
+  return data
+}
+
+function load(): VoiceSettingsData {
+  try {
+    const raw = localStorage.getItem(STORAGE_KEY)
+    if (raw) return sanitize({ ...DEFAULT, ...JSON.parse(raw) })
+  } catch { /* ignore */ }
+  return { ...DEFAULT }
+}
+
+// Run migration once on import
+migrateOldKeys()
+
+// ── Reactive state ──
+const provider = ref<TtsProvider>(load().provider)
+
+// WebSpeech
+const webspeechVoice = ref<string>(load().webspeechVoice)
+
+// OpenAI
+const openaiApiKey = ref<string>(load().openaiApiKey)
+const openaiBaseUrl = ref<string>(load().openaiBaseUrl)
+const openaiModel = ref<string>(load().openaiModel)
+const openaiVoice = ref<string>(load().openaiVoice)
+
+// Custom
+const customUrl = ref<string>(load().customUrl)
+const customApiKey = ref<string>(load().customApiKey)
+
+// Edge TTS
+const edgeUrl = ref<string>(load().edgeUrl)
+const edgeVoice = ref<string>(load().edgeVoice)
+
+// Auto-persist on change
+watch(
+  [provider, webspeechVoice, openaiApiKey, openaiBaseUrl, openaiModel, openaiVoice,
+   customUrl, customApiKey, edgeUrl, edgeVoice],
+  () => {
+    localStorage.setItem(STORAGE_KEY, JSON.stringify({
+      provider: provider.value,
+      webspeechVoice: webspeechVoice.value,
+      openaiApiKey: openaiApiKey.value,
+      openaiBaseUrl: openaiBaseUrl.value,
+      openaiModel: openaiModel.value,
+      openaiVoice: openaiVoice.value,
+      customUrl: customUrl.value,
+      customApiKey: customApiKey.value,
+      edgeUrl: edgeUrl.value,
+      edgeVoice: edgeVoice.value,
+    }))
+  },
+)
+
+export function useVoiceSettings() {
+  return {
+    provider,
+    webspeechVoice,
+    openaiApiKey,
+    openaiBaseUrl,
+    openaiModel,
+    openaiVoice,
+    customUrl,
+    customApiKey,
+    edgeUrl,
+    edgeVoice,
+
+    setProvider(v: TtsProvider) { provider.value = v },
+    setWebSpeechVoice(v: string) { webspeechVoice.value = v },
+    setOpenaiApiKey(v: string) { openaiApiKey.value = v },
+    setOpenaiBaseUrl(v: string) { openaiBaseUrl.value = v },
+    setOpenaiModel(v: string) { openaiModel.value = v },
+    setOpenaiVoice(v: string) { openaiVoice.value = v },
+    setCustomUrl(v: string) { customUrl.value = v },
+    setCustomApiKey(v: string) { customApiKey.value = v },
+    setEdgeUrl(v: string) { edgeUrl.value = v },
+    setEdgeVoice(v: string) { edgeVoice.value = v },
+
+    reset() {
+      provider.value = DEFAULT.provider
+      webspeechVoice.value = DEFAULT.webspeechVoice
+      openaiApiKey.value = DEFAULT.openaiApiKey
+      openaiBaseUrl.value = DEFAULT.openaiBaseUrl
+      openaiModel.value = DEFAULT.openaiModel
+      openaiVoice.value = DEFAULT.openaiVoice
+      customUrl.value = DEFAULT.customUrl
+      customApiKey.value = DEFAULT.customApiKey
+      edgeUrl.value = DEFAULT.edgeUrl
+      edgeVoice.value = DEFAULT.edgeVoice
+    },
+  }
+}