feat: add MiMo TTS provider 语音TTS提供接入MiMo (#752)

* feat: add MiMo TTS provider with preset voices, voice design and voice clone

* refactor: remove MiMo voice clone feature
This commit is contained in:
ZhangKai | 张凯
2026-05-16 08:55:23 +08:00
committed by GitHub
parent 3f8461d9eb
commit 87a8e95d66
13 changed files with 609 additions and 11 deletions
+155 -3
View File
@@ -15,6 +15,15 @@ export interface OpenaiTtsOptions {
pitch?: string // Edge TTS pitch format, e.g. "-8Hz"
}
export interface MimoTtsOptions {
baseUrl: string
apiKey: string
model: string
voice: string // preset voice ID (preset mode) or data URI (clone mode)
voiceDesignDesc?: string // voice design description text (voice design mode)
stylePrompt?: string // natural language style instruction
}
export interface SpeechState {
isPlaying: boolean
isPaused: boolean
@@ -333,20 +342,17 @@ export function useSpeech() {
function openaiToggle(messageId: string, content: string, opts: OpenaiTtsOptions) {
if (currentCustomMessageId.value === messageId && isCustomPlaying.value) {
if (isCustomPaused.value) {
// Resume
if (customAudio) {
customAudio.play()
}
isCustomPaused.value = false
} else {
// Pause
if (customAudio) {
customAudio.pause()
}
isCustomPaused.value = true
}
} else {
// Stop other speech and start new
stop(false)
if (customAudio) {
customAudio.pause()
@@ -356,6 +362,148 @@ export function useSpeech() {
}
}
// ─── MiMo TTS Engine ──────────────────────────────────────────
async function mimoPlay(
messageId: string,
content: string,
opts: MimoTtsOptions,
) {
const text = extractReadableText(content)
if (!text) return
const token = ++playbackToken
isCustomPlaying.value = true
isCustomPaused.value = false
currentCustomMessageId.value = messageId
// Build messages based on model type
const messages: Array<{ role: string; content: string }> = []
if (opts.model === 'mimo-v2.5-tts-voicedesign') {
// Voice design: user message = voice description (+ appended style prompt)
const desc = opts.voiceDesignDesc || ''
const userContent = opts.stylePrompt
? `${desc}\n风格指令:${opts.stylePrompt}`
: desc
messages.push({ role: 'user', content: userContent || '默认音色' })
} else {
// Preset voices: user message = style prompt or empty
messages.push({ role: 'user', content: opts.stylePrompt || '' })
}
// assistant message = synthesis text
messages.push({ role: 'assistant', content: text })
const audio: Record<string, any> = { format: 'wav' }
// Voice design model does not accept audio.voice
if (opts.model !== 'mimo-v2.5-tts-voicedesign') {
audio.voice = opts.voice
}
const body: Record<string, any> = {
model: opts.model,
messages,
audio,
}
const url = `${opts.baseUrl.replace(/\/+$/, '')}/chat/completions`
const headers: Record<string, string> = {
'Content-Type': 'application/json',
'api-key': opts.apiKey,
}
try {
const res = await fetch(url, {
method: 'POST',
headers,
body: JSON.stringify(body),
})
if (token !== playbackToken) return
if (!res.ok) {
const errText = await res.text().catch(() => '')
throw new Error(`MiMo TTS 返回 ${res.status}: ${errText || res.statusText}`)
}
const json = await res.json()
if (token !== playbackToken) return
const audioBase64 = json?.choices?.[0]?.message?.audio?.data
if (!audioBase64) {
throw new Error('MiMo TTS 响应中未找到音频数据')
}
// base64 → binary → Blob
const binaryStr = atob(audioBase64)
const bytes = new Uint8Array(binaryStr.length)
for (let i = 0; i < binaryStr.length; i++) {
bytes[i] = binaryStr.charCodeAt(i)
}
const audioBlob = new Blob([bytes], { type: 'audio/wav' })
if (token !== playbackToken) return
const audioUrl = URL.createObjectURL(audioBlob)
const audio = new Audio(audioUrl)
customAudio = audio
audio.onended = () => {
if (token !== playbackToken) return
URL.revokeObjectURL(audioUrl)
isCustomPlaying.value = false
isCustomPaused.value = false
currentCustomMessageId.value = null
customAudio = null
}
audio.onerror = () => {
if (token !== playbackToken) return
URL.revokeObjectURL(audioUrl)
console.warn('[useSpeech] MiMo TTS audio playback error')
isCustomPlaying.value = false
isCustomPaused.value = false
currentCustomMessageId.value = null
customAudio = null
}
await audio.play()
} catch (err) {
if (token !== playbackToken) return
console.error('[useSpeech] MiMo TTS 请求失败:', err)
isCustomPlaying.value = false
isCustomPaused.value = false
currentCustomMessageId.value = null
throw err
}
}
function mimoToggle(messageId: string, content: string, opts: MimoTtsOptions) {
if (currentCustomMessageId.value === messageId && isCustomPlaying.value) {
if (isCustomPaused.value) {
if (customAudio) {
customAudio.play()
}
isCustomPaused.value = false
} else {
if (customAudio) {
customAudio.pause()
}
isCustomPaused.value = true
}
} else {
stop(false)
if (customAudio) {
customAudio.pause()
customAudio = null
}
mimoPlay(messageId, content, opts)
}
}
// ─── Unified speak ──────────────────────────────────────────
function speak(messageId: string, text: string, options: SpeechOptions = {}) {
@@ -473,6 +621,10 @@ export function useSpeech() {
openaiPlay,
openaiToggle,
// MiMo TTS
mimoPlay,
mimoToggle,
// Browser WebSpeech (直接调用避免 Rolldown 树摇)
speakViaBrowser,
}
@@ -1,6 +1,6 @@
import { ref, watch } from 'vue'
export type TtsProvider = 'webspeech' | 'openai' | 'custom' | 'edge'
export type TtsProvider = 'webspeech' | 'openai' | 'custom' | 'edge' | 'mimo'
export interface VoiceSettingsData {
provider: TtsProvider
@@ -23,6 +23,14 @@ export interface VoiceSettingsData {
edgeVoice: string
edgeRate: number // 语速倍率 0.5~2.01.0 = 正常
edgePitchHz: number // 音调偏移 Hz-20~200 = 正常
// MiMo TTS
mimoApiKey: string
mimoBaseUrl: string
mimoModel: string // 'mimo-v2.5-tts' | 'mimo-v2.5-tts-voicedesign'
mimoVoice: string // 预置音色 ID
mimoVoiceDesignDesc: string // 音色设计描述文本
mimoStylePrompt: string // 风格指令
}
const STORAGE_KEY = 'hermes-tts-settings-v2'
@@ -67,6 +75,13 @@ const DEFAULT: VoiceSettingsData = {
edgeVoice: 'zh-CN-XiaoxiaoNeural',
edgeRate: 1.0,
edgePitchHz: 0,
mimoApiKey: '',
mimoBaseUrl: 'https://api.xiaomimimo.com/v1',
mimoModel: 'mimo-v2.5-tts',
mimoVoice: '冰糖',
mimoVoiceDesignDesc: '',
mimoStylePrompt: '',
}
function sanitize(data: VoiceSettingsData): VoiceSettingsData {
@@ -110,10 +125,19 @@ const edgeVoice = ref<string>(load().edgeVoice)
const edgeRate = ref<number>(load().edgeRate)
const edgePitchHz = ref<number>(load().edgePitchHz)
// MiMo TTS
const mimoApiKey = ref<string>(load().mimoApiKey)
const mimoBaseUrl = ref<string>(load().mimoBaseUrl)
const mimoModel = ref<string>(load().mimoModel)
const mimoVoice = ref<string>(load().mimoVoice)
const mimoVoiceDesignDesc = ref<string>(load().mimoVoiceDesignDesc)
const mimoStylePrompt = ref<string>(load().mimoStylePrompt)
// Auto-persist on change
watch(
[provider, webspeechVoice, openaiApiKey, openaiBaseUrl, openaiModel, openaiVoice,
customUrl, customApiKey, edgeUrl, edgeVoice, edgeRate, edgePitchHz],
customUrl, customApiKey, edgeUrl, edgeVoice, edgeRate, edgePitchHz,
mimoApiKey, mimoBaseUrl, mimoModel, mimoVoice, mimoVoiceDesignDesc, mimoStylePrompt],
() => {
localStorage.setItem(STORAGE_KEY, JSON.stringify({
provider: provider.value,
@@ -128,6 +152,12 @@ watch(
edgeVoice: edgeVoice.value,
edgeRate: edgeRate.value,
edgePitchHz: edgePitchHz.value,
mimoApiKey: mimoApiKey.value,
mimoBaseUrl: mimoBaseUrl.value,
mimoModel: mimoModel.value,
mimoVoice: mimoVoice.value,
mimoVoiceDesignDesc: mimoVoiceDesignDesc.value,
mimoStylePrompt: mimoStylePrompt.value,
}))
},
)
@@ -146,6 +176,12 @@ export function useVoiceSettings() {
edgeVoice,
edgeRate,
edgePitchHz,
mimoApiKey,
mimoBaseUrl,
mimoModel,
mimoVoice,
mimoVoiceDesignDesc,
mimoStylePrompt,
setProvider(v: TtsProvider) { provider.value = v },
setWebSpeechVoice(v: string) { webspeechVoice.value = v },
@@ -159,6 +195,12 @@ export function useVoiceSettings() {
setEdgeVoice(v: string) { edgeVoice.value = v },
setEdgeRate(v: number) { edgeRate.value = v },
setEdgePitchHz(v: number) { edgePitchHz.value = v },
setMimoApiKey(v: string) { mimoApiKey.value = v },
setMimoBaseUrl(v: string) { mimoBaseUrl.value = v },
setMimoModel(v: string) { mimoModel.value = v },
setMimoVoice(v: string) { mimoVoice.value = v },
setMimoVoiceDesignDesc(v: string) { mimoVoiceDesignDesc.value = v },
setMimoStylePrompt(v: string) { mimoStylePrompt.value = v },
reset() {
provider.value = DEFAULT.provider
@@ -173,6 +215,12 @@ export function useVoiceSettings() {
edgeVoice.value = DEFAULT.edgeVoice
edgeRate.value = DEFAULT.edgeRate
edgePitchHz.value = DEFAULT.edgePitchHz
mimoApiKey.value = DEFAULT.mimoApiKey
mimoBaseUrl.value = DEFAULT.mimoBaseUrl
mimoModel.value = DEFAULT.mimoModel
mimoVoice.value = DEFAULT.mimoVoice
mimoVoiceDesignDesc.value = DEFAULT.mimoVoiceDesignDesc
mimoStylePrompt.value = DEFAULT.mimoStylePrompt
},
}
}