feat: add MiMo TTS provider 语音TTS提供接入MiMo (#752)
* feat: add MiMo TTS provider with preset voices, voice design and voice clone * refactor: remove MiMo voice clone feature
This commit is contained in:
@@ -15,6 +15,15 @@ export interface OpenaiTtsOptions {
|
||||
pitch?: string // Edge TTS pitch format, e.g. "-8Hz"
|
||||
}
|
||||
|
||||
export interface MimoTtsOptions {
|
||||
baseUrl: string
|
||||
apiKey: string
|
||||
model: string
|
||||
voice: string // preset voice ID (preset mode) or data URI (clone mode)
|
||||
voiceDesignDesc?: string // voice design description text (voice design mode)
|
||||
stylePrompt?: string // natural language style instruction
|
||||
}
|
||||
|
||||
export interface SpeechState {
|
||||
isPlaying: boolean
|
||||
isPaused: boolean
|
||||
@@ -333,20 +342,17 @@ export function useSpeech() {
|
||||
function openaiToggle(messageId: string, content: string, opts: OpenaiTtsOptions) {
|
||||
if (currentCustomMessageId.value === messageId && isCustomPlaying.value) {
|
||||
if (isCustomPaused.value) {
|
||||
// Resume
|
||||
if (customAudio) {
|
||||
customAudio.play()
|
||||
}
|
||||
isCustomPaused.value = false
|
||||
} else {
|
||||
// Pause
|
||||
if (customAudio) {
|
||||
customAudio.pause()
|
||||
}
|
||||
isCustomPaused.value = true
|
||||
}
|
||||
} else {
|
||||
// Stop other speech and start new
|
||||
stop(false)
|
||||
if (customAudio) {
|
||||
customAudio.pause()
|
||||
@@ -356,6 +362,148 @@ export function useSpeech() {
|
||||
}
|
||||
}
|
||||
|
||||
// ─── MiMo TTS Engine ──────────────────────────────────────────
|
||||
|
||||
async function mimoPlay(
|
||||
messageId: string,
|
||||
content: string,
|
||||
opts: MimoTtsOptions,
|
||||
) {
|
||||
const text = extractReadableText(content)
|
||||
if (!text) return
|
||||
|
||||
const token = ++playbackToken
|
||||
|
||||
isCustomPlaying.value = true
|
||||
isCustomPaused.value = false
|
||||
currentCustomMessageId.value = messageId
|
||||
|
||||
// Build messages based on model type
|
||||
const messages: Array<{ role: string; content: string }> = []
|
||||
|
||||
if (opts.model === 'mimo-v2.5-tts-voicedesign') {
|
||||
// Voice design: user message = voice description (+ appended style prompt)
|
||||
const desc = opts.voiceDesignDesc || ''
|
||||
const userContent = opts.stylePrompt
|
||||
? `${desc}\n风格指令:${opts.stylePrompt}`
|
||||
: desc
|
||||
messages.push({ role: 'user', content: userContent || '默认音色' })
|
||||
} else {
|
||||
// Preset voices: user message = style prompt or empty
|
||||
messages.push({ role: 'user', content: opts.stylePrompt || '' })
|
||||
}
|
||||
|
||||
// assistant message = synthesis text
|
||||
messages.push({ role: 'assistant', content: text })
|
||||
|
||||
const audio: Record<string, any> = { format: 'wav' }
|
||||
// Voice design model does not accept audio.voice
|
||||
if (opts.model !== 'mimo-v2.5-tts-voicedesign') {
|
||||
audio.voice = opts.voice
|
||||
}
|
||||
|
||||
const body: Record<string, any> = {
|
||||
model: opts.model,
|
||||
messages,
|
||||
audio,
|
||||
}
|
||||
|
||||
const url = `${opts.baseUrl.replace(/\/+$/, '')}/chat/completions`
|
||||
|
||||
const headers: Record<string, string> = {
|
||||
'Content-Type': 'application/json',
|
||||
'api-key': opts.apiKey,
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await fetch(url, {
|
||||
method: 'POST',
|
||||
headers,
|
||||
body: JSON.stringify(body),
|
||||
})
|
||||
|
||||
if (token !== playbackToken) return
|
||||
|
||||
if (!res.ok) {
|
||||
const errText = await res.text().catch(() => '')
|
||||
throw new Error(`MiMo TTS 返回 ${res.status}: ${errText || res.statusText}`)
|
||||
}
|
||||
|
||||
const json = await res.json()
|
||||
if (token !== playbackToken) return
|
||||
|
||||
const audioBase64 = json?.choices?.[0]?.message?.audio?.data
|
||||
if (!audioBase64) {
|
||||
throw new Error('MiMo TTS 响应中未找到音频数据')
|
||||
}
|
||||
|
||||
// base64 → binary → Blob
|
||||
const binaryStr = atob(audioBase64)
|
||||
const bytes = new Uint8Array(binaryStr.length)
|
||||
for (let i = 0; i < binaryStr.length; i++) {
|
||||
bytes[i] = binaryStr.charCodeAt(i)
|
||||
}
|
||||
const audioBlob = new Blob([bytes], { type: 'audio/wav' })
|
||||
|
||||
if (token !== playbackToken) return
|
||||
|
||||
const audioUrl = URL.createObjectURL(audioBlob)
|
||||
const audio = new Audio(audioUrl)
|
||||
customAudio = audio
|
||||
|
||||
audio.onended = () => {
|
||||
if (token !== playbackToken) return
|
||||
URL.revokeObjectURL(audioUrl)
|
||||
isCustomPlaying.value = false
|
||||
isCustomPaused.value = false
|
||||
currentCustomMessageId.value = null
|
||||
customAudio = null
|
||||
}
|
||||
|
||||
audio.onerror = () => {
|
||||
if (token !== playbackToken) return
|
||||
URL.revokeObjectURL(audioUrl)
|
||||
console.warn('[useSpeech] MiMo TTS audio playback error')
|
||||
isCustomPlaying.value = false
|
||||
isCustomPaused.value = false
|
||||
currentCustomMessageId.value = null
|
||||
customAudio = null
|
||||
}
|
||||
|
||||
await audio.play()
|
||||
} catch (err) {
|
||||
if (token !== playbackToken) return
|
||||
console.error('[useSpeech] MiMo TTS 请求失败:', err)
|
||||
isCustomPlaying.value = false
|
||||
isCustomPaused.value = false
|
||||
currentCustomMessageId.value = null
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
function mimoToggle(messageId: string, content: string, opts: MimoTtsOptions) {
|
||||
if (currentCustomMessageId.value === messageId && isCustomPlaying.value) {
|
||||
if (isCustomPaused.value) {
|
||||
if (customAudio) {
|
||||
customAudio.play()
|
||||
}
|
||||
isCustomPaused.value = false
|
||||
} else {
|
||||
if (customAudio) {
|
||||
customAudio.pause()
|
||||
}
|
||||
isCustomPaused.value = true
|
||||
}
|
||||
} else {
|
||||
stop(false)
|
||||
if (customAudio) {
|
||||
customAudio.pause()
|
||||
customAudio = null
|
||||
}
|
||||
mimoPlay(messageId, content, opts)
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Unified speak ──────────────────────────────────────────
|
||||
|
||||
function speak(messageId: string, text: string, options: SpeechOptions = {}) {
|
||||
@@ -473,6 +621,10 @@ export function useSpeech() {
|
||||
openaiPlay,
|
||||
openaiToggle,
|
||||
|
||||
// MiMo TTS
|
||||
mimoPlay,
|
||||
mimoToggle,
|
||||
|
||||
// Browser WebSpeech (直接调用避免 Rolldown 树摇)
|
||||
speakViaBrowser,
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { ref, watch } from 'vue'
|
||||
|
||||
export type TtsProvider = 'webspeech' | 'openai' | 'custom' | 'edge'
|
||||
export type TtsProvider = 'webspeech' | 'openai' | 'custom' | 'edge' | 'mimo'
|
||||
|
||||
export interface VoiceSettingsData {
|
||||
provider: TtsProvider
|
||||
@@ -23,6 +23,14 @@ export interface VoiceSettingsData {
|
||||
edgeVoice: string
|
||||
edgeRate: number // 语速倍率 0.5~2.0,1.0 = 正常
|
||||
edgePitchHz: number // 音调偏移 Hz,-20~20,0 = 正常
|
||||
|
||||
// MiMo TTS
|
||||
mimoApiKey: string
|
||||
mimoBaseUrl: string
|
||||
mimoModel: string // 'mimo-v2.5-tts' | 'mimo-v2.5-tts-voicedesign'
|
||||
mimoVoice: string // 预置音色 ID
|
||||
mimoVoiceDesignDesc: string // 音色设计描述文本
|
||||
mimoStylePrompt: string // 风格指令
|
||||
}
|
||||
|
||||
const STORAGE_KEY = 'hermes-tts-settings-v2'
|
||||
@@ -67,6 +75,13 @@ const DEFAULT: VoiceSettingsData = {
|
||||
edgeVoice: 'zh-CN-XiaoxiaoNeural',
|
||||
edgeRate: 1.0,
|
||||
edgePitchHz: 0,
|
||||
|
||||
mimoApiKey: '',
|
||||
mimoBaseUrl: 'https://api.xiaomimimo.com/v1',
|
||||
mimoModel: 'mimo-v2.5-tts',
|
||||
mimoVoice: '冰糖',
|
||||
mimoVoiceDesignDesc: '',
|
||||
mimoStylePrompt: '',
|
||||
}
|
||||
|
||||
function sanitize(data: VoiceSettingsData): VoiceSettingsData {
|
||||
@@ -110,10 +125,19 @@ const edgeVoice = ref<string>(load().edgeVoice)
|
||||
const edgeRate = ref<number>(load().edgeRate)
|
||||
const edgePitchHz = ref<number>(load().edgePitchHz)
|
||||
|
||||
// MiMo TTS
|
||||
const mimoApiKey = ref<string>(load().mimoApiKey)
|
||||
const mimoBaseUrl = ref<string>(load().mimoBaseUrl)
|
||||
const mimoModel = ref<string>(load().mimoModel)
|
||||
const mimoVoice = ref<string>(load().mimoVoice)
|
||||
const mimoVoiceDesignDesc = ref<string>(load().mimoVoiceDesignDesc)
|
||||
const mimoStylePrompt = ref<string>(load().mimoStylePrompt)
|
||||
|
||||
// Auto-persist on change
|
||||
watch(
|
||||
[provider, webspeechVoice, openaiApiKey, openaiBaseUrl, openaiModel, openaiVoice,
|
||||
customUrl, customApiKey, edgeUrl, edgeVoice, edgeRate, edgePitchHz],
|
||||
customUrl, customApiKey, edgeUrl, edgeVoice, edgeRate, edgePitchHz,
|
||||
mimoApiKey, mimoBaseUrl, mimoModel, mimoVoice, mimoVoiceDesignDesc, mimoStylePrompt],
|
||||
() => {
|
||||
localStorage.setItem(STORAGE_KEY, JSON.stringify({
|
||||
provider: provider.value,
|
||||
@@ -128,6 +152,12 @@ watch(
|
||||
edgeVoice: edgeVoice.value,
|
||||
edgeRate: edgeRate.value,
|
||||
edgePitchHz: edgePitchHz.value,
|
||||
mimoApiKey: mimoApiKey.value,
|
||||
mimoBaseUrl: mimoBaseUrl.value,
|
||||
mimoModel: mimoModel.value,
|
||||
mimoVoice: mimoVoice.value,
|
||||
mimoVoiceDesignDesc: mimoVoiceDesignDesc.value,
|
||||
mimoStylePrompt: mimoStylePrompt.value,
|
||||
}))
|
||||
},
|
||||
)
|
||||
@@ -146,6 +176,12 @@ export function useVoiceSettings() {
|
||||
edgeVoice,
|
||||
edgeRate,
|
||||
edgePitchHz,
|
||||
mimoApiKey,
|
||||
mimoBaseUrl,
|
||||
mimoModel,
|
||||
mimoVoice,
|
||||
mimoVoiceDesignDesc,
|
||||
mimoStylePrompt,
|
||||
|
||||
setProvider(v: TtsProvider) { provider.value = v },
|
||||
setWebSpeechVoice(v: string) { webspeechVoice.value = v },
|
||||
@@ -159,6 +195,12 @@ export function useVoiceSettings() {
|
||||
setEdgeVoice(v: string) { edgeVoice.value = v },
|
||||
setEdgeRate(v: number) { edgeRate.value = v },
|
||||
setEdgePitchHz(v: number) { edgePitchHz.value = v },
|
||||
setMimoApiKey(v: string) { mimoApiKey.value = v },
|
||||
setMimoBaseUrl(v: string) { mimoBaseUrl.value = v },
|
||||
setMimoModel(v: string) { mimoModel.value = v },
|
||||
setMimoVoice(v: string) { mimoVoice.value = v },
|
||||
setMimoVoiceDesignDesc(v: string) { mimoVoiceDesignDesc.value = v },
|
||||
setMimoStylePrompt(v: string) { mimoStylePrompt.value = v },
|
||||
|
||||
reset() {
|
||||
provider.value = DEFAULT.provider
|
||||
@@ -173,6 +215,12 @@ export function useVoiceSettings() {
|
||||
edgeVoice.value = DEFAULT.edgeVoice
|
||||
edgeRate.value = DEFAULT.edgeRate
|
||||
edgePitchHz.value = DEFAULT.edgePitchHz
|
||||
mimoApiKey.value = DEFAULT.mimoApiKey
|
||||
mimoBaseUrl.value = DEFAULT.mimoBaseUrl
|
||||
mimoModel.value = DEFAULT.mimoModel
|
||||
mimoVoice.value = DEFAULT.mimoVoice
|
||||
mimoVoiceDesignDesc.value = DEFAULT.mimoVoiceDesignDesc
|
||||
mimoStylePrompt.value = DEFAULT.mimoStylePrompt
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user