feat: add Edge TTS rate/pitch sliders to voice settings (#629)

Add speed (rate) and pitch controls for Edge TTS provider: - Frontend: speedToEdgeRate()/hzToEdgePitch() helpers + UI sliders - Backend: rate/pitch passthrough in OpenaiTtsRequest and controller - i18n: add edgeRate/edgePitch keys across all 8 languages - Rate: 0.5x-2.0x slider, Pitch: -20Hz to +20Hz slider
2026-05-11 21:56:11 +08:00
parent 5e608ea338
commit a68b9bf01f
16 changed files with 142 additions and 4 deletions
@@ -1,6 +1,8 @@
 export interface TtsOptions {
  text: string
  lang?: string
+  rate?: string   // Edge TTS rate format: "+NN%" or "-NN%"
+  pitch?: string  // Edge TTS pitch format: "+NNHz" or "-NNHz"
 }

 export async function generateSpeech(opts: TtsOptions): Promise<{ audio: Blob; engine: string }> {
@@ -17,6 +17,7 @@ import {
 } from "./highlight";
 import { useGlobalSpeech } from "@/composables/useSpeech";
 import { useVoiceSettings } from "@/composables/useVoiceSettings";
+import { speedToEdgeRate, hzToEdgePitch } from "@/utils/ttsHelpers";

 const TOOL_PAYLOAD_DISPLAY_LIMIT = 2000;

@@ -420,6 +421,8 @@ function handleSpeechToggle() {
    speech.openaiToggle(props.message.id, content, {
      baseUrl: apiUrl,
      voice: voiceSettings.edgeVoice.value,
+      rate: speedToEdgeRate(voiceSettings.edgeRate.value),
+      pitch: hzToEdgePitch(voiceSettings.edgePitchHz.value),
    })
    return
  }
@@ -466,6 +469,8 @@ onMounted(() => {
        speech.openaiPlay(props.message.id, content, {
          baseUrl: '/api/tts/proxy',
          voice: voiceSettings.edgeVoice.value,
+          rate: speedToEdgeRate(voiceSettings.edgeRate.value),
+          pitch: hzToEdgePitch(voiceSettings.edgePitchHz.value),
        })
      } else if (voiceSettings.provider.value === 'webspeech') {
        const text = speech.extractReadableText(content)
@@ -1,9 +1,10 @@
 <script setup lang="ts">
 import { ref, onMounted } from 'vue'
-import { NSelect, NInput, NButton } from 'naive-ui'
+import { NSelect, NInput, NButton, NSlider } from 'naive-ui'
 import { useI18n } from 'vue-i18n'
 import { useVoiceSettings } from '@/composables/useVoiceSettings'
 import { useSpeech } from '@/composables/useSpeech'
+import { speedToEdgeRate, hzToEdgePitch } from '@/utils/ttsHelpers'
 import SettingRow from './SettingRow.vue'

 const { t } = useI18n()
@@ -103,6 +104,8 @@ async function handleTest() {
      await speech.openaiPlay('__test__', text, {
        baseUrl: '/api/tts/proxy',
        voice: vs.edgeVoice.value,
+        rate: speedToEdgeRate(vs.edgeRate.value),
+        pitch: hzToEdgePitch(vs.edgePitchHz.value),
      })
    }
  } catch (err) {
@@ -267,6 +270,40 @@ async function handleTest() {
        />
      </SettingRow>

+      <SettingRow
+        :label="t('settings.voice.edgeRate')"
+        :hint="t('settings.voice.edgeRateHint')"
+      >
+        <div class="slider-row">
+          <NSlider
+            :value="vs.edgeRate.value"
+            :min="0.5"
+            :max="2.0"
+            :step="0.05"
+            style="width: 200px"
+            @update:value="vs.setEdgeRate"
+          />
+          <span class="slider-value">{{ vs.edgeRate.value.toFixed(2) }}x ({{ speedToEdgeRate(vs.edgeRate.value) }})</span>
+        </div>
+      </SettingRow>
+
+      <SettingRow
+        :label="t('settings.voice.edgePitch')"
+        :hint="t('settings.voice.edgePitchHint')"
+      >
+        <div class="slider-row">
+          <NSlider
+            :value="vs.edgePitchHz.value"
+            :min="-20"
+            :max="20"
+            :step="1"
+            style="width: 200px"
+            @update:value="vs.setEdgePitchHz"
+          />
+          <span class="slider-value">{{ vs.edgePitchHz.value > 0 ? '+' : '' }}{{ vs.edgePitchHz.value }} Hz ({{ hzToEdgePitch(vs.edgePitchHz.value) }})</span>
+        </div>
+      </SettingRow>
+
    </template>

    <!-- ─── Test / Audition ─── -->
@@ -324,4 +361,17 @@ async function handleTest() {
    align-items: center;
  }
 }
+
+.slider-row {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+}
+
+.slider-value {
+  font-size: 12px;
+  color: #999;
+  white-space: nowrap;
+  min-width: 120px;
+}
 </style>
@@ -11,6 +11,8 @@ export interface OpenaiTtsOptions {
  apiKey?: string
  model?: string
  voice?: string
+  rate?: string   // Edge TTS rate format, e.g. "+20%"
+  pitch?: string  // Edge TTS pitch format, e.g. "-8Hz"
 }

 export interface SpeechState {
@@ -266,6 +268,9 @@ export function useSpeech() {
      input: text,
      voice: opts.voice || 'alloy',
    }
+    // Edge TTS proxy 支持 rate/pitch 参数
+    if (opts.rate) body.rate = opts.rate
+    if (opts.pitch) body.pitch = opts.pitch

    const headers: Record<string, string> = {
      'Content-Type': 'application/json',
@@ -21,6 +21,8 @@ export interface VoiceSettingsData {
  // Edge TTS
  edgeUrl: string
  edgeVoice: string
+  edgeRate: number    // 语速倍率 0.5~2.0，1.0 = 正常
+  edgePitchHz: number // 音调偏移 Hz，-20~20，0 = 正常
 }

 const STORAGE_KEY = 'hermes-tts-settings-v2'
@@ -63,6 +65,8 @@ const DEFAULT: VoiceSettingsData = {

  edgeUrl: '',
  edgeVoice: 'zh-CN-XiaoxiaoNeural',
+  edgeRate: 1.0,
+  edgePitchHz: 0,
 }

 function sanitize(data: VoiceSettingsData): VoiceSettingsData {
@@ -103,11 +107,13 @@ const customApiKey = ref<string>(load().customApiKey)
 // Edge TTS
 const edgeUrl = ref<string>(load().edgeUrl)
 const edgeVoice = ref<string>(load().edgeVoice)
+const edgeRate = ref<number>(load().edgeRate)
+const edgePitchHz = ref<number>(load().edgePitchHz)

 // Auto-persist on change
 watch(
  [provider, webspeechVoice, openaiApiKey, openaiBaseUrl, openaiModel, openaiVoice,
-   customUrl, customApiKey, edgeUrl, edgeVoice],
+   customUrl, customApiKey, edgeUrl, edgeVoice, edgeRate, edgePitchHz],
  () => {
    localStorage.setItem(STORAGE_KEY, JSON.stringify({
      provider: provider.value,
@@ -120,6 +126,8 @@ watch(
      customApiKey: customApiKey.value,
      edgeUrl: edgeUrl.value,
      edgeVoice: edgeVoice.value,
+      edgeRate: edgeRate.value,
+      edgePitchHz: edgePitchHz.value,
    }))
  },
 )
@@ -136,6 +144,8 @@ export function useVoiceSettings() {
    customApiKey,
    edgeUrl,
    edgeVoice,
+    edgeRate,
+    edgePitchHz,

    setProvider(v: TtsProvider) { provider.value = v },
    setWebSpeechVoice(v: string) { webspeechVoice.value = v },
@@ -147,6 +157,8 @@ export function useVoiceSettings() {
    setCustomApiKey(v: string) { customApiKey.value = v },
    setEdgeUrl(v: string) { edgeUrl.value = v },
    setEdgeVoice(v: string) { edgeVoice.value = v },
+    setEdgeRate(v: number) { edgeRate.value = v },
+    setEdgePitchHz(v: number) { edgePitchHz.value = v },

    reset() {
      provider.value = DEFAULT.provider
@@ -159,6 +171,8 @@ export function useVoiceSettings() {
      customApiKey.value = DEFAULT.customApiKey
      edgeUrl.value = DEFAULT.edgeUrl
      edgeVoice.value = DEFAULT.edgeVoice
+      edgeRate.value = DEFAULT.edgeRate
+      edgePitchHz.value = DEFAULT.edgePitchHz
    },
  }
 }
@@ -630,6 +630,11 @@ jobTriggered: 'Job ausgelost',
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: 'Stimme',
      edgeVoiceHint: 'Waehlen Sie eine Stimme fuer die Sprachsynthese',
+      edgeRate: 'Geschwindigkeit',
+      edgeRateHint: 'Sprachgeschwindigkeit anpassen (0,5x ~ 2,0x)',
+      edgePitch: 'Tonhöhe',
+      edgePitchHint: 'Tonhöhe anpassen (-20 ~ +20 Hz)',
+

      // Test
      testTitle: 'Sprachtest',
@@ -793,6 +793,11 @@ export default {
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: 'Voice',
      edgeVoiceHint: 'Select a voice for speech synthesis',
+      edgeRate: 'Speed',
+      edgeRateHint: 'Adjust speech speed (0.5x ~ 2.0x)',
+      edgePitch: 'Pitch',
+      edgePitchHint: 'Adjust speech pitch (-20 ~ +20 Hz)',
+

      // Test
      testTitle: 'Test Voice',
@@ -630,6 +630,11 @@ jobTriggered: 'Job ejecutado',
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: 'Voz',
      edgeVoiceHint: 'Seleccione una voz para la sintesis de voz',
+      edgeRate: 'Velocidad',
+      edgeRateHint: 'Ajustar velocidad del habla (0.5x ~ 2.0x)',
+      edgePitch: 'Tono',
+      edgePitchHint: 'Ajustar tono del habla (-20 ~ +20 Hz)',
+

      // Test
      testTitle: 'Prueba de voz',
@@ -630,6 +630,11 @@ jobTriggered: 'Job declenche',
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: 'Voix',
      edgeVoiceHint: 'Choisir une voix pour la synthese vocale',
+      edgeRate: 'Vitesse',
+      edgeRateHint: "Ajuster la vitesse de la voix (0.5x ~ 2.0x)",
+      edgePitch: 'Hauteur',
+      edgePitchHint: "Ajuster la hauteur de la voix (-20 ~ +20 Hz)",
+

      // Test
      testTitle: 'Test vocal',
@@ -630,6 +630,11 @@ export default {
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: '音色',
      edgeVoiceHint: '音声合成に使用する音色を選択',
+      edgeRate: '速度',
+      edgeRateHint: '音声の速度を調整（0.5～2.0倍）',
+      edgePitch: 'ピッチ',
+      edgePitchHint: '音声のピッチを調整（-20～+20 Hz）',
+

      // Test
      testTitle: '音声テスト',
@@ -630,6 +630,11 @@ export default {
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: '음색',
      edgeVoiceHint: '음성 합성에 사용할 음색 선택',
+      edgeRate: '속도',
+      edgeRateHint: '음성 속도 조절 (0.5~2.0배)',
+      edgePitch: '음높이',
+      edgePitchHint: '음성 음높이 조절 (-20~+20 Hz)',
+

      // Test
      testTitle: '음성 테스트',
@@ -630,6 +630,11 @@ jobTriggered: 'Job acionado',
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: 'Voz',
      edgeVoiceHint: 'Selecione uma voz para sintese de fala',
+      edgeRate: 'Velocidade',
+      edgeRateHint: 'Ajustar velocidade da fala (0.5x ~ 2.0x)',
+      edgePitch: 'Tom',
+      edgePitchHint: 'Ajustar tom da fala (-20 ~ +20 Hz)',
+

      // Test
      testTitle: 'Teste de voz',
@@ -785,6 +785,11 @@ export default {
      edgeUrlPlaceholder: 'http://127.0.0.1:9882',
      edgeVoice: '音色',
      edgeVoiceHint: '选择用于语音合成的音色',
+      edgeRate: '语速',
+      edgeRateHint: '调整语音速度（0.5～2.0 倍）',
+      edgePitch: '音调',
+      edgePitchHint: '调整语音音调（-20～+20 Hz）',
+

      // 试听
      testTitle: '试听测试',
@@ -0,0 +1,16 @@
+/**
+ * 语速倍率 → Edge TTS rate 字符串
+ * 1.0 → "0%", 1.2 → "+20%", 0.5 → "-50%"
+ */
+export function speedToEdgeRate(speed: number): string {
+  const percent = Math.round((speed - 1) * 100)
+  return percent >= 0 ? `+${percent}%` : `${percent}%`
+}
+
+/**
+ * Hz 偏移值 → Edge TTS pitch 字符串
+ * 0 → "0Hz", 12 → "+12Hz", -8 → "-8Hz"
+ */
+export function hzToEdgePitch(hz: number): string {
+  return hz >= 0 ? `+${hz}Hz` : `${hz}Hz`
+}
@@ -38,6 +38,8 @@ export async function openaiProxy(ctx: Context) {
    voice?: string
    speed?: number
    model?: string
+    rate?: string
+    pitch?: string
  }

  if (!body.input || typeof body.input !== 'string') {
@@ -57,6 +59,8 @@ export async function openaiProxy(ctx: Context) {
    voice: body.voice,
    speed: body.speed,
    model: body.model,
+    rate: body.rate,
+    pitch: body.pitch,
  })

  ctx.set('Content-Type', 'audio/mpeg')
@@ -64,6 +64,8 @@ export interface OpenaiTtsRequest {
  input: string
  voice?: string
  speed?: number
+  rate?: string   // Edge TTS rate format, e.g. "+20%". Takes priority over speed.
+  pitch?: string  // Edge TTS pitch format, e.g. "-8Hz"
 }

 export async function openaiCompatibleTts(
@@ -72,7 +74,7 @@ export async function openaiCompatibleTts(
  return textToSpeech({
    text: body.input,
    voice: body.voice || FIXED_VOICE,
-    rate: body.speed ? speedToEdgeRate(body.speed) : FIXED_RATE,
-    pitch: FIXED_PITCH,
+    rate: body.rate || (body.speed ? speedToEdgeRate(body.speed) : FIXED_RATE),
+    pitch: body.pitch || FIXED_PITCH,
  })
 }