feat(hermes): add tts voice controls

2026-07-16 01:42:19 +08:00 · 2026-05-27 04:02:36 +08:00
parent 68320496b3
commit c196cef026
8 changed files with 1149 additions and 1 deletions
--- a/scripts/dev-api.js
+++ b/scripts/dev-api.js
@@ -3330,6 +3330,8 @@ const HERMES_STT_PROVIDERS = new Set(['auto', 'local', 'groq', 'openai', 'mistra
 const HERMES_STT_LOCAL_MODELS = new Set(['tiny', 'base', 'small', 'medium', 'large-v3', 'turbo'])
 const HERMES_STT_OPENAI_MODELS = new Set(['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe'])
 const HERMES_STT_MISTRAL_MODELS = new Set(['voxtral-mini-latest', 'voxtral-mini-2602'])
+const HERMES_TTS_PROVIDERS = new Set(['edge', 'elevenlabs', 'openai', 'xai', 'minimax', 'mistral', 'gemini', 'neutts', 'kittentts', 'piper'])
+const HERMES_TTS_OPENAI_VOICES = new Set(['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'])
 const HERMES_AUXILIARY_PROVIDERS = new Set(['auto', 'openrouter', 'nous', 'gemini', 'ollama-cloud', 'codex', 'main'])
 const HERMES_APPROVAL_MODES = new Set(['manual', 'smart', 'off'])
 const HERMES_APPROVAL_CRON_MODES = new Set(['deny', 'approve'])
@@ -3506,6 +3508,28 @@ function normalizeHermesSttLanguage(value, strict = false) {
  return ''
 }

+function normalizeHermesTtsProvider(value, strict = false) {
+  const provider = String(value ?? '').trim().toLowerCase() || 'edge'
+  if (HERMES_TTS_PROVIDERS.has(provider)) return provider
+  if (strict) throw new Error('tts.provider 必须是 edge、elevenlabs、openai、xai、minimax、mistral、gemini、neutts、kittentts 或 piper')
+  return 'edge'
+}
+
+function normalizeHermesTtsOpenaiVoice(value, strict = false) {
+  const voice = String(value ?? '').trim().toLowerCase() || 'alloy'
+  if (HERMES_TTS_OPENAI_VOICES.has(voice)) return voice
+  if (strict) throw new Error('tts.openai.voice 必须是 alloy、echo、fable、onyx、nova 或 shimmer')
+  return 'alloy'
+}
+
+function normalizeHermesVoiceLanguage(value, strict = false, key = 'tts.xai.language') {
+  const language = String(value ?? '').trim()
+  if (!language) return 'en'
+  if (/^[a-z]{2,3}(-[A-Za-z0-9]+)?$/.test(language)) return language
+  if (strict) throw new Error(`${key} 必须是合法语言标签，例如 en、zh、pt-BR`)
+  return 'en'
+}
+
 function normalizeHermesApprovalMode(value, strict = false) {
  const mode = String(value ?? '').trim().toLowerCase() || 'manual'
  if (HERMES_APPROVAL_MODES.has(mode)) return mode
@@ -5479,6 +5503,92 @@ export function buildHermesSttConfigValues(config = {}) {
  }
 }

+export function buildHermesTtsVoiceConfigValues(config = {}) {
+  const root = config && typeof config === 'object' && !Array.isArray(config) ? config : {}
+  const tts = root.tts && typeof root.tts === 'object' && !Array.isArray(root.tts) ? root.tts : {}
+  const edge = tts.edge && typeof tts.edge === 'object' && !Array.isArray(tts.edge) ? tts.edge : {}
+  const openai = tts.openai && typeof tts.openai === 'object' && !Array.isArray(tts.openai) ? tts.openai : {}
+  const elevenlabs = tts.elevenlabs && typeof tts.elevenlabs === 'object' && !Array.isArray(tts.elevenlabs) ? tts.elevenlabs : {}
+  const xai = tts.xai && typeof tts.xai === 'object' && !Array.isArray(tts.xai) ? tts.xai : {}
+  const mistral = tts.mistral && typeof tts.mistral === 'object' && !Array.isArray(tts.mistral) ? tts.mistral : {}
+  const piper = tts.piper && typeof tts.piper === 'object' && !Array.isArray(tts.piper) ? tts.piper : {}
+  const voice = root.voice && typeof root.voice === 'object' && !Array.isArray(root.voice) ? root.voice : {}
+  return {
+    ttsProvider: normalizeHermesTtsProvider(tts.provider, false),
+    ttsEdgeVoice: typeof edge.voice === 'string' ? edge.voice.trim() : 'en-US-AriaNeural',
+    ttsOpenaiModel: typeof openai.model === 'string' && openai.model.trim() ? openai.model.trim() : 'gpt-4o-mini-tts',
+    ttsOpenaiVoice: normalizeHermesTtsOpenaiVoice(openai.voice, false),
+    ttsElevenlabsVoiceId: typeof elevenlabs.voice_id === 'string' ? elevenlabs.voice_id.trim() : 'pNInz6obpgDQGcFmaJgB',
+    ttsElevenlabsModelId: typeof elevenlabs.model_id === 'string' ? elevenlabs.model_id.trim() : 'eleven_multilingual_v2',
+    ttsXaiVoiceId: typeof xai.voice_id === 'string' && xai.voice_id.trim() ? xai.voice_id.trim() : 'eve',
+    ttsXaiLanguage: normalizeHermesVoiceLanguage(xai.language, false, 'tts.xai.language'),
+    ttsXaiSampleRate: parseHermesInteger(xai.sample_rate, 'tts.xai.sample_rate', 24000, 8000, 192000, false),
+    ttsXaiBitRate: parseHermesInteger(xai.bit_rate, 'tts.xai.bit_rate', 128000, 16000, 512000, false),
+    ttsMistralModel: typeof mistral.model === 'string' && mistral.model.trim() ? mistral.model.trim() : 'voxtral-mini-tts-2603',
+    ttsMistralVoiceId: typeof mistral.voice_id === 'string' ? mistral.voice_id.trim() : 'c69964a6-ab8b-4f8a-9465-ec0925096ec8',
+    ttsPiperVoice: typeof piper.voice === 'string' ? piper.voice.trim() : 'en_US-lessac-medium',
+    voiceRecordKey: typeof voice.record_key === 'string' ? voice.record_key.trim() : 'ctrl+b',
+    voiceMaxRecordingSeconds: parseHermesInteger(voice.max_recording_seconds, 'voice.max_recording_seconds', 120, 1, 3600, false),
+    voiceAutoTts: readHermesBool(voice.auto_tts, false),
+    voiceBeepEnabled: readHermesBool(voice.beep_enabled, true),
+    voiceSilenceThreshold: parseHermesInteger(voice.silence_threshold, 'voice.silence_threshold', 200, 0, 32767, false),
+    voiceSilenceDuration: parseHermesFloat(voice.silence_duration, 'voice.silence_duration', 3, 0.1, 60, false),
+  }
+}
+
+export function mergeHermesTtsVoiceConfig(config = {}, form = {}) {
+  const next = mergeConfigsPreservingFields({}, config && typeof config === 'object' && !Array.isArray(config) ? config : {})
+  const currentValues = buildHermesTtsVoiceConfigValues(next)
+  const tts = next.tts && typeof next.tts === 'object' && !Array.isArray(next.tts) ? mergeConfigsPreservingFields(next.tts, {}) : {}
+  const edge = tts.edge && typeof tts.edge === 'object' && !Array.isArray(tts.edge) ? mergeConfigsPreservingFields(tts.edge, {}) : {}
+  const openai = tts.openai && typeof tts.openai === 'object' && !Array.isArray(tts.openai) ? mergeConfigsPreservingFields(tts.openai, {}) : {}
+  const elevenlabs = tts.elevenlabs && typeof tts.elevenlabs === 'object' && !Array.isArray(tts.elevenlabs) ? mergeConfigsPreservingFields(tts.elevenlabs, {}) : {}
+  const xai = tts.xai && typeof tts.xai === 'object' && !Array.isArray(tts.xai) ? mergeConfigsPreservingFields(tts.xai, {}) : {}
+  const mistral = tts.mistral && typeof tts.mistral === 'object' && !Array.isArray(tts.mistral) ? mergeConfigsPreservingFields(tts.mistral, {}) : {}
+  const piper = tts.piper && typeof tts.piper === 'object' && !Array.isArray(tts.piper) ? mergeConfigsPreservingFields(tts.piper, {}) : {}
+  const voice = next.voice && typeof next.voice === 'object' && !Array.isArray(next.voice) ? mergeConfigsPreservingFields(next.voice, {}) : {}
+  tts.provider = normalizeHermesTtsProvider(Object.hasOwn(form, 'ttsProvider') ? form.ttsProvider : currentValues.ttsProvider, true)
+  const edgeVoice = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsEdgeVoice') ? form.ttsEdgeVoice : currentValues.ttsEdgeVoice, 'tts.edge.voice')
+  if (edgeVoice) edge.voice = edgeVoice
+  else delete edge.voice
+  openai.model = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsOpenaiModel') ? form.ttsOpenaiModel : currentValues.ttsOpenaiModel, 'tts.openai.model') || 'gpt-4o-mini-tts'
+  openai.voice = normalizeHermesTtsOpenaiVoice(Object.hasOwn(form, 'ttsOpenaiVoice') ? form.ttsOpenaiVoice : currentValues.ttsOpenaiVoice, true)
+  const elevenlabsVoiceId = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsElevenlabsVoiceId') ? form.ttsElevenlabsVoiceId : currentValues.ttsElevenlabsVoiceId, 'tts.elevenlabs.voice_id')
+  if (elevenlabsVoiceId) elevenlabs.voice_id = elevenlabsVoiceId
+  else delete elevenlabs.voice_id
+  const elevenlabsModelId = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsElevenlabsModelId') ? form.ttsElevenlabsModelId : currentValues.ttsElevenlabsModelId, 'tts.elevenlabs.model_id')
+  if (elevenlabsModelId) elevenlabs.model_id = elevenlabsModelId
+  else delete elevenlabs.model_id
+  xai.voice_id = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsXaiVoiceId') ? form.ttsXaiVoiceId : currentValues.ttsXaiVoiceId, 'tts.xai.voice_id') || 'eve'
+  xai.language = normalizeHermesVoiceLanguage(Object.hasOwn(form, 'ttsXaiLanguage') ? form.ttsXaiLanguage : currentValues.ttsXaiLanguage, true, 'tts.xai.language')
+  xai.sample_rate = parseHermesInteger(Object.hasOwn(form, 'ttsXaiSampleRate') ? form.ttsXaiSampleRate : currentValues.ttsXaiSampleRate, 'tts.xai.sample_rate', 24000, 8000, 192000, true)
+  xai.bit_rate = parseHermesInteger(Object.hasOwn(form, 'ttsXaiBitRate') ? form.ttsXaiBitRate : currentValues.ttsXaiBitRate, 'tts.xai.bit_rate', 128000, 16000, 512000, true)
+  mistral.model = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsMistralModel') ? form.ttsMistralModel : currentValues.ttsMistralModel, 'tts.mistral.model') || 'voxtral-mini-tts-2603'
+  const mistralVoiceId = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsMistralVoiceId') ? form.ttsMistralVoiceId : currentValues.ttsMistralVoiceId, 'tts.mistral.voice_id')
+  if (mistralVoiceId) mistral.voice_id = mistralVoiceId
+  else delete mistral.voice_id
+  const piperVoice = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsPiperVoice') ? form.ttsPiperVoice : currentValues.ttsPiperVoice, 'tts.piper.voice')
+  if (piperVoice) piper.voice = piperVoice
+  else delete piper.voice
+  const recordKey = normalizeHermesOptionalString(Object.hasOwn(form, 'voiceRecordKey') ? form.voiceRecordKey : currentValues.voiceRecordKey, 'voice.record_key')
+  if (recordKey) voice.record_key = recordKey
+  else delete voice.record_key
+  voice.max_recording_seconds = parseHermesInteger(Object.hasOwn(form, 'voiceMaxRecordingSeconds') ? form.voiceMaxRecordingSeconds : currentValues.voiceMaxRecordingSeconds, 'voice.max_recording_seconds', 120, 1, 3600, true)
+  voice.auto_tts = formHermesBool(form, 'voiceAutoTts', currentValues.voiceAutoTts)
+  voice.beep_enabled = formHermesBool(form, 'voiceBeepEnabled', currentValues.voiceBeepEnabled)
+  voice.silence_threshold = parseHermesInteger(Object.hasOwn(form, 'voiceSilenceThreshold') ? form.voiceSilenceThreshold : currentValues.voiceSilenceThreshold, 'voice.silence_threshold', 200, 0, 32767, true)
+  voice.silence_duration = parseHermesFloat(Object.hasOwn(form, 'voiceSilenceDuration') ? form.voiceSilenceDuration : currentValues.voiceSilenceDuration, 'voice.silence_duration', 3, 0.1, 60, true)
+  tts.edge = edge
+  tts.openai = openai
+  tts.elevenlabs = elevenlabs
+  tts.xai = xai
+  tts.mistral = mistral
+  tts.piper = piper
+  next.tts = tts
+  next.voice = voice
+  return next
+}
+
 export function mergeHermesSttConfig(config = {}, form = {}) {
  const next = mergeConfigsPreservingFields({}, config && typeof config === 'object' && !Array.isArray(config) ? config : {})
  const currentValues = buildHermesSttConfigValues(next)
@@ -12668,6 +12778,27 @@ const handlers = {
    }
  },

+  hermes_tts_voice_config_read() {
+    const { configPath, exists, config } = readHermesConfigYamlObject()
+    return {
+      exists,
+      configPath,
+      values: buildHermesTtsVoiceConfigValues(config),
+    }
+  },
+
+  hermes_tts_voice_config_save({ form } = {}) {
+    const { configPath, config } = readHermesConfigYamlObject()
+    const next = mergeHermesTtsVoiceConfig(config, form || {})
+    const backup = writeHermesConfigYamlObject(configPath, next)
+    return {
+      ok: true,
+      configPath,
+      backup,
+      values: buildHermesTtsVoiceConfigValues(next),
+    }
+  },
+
  hermes_terminal_config_read() {
    const { configPath, exists, config } = readHermesConfigYamlObject()
    return {