diff --git a/scripts/dev-api.js b/scripts/dev-api.js index 36ad220..1fc15e4 100644 --- a/scripts/dev-api.js +++ b/scripts/dev-api.js @@ -3330,6 +3330,8 @@ const HERMES_STT_PROVIDERS = new Set(['auto', 'local', 'groq', 'openai', 'mistra const HERMES_STT_LOCAL_MODELS = new Set(['tiny', 'base', 'small', 'medium', 'large-v3', 'turbo']) const HERMES_STT_OPENAI_MODELS = new Set(['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe']) const HERMES_STT_MISTRAL_MODELS = new Set(['voxtral-mini-latest', 'voxtral-mini-2602']) +const HERMES_TTS_PROVIDERS = new Set(['edge', 'elevenlabs', 'openai', 'xai', 'minimax', 'mistral', 'gemini', 'neutts', 'kittentts', 'piper']) +const HERMES_TTS_OPENAI_VOICES = new Set(['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer']) const HERMES_AUXILIARY_PROVIDERS = new Set(['auto', 'openrouter', 'nous', 'gemini', 'ollama-cloud', 'codex', 'main']) const HERMES_APPROVAL_MODES = new Set(['manual', 'smart', 'off']) const HERMES_APPROVAL_CRON_MODES = new Set(['deny', 'approve']) @@ -3506,6 +3508,28 @@ function normalizeHermesSttLanguage(value, strict = false) { return '' } +function normalizeHermesTtsProvider(value, strict = false) { + const provider = String(value ?? '').trim().toLowerCase() || 'edge' + if (HERMES_TTS_PROVIDERS.has(provider)) return provider + if (strict) throw new Error('tts.provider 必须是 edge、elevenlabs、openai、xai、minimax、mistral、gemini、neutts、kittentts 或 piper') + return 'edge' +} + +function normalizeHermesTtsOpenaiVoice(value, strict = false) { + const voice = String(value ?? '').trim().toLowerCase() || 'alloy' + if (HERMES_TTS_OPENAI_VOICES.has(voice)) return voice + if (strict) throw new Error('tts.openai.voice 必须是 alloy、echo、fable、onyx、nova 或 shimmer') + return 'alloy' +} + +function normalizeHermesVoiceLanguage(value, strict = false, key = 'tts.xai.language') { + const language = String(value ?? '').trim() + if (!language) return 'en' + if (/^[a-z]{2,3}(-[A-Za-z0-9]+)?$/.test(language)) return language + if (strict) throw new Error(`${key} 必须是合法语言标签,例如 en、zh、pt-BR`) + return 'en' +} + function normalizeHermesApprovalMode(value, strict = false) { const mode = String(value ?? '').trim().toLowerCase() || 'manual' if (HERMES_APPROVAL_MODES.has(mode)) return mode @@ -5479,6 +5503,92 @@ export function buildHermesSttConfigValues(config = {}) { } } +export function buildHermesTtsVoiceConfigValues(config = {}) { + const root = config && typeof config === 'object' && !Array.isArray(config) ? config : {} + const tts = root.tts && typeof root.tts === 'object' && !Array.isArray(root.tts) ? root.tts : {} + const edge = tts.edge && typeof tts.edge === 'object' && !Array.isArray(tts.edge) ? tts.edge : {} + const openai = tts.openai && typeof tts.openai === 'object' && !Array.isArray(tts.openai) ? tts.openai : {} + const elevenlabs = tts.elevenlabs && typeof tts.elevenlabs === 'object' && !Array.isArray(tts.elevenlabs) ? tts.elevenlabs : {} + const xai = tts.xai && typeof tts.xai === 'object' && !Array.isArray(tts.xai) ? tts.xai : {} + const mistral = tts.mistral && typeof tts.mistral === 'object' && !Array.isArray(tts.mistral) ? tts.mistral : {} + const piper = tts.piper && typeof tts.piper === 'object' && !Array.isArray(tts.piper) ? tts.piper : {} + const voice = root.voice && typeof root.voice === 'object' && !Array.isArray(root.voice) ? root.voice : {} + return { + ttsProvider: normalizeHermesTtsProvider(tts.provider, false), + ttsEdgeVoice: typeof edge.voice === 'string' ? edge.voice.trim() : 'en-US-AriaNeural', + ttsOpenaiModel: typeof openai.model === 'string' && openai.model.trim() ? openai.model.trim() : 'gpt-4o-mini-tts', + ttsOpenaiVoice: normalizeHermesTtsOpenaiVoice(openai.voice, false), + ttsElevenlabsVoiceId: typeof elevenlabs.voice_id === 'string' ? elevenlabs.voice_id.trim() : 'pNInz6obpgDQGcFmaJgB', + ttsElevenlabsModelId: typeof elevenlabs.model_id === 'string' ? elevenlabs.model_id.trim() : 'eleven_multilingual_v2', + ttsXaiVoiceId: typeof xai.voice_id === 'string' && xai.voice_id.trim() ? xai.voice_id.trim() : 'eve', + ttsXaiLanguage: normalizeHermesVoiceLanguage(xai.language, false, 'tts.xai.language'), + ttsXaiSampleRate: parseHermesInteger(xai.sample_rate, 'tts.xai.sample_rate', 24000, 8000, 192000, false), + ttsXaiBitRate: parseHermesInteger(xai.bit_rate, 'tts.xai.bit_rate', 128000, 16000, 512000, false), + ttsMistralModel: typeof mistral.model === 'string' && mistral.model.trim() ? mistral.model.trim() : 'voxtral-mini-tts-2603', + ttsMistralVoiceId: typeof mistral.voice_id === 'string' ? mistral.voice_id.trim() : 'c69964a6-ab8b-4f8a-9465-ec0925096ec8', + ttsPiperVoice: typeof piper.voice === 'string' ? piper.voice.trim() : 'en_US-lessac-medium', + voiceRecordKey: typeof voice.record_key === 'string' ? voice.record_key.trim() : 'ctrl+b', + voiceMaxRecordingSeconds: parseHermesInteger(voice.max_recording_seconds, 'voice.max_recording_seconds', 120, 1, 3600, false), + voiceAutoTts: readHermesBool(voice.auto_tts, false), + voiceBeepEnabled: readHermesBool(voice.beep_enabled, true), + voiceSilenceThreshold: parseHermesInteger(voice.silence_threshold, 'voice.silence_threshold', 200, 0, 32767, false), + voiceSilenceDuration: parseHermesFloat(voice.silence_duration, 'voice.silence_duration', 3, 0.1, 60, false), + } +} + +export function mergeHermesTtsVoiceConfig(config = {}, form = {}) { + const next = mergeConfigsPreservingFields({}, config && typeof config === 'object' && !Array.isArray(config) ? config : {}) + const currentValues = buildHermesTtsVoiceConfigValues(next) + const tts = next.tts && typeof next.tts === 'object' && !Array.isArray(next.tts) ? mergeConfigsPreservingFields(next.tts, {}) : {} + const edge = tts.edge && typeof tts.edge === 'object' && !Array.isArray(tts.edge) ? mergeConfigsPreservingFields(tts.edge, {}) : {} + const openai = tts.openai && typeof tts.openai === 'object' && !Array.isArray(tts.openai) ? mergeConfigsPreservingFields(tts.openai, {}) : {} + const elevenlabs = tts.elevenlabs && typeof tts.elevenlabs === 'object' && !Array.isArray(tts.elevenlabs) ? mergeConfigsPreservingFields(tts.elevenlabs, {}) : {} + const xai = tts.xai && typeof tts.xai === 'object' && !Array.isArray(tts.xai) ? mergeConfigsPreservingFields(tts.xai, {}) : {} + const mistral = tts.mistral && typeof tts.mistral === 'object' && !Array.isArray(tts.mistral) ? mergeConfigsPreservingFields(tts.mistral, {}) : {} + const piper = tts.piper && typeof tts.piper === 'object' && !Array.isArray(tts.piper) ? mergeConfigsPreservingFields(tts.piper, {}) : {} + const voice = next.voice && typeof next.voice === 'object' && !Array.isArray(next.voice) ? mergeConfigsPreservingFields(next.voice, {}) : {} + tts.provider = normalizeHermesTtsProvider(Object.hasOwn(form, 'ttsProvider') ? form.ttsProvider : currentValues.ttsProvider, true) + const edgeVoice = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsEdgeVoice') ? form.ttsEdgeVoice : currentValues.ttsEdgeVoice, 'tts.edge.voice') + if (edgeVoice) edge.voice = edgeVoice + else delete edge.voice + openai.model = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsOpenaiModel') ? form.ttsOpenaiModel : currentValues.ttsOpenaiModel, 'tts.openai.model') || 'gpt-4o-mini-tts' + openai.voice = normalizeHermesTtsOpenaiVoice(Object.hasOwn(form, 'ttsOpenaiVoice') ? form.ttsOpenaiVoice : currentValues.ttsOpenaiVoice, true) + const elevenlabsVoiceId = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsElevenlabsVoiceId') ? form.ttsElevenlabsVoiceId : currentValues.ttsElevenlabsVoiceId, 'tts.elevenlabs.voice_id') + if (elevenlabsVoiceId) elevenlabs.voice_id = elevenlabsVoiceId + else delete elevenlabs.voice_id + const elevenlabsModelId = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsElevenlabsModelId') ? form.ttsElevenlabsModelId : currentValues.ttsElevenlabsModelId, 'tts.elevenlabs.model_id') + if (elevenlabsModelId) elevenlabs.model_id = elevenlabsModelId + else delete elevenlabs.model_id + xai.voice_id = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsXaiVoiceId') ? form.ttsXaiVoiceId : currentValues.ttsXaiVoiceId, 'tts.xai.voice_id') || 'eve' + xai.language = normalizeHermesVoiceLanguage(Object.hasOwn(form, 'ttsXaiLanguage') ? form.ttsXaiLanguage : currentValues.ttsXaiLanguage, true, 'tts.xai.language') + xai.sample_rate = parseHermesInteger(Object.hasOwn(form, 'ttsXaiSampleRate') ? form.ttsXaiSampleRate : currentValues.ttsXaiSampleRate, 'tts.xai.sample_rate', 24000, 8000, 192000, true) + xai.bit_rate = parseHermesInteger(Object.hasOwn(form, 'ttsXaiBitRate') ? form.ttsXaiBitRate : currentValues.ttsXaiBitRate, 'tts.xai.bit_rate', 128000, 16000, 512000, true) + mistral.model = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsMistralModel') ? form.ttsMistralModel : currentValues.ttsMistralModel, 'tts.mistral.model') || 'voxtral-mini-tts-2603' + const mistralVoiceId = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsMistralVoiceId') ? form.ttsMistralVoiceId : currentValues.ttsMistralVoiceId, 'tts.mistral.voice_id') + if (mistralVoiceId) mistral.voice_id = mistralVoiceId + else delete mistral.voice_id + const piperVoice = normalizeHermesOptionalString(Object.hasOwn(form, 'ttsPiperVoice') ? form.ttsPiperVoice : currentValues.ttsPiperVoice, 'tts.piper.voice') + if (piperVoice) piper.voice = piperVoice + else delete piper.voice + const recordKey = normalizeHermesOptionalString(Object.hasOwn(form, 'voiceRecordKey') ? form.voiceRecordKey : currentValues.voiceRecordKey, 'voice.record_key') + if (recordKey) voice.record_key = recordKey + else delete voice.record_key + voice.max_recording_seconds = parseHermesInteger(Object.hasOwn(form, 'voiceMaxRecordingSeconds') ? form.voiceMaxRecordingSeconds : currentValues.voiceMaxRecordingSeconds, 'voice.max_recording_seconds', 120, 1, 3600, true) + voice.auto_tts = formHermesBool(form, 'voiceAutoTts', currentValues.voiceAutoTts) + voice.beep_enabled = formHermesBool(form, 'voiceBeepEnabled', currentValues.voiceBeepEnabled) + voice.silence_threshold = parseHermesInteger(Object.hasOwn(form, 'voiceSilenceThreshold') ? form.voiceSilenceThreshold : currentValues.voiceSilenceThreshold, 'voice.silence_threshold', 200, 0, 32767, true) + voice.silence_duration = parseHermesFloat(Object.hasOwn(form, 'voiceSilenceDuration') ? form.voiceSilenceDuration : currentValues.voiceSilenceDuration, 'voice.silence_duration', 3, 0.1, 60, true) + tts.edge = edge + tts.openai = openai + tts.elevenlabs = elevenlabs + tts.xai = xai + tts.mistral = mistral + tts.piper = piper + next.tts = tts + next.voice = voice + return next +} + export function mergeHermesSttConfig(config = {}, form = {}) { const next = mergeConfigsPreservingFields({}, config && typeof config === 'object' && !Array.isArray(config) ? config : {}) const currentValues = buildHermesSttConfigValues(next) @@ -12668,6 +12778,27 @@ const handlers = { } }, + hermes_tts_voice_config_read() { + const { configPath, exists, config } = readHermesConfigYamlObject() + return { + exists, + configPath, + values: buildHermesTtsVoiceConfigValues(config), + } + }, + + hermes_tts_voice_config_save({ form } = {}) { + const { configPath, config } = readHermesConfigYamlObject() + const next = mergeHermesTtsVoiceConfig(config, form || {}) + const backup = writeHermesConfigYamlObject(configPath, next) + return { + ok: true, + configPath, + backup, + values: buildHermesTtsVoiceConfigValues(next), + } + }, + hermes_terminal_config_read() { const { configPath, exists, config } = readHermesConfigYamlObject() return { diff --git a/src-tauri/src/commands/hermes.rs b/src-tauri/src/commands/hermes.rs index 03d05ae..77eaa5a 100644 --- a/src-tauri/src/commands/hermes.rs +++ b/src-tauri/src/commands/hermes.rs @@ -7228,6 +7228,85 @@ fn normalize_hermes_stt_language(value: Option, strict: bool) -> Result< } } +fn normalize_hermes_tts_provider(value: Option, strict: bool) -> Result { + let provider = value.unwrap_or_default().trim().to_ascii_lowercase(); + let provider = if provider.is_empty() { + "edge".to_string() + } else { + provider + }; + if matches!( + provider.as_str(), + "edge" + | "elevenlabs" + | "openai" + | "xai" + | "minimax" + | "mistral" + | "gemini" + | "neutts" + | "kittentts" + | "piper" + ) { + return Ok(provider); + } + if strict { + Err("tts.provider 必须是 edge、elevenlabs、openai、xai、minimax、mistral、gemini、neutts、kittentts 或 piper".to_string()) + } else { + Ok("edge".to_string()) + } +} + +fn normalize_hermes_tts_openai_voice( + value: Option, + strict: bool, +) -> Result { + let voice = value.unwrap_or_default().trim().to_ascii_lowercase(); + let voice = if voice.is_empty() { + "alloy".to_string() + } else { + voice + }; + if matches!( + voice.as_str(), + "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer" + ) { + return Ok(voice); + } + if strict { + Err("tts.openai.voice 必须是 alloy、echo、fable、onyx、nova 或 shimmer".to_string()) + } else { + Ok("alloy".to_string()) + } +} + +fn normalize_hermes_voice_language( + value: Option, + strict: bool, + key: &str, +) -> Result { + let language = value.unwrap_or_default().trim().to_string(); + if language.is_empty() { + return Ok("en".to_string()); + } + let mut parts = language.split('-'); + let Some(first) = parts.next() else { + return Ok("en".to_string()); + }; + let first_valid = + (2..=3).contains(&first.len()) && first.chars().all(|ch| ch.is_ascii_lowercase()); + let rest_valid = + parts.all(|part| !part.is_empty() && part.chars().all(|ch| ch.is_ascii_alphanumeric())); + if first_valid && rest_valid { + return Ok(language); + } + if strict { + Err(format!("{key} 必须是合法语言标签,例如 en、zh、pt-BR")) + } else { + Ok("en".to_string()) + } +} + fn normalize_hermes_approval_mode(value: Option, strict: bool) -> Result { let mode = value.unwrap_or_default().trim().to_ascii_lowercase(); let mode = if mode.is_empty() { @@ -8426,6 +8505,233 @@ fn merge_hermes_stt_config(config: &mut serde_yaml::Value, form: &Value) -> Resu Ok(()) } +fn build_hermes_tts_voice_config_values(config: &serde_yaml::Value) -> Value { + let root = config.as_mapping(); + let tts = root.and_then(|map| yaml_get_mapping(map, "tts")); + let edge = tts.and_then(|map| yaml_get_mapping(map, "edge")); + let openai = tts.and_then(|map| yaml_get_mapping(map, "openai")); + let elevenlabs = tts.and_then(|map| yaml_get_mapping(map, "elevenlabs")); + let xai = tts.and_then(|map| yaml_get_mapping(map, "xai")); + let mistral = tts.and_then(|map| yaml_get_mapping(map, "mistral")); + let piper = tts.and_then(|map| yaml_get_mapping(map, "piper")); + let voice = root.and_then(|map| yaml_get_mapping(map, "voice")); + let tts_string = |section: Option<&serde_yaml::Mapping>, key: &str, fallback: &str| { + section + .and_then(|map| yaml_string_field(map, key)) + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| fallback.to_string()) + }; + serde_json::json!({ + "ttsProvider": normalize_hermes_tts_provider(tts.and_then(|map| yaml_string_field(map, "provider")), false).unwrap_or_else(|_| "edge".to_string()), + "ttsEdgeVoice": tts_string(edge, "voice", "en-US-AriaNeural"), + "ttsOpenaiModel": tts_string(openai, "model", "gpt-4o-mini-tts"), + "ttsOpenaiVoice": normalize_hermes_tts_openai_voice(openai.and_then(|map| yaml_string_field(map, "voice")), false).unwrap_or_else(|_| "alloy".to_string()), + "ttsElevenlabsVoiceId": tts_string(elevenlabs, "voice_id", "pNInz6obpgDQGcFmaJgB"), + "ttsElevenlabsModelId": tts_string(elevenlabs, "model_id", "eleven_multilingual_v2"), + "ttsXaiVoiceId": tts_string(xai, "voice_id", "eve"), + "ttsXaiLanguage": normalize_hermes_voice_language(xai.and_then(|map| yaml_string_field(map, "language")), false, "tts.xai.language").unwrap_or_else(|_| "en".to_string()), + "ttsXaiSampleRate": xai.map(|map| bounded_hermes_i64(yaml_i64_field(map, "sample_rate"), 24000, 8000, 192000)).unwrap_or(24000), + "ttsXaiBitRate": xai.map(|map| bounded_hermes_i64(yaml_i64_field(map, "bit_rate"), 128000, 16000, 512000)).unwrap_or(128000), + "ttsMistralModel": tts_string(mistral, "model", "voxtral-mini-tts-2603"), + "ttsMistralVoiceId": tts_string(mistral, "voice_id", "c69964a6-ab8b-4f8a-9465-ec0925096ec8"), + "ttsPiperVoice": tts_string(piper, "voice", "en_US-lessac-medium"), + "voiceRecordKey": voice.and_then(|map| yaml_string_field(map, "record_key")).map(|value| value.trim().to_string()).unwrap_or_else(|| "ctrl+b".to_string()), + "voiceMaxRecordingSeconds": voice.map(|map| bounded_hermes_i64(yaml_i64_field(map, "max_recording_seconds"), 120, 1, 3600)).unwrap_or(120), + "voiceAutoTts": voice.and_then(|map| yaml_bool_field(map, "auto_tts")).unwrap_or(false), + "voiceBeepEnabled": voice.and_then(|map| yaml_bool_field(map, "beep_enabled")).unwrap_or(true), + "voiceSilenceThreshold": voice.map(|map| bounded_hermes_i64(yaml_i64_field(map, "silence_threshold"), 200, 0, 32767)).unwrap_or(200), + "voiceSilenceDuration": voice.map(|map| bounded_hermes_f64(yaml_f64_field(map, "silence_duration"), 3.0, 0.1, 60.0)).unwrap_or(3.0), + }) +} + +fn merge_hermes_tts_voice_config( + config: &mut serde_yaml::Value, + form: &Value, +) -> Result<(), String> { + let current = build_hermes_tts_voice_config_values(config); + let form_or_current_string = |key: &str| { + if form.get(key).is_some() { + form_string(form, key) + } else { + current[key].as_str().map(ToString::to_string) + } + }; + let tts_provider = normalize_hermes_tts_provider(form_or_current_string("ttsProvider"), true)?; + let tts_edge_voice = form_or_current_string("ttsEdgeVoice") + .unwrap_or_default() + .trim() + .to_string(); + let tts_openai_model = form_or_current_string("ttsOpenaiModel") + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| "gpt-4o-mini-tts".to_string()); + let tts_openai_voice = + normalize_hermes_tts_openai_voice(form_or_current_string("ttsOpenaiVoice"), true)?; + let tts_elevenlabs_voice_id = form_or_current_string("ttsElevenlabsVoiceId") + .unwrap_or_default() + .trim() + .to_string(); + let tts_elevenlabs_model_id = form_or_current_string("ttsElevenlabsModelId") + .unwrap_or_default() + .trim() + .to_string(); + let tts_xai_voice_id = form_or_current_string("ttsXaiVoiceId") + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| "eve".to_string()); + let tts_xai_language = normalize_hermes_voice_language( + form_or_current_string("ttsXaiLanguage"), + true, + "tts.xai.language", + )?; + let tts_xai_sample_rate = validate_hermes_i64( + if form.get("ttsXaiSampleRate").is_some() { + form_i64(form, "ttsXaiSampleRate") + } else { + current["ttsXaiSampleRate"].as_i64() + }, + "tts.xai.sample_rate", + 24000, + 8000, + 192000, + )?; + let tts_xai_bit_rate = validate_hermes_i64( + if form.get("ttsXaiBitRate").is_some() { + form_i64(form, "ttsXaiBitRate") + } else { + current["ttsXaiBitRate"].as_i64() + }, + "tts.xai.bit_rate", + 128000, + 16000, + 512000, + )?; + let tts_mistral_model = form_or_current_string("ttsMistralModel") + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) + .unwrap_or_else(|| "voxtral-mini-tts-2603".to_string()); + let tts_mistral_voice_id = form_or_current_string("ttsMistralVoiceId") + .unwrap_or_default() + .trim() + .to_string(); + let tts_piper_voice = form_or_current_string("ttsPiperVoice") + .unwrap_or_default() + .trim() + .to_string(); + let voice_record_key = form_or_current_string("voiceRecordKey") + .unwrap_or_default() + .trim() + .to_string(); + let voice_max_recording_seconds = validate_hermes_i64( + if form.get("voiceMaxRecordingSeconds").is_some() { + form_i64(form, "voiceMaxRecordingSeconds") + } else { + current["voiceMaxRecordingSeconds"].as_i64() + }, + "voice.max_recording_seconds", + 120, + 1, + 3600, + )?; + let voice_auto_tts = form_bool(form, "voiceAutoTts") + .unwrap_or_else(|| current["voiceAutoTts"].as_bool().unwrap_or(false)); + let voice_beep_enabled = form_bool(form, "voiceBeepEnabled") + .unwrap_or_else(|| current["voiceBeepEnabled"].as_bool().unwrap_or(true)); + let voice_silence_threshold = validate_hermes_i64( + if form.get("voiceSilenceThreshold").is_some() { + form_i64(form, "voiceSilenceThreshold") + } else { + current["voiceSilenceThreshold"].as_i64() + }, + "voice.silence_threshold", + 200, + 0, + 32767, + )?; + let voice_silence_duration = validate_hermes_f64( + if form.get("voiceSilenceDuration").is_some() { + form_f64(form, "voiceSilenceDuration") + } else { + current["voiceSilenceDuration"].as_f64() + }, + "voice.silence_duration", + 3.0, + 0.1, + 60.0, + )?; + + let root = ensure_yaml_object(config)?; + let tts = yaml_child_object(root, "tts")?; + tts.insert( + yaml_key("provider"), + serde_yaml::Value::String(tts_provider), + ); + let edge = yaml_child_object(tts, "edge")?; + set_optional_yaml_string(edge, "voice", tts_edge_voice); + let openai = yaml_child_object(tts, "openai")?; + openai.insert( + yaml_key("model"), + serde_yaml::Value::String(tts_openai_model), + ); + openai.insert( + yaml_key("voice"), + serde_yaml::Value::String(tts_openai_voice), + ); + let elevenlabs = yaml_child_object(tts, "elevenlabs")?; + set_optional_yaml_string(elevenlabs, "voice_id", tts_elevenlabs_voice_id); + set_optional_yaml_string(elevenlabs, "model_id", tts_elevenlabs_model_id); + let xai = yaml_child_object(tts, "xai")?; + xai.insert( + yaml_key("voice_id"), + serde_yaml::Value::String(tts_xai_voice_id), + ); + xai.insert( + yaml_key("language"), + serde_yaml::Value::String(tts_xai_language), + ); + xai.insert( + yaml_key("sample_rate"), + serde_yaml::Value::Number(tts_xai_sample_rate.into()), + ); + xai.insert( + yaml_key("bit_rate"), + serde_yaml::Value::Number(tts_xai_bit_rate.into()), + ); + let mistral = yaml_child_object(tts, "mistral")?; + mistral.insert( + yaml_key("model"), + serde_yaml::Value::String(tts_mistral_model), + ); + set_optional_yaml_string(mistral, "voice_id", tts_mistral_voice_id); + let piper = yaml_child_object(tts, "piper")?; + set_optional_yaml_string(piper, "voice", tts_piper_voice); + + let voice = yaml_child_object(root, "voice")?; + set_optional_yaml_string(voice, "record_key", voice_record_key); + voice.insert( + yaml_key("max_recording_seconds"), + serde_yaml::Value::Number(voice_max_recording_seconds.into()), + ); + voice.insert( + yaml_key("auto_tts"), + serde_yaml::Value::Bool(voice_auto_tts), + ); + voice.insert( + yaml_key("beep_enabled"), + serde_yaml::Value::Bool(voice_beep_enabled), + ); + voice.insert( + yaml_key("silence_threshold"), + serde_yaml::Value::Number(voice_silence_threshold.into()), + ); + voice.insert( + yaml_key("silence_duration"), + serde_yaml::to_value(voice_silence_duration).map_err(|err| err.to_string())?, + ); + Ok(()) +} + fn merge_hermes_execution_limits_config( config: &mut serde_yaml::Value, form: &Value, @@ -10688,6 +10994,30 @@ pub fn hermes_stt_config_save(form: Value) -> Result { })) } +#[tauri::command] +pub fn hermes_tts_voice_config_read() -> Result { + let (config_path, exists, config) = read_hermes_channel_yaml_config()?; + ensure_yaml_object(&mut config.clone())?; + Ok(serde_json::json!({ + "exists": exists, + "configPath": config_path.to_string_lossy(), + "values": build_hermes_tts_voice_config_values(&config), + })) +} + +#[tauri::command] +pub fn hermes_tts_voice_config_save(form: Value) -> Result { + let (config_path, _exists, mut config) = read_hermes_channel_yaml_config()?; + merge_hermes_tts_voice_config(&mut config, &form)?; + let backup = write_hermes_yaml_config(&config_path, &config)?; + Ok(serde_json::json!({ + "ok": true, + "configPath": config_path.to_string_lossy(), + "backup": backup, + "values": build_hermes_tts_voice_config_values(&config), + })) +} + #[tauri::command] pub fn hermes_terminal_config_read() -> Result { let (config_path, exists, config) = read_hermes_channel_yaml_config()?; @@ -17241,6 +17571,242 @@ memory: } } +#[cfg(test)] +mod hermes_tts_voice_config_tests { + use super::{build_hermes_tts_voice_config_values, merge_hermes_tts_voice_config}; + use serde_json::json; + + #[test] + fn tts_voice_values_have_upstream_defaults() { + let config: serde_yaml::Value = serde_yaml::from_str("{}").unwrap(); + let values = build_hermes_tts_voice_config_values(&config); + assert_eq!(values["ttsProvider"], "edge"); + assert_eq!(values["ttsEdgeVoice"], "en-US-AriaNeural"); + assert_eq!(values["ttsOpenaiModel"], "gpt-4o-mini-tts"); + assert_eq!(values["ttsOpenaiVoice"], "alloy"); + assert_eq!(values["ttsElevenlabsVoiceId"], "pNInz6obpgDQGcFmaJgB"); + assert_eq!(values["ttsElevenlabsModelId"], "eleven_multilingual_v2"); + assert_eq!(values["ttsXaiVoiceId"], "eve"); + assert_eq!(values["ttsXaiLanguage"], "en"); + assert_eq!(values["ttsXaiSampleRate"], 24000); + assert_eq!(values["ttsXaiBitRate"], 128000); + assert_eq!(values["ttsMistralModel"], "voxtral-mini-tts-2603"); + assert_eq!( + values["ttsMistralVoiceId"], + "c69964a6-ab8b-4f8a-9465-ec0925096ec8" + ); + assert_eq!(values["ttsPiperVoice"], "en_US-lessac-medium"); + assert_eq!(values["voiceRecordKey"], "ctrl+b"); + assert_eq!(values["voiceMaxRecordingSeconds"], 120); + assert_eq!(values["voiceAutoTts"], false); + assert_eq!(values["voiceBeepEnabled"], true); + assert_eq!(values["voiceSilenceThreshold"], 200); + assert_eq!(values["voiceSilenceDuration"], 3.0); + } + + #[test] + fn tts_voice_values_read_yaml_fields() { + let config: serde_yaml::Value = serde_yaml::from_str( + r#" +tts: + provider: openai + edge: + voice: zh-CN-XiaoxiaoNeural + openai: + model: gpt-4o-mini-tts + voice: nova + elevenlabs: + voice_id: voice-123 + model_id: eleven_turbo_v2_5 + xai: + voice_id: custom-eve + language: zh + sample_rate: 48000 + bit_rate: 192000 + mistral: + model: voxtral-mini-tts-2603 + voice_id: mistral-voice + piper: + voice: zh_CN-huayan-medium +voice: + record_key: ctrl+shift+v + max_recording_seconds: 240 + auto_tts: true + beep_enabled: false + silence_threshold: 350 + silence_duration: 1.5 +"#, + ) + .unwrap(); + let values = build_hermes_tts_voice_config_values(&config); + assert_eq!(values["ttsProvider"], "openai"); + assert_eq!(values["ttsEdgeVoice"], "zh-CN-XiaoxiaoNeural"); + assert_eq!(values["ttsOpenaiVoice"], "nova"); + assert_eq!(values["ttsElevenlabsVoiceId"], "voice-123"); + assert_eq!(values["ttsXaiLanguage"], "zh"); + assert_eq!(values["ttsXaiSampleRate"], 48000); + assert_eq!(values["ttsMistralVoiceId"], "mistral-voice"); + assert_eq!(values["ttsPiperVoice"], "zh_CN-huayan-medium"); + assert_eq!(values["voiceRecordKey"], "ctrl+shift+v"); + assert_eq!(values["voiceAutoTts"], true); + assert_eq!(values["voiceBeepEnabled"], false); + assert_eq!(values["voiceSilenceDuration"], 1.5); + } + + #[test] + fn merge_tts_voice_config_preserves_unknown_fields() { + let mut config: serde_yaml::Value = serde_yaml::from_str( + r#" +model: + provider: anthropic +tts: + provider: edge + custom_flag: keep-tts + openai: + custom_flag: keep-openai + piper: + voices_dir: /cache/piper +voice: + custom_flag: keep-voice +streaming: + enabled: true +"#, + ) + .unwrap(); + + merge_hermes_tts_voice_config( + &mut config, + &json!({ + "ttsProvider": "openai", + "ttsEdgeVoice": "zh-CN-XiaoxiaoNeural", + "ttsOpenaiModel": "gpt-4o-mini-tts", + "ttsOpenaiVoice": "nova", + "ttsElevenlabsVoiceId": "voice-123", + "ttsElevenlabsModelId": "eleven_turbo_v2_5", + "ttsXaiVoiceId": "eve-pro", + "ttsXaiLanguage": "zh", + "ttsXaiSampleRate": "48000", + "ttsXaiBitRate": "192000", + "ttsMistralModel": "voxtral-mini-tts-2603", + "ttsMistralVoiceId": "mistral-voice", + "ttsPiperVoice": "zh_CN-huayan-medium", + "voiceRecordKey": "ctrl+shift+v", + "voiceMaxRecordingSeconds": "240", + "voiceAutoTts": true, + "voiceBeepEnabled": false, + "voiceSilenceThreshold": "350", + "voiceSilenceDuration": "1.5", + }), + ) + .unwrap(); + + assert_eq!(config["model"]["provider"].as_str(), Some("anthropic")); + assert_eq!(config["streaming"]["enabled"].as_bool(), Some(true)); + assert_eq!(config["tts"]["provider"].as_str(), Some("openai")); + assert_eq!( + config["tts"]["edge"]["voice"].as_str(), + Some("zh-CN-XiaoxiaoNeural") + ); + assert_eq!(config["tts"]["openai"]["voice"].as_str(), Some("nova")); + assert_eq!( + config["tts"]["openai"]["custom_flag"].as_str(), + Some("keep-openai") + ); + assert_eq!( + config["tts"]["elevenlabs"]["voice_id"].as_str(), + Some("voice-123") + ); + assert_eq!(config["tts"]["xai"]["sample_rate"].as_i64(), Some(48000)); + assert_eq!(config["tts"]["xai"]["bit_rate"].as_i64(), Some(192000)); + assert_eq!( + config["tts"]["mistral"]["voice_id"].as_str(), + Some("mistral-voice") + ); + assert_eq!( + config["tts"]["piper"]["voice"].as_str(), + Some("zh_CN-huayan-medium") + ); + assert_eq!( + config["tts"]["piper"]["voices_dir"].as_str(), + Some("/cache/piper") + ); + assert_eq!(config["tts"]["custom_flag"].as_str(), Some("keep-tts")); + assert_eq!(config["voice"]["record_key"].as_str(), Some("ctrl+shift+v")); + assert_eq!(config["voice"]["max_recording_seconds"].as_i64(), Some(240)); + assert_eq!(config["voice"]["auto_tts"].as_bool(), Some(true)); + assert_eq!(config["voice"]["beep_enabled"].as_bool(), Some(false)); + assert_eq!(config["voice"]["silence_threshold"].as_i64(), Some(350)); + assert_eq!(config["voice"]["silence_duration"].as_f64(), Some(1.5)); + assert_eq!(config["voice"]["custom_flag"].as_str(), Some("keep-voice")); + } + + #[test] + fn merge_tts_voice_config_removes_empty_optional_overrides() { + let mut config: serde_yaml::Value = serde_yaml::from_str( + r#" +tts: + edge: + voice: custom-edge + elevenlabs: + voice_id: voice-123 + model_id: model-123 + piper: + voice: custom-piper + voices_dir: /cache/piper +voice: + record_key: ctrl+shift+v + custom_flag: keep-voice +"#, + ) + .unwrap(); + + merge_hermes_tts_voice_config( + &mut config, + &json!({ + "ttsEdgeVoice": "", + "ttsElevenlabsVoiceId": " ", + "ttsElevenlabsModelId": "", + "ttsPiperVoice": "", + "voiceRecordKey": "", + }), + ) + .unwrap(); + + assert!(config["tts"]["edge"]["voice"].is_null()); + assert!(config["tts"]["elevenlabs"]["voice_id"].is_null()); + assert!(config["tts"]["elevenlabs"]["model_id"].is_null()); + assert!(config["tts"]["piper"]["voice"].is_null()); + assert_eq!( + config["tts"]["piper"]["voices_dir"].as_str(), + Some("/cache/piper") + ); + assert!(config["voice"]["record_key"].is_null()); + assert_eq!(config["voice"]["custom_flag"].as_str(), Some("keep-voice")); + } + + #[test] + fn merge_tts_voice_config_rejects_invalid_values() { + let mut config = serde_yaml::Value::Mapping(serde_yaml::Mapping::new()); + let err = merge_hermes_tts_voice_config(&mut config, &json!({ "ttsProvider": "bad" })) + .unwrap_err(); + assert!(err.contains("tts.provider")); + let err = merge_hermes_tts_voice_config(&mut config, &json!({ "ttsOpenaiVoice": "robot" })) + .unwrap_err(); + assert!(err.contains("tts.openai.voice")); + let err = merge_hermes_tts_voice_config(&mut config, &json!({ "ttsXaiSampleRate": "0" })) + .unwrap_err(); + assert!(err.contains("tts.xai.sample_rate")); + let err = + merge_hermes_tts_voice_config(&mut config, &json!({ "voiceMaxRecordingSeconds": "0" })) + .unwrap_err(); + assert!(err.contains("voice.max_recording_seconds")); + let err = + merge_hermes_tts_voice_config(&mut config, &json!({ "voiceSilenceDuration": "-1" })) + .unwrap_err(); + assert!(err.contains("voice.silence_duration")); + } +} + #[cfg(test)] mod hermes_checkpoints_config_tests { use super::{build_hermes_checkpoints_config_values, merge_hermes_checkpoints_config}; diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index 602f2c9..26aea4a 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -329,6 +329,8 @@ pub fn run() { hermes::hermes_browser_config_save, hermes::hermes_stt_config_read, hermes::hermes_stt_config_save, + hermes::hermes_tts_voice_config_read, + hermes::hermes_tts_voice_config_save, hermes::hermes_terminal_config_read, hermes::hermes_terminal_config_save, hermes::hermes_lazy_deps_features, diff --git a/src/engines/hermes/pages/config.js b/src/engines/hermes/pages/config.js index 3d92a75..89ca7a8 100644 --- a/src/engines/hermes/pages/config.js +++ b/src/engines/hermes/pages/config.js @@ -303,6 +303,28 @@ const STT_DEFAULTS = { sttMistralModel: 'voxtral-mini-latest', } +const TTS_VOICE_DEFAULTS = { + ttsProvider: 'edge', + ttsEdgeVoice: 'en-US-AriaNeural', + ttsOpenaiModel: 'gpt-4o-mini-tts', + ttsOpenaiVoice: 'alloy', + ttsElevenlabsVoiceId: 'pNInz6obpgDQGcFmaJgB', + ttsElevenlabsModelId: 'eleven_multilingual_v2', + ttsXaiVoiceId: 'eve', + ttsXaiLanguage: 'en', + ttsXaiSampleRate: 24000, + ttsXaiBitRate: 128000, + ttsMistralModel: 'voxtral-mini-tts-2603', + ttsMistralVoiceId: 'c69964a6-ab8b-4f8a-9465-ec0925096ec8', + ttsPiperVoice: 'en_US-lessac-medium', + voiceRecordKey: 'ctrl+b', + voiceMaxRecordingSeconds: 120, + voiceAutoTts: false, + voiceBeepEnabled: true, + voiceSilenceThreshold: 200, + voiceSilenceDuration: 3, +} + const TERMINAL_DEFAULTS = { terminalBackend: 'local', terminalCwd: '.', @@ -339,6 +361,8 @@ const STT_PROVIDERS = ['auto', 'local', 'groq', 'openai', 'mistral'] const STT_LOCAL_MODELS = ['tiny', 'base', 'small', 'medium', 'large-v3', 'turbo'] const STT_OPENAI_MODELS = ['whisper-1', 'gpt-4o-mini-transcribe', 'gpt-4o-transcribe'] const STT_MISTRAL_MODELS = ['voxtral-mini-latest', 'voxtral-mini-2602'] +const TTS_PROVIDERS = ['edge', 'elevenlabs', 'openai', 'xai', 'minimax', 'mistral', 'gemini', 'neutts', 'kittentts', 'piper'] +const TTS_OPENAI_VOICES = ['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'] const UNAUTHORIZED_DM_BEHAVIORS = ['pair', 'ignore'] const IMAGE_INPUT_MODES = ['auto', 'native', 'text'] const REASONING_EFFORTS = ['xhigh', 'high', 'medium', 'low', 'minimal', 'none'] @@ -399,6 +423,7 @@ export function render() { let privacyValues = { ...PRIVACY_DEFAULTS } let browserValues = { ...BROWSER_DEFAULTS } let sttValues = { ...STT_DEFAULTS } + let ttsVoiceValues = { ...TTS_VOICE_DEFAULTS } let terminalValues = { ...TERMINAL_DEFAULTS } let loading = true let runtimeLoading = true @@ -437,6 +462,7 @@ export function render() { let privacyLoading = true let browserLoading = true let sttLoading = true + let ttsVoiceLoading = true let terminalLoading = true let saving = false let runtimeSaving = false @@ -475,6 +501,7 @@ export function render() { let privacySaving = false let browserSaving = false let sttSaving = false + let ttsVoiceSaving = false let terminalSaving = false let error = null let runtimeError = null @@ -513,6 +540,7 @@ export function render() { let privacyError = null let browserError = null let sttError = null + let ttsVoiceError = null let terminalError = null function esc(value) { @@ -524,7 +552,7 @@ export function render() { } function isBusy() { - return loading || runtimeLoading || sessionsMaintenanceLoading || updatesLoading || compressionLoading || promptCachingLoading || openrouterCacheLoading || providerRoutingLoading || auxiliaryLoading || toolGuardrailsLoading || memoryLoading || skillsLoading || curatorLoading || quickCommandsLoading || modelLoading || modelAliasesLoading || hooksLoading || providerOverridesLoading || mcpServersLoading || agentToolsetsLoading || platformToolsetsLoading || agentRuntimeLoading || unauthorizedDmLoading || securityLoading || displayLoading || humanDelayLoading || kanbanLoading || streamingLoading || executionLimitsLoading || ioSafetyLoading || checkpointsLoading || cronLoading || loggingLoading || approvalsLoading || privacyLoading || browserLoading || sttLoading || terminalLoading || saving || runtimeSaving || sessionsMaintenanceSaving || updatesSaving || compressionSaving || promptCachingSaving || openrouterCacheSaving || providerRoutingSaving || auxiliarySaving || toolGuardrailsSaving || memorySaving || skillsSaving || curatorSaving || quickCommandsSaving || modelSaving || modelAliasesSaving || hooksSaving || providerOverridesSaving || mcpServersSaving || agentToolsetsSaving || platformToolsetsSaving || agentRuntimeSaving || unauthorizedDmSaving || securitySaving || displaySaving || humanDelaySaving || kanbanSaving || streamingSaving || executionLimitsSaving || ioSafetySaving || checkpointsSaving || cronSaving || loggingSaving || approvalsSaving || privacySaving || browserSaving || sttSaving || terminalSaving + return loading || runtimeLoading || sessionsMaintenanceLoading || updatesLoading || compressionLoading || promptCachingLoading || openrouterCacheLoading || providerRoutingLoading || auxiliaryLoading || toolGuardrailsLoading || memoryLoading || skillsLoading || curatorLoading || quickCommandsLoading || modelLoading || modelAliasesLoading || hooksLoading || providerOverridesLoading || mcpServersLoading || agentToolsetsLoading || platformToolsetsLoading || agentRuntimeLoading || unauthorizedDmLoading || securityLoading || displayLoading || humanDelayLoading || kanbanLoading || streamingLoading || executionLimitsLoading || ioSafetyLoading || checkpointsLoading || cronLoading || loggingLoading || approvalsLoading || privacyLoading || browserLoading || sttLoading || ttsVoiceLoading || terminalLoading || saving || runtimeSaving || sessionsMaintenanceSaving || updatesSaving || compressionSaving || promptCachingSaving || openrouterCacheSaving || providerRoutingSaving || auxiliarySaving || toolGuardrailsSaving || memorySaving || skillsSaving || curatorSaving || quickCommandsSaving || modelSaving || modelAliasesSaving || hooksSaving || providerOverridesSaving || mcpServersSaving || agentToolsetsSaving || platformToolsetsSaving || agentRuntimeSaving || unauthorizedDmSaving || securitySaving || displaySaving || humanDelaySaving || kanbanSaving || streamingSaving || executionLimitsSaving || ioSafetySaving || checkpointsSaving || cronSaving || loggingSaving || approvalsSaving || privacySaving || browserSaving || sttSaving || ttsVoiceSaving || terminalSaving } function option(labelKey, value, selected) { @@ -2271,6 +2299,112 @@ export function render() { ` } + function renderTtsVoicePanel() { + const disabled = loading || saving || ttsVoiceLoading || ttsVoiceSaving || sttSaving || approvalsSaving || cronSaving || loggingSaving || privacySaving || browserSaving || terminalSaving || runtimeSaving || compressionSaving || promptCachingSaving || openrouterCacheSaving || providerRoutingSaving || auxiliarySaving || toolGuardrailsSaving || memorySaving || skillsSaving || quickCommandsSaving || providerOverridesSaving || agentToolsetsSaving || agentRuntimeSaving || unauthorizedDmSaving || streamingSaving || executionLimitsSaving || ioSafetySaving || checkpointsSaving + return ` +
+
+
+
${t('engine.hermesTtsVoiceConfigTitle')}
+
${t('engine.hermesTtsVoiceConfigDesc')}
+
+
+ ${ttsVoiceSaving ? t('engine.hermesConfigStatusSaving') : ttsVoiceLoading ? t('engine.hermesConfigStatusLoading') : t('engine.hermesTtsVoiceConfigStatusReady')} + +
+
+
+ ${renderError(ttsVoiceError)} +
+ + +
+
+ + + + + + + + + + + + + + + + + +
+
${t('engine.hermesTtsVoiceConfigFootnote')}
+
+
+ ` + } + function renderTerminalPanel() { const disabled = loading || saving || terminalLoading || terminalSaving || approvalsSaving || cronSaving || loggingSaving || browserSaving || sttSaving || runtimeSaving || compressionSaving || promptCachingSaving || openrouterCacheSaving || providerRoutingSaving || auxiliarySaving || toolGuardrailsSaving || memorySaving || skillsSaving || quickCommandsSaving || providerOverridesSaving || agentToolsetsSaving || agentRuntimeSaving || unauthorizedDmSaving || streamingSaving || executionLimitsSaving || checkpointsSaving return ` @@ -2425,6 +2559,7 @@ export function render() { ${renderPrivacyPanel()} ${renderBrowserPanel()} ${renderSttPanel()} + ${renderTtsVoicePanel()} ${renderCompressionPanel()} ${renderPromptCachingPanel()} ${renderOpenrouterCachePanel()} @@ -2503,6 +2638,7 @@ export function render() { el.querySelector('#hm-privacy-save')?.addEventListener('click', savePrivacyConfig) el.querySelector('#hm-browser-save')?.addEventListener('click', saveBrowserConfig) el.querySelector('#hm-stt-save')?.addEventListener('click', saveSttConfig) + el.querySelector('#hm-tts-voice-save')?.addEventListener('click', saveTtsVoiceConfig) el.querySelector('#hm-terminal-save')?.addEventListener('click', saveTerminal) } @@ -2691,6 +2827,11 @@ export function render() { sttValues = { ...STT_DEFAULTS, ...(data?.values || {}) } } + async function loadTtsVoiceConfig() { + const data = await api.hermesTtsVoiceConfigRead() + ttsVoiceValues = { ...TTS_VOICE_DEFAULTS, ...(data?.values || {}) } + } + async function loadTerminal() { const data = await api.hermesTerminalConfigRead() terminalValues = { ...TERMINAL_DEFAULTS, ...(data?.values || {}) } @@ -2734,6 +2875,7 @@ export function render() { privacyLoading = true browserLoading = true sttLoading = true + ttsVoiceLoading = true terminalLoading = true error = null runtimeError = null @@ -2771,6 +2913,7 @@ export function render() { privacyError = null browserError = null sttError = null + ttsVoiceError = null terminalError = null draw() try { @@ -2932,6 +3075,14 @@ export function render() { sttLoading = false draw() } + try { + await loadTtsVoiceConfig() + } catch (err) { + ttsVoiceError = humanizeError(err, t('engine.hermesTtsVoiceConfigLoadFailed') || 'Load speech output config failed') + } finally { + ttsVoiceLoading = false + draw() + } try { await loadTerminal() } catch (err) { @@ -4264,6 +4415,49 @@ export function render() { } } + async function saveTtsVoiceConfig() { + const form = { + ttsProvider: el.querySelector('#hm-tts-provider')?.value || 'edge', + ttsEdgeVoice: el.querySelector('#hm-tts-edge-voice')?.value || '', + ttsOpenaiModel: el.querySelector('#hm-tts-openai-model')?.value || 'gpt-4o-mini-tts', + ttsOpenaiVoice: el.querySelector('#hm-tts-openai-voice')?.value || 'alloy', + ttsElevenlabsVoiceId: el.querySelector('#hm-tts-elevenlabs-voice-id')?.value || '', + ttsElevenlabsModelId: el.querySelector('#hm-tts-elevenlabs-model-id')?.value || '', + ttsXaiVoiceId: el.querySelector('#hm-tts-xai-voice-id')?.value || 'eve', + ttsXaiLanguage: el.querySelector('#hm-tts-xai-language')?.value || 'en', + ttsXaiSampleRate: el.querySelector('#hm-tts-xai-sample-rate')?.value || '24000', + ttsXaiBitRate: el.querySelector('#hm-tts-xai-bit-rate')?.value || '128000', + ttsMistralModel: el.querySelector('#hm-tts-mistral-model')?.value || 'voxtral-mini-tts-2603', + ttsMistralVoiceId: el.querySelector('#hm-tts-mistral-voice-id')?.value || '', + ttsPiperVoice: el.querySelector('#hm-tts-piper-voice')?.value || '', + voiceRecordKey: el.querySelector('#hm-voice-record-key')?.value || '', + voiceMaxRecordingSeconds: el.querySelector('#hm-voice-max-recording-seconds')?.value || '120', + voiceAutoTts: !!el.querySelector('#hm-voice-auto-tts')?.checked, + voiceBeepEnabled: !!el.querySelector('#hm-voice-beep-enabled')?.checked, + voiceSilenceThreshold: el.querySelector('#hm-voice-silence-threshold')?.value || '200', + voiceSilenceDuration: el.querySelector('#hm-voice-silence-duration')?.value || '3', + } + ttsVoiceSaving = true + ttsVoiceError = null + draw() + try { + const result = await api.hermesTtsVoiceConfigSave(form) + ttsVoiceValues = { ...TTS_VOICE_DEFAULTS, ...(result?.values || form) } + await refreshRawAfterStructuredSave() + const backup = result?.backup || '' + toast({ + message: t('engine.hermesTtsVoiceConfigSaveSuccess'), + hint: backup ? t('engine.hermesConfigBackupHint', { path: backup }) : '', + }, 'success') + } catch (err) { + ttsVoiceError = humanizeError(err, t('engine.hermesTtsVoiceConfigSaveFailed') || 'Save speech output config failed') + toast(ttsVoiceError, 'error') + } finally { + ttsVoiceSaving = false + draw() + } + } + async function saveTerminal() { const form = { terminalBackend: el.querySelector('#hm-terminal-backend')?.value || 'local', diff --git a/src/lib/tauri-api.js b/src/lib/tauri-api.js index 6a32337..b4bfc57 100644 --- a/src/lib/tauri-api.js +++ b/src/lib/tauri-api.js @@ -581,6 +581,8 @@ export const api = { hermesBrowserConfigSave: (form) => invoke('hermes_browser_config_save', { form }), hermesSttConfigRead: () => invoke('hermes_stt_config_read'), hermesSttConfigSave: (form) => invoke('hermes_stt_config_save', { form }), + hermesTtsVoiceConfigRead: () => invoke('hermes_tts_voice_config_read'), + hermesTtsVoiceConfigSave: (form) => invoke('hermes_tts_voice_config_save', { form }), hermesTerminalConfigRead: () => invoke('hermes_terminal_config_read'), hermesTerminalConfigSave: (form) => invoke('hermes_terminal_config_save', { form }), hermesLazyDepsFeatures: () => cachedInvoke('hermes_lazy_deps_features', {}, 600000), diff --git a/src/locales/modules/engine.js b/src/locales/modules/engine.js index b8469ec..09dac23 100644 --- a/src/locales/modules/engine.js +++ b/src/locales/modules/engine.js @@ -739,6 +739,49 @@ export default { 'hermesSttConfigMistralModel_voxtral-mini-latest': _('voxtral-mini-latest(推荐)', 'voxtral-mini-latest (recommended)', 'voxtral-mini-latest(建議)'), 'hermesSttConfigMistralModel_voxtral-mini-2602': _('voxtral-mini-2602(固定版本)', 'voxtral-mini-2602 (pinned version)', 'voxtral-mini-2602(固定版本)'), hermesSttConfigFootnote: _('这里写入 stt.*。API Key 仍通过 .env 管理;Groq 使用上游默认模型,其他 provider 高级字段会保留在 raw YAML 中。', 'This writes stt.*. API keys are still managed through .env. Groq uses the upstream default model, and other provider advanced fields stay in raw YAML.', '這裡寫入 stt.*。API Key 仍透過 .env 管理;Groq 使用上游預設模型,其他 provider 進階欄位會保留在 raw YAML 中。'), + hermesTtsVoiceConfigTitle: _('语音输出与录音', 'Speech output and recording', '語音輸出與錄音'), + hermesTtsVoiceConfigDesc: _('控制 Hermes 的 TTS 朗读 provider、主流语音模型,以及 CLI 语音录制热键和静音自动停止。', 'Control Hermes TTS providers and common voice models, plus CLI recording hotkey and silence auto-stop.', '控制 Hermes 的 TTS 朗讀 provider、主流語音模型,以及 CLI 語音錄製熱鍵和靜音自動停止。'), + hermesTtsVoiceConfigStatusReady: _('结构化配置', 'structured settings', '結構化設定'), + hermesTtsVoiceConfigSave: _('保存语音输出配置', 'Save speech output settings', '儲存語音輸出設定'), + hermesTtsVoiceConfigSaveSuccess: _('语音输出配置已保存,建议重启 Hermes Gateway 或 CLI 会话生效', 'Speech output settings saved. Restart Hermes Gateway or CLI sessions to take effect.', '語音輸出設定已儲存,建議重啟 Hermes Gateway 或 CLI 工作階段生效'), + hermesTtsVoiceConfigLoadFailed: _('加载语音输出配置失败', 'Load speech output settings failed', '載入語音輸出設定失敗'), + hermesTtsVoiceConfigSaveFailed: _('保存语音输出配置失败', 'Save speech output settings failed', '儲存語音輸出設定失敗'), + hermesTtsConfigProvider: _('TTS 服务', 'TTS provider', 'TTS 服務'), + hermesTtsConfigProvider_edge: _('Edge TTS(免费)', 'Edge TTS (free)', 'Edge TTS(免費)'), + hermesTtsConfigProvider_elevenlabs: _('ElevenLabs(高质量)', 'ElevenLabs (high quality)', 'ElevenLabs(高品質)'), + hermesTtsConfigProvider_openai: _('OpenAI TTS', 'OpenAI TTS', 'OpenAI TTS'), + hermesTtsConfigProvider_xai: _('xAI TTS', 'xAI TTS', 'xAI TTS'), + hermesTtsConfigProvider_minimax: _('MiniMax TTS', 'MiniMax TTS', 'MiniMax TTS'), + hermesTtsConfigProvider_mistral: _('Mistral Voxtral TTS', 'Mistral Voxtral TTS', 'Mistral Voxtral TTS'), + hermesTtsConfigProvider_gemini: _('Gemini TTS', 'Gemini TTS', 'Gemini TTS'), + hermesTtsConfigProvider_neutts: _('NeuTTS(本地)', 'NeuTTS (local)', 'NeuTTS(本機)'), + hermesTtsConfigProvider_kittentts: _('KittenTTS(本地)', 'KittenTTS (local)', 'KittenTTS(本機)'), + hermesTtsConfigProvider_piper: _('Piper(本地)', 'Piper (local)', 'Piper(本機)'), + hermesTtsConfigEdgeVoice: _('Edge 声音', 'Edge voice', 'Edge 聲音'), + hermesTtsConfigOpenaiModel: _('OpenAI 模型', 'OpenAI model', 'OpenAI 模型'), + hermesTtsConfigOpenaiVoice: _('OpenAI 声音', 'OpenAI voice', 'OpenAI 聲音'), + hermesTtsConfigOpenaiVoice_alloy: _('alloy', 'alloy', 'alloy'), + hermesTtsConfigOpenaiVoice_echo: _('echo', 'echo', 'echo'), + hermesTtsConfigOpenaiVoice_fable: _('fable', 'fable', 'fable'), + hermesTtsConfigOpenaiVoice_onyx: _('onyx', 'onyx', 'onyx'), + hermesTtsConfigOpenaiVoice_nova: _('nova', 'nova', 'nova'), + hermesTtsConfigOpenaiVoice_shimmer: _('shimmer', 'shimmer', 'shimmer'), + hermesTtsConfigElevenlabsVoiceId: _('ElevenLabs Voice ID', 'ElevenLabs voice ID', 'ElevenLabs Voice ID'), + hermesTtsConfigElevenlabsModelId: _('ElevenLabs 模型 ID', 'ElevenLabs model ID', 'ElevenLabs 模型 ID'), + hermesTtsConfigXaiVoiceId: _('xAI Voice ID', 'xAI voice ID', 'xAI Voice ID'), + hermesTtsConfigXaiLanguage: _('xAI 语言', 'xAI language', 'xAI 語言'), + hermesTtsConfigXaiSampleRate: _('xAI 采样率', 'xAI sample rate', 'xAI 取樣率'), + hermesTtsConfigXaiBitRate: _('xAI 比特率', 'xAI bit rate', 'xAI 位元率'), + hermesTtsConfigMistralModel: _('Mistral 模型', 'Mistral model', 'Mistral 模型'), + hermesTtsConfigMistralVoiceId: _('Mistral Voice ID', 'Mistral voice ID', 'Mistral Voice ID'), + hermesTtsConfigPiperVoice: _('Piper 声音', 'Piper voice', 'Piper 聲音'), + hermesVoiceConfigRecordKey: _('录音热键', 'Recording hotkey', '錄音快捷鍵'), + hermesVoiceConfigMaxRecordingSeconds: _('最长录音秒数', 'Max recording seconds', '最長錄音秒數'), + hermesVoiceConfigAutoTts: _('CLI 回复后自动朗读', 'Auto-speak CLI replies', 'CLI 回覆後自動朗讀'), + hermesVoiceConfigBeepEnabled: _('录音开始/结束提示音', 'Recording start/stop beeps', '錄音開始/結束提示音'), + hermesVoiceConfigSilenceThreshold: _('静音阈值 RMS', 'Silence RMS threshold', '靜音閾值 RMS'), + hermesVoiceConfigSilenceDuration: _('静音自动停止秒数', 'Silence auto-stop seconds', '靜音自動停止秒數'), + hermesTtsVoiceConfigFootnote: _('这里写入 tts.* 与 voice.*。API Key 仍通过 .env 管理;留空的可选声音 ID 会删除覆盖并使用 Hermes/provider 默认值;provider 的高级字段和 max_text_length 会保留在 raw YAML 中。', 'This writes tts.* and voice.*. API keys are still managed through .env. Empty optional voice IDs remove overrides and fall back to Hermes/provider defaults. Advanced provider fields and max_text_length stay in raw YAML.', '這裡寫入 tts.* 與 voice.*。API Key 仍透過 .env 管理;留空的可選聲音 ID 會刪除覆蓋並使用 Hermes/provider 預設值;provider 的進階欄位和 max_text_length 會保留在 raw YAML 中。'), hermesCompressionTitle: _('上下文压缩', 'Context compression', '上下文壓縮'), hermesCompressionDesc: _('控制长对话何时触发压缩、压缩目标和保留范围,降低上下文过长导致的失败与费用浪费。', 'Control when long conversations are compressed, the target size, and protected message ranges to reduce failures and wasted cost from oversized context.', '控制長對話何時觸發壓縮、壓縮目標和保留範圍,降低上下文過長導致的失敗與費用浪費。'), hermesCompressionStatusReady: _('结构化配置', 'structured settings', '結構化設定'), diff --git a/tests/hermes-config-page-ui.test.js b/tests/hermes-config-page-ui.test.js index b42682f..3ced1be 100644 --- a/tests/hermes-config-page-ui.test.js +++ b/tests/hermes-config-page-ui.test.js @@ -476,6 +476,33 @@ test('Hermes 配置页会暴露语音转写结构化配置字段', () => { } }) +test('Hermes 配置页会暴露语音输出与录音结构化配置字段', () => { + for (const id of [ + 'hm-tts-voice-save', + 'hm-tts-provider', + 'hm-tts-edge-voice', + 'hm-tts-openai-model', + 'hm-tts-openai-voice', + 'hm-tts-elevenlabs-voice-id', + 'hm-tts-elevenlabs-model-id', + 'hm-tts-xai-voice-id', + 'hm-tts-xai-language', + 'hm-tts-xai-sample-rate', + 'hm-tts-xai-bit-rate', + 'hm-tts-mistral-model', + 'hm-tts-mistral-voice-id', + 'hm-tts-piper-voice', + 'hm-voice-record-key', + 'hm-voice-max-recording-seconds', + 'hm-voice-auto-tts', + 'hm-voice-beep-enabled', + 'hm-voice-silence-threshold', + 'hm-voice-silence-duration', + ]) { + assert.match(source, new RegExp(`id="${id}"`), `缺少 ${id}`) + } +}) + test('Hermes 配置页会暴露 Kanban 调度稳定性结构化配置字段', () => { for (const id of [ 'hm-kanban-config-save', diff --git a/tests/hermes-tts-voice-config.test.js b/tests/hermes-tts-voice-config.test.js new file mode 100644 index 0000000..4c7e5a3 --- /dev/null +++ b/tests/hermes-tts-voice-config.test.js @@ -0,0 +1,183 @@ +import test from 'node:test' +import assert from 'node:assert/strict' + +import { + buildHermesTtsVoiceConfigValues, + mergeHermesTtsVoiceConfig, +} from '../scripts/dev-api.js' + +test('Hermes TTS/Voice 配置读取会提供上游默认值', () => { + const values = buildHermesTtsVoiceConfigValues({}) + + assert.deepEqual(values, { + ttsProvider: 'edge', + ttsEdgeVoice: 'en-US-AriaNeural', + ttsOpenaiModel: 'gpt-4o-mini-tts', + ttsOpenaiVoice: 'alloy', + ttsElevenlabsVoiceId: 'pNInz6obpgDQGcFmaJgB', + ttsElevenlabsModelId: 'eleven_multilingual_v2', + ttsXaiVoiceId: 'eve', + ttsXaiLanguage: 'en', + ttsXaiSampleRate: 24000, + ttsXaiBitRate: 128000, + ttsMistralModel: 'voxtral-mini-tts-2603', + ttsMistralVoiceId: 'c69964a6-ab8b-4f8a-9465-ec0925096ec8', + ttsPiperVoice: 'en_US-lessac-medium', + voiceRecordKey: 'ctrl+b', + voiceMaxRecordingSeconds: 120, + voiceAutoTts: false, + voiceBeepEnabled: true, + voiceSilenceThreshold: 200, + voiceSilenceDuration: 3, + }) +}) + +test('Hermes TTS/Voice 配置读取会回显 YAML 字段', () => { + const values = buildHermesTtsVoiceConfigValues({ + tts: { + provider: 'openai', + edge: { voice: 'zh-CN-XiaoxiaoNeural' }, + openai: { model: 'gpt-4o-mini-tts', voice: 'nova' }, + elevenlabs: { voice_id: 'voice-123', model_id: 'eleven_turbo_v2_5' }, + xai: { + voice_id: 'custom-eve', + language: 'zh', + sample_rate: 48000, + bit_rate: 192000, + }, + mistral: { model: 'voxtral-mini-tts-2603', voice_id: 'mistral-voice' }, + piper: { voice: 'zh_CN-huayan-medium' }, + }, + voice: { + record_key: 'ctrl+shift+v', + max_recording_seconds: 240, + auto_tts: true, + beep_enabled: false, + silence_threshold: 350, + silence_duration: 1.5, + }, + }) + + assert.equal(values.ttsProvider, 'openai') + assert.equal(values.ttsEdgeVoice, 'zh-CN-XiaoxiaoNeural') + assert.equal(values.ttsOpenaiVoice, 'nova') + assert.equal(values.ttsElevenlabsVoiceId, 'voice-123') + assert.equal(values.ttsXaiLanguage, 'zh') + assert.equal(values.ttsXaiSampleRate, 48000) + assert.equal(values.ttsMistralVoiceId, 'mistral-voice') + assert.equal(values.ttsPiperVoice, 'zh_CN-huayan-medium') + assert.equal(values.voiceRecordKey, 'ctrl+shift+v') + assert.equal(values.voiceAutoTts, true) + assert.equal(values.voiceBeepEnabled, false) + assert.equal(values.voiceSilenceDuration, 1.5) +}) + +test('Hermes TTS/Voice 配置保存会保留未知字段并写入上游结构', () => { + const next = mergeHermesTtsVoiceConfig({ + model: { provider: 'anthropic' }, + tts: { + provider: 'edge', + custom_flag: 'keep-tts', + openai: { custom_flag: 'keep-openai' }, + piper: { voices_dir: '/cache/piper' }, + }, + voice: { + custom_flag: 'keep-voice', + }, + streaming: { enabled: true }, + }, { + ttsProvider: 'openai', + ttsEdgeVoice: 'zh-CN-XiaoxiaoNeural', + ttsOpenaiModel: 'gpt-4o-mini-tts', + ttsOpenaiVoice: 'nova', + ttsElevenlabsVoiceId: 'voice-123', + ttsElevenlabsModelId: 'eleven_turbo_v2_5', + ttsXaiVoiceId: 'eve-pro', + ttsXaiLanguage: 'zh', + ttsXaiSampleRate: '48000', + ttsXaiBitRate: '192000', + ttsMistralModel: 'voxtral-mini-tts-2603', + ttsMistralVoiceId: 'mistral-voice', + ttsPiperVoice: 'zh_CN-huayan-medium', + voiceRecordKey: 'ctrl+shift+v', + voiceMaxRecordingSeconds: '240', + voiceAutoTts: true, + voiceBeepEnabled: false, + voiceSilenceThreshold: '350', + voiceSilenceDuration: '1.5', + }) + + assert.deepEqual(next.model, { provider: 'anthropic' }) + assert.deepEqual(next.streaming, { enabled: true }) + assert.equal(next.tts.provider, 'openai') + assert.equal(next.tts.edge.voice, 'zh-CN-XiaoxiaoNeural') + assert.equal(next.tts.openai.model, 'gpt-4o-mini-tts') + assert.equal(next.tts.openai.voice, 'nova') + assert.equal(next.tts.openai.custom_flag, 'keep-openai') + assert.equal(next.tts.elevenlabs.voice_id, 'voice-123') + assert.equal(next.tts.elevenlabs.model_id, 'eleven_turbo_v2_5') + assert.equal(next.tts.xai.sample_rate, 48000) + assert.equal(next.tts.xai.bit_rate, 192000) + assert.equal(next.tts.mistral.voice_id, 'mistral-voice') + assert.equal(next.tts.piper.voice, 'zh_CN-huayan-medium') + assert.equal(next.tts.piper.voices_dir, '/cache/piper') + assert.equal(next.tts.custom_flag, 'keep-tts') + assert.equal(next.voice.record_key, 'ctrl+shift+v') + assert.equal(next.voice.max_recording_seconds, 240) + assert.equal(next.voice.auto_tts, true) + assert.equal(next.voice.beep_enabled, false) + assert.equal(next.voice.silence_threshold, 350) + assert.equal(next.voice.silence_duration, 1.5) + assert.equal(next.voice.custom_flag, 'keep-voice') +}) + +test('Hermes TTS/Voice 配置保存空可选字段会删除对应覆盖', () => { + const next = mergeHermesTtsVoiceConfig({ + tts: { + edge: { voice: 'custom-edge' }, + elevenlabs: { voice_id: 'voice-123', model_id: 'model-123' }, + piper: { voice: 'custom-piper', voices_dir: '/cache/piper' }, + }, + voice: { + record_key: 'ctrl+shift+v', + custom_flag: 'keep-voice', + }, + }, { + ttsEdgeVoice: '', + ttsElevenlabsVoiceId: ' ', + ttsElevenlabsModelId: '', + ttsPiperVoice: '', + voiceRecordKey: '', + }) + + assert.equal(Object.hasOwn(next.tts.edge, 'voice'), false) + assert.equal(Object.hasOwn(next.tts.elevenlabs, 'voice_id'), false) + assert.equal(Object.hasOwn(next.tts.elevenlabs, 'model_id'), false) + assert.equal(Object.hasOwn(next.tts.piper, 'voice'), false) + assert.equal(next.tts.piper.voices_dir, '/cache/piper') + assert.equal(Object.hasOwn(next.voice, 'record_key'), false) + assert.equal(next.voice.custom_flag, 'keep-voice') +}) + +test('Hermes TTS/Voice 配置保存会拒绝非法枚举和越界值', () => { + assert.throws( + () => mergeHermesTtsVoiceConfig({}, { ttsProvider: 'bad' }), + /tts\.provider/, + ) + assert.throws( + () => mergeHermesTtsVoiceConfig({}, { ttsOpenaiVoice: 'robot' }), + /tts\.openai\.voice/, + ) + assert.throws( + () => mergeHermesTtsVoiceConfig({}, { ttsXaiSampleRate: '0' }), + /tts\.xai\.sample_rate/, + ) + assert.throws( + () => mergeHermesTtsVoiceConfig({}, { voiceMaxRecordingSeconds: '0' }), + /voice\.max_recording_seconds/, + ) + assert.throws( + () => mergeHermesTtsVoiceConfig({}, { voiceSilenceDuration: '-1' }), + /voice\.silence_duration/, + ) +})