mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-09 17:50:23 +08:00
feat(agent): add voice message support with TTS/STT for Telegram and WeChat
- Integrate voice message handling: detect and extract audio references from Telegram and WeChat messages, route to agent with voice reply preference. - Add voice provider abstraction and OpenAI-based TTS/STT implementation. - Implement agent tool `send_voice_message` for generating and sending voice replies, with fallback to text if voice is unavailable. - Extend agent prompt and context to support voice reply instructions. - Update notification and message schemas to support audio fields. - Add Telegram and WeChat voice sending logic, including audio file conversion and temporary media upload for WeChat. - Add tests for voice helper and agent voice routing.
This commit is contained in:
@@ -214,17 +214,19 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
text = self._append_reply_markup_links(text, msg.get("reply_markup"))
|
||||
|
||||
images = self._extract_images(msg)
|
||||
audio_refs = self._extract_audio_refs(msg)
|
||||
|
||||
if user_id:
|
||||
if not text and not images:
|
||||
if not text and not images and not audio_refs:
|
||||
logger.debug(
|
||||
f"收到来自 {client_config.name} 的Telegram消息无文本和图片"
|
||||
f"收到来自 {client_config.name} 的Telegram消息无文本、图片和语音"
|
||||
)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
f"收到来自 {client_config.name} 的Telegram消息:"
|
||||
f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, images={len(images) if images else 0}"
|
||||
f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, "
|
||||
f"images={len(images) if images else 0}, audios={len(audio_refs) if audio_refs else 0}"
|
||||
)
|
||||
|
||||
cleaned_text = (
|
||||
@@ -263,6 +265,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
text=cleaned_text,
|
||||
chat_id=str(chat_id) if chat_id else None,
|
||||
images=images if images else None,
|
||||
audio_refs=audio_refs if audio_refs else None,
|
||||
)
|
||||
return None
|
||||
|
||||
@@ -288,6 +291,26 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
|
||||
return images if images else None
|
||||
|
||||
@staticmethod
|
||||
def _extract_audio_refs(msg: dict) -> Optional[List[str]]:
|
||||
"""
|
||||
从Telegram消息中提取语音/音频 file_id。
|
||||
"""
|
||||
audio_refs = []
|
||||
voice = msg.get("voice")
|
||||
if voice:
|
||||
file_id = voice.get("file_id")
|
||||
if file_id:
|
||||
audio_refs.append(f"tg://voice_file_id/{file_id}")
|
||||
|
||||
audio = msg.get("audio")
|
||||
if audio:
|
||||
file_id = audio.get("file_id")
|
||||
if file_id:
|
||||
audio_refs.append(f"tg://audio_file_id/{file_id}")
|
||||
|
||||
return audio_refs if audio_refs else None
|
||||
|
||||
@staticmethod
|
||||
def _embed_entity_links(text: str, entities: Optional[List[dict]]) -> str:
|
||||
"""
|
||||
@@ -389,17 +412,25 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
return
|
||||
client: Telegram = self.get_instance(conf.name)
|
||||
if client:
|
||||
client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
buttons=message.buttons,
|
||||
original_message_id=message.original_message_id,
|
||||
original_chat_id=message.original_chat_id,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
if message.voice_path:
|
||||
client.send_voice(
|
||||
voice_path=message.voice_path,
|
||||
userid=userid,
|
||||
caption=message.voice_caption,
|
||||
original_chat_id=message.original_chat_id,
|
||||
)
|
||||
else:
|
||||
client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
buttons=message.buttons,
|
||||
original_message_id=message.original_message_id,
|
||||
original_chat_id=message.original_chat_id,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
|
||||
def post_medias_message(
|
||||
self, message: Notification, medias: List[MediaInfo]
|
||||
@@ -531,14 +562,22 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
return None
|
||||
client: Telegram = self.get_instance(conf.name)
|
||||
if client:
|
||||
result = client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
if message.voice_path:
|
||||
result = client.send_voice(
|
||||
voice_path=message.voice_path,
|
||||
userid=userid,
|
||||
caption=message.voice_caption,
|
||||
original_chat_id=message.original_chat_id,
|
||||
)
|
||||
else:
|
||||
result = client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
if result and result.get("success"):
|
||||
return MessageResponse(
|
||||
message_id=result.get("message_id"),
|
||||
@@ -601,7 +640,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
)
|
||||
client.register_commands(filtered_scoped_commands)
|
||||
|
||||
def download_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
|
||||
def download_telegram_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
|
||||
"""
|
||||
下载Telegram文件并转为base64
|
||||
:param file_id: Telegram文件ID
|
||||
@@ -620,3 +659,15 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
|
||||
return base64.b64encode(file_content).decode()
|
||||
return None
|
||||
|
||||
def download_telegram_file_bytes(self, file_id: str, source: str) -> Optional[bytes]:
|
||||
"""
|
||||
下载Telegram文件并返回原始字节。
|
||||
"""
|
||||
config = self.get_config(source)
|
||||
if not config:
|
||||
return None
|
||||
client = self.get_instance(config.name)
|
||||
if not client:
|
||||
return None
|
||||
return client.download_file(file_id)
|
||||
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, List, Dict, Callable, Union
|
||||
from urllib.parse import urljoin, quote
|
||||
|
||||
@@ -461,6 +462,51 @@ class Telegram:
|
||||
self._stop_typing_task(chat_id)
|
||||
return {"success": False}
|
||||
|
||||
def send_voice(
|
||||
self,
|
||||
voice_path: str,
|
||||
userid: Optional[str] = None,
|
||||
caption: Optional[str] = None,
|
||||
original_chat_id: Optional[str] = None,
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
发送Telegram语音消息。
|
||||
"""
|
||||
if not self._bot or not voice_path:
|
||||
return None
|
||||
|
||||
chat_id = self._determine_target_chat_id(userid, original_chat_id)
|
||||
voice_file = Path(voice_path)
|
||||
if not voice_file.exists():
|
||||
logger.error(f"语音文件不存在: {voice_file}")
|
||||
return {"success": False}
|
||||
|
||||
try:
|
||||
with voice_file.open("rb") as fp:
|
||||
sent = self._bot.send_voice(
|
||||
chat_id=chat_id,
|
||||
voice=fp,
|
||||
caption=standardize(caption) if caption else None,
|
||||
parse_mode="MarkdownV2" if caption else None,
|
||||
)
|
||||
self._stop_typing_task(chat_id)
|
||||
if sent and hasattr(sent, "message_id"):
|
||||
return {
|
||||
"success": True,
|
||||
"message_id": sent.message_id,
|
||||
"chat_id": sent.chat.id if hasattr(sent, "chat") else chat_id,
|
||||
}
|
||||
return {"success": bool(sent)}
|
||||
except Exception as err:
|
||||
logger.error(f"发送语音消息失败:{err}")
|
||||
self._stop_typing_task(chat_id)
|
||||
return {"success": False}
|
||||
finally:
|
||||
try:
|
||||
voice_file.unlink(missing_ok=True)
|
||||
except Exception as cleanup_err:
|
||||
logger.debug(f"清理语音临时文件失败: {cleanup_err}")
|
||||
|
||||
def _determine_target_chat_id(
|
||||
self, userid: Optional[str] = None, original_chat_id: Optional[str] = None
|
||||
) -> str:
|
||||
|
||||
Reference in New Issue
Block a user