feat(agent): add voice message support with TTS/STT for Telegram and WeChat

- Integrate voice message handling: detect and extract audio references from Telegram and WeChat messages, route to agent with voice reply preference. - Add voice provider abstraction and OpenAI-based TTS/STT implementation. - Implement agent tool `send_voice_message` for generating and sending voice replies, with fallback to text if voice is unavailable. - Extend agent prompt and context to support voice reply instructions. - Update notification and message schemas to support audio fields. - Add Telegram and WeChat voice sending logic, including audio file conversion and temporary media upload for WeChat. - Add tests for voice helper and agent voice routing.
2026-06-09 17:50:23 +08:00 · 2026-04-12 12:30:02 +08:00
parent 9dababbcfd
commit e5f97cd299
17 changed files with 945 additions and 167 deletions
--- a/app/modules/telegram/init.py
+++ b/app/modules/telegram/init.py
@@ -214,17 +214,19 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
        text = self._append_reply_markup_links(text, msg.get("reply_markup"))

        images = self._extract_images(msg)
+        audio_refs = self._extract_audio_refs(msg)

        if user_id:
-            if not text and not images:
+            if not text and not images and not audio_refs:
                logger.debug(
-                    f"收到来自 {client_config.name} 的Telegram消息无文本和图片"
+                    f"收到来自 {client_config.name} 的Telegram消息无文本、图片和语音"
                )
                return None

            logger.info(
                f"收到来自 {client_config.name} 的Telegram消息："
-                f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, images={len(images) if images else 0}"
+                f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, "
+                f"images={len(images) if images else 0}, audios={len(audio_refs) if audio_refs else 0}"
            )

            cleaned_text = (
@@ -263,6 +265,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                text=cleaned_text,
                chat_id=str(chat_id) if chat_id else None,
                images=images if images else None,
+                audio_refs=audio_refs if audio_refs else None,
            )
        return None

@@ -288,6 +291,26 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):

        return images if images else None

+    @staticmethod
+    def _extract_audio_refs(msg: dict) -> Optional[List[str]]:
+        """
+        从Telegram消息中提取语音/音频 file_id。
+        """
+        audio_refs = []
+        voice = msg.get("voice")
+        if voice:
+            file_id = voice.get("file_id")
+            if file_id:
+                audio_refs.append(f"tg://voice_file_id/{file_id}")
+
+        audio = msg.get("audio")
+        if audio:
+            file_id = audio.get("file_id")
+            if file_id:
+                audio_refs.append(f"tg://audio_file_id/{file_id}")
+
+        return audio_refs if audio_refs else None
+
    @staticmethod
    def _embed_entity_links(text: str, entities: Optional[List[dict]]) -> str:
        """
@@ -389,17 +412,25 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                    return
            client: Telegram = self.get_instance(conf.name)
            if client:
-                client.send_msg(
-                    title=message.title,
-                    text=message.text,
-                    image=message.image,
-                    userid=userid,
-                    link=message.link,
-                    buttons=message.buttons,
-                    original_message_id=message.original_message_id,
-                    original_chat_id=message.original_chat_id,
-                    disable_web_page_preview=message.disable_web_page_preview,
-                )
+                if message.voice_path:
+                    client.send_voice(
+                        voice_path=message.voice_path,
+                        userid=userid,
+                        caption=message.voice_caption,
+                        original_chat_id=message.original_chat_id,
+                    )
+                else:
+                    client.send_msg(
+                        title=message.title,
+                        text=message.text,
+                        image=message.image,
+                        userid=userid,
+                        link=message.link,
+                        buttons=message.buttons,
+                        original_message_id=message.original_message_id,
+                        original_chat_id=message.original_chat_id,
+                        disable_web_page_preview=message.disable_web_page_preview,
+                    )

    def post_medias_message(
        self, message: Notification, medias: List[MediaInfo]
@@ -531,14 +562,22 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                    return None
            client: Telegram = self.get_instance(conf.name)
            if client:
-                result = client.send_msg(
-                    title=message.title,
-                    text=message.text,
-                    image=message.image,
-                    userid=userid,
-                    link=message.link,
-                    disable_web_page_preview=message.disable_web_page_preview,
-                )
+                if message.voice_path:
+                    result = client.send_voice(
+                        voice_path=message.voice_path,
+                        userid=userid,
+                        caption=message.voice_caption,
+                        original_chat_id=message.original_chat_id,
+                    )
+                else:
+                    result = client.send_msg(
+                        title=message.title,
+                        text=message.text,
+                        image=message.image,
+                        userid=userid,
+                        link=message.link,
+                        disable_web_page_preview=message.disable_web_page_preview,
+                    )
                if result and result.get("success"):
                    return MessageResponse(
                        message_id=result.get("message_id"),
@@ -601,7 +640,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                )
            client.register_commands(filtered_scoped_commands)

-    def download_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
+    def download_telegram_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
        """
        下载Telegram文件并转为base64
        :param file_id: Telegram文件ID
@@ -620,3 +659,15 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):

            return base64.b64encode(file_content).decode()
        return None
+
+    def download_telegram_file_bytes(self, file_id: str, source: str) -> Optional[bytes]:
+        """
+        下载Telegram文件并返回原始字节。
+        """
+        config = self.get_config(source)
+        if not config:
+            return None
+        client = self.get_instance(config.name)
+        if not client:
+            return None
+        return client.download_file(file_id)
--- a/app/modules/telegram/telegram.py
+++ b/app/modules/telegram/telegram.py
@@ -3,6 +3,7 @@ import json
 import re
 import threading
 import time
+from pathlib import Path
 from typing import Any, Optional, List, Dict, Callable, Union
 from urllib.parse import urljoin, quote

@@ -461,6 +462,51 @@ class Telegram:
            self._stop_typing_task(chat_id)
            return {"success": False}

+    def send_voice(
+            self,
+            voice_path: str,
+            userid: Optional[str] = None,
+            caption: Optional[str] = None,
+            original_chat_id: Optional[str] = None,
+    ) -> Optional[dict]:
+        """
+        发送Telegram语音消息。
+        """
+        if not self._bot or not voice_path:
+            return None
+
+        chat_id = self._determine_target_chat_id(userid, original_chat_id)
+        voice_file = Path(voice_path)
+        if not voice_file.exists():
+            logger.error(f"语音文件不存在: {voice_file}")
+            return {"success": False}
+
+        try:
+            with voice_file.open("rb") as fp:
+                sent = self._bot.send_voice(
+                    chat_id=chat_id,
+                    voice=fp,
+                    caption=standardize(caption) if caption else None,
+                    parse_mode="MarkdownV2" if caption else None,
+                )
+            self._stop_typing_task(chat_id)
+            if sent and hasattr(sent, "message_id"):
+                return {
+                    "success": True,
+                    "message_id": sent.message_id,
+                    "chat_id": sent.chat.id if hasattr(sent, "chat") else chat_id,
+                }
+            return {"success": bool(sent)}
+        except Exception as err:
+            logger.error(f"发送语音消息失败：{err}")
+            self._stop_typing_task(chat_id)
+            return {"success": False}
+        finally:
+            try:
+                voice_file.unlink(missing_ok=True)
+            except Exception as cleanup_err:
+                logger.debug(f"清理语音临时文件失败: {cleanup_err}")
+
    def _determine_target_chat_id(
            self, userid: Optional[str] = None, original_chat_id: Optional[str] = None
    ) -> str: