feat(agent): add voice message support with TTS/STT for Telegram and WeChat

- Integrate voice message handling: detect and extract audio references from Telegram and WeChat messages, route to agent with voice reply preference. - Add voice provider abstraction and OpenAI-based TTS/STT implementation. - Implement agent tool `send_voice_message` for generating and sending voice replies, with fallback to text if voice is unavailable. - Extend agent prompt and context to support voice reply instructions. - Update notification and message schemas to support audio fields. - Add Telegram and WeChat voice sending logic, including audio file conversion and temporary media upload for WeChat. - Add tests for voice helper and agent voice routing.
2026-07-21 12:41:59 +08:00 · 2026-04-12 12:30:02 +08:00
parent 9dababbcfd
commit e5f97cd299
17 changed files with 945 additions and 167 deletions
--- a/app/modules/slack/init.py
+++ b/app/modules/slack/init.py
@@ -297,7 +297,7 @@ class SlackModule(_ModuleBase, _MessageBase[Slack]):
                    images.append(url)
        return images if images else None

-    def download_file_to_data_url(self, file_url: str, source: str) -> Optional[str]:
+    def download_slack_file_to_data_url(self, file_url: str, source: str) -> Optional[str]:
        """
        下载Slack文件并转为data URL
        :param file_url: Slack私有文件URL
--- a/app/modules/telegram/init.py
+++ b/app/modules/telegram/init.py
@@ -214,17 +214,19 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
        text = self._append_reply_markup_links(text, msg.get("reply_markup"))

        images = self._extract_images(msg)
+        audio_refs = self._extract_audio_refs(msg)

        if user_id:
-            if not text and not images:
+            if not text and not images and not audio_refs:
                logger.debug(
-                    f"收到来自 {client_config.name} 的Telegram消息无文本和图片"
+                    f"收到来自 {client_config.name} 的Telegram消息无文本、图片和语音"
                )
                return None

            logger.info(
                f"收到来自 {client_config.name} 的Telegram消息："
-                f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, images={len(images) if images else 0}"
+                f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, "
+                f"images={len(images) if images else 0}, audios={len(audio_refs) if audio_refs else 0}"
            )

            cleaned_text = (
@@ -263,6 +265,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                text=cleaned_text,
                chat_id=str(chat_id) if chat_id else None,
                images=images if images else None,
+                audio_refs=audio_refs if audio_refs else None,
            )
        return None

@@ -288,6 +291,26 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):

        return images if images else None

+    @staticmethod
+    def _extract_audio_refs(msg: dict) -> Optional[List[str]]:
+        """
+        从Telegram消息中提取语音/音频 file_id。
+        """
+        audio_refs = []
+        voice = msg.get("voice")
+        if voice:
+            file_id = voice.get("file_id")
+            if file_id:
+                audio_refs.append(f"tg://voice_file_id/{file_id}")
+
+        audio = msg.get("audio")
+        if audio:
+            file_id = audio.get("file_id")
+            if file_id:
+                audio_refs.append(f"tg://audio_file_id/{file_id}")
+
+        return audio_refs if audio_refs else None
+
    @staticmethod
    def _embed_entity_links(text: str, entities: Optional[List[dict]]) -> str:
        """
@@ -389,17 +412,25 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                    return
            client: Telegram = self.get_instance(conf.name)
            if client:
-                client.send_msg(
-                    title=message.title,
-                    text=message.text,
-                    image=message.image,
-                    userid=userid,
-                    link=message.link,
-                    buttons=message.buttons,
-                    original_message_id=message.original_message_id,
-                    original_chat_id=message.original_chat_id,
-                    disable_web_page_preview=message.disable_web_page_preview,
-                )
+                if message.voice_path:
+                    client.send_voice(
+                        voice_path=message.voice_path,
+                        userid=userid,
+                        caption=message.voice_caption,
+                        original_chat_id=message.original_chat_id,
+                    )
+                else:
+                    client.send_msg(
+                        title=message.title,
+                        text=message.text,
+                        image=message.image,
+                        userid=userid,
+                        link=message.link,
+                        buttons=message.buttons,
+                        original_message_id=message.original_message_id,
+                        original_chat_id=message.original_chat_id,
+                        disable_web_page_preview=message.disable_web_page_preview,
+                    )

    def post_medias_message(
        self, message: Notification, medias: List[MediaInfo]
@@ -531,14 +562,22 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                    return None
            client: Telegram = self.get_instance(conf.name)
            if client:
-                result = client.send_msg(
-                    title=message.title,
-                    text=message.text,
-                    image=message.image,
-                    userid=userid,
-                    link=message.link,
-                    disable_web_page_preview=message.disable_web_page_preview,
-                )
+                if message.voice_path:
+                    result = client.send_voice(
+                        voice_path=message.voice_path,
+                        userid=userid,
+                        caption=message.voice_caption,
+                        original_chat_id=message.original_chat_id,
+                    )
+                else:
+                    result = client.send_msg(
+                        title=message.title,
+                        text=message.text,
+                        image=message.image,
+                        userid=userid,
+                        link=message.link,
+                        disable_web_page_preview=message.disable_web_page_preview,
+                    )
                if result and result.get("success"):
                    return MessageResponse(
                        message_id=result.get("message_id"),
@@ -601,7 +640,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
                )
            client.register_commands(filtered_scoped_commands)

-    def download_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
+    def download_telegram_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
        """
        下载Telegram文件并转为base64
        :param file_id: Telegram文件ID
@@ -620,3 +659,15 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):

            return base64.b64encode(file_content).decode()
        return None
+
+    def download_telegram_file_bytes(self, file_id: str, source: str) -> Optional[bytes]:
+        """
+        下载Telegram文件并返回原始字节。
+        """
+        config = self.get_config(source)
+        if not config:
+            return None
+        client = self.get_instance(config.name)
+        if not client:
+            return None
+        return client.download_file(file_id)
--- a/app/modules/telegram/telegram.py
+++ b/app/modules/telegram/telegram.py
@@ -3,6 +3,7 @@ import json
 import re
 import threading
 import time
+from pathlib import Path
 from typing import Any, Optional, List, Dict, Callable, Union
 from urllib.parse import urljoin, quote

@@ -461,6 +462,51 @@ class Telegram:
            self._stop_typing_task(chat_id)
            return {"success": False}

+    def send_voice(
+            self,
+            voice_path: str,
+            userid: Optional[str] = None,
+            caption: Optional[str] = None,
+            original_chat_id: Optional[str] = None,
+    ) -> Optional[dict]:
+        """
+        发送Telegram语音消息。
+        """
+        if not self._bot or not voice_path:
+            return None
+
+        chat_id = self._determine_target_chat_id(userid, original_chat_id)
+        voice_file = Path(voice_path)
+        if not voice_file.exists():
+            logger.error(f"语音文件不存在: {voice_file}")
+            return {"success": False}
+
+        try:
+            with voice_file.open("rb") as fp:
+                sent = self._bot.send_voice(
+                    chat_id=chat_id,
+                    voice=fp,
+                    caption=standardize(caption) if caption else None,
+                    parse_mode="MarkdownV2" if caption else None,
+                )
+            self._stop_typing_task(chat_id)
+            if sent and hasattr(sent, "message_id"):
+                return {
+                    "success": True,
+                    "message_id": sent.message_id,
+                    "chat_id": sent.chat.id if hasattr(sent, "chat") else chat_id,
+                }
+            return {"success": bool(sent)}
+        except Exception as err:
+            logger.error(f"发送语音消息失败：{err}")
+            self._stop_typing_task(chat_id)
+            return {"success": False}
+        finally:
+            try:
+                voice_file.unlink(missing_ok=True)
+            except Exception as cleanup_err:
+                logger.debug(f"清理语音临时文件失败: {cleanup_err}")
+
    def _determine_target_chat_id(
            self, userid: Optional[str] = None, original_chat_id: Optional[str] = None
    ) -> str:
--- a/app/modules/wechat/init.py
+++ b/app/modules/wechat/init.py
@@ -167,6 +167,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
            # 解析消息内容
            content = None
            images = None
+            audio_refs = None
            if msg_type == "event" and event == "click":
                # 校验用户有权限执行交互命令
                if client_config.config.get('WECHAT_ADMINS'):
@@ -192,14 +193,24 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
                logger.info(
                    f"收到来自 {client_config.name} 的微信图片消息：userid={user_id}, images={len(images) if images else 0}"
                )
+            elif msg_type == "voice":
+                media_id = DomUtils.tag_value(root_node, "MediaId")
+                recognition = DomUtils.tag_value(root_node, "Recognition", default="")
+                content = (recognition or "").strip()
+                if media_id:
+                    audio_refs = [f"wxwork://voice_media_id/{media_id}"]
+                logger.info(
+                    f"收到来自 {client_config.name} 的微信语音消息：userid={user_id}, "
+                    f"text={content}, audios={len(audio_refs) if audio_refs else 0}"
+                )
            else:
                return None

-            if content or images:
+            if content or images or audio_refs:
                # 处理消息内容
                return CommingMessage(channel=MessageChannel.Wechat, source=client_config.name,
                                      userid=user_id, username=user_id, text=content or "",
-                                      images=images)
+                                      images=images, audio_refs=audio_refs)
        except Exception as err:
            logger.error(f"微信消息处理发生错误：{str(err)}")
        return None
@@ -230,6 +241,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):

        text = WeChatBot._extract_text_from_body(payload_body)
        images = WeChatBot._extract_images_from_body(payload_body)
+        audio_refs = ["wxbot://voice"] if payload_body.get("msgtype") == "voice" else None
        if text:
            text = re.sub(r"@\S+", "", text).strip()

@@ -245,7 +257,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
                    client.send_msg(title="只有管理员才有权限执行此命令", userid=sender)
                return None

-        if not text and not images:
+        if not text and not images and not audio_refs:
            return None

        logger.info(
@@ -259,6 +271,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
            username=sender,
            text=text or "",
            images=images,
+            audio_refs=audio_refs,
        )

    def post_message(self, message: Notification, **kwargs) -> None:
@@ -279,8 +292,17 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
                    return
            client: WeChat = self.get_instance(conf.name)
            if client:
-                client.send_msg(title=message.title, text=message.text,
-                                image=message.image, userid=userid, link=message.link)
+                if message.voice_path and hasattr(client, "send_voice"):
+                    sent = client.send_voice(
+                        voice_path=message.voice_path,
+                        userid=userid,
+                    )
+                    if not sent:
+                        client.send_msg(title=message.title, text=message.text,
+                                        image=message.image, userid=userid, link=message.link)
+                else:
+                    client.send_msg(title=message.title, text=message.text,
+                                    image=message.image, userid=userid, link=message.link)

    def download_wechat_image_to_data_url(self, image_ref: str, source: str) -> Optional[str]:
        """
@@ -301,6 +323,23 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
            return client.download_image_to_data_url(image_ref)
        return None

+    def download_wechat_media_bytes(self, media_ref: str, source: str) -> Optional[bytes]:
+        """
+        下载企业微信语音媒体并返回原始字节。
+        """
+        if not media_ref:
+            return None
+        client_config = self.get_config(source)
+        if not client_config:
+            return None
+        client = self.get_instance(client_config.name)
+        if not client or not hasattr(client, "download_media_bytes"):
+            return None
+        if media_ref.startswith("wxwork://voice_media_id/"):
+            media_id = media_ref.replace("wxwork://voice_media_id/", "", 1)
+            return client.download_media_bytes(media_id)
+        return None
+
    def post_medias_message(self, message: Notification, medias: List[MediaInfo]) -> None:
        """
        发送媒体信息选择列表
--- a/app/modules/wechat/wechat.py
+++ b/app/modules/wechat/wechat.py
@@ -2,7 +2,9 @@ import json
 import re
 import threading
 import base64
+import subprocess
 from datetime import datetime
+from pathlib import Path
 from typing import Optional, List, Dict

 from app.core.context import MediaInfo, Context
@@ -46,6 +48,8 @@ class WeChat:
    _delete_menu_url = "cgi-bin/menu/delete?access_token={access_token}&agentid={agentid}"
    # 企业微信下载媒体URL
    _download_media_url = "cgi-bin/media/get?access_token={access_token}&media_id={media_id}"
+    # 企业微信上传临时素材URL
+    _upload_media_url = "cgi-bin/media/upload?access_token={access_token}&type={media_type}"

    def __init__(self, WECHAT_CORPID: Optional[str] = None, WECHAT_APP_SECRET: Optional[str] = None,
                 WECHAT_APP_ID: Optional[str] = None, WECHAT_PROXY: Optional[str] = None, **kwargs):
@@ -66,6 +70,7 @@ class WeChat:
            self._create_menu_url = UrlUtils.adapt_request_url(self._proxy, self._create_menu_url)
            self._delete_menu_url = UrlUtils.adapt_request_url(self._proxy, self._delete_menu_url)
            self._download_media_url = UrlUtils.adapt_request_url(self._proxy, self._download_media_url)
+            self._upload_media_url = UrlUtils.adapt_request_url(self._proxy, self._upload_media_url)

        if self._corpid and self._appsecret and self._appid:
            self.__get_access_token()
@@ -323,6 +328,168 @@ class WeChat:
        mime_type = self._guess_mime_type(res.content, content_type or "image/jpeg")
        return f"data:{mime_type};base64,{base64.b64encode(res.content).decode()}"

+    def download_media_bytes(self, media_id: str) -> Optional[bytes]:
+        """
+        下载企业微信媒体文件并返回原始字节。
+        """
+        if not media_id:
+            return None
+        access_token = self.__get_access_token()
+        if not access_token:
+            logger.error("下载企业微信媒体失败：access_token 获取失败")
+            return None
+        req_url = self._download_media_url.format(
+            access_token=access_token,
+            media_id=media_id,
+        )
+        try:
+            res = RequestUtils(timeout=30).get_res(req_url)
+        except Exception as err:
+            logger.error(f"下载企业微信媒体失败：{err}")
+            return None
+        if not res or not res.content:
+            return None
+        content_type = (res.headers.get("Content-Type") or "").split(";")[0].strip()
+        if content_type == "application/json":
+            try:
+                logger.error(f"企业微信媒体下载失败：{res.json()}")
+            except Exception:
+                logger.error(f"企业微信媒体下载失败：{res.text}")
+            return None
+        return res.content
+
+    @staticmethod
+    def _convert_voice_to_amr(voice_path: str) -> Optional[Path]:
+        """
+        将语音文件转换为企业微信要求的 AMR 格式（<=60s）。
+        """
+        src_path = Path(voice_path)
+        if not src_path.exists():
+            logger.error(f"语音文件不存在：{src_path}")
+            return None
+
+        dst_path = src_path.with_suffix(".amr")
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-i",
+            str(src_path),
+            "-ar",
+            "8000",
+            "-ac",
+            "1",
+            "-t",
+            "60",
+            str(dst_path),
+        ]
+        try:
+            result = subprocess.run(
+                cmd,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+        except Exception as err:
+            logger.error(f"调用 ffmpeg 转换 AMR 失败：{err}")
+            return None
+
+        if result.returncode != 0 or not dst_path.exists():
+            logger.error(
+                "ffmpeg 转换 AMR 失败: returncode=%s, stderr=%s",
+                result.returncode,
+                (result.stderr or "").strip()[:500],
+            )
+            return None
+
+        if dst_path.stat().st_size > 2 * 1024 * 1024:
+            logger.error("AMR 语音文件超过 2MB，无法发送到企业微信")
+            dst_path.unlink(missing_ok=True)
+            return None
+        return dst_path
+
+    def _upload_temp_media(self, media_path: Path, media_type: str = "voice") -> Optional[str]:
+        """
+        上传企业微信临时素材，返回 media_id。
+        """
+        access_token = self.__get_access_token()
+        if not access_token:
+            return None
+        req_url = self._upload_media_url.format(
+            access_token=access_token,
+            media_type=media_type,
+        )
+        try:
+            with media_path.open("rb") as media_file:
+                response = RequestUtils(timeout=60).request(
+                    method="post",
+                    url=req_url,
+                    files={
+                        "media": (
+                            media_path.name,
+                            media_file,
+                            "voice/amr" if media_type == "voice" else "application/octet-stream",
+                        )
+                    },
+                )
+        except Exception as err:
+            logger.error(f"上传企业微信临时素材失败：{err}")
+            return None
+
+        if not response:
+            return None
+
+        try:
+            ret_json = response.json()
+        except Exception as err:
+            logger.error(f"解析企业微信临时素材响应失败：{err}")
+            return None
+
+        if ret_json.get("errcode") != 0:
+            logger.error(f"上传企业微信临时素材失败：{ret_json}")
+            return None
+        return ret_json.get("media_id")
+
+    def send_voice(self, voice_path: str, userid: Optional[str] = None) -> Optional[bool]:
+        """
+        发送企业微信语音消息。仅自建应用模式支持。
+        """
+        if not voice_path:
+            return False
+        if not self.__get_access_token():
+            logger.error("获取微信access_token失败，请检查参数配置")
+            return None
+        if not userid:
+            userid = "@all"
+
+        source_path = Path(voice_path)
+        converted_path = self._convert_voice_to_amr(voice_path)
+        if not converted_path:
+            return False
+
+        try:
+            media_id = self._upload_temp_media(converted_path, media_type="voice")
+            if not media_id:
+                return False
+
+            req_json = {
+                "touser": userid,
+                "msgtype": "voice",
+                "agentid": self._appid,
+                "voice": {
+                    "media_id": media_id
+                },
+                "safe": 0,
+                "enable_id_trans": 0,
+                "enable_duplicate_check": 0
+            }
+            return self.__post_request(self._send_msg_url, req_json)
+        except Exception as err:
+            logger.error(f"发送企业微信语音消息失败：{err}")
+            return False
+        finally:
+            converted_path.unlink(missing_ok=True)
+            source_path.unlink(missing_ok=True)
+
    def send_medias_msg(self, medias: List[MediaInfo], userid: Optional[str] = None) -> Optional[bool]:
        """
        发送列表类消息