feat(agent): add voice message support with TTS/STT for Telegram and WeChat

- Integrate voice message handling: detect and extract audio references from Telegram and WeChat messages, route to agent with voice reply preference.
- Add voice provider abstraction and OpenAI-based TTS/STT implementation.
- Implement agent tool `send_voice_message` for generating and sending voice replies, with fallback to text if voice is unavailable.
- Extend agent prompt and context to support voice reply instructions.
- Update notification and message schemas to support audio fields.
- Add Telegram and WeChat voice sending logic, including audio file conversion and temporary media upload for WeChat.
- Add tests for voice helper and agent voice routing.
This commit is contained in:
jxxghp
2026-04-12 12:30:02 +08:00
parent 9dababbcfd
commit e5f97cd299
17 changed files with 945 additions and 167 deletions

View File

@@ -297,7 +297,7 @@ class SlackModule(_ModuleBase, _MessageBase[Slack]):
images.append(url)
return images if images else None
def download_file_to_data_url(self, file_url: str, source: str) -> Optional[str]:
def download_slack_file_to_data_url(self, file_url: str, source: str) -> Optional[str]:
"""
下载Slack文件并转为data URL
:param file_url: Slack私有文件URL

View File

@@ -214,17 +214,19 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
text = self._append_reply_markup_links(text, msg.get("reply_markup"))
images = self._extract_images(msg)
audio_refs = self._extract_audio_refs(msg)
if user_id:
if not text and not images:
if not text and not images and not audio_refs:
logger.debug(
f"收到来自 {client_config.name} 的Telegram消息无文本图片"
f"收到来自 {client_config.name} 的Telegram消息无文本图片和语音"
)
return None
logger.info(
f"收到来自 {client_config.name} 的Telegram消息"
f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, images={len(images) if images else 0}"
f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, "
f"images={len(images) if images else 0}, audios={len(audio_refs) if audio_refs else 0}"
)
cleaned_text = (
@@ -263,6 +265,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
text=cleaned_text,
chat_id=str(chat_id) if chat_id else None,
images=images if images else None,
audio_refs=audio_refs if audio_refs else None,
)
return None
@@ -288,6 +291,26 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
return images if images else None
@staticmethod
def _extract_audio_refs(msg: dict) -> Optional[List[str]]:
"""
从Telegram消息中提取语音/音频 file_id。
"""
audio_refs = []
voice = msg.get("voice")
if voice:
file_id = voice.get("file_id")
if file_id:
audio_refs.append(f"tg://voice_file_id/{file_id}")
audio = msg.get("audio")
if audio:
file_id = audio.get("file_id")
if file_id:
audio_refs.append(f"tg://audio_file_id/{file_id}")
return audio_refs if audio_refs else None
@staticmethod
def _embed_entity_links(text: str, entities: Optional[List[dict]]) -> str:
"""
@@ -389,17 +412,25 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
return
client: Telegram = self.get_instance(conf.name)
if client:
client.send_msg(
title=message.title,
text=message.text,
image=message.image,
userid=userid,
link=message.link,
buttons=message.buttons,
original_message_id=message.original_message_id,
original_chat_id=message.original_chat_id,
disable_web_page_preview=message.disable_web_page_preview,
)
if message.voice_path:
client.send_voice(
voice_path=message.voice_path,
userid=userid,
caption=message.voice_caption,
original_chat_id=message.original_chat_id,
)
else:
client.send_msg(
title=message.title,
text=message.text,
image=message.image,
userid=userid,
link=message.link,
buttons=message.buttons,
original_message_id=message.original_message_id,
original_chat_id=message.original_chat_id,
disable_web_page_preview=message.disable_web_page_preview,
)
def post_medias_message(
self, message: Notification, medias: List[MediaInfo]
@@ -531,14 +562,22 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
return None
client: Telegram = self.get_instance(conf.name)
if client:
result = client.send_msg(
title=message.title,
text=message.text,
image=message.image,
userid=userid,
link=message.link,
disable_web_page_preview=message.disable_web_page_preview,
)
if message.voice_path:
result = client.send_voice(
voice_path=message.voice_path,
userid=userid,
caption=message.voice_caption,
original_chat_id=message.original_chat_id,
)
else:
result = client.send_msg(
title=message.title,
text=message.text,
image=message.image,
userid=userid,
link=message.link,
disable_web_page_preview=message.disable_web_page_preview,
)
if result and result.get("success"):
return MessageResponse(
message_id=result.get("message_id"),
@@ -601,7 +640,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
)
client.register_commands(filtered_scoped_commands)
def download_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
def download_telegram_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
"""
下载Telegram文件并转为base64
:param file_id: Telegram文件ID
@@ -620,3 +659,15 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
return base64.b64encode(file_content).decode()
return None
def download_telegram_file_bytes(self, file_id: str, source: str) -> Optional[bytes]:
"""
下载Telegram文件并返回原始字节。
"""
config = self.get_config(source)
if not config:
return None
client = self.get_instance(config.name)
if not client:
return None
return client.download_file(file_id)

View File

@@ -3,6 +3,7 @@ import json
import re
import threading
import time
from pathlib import Path
from typing import Any, Optional, List, Dict, Callable, Union
from urllib.parse import urljoin, quote
@@ -461,6 +462,51 @@ class Telegram:
self._stop_typing_task(chat_id)
return {"success": False}
def send_voice(
self,
voice_path: str,
userid: Optional[str] = None,
caption: Optional[str] = None,
original_chat_id: Optional[str] = None,
) -> Optional[dict]:
"""
发送Telegram语音消息。
"""
if not self._bot or not voice_path:
return None
chat_id = self._determine_target_chat_id(userid, original_chat_id)
voice_file = Path(voice_path)
if not voice_file.exists():
logger.error(f"语音文件不存在: {voice_file}")
return {"success": False}
try:
with voice_file.open("rb") as fp:
sent = self._bot.send_voice(
chat_id=chat_id,
voice=fp,
caption=standardize(caption) if caption else None,
parse_mode="MarkdownV2" if caption else None,
)
self._stop_typing_task(chat_id)
if sent and hasattr(sent, "message_id"):
return {
"success": True,
"message_id": sent.message_id,
"chat_id": sent.chat.id if hasattr(sent, "chat") else chat_id,
}
return {"success": bool(sent)}
except Exception as err:
logger.error(f"发送语音消息失败:{err}")
self._stop_typing_task(chat_id)
return {"success": False}
finally:
try:
voice_file.unlink(missing_ok=True)
except Exception as cleanup_err:
logger.debug(f"清理语音临时文件失败: {cleanup_err}")
def _determine_target_chat_id(
self, userid: Optional[str] = None, original_chat_id: Optional[str] = None
) -> str:

View File

@@ -167,6 +167,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
# 解析消息内容
content = None
images = None
audio_refs = None
if msg_type == "event" and event == "click":
# 校验用户有权限执行交互命令
if client_config.config.get('WECHAT_ADMINS'):
@@ -192,14 +193,24 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
logger.info(
f"收到来自 {client_config.name} 的微信图片消息userid={user_id}, images={len(images) if images else 0}"
)
elif msg_type == "voice":
media_id = DomUtils.tag_value(root_node, "MediaId")
recognition = DomUtils.tag_value(root_node, "Recognition", default="")
content = (recognition or "").strip()
if media_id:
audio_refs = [f"wxwork://voice_media_id/{media_id}"]
logger.info(
f"收到来自 {client_config.name} 的微信语音消息userid={user_id}, "
f"text={content}, audios={len(audio_refs) if audio_refs else 0}"
)
else:
return None
if content or images:
if content or images or audio_refs:
# 处理消息内容
return CommingMessage(channel=MessageChannel.Wechat, source=client_config.name,
userid=user_id, username=user_id, text=content or "",
images=images)
images=images, audio_refs=audio_refs)
except Exception as err:
logger.error(f"微信消息处理发生错误:{str(err)}")
return None
@@ -230,6 +241,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
text = WeChatBot._extract_text_from_body(payload_body)
images = WeChatBot._extract_images_from_body(payload_body)
audio_refs = ["wxbot://voice"] if payload_body.get("msgtype") == "voice" else None
if text:
text = re.sub(r"@\S+", "", text).strip()
@@ -245,7 +257,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
client.send_msg(title="只有管理员才有权限执行此命令", userid=sender)
return None
if not text and not images:
if not text and not images and not audio_refs:
return None
logger.info(
@@ -259,6 +271,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
username=sender,
text=text or "",
images=images,
audio_refs=audio_refs,
)
def post_message(self, message: Notification, **kwargs) -> None:
@@ -279,8 +292,17 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
return
client: WeChat = self.get_instance(conf.name)
if client:
client.send_msg(title=message.title, text=message.text,
image=message.image, userid=userid, link=message.link)
if message.voice_path and hasattr(client, "send_voice"):
sent = client.send_voice(
voice_path=message.voice_path,
userid=userid,
)
if not sent:
client.send_msg(title=message.title, text=message.text,
image=message.image, userid=userid, link=message.link)
else:
client.send_msg(title=message.title, text=message.text,
image=message.image, userid=userid, link=message.link)
def download_wechat_image_to_data_url(self, image_ref: str, source: str) -> Optional[str]:
"""
@@ -301,6 +323,23 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
return client.download_image_to_data_url(image_ref)
return None
def download_wechat_media_bytes(self, media_ref: str, source: str) -> Optional[bytes]:
"""
下载企业微信语音媒体并返回原始字节。
"""
if not media_ref:
return None
client_config = self.get_config(source)
if not client_config:
return None
client = self.get_instance(client_config.name)
if not client or not hasattr(client, "download_media_bytes"):
return None
if media_ref.startswith("wxwork://voice_media_id/"):
media_id = media_ref.replace("wxwork://voice_media_id/", "", 1)
return client.download_media_bytes(media_id)
return None
def post_medias_message(self, message: Notification, medias: List[MediaInfo]) -> None:
"""
发送媒体信息选择列表

View File

@@ -2,7 +2,9 @@ import json
import re
import threading
import base64
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Optional, List, Dict
from app.core.context import MediaInfo, Context
@@ -46,6 +48,8 @@ class WeChat:
_delete_menu_url = "cgi-bin/menu/delete?access_token={access_token}&agentid={agentid}"
# 企业微信下载媒体URL
_download_media_url = "cgi-bin/media/get?access_token={access_token}&media_id={media_id}"
# 企业微信上传临时素材URL
_upload_media_url = "cgi-bin/media/upload?access_token={access_token}&type={media_type}"
def __init__(self, WECHAT_CORPID: Optional[str] = None, WECHAT_APP_SECRET: Optional[str] = None,
WECHAT_APP_ID: Optional[str] = None, WECHAT_PROXY: Optional[str] = None, **kwargs):
@@ -66,6 +70,7 @@ class WeChat:
self._create_menu_url = UrlUtils.adapt_request_url(self._proxy, self._create_menu_url)
self._delete_menu_url = UrlUtils.adapt_request_url(self._proxy, self._delete_menu_url)
self._download_media_url = UrlUtils.adapt_request_url(self._proxy, self._download_media_url)
self._upload_media_url = UrlUtils.adapt_request_url(self._proxy, self._upload_media_url)
if self._corpid and self._appsecret and self._appid:
self.__get_access_token()
@@ -323,6 +328,168 @@ class WeChat:
mime_type = self._guess_mime_type(res.content, content_type or "image/jpeg")
return f"data:{mime_type};base64,{base64.b64encode(res.content).decode()}"
def download_media_bytes(self, media_id: str) -> Optional[bytes]:
"""
下载企业微信媒体文件并返回原始字节。
"""
if not media_id:
return None
access_token = self.__get_access_token()
if not access_token:
logger.error("下载企业微信媒体失败access_token 获取失败")
return None
req_url = self._download_media_url.format(
access_token=access_token,
media_id=media_id,
)
try:
res = RequestUtils(timeout=30).get_res(req_url)
except Exception as err:
logger.error(f"下载企业微信媒体失败:{err}")
return None
if not res or not res.content:
return None
content_type = (res.headers.get("Content-Type") or "").split(";")[0].strip()
if content_type == "application/json":
try:
logger.error(f"企业微信媒体下载失败:{res.json()}")
except Exception:
logger.error(f"企业微信媒体下载失败:{res.text}")
return None
return res.content
@staticmethod
def _convert_voice_to_amr(voice_path: str) -> Optional[Path]:
"""
将语音文件转换为企业微信要求的 AMR 格式(<=60s
"""
src_path = Path(voice_path)
if not src_path.exists():
logger.error(f"语音文件不存在:{src_path}")
return None
dst_path = src_path.with_suffix(".amr")
cmd = [
"ffmpeg",
"-y",
"-i",
str(src_path),
"-ar",
"8000",
"-ac",
"1",
"-t",
"60",
str(dst_path),
]
try:
result = subprocess.run(
cmd,
capture_output=True,
text=True,
check=False,
)
except Exception as err:
logger.error(f"调用 ffmpeg 转换 AMR 失败:{err}")
return None
if result.returncode != 0 or not dst_path.exists():
logger.error(
"ffmpeg 转换 AMR 失败: returncode=%s, stderr=%s",
result.returncode,
(result.stderr or "").strip()[:500],
)
return None
if dst_path.stat().st_size > 2 * 1024 * 1024:
logger.error("AMR 语音文件超过 2MB无法发送到企业微信")
dst_path.unlink(missing_ok=True)
return None
return dst_path
def _upload_temp_media(self, media_path: Path, media_type: str = "voice") -> Optional[str]:
"""
上传企业微信临时素材,返回 media_id。
"""
access_token = self.__get_access_token()
if not access_token:
return None
req_url = self._upload_media_url.format(
access_token=access_token,
media_type=media_type,
)
try:
with media_path.open("rb") as media_file:
response = RequestUtils(timeout=60).request(
method="post",
url=req_url,
files={
"media": (
media_path.name,
media_file,
"voice/amr" if media_type == "voice" else "application/octet-stream",
)
},
)
except Exception as err:
logger.error(f"上传企业微信临时素材失败:{err}")
return None
if not response:
return None
try:
ret_json = response.json()
except Exception as err:
logger.error(f"解析企业微信临时素材响应失败:{err}")
return None
if ret_json.get("errcode") != 0:
logger.error(f"上传企业微信临时素材失败:{ret_json}")
return None
return ret_json.get("media_id")
def send_voice(self, voice_path: str, userid: Optional[str] = None) -> Optional[bool]:
"""
发送企业微信语音消息。仅自建应用模式支持。
"""
if not voice_path:
return False
if not self.__get_access_token():
logger.error("获取微信access_token失败请检查参数配置")
return None
if not userid:
userid = "@all"
source_path = Path(voice_path)
converted_path = self._convert_voice_to_amr(voice_path)
if not converted_path:
return False
try:
media_id = self._upload_temp_media(converted_path, media_type="voice")
if not media_id:
return False
req_json = {
"touser": userid,
"msgtype": "voice",
"agentid": self._appid,
"voice": {
"media_id": media_id
},
"safe": 0,
"enable_id_trans": 0,
"enable_duplicate_check": 0
}
return self.__post_request(self._send_msg_url, req_json)
except Exception as err:
logger.error(f"发送企业微信语音消息失败:{err}")
return False
finally:
converted_path.unlink(missing_ok=True)
source_path.unlink(missing_ok=True)
def send_medias_msg(self, medias: List[MediaInfo], userid: Optional[str] = None) -> Optional[bool]:
"""
发送列表类消息