mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-02 14:11:07 +08:00
feat(agent): add voice message support with TTS/STT for Telegram and WeChat
- Integrate voice message handling: detect and extract audio references from Telegram and WeChat messages, route to agent with voice reply preference. - Add voice provider abstraction and OpenAI-based TTS/STT implementation. - Implement agent tool `send_voice_message` for generating and sending voice replies, with fallback to text if voice is unavailable. - Extend agent prompt and context to support voice reply instructions. - Update notification and message schemas to support audio fields. - Add Telegram and WeChat voice sending logic, including audio file conversion and temporary media upload for WeChat. - Add tests for voice helper and agent voice routing.
This commit is contained in:
@@ -297,7 +297,7 @@ class SlackModule(_ModuleBase, _MessageBase[Slack]):
|
||||
images.append(url)
|
||||
return images if images else None
|
||||
|
||||
def download_file_to_data_url(self, file_url: str, source: str) -> Optional[str]:
|
||||
def download_slack_file_to_data_url(self, file_url: str, source: str) -> Optional[str]:
|
||||
"""
|
||||
下载Slack文件并转为data URL
|
||||
:param file_url: Slack私有文件URL
|
||||
|
||||
@@ -214,17 +214,19 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
text = self._append_reply_markup_links(text, msg.get("reply_markup"))
|
||||
|
||||
images = self._extract_images(msg)
|
||||
audio_refs = self._extract_audio_refs(msg)
|
||||
|
||||
if user_id:
|
||||
if not text and not images:
|
||||
if not text and not images and not audio_refs:
|
||||
logger.debug(
|
||||
f"收到来自 {client_config.name} 的Telegram消息无文本和图片"
|
||||
f"收到来自 {client_config.name} 的Telegram消息无文本、图片和语音"
|
||||
)
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
f"收到来自 {client_config.name} 的Telegram消息:"
|
||||
f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, images={len(images) if images else 0}"
|
||||
f"userid={user_id}, username={user_name}, chat_id={chat_id}, text={text}, "
|
||||
f"images={len(images) if images else 0}, audios={len(audio_refs) if audio_refs else 0}"
|
||||
)
|
||||
|
||||
cleaned_text = (
|
||||
@@ -263,6 +265,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
text=cleaned_text,
|
||||
chat_id=str(chat_id) if chat_id else None,
|
||||
images=images if images else None,
|
||||
audio_refs=audio_refs if audio_refs else None,
|
||||
)
|
||||
return None
|
||||
|
||||
@@ -288,6 +291,26 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
|
||||
return images if images else None
|
||||
|
||||
@staticmethod
|
||||
def _extract_audio_refs(msg: dict) -> Optional[List[str]]:
|
||||
"""
|
||||
从Telegram消息中提取语音/音频 file_id。
|
||||
"""
|
||||
audio_refs = []
|
||||
voice = msg.get("voice")
|
||||
if voice:
|
||||
file_id = voice.get("file_id")
|
||||
if file_id:
|
||||
audio_refs.append(f"tg://voice_file_id/{file_id}")
|
||||
|
||||
audio = msg.get("audio")
|
||||
if audio:
|
||||
file_id = audio.get("file_id")
|
||||
if file_id:
|
||||
audio_refs.append(f"tg://audio_file_id/{file_id}")
|
||||
|
||||
return audio_refs if audio_refs else None
|
||||
|
||||
@staticmethod
|
||||
def _embed_entity_links(text: str, entities: Optional[List[dict]]) -> str:
|
||||
"""
|
||||
@@ -389,17 +412,25 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
return
|
||||
client: Telegram = self.get_instance(conf.name)
|
||||
if client:
|
||||
client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
buttons=message.buttons,
|
||||
original_message_id=message.original_message_id,
|
||||
original_chat_id=message.original_chat_id,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
if message.voice_path:
|
||||
client.send_voice(
|
||||
voice_path=message.voice_path,
|
||||
userid=userid,
|
||||
caption=message.voice_caption,
|
||||
original_chat_id=message.original_chat_id,
|
||||
)
|
||||
else:
|
||||
client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
buttons=message.buttons,
|
||||
original_message_id=message.original_message_id,
|
||||
original_chat_id=message.original_chat_id,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
|
||||
def post_medias_message(
|
||||
self, message: Notification, medias: List[MediaInfo]
|
||||
@@ -531,14 +562,22 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
return None
|
||||
client: Telegram = self.get_instance(conf.name)
|
||||
if client:
|
||||
result = client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
if message.voice_path:
|
||||
result = client.send_voice(
|
||||
voice_path=message.voice_path,
|
||||
userid=userid,
|
||||
caption=message.voice_caption,
|
||||
original_chat_id=message.original_chat_id,
|
||||
)
|
||||
else:
|
||||
result = client.send_msg(
|
||||
title=message.title,
|
||||
text=message.text,
|
||||
image=message.image,
|
||||
userid=userid,
|
||||
link=message.link,
|
||||
disable_web_page_preview=message.disable_web_page_preview,
|
||||
)
|
||||
if result and result.get("success"):
|
||||
return MessageResponse(
|
||||
message_id=result.get("message_id"),
|
||||
@@ -601,7 +640,7 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
)
|
||||
client.register_commands(filtered_scoped_commands)
|
||||
|
||||
def download_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
|
||||
def download_telegram_file_to_base64(self, file_id: str, source: str) -> Optional[str]:
|
||||
"""
|
||||
下载Telegram文件并转为base64
|
||||
:param file_id: Telegram文件ID
|
||||
@@ -620,3 +659,15 @@ class TelegramModule(_ModuleBase, _MessageBase[Telegram]):
|
||||
|
||||
return base64.b64encode(file_content).decode()
|
||||
return None
|
||||
|
||||
def download_telegram_file_bytes(self, file_id: str, source: str) -> Optional[bytes]:
|
||||
"""
|
||||
下载Telegram文件并返回原始字节。
|
||||
"""
|
||||
config = self.get_config(source)
|
||||
if not config:
|
||||
return None
|
||||
client = self.get_instance(config.name)
|
||||
if not client:
|
||||
return None
|
||||
return client.download_file(file_id)
|
||||
|
||||
@@ -3,6 +3,7 @@ import json
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional, List, Dict, Callable, Union
|
||||
from urllib.parse import urljoin, quote
|
||||
|
||||
@@ -461,6 +462,51 @@ class Telegram:
|
||||
self._stop_typing_task(chat_id)
|
||||
return {"success": False}
|
||||
|
||||
def send_voice(
|
||||
self,
|
||||
voice_path: str,
|
||||
userid: Optional[str] = None,
|
||||
caption: Optional[str] = None,
|
||||
original_chat_id: Optional[str] = None,
|
||||
) -> Optional[dict]:
|
||||
"""
|
||||
发送Telegram语音消息。
|
||||
"""
|
||||
if not self._bot or not voice_path:
|
||||
return None
|
||||
|
||||
chat_id = self._determine_target_chat_id(userid, original_chat_id)
|
||||
voice_file = Path(voice_path)
|
||||
if not voice_file.exists():
|
||||
logger.error(f"语音文件不存在: {voice_file}")
|
||||
return {"success": False}
|
||||
|
||||
try:
|
||||
with voice_file.open("rb") as fp:
|
||||
sent = self._bot.send_voice(
|
||||
chat_id=chat_id,
|
||||
voice=fp,
|
||||
caption=standardize(caption) if caption else None,
|
||||
parse_mode="MarkdownV2" if caption else None,
|
||||
)
|
||||
self._stop_typing_task(chat_id)
|
||||
if sent and hasattr(sent, "message_id"):
|
||||
return {
|
||||
"success": True,
|
||||
"message_id": sent.message_id,
|
||||
"chat_id": sent.chat.id if hasattr(sent, "chat") else chat_id,
|
||||
}
|
||||
return {"success": bool(sent)}
|
||||
except Exception as err:
|
||||
logger.error(f"发送语音消息失败:{err}")
|
||||
self._stop_typing_task(chat_id)
|
||||
return {"success": False}
|
||||
finally:
|
||||
try:
|
||||
voice_file.unlink(missing_ok=True)
|
||||
except Exception as cleanup_err:
|
||||
logger.debug(f"清理语音临时文件失败: {cleanup_err}")
|
||||
|
||||
def _determine_target_chat_id(
|
||||
self, userid: Optional[str] = None, original_chat_id: Optional[str] = None
|
||||
) -> str:
|
||||
|
||||
@@ -167,6 +167,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
# 解析消息内容
|
||||
content = None
|
||||
images = None
|
||||
audio_refs = None
|
||||
if msg_type == "event" and event == "click":
|
||||
# 校验用户有权限执行交互命令
|
||||
if client_config.config.get('WECHAT_ADMINS'):
|
||||
@@ -192,14 +193,24 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
logger.info(
|
||||
f"收到来自 {client_config.name} 的微信图片消息:userid={user_id}, images={len(images) if images else 0}"
|
||||
)
|
||||
elif msg_type == "voice":
|
||||
media_id = DomUtils.tag_value(root_node, "MediaId")
|
||||
recognition = DomUtils.tag_value(root_node, "Recognition", default="")
|
||||
content = (recognition or "").strip()
|
||||
if media_id:
|
||||
audio_refs = [f"wxwork://voice_media_id/{media_id}"]
|
||||
logger.info(
|
||||
f"收到来自 {client_config.name} 的微信语音消息:userid={user_id}, "
|
||||
f"text={content}, audios={len(audio_refs) if audio_refs else 0}"
|
||||
)
|
||||
else:
|
||||
return None
|
||||
|
||||
if content or images:
|
||||
if content or images or audio_refs:
|
||||
# 处理消息内容
|
||||
return CommingMessage(channel=MessageChannel.Wechat, source=client_config.name,
|
||||
userid=user_id, username=user_id, text=content or "",
|
||||
images=images)
|
||||
images=images, audio_refs=audio_refs)
|
||||
except Exception as err:
|
||||
logger.error(f"微信消息处理发生错误:{str(err)}")
|
||||
return None
|
||||
@@ -230,6 +241,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
|
||||
text = WeChatBot._extract_text_from_body(payload_body)
|
||||
images = WeChatBot._extract_images_from_body(payload_body)
|
||||
audio_refs = ["wxbot://voice"] if payload_body.get("msgtype") == "voice" else None
|
||||
if text:
|
||||
text = re.sub(r"@\S+", "", text).strip()
|
||||
|
||||
@@ -245,7 +257,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
client.send_msg(title="只有管理员才有权限执行此命令", userid=sender)
|
||||
return None
|
||||
|
||||
if not text and not images:
|
||||
if not text and not images and not audio_refs:
|
||||
return None
|
||||
|
||||
logger.info(
|
||||
@@ -259,6 +271,7 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
username=sender,
|
||||
text=text or "",
|
||||
images=images,
|
||||
audio_refs=audio_refs,
|
||||
)
|
||||
|
||||
def post_message(self, message: Notification, **kwargs) -> None:
|
||||
@@ -279,8 +292,17 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
return
|
||||
client: WeChat = self.get_instance(conf.name)
|
||||
if client:
|
||||
client.send_msg(title=message.title, text=message.text,
|
||||
image=message.image, userid=userid, link=message.link)
|
||||
if message.voice_path and hasattr(client, "send_voice"):
|
||||
sent = client.send_voice(
|
||||
voice_path=message.voice_path,
|
||||
userid=userid,
|
||||
)
|
||||
if not sent:
|
||||
client.send_msg(title=message.title, text=message.text,
|
||||
image=message.image, userid=userid, link=message.link)
|
||||
else:
|
||||
client.send_msg(title=message.title, text=message.text,
|
||||
image=message.image, userid=userid, link=message.link)
|
||||
|
||||
def download_wechat_image_to_data_url(self, image_ref: str, source: str) -> Optional[str]:
|
||||
"""
|
||||
@@ -301,6 +323,23 @@ class WechatModule(_ModuleBase, _MessageBase[WeChat]):
|
||||
return client.download_image_to_data_url(image_ref)
|
||||
return None
|
||||
|
||||
def download_wechat_media_bytes(self, media_ref: str, source: str) -> Optional[bytes]:
|
||||
"""
|
||||
下载企业微信语音媒体并返回原始字节。
|
||||
"""
|
||||
if not media_ref:
|
||||
return None
|
||||
client_config = self.get_config(source)
|
||||
if not client_config:
|
||||
return None
|
||||
client = self.get_instance(client_config.name)
|
||||
if not client or not hasattr(client, "download_media_bytes"):
|
||||
return None
|
||||
if media_ref.startswith("wxwork://voice_media_id/"):
|
||||
media_id = media_ref.replace("wxwork://voice_media_id/", "", 1)
|
||||
return client.download_media_bytes(media_id)
|
||||
return None
|
||||
|
||||
def post_medias_message(self, message: Notification, medias: List[MediaInfo]) -> None:
|
||||
"""
|
||||
发送媒体信息选择列表
|
||||
|
||||
@@ -2,7 +2,9 @@ import json
|
||||
import re
|
||||
import threading
|
||||
import base64
|
||||
import subprocess
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, Dict
|
||||
|
||||
from app.core.context import MediaInfo, Context
|
||||
@@ -46,6 +48,8 @@ class WeChat:
|
||||
_delete_menu_url = "cgi-bin/menu/delete?access_token={access_token}&agentid={agentid}"
|
||||
# 企业微信下载媒体URL
|
||||
_download_media_url = "cgi-bin/media/get?access_token={access_token}&media_id={media_id}"
|
||||
# 企业微信上传临时素材URL
|
||||
_upload_media_url = "cgi-bin/media/upload?access_token={access_token}&type={media_type}"
|
||||
|
||||
def __init__(self, WECHAT_CORPID: Optional[str] = None, WECHAT_APP_SECRET: Optional[str] = None,
|
||||
WECHAT_APP_ID: Optional[str] = None, WECHAT_PROXY: Optional[str] = None, **kwargs):
|
||||
@@ -66,6 +70,7 @@ class WeChat:
|
||||
self._create_menu_url = UrlUtils.adapt_request_url(self._proxy, self._create_menu_url)
|
||||
self._delete_menu_url = UrlUtils.adapt_request_url(self._proxy, self._delete_menu_url)
|
||||
self._download_media_url = UrlUtils.adapt_request_url(self._proxy, self._download_media_url)
|
||||
self._upload_media_url = UrlUtils.adapt_request_url(self._proxy, self._upload_media_url)
|
||||
|
||||
if self._corpid and self._appsecret and self._appid:
|
||||
self.__get_access_token()
|
||||
@@ -323,6 +328,168 @@ class WeChat:
|
||||
mime_type = self._guess_mime_type(res.content, content_type or "image/jpeg")
|
||||
return f"data:{mime_type};base64,{base64.b64encode(res.content).decode()}"
|
||||
|
||||
def download_media_bytes(self, media_id: str) -> Optional[bytes]:
|
||||
"""
|
||||
下载企业微信媒体文件并返回原始字节。
|
||||
"""
|
||||
if not media_id:
|
||||
return None
|
||||
access_token = self.__get_access_token()
|
||||
if not access_token:
|
||||
logger.error("下载企业微信媒体失败:access_token 获取失败")
|
||||
return None
|
||||
req_url = self._download_media_url.format(
|
||||
access_token=access_token,
|
||||
media_id=media_id,
|
||||
)
|
||||
try:
|
||||
res = RequestUtils(timeout=30).get_res(req_url)
|
||||
except Exception as err:
|
||||
logger.error(f"下载企业微信媒体失败:{err}")
|
||||
return None
|
||||
if not res or not res.content:
|
||||
return None
|
||||
content_type = (res.headers.get("Content-Type") or "").split(";")[0].strip()
|
||||
if content_type == "application/json":
|
||||
try:
|
||||
logger.error(f"企业微信媒体下载失败:{res.json()}")
|
||||
except Exception:
|
||||
logger.error(f"企业微信媒体下载失败:{res.text}")
|
||||
return None
|
||||
return res.content
|
||||
|
||||
@staticmethod
|
||||
def _convert_voice_to_amr(voice_path: str) -> Optional[Path]:
|
||||
"""
|
||||
将语音文件转换为企业微信要求的 AMR 格式(<=60s)。
|
||||
"""
|
||||
src_path = Path(voice_path)
|
||||
if not src_path.exists():
|
||||
logger.error(f"语音文件不存在:{src_path}")
|
||||
return None
|
||||
|
||||
dst_path = src_path.with_suffix(".amr")
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(src_path),
|
||||
"-ar",
|
||||
"8000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-t",
|
||||
"60",
|
||||
str(dst_path),
|
||||
]
|
||||
try:
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"调用 ffmpeg 转换 AMR 失败:{err}")
|
||||
return None
|
||||
|
||||
if result.returncode != 0 or not dst_path.exists():
|
||||
logger.error(
|
||||
"ffmpeg 转换 AMR 失败: returncode=%s, stderr=%s",
|
||||
result.returncode,
|
||||
(result.stderr or "").strip()[:500],
|
||||
)
|
||||
return None
|
||||
|
||||
if dst_path.stat().st_size > 2 * 1024 * 1024:
|
||||
logger.error("AMR 语音文件超过 2MB,无法发送到企业微信")
|
||||
dst_path.unlink(missing_ok=True)
|
||||
return None
|
||||
return dst_path
|
||||
|
||||
def _upload_temp_media(self, media_path: Path, media_type: str = "voice") -> Optional[str]:
|
||||
"""
|
||||
上传企业微信临时素材,返回 media_id。
|
||||
"""
|
||||
access_token = self.__get_access_token()
|
||||
if not access_token:
|
||||
return None
|
||||
req_url = self._upload_media_url.format(
|
||||
access_token=access_token,
|
||||
media_type=media_type,
|
||||
)
|
||||
try:
|
||||
with media_path.open("rb") as media_file:
|
||||
response = RequestUtils(timeout=60).request(
|
||||
method="post",
|
||||
url=req_url,
|
||||
files={
|
||||
"media": (
|
||||
media_path.name,
|
||||
media_file,
|
||||
"voice/amr" if media_type == "voice" else "application/octet-stream",
|
||||
)
|
||||
},
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"上传企业微信临时素材失败:{err}")
|
||||
return None
|
||||
|
||||
if not response:
|
||||
return None
|
||||
|
||||
try:
|
||||
ret_json = response.json()
|
||||
except Exception as err:
|
||||
logger.error(f"解析企业微信临时素材响应失败:{err}")
|
||||
return None
|
||||
|
||||
if ret_json.get("errcode") != 0:
|
||||
logger.error(f"上传企业微信临时素材失败:{ret_json}")
|
||||
return None
|
||||
return ret_json.get("media_id")
|
||||
|
||||
def send_voice(self, voice_path: str, userid: Optional[str] = None) -> Optional[bool]:
|
||||
"""
|
||||
发送企业微信语音消息。仅自建应用模式支持。
|
||||
"""
|
||||
if not voice_path:
|
||||
return False
|
||||
if not self.__get_access_token():
|
||||
logger.error("获取微信access_token失败,请检查参数配置")
|
||||
return None
|
||||
if not userid:
|
||||
userid = "@all"
|
||||
|
||||
source_path = Path(voice_path)
|
||||
converted_path = self._convert_voice_to_amr(voice_path)
|
||||
if not converted_path:
|
||||
return False
|
||||
|
||||
try:
|
||||
media_id = self._upload_temp_media(converted_path, media_type="voice")
|
||||
if not media_id:
|
||||
return False
|
||||
|
||||
req_json = {
|
||||
"touser": userid,
|
||||
"msgtype": "voice",
|
||||
"agentid": self._appid,
|
||||
"voice": {
|
||||
"media_id": media_id
|
||||
},
|
||||
"safe": 0,
|
||||
"enable_id_trans": 0,
|
||||
"enable_duplicate_check": 0
|
||||
}
|
||||
return self.__post_request(self._send_msg_url, req_json)
|
||||
except Exception as err:
|
||||
logger.error(f"发送企业微信语音消息失败:{err}")
|
||||
return False
|
||||
finally:
|
||||
converted_path.unlink(missing_ok=True)
|
||||
source_path.unlink(missing_ok=True)
|
||||
|
||||
def send_medias_msg(self, medias: List[MediaInfo], userid: Optional[str] = None) -> Optional[bool]:
|
||||
"""
|
||||
发送列表类消息
|
||||
|
||||
Reference in New Issue
Block a user