fix: filter invalid subtitle action links

This commit is contained in:
jxxghp
2026-05-13 17:57:25 +08:00
parent 0959c4ace4
commit fcf6e14ac9
2 changed files with 116 additions and 23 deletions

View File

@@ -1,7 +1,9 @@
import re
import shutil
import time
from pathlib import Path
from typing import Tuple, Union
from typing import List, Optional, Tuple, Union
from urllib.parse import urljoin, urlparse
from lxml import etree
@@ -18,7 +20,6 @@ from app.schemas import TorrentInfo
from app.schemas.file import FileURI
from app.schemas.types import ModuleType, OtherModulesType
from app.utils.http import RequestUtils
from app.utils.string import StringUtils
from app.utils.system import SystemUtils
@@ -27,12 +28,24 @@ class SubtitleModule(_ModuleBase):
字幕下载模块
"""
# 站点详情页字幕下载链接识别XPATH
# 站点详情页字幕下载元素识别XPATH
_SITE_SUBTITLE_XPATH = [
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a[not(@class)]/@href',
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a/@href',
'//div[contains(@class, "font-bold")][text()="字幕"]/following-sibling::div[1]//a[not(@class)]/@href', # 憨憨
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a[not(@class)]',
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a',
'//div[contains(@class, "font-bold")][text()="字幕"]/following-sibling::div[1]//a[not(@class)]', # 憨憨
]
_SUBTITLE_URL_ATTRS = (
"href",
"data-url",
"data-href",
"data-link",
"data-download",
"data-download-url",
)
_SCRIPT_URL_RE = re.compile(
r"""["'](?P<url>(?:https?:)?//[^"']+|/[^"']+|[^"']*(?:download|subtitle|subs?)[^"']*)["']""",
re.IGNORECASE,
)
def init_module(self) -> None:
pass
@@ -71,6 +84,57 @@ class SubtitleModule(_ModuleBase):
def test(self):
pass
@classmethod
def __normalize_subtitle_link(cls, page_url: str, sublink: str) -> Optional[str]:
"""
转换并过滤真实字幕下载链接
"""
if not sublink:
return None
sublink = sublink.strip()
if not sublink or sublink.startswith("#"):
return None
parsed = urlparse(sublink)
if parsed.scheme and parsed.scheme not in ("http", "https"):
return None
if sublink.startswith("//"):
page_scheme = urlparse(page_url).scheme or "https"
sublink = f"{page_scheme}:{sublink}"
else:
sublink = urljoin(page_url, sublink)
parsed = urlparse(sublink)
if parsed.scheme not in ("http", "https") or not parsed.netloc:
return None
return sublink
@classmethod
def _parse_subtitle_links(cls, html, page_url: str) -> List[str]:
"""
从站点详情页中解析字幕下载链接
"""
sublink_list = []
found_links = set()
for xpath in cls._SITE_SUBTITLE_XPATH:
sublink_count = len(sublink_list)
sublink_nodes = html.xpath(xpath)
if sublink_nodes:
for sublink_node in sublink_nodes:
sublinks = [sublink_node.get(attr) for attr in cls._SUBTITLE_URL_ATTRS]
sublinks.extend(
match.group("url")
for match in cls._SCRIPT_URL_RE.finditer(sublink_node.get("onclick") or "")
)
for sublink in sublinks:
sublink = cls.__normalize_subtitle_link(page_url, sublink)
if not sublink or sublink in found_links:
continue
found_links.add(sublink)
sublink_list.append(sublink)
# 已成功匹配字幕区域后续xpath可以忽略
if len(sublink_list) > sublink_count:
break
return sublink_list
def _get_subtitle_links(self, torrent: TorrentInfo):
"""
获取字幕链接
@@ -97,23 +161,7 @@ class SubtitleModule(_ModuleBase):
return []
html = etree.HTML(res.text)
try:
sublink_list = []
for xpath in self._SITE_SUBTITLE_XPATH:
sublinks = html.xpath(xpath)
if sublinks:
for sublink in sublinks:
if not sublink:
continue
if not sublink.startswith("http"):
base_url = StringUtils.get_base_url(torrent.page_url)
if sublink.startswith("/"):
sublink = "%s%s" % (base_url, sublink)
else:
sublink = "%s/%s" % (base_url, sublink)
sublink_list.append(sublink)
# 已成功获取了链接后续xpath可以忽略
break
return sublink_list
return self._parse_subtitle_links(html, torrent.page_url)
finally:
if html is not None:
del html