mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-01 13:40:54 +08:00
fix: filter invalid subtitle action links
This commit is contained in:
@@ -1,7 +1,9 @@
|
||||
import re
|
||||
import shutil
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
from lxml import etree
|
||||
|
||||
@@ -18,7 +20,6 @@ from app.schemas import TorrentInfo
|
||||
from app.schemas.file import FileURI
|
||||
from app.schemas.types import ModuleType, OtherModulesType
|
||||
from app.utils.http import RequestUtils
|
||||
from app.utils.string import StringUtils
|
||||
from app.utils.system import SystemUtils
|
||||
|
||||
|
||||
@@ -27,12 +28,24 @@ class SubtitleModule(_ModuleBase):
|
||||
字幕下载模块
|
||||
"""
|
||||
|
||||
# 站点详情页字幕下载链接识别XPATH
|
||||
# 站点详情页字幕下载元素识别XPATH
|
||||
_SITE_SUBTITLE_XPATH = [
|
||||
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a[not(@class)]/@href',
|
||||
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a/@href',
|
||||
'//div[contains(@class, "font-bold")][text()="字幕"]/following-sibling::div[1]//a[not(@class)]/@href', # 憨憨
|
||||
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a[not(@class)]',
|
||||
'//td[@class="rowhead"][text()="字幕"]/following-sibling::td//a',
|
||||
'//div[contains(@class, "font-bold")][text()="字幕"]/following-sibling::div[1]//a[not(@class)]', # 憨憨
|
||||
]
|
||||
_SUBTITLE_URL_ATTRS = (
|
||||
"href",
|
||||
"data-url",
|
||||
"data-href",
|
||||
"data-link",
|
||||
"data-download",
|
||||
"data-download-url",
|
||||
)
|
||||
_SCRIPT_URL_RE = re.compile(
|
||||
r"""["'](?P<url>(?:https?:)?//[^"']+|/[^"']+|[^"']*(?:download|subtitle|subs?)[^"']*)["']""",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
def init_module(self) -> None:
|
||||
pass
|
||||
@@ -71,6 +84,57 @@ class SubtitleModule(_ModuleBase):
|
||||
def test(self):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def __normalize_subtitle_link(cls, page_url: str, sublink: str) -> Optional[str]:
|
||||
"""
|
||||
转换并过滤真实字幕下载链接
|
||||
"""
|
||||
if not sublink:
|
||||
return None
|
||||
sublink = sublink.strip()
|
||||
if not sublink or sublink.startswith("#"):
|
||||
return None
|
||||
parsed = urlparse(sublink)
|
||||
if parsed.scheme and parsed.scheme not in ("http", "https"):
|
||||
return None
|
||||
if sublink.startswith("//"):
|
||||
page_scheme = urlparse(page_url).scheme or "https"
|
||||
sublink = f"{page_scheme}:{sublink}"
|
||||
else:
|
||||
sublink = urljoin(page_url, sublink)
|
||||
parsed = urlparse(sublink)
|
||||
if parsed.scheme not in ("http", "https") or not parsed.netloc:
|
||||
return None
|
||||
return sublink
|
||||
|
||||
@classmethod
|
||||
def _parse_subtitle_links(cls, html, page_url: str) -> List[str]:
|
||||
"""
|
||||
从站点详情页中解析字幕下载链接
|
||||
"""
|
||||
sublink_list = []
|
||||
found_links = set()
|
||||
for xpath in cls._SITE_SUBTITLE_XPATH:
|
||||
sublink_count = len(sublink_list)
|
||||
sublink_nodes = html.xpath(xpath)
|
||||
if sublink_nodes:
|
||||
for sublink_node in sublink_nodes:
|
||||
sublinks = [sublink_node.get(attr) for attr in cls._SUBTITLE_URL_ATTRS]
|
||||
sublinks.extend(
|
||||
match.group("url")
|
||||
for match in cls._SCRIPT_URL_RE.finditer(sublink_node.get("onclick") or "")
|
||||
)
|
||||
for sublink in sublinks:
|
||||
sublink = cls.__normalize_subtitle_link(page_url, sublink)
|
||||
if not sublink or sublink in found_links:
|
||||
continue
|
||||
found_links.add(sublink)
|
||||
sublink_list.append(sublink)
|
||||
# 已成功匹配字幕区域,后续xpath可以忽略
|
||||
if len(sublink_list) > sublink_count:
|
||||
break
|
||||
return sublink_list
|
||||
|
||||
def _get_subtitle_links(self, torrent: TorrentInfo):
|
||||
"""
|
||||
获取字幕链接
|
||||
@@ -97,23 +161,7 @@ class SubtitleModule(_ModuleBase):
|
||||
return []
|
||||
html = etree.HTML(res.text)
|
||||
try:
|
||||
sublink_list = []
|
||||
for xpath in self._SITE_SUBTITLE_XPATH:
|
||||
sublinks = html.xpath(xpath)
|
||||
if sublinks:
|
||||
for sublink in sublinks:
|
||||
if not sublink:
|
||||
continue
|
||||
if not sublink.startswith("http"):
|
||||
base_url = StringUtils.get_base_url(torrent.page_url)
|
||||
if sublink.startswith("/"):
|
||||
sublink = "%s%s" % (base_url, sublink)
|
||||
else:
|
||||
sublink = "%s/%s" % (base_url, sublink)
|
||||
sublink_list.append(sublink)
|
||||
# 已成功获取了链接,后续xpath可以忽略
|
||||
break
|
||||
return sublink_list
|
||||
return self._parse_subtitle_links(html, torrent.page_url)
|
||||
finally:
|
||||
if html is not None:
|
||||
del html
|
||||
|
||||
Reference in New Issue
Block a user