diff --git a/app/chain/search.py b/app/chain/search.py
index 0d9d1b44..a7553148 100644
--- a/app/chain/search.py
+++ b/app/chain/search.py
@@ -1480,11 +1480,11 @@ class SearchChain(ChainBase):
@staticmethod
def __build_subtitle_names(subtitle: SubtitleInfo) -> List[str]:
"""
- 提取字幕标题和下载文件名,作为精确匹配的名称候选。
+ 提取字幕标题、下载文件名和描述,作为精确匹配的名称候选。
"""
return list(dict.fromkeys(
name.strip()
- for name in (subtitle.title, subtitle.file_name)
+ for name in (subtitle.title, subtitle.file_name, subtitle.description)
if name and name.strip()
))
diff --git a/app/core/context.py b/app/core/context.py
index 28e08674..44e9a8dd 100644
--- a/app/core/context.py
+++ b/app/core/context.py
@@ -199,6 +199,21 @@ class SubtitleInfo:
# 下载文件名
file_name: str = None
+ def __build_meta_info(self) -> Optional[dict]:
+ """
+ 从字幕标题、文件名和描述中识别可展示的季集信息。
+ """
+ for title in (self.title, self.file_name, self.description):
+ if not title:
+ continue
+ try:
+ meta_dict = MetaInfo(title=title, subtitle=self.description).to_dict()
+ except Exception:
+ continue
+ if meta_dict.get("season_episode") or meta_dict.get("episode_list"):
+ return meta_dict
+ return None
+
def __setattr__(self, name: str, value: Any):
self.__dict__[name] = value
@@ -213,7 +228,13 @@ class SubtitleInfo:
"""
返回字典。
"""
- return vars(self).copy()
+ dicts = vars(self).copy()
+ meta_info = self.__build_meta_info()
+ if meta_info:
+ dicts["meta_info"] = meta_info
+ dicts["season_episode"] = meta_info.get("season_episode")
+ dicts["episode_list"] = meta_info.get("episode_list")
+ return dicts
@dataclass
diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py
index 09f87fbd..3400967e 100644
--- a/app/modules/indexer/spider/__init__.py
+++ b/app/modules/indexer/spider/__init__.py
@@ -450,7 +450,9 @@ class SiteSpider:
return
selector = self.fields.get('size', {})
item = self._safe_query(torrent, selector)
- if item:
+ if not item:
+ item = self.__get_default_value(selector)
+ if item is not None and item != "":
size_val = item.replace("\n", "").strip()
size_val = self.__filter_text(size_val,
selector.get('filters'))
@@ -492,7 +494,9 @@ class SiteSpider:
return
selector = self.fields.get('grabs', {})
item = self._safe_query(torrent, selector)
- if item:
+ if not item:
+ item = self.__get_default_value(selector)
+ if item is not None and item != "":
grabs_val = item.split("/")[0]
grabs_val = grabs_val.replace(",", "")
grabs_val = self.__filter_text(grabs_val, selector.get('filters'))
@@ -646,6 +650,8 @@ class SiteSpider:
return
item = self._safe_query(torrent, selector)
value = self.__filter_text(item, selector.get('filters'))
+ if not value:
+ value = self.__get_default_value(selector)
if value is not None:
self.torrents_info[field_name] = value
@@ -773,6 +779,8 @@ class SiteSpider:
):
self.__get_subtitle_field(subtitle, field_name)
self.__fill_subtitle_ids()
+ if not self.torrents_info.get("title") or not self.torrents_info.get("enclosure"):
+ return {}
return self.torrents_info.copy() if self.torrents_info else {}
except Exception as err:
logger.error("%s 字幕搜索出现错误:%s" % (self.indexername, str(err)))
@@ -831,6 +839,9 @@ class SiteSpider:
@staticmethod
def __attribute_or_text(item: Any, selector: Optional[dict]) -> list:
+ """
+ 获取查询结果的属性或文本列表。
+ """
if not selector:
return item
if not item:
@@ -843,6 +854,9 @@ class SiteSpider:
@staticmethod
def __index(items: Optional[list], selector: Optional[dict]) -> Optional[str]:
+ """
+ 按配置下标读取查询结果。
+ """
if not items:
return None
if selector:
@@ -858,6 +872,33 @@ class SiteSpider:
item = items[0]
return item
+ @staticmethod
+ def __fallback_rows(html_doc: Any, selector_text: Optional[str]) -> Optional[list]:
+ """
+ 为 PyQuery 不支持的列表选择器提供等价行集合。
+ """
+ if selector_text in ("tr:has(td.rowfollow)", "table tr:has(td.rowfollow)"):
+ return [
+ row
+ for row in html_doc("tr")
+ if len(PyQuery(row).children("td.rowfollow")) >= 3
+ and len(PyQuery(row).children("td.rowfollow").find("table")) == 0
+ and len(PyQuery(row).children("td.rowfollow").find('a[href*="downloadsubs.php"]')) > 0
+ ]
+ return None
+
+ @staticmethod
+ def __get_default_value(selector: Optional[dict]) -> Optional[str]:
+ """
+ 读取字段默认值,兼容历史配置里的 defualt_value 拼写。
+ """
+ if not selector:
+ return None
+ value = selector.get("default_value", selector.get("defualt_value"))
+ if value is None:
+ return None
+ return str(value)
+
def parse(self, html_text: str) -> List[dict]:
"""
解析整个页面
@@ -896,8 +937,12 @@ class SiteSpider:
html_doc = PyQuery(html_text)
# 种子筛选器
torrents_selector = self.list.get('selector', '')
+ rows = html_doc(torrents_selector)
+ fallback_rows = self.__fallback_rows(html_doc, torrents_selector)
+ if fallback_rows is not None:
+ rows = fallback_rows
# 遍历种子html列表
- for i, torn in enumerate(html_doc(torrents_selector)):
+ for i, torn in enumerate(rows):
if i >= int(self.result_num):
break
# 创建临时PyQuery对象进行解析
diff --git a/app/schemas/context.py b/app/schemas/context.py
index 9dce24ed..3ea6976b 100644
--- a/app/schemas/context.py
+++ b/app/schemas/context.py
@@ -288,6 +288,12 @@ class SubtitleInfo(BaseModel):
subtitle_id: Optional[str] = None
# 下载文件名
file_name: Optional[str] = None
+ # 识别元数据
+ meta_info: Optional[MetaInfo] = None
+ # SxxExx
+ season_episode: Optional[str] = None
+ # 集列表
+ episode_list: Optional[List[int]] = Field(default_factory=list)
class Context(BaseModel):
diff --git a/tests/test_subtitle_search.py b/tests/test_subtitle_search.py
index eceaafd6..46af0ca8 100644
--- a/tests/test_subtitle_search.py
+++ b/tests/test_subtitle_search.py
@@ -3,7 +3,148 @@ import pytest
from app.api.endpoints.search import _parse_media_type
from app.chain.search import SearchChain
from app.core.context import MediaInfo, SubtitleInfo
+from app.modules.indexer.spider import SiteSpider
from app.schemas.types import MediaType
+from app.utils import rust_accel
+
+
+AUDIENCES_SUBTITLE_HTML = """
+
+"""
+
+HHANCLUB_SUBTITLE_HTML = """
+
+"""
+
+PTTIME_SUBTITLE_HTML = """
+
+"""
+
+
+def _audiences_indexer():
+ """
+ 构造 audiences 字幕解析配置。
+ """
+ return {
+ "id": 1,
+ "name": "观众",
+ "domain": "https://audiences.me/",
+ "subtitles": {
+ "list": {"selector": "tr:has(td.rowfollow)"},
+ "fields": {
+ "language": {"selector": "td:nth-child(1) img", "attribute": "title"},
+ "language_icon": {"selector": "td:nth-child(1) img", "attribute": "src"},
+ "title": {"selector": "td:nth-child(2) a"},
+ "download": {"selector": "td:nth-child(2) a", "attribute": "href"},
+ "date_added": {"selector": "td:nth-child(3) span", "attribute": "title"},
+ "date_elapsed": {"selector": "td:nth-child(3) span"},
+ "size": {"selector": "td:nth-child(4)"},
+ "grabs": {"selector": "td:nth-child(5)"},
+ "uploader": {"selector": "td:nth-child(6)"},
+ "report": {"selector": "td:nth-child(7) a", "attribute": "href"},
+ },
+ },
+ }
+
+
+def _hhanclub_indexer():
+ """
+ 构造 hhanclub 字幕解析配置。
+ """
+ return {
+ "id": 2,
+ "name": "憨憨",
+ "domain": "https://hhanclub.net/",
+ "subtitles": {
+ "list": {"selector": "#subtitles-table > div"},
+ "fields": {
+ "language": {
+ "selector": "div:nth-child(1) img",
+ "attribute": "title",
+ "default_value": "简体中文",
+ },
+ "language_icon": {"selector": "div:nth-child(1) img", "attribute": "src"},
+ "title": {"selector": 'div:nth-child(2) a[href*="downloadsubs.php"]'},
+ "download": {
+ "selector": 'div:nth-child(2) a[href*="downloadsubs.php"]',
+ "attribute": "href",
+ },
+ "date_added": {"selector": "div:nth-child(4) span", "attribute": "title"},
+ "date_elapsed": {"selector": "div:nth-child(4) span"},
+ "size": {"selector": "div:nth-child(3)"},
+ "grabs": {"default_value": 0},
+ "uploader": {"selector": 'div:nth-child(2) a[href*="userdetails.php"]'},
+ "report": {"selector": 'div:nth-child(5) a[href*="report.php"]', "attribute": "href"},
+ },
+ },
+ }
+
+
+def _pttime_indexer():
+ """
+ 构造 PT时间 字幕解析配置。
+ """
+ return {
+ "id": 3,
+ "name": "PT时间",
+ "domain": "https://www.pttime.org/",
+ "subtitles": {
+ "list": {"selector": "table tr:has(td.rowfollow)"},
+ "fields": {
+ "language": {"selector": "td:nth-child(1)"},
+ "language_icon": {
+ "selector": "td:nth-child(1) img",
+ "attribute": "src",
+ "optional": True,
+ },
+ "title": {"selector": "td:nth-child(2) a"},
+ "download": {"selector": "td:nth-child(2) a", "attribute": "href"},
+ "date_added": {"selector": "td:nth-child(4)", "optional": True},
+ "date_elapsed": {"selector": "td:nth-child(4)", "optional": True},
+ "size": {"selector": "td:nth-child(5)"},
+ "grabs": {"selector": "td:nth-child(6)"},
+ "uploader": {"selector": "td:nth-child(7)"},
+ "report": {"selector": "td:nth-child(8) a", "attribute": "href"},
+ },
+ },
+ }
def test_search_media_type_parser_accepts_agent_values():
@@ -114,3 +255,214 @@ def test_subtitle_search_params_keep_episode():
assert params["episode"] == "3"
assert params["result_type"] == "subtitle"
+
+
+def test_subtitle_info_serializes_title_season_episode():
+ """
+ 字幕结果序列化应返回从字幕名称中识别出的季集信息。
+ """
+ subtitle = SubtitleInfo(
+ site_name="观众",
+ title="The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb",
+ )
+
+ result = subtitle.to_dict()
+
+ assert result["season_episode"] == "S02 E05"
+ assert result["meta_info"]["season_episode"] == "S02 E05"
+ assert result["episode_list"] == [5]
+
+
+@pytest.mark.parametrize(
+ ("indexer", "html", "expected"),
+ [
+ (
+ _audiences_indexer(),
+ AUDIENCES_SUBTITLE_HTML,
+ {
+ "title": "The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb",
+ "language": "English",
+ "language_icon": "https://audiences.me/pic/flag/uk.gif",
+ "enclosure": "https://audiences.me/downloadsubs.php?torrentid=61964&subid=394",
+ "pubdate": "2022-09-18 19:33:11",
+ "date_elapsed": "3年9月",
+ "size": 99011,
+ "grabs": 1,
+ "uploader": "匿名",
+ "report_url": "https://audiences.me/report.php?subtitle=394",
+ "torrent_id": "61964",
+ "subtitle_id": "394",
+ },
+ ),
+ (
+ _hhanclub_indexer(),
+ HHANCLUB_SUBTITLE_HTML,
+ {
+ "title": "The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&eng]",
+ "language": "简体中文",
+ "language_icon": "https://hhanclub.net/pic/flag/china.gif",
+ "enclosure": "https://hhanclub.net/downloadsubs.php?torrentid=1435&subid=1733",
+ "pubdate": "2026-03-25 23:26:37",
+ "date_elapsed": "2月15天",
+ "size": 184801,
+ "grabs": 0,
+ "uploader": "qfsong",
+ "report_url": "https://hhanclub.net/report.php?subtitle=1733",
+ "torrent_id": "1435",
+ "subtitle_id": "1733",
+ },
+ ),
+ (
+ _pttime_indexer(),
+ PTTIME_SUBTITLE_HTML,
+ {
+ "title": "The.Capture.S02.1080p.iP.WEBRip.AAC2.0.x264-PlayWEB.zip",
+ "language": "简体中文",
+ "enclosure": "https://www.pttime.org/downloadsubs.php?torrentid=33968&subid=1242",
+ "pubdate": "2022-09-25 13:36:44",
+ "date_elapsed": "2022-09-25 13:36:44",
+ "size": 253952,
+ "grabs": 27,
+ "uploader": "匿名",
+ "report_url": "https://www.pttime.org/report.php?subtitle=1242",
+ "torrent_id": "33968",
+ "subtitle_id": "1242",
+ },
+ ),
+ ],
+)
+def test_subtitle_site_spider_parses_standard_fields(monkeypatch, indexer, html, expected):
+ """
+ Python 字幕解析应把站点配置字段归一化为前端需要的标准字段。
+ """
+ monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None)
+
+ result = SiteSpider(indexer, keyword="The.Capture", search_type="subtitles").parse(html)
+
+ assert len(result) == 1
+ for field, value in expected.items():
+ assert result[0][field] == value
+
+
+def test_subtitle_site_spider_skips_empty_rows(monkeypatch):
+ """
+ Python 字幕解析应丢弃没有标题或下载链接的表格杂项行。
+ """
+ monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None)
+ html = """
+
+ """
+
+ result = SiteSpider(_audiences_indexer(), keyword="The.Capture", search_type="subtitles").parse(html)
+
+ assert len(result) == 1
+ assert result[0]["title"] == "The.Capture.S01"
+
+
+def test_subtitle_site_spider_uses_direct_nexus_row(monkeypatch):
+ """
+ Python 字幕解析应只使用 NexusPHP 内层字幕行,避免外层布局行字段错位。
+ """
+ monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None)
+ html = """
+
+ """
+
+ result = SiteSpider(_audiences_indexer(), keyword="The.Capture", search_type="subtitles").parse(html)
+
+ assert [item["title"] for item in result] == ["The.Capture.S01", "The.Capture.S02"]
+ assert result[0]["language"] == "添加时间"
+ assert result[1]["language"] == "English"
+
+
+def test_exact_subtitle_match_uses_torrent_helper_for_media_names():
+ """
+ 精确字幕搜索应复用资源匹配逻辑识别字幕标题中的媒体名称。
+ """
+ chain = object.__new__(SearchChain)
+ mediainfo = MediaInfo(
+ type=MediaType.TV,
+ title="真相捕捉",
+ original_title="The Capture",
+ en_title="The Capture",
+ year="2019",
+ season=1,
+ names=["The.Capture", "The Capture"],
+ season_years={1: "2019"},
+ )
+ subtitles = [
+ SubtitleInfo(
+ site_name="憨憨",
+ title="The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&eng]",
+ subtitle_id="1733",
+ ),
+ SubtitleInfo(
+ site_name="观众",
+ title="Other.Show.S01.1080p.WEB-DL.CHS",
+ subtitle_id="404",
+ ),
+ ]
+
+ result = chain._SearchChain__parse_subtitle_result(
+ subtitles=subtitles,
+ mediainfo=mediainfo,
+ season_episodes={1: []},
+ )
+
+ assert [item.subtitle_id for item in result] == ["1733"]
+
+
+def test_exact_subtitle_match_rejects_partial_name_match():
+ """
+ 精确字幕搜索应避免媒体名称只在中间出现时误判。
+ """
+ chain = object.__new__(SearchChain)
+ mediainfo = MediaInfo(
+ type=MediaType.TV,
+ title="真相捕捉",
+ original_title="The Capture",
+ en_title="The Capture",
+ year="2019",
+ season=1,
+ names=["The.Capture", "The Capture"],
+ season_years={1: "2019"},
+ )
+ subtitles = [
+ SubtitleInfo(
+ site_name="观众",
+ title="Not.The.Capture.S01.1080p.WEB-DL.CHS",
+ subtitle_id="404",
+ ),
+ ]
+
+ result = chain._SearchChain__parse_subtitle_result(
+ subtitles=subtitles,
+ mediainfo=mediainfo,
+ season_episodes={1: []},
+ )
+
+ assert result == []