From fa06d5d8619dce4f84299d46a41d5cf6af0e364b Mon Sep 17 00:00:00 2001 From: jxxghp Date: Wed, 10 Jun 2026 00:54:58 +0800 Subject: [PATCH] fix: improve subtitle parsing and matching --- app/chain/search.py | 4 +- app/core/context.py | 23 +- app/modules/indexer/spider/__init__.py | 51 +++- app/schemas/context.py | 6 + tests/test_subtitle_search.py | 352 +++++++++++++++++++++++++ 5 files changed, 430 insertions(+), 6 deletions(-) diff --git a/app/chain/search.py b/app/chain/search.py index 0d9d1b44..a7553148 100644 --- a/app/chain/search.py +++ b/app/chain/search.py @@ -1480,11 +1480,11 @@ class SearchChain(ChainBase): @staticmethod def __build_subtitle_names(subtitle: SubtitleInfo) -> List[str]: """ - 提取字幕标题和下载文件名,作为精确匹配的名称候选。 + 提取字幕标题、下载文件名和描述,作为精确匹配的名称候选。 """ return list(dict.fromkeys( name.strip() - for name in (subtitle.title, subtitle.file_name) + for name in (subtitle.title, subtitle.file_name, subtitle.description) if name and name.strip() )) diff --git a/app/core/context.py b/app/core/context.py index 28e08674..44e9a8dd 100644 --- a/app/core/context.py +++ b/app/core/context.py @@ -199,6 +199,21 @@ class SubtitleInfo: # 下载文件名 file_name: str = None + def __build_meta_info(self) -> Optional[dict]: + """ + 从字幕标题、文件名和描述中识别可展示的季集信息。 + """ + for title in (self.title, self.file_name, self.description): + if not title: + continue + try: + meta_dict = MetaInfo(title=title, subtitle=self.description).to_dict() + except Exception: + continue + if meta_dict.get("season_episode") or meta_dict.get("episode_list"): + return meta_dict + return None + def __setattr__(self, name: str, value: Any): self.__dict__[name] = value @@ -213,7 +228,13 @@ class SubtitleInfo: """ 返回字典。 """ - return vars(self).copy() + dicts = vars(self).copy() + meta_info = self.__build_meta_info() + if meta_info: + dicts["meta_info"] = meta_info + dicts["season_episode"] = meta_info.get("season_episode") + dicts["episode_list"] = meta_info.get("episode_list") + return dicts @dataclass diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index 09f87fbd..3400967e 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -450,7 +450,9 @@ class SiteSpider: return selector = self.fields.get('size', {}) item = self._safe_query(torrent, selector) - if item: + if not item: + item = self.__get_default_value(selector) + if item is not None and item != "": size_val = item.replace("\n", "").strip() size_val = self.__filter_text(size_val, selector.get('filters')) @@ -492,7 +494,9 @@ class SiteSpider: return selector = self.fields.get('grabs', {}) item = self._safe_query(torrent, selector) - if item: + if not item: + item = self.__get_default_value(selector) + if item is not None and item != "": grabs_val = item.split("/")[0] grabs_val = grabs_val.replace(",", "") grabs_val = self.__filter_text(grabs_val, selector.get('filters')) @@ -646,6 +650,8 @@ class SiteSpider: return item = self._safe_query(torrent, selector) value = self.__filter_text(item, selector.get('filters')) + if not value: + value = self.__get_default_value(selector) if value is not None: self.torrents_info[field_name] = value @@ -773,6 +779,8 @@ class SiteSpider: ): self.__get_subtitle_field(subtitle, field_name) self.__fill_subtitle_ids() + if not self.torrents_info.get("title") or not self.torrents_info.get("enclosure"): + return {} return self.torrents_info.copy() if self.torrents_info else {} except Exception as err: logger.error("%s 字幕搜索出现错误:%s" % (self.indexername, str(err))) @@ -831,6 +839,9 @@ class SiteSpider: @staticmethod def __attribute_or_text(item: Any, selector: Optional[dict]) -> list: + """ + 获取查询结果的属性或文本列表。 + """ if not selector: return item if not item: @@ -843,6 +854,9 @@ class SiteSpider: @staticmethod def __index(items: Optional[list], selector: Optional[dict]) -> Optional[str]: + """ + 按配置下标读取查询结果。 + """ if not items: return None if selector: @@ -858,6 +872,33 @@ class SiteSpider: item = items[0] return item + @staticmethod + def __fallback_rows(html_doc: Any, selector_text: Optional[str]) -> Optional[list]: + """ + 为 PyQuery 不支持的列表选择器提供等价行集合。 + """ + if selector_text in ("tr:has(td.rowfollow)", "table tr:has(td.rowfollow)"): + return [ + row + for row in html_doc("tr") + if len(PyQuery(row).children("td.rowfollow")) >= 3 + and len(PyQuery(row).children("td.rowfollow").find("table")) == 0 + and len(PyQuery(row).children("td.rowfollow").find('a[href*="downloadsubs.php"]')) > 0 + ] + return None + + @staticmethod + def __get_default_value(selector: Optional[dict]) -> Optional[str]: + """ + 读取字段默认值,兼容历史配置里的 defualt_value 拼写。 + """ + if not selector: + return None + value = selector.get("default_value", selector.get("defualt_value")) + if value is None: + return None + return str(value) + def parse(self, html_text: str) -> List[dict]: """ 解析整个页面 @@ -896,8 +937,12 @@ class SiteSpider: html_doc = PyQuery(html_text) # 种子筛选器 torrents_selector = self.list.get('selector', '') + rows = html_doc(torrents_selector) + fallback_rows = self.__fallback_rows(html_doc, torrents_selector) + if fallback_rows is not None: + rows = fallback_rows # 遍历种子html列表 - for i, torn in enumerate(html_doc(torrents_selector)): + for i, torn in enumerate(rows): if i >= int(self.result_num): break # 创建临时PyQuery对象进行解析 diff --git a/app/schemas/context.py b/app/schemas/context.py index 9dce24ed..3ea6976b 100644 --- a/app/schemas/context.py +++ b/app/schemas/context.py @@ -288,6 +288,12 @@ class SubtitleInfo(BaseModel): subtitle_id: Optional[str] = None # 下载文件名 file_name: Optional[str] = None + # 识别元数据 + meta_info: Optional[MetaInfo] = None + # SxxExx + season_episode: Optional[str] = None + # 集列表 + episode_list: Optional[List[int]] = Field(default_factory=list) class Context(BaseModel): diff --git a/tests/test_subtitle_search.py b/tests/test_subtitle_search.py index eceaafd6..46af0ca8 100644 --- a/tests/test_subtitle_search.py +++ b/tests/test_subtitle_search.py @@ -3,7 +3,148 @@ import pytest from app.api.endpoints.search import _parse_media_type from app.chain.search import SearchChain from app.core.context import MediaInfo, SubtitleInfo +from app.modules.indexer.spider import SiteSpider from app.schemas.types import MediaType +from app.utils import rust_accel + + +AUDIENCES_SUBTITLE_HTML = """ + + + + + + + + + + + +
语言标题timesize点击上传者举报
EnglishThe.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb3年9月96.69 KB1匿名Report
+""" + +HHANCLUB_SUBTITLE_HTML = """ +
+
+
+ +
180.47 KB
+
2月15天
+
举报
+
+
+""" + +PTTIME_SUBTITLE_HTML = """ + + + + + + + + + + + +
简体中文The.Capture.S02.1080p.iP.WEBRip.AAC2.0.x264-PlayWEB.zip339682022-09-25 13:36:44248KB27匿名举报
+""" + + +def _audiences_indexer(): + """ + 构造 audiences 字幕解析配置。 + """ + return { + "id": 1, + "name": "观众", + "domain": "https://audiences.me/", + "subtitles": { + "list": {"selector": "tr:has(td.rowfollow)"}, + "fields": { + "language": {"selector": "td:nth-child(1) img", "attribute": "title"}, + "language_icon": {"selector": "td:nth-child(1) img", "attribute": "src"}, + "title": {"selector": "td:nth-child(2) a"}, + "download": {"selector": "td:nth-child(2) a", "attribute": "href"}, + "date_added": {"selector": "td:nth-child(3) span", "attribute": "title"}, + "date_elapsed": {"selector": "td:nth-child(3) span"}, + "size": {"selector": "td:nth-child(4)"}, + "grabs": {"selector": "td:nth-child(5)"}, + "uploader": {"selector": "td:nth-child(6)"}, + "report": {"selector": "td:nth-child(7) a", "attribute": "href"}, + }, + }, + } + + +def _hhanclub_indexer(): + """ + 构造 hhanclub 字幕解析配置。 + """ + return { + "id": 2, + "name": "憨憨", + "domain": "https://hhanclub.net/", + "subtitles": { + "list": {"selector": "#subtitles-table > div"}, + "fields": { + "language": { + "selector": "div:nth-child(1) img", + "attribute": "title", + "default_value": "简体中文", + }, + "language_icon": {"selector": "div:nth-child(1) img", "attribute": "src"}, + "title": {"selector": 'div:nth-child(2) a[href*="downloadsubs.php"]'}, + "download": { + "selector": 'div:nth-child(2) a[href*="downloadsubs.php"]', + "attribute": "href", + }, + "date_added": {"selector": "div:nth-child(4) span", "attribute": "title"}, + "date_elapsed": {"selector": "div:nth-child(4) span"}, + "size": {"selector": "div:nth-child(3)"}, + "grabs": {"default_value": 0}, + "uploader": {"selector": 'div:nth-child(2) a[href*="userdetails.php"]'}, + "report": {"selector": 'div:nth-child(5) a[href*="report.php"]', "attribute": "href"}, + }, + }, + } + + +def _pttime_indexer(): + """ + 构造 PT时间 字幕解析配置。 + """ + return { + "id": 3, + "name": "PT时间", + "domain": "https://www.pttime.org/", + "subtitles": { + "list": {"selector": "table tr:has(td.rowfollow)"}, + "fields": { + "language": {"selector": "td:nth-child(1)"}, + "language_icon": { + "selector": "td:nth-child(1) img", + "attribute": "src", + "optional": True, + }, + "title": {"selector": "td:nth-child(2) a"}, + "download": {"selector": "td:nth-child(2) a", "attribute": "href"}, + "date_added": {"selector": "td:nth-child(4)", "optional": True}, + "date_elapsed": {"selector": "td:nth-child(4)", "optional": True}, + "size": {"selector": "td:nth-child(5)"}, + "grabs": {"selector": "td:nth-child(6)"}, + "uploader": {"selector": "td:nth-child(7)"}, + "report": {"selector": "td:nth-child(8) a", "attribute": "href"}, + }, + }, + } def test_search_media_type_parser_accepts_agent_values(): @@ -114,3 +255,214 @@ def test_subtitle_search_params_keep_episode(): assert params["episode"] == "3" assert params["result_type"] == "subtitle" + + +def test_subtitle_info_serializes_title_season_episode(): + """ + 字幕结果序列化应返回从字幕名称中识别出的季集信息。 + """ + subtitle = SubtitleInfo( + site_name="观众", + title="The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb", + ) + + result = subtitle.to_dict() + + assert result["season_episode"] == "S02 E05" + assert result["meta_info"]["season_episode"] == "S02 E05" + assert result["episode_list"] == [5] + + +@pytest.mark.parametrize( + ("indexer", "html", "expected"), + [ + ( + _audiences_indexer(), + AUDIENCES_SUBTITLE_HTML, + { + "title": "The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb", + "language": "English", + "language_icon": "https://audiences.me/pic/flag/uk.gif", + "enclosure": "https://audiences.me/downloadsubs.php?torrentid=61964&subid=394", + "pubdate": "2022-09-18 19:33:11", + "date_elapsed": "3年9月", + "size": 99011, + "grabs": 1, + "uploader": "匿名", + "report_url": "https://audiences.me/report.php?subtitle=394", + "torrent_id": "61964", + "subtitle_id": "394", + }, + ), + ( + _hhanclub_indexer(), + HHANCLUB_SUBTITLE_HTML, + { + "title": "The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&eng]", + "language": "简体中文", + "language_icon": "https://hhanclub.net/pic/flag/china.gif", + "enclosure": "https://hhanclub.net/downloadsubs.php?torrentid=1435&subid=1733", + "pubdate": "2026-03-25 23:26:37", + "date_elapsed": "2月15天", + "size": 184801, + "grabs": 0, + "uploader": "qfsong", + "report_url": "https://hhanclub.net/report.php?subtitle=1733", + "torrent_id": "1435", + "subtitle_id": "1733", + }, + ), + ( + _pttime_indexer(), + PTTIME_SUBTITLE_HTML, + { + "title": "The.Capture.S02.1080p.iP.WEBRip.AAC2.0.x264-PlayWEB.zip", + "language": "简体中文", + "enclosure": "https://www.pttime.org/downloadsubs.php?torrentid=33968&subid=1242", + "pubdate": "2022-09-25 13:36:44", + "date_elapsed": "2022-09-25 13:36:44", + "size": 253952, + "grabs": 27, + "uploader": "匿名", + "report_url": "https://www.pttime.org/report.php?subtitle=1242", + "torrent_id": "33968", + "subtitle_id": "1242", + }, + ), + ], +) +def test_subtitle_site_spider_parses_standard_fields(monkeypatch, indexer, html, expected): + """ + Python 字幕解析应把站点配置字段归一化为前端需要的标准字段。 + """ + monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None) + + result = SiteSpider(indexer, keyword="The.Capture", search_type="subtitles").parse(html) + + assert len(result) == 1 + for field, value in expected.items(): + assert result[0][field] == value + + +def test_subtitle_site_spider_skips_empty_rows(monkeypatch): + """ + Python 字幕解析应丢弃没有标题或下载链接的表格杂项行。 + """ + monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None) + html = """ + + + + + + + +
not languagenot title
The.Capture.S011天1 KB0匿名report
+ """ + + result = SiteSpider(_audiences_indexer(), keyword="The.Capture", search_type="subtitles").parse(html) + + assert len(result) == 1 + assert result[0]["title"] == "The.Capture.S01" + + +def test_subtitle_site_spider_uses_direct_nexus_row(monkeypatch): + """ + Python 字幕解析应只使用 NexusPHP 内层字幕行,避免外层布局行字段错位。 + """ + monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None) + html = """ +
+ + + + + + + + + + + + + + + +
The.Capture.S011天1 KB0上传者report
The.Capture.S022天2 KB1匿名report
+
+ """ + + result = SiteSpider(_audiences_indexer(), keyword="The.Capture", search_type="subtitles").parse(html) + + assert [item["title"] for item in result] == ["The.Capture.S01", "The.Capture.S02"] + assert result[0]["language"] == "添加时间" + assert result[1]["language"] == "English" + + +def test_exact_subtitle_match_uses_torrent_helper_for_media_names(): + """ + 精确字幕搜索应复用资源匹配逻辑识别字幕标题中的媒体名称。 + """ + chain = object.__new__(SearchChain) + mediainfo = MediaInfo( + type=MediaType.TV, + title="真相捕捉", + original_title="The Capture", + en_title="The Capture", + year="2019", + season=1, + names=["The.Capture", "The Capture"], + season_years={1: "2019"}, + ) + subtitles = [ + SubtitleInfo( + site_name="憨憨", + title="The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&eng]", + subtitle_id="1733", + ), + SubtitleInfo( + site_name="观众", + title="Other.Show.S01.1080p.WEB-DL.CHS", + subtitle_id="404", + ), + ] + + result = chain._SearchChain__parse_subtitle_result( + subtitles=subtitles, + mediainfo=mediainfo, + season_episodes={1: []}, + ) + + assert [item.subtitle_id for item in result] == ["1733"] + + +def test_exact_subtitle_match_rejects_partial_name_match(): + """ + 精确字幕搜索应避免媒体名称只在中间出现时误判。 + """ + chain = object.__new__(SearchChain) + mediainfo = MediaInfo( + type=MediaType.TV, + title="真相捕捉", + original_title="The Capture", + en_title="The Capture", + year="2019", + season=1, + names=["The.Capture", "The Capture"], + season_years={1: "2019"}, + ) + subtitles = [ + SubtitleInfo( + site_name="观众", + title="Not.The.Capture.S01.1080p.WEB-DL.CHS", + subtitle_id="404", + ), + ] + + result = chain._SearchChain__parse_subtitle_result( + subtitles=subtitles, + mediainfo=mediainfo, + season_episodes={1: []}, + ) + + assert result == []