fix: improve subtitle parsing and matching

2026-07-31 10:47:16 +08:00 · 2026-06-10 00:54:58 +08:00
parent 0f468f67c1
commit fa06d5d861
5 changed files with 430 additions and 6 deletions
--- a/app/chain/search.py
+++ b/app/chain/search.py
@@ -1480,11 +1480,11 @@ class SearchChain(ChainBase):
    @staticmethod
    def __build_subtitle_names(subtitle: SubtitleInfo) -> List[str]:
        """
-        提取字幕标题和下载文件名，作为精确匹配的名称候选。
+        提取字幕标题、下载文件名和描述，作为精确匹配的名称候选。
        """
        return list(dict.fromkeys(
            name.strip()
-            for name in (subtitle.title, subtitle.file_name)
+            for name in (subtitle.title, subtitle.file_name, subtitle.description)
            if name and name.strip()
        ))

--- a/app/core/context.py
+++ b/app/core/context.py
@@ -199,6 +199,21 @@ class SubtitleInfo:
    # 下载文件名
    file_name: str = None

+    def __build_meta_info(self) -> Optional[dict]:
+        """
+        从字幕标题、文件名和描述中识别可展示的季集信息。
+        """
+        for title in (self.title, self.file_name, self.description):
+            if not title:
+                continue
+            try:
+                meta_dict = MetaInfo(title=title, subtitle=self.description).to_dict()
+            except Exception:
+                continue
+            if meta_dict.get("season_episode") or meta_dict.get("episode_list"):
+                return meta_dict
+        return None
+
    def __setattr__(self, name: str, value: Any):
        self.__dict__[name] = value

@@ -213,7 +228,13 @@ class SubtitleInfo:
        """
        返回字典。
        """
-        return vars(self).copy()
+        dicts = vars(self).copy()
+        meta_info = self.__build_meta_info()
+        if meta_info:
+            dicts["meta_info"] = meta_info
+            dicts["season_episode"] = meta_info.get("season_episode")
+            dicts["episode_list"] = meta_info.get("episode_list")
+        return dicts


@dataclass
--- a/app/modules/indexer/spider/init.py
+++ b/app/modules/indexer/spider/init.py
@@ -450,7 +450,9 @@ class SiteSpider:
            return
        selector = self.fields.get('size', {})
        item = self._safe_query(torrent, selector)
-        if item:
+        if not item:
+            item = self.__get_default_value(selector)
+        if item is not None and item != "":
            size_val = item.replace("\n", "").strip()
            size_val = self.__filter_text(size_val,
                                          selector.get('filters'))
@@ -492,7 +494,9 @@ class SiteSpider:
            return
        selector = self.fields.get('grabs', {})
        item = self._safe_query(torrent, selector)
-        if item:
+        if not item:
+            item = self.__get_default_value(selector)
+        if item is not None and item != "":
            grabs_val = item.split("/")[0]
            grabs_val = grabs_val.replace(",", "")
            grabs_val = self.__filter_text(grabs_val, selector.get('filters'))
@@ -646,6 +650,8 @@ class SiteSpider:
            return
        item = self._safe_query(torrent, selector)
        value = self.__filter_text(item, selector.get('filters'))
+        if not value:
+            value = self.__get_default_value(selector)
        if value is not None:
            self.torrents_info[field_name] = value

@@ -773,6 +779,8 @@ class SiteSpider:
            ):
                self.__get_subtitle_field(subtitle, field_name)
            self.__fill_subtitle_ids()
+            if not self.torrents_info.get("title") or not self.torrents_info.get("enclosure"):
+                return {}
            return self.torrents_info.copy() if self.torrents_info else {}
        except Exception as err:
            logger.error("%s 字幕搜索出现错误：%s" % (self.indexername, str(err)))
@@ -831,6 +839,9 @@ class SiteSpider:

    @staticmethod
    def __attribute_or_text(item: Any, selector: Optional[dict]) -> list:
+        """
+        获取查询结果的属性或文本列表。
+        """
        if not selector:
            return item
        if not item:
@@ -843,6 +854,9 @@ class SiteSpider:

    @staticmethod
    def __index(items: Optional[list], selector: Optional[dict]) -> Optional[str]:
+        """
+        按配置下标读取查询结果。
+        """
        if not items:
            return None
        if selector:
@@ -858,6 +872,33 @@ class SiteSpider:
            item = items[0]
        return item

+    @staticmethod
+    def __fallback_rows(html_doc: Any, selector_text: Optional[str]) -> Optional[list]:
+        """
+        为 PyQuery 不支持的列表选择器提供等价行集合。
+        """
+        if selector_text in ("tr:has(td.rowfollow)", "table tr:has(td.rowfollow)"):
+            return [
+                row
+                for row in html_doc("tr")
+                if len(PyQuery(row).children("td.rowfollow")) >= 3
+                and len(PyQuery(row).children("td.rowfollow").find("table")) == 0
+                and len(PyQuery(row).children("td.rowfollow").find('a[href*="downloadsubs.php"]')) > 0
+            ]
+        return None
+
+    @staticmethod
+    def __get_default_value(selector: Optional[dict]) -> Optional[str]:
+        """
+        读取字段默认值，兼容历史配置里的 defualt_value 拼写。
+        """
+        if not selector:
+            return None
+        value = selector.get("default_value", selector.get("defualt_value"))
+        if value is None:
+            return None
+        return str(value)
+
    def parse(self, html_text: str) -> List[dict]:
        """
        解析整个页面
@@ -896,8 +937,12 @@ class SiteSpider:
            html_doc = PyQuery(html_text)
            # 种子筛选器
            torrents_selector = self.list.get('selector', '')
+            rows = html_doc(torrents_selector)
+            fallback_rows = self.__fallback_rows(html_doc, torrents_selector)
+            if fallback_rows is not None:
+                rows = fallback_rows
            # 遍历种子html列表
-            for i, torn in enumerate(html_doc(torrents_selector)):
+            for i, torn in enumerate(rows):
                if i >= int(self.result_num):
                    break
                # 创建临时PyQuery对象进行解析
--- a/app/schemas/context.py
+++ b/app/schemas/context.py
@@ -288,6 +288,12 @@ class SubtitleInfo(BaseModel):
    subtitle_id: Optional[str] = None
    # 下载文件名
    file_name: Optional[str] = None
+    # 识别元数据
+    meta_info: Optional[MetaInfo] = None
+    # SxxExx
+    season_episode: Optional[str] = None
+    # 集列表
+    episode_list: Optional[List[int]] = Field(default_factory=list)


 class Context(BaseModel):
--- a/tests/test_subtitle_search.py
+++ b/tests/test_subtitle_search.py
@@ -3,7 +3,148 @@ import pytest
 from app.api.endpoints.search import _parse_media_type
 from app.chain.search import SearchChain
 from app.core.context import MediaInfo, SubtitleInfo
+from app.modules.indexer.spider import SiteSpider
 from app.schemas.types import MediaType
+from app.utils import rust_accel
+
+
+AUDIENCES_SUBTITLE_HTML = """
+<table width="940" border="1" cellspacing="0" cellpadding="5">
+<tbody><tr><td class="colhead">语言</td><td width="100%" class="colhead" align="center">标题</td><td class="colhead" align="center"><img class="time" src="pic/trans.gif" alt="time" title="添加时间"></td>
+        <td class="colhead" align="center"><img class="size" src="pic/trans.gif" alt="size" title="大小"></td><td class="colhead" align="center">点击</td><td class="colhead" align="center">上传者</td><td class="colhead" align="center">举报</td></tr>
+<tr><td class="rowfollow" align="center" valign="middle"><img border="0" src="pic/flag/uk.gif" alt="English" title="English"></td>
+<td class="rowfollow" align="left"><a href="downloadsubs.php?torrentid=61964&amp;subid=394" <b="">The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb</a></td>
+<td class="rowfollow" align="center"><nobr><span title="2022-09-18 19:33:11">3年9月</span></nobr></td>
+<td class="rowfollow" align="center">96.69&nbsp;KB</td>
+<td class="rowfollow" align="center">1</td>
+<td class="rowfollow" align="center"><i>匿名</i></td>
+<td class="rowfollow" align="center"><a href="report.php?subtitle=394"><img class="f_report" src="pic/trans.gif" alt="Report" title="举报该字幕"></a></td>
+</tr>
+</tbody></table>
+"""
+
+HHANCLUB_SUBTITLE_HTML = """
+<div class="flex flex-col w-full items-center mt-[25px] gap-y-[10px] bg-[#F1F3F5] !rounded-md p-5" id="subtitles-table">
+  <div class="grid grid-cols-[10%_60%_10%_10%_10%] w-[95%] !rounded-md py-1 items-center bg-[#FFFFFF]/[0.7]">
+    <div><img class="w-[70px] h-[46px] pl-5" src="pic/flag/china.gif"></div>
+    <div class="flex flex-col">
+      <div class="flex flex-row gap-x-[45px]">
+        <a href="downloadsubs.php?torrentid=1435&amp;subid=1733" class="!text-[#000000] !text-[16px] !font-[700px] leading-[24px] hover:!text-orange-400 w-[80%]">The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&amp;eng]</a>
+      </div>
+      <div class="flex flex-row items-center !text-[#888A8D] !text-[15px] !font-[500px] leading-[22px]">
+        <div class="flex flex-wrap items-center"><a href="https://hhanclub.net/userdetails.php?id=26202" class="User_Name"><b>qfsong</b></a></div>
+      </div>
+    </div>
+    <div><div class="!text-[#000000] !text-[16px] !font-[700px] leading-[24px]">180.47&nbsp;KB</div></div>
+    <div><div class="!text-[#000000] !text-[16px] !font-[700px] leading-[24px]"><span title="2026-03-25 23:26:37">2月15天</span></div></div>
+    <div><div><a href="report.php?subtitle=1733"><img src="styles/HHan/icons/icon-report.svg" alt="举报"></a></div></div>
+  </div>
+</div>
+"""
+
+PTTIME_SUBTITLE_HTML = """
+<table>
+  <tr>
+    <td class="rowfollow">简体中文</td>
+    <td class="rowfollow"><a href="downloadsubs.php?torrentid=33968&amp;subid=1242">The.Capture.S02.1080p.iP.WEBRip.AAC2.0.x264-PlayWEB.zip</a></td>
+    <td class="rowfollow"><a href="/details.php?id=33968">33968</a></td>
+    <td class="rowfollow">2022-09-25 13:36:44</td>
+    <td class="rowfollow">248KB</td>
+    <td class="rowfollow">27</td>
+    <td class="rowfollow">匿名</td>
+    <td class="rowfollow"><a href="report.php?subtitle=1242">举报</a></td>
+  </tr>
+</table>
+"""
+
+
+def _audiences_indexer():
+    """
+    构造 audiences 字幕解析配置。
+    """
+    return {
+        "id": 1,
+        "name": "观众",
+        "domain": "https://audiences.me/",
+        "subtitles": {
+            "list": {"selector": "tr:has(td.rowfollow)"},
+            "fields": {
+                "language": {"selector": "td:nth-child(1) img", "attribute": "title"},
+                "language_icon": {"selector": "td:nth-child(1) img", "attribute": "src"},
+                "title": {"selector": "td:nth-child(2) a"},
+                "download": {"selector": "td:nth-child(2) a", "attribute": "href"},
+                "date_added": {"selector": "td:nth-child(3) span", "attribute": "title"},
+                "date_elapsed": {"selector": "td:nth-child(3) span"},
+                "size": {"selector": "td:nth-child(4)"},
+                "grabs": {"selector": "td:nth-child(5)"},
+                "uploader": {"selector": "td:nth-child(6)"},
+                "report": {"selector": "td:nth-child(7) a", "attribute": "href"},
+            },
+        },
+    }
+
+
+def _hhanclub_indexer():
+    """
+    构造 hhanclub 字幕解析配置。
+    """
+    return {
+        "id": 2,
+        "name": "憨憨",
+        "domain": "https://hhanclub.net/",
+        "subtitles": {
+            "list": {"selector": "#subtitles-table > div"},
+            "fields": {
+                "language": {
+                    "selector": "div:nth-child(1) img",
+                    "attribute": "title",
+                    "default_value": "简体中文",
+                },
+                "language_icon": {"selector": "div:nth-child(1) img", "attribute": "src"},
+                "title": {"selector": 'div:nth-child(2) a[href*="downloadsubs.php"]'},
+                "download": {
+                    "selector": 'div:nth-child(2) a[href*="downloadsubs.php"]',
+                    "attribute": "href",
+                },
+                "date_added": {"selector": "div:nth-child(4) span", "attribute": "title"},
+                "date_elapsed": {"selector": "div:nth-child(4) span"},
+                "size": {"selector": "div:nth-child(3)"},
+                "grabs": {"default_value": 0},
+                "uploader": {"selector": 'div:nth-child(2) a[href*="userdetails.php"]'},
+                "report": {"selector": 'div:nth-child(5) a[href*="report.php"]', "attribute": "href"},
+            },
+        },
+    }
+
+
+def _pttime_indexer():
+    """
+    构造 PT时间 字幕解析配置。
+    """
+    return {
+        "id": 3,
+        "name": "PT时间",
+        "domain": "https://www.pttime.org/",
+        "subtitles": {
+            "list": {"selector": "table tr:has(td.rowfollow)"},
+            "fields": {
+                "language": {"selector": "td:nth-child(1)"},
+                "language_icon": {
+                    "selector": "td:nth-child(1) img",
+                    "attribute": "src",
+                    "optional": True,
+                },
+                "title": {"selector": "td:nth-child(2) a"},
+                "download": {"selector": "td:nth-child(2) a", "attribute": "href"},
+                "date_added": {"selector": "td:nth-child(4)", "optional": True},
+                "date_elapsed": {"selector": "td:nth-child(4)", "optional": True},
+                "size": {"selector": "td:nth-child(5)"},
+                "grabs": {"selector": "td:nth-child(6)"},
+                "uploader": {"selector": "td:nth-child(7)"},
+                "report": {"selector": "td:nth-child(8) a", "attribute": "href"},
+            },
+        },
+    }


 def test_search_media_type_parser_accepts_agent_values():
@@ -114,3 +255,214 @@ def test_subtitle_search_params_keep_episode():

    assert params["episode"] == "3"
    assert params["result_type"] == "subtitle"
+
+
+def test_subtitle_info_serializes_title_season_episode():
+    """
+    字幕结果序列化应返回从字幕名称中识别出的季集信息。
+    """
+    subtitle = SubtitleInfo(
+        site_name="观众",
+        title="The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb",
+    )
+
+    result = subtitle.to_dict()
+
+    assert result["season_episode"] == "S02 E05"
+    assert result["meta_info"]["season_episode"] == "S02 E05"
+    assert result["episode_list"] == [5]
+
+
+@pytest.mark.parametrize(
+    ("indexer", "html", "expected"),
+    [
+        (
+            _audiences_indexer(),
+            AUDIENCES_SUBTITLE_HTML,
+            {
+                "title": "The.Capture.S02E05.2022.1080p.iP.WEB-DL.x264.AAC-ADWeb",
+                "language": "English",
+                "language_icon": "https://audiences.me/pic/flag/uk.gif",
+                "enclosure": "https://audiences.me/downloadsubs.php?torrentid=61964&subid=394",
+                "pubdate": "2022-09-18 19:33:11",
+                "date_elapsed": "3年9月",
+                "size": 99011,
+                "grabs": 1,
+                "uploader": "匿名",
+                "report_url": "https://audiences.me/report.php?subtitle=394",
+                "torrent_id": "61964",
+                "subtitle_id": "394",
+            },
+        ),
+        (
+            _hhanclub_indexer(),
+            HHANCLUB_SUBTITLE_HTML,
+            {
+                "title": "The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&eng]",
+                "language": "简体中文",
+                "language_icon": "https://hhanclub.net/pic/flag/china.gif",
+                "enclosure": "https://hhanclub.net/downloadsubs.php?torrentid=1435&subid=1733",
+                "pubdate": "2026-03-25 23:26:37",
+                "date_elapsed": "2月15天",
+                "size": 184801,
+                "grabs": 0,
+                "uploader": "qfsong",
+                "report_url": "https://hhanclub.net/report.php?subtitle=1733",
+                "torrent_id": "1435",
+                "subtitle_id": "1733",
+            },
+        ),
+        (
+            _pttime_indexer(),
+            PTTIME_SUBTITLE_HTML,
+            {
+                "title": "The.Capture.S02.1080p.iP.WEBRip.AAC2.0.x264-PlayWEB.zip",
+                "language": "简体中文",
+                "enclosure": "https://www.pttime.org/downloadsubs.php?torrentid=33968&subid=1242",
+                "pubdate": "2022-09-25 13:36:44",
+                "date_elapsed": "2022-09-25 13:36:44",
+                "size": 253952,
+                "grabs": 27,
+                "uploader": "匿名",
+                "report_url": "https://www.pttime.org/report.php?subtitle=1242",
+                "torrent_id": "33968",
+                "subtitle_id": "1242",
+            },
+        ),
+    ],
+)
+def test_subtitle_site_spider_parses_standard_fields(monkeypatch, indexer, html, expected):
+    """
+    Python 字幕解析应把站点配置字段归一化为前端需要的标准字段。
+    """
+    monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None)
+
+    result = SiteSpider(indexer, keyword="The.Capture", search_type="subtitles").parse(html)
+
+    assert len(result) == 1
+    for field, value in expected.items():
+        assert result[0][field] == value
+
+
+def test_subtitle_site_spider_skips_empty_rows(monkeypatch):
+    """
+    Python 字幕解析应丢弃没有标题或下载链接的表格杂项行。
+    """
+    monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None)
+    html = """
+    <table>
+    <tr><td class="rowfollow">not language</td><td class="rowfollow">not title</td></tr>
+    <tr><td class="rowfollow"><img src="pic/flag/uk.gif" title="English"></td>
+    <td class="rowfollow"><a href="downloadsubs.php?torrentid=1&amp;subid=2">The.Capture.S01</a></td>
+    <td class="rowfollow"><span title="2026-01-01 00:00:00">1天</span></td>
+    <td class="rowfollow">1 KB</td><td class="rowfollow">0</td><td class="rowfollow">匿名</td>
+    <td class="rowfollow"><a href="report.php?subtitle=2">report</a></td></tr>
+    </table>
+    """
+
+    result = SiteSpider(_audiences_indexer(), keyword="The.Capture", search_type="subtitles").parse(html)
+
+    assert len(result) == 1
+    assert result[0]["title"] == "The.Capture.S01"
+
+
+def test_subtitle_site_spider_uses_direct_nexus_row(monkeypatch):
+    """
+    Python 字幕解析应只使用 NexusPHP 内层字幕行，避免外层布局行字段错位。
+    """
+    monkeypatch.setattr(rust_accel, "parse_indexer_subtitles", lambda **_kwargs: None)
+    html = """
+    <table><tr><td class="rowfollow">
+      <table>
+        <tr>
+          <td class="rowfollow"><img src="data:image/svg+xml;base64,xxx" title="添加时间"></td>
+          <td class="rowfollow"><a href="downloadsubs.php?torrentid=1&amp;subid=2">The.Capture.S01</a></td>
+          <td class="rowfollow"><span title="2026-01-01 00:00:00">1天</span></td>
+          <td class="rowfollow">1 KB</td><td class="rowfollow">0</td><td class="rowfollow">上传者</td>
+          <td class="rowfollow"><a href="report.php?subtitle=2">report</a></td>
+        </tr>
+        <tr>
+          <td class="rowfollow"><img src="pic/flag/uk.gif" title="English"></td>
+          <td class="rowfollow"><a href="downloadsubs.php?torrentid=3&amp;subid=4">The.Capture.S02</a></td>
+          <td class="rowfollow"><span title="2026-01-02 00:00:00">2天</span></td>
+          <td class="rowfollow">2 KB</td><td class="rowfollow">1</td><td class="rowfollow">匿名</td>
+          <td class="rowfollow"><a href="report.php?subtitle=4">report</a></td>
+        </tr>
+      </table>
+    </td></tr></table>
+    """
+
+    result = SiteSpider(_audiences_indexer(), keyword="The.Capture", search_type="subtitles").parse(html)
+
+    assert [item["title"] for item in result] == ["The.Capture.S01", "The.Capture.S02"]
+    assert result[0]["language"] == "添加时间"
+    assert result[1]["language"] == "English"
+
+
+def test_exact_subtitle_match_uses_torrent_helper_for_media_names():
+    """
+    精确字幕搜索应复用资源匹配逻辑识别字幕标题中的媒体名称。
+    """
+    chain = object.__new__(SearchChain)
+    mediainfo = MediaInfo(
+        type=MediaType.TV,
+        title="真相捕捉",
+        original_title="The Capture",
+        en_title="The Capture",
+        year="2019",
+        season=1,
+        names=["The.Capture", "The Capture"],
+        season_years={1: "2019"},
+    )
+    subtitles = [
+        SubtitleInfo(
+            site_name="憨憨",
+            title="The.Capture.S01.1080p.AMZN.WEB-DL.DDP5.1.H.264-NTb[chs&eng]",
+            subtitle_id="1733",
+        ),
+        SubtitleInfo(
+            site_name="观众",
+            title="Other.Show.S01.1080p.WEB-DL.CHS",
+            subtitle_id="404",
+        ),
+    ]
+
+    result = chain._SearchChain__parse_subtitle_result(
+        subtitles=subtitles,
+        mediainfo=mediainfo,
+        season_episodes={1: []},
+    )
+
+    assert [item.subtitle_id for item in result] == ["1733"]
+
+
+def test_exact_subtitle_match_rejects_partial_name_match():
+    """
+    精确字幕搜索应避免媒体名称只在中间出现时误判。
+    """
+    chain = object.__new__(SearchChain)
+    mediainfo = MediaInfo(
+        type=MediaType.TV,
+        title="真相捕捉",
+        original_title="The Capture",
+        en_title="The Capture",
+        year="2019",
+        season=1,
+        names=["The.Capture", "The Capture"],
+        season_years={1: "2019"},
+    )
+    subtitles = [
+        SubtitleInfo(
+            site_name="观众",
+            title="Not.The.Capture.S01.1080p.WEB-DL.CHS",
+            subtitle_id="404",
+        ),
+    ]
+
+    result = chain._SearchChain__parse_subtitle_result(
+        subtitles=subtitles,
+        mediainfo=mediainfo,
+        season_episodes={1: []},
+    )
+
+    assert result == []