fix: improve subtitle parsing and matching

This commit is contained in:
jxxghp
2026-06-10 00:54:58 +08:00
parent 0f468f67c1
commit fa06d5d861
5 changed files with 430 additions and 6 deletions

View File

@@ -450,7 +450,9 @@ class SiteSpider:
return
selector = self.fields.get('size', {})
item = self._safe_query(torrent, selector)
if item:
if not item:
item = self.__get_default_value(selector)
if item is not None and item != "":
size_val = item.replace("\n", "").strip()
size_val = self.__filter_text(size_val,
selector.get('filters'))
@@ -492,7 +494,9 @@ class SiteSpider:
return
selector = self.fields.get('grabs', {})
item = self._safe_query(torrent, selector)
if item:
if not item:
item = self.__get_default_value(selector)
if item is not None and item != "":
grabs_val = item.split("/")[0]
grabs_val = grabs_val.replace(",", "")
grabs_val = self.__filter_text(grabs_val, selector.get('filters'))
@@ -646,6 +650,8 @@ class SiteSpider:
return
item = self._safe_query(torrent, selector)
value = self.__filter_text(item, selector.get('filters'))
if not value:
value = self.__get_default_value(selector)
if value is not None:
self.torrents_info[field_name] = value
@@ -773,6 +779,8 @@ class SiteSpider:
):
self.__get_subtitle_field(subtitle, field_name)
self.__fill_subtitle_ids()
if not self.torrents_info.get("title") or not self.torrents_info.get("enclosure"):
return {}
return self.torrents_info.copy() if self.torrents_info else {}
except Exception as err:
logger.error("%s 字幕搜索出现错误:%s" % (self.indexername, str(err)))
@@ -831,6 +839,9 @@ class SiteSpider:
@staticmethod
def __attribute_or_text(item: Any, selector: Optional[dict]) -> list:
"""
获取查询结果的属性或文本列表。
"""
if not selector:
return item
if not item:
@@ -843,6 +854,9 @@ class SiteSpider:
@staticmethod
def __index(items: Optional[list], selector: Optional[dict]) -> Optional[str]:
"""
按配置下标读取查询结果。
"""
if not items:
return None
if selector:
@@ -858,6 +872,33 @@ class SiteSpider:
item = items[0]
return item
@staticmethod
def __fallback_rows(html_doc: Any, selector_text: Optional[str]) -> Optional[list]:
"""
为 PyQuery 不支持的列表选择器提供等价行集合。
"""
if selector_text in ("tr:has(td.rowfollow)", "table tr:has(td.rowfollow)"):
return [
row
for row in html_doc("tr")
if len(PyQuery(row).children("td.rowfollow")) >= 3
and len(PyQuery(row).children("td.rowfollow").find("table")) == 0
and len(PyQuery(row).children("td.rowfollow").find('a[href*="downloadsubs.php"]')) > 0
]
return None
@staticmethod
def __get_default_value(selector: Optional[dict]) -> Optional[str]:
"""
读取字段默认值,兼容历史配置里的 defualt_value 拼写。
"""
if not selector:
return None
value = selector.get("default_value", selector.get("defualt_value"))
if value is None:
return None
return str(value)
def parse(self, html_text: str) -> List[dict]:
"""
解析整个页面
@@ -896,8 +937,12 @@ class SiteSpider:
html_doc = PyQuery(html_text)
# 种子筛选器
torrents_selector = self.list.get('selector', '')
rows = html_doc(torrents_selector)
fallback_rows = self.__fallback_rows(html_doc, torrents_selector)
if fallback_rows is not None:
rows = fallback_rows
# 遍历种子html列表
for i, torn in enumerate(html_doc(torrents_selector)):
for i, torn in enumerate(rows):
if i >= int(self.result_num):
break
# 创建临时PyQuery对象进行解析