From 014dc2884cdf6a97ffe3bf0403c106fa29990ea8 Mon Sep 17 00:00:00 2001 From: jxxghp Date: Wed, 20 May 2026 21:42:53 +0800 Subject: [PATCH] fix: simplify audiences unread pagination --- app/modules/indexer/parser/nexus_audiences.py | 138 ++---------------- tests/test_nexus_audiences_parser.py | 31 ++-- 2 files changed, 25 insertions(+), 144 deletions(-) diff --git a/app/modules/indexer/parser/nexus_audiences.py b/app/modules/indexer/parser/nexus_audiences.py index 12e64979..8358e651 100644 --- a/app/modules/indexer/parser/nexus_audiences.py +++ b/app/modules/indexer/parser/nexus_audiences.py @@ -1,8 +1,7 @@ # -*- coding: utf-8 -*- import json import re -from typing import Optional -from urllib.parse import parse_qsl, urlencode, urljoin, urlsplit, urlunsplit +from urllib.parse import urljoin from lxml import etree @@ -20,7 +19,8 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): """ super().__init__(*args, **kwargs) self._user_mail_unread_page = self.__build_unread_mailbox_page(box=1) - self._sys_mail_unread_page = self.__build_unread_mailbox_page(box=-2) + self._sys_mail_unread_page = None + self.__next_mail_page = 1 def _parse_message_unread(self, html_text): """ @@ -64,7 +64,7 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): 'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href' ) msg_links.extend(message_links) - next_page = self.__parse_next_message_page(html) + next_page = self.__build_next_unread_mailbox_page(bool(message_links)) finally: if html is not None: del html @@ -78,134 +78,16 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): """ return f"messages.php?action=viewmailbox&box={box}&unread=yes" - @classmethod - def __parse_next_message_page(cls, html) -> Optional[str]: + def __build_next_unread_mailbox_page(self, has_unread: bool) -> str: """ - 解析 Audiences 新版分页中的下一页链接,兼容图标按钮和英文无障碍标签。 + 当前页存在未读消息时按 Audiences 的 page 参数规则生成下一页地址。 """ - next_pages = [] - for link in html.xpath('//a[@href]'): - if cls.__is_next_message_page_link(link): - next_pages.append(cls.__normalize_message_page_link(link.get("href").strip())) - if next_pages: - return next_pages[-1] - - return cls.__parse_next_numeric_message_page(html) - - @classmethod - def __parse_next_numeric_message_page(cls, html) -> Optional[str]: - """ - 从数字分页中推断下一页,兼容只展示页码按钮的私信列表。 - """ - current_page = None - page_links = [] - for link in html.xpath('//a[@href]'): - page = cls.__extract_message_page(link.get("href")) - if page is None: - continue - page_links.append((page, link.get("href").strip())) - if cls.__is_current_page_link(link): - current_page = page - - if current_page is None: - current_page = cls.__extract_current_page_from_markup(html) - if current_page is None or not page_links: + if not has_unread: return None - next_links = [(page, href) for page, href in page_links if page > current_page] - if not next_links: - return None - - _, next_link = sorted(next_links, key=lambda item: item[0])[0] - return cls.__normalize_message_page_link(next_link) - - @staticmethod - def __is_next_message_page_link(link) -> bool: - """ - 判断分页链接是否指向下一页,避免只识别中文“下一页”文本。 - """ - link_class = f" {link.get('class') or ''} ".lower() - if any(flag in link_class for flag in (" disabled ", " disable ", " inactive ")): - return False - - link_text = re.sub(r"\s+", " ", link.xpath("string(.)") or "").strip().lower() - title = (link.get("title") or "").strip().lower() - aria_label = (link.get("aria-label") or "").strip().lower() - rel = (link.get("rel") or "").strip().lower() - signal_text = " ".join([link_text, title, aria_label, rel]) - if any(keyword in signal_text for keyword in ("下一页", "下一頁", "next")): - return True - - if link_text in {">", "›", "»"}: - return True - - return any(flag in link_class for flag in (" next ", " pager-next ", " page-next ")) - - @classmethod - def __extract_message_page(cls, href: str) -> Optional[int]: - """ - 从 Audiences 私信分页链接中提取页码。 - """ - if not href: - return None - - query_params = dict(parse_qsl(urlsplit(href).query, keep_blank_values=True)) - if query_params.get("action") != "viewmailbox": - return None - return StringUtils.str_int(query_params.get("page")) - - @staticmethod - def __is_current_page_link(link) -> bool: - """ - 判断页码链接是否表示当前页。 - """ - link_class = f" {link.get('class') or ''} ".lower() - parent_class = f" {link.getparent().get('class') or ''} ".lower() if link.getparent() is not None else "" - aria_current = (link.get("aria-current") or "").strip().lower() - current_flags = (" active ", " current ", " selected ") - return aria_current == "page" or any(flag in link_class or flag in parent_class for flag in current_flags) - - @staticmethod - def __extract_current_page_from_markup(html) -> Optional[int]: - """ - 从非链接的当前页标记中提取 Audiences 的 page 参数值。 - """ - current_texts = html.xpath( - '//*[contains(concat(" ", normalize-space(@class), " "), " active ") ' - 'or contains(concat(" ", normalize-space(@class), " "), " current ") ' - 'or contains(concat(" ", normalize-space(@class), " "), " selected ") ' - 'or @aria-current="page"]/text()' - ) - for current_text in current_texts: - page_match = re.search(r"\d+", str(current_text)) - if page_match: - # Audiences 页码展示从 1 开始,但 URL 中 page=1 表示第二页。 - return max(StringUtils.str_int(page_match.group()) - 1, 0) - return None - - @classmethod - def __normalize_message_page_link(cls, href: str) -> str: - """ - 给翻页链接补齐未读过滤,避免后续页退回站点完整收件箱。 - """ - if not href: - return href - - url_info = urlsplit(href) - query_params = dict(parse_qsl(url_info.query, keep_blank_values=True)) - if query_params.get("action") != "viewmailbox": - return href - - query_params.setdefault("unread", "yes") - return urlunsplit( - ( - url_info.scheme, - url_info.netloc, - url_info.path, - urlencode(query_params), - url_info.fragment, - ) - ) + next_page = self.__next_mail_page + self.__next_mail_page += 1 + return f"{self._user_mail_unread_page}&page={next_page}" def _parse_user_traffic_info(self, html_text): """ diff --git a/tests/test_nexus_audiences_parser.py b/tests/test_nexus_audiences_parser.py index fcb7329e..e450c61e 100644 --- a/tests/test_nexus_audiences_parser.py +++ b/tests/test_nexus_audiences_parser.py @@ -149,7 +149,7 @@ def test_audiences_table_unread_links_ignore_content_rows(): next_page = parser._parse_message_unread_links(html_text, msg_links) assert msg_links == ["messages.php?action=viewmessage&id=4318225"] - assert next_page is None + assert next_page == "messages.php?action=viewmailbox&box=1&unread=yes&page=1" def test_audiences_readpm_row_is_not_unread_message(): @@ -183,9 +183,9 @@ def test_audiences_readpm_row_is_not_unread_message(): assert msg_links == [] -def test_audiences_unread_mailbox_first_page_omits_page_param(): +def test_audiences_unread_mailbox_only_uses_user_box(): """ - Audiences 私信首页不传 page,page=1 实际表示第二页。 + Audiences 只使用用户消息箱,首页不传 page,page=1 实际表示第二页。 """ parser = NexusAudiencesSiteUserInfo( site_name="Audiences", @@ -196,12 +196,12 @@ def test_audiences_unread_mailbox_first_page_omits_page_param(): ) assert parser._user_mail_unread_page == "messages.php?action=viewmailbox&box=1&unread=yes" - assert parser._sys_mail_unread_page == "messages.php?action=viewmailbox&box=-2&unread=yes" + assert parser._sys_mail_unread_page is None -def test_audiences_unread_links_follow_icon_next_page(): +def test_audiences_unread_links_increment_page_until_empty(): """ - Audiences 新版分页可能只有图标和 aria-label,不能只依赖中文下一页文本。 + Audiences 每页固定 10 条,有未读行时按 page 参数自增继续翻页。 """ parser = NexusAudiencesSiteUserInfo( site_name="Audiences", @@ -223,20 +223,19 @@ def test_audiences_unread_links_follow_icon_next_page(): - """ msg_links = [] next_page = parser._parse_message_unread_links(html_text, msg_links) + next_next_page = parser._parse_message_unread_links(html_text, msg_links) + stop_page = parser._parse_message_unread_links("
", msg_links) - assert msg_links == ["messages.php?action=viewmessage&id=4318225"] - assert next_page == "messages.php?action=viewmailbox&box=1&page=1&unread=yes" + assert msg_links == [ + "messages.php?action=viewmessage&id=4318225", + "messages.php?action=viewmessage&id=4318225", + ] + assert next_page == "messages.php?action=viewmailbox&box=1&unread=yes&page=1" + assert next_next_page == "messages.php?action=viewmailbox&box=1&unread=yes&page=2" + assert stop_page is None