diff --git a/app/modules/indexer/parser/nexus_audiences.py b/app/modules/indexer/parser/nexus_audiences.py index 2403ca84..12e64979 100644 --- a/app/modules/indexer/parser/nexus_audiences.py +++ b/app/modules/indexer/parser/nexus_audiences.py @@ -1,7 +1,8 @@ # -*- coding: utf-8 -*- import json import re -from urllib.parse import urljoin +from typing import Optional +from urllib.parse import parse_qsl, urlencode, urljoin, urlsplit, urlunsplit from lxml import etree @@ -13,6 +14,14 @@ from app.utils.string import StringUtils class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): schema = SiteSchema.NexusAudiences + def __init__(self, *args, **kwargs): + """ + 初始化 Audiences 未读私信列表地址,第一页不能携带 page 参数。 + """ + super().__init__(*args, **kwargs) + self._user_mail_unread_page = self.__build_unread_mailbox_page(box=1) + self._sys_mail_unread_page = self.__build_unread_mailbox_page(box=-2) + def _parse_message_unread(self, html_text): """ 解析 Audiences 新版顶部用户栏中的未读消息数。 @@ -55,16 +64,149 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): 'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href' ) msg_links.extend(message_links) - next_page = None - next_page_text = html.xpath('//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁")]/@href') - if next_page_text: - next_page = next_page_text[-1].strip() + next_page = self.__parse_next_message_page(html) finally: if html is not None: del html return next_page + @classmethod + def __build_unread_mailbox_page(cls, box: int) -> str: + """ + 构造 Audiences 未读私信列表首页地址。 + """ + return f"messages.php?action=viewmailbox&box={box}&unread=yes" + + @classmethod + def __parse_next_message_page(cls, html) -> Optional[str]: + """ + 解析 Audiences 新版分页中的下一页链接,兼容图标按钮和英文无障碍标签。 + """ + next_pages = [] + for link in html.xpath('//a[@href]'): + if cls.__is_next_message_page_link(link): + next_pages.append(cls.__normalize_message_page_link(link.get("href").strip())) + if next_pages: + return next_pages[-1] + + return cls.__parse_next_numeric_message_page(html) + + @classmethod + def __parse_next_numeric_message_page(cls, html) -> Optional[str]: + """ + 从数字分页中推断下一页,兼容只展示页码按钮的私信列表。 + """ + current_page = None + page_links = [] + for link in html.xpath('//a[@href]'): + page = cls.__extract_message_page(link.get("href")) + if page is None: + continue + page_links.append((page, link.get("href").strip())) + if cls.__is_current_page_link(link): + current_page = page + + if current_page is None: + current_page = cls.__extract_current_page_from_markup(html) + if current_page is None or not page_links: + return None + + next_links = [(page, href) for page, href in page_links if page > current_page] + if not next_links: + return None + + _, next_link = sorted(next_links, key=lambda item: item[0])[0] + return cls.__normalize_message_page_link(next_link) + + @staticmethod + def __is_next_message_page_link(link) -> bool: + """ + 判断分页链接是否指向下一页,避免只识别中文“下一页”文本。 + """ + link_class = f" {link.get('class') or ''} ".lower() + if any(flag in link_class for flag in (" disabled ", " disable ", " inactive ")): + return False + + link_text = re.sub(r"\s+", " ", link.xpath("string(.)") or "").strip().lower() + title = (link.get("title") or "").strip().lower() + aria_label = (link.get("aria-label") or "").strip().lower() + rel = (link.get("rel") or "").strip().lower() + signal_text = " ".join([link_text, title, aria_label, rel]) + if any(keyword in signal_text for keyword in ("下一页", "下一頁", "next")): + return True + + if link_text in {">", "›", "»"}: + return True + + return any(flag in link_class for flag in (" next ", " pager-next ", " page-next ")) + + @classmethod + def __extract_message_page(cls, href: str) -> Optional[int]: + """ + 从 Audiences 私信分页链接中提取页码。 + """ + if not href: + return None + + query_params = dict(parse_qsl(urlsplit(href).query, keep_blank_values=True)) + if query_params.get("action") != "viewmailbox": + return None + return StringUtils.str_int(query_params.get("page")) + + @staticmethod + def __is_current_page_link(link) -> bool: + """ + 判断页码链接是否表示当前页。 + """ + link_class = f" {link.get('class') or ''} ".lower() + parent_class = f" {link.getparent().get('class') or ''} ".lower() if link.getparent() is not None else "" + aria_current = (link.get("aria-current") or "").strip().lower() + current_flags = (" active ", " current ", " selected ") + return aria_current == "page" or any(flag in link_class or flag in parent_class for flag in current_flags) + + @staticmethod + def __extract_current_page_from_markup(html) -> Optional[int]: + """ + 从非链接的当前页标记中提取 Audiences 的 page 参数值。 + """ + current_texts = html.xpath( + '//*[contains(concat(" ", normalize-space(@class), " "), " active ") ' + 'or contains(concat(" ", normalize-space(@class), " "), " current ") ' + 'or contains(concat(" ", normalize-space(@class), " "), " selected ") ' + 'or @aria-current="page"]/text()' + ) + for current_text in current_texts: + page_match = re.search(r"\d+", str(current_text)) + if page_match: + # Audiences 页码展示从 1 开始,但 URL 中 page=1 表示第二页。 + return max(StringUtils.str_int(page_match.group()) - 1, 0) + return None + + @classmethod + def __normalize_message_page_link(cls, href: str) -> str: + """ + 给翻页链接补齐未读过滤,避免后续页退回站点完整收件箱。 + """ + if not href: + return href + + url_info = urlsplit(href) + query_params = dict(parse_qsl(url_info.query, keep_blank_values=True)) + if query_params.get("action") != "viewmailbox": + return href + + query_params.setdefault("unread", "yes") + return urlunsplit( + ( + url_info.scheme, + url_info.netloc, + url_info.path, + urlencode(query_params), + url_info.fragment, + ) + ) + def _parse_user_traffic_info(self, html_text): """ 解析用户流量信息 diff --git a/tests/test_nexus_audiences_parser.py b/tests/test_nexus_audiences_parser.py index 1982394a..fcb7329e 100644 --- a/tests/test_nexus_audiences_parser.py +++ b/tests/test_nexus_audiences_parser.py @@ -181,3 +181,62 @@ def test_audiences_readpm_row_is_not_unread_message(): parser._parse_message_unread_links(html_text, msg_links) assert msg_links == [] + + +def test_audiences_unread_mailbox_first_page_omits_page_param(): + """ + Audiences 私信首页不传 page,page=1 实际表示第二页。 + """ + parser = NexusAudiencesSiteUserInfo( + site_name="Audiences", + url="https://audiences.me/", + site_cookie="", + apikey=None, + token=None, + ) + + assert parser._user_mail_unread_page == "messages.php?action=viewmailbox&box=1&unread=yes" + assert parser._sys_mail_unread_page == "messages.php?action=viewmailbox&box=-2&unread=yes" + + +def test_audiences_unread_links_follow_icon_next_page(): + """ + Audiences 新版分页可能只有图标和 aria-label,不能只依赖中文下一页文本。 + """ + parser = NexusAudiencesSiteUserInfo( + site_name="Audiences", + url="https://audiences.me/", + site_cookie="", + apikey=None, + token=None, + ) + html_text = """ + + + + + + + +
+ Unread + + 种子被删除 +
+ + + + """ + msg_links = [] + + next_page = parser._parse_message_unread_links(html_text, msg_links) + + assert msg_links == ["messages.php?action=viewmessage&id=4318225"] + assert next_page == "messages.php?action=viewmailbox&box=1&page=1&unread=yes"