fix: improve audiences message pagination

2026-06-01 21:50:14 +08:00 · 2026-05-20 21:30:14 +08:00
parent 0fb9d18b30
commit 284c272001
2 changed files with 206 additions and 5 deletions
--- a/app/modules/indexer/parser/nexus_audiences.py
+++ b/app/modules/indexer/parser/nexus_audiences.py
@@ -1,7 +1,8 @@
 # -*- coding: utf-8 -*-
 import json
 import re
-from urllib.parse import urljoin
+from typing import Optional
+from urllib.parse import parse_qsl, urlencode, urljoin, urlsplit, urlunsplit

 from lxml import etree

@@ -13,6 +14,14 @@ from app.utils.string import StringUtils
 class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo):
    schema = SiteSchema.NexusAudiences

+    def __init__(self, *args, **kwargs):
+        """
+        初始化 Audiences 未读私信列表地址，第一页不能携带 page 参数。
+        """
+        super().__init__(*args, **kwargs)
+        self._user_mail_unread_page = self.__build_unread_mailbox_page(box=1)
+        self._sys_mail_unread_page = self.__build_unread_mailbox_page(box=-2)
+
    def _parse_message_unread(self, html_text):
        """
        解析 Audiences 新版顶部用户栏中的未读消息数。
@@ -55,16 +64,149 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo):
                'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href'
            )
            msg_links.extend(message_links)
-            next_page = None
-            next_page_text = html.xpath('//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁")]/@href')
-            if next_page_text:
-                next_page = next_page_text[-1].strip()
+            next_page = self.__parse_next_message_page(html)
        finally:
            if html is not None:
                del html

        return next_page

+    @classmethod
+    def __build_unread_mailbox_page(cls, box: int) -> str:
+        """
+        构造 Audiences 未读私信列表首页地址。
+        """
+        return f"messages.php?action=viewmailbox&box={box}&unread=yes"
+
+    @classmethod
+    def __parse_next_message_page(cls, html) -> Optional[str]:
+        """
+        解析 Audiences 新版分页中的下一页链接，兼容图标按钮和英文无障碍标签。
+        """
+        next_pages = []
+        for link in html.xpath('//a[@href]'):
+            if cls.__is_next_message_page_link(link):
+                next_pages.append(cls.__normalize_message_page_link(link.get("href").strip()))
+        if next_pages:
+            return next_pages[-1]
+
+        return cls.__parse_next_numeric_message_page(html)
+
+    @classmethod
+    def __parse_next_numeric_message_page(cls, html) -> Optional[str]:
+        """
+        从数字分页中推断下一页，兼容只展示页码按钮的私信列表。
+        """
+        current_page = None
+        page_links = []
+        for link in html.xpath('//a[@href]'):
+            page = cls.__extract_message_page(link.get("href"))
+            if page is None:
+                continue
+            page_links.append((page, link.get("href").strip()))
+            if cls.__is_current_page_link(link):
+                current_page = page
+
+        if current_page is None:
+            current_page = cls.__extract_current_page_from_markup(html)
+        if current_page is None or not page_links:
+            return None
+
+        next_links = [(page, href) for page, href in page_links if page > current_page]
+        if not next_links:
+            return None
+
+        _, next_link = sorted(next_links, key=lambda item: item[0])[0]
+        return cls.__normalize_message_page_link(next_link)
+
+    @staticmethod
+    def __is_next_message_page_link(link) -> bool:
+        """
+        判断分页链接是否指向下一页，避免只识别中文“下一页”文本。
+        """
+        link_class = f" {link.get('class') or ''} ".lower()
+        if any(flag in link_class for flag in (" disabled ", " disable ", " inactive ")):
+            return False
+
+        link_text = re.sub(r"\s+", " ", link.xpath("string(.)") or "").strip().lower()
+        title = (link.get("title") or "").strip().lower()
+        aria_label = (link.get("aria-label") or "").strip().lower()
+        rel = (link.get("rel") or "").strip().lower()
+        signal_text = " ".join([link_text, title, aria_label, rel])
+        if any(keyword in signal_text for keyword in ("下一页", "下一頁", "next")):
+            return True
+
+        if link_text in {">", "›", "»"}:
+            return True
+
+        return any(flag in link_class for flag in (" next ", " pager-next ", " page-next "))
+
+    @classmethod
+    def __extract_message_page(cls, href: str) -> Optional[int]:
+        """
+        从 Audiences 私信分页链接中提取页码。
+        """
+        if not href:
+            return None
+
+        query_params = dict(parse_qsl(urlsplit(href).query, keep_blank_values=True))
+        if query_params.get("action") != "viewmailbox":
+            return None
+        return StringUtils.str_int(query_params.get("page"))
+
+    @staticmethod
+    def __is_current_page_link(link) -> bool:
+        """
+        判断页码链接是否表示当前页。
+        """
+        link_class = f" {link.get('class') or ''} ".lower()
+        parent_class = f" {link.getparent().get('class') or ''} ".lower() if link.getparent() is not None else ""
+        aria_current = (link.get("aria-current") or "").strip().lower()
+        current_flags = (" active ", " current ", " selected ")
+        return aria_current == "page" or any(flag in link_class or flag in parent_class for flag in current_flags)
+
+    @staticmethod
+    def __extract_current_page_from_markup(html) -> Optional[int]:
+        """
+        从非链接的当前页标记中提取 Audiences 的 page 参数值。
+        """
+        current_texts = html.xpath(
+            '//*[contains(concat(" ", normalize-space(@class), " "), " active ") '
+            'or contains(concat(" ", normalize-space(@class), " "), " current ") '
+            'or contains(concat(" ", normalize-space(@class), " "), " selected ") '
+            'or @aria-current="page"]/text()'
+        )
+        for current_text in current_texts:
+            page_match = re.search(r"\d+", str(current_text))
+            if page_match:
+                # Audiences 页码展示从 1 开始，但 URL 中 page=1 表示第二页。
+                return max(StringUtils.str_int(page_match.group()) - 1, 0)
+        return None
+
+    @classmethod
+    def __normalize_message_page_link(cls, href: str) -> str:
+        """
+        给翻页链接补齐未读过滤，避免后续页退回站点完整收件箱。
+        """
+        if not href:
+            return href
+
+        url_info = urlsplit(href)
+        query_params = dict(parse_qsl(url_info.query, keep_blank_values=True))
+        if query_params.get("action") != "viewmailbox":
+            return href
+
+        query_params.setdefault("unread", "yes")
+        return urlunsplit(
+            (
+                url_info.scheme,
+                url_info.netloc,
+                url_info.path,
+                urlencode(query_params),
+                url_info.fragment,
+            )
+        )
+
    def _parse_user_traffic_info(self, html_text):
        """
        解析用户流量信息