mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-16 05:01:16 +08:00
876 lines
31 KiB
Python
876 lines
31 KiB
Python
import ipaddress
|
||
import threading
|
||
import time
|
||
import uuid
|
||
from dataclasses import dataclass, field
|
||
from typing import Any, Callable, Optional, Protocol
|
||
from urllib.parse import urlparse
|
||
|
||
from app.core.config import settings
|
||
from app.log import logger
|
||
from app.utils.http import RequestUtils, cookie_parse
|
||
|
||
|
||
class BrowserElement(Protocol):
|
||
"""
|
||
页面元素的最小接口,避免为了类型标注直接导入 Playwright。
|
||
"""
|
||
|
||
def is_visible(self) -> bool:
|
||
"""判断元素是否可见。"""
|
||
...
|
||
|
||
def fill(self, value: str) -> None:
|
||
"""向元素输入文本。"""
|
||
...
|
||
|
||
def inner_text(self) -> str:
|
||
"""获取元素可见文本。"""
|
||
...
|
||
|
||
|
||
class BrowserContext(Protocol):
|
||
"""
|
||
CloakBrowser 返回的上下文只需要满足这些能力即可。
|
||
"""
|
||
|
||
def new_page(self) -> "BrowserPage":
|
||
"""创建新的浏览器页面。"""
|
||
...
|
||
|
||
def cookies(self) -> list[dict[str, Any]]:
|
||
"""返回当前上下文 Cookie。"""
|
||
...
|
||
|
||
def close(self) -> None:
|
||
"""关闭浏览器上下文。"""
|
||
...
|
||
|
||
|
||
class BrowserPage(Protocol):
|
||
"""
|
||
CloakBrowser 页面对象的最小接口,覆盖 helper 和登录流程当前用到的方法。
|
||
"""
|
||
|
||
context: BrowserContext
|
||
url: str
|
||
|
||
def set_extra_http_headers(self, headers: dict[str, str]) -> None:
|
||
"""设置页面额外请求头。"""
|
||
...
|
||
|
||
def set_default_timeout(self, timeout: int) -> None:
|
||
"""设置页面默认超时时间。"""
|
||
...
|
||
|
||
def goto(self, url: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""导航到指定 URL。"""
|
||
...
|
||
|
||
def wait_for_load_state(self, state: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""等待页面加载状态。"""
|
||
...
|
||
|
||
def wait_for_selector(self, selector: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""等待指定选择器出现。"""
|
||
...
|
||
|
||
def fill(self, selector: str, value: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""向指定选择器输入文本。"""
|
||
...
|
||
|
||
def click(self, selector: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""点击指定选择器。"""
|
||
...
|
||
|
||
def select_option(self, selector: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""选择下拉框选项。"""
|
||
...
|
||
|
||
def query_selector(self, selector: str) -> Optional[BrowserElement]:
|
||
"""查询指定选择器元素。"""
|
||
...
|
||
|
||
def title(self) -> str:
|
||
"""返回页面标题。"""
|
||
...
|
||
|
||
def inner_text(self, selector: str) -> str:
|
||
"""返回指定选择器的可见文本。"""
|
||
...
|
||
|
||
def content(self) -> str:
|
||
"""返回页面 HTML 内容。"""
|
||
...
|
||
|
||
def evaluate(self, expression: str, *args: Any, **kwargs: Any) -> Any:
|
||
"""执行页面 JavaScript 表达式。"""
|
||
...
|
||
|
||
def screenshot(self, *args: Any, **kwargs: Any) -> bytes:
|
||
"""截取页面截图。"""
|
||
...
|
||
|
||
def close(self) -> None:
|
||
"""关闭浏览器页面。"""
|
||
...
|
||
|
||
|
||
@dataclass
|
||
class _BrowserSessionState:
|
||
session_key: str
|
||
context: BrowserContext
|
||
pages: list[BrowserPage]
|
||
active_index: int = 0
|
||
user_agent: Optional[str] = None
|
||
cookies: Optional[str] = None
|
||
created_at: float = field(default_factory=time.monotonic)
|
||
last_used_at: float = field(default_factory=time.monotonic)
|
||
lock: threading.RLock = field(default_factory=threading.RLock)
|
||
|
||
@property
|
||
def active_page(self) -> BrowserPage:
|
||
return self.pages[self.active_index]
|
||
|
||
|
||
class BrowserSessionHelper:
|
||
"""
|
||
Agent 浏览器会话辅助类,负责复用 CloakBrowser 上下文并生成可操作页面快照。
|
||
"""
|
||
|
||
SESSION_TTL_SECONDS = 15 * 60
|
||
MAX_SESSIONS = 8
|
||
DEFAULT_VIEWPORT = {"width": 1280, "height": 720}
|
||
PRIVATE_HOST_SUFFIXES = (".localhost", ".local", ".lan", ".home", ".internal")
|
||
PRIVATE_HOSTNAMES = {"localhost", "ip6-localhost", "ip6-loopback"}
|
||
REF_ATTRIBUTE = "data-moviepilot-agent-ref"
|
||
|
||
_sessions: dict[str, _BrowserSessionState] = {}
|
||
_sessions_lock = threading.RLock()
|
||
|
||
def __init__(self, headless: bool = True, viewport: Optional[dict[str, int]] = None):
|
||
"""
|
||
初始化浏览器会话辅助类。
|
||
|
||
:param headless: 是否使用无头浏览器
|
||
:param viewport: 默认视口大小
|
||
"""
|
||
self.headless = headless
|
||
self.viewport = viewport or self.DEFAULT_VIEWPORT
|
||
|
||
@classmethod
|
||
def validate_url(cls, url: str, allow_private_network: bool = False) -> str:
|
||
"""
|
||
校验浏览器可访问的 URL,默认拒绝本机、私网和非 HTTP 协议。
|
||
|
||
:param url: 待访问的 URL
|
||
:param allow_private_network: 是否允许访问本机或私网地址
|
||
:return: 原始 URL
|
||
"""
|
||
parsed = urlparse(url or "")
|
||
if parsed.scheme not in {"http", "https"}:
|
||
raise ValueError("仅支持 http/https URL")
|
||
if not parsed.hostname:
|
||
raise ValueError("URL 缺少主机名")
|
||
|
||
hostname = parsed.hostname.lower().rstrip(".")
|
||
if allow_private_network:
|
||
return url
|
||
|
||
if hostname in cls.PRIVATE_HOSTNAMES or hostname.endswith(
|
||
cls.PRIVATE_HOST_SUFFIXES
|
||
):
|
||
raise ValueError("默认不允许访问本机或私网地址")
|
||
|
||
try:
|
||
ip_address = ipaddress.ip_address(hostname)
|
||
except ValueError:
|
||
return url
|
||
|
||
if not ip_address.is_global:
|
||
raise ValueError("默认不允许访问本机或私网地址")
|
||
return url
|
||
|
||
@classmethod
|
||
def ref_to_selector(cls, ref: str) -> str:
|
||
"""
|
||
将页面快照中的元素引用转换为稳定选择器。
|
||
|
||
:param ref: 快照返回的元素引用
|
||
:return: 可传给浏览器的属性选择器
|
||
"""
|
||
clean_ref = (ref or "").strip()
|
||
if not clean_ref:
|
||
raise ValueError("元素 ref 不能为空")
|
||
escaped_ref = clean_ref.replace("\\", "\\\\").replace('"', '\\"')
|
||
return f'[{cls.REF_ATTRIBUTE}="{escaped_ref}"]'
|
||
|
||
@classmethod
|
||
def close_all_sessions(cls) -> None:
|
||
"""
|
||
关闭所有 Agent 浏览器会话。
|
||
"""
|
||
with cls._sessions_lock:
|
||
session_keys = list(cls._sessions.keys())
|
||
for session_key in session_keys:
|
||
cls.close_session(session_key)
|
||
|
||
@classmethod
|
||
def close_session(cls, session_key: str) -> bool:
|
||
"""
|
||
关闭指定 Agent 浏览器会话。
|
||
|
||
:param session_key: 会话标识
|
||
:return: 找到并关闭会话时返回 True
|
||
"""
|
||
with cls._sessions_lock:
|
||
session = cls._sessions.pop(session_key, None)
|
||
if not session:
|
||
return False
|
||
cls._close_session_state(session)
|
||
return True
|
||
|
||
def with_session(
|
||
self,
|
||
session_key: str,
|
||
callback: Callable[[_BrowserSessionState], Any],
|
||
user_agent: Optional[str] = None,
|
||
cookies: Optional[str] = None,
|
||
timeout: Optional[int] = 30,
|
||
) -> Any:
|
||
"""
|
||
获取或创建浏览器会话,并在持有会话锁时执行回调。
|
||
|
||
:param session_key: 会话标识
|
||
:param callback: 使用浏览器会话执行操作的回调函数
|
||
:param user_agent: 新建会话时使用的 User-Agent
|
||
:param cookies: 本次操作要注入的 Cookie 请求头
|
||
:param timeout: 默认操作超时时间,单位秒
|
||
:return: 回调函数返回值
|
||
"""
|
||
self._prune_sessions()
|
||
session = self._get_or_create_session(
|
||
session_key=session_key,
|
||
user_agent=user_agent,
|
||
cookies=cookies,
|
||
)
|
||
with session.lock:
|
||
session.last_used_at = time.monotonic()
|
||
if timeout and hasattr(session.active_page, "set_default_timeout"):
|
||
session.active_page.set_default_timeout(int(timeout) * 1000)
|
||
if cookies:
|
||
session.cookies = cookies
|
||
session.active_page.set_extra_http_headers({"cookie": cookies})
|
||
return callback(session)
|
||
|
||
def open_tab(
|
||
self,
|
||
session: _BrowserSessionState,
|
||
url: Optional[str] = None,
|
||
timeout: Optional[int] = 30,
|
||
allow_private_network: bool = False,
|
||
) -> BrowserPage:
|
||
"""
|
||
在当前会话中新建标签页,并可选导航到指定 URL。
|
||
|
||
:param session: 当前浏览器会话
|
||
:param url: 可选的目标 URL
|
||
:param timeout: 导航超时时间,单位秒
|
||
:param allow_private_network: 是否允许访问本机或私网地址
|
||
:return: 新建的页面对象
|
||
"""
|
||
page = session.context.new_page()
|
||
if timeout and hasattr(page, "set_default_timeout"):
|
||
page.set_default_timeout(int(timeout) * 1000)
|
||
if session.cookies:
|
||
page.set_extra_http_headers({"cookie": session.cookies})
|
||
session.pages.append(page)
|
||
session.active_index = len(session.pages) - 1
|
||
if url:
|
||
self.goto(
|
||
page,
|
||
url,
|
||
timeout=timeout,
|
||
allow_private_network=allow_private_network,
|
||
)
|
||
return page
|
||
|
||
@staticmethod
|
||
def list_tabs(session: _BrowserSessionState) -> list[dict[str, Any]]:
|
||
"""
|
||
列出当前浏览器会话中的标签页。
|
||
|
||
:param session: 当前浏览器会话
|
||
:return: 标签页摘要列表
|
||
"""
|
||
tabs = []
|
||
for index, page in enumerate(session.pages):
|
||
tabs.append(
|
||
{
|
||
"index": index,
|
||
"active": index == session.active_index,
|
||
"url": getattr(page, "url", ""),
|
||
"title": BrowserSessionHelper._safe_page_title(page),
|
||
}
|
||
)
|
||
return tabs
|
||
|
||
@staticmethod
|
||
def focus_tab(session: _BrowserSessionState, tab_index: int) -> BrowserPage:
|
||
"""
|
||
切换当前会话的活动标签页。
|
||
|
||
:param session: 当前浏览器会话
|
||
:param tab_index: 标签页索引
|
||
:return: 切换后的页面对象
|
||
"""
|
||
if tab_index < 0 or tab_index >= len(session.pages):
|
||
raise ValueError(f"标签页索引不存在: {tab_index}")
|
||
session.active_index = tab_index
|
||
return session.active_page
|
||
|
||
@staticmethod
|
||
def close_tab(session: _BrowserSessionState, tab_index: int) -> list[dict[str, Any]]:
|
||
"""
|
||
关闭当前会话中的指定标签页。
|
||
|
||
:param session: 当前浏览器会话
|
||
:param tab_index: 标签页索引
|
||
:return: 关闭后的标签页列表
|
||
"""
|
||
if tab_index < 0 or tab_index >= len(session.pages):
|
||
raise ValueError(f"标签页索引不存在: {tab_index}")
|
||
page = session.pages.pop(tab_index)
|
||
try:
|
||
page.close()
|
||
except Exception as err:
|
||
logger.warning(f"关闭浏览器标签页失败: {str(err)}")
|
||
if not session.pages:
|
||
session.pages.append(session.context.new_page())
|
||
session.active_index = min(session.active_index, len(session.pages) - 1)
|
||
return BrowserSessionHelper.list_tabs(session)
|
||
|
||
def goto(
|
||
self,
|
||
page: BrowserPage,
|
||
url: str,
|
||
timeout: Optional[int] = 30,
|
||
allow_private_network: bool = False,
|
||
) -> Any:
|
||
"""
|
||
校验并导航页面到指定 URL。
|
||
|
||
:param page: 页面对象
|
||
:param url: 目标 URL
|
||
:param timeout: 导航超时时间,单位秒
|
||
:param allow_private_network: 是否允许访问本机或私网地址
|
||
:return: 浏览器导航响应对象
|
||
"""
|
||
self.validate_url(url, allow_private_network=allow_private_network)
|
||
response = page.goto(
|
||
url,
|
||
wait_until="domcontentloaded",
|
||
timeout=int(timeout or 30) * 1000,
|
||
)
|
||
try:
|
||
page.wait_for_load_state(
|
||
"networkidle",
|
||
timeout=min(int(timeout or 30), 15) * 1000,
|
||
)
|
||
except Exception:
|
||
pass
|
||
self.validate_current_url(page, allow_private_network=allow_private_network)
|
||
return response
|
||
|
||
@classmethod
|
||
def validate_current_url(
|
||
cls, page: BrowserPage, allow_private_network: bool = False
|
||
) -> None:
|
||
"""
|
||
校验当前页面地址,捕获跳转后的不安全目标。
|
||
|
||
:param page: 页面对象
|
||
:param allow_private_network: 是否允许访问本机或私网地址
|
||
"""
|
||
current_url = getattr(page, "url", "")
|
||
if current_url and current_url.startswith(("http://", "https://")):
|
||
cls.validate_url(current_url, allow_private_network=allow_private_network)
|
||
|
||
@classmethod
|
||
def build_snapshot(
|
||
cls,
|
||
page: BrowserPage,
|
||
status: Optional[Any] = None,
|
||
max_text_chars: int = 8000,
|
||
max_elements: int = 40,
|
||
) -> dict[str, Any]:
|
||
"""
|
||
构建包含可读文本和可交互元素 ref 的页面快照。
|
||
|
||
:param page: 页面对象
|
||
:param status: 可选的导航状态码
|
||
:param max_text_chars: 页面文本最大返回长度
|
||
:param max_elements: 最大可交互元素数量
|
||
:return: 页面快照字典
|
||
"""
|
||
text_content = cls._safe_inner_text(page, "body")
|
||
result = {
|
||
"url": getattr(page, "url", ""),
|
||
"title": cls._safe_page_title(page),
|
||
"text_content": cls._truncate_text(text_content, max_text_chars),
|
||
"interactive_elements": cls._extract_interactive_elements(
|
||
page, max_elements=max_elements
|
||
),
|
||
}
|
||
if status is not None:
|
||
result["status"] = status
|
||
|
||
links = [
|
||
{
|
||
"ref": element.get("ref"),
|
||
"text": element.get("text"),
|
||
"href": element.get("href"),
|
||
}
|
||
for element in result["interactive_elements"]
|
||
if element.get("tag") == "a" and element.get("href")
|
||
][:30]
|
||
forms = [
|
||
element
|
||
for element in result["interactive_elements"]
|
||
if element.get("tag") in {"input", "textarea", "select", "button"}
|
||
][:30]
|
||
if links:
|
||
result["links"] = links
|
||
if forms:
|
||
result["form_elements"] = forms
|
||
return result
|
||
|
||
@staticmethod
|
||
def _launch_context(
|
||
headless: bool,
|
||
user_agent: Optional[str] = None,
|
||
viewport: Optional[dict[str, int]] = None,
|
||
) -> BrowserContext:
|
||
from cloakbrowser import launch_context
|
||
|
||
context_kwargs = {
|
||
"headless": headless,
|
||
"humanize": settings.CLOAKBROWSER_HUMANIZE,
|
||
"human_preset": settings.CLOAKBROWSER_HUMAN_PRESET,
|
||
}
|
||
if user_agent:
|
||
context_kwargs["user_agent"] = user_agent
|
||
if viewport:
|
||
context_kwargs["viewport"] = viewport
|
||
return launch_context(**context_kwargs)
|
||
|
||
def _get_or_create_session(
|
||
self,
|
||
session_key: str,
|
||
user_agent: Optional[str] = None,
|
||
cookies: Optional[str] = None,
|
||
) -> _BrowserSessionState:
|
||
with self._sessions_lock:
|
||
session = self._sessions.get(session_key)
|
||
if session and user_agent and session.user_agent != user_agent:
|
||
self._sessions.pop(session_key, None)
|
||
self._close_session_state(session)
|
||
session = None
|
||
if session:
|
||
return session
|
||
|
||
context = self._launch_context(
|
||
headless=self.headless,
|
||
user_agent=user_agent,
|
||
viewport=self.viewport,
|
||
)
|
||
page = context.new_page()
|
||
if cookies:
|
||
page.set_extra_http_headers({"cookie": cookies})
|
||
session = _BrowserSessionState(
|
||
session_key=session_key,
|
||
context=context,
|
||
pages=[page],
|
||
user_agent=user_agent,
|
||
cookies=cookies,
|
||
)
|
||
self._sessions[session_key] = session
|
||
self._enforce_session_limit()
|
||
return session
|
||
|
||
@classmethod
|
||
def _prune_sessions(cls) -> None:
|
||
now = time.monotonic()
|
||
with cls._sessions_lock:
|
||
expired_keys = [
|
||
session_key
|
||
for session_key, session in cls._sessions.items()
|
||
if now - session.last_used_at > cls.SESSION_TTL_SECONDS
|
||
]
|
||
for session_key in expired_keys:
|
||
cls.close_session(session_key)
|
||
|
||
@classmethod
|
||
def _enforce_session_limit(cls) -> None:
|
||
while len(cls._sessions) > cls.MAX_SESSIONS:
|
||
oldest_key = min(
|
||
cls._sessions,
|
||
key=lambda key: cls._sessions[key].last_used_at,
|
||
)
|
||
session = cls._sessions.pop(oldest_key)
|
||
cls._close_session_state(session)
|
||
|
||
@staticmethod
|
||
def _close_session_state(session: _BrowserSessionState) -> None:
|
||
with session.lock:
|
||
for page in list(session.pages):
|
||
try:
|
||
page.close()
|
||
except Exception as err:
|
||
logger.warning(f"关闭浏览器页面失败: {str(err)}")
|
||
try:
|
||
session.context.close()
|
||
except Exception as err:
|
||
logger.warning(f"关闭浏览器上下文失败: {str(err)}")
|
||
|
||
@staticmethod
|
||
def _safe_page_title(page: BrowserPage) -> str:
|
||
try:
|
||
return page.title()
|
||
except Exception:
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _safe_inner_text(page: BrowserPage, selector: str) -> str:
|
||
try:
|
||
return page.inner_text(selector)
|
||
except Exception:
|
||
return ""
|
||
|
||
@staticmethod
|
||
def _truncate_text(text: Optional[str], max_chars: int) -> str:
|
||
if not text:
|
||
return ""
|
||
if len(text) <= max_chars:
|
||
return text
|
||
return text[:max_chars] + "\n\n...(内容已截断)"
|
||
|
||
@classmethod
|
||
def _extract_interactive_elements(
|
||
cls, page: BrowserPage, max_elements: int
|
||
) -> list[dict[str, Any]]:
|
||
script = f"""
|
||
() => {{
|
||
const limit = {int(max_elements)};
|
||
const selector = [
|
||
'a[href]',
|
||
'button',
|
||
'input',
|
||
'textarea',
|
||
'select',
|
||
'[role="button"]',
|
||
'[role="link"]',
|
||
'[onclick]',
|
||
'summary'
|
||
].join(',');
|
||
const isVisible = (el) => {{
|
||
const style = window.getComputedStyle(el);
|
||
const rect = el.getBoundingClientRect();
|
||
return style && style.visibility !== 'hidden'
|
||
&& style.display !== 'none'
|
||
&& rect.width > 0
|
||
&& rect.height > 0;
|
||
}};
|
||
return Array.from(document.querySelectorAll(selector))
|
||
.filter(isVisible)
|
||
.slice(0, limit)
|
||
.map((el, index) => {{
|
||
const ref = `e${{index + 1}}`;
|
||
el.setAttribute('{cls.REF_ATTRIBUTE}', ref);
|
||
const tag = el.tagName.toLowerCase();
|
||
const text = (
|
||
el.innerText
|
||
|| el.value
|
||
|| el.getAttribute('aria-label')
|
||
|| el.getAttribute('title')
|
||
|| el.getAttribute('placeholder')
|
||
|| ''
|
||
).trim();
|
||
return {{
|
||
ref,
|
||
tag,
|
||
type: el.type || '',
|
||
text: text.substring(0, 120),
|
||
name: el.name || '',
|
||
id: el.id || '',
|
||
role: el.getAttribute('role') || '',
|
||
placeholder: el.getAttribute('placeholder') || '',
|
||
href: el.href || '',
|
||
value: tag === 'select' ? '' : (el.value || '').substring(0, 80),
|
||
selector: `[${cls.REF_ATTRIBUTE}="${{ref}}"]`
|
||
}};
|
||
}});
|
||
}}
|
||
"""
|
||
try:
|
||
elements = page.evaluate(script)
|
||
except Exception as err:
|
||
logger.debug(f"提取页面可交互元素失败: {str(err)}")
|
||
return []
|
||
if not isinstance(elements, list):
|
||
return []
|
||
return elements
|
||
|
||
|
||
class PlaywrightHelper:
|
||
def __init__(self, browser_type: Optional[str] = None, *args, **kwargs):
|
||
"""
|
||
兼容旧的 PlaywrightHelper(browser_type=...) 构造方式。
|
||
"""
|
||
self.browser_type = browser_type or settings.PLAYWRIGHT_BROWSER_TYPE
|
||
|
||
@staticmethod
|
||
def __browser_emulation() -> str:
|
||
"""
|
||
当前浏览器仿真类型。
|
||
"""
|
||
return (settings.BROWSER_EMULATION or "cloakbrowser").lower()
|
||
|
||
@staticmethod
|
||
def __launch_cloakbrowser_context(headless: bool,
|
||
user_agent: Optional[str] = None,
|
||
proxies: Optional[dict] = None) -> BrowserContext:
|
||
"""
|
||
启动 CloakBrowser 上下文。
|
||
"""
|
||
from cloakbrowser import launch_context
|
||
|
||
return launch_context(headless=headless,
|
||
proxy=proxies,
|
||
user_agent=user_agent,
|
||
humanize=settings.CLOAKBROWSER_HUMANIZE,
|
||
human_preset=settings.CLOAKBROWSER_HUMAN_PRESET)
|
||
|
||
@staticmethod
|
||
def __fs_cookie_str(cookies: list) -> str:
|
||
if not cookies:
|
||
return ""
|
||
return "; ".join([f"{c.get('name')}={c.get('value')}" for c in cookies if c and c.get('name') is not None])
|
||
|
||
@staticmethod
|
||
def __flaresolverr_request(url: str,
|
||
cookies: Optional[str] = None,
|
||
proxy_config: Optional[dict] = None,
|
||
timeout: Optional[int] = 60) -> Optional[dict]:
|
||
"""
|
||
调用 FlareSolverr 解决 Cloudflare 并返回 solution 结果
|
||
参考: https://github.com/FlareSolverr/FlareSolverr
|
||
"""
|
||
if not settings.FLARESOLVERR_URL:
|
||
logger.warn("未配置 FLARESOLVERR_URL,无法使用 FlareSolverr")
|
||
return None
|
||
|
||
fs_api = settings.FLARESOLVERR_URL.rstrip("/") + "/v1"
|
||
session_id = None
|
||
|
||
try:
|
||
# 检查是否需要代理认证
|
||
need_proxy_auth = (proxy_config and proxy_config.get("server") and
|
||
(proxy_config.get("username") or proxy_config.get("password")))
|
||
|
||
if need_proxy_auth:
|
||
# 使用 session 模式支持代理认证
|
||
logger.debug("检测到flaresolverr代理需要认证,使用 session 模式")
|
||
|
||
# 1. 创建会话
|
||
session_id = str(uuid.uuid4())
|
||
create_payload: dict = {
|
||
"cmd": "sessions.create",
|
||
"session": session_id
|
||
}
|
||
|
||
# 添加代理配置到会话创建请求
|
||
if proxy_config and proxy_config.get("server"):
|
||
proxy_payload: dict = {"url": proxy_config["server"]}
|
||
if proxy_config.get("username"):
|
||
proxy_payload["username"] = proxy_config["username"]
|
||
if proxy_config.get("password"):
|
||
proxy_payload["password"] = proxy_config["password"]
|
||
create_payload["proxy"] = proxy_payload
|
||
|
||
# 创建会话
|
||
create_result = RequestUtils(content_type="application/json",
|
||
timeout=timeout or 60).post_json(url=fs_api, json=create_payload)
|
||
if not create_result or create_result.get("status") != "ok":
|
||
logger.error(
|
||
f"创建 FlareSolverr 会话失败: {create_result.get('message') if create_result else '无响应'}")
|
||
return None
|
||
|
||
# 2. 使用会话发送请求
|
||
request_payload = {
|
||
"cmd": "request.get",
|
||
"url": url,
|
||
"session": session_id,
|
||
"maxTimeout": int(timeout or 60) * 1000,
|
||
}
|
||
else:
|
||
# 使用普通模式(无代理认证)
|
||
request_payload = {
|
||
"cmd": "request.get",
|
||
"url": url,
|
||
"maxTimeout": int(timeout or 60) * 1000,
|
||
}
|
||
# 添加代理配置(仅 URL,无认证)
|
||
if proxy_config and proxy_config.get("server"):
|
||
request_payload["proxy"] = {"url": proxy_config["server"]}
|
||
|
||
# 将 cookies 以数组形式传递给 FlareSolverr
|
||
if cookies:
|
||
try:
|
||
request_payload["cookies"] = cookie_parse(cookies, array=True)
|
||
except Exception as e:
|
||
logger.debug(f"解析 cookies 失败,忽略: {str(e)}")
|
||
|
||
# 发送请求
|
||
data = RequestUtils(content_type="application/json",
|
||
timeout=timeout or 60).post_json(url=fs_api, json=request_payload)
|
||
if not data:
|
||
logger.error("FlareSolverr 返回空响应")
|
||
return None
|
||
if data.get("status") != "ok":
|
||
logger.error(f"FlareSolverr 调用失败: {data.get('message')}")
|
||
return None
|
||
return data.get("solution")
|
||
except Exception as e:
|
||
logger.error(f"调用 FlareSolverr 失败: {str(e)}")
|
||
return None
|
||
finally:
|
||
# 清理会话
|
||
if session_id:
|
||
try:
|
||
destroy_payload = {
|
||
"cmd": "sessions.destroy",
|
||
"session": session_id
|
||
}
|
||
RequestUtils(content_type="application/json",
|
||
timeout=10).post_json(url=fs_api, json=destroy_payload)
|
||
logger.debug(f"已清理 FlareSolverr 会话: {session_id}")
|
||
except Exception as e:
|
||
logger.warning(f"清理 FlareSolverr 会话失败: {str(e)}")
|
||
|
||
def action(self, url: str,
|
||
callback: Callable[[BrowserPage], Any],
|
||
cookies: Optional[str] = None,
|
||
ua: Optional[str] = None,
|
||
proxies: Optional[dict] = None,
|
||
headless: Optional[bool] = False,
|
||
timeout: Optional[int] = 60) -> Any:
|
||
"""
|
||
访问网页,接收Page对象并执行操作
|
||
:param url: 网页地址
|
||
:param callback: 回调函数,需要接收page对象
|
||
:param cookies: cookies
|
||
:param ua: user-agent
|
||
:param proxies: 代理
|
||
:param headless: 是否无头模式
|
||
:param timeout: 超时时间
|
||
"""
|
||
result = None
|
||
try:
|
||
context = None
|
||
page = None
|
||
try:
|
||
# 如果配置使用 FlareSolverr,先通过其获取清除后的 cookies 与 UA
|
||
fs_cookie_header = None
|
||
fs_ua = None
|
||
if self.__browser_emulation() == "flaresolverr":
|
||
solution = self.__flaresolverr_request(url=url, cookies=cookies,
|
||
proxy_config=proxies, timeout=timeout)
|
||
if solution:
|
||
fs_cookie_header = self.__fs_cookie_str(solution.get("cookies", []))
|
||
fs_ua = solution.get("userAgent")
|
||
|
||
context = self.__launch_cloakbrowser_context(headless=headless,
|
||
user_agent=fs_ua or ua,
|
||
proxies=proxies)
|
||
page = context.new_page()
|
||
|
||
# 优先使用 FlareSolverr 返回,其次使用入参
|
||
merged_cookie = fs_cookie_header or cookies
|
||
if merged_cookie:
|
||
page.set_extra_http_headers({"cookie": merged_cookie})
|
||
|
||
page.goto(url)
|
||
page.wait_for_load_state("networkidle", timeout=timeout * 1000)
|
||
|
||
# 回调函数
|
||
result = callback(page)
|
||
|
||
except Exception as e:
|
||
logger.error(f"网页操作失败: {str(e)}")
|
||
finally:
|
||
if page:
|
||
page.close()
|
||
if context:
|
||
context.close()
|
||
except Exception as e:
|
||
logger.error(f"CloakBrowser初始化失败: {str(e)}")
|
||
|
||
return result
|
||
|
||
def get_page_source(self, url: str,
|
||
cookies: Optional[str] = None,
|
||
ua: Optional[str] = None,
|
||
proxies: Optional[dict] = None,
|
||
headless: Optional[bool] = False,
|
||
timeout: Optional[int] = 60) -> Optional[str]:
|
||
"""
|
||
获取网页源码
|
||
:param url: 网页地址
|
||
:param cookies: cookies
|
||
:param ua: user-agent
|
||
:param proxies: 代理
|
||
:param headless: 是否无头模式
|
||
:param timeout: 超时时间
|
||
"""
|
||
source = None
|
||
# 如果配置为 FlareSolverr,则直接调用获取页面源码
|
||
if self.__browser_emulation() == "flaresolverr":
|
||
try:
|
||
solution = self.__flaresolverr_request(url=url, cookies=cookies,
|
||
proxy_config=proxies, timeout=timeout)
|
||
if solution:
|
||
return solution.get("response")
|
||
except Exception as e:
|
||
logger.error(f"FlareSolverr 获取源码失败: {str(e)}")
|
||
try:
|
||
context = None
|
||
page = None
|
||
try:
|
||
context = self.__launch_cloakbrowser_context(headless=headless,
|
||
user_agent=ua,
|
||
proxies=proxies)
|
||
page = context.new_page()
|
||
|
||
if cookies:
|
||
page.set_extra_http_headers({"cookie": cookies})
|
||
|
||
page.goto(url)
|
||
page.wait_for_load_state("networkidle", timeout=timeout * 1000)
|
||
|
||
source = page.content()
|
||
|
||
except Exception as e:
|
||
logger.error(f"获取网页源码失败: {str(e)}")
|
||
source = None
|
||
finally:
|
||
# 确保资源被正确清理
|
||
if page:
|
||
page.close()
|
||
if context:
|
||
context.close()
|
||
except Exception as e:
|
||
logger.error(f"CloakBrowser初始化失败: {str(e)}")
|
||
|
||
return source
|