Compare commits

...

16 Commits

Author SHA1 Message Date
jxxghp
6156b9a481 Merge pull request #4561 from jxxghp/cursor/move-media-files-to-season-directory-6ee0 2025-07-08 18:00:50 +08:00
Cursor Agent
8c516c5691 Fix: Ensure parent item exists before saving NFO file
Co-authored-by: jxxghp <jxxghp@163.com>
2025-07-08 09:51:43 +00:00
Cursor Agent
bf9a149898 Fix TV show metadata scraping to use correct parent directory
Co-authored-by: jxxghp <jxxghp@163.com>
2025-07-08 09:31:35 +00:00
jxxghp
277cde8db2 更新 version.py 2025-07-08 12:17:57 +08:00
jxxghp
e06bdaf53e fix:资源包升级失败时一直重启的问题 2025-07-08 12:06:30 +08:00
jxxghp
da367bd138 fix spider 2025-07-08 11:25:36 +08:00
jxxghp
d336bcbf1f fix etree 2025-07-08 11:00:38 +08:00
jxxghp
a8aedba6ff fix https://github.com/jxxghp/MoviePilot/issues/4552 2025-07-08 09:34:24 +08:00
jxxghp
9ede86c6a3 Merge pull request #4555 from cddjr/fix_local_exists 2025-07-07 23:30:51 +08:00
景大侠
1468f2b082 fix 本地媒体文件检查时首选含影视标题的目录
避免了以年份、分辨率等作为重命名第一层目录时的误判问题
2025-07-07 23:24:04 +08:00
jxxghp
e04ae70f89 Merge pull request #4553 from cddjr/fix_trim_task 2025-07-07 22:15:12 +08:00
景大侠
7f7d2c9ba8 fix 飞牛刷新媒体库报错Task duplicate 2025-07-07 21:46:17 +08:00
jxxghp
d73deef8dc Merge pull request #4549 from cddjr/fix_tr 2025-07-07 17:28:28 +08:00
景大侠
f93a1540af fix TR模块报错找不到_protocol属性
v2.5.9引入的bug
2025-07-07 17:05:28 +08:00
jxxghp
c8bd9cb716 Merge pull request #4548 from cddjr/set_lock_timeout 2025-07-07 12:04:46 +08:00
景大侠
2ed13c7e5b fix 订阅匹配锁增加超时,避免罕见的长时间卡任务问题 2025-07-07 11:51:58 +08:00
29 changed files with 1753 additions and 806 deletions

View File

@@ -19,7 +19,6 @@ from app.utils.string import StringUtils
recognize_lock = Lock()
scraping_lock = Lock()
scraping_files = []
class MediaChain(ChainBase):
@@ -35,25 +34,25 @@ class MediaChain(ChainBase):
switchs = SystemConfigOper().get(SystemConfigKey.ScrapingSwitchs) or {}
# 默认配置
default_switchs = {
'movie_nfo': True, # 电影NFO
'movie_poster': True, # 电影海报
'movie_backdrop': True, # 电影背景图
'movie_logo': True, # 电影Logo
'movie_disc': True, # 电影光盘图
'movie_banner': True, # 电影横幅图
'movie_thumb': True, # 电影缩略图
'tv_nfo': True, # 电视剧NFO
'tv_poster': True, # 电视剧海报
'tv_backdrop': True, # 电视剧背景图
'tv_banner': True, # 电视剧横幅图
'tv_logo': True, # 电视剧Logo
'tv_thumb': True, # 电视剧缩略图
'season_nfo': True, # 季NFO
'season_poster': True, # 季海报
'season_banner': True, # 季横幅图
'season_thumb': True, # 季缩略图
'episode_nfo': True, # 集NFO
'episode_thumb': True # 集缩略图
'movie_nfo': True, # 电影NFO
'movie_poster': True, # 电影海报
'movie_backdrop': True, # 电影背景图
'movie_logo': True, # 电影Logo
'movie_disc': True, # 电影光盘图
'movie_banner': True, # 电影横幅图
'movie_thumb': True, # 电影缩略图
'tv_nfo': True, # 电视剧NFO
'tv_poster': True, # 电视剧海报
'tv_backdrop': True, # 电视剧背景图
'tv_banner': True, # 电视剧横幅图
'tv_logo': True, # 电视剧Logo
'tv_thumb': True, # 电视剧缩略图
'season_nfo': True, # 季NFO
'season_poster': True, # 季海报
'season_banner': True, # 季横幅图
'season_thumb': True, # 季缩略图
'episode_nfo': True, # 集NFO
'episode_thumb': True # 集缩略图
}
# 合并用户配置和默认配置
for key, default_value in default_switchs.items():
@@ -344,23 +343,49 @@ class MediaChain(ChainBase):
return
event_data = event.event_data or {}
fileitem: FileItem = event_data.get("fileitem")
file_list: List[str] = event_data.get("file_list", [])
meta: MetaBase = event_data.get("meta")
mediainfo: MediaInfo = event_data.get("mediainfo")
overwrite = event_data.get("overwrite", False)
if not fileitem:
return
# 刮削锁
with scraping_lock:
if fileitem.path in scraping_files:
# 检查文件项是否存在
storagechain = StorageChain()
if not storagechain.get_item(fileitem):
logger.warn(f"文件项不存在:{fileitem.path}")
return
scraping_files.append(fileitem.path)
try:
# 执行刮削
self.scrape_metadata(fileitem=fileitem, meta=meta, mediainfo=mediainfo, overwrite=overwrite)
finally:
# 释放锁
with scraping_lock:
scraping_files.remove(fileitem.path)
# 检查是否为目录
if fileitem.type == "file":
# 单个文件刮削
self.scrape_metadata(fileitem=fileitem,
mediainfo=mediainfo,
init_folder=False,
parent=storagechain.get_parent_item(fileitem),
overwrite=overwrite)
else:
# 检查目的目录下是否已经有nfo刮削文件
sub_files = storagechain.list_files(fileitem)
if any(f.name.endswith('.nfo') for f in sub_files):
logger.info(f"目录 {fileitem.path} 已有NFO文件开始增量刮削...")
for file_path in file_list:
file_item = storagechain.get_file_item(storage=fileitem.storage,
path=Path(file_path))
if file_item:
# 对于电视剧文件,应该保存到与视频文件相同的目录
# 而不是电视剧根目录
self.scrape_metadata(fileitem=file_item,
mediainfo=mediainfo,
init_folder=False,
parent=None, # 让函数内部自动获取正确的父目录
overwrite=overwrite)
else:
# 执行全量刮削
logger.info(f"开始全量刮削目录 {fileitem.path} ...")
self.scrape_metadata(fileitem=fileitem, meta=meta, init_folder=True,
mediainfo=mediainfo, overwrite=overwrite)
def scrape_metadata(self, fileitem: schemas.FileItem,
meta: MetaBase = None, mediainfo: MediaInfo = None,
@@ -436,6 +461,9 @@ class MediaChain(ChainBase):
logger.error(f"{_url} 图片下载失败:{str(err)}")
return None
if not fileitem:
return
# 当前文件路径
filepath = Path(fileitem.path)
if fileitem.type == "file" \
@@ -464,6 +492,8 @@ class MediaChain(ChainBase):
movie_nfo = self.metadata_nfo(meta=meta, mediainfo=mediainfo)
if movie_nfo:
# 保存或上传nfo文件到上级目录
if not parent:
parent = storagechain.get_parent_item(fileitem)
__save_file(_fileitem=parent, _path=nfo_path, _content=movie_nfo)
else:
logger.warn(f"{filepath.name} nfo文件生成失败")
@@ -494,8 +524,9 @@ class MediaChain(ChainBase):
files = __list_files(_fileitem=fileitem)
for file in files:
self.scrape_metadata(fileitem=file,
meta=meta, mediainfo=mediainfo,
init_folder=False, parent=fileitem,
mediainfo=mediainfo,
init_folder=False,
parent=fileitem,
overwrite=overwrite)
# 生成目录内图片文件
if init_folder:
@@ -587,11 +618,11 @@ class MediaChain(ChainBase):
else:
logger.info("集缩略图刮削已关闭,跳过")
else:
# 当前为目录,处理目录内的文件
# 当前为电视剧目录,处理目录内的文件
files = __list_files(_fileitem=fileitem)
for file in files:
self.scrape_metadata(fileitem=file,
meta=meta, mediainfo=mediainfo,
mediainfo=mediainfo,
parent=fileitem if file.type == "file" else None,
init_folder=True if file.type == "dir" else False,
overwrite=overwrite)
@@ -659,7 +690,8 @@ class MediaChain(ChainBase):
# 只下载当前刮削季的图片
image_season = "00" if "specials" in image_name else image_name[6:8]
if image_season != str(season_meta.begin_season).rjust(2, '0'):
logger.info(f"当前刮削季为:{season_meta.begin_season},跳过文件:{image_path}")
logger.info(
f"当前刮削季为:{season_meta.begin_season},跳过文件:{image_path}")
continue
if overwrite or not storagechain.get_file_item(storage=fileitem.storage,
path=image_path):

View File

@@ -271,16 +271,20 @@ class SiteChain(ChainBase):
logger.error(f"获取站点页面失败:{url}")
return favicon_url, None
html = etree.HTML(html_text)
if StringUtils.is_valid_html_element(html):
fav_link = html.xpath('//head/link[contains(@rel, "icon")]/@href')
if fav_link:
favicon_url = urljoin(url, fav_link[0])
try:
if StringUtils.is_valid_html_element(html):
fav_link = html.xpath('//head/link[contains(@rel, "icon")]/@href')
if fav_link:
favicon_url = urljoin(url, fav_link[0])
res = RequestUtils(cookies=cookie, timeout=15, ua=ua).get_res(url=favicon_url)
if res:
return favicon_url, base64.b64encode(res.content).decode()
else:
logger.error(f"获取站点图标失败:{favicon_url}")
res = RequestUtils(cookies=cookie, timeout=15, ua=ua).get_res(url=favicon_url)
if res:
return favicon_url, base64.b64encode(res.content).decode()
else:
logger.error(f"获取站点图标失败:{favicon_url}")
finally:
if html is not None:
del html
return favicon_url, None
def sync_cookies(self, manual=False) -> Tuple[bool, str]:

View File

@@ -39,6 +39,8 @@ class SubscribeChain(ChainBase):
"""
_rlock = threading.RLock()
# 避免莫名原因导致长时间持有锁
_LOCK_TIMOUT = 3600 * 2
def add(self, title: str, year: str,
mtype: MediaType = None,
@@ -279,8 +281,15 @@ class SubscribeChain(ChainBase):
:param manual: 是否手动搜索
:return: 更新订阅状态为R或删除订阅
"""
with self._rlock:
logger.debug(f"search lock acquired at {datetime.now()}")
lock_acquired = False
try:
if lock_acquired := self._rlock.acquire(
blocking=True, timeout=self._LOCK_TIMOUT
):
logger.debug(f"search lock acquired at {datetime.now()}")
else:
logger.warn("search上锁超时")
subscribeoper = SubscribeOper()
if sid:
subscribe = subscribeoper.get(sid)
@@ -437,8 +446,10 @@ class SubscribeChain(ChainBase):
finally:
subscribes.clear()
del subscribes
logger.debug(f"search Lock released at {datetime.now()}")
finally:
if lock_acquired:
self._rlock.release()
logger.debug(f"search Lock released at {datetime.now()}")
# 如果不是大内存模式,进行垃圾回收
if not settings.BIG_MEMORY_MODE:
@@ -565,8 +576,14 @@ class SubscribeChain(ChainBase):
logger.warn('没有缓存资源,无法匹配订阅')
return
with self._rlock:
logger.debug(f"match lock acquired at {datetime.now()}")
lock_acquired = False
try:
if lock_acquired := self._rlock.acquire(
blocking=True, timeout=self._LOCK_TIMOUT
):
logger.debug(f"match lock acquired at {datetime.now()}")
else:
logger.warn("match上锁超时")
# 预识别所有未识别的种子
processed_torrents: Dict[str, List[Context]] = {}
@@ -822,8 +839,10 @@ class SubscribeChain(ChainBase):
del processed_torrents
subscribes.clear()
del subscribes
logger.debug(f"match Lock released at {datetime.now()}")
finally:
if lock_acquired:
self._rlock.release()
logger.debug(f"match Lock released at {datetime.now()}")
def check(self):
"""

View File

@@ -488,7 +488,9 @@ class TransferChain(ChainBase, metaclass=Singleton):
self.eventmanager.send_event(EventType.MetadataScrape, {
'meta': task.meta,
'mediainfo': task.mediainfo,
'fileitem': transferinfo.target_diritem
'fileitem': transferinfo.target_diritem,
'file_list': transferinfo.file_list_new,
'overwrite': False
})
# 移除已完成的任务

View File

@@ -312,4 +312,3 @@ class StreamingPlatforms(metaclass=Singleton):
if name is None:
return False
return name.upper() in self._lookup_cache

View File

@@ -96,53 +96,58 @@ class CookieHelper:
return None, None, "获取源码失败"
# 查找用户名输入框
html = etree.HTML(html_text)
username_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("username"):
if html.xpath(xpath):
username_xpath = xpath
break
if not username_xpath:
return None, None, "未找到用户名输入框"
# 查找密码输入框
password_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("password"):
if html.xpath(xpath):
password_xpath = xpath
break
if not password_xpath:
return None, None, "未找到密码输入框"
# 处理二步验证码
otp_code = TwoFactorAuth(two_step_code).get_code()
# 查找二步验证码输入框
twostep_xpath = None
if otp_code:
for xpath in self._SITE_LOGIN_XPATH.get("twostep"):
try:
username_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("username"):
if html.xpath(xpath):
twostep_xpath = xpath
username_xpath = xpath
break
# 查找验证码输入框
captcha_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("captcha"):
if html.xpath(xpath):
captcha_xpath = xpath
break
# 查找验证码图片
captcha_img_url = None
if captcha_xpath:
for xpath in self._SITE_LOGIN_XPATH.get("captcha_img"):
if not username_xpath:
return None, None, "未找到用户名输入框"
# 查找密码输入框
password_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("password"):
if html.xpath(xpath):
captcha_img_url = html.xpath(xpath)[0]
password_xpath = xpath
break
if not captcha_img_url:
return None, None, "未找到验证码图片"
# 查找登录按钮
submit_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("submit"):
if html.xpath(xpath):
submit_xpath = xpath
break
if not submit_xpath:
return None, None, "未找到登录按钮"
if not password_xpath:
return None, None, "未找到密码输入框"
# 处理二步验证码
otp_code = TwoFactorAuth(two_step_code).get_code()
# 查找二步验证码输入框
twostep_xpath = None
if otp_code:
for xpath in self._SITE_LOGIN_XPATH.get("twostep"):
if html.xpath(xpath):
twostep_xpath = xpath
break
# 查找验证码输入框
captcha_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("captcha"):
if html.xpath(xpath):
captcha_xpath = xpath
break
# 查找验证码图片
captcha_img_url = None
if captcha_xpath:
for xpath in self._SITE_LOGIN_XPATH.get("captcha_img"):
if html.xpath(xpath):
captcha_img_url = html.xpath(xpath)[0]
break
if not captcha_img_url:
return None, None, "未找到验证码图片"
# 查找登录按钮
submit_xpath = None
for xpath in self._SITE_LOGIN_XPATH.get("submit"):
if html.xpath(xpath):
submit_xpath = xpath
break
if not submit_xpath:
return None, None, "未找到登录按钮"
finally:
if html is not None:
del html
# 点击登录按钮
try:
# 等待登录按钮准备好
@@ -185,19 +190,23 @@ class CookieHelper:
if not otp_code:
return None, None, "需要二次验证码"
html = etree.HTML(page.content())
for xpath in self._SITE_LOGIN_XPATH.get("twostep"):
if html.xpath(xpath):
try:
# 刷新一下 2fa code
otp_code = TwoFactorAuth(two_step_code).get_code()
page.fill(xpath, otp_code)
# 登录按钮 xpath 理论上相同,不再重复查找
page.click(submit_xpath)
page.wait_for_load_state("networkidle", timeout=30 * 1000)
except Exception as e:
logger.error(f"二次验证码输入失败:{str(e)}")
return None, None, f"二次验证码输入失败:{str(e)}"
break
try:
for xpath in self._SITE_LOGIN_XPATH.get("twostep"):
if html.xpath(xpath):
try:
# 刷新一下 2fa code
otp_code = TwoFactorAuth(two_step_code).get_code()
page.fill(xpath, otp_code)
# 登录按钮 xpath 理论上相同,不再重复查找
page.click(submit_xpath)
page.wait_for_load_state("networkidle", timeout=30 * 1000)
except Exception as e:
logger.error(f"二次验证码输入失败:{str(e)}")
return None, None, f"二次验证码输入失败:{str(e)}"
break
finally:
if html is not None:
del html
# 登录后的源码
html_text = page.content()
if not html_text:

View File

@@ -8,6 +8,7 @@ from app.log import logger
from app.utils.http import RequestUtils
from app.utils.string import StringUtils
from app.utils.system import SystemUtils
from version import APP_VERSION
class ResourceHelper:
@@ -58,15 +59,15 @@ class ResourceHelper:
if rtype == "auth":
# 站点认证资源
local_version = SitesHelper().auth_version
# 阻断v2.3.0以下的版本直接更新,避免无限重启
# 阻断站点认证资源v2.3.0以下的版本直接更新,避免无限重启
if StringUtils.compare_version(local_version, "<", "2.3.0"):
continue
# 阻断主程序版本v2.6.3以下的版本直接更新,避免搜索异常
if StringUtils.compare_version(APP_VERSION, "<", "2.6.3"):
continue
elif rtype == "sites":
# 站点索引资源
local_version = SitesHelper().indexer_version
# 阻断v2.0.0以下的版本直接更新,避免无限重启
if StringUtils.compare_version(local_version, "<", "2.0.0"):
continue
else:
continue
if StringUtils.compare_version(version, ">", local_version):
@@ -84,6 +85,8 @@ class ResourceHelper:
elif not r:
return None, "连接仓库失败"
files_info = r.json()
# 下载资源文件
success = True
for item in files_info:
save_path = need_updates.get(item.get("name"))
if not save_path:
@@ -96,16 +99,23 @@ class ResourceHelper:
timeout=180).get_res(download_url)
if not res:
logger.error(f"文件 {item.get('name')} 下载失败!")
success = False
break
elif res.status_code != 200:
logger.error(f"下载文件 {item.get('name')} 失败:{res.status_code} - {res.reason}")
success = False
break
# 创建插件文件夹
file_path = self._base_dir / save_path / item.get("name")
if not file_path.parent.exists():
file_path.parent.mkdir(parents=True, exist_ok=True)
# 写入文件
file_path.write_bytes(res.content)
logger.info("资源包更新完成,开始重启服务...")
SystemHelper.restart()
if success:
logger.info("资源包更新完成,开始重启服务...")
SystemHelper.restart()
else:
logger.warn("资源包更新失败,跳过升级!")
else:
logger.info("所有资源已最新,无需更新")
except json.JSONDecodeError:

View File

@@ -530,7 +530,14 @@ class FileManagerModule(_ModuleBase):
mediainfo=mediainfo)
)
# 计算重命名中的文件夹层数
rename_format_level = len(rename_format.split("/")) - 1
rename_list = rename_format.split("/")
rename_format_level = len(rename_list) - 1
for level, name in enumerate(rename_list):
# 处理特例,有的人重命名第一层是年份、分辨率
if "{{title}}" in name:
# 找出含标题的这一层作为扫描路径
rename_format_level -= level
break
# 取相对路径的第1层目录
media_path = target_path.parents[rename_format_level - 1]
if dir_path.is_relative_to(media_path):
@@ -562,9 +569,12 @@ class FileManagerModule(_ModuleBase):
if not settings.LOCAL_EXISTS_SEARCH:
return None
logger.debug(f"正在本地媒体库中查找 {mediainfo.title_year}...")
# 检查媒体库
fileitems = self.media_files(mediainfo)
if not fileitems:
logger.debug(f"{mediainfo.title_year} 不在本地媒体库中")
return None
if mediainfo.type == MediaType.MOVIE:

View File

@@ -219,8 +219,11 @@ class U115Pan(StorageBase, metaclass=Singleton):
# 处理速率限制
if resp.status_code == 429:
reset_time = int(resp.headers.get("X-RateLimit-Reset", 60))
time.sleep(reset_time + 5)
reset_time = 5 + int(resp.headers.get("X-RateLimit-Reset", 60))
logger.debug(
f"【115】{method} 请求 {endpoint} 限流,等待{reset_time}秒后重试"
)
time.sleep(reset_time)
return self._request_api(method, endpoint, result_key, **kwargs)
# 处理请求错误

View File

@@ -5,10 +5,11 @@ from app.core.config import settings
from app.core.context import TorrentInfo
from app.db.site_oper import SiteOper
from app.helper.module import ModuleHelper
from app.helper.sites import SitesHelper, SiteSpider
from app.helper.sites import SitesHelper
from app.log import logger
from app.modules import _ModuleBase
from app.modules.indexer.parser import SiteParserBase
from app.modules.indexer.spider import SiteSpider
from app.modules.indexer.spider.haidan import HaiDanSpider
from app.modules.indexer.spider.hddolby import HddolbySpider
from app.modules.indexer.spider.mtorrent import MTorrentSpider

View File

@@ -14,15 +14,18 @@ class DiscuzUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
user_info = html.xpath('//a[contains(@href, "&uid=")]')
if user_info:
user_id_match = re.search(r"&uid=(\d+)", user_info[0].attrib['href'])
if user_id_match and user_id_match.group().strip():
self.userid = user_id_match.group(1)
self._torrent_seeding_page = f"forum.php?&mod=torrents&cat_5up=on"
self._user_detail_page = user_info[0].attrib['href']
self.username = user_info[0].text.strip()
try:
user_info = html.xpath('//a[contains(@href, "&uid=")]')
if user_info:
user_id_match = re.search(r"&uid=(\d+)", user_info[0].attrib['href'])
if user_id_match and user_id_match.group().strip():
self.userid = user_id_match.group(1)
self._torrent_seeding_page = f"forum.php?&mod=torrents&cat_5up=on"
self._user_detail_page = user_info[0].attrib['href']
self.username = user_info[0].text.strip()
finally:
if html is not None:
del html
def _parse_site_page(self, html_text: str):
pass
@@ -34,40 +37,44 @@ class DiscuzUserInfo(SiteParserBase):
:return:
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
# 用户等级
user_levels_text = html.xpath('//a[contains(@href, "usergroup")]/text()')
if user_levels_text:
self.user_level = user_levels_text[-1].strip()
# 用户等级
user_levels_text = html.xpath('//a[contains(@href, "usergroup")]/text()')
if user_levels_text:
self.user_level = user_levels_text[-1].strip()
# 加入日期
join_at_text = html.xpath('//li[em[text()="注册时间"]]/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].strip())
# 加入日期
join_at_text = html.xpath('//li[em[text()="注册时间"]]/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].strip())
# 分享率
ratio_text = html.xpath('//li[contains(.//text(), "分享率")]//text()')
if ratio_text:
ratio_match = re.search(r"\(([\d,.]+)\)", ratio_text[0])
if ratio_match and ratio_match.group(1).strip():
self.bonus = StringUtils.str_float(ratio_match.group(1))
# 分享率
ratio_text = html.xpath('//li[contains(.//text(), "分享率")]//text()')
if ratio_text:
ratio_match = re.search(r"\(([\d,.]+)\)", ratio_text[0])
if ratio_match and ratio_match.group(1).strip():
self.bonus = StringUtils.str_float(ratio_match.group(1))
# 积分
bouns_text = html.xpath('//li[em[text()="积分"]]/text()')
if bouns_text:
self.bonus = StringUtils.str_float(bouns_text[0].strip())
# 积分
bouns_text = html.xpath('//li[em[text()="积分"]]/text()')
if bouns_text:
self.bonus = StringUtils.str_float(bouns_text[0].strip())
# 上传
upload_text = html.xpath('//li[em[contains(text(),"上传量")]]/text()')
if upload_text:
self.upload = StringUtils.num_filesize(upload_text[0].strip().split('/')[-1])
# 上传
upload_text = html.xpath('//li[em[contains(text(),"上传量")]]/text()')
if upload_text:
self.upload = StringUtils.num_filesize(upload_text[0].strip().split('/')[-1])
# 下载
download_text = html.xpath('//li[em[contains(text(),"下载量")]]/text()')
if download_text:
self.download = StringUtils.num_filesize(download_text[0].strip().split('/')[-1])
# 下载
download_text = html.xpath('//li[em[contains(text(),"下载量")]]/text()')
if download_text:
self.download = StringUtils.num_filesize(download_text[0].strip().split('/')[-1])
finally:
if html is not None:
del html
def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: bool = False) -> Optional[str]:
"""
@@ -77,44 +84,48 @@ class DiscuzUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
size_col = 3
seeders_col = 4
# 搜索size列
if html.xpath('//tr[position()=1]/td[.//img[@class="size"] and .//img[@alt="size"]]'):
size_col = len(html.xpath('//tr[position()=1]/td[.//img[@class="size"] '
'and .//img[@alt="size"]]/preceding-sibling::td')) + 1
# 搜索seeders列
if html.xpath('//tr[position()=1]/td[.//img[@class="seeders"] and .//img[@alt="seeders"]]'):
seeders_col = len(html.xpath('//tr[position()=1]/td[.//img[@class="seeders"] '
'and .//img[@alt="seeders"]]/preceding-sibling::td')) + 1
size_col = 3
seeders_col = 4
# 搜索size列
if html.xpath('//tr[position()=1]/td[.//img[@class="size"] and .//img[@alt="size"]]'):
size_col = len(html.xpath('//tr[position()=1]/td[.//img[@class="size"] '
'and .//img[@alt="size"]]/preceding-sibling::td')) + 1
# 搜索seeders列
if html.xpath('//tr[position()=1]/td[.//img[@class="seeders"] and .//img[@alt="seeders"]]'):
seeders_col = len(html.xpath('//tr[position()=1]/td[.//img[@class="seeders"] '
'and .//img[@alt="seeders"]]/preceding-sibling::td')) + 1
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//tr[position()>1]/td[{seeders_col}]//text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//tr[position()>1]/td[{seeders_col}]//text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
next_page_text = html.xpath('//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁")]/@href')
if next_page_text:
next_page = next_page_text[-1].strip()
# 是否存在下页数据
next_page = None
next_page_text = html.xpath('//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁")]/@href')
if next_page_text:
next_page = next_page_text[-1].strip()
finally:
if html is not None:
del html
return next_page

View File

@@ -24,10 +24,13 @@ class FileListSiteUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
ret = html.xpath(f'//a[contains(@href, "userdetails") and contains(@href, "{self.userid}")]//text()')
if ret:
self.username = str(ret[0])
try:
ret = html.xpath(f'//a[contains(@href, "userdetails") and contains(@href, "{self.userid}")]//text()')
if ret:
self.username = str(ret[0])
finally:
if html is not None:
del html
def _parse_user_traffic_info(self, html_text: str):
"""
@@ -40,39 +43,41 @@ class FileListSiteUserInfo(SiteParserBase):
def _parse_user_detail_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
try:
upload_html = html.xpath('//table//tr/td[text()="Uploaded"]/following-sibling::td//text()')
if upload_html:
self.upload = StringUtils.num_filesize(upload_html[0])
download_html = html.xpath('//table//tr/td[text()="Downloaded"]/following-sibling::td//text()')
if download_html:
self.download = StringUtils.num_filesize(download_html[0])
upload_html = html.xpath('//table//tr/td[text()="Uploaded"]/following-sibling::td//text()')
if upload_html:
self.upload = StringUtils.num_filesize(upload_html[0])
download_html = html.xpath('//table//tr/td[text()="Downloaded"]/following-sibling::td//text()')
if download_html:
self.download = StringUtils.num_filesize(download_html[0])
ratio_html = html.xpath('//table//tr/td[text()="Share ratio"]/following-sibling::td//text()')
if ratio_html:
share_ratio = StringUtils.str_float(ratio_html[0])
else:
share_ratio = 0
self.ratio = 0 if self.download == 0 else share_ratio
ratio_html = html.xpath('//table//tr/td[text()="Share ratio"]/following-sibling::td//text()')
if ratio_html:
share_ratio = StringUtils.str_float(ratio_html[0])
else:
share_ratio = 0
self.ratio = 0 if self.download == 0 else share_ratio
seed_html = html.xpath('//table//tr/td[text()="Seed bonus"]/following-sibling::td//text()')
if seed_html:
self.seeding = StringUtils.str_int(seed_html[1])
self.seeding_size = StringUtils.num_filesize(seed_html[3])
seed_html = html.xpath('//table//tr/td[text()="Seed bonus"]/following-sibling::td//text()')
if seed_html:
self.seeding = StringUtils.str_int(seed_html[1])
self.seeding_size = StringUtils.num_filesize(seed_html[3])
user_level_html = html.xpath('//table//tr/td[text()="Class"]/following-sibling::td//text()')
if user_level_html:
self.user_level = user_level_html[0].strip()
user_level_html = html.xpath('//table//tr/td[text()="Class"]/following-sibling::td//text()')
if user_level_html:
self.user_level = user_level_html[0].strip()
join_at_html = html.xpath('//table//tr/td[contains(text(), "Join")]/following-sibling::td//text()')
if join_at_html:
join_at = (join_at_html[0].split("("))[0].strip()
self.join_at = StringUtils.unify_datetime_str(join_at)
join_at_html = html.xpath('//table//tr/td[contains(text(), "Join")]/following-sibling::td//text()')
if join_at_html:
join_at = (join_at_html[0].split("("))[0].strip()
self.join_at = StringUtils.unify_datetime_str(join_at)
bonus_html = html.xpath('//a[contains(@href, "shop.php")]')
if bonus_html:
self.bonus = StringUtils.str_float(bonus_html[0].xpath("string(.)").strip())
pass
bonus_html = html.xpath('//a[contains(@href, "shop.php")]')
if bonus_html:
self.bonus = StringUtils.str_float(bonus_html[0].xpath("string(.)").strip())
finally:
if html is not None:
del html
def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: Optional[bool] = False) -> Optional[str]:
"""
@@ -82,28 +87,32 @@ class FileListSiteUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
size_col = 6
seeders_col = 7
size_col = 6
seeders_col = 7
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//table/tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//table/tr[position()>1]/td[{seeders_col}]')
if seeding_sizes and seeding_seeders:
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i].xpath("string(.)").strip())
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//table/tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//table/tr[position()>1]/td[{seeders_col}]')
if seeding_sizes and seeding_seeders:
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i].xpath("string(.)").strip())
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
self.seeding_info.extend(page_seeding_info)
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
# 是否存在下页数据
next_page = None
finally:
if html is not None:
del html
return next_page

View File

@@ -14,46 +14,49 @@ class GazelleSiteUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
try:
tmps = html.xpath('//a[contains(@href, "user.php?id=")]')
if tmps:
user_id_match = re.search(r"user.php\?id=(\d+)", tmps[0].attrib['href'])
if user_id_match and user_id_match.group().strip():
self.userid = user_id_match.group(1)
self._torrent_seeding_page = f"torrents.php?type=seeding&userid={self.userid}"
self._user_detail_page = f"user.php?id={self.userid}"
self.username = tmps[0].text.strip()
tmps = html.xpath('//a[contains(@href, "user.php?id=")]')
if tmps:
user_id_match = re.search(r"user.php\?id=(\d+)", tmps[0].attrib['href'])
if user_id_match and user_id_match.group().strip():
self.userid = user_id_match.group(1)
self._torrent_seeding_page = f"torrents.php?type=seeding&userid={self.userid}"
self._user_detail_page = f"user.php?id={self.userid}"
self.username = tmps[0].text.strip()
tmps = html.xpath('//*[@id="header-uploaded-value"]/@data-value')
if tmps:
self.upload = StringUtils.num_filesize(tmps[0])
else:
tmps = html.xpath('//li[@id="stats_seeding"]/span/text()')
tmps = html.xpath('//*[@id="header-uploaded-value"]/@data-value')
if tmps:
self.upload = StringUtils.num_filesize(tmps[0])
else:
tmps = html.xpath('//li[@id="stats_seeding"]/span/text()')
if tmps:
self.upload = StringUtils.num_filesize(tmps[0])
tmps = html.xpath('//*[@id="header-downloaded-value"]/@data-value')
if tmps:
self.download = StringUtils.num_filesize(tmps[0])
else:
tmps = html.xpath('//li[@id="stats_leeching"]/span/text()')
tmps = html.xpath('//*[@id="header-downloaded-value"]/@data-value')
if tmps:
self.download = StringUtils.num_filesize(tmps[0])
else:
tmps = html.xpath('//li[@id="stats_leeching"]/span/text()')
if tmps:
self.download = StringUtils.num_filesize(tmps[0])
self.ratio = 0.0 if self.download <= 0.0 else round(self.upload / self.download, 3)
self.ratio = 0.0 if self.download <= 0.0 else round(self.upload / self.download, 3)
tmps = html.xpath('//a[contains(@href, "bonus.php")]/@data-tooltip')
if tmps:
bonus_match = re.search(r"([\d,.]+)", tmps[0])
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
else:
tmps = html.xpath('//a[contains(@href, "bonus.php")]')
tmps = html.xpath('//a[contains(@href, "bonus.php")]/@data-tooltip')
if tmps:
bonus_text = tmps[0].xpath("string(.)")
bonus_match = re.search(r"([\d,.]+)", bonus_text)
bonus_match = re.search(r"([\d,.]+)", tmps[0])
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
else:
tmps = html.xpath('//a[contains(@href, "bonus.php")]')
if tmps:
bonus_text = tmps[0].xpath("string(.)")
bonus_match = re.search(r"([\d,.]+)", bonus_text)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
finally:
if html is not None:
del html
def _parse_site_page(self, html_text: str):
pass
@@ -65,27 +68,31 @@ class GazelleSiteUserInfo(SiteParserBase):
:return:
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
# 用户等级
user_levels_text = html.xpath('//*[@id="class-value"]/@data-value')
if user_levels_text:
self.user_level = user_levels_text[0].strip()
else:
user_levels_text = html.xpath('//li[contains(text(), "用户等级")]/text()')
# 用户等级
user_levels_text = html.xpath('//*[@id="class-value"]/@data-value')
if user_levels_text:
self.user_level = user_levels_text[0].split(':')[1].strip()
self.user_level = user_levels_text[0].strip()
else:
user_levels_text = html.xpath('//li[contains(text(), "用户等级")]/text()')
if user_levels_text:
self.user_level = user_levels_text[0].split(':')[1].strip()
# 加入日期
join_at_text = html.xpath('//*[@id="join-date-value"]/@data-value')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].strip())
else:
join_at_text = html.xpath(
'//div[contains(@class, "box_userinfo_stats")]//li[contains(text(), "加入时间")]/span/text()')
# 加入日期
join_at_text = html.xpath('//*[@id="join-date-value"]/@data-value')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].strip())
else:
join_at_text = html.xpath(
'//div[contains(@class, "box_userinfo_stats")]//li[contains(text(), "加入时间")]/span/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].strip())
finally:
if html is not None:
del html
def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: Optional[bool] = False) -> Optional[str]:
"""
@@ -95,48 +102,52 @@ class GazelleSiteUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
size_col = 3
# 搜索size列
if html.xpath('//table[contains(@id, "torrent")]//tr[1]/td'):
size_col = len(html.xpath('//table[contains(@id, "torrent")]//tr[1]/td')) - 3
# 搜索seeders列
seeders_col = size_col + 2
size_col = 3
# 搜索size列
if html.xpath('//table[contains(@id, "torrent")]//tr[1]/td'):
size_col = len(html.xpath('//table[contains(@id, "torrent")]//tr[1]/td')) - 3
# 搜索seeders列
seeders_col = size_col + 2
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//table[contains(@id, "torrent")]//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//table[contains(@id, "torrent")]//tr[position()>1]/td[{seeders_col}]/text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//table[contains(@id, "torrent")]//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//table[contains(@id, "torrent")]//tr[position()>1]/td[{seeders_col}]/text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = int(seeding_seeders[i])
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = int(seeding_seeders[i])
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
if multi_page:
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
else:
if not self.seeding:
self.seeding = page_seeding
if not self.seeding_size:
self.seeding_size = page_seeding_size
if not self.seeding_info:
self.seeding_info = page_seeding_info
if multi_page:
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
else:
if not self.seeding:
self.seeding = page_seeding
if not self.seeding_size:
self.seeding_size = page_seeding_size
if not self.seeding_info:
self.seeding_info = page_seeding_info
# 是否存在下页数据
next_page = None
next_page_text = html.xpath('//a[contains(.//text(), "Next") or contains(.//text(), "下一页")]/@href')
if next_page_text:
next_page = next_page_text[-1].strip()
# 是否存在下页数据
next_page = None
next_page_text = html.xpath('//a[contains(.//text(), "Next") or contains(.//text(), "下一页")]/@href')
if next_page_text:
next_page = next_page_text[-1].strip()
finally:
if html is not None:
del html
return next_page

View File

@@ -14,67 +14,79 @@ class IptSiteUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
tmps = html.xpath('//a[contains(@href, "/u/")]//text()')
tmps_id = html.xpath('//a[contains(@href, "/u/")]/@href')
if tmps:
self.username = str(tmps[-1])
if tmps_id:
user_id_match = re.search(r"/u/(\d+)", tmps_id[0])
if user_id_match and user_id_match.group().strip():
self.userid = user_id_match.group(1)
self._user_detail_page = f"user.php?u={self.userid}"
self._torrent_seeding_page = f"peers?u={self.userid}"
try:
tmps = html.xpath('//a[contains(@href, "/u/")]//text()')
tmps_id = html.xpath('//a[contains(@href, "/u/")]/@href')
if tmps:
self.username = str(tmps[-1])
if tmps_id:
user_id_match = re.search(r"/u/(\d+)", tmps_id[0])
if user_id_match and user_id_match.group().strip():
self.userid = user_id_match.group(1)
self._user_detail_page = f"user.php?u={self.userid}"
self._torrent_seeding_page = f"peers?u={self.userid}"
tmps = html.xpath('//div[@class = "stats"]/div/div')
if tmps:
self.upload = StringUtils.num_filesize(str(tmps[0].xpath('span/text()')[1]).strip())
self.download = StringUtils.num_filesize(str(tmps[0].xpath('span/text()')[2]).strip())
self.seeding = StringUtils.str_int(tmps[0].xpath('a')[2].xpath('text()')[0])
self.leeching = StringUtils.str_int(tmps[0].xpath('a')[2].xpath('text()')[1])
self.ratio = StringUtils.str_float(str(tmps[0].xpath('span/text()')[0]).strip().replace('-', '0'))
self.bonus = StringUtils.str_float(tmps[0].xpath('a')[3].xpath('text()')[0])
tmps = html.xpath('//div[@class = "stats"]/div/div')
if tmps:
self.upload = StringUtils.num_filesize(str(tmps[0].xpath('span/text()')[1]).strip())
self.download = StringUtils.num_filesize(str(tmps[0].xpath('span/text()')[2]).strip())
self.seeding = StringUtils.str_int(tmps[0].xpath('a')[2].xpath('text()')[0])
self.leeching = StringUtils.str_int(tmps[0].xpath('a')[2].xpath('text()')[1])
self.ratio = StringUtils.str_float(str(tmps[0].xpath('span/text()')[0]).strip().replace('-', '0'))
self.bonus = StringUtils.str_float(tmps[0].xpath('a')[3].xpath('text()')[0])
finally:
if html is not None:
del html
def _parse_site_page(self, html_text: str):
pass
def _parse_user_detail_info(self, html_text: str):
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
try:
if not StringUtils.is_valid_html_element(html):
return
user_levels_text = html.xpath('//tr/th[text()="Class"]/following-sibling::td[1]/text()')
if user_levels_text:
self.user_level = user_levels_text[0].strip()
user_levels_text = html.xpath('//tr/th[text()="Class"]/following-sibling::td[1]/text()')
if user_levels_text:
self.user_level = user_levels_text[0].strip()
# 加入日期
join_at_text = html.xpath('//tr/th[text()="Join date"]/following-sibling::td[1]/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].split(' (')[0])
# 加入日期
join_at_text = html.xpath('//tr/th[text()="Join date"]/following-sibling::td[1]/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].split(' (')[0])
finally:
if html is not None:
del html
def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: bool = False) -> Optional[str]:
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
# seeding start
seeding_end_pos = 3
if html.xpath('//tr/td[text() = "Leechers"]'):
seeding_end_pos = len(html.xpath('//tr/td[text() = "Leechers"]/../preceding-sibling::tr')) + 1
seeding_end_pos = seeding_end_pos - 3
try:
if not StringUtils.is_valid_html_element(html):
return None
# seeding start
seeding_end_pos = 3
if html.xpath('//tr/td[text() = "Leechers"]'):
seeding_end_pos = len(html.xpath('//tr/td[text() = "Leechers"]/../preceding-sibling::tr')) + 1
seeding_end_pos = seeding_end_pos - 3
page_seeding = 0
page_seeding_size = 0
seeding_torrents = html.xpath('//tr/td[text() = "Seeders"]/../following-sibling::tr/td[position()=6]/text()')
if seeding_torrents:
page_seeding = seeding_end_pos
for per_size in seeding_torrents[:seeding_end_pos]:
if '(' in per_size and ')' in per_size:
per_size = per_size.split('(')[-1]
per_size = per_size.split(')')[0]
page_seeding = 0
page_seeding_size = 0
seeding_torrents = html.xpath('//tr/td[text() = "Seeders"]/../following-sibling::tr/td[position()=6]/text()')
if seeding_torrents:
page_seeding = seeding_end_pos
for per_size in seeding_torrents[:seeding_end_pos]:
if '(' in per_size and ')' in per_size:
per_size = per_size.split('(')[-1]
per_size = per_size.split(')')[0]
page_seeding_size += StringUtils.num_filesize(per_size)
page_seeding_size += StringUtils.num_filesize(per_size)
self.seeding = page_seeding
self.seeding_size = page_seeding_size
self.seeding = page_seeding
self.seeding_size = page_seeding_size
finally:
if html is not None:
del html
def _parse_user_traffic_info(self, html_text: str):
pass

View File

@@ -23,12 +23,16 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo):
if not html_text:
return
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
total_row = html.xpath('//table[@class="table table-bordered"]//tr[td[1][normalize-space()="Total"]]')
if not total_row:
return
seeding_count = total_row[0].xpath('./td[2]/text()')
seeding_size = total_row[0].xpath('./td[3]/text()')
self.seeding = StringUtils.str_int(seeding_count[0]) if seeding_count else 0
self.seeding_size = StringUtils.num_filesize(seeding_size[0].strip()) if seeding_size else 0
try:
if not StringUtils.is_valid_html_element(html):
return
total_row = html.xpath('//table[@class="table table-bordered"]//tr[td[1][normalize-space()="Total"]]')
if not total_row:
return
seeding_count = total_row[0].xpath('./td[2]/text()')
seeding_size = total_row[0].xpath('./td[3]/text()')
self.seeding = StringUtils.str_int(seeding_count[0]) if seeding_count else 0
self.seeding_size = StringUtils.num_filesize(seeding_size[0].strip()) if seeding_size else 0
finally:
if html is not None:
del html

View File

@@ -17,21 +17,25 @@ class NexusHhanclubSiteUserInfo(NexusPhpSiteUserInfo):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
# 上传、下载、分享率
upload_match = re.search(r"[_<>/a-zA-Z-=\"'\s#;]+([\d,.\s]+[KMGTPI]*B)",
html.xpath('//*[@id="user-info-panel"]/div[2]/div[2]/div[4]/text()')[0])
download_match = re.search(r"[_<>/a-zA-Z-=\"'\s#;]+([\d,.\s]+[KMGTPI]*B)",
html.xpath('//*[@id="user-info-panel"]/div[2]/div[2]/div[5]/text()')[0])
ratio_match = re.search(r"分享率][:_<>/a-zA-Z-=\"'\s#;]+([\d,.\s]+)",
html.xpath('//*[@id="user-info-panel"]/div[2]/div[1]/div[1]/div/text()')[0])
try:
# 上传、下载、分享率
upload_match = re.search(r"[_<>/a-zA-Z-=\"'\s#;]+([\d,.\s]+[KMGTPI]*B)",
html.xpath('//*[@id="user-info-panel"]/div[2]/div[2]/div[4]/text()')[0])
download_match = re.search(r"[_<>/a-zA-Z-=\"'\s#;]+([\d,.\s]+[KMGTPI]*B)",
html.xpath('//*[@id="user-info-panel"]/div[2]/div[2]/div[5]/text()')[0])
ratio_match = re.search(r"分享率][:_<>/a-zA-Z-=\"'\s#;]+([\d,.\s]+)",
html.xpath('//*[@id="user-info-panel"]/div[2]/div[1]/div[1]/div/text()')[0])
# 计算分享率
self.upload = StringUtils.num_filesize(upload_match.group(1).strip()) if upload_match else 0
self.download = StringUtils.num_filesize(download_match.group(1).strip()) if download_match else 0
# 优先使用页面上的分享率
calc_ratio = 0.0 if self.download <= 0.0 else round(self.upload / self.download, 3)
self.ratio = StringUtils.str_float(ratio_match.group(1)) if (
ratio_match and ratio_match.group(1).strip()) else calc_ratio
# 计算分享率
self.upload = StringUtils.num_filesize(upload_match.group(1).strip()) if upload_match else 0
self.download = StringUtils.num_filesize(download_match.group(1).strip()) if download_match else 0
# 优先使用页面上的分享率
calc_ratio = 0.0 if self.download <= 0.0 else round(self.upload / self.download, 3)
self.ratio = StringUtils.str_float(ratio_match.group(1)) if (
ratio_match and ratio_match.group(1).strip()) else calc_ratio
finally:
if html is not None:
del html
def _parse_user_detail_info(self, html_text: str):
"""
@@ -42,12 +46,16 @@ class NexusHhanclubSiteUserInfo(NexusPhpSiteUserInfo):
super()._parse_user_detail_info(html_text)
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
# 加入时间
join_at_text = html.xpath('//*[@id="mainContent"]/div/div[2]/div[4]/div[3]/span[2]/text()[1]')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].split(' (')[0].strip())
try:
if not StringUtils.is_valid_html_element(html):
return
# 加入时间
join_at_text = html.xpath('//*[@id="mainContent"]/div/div[2]/div[4]/div[3]/span[2]/text()[1]')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].split(' (')[0].strip())
finally:
if html is not None:
del html
def _get_user_level(self, html):
super()._get_user_level(html)

View File

@@ -34,21 +34,25 @@ class NexusPhpSiteUserInfo(SiteParserBase):
:return:
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
try:
if not StringUtils.is_valid_html_element(html):
return
message_labels = html.xpath('//a[@href="messages.php"]/..')
message_labels.extend(html.xpath('//a[contains(@href, "messages.php")]/..'))
if message_labels:
message_text = message_labels[0].xpath("string(.)")
message_labels = html.xpath('//a[@href="messages.php"]/..')
message_labels.extend(html.xpath('//a[contains(@href, "messages.php")]/..'))
if message_labels:
message_text = message_labels[0].xpath("string(.)")
logger.debug(f"{self._site_name} 消息原始信息 {message_text}")
message_unread_match = re.findall(r"[^Date](信息箱\s*|\((?![^)]*:)|你有\xa0)(\d+)", message_text)
logger.debug(f"{self._site_name} 消息原始信息 {message_text}")
message_unread_match = re.findall(r"[^Date](信息箱\s*|\((?![^)]*:)|你有\xa0)(\d+)", message_text)
if message_unread_match and len(message_unread_match[-1]) == 2:
self.message_unread = StringUtils.str_int(message_unread_match[-1][1])
elif message_text.isdigit():
self.message_unread = StringUtils.str_int(message_text)
if message_unread_match and len(message_unread_match[-1]) == 2:
self.message_unread = StringUtils.str_int(message_unread_match[-1][1])
elif message_text.isdigit():
self.message_unread = StringUtils.str_int(message_text)
finally:
if html is not None:
del html
def _parse_user_base_info(self, html_text: str):
"""
@@ -61,18 +65,23 @@ class NexusPhpSiteUserInfo(SiteParserBase):
self._parse_message_unread(html_text)
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
try:
if not StringUtils.is_valid_html_element(html):
return
ret = html.xpath(f'//a[contains(@href, "userdetails") and contains(@href, "{self.userid}")]//b//text()')
if ret:
self.username = str(ret[0])
return
ret = html.xpath(f'//a[contains(@href, "userdetails") and contains(@href, "{self.userid}")]//text()')
if ret:
self.username = str(ret[0])
ret = html.xpath(f'//a[contains(@href, "userdetails") and contains(@href, "{self.userid}")]//b//text()')
if ret:
self.username = str(ret[0])
return
ret = html.xpath(f'//a[contains(@href, "userdetails") and contains(@href, "{self.userid}")]//text()')
if ret:
self.username = str(ret[0])
ret = html.xpath('//a[contains(@href, "userdetails")]//strong//text()')
finally:
if html is not None:
del html
ret = html.xpath('//a[contains(@href, "userdetails")]//strong//text()')
if ret:
self.username = str(ret[0])
return
@@ -98,28 +107,32 @@ class NexusPhpSiteUserInfo(SiteParserBase):
self.leeching = StringUtils.str_int(leeching_match.group(2)) if leeching_match and leeching_match.group(
2).strip() else 0
html = etree.HTML(html_text)
has_ucoin, self.bonus = self._parse_ucoin(html)
if has_ucoin:
return
tmps = html.xpath('//a[contains(@href,"mybonus")]/text()') if html else None
if tmps:
bonus_text = str(tmps[0]).strip()
bonus_match = re.search(r"([\d,.]+)", bonus_text)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
return
bonus_match = re.search(r"mybonus.[\[\]:<>/a-zA-Z_\-=\"'\s#;.(使用魔力值豆]+\s*([\d,.]+)[<()&\s]", html_text)
try:
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
has_ucoin, self.bonus = self._parse_ucoin(html)
if has_ucoin:
return
bonus_match = re.search(r"[魔力值|\]][\[\]:<>/a-zA-Z_\-=\"'\s#;]+\s*([\d,.]+|\"[\d,.]+\")[<>()&\s]",
html_text,
flags=re.S)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1).strip('"'))
except Exception as err:
logger.error(f"{self._site_name} 解析魔力值出错, 错误信息: {str(err)}")
tmps = html.xpath('//a[contains(@href,"mybonus")]/text()') if html else None
if tmps:
bonus_text = str(tmps[0]).strip()
bonus_match = re.search(r"([\d,.]+)", bonus_text)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
return
bonus_match = re.search(r"mybonus.[\[\]:<>/a-zA-Z_\-=\"'\s#;.(使用魔力值豆]+\s*([\d,.]+)[<()&\s]", html_text)
try:
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
return
bonus_match = re.search(r"[魔力值|\]][\[\]:<>/a-zA-Z_\-=\"'\s#;]+\s*([\d,.]+|\"[\d,.]+\")[<>()&\s]",
html_text,
flags=re.S)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1).strip('"'))
except Exception as err:
logger.error(f"{self._site_name} 解析魔力值出错, 错误信息: {str(err)}")
finally:
if html is not None:
del html
@staticmethod
def _parse_ucoin(html):
@@ -155,72 +168,76 @@ class NexusPhpSiteUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(str(html_text).replace(r'\/', '/'))
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
# 首页存在扩展链接,使用扩展链接
seeding_url_text = html.xpath('//a[contains(@href,"torrents.php") '
'and contains(@href,"seeding")]/@href')
if multi_page is False and seeding_url_text and seeding_url_text[0].strip():
self._torrent_seeding_page = seeding_url_text[0].strip()
return self._torrent_seeding_page
# 首页存在扩展链接,使用扩展链接
seeding_url_text = html.xpath('//a[contains(@href,"torrents.php") '
'and contains(@href,"seeding")]/@href')
if multi_page is False and seeding_url_text and seeding_url_text[0].strip():
self._torrent_seeding_page = seeding_url_text[0].strip()
return self._torrent_seeding_page
size_col = 3
seeders_col = 4
# 搜索size列
size_col_xpath = '//tr[position()=1]/' \
'td[(img[@class="size"] and img[@alt="size"])' \
' or (text() = "大小")' \
' or (a/img[@class="size" and @alt="size"])]'
if html.xpath(size_col_xpath):
size_col = len(html.xpath(f'{size_col_xpath}/preceding-sibling::td')) + 1
# 搜索seeders列
seeders_col_xpath = '//tr[position()=1]/' \
'td[(img[@class="seeders"] and img[@alt="seeders"])' \
' or (text() = "在做种")' \
' or (a/img[@class="seeders" and @alt="seeders"])]'
if html.xpath(seeders_col_xpath):
seeders_col = len(html.xpath(f'{seeders_col_xpath}/preceding-sibling::td')) + 1
size_col = 3
seeders_col = 4
# 搜索size列
size_col_xpath = '//tr[position()=1]/' \
'td[(img[@class="size"] and img[@alt="size"])' \
' or (text() = "大小")' \
' or (a/img[@class="size" and @alt="size"])]'
if html.xpath(size_col_xpath):
size_col = len(html.xpath(f'{size_col_xpath}/preceding-sibling::td')) + 1
# 搜索seeders列
seeders_col_xpath = '//tr[position()=1]/' \
'td[(img[@class="seeders"] and img[@alt="seeders"])' \
' or (text() = "在做种")' \
' or (a/img[@class="seeders" and @alt="seeders"])]'
if html.xpath(seeders_col_xpath):
seeders_col = len(html.xpath(f'{seeders_col_xpath}/preceding-sibling::td')) + 1
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
# 如果 table class="torrents"则增加table[@class="torrents"]
table_class = '//table[@class="torrents"]' if html.xpath('//table[@class="torrents"]') else ''
seeding_sizes = html.xpath(f'{table_class}//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'{table_class}//tr[position()>1]/td[{seeders_col}]/b/a/text()')
if not seeding_seeders:
seeding_seeders = html.xpath(f'{table_class}//tr[position()>1]/td[{seeders_col}]//text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
# 如果 table class="torrents"则增加table[@class="torrents"]
table_class = '//table[@class="torrents"]' if html.xpath('//table[@class="torrents"]') else ''
seeding_sizes = html.xpath(f'{table_class}//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'{table_class}//tr[position()>1]/td[{seeders_col}]/b/a/text()')
if not seeding_seeders:
seeding_seeders = html.xpath(f'{table_class}//tr[position()>1]/td[{seeders_col}]//text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
next_page_text = html.xpath(
'//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁") or contains(.//text(), ">")]/@href')
# 防止识别到详情页
while next_page_text:
next_page = next_page_text.pop().strip()
if not next_page.startswith('details.php'):
break
# 是否存在下页数据
next_page = None
next_page_text = html.xpath(
'//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁") or contains(.//text(), ">")]/@href')
# fix up page url
if next_page:
if self.userid not in next_page:
next_page = f'{next_page}&userid={self.userid}&type=seeding'
# 防止识别到详情页
while next_page_text:
next_page = next_page_text.pop().strip()
if not next_page.startswith('details.php'):
break
next_page = None
# fix up page url
if next_page:
if self.userid not in next_page:
next_page = f'{next_page}&userid={self.userid}&type=seeding'
finally:
if html is not None:
del html
return next_page
@@ -231,57 +248,61 @@ class NexusPhpSiteUserInfo(SiteParserBase):
:return:
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
try:
if not StringUtils.is_valid_html_element(html):
return
self._get_user_level(html)
self._get_user_level(html)
self._fixup_traffic_info(html)
self._fixup_traffic_info(html)
# 加入日期
join_at_text = html.xpath(
'//tr/td[text()="加入日期" or text()="注册日期" or *[text()="加入日期"]]/following-sibling::td[1]//text()'
'|//div/b[text()="加入日期"]/../text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].split(' (')[0].strip())
# 加入日期
join_at_text = html.xpath(
'//tr/td[text()="加入日期" or text()="注册日期" or *[text()="加入日期"]]/following-sibling::td[1]//text()'
'|//div/b[text()="加入日期"]/../text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(join_at_text[0].split(' (')[0].strip())
# 做种体积 & 做种数
# seeding 页面获取不到的话,此处再获取一次
seeding_sizes = html.xpath('//tr/td[text()="当前上传"]/following-sibling::td[1]//'
'table[tr[1][td[4 and text()="尺寸"]]]//tr[position()>1]/td[4]')
seeding_seeders = html.xpath('//tr/td[text()="当前上传"]/following-sibling::td[1]//'
'table[tr[1][td[5 and text()="做种者"]]]//tr[position()>1]/td[5]//text()')
tmp_seeding = len(seeding_sizes)
tmp_seeding_size = 0
tmp_seeding_info = []
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
# 做种体积 & 做种数
# seeding 页面获取不到的话,此处再获取一次
seeding_sizes = html.xpath('//tr/td[text()="当前上传"]/following-sibling::td[1]//'
'table[tr[1][td[4 and text()="尺寸"]]]//tr[position()>1]/td[4]')
seeding_seeders = html.xpath('//tr/td[text()="当前上传"]/following-sibling::td[1]//'
'table[tr[1][td[5 and text()="做种者"]]]//tr[position()>1]/td[5]//text()')
tmp_seeding = len(seeding_sizes)
tmp_seeding_size = 0
tmp_seeding_info = []
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
tmp_seeding_size += size
tmp_seeding_info.append([seeders, size])
tmp_seeding_size += size
tmp_seeding_info.append([seeders, size])
if not self.seeding_size:
self.seeding_size = tmp_seeding_size
if not self.seeding:
self.seeding = tmp_seeding
if not self.seeding_info:
self.seeding_info = tmp_seeding_info
if not self.seeding_size:
self.seeding_size = tmp_seeding_size
if not self.seeding:
self.seeding = tmp_seeding
if not self.seeding_info:
self.seeding_info = tmp_seeding_info
seeding_sizes = html.xpath('//tr/td[text()="做种统计"]/following-sibling::td[1]//text()')
if seeding_sizes:
seeding_match = re.search(r"总做种数:\s+(\d+)", seeding_sizes[0], re.IGNORECASE)
seeding_size_match = re.search(r"总做种体积:\s+([\d,.\s]+[KMGTPI]*B)", seeding_sizes[0], re.IGNORECASE)
tmp_seeding = StringUtils.str_int(seeding_match.group(1)) if (
seeding_match and seeding_match.group(1)) else 0
tmp_seeding_size = StringUtils.num_filesize(
seeding_size_match.group(1).strip()) if seeding_size_match else 0
if not self.seeding_size:
self.seeding_size = tmp_seeding_size
if not self.seeding:
self.seeding = tmp_seeding
seeding_sizes = html.xpath('//tr/td[text()="做种统计"]/following-sibling::td[1]//text()')
if seeding_sizes:
seeding_match = re.search(r"总做种数:\s+(\d+)", seeding_sizes[0], re.IGNORECASE)
seeding_size_match = re.search(r"总做种体积:\s+([\d,.\s]+[KMGTPI]*B)", seeding_sizes[0], re.IGNORECASE)
tmp_seeding = StringUtils.str_int(seeding_match.group(1)) if (
seeding_match and seeding_match.group(1)) else 0
tmp_seeding_size = StringUtils.num_filesize(
seeding_size_match.group(1).strip()) if seeding_size_match else 0
if not self.seeding_size:
self.seeding_size = tmp_seeding_size
if not self.seeding:
self.seeding = tmp_seeding
self._fixup_torrent_seeding_page(html)
self._fixup_torrent_seeding_page(html)
finally:
if html is not None:
del html
def _fixup_torrent_seeding_page(self, html):
"""
@@ -348,43 +369,51 @@ class NexusPhpSiteUserInfo(SiteParserBase):
def _parse_message_unread_links(self, html_text: str, msg_links: list) -> Optional[str]:
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
message_links = html.xpath('//tr[not(./td/img[@alt="Read"])]/td/a[contains(@href, "viewmessage")]/@href')
msg_links.extend(message_links)
# 是否存在下页数据
next_page = None
next_page_text = html.xpath('//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁")]/@href')
if next_page_text:
next_page = next_page_text[-1].strip()
message_links = html.xpath('//tr[not(./td/img[@alt="Read"])]/td/a[contains(@href, "viewmessage")]/@href')
msg_links.extend(message_links)
# 是否存在下页数据
next_page = None
next_page_text = html.xpath('//a[contains(.//text(), "下一页") or contains(.//text(), "下一頁")]/@href')
if next_page_text:
next_page = next_page_text[-1].strip()
finally:
if html is not None:
del html
return next_page
def _parse_message_content(self, html_text):
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None, None, None
# 标题
message_head_text = None
message_head = html.xpath('//h1/text()'
'|//div[@class="layui-card-header"]/span[1]/text()')
if message_head:
message_head_text = message_head[-1].strip()
try:
if not StringUtils.is_valid_html_element(html):
return None, None, None
# 标题
message_head_text = None
message_head = html.xpath('//h1/text()'
'|//div[@class="layui-card-header"]/span[1]/text()')
if message_head:
message_head_text = message_head[-1].strip()
# 消息时间
message_date_text = None
message_date = html.xpath('//h1/following-sibling::table[.//tr/td[@class="colhead"]]//tr[2]/td[2]'
'|//div[@class="layui-card-header"]/span[2]/span[2]')
if message_date:
message_date_text = message_date[0].xpath("string(.)").strip()
# 消息时间
message_date_text = None
message_date = html.xpath('//h1/following-sibling::table[.//tr/td[@class="colhead"]]//tr[2]/td[2]'
'|//div[@class="layui-card-header"]/span[2]/span[2]')
if message_date:
message_date_text = message_date[0].xpath("string(.)").strip()
# 消息内容
message_content_text = None
message_content = html.xpath('//h1/following-sibling::table[.//tr/td[@class="colhead"]]//tr[3]/td'
'|//div[contains(@class,"layui-card-body")]')
if message_content:
message_content_text = message_content[0].xpath("string(.)").strip()
# 消息内容
message_content_text = None
message_content = html.xpath('//h1/following-sibling::table[.//tr/td[@class="colhead"]]//tr[3]/td'
'|//div[contains(@class,"layui-card-body")]')
if message_content:
message_content_text = message_content[0].xpath("string(.)").strip()
finally:
if html is not None:
del html
return message_head_text, message_date_text, message_content_text

View File

@@ -114,48 +114,56 @@ class NexusRabbitSiteUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
"""只有奶糖余额才需要在 base 中获取,其它均可以在详情页拿到"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
bonus = html.xpath(
'//div[contains(text(), "奶糖余额")]/following-sibling::div[1]/text()'
)
if bonus:
self.bonus = StringUtils.str_float(bonus[0].strip())
try:
if not StringUtils.is_valid_html_element(html):
return
bonus = html.xpath(
'//div[contains(text(), "奶糖余额")]/following-sibling::div[1]/text()'
)
if bonus:
self.bonus = StringUtils.str_float(bonus[0].strip())
finally:
if html is not None:
del html
def _parse_user_detail_info(self, html_text: str):
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return
# 缩小一下查找范围,所有的信息都在这个 div 里
user_info = html.xpath('//div[contains(@class, "layui-hares-user-info-right")]')
if not user_info:
return
user_info = user_info[0]
# 用户名
if username := user_info.xpath(
'.//span[contains(text(), "用户名")]/a/span/text()'
):
self.username = username[0].strip()
# 等级
if user_level := user_info.xpath('.//span[contains(text(), "等级")]/b/text()'):
self.user_level = user_level[0].strip()
# 加入日期
if join_date := user_info.xpath('.//span[contains(text(), "注册日期")]/text()'):
join_date = join_date[0].strip().split("\r")[0].removeprefix("注册日期:")
self.join_at = StringUtils.unify_datetime_str(join_date)
# 上传量
if upload := user_info.xpath('.//span[contains(text(), "上传量")]/text()'):
self.upload = StringUtils.num_filesize(
upload[0].strip().removeprefix("上传量:")
)
# 下载量
if download := user_info.xpath('.//span[contains(text(), "下载量")]/text()'):
self.download = StringUtils.num_filesize(
download[0].strip().removeprefix("下载量:")
)
# 分享率
if ratio := user_info.xpath('.//span[contains(text(), "分享率")]/em/text()'):
self.ratio = StringUtils.str_float(ratio[0].strip())
try:
if not StringUtils.is_valid_html_element(html):
return
# 缩小一下查找范围,所有的信息都在这个 div 里
user_info = html.xpath('//div[contains(@class, "layui-hares-user-info-right")]')
if not user_info:
return
user_info = user_info[0]
# 用户名
if username := user_info.xpath(
'.//span[contains(text(), "用户名")]/a/span/text()'
):
self.username = username[0].strip()
# 等级
if user_level := user_info.xpath('.//span[contains(text(), "等级")]/b/text()'):
self.user_level = user_level[0].strip()
# 加入日期
if join_date := user_info.xpath('.//span[contains(text(), "注册日期")]/text()'):
join_date = join_date[0].strip().split("\r")[0].removeprefix("注册日期:")
self.join_at = StringUtils.unify_datetime_str(join_date)
# 上传量
if upload := user_info.xpath('.//span[contains(text(), "上传量")]/text()'):
self.upload = StringUtils.num_filesize(
upload[0].strip().removeprefix("上传量:")
)
# 下载量
if download := user_info.xpath('.//span[contains(text(), "下载量")]/text()'):
self.download = StringUtils.num_filesize(
download[0].strip().removeprefix("下载量:")
)
# 分享率
if ratio := user_info.xpath('.//span[contains(text(), "分享率")]/em/text()'):
self.ratio = StringUtils.str_float(ratio[0].strip())
finally:
if html is not None:
del html
def _parse_message_content(self, html_text):
"""

View File

@@ -24,9 +24,13 @@ class SmallHorseSiteUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
ret = html.xpath('//a[contains(@href, "user.php")]//text()')
if ret:
self.username = str(ret[0])
try:
ret = html.xpath('//a[contains(@href, "user.php")]//text()')
if ret:
self.username = str(ret[0])
finally:
if html is not None:
del html
def _parse_user_traffic_info(self, html_text: str):
"""
@@ -36,21 +40,25 @@ class SmallHorseSiteUserInfo(SiteParserBase):
"""
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
tmps = html.xpath('//ul[@class = "stats nobullet"]')
if tmps:
if tmps[1].xpath("li") and tmps[1].xpath("li")[0].xpath("span//text()"):
self.join_at = StringUtils.unify_datetime_str(tmps[1].xpath("li")[0].xpath("span//text()")[0])
self.upload = StringUtils.num_filesize(str(tmps[1].xpath("li")[2].xpath("text()")[0]).split(":")[1].strip())
self.download = StringUtils.num_filesize(
str(tmps[1].xpath("li")[3].xpath("text()")[0]).split(":")[1].strip())
if tmps[1].xpath("li")[4].xpath("span//text()"):
self.ratio = StringUtils.str_float(str(tmps[1].xpath("li")[4].xpath("span//text()")[0]).replace('', '0'))
else:
self.ratio = StringUtils.str_float(str(tmps[1].xpath("li")[5].xpath("text()")[0]).split(":")[1])
self.bonus = StringUtils.str_float(str(tmps[1].xpath("li")[5].xpath("text()")[0]).split(":")[1])
self.user_level = str(tmps[3].xpath("li")[0].xpath("text()")[0]).split(":")[1].strip()
self.leeching = StringUtils.str_int(
(tmps[4].xpath("li")[6].xpath("text()")[0]).split(":")[1].replace("[", ""))
try:
tmps = html.xpath('//ul[@class = "stats nobullet"]')
if tmps:
if tmps[1].xpath("li") and tmps[1].xpath("li")[0].xpath("span//text()"):
self.join_at = StringUtils.unify_datetime_str(tmps[1].xpath("li")[0].xpath("span//text()")[0])
self.upload = StringUtils.num_filesize(str(tmps[1].xpath("li")[2].xpath("text()")[0]).split(":")[1].strip())
self.download = StringUtils.num_filesize(
str(tmps[1].xpath("li")[3].xpath("text()")[0]).split(":")[1].strip())
if tmps[1].xpath("li")[4].xpath("span//text()"):
self.ratio = StringUtils.str_float(str(tmps[1].xpath("li")[4].xpath("span//text()")[0]).replace('', '0'))
else:
self.ratio = StringUtils.str_float(str(tmps[1].xpath("li")[5].xpath("text()")[0]).split(":")[1])
self.bonus = StringUtils.str_float(str(tmps[1].xpath("li")[5].xpath("text()")[0]).split(":")[1])
self.user_level = str(tmps[3].xpath("li")[0].xpath("text()")[0]).split(":")[1].strip()
self.leeching = StringUtils.str_int(
(tmps[4].xpath("li")[6].xpath("text()")[0]).split(":")[1].replace("[", ""))
finally:
if html is not None:
del html
def _parse_user_detail_info(self, html_text: str):
pass
@@ -63,39 +71,42 @@ class SmallHorseSiteUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
size_col = 6
seeders_col = 8
size_col = 6
seeders_col = 8
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//table[@id="torrent_table"]//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//table[@id="torrent_table"]//tr[position()>1]/td[{seeders_col}]')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//table[@id="torrent_table"]//tr[position()>1]/td[{size_col}]')
seeding_seeders = html.xpath(f'//table[@id="torrent_table"]//tr[position()>1]/td[{seeders_col}]')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i].xpath("string(.)").strip())
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i].xpath("string(.)").strip())
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
next_pages = html.xpath('//ul[@class="pagination"]/li[contains(@class,"active")]/following-sibling::li')
if next_pages and len(next_pages) > 1:
page_num = next_pages[0].xpath("string(.)").strip()
if page_num.isdigit():
next_page = f"{self._torrent_seeding_page}&page={page_num}"
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
next_pages = html.xpath('//ul[@class="pagination"]/li[contains(@class,"active")]/following-sibling::li')
if next_pages and len(next_pages) > 1:
page_num = next_pages[0].xpath("string(.)").strip()
if page_num.isdigit():
next_page = f"{self._torrent_seeding_page}&page={page_num}"
finally:
if html is not None:
del html
return next_page
def _parse_message_unread_links(self, html_text: str, msg_links: list) -> Optional[str]:

View File

@@ -32,29 +32,33 @@ class TorrentLeechSiteUserInfo(SiteParserBase):
"""
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
upload_html = html.xpath('//div[contains(@class,"profile-uploaded")]//span/text()')
if upload_html:
self.upload = StringUtils.num_filesize(upload_html[0])
download_html = html.xpath('//div[contains(@class,"profile-downloaded")]//span/text()')
if download_html:
self.download = StringUtils.num_filesize(download_html[0])
ratio_html = html.xpath('//div[contains(@class,"profile-ratio")]//span/text()')
if ratio_html:
self.ratio = StringUtils.str_float(ratio_html[0].replace('', '0'))
try:
upload_html = html.xpath('//div[contains(@class,"profile-uploaded")]//span/text()')
if upload_html:
self.upload = StringUtils.num_filesize(upload_html[0])
download_html = html.xpath('//div[contains(@class,"profile-downloaded")]//span/text()')
if download_html:
self.download = StringUtils.num_filesize(download_html[0])
ratio_html = html.xpath('//div[contains(@class,"profile-ratio")]//span/text()')
if ratio_html:
self.ratio = StringUtils.str_float(ratio_html[0].replace('', '0'))
user_level_html = html.xpath('//table[contains(@class, "profileViewTable")]'
'//tr/td[text()="Class"]/following-sibling::td/text()')
if user_level_html:
self.user_level = user_level_html[0].strip()
user_level_html = html.xpath('//table[contains(@class, "profileViewTable")]'
'//tr/td[text()="Class"]/following-sibling::td/text()')
if user_level_html:
self.user_level = user_level_html[0].strip()
join_at_html = html.xpath('//table[contains(@class, "profileViewTable")]'
'//tr/td[text()="Registration date"]/following-sibling::td/text()')
if join_at_html:
self.join_at = StringUtils.unify_datetime_str(join_at_html[0].strip())
join_at_html = html.xpath('//table[contains(@class, "profileViewTable")]'
'//tr/td[text()="Registration date"]/following-sibling::td/text()')
if join_at_html:
self.join_at = StringUtils.unify_datetime_str(join_at_html[0].strip())
bonus_html = html.xpath('//span[contains(@class, "total-TL-points")]/text()')
if bonus_html:
self.bonus = StringUtils.str_float(bonus_html[0].strip())
bonus_html = html.xpath('//span[contains(@class, "total-TL-points")]/text()')
if bonus_html:
self.bonus = StringUtils.str_float(bonus_html[0].strip())
finally:
if html is not None:
del html
def _parse_user_detail_info(self, html_text: str):
pass
@@ -67,33 +71,37 @@ class TorrentLeechSiteUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
size_col = 2
seeders_col = 7
size_col = 2
seeders_col = 7
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//tbody/tr/td[{size_col}]')
seeding_seeders = html.xpath(f'//tbody/tr/td[{seeders_col}]/text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//tbody/tr/td[{size_col}]')
seeding_seeders = html.xpath(f'//tbody/tr/td[{seeders_col}]/text()')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i])
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
# 是否存在下页数据
next_page = None
finally:
if html is not None:
del html
return next_page

View File

@@ -14,21 +14,24 @@ class Unit3dSiteUserInfo(SiteParserBase):
def _parse_user_base_info(self, html_text: str):
html_text = self._prepare_html_text(html_text)
html = etree.HTML(html_text)
try:
tmps = html.xpath('//a[contains(@href, "/users/") and contains(@href, "settings")]/@href')
if tmps:
user_name_match = re.search(r"/users/(.+)/settings", tmps[0])
if user_name_match and user_name_match.group().strip():
self.username = user_name_match.group(1)
self._torrent_seeding_page = f"/users/{self.username}/active?perPage=100&client=&seeding=include"
self._user_detail_page = f"/users/{self.username}"
tmps = html.xpath('//a[contains(@href, "/users/") and contains(@href, "settings")]/@href')
if tmps:
user_name_match = re.search(r"/users/(.+)/settings", tmps[0])
if user_name_match and user_name_match.group().strip():
self.username = user_name_match.group(1)
self._torrent_seeding_page = f"/users/{self.username}/active?perPage=100&client=&seeding=include"
self._user_detail_page = f"/users/{self.username}"
tmps = html.xpath('//a[contains(@href, "bonus/earnings")]')
if tmps:
bonus_text = tmps[0].xpath("string(.)")
bonus_match = re.search(r"([\d,.]+)", bonus_text)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
tmps = html.xpath('//a[contains(@href, "bonus/earnings")]')
if tmps:
bonus_text = tmps[0].xpath("string(.)")
bonus_match = re.search(r"([\d,.]+)", bonus_text)
if bonus_match and bonus_match.group(1).strip():
self.bonus = StringUtils.str_float(bonus_match.group(1))
finally:
if html is not None:
del html
def _parse_site_page(self, html_text: str):
pass
@@ -40,21 +43,25 @@ class Unit3dSiteUserInfo(SiteParserBase):
:return:
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
# 用户等级
user_levels_text = html.xpath('//div[contains(@class, "content")]//span[contains(@class, "badge-user")]/text()')
if user_levels_text:
self.user_level = user_levels_text[0].strip()
# 用户等级
user_levels_text = html.xpath('//div[contains(@class, "content")]//span[contains(@class, "badge-user")]/text()')
if user_levels_text:
self.user_level = user_levels_text[0].strip()
# 加入日期
join_at_text = html.xpath('//div[contains(@class, "content")]//h4[contains(text(), "注册日期") '
'or contains(text(), "註冊日期") '
'or contains(text(), "Registration date")]/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(
join_at_text[0].replace('注册日期', '').replace('註冊日期', '').replace('Registration date', ''))
# 加入日期
join_at_text = html.xpath('//div[contains(@class, "content")]//h4[contains(text(), "注册日期") '
'or contains(text(), "註冊日期") '
'or contains(text(), "Registration date")]/text()')
if join_at_text:
self.join_at = StringUtils.unify_datetime_str(
join_at_text[0].replace('注册日期', '').replace('註冊日期', '').replace('Registration date', ''))
finally:
if html is not None:
del html
def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: Optional[bool] = False) -> Optional[str]:
"""
@@ -64,44 +71,48 @@ class Unit3dSiteUserInfo(SiteParserBase):
:return: 下页地址
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return None
try:
if not StringUtils.is_valid_html_element(html):
return None
size_col = 9
seeders_col = 2
# 搜索size列
if html.xpath('//thead//th[contains(@class,"size")]'):
size_col = len(html.xpath('//thead//th[contains(@class,"size")][1]/preceding-sibling::th')) + 1
# 搜索seeders列
if html.xpath('//thead//th[contains(@class,"seeders")]'):
seeders_col = len(html.xpath('//thead//th[contains(@class,"seeders")]/preceding-sibling::th')) + 1
size_col = 9
seeders_col = 2
# 搜索size列
if html.xpath('//thead//th[contains(@class,"size")]'):
size_col = len(html.xpath('//thead//th[contains(@class,"size")][1]/preceding-sibling::th')) + 1
# 搜索seeders列
if html.xpath('//thead//th[contains(@class,"seeders")]'):
seeders_col = len(html.xpath('//thead//th[contains(@class,"seeders")]/preceding-sibling::th')) + 1
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//tr[position()]/td[{size_col}]')
seeding_seeders = html.xpath(f'//tr[position()]/td[{seeders_col}]')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
page_seeding = 0
page_seeding_size = 0
page_seeding_info = []
seeding_sizes = html.xpath(f'//tr[position()]/td[{size_col}]')
seeding_seeders = html.xpath(f'//tr[position()]/td[{seeders_col}]')
if seeding_sizes and seeding_seeders:
page_seeding = len(seeding_sizes)
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i].xpath("string(.)").strip())
for i in range(0, len(seeding_sizes)):
size = StringUtils.num_filesize(seeding_sizes[i].xpath("string(.)").strip())
seeders = StringUtils.str_int(seeding_seeders[i].xpath("string(.)").strip())
page_seeding_size += size
page_seeding_info.append([seeders, size])
page_seeding_size += size
page_seeding_info.append([seeders, size])
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
self.seeding += page_seeding
self.seeding_size += page_seeding_size
self.seeding_info.extend(page_seeding_info)
# 是否存在下页数据
next_page = None
next_pages = html.xpath('//ul[@class="pagination"]/li[contains(@class,"active")]/following-sibling::li')
if next_pages and len(next_pages) > 1:
page_num = next_pages[0].xpath("string(.)").strip()
if page_num.isdigit():
next_page = f"{self._torrent_seeding_page}&page={page_num}"
# 是否存在下页数据
next_page = None
next_pages = html.xpath('//ul[@class="pagination"]/li[contains(@class,"active")]/following-sibling::li')
if next_pages and len(next_pages) > 1:
page_num = next_pages[0].xpath("string(.)").strip()
if page_num.isdigit():
next_page = f"{self._torrent_seeding_page}&page={page_num}"
finally:
if html is not None:
del html
return next_page

View File

@@ -0,0 +1,704 @@
import datetime
import re
import traceback
from typing import Any, Optional
from typing import List
from urllib.parse import quote, urlencode, urlparse, parse_qs
from jinja2 import Template
from pyquery import PyQuery
from app.core.config import settings
from app.log import logger
from app.schemas.types import MediaType
from app.utils.http import RequestUtils
from app.utils.string import StringUtils
class SiteSpider:
"""
站点爬虫
"""
@property
def __class__(self):
return object
@property
def __dict__(self):
return {}
@property
def __dir__(self):
raise AttributeError(f"Cannot read protected attribute!")
def __init__(self,
indexer: dict,
keyword: Optional[str] = None,
mtype: MediaType = None,
cat: Optional[str] = None,
page: Optional[int] = 0,
referer: Optional[str] = None):
"""
设置查询参数
:param indexer: 索引器
:param keyword: 搜索关键字,如果数组则为批量搜索
:param mtype: 媒体类型
:param cat: 搜索分类
:param page: 页码
:param referer: Referer
"""
if not indexer:
return
self.keyword = keyword
self.cat = cat
self.mtype = mtype
self.indexerid = indexer.get('id')
self.indexername = indexer.get('name')
self.search = indexer.get('search')
self.batch = indexer.get('batch')
self.browse = indexer.get('browse')
self.category = indexer.get('category')
self.list = indexer.get('torrents').get('list', {})
self.fields = indexer.get('torrents').get('fields')
if not keyword and self.browse:
self.list = self.browse.get('list') or self.list
self.fields = self.browse.get('fields') or self.fields
self.domain = indexer.get('domain')
self.result_num = int(indexer.get('result_num') or 100)
self._timeout = int(indexer.get('timeout') or 15)
self.page = page
if self.domain and not str(self.domain).endswith("/"):
self.domain = self.domain + "/"
self.ua = indexer.get('ua') or settings.USER_AGENT
self.proxies = settings.PROXY if indexer.get('proxy') else None
self.proxy_server = settings.PROXY_SERVER if indexer.get('proxy') else None
self.cookie = indexer.get('cookie')
self.referer = referer
# 初始化属性
self.is_error = False
self.torrents_info = {}
self.torrents_info_array = []
def get_torrents(self) -> List[dict]:
"""
开始请求
"""
if not self.search or not self.domain:
return []
# 种子搜索相对路径
paths = self.search.get('paths', [])
torrentspath = ""
if len(paths) == 1:
torrentspath = paths[0].get('path', '')
else:
for path in paths:
if path.get("type") == "all" and not self.mtype:
torrentspath = path.get('path')
break
elif path.get("type") == "movie" and self.mtype == MediaType.MOVIE:
torrentspath = path.get('path')
break
elif path.get("type") == "tv" and self.mtype == MediaType.TV:
torrentspath = path.get('path')
break
# 精确搜索
if self.keyword:
if isinstance(self.keyword, list):
# 批量查询
if self.batch:
delimiter = self.batch.get('delimiter') or ' '
space_replace = self.batch.get('space_replace') or ' '
search_word = delimiter.join([str(k).replace(' ',
space_replace) for k in self.keyword])
else:
search_word = " ".join(self.keyword)
# 查询模式:或
search_mode = "1"
else:
# 单个查询
search_word = self.keyword
# 查询模式与
search_mode = "0"
# 搜索URL
indexer_params = self.search.get("params", {}).copy()
if indexer_params:
search_area = indexer_params.get('search_area')
# search_area非0表示支持imdbid搜索
if (search_area and
(not self.keyword or not self.keyword.startswith('tt'))):
# 支持imdbid搜索但关键字不是imdbid时不启用imdbid搜索
indexer_params.pop('search_area')
# 变量字典
inputs_dict = {
"keyword": search_word
}
# 查询参数,默认查询标题
params = {
"search_mode": search_mode,
"search_area": 0,
"page": self.page or 0,
"notnewword": 1
}
# 额外参数
for key, value in indexer_params.items():
params.update({
"%s" % key: str(value).format(**inputs_dict)
})
# 分类条件
if self.category:
if self.mtype == MediaType.TV:
cats = self.category.get("tv") or []
elif self.mtype == MediaType.MOVIE:
cats = self.category.get("movie") or []
else:
cats = (self.category.get("movie") or []) + (self.category.get("tv") or [])
for cat in cats:
if self.cat and str(cat.get("id")) not in self.cat:
continue
if self.category.get("field"):
value = params.get(self.category.get("field"), "")
params.update({
"%s" % self.category.get("field"): value + self.category.get("delimiter",
' ') + cat.get("id")
})
else:
params.update({
"cat%s" % cat.get("id"): 1
})
searchurl = self.domain + torrentspath + "?" + urlencode(params)
else:
# 变量字典
inputs_dict = {
"keyword": quote(search_word),
"page": self.page or 0
}
# 无额外参数
searchurl = self.domain + str(torrentspath).format(**inputs_dict)
# 列表浏览
else:
# 变量字典
inputs_dict = {
"page": self.page or 0,
"keyword": ""
}
# 有单独浏览路径
if self.browse:
torrentspath = self.browse.get("path")
if self.browse.get("start"):
start_page = int(self.browse.get("start")) + int(self.page or 0)
inputs_dict.update({
"page": start_page
})
elif self.page:
torrentspath = torrentspath + f"?page={self.page}"
# 搜索Url
searchurl = self.domain + str(torrentspath).format(**inputs_dict)
logger.info(f"开始请求:{searchurl}")
# requests请求
ret = RequestUtils(
ua=self.ua,
cookies=self.cookie,
timeout=self._timeout,
referer=self.referer,
proxies=self.proxies
).get_res(searchurl, allow_redirects=True)
# 解析返回
return self.parse(
RequestUtils.get_decoded_html_content(
ret,
performance_mode=settings.ENCODING_DETECTION_PERFORMANCE_MODE,
confidence_threshold=settings.ENCODING_DETECTION_MIN_CONFIDENCE
)
)
def __get_title(self, torrent: Any):
# title default text
if 'title' not in self.fields:
return
selector = self.fields.get('title', {})
if 'selector' in selector:
self.torrents_info['title'] = self._safe_query(torrent, selector)
elif 'text' in selector:
render_dict = {}
if "title_default" in self.fields:
title_default_selector = self.fields.get('title_default', {})
title_default = self._safe_query(torrent, title_default_selector)
render_dict.update({'title_default': title_default})
if "title_optional" in self.fields:
title_optional_selector = self.fields.get('title_optional', {})
title_optional = self._safe_query(torrent, title_optional_selector)
render_dict.update({'title_optional': title_optional})
self.torrents_info['title'] = Template(selector.get('text')).render(fields=render_dict)
self.torrents_info['title'] = self.__filter_text(self.torrents_info.get('title'),
selector.get('filters'))
def __get_description(self, torrent: Any):
# description text
if 'description' not in self.fields:
return
selector = self.fields.get('description', {})
if "selector" in selector or "selectors" in selector:
# 对于selectors情况需要特殊处理selector_config
desc_selector = selector.copy()
if "selectors" in selector and "selector" not in selector:
desc_selector["selector"] = selector.get("selectors", "")
self.torrents_info['description'] = self._safe_query(torrent, desc_selector)
elif "text" in selector:
render_dict = {}
if "tags" in self.fields:
tags_selector = self.fields.get('tags', {})
tag = self._safe_query(torrent, tags_selector)
render_dict.update({'tags': tag})
if "subject" in self.fields:
subject_selector = self.fields.get('subject', {})
subject = self._safe_query(torrent, subject_selector)
render_dict.update({'subject': subject})
if "description_free_forever" in self.fields:
description_free_forever_selector = self.fields.get("description_free_forever", {})
description_free_forever = self._safe_query(torrent, description_free_forever_selector)
render_dict.update({"description_free_forever": description_free_forever})
if "description_normal" in self.fields:
description_normal_selector = self.fields.get("description_normal", {})
description_normal = self._safe_query(torrent, description_normal_selector)
render_dict.update({"description_normal": description_normal})
self.torrents_info['description'] = Template(selector.get('text')).render(fields=render_dict)
self.torrents_info['description'] = self.__filter_text(self.torrents_info.get('description'),
selector.get('filters'))
def __get_detail(self, torrent: Any):
# details page text
if 'details' not in self.fields:
return
selector = self.fields.get('details', {})
item = self._safe_query(torrent, selector)
detail_link = self.__filter_text(item, selector.get('filters'))
if detail_link:
if not detail_link.startswith("http"):
if detail_link.startswith("//"):
self.torrents_info['page_url'] = self.domain.split(":")[0] + ":" + detail_link
elif detail_link.startswith("/"):
self.torrents_info['page_url'] = self.domain + detail_link[1:]
else:
self.torrents_info['page_url'] = self.domain + detail_link
else:
self.torrents_info['page_url'] = detail_link
def __get_download(self, torrent: Any):
# download link text
if 'download' not in self.fields:
return
selector = self.fields.get('download', {})
item = self._safe_query(torrent, selector)
download_link = self.__filter_text(item, selector.get('filters'))
if download_link:
if not download_link.startswith("http") \
and not download_link.startswith("magnet"):
_scheme, _domain = StringUtils.get_url_netloc(self.domain)
if _domain in download_link:
if download_link.startswith("/"):
self.torrents_info['enclosure'] = f"{_scheme}:{download_link}"
else:
self.torrents_info['enclosure'] = f"{_scheme}://{download_link}"
else:
if download_link.startswith("/"):
self.torrents_info['enclosure'] = f"{self.domain}{download_link[1:]}"
else:
self.torrents_info['enclosure'] = f"{self.domain}{download_link}"
else:
self.torrents_info['enclosure'] = download_link
def __get_imdbid(self, torrent: Any):
# imdbid
if "imdbid" not in self.fields:
return
selector = self.fields.get('imdbid', {})
item = self._safe_query(torrent, selector)
self.torrents_info['imdbid'] = self.__filter_text(item, selector.get('filters'))
def __get_size(self, torrent: Any):
# torrent size int
if 'size' not in self.fields:
return
selector = self.fields.get('size', {})
item = self._safe_query(torrent, selector)
if item:
size_val = item.replace("\n", "").strip()
size_val = self.__filter_text(size_val,
selector.get('filters'))
self.torrents_info['size'] = StringUtils.num_filesize(size_val)
else:
self.torrents_info['size'] = 0
def __get_leechers(self, torrent: Any):
# torrent leechers int
if 'leechers' not in self.fields:
return
selector = self.fields.get('leechers', {})
item = self._safe_query(torrent, selector)
if item:
peers_val = item.split("/")[0]
peers_val = peers_val.replace(",", "")
peers_val = self.__filter_text(peers_val, selector.get('filters'))
self.torrents_info['peers'] = int(peers_val) if peers_val and peers_val.isdigit() else 0
else:
self.torrents_info['peers'] = 0
def __get_seeders(self, torrent: Any):
# torrent seeders int
if 'seeders' not in self.fields:
return
selector = self.fields.get('seeders', {})
item = self._safe_query(torrent, selector)
if item:
seeders_val = item.split("/")[0]
seeders_val = seeders_val.replace(",", "")
seeders_val = self.__filter_text(seeders_val, selector.get('filters'))
self.torrents_info['seeders'] = int(seeders_val) if seeders_val and seeders_val.isdigit() else 0
else:
self.torrents_info['seeders'] = 0
def __get_grabs(self, torrent: Any):
# torrent grabs int
if 'grabs' not in self.fields:
return
selector = self.fields.get('grabs', {})
item = self._safe_query(torrent, selector)
if item:
grabs_val = item.split("/")[0]
grabs_val = grabs_val.replace(",", "")
grabs_val = self.__filter_text(grabs_val, selector.get('filters'))
self.torrents_info['grabs'] = int(grabs_val) if grabs_val and grabs_val.isdigit() else 0
else:
self.torrents_info['grabs'] = 0
def __get_pubdate(self, torrent: Any):
# torrent pubdate yyyy-mm-dd hh:mm:ss
if 'date_added' not in self.fields:
return
selector = self.fields.get('date_added', {})
pubdate_str = self._safe_query(torrent, selector)
if pubdate_str:
pubdate_str = pubdate_str.replace('\n', ' ').strip()
self.torrents_info['pubdate'] = self.__filter_text(pubdate_str, selector.get('filters'))
def __get_date_elapsed(self, torrent: Any):
# torrent date elapsed text
if 'date_elapsed' not in self.fields:
return
selector = self.fields.get('date_elapsed', {})
date_elapsed = self._safe_query(torrent, selector)
self.torrents_info['date_elapsed'] = self.__filter_text(date_elapsed, selector.get('filters'))
def __get_downloadvolumefactor(self, torrent: Any):
# downloadvolumefactor int
selector = self.fields.get('downloadvolumefactor', {})
if not selector:
return
self.torrents_info['downloadvolumefactor'] = 1
if 'case' in selector:
for downloadvolumefactorselector in list(selector.get('case', {}).keys()):
downloadvolumefactor = torrent(downloadvolumefactorselector)
try:
if len(downloadvolumefactor) > 0:
self.torrents_info['downloadvolumefactor'] = selector.get('case', {}).get(
downloadvolumefactorselector)
break
finally:
downloadvolumefactor.clear()
del downloadvolumefactor
elif "selector" in selector:
item = self._safe_query(torrent, selector)
if item:
downloadvolumefactor = re.search(r'(\d+\.?\d*)', item)
if downloadvolumefactor:
self.torrents_info['downloadvolumefactor'] = int(downloadvolumefactor.group(1))
def __get_uploadvolumefactor(self, torrent: Any):
# uploadvolumefactor int
selector = self.fields.get('uploadvolumefactor', {})
if not selector:
return
self.torrents_info['uploadvolumefactor'] = 1
if 'case' in selector:
for uploadvolumefactorselector in list(selector.get('case', {}).keys()):
uploadvolumefactor = torrent(uploadvolumefactorselector)
try:
if len(uploadvolumefactor) > 0:
self.torrents_info['uploadvolumefactor'] = selector.get('case', {}).get(
uploadvolumefactorselector)
break
finally:
uploadvolumefactor.clear()
del uploadvolumefactor
elif "selector" in selector:
item = self._safe_query(torrent, selector)
if item:
uploadvolumefactor = re.search(r'(\d+\.?\d*)', item)
if uploadvolumefactor:
self.torrents_info['uploadvolumefactor'] = int(uploadvolumefactor.group(1))
def __get_labels(self, torrent: Any):
# labels ['label1', 'label2']
if 'labels' not in self.fields:
return
selector = self.fields.get('labels', {})
if not selector.get('selector'):
self.torrents_info['labels'] = []
return
# labels需要特殊处理因为它返回的是列表
labels = torrent(selector.get("selector", "")).clone()
try:
self.__remove(labels, selector)
items = self.__attribute_or_text(labels, selector)
if items:
self.torrents_info['labels'] = [item for item in items if item]
else:
self.torrents_info['labels'] = []
finally:
labels.clear()
del labels
def __get_free_date(self, torrent: Any):
# free date yyyy-mm-dd hh:mm:ss
if 'freedate' not in self.fields:
return
selector = self.fields.get('freedate', {})
freedate = self._safe_query(torrent, selector)
self.torrents_info['freedate'] = self.__filter_text(freedate, selector.get('filters'))
def __get_hit_and_run(self, torrent: Any):
# hitandrun True/False
if 'hr' not in self.fields:
return
selector = self.fields.get('hr', {})
hit_and_run = torrent(selector.get('selector', ''))
try:
if hit_and_run:
self.torrents_info['hit_and_run'] = True
else:
self.torrents_info['hit_and_run'] = False
finally:
hit_and_run.clear()
del hit_and_run
def __get_category(self, torrent: Any):
# category 电影/电视剧
if 'category' not in self.fields:
return
selector = self.fields.get('category', {})
category_value = self._safe_query(torrent, selector)
category_value = self.__filter_text(category_value, selector.get('filters'))
if category_value and self.category:
tv_cats = [str(cat.get("id")) for cat in self.category.get("tv") or []]
movie_cats = [str(cat.get("id")) for cat in self.category.get("movie") or []]
if category_value in tv_cats \
and category_value not in movie_cats:
self.torrents_info['category'] = MediaType.TV.value
elif category_value in movie_cats:
self.torrents_info['category'] = MediaType.MOVIE.value
else:
self.torrents_info['category'] = MediaType.UNKNOWN.value
else:
self.torrents_info['category'] = MediaType.UNKNOWN.value
def _safe_query(self, torrent: Any, selector_config: Optional[dict]) -> Optional[str]:
"""
安全地执行PyQuery查询并自动清理资源
:param torrent: PyQuery对象
:param selector_config: 选择器配置
:return: 处理后的结果
"""
if not selector_config or not selector_config.get('selector'):
return None
query_obj = torrent(selector_config.get('selector', '')).clone()
try:
self.__remove(query_obj, selector_config)
items = self.__attribute_or_text(query_obj, selector_config)
return self.__index(items, selector_config)
finally:
query_obj.clear()
del query_obj
def get_info(self, torrent: Any) -> dict:
"""
解析单条种子数据
"""
# 每次调用时重新初始化,避免数据累积
self.torrents_info = {}
try:
# 标题
self.__get_title(torrent)
# 描述
self.__get_description(torrent)
# 详情页面
self.__get_detail(torrent)
# 下载链接
self.__get_download(torrent)
# 完成数
self.__get_grabs(torrent)
# 下载数
self.__get_leechers(torrent)
# 做种数
self.__get_seeders(torrent)
# 大小
self.__get_size(torrent)
# IMDBID
self.__get_imdbid(torrent)
# 下载系数
self.__get_downloadvolumefactor(torrent)
# 上传系数
self.__get_uploadvolumefactor(torrent)
# 发布时间
self.__get_pubdate(torrent)
# 已发布时间
self.__get_date_elapsed(torrent)
# 免费载止时间
self.__get_free_date(torrent)
# 标签
self.__get_labels(torrent)
# HR
self.__get_hit_and_run(torrent)
# 分类
self.__get_category(torrent)
# 返回当前种子信息的副本,而不是引用
return self.torrents_info.copy() if self.torrents_info else {}
except Exception as err:
logger.error("%s 搜索出现错误:%s" % (self.indexername, str(err)))
return {}
finally:
self.torrents_info.clear()
@staticmethod
def __filter_text(text: Optional[str], filters: Optional[List[dict]]) -> str:
"""
对文件进行处理
"""
if not text or not filters or not isinstance(filters, list):
return text
if not isinstance(text, str):
text = str(text)
for filter_item in filters:
if not text:
break
method_name = filter_item.get("name")
try:
args = filter_item.get("args")
if method_name == "re_search" and isinstance(args, list):
rematch = re.search(r"%s" % args[0], text)
if rematch:
text = rematch.group(args[-1])
elif method_name == "split" and isinstance(args, list):
text = text.split(r"%s" % args[0])[args[-1]]
elif method_name == "replace" and isinstance(args, list):
text = text.replace(r"%s" % args[0], r"%s" % args[-1])
elif method_name == "dateparse" and isinstance(args, str):
text = text.replace("\n", " ").strip()
text = datetime.datetime.strptime(text, r"%s" % args)
elif method_name == "strip":
text = text.strip()
elif method_name == "appendleft":
text = f"{args}{text}"
elif method_name == "querystring":
parsed_url = urlparse(str(text))
query_params = parse_qs(parsed_url.query)
param_value = query_params.get(args)
text = param_value[0] if param_value else ''
except Exception as err:
logger.debug(f'过滤器 {method_name} 处理失败:{str(err)} - {traceback.format_exc()}')
return text.strip()
@staticmethod
def __remove(item: Any, selector: Optional[dict]):
"""
移除元素
"""
if selector and "remove" in selector:
removelist = selector.get('remove', '').split(', ')
for v in removelist:
item.remove(v)
@staticmethod
def __attribute_or_text(item: Any, selector: Optional[dict]) -> list:
if not selector:
return item
if not item:
return []
if 'attribute' in selector:
items = [i.attr(selector.get('attribute')) for i in item.items() if i]
else:
items = [i.text() for i in item.items() if i]
return items
@staticmethod
def __index(items: Optional[list], selector: Optional[dict]) -> Optional[str]:
if not items:
return None
if selector:
if "contents" in selector \
and len(items) > int(selector.get("contents")):
item = items[0].split("\n")[selector.get("contents")]
elif "index" in selector \
and len(items) > int(selector.get("index")):
item = items[int(selector.get("index"))]
else:
item = items[0]
else:
item = items[0]
return item
def parse(self, html_text: str) -> List[dict]:
"""
解析整个页面
"""
if not html_text:
self.is_error = True
return []
# 清空旧结果
self.torrents_info_array = []
html_doc = None
try:
# 解析站点文本对象
html_doc = PyQuery(html_text)
# 种子筛选器
torrents_selector = self.list.get('selector', '')
# 遍历种子html列表
for i, torn in enumerate(html_doc(torrents_selector)):
if i >= int(self.result_num):
break
# 创建临时PyQuery对象进行解析
torrent_query = PyQuery(torn)
try:
# 直接获取种子信息,避免深拷贝
torrent_info = self.get_info(torrent_query)
if torrent_info:
# 浅拷贝即可,减少内存使用
self.torrents_info_array.append(torrent_info)
finally:
# 显式删除临时PyQuery对象
torrent_query.clear()
del torrent_query
# 返回数组的副本,防止被后续清理操作影响
return self.torrents_info_array.copy()
except Exception as err:
self.is_error = True
logger.warn(f"错误:{self.indexername} {str(err)}")
return []
finally:
# 清理种子缓存
self.torrents_info_array.clear()
# 清理HTML文档对象
if html_doc is not None:
html_doc.clear()
del html_doc
# 清理html_text引用
del html_text

View File

@@ -104,20 +104,24 @@ class SubtitleModule(_ModuleBase):
logger.warn(f"读取页面代码失败:{torrent.page_url}")
return
html = etree.HTML(res.text)
sublink_list = []
for xpath in self._SITE_SUBTITLE_XPATH:
sublinks = html.xpath(xpath)
if sublinks:
for sublink in sublinks:
if not sublink:
continue
if not sublink.startswith("http"):
base_url = StringUtils.get_base_url(torrent.page_url)
if sublink.startswith("/"):
sublink = "%s%s" % (base_url, sublink)
else:
sublink = "%s/%s" % (base_url, sublink)
sublink_list.append(sublink)
try:
sublink_list = []
for xpath in self._SITE_SUBTITLE_XPATH:
sublinks = html.xpath(xpath)
if sublinks:
for sublink in sublinks:
if not sublink:
continue
if not sublink.startswith("http"):
base_url = StringUtils.get_base_url(torrent.page_url)
if sublink.startswith("/"):
sublink = "%s%s" % (base_url, sublink)
else:
sublink = "%s/%s" % (base_url, sublink)
sublink_list.append(sublink)
finally:
if html is not None:
del html
# 下载所有字幕文件
for sublink in sublink_list:
logger.info(f"找到字幕下载链接:{sublink},开始下载...")

View File

@@ -563,6 +563,9 @@ class TmdbApi:
except Exception as err:
logger.error(f"从TheDbMovie网站查询出错{str(err)}")
return {}
finally:
if html is not None:
del html
return {}
def get_info(self,

View File

@@ -25,7 +25,7 @@ class Transmission:
若不设置参数,则创建配置文件设置的下载器
"""
if host and port:
self._protocol, self._host, self._port = kwargs.get("protocol", self._protocol), host, port
self._protocol, self._host, self._port = kwargs.get("protocol", "http"), host, port
elif host:
result = UrlUtils.parse_url_params(url=host)
if result:

View File

@@ -111,7 +111,7 @@ class Api:
"_api_path",
"_request_utils",
"_version",
"_session"
"_session",
)
@property
@@ -287,6 +287,18 @@ class Api:
return True
return False
def task_running(self):
"""
当前正在运行的任务
"""
if (
res := self.__request_api("/task/running")
) and res.success:
if res.data:
# TODO 具体正在运行的任务
return True
return False
def __build_item(self, info: dict) -> Item:
"""
构造媒体Item

View File

@@ -111,6 +111,8 @@ class TrimeMedia:
if self._userinfo is None:
return False
logger.debug(f"{self._username} 成功登录飞牛影视")
# 刷新媒体库列表
self.get_librarys()
return True
def disconnect(self):
@@ -311,6 +313,8 @@ class TrimeMedia:
logger.error("飞牛仅支持管理员账号刷新媒体库")
return False
# 必须调用 否则容易误报 -14 Task duplicate
self._api.task_running()
logger.info("刷新所有媒体库")
return self._api.mdb_scanall()
@@ -337,6 +341,8 @@ class TrimeMedia:
# 媒体库去重
libraries.add(lib.guid)
# 必须调用 否则容易误报 -14 Task duplicate
self._api.task_running()
for lib_guid in libraries:
# 逐个刷新
lib = self._libraries[lib_guid]

View File

@@ -13,27 +13,31 @@ class SiteUtils:
:return:
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
try:
if not StringUtils.is_valid_html_element(html):
return False
# 存在明显的密码输入框,说明未登录
if html.xpath("//input[@type='password']"):
return False
# 是否存在登出和用户面板等链接
xpaths = [
'//a[contains(@href, "logout")'
' or contains(@data-url, "logout")'
' or contains(@href, "mybonus") '
' or contains(@onclick, "logout")'
' or contains(@href, "usercp")'
' or contains(@lay-on, "logout")]',
'//form[contains(@action, "logout")]',
'//div[@class="user-info-side"]',
'//a[@id="myitem"]'
]
for xpath in xpaths:
if html.xpath(xpath):
return True
return False
# 存在明显的密码输入框,说明未登录
if html.xpath("//input[@type='password']"):
return False
# 是否存在登出和用户面板等链接
xpaths = [
'//a[contains(@href, "logout")'
' or contains(@data-url, "logout")'
' or contains(@href, "mybonus") '
' or contains(@onclick, "logout")'
' or contains(@href, "usercp")'
' or contains(@lay-on, "logout")]',
'//form[contains(@action, "logout")]',
'//div[@class="user-info-side"]',
'//a[@id="myitem"]'
]
for xpath in xpaths:
if html.xpath(xpath):
return True
return False
finally:
if html is not None:
del html
@classmethod
def is_checkin(cls, html_text: str) -> bool:
@@ -42,24 +46,27 @@ class SiteUtils:
:return True已签到 False未签到
"""
html = etree.HTML(html_text)
if not StringUtils.is_valid_html_element(html):
return False
# 站点签到支持的识别XPATH
xpaths = [
'//a[@id="signed"]',
'//a[contains(@href, "attendance")]',
'//a[contains(text(), "签到")]',
'//a/b[contains(text(), "签 到")]',
'//span[@id="sign_in"]/a',
'//a[contains(@href, "addbonus")]',
'//input[@class="dt_button"][contains(@value, "打卡")]',
'//a[contains(@href, "sign_in")]',
'//a[contains(@onclick, "do_signin")]',
'//a[@id="do-attendance"]',
'//shark-icon-button[@href="attendance.php"]'
]
for xpath in xpaths:
if html.xpath(xpath):
try:
if not StringUtils.is_valid_html_element(html):
return False
return True
# 站点签到支持的识别XPATH
xpaths = [
'//a[@id="signed"]',
'//a[contains(@href, "attendance")]',
'//a[contains(text(), "签到")]',
'//a/b[contains(text(), "签 到")]',
'//span[@id="sign_in"]/a',
'//a[contains(@href, "addbonus")]',
'//input[@class="dt_button"][contains(@value, "打卡")]',
'//a[contains(@href, "sign_in")]',
'//a[contains(@onclick, "do_signin")]',
'//a[@id="do-attendance"]',
'//shark-icon-button[@href="attendance.php"]'
]
for xpath in xpaths:
if html.xpath(xpath):
return False
return True
finally:
if html is not None:
del html

View File

@@ -1,2 +1,2 @@
APP_VERSION = 'v2.6.2'
FRONTEND_VERSION = 'v2.6.2'
APP_VERSION = 'v2.6.3'
FRONTEND_VERSION = 'v2.6.3'