Files
IP-Sentinel/scripts/fetch_trust_urls.py

163 lines
8.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import urllib.request
import xml.etree.ElementTree as ET
import os
import json
import random
import re
# ====================================================
# 脚本功能: 5+25 混合流量注入矩阵 (自适应解耦版)
# 核心指标: 5条基石 + 25条活体新闻 = 30条确保双栈机型完全可达的白名单
# ====================================================
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_ROOT = os.path.dirname(SCRIPT_DIR)
REGIONS_DIR = os.path.join(PROJECT_ROOT, "data", "regions")
# ====================================================
# 2025 年高保真多平台 UA 轮换池 (防风控装甲)
# 包含 Windows, macOS, iOS, Android 的最新骨干指纹
# ====================================================
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36 Edg/133.0.0.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/618.1.15 (KHTML, like Gecko) Version/18.3 Safari/618.1.15',
'Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_1 like Mac OS X) AppleWebKit/618.1.15 (KHTML, like Gecko) Version/18.3 Mobile/15E148 Safari/604.1',
'Mozilla/5.0 (Linux; Android 15; Pixel 9 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Mobile Safari/537.36'
]
# ====================================================
# 全球骨干新闻 RSS 监听矩阵 (双栈强穿透版)
# 全面采用 Google News 本土化入口,确保提取的链接支持 IPv4+IPv6
# ====================================================
RSS_FEEDS = {
"US": ["https://news.google.com/rss?hl=en-US&gl=US&ceid=US:en"],
"UK": ["https://news.google.com/rss?hl=en-GB&gl=GB&ceid=GB:en"],
"AU": ["https://news.google.com/rss?hl=en-AU&gl=AU&ceid=AU:en"],
"CA": ["https://news.google.com/rss?hl=en-CA&gl=CA&ceid=CA:en"],
"DE": ["https://news.google.com/rss?hl=de&gl=DE&ceid=DE:de"],
"FR": ["https://news.google.com/rss?hl=fr&gl=FR&ceid=FR:fr"],
"ES": ["https://news.google.com/rss?hl=es-419&gl=US&ceid=US:es_419"],
"JP": ["https://news.google.com/rss?hl=ja&gl=JP&ceid=JP:ja"],
"HK": ["https://news.google.com/rss?hl=zh-HK&gl=HK&ceid=HK:zh-Hant"],
"MO": ["https://news.google.com/rss?hl=zh-HK&gl=HK&ceid=HK:zh-Hant"],
"TW": ["https://news.google.com/rss?hl=zh-TW&gl=TW&ceid=TW:zh-Hant"],
"KR": ["https://news.google.com/rss?hl=ko&gl=KR&ceid=KR:ko"],
"SG": ["https://news.google.com/rss?hl=en-SG&gl=SG&ceid=SG:en"],
"NL": ["https://news.google.com/rss?hl=nl&gl=NL&ceid=NL:nl"],
"VN": ["https://news.google.com/rss?hl=vi&gl=VN&ceid=VN:vi"],
"MY": ["https://news.google.com/rss?hl=en-MY&gl=MY&ceid=MY:en"],
"NG": ["https://news.google.com/rss?hl=en-NG&gl=NG&ceid=NG:en"],
"TR": ["https://news.google.com/rss?hl=tr&gl=TR&ceid=TR:tr"],
"PH": ["https://news.google.com/rss?hl=en-PH&gl=PH"],
"TH": ["https://news.google.com/rss?hl=th&gl=TH&ceid=TH:th"],
"ID": ["https://news.google.com/rss?hl=id&gl=ID&ceid=ID:id"],
"IN": ["https://news.google.com/rss?hl=en-IN&gl=IN&ceid=IN:en"],
"AE": ["https://news.google.com/rss?hl=en-AE&gl=AE"],
"SA": ["https://news.google.com/rss?hl=ar-SA&gl=SA"],
"BD": ["https://news.google.com/rss?hl=en-BD&gl=BD"],
"NP": ["https://news.google.com/rss?hl=en-NP&gl=NP"],
"KH": ["https://news.google.com/rss?hl=en-KH&gl=KH"],
"MM": ["https://news.google.com/rss?hl=en-MM&gl=MM"],
"LA": ["https://news.google.com/rss?hl=en-LA&gl=LA"],
"MN": ["https://news.google.com/rss?hl=en-MN&gl=MN"]
}
def is_dual_stack_safe(url):
"""网络可达性过滤: 阻断任何不支持双栈(IPv4+IPv6)的动态新闻域名"""
DUAL_STACK_SAFE_DOMAINS = [
"google.com", "wikipedia.org", "apple.com", "microsoft.com",
"wikimedia.org", "blogspot.com", "yahoo.com"
]
return any(domain in url for domain in DUAL_STACK_SAFE_DOMAINS)
def fetch_rss_links(lang_params, region_name, max_items=25):
"""根据 JSON 提供的语言参数,动态拼接对应国家的 Google 新闻源"""
# [核心解耦]:直接使用 json 文件中的 lang_params彻底淘汰 RSS_FEEDS 字典
url = f"https://news.google.com/rss?{lang_params}"
links = []
try:
dynamic_headers = {'User-Agent': random.choice(USER_AGENTS)}
req = urllib.request.Request(url, headers=dynamic_headers)
with urllib.request.urlopen(req, timeout=10) as response:
xml_data = response.read()
root = ET.fromstring(xml_data)
for item in root.findall('.//item'):
link = item.find('link')
if link is not None and link.text:
clean_link = link.text.strip()
# 仅对动态获取的不可控新闻链接应用双栈过滤
if clean_link.startswith('http') and is_dual_stack_safe(clean_link):
links.append(clean_link)
except Exception as e:
print(f"⚠️ [{region_name}] 本土 RSS 抓取异常 ({url}): {e}")
unique_links = list(set(links))
random.shuffle(unique_links)
return unique_links[:max_items]
def process_json_file(file_path):
"""解析单节点配置并执行流量注入"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
trust_mod = data.get("trust_module", {})
google_mod = data.get("google_module", {})
region_name = data.get("region_name", "Unknown")
if not trust_mod or not google_mod:
return
# 1. 提取语言参数 (如 hl=en-AE&gl=AE)
lang_params = google_mod.get("lang_params", "hl=en-US&gl=US")
# 2. 智能提取语种前缀,用于维基百科的本土化 (如 hl=ja 提取出 ja)
hl_match = re.search(r'hl=([a-zA-Z]+)', lang_params)
# 如果带有地区后缀(如 zh-TW, en-GB),则只取前面代表语种的字母
lang_prefix = hl_match.group(1).split('-')[0].lower() if hl_match else 'en'
# 3. 基石处理:【修复点】直接提取静态基石,不做双栈硬编码过滤,保护本地化域名!
static_urls = trust_mod.get("static_urls", [])
if len(static_urls) < 5:
# 基础数据残缺时,采用本土化的 Wikipedia 域名垫底
static_urls += [f"https://{lang_prefix}.wikipedia.org/wiki/Special:Random", "https://www.apple.com/", "https://www.microsoft.com/"]
random.shuffle(static_urls)
final_static = list(set(static_urls))[:5]
# 4. 将本地语言参数传给 RSS 爬虫,拉取活体新闻
final_news = fetch_rss_links(lang_params, region_name, max_items=25)
combined_urls = list(set(final_static + final_news))
# 5. 采用完全本土化的维基百科执行最终充能
while len(combined_urls) < 30:
combined_urls.append(f"https://{lang_prefix}.wikipedia.org/wiki/Special:Random?r={random.randint(1,100000)}")
combined_urls = list(set(combined_urls))
final_white_list = combined_urls[:30]
random.shuffle(final_white_list)
trust_mod["white_urls"] = final_white_list
data["trust_module"] = trust_mod
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
print(f"✅ [信用融合] {os.path.basename(file_path)} (语系: {lang_prefix}): 固化基石 {len(final_static)} 条 + 活体新闻 {len(final_news)} 条 = 统合满编 {len(final_white_list)}")
except Exception as e:
print(f"❌ [处理失败] {file_path}: {e}")
if __name__ == '__main__':
print("========== 启动 IP-Sentinel 活体新闻流融合引擎 (自适应架构版) ==========")
for root_dir, _, files in os.walk(REGIONS_DIR):
for file in files:
if file.endswith(".json"):
file_path = os.path.join(root_dir, file)
process_json_file(file_path)
print("========== 融合引擎执行完毕 ==========")