fix: prevent Linux gateway mis-kill and calibration data resurrection

Linux cleanup_zombie_gateway_processes previously used fuser + kill -9 on
every PID bound to the gateway port, with no cmdline or /health checks.
This could terminate unrelated services or healthy Gateways during restart
timeouts. Align Linux with the Windows strategy: verify openclaw gateway
cmdline, retry /health, only kill unresponsive zombies, and adopt healthy
external instances.

Calibration inherit mode preferred the richer openclaw.json.bak over the
current file. Because every write copies the previous config to .bak,
intentional removals (providers/channels) could be resurrected on the next
calibration. Prefer current whenever it is non-empty; only fall back to
backup when current is effectively empty.

Add regression tests for calibration source selection and mirror the fix
in dev-api.js.

Co-authored-by: 晴天 <1186258278@users.noreply.github.com>
This commit is contained in:
Cursor Agent
2026-06-14 11:09:42 +00:00
parent c86382adfa
commit 67ebd2410e
3 changed files with 226 additions and 21 deletions

View File

@@ -2443,9 +2443,11 @@ function calibrationRichnessScore(config) {
function selectCalibrationSource(current, backup) {
if (current && backup) {
return calibrationRichnessScore(backup) > calibrationRichnessScore(current)
? ['backup', backup]
: ['current', current]
const currentScore = calibrationRichnessScore(current)
if (currentScore === 0 && calibrationRichnessScore(backup) > 0) {
return ['backup', backup]
}
return ['current', current]
}
if (current) return ['current', current]
if (backup) return ['backup', backup]

View File

@@ -1278,12 +1278,16 @@ fn select_calibration_source(current: Option<Value>, backup: Option<Value>) -> (
match (current, backup) {
(Some(current), Some(backup)) => {
let current_score = calibration_richness_score(&current);
let backup_score = calibration_richness_score(&backup);
if backup_score > current_score {
("backup".into(), backup)
} else {
("current".into(), current)
// write_openclaw_config copies the previous file to .bak on every save, so a
// slimmer intentional edit still leaves a richer backup. Only fall back when
// current is effectively empty, not when the user removed providers/channels.
if current_score == 0 {
let backup_score = calibration_richness_score(&backup);
if backup_score > 0 {
return ("backup".into(), backup);
}
}
("current".into(), current)
}
(Some(current), None) => ("current".into(), current),
(None, Some(backup)) => ("backup".into(), backup),
@@ -7796,3 +7800,61 @@ mod write_openclaw_config_merge_tests {
assert_eq!(resolved, Some(node_bin));
}
}
#[cfg(test)]
mod calibration_source_tests {
use super::{calibration_richness_score, select_calibration_source};
use serde_json::json;
#[test]
fn select_calibration_source_prefers_current_over_richer_backup() {
let current = json!({
"models": { "providers": {} },
"gateway": {
"auth": { "mode": "token", "token": "secret" },
"controlUi": { "allowedOrigins": ["http://localhost:1420"] }
}
});
let backup = json!({
"models": {
"providers": {
"openai": { "apiKey": "sk-test", "baseUrl": "https://api.openai.com" }
}
},
"channels": { "telegram": { "enabled": true } },
"gateway": {
"auth": { "mode": "token", "token": "secret" },
"controlUi": { "allowedOrigins": ["http://localhost:1420"] }
}
});
assert!(
calibration_richness_score(&backup) > calibration_richness_score(&current),
"backup fixture should be richer than current"
);
let (source, seed) =
select_calibration_source(Some(current.clone()), Some(backup.clone()));
assert_eq!(source, "current");
assert_eq!(seed, current);
}
#[test]
fn select_calibration_source_uses_backup_when_current_empty() {
let current = json!({});
let backup = json!({
"models": {
"providers": {
"openai": { "apiKey": "sk-test" }
}
}
});
let (source, seed) =
select_calibration_source(Some(current), Some(backup.clone()));
assert_eq!(source, "backup");
assert_eq!(seed, backup);
}
}

View File

@@ -2023,24 +2023,165 @@ mod platform {
}
}
/// 清理残留的 Gateway 进程Linux 版:通过 fuser 查端口占用进程并 kill
fn cleanup_zombie_gateway_processes() {
let port = crate::commands::gateway_listen_port();
// 尝试用 fuser 找到端口占用进程
if let Ok(output) = std::process::Command::new("fuser")
.args([&format!("{port}/tcp")])
static LAST_KNOWN_GATEWAY_PID: Mutex<Option<u32>> = Mutex::new(None);
/// 检查 Gateway 端口是否有响应(阻塞式 HTTP /health3s 超时)
fn is_gateway_port_responsive(port: u16) -> bool {
use std::io::{Read, Write as IoWrite};
use std::net::TcpStream;
let addr = format!("127.0.0.1:{port}");
let mut stream =
match TcpStream::connect_timeout(&addr.parse().unwrap(), Duration::from_secs(3)) {
Ok(s) => s,
Err(_) => return false,
};
let _ = stream.set_read_timeout(Some(Duration::from_secs(3)));
let _ = stream.set_write_timeout(Some(Duration::from_secs(2)));
let req = format!("GET /health HTTP/1.0\r\nHost: 127.0.0.1:{port}\r\n\r\n");
if stream.write_all(req.as_bytes()).is_err() {
return false;
}
let mut buf = [0u8; 256];
match stream.read(&mut buf) {
Ok(n) if n > 0 => {
let resp = String::from_utf8_lossy(&buf[..n]);
resp.contains("200") || resp.contains("OK")
}
_ => false,
}
}
fn is_gateway_port_responsive_with_retry(port: u16, retries: u32, interval: Duration) -> bool {
for attempt in 0..retries {
if attempt > 0 {
std::thread::sleep(interval);
}
if is_gateway_port_responsive(port) {
return true;
}
}
false
}
fn find_listening_pids(port: u16) -> Vec<u32> {
let mut pids = Vec::new();
let filter = format!("sport = :{port}");
if let Ok(output) = std::process::Command::new("ss")
.args(["-ltnp", &filter])
.output()
{
let pids = String::from_utf8_lossy(&output.stdout);
for pid_str in pids.split_whitespace() {
if let Ok(pid) = pid_str.trim().parse::<u32>() {
let _ = std::process::Command::new("kill")
.args(["-9", &pid.to_string()])
.output();
eprintln!("[cleanup_zombie] killed PID {pid} on port {port}");
let text = String::from_utf8_lossy(&output.stdout);
for line in text.lines() {
let Some(idx) = line.find("pid=") else {
continue;
};
let digits: String = line[idx + 4..]
.chars()
.take_while(|c| c.is_ascii_digit())
.collect();
if let Ok(pid) = digits.parse::<u32>() {
if pid > 0 && !pids.contains(&pid) {
pids.push(pid);
}
}
}
}
if pids.is_empty() {
if let Ok(output) = std::process::Command::new("lsof")
.args(["-i", &format!("TCP:{port}"), "-sTCP:LISTEN", "-t"])
.output()
{
let text = String::from_utf8_lossy(&output.stdout);
for line in text.lines() {
if let Ok(pid) = line.trim().parse::<u32>() {
if pid > 0 && !pids.contains(&pid) {
pids.push(pid);
}
}
}
}
}
pids
}
fn read_process_command_line(pid: u32) -> Option<String> {
let raw = std::fs::read(format!("/proc/{pid}/cmdline")).ok()?;
let cmdline = raw
.split(|&b| b == 0)
.filter(|part| !part.is_empty())
.map(|part| String::from_utf8_lossy(part).into_owned())
.collect::<Vec<_>>()
.join(" ");
if cmdline.is_empty() {
None
} else {
Some(cmdline)
}
}
fn is_process_alive(pid: u32) -> bool {
std::path::Path::new(&format!("/proc/{pid}")).exists()
}
fn kill_process_tree(pid: u32) {
let _ = std::process::Command::new("kill")
.args(["-TERM", &pid.to_string()])
.output();
let _ = std::process::Command::new("pkill")
.args(["-TERM", "-P", &pid.to_string()])
.output();
std::thread::sleep(Duration::from_millis(500));
if is_process_alive(pid) {
let _ = std::process::Command::new("kill")
.args(["-KILL", &pid.to_string()])
.output();
let _ = std::process::Command::new("pkill")
.args(["-KILL", "-P", &pid.to_string()])
.output();
}
}
/// 清理残留的 Gateway 进程Linux 版:对齐 Windows 的安全策略)
///
/// 仅终止命令行确认为 openclaw gateway 且 /health 连续无响应的进程;
/// 健康的 Gateway含外部启动不会被误杀。
fn cleanup_zombie_gateway_processes() {
let port = crate::commands::gateway_listen_port();
let pids = find_listening_pids(port);
if pids.is_empty() {
return;
}
let responsive =
is_gateway_port_responsive_with_retry(port, 3, Duration::from_millis(800));
for pid in pids {
let Some(cmdline) = read_process_command_line(pid) else {
continue;
};
let cmdline_lower = cmdline.to_lowercase();
let is_gateway =
cmdline_lower.contains("openclaw") && cmdline_lower.contains("gateway");
if !is_gateway {
continue;
}
let our_pid = *LAST_KNOWN_GATEWAY_PID.lock().unwrap();
if !responsive {
super::guardian_log(&format!(
"检测到僵尸 Gateway 进程 (PID {pid}):端口 {port} 占用但 /health 连续 3 次无响应,强制终止"
));
kill_process_tree(pid);
} else if Some(pid) != our_pid {
super::guardian_log(&format!(
"检测到健康的 Gateway 进程 (PID {pid})/health 正常响应,已采纳"
));
let mut known = LAST_KNOWN_GATEWAY_PID.lock().unwrap();
*known = Some(pid);
}
}
}
async fn gateway_command(action: &str) -> Result<(), String> {