功能: 修复并实现多节点集群部署 (#38)

基础修复:
- 新增节点离线检测:每 15s 扫描,超 45s 未心跳的远程节点自动置离线
- 节点删除前检查关联任务,避免孤立备份任务
- BackupTaskRepository 新增 CountByNodeID/ListByNodeID

Master 端 Agent 协议:
- 新增 AgentCommand 模型与命令队列仓储(pending/dispatched/succeeded/failed/timeout)
- 新增 AgentService:任务下发、命令轮询、结果回收、超时扫描
- 新增专用 Agent HTTP API(X-Agent-Token 认证):
  /api/agent/heartbeat
  /api/agent/commands/poll
  /api/agent/commands/:id/result
  /api/agent/tasks/:id
  /api/agent/records/:id
- BackupExecutionService 支持 node 路由:task.NodeID 指向远程节点时自动入队派发

Agent CLI(backupx agent 子命令):
- 配置:YAML 文件 / 环境变量 / CLI 参数,优先级 CLI > 文件 > 环境
- 心跳循环 + 命令轮询循环 + 优雅退出
- 本地复用 BackupRunner 与 storage registry 执行备份并直接上传
- 支持 run_task 和 list_dir 两种命令

远程目录浏览:
- NodeService 支持通过 Agent RPC 列出远程节点目录(15s 超时)

前端:
- NodesPage 添加节点后展示 Agent 启动命令和环境变量配置

文档:
- README 中英文重写"多节点集群"章节,含架构图、步骤、限制、CLI 参考
This commit is contained in:
Wu Qing
2026-04-17 12:29:08 +08:00
committed by GitHub
parent e04774ff68
commit 757b0fa5ed
27 changed files with 2224 additions and 24 deletions

View File

@@ -0,0 +1,120 @@
package repository
import (
"context"
"testing"
"time"
"backupx/server/internal/model"
"github.com/glebarez/sqlite"
"gorm.io/gorm"
gormlogger "gorm.io/gorm/logger"
)
func newTestDB(t *testing.T) *gorm.DB {
t.Helper()
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{Logger: gormlogger.Default.LogMode(gormlogger.Silent)})
if err != nil {
t.Fatalf("open: %v", err)
}
if err := db.AutoMigrate(&model.AgentCommand{}); err != nil {
t.Fatalf("migrate: %v", err)
}
return db
}
func TestAgentCommandRepository_ClaimPending(t *testing.T) {
db := newTestDB(t)
repo := NewAgentCommandRepository(db)
ctx := context.Background()
// 插入两条 pending 命令
c1 := &model.AgentCommand{NodeID: 5, Type: "run_task", Status: model.AgentCommandStatusPending, Payload: `{"taskId":1}`}
c2 := &model.AgentCommand{NodeID: 5, Type: "list_dir", Status: model.AgentCommandStatusPending, Payload: `{"path":"/"}`}
c3 := &model.AgentCommand{NodeID: 99, Type: "run_task", Status: model.AgentCommandStatusPending}
for _, c := range []*model.AgentCommand{c1, c2, c3} {
if err := repo.Create(ctx, c); err != nil {
t.Fatal(err)
}
}
// 第一次 Claim 应拿到 c1
claimed, err := repo.ClaimPending(ctx, 5)
if err != nil {
t.Fatalf("claim: %v", err)
}
if claimed == nil || claimed.ID != c1.ID || claimed.Status != model.AgentCommandStatusDispatched {
t.Fatalf("expected c1 dispatched: %+v", claimed)
}
// 第二次应拿到 c2
claimed2, err := repo.ClaimPending(ctx, 5)
if err != nil || claimed2 == nil || claimed2.ID != c2.ID {
t.Fatalf("expected c2: %+v %v", claimed2, err)
}
// 第三次无 pending返回 nil
claimed3, err := repo.ClaimPending(ctx, 5)
if err != nil || claimed3 != nil {
t.Fatalf("expected nil, got %+v", claimed3)
}
// 不同 node 的命令不应被抢到
other, err := repo.ClaimPending(ctx, 5)
if err != nil || other != nil {
t.Fatalf("expected nil: %+v", other)
}
}
func TestAgentCommandRepository_Update(t *testing.T) {
db := newTestDB(t)
repo := NewAgentCommandRepository(db)
ctx := context.Background()
cmd := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusPending}
_ = repo.Create(ctx, cmd)
cmd.Status = model.AgentCommandStatusSucceeded
cmd.Result = `{"ok":true}`
now := time.Now().UTC()
cmd.CompletedAt = &now
if err := repo.Update(ctx, cmd); err != nil {
t.Fatal(err)
}
got, err := repo.FindByID(ctx, cmd.ID)
if err != nil || got == nil {
t.Fatal(err)
}
if got.Status != model.AgentCommandStatusSucceeded || got.Result != `{"ok":true}` {
t.Errorf("mismatch: %+v", got)
}
}
func TestAgentCommandRepository_MarkStaleTimeout(t *testing.T) {
db := newTestDB(t)
repo := NewAgentCommandRepository(db)
ctx := context.Background()
old := time.Now().Add(-time.Hour)
recent := time.Now()
// 两条 dispatched一条旧、一条新
oldCmd := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusDispatched, DispatchedAt: &old}
newCmd := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusDispatched, DispatchedAt: &recent}
_ = repo.Create(ctx, oldCmd)
_ = repo.Create(ctx, newCmd)
n, err := repo.MarkStaleTimeout(ctx, time.Now().Add(-30*time.Minute))
if err != nil {
t.Fatal(err)
}
if n != 1 {
t.Errorf("expected 1 row, got %d", n)
}
oldGot, _ := repo.FindByID(ctx, oldCmd.ID)
newGot, _ := repo.FindByID(ctx, newCmd.ID)
if oldGot.Status != model.AgentCommandStatusTimeout {
t.Errorf("old should be timeout: %+v", oldGot)
}
if newGot.Status != model.AgentCommandStatusDispatched {
t.Errorf("new should stay dispatched: %+v", newGot)
}
}