mirror of
https://github.com/Awuqing/BackupX.git
synced 2026-06-02 16:29:40 +08:00
功能: 修复并实现多节点集群部署 (#38)
基础修复: - 新增节点离线检测:每 15s 扫描,超 45s 未心跳的远程节点自动置离线 - 节点删除前检查关联任务,避免孤立备份任务 - BackupTaskRepository 新增 CountByNodeID/ListByNodeID Master 端 Agent 协议: - 新增 AgentCommand 模型与命令队列仓储(pending/dispatched/succeeded/failed/timeout) - 新增 AgentService:任务下发、命令轮询、结果回收、超时扫描 - 新增专用 Agent HTTP API(X-Agent-Token 认证): /api/agent/heartbeat /api/agent/commands/poll /api/agent/commands/:id/result /api/agent/tasks/:id /api/agent/records/:id - BackupExecutionService 支持 node 路由:task.NodeID 指向远程节点时自动入队派发 Agent CLI(backupx agent 子命令): - 配置:YAML 文件 / 环境变量 / CLI 参数,优先级 CLI > 文件 > 环境 - 心跳循环 + 命令轮询循环 + 优雅退出 - 本地复用 BackupRunner 与 storage registry 执行备份并直接上传 - 支持 run_task 和 list_dir 两种命令 远程目录浏览: - NodeService 支持通过 Agent RPC 列出远程节点目录(15s 超时) 前端: - NodesPage 添加节点后展示 Agent 启动命令和环境变量配置 文档: - README 中英文重写"多节点集群"章节,含架构图、步骤、限制、CLI 参考
This commit is contained in:
101
server/internal/repository/agent_command_repository.go
Normal file
101
server/internal/repository/agent_command_repository.go
Normal file
@@ -0,0 +1,101 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
"backupx/server/internal/model"
|
||||
"gorm.io/gorm"
|
||||
)
|
||||
|
||||
// AgentCommandRepository 维护 Agent 命令队列。
|
||||
type AgentCommandRepository interface {
|
||||
Create(ctx context.Context, cmd *model.AgentCommand) error
|
||||
FindByID(ctx context.Context, id uint) (*model.AgentCommand, error)
|
||||
// ClaimPending 以原子方式把该节点一条 pending 命令置为 dispatched,
|
||||
// 并返回领取到的命令。无命令时返回 (nil, nil)。
|
||||
ClaimPending(ctx context.Context, nodeID uint) (*model.AgentCommand, error)
|
||||
Update(ctx context.Context, cmd *model.AgentCommand) error
|
||||
// MarkStaleTimeout 把 dispatched 状态但超时未完成的命令标记为 timeout。
|
||||
MarkStaleTimeout(ctx context.Context, threshold time.Time) (int64, error)
|
||||
}
|
||||
|
||||
type GormAgentCommandRepository struct {
|
||||
db *gorm.DB
|
||||
}
|
||||
|
||||
func NewAgentCommandRepository(db *gorm.DB) *GormAgentCommandRepository {
|
||||
return &GormAgentCommandRepository{db: db}
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) Create(ctx context.Context, cmd *model.AgentCommand) error {
|
||||
return r.db.WithContext(ctx).Create(cmd).Error
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) FindByID(ctx context.Context, id uint) (*model.AgentCommand, error) {
|
||||
var item model.AgentCommand
|
||||
if err := r.db.WithContext(ctx).First(&item, id).Error; err != nil {
|
||||
if errors.Is(err, gorm.ErrRecordNotFound) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return &item, nil
|
||||
}
|
||||
|
||||
// ClaimPending 使用 UPDATE...WHERE id=(SELECT...) 的两步方式实现原子领取。
|
||||
// SQLite 不支持 SELECT FOR UPDATE,这里用事务 + 乐观锁。
|
||||
func (r *GormAgentCommandRepository) ClaimPending(ctx context.Context, nodeID uint) (*model.AgentCommand, error) {
|
||||
var claimed *model.AgentCommand
|
||||
err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
|
||||
var item model.AgentCommand
|
||||
err := tx.Where("node_id = ? AND status = ?", nodeID, model.AgentCommandStatusPending).
|
||||
Order("id asc").First(&item).Error
|
||||
if err != nil {
|
||||
if errors.Is(err, gorm.ErrRecordNotFound) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
now := time.Now().UTC()
|
||||
result := tx.Model(&model.AgentCommand{}).
|
||||
Where("id = ? AND status = ?", item.ID, model.AgentCommandStatusPending).
|
||||
Updates(map[string]any{
|
||||
"status": model.AgentCommandStatusDispatched,
|
||||
"dispatched_at": &now,
|
||||
})
|
||||
if result.Error != nil {
|
||||
return result.Error
|
||||
}
|
||||
if result.RowsAffected == 0 {
|
||||
// 被其它 worker 抢占,放弃
|
||||
return nil
|
||||
}
|
||||
item.Status = model.AgentCommandStatusDispatched
|
||||
item.DispatchedAt = &now
|
||||
claimed = &item
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return claimed, nil
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) Update(ctx context.Context, cmd *model.AgentCommand) error {
|
||||
return r.db.WithContext(ctx).Save(cmd).Error
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) MarkStaleTimeout(ctx context.Context, threshold time.Time) (int64, error) {
|
||||
result := r.db.WithContext(ctx).Model(&model.AgentCommand{}).
|
||||
Where("status = ? AND dispatched_at < ?", model.AgentCommandStatusDispatched, threshold).
|
||||
Updates(map[string]any{
|
||||
"status": model.AgentCommandStatusTimeout,
|
||||
"error_message": "agent did not report result before timeout",
|
||||
})
|
||||
if result.Error != nil {
|
||||
return 0, result.Error
|
||||
}
|
||||
return result.RowsAffected, nil
|
||||
}
|
||||
120
server/internal/repository/agent_command_repository_test.go
Normal file
120
server/internal/repository/agent_command_repository_test.go
Normal file
@@ -0,0 +1,120 @@
|
||||
package repository
|
||||
|
||||
import (
|
||||
"context"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"backupx/server/internal/model"
|
||||
"github.com/glebarez/sqlite"
|
||||
"gorm.io/gorm"
|
||||
gormlogger "gorm.io/gorm/logger"
|
||||
)
|
||||
|
||||
func newTestDB(t *testing.T) *gorm.DB {
|
||||
t.Helper()
|
||||
db, err := gorm.Open(sqlite.Open(":memory:"), &gorm.Config{Logger: gormlogger.Default.LogMode(gormlogger.Silent)})
|
||||
if err != nil {
|
||||
t.Fatalf("open: %v", err)
|
||||
}
|
||||
if err := db.AutoMigrate(&model.AgentCommand{}); err != nil {
|
||||
t.Fatalf("migrate: %v", err)
|
||||
}
|
||||
return db
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_ClaimPending(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
ctx := context.Background()
|
||||
|
||||
// 插入两条 pending 命令
|
||||
c1 := &model.AgentCommand{NodeID: 5, Type: "run_task", Status: model.AgentCommandStatusPending, Payload: `{"taskId":1}`}
|
||||
c2 := &model.AgentCommand{NodeID: 5, Type: "list_dir", Status: model.AgentCommandStatusPending, Payload: `{"path":"/"}`}
|
||||
c3 := &model.AgentCommand{NodeID: 99, Type: "run_task", Status: model.AgentCommandStatusPending}
|
||||
for _, c := range []*model.AgentCommand{c1, c2, c3} {
|
||||
if err := repo.Create(ctx, c); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
// 第一次 Claim 应拿到 c1
|
||||
claimed, err := repo.ClaimPending(ctx, 5)
|
||||
if err != nil {
|
||||
t.Fatalf("claim: %v", err)
|
||||
}
|
||||
if claimed == nil || claimed.ID != c1.ID || claimed.Status != model.AgentCommandStatusDispatched {
|
||||
t.Fatalf("expected c1 dispatched: %+v", claimed)
|
||||
}
|
||||
|
||||
// 第二次应拿到 c2
|
||||
claimed2, err := repo.ClaimPending(ctx, 5)
|
||||
if err != nil || claimed2 == nil || claimed2.ID != c2.ID {
|
||||
t.Fatalf("expected c2: %+v %v", claimed2, err)
|
||||
}
|
||||
|
||||
// 第三次无 pending,返回 nil
|
||||
claimed3, err := repo.ClaimPending(ctx, 5)
|
||||
if err != nil || claimed3 != nil {
|
||||
t.Fatalf("expected nil, got %+v", claimed3)
|
||||
}
|
||||
|
||||
// 不同 node 的命令不应被抢到
|
||||
other, err := repo.ClaimPending(ctx, 5)
|
||||
if err != nil || other != nil {
|
||||
t.Fatalf("expected nil: %+v", other)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_Update(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
ctx := context.Background()
|
||||
cmd := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusPending}
|
||||
_ = repo.Create(ctx, cmd)
|
||||
|
||||
cmd.Status = model.AgentCommandStatusSucceeded
|
||||
cmd.Result = `{"ok":true}`
|
||||
now := time.Now().UTC()
|
||||
cmd.CompletedAt = &now
|
||||
if err := repo.Update(ctx, cmd); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
got, err := repo.FindByID(ctx, cmd.ID)
|
||||
if err != nil || got == nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if got.Status != model.AgentCommandStatusSucceeded || got.Result != `{"ok":true}` {
|
||||
t.Errorf("mismatch: %+v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_MarkStaleTimeout(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
ctx := context.Background()
|
||||
old := time.Now().Add(-time.Hour)
|
||||
recent := time.Now()
|
||||
// 两条 dispatched:一条旧、一条新
|
||||
oldCmd := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusDispatched, DispatchedAt: &old}
|
||||
newCmd := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusDispatched, DispatchedAt: &recent}
|
||||
_ = repo.Create(ctx, oldCmd)
|
||||
_ = repo.Create(ctx, newCmd)
|
||||
|
||||
n, err := repo.MarkStaleTimeout(ctx, time.Now().Add(-30*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if n != 1 {
|
||||
t.Errorf("expected 1 row, got %d", n)
|
||||
}
|
||||
oldGot, _ := repo.FindByID(ctx, oldCmd.ID)
|
||||
newGot, _ := repo.FindByID(ctx, newCmd.ID)
|
||||
if oldGot.Status != model.AgentCommandStatusTimeout {
|
||||
t.Errorf("old should be timeout: %+v", oldGot)
|
||||
}
|
||||
if newGot.Status != model.AgentCommandStatusDispatched {
|
||||
t.Errorf("new should stay dispatched: %+v", newGot)
|
||||
}
|
||||
}
|
||||
@@ -21,6 +21,8 @@ type BackupTaskRepository interface {
|
||||
Count(context.Context) (int64, error)
|
||||
CountEnabled(context.Context) (int64, error)
|
||||
CountByStorageTargetID(context.Context, uint) (int64, error)
|
||||
CountByNodeID(context.Context, uint) (int64, error)
|
||||
ListByNodeID(context.Context, uint) ([]model.BackupTask, error)
|
||||
Create(context.Context, *model.BackupTask) error
|
||||
Update(context.Context, *model.BackupTask) error
|
||||
Delete(context.Context, uint) error
|
||||
@@ -103,6 +105,24 @@ func (r *GormBackupTaskRepository) CountByStorageTargetID(ctx context.Context, s
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// CountByNodeID 统计绑定到指定节点的任务数。用于删除节点前的引用检查。
|
||||
func (r *GormBackupTaskRepository) CountByNodeID(ctx context.Context, nodeID uint) (int64, error) {
|
||||
var count int64
|
||||
if err := r.db.WithContext(ctx).Model(&model.BackupTask{}).Where("node_id = ?", nodeID).Count(&count).Error; err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return count, nil
|
||||
}
|
||||
|
||||
// ListByNodeID 列出绑定到指定节点的任务。用于 Agent 拉取本节点待执行任务。
|
||||
func (r *GormBackupTaskRepository) ListByNodeID(ctx context.Context, nodeID uint) ([]model.BackupTask, error) {
|
||||
var items []model.BackupTask
|
||||
if err := r.db.WithContext(ctx).Preload("StorageTarget").Preload("StorageTargets").Where("node_id = ?", nodeID).Order("id asc").Find(&items).Error; err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (r *GormBackupTaskRepository) Create(ctx context.Context, item *model.BackupTask) error {
|
||||
if err := r.db.WithContext(ctx).Create(item).Error; err != nil {
|
||||
return err
|
||||
|
||||
@@ -3,6 +3,7 @@ package repository
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"time"
|
||||
|
||||
"backupx/server/internal/model"
|
||||
"gorm.io/gorm"
|
||||
@@ -16,6 +17,7 @@ type NodeRepository interface {
|
||||
Create(context.Context, *model.Node) error
|
||||
Update(context.Context, *model.Node) error
|
||||
Delete(context.Context, uint) error
|
||||
MarkStaleOffline(ctx context.Context, threshold time.Time) (int64, error)
|
||||
}
|
||||
|
||||
type GormNodeRepository struct {
|
||||
@@ -78,3 +80,16 @@ func (r *GormNodeRepository) Update(ctx context.Context, item *model.Node) error
|
||||
func (r *GormNodeRepository) Delete(ctx context.Context, id uint) error {
|
||||
return r.db.WithContext(ctx).Delete(&model.Node{}, id).Error
|
||||
}
|
||||
|
||||
// MarkStaleOffline 把最近心跳早于 threshold 的在线远程节点标记为离线。
|
||||
// 本机节点 (is_local=true) 不受影响,由主程序自己维护 online 状态。
|
||||
// 返回受影响行数。
|
||||
func (r *GormNodeRepository) MarkStaleOffline(ctx context.Context, threshold time.Time) (int64, error) {
|
||||
result := r.db.WithContext(ctx).Model(&model.Node{}).
|
||||
Where("is_local = ? AND status = ? AND last_seen < ?", false, model.NodeStatusOnline, threshold).
|
||||
Update("status", model.NodeStatusOffline)
|
||||
if result.Error != nil {
|
||||
return 0, result.Error
|
||||
}
|
||||
return result.RowsAffected, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user