Files
BackupX/server/internal/repository/agent_command_repository.go
Wu Qing 757b0fa5ed 功能: 修复并实现多节点集群部署 (#38)
基础修复:
- 新增节点离线检测:每 15s 扫描,超 45s 未心跳的远程节点自动置离线
- 节点删除前检查关联任务,避免孤立备份任务
- BackupTaskRepository 新增 CountByNodeID/ListByNodeID

Master 端 Agent 协议:
- 新增 AgentCommand 模型与命令队列仓储(pending/dispatched/succeeded/failed/timeout)
- 新增 AgentService:任务下发、命令轮询、结果回收、超时扫描
- 新增专用 Agent HTTP API(X-Agent-Token 认证):
  /api/agent/heartbeat
  /api/agent/commands/poll
  /api/agent/commands/:id/result
  /api/agent/tasks/:id
  /api/agent/records/:id
- BackupExecutionService 支持 node 路由:task.NodeID 指向远程节点时自动入队派发

Agent CLI(backupx agent 子命令):
- 配置:YAML 文件 / 环境变量 / CLI 参数,优先级 CLI > 文件 > 环境
- 心跳循环 + 命令轮询循环 + 优雅退出
- 本地复用 BackupRunner 与 storage registry 执行备份并直接上传
- 支持 run_task 和 list_dir 两种命令

远程目录浏览:
- NodeService 支持通过 Agent RPC 列出远程节点目录(15s 超时)

前端:
- NodesPage 添加节点后展示 Agent 启动命令和环境变量配置

文档:
- README 中英文重写"多节点集群"章节,含架构图、步骤、限制、CLI 参考
2026-04-17 12:29:08 +08:00

102 lines
3.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package repository
import (
"context"
"errors"
"time"
"backupx/server/internal/model"
"gorm.io/gorm"
)
// AgentCommandRepository 维护 Agent 命令队列。
type AgentCommandRepository interface {
Create(ctx context.Context, cmd *model.AgentCommand) error
FindByID(ctx context.Context, id uint) (*model.AgentCommand, error)
// ClaimPending 以原子方式把该节点一条 pending 命令置为 dispatched
// 并返回领取到的命令。无命令时返回 (nil, nil)。
ClaimPending(ctx context.Context, nodeID uint) (*model.AgentCommand, error)
Update(ctx context.Context, cmd *model.AgentCommand) error
// MarkStaleTimeout 把 dispatched 状态但超时未完成的命令标记为 timeout。
MarkStaleTimeout(ctx context.Context, threshold time.Time) (int64, error)
}
type GormAgentCommandRepository struct {
db *gorm.DB
}
func NewAgentCommandRepository(db *gorm.DB) *GormAgentCommandRepository {
return &GormAgentCommandRepository{db: db}
}
func (r *GormAgentCommandRepository) Create(ctx context.Context, cmd *model.AgentCommand) error {
return r.db.WithContext(ctx).Create(cmd).Error
}
func (r *GormAgentCommandRepository) FindByID(ctx context.Context, id uint) (*model.AgentCommand, error) {
var item model.AgentCommand
if err := r.db.WithContext(ctx).First(&item, id).Error; err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
return nil, nil
}
return nil, err
}
return &item, nil
}
// ClaimPending 使用 UPDATE...WHERE id=(SELECT...) 的两步方式实现原子领取。
// SQLite 不支持 SELECT FOR UPDATE这里用事务 + 乐观锁。
func (r *GormAgentCommandRepository) ClaimPending(ctx context.Context, nodeID uint) (*model.AgentCommand, error) {
var claimed *model.AgentCommand
err := r.db.WithContext(ctx).Transaction(func(tx *gorm.DB) error {
var item model.AgentCommand
err := tx.Where("node_id = ? AND status = ?", nodeID, model.AgentCommandStatusPending).
Order("id asc").First(&item).Error
if err != nil {
if errors.Is(err, gorm.ErrRecordNotFound) {
return nil
}
return err
}
now := time.Now().UTC()
result := tx.Model(&model.AgentCommand{}).
Where("id = ? AND status = ?", item.ID, model.AgentCommandStatusPending).
Updates(map[string]any{
"status": model.AgentCommandStatusDispatched,
"dispatched_at": &now,
})
if result.Error != nil {
return result.Error
}
if result.RowsAffected == 0 {
// 被其它 worker 抢占,放弃
return nil
}
item.Status = model.AgentCommandStatusDispatched
item.DispatchedAt = &now
claimed = &item
return nil
})
if err != nil {
return nil, err
}
return claimed, nil
}
func (r *GormAgentCommandRepository) Update(ctx context.Context, cmd *model.AgentCommand) error {
return r.db.WithContext(ctx).Save(cmd).Error
}
func (r *GormAgentCommandRepository) MarkStaleTimeout(ctx context.Context, threshold time.Time) (int64, error) {
result := r.db.WithContext(ctx).Model(&model.AgentCommand{}).
Where("status = ? AND dispatched_at < ?", model.AgentCommandStatusDispatched, threshold).
Updates(map[string]any{
"status": model.AgentCommandStatusTimeout,
"error_message": "agent did not report result before timeout",
})
if result.Error != nil {
return 0, result.Error
}
return result.RowsAffected, nil
}