Files
BackupX/server/internal/service/agent_service.go
Wu Qing 757b0fa5ed 功能: 修复并实现多节点集群部署 (#38)
基础修复:
- 新增节点离线检测:每 15s 扫描,超 45s 未心跳的远程节点自动置离线
- 节点删除前检查关联任务,避免孤立备份任务
- BackupTaskRepository 新增 CountByNodeID/ListByNodeID

Master 端 Agent 协议:
- 新增 AgentCommand 模型与命令队列仓储(pending/dispatched/succeeded/failed/timeout)
- 新增 AgentService:任务下发、命令轮询、结果回收、超时扫描
- 新增专用 Agent HTTP API(X-Agent-Token 认证):
  /api/agent/heartbeat
  /api/agent/commands/poll
  /api/agent/commands/:id/result
  /api/agent/tasks/:id
  /api/agent/records/:id
- BackupExecutionService 支持 node 路由:task.NodeID 指向远程节点时自动入队派发

Agent CLI(backupx agent 子命令):
- 配置:YAML 文件 / 环境变量 / CLI 参数,优先级 CLI > 文件 > 环境
- 心跳循环 + 命令轮询循环 + 优雅退出
- 本地复用 BackupRunner 与 storage registry 执行备份并直接上传
- 支持 run_task 和 list_dir 两种命令

远程目录浏览:
- NodeService 支持通过 Agent RPC 列出远程节点目录(15s 超时)

前端:
- NodesPage 添加节点后展示 Agent 启动命令和环境变量配置

文档:
- README 中英文重写"多节点集群"章节,含架构图、步骤、限制、CLI 参考
2026-04-17 12:29:08 +08:00

349 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"backupx/server/internal/apperror"
"backupx/server/internal/model"
"backupx/server/internal/repository"
"backupx/server/internal/storage/codec"
)
// AgentService 实现 Master 端 Agent 协议,提供给远程 Agent 通过 HTTP 调用。
// 所有方法使用 Agent Token 进行节点认证,避免暴露 JWT 给 Agent。
type AgentService struct {
nodeRepo repository.NodeRepository
taskRepo repository.BackupTaskRepository
recordRepo repository.BackupRecordRepository
storageRepo repository.StorageTargetRepository
cmdRepo repository.AgentCommandRepository
cipher *codec.ConfigCipher
}
func NewAgentService(
nodeRepo repository.NodeRepository,
taskRepo repository.BackupTaskRepository,
recordRepo repository.BackupRecordRepository,
storageRepo repository.StorageTargetRepository,
cmdRepo repository.AgentCommandRepository,
cipher *codec.ConfigCipher,
) *AgentService {
return &AgentService{
nodeRepo: nodeRepo,
taskRepo: taskRepo,
recordRepo: recordRepo,
storageRepo: storageRepo,
cmdRepo: cmdRepo,
cipher: cipher,
}
}
// AuthenticatedNode 通过 token 解析并返回节点。失败返回 401。
func (s *AgentService) AuthenticatedNode(ctx context.Context, token string) (*model.Node, error) {
if strings.TrimSpace(token) == "" {
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "缺少认证令牌", nil)
}
node, err := s.nodeRepo.FindByToken(ctx, token)
if err != nil {
return nil, err
}
if node == nil {
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "无效的节点认证令牌", nil)
}
return node, nil
}
// AgentCommandPayload 给 Agent 返回的命令描述
type AgentCommandPayload struct {
ID uint `json:"id"`
Type string `json:"type"`
Payload json.RawMessage `json:"payload,omitempty"`
}
// PollCommand 为指定节点拉取一条 pending 命令;无命令时返回 (nil, nil)。
func (s *AgentService) PollCommand(ctx context.Context, node *model.Node) (*AgentCommandPayload, error) {
cmd, err := s.cmdRepo.ClaimPending(ctx, node.ID)
if err != nil {
return nil, err
}
if cmd == nil {
return nil, nil
}
return &AgentCommandPayload{
ID: cmd.ID,
Type: cmd.Type,
Payload: json.RawMessage(cmd.Payload),
}, nil
}
// AgentCommandResult Agent 上报命令执行结果
type AgentCommandResult struct {
Success bool `json:"success"`
ErrorMessage string `json:"errorMessage,omitempty"`
Result json.RawMessage `json:"result,omitempty"`
}
// SubmitCommandResult 接收 Agent 上报的命令结果。
func (s *AgentService) SubmitCommandResult(ctx context.Context, node *model.Node, cmdID uint, result AgentCommandResult) error {
cmd, err := s.cmdRepo.FindByID(ctx, cmdID)
if err != nil {
return err
}
if cmd == nil {
return apperror.New(404, "AGENT_COMMAND_NOT_FOUND", "命令不存在", fmt.Errorf("command %d not found", cmdID))
}
if cmd.NodeID != node.ID {
return apperror.Unauthorized("AGENT_COMMAND_FORBIDDEN", "命令不属于当前节点", nil)
}
now := time.Now().UTC()
if result.Success {
cmd.Status = model.AgentCommandStatusSucceeded
} else {
cmd.Status = model.AgentCommandStatusFailed
}
cmd.ErrorMessage = result.ErrorMessage
if len(result.Result) > 0 {
cmd.Result = string(result.Result)
}
cmd.CompletedAt = &now
return s.cmdRepo.Update(ctx, cmd)
}
// AgentTaskSpec 给 Agent 返回的任务规格,包含解密后的存储配置,供 Agent 直接执行。
// 敏感信息:此接口仅供 Agent 调用token 认证),避免通过公共 API 泄露。
type AgentTaskSpec struct {
TaskID uint `json:"taskId"`
Name string `json:"name"`
Type string `json:"type"`
SourcePath string `json:"sourcePath,omitempty"`
SourcePaths string `json:"sourcePaths,omitempty"`
ExcludePatterns string `json:"excludePatterns,omitempty"`
DBHost string `json:"dbHost,omitempty"`
DBPort int `json:"dbPort,omitempty"`
DBUser string `json:"dbUser,omitempty"`
DBPassword string `json:"dbPassword,omitempty"`
DBName string `json:"dbName,omitempty"`
DBPath string `json:"dbPath,omitempty"`
ExtraConfig string `json:"extraConfig,omitempty"`
Compression string `json:"compression"`
Encrypt bool `json:"encrypt"`
StorageTargets []AgentStorageTargetConfig `json:"storageTargets"`
}
// AgentStorageTargetConfig 存储目标配置(已解密)
type AgentStorageTargetConfig struct {
ID uint `json:"id"`
Type string `json:"type"`
Name string `json:"name"`
Config json.RawMessage `json:"config"`
}
// GetTaskSpec 返回 Agent 执行任务所需的完整规格。
func (s *AgentService) GetTaskSpec(ctx context.Context, node *model.Node, taskID uint) (*AgentTaskSpec, error) {
task, err := s.taskRepo.FindByID(ctx, taskID)
if err != nil {
return nil, err
}
if task == nil {
return nil, apperror.New(404, "BACKUP_TASK_NOT_FOUND", "任务不存在", nil)
}
if task.NodeID != node.ID {
return nil, apperror.Unauthorized("BACKUP_TASK_FORBIDDEN", "任务不属于当前节点", nil)
}
// 解密数据库密码(若有)
dbPassword := ""
if task.DBPasswordCiphertext != "" {
plain, decErr := s.cipher.Decrypt(task.DBPasswordCiphertext)
if decErr != nil {
return nil, fmt.Errorf("decrypt db password: %w", decErr)
}
dbPassword = string(plain)
}
// 解密存储目标配置
targets := collectTargetIDs(task)
storageTargets := make([]AgentStorageTargetConfig, 0, len(targets))
for _, tid := range targets {
target, err := s.storageRepo.FindByID(ctx, tid)
if err != nil {
return nil, err
}
if target == nil {
continue
}
configRaw, err := s.cipher.Decrypt(target.ConfigCiphertext)
if err != nil {
return nil, fmt.Errorf("decrypt storage config: %w", err)
}
storageTargets = append(storageTargets, AgentStorageTargetConfig{
ID: target.ID,
Type: target.Type,
Name: target.Name,
Config: json.RawMessage(configRaw),
})
}
return &AgentTaskSpec{
TaskID: task.ID,
Name: task.Name,
Type: task.Type,
SourcePath: task.SourcePath,
SourcePaths: task.SourcePaths,
ExcludePatterns: task.ExcludePatterns,
DBHost: task.DBHost,
DBPort: task.DBPort,
DBUser: task.DBUser,
DBPassword: dbPassword,
DBName: task.DBName,
DBPath: task.DBPath,
ExtraConfig: task.ExtraConfig,
Compression: task.Compression,
Encrypt: task.Encrypt,
StorageTargets: storageTargets,
}, nil
}
// AgentRecordUpdate Agent 上报备份记录的最终状态。
type AgentRecordUpdate struct {
Status string `json:"status"` // running | success | failed
FileName string `json:"fileName,omitempty"`
FileSize int64 `json:"fileSize,omitempty"`
Checksum string `json:"checksum,omitempty"`
StoragePath string `json:"storagePath,omitempty"`
ErrorMessage string `json:"errorMessage,omitempty"`
LogAppend string `json:"logAppend,omitempty"` // 增量日志,追加到 record.log_content
}
// UpdateRecord 更新备份记录的状态/日志。Agent 在执行过程中可多次调用。
func (s *AgentService) UpdateRecord(ctx context.Context, node *model.Node, recordID uint, update AgentRecordUpdate) error {
record, err := s.recordRepo.FindByID(ctx, recordID)
if err != nil {
return err
}
if record == nil {
return apperror.New(404, "BACKUP_RECORD_NOT_FOUND", "记录不存在", nil)
}
// 通过 task.NodeID 判断是否属于当前 agent
task, err := s.taskRepo.FindByID(ctx, record.TaskID)
if err != nil {
return err
}
if task == nil || task.NodeID != node.ID {
return apperror.Unauthorized("BACKUP_RECORD_FORBIDDEN", "记录不属于当前节点", nil)
}
if update.Status != "" {
record.Status = update.Status
}
if update.FileName != "" {
record.FileName = update.FileName
}
if update.FileSize > 0 {
record.FileSize = update.FileSize
}
if update.Checksum != "" {
record.Checksum = update.Checksum
}
if update.StoragePath != "" {
record.StoragePath = update.StoragePath
}
if update.ErrorMessage != "" {
record.ErrorMessage = update.ErrorMessage
}
if update.LogAppend != "" {
if record.LogContent == "" {
record.LogContent = update.LogAppend
} else {
record.LogContent += update.LogAppend
}
}
if update.Status == model.BackupRecordStatusSuccess || update.Status == model.BackupRecordStatusFailed {
now := time.Now().UTC()
record.CompletedAt = &now
record.DurationSeconds = int(now.Sub(record.StartedAt).Seconds())
}
if err := s.recordRepo.Update(ctx, record); err != nil {
return err
}
// 同步更新任务的 last_status
if update.Status == model.BackupRecordStatusSuccess || update.Status == model.BackupRecordStatusFailed {
task.LastStatus = update.Status
_ = s.taskRepo.Update(ctx, task)
}
return nil
}
// EnqueueCommand Master 端调用:给指定节点插入一条待执行命令。
// 返回命令 ID。
func (s *AgentService) EnqueueCommand(ctx context.Context, nodeID uint, cmdType string, payload any) (uint, error) {
if nodeID == 0 {
return 0, errors.New("nodeID is required")
}
payloadBytes, err := json.Marshal(payload)
if err != nil {
return 0, fmt.Errorf("marshal payload: %w", err)
}
cmd := &model.AgentCommand{
NodeID: nodeID,
Type: cmdType,
Status: model.AgentCommandStatusPending,
Payload: string(payloadBytes),
}
if err := s.cmdRepo.Create(ctx, cmd); err != nil {
return 0, err
}
return cmd.ID, nil
}
// WaitForCommandResult 同步等待指定命令完成(用于 list_dir 这类 RPC 式调用)。
// timeout 为 0 表示不限,建议传 10~30s。
func (s *AgentService) WaitForCommandResult(ctx context.Context, cmdID uint, timeout time.Duration) (*model.AgentCommand, error) {
deadline := time.Now().Add(timeout)
for {
cmd, err := s.cmdRepo.FindByID(ctx, cmdID)
if err != nil {
return nil, err
}
if cmd == nil {
return nil, apperror.New(404, "AGENT_COMMAND_NOT_FOUND", "命令不存在", nil)
}
switch cmd.Status {
case model.AgentCommandStatusSucceeded, model.AgentCommandStatusFailed, model.AgentCommandStatusTimeout:
return cmd, nil
}
if timeout > 0 && time.Now().After(deadline) {
return nil, apperror.New(504, "AGENT_COMMAND_TIMEOUT", "等待 Agent 响应超时", nil)
}
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(300 * time.Millisecond):
}
}
}
// StartCommandTimeoutMonitor 启动后台定时任务,把超时命令标记为 timeout。
func (s *AgentService) StartCommandTimeoutMonitor(ctx context.Context, interval time.Duration, timeout time.Duration) {
if interval <= 0 {
interval = 30 * time.Second
}
if timeout <= 0 {
timeout = 10 * time.Minute
}
ticker := time.NewTicker(interval)
go func() {
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
threshold := time.Now().UTC().Add(-timeout)
_, _ = s.cmdRepo.MarkStaleTimeout(ctx, threshold)
}
}
}()
}