Files
BackupX/server/internal/service/agent_service.go
2026-05-13 14:24:45 +08:00

543 lines
18 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"encoding/json"
"errors"
"fmt"
"strings"
"time"
"backupx/server/internal/apperror"
"backupx/server/internal/model"
"backupx/server/internal/repository"
"backupx/server/internal/storage/codec"
)
// AgentService 实现 Master 端 Agent 协议,提供给远程 Agent 通过 HTTP 调用。
// 所有方法使用 Agent Token 进行节点认证,避免暴露 JWT 给 Agent。
type AgentService struct {
nodeRepo repository.NodeRepository
taskRepo repository.BackupTaskRepository
recordRepo repository.BackupRecordRepository
storageRepo repository.StorageTargetRepository
cmdRepo repository.AgentCommandRepository
restoreRepo repository.RestoreRecordRepository
cipher *codec.ConfigCipher
}
func NewAgentService(
nodeRepo repository.NodeRepository,
taskRepo repository.BackupTaskRepository,
recordRepo repository.BackupRecordRepository,
storageRepo repository.StorageTargetRepository,
cmdRepo repository.AgentCommandRepository,
cipher *codec.ConfigCipher,
) *AgentService {
return &AgentService{
nodeRepo: nodeRepo,
taskRepo: taskRepo,
recordRepo: recordRepo,
storageRepo: storageRepo,
cmdRepo: cmdRepo,
cipher: cipher,
}
}
// SetRestoreRepository 注入恢复记录仓储,用于命令超时时联动 restore_record 状态。
// 可选注入:未注入时恢复命令超时仅标记命令 timeout记录需另行查验。
func (s *AgentService) SetRestoreRepository(repo repository.RestoreRecordRepository) {
s.restoreRepo = repo
}
// AuthenticatedNode 通过 token 解析并返回节点。失败返回 401。
func (s *AgentService) AuthenticatedNode(ctx context.Context, token string) (*model.Node, error) {
if strings.TrimSpace(token) == "" {
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "缺少认证令牌", nil)
}
node, err := s.nodeRepo.FindByToken(ctx, token)
if err != nil {
return nil, err
}
if node == nil {
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "无效的节点认证令牌", nil)
}
return node, nil
}
// AgentCommandPayload 给 Agent 返回的命令描述
type AgentCommandPayload struct {
ID uint `json:"id"`
Type string `json:"type"`
Payload json.RawMessage `json:"payload,omitempty"`
}
// PollCommand 为指定节点拉取一条 pending 命令;无命令时返回 (nil, nil)。
func (s *AgentService) PollCommand(ctx context.Context, node *model.Node) (*AgentCommandPayload, error) {
cmd, err := s.cmdRepo.ClaimPending(ctx, node.ID)
if err != nil {
return nil, err
}
if cmd == nil {
return nil, nil
}
return &AgentCommandPayload{
ID: cmd.ID,
Type: cmd.Type,
Payload: json.RawMessage(cmd.Payload),
}, nil
}
// AgentCommandResult Agent 上报命令执行结果
type AgentCommandResult struct {
Success bool `json:"success"`
ErrorMessage string `json:"errorMessage,omitempty"`
Result json.RawMessage `json:"result,omitempty"`
}
// SubmitCommandResult 接收 Agent 上报的命令结果。
func (s *AgentService) SubmitCommandResult(ctx context.Context, node *model.Node, cmdID uint, result AgentCommandResult) error {
cmd, err := s.cmdRepo.FindByID(ctx, cmdID)
if err != nil {
return err
}
if cmd == nil {
return apperror.New(404, "AGENT_COMMAND_NOT_FOUND", "命令不存在", fmt.Errorf("command %d not found", cmdID))
}
if cmd.NodeID != node.ID {
return apperror.Unauthorized("AGENT_COMMAND_FORBIDDEN", "命令不属于当前节点", nil)
}
now := time.Now().UTC()
if result.Success {
cmd.Status = model.AgentCommandStatusSucceeded
} else {
cmd.Status = model.AgentCommandStatusFailed
}
cmd.ErrorMessage = result.ErrorMessage
if len(result.Result) > 0 {
cmd.Result = string(result.Result)
}
cmd.CompletedAt = &now
_, err = s.cmdRepo.CompleteDispatched(ctx, cmd)
return err
}
// AgentTaskSpec 给 Agent 返回的任务规格,包含解密后的存储配置,供 Agent 直接执行。
// 敏感信息:此接口仅供 Agent 调用token 认证),避免通过公共 API 泄露。
type AgentTaskSpec struct {
TaskID uint `json:"taskId"`
Name string `json:"name"`
Type string `json:"type"`
SourcePath string `json:"sourcePath,omitempty"`
SourcePaths string `json:"sourcePaths,omitempty"`
ExcludePatterns string `json:"excludePatterns,omitempty"`
DBHost string `json:"dbHost,omitempty"`
DBPort int `json:"dbPort,omitempty"`
DBUser string `json:"dbUser,omitempty"`
DBPassword string `json:"dbPassword,omitempty"`
DBName string `json:"dbName,omitempty"`
DBPath string `json:"dbPath,omitempty"`
ExtraConfig string `json:"extraConfig,omitempty"`
Compression string `json:"compression"`
Encrypt bool `json:"encrypt"`
StorageTargets []AgentStorageTargetConfig `json:"storageTargets"`
}
// AgentStorageTargetConfig 存储目标配置(已解密)
type AgentStorageTargetConfig struct {
ID uint `json:"id"`
Type string `json:"type"`
Name string `json:"name"`
Config json.RawMessage `json:"config"`
}
// GetTaskSpec 返回 Agent 执行任务所需的完整规格。
func (s *AgentService) GetTaskSpec(ctx context.Context, node *model.Node, taskID uint) (*AgentTaskSpec, error) {
task, err := s.taskRepo.FindByID(ctx, taskID)
if err != nil {
return nil, err
}
if task == nil {
return nil, apperror.New(404, "BACKUP_TASK_NOT_FOUND", "任务不存在", nil)
}
if err := s.ensureTaskSpecAccess(ctx, node, task); err != nil {
return nil, err
}
// 解密数据库密码(若有)
dbPassword := ""
if task.DBPasswordCiphertext != "" {
plain, decErr := s.cipher.Decrypt(task.DBPasswordCiphertext)
if decErr != nil {
return nil, fmt.Errorf("decrypt db password: %w", decErr)
}
dbPassword = string(plain)
}
// 解密存储目标配置
targets := collectTargetIDs(task)
storageTargets := make([]AgentStorageTargetConfig, 0, len(targets))
for _, tid := range targets {
target, err := s.storageRepo.FindByID(ctx, tid)
if err != nil {
return nil, err
}
if target == nil {
continue
}
configRaw, err := s.cipher.Decrypt(target.ConfigCiphertext)
if err != nil {
return nil, fmt.Errorf("decrypt storage config: %w", err)
}
storageTargets = append(storageTargets, AgentStorageTargetConfig{
ID: target.ID,
Type: target.Type,
Name: target.Name,
Config: json.RawMessage(configRaw),
})
}
return &AgentTaskSpec{
TaskID: task.ID,
Name: task.Name,
Type: task.Type,
SourcePath: task.SourcePath,
SourcePaths: task.SourcePaths,
ExcludePatterns: task.ExcludePatterns,
DBHost: task.DBHost,
DBPort: task.DBPort,
DBUser: task.DBUser,
DBPassword: dbPassword,
DBName: task.DBName,
DBPath: task.DBPath,
ExtraConfig: task.ExtraConfig,
Compression: task.Compression,
Encrypt: task.Encrypt,
StorageTargets: storageTargets,
}, nil
}
func (s *AgentService) ensureTaskSpecAccess(ctx context.Context, node *model.Node, task *model.BackupTask) error {
if task.NodeID == node.ID {
return nil
}
record, err := s.recordRepo.FindRunningByTaskAndNode(ctx, task.ID, node.ID)
if err != nil {
return err
}
if record == nil {
return apperror.Unauthorized("BACKUP_TASK_FORBIDDEN", "任务不属于当前节点", nil)
}
return nil
}
// AgentRecordUpdate Agent 上报备份记录的最终状态。
type AgentRecordUpdate struct {
Status string `json:"status"` // running | success | failed
FileName string `json:"fileName,omitempty"`
FileSize int64 `json:"fileSize,omitempty"`
Checksum string `json:"checksum,omitempty"`
StoragePath string `json:"storagePath,omitempty"`
StorageTargetID uint `json:"storageTargetId,omitempty"`
StorageUploadResults []StorageUploadResultItem `json:"storageUploadResults,omitempty"`
ErrorMessage string `json:"errorMessage,omitempty"`
LogAppend string `json:"logAppend,omitempty"` // 增量日志,追加到 record.log_content
}
// UpdateRecord 更新备份记录的状态/日志。Agent 在执行过程中可多次调用。
func (s *AgentService) UpdateRecord(ctx context.Context, node *model.Node, recordID uint, update AgentRecordUpdate) error {
record, err := s.recordRepo.FindByID(ctx, recordID)
if err != nil {
return err
}
if record == nil {
return apperror.New(404, "BACKUP_RECORD_NOT_FOUND", "记录不存在", nil)
}
task, err := s.taskRepo.FindByID(ctx, record.TaskID)
if err != nil {
return err
}
if task == nil || !recordBelongsToNode(record, task, node.ID) {
return apperror.Unauthorized("BACKUP_RECORD_FORBIDDEN", "记录不属于当前节点", nil)
}
if isBackupRecordTerminal(record.Status) {
return nil
}
if update.Status != "" {
record.Status = update.Status
}
if update.FileName != "" {
record.FileName = update.FileName
}
if update.FileSize > 0 {
record.FileSize = update.FileSize
}
if update.Checksum != "" {
record.Checksum = update.Checksum
}
if update.StoragePath != "" {
record.StoragePath = update.StoragePath
}
if update.StorageTargetID > 0 {
record.StorageTargetID = update.StorageTargetID
}
if len(update.StorageUploadResults) > 0 {
if resultsJSON, marshalErr := json.Marshal(update.StorageUploadResults); marshalErr == nil {
record.StorageUploadResults = string(resultsJSON)
}
}
if update.ErrorMessage != "" {
record.ErrorMessage = update.ErrorMessage
}
if update.LogAppend != "" {
if record.LogContent == "" {
record.LogContent = update.LogAppend
} else {
record.LogContent += update.LogAppend
}
}
if update.Status == model.BackupRecordStatusSuccess || update.Status == model.BackupRecordStatusFailed {
now := time.Now().UTC()
record.CompletedAt = &now
record.DurationSeconds = int(now.Sub(record.StartedAt).Seconds())
}
if err := s.recordRepo.Update(ctx, record); err != nil {
return err
}
// 同步更新任务的 last_status
if update.Status == model.BackupRecordStatusSuccess || update.Status == model.BackupRecordStatusFailed {
task.LastStatus = update.Status
task.LastRunAt = &record.StartedAt
if err := s.taskRepo.Update(ctx, task); err != nil {
return fmt.Errorf("update backup task summary: %w", err)
}
}
return nil
}
func recordBelongsToNode(record *model.BackupRecord, task *model.BackupTask, nodeID uint) bool {
if record.NodeID != 0 {
return record.NodeID == nodeID
}
return task.NodeID == nodeID
}
func isBackupRecordTerminal(status string) bool {
return status == model.BackupRecordStatusSuccess || status == model.BackupRecordStatusFailed
}
// EnqueueCommand Master 端调用:给指定节点插入一条待执行命令。
// 返回命令 ID。
func (s *AgentService) EnqueueCommand(ctx context.Context, nodeID uint, cmdType string, payload any) (uint, error) {
if nodeID == 0 {
return 0, errors.New("nodeID is required")
}
payloadBytes, err := json.Marshal(payload)
if err != nil {
return 0, fmt.Errorf("marshal payload: %w", err)
}
cmd := &model.AgentCommand{
NodeID: nodeID,
Type: cmdType,
Status: model.AgentCommandStatusPending,
Payload: string(payloadBytes),
}
if err := s.cmdRepo.Create(ctx, cmd); err != nil {
return 0, err
}
return cmd.ID, nil
}
// WaitForCommandResult 同步等待指定命令完成(用于 list_dir 这类 RPC 式调用)。
// timeout 为 0 表示不限,建议传 10~30s。
func (s *AgentService) WaitForCommandResult(ctx context.Context, cmdID uint, timeout time.Duration) (*model.AgentCommand, error) {
deadline := time.Now().Add(timeout)
for {
cmd, err := s.cmdRepo.FindByID(ctx, cmdID)
if err != nil {
return nil, err
}
if cmd == nil {
return nil, apperror.New(404, "AGENT_COMMAND_NOT_FOUND", "命令不存在", nil)
}
switch cmd.Status {
case model.AgentCommandStatusSucceeded, model.AgentCommandStatusFailed, model.AgentCommandStatusTimeout:
return cmd, nil
}
if timeout > 0 && time.Now().After(deadline) {
return nil, apperror.New(504, "AGENT_COMMAND_TIMEOUT", "等待 Agent 响应超时", nil)
}
select {
case <-ctx.Done():
return nil, ctx.Err()
case <-time.After(300 * time.Millisecond):
}
}
}
// StartCommandTimeoutMonitor 启动后台定时任务,把超时命令标记为 timeout。
// 对于 run_task / restore_record 命令,同时把关联的 BackupRecord / RestoreRecord
// 标记为 failed避免 Agent 离线/崩溃时记录永远卡在 running。
func (s *AgentService) StartCommandTimeoutMonitor(ctx context.Context, interval time.Duration, timeout time.Duration) {
if interval <= 0 {
interval = 30 * time.Second
}
if timeout <= 0 {
timeout = 10 * time.Minute
}
ticker := time.NewTicker(interval)
go func() {
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
threshold := time.Now().UTC().Add(-timeout)
s.processStaleCommands(ctx, threshold)
}
}
}()
}
// processStaleCommands 扫描已超时的 pending/dispatched 命令并联动关联记录。
// 流程:先取超时候选 → 条件式把命令置为 timeout → 对抢到的命令联动 backup/restore 记录。
// 单条失败不影响后续处理。
func (s *AgentService) processStaleCommands(ctx context.Context, threshold time.Time) {
commands, err := s.cmdRepo.ListStaleActive(ctx, threshold)
if err != nil || len(commands) == 0 {
return
}
for i := range commands {
cmd := commands[i]
if s.commandStillActive(ctx, &cmd, threshold) {
continue
}
now := time.Now().UTC()
cmd.Status = model.AgentCommandStatusTimeout
cmd.ErrorMessage = "agent did not report result before timeout"
cmd.CompletedAt = &now
timedOut, err := s.cmdRepo.TimeoutActive(ctx, &cmd)
if err != nil || !timedOut {
continue
}
s.failLinkedRecord(ctx, &cmd)
}
}
// commandStillActive 用关联记录状态、记录更新时间和节点心跳作为长任务续租信号。
// 仅 run_task / restore_record 允许续租,避免短 RPC 命令被在线节点长期保留。
func (s *AgentService) commandStillActive(ctx context.Context, cmd *model.AgentCommand, threshold time.Time) bool {
if cmd.Status != model.AgentCommandStatusDispatched {
return false
}
switch cmd.Type {
case model.AgentCommandTypeRunTask:
var payload struct {
RecordID uint `json:"recordId"`
}
if err := json.Unmarshal([]byte(cmd.Payload), &payload); err != nil || payload.RecordID == 0 {
return false
}
record, err := s.recordRepo.FindByID(ctx, payload.RecordID)
if err != nil || record == nil || record.Status != model.BackupRecordStatusRunning {
return false
}
if s.nodeRecentlySeen(ctx, cmd.NodeID, threshold) {
return true
}
return record.UpdatedAt.After(threshold)
case model.AgentCommandTypeRestoreRecord:
if s.restoreRepo == nil {
return false
}
var payload struct {
RestoreRecordID uint `json:"restoreRecordId"`
}
if err := json.Unmarshal([]byte(cmd.Payload), &payload); err != nil || payload.RestoreRecordID == 0 {
return false
}
restore, err := s.restoreRepo.FindByID(ctx, payload.RestoreRecordID)
if err != nil || restore == nil || restore.Status != model.RestoreRecordStatusRunning {
return false
}
if s.nodeRecentlySeen(ctx, cmd.NodeID, threshold) {
return true
}
return restore.UpdatedAt.After(threshold)
default:
return false
}
}
func (s *AgentService) nodeRecentlySeen(ctx context.Context, nodeID uint, threshold time.Time) bool {
node, err := s.nodeRepo.FindByID(ctx, nodeID)
if err != nil || node == nil {
return false
}
return node.Status == model.NodeStatusOnline && node.LastSeen.After(threshold)
}
// failLinkedRecord 根据命令类型把关联记录标记为 failed。
// 只对仍然处于 running 状态的记录生效,避免覆盖已完成的结果。
func (s *AgentService) failLinkedRecord(ctx context.Context, cmd *model.AgentCommand) {
const failureMessage = "Agent 未在超时前回传状态(节点可能已离线或崩溃)"
switch cmd.Type {
case model.AgentCommandTypeRunTask:
var payload struct {
RecordID uint `json:"recordId"`
}
if err := json.Unmarshal([]byte(cmd.Payload), &payload); err != nil || payload.RecordID == 0 {
return
}
record, err := s.recordRepo.FindByID(ctx, payload.RecordID)
if err != nil || record == nil || record.Status != model.BackupRecordStatusRunning {
return
}
completedAt := time.Now().UTC()
record.Status = model.BackupRecordStatusFailed
record.ErrorMessage = failureMessage
record.CompletedAt = &completedAt
record.DurationSeconds = int(completedAt.Sub(record.StartedAt).Seconds())
_ = s.recordRepo.Update(ctx, record)
case model.AgentCommandTypeRestoreRecord:
if s.restoreRepo == nil {
return
}
var payload struct {
RestoreRecordID uint `json:"restoreRecordId"`
}
if err := json.Unmarshal([]byte(cmd.Payload), &payload); err != nil || payload.RestoreRecordID == 0 {
return
}
restore, err := s.restoreRepo.FindByID(ctx, payload.RestoreRecordID)
if err != nil || restore == nil || restore.Status != model.RestoreRecordStatusRunning {
return
}
completedAt := time.Now().UTC()
restore.Status = model.RestoreRecordStatusFailed
restore.ErrorMessage = failureMessage
restore.CompletedAt = &completedAt
restore.DurationSeconds = int(completedAt.Sub(restore.StartedAt).Seconds())
_ = s.restoreRepo.Update(ctx, restore)
}
}
// AgentSelfStatus 是 /api/v1/agent/self 端点返回给 Agent 的轻量状态摘要。
type AgentSelfStatus struct {
ID uint `json:"id"`
Name string `json:"name"`
Status string `json:"status"`
LastSeen time.Time `json:"lastSeen"`
}
// SelfStatus 返回 Agent token 所属节点的当前状态,供安装脚本末尾探活。
func (s *AgentService) SelfStatus(ctx context.Context, node *model.Node) (*AgentSelfStatus, error) {
if node == nil {
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "节点不存在", nil)
}
return &AgentSelfStatus{
ID: node.ID,
Name: node.Name,
Status: node.Status,
LastSeen: node.LastSeen,
}, nil
}