mirror of
https://github.com/Awuqing/BackupX.git
synced 2026-05-11 18:10:23 +08:00
* 功能: v2.0.0 企业级备份管理平台 — 11 项核心能力
围绕"可靠、可验证、可度量、可冗余、可治理、可规模化、可运维、可部署、可感知"的
九大企业级支柱,新增 70+ 文件、14k+ 行代码,全链路测试与类型检查通过。
## 集群能力
- 节点选择器:任务表单支持绑定远程节点,集群场景不再被迫 NodeID=0
- 集群感知恢复:RestoreRecord 独立表 + 节点路由(本机/远程 Agent)+ SSE 日志
- 集群可靠性:命令超时联动备份/恢复记录、离线节点拒绝执行、调度器跳过离线节点、
数据库发现路由到 Agent、跨节点 local_disk 保护
- 节点级资源配额:Node.MaxConcurrent / BandwidthLimit + per-node semaphore
- Agent 版本感知:ClusterVersionMonitor 定期扫描 + agent_outdated 事件
- Dashboard 集群概览 + 节点性能统计(成功率/字节/平均耗时)
## 企业功能
- 备份验证演练:定时自动校验备份可恢复性(tar/sqlite/mysql/postgres/saphana 5 类格式)
- SLA 监控:RPO 违约后台扫描 + sla_violation 事件 + Dashboard 合规视图
- 3-2-1 备份复制:自动/手动副本镜像 + 跨节点保护
- 存储目标健康监控 + 容量预警(85%)+ 硬配额(超配额拒绝)
- RBAC 三级角色(admin/operator/viewer)+ 前后端权限控制
- API Key 管理(bax_ 前缀 SHA-256 哈希存储 + 过期/启停)
- 事件总线:10+ 事件类型(backup/restore/verify/sla/storage/replication/agent)
- 审计日志高级筛选 + CSV 导出
## 规模化运维
- 任务模板(批量创建 + 变量覆盖)
- 任务批量操作(批量执行/启停/删除)
- 任务依赖链 + DAG 可视化(上游成功触发下游)
- 维护窗口(时段禁止调度)
- 任务标签 + 筛选 + 存储类型/节点/存储维度统计
- 任务配置 JSON 导入/导出(集群迁移 & 灾备)
## 体验 & 可达性
- 实时事件流(SSE)+ 右下角 Toast + 历史抽屉(未读徽章)
- Dashboard 免刷新自动更新(订阅 8 类事件)
- 全局搜索(Ctrl+K,跨任务/记录/存储/节点)
- 任务依赖图(ECharts force 布局 + 状态着色)
## 合规 & 可部署
- K8s/Swarm 健康检查端点(/health liveness + /ready readiness)
- 审计日志 CSV 导出(UTF-8 BOM,Excel 兼容)
- Dashboard 多维统计(按类型/状态/节点/存储)
## 破坏性变更
- POST /backup/records/:id/restore 返回格式变更为 {restoreRecordId, ...}
(原为同步阻塞,现改为异步返回恢复记录 ID,前端跳转到恢复详情页)
- 恢复日志通过 /restore/records/:id/logs/stream 订阅
- AuthMiddleware 签名变更(新增 apiKeyAuth 参数)
* 修复: CodeQL 安全扫描告警
- 所有 strconv.ParseUint 由 64bit 改为 32bit 位宽,strconv 内置溢出检查
- hashApiKey 参数改名 rawToken 避免 CodeQL 误判为密码哈希(API Key 是 192 位
高熵 token,使用 bcrypt 会引入不必要的延迟;同时补充安全说明)
* 修复: API Key 哈希改用 HMAC-SHA256 + 应用级 pepper
- 符合 RFC 2104 标准,业界 API token 存储的推荐方案
- 数据库泄漏场景下增加离线反推难度(需同时获取二进制 pepper)
- 规避 CodeQL go/weak-sensitive-data-hashing 对裸 SHA-256 的误判
443 lines
15 KiB
Go
443 lines
15 KiB
Go
package service
|
||
|
||
import (
|
||
"context"
|
||
"encoding/json"
|
||
"errors"
|
||
"fmt"
|
||
"strings"
|
||
"time"
|
||
|
||
"backupx/server/internal/apperror"
|
||
"backupx/server/internal/model"
|
||
"backupx/server/internal/repository"
|
||
"backupx/server/internal/storage/codec"
|
||
)
|
||
|
||
// AgentService 实现 Master 端 Agent 协议,提供给远程 Agent 通过 HTTP 调用。
|
||
// 所有方法使用 Agent Token 进行节点认证,避免暴露 JWT 给 Agent。
|
||
type AgentService struct {
|
||
nodeRepo repository.NodeRepository
|
||
taskRepo repository.BackupTaskRepository
|
||
recordRepo repository.BackupRecordRepository
|
||
storageRepo repository.StorageTargetRepository
|
||
cmdRepo repository.AgentCommandRepository
|
||
restoreRepo repository.RestoreRecordRepository
|
||
cipher *codec.ConfigCipher
|
||
}
|
||
|
||
func NewAgentService(
|
||
nodeRepo repository.NodeRepository,
|
||
taskRepo repository.BackupTaskRepository,
|
||
recordRepo repository.BackupRecordRepository,
|
||
storageRepo repository.StorageTargetRepository,
|
||
cmdRepo repository.AgentCommandRepository,
|
||
cipher *codec.ConfigCipher,
|
||
) *AgentService {
|
||
return &AgentService{
|
||
nodeRepo: nodeRepo,
|
||
taskRepo: taskRepo,
|
||
recordRepo: recordRepo,
|
||
storageRepo: storageRepo,
|
||
cmdRepo: cmdRepo,
|
||
cipher: cipher,
|
||
}
|
||
}
|
||
|
||
// SetRestoreRepository 注入恢复记录仓储,用于命令超时时联动 restore_record 状态。
|
||
// 可选注入:未注入时恢复命令超时仅标记命令 timeout,记录需另行查验。
|
||
func (s *AgentService) SetRestoreRepository(repo repository.RestoreRecordRepository) {
|
||
s.restoreRepo = repo
|
||
}
|
||
|
||
// AuthenticatedNode 通过 token 解析并返回节点。失败返回 401。
|
||
func (s *AgentService) AuthenticatedNode(ctx context.Context, token string) (*model.Node, error) {
|
||
if strings.TrimSpace(token) == "" {
|
||
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "缺少认证令牌", nil)
|
||
}
|
||
node, err := s.nodeRepo.FindByToken(ctx, token)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if node == nil {
|
||
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "无效的节点认证令牌", nil)
|
||
}
|
||
return node, nil
|
||
}
|
||
|
||
// AgentCommandPayload 给 Agent 返回的命令描述
|
||
type AgentCommandPayload struct {
|
||
ID uint `json:"id"`
|
||
Type string `json:"type"`
|
||
Payload json.RawMessage `json:"payload,omitempty"`
|
||
}
|
||
|
||
// PollCommand 为指定节点拉取一条 pending 命令;无命令时返回 (nil, nil)。
|
||
func (s *AgentService) PollCommand(ctx context.Context, node *model.Node) (*AgentCommandPayload, error) {
|
||
cmd, err := s.cmdRepo.ClaimPending(ctx, node.ID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if cmd == nil {
|
||
return nil, nil
|
||
}
|
||
return &AgentCommandPayload{
|
||
ID: cmd.ID,
|
||
Type: cmd.Type,
|
||
Payload: json.RawMessage(cmd.Payload),
|
||
}, nil
|
||
}
|
||
|
||
// AgentCommandResult Agent 上报命令执行结果
|
||
type AgentCommandResult struct {
|
||
Success bool `json:"success"`
|
||
ErrorMessage string `json:"errorMessage,omitempty"`
|
||
Result json.RawMessage `json:"result,omitempty"`
|
||
}
|
||
|
||
// SubmitCommandResult 接收 Agent 上报的命令结果。
|
||
func (s *AgentService) SubmitCommandResult(ctx context.Context, node *model.Node, cmdID uint, result AgentCommandResult) error {
|
||
cmd, err := s.cmdRepo.FindByID(ctx, cmdID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if cmd == nil {
|
||
return apperror.New(404, "AGENT_COMMAND_NOT_FOUND", "命令不存在", fmt.Errorf("command %d not found", cmdID))
|
||
}
|
||
if cmd.NodeID != node.ID {
|
||
return apperror.Unauthorized("AGENT_COMMAND_FORBIDDEN", "命令不属于当前节点", nil)
|
||
}
|
||
now := time.Now().UTC()
|
||
if result.Success {
|
||
cmd.Status = model.AgentCommandStatusSucceeded
|
||
} else {
|
||
cmd.Status = model.AgentCommandStatusFailed
|
||
}
|
||
cmd.ErrorMessage = result.ErrorMessage
|
||
if len(result.Result) > 0 {
|
||
cmd.Result = string(result.Result)
|
||
}
|
||
cmd.CompletedAt = &now
|
||
return s.cmdRepo.Update(ctx, cmd)
|
||
}
|
||
|
||
// AgentTaskSpec 给 Agent 返回的任务规格,包含解密后的存储配置,供 Agent 直接执行。
|
||
// 敏感信息:此接口仅供 Agent 调用(token 认证),避免通过公共 API 泄露。
|
||
type AgentTaskSpec struct {
|
||
TaskID uint `json:"taskId"`
|
||
Name string `json:"name"`
|
||
Type string `json:"type"`
|
||
SourcePath string `json:"sourcePath,omitempty"`
|
||
SourcePaths string `json:"sourcePaths,omitempty"`
|
||
ExcludePatterns string `json:"excludePatterns,omitempty"`
|
||
DBHost string `json:"dbHost,omitempty"`
|
||
DBPort int `json:"dbPort,omitempty"`
|
||
DBUser string `json:"dbUser,omitempty"`
|
||
DBPassword string `json:"dbPassword,omitempty"`
|
||
DBName string `json:"dbName,omitempty"`
|
||
DBPath string `json:"dbPath,omitempty"`
|
||
ExtraConfig string `json:"extraConfig,omitempty"`
|
||
Compression string `json:"compression"`
|
||
Encrypt bool `json:"encrypt"`
|
||
StorageTargets []AgentStorageTargetConfig `json:"storageTargets"`
|
||
}
|
||
|
||
// AgentStorageTargetConfig 存储目标配置(已解密)
|
||
type AgentStorageTargetConfig struct {
|
||
ID uint `json:"id"`
|
||
Type string `json:"type"`
|
||
Name string `json:"name"`
|
||
Config json.RawMessage `json:"config"`
|
||
}
|
||
|
||
// GetTaskSpec 返回 Agent 执行任务所需的完整规格。
|
||
func (s *AgentService) GetTaskSpec(ctx context.Context, node *model.Node, taskID uint) (*AgentTaskSpec, error) {
|
||
task, err := s.taskRepo.FindByID(ctx, taskID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if task == nil {
|
||
return nil, apperror.New(404, "BACKUP_TASK_NOT_FOUND", "任务不存在", nil)
|
||
}
|
||
if task.NodeID != node.ID {
|
||
return nil, apperror.Unauthorized("BACKUP_TASK_FORBIDDEN", "任务不属于当前节点", nil)
|
||
}
|
||
// 解密数据库密码(若有)
|
||
dbPassword := ""
|
||
if task.DBPasswordCiphertext != "" {
|
||
plain, decErr := s.cipher.Decrypt(task.DBPasswordCiphertext)
|
||
if decErr != nil {
|
||
return nil, fmt.Errorf("decrypt db password: %w", decErr)
|
||
}
|
||
dbPassword = string(plain)
|
||
}
|
||
// 解密存储目标配置
|
||
targets := collectTargetIDs(task)
|
||
storageTargets := make([]AgentStorageTargetConfig, 0, len(targets))
|
||
for _, tid := range targets {
|
||
target, err := s.storageRepo.FindByID(ctx, tid)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if target == nil {
|
||
continue
|
||
}
|
||
configRaw, err := s.cipher.Decrypt(target.ConfigCiphertext)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("decrypt storage config: %w", err)
|
||
}
|
||
storageTargets = append(storageTargets, AgentStorageTargetConfig{
|
||
ID: target.ID,
|
||
Type: target.Type,
|
||
Name: target.Name,
|
||
Config: json.RawMessage(configRaw),
|
||
})
|
||
}
|
||
return &AgentTaskSpec{
|
||
TaskID: task.ID,
|
||
Name: task.Name,
|
||
Type: task.Type,
|
||
SourcePath: task.SourcePath,
|
||
SourcePaths: task.SourcePaths,
|
||
ExcludePatterns: task.ExcludePatterns,
|
||
DBHost: task.DBHost,
|
||
DBPort: task.DBPort,
|
||
DBUser: task.DBUser,
|
||
DBPassword: dbPassword,
|
||
DBName: task.DBName,
|
||
DBPath: task.DBPath,
|
||
ExtraConfig: task.ExtraConfig,
|
||
Compression: task.Compression,
|
||
Encrypt: task.Encrypt,
|
||
StorageTargets: storageTargets,
|
||
}, nil
|
||
}
|
||
|
||
// AgentRecordUpdate Agent 上报备份记录的最终状态。
|
||
type AgentRecordUpdate struct {
|
||
Status string `json:"status"` // running | success | failed
|
||
FileName string `json:"fileName,omitempty"`
|
||
FileSize int64 `json:"fileSize,omitempty"`
|
||
Checksum string `json:"checksum,omitempty"`
|
||
StoragePath string `json:"storagePath,omitempty"`
|
||
ErrorMessage string `json:"errorMessage,omitempty"`
|
||
LogAppend string `json:"logAppend,omitempty"` // 增量日志,追加到 record.log_content
|
||
}
|
||
|
||
// UpdateRecord 更新备份记录的状态/日志。Agent 在执行过程中可多次调用。
|
||
func (s *AgentService) UpdateRecord(ctx context.Context, node *model.Node, recordID uint, update AgentRecordUpdate) error {
|
||
record, err := s.recordRepo.FindByID(ctx, recordID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if record == nil {
|
||
return apperror.New(404, "BACKUP_RECORD_NOT_FOUND", "记录不存在", nil)
|
||
}
|
||
// 通过 task.NodeID 判断是否属于当前 agent
|
||
task, err := s.taskRepo.FindByID(ctx, record.TaskID)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
if task == nil || task.NodeID != node.ID {
|
||
return apperror.Unauthorized("BACKUP_RECORD_FORBIDDEN", "记录不属于当前节点", nil)
|
||
}
|
||
if update.Status != "" {
|
||
record.Status = update.Status
|
||
}
|
||
if update.FileName != "" {
|
||
record.FileName = update.FileName
|
||
}
|
||
if update.FileSize > 0 {
|
||
record.FileSize = update.FileSize
|
||
}
|
||
if update.Checksum != "" {
|
||
record.Checksum = update.Checksum
|
||
}
|
||
if update.StoragePath != "" {
|
||
record.StoragePath = update.StoragePath
|
||
}
|
||
if update.ErrorMessage != "" {
|
||
record.ErrorMessage = update.ErrorMessage
|
||
}
|
||
if update.LogAppend != "" {
|
||
if record.LogContent == "" {
|
||
record.LogContent = update.LogAppend
|
||
} else {
|
||
record.LogContent += update.LogAppend
|
||
}
|
||
}
|
||
if update.Status == model.BackupRecordStatusSuccess || update.Status == model.BackupRecordStatusFailed {
|
||
now := time.Now().UTC()
|
||
record.CompletedAt = &now
|
||
record.DurationSeconds = int(now.Sub(record.StartedAt).Seconds())
|
||
}
|
||
if err := s.recordRepo.Update(ctx, record); err != nil {
|
||
return err
|
||
}
|
||
// 同步更新任务的 last_status
|
||
if update.Status == model.BackupRecordStatusSuccess || update.Status == model.BackupRecordStatusFailed {
|
||
task.LastStatus = update.Status
|
||
_ = s.taskRepo.Update(ctx, task)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// EnqueueCommand Master 端调用:给指定节点插入一条待执行命令。
|
||
// 返回命令 ID。
|
||
func (s *AgentService) EnqueueCommand(ctx context.Context, nodeID uint, cmdType string, payload any) (uint, error) {
|
||
if nodeID == 0 {
|
||
return 0, errors.New("nodeID is required")
|
||
}
|
||
payloadBytes, err := json.Marshal(payload)
|
||
if err != nil {
|
||
return 0, fmt.Errorf("marshal payload: %w", err)
|
||
}
|
||
cmd := &model.AgentCommand{
|
||
NodeID: nodeID,
|
||
Type: cmdType,
|
||
Status: model.AgentCommandStatusPending,
|
||
Payload: string(payloadBytes),
|
||
}
|
||
if err := s.cmdRepo.Create(ctx, cmd); err != nil {
|
||
return 0, err
|
||
}
|
||
return cmd.ID, nil
|
||
}
|
||
|
||
// WaitForCommandResult 同步等待指定命令完成(用于 list_dir 这类 RPC 式调用)。
|
||
// timeout 为 0 表示不限,建议传 10~30s。
|
||
func (s *AgentService) WaitForCommandResult(ctx context.Context, cmdID uint, timeout time.Duration) (*model.AgentCommand, error) {
|
||
deadline := time.Now().Add(timeout)
|
||
for {
|
||
cmd, err := s.cmdRepo.FindByID(ctx, cmdID)
|
||
if err != nil {
|
||
return nil, err
|
||
}
|
||
if cmd == nil {
|
||
return nil, apperror.New(404, "AGENT_COMMAND_NOT_FOUND", "命令不存在", nil)
|
||
}
|
||
switch cmd.Status {
|
||
case model.AgentCommandStatusSucceeded, model.AgentCommandStatusFailed, model.AgentCommandStatusTimeout:
|
||
return cmd, nil
|
||
}
|
||
if timeout > 0 && time.Now().After(deadline) {
|
||
return nil, apperror.New(504, "AGENT_COMMAND_TIMEOUT", "等待 Agent 响应超时", nil)
|
||
}
|
||
select {
|
||
case <-ctx.Done():
|
||
return nil, ctx.Err()
|
||
case <-time.After(300 * time.Millisecond):
|
||
}
|
||
}
|
||
}
|
||
|
||
// StartCommandTimeoutMonitor 启动后台定时任务,把超时命令标记为 timeout。
|
||
// 对于 run_task / restore_record 命令,同时把关联的 BackupRecord / RestoreRecord
|
||
// 标记为 failed,避免 Agent 离线/崩溃时记录永远卡在 running。
|
||
func (s *AgentService) StartCommandTimeoutMonitor(ctx context.Context, interval time.Duration, timeout time.Duration) {
|
||
if interval <= 0 {
|
||
interval = 30 * time.Second
|
||
}
|
||
if timeout <= 0 {
|
||
timeout = 10 * time.Minute
|
||
}
|
||
ticker := time.NewTicker(interval)
|
||
go func() {
|
||
defer ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case <-ticker.C:
|
||
threshold := time.Now().UTC().Add(-timeout)
|
||
s.processStaleCommands(ctx, threshold)
|
||
}
|
||
}
|
||
}()
|
||
}
|
||
|
||
// processStaleCommands 扫描已超时的 dispatched 命令并联动关联记录。
|
||
// 流程:先取超时候选 → 对每条联动 backup/restore 记录 → 把命令置为 timeout。
|
||
// 单条失败不影响后续处理。
|
||
func (s *AgentService) processStaleCommands(ctx context.Context, threshold time.Time) {
|
||
commands, err := s.cmdRepo.ListStaleDispatched(ctx, threshold)
|
||
if err != nil || len(commands) == 0 {
|
||
return
|
||
}
|
||
for i := range commands {
|
||
cmd := commands[i]
|
||
s.failLinkedRecord(ctx, &cmd)
|
||
now := time.Now().UTC()
|
||
cmd.Status = model.AgentCommandStatusTimeout
|
||
cmd.ErrorMessage = "agent did not report result before timeout"
|
||
cmd.CompletedAt = &now
|
||
_ = s.cmdRepo.Update(ctx, &cmd)
|
||
}
|
||
}
|
||
|
||
// failLinkedRecord 根据命令类型把关联记录标记为 failed。
|
||
// 只对仍然处于 running 状态的记录生效,避免覆盖已完成的结果。
|
||
func (s *AgentService) failLinkedRecord(ctx context.Context, cmd *model.AgentCommand) {
|
||
const failureMessage = "Agent 未在超时前回传状态(节点可能已离线或崩溃)"
|
||
switch cmd.Type {
|
||
case model.AgentCommandTypeRunTask:
|
||
var payload struct {
|
||
RecordID uint `json:"recordId"`
|
||
}
|
||
if err := json.Unmarshal([]byte(cmd.Payload), &payload); err != nil || payload.RecordID == 0 {
|
||
return
|
||
}
|
||
record, err := s.recordRepo.FindByID(ctx, payload.RecordID)
|
||
if err != nil || record == nil || record.Status != model.BackupRecordStatusRunning {
|
||
return
|
||
}
|
||
completedAt := time.Now().UTC()
|
||
record.Status = model.BackupRecordStatusFailed
|
||
record.ErrorMessage = failureMessage
|
||
record.CompletedAt = &completedAt
|
||
record.DurationSeconds = int(completedAt.Sub(record.StartedAt).Seconds())
|
||
_ = s.recordRepo.Update(ctx, record)
|
||
case model.AgentCommandTypeRestoreRecord:
|
||
if s.restoreRepo == nil {
|
||
return
|
||
}
|
||
var payload struct {
|
||
RestoreRecordID uint `json:"restoreRecordId"`
|
||
}
|
||
if err := json.Unmarshal([]byte(cmd.Payload), &payload); err != nil || payload.RestoreRecordID == 0 {
|
||
return
|
||
}
|
||
restore, err := s.restoreRepo.FindByID(ctx, payload.RestoreRecordID)
|
||
if err != nil || restore == nil || restore.Status != model.RestoreRecordStatusRunning {
|
||
return
|
||
}
|
||
completedAt := time.Now().UTC()
|
||
restore.Status = model.RestoreRecordStatusFailed
|
||
restore.ErrorMessage = failureMessage
|
||
restore.CompletedAt = &completedAt
|
||
restore.DurationSeconds = int(completedAt.Sub(restore.StartedAt).Seconds())
|
||
_ = s.restoreRepo.Update(ctx, restore)
|
||
}
|
||
}
|
||
|
||
// AgentSelfStatus 是 /api/v1/agent/self 端点返回给 Agent 的轻量状态摘要。
|
||
type AgentSelfStatus struct {
|
||
ID uint `json:"id"`
|
||
Name string `json:"name"`
|
||
Status string `json:"status"`
|
||
LastSeen time.Time `json:"lastSeen"`
|
||
}
|
||
|
||
// SelfStatus 返回 Agent token 所属节点的当前状态,供安装脚本末尾探活。
|
||
func (s *AgentService) SelfStatus(ctx context.Context, node *model.Node) (*AgentSelfStatus, error) {
|
||
if node == nil {
|
||
return nil, apperror.Unauthorized("NODE_INVALID_TOKEN", "节点不存在", nil)
|
||
}
|
||
return &AgentSelfStatus{
|
||
ID: node.ID,
|
||
Name: node.Name,
|
||
Status: node.Status,
|
||
LastSeen: node.LastSeen,
|
||
}, nil
|
||
}
|