mirror of
https://github.com/Awuqing/BackupX.git
synced 2026-05-06 20:02:41 +08:00
* 功能: v2.0.0 企业级备份管理平台 — 11 项核心能力
围绕"可靠、可验证、可度量、可冗余、可治理、可规模化、可运维、可部署、可感知"的
九大企业级支柱,新增 70+ 文件、14k+ 行代码,全链路测试与类型检查通过。
## 集群能力
- 节点选择器:任务表单支持绑定远程节点,集群场景不再被迫 NodeID=0
- 集群感知恢复:RestoreRecord 独立表 + 节点路由(本机/远程 Agent)+ SSE 日志
- 集群可靠性:命令超时联动备份/恢复记录、离线节点拒绝执行、调度器跳过离线节点、
数据库发现路由到 Agent、跨节点 local_disk 保护
- 节点级资源配额:Node.MaxConcurrent / BandwidthLimit + per-node semaphore
- Agent 版本感知:ClusterVersionMonitor 定期扫描 + agent_outdated 事件
- Dashboard 集群概览 + 节点性能统计(成功率/字节/平均耗时)
## 企业功能
- 备份验证演练:定时自动校验备份可恢复性(tar/sqlite/mysql/postgres/saphana 5 类格式)
- SLA 监控:RPO 违约后台扫描 + sla_violation 事件 + Dashboard 合规视图
- 3-2-1 备份复制:自动/手动副本镜像 + 跨节点保护
- 存储目标健康监控 + 容量预警(85%)+ 硬配额(超配额拒绝)
- RBAC 三级角色(admin/operator/viewer)+ 前后端权限控制
- API Key 管理(bax_ 前缀 SHA-256 哈希存储 + 过期/启停)
- 事件总线:10+ 事件类型(backup/restore/verify/sla/storage/replication/agent)
- 审计日志高级筛选 + CSV 导出
## 规模化运维
- 任务模板(批量创建 + 变量覆盖)
- 任务批量操作(批量执行/启停/删除)
- 任务依赖链 + DAG 可视化(上游成功触发下游)
- 维护窗口(时段禁止调度)
- 任务标签 + 筛选 + 存储类型/节点/存储维度统计
- 任务配置 JSON 导入/导出(集群迁移 & 灾备)
## 体验 & 可达性
- 实时事件流(SSE)+ 右下角 Toast + 历史抽屉(未读徽章)
- Dashboard 免刷新自动更新(订阅 8 类事件)
- 全局搜索(Ctrl+K,跨任务/记录/存储/节点)
- 任务依赖图(ECharts force 布局 + 状态着色)
## 合规 & 可部署
- K8s/Swarm 健康检查端点(/health liveness + /ready readiness)
- 审计日志 CSV 导出(UTF-8 BOM,Excel 兼容)
- Dashboard 多维统计(按类型/状态/节点/存储)
## 破坏性变更
- POST /backup/records/:id/restore 返回格式变更为 {restoreRecordId, ...}
(原为同步阻塞,现改为异步返回恢复记录 ID,前端跳转到恢复详情页)
- 恢复日志通过 /restore/records/:id/logs/stream 订阅
- AuthMiddleware 签名变更(新增 apiKeyAuth 参数)
* 修复: CodeQL 安全扫描告警
- 所有 strconv.ParseUint 由 64bit 改为 32bit 位宽,strconv 内置溢出检查
- hashApiKey 参数改名 rawToken 避免 CodeQL 误判为密码哈希(API Key 是 192 位
高熵 token,使用 bcrypt 会引入不必要的延迟;同时补充安全说明)
* 修复: API Key 哈希改用 HMAC-SHA256 + 应用级 pepper
- 符合 RFC 2104 标准,业界 API token 存储的推荐方案
- 数据库泄漏场景下增加离线反推难度(需同时获取二进制 pepper)
- 规避 CodeQL go/weak-sensitive-data-hashing 对裸 SHA-256 的误判
265 lines
8.1 KiB
Go
265 lines
8.1 KiB
Go
package scheduler
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"sync"
|
||
"time"
|
||
|
||
"backupx/server/internal/backup"
|
||
"backupx/server/internal/model"
|
||
"backupx/server/internal/repository"
|
||
servicepkg "backupx/server/internal/service"
|
||
"github.com/robfig/cron/v3"
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
type TaskRunner interface {
|
||
RunTaskByID(context.Context, uint) (*servicepkg.BackupRecordDetail, error)
|
||
}
|
||
|
||
// VerifyRunner 供调度器触发验证演练。
|
||
// 使用最新成功备份作为源;taskID 对应的任务须配置 VerifyEnabled=true。
|
||
type VerifyRunner interface {
|
||
StartByTask(ctx context.Context, taskID uint, mode, triggeredBy string) (*servicepkg.VerificationRecordDetail, error)
|
||
}
|
||
|
||
// AuditRecorder 记录审计日志(可选依赖)
|
||
type AuditRecorder interface {
|
||
Record(servicepkg.AuditEntry)
|
||
}
|
||
|
||
type Service struct {
|
||
mu sync.Mutex
|
||
cron *cron.Cron
|
||
tasks repository.BackupTaskRepository
|
||
nodes repository.NodeRepository
|
||
runner TaskRunner
|
||
verifyRunner VerifyRunner
|
||
logger *zap.Logger
|
||
audit AuditRecorder
|
||
entries map[uint]cron.EntryID // 备份 cron 条目
|
||
verifyEntries map[uint]cron.EntryID // 验证 cron 条目
|
||
}
|
||
|
||
func NewService(tasks repository.BackupTaskRepository, runner TaskRunner, logger *zap.Logger) *Service {
|
||
parser := cron.NewParser(cron.SecondOptional | cron.Minute | cron.Hour | cron.Dom | cron.Month | cron.Dow | cron.Descriptor)
|
||
return &Service{
|
||
cron: cron.New(cron.WithParser(parser), cron.WithLocation(time.UTC)),
|
||
tasks: tasks,
|
||
runner: runner,
|
||
logger: logger,
|
||
entries: make(map[uint]cron.EntryID),
|
||
verifyEntries: make(map[uint]cron.EntryID),
|
||
}
|
||
}
|
||
|
||
// SetVerifyRunner 注入验证调度器。可选注入:未注入时不处理验证 cron。
|
||
func (s *Service) SetVerifyRunner(runner VerifyRunner) {
|
||
s.mu.Lock()
|
||
defer s.mu.Unlock()
|
||
s.verifyRunner = runner
|
||
}
|
||
|
||
func (s *Service) SetAuditRecorder(audit AuditRecorder) { s.audit = audit }
|
||
|
||
// SetNodeRepository 注入节点仓储用于调度前的健康检查。
|
||
// 可选注入:未注入时调度器无条件触发任务(单节点场景)。
|
||
func (s *Service) SetNodeRepository(nodes repository.NodeRepository) {
|
||
s.mu.Lock()
|
||
defer s.mu.Unlock()
|
||
s.nodes = nodes
|
||
}
|
||
|
||
func (s *Service) Start(ctx context.Context) error {
|
||
if err := s.Reload(ctx); err != nil {
|
||
return err
|
||
}
|
||
s.cron.Start()
|
||
return nil
|
||
}
|
||
|
||
func (s *Service) Stop(ctx context.Context) error {
|
||
stopCtx := s.cron.Stop()
|
||
select {
|
||
case <-stopCtx.Done():
|
||
return nil
|
||
case <-ctx.Done():
|
||
return ctx.Err()
|
||
}
|
||
}
|
||
|
||
func (s *Service) Reload(ctx context.Context) error {
|
||
items, err := s.tasks.ListSchedulable(ctx)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
// 验证调度单独扫描(启用验证的任务可能未启用备份 cron,反之亦然)
|
||
verifyItems, err := s.tasks.ListVerifySchedulable(ctx)
|
||
if err != nil {
|
||
return err
|
||
}
|
||
s.mu.Lock()
|
||
defer s.mu.Unlock()
|
||
for taskID, entryID := range s.entries {
|
||
s.cron.Remove(entryID)
|
||
delete(s.entries, taskID)
|
||
}
|
||
for taskID, entryID := range s.verifyEntries {
|
||
s.cron.Remove(entryID)
|
||
delete(s.verifyEntries, taskID)
|
||
}
|
||
for _, item := range items {
|
||
item := item
|
||
if err := s.syncTaskLocked(&item); err != nil {
|
||
return err
|
||
}
|
||
}
|
||
for _, item := range verifyItems {
|
||
item := item
|
||
if err := s.syncVerifyTaskLocked(&item); err != nil {
|
||
return err
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (s *Service) SyncTask(_ context.Context, task *model.BackupTask) error {
|
||
s.mu.Lock()
|
||
defer s.mu.Unlock()
|
||
if err := s.syncTaskLocked(task); err != nil {
|
||
return err
|
||
}
|
||
return s.syncVerifyTaskLocked(task)
|
||
}
|
||
|
||
func (s *Service) RemoveTask(_ context.Context, taskID uint) error {
|
||
s.mu.Lock()
|
||
defer s.mu.Unlock()
|
||
if entryID, ok := s.entries[taskID]; ok {
|
||
s.cron.Remove(entryID)
|
||
delete(s.entries, taskID)
|
||
}
|
||
if entryID, ok := s.verifyEntries[taskID]; ok {
|
||
s.cron.Remove(entryID)
|
||
delete(s.verifyEntries, taskID)
|
||
}
|
||
return nil
|
||
}
|
||
|
||
func (s *Service) syncTaskLocked(task *model.BackupTask) error {
|
||
if task == nil {
|
||
return fmt.Errorf("task is required")
|
||
}
|
||
if entryID, ok := s.entries[task.ID]; ok {
|
||
s.cron.Remove(entryID)
|
||
delete(s.entries, task.ID)
|
||
}
|
||
if !task.Enabled || task.CronExpr == "" {
|
||
return nil
|
||
}
|
||
taskID := task.ID
|
||
taskName := task.Name
|
||
taskNodeID := task.NodeID
|
||
cronExpr := task.CronExpr
|
||
maintenanceWindows := task.MaintenanceWindows
|
||
entryID, err := s.cron.AddFunc(cronExpr, func() {
|
||
// 集群感知:若任务绑定了离线的远程节点,跳过本轮触发避免堆积 failed 记录
|
||
if taskNodeID > 0 && s.nodes != nil {
|
||
node, err := s.nodes.FindByID(context.Background(), taskNodeID)
|
||
if err == nil && node != nil && !node.IsLocal && node.Status != model.NodeStatusOnline {
|
||
if s.logger != nil {
|
||
s.logger.Warn("skip scheduled run: target node offline",
|
||
zap.Uint("task_id", taskID), zap.String("task_name", taskName),
|
||
zap.Uint("node_id", taskNodeID), zap.String("node_name", node.Name))
|
||
}
|
||
if s.audit != nil {
|
||
s.audit.Record(servicepkg.AuditEntry{
|
||
Username: "system", Category: "backup_task", Action: "scheduled_skip",
|
||
TargetType: "backup_task", TargetID: fmt.Sprintf("%d", taskID),
|
||
TargetName: taskName,
|
||
Detail: fmt.Sprintf("跳过调度触发:节点 %s 离线 (task: %s, cron: %s)", node.Name, taskName, cronExpr),
|
||
})
|
||
}
|
||
return
|
||
}
|
||
}
|
||
// 维护窗口校验:非窗口时间跳过。Windows 为空则不限制。
|
||
if maintenanceWindows != "" {
|
||
windows := backup.ParseMaintenanceWindows(maintenanceWindows)
|
||
if len(windows) > 0 && !backup.IsWithinWindow(time.Now(), windows) {
|
||
if s.logger != nil {
|
||
s.logger.Info("skip scheduled run: outside maintenance window",
|
||
zap.Uint("task_id", taskID), zap.String("task_name", taskName),
|
||
zap.String("windows", maintenanceWindows))
|
||
}
|
||
if s.audit != nil {
|
||
s.audit.Record(servicepkg.AuditEntry{
|
||
Username: "system", Category: "backup_task", Action: "scheduled_skip",
|
||
TargetType: "backup_task", TargetID: fmt.Sprintf("%d", taskID),
|
||
TargetName: taskName,
|
||
Detail: fmt.Sprintf("跳过调度触发:非维护窗口 (task: %s, windows: %s)", taskName, maintenanceWindows),
|
||
})
|
||
}
|
||
return
|
||
}
|
||
}
|
||
// 自动调度任务记录审计日志
|
||
if s.audit != nil {
|
||
s.audit.Record(servicepkg.AuditEntry{
|
||
Username: "system", Category: "backup_task", Action: "scheduled_run",
|
||
TargetType: "backup_task", TargetID: fmt.Sprintf("%d", taskID),
|
||
TargetName: taskName, Detail: fmt.Sprintf("定时调度触发备份任务: %s (cron: %s)", taskName, cronExpr),
|
||
})
|
||
}
|
||
if _, runErr := s.runner.RunTaskByID(context.Background(), taskID); runErr != nil && s.logger != nil {
|
||
s.logger.Warn("scheduled backup run failed", zap.Uint("task_id", taskID), zap.Error(runErr))
|
||
}
|
||
})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
s.entries[task.ID] = entryID
|
||
return nil
|
||
}
|
||
|
||
// syncVerifyTaskLocked 同步任务的验证演练 cron 条目。
|
||
// 调度时间到 → 拉取最新成功备份 → 触发 Verify 快速校验。
|
||
// 若未注入 verifyRunner,直接返回(单节点+无验证场景)。
|
||
func (s *Service) syncVerifyTaskLocked(task *model.BackupTask) error {
|
||
if task == nil {
|
||
return fmt.Errorf("task is required")
|
||
}
|
||
if entryID, ok := s.verifyEntries[task.ID]; ok {
|
||
s.cron.Remove(entryID)
|
||
delete(s.verifyEntries, task.ID)
|
||
}
|
||
if s.verifyRunner == nil {
|
||
return nil
|
||
}
|
||
if !task.Enabled || !task.VerifyEnabled || task.VerifyCronExpr == "" {
|
||
return nil
|
||
}
|
||
taskID := task.ID
|
||
taskName := task.Name
|
||
mode := task.VerifyMode
|
||
verifyCron := task.VerifyCronExpr
|
||
entryID, err := s.cron.AddFunc(verifyCron, func() {
|
||
if s.audit != nil {
|
||
s.audit.Record(servicepkg.AuditEntry{
|
||
Username: "system", Category: "backup_verify", Action: "scheduled_run",
|
||
TargetType: "backup_task", TargetID: fmt.Sprintf("%d", taskID),
|
||
TargetName: taskName, Detail: fmt.Sprintf("定时验证演练: %s (cron: %s, mode: %s)", taskName, verifyCron, mode),
|
||
})
|
||
}
|
||
if _, runErr := s.verifyRunner.StartByTask(context.Background(), taskID, mode, "system"); runErr != nil && s.logger != nil {
|
||
s.logger.Warn("scheduled verify run failed", zap.Uint("task_id", taskID), zap.Error(runErr))
|
||
}
|
||
})
|
||
if err != nil {
|
||
return err
|
||
}
|
||
s.verifyEntries[task.ID] = entryID
|
||
return nil
|
||
}
|