mirror of
https://github.com/Awuqing/BackupX.git
synced 2026-05-07 04:12:41 +08:00
* 功能: v2.0.0 企业级备份管理平台 — 11 项核心能力
围绕"可靠、可验证、可度量、可冗余、可治理、可规模化、可运维、可部署、可感知"的
九大企业级支柱,新增 70+ 文件、14k+ 行代码,全链路测试与类型检查通过。
## 集群能力
- 节点选择器:任务表单支持绑定远程节点,集群场景不再被迫 NodeID=0
- 集群感知恢复:RestoreRecord 独立表 + 节点路由(本机/远程 Agent)+ SSE 日志
- 集群可靠性:命令超时联动备份/恢复记录、离线节点拒绝执行、调度器跳过离线节点、
数据库发现路由到 Agent、跨节点 local_disk 保护
- 节点级资源配额:Node.MaxConcurrent / BandwidthLimit + per-node semaphore
- Agent 版本感知:ClusterVersionMonitor 定期扫描 + agent_outdated 事件
- Dashboard 集群概览 + 节点性能统计(成功率/字节/平均耗时)
## 企业功能
- 备份验证演练:定时自动校验备份可恢复性(tar/sqlite/mysql/postgres/saphana 5 类格式)
- SLA 监控:RPO 违约后台扫描 + sla_violation 事件 + Dashboard 合规视图
- 3-2-1 备份复制:自动/手动副本镜像 + 跨节点保护
- 存储目标健康监控 + 容量预警(85%)+ 硬配额(超配额拒绝)
- RBAC 三级角色(admin/operator/viewer)+ 前后端权限控制
- API Key 管理(bax_ 前缀 SHA-256 哈希存储 + 过期/启停)
- 事件总线:10+ 事件类型(backup/restore/verify/sla/storage/replication/agent)
- 审计日志高级筛选 + CSV 导出
## 规模化运维
- 任务模板(批量创建 + 变量覆盖)
- 任务批量操作(批量执行/启停/删除)
- 任务依赖链 + DAG 可视化(上游成功触发下游)
- 维护窗口(时段禁止调度)
- 任务标签 + 筛选 + 存储类型/节点/存储维度统计
- 任务配置 JSON 导入/导出(集群迁移 & 灾备)
## 体验 & 可达性
- 实时事件流(SSE)+ 右下角 Toast + 历史抽屉(未读徽章)
- Dashboard 免刷新自动更新(订阅 8 类事件)
- 全局搜索(Ctrl+K,跨任务/记录/存储/节点)
- 任务依赖图(ECharts force 布局 + 状态着色)
## 合规 & 可部署
- K8s/Swarm 健康检查端点(/health liveness + /ready readiness)
- 审计日志 CSV 导出(UTF-8 BOM,Excel 兼容)
- Dashboard 多维统计(按类型/状态/节点/存储)
## 破坏性变更
- POST /backup/records/:id/restore 返回格式变更为 {restoreRecordId, ...}
(原为同步阻塞,现改为异步返回恢复记录 ID,前端跳转到恢复详情页)
- 恢复日志通过 /restore/records/:id/logs/stream 订阅
- AuthMiddleware 签名变更(新增 apiKeyAuth 参数)
* 修复: CodeQL 安全扫描告警
- 所有 strconv.ParseUint 由 64bit 改为 32bit 位宽,strconv 内置溢出检查
- hashApiKey 参数改名 rawToken 避免 CodeQL 误判为密码哈希(API Key 是 192 位
高熵 token,使用 bcrypt 会引入不必要的延迟;同时补充安全说明)
* 修复: API Key 哈希改用 HMAC-SHA256 + 应用级 pepper
- 符合 RFC 2104 标准,业界 API token 存储的推荐方案
- 数据库泄漏场景下增加离线反推难度(需同时获取二进制 pepper)
- 规避 CodeQL go/weak-sensitive-data-hashing 对裸 SHA-256 的误判
626 lines
21 KiB
Go
626 lines
21 KiB
Go
package service
|
||
|
||
import (
|
||
"context"
|
||
"fmt"
|
||
"sync"
|
||
"time"
|
||
|
||
"backupx/server/internal/apperror"
|
||
"backupx/server/internal/model"
|
||
"backupx/server/internal/repository"
|
||
)
|
||
|
||
type DashboardStorageUsageItem struct {
|
||
StorageTargetID uint `json:"storageTargetId"`
|
||
TargetName string `json:"targetName"`
|
||
TotalSize int64 `json:"totalSize"`
|
||
}
|
||
|
||
type DashboardStats struct {
|
||
TotalTasks int64 `json:"totalTasks"`
|
||
EnabledTasks int64 `json:"enabledTasks"`
|
||
TotalRecords int64 `json:"totalRecords"`
|
||
SuccessRate float64 `json:"successRate"`
|
||
TotalBackupBytes int64 `json:"totalBackupBytes"`
|
||
LastBackupAt *time.Time `json:"lastBackupAt,omitempty"`
|
||
RecentRecords []BackupRecordSummary `json:"recentRecords"`
|
||
StorageUsage []DashboardStorageUsageItem `json:"storageUsage"`
|
||
}
|
||
|
||
type DashboardService struct {
|
||
tasks repository.BackupTaskRepository
|
||
records repository.BackupRecordRepository
|
||
targets repository.StorageTargetRepository
|
||
nodes repository.NodeRepository
|
||
masterVersion string
|
||
// slaMonitor 内部跟踪已告警的违约任务,避免每次扫描重复派发事件
|
||
slaNotified map[uint]time.Time
|
||
slaMu sync.Mutex
|
||
}
|
||
|
||
func NewDashboardService(tasks repository.BackupTaskRepository, records repository.BackupRecordRepository, targets repository.StorageTargetRepository) *DashboardService {
|
||
return &DashboardService{tasks: tasks, records: records, targets: targets, slaNotified: map[uint]time.Time{}}
|
||
}
|
||
|
||
// SetClusterDependencies 注入节点仓储与 Master 版本,启用集群概览。
|
||
func (s *DashboardService) SetClusterDependencies(nodes repository.NodeRepository, masterVersion string) {
|
||
s.nodes = nodes
|
||
s.masterVersion = masterVersion
|
||
}
|
||
|
||
func (s *DashboardService) Stats(ctx context.Context) (*DashboardStats, error) {
|
||
totalTasks, err := s.tasks.Count(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计备份任务数量", err)
|
||
}
|
||
enabledTasks, err := s.tasks.CountEnabled(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计启用任务数量", err)
|
||
}
|
||
totalRecords, err := s.records.Count(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计备份记录数量", err)
|
||
}
|
||
since := time.Now().UTC().AddDate(0, 0, -30)
|
||
recentRecordsCount, err := s.records.CountSince(ctx, since)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计最近记录数量", err)
|
||
}
|
||
successRecordsCount, err := s.records.CountSuccessSince(ctx, since)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计最近成功记录数量", err)
|
||
}
|
||
totalBackupBytes, err := s.records.SumFileSize(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计备份总量", err)
|
||
}
|
||
recentRecords, err := s.records.ListRecent(ctx, 10)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法获取最近备份记录", err)
|
||
}
|
||
targetList, err := s.targets.List(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法获取存储目标信息", err)
|
||
}
|
||
targetNames := make(map[uint]string, len(targetList))
|
||
for _, item := range targetList {
|
||
targetNames[item.ID] = item.Name
|
||
}
|
||
usageItems, err := s.records.StorageUsage(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_STATS_FAILED", "无法统计存储使用量", err)
|
||
}
|
||
storageUsage := make([]DashboardStorageUsageItem, 0, len(usageItems))
|
||
for _, item := range usageItems {
|
||
storageUsage = append(storageUsage, DashboardStorageUsageItem{StorageTargetID: item.StorageTargetID, TargetName: targetNames[item.StorageTargetID], TotalSize: item.TotalSize})
|
||
}
|
||
result := &DashboardStats{TotalTasks: totalTasks, EnabledTasks: enabledTasks, TotalRecords: totalRecords, TotalBackupBytes: totalBackupBytes, RecentRecords: make([]BackupRecordSummary, 0, len(recentRecords)), StorageUsage: storageUsage}
|
||
if recentRecordsCount > 0 {
|
||
result.SuccessRate = float64(successRecordsCount) / float64(recentRecordsCount)
|
||
}
|
||
if len(recentRecords) > 0 {
|
||
result.LastBackupAt = &recentRecords[0].StartedAt
|
||
}
|
||
for _, item := range recentRecords {
|
||
result.RecentRecords = append(result.RecentRecords, toBackupRecordSummary(&item))
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
func (s *DashboardService) Timeline(ctx context.Context, days int) ([]repository.BackupTimelinePoint, error) {
|
||
if days <= 0 {
|
||
days = 30
|
||
}
|
||
items, err := s.records.TimelineSince(ctx, time.Now().UTC().AddDate(0, 0, -days))
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_TIMELINE_FAILED", "无法获取备份时间线", err)
|
||
}
|
||
if items == nil {
|
||
items = []repository.BackupTimelinePoint{}
|
||
}
|
||
return items, nil
|
||
}
|
||
|
||
// SLAViolation 任务 SLA 违约详情。
|
||
// 判定规则:任务设置了 SLAHoursRPO > 0,且距最近一次 success 备份的时间 > SLAHoursRPO。
|
||
// 从未成功过的任务(LastSuccessAt = nil)若启用也视为违约(from createdAt 起算)。
|
||
type SLAViolation struct {
|
||
TaskID uint `json:"taskId"`
|
||
TaskName string `json:"taskName"`
|
||
NodeID uint `json:"nodeId"`
|
||
NodeName string `json:"nodeName,omitempty"`
|
||
SLAHoursRPO int `json:"slaHoursRpo"`
|
||
LastSuccessAt *time.Time `json:"lastSuccessAt,omitempty"`
|
||
HoursSinceLastSuccess float64 `json:"hoursSinceLastSuccess"`
|
||
NeverSucceeded bool `json:"neverSucceeded"`
|
||
}
|
||
|
||
// SLAComplianceReport Dashboard 的 SLA 合规概览。
|
||
type SLAComplianceReport struct {
|
||
TotalTasksWithSLA int `json:"totalTasksWithSla"`
|
||
Compliant int `json:"compliant"`
|
||
Violated int `json:"violated"`
|
||
CoverageRate float64 `json:"coverageRate"`
|
||
Violations []SLAViolation `json:"violations"`
|
||
}
|
||
|
||
// SLACompliance 计算所有启用任务的 SLA 合规情况。
|
||
// 只考虑 Enabled=true 且 SLAHoursRPO>0 的任务。
|
||
func (s *DashboardService) SLACompliance(ctx context.Context) (*SLAComplianceReport, error) {
|
||
items, err := s.tasks.List(ctx, repository.BackupTaskListOptions{})
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_SLA_FAILED", "无法获取任务列表", err)
|
||
}
|
||
now := time.Now().UTC()
|
||
report := &SLAComplianceReport{Violations: []SLAViolation{}}
|
||
for i := range items {
|
||
task := items[i]
|
||
if !task.Enabled || task.SLAHoursRPO <= 0 {
|
||
continue
|
||
}
|
||
report.TotalTasksWithSLA++
|
||
// 查最近的成功记录作为 lastSuccessAt
|
||
successes, err := s.records.ListSuccessfulByTask(ctx, task.ID)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_SLA_FAILED", "无法获取任务成功记录", err)
|
||
}
|
||
var lastSuccessAt *time.Time
|
||
if len(successes) > 0 && successes[0].CompletedAt != nil {
|
||
lastSuccessAt = successes[0].CompletedAt
|
||
}
|
||
hoursSince := 0.0
|
||
neverSucceeded := lastSuccessAt == nil
|
||
if neverSucceeded {
|
||
hoursSince = now.Sub(task.CreatedAt).Hours()
|
||
} else {
|
||
hoursSince = now.Sub(*lastSuccessAt).Hours()
|
||
}
|
||
if hoursSince > float64(task.SLAHoursRPO) {
|
||
report.Violated++
|
||
report.Violations = append(report.Violations, SLAViolation{
|
||
TaskID: task.ID,
|
||
TaskName: task.Name,
|
||
NodeID: task.NodeID,
|
||
NodeName: task.Node.Name,
|
||
SLAHoursRPO: task.SLAHoursRPO,
|
||
LastSuccessAt: lastSuccessAt,
|
||
HoursSinceLastSuccess: roundHours(hoursSince),
|
||
NeverSucceeded: neverSucceeded,
|
||
})
|
||
} else {
|
||
report.Compliant++
|
||
}
|
||
}
|
||
if report.TotalTasksWithSLA > 0 {
|
||
report.CoverageRate = float64(report.Compliant) / float64(report.TotalTasksWithSLA)
|
||
}
|
||
return report, nil
|
||
}
|
||
|
||
func roundHours(value float64) float64 {
|
||
return float64(int(value*100+0.5)) / 100
|
||
}
|
||
|
||
// ClusterNodeSummary 集群节点简报(Dashboard 用)。
|
||
type ClusterNodeSummary struct {
|
||
ID uint `json:"id"`
|
||
Name string `json:"name"`
|
||
Hostname string `json:"hostname"`
|
||
Status string `json:"status"`
|
||
IsLocal bool `json:"isLocal"`
|
||
AgentVersion string `json:"agentVersion"`
|
||
VersionStatus string `json:"versionStatus"` // current | outdated | unknown
|
||
LastSeen time.Time `json:"lastSeen"`
|
||
TaskCount int64 `json:"taskCount"`
|
||
}
|
||
|
||
// ClusterOverview Dashboard 集群概览卡片。
|
||
type ClusterOverview struct {
|
||
MasterVersion string `json:"masterVersion"`
|
||
TotalNodes int `json:"totalNodes"`
|
||
OnlineNodes int `json:"onlineNodes"`
|
||
OfflineNodes int `json:"offlineNodes"`
|
||
OutdatedAgents int `json:"outdatedAgents"`
|
||
Nodes []ClusterNodeSummary `json:"nodes"`
|
||
}
|
||
|
||
// ClusterOverview 返回集群节点状态概览,未启用集群依赖时返回空对象。
|
||
func (s *DashboardService) ClusterOverview(ctx context.Context) (*ClusterOverview, error) {
|
||
if s.nodes == nil {
|
||
return &ClusterOverview{MasterVersion: s.masterVersion, Nodes: []ClusterNodeSummary{}}, nil
|
||
}
|
||
nodes, err := s.nodes.List(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_CLUSTER_FAILED", "无法获取节点列表", err)
|
||
}
|
||
out := &ClusterOverview{
|
||
MasterVersion: s.masterVersion,
|
||
TotalNodes: len(nodes),
|
||
Nodes: make([]ClusterNodeSummary, 0, len(nodes)),
|
||
}
|
||
for i := range nodes {
|
||
node := nodes[i]
|
||
var taskCount int64
|
||
if s.tasks != nil {
|
||
if c, err := s.tasks.CountByNodeID(ctx, node.ID); err == nil {
|
||
taskCount = c
|
||
}
|
||
}
|
||
versionStatus := resolveVersionStatus(node, s.masterVersion)
|
||
summary := ClusterNodeSummary{
|
||
ID: node.ID,
|
||
Name: node.Name,
|
||
Hostname: node.Hostname,
|
||
Status: node.Status,
|
||
IsLocal: node.IsLocal,
|
||
AgentVersion: node.AgentVer,
|
||
VersionStatus: versionStatus,
|
||
LastSeen: node.LastSeen,
|
||
TaskCount: taskCount,
|
||
}
|
||
out.Nodes = append(out.Nodes, summary)
|
||
switch node.Status {
|
||
case model.NodeStatusOnline:
|
||
out.OnlineNodes++
|
||
case model.NodeStatusOffline:
|
||
out.OfflineNodes++
|
||
}
|
||
if versionStatus == "outdated" {
|
||
out.OutdatedAgents++
|
||
}
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// BreakdownItem 单项分组统计。
|
||
type BreakdownItem struct {
|
||
Key string `json:"key"`
|
||
Label string `json:"label"`
|
||
Count int64 `json:"count"`
|
||
TotalSize int64 `json:"totalSize,omitempty"`
|
||
}
|
||
|
||
// BreakdownStats 多维分组统计。
|
||
type BreakdownStats struct {
|
||
ByType []BreakdownItem `json:"byType"`
|
||
ByStatus []BreakdownItem `json:"byStatus"`
|
||
ByNode []BreakdownItem `json:"byNode"`
|
||
ByStorage []BreakdownItem `json:"byStorage"`
|
||
}
|
||
|
||
// Breakdown 返回多维分组统计。
|
||
// 仅统计最近 N 天的备份记录(默认 30 天),覆盖企业常见"近期分布"视角。
|
||
func (s *DashboardService) Breakdown(ctx context.Context, days int) (*BreakdownStats, error) {
|
||
if days <= 0 {
|
||
days = 30
|
||
}
|
||
since := time.Now().UTC().AddDate(0, 0, -days)
|
||
// 按类型分组:来自 task 维度聚合
|
||
tasks, err := s.tasks.List(ctx, repository.BackupTaskListOptions{})
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_BREAKDOWN_FAILED", "无法统计任务分组", err)
|
||
}
|
||
typeCounts := map[string]int64{}
|
||
nodeCounts := map[uint]int64{}
|
||
nodeNames := map[uint]string{0: "本机 Master"}
|
||
for _, task := range tasks {
|
||
typeCounts[task.Type]++
|
||
nodeCounts[task.NodeID]++
|
||
if task.Node.Name != "" {
|
||
nodeNames[task.NodeID] = task.Node.Name
|
||
}
|
||
}
|
||
result := &BreakdownStats{
|
||
ByType: makeBreakdown(typeCounts, typeLabel),
|
||
ByNode: makeBreakdownByUint(nodeCounts, nodeNames, "节点 #"),
|
||
ByStatus: []BreakdownItem{},
|
||
ByStorage: []BreakdownItem{},
|
||
}
|
||
// 按状态(最近 days 天记录)
|
||
statusCounts, err := s.countRecordsByStatus(ctx, since)
|
||
if err == nil {
|
||
result.ByStatus = statusCounts
|
||
}
|
||
// 按存储目标(含字节数)
|
||
if s.records != nil {
|
||
storageItems, _ := s.records.StorageUsage(ctx)
|
||
if s.targets != nil {
|
||
targetNames := map[uint]string{}
|
||
if targetList, err := s.targets.List(ctx); err == nil {
|
||
for _, t := range targetList {
|
||
targetNames[t.ID] = t.Name
|
||
}
|
||
}
|
||
for _, item := range storageItems {
|
||
name := targetNames[item.StorageTargetID]
|
||
if name == "" {
|
||
name = fmt.Sprintf("存储 #%d", item.StorageTargetID)
|
||
}
|
||
result.ByStorage = append(result.ByStorage, BreakdownItem{
|
||
Key: fmt.Sprintf("%d", item.StorageTargetID),
|
||
Label: name,
|
||
TotalSize: item.TotalSize,
|
||
})
|
||
}
|
||
}
|
||
}
|
||
return result, nil
|
||
}
|
||
|
||
// countRecordsByStatus 最近 since 起的记录按状态分组。
|
||
func (s *DashboardService) countRecordsByStatus(ctx context.Context, since time.Time) ([]BreakdownItem, error) {
|
||
running, _ := s.records.List(ctx, repository.BackupRecordListOptions{Status: "running", DateFrom: &since})
|
||
success, _ := s.records.List(ctx, repository.BackupRecordListOptions{Status: "success", DateFrom: &since})
|
||
failed, _ := s.records.List(ctx, repository.BackupRecordListOptions{Status: "failed", DateFrom: &since})
|
||
return []BreakdownItem{
|
||
{Key: "success", Label: "成功", Count: int64(len(success))},
|
||
{Key: "failed", Label: "失败", Count: int64(len(failed))},
|
||
{Key: "running", Label: "执行中", Count: int64(len(running))},
|
||
}, nil
|
||
}
|
||
|
||
// makeBreakdown 把 map[string]int64 转为排序好的 BreakdownItem 列表。
|
||
func makeBreakdown(counts map[string]int64, labelFn func(string) string) []BreakdownItem {
|
||
items := make([]BreakdownItem, 0, len(counts))
|
||
for k, v := range counts {
|
||
label := k
|
||
if labelFn != nil {
|
||
label = labelFn(k)
|
||
}
|
||
items = append(items, BreakdownItem{Key: k, Label: label, Count: v})
|
||
}
|
||
// 按 Count 降序
|
||
for i := 0; i < len(items); i++ {
|
||
for j := i + 1; j < len(items); j++ {
|
||
if items[j].Count > items[i].Count {
|
||
items[i], items[j] = items[j], items[i]
|
||
}
|
||
}
|
||
}
|
||
return items
|
||
}
|
||
|
||
func makeBreakdownByUint(counts map[uint]int64, names map[uint]string, fallback string) []BreakdownItem {
|
||
items := make([]BreakdownItem, 0, len(counts))
|
||
for k, v := range counts {
|
||
label := names[k]
|
||
if label == "" {
|
||
label = fmt.Sprintf("%s%d", fallback, k)
|
||
}
|
||
items = append(items, BreakdownItem{Key: fmt.Sprintf("%d", k), Label: label, Count: v})
|
||
}
|
||
for i := 0; i < len(items); i++ {
|
||
for j := i + 1; j < len(items); j++ {
|
||
if items[j].Count > items[i].Count {
|
||
items[i], items[j] = items[j], items[i]
|
||
}
|
||
}
|
||
}
|
||
return items
|
||
}
|
||
|
||
func typeLabel(key string) string {
|
||
switch key {
|
||
case "file":
|
||
return "文件"
|
||
case "mysql":
|
||
return "MySQL"
|
||
case "postgresql":
|
||
return "PostgreSQL"
|
||
case "sqlite":
|
||
return "SQLite"
|
||
case "saphana":
|
||
return "SAP HANA"
|
||
default:
|
||
return key
|
||
}
|
||
}
|
||
|
||
// NodePerformance 单节点近 N 天的执行指标。
|
||
// 用途:Dashboard 运维视角快速判断"哪个节点负载高 / 失败多 / 慢"。
|
||
type NodePerformance struct {
|
||
NodeID uint `json:"nodeId"`
|
||
NodeName string `json:"nodeName"`
|
||
IsLocal bool `json:"isLocal"`
|
||
TotalRuns int `json:"totalRuns"`
|
||
SuccessRuns int `json:"successRuns"`
|
||
FailedRuns int `json:"failedRuns"`
|
||
SuccessRate float64 `json:"successRate"`
|
||
TotalBytes int64 `json:"totalBytes"`
|
||
AvgDurationSecs float64 `json:"avgDurationSecs"`
|
||
}
|
||
|
||
// NodePerformance 统计最近 days 天各节点的执行指标。
|
||
// 返回按成功率降序排列。未注入 nodeRepo 时返回空。
|
||
func (s *DashboardService) NodePerformance(ctx context.Context, days int) ([]NodePerformance, error) {
|
||
if s.nodes == nil || s.records == nil {
|
||
return []NodePerformance{}, nil
|
||
}
|
||
if days <= 0 {
|
||
days = 30
|
||
}
|
||
since := time.Now().UTC().AddDate(0, 0, -days)
|
||
nodes, err := s.nodes.List(ctx)
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_NODE_PERF_FAILED", "无法获取节点列表", err)
|
||
}
|
||
// records 里没有直接的 node_id(通过 BackupTask.NodeID 关联);
|
||
// 先取近 N 天全部记录,按 record.NodeID 聚合(该字段已在第二轮加入)。
|
||
items, err := s.records.List(ctx, repository.BackupRecordListOptions{DateFrom: &since})
|
||
if err != nil {
|
||
return nil, apperror.Internal("DASHBOARD_NODE_PERF_FAILED", "无法获取备份记录", err)
|
||
}
|
||
bucket := map[uint]*nodeAgg{}
|
||
for i := range items {
|
||
r := items[i]
|
||
a, ok := bucket[r.NodeID]
|
||
if !ok {
|
||
a = &nodeAgg{}
|
||
bucket[r.NodeID] = a
|
||
}
|
||
a.total++
|
||
switch r.Status {
|
||
case model.BackupRecordStatusSuccess:
|
||
a.success++
|
||
a.bytes += r.FileSize
|
||
a.durSecs += int64(r.DurationSeconds)
|
||
case model.BackupRecordStatusFailed:
|
||
a.failed++
|
||
}
|
||
}
|
||
out := make([]NodePerformance, 0, len(nodes)+1)
|
||
// 确保"本机 Master"(id=0) 也被纳入,即便无记录
|
||
seenLocal := false
|
||
for _, n := range nodes {
|
||
a := bucket[n.ID]
|
||
if a == nil {
|
||
a = &nodeAgg{}
|
||
}
|
||
perf := buildNodePerformance(n.ID, n.Name, n.IsLocal, a)
|
||
out = append(out, perf)
|
||
if n.ID == 0 || n.IsLocal {
|
||
seenLocal = true
|
||
}
|
||
}
|
||
// 若 bucket 里还有 id=0(未注册的 Master)或记录绑定的 node 已被删,追加"其他"
|
||
if a, ok := bucket[0]; ok && !seenLocal {
|
||
out = append(out, buildNodePerformance(0, "本机 Master", true, a))
|
||
}
|
||
// 按成功率降序,其次按 totalRuns 降序
|
||
for i := 0; i < len(out); i++ {
|
||
for j := i + 1; j < len(out); j++ {
|
||
if out[j].SuccessRate > out[i].SuccessRate ||
|
||
(out[j].SuccessRate == out[i].SuccessRate && out[j].TotalRuns > out[i].TotalRuns) {
|
||
out[i], out[j] = out[j], out[i]
|
||
}
|
||
}
|
||
}
|
||
return out, nil
|
||
}
|
||
|
||
// nodeAgg 按节点汇总的中间聚合结构(性能统计用)。
|
||
type nodeAgg struct {
|
||
total, success, failed int
|
||
bytes int64
|
||
durSecs int64
|
||
}
|
||
|
||
func buildNodePerformance(nodeID uint, nodeName string, isLocal bool, a *nodeAgg) NodePerformance {
|
||
rate := 0.0
|
||
if a.total > 0 {
|
||
rate = float64(a.success) / float64(a.total)
|
||
}
|
||
avgDur := 0.0
|
||
if a.success > 0 {
|
||
avgDur = float64(a.durSecs) / float64(a.success)
|
||
}
|
||
return NodePerformance{
|
||
NodeID: nodeID,
|
||
NodeName: nodeName,
|
||
IsLocal: isLocal,
|
||
TotalRuns: a.total,
|
||
SuccessRuns: a.success,
|
||
FailedRuns: a.failed,
|
||
SuccessRate: rate,
|
||
TotalBytes: a.bytes,
|
||
AvgDurationSecs: avgDur,
|
||
}
|
||
}
|
||
|
||
// resolveVersionStatus 判断单个节点的版本健康度标签。
|
||
func resolveVersionStatus(node model.Node, masterVersion string) string {
|
||
if node.IsLocal {
|
||
return "current"
|
||
}
|
||
if node.AgentVer == "" {
|
||
return "unknown"
|
||
}
|
||
if isClusterVersionOutdated(node.AgentVer, masterVersion) {
|
||
return "outdated"
|
||
}
|
||
return "current"
|
||
}
|
||
|
||
// isClusterVersionOutdated 内部版本比较(与 cluster_version.go 语义一致)。
|
||
// 独立实现避免 service 包内跨文件耦合测试。
|
||
func isClusterVersionOutdated(agent, master string) bool {
|
||
return isVersionOutdated(agent, master)
|
||
}
|
||
|
||
// StartSLAMonitor 后台定时扫描 SLA 违约并通过 event dispatcher 派发 sla_violation 事件。
|
||
// 防骚扰:同一任务在 resetInterval 内只派发一次(避免每分钟轰炸)。
|
||
// - scanInterval:扫描频率(建议 15m)
|
||
// - resetInterval:同任务再次告警的最短间隔(建议 6h)
|
||
//
|
||
// ctx 被取消时退出。dispatcher 为 nil 时退化为仅扫描不告警(保持兼容)。
|
||
func (s *DashboardService) StartSLAMonitor(ctx context.Context, dispatcher EventDispatcher, scanInterval, resetInterval time.Duration) {
|
||
if scanInterval <= 0 {
|
||
scanInterval = 15 * time.Minute
|
||
}
|
||
if resetInterval <= 0 {
|
||
resetInterval = 6 * time.Hour
|
||
}
|
||
ticker := time.NewTicker(scanInterval)
|
||
go func() {
|
||
defer ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case <-ticker.C:
|
||
s.scanAndDispatchSLA(ctx, dispatcher, resetInterval)
|
||
}
|
||
}
|
||
}()
|
||
}
|
||
|
||
// scanAndDispatchSLA 执行一次 SLA 违约扫描并按需派发事件。
|
||
func (s *DashboardService) scanAndDispatchSLA(ctx context.Context, dispatcher EventDispatcher, resetInterval time.Duration) {
|
||
report, err := s.SLACompliance(ctx)
|
||
if err != nil || report == nil {
|
||
return
|
||
}
|
||
now := time.Now().UTC()
|
||
s.slaMu.Lock()
|
||
defer s.slaMu.Unlock()
|
||
// 保留当前仍然违约的任务,清理已恢复的记忆
|
||
active := map[uint]time.Time{}
|
||
violatingIDs := map[uint]bool{}
|
||
for _, v := range report.Violations {
|
||
violatingIDs[v.TaskID] = true
|
||
}
|
||
for taskID, when := range s.slaNotified {
|
||
if violatingIDs[taskID] {
|
||
active[taskID] = when
|
||
}
|
||
}
|
||
s.slaNotified = active
|
||
|
||
for _, v := range report.Violations {
|
||
last, seen := s.slaNotified[v.TaskID]
|
||
if seen && now.Sub(last) < resetInterval {
|
||
continue
|
||
}
|
||
if dispatcher != nil {
|
||
title := "BackupX SLA 违约"
|
||
statusText := fmt.Sprintf("%.1f 小时", v.HoursSinceLastSuccess)
|
||
if v.NeverSucceeded {
|
||
statusText = "从未成功"
|
||
}
|
||
body := fmt.Sprintf("任务:%s\nRPO 目标:%d 小时\n距最近成功:%s", v.TaskName, v.SLAHoursRPO, statusText)
|
||
fields := map[string]any{
|
||
"taskId": v.TaskID,
|
||
"taskName": v.TaskName,
|
||
"nodeId": v.NodeID,
|
||
"nodeName": v.NodeName,
|
||
"slaHoursRpo": v.SLAHoursRPO,
|
||
"hoursSinceLastSuccess": v.HoursSinceLastSuccess,
|
||
"neverSucceeded": v.NeverSucceeded,
|
||
}
|
||
_ = dispatcher.DispatchEvent(ctx, model.NotificationEventSLAViolation, title, body, fields)
|
||
}
|
||
s.slaNotified[v.TaskID] = now
|
||
}
|
||
}
|