mirror of
https://github.com/Awuqing/BackupX.git
synced 2026-05-06 20:02:41 +08:00
* 功能: v2.1 可观测性与流控 — Prometheus + 节点带宽 + 审计 Webhook 核心能力: - Prometheus /metrics 端点:11 类指标(任务/存储/节点/SLA/验证/恢复/复制) - 节点级带宽限速生效:model.Node.BandwidthLimit 覆盖全局默认 - 审计日志 Webhook 外输:HMAC-SHA256 签名,配合 SIEM 合规留档 实现: - server/internal/metrics/ 独立 Registry + 异步 Gauge Collector(30s) - backup/restore/verify/replication 服务注入 metrics 钩子,nil 安全 - resolveProviderForNode() 按 task.NodeID 解析 BandwidthLimit - AuditService.SetWebhook + 动态 settings 推送,无需重启 测试: - metrics/registry_test.go: 注册/采集/nil safety/HTTP handler - service/audit_service_webhook_test.go: 签名正确性/异步投递/禁用路径 - go test ./... 全部通过 * chore: 触发 CodeQL 扫描
153 lines
4.3 KiB
Go
153 lines
4.3 KiB
Go
package metrics
|
||
|
||
import (
|
||
"context"
|
||
"time"
|
||
|
||
"backupx/server/internal/model"
|
||
"backupx/server/internal/repository"
|
||
)
|
||
|
||
// SampleSource 抽象 Collector 需要的仓储访问,便于单测替换。
|
||
type SampleSource interface {
|
||
ListStorageTargets(ctx context.Context) ([]model.StorageTarget, error)
|
||
StorageUsage(ctx context.Context) ([]repository.BackupStorageUsageItem, error)
|
||
ListNodes(ctx context.Context) ([]model.Node, error)
|
||
CountSLABreach(ctx context.Context) (int, error)
|
||
}
|
||
|
||
// repoSource 把 repository 适配到 SampleSource。
|
||
type repoSource struct {
|
||
targets repository.StorageTargetRepository
|
||
records repository.BackupRecordRepository
|
||
nodes repository.NodeRepository
|
||
tasks repository.BackupTaskRepository
|
||
now func() time.Time
|
||
}
|
||
|
||
// NewRepoSource 用仓储实例构造 SampleSource。
|
||
func NewRepoSource(
|
||
targets repository.StorageTargetRepository,
|
||
records repository.BackupRecordRepository,
|
||
nodes repository.NodeRepository,
|
||
tasks repository.BackupTaskRepository,
|
||
) SampleSource {
|
||
return &repoSource{
|
||
targets: targets,
|
||
records: records,
|
||
nodes: nodes,
|
||
tasks: tasks,
|
||
now: func() time.Time { return time.Now().UTC() },
|
||
}
|
||
}
|
||
|
||
func (s *repoSource) ListStorageTargets(ctx context.Context) ([]model.StorageTarget, error) {
|
||
return s.targets.List(ctx)
|
||
}
|
||
|
||
func (s *repoSource) StorageUsage(ctx context.Context) ([]repository.BackupStorageUsageItem, error) {
|
||
return s.records.StorageUsage(ctx)
|
||
}
|
||
|
||
func (s *repoSource) ListNodes(ctx context.Context) ([]model.Node, error) {
|
||
return s.nodes.List(ctx)
|
||
}
|
||
|
||
// CountSLABreach 统计当前违反 RPO 的任务:
|
||
// - 任务启用且配置了 SLAHoursRPO > 0
|
||
// - 最近一次成功备份距今超出 SLA 时间窗,或从未成功过
|
||
func (s *repoSource) CountSLABreach(ctx context.Context) (int, error) {
|
||
tasks, err := s.tasks.List(ctx, repository.BackupTaskListOptions{})
|
||
if err != nil {
|
||
return 0, err
|
||
}
|
||
now := s.now()
|
||
count := 0
|
||
for i := range tasks {
|
||
task := &tasks[i]
|
||
if task.SLAHoursRPO <= 0 || !task.Enabled {
|
||
continue
|
||
}
|
||
threshold := now.Add(-time.Duration(task.SLAHoursRPO) * time.Hour)
|
||
if task.LastRunAt == nil || task.LastRunAt.Before(threshold) {
|
||
count++
|
||
}
|
||
}
|
||
return count, nil
|
||
}
|
||
|
||
// Collector 周期性采集 gauge 类指标(存储用量、节点在线、SLA 违约)。
|
||
// 用后台 goroutine 驱动,避免在 /metrics 请求路径做慢 IO。
|
||
type Collector struct {
|
||
metrics *Metrics
|
||
source SampleSource
|
||
interval time.Duration
|
||
}
|
||
|
||
// NewCollector 创建周期采集器。interval=0 走默认 30s。
|
||
func NewCollector(m *Metrics, source SampleSource, interval time.Duration) *Collector {
|
||
if interval <= 0 {
|
||
interval = 30 * time.Second
|
||
}
|
||
return &Collector{metrics: m, source: source, interval: interval}
|
||
}
|
||
|
||
// Start 在后台运行采集循环;随 ctx 取消而终止。
|
||
// 启动时立即采一次,之后按 interval 轮询。
|
||
func (c *Collector) Start(ctx context.Context) {
|
||
if c == nil || c.metrics == nil || c.source == nil {
|
||
return
|
||
}
|
||
go func() {
|
||
c.collect(ctx)
|
||
ticker := time.NewTicker(c.interval)
|
||
defer ticker.Stop()
|
||
for {
|
||
select {
|
||
case <-ctx.Done():
|
||
return
|
||
case <-ticker.C:
|
||
c.collect(ctx)
|
||
}
|
||
}
|
||
}()
|
||
}
|
||
|
||
// collect 执行一次采样;单轮失败不影响下次。
|
||
func (c *Collector) collect(ctx context.Context) {
|
||
// 存储用量:按 StorageTargetID 聚合 file_size,对应 target name/type
|
||
if targets, err := c.source.ListStorageTargets(ctx); err == nil {
|
||
nameByID := make(map[uint]string, len(targets))
|
||
typeByID := make(map[uint]string, len(targets))
|
||
for i := range targets {
|
||
nameByID[targets[i].ID] = targets[i].Name
|
||
typeByID[targets[i].ID] = targets[i].Type
|
||
}
|
||
if usage, uerr := c.source.StorageUsage(ctx); uerr == nil {
|
||
c.metrics.ResetStorageUsed()
|
||
for _, item := range usage {
|
||
name := nameByID[item.StorageTargetID]
|
||
if name == "" {
|
||
continue
|
||
}
|
||
c.metrics.SetStorageUsed(name, typeByID[item.StorageTargetID], item.TotalSize)
|
||
}
|
||
}
|
||
}
|
||
// 节点在线状态:role 约定为 master / agent
|
||
if nodes, err := c.source.ListNodes(ctx); err == nil {
|
||
c.metrics.ResetNodeOnline()
|
||
for i := range nodes {
|
||
n := &nodes[i]
|
||
role := "agent"
|
||
if n.IsLocal {
|
||
role = "master"
|
||
}
|
||
c.metrics.SetNodeOnline(n.Name, role, n.Status == model.NodeStatusOnline)
|
||
}
|
||
}
|
||
if breach, err := c.source.CountSLABreach(ctx); err == nil {
|
||
c.metrics.SetSLABreach(breach)
|
||
}
|
||
}
|