Files
BackupX/server/internal/metrics/registry.go
Wu Qing 1b73f19eb1 功能: v2.1 可观测性与流控 (#47)
* 功能: v2.1 可观测性与流控 — Prometheus + 节点带宽 + 审计 Webhook

核心能力:
- Prometheus /metrics 端点:11 类指标(任务/存储/节点/SLA/验证/恢复/复制)
- 节点级带宽限速生效:model.Node.BandwidthLimit 覆盖全局默认
- 审计日志 Webhook 外输:HMAC-SHA256 签名,配合 SIEM 合规留档

实现:
- server/internal/metrics/  独立 Registry + 异步 Gauge Collector(30s)
- backup/restore/verify/replication 服务注入 metrics 钩子,nil 安全
- resolveProviderForNode() 按 task.NodeID 解析 BandwidthLimit
- AuditService.SetWebhook + 动态 settings 推送,无需重启

测试:
- metrics/registry_test.go: 注册/采集/nil safety/HTTP handler
- service/audit_service_webhook_test.go: 签名正确性/异步投递/禁用路径
- go test ./... 全部通过

* chore: 触发 CodeQL 扫描
2026-04-20 23:26:04 +08:00

226 lines
7.0 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package metrics 暴露 BackupX 的 Prometheus 采集器。
//
// 设计要点:
// - 使用独立 Registry避免与 default registry 中的 Go runtime metrics 混淆
// - Counter/Gauge/Histogram 全部以 backupx_ 为前缀,遵循 Prometheus 命名规范
// - 所有指标都支持零值:未注入时调用方法是 no-op不会 panic
// - 组件只依赖本包,不反向引用 service/repository避免循环
package metrics
import (
"net/http"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
)
// Metrics 聚合所有采集器,由 app 层组装一次并按需注入到 service。
type Metrics struct {
registry *prometheus.Registry
// 任务执行计数labels: status, task_type
TaskRunTotal *prometheus.CounterVec
// 任务耗时分布labels: task_type
TaskRunDuration *prometheus.HistogramVec
// 任务产出字节数labels: task_type
TaskBytesTotal *prometheus.CounterVec
// 正在运行的任务数
TaskRunningGauge prometheus.Gauge
// 存储目标用量labels: target_name, target_type
StorageUsedBytes *prometheus.GaugeVec
// 节点在线状态labels: node_name, rolevalue: 0/1
NodeOnline *prometheus.GaugeVec
// 验证演练结果labels: status
VerifyRunTotal *prometheus.CounterVec
// 恢复操作结果labels: status
RestoreRunTotal *prometheus.CounterVec
// 副本复制结果labels: status
ReplicationRunTotal *prometheus.CounterVec
// SLA 违约数gauge
SLABreachGauge prometheus.Gauge
// 应用信息label: version
AppInfo *prometheus.GaugeVec
}
// New 构造并注册所有采集器。
// 失败时 panic采集器注册失败属于启动期编程错误没有合理 fallback。
func New(version string) *Metrics {
reg := prometheus.NewRegistry()
// 注入标准 Go runtime + process 指标
reg.MustRegister(collectors.NewGoCollector())
reg.MustRegister(collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}))
m := &Metrics{
registry: reg,
TaskRunTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "backupx_task_run_total",
Help: "备份任务执行总数,按状态和任务类型细分",
}, []string{"status", "task_type"}),
TaskRunDuration: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Name: "backupx_task_run_duration_seconds",
Help: "备份任务耗时分布",
Buckets: []float64{1, 5, 15, 30, 60, 120, 300, 600, 1800, 3600, 7200},
}, []string{"task_type"}),
TaskBytesTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "backupx_task_bytes_total",
Help: "备份任务累计产出字节数",
}, []string{"task_type"}),
TaskRunningGauge: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "backupx_task_running",
Help: "当前正在执行的备份任务数",
}),
StorageUsedBytes: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "backupx_storage_used_bytes",
Help: "存储目标已用字节数",
}, []string{"target_name", "target_type"}),
NodeOnline: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "backupx_node_online",
Help: "集群节点在线状态1 在线 / 0 离线)",
}, []string{"node_name", "role"}),
VerifyRunTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "backupx_verify_run_total",
Help: "备份验证演练执行总数",
}, []string{"status"}),
RestoreRunTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "backupx_restore_run_total",
Help: "恢复操作执行总数",
}, []string{"status"}),
ReplicationRunTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "backupx_replication_run_total",
Help: "备份副本复制执行总数",
}, []string{"status"}),
SLABreachGauge: prometheus.NewGauge(prometheus.GaugeOpts{
Name: "backupx_sla_breach_tasks",
Help: "当前违反 SLA/RPO 的任务数",
}),
AppInfo: prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: "backupx_app_info",
Help: "BackupX 应用元信息(恒为 1通过 label 暴露版本号)",
}, []string{"version"}),
}
reg.MustRegister(
m.TaskRunTotal,
m.TaskRunDuration,
m.TaskBytesTotal,
m.TaskRunningGauge,
m.StorageUsedBytes,
m.NodeOnline,
m.VerifyRunTotal,
m.RestoreRunTotal,
m.ReplicationRunTotal,
m.SLABreachGauge,
m.AppInfo,
)
m.AppInfo.WithLabelValues(version).Set(1)
return m
}
// Handler 返回 /metrics 的 HTTP handler。
// 使用本包专属 registry避免混入其他组件的默认 metrics。
func (m *Metrics) Handler() http.Handler {
if m == nil {
return http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
http.Error(w, "metrics disabled", http.StatusServiceUnavailable)
})
}
return promhttp.HandlerFor(m.registry, promhttp.HandlerOpts{
EnableOpenMetrics: false,
})
}
// ObserveTaskRun 记录一次任务执行结果。
// status 常用值success / failed / cancelled。nil 接收器安全。
func (m *Metrics) ObserveTaskRun(taskType, status string, durationSec float64, bytes int64) {
if m == nil {
return
}
m.TaskRunTotal.WithLabelValues(status, taskType).Inc()
m.TaskRunDuration.WithLabelValues(taskType).Observe(durationSec)
if bytes > 0 {
m.TaskBytesTotal.WithLabelValues(taskType).Add(float64(bytes))
}
}
// IncTaskRunning / DecTaskRunning 配套使用,反映并发中任务数。
func (m *Metrics) IncTaskRunning() {
if m == nil {
return
}
m.TaskRunningGauge.Inc()
}
func (m *Metrics) DecTaskRunning() {
if m == nil {
return
}
m.TaskRunningGauge.Dec()
}
// ObserveRestore / ObserveVerify / ObserveReplication 记录子动作结果。
// 所有方法对 nil 接收器安全:未注入 Metrics 时静默降级,不 panic。
func (m *Metrics) ObserveRestore(status string) {
if m == nil {
return
}
m.RestoreRunTotal.WithLabelValues(status).Inc()
}
func (m *Metrics) ObserveVerify(status string) {
if m == nil {
return
}
m.VerifyRunTotal.WithLabelValues(status).Inc()
}
func (m *Metrics) ObserveReplication(status string) {
if m == nil {
return
}
m.ReplicationRunTotal.WithLabelValues(status).Inc()
}
// SetStorageUsed 刷新某存储目标的用量。调用方负责周期采集。
func (m *Metrics) SetStorageUsed(name, targetType string, bytes int64) {
if m == nil {
return
}
m.StorageUsedBytes.WithLabelValues(name, targetType).Set(float64(bytes))
}
// SetNodeOnline 刷新节点在线状态。
func (m *Metrics) SetNodeOnline(name, role string, online bool) {
if m == nil {
return
}
val := 0.0
if online {
val = 1
}
m.NodeOnline.WithLabelValues(name, role).Set(val)
}
// ResetNodeOnline 清空节点 gauge当节点被删除时避免残留指标
func (m *Metrics) ResetNodeOnline() {
if m == nil {
return
}
m.NodeOnline.Reset()
}
// ResetStorageUsed 清空存储目标 gauge。
func (m *Metrics) ResetStorageUsed() {
if m == nil {
return
}
m.StorageUsedBytes.Reset()
}
// SetSLABreach 刷新 SLA 违约任务数。
func (m *Metrics) SetSLABreach(count int) {
if m == nil {
return
}
m.SLABreachGauge.Set(float64(count))
}