功能: v2.1 可观测性与流控 (#47)

* 功能: v2.1 可观测性与流控 — Prometheus + 节点带宽 + 审计 Webhook

核心能力:
- Prometheus /metrics 端点:11 类指标(任务/存储/节点/SLA/验证/恢复/复制)
- 节点级带宽限速生效:model.Node.BandwidthLimit 覆盖全局默认
- 审计日志 Webhook 外输:HMAC-SHA256 签名,配合 SIEM 合规留档

实现:
- server/internal/metrics/  独立 Registry + 异步 Gauge Collector(30s)
- backup/restore/verify/replication 服务注入 metrics 钩子,nil 安全
- resolveProviderForNode() 按 task.NodeID 解析 BandwidthLimit
- AuditService.SetWebhook + 动态 settings 推送,无需重启

测试:
- metrics/registry_test.go: 注册/采集/nil safety/HTTP handler
- service/audit_service_webhook_test.go: 签名正确性/异步投递/禁用路径
- go test ./... 全部通过

* chore: 触发 CodeQL 扫描
This commit is contained in:
Wu Qing
2026-04-20 23:26:04 +08:00
committed by GitHub
parent f7596bd319
commit 5021fe665e
16 changed files with 970 additions and 14 deletions

View File

@@ -1,9 +1,18 @@
package service
import (
"bytes"
"context"
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"fmt"
"log"
"net/http"
"strings"
"sync"
"time"
"backupx/server/internal/apperror"
"backupx/server/internal/model"
@@ -25,10 +34,39 @@ type AuditEntry struct {
type AuditService struct {
repo repository.AuditLogRepository
// webhook 外输配置(可选)
webhookMu sync.RWMutex
webhookURL string
webhookSecret string
httpClient *http.Client
}
func NewAuditService(repo repository.AuditLogRepository) *AuditService {
return &AuditService{repo: repo}
return &AuditService{
repo: repo,
httpClient: &http.Client{
Timeout: 3 * time.Second, // 短超时:审计 webhook 不应拖慢业务
},
}
}
// SetWebhook 动态配置审计事件转发 URL 与签名密钥。
// - url 为空字符串时禁用转发
// - secret 非空时对 payload 计算 HMAC-SHA256作为 X-BackupX-Signature header
//
// 适用场景:
// - 企业 SIEM 集成Splunk HEC、ELK、Loki
// - 安全审计留痕到第三方 WORM 存储
// - 合规日志归档GDPR / SOC2
func (s *AuditService) SetWebhook(url, secret string) {
if s == nil {
return
}
s.webhookMu.Lock()
defer s.webhookMu.Unlock()
s.webhookURL = strings.TrimSpace(url)
s.webhookSecret = strings.TrimSpace(secret)
}
// Record 异步 fire-and-forget 写入审计日志,不阻塞业务逻辑
@@ -51,9 +89,65 @@ func (s *AuditService) Record(entry AuditEntry) {
if err := s.repo.Create(context.Background(), record); err != nil {
log.Printf("[audit] failed to write audit log: %v", err)
}
s.fireWebhook(record)
}()
}
// fireWebhook 异步向外部系统转发审计事件。失败降级到本地日志,永不影响主流程。
func (s *AuditService) fireWebhook(record *model.AuditLog) {
if s == nil {
return
}
s.webhookMu.RLock()
url := s.webhookURL
secret := s.webhookSecret
s.webhookMu.RUnlock()
if url == "" {
return
}
payload := map[string]any{
"eventType": "audit.log",
"occurredAt": record.CreatedAt.UTC().Format(time.RFC3339),
"actor": map[string]any{
"userId": record.UserID,
"username": record.Username,
},
"category": record.Category,
"action": record.Action,
"targetType": record.TargetType,
"targetId": record.TargetID,
"targetName": record.TargetName,
"detail": record.Detail,
"clientIp": record.ClientIP,
}
body, err := json.Marshal(payload)
if err != nil {
log.Printf("[audit] webhook marshal failed: %v", err)
return
}
req, err := http.NewRequestWithContext(context.Background(), http.MethodPost, url, bytes.NewReader(body))
if err != nil {
log.Printf("[audit] webhook build request failed: %v", err)
return
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("User-Agent", "BackupX-Audit/1.0")
if secret != "" {
mac := hmac.New(sha256.New, []byte(secret))
mac.Write(body)
req.Header.Set("X-BackupX-Signature", "sha256="+hex.EncodeToString(mac.Sum(nil)))
}
resp, err := s.httpClient.Do(req)
if err != nil {
log.Printf("[audit] webhook POST failed: %v", err)
return
}
defer resp.Body.Close()
if resp.StatusCode >= 400 {
log.Printf("[audit] webhook returned status %d", resp.StatusCode)
}
}
// List 分页查询审计日志
func (s *AuditService) List(ctx context.Context, category string, limit, offset int) (*repository.AuditLogListResult, error) {
result, err := s.repo.List(ctx, repository.AuditLogListOptions{

View File

@@ -0,0 +1,129 @@
package service
import (
"context"
"crypto/hmac"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"io"
"net/http"
"net/http/httptest"
"sync"
"sync/atomic"
"testing"
"time"
"backupx/server/internal/model"
"backupx/server/internal/repository"
)
// fakeAuditRepo 用通道同步等待异步写入,避免 sleep。
type fakeAuditRepo struct {
mu sync.Mutex
logs []model.AuditLog
created chan struct{}
}
func newFakeAuditRepo() *fakeAuditRepo {
return &fakeAuditRepo{created: make(chan struct{}, 4)}
}
func (r *fakeAuditRepo) Create(_ context.Context, log *model.AuditLog) error {
r.mu.Lock()
log.CreatedAt = time.Now().UTC()
r.logs = append(r.logs, *log)
r.mu.Unlock()
r.created <- struct{}{}
return nil
}
func (r *fakeAuditRepo) List(context.Context, repository.AuditLogListOptions) (*repository.AuditLogListResult, error) {
return &repository.AuditLogListResult{}, nil
}
func (r *fakeAuditRepo) ListAll(context.Context, repository.AuditLogListOptions) ([]model.AuditLog, error) {
return nil, nil
}
func TestAuditService_WebhookDeliversSignedPayload(t *testing.T) {
var hits atomic.Int32
var got struct {
sig string
payload map[string]any
received chan struct{}
}
got.received = make(chan struct{}, 1)
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
hits.Add(1)
body, _ := io.ReadAll(r.Body)
got.sig = r.Header.Get("X-BackupX-Signature")
_ = json.Unmarshal(body, &got.payload)
// 验证 HMAC 正确
mac := hmac.New(sha256.New, []byte("s3cret"))
mac.Write(body)
expected := "sha256=" + hex.EncodeToString(mac.Sum(nil))
if got.sig != expected {
t.Errorf("signature mismatch: expected %s, got %s", expected, got.sig)
}
w.WriteHeader(http.StatusOK)
got.received <- struct{}{}
}))
defer server.Close()
repo := newFakeAuditRepo()
svc := NewAuditService(repo)
svc.SetWebhook(server.URL, "s3cret")
svc.Record(AuditEntry{
Username: "alice",
Category: "auth",
Action: "login_success",
ClientIP: "10.0.0.1",
Detail: "admin login",
})
// 等待异步写入 + webhook
select {
case <-repo.created:
case <-time.After(time.Second):
t.Fatal("audit log not written within 1s")
}
select {
case <-got.received:
case <-time.After(time.Second):
t.Fatal("webhook not invoked within 1s")
}
if hits.Load() != 1 {
t.Fatalf("expected 1 webhook hit, got %d", hits.Load())
}
if got.payload["eventType"] != "audit.log" {
t.Errorf("eventType wrong: %v", got.payload["eventType"])
}
actor, ok := got.payload["actor"].(map[string]any)
if !ok || actor["username"] != "alice" {
t.Errorf("actor.username mismatch: %v", got.payload["actor"])
}
if got.payload["action"] != "login_success" {
t.Errorf("action mismatch: %v", got.payload["action"])
}
}
func TestAuditService_WebhookDisabledWhenURLEmpty(t *testing.T) {
repo := newFakeAuditRepo()
svc := NewAuditService(repo)
// 不调用 SetWebhook应该不发送任何请求
svc.Record(AuditEntry{Username: "bob", Action: "logout"})
select {
case <-repo.created:
case <-time.After(time.Second):
t.Fatal("audit log not written within 1s")
}
// 给 webhook 一些时间(即便它不会被调用)
time.Sleep(100 * time.Millisecond)
// 无显式断言:能不 panic 即算通过
}

View File

@@ -17,6 +17,7 @@ import (
"backupx/server/internal/apperror"
"backupx/server/internal/backup"
backupretention "backupx/server/internal/backup/retention"
"backupx/server/internal/metrics"
"backupx/server/internal/model"
"backupx/server/internal/repository"
"backupx/server/internal/storage"
@@ -92,8 +93,14 @@ type BackupExecutionService struct {
// nodeSemaphores 节点级并发限制(按 NodeID 映射)。
// 没命中的 NodeID 走全局 semaphore节点配置 MaxConcurrent>0 时按该节点独立排队。
nodeSemaphores sync.Map
retries int // rclone 底层重试次数
bandwidthLimit string // rclone 带宽限制
retries int // rclone 底层重试次数
bandwidthLimit string // rclone 带宽限制(全局默认,节点配置可覆盖)
metrics *metrics.Metrics
}
// SetMetrics 注入 Prometheus 采集器。nil 时所有埋点退化为 no-op。
func (s *BackupExecutionService) SetMetrics(m *metrics.Metrics) {
s.metrics = m
}
// ReplicationTrigger 抽象备份成功后的副本派发实现者ReplicationService
@@ -407,6 +414,22 @@ func (s *BackupExecutionService) shouldNotify(ctx context.Context, task *model.B
return true
}
// effectiveBandwidth 返回当前上下文应用的带宽限速字符串。
// 优先级Node.BandwidthLimit非空 > 全局 s.bandwidthLimit。
func (s *BackupExecutionService) effectiveBandwidth(ctx context.Context, nodeID uint) string {
if nodeID == 0 || s.nodeRepo == nil {
return s.bandwidthLimit
}
node, err := s.nodeRepo.FindByID(ctx, nodeID)
if err != nil || node == nil {
return s.bandwidthLimit
}
if strings.TrimSpace(node.BandwidthLimit) != "" {
return node.BandwidthLimit
}
return s.bandwidthLimit
}
// acquireNodeSemaphore 返回节点级并发通道。懒初始化:第一次为某节点排队时创建。
// 如果节点未配置 MaxConcurrent 或 nodeRepo 未注入,返回 nil调用方走全局 semaphore
// 节点容量仅在首次创建时采用,后续变更需重启服务才生效(避免运行时 resize 通道的复杂度)。
@@ -456,6 +479,10 @@ func (s *BackupExecutionService) executeTask(ctx context.Context, task *model.Ba
s.semaphore <- struct{}{}
defer func() { <-s.semaphore }()
// Prometheus: running gauge + 完成时 observe 耗时/字节/状态
s.metrics.IncTaskRunning()
defer s.metrics.DecTaskRunning()
logger := backup.NewExecutionLogger(recordID, s.logHub)
status := "failed"
errMessage := ""
@@ -468,6 +495,8 @@ func (s *BackupExecutionService) executeTask(ctx context.Context, task *model.Ba
if finalizeErr := s.finalizeRecord(ctx, task, recordID, startedAt, status, errMessage, logger.String(), fileName, fileSize, checksum, storagePath); finalizeErr != nil {
logger.Errorf("写回备份记录失败:%v", finalizeErr)
}
// 采集任务执行结果到 Prometheus耗时 + 产出字节 + 状态计数)
s.metrics.ObserveTaskRun(task.Type, status, time.Since(startedAt).Seconds(), fileSize)
// 写入多目标上传结果
if len(uploadResults) > 0 {
if resultsJSON, marshalErr := json.Marshal(uploadResults); marshalErr == nil {
@@ -559,7 +588,8 @@ func (s *BackupExecutionService) executeTask(ctx context.Context, task *model.Ba
if findErr == nil && target != nil {
targetName = target.Name
}
provider, resolveErr := s.resolveProvider(ctx, targetID)
// 节点级带宽覆盖:若 task 绑定节点并配置了 BandwidthLimit覆盖全局限速
provider, resolveErr := s.resolveProviderForNode(ctx, targetID, task.NodeID)
if resolveErr != nil {
uploadResults[index] = StorageUploadResultItem{StorageTargetID: targetID, StorageTargetName: targetName, Status: "failed", Error: resolveErr.Error()}
logger.Warnf("存储目标 %s 创建客户端失败:%v", targetName, resolveErr)
@@ -742,10 +772,17 @@ func (s *BackupExecutionService) finalizeRecord(ctx context.Context, task *model
}
func (s *BackupExecutionService) resolveProvider(ctx context.Context, targetID uint) (storage.StorageProvider, error) {
// 注入 rclone 传输配置(重试、带宽限制)
return s.resolveProviderForNode(ctx, targetID, 0)
}
// resolveProviderForNode 根据节点的 BandwidthLimit 覆盖全局默认。
// nodeID=0 或节点未配置时退化为全局默认。
// 仅在 Master 本地执行生效Agent 会收到自身 Node 配置,并在独立 runtime 中应用。
func (s *BackupExecutionService) resolveProviderForNode(ctx context.Context, targetID uint, nodeID uint) (storage.StorageProvider, error) {
// 注入 rclone 传输配置(重试、节点级带宽覆盖全局)
ctx = rclone.ConfiguredContext(ctx, rclone.TransferConfig{
LowLevelRetries: s.retries,
BandwidthLimit: s.bandwidthLimit,
BandwidthLimit: s.effectiveBandwidth(ctx, nodeID),
})
target, err := s.targets.FindByID(ctx, targetID)
if err != nil {

View File

@@ -10,6 +10,7 @@ import (
"time"
"backupx/server/internal/apperror"
"backupx/server/internal/metrics"
"backupx/server/internal/model"
"backupx/server/internal/repository"
"backupx/server/internal/storage"
@@ -37,6 +38,12 @@ type ReplicationService struct {
semaphore chan struct{}
async func(func())
now func() time.Time
metrics *metrics.Metrics
}
// SetMetrics 注入 Prometheus 采集器。
func (s *ReplicationService) SetMetrics(m *metrics.Metrics) {
s.metrics = m
}
func NewReplicationService(
@@ -193,6 +200,7 @@ func (s *ReplicationService) executeReplication(ctx context.Context, repID uint)
rep.DurationSeconds = int(completedAt.Sub(rep.StartedAt).Seconds())
rep.CompletedAt = &completedAt
_ = s.replications.Update(ctx, rep)
s.metrics.ObserveReplication(status)
if status == model.ReplicationStatusFailed {
s.dispatchFailed(ctx, rep, errMessage)
}

View File

@@ -11,6 +11,7 @@ import (
"backupx/server/internal/apperror"
"backupx/server/internal/backup"
"backupx/server/internal/metrics"
"backupx/server/internal/model"
"backupx/server/internal/repository"
"backupx/server/internal/storage"
@@ -41,6 +42,12 @@ type RestoreService struct {
semaphore chan struct{}
async func(func())
now func() time.Time
metrics *metrics.Metrics
}
// SetMetrics 注入 Prometheus 采集器。
func (s *RestoreService) SetMetrics(m *metrics.Metrics) {
s.metrics = m
}
// NewRestoreService 构造恢复服务。maxConcurrent 控制本地并发恢复数。
@@ -432,6 +439,7 @@ func (s *RestoreService) finalizeWithLog(ctx context.Context, restoreID uint, st
}
record.DurationSeconds = int(completedAt.Sub(record.StartedAt).Seconds())
record.CompletedAt = &completedAt
s.metrics.ObserveRestore(status)
return s.restores.Update(ctx, record)
}

View File

@@ -8,21 +8,55 @@ import (
"backupx/server/internal/repository"
)
// AuditWebhookConfigurer 抽象审计 webhook 配置接口,由 AuditService 实现。
// 用接口解耦避免 settings_service 直接依赖 AuditService 具体类型。
type AuditWebhookConfigurer interface {
SetWebhook(url, secret string)
}
type SettingsService struct {
configs repository.SystemConfigRepository
configs repository.SystemConfigRepository
auditWebhook AuditWebhookConfigurer
}
func NewSettingsService(configs repository.SystemConfigRepository) *SettingsService {
return &SettingsService{configs: configs}
}
// settingsKeys lists all user-editable setting keys.
// SetAuditWebhookConfigurer 注入 audit webhook 配置接收方。
// 启动时立即用当前 DB 中的设置调用一次,后续每次 Update 变更 webhook key 时同步推送。
func (s *SettingsService) SetAuditWebhookConfigurer(ctx context.Context, configurer AuditWebhookConfigurer) {
if s == nil || configurer == nil {
return
}
s.auditWebhook = configurer
// 启动时同步一次,保证重启后配置不丢失
all, err := s.GetAll(ctx)
if err == nil {
configurer.SetWebhook(all[SettingKeyAuditWebhookURL], all[SettingKeyAuditWebhookSecret])
}
}
// 可被前端写入的系统设置键。新增键必须同步加入此清单,
// 否则 Update 会忽略(安全原则:显式 allow-list
const (
SettingKeySiteName = "site_name"
SettingKeyLanguage = "language"
SettingKeyTimezone = "timezone"
SettingKeyBackupNotificationEnabled = "backup_notification_enabled"
SettingKeyBandwidthLimit = "bandwidth_limit"
SettingKeyAuditWebhookURL = "audit_webhook_url"
SettingKeyAuditWebhookSecret = "audit_webhook_secret"
)
var settingsKeys = []string{
"site_name",
"language",
"timezone",
"backup_notification_enabled",
"bandwidth_limit",
SettingKeySiteName,
SettingKeyLanguage,
SettingKeyTimezone,
SettingKeyBackupNotificationEnabled,
SettingKeyBandwidthLimit,
SettingKeyAuditWebhookURL,
SettingKeyAuditWebhookSecret,
}
func (s *SettingsService) GetAll(ctx context.Context) (map[string]string, error) {
@@ -42,6 +76,7 @@ func (s *SettingsService) Update(ctx context.Context, settings map[string]string
for _, key := range settingsKeys {
allowed[key] = true
}
auditWebhookTouched := false
for key, value := range settings {
if !allowed[key] {
continue
@@ -50,6 +85,14 @@ func (s *SettingsService) Update(ctx context.Context, settings map[string]string
if err := s.configs.Upsert(ctx, item); err != nil {
return nil, apperror.Internal("SETTINGS_UPDATE_FAILED", "无法更新系统设置", err)
}
if key == SettingKeyAuditWebhookURL || key == SettingKeyAuditWebhookSecret {
auditWebhookTouched = true
}
}
// audit webhook 配置变化:立即同步到 AuditService避免重启才生效
if auditWebhookTouched && s.auditWebhook != nil {
all, _ := s.GetAll(ctx)
s.auditWebhook.SetWebhook(all[SettingKeyAuditWebhookURL], all[SettingKeyAuditWebhookSecret])
}
return s.GetAll(ctx)
}

View File

@@ -10,6 +10,7 @@ import (
"backupx/server/internal/apperror"
"backupx/server/internal/backup"
"backupx/server/internal/metrics"
"backupx/server/internal/model"
"backupx/server/internal/repository"
"backupx/server/internal/storage"
@@ -42,6 +43,12 @@ type VerificationService struct {
semaphore chan struct{}
async func(func())
now func() time.Time
metrics *metrics.Metrics
}
// SetMetrics 注入 Prometheus 采集器。
func (s *VerificationService) SetMetrics(m *metrics.Metrics) {
s.metrics = m
}
// VerificationNotifier 给用户推送验证完成/失败通知。
@@ -413,6 +420,7 @@ func (s *VerificationService) finalize(ctx context.Context, verID uint, status,
}
record.DurationSeconds = int(completedAt.Sub(record.StartedAt).Seconds())
record.CompletedAt = &completedAt
s.metrics.ObserveVerify(status)
return s.verifications.Update(ctx, record)
}