功能: v2.2 节点池调度 + Grafana Dashboard + 版本漂移 UI (#49)

节点池动态调度（企业集群核心需求）： - model.Node 新增 Labels CSV；Node.HasLabel / LabelSet 辅助方法 - model.BackupTask 新增 NodePoolTag；与 NodeID 互斥（校验层拒绝同时设置） - BackupExecutionService.selectPoolNode：匹配标签的在线节点中选"运行中任务最少" 并列按 ID 升序稳定；空池返回 NODE_POOL_EMPTY 让用户立即感知 - 选中节点仅写 BackupRecord，不回写 task.NodeID —— 每次执行重选实现真轮转均衡 Grafana Dashboard（v2.1 指标的可视化闭环）： - deploy/grafana/backupx-dashboard.json：11 个面板覆盖概览/时序/容量/集群 - deploy/grafana/README.md：Prometheus 抓取配置 + 告警建议 - release workflow 打包 grafana/ + nginx.conf 到 tar.gz 前端： - 节点列表：Agent 版本 vs Master 不一致时橙红 Tag + Tooltip 提示升级 - 节点列表新增"标签/节点池"列，支持 CSV 编辑 + 并发/带宽一起改 - 任务表单新增 NodePoolTag 输入框，与节点选择器互斥禁用测试： - model/node_label_test.go：HasLabel / LabelSet / nil 安全 - service/node_pool_scheduler_test.go：负载最低优先 / 空池错误 / nil repo 降级 - go test ./... + npm run build 全绿
2026-05-11 18:10:23 +08:00 · 2026-04-21 14:05:48 +08:00
parent e2baa6bd17
commit eff48342c8
16 changed files with 701 additions and 15 deletions
--- a/server/internal/model/backup_task.go
+++ b/server/internal/model/backup_task.go
@@ -39,6 +39,10 @@ type BackupTask struct {
 	StorageTargets       []StorageTarget `gorm:"many2many:backup_task_storage_targets" json:"storageTargets,omitempty"`
 	NodeID               uint            `gorm:"column:node_id;index;default:0" json:"nodeId"`
 	Node                 Node            `json:"node,omitempty"`
+	// NodePoolTag 节点池标签（可选）。非空且 NodeID=0 时，调度器会从 Node.Labels 包含该 tag
+	// 的在线节点中动态挑选一台执行（按运行中任务数最少原则），失败会 best-effort 切换到下一个候选。
+	// 典型场景：NodePoolTag="db" 让 MySQL 备份任务在任意标有 "db" 的数据库节点执行。
+	NodePoolTag string `gorm:"column:node_pool_tag;size:64;index" json:"nodePoolTag"`
 	Tags                 string          `gorm:"column:tags;size:500" json:"tags"`
 	RetentionDays        int             `gorm:"column:retention_days;not null;default:30" json:"retentionDays"`
 	Compression          string          `gorm:"size:10;not null;default:'gzip'" json:"compression"`
--- a/server/internal/model/node.go
+++ b/server/internal/model/node.go
@@ -1,6 +1,9 @@
 package model

-import "time"
+import (
+	"strings"
+	"time"
+)

 const (
 	NodeStatusOnline  = "online"
@@ -29,8 +32,42 @@ type Node struct {
 	// BandwidthLimit 该节点上传带宽上限（rclone 可识别格式：10M / 1G / 0=不限）。
 	// 对集群感知的上传场景有效（Master 本地与 Agent 运行时均会应用）。
 	BandwidthLimit string `gorm:"column:bandwidth_limit;size:32" json:"bandwidthLimit"`
-	CreatedAt      time.Time `json:"createdAt"`
-	UpdatedAt      time.Time `json:"updatedAt"`
+	// Labels 节点标签（CSV，如 "prod,db-host,high-mem"）。
+	// 用于任务调度的节点池选择：任务配置 NodePoolTag 时，调度器会从 Labels 包含该 tag 的
+	// 在线节点中自动挑选一台执行（按当前运行中任务数升序）。单节点可属多个池。
+	Labels    string    `gorm:"column:labels;size:500" json:"labels"`
+	CreatedAt time.Time `json:"createdAt"`
+	UpdatedAt time.Time `json:"updatedAt"`
+}
+
+// LabelSet 把 CSV Labels 解析为 set，便于做成员判定。
+// 空白与空 token 自动忽略。
+func (n *Node) LabelSet() map[string]struct{} {
+	if n == nil {
+		return nil
+	}
+	out := make(map[string]struct{})
+	for _, raw := range strings.Split(n.Labels, ",") {
+		label := strings.TrimSpace(raw)
+		if label != "" {
+			out[label] = struct{}{}
+		}
+	}
+	return out
+}
+
+// HasLabel 判断节点是否属于指定池。nil/空 tag 返回 false。
+func (n *Node) HasLabel(tag string) bool {
+	tag = strings.TrimSpace(tag)
+	if n == nil || tag == "" {
+		return false
+	}
+	for _, raw := range strings.Split(n.Labels, ",") {
+		if strings.TrimSpace(raw) == tag {
+			return true
+		}
+	}
+	return false
 }

 func (Node) TableName() string {
--- a/server/internal/model/node_label_test.go
+++ b/server/internal/model/node_label_test.go
@@ -0,0 +1,47 @@
+package model
+
+import "testing"
+
+func TestNodeHasLabel(t *testing.T) {
+	cases := []struct {
+		labels string
+		tag    string
+		want   bool
+	}{
+		{"prod,db,high-mem", "prod", true},
+		{"prod,db,high-mem", "db", true},
+		{"prod,db,high-mem", "backup", false},
+		{"  prod ,  db  ", "db", true}, // trim 空白
+		{"", "prod", false},
+		{"prod", "", false}, // 空 tag 不匹配
+	}
+	for _, c := range cases {
+		n := &Node{Labels: c.labels}
+		if got := n.HasLabel(c.tag); got != c.want {
+			t.Errorf("labels=%q tag=%q want %v got %v", c.labels, c.tag, c.want, got)
+		}
+	}
+}
+
+func TestNodeLabelSet(t *testing.T) {
+	n := &Node{Labels: "prod, db ,,high-mem,prod"}
+	set := n.LabelSet()
+	for _, want := range []string{"prod", "db", "high-mem"} {
+		if _, ok := set[want]; !ok {
+			t.Errorf("expected label %q in set", want)
+		}
+	}
+	if len(set) != 3 {
+		t.Errorf("duplicates not deduped, got %v", set)
+	}
+}
+
+func TestNilNodeHasLabelSafe(t *testing.T) {
+	var n *Node
+	if n.HasLabel("anything") {
+		t.Error("nil node should never match any label")
+	}
+	if s := n.LabelSet(); s != nil {
+		t.Errorf("nil node LabelSet should be nil, got %v", s)
+	}
+}
--- a/server/internal/service/backup_execution_service.go
+++ b/server/internal/service/backup_execution_service.go
@@ -335,16 +335,29 @@ func (s *BackupExecutionService) startTask(ctx context.Context, id uint, async b
 				nil)
 		}
 	}
+	// 节点池动态选择：task.NodeID=0 且 NodePoolTag 非空时，从匹配的在线节点中挑一台。
+	// 选择策略：正在运行任务数最少者优先；并列时按 ID 升序稳定。
+	// 选中节点仅影响本次运行（task.NodeID 不持久化改动），保证任务在池内轮转。
+	resolvedNodeID := task.NodeID
+	if task.NodeID == 0 && strings.TrimSpace(task.NodePoolTag) != "" {
+		if pooled, perr := s.selectPoolNode(ctx, task.NodePoolTag); perr == nil && pooled != nil {
+			resolvedNodeID = pooled.ID
+		} else if perr != nil {
+			return nil, perr
+		}
+	}
 	startedAt := s.now()
 	// 取第一个存储目标 ID 做兼容
 	primaryTargetID := task.StorageTargetID
 	if tids := collectTargetIDs(task); len(tids) > 0 {
 		primaryTargetID = tids[0]
 	}
-	record := &model.BackupRecord{TaskID: task.ID, StorageTargetID: primaryTargetID, NodeID: task.NodeID, Status: "running", StartedAt: startedAt}
+	record := &model.BackupRecord{TaskID: task.ID, StorageTargetID: primaryTargetID, NodeID: resolvedNodeID, Status: "running", StartedAt: startedAt}
 	if err := s.records.Create(ctx, record); err != nil {
 		return nil, apperror.Internal("BACKUP_RECORD_CREATE_FAILED", "无法创建备份记录", err)
 	}
+	// 用池选出的节点 ID 复写 task 副本，使后续路由/执行沿用
+	task.NodeID = resolvedNodeID
 	task.LastRunAt = &startedAt
 	task.LastStatus = "running"
 	if err := s.tasks.Update(ctx, task); err != nil {
@@ -414,6 +427,64 @@ func (s *BackupExecutionService) shouldNotify(ctx context.Context, task *model.B
 	return true
 }

+// selectPoolNode 从所有 Labels 包含 poolTag 的在线节点中选择"当前运行中任务最少"的一台。
+// 返回 (nil, error) 表示硬错误（仓储访问失败）；(nil, nil) 表示没有匹配节点（退化走本机 Master）。
+// 本方法不修改任何持久化状态，仅做选择。
+func (s *BackupExecutionService) selectPoolNode(ctx context.Context, poolTag string) (*model.Node, error) {
+	if s.nodeRepo == nil {
+		// 没接入集群依赖时，降级为让调用方走本机 Master
+		return nil, nil
+	}
+	nodes, err := s.nodeRepo.List(ctx)
+	if err != nil {
+		return nil, apperror.Internal("NODE_LIST_FAILED", "无法枚举节点池", err)
+	}
+	candidates := make([]*model.Node, 0)
+	for i := range nodes {
+		n := &nodes[i]
+		if n.Status != model.NodeStatusOnline {
+			continue
+		}
+		if !n.HasLabel(poolTag) {
+			continue
+		}
+		candidates = append(candidates, n)
+	}
+	if len(candidates) == 0 {
+		return nil, apperror.BadRequest("NODE_POOL_EMPTY",
+			fmt.Sprintf("节点池 %q 下无在线节点，任务无法调度", poolTag), nil)
+	}
+	// 运行中记录数越少越优先。并列按 ID 升序（稳定、可预期）。
+	best := candidates[0]
+	bestLoad := s.countRunningOnNode(ctx, best.ID)
+	for _, n := range candidates[1:] {
+		load := s.countRunningOnNode(ctx, n.ID)
+		if load < bestLoad || (load == bestLoad && n.ID < best.ID) {
+			best = n
+			bestLoad = load
+		}
+	}
+	return best, nil
+}
+
+// countRunningOnNode 近似返回节点当前 running 记录数。失败按 0 处理（不影响功能，仅退化调度精度）。
+func (s *BackupExecutionService) countRunningOnNode(ctx context.Context, nodeID uint) int {
+	if s.records == nil {
+		return 0
+	}
+	items, err := s.records.List(ctx, repository.BackupRecordListOptions{Status: model.BackupRecordStatusRunning})
+	if err != nil {
+		return 0
+	}
+	count := 0
+	for i := range items {
+		if items[i].NodeID == nodeID {
+			count++
+		}
+	}
+	return count
+}
+
 // effectiveBandwidth 返回当前上下文应用的带宽限速字符串。
 // 优先级：Node.BandwidthLimit（非空） > 全局 s.bandwidthLimit。
 func (s *BackupExecutionService) effectiveBandwidth(ctx context.Context, nodeID uint) string {
--- a/server/internal/service/backup_task_service.go
+++ b/server/internal/service/backup_task_service.go
@@ -35,7 +35,9 @@ type BackupTaskUpsertInput struct {
 	DBPath           string   `json:"dbPath" binding:"max=500"`
 	StorageTargetID  uint     `json:"storageTargetId"`                       // deprecated: 向后兼容
 	StorageTargetIDs []uint   `json:"storageTargetIds"`                      // 新增：多存储目标
-	NodeID           uint     `json:"nodeId"`                                // 执行节点（0 = 本机 Master）
+	NodeID           uint     `json:"nodeId"`                                // 执行节点（0 = 本机 Master 或节点池）
+	// NodePoolTag 节点池标签。NodeID=0 且本字段非空时，调度器动态从 Labels 命中的在线节点中选负载最低者。
+	NodePoolTag      string   `json:"nodePoolTag" binding:"max=64"`
 	Tags             string   `json:"tags" binding:"max=500"`                // 逗号分隔标签
 	RetentionDays    int      `json:"retentionDays"`
 	Compression      string   `json:"compression" binding:"omitempty,oneof=gzip none"`
@@ -74,6 +76,7 @@ type BackupTaskSummary struct {
 	StorageTargetNames []string   `json:"storageTargetNames"`
 	NodeID             uint       `json:"nodeId"`
 	NodeName           string     `json:"nodeName,omitempty"`
+	NodePoolTag        string     `json:"nodePoolTag,omitempty"`
 	Tags               string     `json:"tags"`
 	RetentionDays      int        `json:"retentionDays"`
 	Compression        string     `json:"compression"`
@@ -494,6 +497,11 @@ func (s *BackupTaskService) validateInput(ctx context.Context, existing *model.B
 			return apperror.BadRequest("BACKUP_TASK_INVALID", "所选执行节点不存在", nil)
 		}
 	}
+	// 节点池与固定节点互斥：固定节点已确定执行位置，不再动态调度
+	if input.NodeID > 0 && strings.TrimSpace(input.NodePoolTag) != "" {
+		return apperror.BadRequest("BACKUP_TASK_INVALID",
+			"固定执行节点与节点池标签只能选其一", nil)
+	}
 	if input.RetentionDays < 0 {
 		return apperror.BadRequest("BACKUP_TASK_INVALID", "保留天数不能小于 0", nil)
 	}
@@ -648,6 +656,7 @@ func (s *BackupTaskService) buildTask(existing *model.BackupTask, input BackupTa
 		StorageTargetID:      primaryTargetID,
 		StorageTargets:       storageTargets,
 		NodeID:               input.NodeID,
+		NodePoolTag:          strings.TrimSpace(input.NodePoolTag),
 		Tags:                 strings.TrimSpace(input.Tags),
 		RetentionDays:        input.RetentionDays,
 		Compression:          compression,
@@ -738,6 +747,7 @@ func toBackupTaskSummary(item *model.BackupTask) BackupTaskSummary {
 		StorageTargetNames: targetNames,
 		NodeID:             item.NodeID,
 		NodeName:           item.Node.Name,
+		NodePoolTag:        item.NodePoolTag,
 		Tags:               item.Tags,
 		RetentionDays:      item.RetentionDays,
 		Compression:        item.Compression,
--- a/server/internal/service/node_pool_scheduler_test.go
+++ b/server/internal/service/node_pool_scheduler_test.go
@@ -0,0 +1,83 @@
+package service
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"backupx/server/internal/apperror"
+	"backupx/server/internal/model"
+)
+
+// nodeRepoStub 返回预设节点切片；仅关注 List/FindByID。
+// 其余方法返回零值，避免在调度路径被调用到。
+type nodeRepoStub struct {
+	nodes []model.Node
+}
+
+func (s *nodeRepoStub) List(context.Context) ([]model.Node, error) { return s.nodes, nil }
+func (s *nodeRepoStub) FindByID(_ context.Context, id uint) (*model.Node, error) {
+	for i := range s.nodes {
+		if s.nodes[i].ID == id {
+			return &s.nodes[i], nil
+		}
+	}
+	return nil, nil
+}
+func (s *nodeRepoStub) FindByToken(context.Context, string) (*model.Node, error) { return nil, nil }
+func (s *nodeRepoStub) FindLocal(context.Context) (*model.Node, error)           { return nil, nil }
+func (s *nodeRepoStub) Create(context.Context, *model.Node) error                { return nil }
+func (s *nodeRepoStub) BatchCreate(context.Context, []*model.Node) error         { return nil }
+func (s *nodeRepoStub) Update(context.Context, *model.Node) error                { return nil }
+func (s *nodeRepoStub) Delete(context.Context, uint) error                       { return nil }
+func (s *nodeRepoStub) MarkStaleOffline(context.Context, time.Time) (int64, error) {
+	return 0, nil
+}
+
+func TestSelectPoolNode_PicksLeastLoaded(t *testing.T) {
+	nodes := []model.Node{
+		{ID: 1, Name: "node-a", Status: model.NodeStatusOnline, Labels: "prod,db"},
+		{ID: 2, Name: "node-b", Status: model.NodeStatusOnline, Labels: "prod,db"},
+		{ID: 3, Name: "node-offline", Status: model.NodeStatusOffline, Labels: "prod,db"},
+		{ID: 4, Name: "node-other-pool", Status: model.NodeStatusOnline, Labels: "staging"},
+	}
+	svc := &BackupExecutionService{
+		nodeRepo: &nodeRepoStub{nodes: nodes},
+		records:  nil, // 触发 countRunningOnNode 返回 0，节点并列时按 ID 升序
+	}
+	chosen, err := svc.selectPoolNode(context.Background(), "db")
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if chosen == nil || chosen.ID != 1 {
+		t.Fatalf("expected node-a (ID=1), got %#v", chosen)
+	}
+}
+
+func TestSelectPoolNode_EmptyPoolReturnsError(t *testing.T) {
+	svc := &BackupExecutionService{
+		nodeRepo: &nodeRepoStub{nodes: []model.Node{
+			{ID: 1, Status: model.NodeStatusOnline, Labels: "prod"},
+		}},
+	}
+	_, err := svc.selectPoolNode(context.Background(), "missing-pool")
+	if err == nil {
+		t.Fatal("expected empty-pool error")
+	}
+	var apperr *apperror.AppError
+	if !errors.As(err, &apperr) || apperr.Code != "NODE_POOL_EMPTY" {
+		t.Errorf("expected NODE_POOL_EMPTY, got %v", err)
+	}
+}
+
+func TestSelectPoolNode_NilRepoDegradesGracefully(t *testing.T) {
+	svc := &BackupExecutionService{}
+	got, err := svc.selectPoolNode(context.Background(), "any")
+	if err != nil {
+		t.Errorf("nil repo should degrade silently, got err %v", err)
+	}
+	if got != nil {
+		t.Errorf("nil repo should return nil node, got %v", got)
+	}
+}
--- a/server/internal/service/node_service.go
+++ b/server/internal/service/node_service.go
@@ -34,6 +34,7 @@ type NodeSummary struct {
 	LastSeen       time.Time `json:"lastSeen"`
 	MaxConcurrent  int       `json:"maxConcurrent"`
 	BandwidthLimit string    `json:"bandwidthLimit"`
+	Labels         string    `json:"labels"`
 	CreatedAt      time.Time `json:"createdAt"`
 }

@@ -47,6 +48,8 @@ type NodeUpdateInput struct {
 	Name           string `json:"name" binding:"required"`
 	MaxConcurrent  int    `json:"maxConcurrent"`
 	BandwidthLimit string `json:"bandwidthLimit" binding:"max=32"`
+	// Labels CSV；同时作为调度器的节点池标签（task.NodePoolTag 对齐的值）。
+	Labels string `json:"labels" binding:"max=500"`
 }

 // NodeService manages the cluster nodes.
@@ -132,6 +135,7 @@ func (s *NodeService) List(ctx context.Context) ([]NodeSummary, error) {
 			LastSeen:       n.LastSeen,
 			MaxConcurrent:  n.MaxConcurrent,
 			BandwidthLimit: n.BandwidthLimit,
+			Labels:         n.Labels,
 			CreatedAt:      n.CreatedAt,
 		}
 	}
@@ -159,6 +163,7 @@ func (s *NodeService) Get(ctx context.Context, id uint) (*NodeSummary, error) {
 		LastSeen:       node.LastSeen,
 		MaxConcurrent:  node.MaxConcurrent,
 		BandwidthLimit: node.BandwidthLimit,
+		Labels:         node.Labels,
 		CreatedAt:      node.CreatedAt,
 	}, nil
 }
@@ -320,12 +325,32 @@ func (s *NodeService) Update(ctx context.Context, id uint, input NodeUpdateInput
 	}
 	node.MaxConcurrent = input.MaxConcurrent
 	node.BandwidthLimit = strings.TrimSpace(input.BandwidthLimit)
+	node.Labels = normalizeLabels(input.Labels)
 	if err := s.repo.Update(ctx, node); err != nil {
 		return nil, err
 	}
 	return s.Get(ctx, id)
 }

+// normalizeLabels 规整 CSV labels：去空白、去空 token、去重、保持首次出现顺序。
+// 输入 "  prod, db , prod ,high-mem " → "prod,db,high-mem"
+func normalizeLabels(raw string) string {
+	seen := make(map[string]struct{})
+	out := make([]string, 0)
+	for _, token := range strings.Split(raw, ",") {
+		label := strings.TrimSpace(token)
+		if label == "" {
+			continue
+		}
+		if _, dup := seen[label]; dup {
+			continue
+		}
+		seen[label] = struct{}{}
+		out = append(out, label)
+	}
+	return strings.Join(out, ",")
+}
+
 // DirEntry represents a file or directory in a node's file system.
 type DirEntry struct {
 	Name  string `json:"name"`