Files
BackupX/server/internal/service/node_service.go
Wu Qing 757b0fa5ed 功能: 修复并实现多节点集群部署 (#38)
基础修复:
- 新增节点离线检测:每 15s 扫描,超 45s 未心跳的远程节点自动置离线
- 节点删除前检查关联任务,避免孤立备份任务
- BackupTaskRepository 新增 CountByNodeID/ListByNodeID

Master 端 Agent 协议:
- 新增 AgentCommand 模型与命令队列仓储(pending/dispatched/succeeded/failed/timeout)
- 新增 AgentService:任务下发、命令轮询、结果回收、超时扫描
- 新增专用 Agent HTTP API(X-Agent-Token 认证):
  /api/agent/heartbeat
  /api/agent/commands/poll
  /api/agent/commands/:id/result
  /api/agent/tasks/:id
  /api/agent/records/:id
- BackupExecutionService 支持 node 路由:task.NodeID 指向远程节点时自动入队派发

Agent CLI(backupx agent 子命令):
- 配置:YAML 文件 / 环境变量 / CLI 参数,优先级 CLI > 文件 > 环境
- 心跳循环 + 命令轮询循环 + 优雅退出
- 本地复用 BackupRunner 与 storage registry 执行备份并直接上传
- 支持 run_task 和 list_dir 两种命令

远程目录浏览:
- NodeService 支持通过 Agent RPC 列出远程节点目录(15s 超时)

前端:
- NodesPage 添加节点后展示 Agent 启动命令和环境变量配置

文档:
- README 中英文重写"多节点集群"章节,含架构图、步骤、限制、CLI 参考
2026-04-17 12:29:08 +08:00

383 lines
11 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
package service
import (
"context"
"crypto/rand"
"encoding/hex"
"encoding/json"
"fmt"
"net"
"net/http"
"os"
"path/filepath"
"runtime"
"sort"
"strings"
"time"
"backupx/server/internal/apperror"
"backupx/server/internal/model"
"backupx/server/internal/repository"
)
// NodeSummary is the API response for node listings.
type NodeSummary struct {
ID uint `json:"id"`
Name string `json:"name"`
Hostname string `json:"hostname"`
IPAddress string `json:"ipAddress"`
Status string `json:"status"`
IsLocal bool `json:"isLocal"`
OS string `json:"os"`
Arch string `json:"arch"`
AgentVersion string `json:"agentVersion"`
LastSeen time.Time `json:"lastSeen"`
CreatedAt time.Time `json:"createdAt"`
}
// NodeCreateInput is the input for creating a new remote node.
type NodeCreateInput struct {
Name string `json:"name" binding:"required"`
}
// NodeUpdateInput 是编辑节点的输入。
type NodeUpdateInput struct {
Name string `json:"name" binding:"required"`
}
// NodeService manages the cluster nodes.
type NodeService struct {
repo repository.NodeRepository
taskRepo repository.BackupTaskRepository
agentRPC NodeAgentRPC
version string
}
// NodeAgentRPC 抽象 Agent 远程调用能力(避免 service 内循环依赖)。
// 由 AgentService 实现;当 Agent 未启用时可不注入,远程目录浏览返回提示。
type NodeAgentRPC interface {
EnqueueCommand(ctx context.Context, nodeID uint, cmdType string, payload any) (uint, error)
WaitForCommandResult(ctx context.Context, cmdID uint, timeout time.Duration) (*model.AgentCommand, error)
}
func NewNodeService(repo repository.NodeRepository, version string) *NodeService {
return &NodeService{repo: repo, version: version}
}
// SetTaskRepository 注入任务仓储以支持删除前引用检查。可选注入,便于测试。
func (s *NodeService) SetTaskRepository(taskRepo repository.BackupTaskRepository) {
s.taskRepo = taskRepo
}
// SetAgentRPC 注入 Agent RPC 能力,启用远程目录浏览。
func (s *NodeService) SetAgentRPC(rpc NodeAgentRPC) {
s.agentRPC = rpc
}
// EnsureLocalNode creates the default "local" node if it does not exist.
func (s *NodeService) EnsureLocalNode(ctx context.Context) error {
existing, err := s.repo.FindLocal(ctx)
if err != nil {
return err
}
if existing != nil {
existing.Status = model.NodeStatusOnline
existing.LastSeen = time.Now().UTC()
hostname, _ := os.Hostname()
existing.Hostname = hostname
existing.IPAddress = detectLocalIP()
existing.AgentVer = s.version
existing.OS = runtime.GOOS
existing.Arch = runtime.GOARCH
return s.repo.Update(ctx, existing)
}
hostname, _ := os.Hostname()
token, _ := generateToken()
node := &model.Node{
Name: "本机 (Local)",
Hostname: hostname,
IPAddress: detectLocalIP(),
Token: token,
Status: model.NodeStatusOnline,
IsLocal: true,
OS: runtime.GOOS,
Arch: runtime.GOARCH,
AgentVer: s.version,
LastSeen: time.Now().UTC(),
}
return s.repo.Create(ctx, node)
}
func (s *NodeService) List(ctx context.Context) ([]NodeSummary, error) {
nodes, err := s.repo.List(ctx)
if err != nil {
return nil, err
}
result := make([]NodeSummary, len(nodes))
for i, n := range nodes {
result[i] = NodeSummary{
ID: n.ID,
Name: n.Name,
Hostname: n.Hostname,
IPAddress: n.IPAddress,
Status: n.Status,
IsLocal: n.IsLocal,
OS: n.OS,
Arch: n.Arch,
AgentVersion: n.AgentVer,
LastSeen: n.LastSeen,
CreatedAt: n.CreatedAt,
}
}
return result, nil
}
func (s *NodeService) Get(ctx context.Context, id uint) (*NodeSummary, error) {
node, err := s.repo.FindByID(ctx, id)
if err != nil {
return nil, err
}
if node == nil {
return nil, apperror.New(http.StatusNotFound, "NODE_NOT_FOUND", "节点不存在", nil)
}
return &NodeSummary{
ID: node.ID,
Name: node.Name,
Hostname: node.Hostname,
IPAddress: node.IPAddress,
Status: node.Status,
IsLocal: node.IsLocal,
OS: node.OS,
Arch: node.Arch,
AgentVersion: node.AgentVer,
LastSeen: node.LastSeen,
CreatedAt: node.CreatedAt,
}, nil
}
// Create registers a new remote node and returns its authentication token.
func (s *NodeService) Create(ctx context.Context, input NodeCreateInput) (string, error) {
token, err := generateToken()
if err != nil {
return "", fmt.Errorf("generate token: %w", err)
}
node := &model.Node{
Name: input.Name,
Token: token,
Status: model.NodeStatusOffline,
IsLocal: false,
LastSeen: time.Now().UTC(),
}
if err := s.repo.Create(ctx, node); err != nil {
return "", err
}
return token, nil
}
func (s *NodeService) Delete(ctx context.Context, id uint) error {
node, err := s.repo.FindByID(ctx, id)
if err != nil {
return err
}
if node == nil {
return apperror.New(http.StatusNotFound, "NODE_NOT_FOUND", "节点不存在", nil)
}
if node.IsLocal {
return apperror.BadRequest("NODE_DELETE_LOCAL", "无法删除本机节点", nil)
}
// 删除前检查是否有关联备份任务,避免孤立任务
if s.taskRepo != nil {
count, err := s.taskRepo.CountByNodeID(ctx, id)
if err != nil {
return err
}
if count > 0 {
return apperror.BadRequest(
"NODE_HAS_TASKS",
fmt.Sprintf("无法删除:该节点上还有 %d 个备份任务,请先删除或迁移", count),
nil,
)
}
}
return s.repo.Delete(ctx, id)
}
// ListDirectory lists the contents of a directory on the local node.
func (s *NodeService) ListDirectory(ctx context.Context, nodeID uint, path string) ([]DirEntry, error) {
node, err := s.repo.FindByID(ctx, nodeID)
if err != nil {
return nil, err
}
if node == nil {
return nil, apperror.New(http.StatusNotFound, "NODE_NOT_FOUND", "节点不存在", nil)
}
if !node.IsLocal {
// 远程节点:通过 Agent 命令队列做同步 RPC
return s.remoteListDirectory(ctx, node, path)
}
cleanPath := filepath.Clean(path)
entries, err := os.ReadDir(cleanPath)
if err != nil {
return nil, apperror.BadRequest("NODE_FS_READ_ERROR", fmt.Sprintf("无法读取目录: %s", err.Error()), err)
}
result := make([]DirEntry, 0, len(entries))
for _, entry := range entries {
info, _ := entry.Info()
size := int64(0)
if info != nil {
size = info.Size()
}
result = append(result, DirEntry{
Name: entry.Name(),
Path: filepath.Join(cleanPath, entry.Name()),
IsDir: entry.IsDir(),
Size: size,
})
}
sort.Slice(result, func(i, j int) bool {
if result[i].IsDir != result[j].IsDir {
return result[i].IsDir
}
return result[i].Name < result[j].Name
})
return result, nil
}
// OfflineThreshold 节点被判定为离线的心跳超时阈值。
// Agent 默认 15s 心跳一次45s 未见视为离线,预留 3 次重试空间。
const OfflineThreshold = 45 * time.Second
// StartOfflineMonitor 启动后台 goroutine定期把超时未心跳的节点标记为离线。
// 传入的 ctx 被取消后退出。
func (s *NodeService) StartOfflineMonitor(ctx context.Context, interval time.Duration) {
if interval <= 0 {
interval = 15 * time.Second
}
ticker := time.NewTicker(interval)
go func() {
defer ticker.Stop()
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
threshold := time.Now().UTC().Add(-OfflineThreshold)
_, _ = s.repo.MarkStaleOffline(ctx, threshold)
}
}
}()
}
// Heartbeat updates the node status when an agent reports in.
func (s *NodeService) Heartbeat(ctx context.Context, token string, hostname string, ip string, agentVer string, osName string, archName string) error {
node, err := s.repo.FindByToken(ctx, token)
if err != nil {
return err
}
if node == nil {
return apperror.Unauthorized("NODE_INVALID_TOKEN", "无效的节点认证令牌", nil)
}
node.Status = model.NodeStatusOnline
node.Hostname = hostname
node.IPAddress = ip
node.AgentVer = agentVer
if strings.TrimSpace(osName) != "" {
node.OS = osName
} else {
node.OS = runtime.GOOS
}
if strings.TrimSpace(archName) != "" {
node.Arch = archName
} else {
node.Arch = runtime.GOARCH
}
node.LastSeen = time.Now().UTC()
return s.repo.Update(ctx, node)
}
// Update 编辑节点名称。
func (s *NodeService) Update(ctx context.Context, id uint, input NodeUpdateInput) (*NodeSummary, error) {
node, err := s.repo.FindByID(ctx, id)
if err != nil {
return nil, err
}
if node == nil {
return nil, apperror.New(http.StatusNotFound, "NODE_NOT_FOUND", "节点不存在", nil)
}
node.Name = strings.TrimSpace(input.Name)
if err := s.repo.Update(ctx, node); err != nil {
return nil, err
}
return s.Get(ctx, id)
}
// DirEntry represents a file or directory in a node's file system.
type DirEntry struct {
Name string `json:"name"`
Path string `json:"path"`
IsDir bool `json:"isDir"`
Size int64 `json:"size"`
}
// remoteListDirectory 通过命令队列下发 list_dir 给 Agent 并同步等待结果。
// Agent 必须在线,且响应需在 15s 内返回,否则返回超时错误。
func (s *NodeService) remoteListDirectory(ctx context.Context, node *model.Node, path string) ([]DirEntry, error) {
if s.agentRPC == nil {
return nil, apperror.BadRequest("NODE_REMOTE_FS_NOT_SUPPORTED", "远程目录浏览未启用,需要 Master 启用 Agent 服务", nil)
}
if node.Status != model.NodeStatusOnline {
return nil, apperror.BadRequest("NODE_OFFLINE", "节点当前不在线,无法浏览其目录", nil)
}
if strings.TrimSpace(path) == "" {
path = "/"
}
cmdID, err := s.agentRPC.EnqueueCommand(ctx, node.ID, model.AgentCommandTypeListDir, map[string]any{"path": path})
if err != nil {
return nil, apperror.Internal("AGENT_COMMAND_ENQUEUE_FAILED", "下发目录浏览命令失败", err)
}
cmd, err := s.agentRPC.WaitForCommandResult(ctx, cmdID, 15*time.Second)
if err != nil {
return nil, err
}
if cmd.Status != model.AgentCommandStatusSucceeded {
msg := cmd.ErrorMessage
if msg == "" {
msg = fmt.Sprintf("command status: %s", cmd.Status)
}
return nil, apperror.BadRequest("NODE_FS_READ_ERROR", fmt.Sprintf("远程目录浏览失败: %s", msg), nil)
}
var result struct {
Entries []DirEntry `json:"entries"`
}
if err := json.Unmarshal([]byte(cmd.Result), &result); err != nil {
return nil, apperror.Internal("AGENT_RESULT_INVALID", "Agent 返回结果格式错误", err)
}
return result.Entries, nil
}
// detectLocalIP 获取本机第一个非回环 IPv4 地址。
func detectLocalIP() string {
addrs, err := net.InterfaceAddrs()
if err != nil {
return ""
}
for _, addr := range addrs {
if ipNet, ok := addr.(*net.IPNet); ok && !ipNet.IP.IsLoopback() {
if ipNet.IP.To4() != nil {
return ipNet.IP.String()
}
}
}
return ""
}
func generateToken() (string, error) {
b := make([]byte, 32)
if _, err := rand.Read(b); err != nil {
return "", err
}
return hex.EncodeToString(b), nil
}