mirror of
https://github.com/Awuqing/BackupX.git
synced 2026-06-07 18:59:34 +08:00
feat(BackupX): 修复跨节点备份恢复终态处理 (#60)
* feat(BackupX): 修复集群部署管理逻辑 * feat(BackupX): 修复节点池任务运行归属 * feat(BackupX): 修复跨节点恢复路由 * feat(BackupX): 修复跨节点备份恢复终态处理 * test(BackupX): 稳定安装流HTTP测试
This commit is contained in:
@@ -17,12 +17,21 @@ type AgentCommandRepository interface {
|
||||
// 并返回领取到的命令。无命令时返回 (nil, nil)。
|
||||
ClaimPending(ctx context.Context, nodeID uint) (*model.AgentCommand, error)
|
||||
Update(ctx context.Context, cmd *model.AgentCommand) error
|
||||
// CompleteDispatched 只在命令仍处于 dispatched 时写入终态。
|
||||
// 返回 false 表示命令已被超时监控或其它流程终结,调用方不应覆盖。
|
||||
CompleteDispatched(ctx context.Context, cmd *model.AgentCommand) (bool, error)
|
||||
// MarkStaleTimeout 把 dispatched 状态但超时未完成的命令标记为 timeout。
|
||||
// 返回被标记的行数。不返回具体命令(供背景监控简单调用)。
|
||||
MarkStaleTimeout(ctx context.Context, threshold time.Time) (int64, error)
|
||||
// TimeoutActive 只在命令仍处于 pending/dispatched 时写入 timeout。
|
||||
// 返回 false 表示命令已被 Agent 回写为终态,调用方不应覆盖。
|
||||
TimeoutActive(ctx context.Context, cmd *model.AgentCommand) (bool, error)
|
||||
// ListStaleDispatched 列出 dispatched 但已超时、尚未被标记的命令。
|
||||
// 调用方需要把它们逐一标记 timeout 并联动关联记录状态。
|
||||
ListStaleDispatched(ctx context.Context, threshold time.Time) ([]model.AgentCommand, error)
|
||||
// ListStaleActive 列出 pending/dispatched 但已超时、尚未完成的命令。
|
||||
// pending 使用 created_at 判定,dispatched 使用 dispatched_at 判定。
|
||||
ListStaleActive(ctx context.Context, threshold time.Time) ([]model.AgentCommand, error)
|
||||
// ListPendingByNode 列出某节点下的所有 pending/dispatched 命令。
|
||||
// 用于删除节点或节点离线时的清理。
|
||||
ListPendingByNode(ctx context.Context, nodeID uint) ([]model.AgentCommand, error)
|
||||
@@ -94,6 +103,21 @@ func (r *GormAgentCommandRepository) Update(ctx context.Context, cmd *model.Agen
|
||||
return r.db.WithContext(ctx).Save(cmd).Error
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) CompleteDispatched(ctx context.Context, cmd *model.AgentCommand) (bool, error) {
|
||||
result := r.db.WithContext(ctx).Model(&model.AgentCommand{}).
|
||||
Where("id = ? AND node_id = ? AND status = ?", cmd.ID, cmd.NodeID, model.AgentCommandStatusDispatched).
|
||||
Updates(map[string]any{
|
||||
"status": cmd.Status,
|
||||
"error_message": cmd.ErrorMessage,
|
||||
"result": cmd.Result,
|
||||
"completed_at": cmd.CompletedAt,
|
||||
})
|
||||
if result.Error != nil {
|
||||
return false, result.Error
|
||||
}
|
||||
return result.RowsAffected > 0, nil
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) MarkStaleTimeout(ctx context.Context, threshold time.Time) (int64, error) {
|
||||
result := r.db.WithContext(ctx).Model(&model.AgentCommand{}).
|
||||
Where("status = ? AND dispatched_at < ?", model.AgentCommandStatusDispatched, threshold).
|
||||
@@ -107,6 +131,20 @@ func (r *GormAgentCommandRepository) MarkStaleTimeout(ctx context.Context, thres
|
||||
return result.RowsAffected, nil
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) TimeoutActive(ctx context.Context, cmd *model.AgentCommand) (bool, error) {
|
||||
result := r.db.WithContext(ctx).Model(&model.AgentCommand{}).
|
||||
Where("id = ? AND status IN ?", cmd.ID, []string{model.AgentCommandStatusPending, model.AgentCommandStatusDispatched}).
|
||||
Updates(map[string]any{
|
||||
"status": model.AgentCommandStatusTimeout,
|
||||
"error_message": cmd.ErrorMessage,
|
||||
"completed_at": cmd.CompletedAt,
|
||||
})
|
||||
if result.Error != nil {
|
||||
return false, result.Error
|
||||
}
|
||||
return result.RowsAffected > 0, nil
|
||||
}
|
||||
|
||||
// ListStaleDispatched 列出 dispatched 但 dispatched_at 早于 threshold 的命令。
|
||||
func (r *GormAgentCommandRepository) ListStaleDispatched(ctx context.Context, threshold time.Time) ([]model.AgentCommand, error) {
|
||||
var items []model.AgentCommand
|
||||
@@ -119,6 +157,21 @@ func (r *GormAgentCommandRepository) ListStaleDispatched(ctx context.Context, th
|
||||
return items, nil
|
||||
}
|
||||
|
||||
func (r *GormAgentCommandRepository) ListStaleActive(ctx context.Context, threshold time.Time) ([]model.AgentCommand, error) {
|
||||
var items []model.AgentCommand
|
||||
if err := r.db.WithContext(ctx).
|
||||
Where(
|
||||
"(status = ? AND created_at < ?) OR (status = ? AND dispatched_at < ?)",
|
||||
model.AgentCommandStatusPending, threshold,
|
||||
model.AgentCommandStatusDispatched, threshold,
|
||||
).
|
||||
Order("id asc").
|
||||
Find(&items).Error; err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return items, nil
|
||||
}
|
||||
|
||||
// ListPendingByNode 列出某节点下所有待执行(pending 或 dispatched)命令。
|
||||
func (r *GormAgentCommandRepository) ListPendingByNode(ctx context.Context, nodeID uint) ([]model.AgentCommand, error) {
|
||||
var items []model.AgentCommand
|
||||
|
||||
@@ -90,6 +90,78 @@ func TestAgentCommandRepository_Update(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_CompleteDispatchedOnlyUpdatesDispatchedCommand(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
ctx := context.Background()
|
||||
dispatched := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusDispatched}
|
||||
timeout := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusTimeout, ErrorMessage: "timeout"}
|
||||
if err := repo.Create(ctx, dispatched); err != nil {
|
||||
t.Fatalf("Create dispatched returned error: %v", err)
|
||||
}
|
||||
if err := repo.Create(ctx, timeout); err != nil {
|
||||
t.Fatalf("Create timeout returned error: %v", err)
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
dispatched.Status = model.AgentCommandStatusSucceeded
|
||||
dispatched.Result = `{"ok":true}`
|
||||
dispatched.CompletedAt = &now
|
||||
updated, err := repo.CompleteDispatched(ctx, dispatched)
|
||||
if err != nil {
|
||||
t.Fatalf("CompleteDispatched returned error: %v", err)
|
||||
}
|
||||
if !updated {
|
||||
t.Fatal("expected dispatched command to be updated")
|
||||
}
|
||||
|
||||
timeout.Status = model.AgentCommandStatusSucceeded
|
||||
timeout.Result = `{"late":true}`
|
||||
timeout.CompletedAt = &now
|
||||
updated, err = repo.CompleteDispatched(ctx, timeout)
|
||||
if err != nil {
|
||||
t.Fatalf("CompleteDispatched terminal returned error: %v", err)
|
||||
}
|
||||
if updated {
|
||||
t.Fatal("expected terminal command not to be updated")
|
||||
}
|
||||
gotTimeout, err := repo.FindByID(ctx, timeout.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("FindByID timeout returned error: %v", err)
|
||||
}
|
||||
if gotTimeout.Status != model.AgentCommandStatusTimeout || gotTimeout.Result != "" {
|
||||
t.Fatalf("expected timeout command unchanged, got %#v", gotTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_TimeoutActiveDoesNotOverwriteTerminalCommand(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
ctx := context.Background()
|
||||
succeeded := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusSucceeded, Result: `{"ok":true}`}
|
||||
if err := repo.Create(ctx, succeeded); err != nil {
|
||||
t.Fatalf("Create succeeded returned error: %v", err)
|
||||
}
|
||||
|
||||
now := time.Now().UTC()
|
||||
succeeded.ErrorMessage = "timeout"
|
||||
succeeded.CompletedAt = &now
|
||||
updated, err := repo.TimeoutActive(ctx, succeeded)
|
||||
if err != nil {
|
||||
t.Fatalf("TimeoutActive returned error: %v", err)
|
||||
}
|
||||
if updated {
|
||||
t.Fatal("expected terminal command not to be timed out")
|
||||
}
|
||||
got, err := repo.FindByID(ctx, succeeded.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("FindByID returned error: %v", err)
|
||||
}
|
||||
if got.Status != model.AgentCommandStatusSucceeded || got.ErrorMessage != "" || got.Result != `{"ok":true}` {
|
||||
t.Fatalf("expected succeeded command unchanged, got %#v", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_MarkStaleTimeout(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
@@ -118,3 +190,31 @@ func TestAgentCommandRepository_MarkStaleTimeout(t *testing.T) {
|
||||
t.Errorf("new should stay dispatched: %+v", newGot)
|
||||
}
|
||||
}
|
||||
|
||||
func TestAgentCommandRepository_ListStaleActiveIncludesPendingAndDispatched(t *testing.T) {
|
||||
db := newTestDB(t)
|
||||
repo := NewAgentCommandRepository(db)
|
||||
ctx := context.Background()
|
||||
old := time.Now().Add(-time.Hour)
|
||||
recent := time.Now()
|
||||
oldPending := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusPending, CreatedAt: old}
|
||||
oldDispatched := &model.AgentCommand{NodeID: 1, Type: "restore_record", Status: model.AgentCommandStatusDispatched, DispatchedAt: &old}
|
||||
recentPending := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusPending, CreatedAt: recent}
|
||||
succeeded := &model.AgentCommand{NodeID: 1, Type: "run_task", Status: model.AgentCommandStatusSucceeded, CreatedAt: old}
|
||||
for _, cmd := range []*model.AgentCommand{oldPending, oldDispatched, recentPending, succeeded} {
|
||||
if err := repo.Create(ctx, cmd); err != nil {
|
||||
t.Fatalf("Create returned error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
items, err := repo.ListStaleActive(ctx, time.Now().Add(-30*time.Minute))
|
||||
if err != nil {
|
||||
t.Fatalf("ListStaleActive returned error: %v", err)
|
||||
}
|
||||
if len(items) != 2 {
|
||||
t.Fatalf("expected 2 stale active commands, got %#v", items)
|
||||
}
|
||||
if items[0].ID != oldPending.ID || items[1].ID != oldDispatched.ID {
|
||||
t.Fatalf("unexpected stale active order/items: %#v", items)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ package repository
|
||||
import (
|
||||
"context"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -83,6 +84,59 @@ func TestInstallTokenConsumeExpired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallTokenConsumeConcurrentOnlyOneWins(t *testing.T) {
|
||||
db := openTestInstallTokenDB(t)
|
||||
repo := NewAgentInstallTokenRepository(db)
|
||||
ctx := context.Background()
|
||||
|
||||
tok := &model.AgentInstallToken{
|
||||
Token: "concurrent", NodeID: 1, Mode: model.InstallModeSystemd,
|
||||
Arch: model.InstallArchAuto, AgentVer: "v1.7.0",
|
||||
DownloadSrc: model.InstallSourceGitHub,
|
||||
ExpiresAt: time.Now().UTC().Add(15 * time.Minute),
|
||||
CreatedByID: 1,
|
||||
}
|
||||
if err := repo.Create(ctx, tok); err != nil {
|
||||
t.Fatalf("create: %v", err)
|
||||
}
|
||||
|
||||
const workers = 8
|
||||
var wg sync.WaitGroup
|
||||
start := make(chan struct{})
|
||||
results := make(chan *model.AgentInstallToken, workers)
|
||||
errs := make(chan error, workers)
|
||||
for i := 0; i < workers; i++ {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
<-start
|
||||
got, err := repo.ConsumeByToken(ctx, "concurrent")
|
||||
if err != nil {
|
||||
errs <- err
|
||||
return
|
||||
}
|
||||
results <- got
|
||||
}()
|
||||
}
|
||||
close(start)
|
||||
wg.Wait()
|
||||
close(results)
|
||||
close(errs)
|
||||
|
||||
for err := range errs {
|
||||
t.Fatalf("consume err: %v", err)
|
||||
}
|
||||
success := 0
|
||||
for got := range results {
|
||||
if got != nil {
|
||||
success++
|
||||
}
|
||||
}
|
||||
if success != 1 {
|
||||
t.Fatalf("expected exactly one successful consume, got %d", success)
|
||||
}
|
||||
}
|
||||
|
||||
func TestInstallTokenGC(t *testing.T) {
|
||||
db := openTestInstallTokenDB(t)
|
||||
repo := NewAgentInstallTokenRepository(db)
|
||||
|
||||
@@ -33,6 +33,7 @@ type BackupStorageUsageItem struct {
|
||||
type BackupRecordRepository interface {
|
||||
List(context.Context, BackupRecordListOptions) ([]model.BackupRecord, error)
|
||||
FindByID(context.Context, uint) (*model.BackupRecord, error)
|
||||
FindRunningByTaskAndNode(context.Context, uint, uint) (*model.BackupRecord, error)
|
||||
Create(context.Context, *model.BackupRecord) error
|
||||
Update(context.Context, *model.BackupRecord) error
|
||||
Delete(context.Context, uint) error
|
||||
@@ -93,6 +94,20 @@ func (r *GormBackupRecordRepository) FindByID(ctx context.Context, id uint) (*mo
|
||||
return &item, nil
|
||||
}
|
||||
|
||||
func (r *GormBackupRecordRepository) FindRunningByTaskAndNode(ctx context.Context, taskID uint, nodeID uint) (*model.BackupRecord, error) {
|
||||
var item model.BackupRecord
|
||||
if err := r.db.WithContext(ctx).
|
||||
Where("task_id = ? AND node_id = ? AND status = ?", taskID, nodeID, model.BackupRecordStatusRunning).
|
||||
Order("id desc").
|
||||
First(&item).Error; err != nil {
|
||||
if errors.Is(err, gorm.ErrRecordNotFound) {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
return &item, nil
|
||||
}
|
||||
|
||||
func (r *GormBackupRecordRepository) Create(ctx context.Context, item *model.BackupRecord) error {
|
||||
return r.db.WithContext(ctx).Create(item).Error
|
||||
}
|
||||
|
||||
@@ -226,7 +226,7 @@ func (r *GormBackupTaskRepository) Create(ctx context.Context, item *model.Backu
|
||||
}
|
||||
|
||||
func (r *GormBackupTaskRepository) Update(ctx context.Context, item *model.BackupTask) error {
|
||||
if err := r.db.WithContext(ctx).Save(item).Error; err != nil {
|
||||
if err := r.db.WithContext(ctx).Omit("StorageTarget", "StorageTargets", "Node").Save(item).Error; err != nil {
|
||||
return err
|
||||
}
|
||||
if len(item.StorageTargets) > 0 {
|
||||
|
||||
@@ -92,3 +92,49 @@ func TestBackupTaskRepositoryCRUD(t *testing.T) {
|
||||
t.Fatalf("expected task deleted, got %#v", deleted)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBackupTaskRepositoryUpdateCanClearNodeIDAfterPreload(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
repo := newBackupTaskTestRepository(t)
|
||||
remoteNode := &model.Node{Name: "edge-1", Token: "edge-token", Status: model.NodeStatusOnline, IsLocal: false}
|
||||
if err := repo.db.WithContext(ctx).Create(remoteNode).Error; err != nil {
|
||||
t.Fatalf("create node: %v", err)
|
||||
}
|
||||
task := &model.BackupTask{
|
||||
Name: "pooled-source",
|
||||
Type: "file",
|
||||
Enabled: true,
|
||||
SourcePath: "/srv/www/site",
|
||||
StorageTargetID: 1,
|
||||
NodeID: remoteNode.ID,
|
||||
RetentionDays: 30,
|
||||
Compression: "gzip",
|
||||
MaxBackups: 10,
|
||||
LastStatus: "idle",
|
||||
}
|
||||
if err := repo.Create(ctx, task); err != nil {
|
||||
t.Fatalf("Create returned error: %v", err)
|
||||
}
|
||||
loaded, err := repo.FindByID(ctx, task.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("FindByID returned error: %v", err)
|
||||
}
|
||||
if loaded == nil || loaded.Node.ID != remoteNode.ID {
|
||||
t.Fatalf("expected preloaded node %d, got %#v", remoteNode.ID, loaded)
|
||||
}
|
||||
loaded.NodeID = 0
|
||||
loaded.NodePoolTag = "db"
|
||||
if err := repo.Update(ctx, loaded); err != nil {
|
||||
t.Fatalf("Update returned error: %v", err)
|
||||
}
|
||||
stored, err := repo.FindByID(ctx, task.ID)
|
||||
if err != nil {
|
||||
t.Fatalf("FindByID after update returned error: %v", err)
|
||||
}
|
||||
if stored.NodeID != 0 {
|
||||
t.Fatalf("expected NodeID to be cleared, got %d", stored.NodeID)
|
||||
}
|
||||
if stored.NodePoolTag != "db" {
|
||||
t.Fatalf("expected NodePoolTag db, got %q", stored.NodePoolTag)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user