mirror of
https://github.com/Syngnat/GoNavi.git
synced 2026-05-07 06:13:03 +08:00
🐛 fix(app): 为稳定期首次连接增加瞬时网络重试保护 (#309)
## 问题背景 在 app 启动后等待 20s 以上,再手动触发数据库连接时,遇到瞬时网络错误(如 `no route to host`)会立即失败,用户体感为“没有重试”。 相关讨论与上下文参考: - https://github.com/Syngnat/GoNavi/pull/294 ## 问题描述 此前重试保护逻辑主要以“应用启动窗口(20s)”为边界: - 启动窗口内:瞬时网络失败会自动重试 - 启动窗口外:即使是瞬时网络失败也不重试 这导致“用户首次手动连接发生在稳定期”时,行为与预期不一致。 ## 本地复现关键日志(节选) ```log 2026/03/27 15:21:04.792462 [INFO] 应用启动完成(首次连接保护窗口=20s,最多重试=4 次) 2026/03/27 15:22:29.196794 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=1m24.405s) 2026/03/27 15:22:29.208920 [ERROR] 建立数据库连接失败:... connect: no route to host 2026/03/27 15:22:29.212453 [ERROR] DBGetDatabases 获取连接失败:... connect: no route to host 2026/03/27 16:07:45.463959 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=21s) 2026/03/27 16:07:45.470744 [ERROR] 建立数据库连接失败:... connect: no route to host 2026/03/27 16:07:45.473604 [WARN] 检测到瞬时网络失败,准备重试连接:... 尝试=1/4 延迟=800ms 2026/03/27 16:07:46.277658 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=21.814s) 2026/03/27 16:07:46.281761 [INFO] 创建数据库驱动实例:... 尝试=2/4 2026/03/27 16:07:46.284741 [ERROR] 建立数据库连接失败:... connect: no route to host 2026/03/27 16:20:59.298636 [INFO] 应用启动完成(首次连接保护窗口=20s,最多重试=4 次) 2026/03/27 16:23:26.180978 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=2m26.883s) 2026/03/27 16:23:26.201478 [INFO] 数据库连接成功并写入缓存:... ``` ## 变更内容 - 调整连接重试判定逻辑: - 启动窗口内:保持原有重试预算(最多 4 次) - 启动窗口外:若为瞬时网络错误,补充一次保护重试(总计 2 次尝试) - 非瞬时错误(如认证失败)在稳定期不重试 - 日志文案泛化,避免“仅启动期”误导: - 数据库连接在重试后成功 - 检测到瞬时网络失败,准备重试连接 ## 测试与验证 ### 新增/更新单元测试覆盖以下场景: - 启动期瞬时错误重试并成功 - 稳定期瞬时错误重试一次并成功 - 稳定期瞬时错误持续失败时,仅重试一次后停止 - 稳定期非瞬时错误不重试 - 稳定期重试路径输出重试提示日志 - 启动期瞬时错误失败时使用完整重试预算 ### 本地执行: - go test ./internal/app -run StartupRetry -count=1 - go test ./internal/app -count=1 ### 影响范围 - 连接建立重试策略(internal/app/app.go) - 启动重试相关测试(internal/app/app_startup_connect_retry_test.go) ## 风险与回滚 - 风险:稳定期瞬时网络错误会增加一次重试等待(约 800ms) - 回滚:可回退本 PR 即恢复“仅启动窗口重试”的旧策略
This commit is contained in:
@@ -608,7 +608,7 @@ func (a *App) connectDatabaseWithStartupRetry(rawConfig connection.ConnectionCon
|
||||
|
||||
if err := dbInst.Connect(connectConfig); err == nil {
|
||||
if attempt > 1 {
|
||||
logger.Warnf("数据库连接在启动保护重试后成功:%s 缓存Key=%s 尝试=%d/%d", formatConnSummary(effectiveConfig), cacheKey, attempt, startupConnectRetryAttempts)
|
||||
logger.Warnf("数据库连接在重试后成功:%s 缓存Key=%s 尝试=%d/%d", formatConnSummary(effectiveConfig), cacheKey, attempt, startupConnectRetryAttempts)
|
||||
}
|
||||
return dbInst, effectiveConfig, nil
|
||||
} else {
|
||||
@@ -616,10 +616,10 @@ func (a *App) connectDatabaseWithStartupRetry(rawConfig connection.ConnectionCon
|
||||
wrapped := wrapConnectError(effectiveConfig, err)
|
||||
lastErr = wrapped
|
||||
logger.Error(wrapped, "建立数据库连接失败:%s 缓存Key=%s", formatConnSummary(effectiveConfig), cacheKey)
|
||||
if !a.shouldRetryStartupConnect(err, attempt) {
|
||||
if !a.shouldRetryConnect(err, attempt) {
|
||||
return nil, effectiveConfig, wrapped
|
||||
}
|
||||
logger.Warnf("检测到启动期瞬时网络失败,准备重试连接:%s 缓存Key=%s 尝试=%d/%d 延迟=%s 原因=%s",
|
||||
logger.Warnf("检测到瞬时网络失败,准备重试连接:%s 缓存Key=%s 尝试=%d/%d 延迟=%s 原因=%s",
|
||||
formatConnSummary(effectiveConfig), cacheKey, attempt, startupConnectRetryAttempts, startupConnectRetryDelay, normalizeErrorMessage(err))
|
||||
time.Sleep(startupConnectRetryDelay)
|
||||
}
|
||||
@@ -650,18 +650,21 @@ func (a *App) startupPhaseLabel() string {
|
||||
return fmt.Sprintf("稳定期(age=%s)", age)
|
||||
}
|
||||
|
||||
func (a *App) shouldRetryStartupConnect(err error, attempt int) bool {
|
||||
func (a *App) shouldRetryConnect(err error, attempt int) bool {
|
||||
if attempt >= startupConnectRetryAttempts {
|
||||
return false
|
||||
}
|
||||
if a == nil || a.startedAt.IsZero() {
|
||||
if !isTransientStartupConnectError(err) {
|
||||
return false
|
||||
}
|
||||
age := time.Since(a.startedAt)
|
||||
if age < 0 || age > startupConnectRetryWindow {
|
||||
return false
|
||||
if a != nil && !a.startedAt.IsZero() {
|
||||
age := time.Since(a.startedAt)
|
||||
if age >= 0 && age <= startupConnectRetryWindow {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return isTransientStartupConnectError(err)
|
||||
// Outside startup window, still grant one retry for transient network glitches.
|
||||
return attempt == 1
|
||||
}
|
||||
|
||||
func isTransientStartupConnectError(err error) bool {
|
||||
|
||||
@@ -2,11 +2,14 @@ package app
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"GoNavi-Wails/internal/connection"
|
||||
"GoNavi-Wails/internal/db"
|
||||
"GoNavi-Wails/internal/logger"
|
||||
)
|
||||
|
||||
type fakeStartupRetryDB struct {
|
||||
@@ -106,7 +109,7 @@ func TestConnectDatabaseWithStartupRetry_RetriesTransientFailureAndReappliesGlob
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectDatabaseWithStartupRetry_DoesNotRetryOutsideStartupWindow(t *testing.T) {
|
||||
func TestConnectDatabaseWithStartupRetry_RetriesOnceOutsideStartupWindow(t *testing.T) {
|
||||
originalNewDatabaseFunc := newDatabaseFunc
|
||||
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
|
||||
defer func() {
|
||||
@@ -130,12 +133,165 @@ func TestConnectDatabaseWithStartupRetry_DoesNotRetryOutsideStartupWindow(t *tes
|
||||
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
|
||||
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
|
||||
|
||||
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if connectCalls != 2 {
|
||||
t.Fatalf("expected 2 connect attempts outside startup window, got %d", connectCalls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectDatabaseWithStartupRetry_DoesNotRetryOutsideStartupWindowForNonTransientError(t *testing.T) {
|
||||
originalNewDatabaseFunc := newDatabaseFunc
|
||||
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
|
||||
defer func() {
|
||||
newDatabaseFunc = originalNewDatabaseFunc
|
||||
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
|
||||
}()
|
||||
|
||||
connectCalls := 0
|
||||
newDatabaseFunc = func(dbType string) (db.Database, error) {
|
||||
return &fakeStartupRetryDB{
|
||||
connect: func(config connection.ConnectionConfig) error {
|
||||
connectCalls++
|
||||
return errors.New("pq: password authentication failed")
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
|
||||
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
|
||||
|
||||
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if connectCalls != 1 {
|
||||
t.Fatalf("expected 1 connect attempt outside startup window, got %d", connectCalls)
|
||||
t.Fatalf("expected 1 connect attempt outside startup window for non-transient error, got %d", connectCalls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectDatabaseWithStartupRetry_LogsRetryHintOutsideStartupWindow(t *testing.T) {
|
||||
originalNewDatabaseFunc := newDatabaseFunc
|
||||
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
|
||||
defer func() {
|
||||
newDatabaseFunc = originalNewDatabaseFunc
|
||||
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
|
||||
}()
|
||||
|
||||
logPath := logger.Path()
|
||||
beforeSize := int64(0)
|
||||
if fi, err := os.Stat(logPath); err == nil {
|
||||
beforeSize = fi.Size()
|
||||
}
|
||||
|
||||
connectCalls := 0
|
||||
newDatabaseFunc = func(dbType string) (db.Database, error) {
|
||||
return &fakeStartupRetryDB{
|
||||
connect: func(config connection.ConnectionConfig) error {
|
||||
connectCalls++
|
||||
if connectCalls == 1 {
|
||||
return errors.New("dial tcp 10.1.131.86:5432: connect: no route to host")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
|
||||
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
|
||||
|
||||
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
|
||||
if err != nil {
|
||||
t.Fatalf("expected success after retry, got error: %v", err)
|
||||
}
|
||||
if connectCalls != 2 {
|
||||
t.Fatalf("expected 2 connect attempts, got %d", connectCalls)
|
||||
}
|
||||
|
||||
logContent, readErr := os.ReadFile(logPath)
|
||||
if readErr != nil {
|
||||
t.Fatalf("read log failed: %v", readErr)
|
||||
}
|
||||
if int64(len(logContent)) < beforeSize {
|
||||
t.Fatalf("expected log file to grow, before=%d after=%d", beforeSize, len(logContent))
|
||||
}
|
||||
appended := string(logContent[beforeSize:])
|
||||
if !strings.Contains(appended, "检测到瞬时网络失败,准备重试连接") {
|
||||
t.Fatalf("expected retry hint log in appended segment, got: %s", appended)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectDatabaseWithStartupRetry_OutsideStartupWindowTransientFailureStopsAfterOneRetry(t *testing.T) {
|
||||
originalNewDatabaseFunc := newDatabaseFunc
|
||||
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
|
||||
defer func() {
|
||||
newDatabaseFunc = originalNewDatabaseFunc
|
||||
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
|
||||
}()
|
||||
|
||||
connectCalls := 0
|
||||
newDatabaseFunc = func(dbType string) (db.Database, error) {
|
||||
return &fakeStartupRetryDB{
|
||||
connect: func(config connection.ConnectionConfig) error {
|
||||
connectCalls++
|
||||
return errors.New("dial tcp 10.1.131.86:5432: connect: no route to host")
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
|
||||
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
|
||||
|
||||
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if connectCalls != 2 {
|
||||
t.Fatalf("expected 2 connect attempts outside startup window for transient error, got %d", connectCalls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestConnectDatabaseWithStartupRetry_StartupWindowTransientFailureUsesFullRetryBudget(t *testing.T) {
|
||||
originalNewDatabaseFunc := newDatabaseFunc
|
||||
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
|
||||
defer func() {
|
||||
newDatabaseFunc = originalNewDatabaseFunc
|
||||
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
|
||||
}()
|
||||
|
||||
connectCalls := 0
|
||||
newDatabaseFunc = func(dbType string) (db.Database, error) {
|
||||
return &fakeStartupRetryDB{
|
||||
connect: func(config connection.ConnectionConfig) error {
|
||||
connectCalls++
|
||||
return errors.New("dial tcp 10.1.131.86:5432: connect: no route to host")
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
|
||||
return raw, nil
|
||||
}
|
||||
|
||||
a := &App{startedAt: time.Now()}
|
||||
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
|
||||
|
||||
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
|
||||
if err == nil {
|
||||
t.Fatal("expected error, got nil")
|
||||
}
|
||||
if connectCalls != startupConnectRetryAttempts {
|
||||
t.Fatalf("expected %d connect attempts in startup window, got %d", startupConnectRetryAttempts, connectCalls)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user