🐛 fix(app): 为稳定期首次连接增加瞬时网络重试保护 (#309)

## 问题背景
在 app 启动后等待 20s 以上,再手动触发数据库连接时,遇到瞬时网络错误(如 `no route to
host`)会立即失败,用户体感为“没有重试”。

相关讨论与上下文参考:
- https://github.com/Syngnat/GoNavi/pull/294

## 问题描述
此前重试保护逻辑主要以“应用启动窗口(20s)”为边界:
- 启动窗口内:瞬时网络失败会自动重试
- 启动窗口外:即使是瞬时网络失败也不重试
这导致“用户首次手动连接发生在稳定期”时,行为与预期不一致。

## 本地复现关键日志(节选)
```log
2026/03/27 15:21:04.792462 [INFO] 应用启动完成(首次连接保护窗口=20s,最多重试=4 次)
2026/03/27 15:22:29.196794 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=1m24.405s)
2026/03/27 15:22:29.208920 [ERROR] 建立数据库连接失败:... connect: no route to host
2026/03/27 15:22:29.212453 [ERROR] DBGetDatabases 获取连接失败:... connect: no route to host


2026/03/27 16:07:45.463959 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=21s)
2026/03/27 16:07:45.470744 [ERROR] 建立数据库连接失败:... connect: no route to host
2026/03/27 16:07:45.473604 [WARN] 检测到瞬时网络失败,准备重试连接:... 尝试=1/4 延迟=800ms
2026/03/27 16:07:46.277658 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=21.814s)
2026/03/27 16:07:46.281761 [INFO] 创建数据库驱动实例:... 尝试=2/4
2026/03/27 16:07:46.284741 [ERROR] 建立数据库连接失败:... connect: no route to host
2026/03/27 16:20:59.298636 [INFO] 应用启动完成(首次连接保护窗口=20s,最多重试=4 次)
2026/03/27 16:23:26.180978 [INFO] 获取数据库连接:... 启动阶段=稳定期(age=2m26.883s)
2026/03/27 16:23:26.201478 [INFO] 数据库连接成功并写入缓存:...
```

## 变更内容
- 调整连接重试判定逻辑:
  - 启动窗口内:保持原有重试预算(最多 4 次)
  - 启动窗口外:若为瞬时网络错误,补充一次保护重试(总计 2 次尝试)
  - 非瞬时错误(如认证失败)在稳定期不重试
- 日志文案泛化,避免“仅启动期”误导:
  - 数据库连接在重试后成功
  - 检测到瞬时网络失败,准备重试连接
## 测试与验证
### 新增/更新单元测试覆盖以下场景:
- 启动期瞬时错误重试并成功
- 稳定期瞬时错误重试一次并成功
- 稳定期瞬时错误持续失败时,仅重试一次后停止
- 稳定期非瞬时错误不重试
- 稳定期重试路径输出重试提示日志
- 启动期瞬时错误失败时使用完整重试预算
### 本地执行:
- go test ./internal/app -run StartupRetry -count=1
- go test ./internal/app -count=1
### 影响范围
- 连接建立重试策略(internal/app/app.go)
- 启动重试相关测试(internal/app/app_startup_connect_retry_test.go)
## 风险与回滚
- 风险:稳定期瞬时网络错误会增加一次重试等待(约 800ms)
- 回滚:可回退本 PR 即恢复“仅启动窗口重试”的旧策略
This commit is contained in:
Syngnat
2026-03-27 17:30:14 +08:00
committed by GitHub
2 changed files with 170 additions and 11 deletions

View File

@@ -608,7 +608,7 @@ func (a *App) connectDatabaseWithStartupRetry(rawConfig connection.ConnectionCon
if err := dbInst.Connect(connectConfig); err == nil {
if attempt > 1 {
logger.Warnf("数据库连接在启动保护重试后成功:%s 缓存Key=%s 尝试=%d/%d", formatConnSummary(effectiveConfig), cacheKey, attempt, startupConnectRetryAttempts)
logger.Warnf("数据库连接在重试后成功:%s 缓存Key=%s 尝试=%d/%d", formatConnSummary(effectiveConfig), cacheKey, attempt, startupConnectRetryAttempts)
}
return dbInst, effectiveConfig, nil
} else {
@@ -616,10 +616,10 @@ func (a *App) connectDatabaseWithStartupRetry(rawConfig connection.ConnectionCon
wrapped := wrapConnectError(effectiveConfig, err)
lastErr = wrapped
logger.Error(wrapped, "建立数据库连接失败:%s 缓存Key=%s", formatConnSummary(effectiveConfig), cacheKey)
if !a.shouldRetryStartupConnect(err, attempt) {
if !a.shouldRetryConnect(err, attempt) {
return nil, effectiveConfig, wrapped
}
logger.Warnf("检测到启动期瞬时网络失败,准备重试连接:%s 缓存Key=%s 尝试=%d/%d 延迟=%s 原因=%s",
logger.Warnf("检测到瞬时网络失败,准备重试连接:%s 缓存Key=%s 尝试=%d/%d 延迟=%s 原因=%s",
formatConnSummary(effectiveConfig), cacheKey, attempt, startupConnectRetryAttempts, startupConnectRetryDelay, normalizeErrorMessage(err))
time.Sleep(startupConnectRetryDelay)
}
@@ -650,18 +650,21 @@ func (a *App) startupPhaseLabel() string {
return fmt.Sprintf("稳定期(age=%s)", age)
}
func (a *App) shouldRetryStartupConnect(err error, attempt int) bool {
func (a *App) shouldRetryConnect(err error, attempt int) bool {
if attempt >= startupConnectRetryAttempts {
return false
}
if a == nil || a.startedAt.IsZero() {
if !isTransientStartupConnectError(err) {
return false
}
age := time.Since(a.startedAt)
if age < 0 || age > startupConnectRetryWindow {
return false
if a != nil && !a.startedAt.IsZero() {
age := time.Since(a.startedAt)
if age >= 0 && age <= startupConnectRetryWindow {
return true
}
}
return isTransientStartupConnectError(err)
// Outside startup window, still grant one retry for transient network glitches.
return attempt == 1
}
func isTransientStartupConnectError(err error) bool {

View File

@@ -2,11 +2,14 @@ package app
import (
"errors"
"os"
"strings"
"testing"
"time"
"GoNavi-Wails/internal/connection"
"GoNavi-Wails/internal/db"
"GoNavi-Wails/internal/logger"
)
type fakeStartupRetryDB struct {
@@ -106,7 +109,7 @@ func TestConnectDatabaseWithStartupRetry_RetriesTransientFailureAndReappliesGlob
}
}
func TestConnectDatabaseWithStartupRetry_DoesNotRetryOutsideStartupWindow(t *testing.T) {
func TestConnectDatabaseWithStartupRetry_RetriesOnceOutsideStartupWindow(t *testing.T) {
originalNewDatabaseFunc := newDatabaseFunc
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
defer func() {
@@ -130,12 +133,165 @@ func TestConnectDatabaseWithStartupRetry_DoesNotRetryOutsideStartupWindow(t *tes
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
if err == nil {
t.Fatal("expected error, got nil")
}
if connectCalls != 2 {
t.Fatalf("expected 2 connect attempts outside startup window, got %d", connectCalls)
}
}
func TestConnectDatabaseWithStartupRetry_DoesNotRetryOutsideStartupWindowForNonTransientError(t *testing.T) {
originalNewDatabaseFunc := newDatabaseFunc
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
defer func() {
newDatabaseFunc = originalNewDatabaseFunc
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
}()
connectCalls := 0
newDatabaseFunc = func(dbType string) (db.Database, error) {
return &fakeStartupRetryDB{
connect: func(config connection.ConnectionConfig) error {
connectCalls++
return errors.New("pq: password authentication failed")
},
}, nil
}
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
return raw, nil
}
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
if err == nil {
t.Fatal("expected error, got nil")
}
if connectCalls != 1 {
t.Fatalf("expected 1 connect attempt outside startup window, got %d", connectCalls)
t.Fatalf("expected 1 connect attempt outside startup window for non-transient error, got %d", connectCalls)
}
}
func TestConnectDatabaseWithStartupRetry_LogsRetryHintOutsideStartupWindow(t *testing.T) {
originalNewDatabaseFunc := newDatabaseFunc
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
defer func() {
newDatabaseFunc = originalNewDatabaseFunc
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
}()
logPath := logger.Path()
beforeSize := int64(0)
if fi, err := os.Stat(logPath); err == nil {
beforeSize = fi.Size()
}
connectCalls := 0
newDatabaseFunc = func(dbType string) (db.Database, error) {
return &fakeStartupRetryDB{
connect: func(config connection.ConnectionConfig) error {
connectCalls++
if connectCalls == 1 {
return errors.New("dial tcp 10.1.131.86:5432: connect: no route to host")
}
return nil
},
}, nil
}
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
return raw, nil
}
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
if err != nil {
t.Fatalf("expected success after retry, got error: %v", err)
}
if connectCalls != 2 {
t.Fatalf("expected 2 connect attempts, got %d", connectCalls)
}
logContent, readErr := os.ReadFile(logPath)
if readErr != nil {
t.Fatalf("read log failed: %v", readErr)
}
if int64(len(logContent)) < beforeSize {
t.Fatalf("expected log file to grow, before=%d after=%d", beforeSize, len(logContent))
}
appended := string(logContent[beforeSize:])
if !strings.Contains(appended, "检测到瞬时网络失败,准备重试连接") {
t.Fatalf("expected retry hint log in appended segment, got: %s", appended)
}
}
func TestConnectDatabaseWithStartupRetry_OutsideStartupWindowTransientFailureStopsAfterOneRetry(t *testing.T) {
originalNewDatabaseFunc := newDatabaseFunc
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
defer func() {
newDatabaseFunc = originalNewDatabaseFunc
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
}()
connectCalls := 0
newDatabaseFunc = func(dbType string) (db.Database, error) {
return &fakeStartupRetryDB{
connect: func(config connection.ConnectionConfig) error {
connectCalls++
return errors.New("dial tcp 10.1.131.86:5432: connect: no route to host")
},
}, nil
}
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
return raw, nil
}
a := &App{startedAt: time.Now().Add(-startupConnectRetryWindow - time.Second)}
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
if err == nil {
t.Fatal("expected error, got nil")
}
if connectCalls != 2 {
t.Fatalf("expected 2 connect attempts outside startup window for transient error, got %d", connectCalls)
}
}
func TestConnectDatabaseWithStartupRetry_StartupWindowTransientFailureUsesFullRetryBudget(t *testing.T) {
originalNewDatabaseFunc := newDatabaseFunc
originalResolveDialConfigWithProxyFunc := resolveDialConfigWithProxyFunc
defer func() {
newDatabaseFunc = originalNewDatabaseFunc
resolveDialConfigWithProxyFunc = originalResolveDialConfigWithProxyFunc
}()
connectCalls := 0
newDatabaseFunc = func(dbType string) (db.Database, error) {
return &fakeStartupRetryDB{
connect: func(config connection.ConnectionConfig) error {
connectCalls++
return errors.New("dial tcp 10.1.131.86:5432: connect: no route to host")
},
}, nil
}
resolveDialConfigWithProxyFunc = func(raw connection.ConnectionConfig) (connection.ConnectionConfig, error) {
return raw, nil
}
a := &App{startedAt: time.Now()}
rawConfig := connection.ConnectionConfig{Type: "postgres", Host: "10.1.131.86", Port: 5432, User: "postgres"}
_, _, err := a.connectDatabaseWithStartupRetry(rawConfig)
if err == nil {
t.Fatal("expected error, got nil")
}
if connectCalls != startupConnectRetryAttempts {
t.Fatalf("expected %d connect attempts in startup window, got %d", startupConnectRetryAttempts, connectCalls)
}
}