Skip to content

Commit 1a774ab

Browse files
fix(tailnet): retry after transport dial timeouts (#22977) (cherry-pick/v2.31) (#22992)
Backport of #22977 to 2.31
1 parent 581e956 commit 1a774ab

File tree

2 files changed

+104
-1
lines changed

2 files changed

+104
-1
lines changed

tailnet/controllers.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1429,7 +1429,7 @@ func (c *Controller) Run(ctx context.Context) {
14291429

14301430
tailnetClients, err := c.Dialer.Dial(c.ctx, c.ResumeTokenCtrl)
14311431
if err != nil {
1432-
if xerrors.Is(err, context.Canceled) || xerrors.Is(err, context.DeadlineExceeded) {
1432+
if c.ctx.Err() != nil {
14331433
return
14341434
}
14351435

tailnet/controllers_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1075,6 +1075,84 @@ func TestController_Disconnects(t *testing.T) {
10751075
_ = testutil.TryReceive(testCtx, t, uut.Closed())
10761076
}
10771077

1078+
func TestController_RetriesWrappedDeadlineExceeded(t *testing.T) {
1079+
t.Parallel()
1080+
testCtx := testutil.Context(t, testutil.WaitShort)
1081+
ctx, cancel := context.WithCancel(testCtx)
1082+
defer cancel()
1083+
1084+
logger := testutil.Logger(t)
1085+
dialer := &scriptedDialer{
1086+
attempts: make(chan int, 10),
1087+
dialFn: func(ctx context.Context, attempt int) (tailnet.ControlProtocolClients, error) {
1088+
if attempt == 1 {
1089+
return tailnet.ControlProtocolClients{}, &net.OpError{
1090+
Op: "dial",
1091+
Net: "tcp",
1092+
Err: context.DeadlineExceeded,
1093+
}
1094+
}
1095+
1096+
<-ctx.Done()
1097+
return tailnet.ControlProtocolClients{}, ctx.Err()
1098+
},
1099+
}
1100+
1101+
uut := tailnet.NewController(logger.Named("ctrl"), dialer)
1102+
uut.Run(ctx)
1103+
1104+
require.Equal(t, 1, testutil.TryReceive(testCtx, t, dialer.attempts))
1105+
require.Equal(t, 2, testutil.TryReceive(testCtx, t, dialer.attempts))
1106+
1107+
select {
1108+
case <-uut.Closed():
1109+
t.Fatal("controller exited after wrapped deadline exceeded")
1110+
default:
1111+
}
1112+
1113+
cancel()
1114+
_ = testutil.TryReceive(testCtx, t, uut.Closed())
1115+
}
1116+
1117+
func TestController_DoesNotRedialAfterCancel(t *testing.T) {
1118+
t.Parallel()
1119+
testCtx := testutil.Context(t, testutil.WaitShort)
1120+
ctx, cancel := context.WithCancel(testCtx)
1121+
logger := testutil.Logger(t)
1122+
1123+
fClient := newFakeWorkspaceUpdateClient(testCtx, t)
1124+
dialer := &scriptedDialer{
1125+
attempts: make(chan int, 10),
1126+
dialFn: func(_ context.Context, _ int) (tailnet.ControlProtocolClients, error) {
1127+
return tailnet.ControlProtocolClients{
1128+
WorkspaceUpdates: fClient,
1129+
Closer: fakeCloser{},
1130+
}, nil
1131+
},
1132+
}
1133+
fCtrl := newFakeUpdatesController(testCtx, t)
1134+
1135+
uut := tailnet.NewController(logger.Named("ctrl"), dialer)
1136+
uut.WorkspaceUpdatesCtrl = fCtrl
1137+
uut.Run(ctx)
1138+
1139+
require.Equal(t, 1, testutil.TryReceive(testCtx, t, dialer.attempts))
1140+
call := testutil.TryReceive(testCtx, t, fCtrl.calls)
1141+
require.Equal(t, fClient, call.client)
1142+
testutil.RequireSend[tailnet.CloserWaiter](testCtx, t, call.resp, newFakeCloserWaiter())
1143+
1144+
cancel()
1145+
closeCall := testutil.TryReceive(testCtx, t, fClient.close)
1146+
testutil.RequireSend(testCtx, t, closeCall, nil)
1147+
_ = testutil.TryReceive(testCtx, t, uut.Closed())
1148+
1149+
select {
1150+
case attempt := <-dialer.attempts:
1151+
t.Fatalf("unexpected redial attempt after cancel: %d", attempt)
1152+
default:
1153+
}
1154+
}
1155+
10781156
func TestController_TelemetrySuccess(t *testing.T) {
10791157
t.Parallel()
10801158
ctx := testutil.Context(t, testutil.WaitShort)
@@ -2070,6 +2148,31 @@ func newFakeCloserWaiter() *fakeCloserWaiter {
20702148
}
20712149
}
20722150

2151+
type scriptedDialer struct {
2152+
attempts chan int
2153+
dialFn func(context.Context, int) (tailnet.ControlProtocolClients, error)
2154+
2155+
mu sync.Mutex
2156+
attemptN int
2157+
}
2158+
2159+
func (d *scriptedDialer) Dial(ctx context.Context, _ tailnet.ResumeTokenController) (tailnet.ControlProtocolClients, error) {
2160+
d.mu.Lock()
2161+
d.attemptN++
2162+
attempt := d.attemptN
2163+
d.mu.Unlock()
2164+
2165+
if d.attempts != nil {
2166+
select {
2167+
case d.attempts <- attempt:
2168+
case <-ctx.Done():
2169+
return tailnet.ControlProtocolClients{}, ctx.Err()
2170+
}
2171+
}
2172+
2173+
return d.dialFn(ctx, attempt)
2174+
}
2175+
20732176
type fakeWorkspaceUpdatesDialer struct {
20742177
client tailnet.WorkspaceUpdatesClient
20752178
}

0 commit comments

Comments
 (0)