Skip to content

Commit a3e7160

Browse files
committed
feat(coderd/agentconnectionbatcher): add batched agent heartbeat writes
Reduces agent connection heartbeat DB writes from ~667/s to ~2/s at 10k agents by batching UpdateWorkspaceAgentConnectionByID calls. Disconnect writes bypass the batcher for immediate visibility.
1 parent b8a5344 commit a3e7160

File tree

11 files changed

+714
-11
lines changed

11 files changed

+714
-11
lines changed
Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
package agentconnectionbatcher
2+
3+
import (
4+
"context"
5+
"database/sql"
6+
"time"
7+
8+
"github.com/google/uuid"
9+
10+
"cdr.dev/slog/v3"
11+
"github.com/coder/coder/v2/coderd/database"
12+
"github.com/coder/coder/v2/coderd/database/dbauthz"
13+
"github.com/coder/quartz"
14+
)
15+
16+
const (
17+
// defaultBatchSize is the maximum number of agent connection updates
18+
// to batch before forcing a flush. With one entry per agent, this
19+
// accommodates 500 concurrently connected agents per batch.
20+
defaultBatchSize = 500
21+
22+
// defaultChannelBufferMultiplier is the multiplier for the channel
23+
// buffer size relative to the batch size. A 5x multiplier provides
24+
// significant headroom for bursts while the batch is being flushed.
25+
defaultChannelBufferMultiplier = 5
26+
27+
// defaultFlushInterval is how frequently to flush batched connection
28+
// updates to the database. 5 seconds provides a good balance between
29+
// reducing database load and keeping connection state reasonably
30+
// current.
31+
defaultFlushInterval = 5 * time.Second
32+
33+
// finalFlushTimeout is the timeout for the final flush when the
34+
// batcher is shutting down.
35+
finalFlushTimeout = 15 * time.Second
36+
)
37+
38+
// Update represents a single agent connection state update to be batched.
39+
type Update struct {
40+
ID uuid.UUID
41+
FirstConnectedAt sql.NullTime
42+
LastConnectedAt sql.NullTime
43+
LastConnectedReplicaID uuid.NullUUID
44+
DisconnectedAt sql.NullTime
45+
UpdatedAt time.Time
46+
}
47+
48+
// Batcher accumulates agent connection updates and periodically flushes
49+
// them to the database in a single batch query. This reduces per-heartbeat
50+
// database write pressure from O(n) queries to O(1).
51+
type Batcher struct {
52+
store database.Store
53+
log slog.Logger
54+
55+
updateCh chan Update
56+
batch map[uuid.UUID]Update
57+
maxBatchSize int
58+
59+
clock quartz.Clock
60+
timer *quartz.Timer
61+
interval time.Duration
62+
63+
ctx context.Context
64+
cancel context.CancelFunc
65+
done chan struct{}
66+
}
67+
68+
// Option is a functional option for configuring a Batcher.
69+
type Option func(b *Batcher)
70+
71+
// WithBatchSize sets the maximum number of updates to accumulate before
72+
// forcing a flush.
73+
func WithBatchSize(size int) Option {
74+
return func(b *Batcher) {
75+
b.maxBatchSize = size
76+
}
77+
}
78+
79+
// WithInterval sets how frequently the batcher flushes to the database.
80+
func WithInterval(d time.Duration) Option {
81+
return func(b *Batcher) {
82+
b.interval = d
83+
}
84+
}
85+
86+
// WithLogger sets the logger for the batcher.
87+
func WithLogger(log slog.Logger) Option {
88+
return func(b *Batcher) {
89+
b.log = log
90+
}
91+
}
92+
93+
// WithClock sets the clock for the batcher, useful for testing.
94+
func WithClock(clock quartz.Clock) Option {
95+
return func(b *Batcher) {
96+
b.clock = clock
97+
}
98+
}
99+
100+
// New creates a new Batcher and starts its background processing loop.
101+
// The provided context controls the lifetime of the batcher.
102+
func New(ctx context.Context, store database.Store, opts ...Option) *Batcher {
103+
b := &Batcher{
104+
store: store,
105+
done: make(chan struct{}),
106+
log: slog.Logger{},
107+
clock: quartz.NewReal(),
108+
}
109+
110+
for _, opt := range opts {
111+
opt(b)
112+
}
113+
114+
if b.interval == 0 {
115+
b.interval = defaultFlushInterval
116+
}
117+
if b.maxBatchSize == 0 {
118+
b.maxBatchSize = defaultBatchSize
119+
}
120+
121+
b.timer = b.clock.NewTimer(b.interval)
122+
channelSize := b.maxBatchSize * defaultChannelBufferMultiplier
123+
b.updateCh = make(chan Update, channelSize)
124+
b.batch = make(map[uuid.UUID]Update)
125+
126+
b.ctx, b.cancel = context.WithCancel(ctx)
127+
go func() {
128+
b.run(b.ctx)
129+
close(b.done)
130+
}()
131+
132+
return b
133+
}
134+
135+
// Close cancels the batcher context and waits for the final flush to
136+
// complete.
137+
func (b *Batcher) Close() {
138+
b.cancel()
139+
if b.timer != nil {
140+
b.timer.Stop()
141+
}
142+
<-b.done
143+
}
144+
145+
// Add enqueues an agent connection update for batching. If the
146+
// channel is full, a direct (unbatched) DB write is performed as a
147+
// fallback so that heartbeats are never silently lost.
148+
func (b *Batcher) Add(u Update) {
149+
select {
150+
case b.updateCh <- u:
151+
default:
152+
b.log.Warn(context.Background(), "connection batcher channel full, falling back to direct write",
153+
slog.F("agent_id", u.ID),
154+
)
155+
b.writeDirect(u)
156+
}
157+
}
158+
159+
// writeDirect performs a single-item batch write as a fallback when
160+
// the channel is full. This ensures heartbeats are never lost, which
161+
// would cause agents to be erroneously marked as disconnected.
162+
func (b *Batcher) writeDirect(u Update) {
163+
//nolint:gocritic // System-level fallback for agent heartbeats.
164+
ctx, cancel := context.WithTimeout(dbauthz.AsSystemRestricted(context.Background()), 10*time.Second)
165+
defer cancel()
166+
err := b.store.BatchUpdateWorkspaceAgentConnections(ctx, database.BatchUpdateWorkspaceAgentConnectionsParams{
167+
ID: []uuid.UUID{u.ID},
168+
FirstConnectedAt: []time.Time{nullTimeToTime(u.FirstConnectedAt)},
169+
LastConnectedAt: []time.Time{nullTimeToTime(u.LastConnectedAt)},
170+
LastConnectedReplicaID: []uuid.UUID{nullUUIDToUUID(u.LastConnectedReplicaID)},
171+
DisconnectedAt: []time.Time{nullTimeToTime(u.DisconnectedAt)},
172+
UpdatedAt: []time.Time{u.UpdatedAt},
173+
})
174+
if err != nil {
175+
b.log.Error(context.Background(), "direct heartbeat write failed",
176+
slog.F("agent_id", u.ID), slog.Error(err))
177+
}
178+
}
179+
180+
func (b *Batcher) processUpdate(u Update) {
181+
existing, exists := b.batch[u.ID]
182+
if exists && u.UpdatedAt.Before(existing.UpdatedAt) {
183+
return
184+
}
185+
b.batch[u.ID] = u
186+
}
187+
188+
func (b *Batcher) run(ctx context.Context) {
189+
//nolint:gocritic // System-level batch operation for agent connections.
190+
authCtx := dbauthz.AsSystemRestricted(ctx)
191+
for {
192+
select {
193+
case u := <-b.updateCh:
194+
b.processUpdate(u)
195+
196+
if len(b.batch) >= b.maxBatchSize {
197+
b.flush(authCtx)
198+
b.timer.Reset(b.interval, "connectionBatcher", "capacityFlush")
199+
}
200+
201+
case <-b.timer.C:
202+
b.flush(authCtx)
203+
b.timer.Reset(b.interval, "connectionBatcher", "scheduledFlush")
204+
205+
case <-ctx.Done():
206+
b.log.Debug(ctx, "context done, flushing before exit")
207+
208+
ctxTimeout, cancel := context.WithTimeout(context.Background(), finalFlushTimeout)
209+
defer cancel() //nolint:revive // Returning after this.
210+
211+
//nolint:gocritic // System-level batch operation for agent connections.
212+
b.flush(dbauthz.AsSystemRestricted(ctxTimeout))
213+
return
214+
}
215+
}
216+
}
217+
218+
func (b *Batcher) flush(ctx context.Context) {
219+
count := len(b.batch)
220+
if count == 0 {
221+
return
222+
}
223+
224+
b.log.Debug(ctx, "flushing connection batch", slog.F("count", count))
225+
226+
var (
227+
ids = make([]uuid.UUID, 0, count)
228+
firstConnectedAt = make([]time.Time, 0, count)
229+
lastConnectedAt = make([]time.Time, 0, count)
230+
lastConnectedReplicaID = make([]uuid.UUID, 0, count)
231+
disconnectedAt = make([]time.Time, 0, count)
232+
updatedAt = make([]time.Time, 0, count)
233+
)
234+
235+
for _, u := range b.batch {
236+
ids = append(ids, u.ID)
237+
firstConnectedAt = append(firstConnectedAt, nullTimeToTime(u.FirstConnectedAt))
238+
lastConnectedAt = append(lastConnectedAt, nullTimeToTime(u.LastConnectedAt))
239+
lastConnectedReplicaID = append(lastConnectedReplicaID, nullUUIDToUUID(u.LastConnectedReplicaID))
240+
disconnectedAt = append(disconnectedAt, nullTimeToTime(u.DisconnectedAt))
241+
updatedAt = append(updatedAt, u.UpdatedAt)
242+
}
243+
244+
// Clear batch before the DB call. Losing a batch of heartbeat
245+
// timestamps is acceptable; the next heartbeat will update them.
246+
b.batch = make(map[uuid.UUID]Update)
247+
248+
err := b.store.BatchUpdateWorkspaceAgentConnections(ctx, database.BatchUpdateWorkspaceAgentConnectionsParams{
249+
ID: ids,
250+
FirstConnectedAt: firstConnectedAt,
251+
LastConnectedAt: lastConnectedAt,
252+
LastConnectedReplicaID: lastConnectedReplicaID,
253+
DisconnectedAt: disconnectedAt,
254+
UpdatedAt: updatedAt,
255+
})
256+
if err != nil {
257+
if database.IsQueryCanceledError(err) {
258+
b.log.Debug(ctx, "query canceled, skipping connection batch update")
259+
return
260+
}
261+
b.log.Error(ctx, "failed to batch update agent connections", slog.Error(err))
262+
return
263+
}
264+
265+
b.log.Debug(ctx, "connection batch flush complete", slog.F("count", count))
266+
}
267+
268+
// nullTimeToTime converts a sql.NullTime to a time.Time. When the
269+
// NullTime is not valid, the zero time is returned which PostgreSQL
270+
// will store as the epoch. The batch query uses unnest over plain
271+
// time arrays, so we cannot pass NULL directly.
272+
func nullTimeToTime(nt sql.NullTime) time.Time {
273+
if nt.Valid {
274+
return nt.Time
275+
}
276+
return time.Time{}
277+
}
278+
279+
// nullUUIDToUUID converts a uuid.NullUUID to a uuid.UUID. When the
280+
// NullUUID is not valid, uuid.Nil is returned.
281+
func nullUUIDToUUID(nu uuid.NullUUID) uuid.UUID {
282+
if nu.Valid {
283+
return nu.UUID
284+
}
285+
return uuid.Nil
286+
}

0 commit comments

Comments
 (0)