Skip to content

Commit d35c20d

Browse files
Daniel McCarneyjsha
authored andcommitted
boulder-janitor: switch workbatch gauge to counter. (letsencrypt#4477)
A gauge wasn't the appropriate stat type choice for this usage. Switching the stat to be a counter instead of a gauge means we can't detect when the janitor is finished its work in the integration test by watching for this stat to drop to zero for all the table labels we're concerned with. Instead the test is updated to watch for the counter value to stabilize for a period longer than the workbatch sleep.
1 parent 83882ab commit d35c20d

File tree

3 files changed

+25
-29
lines changed

3 files changed

+25
-29
lines changed

cmd/boulder-janitor/job.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ var (
3131
Help: "Number of deletions by table the boulder-janitor has performed.",
3232
},
3333
[]string{"table"})
34-
// workStat is a prometheus gauge vector tracking the number of rows found
34+
// workStat is a prometheus counter vector tracking the number of rows found
3535
// during a batchedJob's getWork stage and queued into the work channel sliced
3636
// by a table label.
37-
workStat = prometheus.NewGaugeVec(
38-
prometheus.GaugeOpts{
37+
workStat = prometheus.NewCounterVec(
38+
prometheus.CounterOpts{
3939
Name: "janitor_workbatch",
4040
Help: "Number of items of work by table the boulder-janitor queued for deletion.",
4141
},
@@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) {
113113
rows++
114114
lastID = v.ID
115115
}
116-
workStat.WithLabelValues(j.table).Set(float64(rows))
116+
workStat.WithLabelValues(j.table).Add(float64(rows))
117117
return lastID, nil
118118
}
119119

cmd/boulder-janitor/job_test.go

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"github.com/jmhodges/clock"
1111
blog "github.com/letsencrypt/boulder/log"
1212
"github.com/letsencrypt/boulder/test"
13-
"github.com/prometheus/client_golang/prometheus"
1413
)
1514

1615
func setup() (*blog.Mock, clock.FakeClock) {
@@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {
122121
}
123122

124123
// We expect the work gauge for this table has been updated
125-
workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
126-
test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
124+
workCount := test.CountCounterVec("table", table, workStat)
127125
test.AssertEquals(t, workCount, len(mockIDs))
128126

129127
// Set the third item in mockIDs to have an expiry after the purge cutoff
@@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {
140138
got := <-workChan
141139
test.AssertEquals(t, got, mockIDs[i].ID)
142140
}
143-
workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
144-
test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
141+
workCount = test.CountCounterVec("table", table, workStat)
145142
test.AssertEquals(t, workCount, 2)
146143
}
147144

test/integration-test.py

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -123,32 +123,31 @@ def stat_value(line):
123123
raise Exception("stat line {0} was missing required parts".format(line))
124124
return parts[1]
125125

126-
# Wait for the janitor to report it isn't finding new work
127-
print("waiting for boulder-janitor work to complete...\n")
128-
workDone = False
129-
for i in range(10):
130-
certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
126+
# Wait for the janitor to finish its work. The easiest way to tell this
127+
# externally is to watch for the work batch counters to stabilize for
128+
# a period longer than the configured workSleep.
129+
attempts = 0
130+
while True:
131+
if attempts > 5:
132+
raise Exception("timed out waiting for janitor workbatch counts to stabilize")
133+
134+
certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
131135
certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
132136
certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
133-
if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch:
134-
print("not done after check {0}. Sleeping".format(i))
135-
time.sleep(2)
136-
continue
137137

138-
allReady = True
139-
for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
140-
if stat_value(line) != "0":
141-
allReady = False
138+
# sleep for double the configured workSleep for each job
139+
time.sleep(1)
142140

143-
if allReady is False:
144-
print("not done after check {0}. Sleeping".format(i))
145-
time.sleep(2)
146-
else:
147-
workDone = True
141+
newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
142+
newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
143+
newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
144+
145+
if (certStatusWorkBatch == newCertStatusWorkBatch
146+
and certsWorkBatch == newCertsWorkBatch
147+
and certsPerNameWorkBatch == newCertsPerNameWorkBatch):
148148
break
149149

150-
if workDone is False:
151-
raise Exception("Timed out waiting for janitor to report all work completed\n")
150+
attempts = attempts + 1
152151

153152
# Check deletion stats are not empty/zero
154153
for i in range(10):

0 commit comments

Comments
 (0)