Skip to content

Commit bb005e1

Browse files
author
Daniel McCarney
authored
integration: add test for boulder-janitor. (letsencrypt#4364)
1 parent 98677b8 commit bb005e1

File tree

8 files changed

+120
-10
lines changed

8 files changed

+120
-10
lines changed

cmd/boulder-janitor/certs.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ func newCertificatesJob(
2424
db: db,
2525
log: log,
2626
purgeBefore: purgeBefore,
27+
workSleep: config.Janitor.Certificates.WorkSleep.Duration,
2728
batchSize: config.Janitor.Certificates.BatchSize,
2829
maxDPS: config.Janitor.Certificates.MaxDPS,
2930
parallelism: config.Janitor.Certificates.Parallelism,

cmd/boulder-janitor/certsPerName.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ func newCertificatesPerNameJob(
2424
db: db,
2525
log: log,
2626
purgeBefore: purgeBefore,
27+
workSleep: config.Janitor.CertificatesPerName.WorkSleep.Duration,
2728
batchSize: config.Janitor.CertificatesPerName.BatchSize,
2829
maxDPS: config.Janitor.CertificatesPerName.MaxDPS,
2930
parallelism: config.Janitor.CertificatesPerName.Parallelism,

cmd/boulder-janitor/certstatus.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ func newCertificateStatusJob(
2424
db: db,
2525
log: log,
2626
purgeBefore: purgeBefore,
27+
workSleep: config.Janitor.CertificateStatus.WorkSleep.Duration,
2728
batchSize: config.Janitor.CertificateStatus.BatchSize,
2829
maxDPS: config.Janitor.CertificateStatus.MaxDPS,
2930
parallelism: config.Janitor.CertificateStatus.Parallelism,

cmd/boulder-janitor/config.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@ type CleanupConfig struct {
1313
Enabled bool
1414
// GracePeriod controls when a resource is old enough to be cleaned up.
1515
GracePeriod cmd.ConfigDuration
16+
// WorkSleep controls how long the janitor's work threads sleep between
17+
// finding no work and trying again. Defaults to a minute if not provided.
18+
WorkSleep cmd.ConfigDuration
1619
// BatchSize controls how many rows of the resource will be read from the DB
1720
// per-query.
1821
BatchSize int64

cmd/boulder-janitor/janitor.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,9 @@ func New(clk clock.Clock, config Config) (*janitor, error) {
4747

4848
// Setup logging and stats
4949
scope, logger := cmd.StatsAndLogging(config.Janitor.Syslog, config.Janitor.DebugAddr)
50+
scope.MustRegister(errStat)
5051
scope.MustRegister(deletedStat)
52+
scope.MustRegister(workStat)
5153
defer logger.AuditPanic()
5254
logger.Info(cmd.VersionString())
5355

cmd/boulder-janitor/job.go

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ type batchedDBJob struct {
5252
// purgeBefore indicates the cut-off for the the resoruce being cleaned up by
5353
// the job. Rows that older than now - purgeBefore are deleted.
5454
purgeBefore time.Time
55+
// workSleep is a duration that the job will sleep between getWork() calls
56+
// when no new work is found. If not provided, defaults to a minute.
57+
workSleep time.Duration
5558
// batchSize indicates how many database rows of work should be returned per query.
5659
batchSize int64
5760
// maxDPS optionally indicates a maximum rate of deletes to run per second.
@@ -171,7 +174,11 @@ func (j batchedDBJob) RunForever() {
171174
j.log.Debugf(
172175
"made no new progress on table %q. Sleeping for a minute",
173176
j.table)
174-
time.Sleep(time.Minute)
177+
if j.workSleep.Seconds() == 0 {
178+
time.Sleep(time.Minute)
179+
} else {
180+
time.Sleep(j.workSleep)
181+
}
175182
}
176183
id = lastID
177184
}

test/config-next/janitor.json

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,34 @@
11
{
22
"janitor": {
33
"syslog": {
4-
"stdoutLevel": 7
4+
"stdoutLevel": 6
55
},
66
"dbConnectFile": "test/secrets/janitor_dburl",
77
"maxDBConns": 10,
88
"debugAddr": ":8014",
99
"certificates": {
1010
"enabled": true,
1111
"gracePeriod": "1h",
12-
"batchSize": 10,
12+
"batchSize": 100,
13+
"workSleep": "500ms",
1314
"parallelism": 2,
14-
"maxDPS": 1
15+
"maxDPS": 50
1516
},
1617
"certificateStatus": {
1718
"enabled": true,
1819
"gracePeriod": "1h",
19-
"batchSize": 10,
20+
"batchSize": 100,
21+
"workSleep": "500ms",
2022
"parallelism": 2,
21-
"maxDPS": 1
23+
"maxDPS": 50
2224
},
2325
"certificatesPerName": {
2426
"enabled": true,
2527
"gracePeriod": "1h",
26-
"batchSize": 10,
28+
"batchSize": 100,
29+
"workSleep": "500ms",
2730
"parallelism": 2,
28-
"maxDPS": 1
31+
"maxDPS": 50
2932
}
3033
}
3134
}

test/integration-test.py

Lines changed: 94 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,6 @@
3030

3131
from acme import challenges
3232

33-
import requests
34-
3533
def run_client_tests():
3634
root = os.environ.get("CERTBOT_PATH")
3735
assert root is not None, (
@@ -83,6 +81,94 @@ def expect(target_time, num, table):
8381
expect(now, 0, "authz")
8482
expect(after_grace_period, 1, "authz")
8583

84+
def run_janitor():
85+
# Set the fake clock to a year in the future such that all of the database
86+
# rows created during the integration tests are older than the grace period.
87+
now = datetime.datetime.utcnow()
88+
target_time = now+datetime.timedelta(days=+365)
89+
90+
e = os.environ.copy()
91+
e.setdefault("GORACE", "halt_on_error=1")
92+
e.setdefault("FAKECLOCK", fakeclock(target_time))
93+
94+
# Note: Must use exec here so that killing this process kills the command.
95+
cmdline = "exec ./bin/boulder-janitor --config test/config-next/janitor.json"
96+
p = subprocess.Popen(cmdline, shell=True, env=e)
97+
98+
# Wait for the janitor to come up
99+
waitport(8014, "boulder-janitor", None)
100+
101+
def statline(statname, table):
102+
# NOTE: we omit the trailing "}}" to make this match general enough to
103+
# permit new labels in the future.
104+
return "janitor_{0}{{table=\"{1}\"".format(statname, table)
105+
106+
def get_stat_line(port, stat):
107+
url = "http://localhost:%d/metrics" % port
108+
response = requests.get(url)
109+
for l in response.content.split("\n"):
110+
if l.strip().startswith(stat):
111+
return l
112+
return None
113+
114+
def stat_value(line):
115+
parts = line.split(" ")
116+
if len(parts) != 2:
117+
raise Exception("stat line {0} was missing required parts".format(line))
118+
return parts[1]
119+
120+
# Wait for the janitor to report it isn't finding new work
121+
print("waiting for boulder-janitor work to complete...\n")
122+
workDone = False
123+
for i in range(10):
124+
certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
125+
certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
126+
certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
127+
128+
allReady = True
129+
for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
130+
if stat_value(line) != "0":
131+
allReady = False
132+
133+
if allReady is False:
134+
print("not done after check {0}. Sleeping".format(i))
135+
time.sleep(2)
136+
else:
137+
workDone = True
138+
break
139+
140+
if workDone is False:
141+
raise Exception("Timed out waiting for janitor to report all work completed\n")
142+
143+
# Check deletion stats are not empty/zero
144+
for i in range(10):
145+
certStatusDeletes = get_stat_line(8014, statline("deletions", "certificateStatus"))
146+
certsDeletes = get_stat_line(8014, statline("deletions", "certificates"))
147+
certsPerNameDeletes = get_stat_line(8014, statline("deletions", "certificatesPerName"))
148+
149+
if certStatusDeletes is None or certsDeletes is None or certsPerNameDeletes is None:
150+
print("delete stats not present after check {0}. Sleeping".format(i))
151+
time.sleep(2)
152+
continue
153+
154+
for l in [certStatusDeletes, certsDeletes, certsPerNameDeletes]:
155+
if stat_value(l) == "0":
156+
raise Exception("Expected a non-zero number of deletes to be performed. Found {0}".format(l))
157+
158+
# Check that all error stats are empty
159+
errorStats = [
160+
statline("errors", "certificateStatus"),
161+
statline("errors", "certificates"),
162+
statline("errors", "certificatesPerName"),
163+
]
164+
for eStat in errorStats:
165+
actual = get_stat_line(8014, eStat)
166+
if actual is not None:
167+
raise Exception("Expected to find no error stat lines but found {0}\n".format(eStat))
168+
169+
# Terminate the janitor
170+
p.terminate()
171+
86172
def test_single_ocsp():
87173
"""Run the single-ocsp command, which is used to generate OCSP responses for
88174
intermediate certificates on a manual basis. Then start up an
@@ -187,6 +273,12 @@ def main():
187273
check_balance()
188274
if not CONFIG_NEXT:
189275
run_expired_authz_purger()
276+
277+
# Run the boulder-janitor. This should happen after all other tests because
278+
# it runs with the fake clock set to the future and deletes rows that may
279+
# otherwise be referenced by tests.
280+
run_janitor()
281+
190282
# Run the load-generator last. run_loadtest will stop the
191283
# pebble-challtestsrv before running the load-generator and will not restart
192284
# it.

0 commit comments

Comments
 (0)