integration: add test for boulder-janitor. (letsencrypt#4364)

Daniel McCarney · web-flow · commit bb005e1c79f3 · 2019-07-29T16:13:10.000-04:00
diff --git a/cmd/boulder-janitor/certs.go b/cmd/boulder-janitor/certs.go
@@ -24,6 +24,7 @@ func newCertificatesJob(
 		db:          db,
 		log:         log,
 		purgeBefore: purgeBefore,
+		workSleep:   config.Janitor.Certificates.WorkSleep.Duration,
 		batchSize:   config.Janitor.Certificates.BatchSize,
 		maxDPS:      config.Janitor.Certificates.MaxDPS,
 		parallelism: config.Janitor.Certificates.Parallelism,
diff --git a/cmd/boulder-janitor/certsPerName.go b/cmd/boulder-janitor/certsPerName.go
@@ -24,6 +24,7 @@ func newCertificatesPerNameJob(
 		db:          db,
 		log:         log,
 		purgeBefore: purgeBefore,
+		workSleep:   config.Janitor.CertificatesPerName.WorkSleep.Duration,
 		batchSize:   config.Janitor.CertificatesPerName.BatchSize,
 		maxDPS:      config.Janitor.CertificatesPerName.MaxDPS,
 		parallelism: config.Janitor.CertificatesPerName.Parallelism,
diff --git a/cmd/boulder-janitor/certstatus.go b/cmd/boulder-janitor/certstatus.go
@@ -24,6 +24,7 @@ func newCertificateStatusJob(
 		db:          db,
 		log:         log,
 		purgeBefore: purgeBefore,
+		workSleep:   config.Janitor.CertificateStatus.WorkSleep.Duration,
 		batchSize:   config.Janitor.CertificateStatus.BatchSize,
 		maxDPS:      config.Janitor.CertificateStatus.MaxDPS,
 		parallelism: config.Janitor.CertificateStatus.Parallelism,
diff --git a/cmd/boulder-janitor/config.go b/cmd/boulder-janitor/config.go
@@ -13,6 +13,9 @@ type CleanupConfig struct {
 	Enabled bool
 	// GracePeriod controls when a resource is old enough to be cleaned up.
 	GracePeriod cmd.ConfigDuration
+	// WorkSleep controls how long the janitor's work threads sleep between
+	// finding no work and trying again. Defaults to a minute if not provided.
+	WorkSleep cmd.ConfigDuration
 	// BatchSize controls how many rows of the resource will be read from the DB
 	// per-query.
 	BatchSize int64
diff --git a/cmd/boulder-janitor/janitor.go b/cmd/boulder-janitor/janitor.go
@@ -47,7 +47,9 @@ func New(clk clock.Clock, config Config) (*janitor, error) {
 
 	// Setup logging and stats
 	scope, logger := cmd.StatsAndLogging(config.Janitor.Syslog, config.Janitor.DebugAddr)
+	scope.MustRegister(errStat)
 	scope.MustRegister(deletedStat)
+	scope.MustRegister(workStat)
 	defer logger.AuditPanic()
 	logger.Info(cmd.VersionString())
 
diff --git a/cmd/boulder-janitor/job.go b/cmd/boulder-janitor/job.go
@@ -52,6 +52,9 @@ type batchedDBJob struct {
 	// purgeBefore indicates the cut-off for the the resoruce being cleaned up by
 	// the job. Rows that older than now - purgeBefore are deleted.
 	purgeBefore time.Time
+	// workSleep is a duration that the job will sleep between getWork() calls
+	// when no new work is found. If not provided, defaults to a minute.
+	workSleep time.Duration
 	// batchSize indicates how many database rows of work should be returned per query.
 	batchSize int64
 	// maxDPS optionally indicates a maximum rate of deletes to run per second.
@@ -171,7 +174,11 @@ func (j batchedDBJob) RunForever() {
 				j.log.Debugf(
 					"made no new progress on table %q. Sleeping for a minute",
 					j.table)
-				time.Sleep(time.Minute)
+				if j.workSleep.Seconds() == 0 {
+					time.Sleep(time.Minute)
+				} else {
+					time.Sleep(j.workSleep)
+				}
 			}
 			id = lastID
 		}
diff --git a/test/config-next/janitor.json b/test/config-next/janitor.json
@@ -1,31 +1,34 @@
 {
   "janitor": {
     "syslog": {
-      "stdoutLevel": 7
+      "stdoutLevel": 6
     },
     "dbConnectFile": "test/secrets/janitor_dburl",
     "maxDBConns": 10,
     "debugAddr": ":8014",
     "certificates": {
         "enabled": true,
         "gracePeriod": "1h",
-        "batchSize": 10,
+        "batchSize": 100,
+        "workSleep": "500ms",
         "parallelism": 2,
-        "maxDPS": 1
+        "maxDPS": 50
     },
     "certificateStatus": {
         "enabled": true,
         "gracePeriod": "1h",
-        "batchSize": 10,
+        "batchSize": 100,
+        "workSleep": "500ms",
         "parallelism": 2,
-        "maxDPS": 1
+        "maxDPS": 50
     },
     "certificatesPerName": {
         "enabled": true,
         "gracePeriod": "1h",
-        "batchSize": 10,
+        "batchSize": 100,
+        "workSleep": "500ms",
         "parallelism": 2,
-        "maxDPS": 1
+        "maxDPS": 50
     }
   }
 }
diff --git a/test/integration-test.py b/test/integration-test.py
@@ -30,8 +30,6 @@
 
 from acme import challenges
 
-import requests
-
 def run_client_tests():
     root = os.environ.get("CERTBOT_PATH")
     assert root is not None, (
@@ -83,6 +81,94 @@ def expect(target_time, num, table):
     expect(now, 0, "authz")
     expect(after_grace_period, 1, "authz")
 
+def run_janitor():
+    # Set the fake clock to a year in the future such that all of the database
+    # rows created during the integration tests are older than the grace period.
+    now = datetime.datetime.utcnow()
+    target_time = now+datetime.timedelta(days=+365)
+
+    e = os.environ.copy()
+    e.setdefault("GORACE", "halt_on_error=1")
+    e.setdefault("FAKECLOCK", fakeclock(target_time))
+
+    # Note: Must use exec here so that killing this process kills the command.
+    cmdline = "exec ./bin/boulder-janitor --config test/config-next/janitor.json"
+    p = subprocess.Popen(cmdline, shell=True, env=e)
+
+    # Wait for the janitor to come up
+    waitport(8014, "boulder-janitor", None)
+
+    def statline(statname, table):
+        # NOTE: we omit the trailing "}}" to make this match general enough to
+        # permit new labels in the future.
+        return "janitor_{0}{{table=\"{1}\"".format(statname, table)
+
+    def get_stat_line(port, stat):
+        url = "http://localhost:%d/metrics" % port
+        response = requests.get(url)
+        for l in response.content.split("\n"):
+            if l.strip().startswith(stat):
+                return l
+        return None
+
+    def stat_value(line):
+        parts = line.split(" ")
+        if len(parts) != 2:
+            raise Exception("stat line {0} was missing required parts".format(line))
+        return parts[1]
+
+    # Wait for the janitor to report it isn't finding new work
+    print("waiting for boulder-janitor work to complete...\n")
+    workDone = False
+    for i in range(10):
+        certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+        certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
+        certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
+
+        allReady = True
+        for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
+            if stat_value(line) != "0":
+                allReady = False
+
+        if allReady is False:
+            print("not done after check {0}. Sleeping".format(i))
+            time.sleep(2)
+        else:
+            workDone = True
+            break
+
+    if workDone is False:
+        raise Exception("Timed out waiting for janitor to report all work completed\n")
+
+    # Check deletion stats are not empty/zero
+    for i in range(10):
+        certStatusDeletes = get_stat_line(8014, statline("deletions", "certificateStatus"))
+        certsDeletes = get_stat_line(8014, statline("deletions", "certificates"))
+        certsPerNameDeletes = get_stat_line(8014, statline("deletions", "certificatesPerName"))
+
+        if certStatusDeletes is None or certsDeletes is None or certsPerNameDeletes is None:
+            print("delete stats not present after check {0}. Sleeping".format(i))
+            time.sleep(2)
+            continue
+
+        for l in [certStatusDeletes, certsDeletes, certsPerNameDeletes]:
+            if stat_value(l) == "0":
+                raise Exception("Expected a non-zero number of deletes to be performed. Found {0}".format(l))
+
+    # Check that all error stats are empty
+    errorStats = [
+      statline("errors", "certificateStatus"),
+      statline("errors", "certificates"),
+      statline("errors", "certificatesPerName"),
+    ]
+    for eStat in errorStats:
+        actual = get_stat_line(8014, eStat)
+        if actual is not None:
+            raise Exception("Expected to find no error stat lines but found {0}\n".format(eStat))
+
+    # Terminate the janitor
+    p.terminate()
+
 def test_single_ocsp():
     """Run the single-ocsp command, which is used to generate OCSP responses for
        intermediate certificates on a manual basis. Then start up an
@@ -187,6 +273,12 @@ def main():
         check_balance()
         if not CONFIG_NEXT:
             run_expired_authz_purger()
+
+        # Run the boulder-janitor. This should happen after all other tests because
+        # it runs with the fake clock set to the future and deletes rows that may
+        # otherwise be referenced by tests.
+        run_janitor()
+
         # Run the load-generator last. run_loadtest will stop the
         # pebble-challtestsrv before running the load-generator and will not restart
         # it.