boulder-janitor: switch workbatch gauge to counter. (letsencrypt#4477)

Daniel McCarney · jsha · commit d35c20db75a0 · 2019-10-11T14:40:59.000-07:00
A gauge wasn't the appropriate stat type choice for this usage.

Switching the stat to be a counter instead of a gauge means we can't
detect when the janitor is finished its work in the integration test by
watching for this stat to drop to zero for all the table labels we're
concerned with. Instead the test is updated to watch for the counter
value to stabilize for a period longer than the workbatch sleep.
diff --git a/cmd/boulder-janitor/job.go b/cmd/boulder-janitor/job.go
@@ -31,11 +31,11 @@ var (
 			Help: "Number of deletions by table the boulder-janitor has performed.",
 		},
 		[]string{"table"})
-	// workStat is a prometheus gauge vector tracking the number of rows found
+	// workStat is a prometheus counter vector tracking the number of rows found
 	// during a batchedJob's getWork stage and queued into the work channel sliced
 	// by a table label.
-	workStat = prometheus.NewGaugeVec(
-		prometheus.GaugeOpts{
+	workStat = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
 			Name: "janitor_workbatch",
 			Help: "Number of items of work by table the boulder-janitor queued for deletion.",
 		},
@@ -113,7 +113,7 @@ func (j batchedDBJob) getWork(work chan<- int64, startID int64) (int64, error) {
 		rows++
 		lastID = v.ID
 	}
-	workStat.WithLabelValues(j.table).Set(float64(rows))
+	workStat.WithLabelValues(j.table).Add(float64(rows))
 	return lastID, nil
 }
 
diff --git a/cmd/boulder-janitor/job_test.go b/cmd/boulder-janitor/job_test.go
@@ -10,7 +10,6 @@ import (
 	"github.com/jmhodges/clock"
 	blog "github.com/letsencrypt/boulder/log"
 	"github.com/letsencrypt/boulder/test"
-	"github.com/prometheus/client_golang/prometheus"
 )
 
 func setup() (*blog.Mock, clock.FakeClock) {
@@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {
 	}
 
 	// We expect the work gauge for this table has been updated
-	workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
-	test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
+	workCount := test.CountCounterVec("table", table, workStat)
 	test.AssertEquals(t, workCount, len(mockIDs))
 
 	// Set the third item in mockIDs to have an expiry after the purge cutoff
@@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {
 		got := <-workChan
 		test.AssertEquals(t, got, mockIDs[i].ID)
 	}
-	workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})
-	test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")
+	workCount = test.CountCounterVec("table", table, workStat)
 	test.AssertEquals(t, workCount, 2)
 }
 
diff --git a/test/integration-test.py b/test/integration-test.py
@@ -123,32 +123,31 @@ def stat_value(line):
             raise Exception("stat line {0} was missing required parts".format(line))
         return parts[1]
 
-    # Wait for the janitor to report it isn't finding new work
-    print("waiting for boulder-janitor work to complete...\n")
-    workDone = False
-    for i in range(10):
-        certStatusWorkbatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+    # Wait for the janitor to finish its work. The easiest way to tell this
+    # externally is to watch for the work batch counters to stabilize for
+    # a period longer than the configured workSleep.
+    attempts = 0
+    while True:
+        if attempts > 5:
+            raise Exception("timed out waiting for janitor workbatch counts to stabilize")
+
+        certStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
         certsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
         certsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
-        if not certStatusWorkbatch or not certsWorkBatch or not certsPerNameWorkBatch:
-            print("not done after check {0}. Sleeping".format(i))
-            time.sleep(2)
-            continue
 
-        allReady = True
-        for line in [certStatusWorkbatch, certsWorkBatch, certsPerNameWorkBatch]:
-            if stat_value(line) != "0":
-                allReady = False
+        # sleep for double the configured workSleep for each job
+        time.sleep(1)
 
-        if allReady is False:
-            print("not done after check {0}. Sleeping".format(i))
-            time.sleep(2)
-        else:
-            workDone = True
+        newCertStatusWorkBatch = get_stat_line(8014, statline("workbatch", "certificateStatus"))
+        newCertsWorkBatch = get_stat_line(8014, statline("workbatch", "certificates"))
+        newCertsPerNameWorkBatch = get_stat_line(8014, statline("workbatch", "certificatesPerName"))
+
+        if (certStatusWorkBatch == newCertStatusWorkBatch 
+            and certsWorkBatch == newCertsWorkBatch 
+            and certsPerNameWorkBatch == newCertsPerNameWorkBatch):
             break
 
-    if workDone is False:
-        raise Exception("Timed out waiting for janitor to report all work completed\n")
+        attempts = attempts + 1
 
     # Check deletion stats are not empty/zero
     for i in range(10):

Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,6 @@ import (`
`10`	`10`	`"github.com/jmhodges/clock"`
`11`	`11`	`blog "github.com/letsencrypt/boulder/log"`
`12`	`12`	`"github.com/letsencrypt/boulder/test"`
`13`		`- "github.com/prometheus/client_golang/prometheus"`
`14`	`13`	`)`
`15`	`14`
`16`	`15`	`func setup() (*blog.Mock, clock.FakeClock) {`
`@@ -122,8 +121,7 @@ func TestGetWork(t *testing.T) {`
`122`	`121`	`}`
`123`	`122`
`124`	`123`	`// We expect the work gauge for this table has been updated`
`125`		`- workCount, err := test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})`
`126`		`- test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")`
	`124`	`+ workCount := test.CountCounterVec("table", table, workStat)`
`127`	`125`	`test.AssertEquals(t, workCount, len(mockIDs))`
`128`	`126`
`129`	`127`	`// Set the third item in mockIDs to have an expiry after the purge cutoff`
`@@ -140,8 +138,7 @@ func TestGetWork(t *testing.T) {`
`140`	`138`	`got := <-workChan`
`141`	`139`	`test.AssertEquals(t, got, mockIDs[i].ID)`
`142`	`140`	`}`
`143`		`- workCount, err = test.GaugeValueWithLabels(workStat, prometheus.Labels{"table": table})`
`144`		`- test.AssertNotError(t, err, "unexpected error from GaugeValueWithLabels")`
	`141`	`+ workCount = test.CountCounterVec("table", table, workStat)`
`145`	`142`	`test.AssertEquals(t, workCount, 2)`
`146`	`143`	`}`
`147`	`144`