perf: data files record their hash in the file name

nedbat · nedbat · commit 160ad3b641fb · 2025-12-18T18:40:50.000-05:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -27,6 +27,10 @@ Unreleased
   third-party code is installed, and avoids measuring it. This shouldn't change
   any behavior. If you find that it does, please get in touch.
 
+- Perf: datafiles that will be combined now record their hash as part of the
+  file name. This lets us skip duplicate data more quickly, speeding the
+  combining step.
+
 
 .. start-releases
 
diff --git a/coverage/data.py b/coverage/data.py
@@ -23,6 +23,7 @@
 from coverage.files import PathAliases
 from coverage.misc import Hasher, file_be_gone, human_sorted, plural
 from coverage.sqldata import CoverageData as CoverageData  # pylint: disable=useless-import-alias
+from coverage.sqldata import filename_match
 
 
 def line_counts(data: CoverageData, fullpath: bool = False) -> dict[str, int]:
@@ -95,19 +96,23 @@ def combinable_files(data_file: str, data_paths: Iterable[str] | None = None) ->
     return sorted(files_to_combine)
 
 
-def hash_for_data_file(f: str) -> bytes:
+def hash_for_data_file(dbfilename: str) -> str:
     """Get the hash of the data in the file."""
-    with open(f, "rb") as fobj:
-        hasher = hashlib.new("sha3_256", usedforsecurity=False)
-        hasher.update(fobj.read())
-    return hasher.digest()
+    m = filename_match(dbfilename)
+    if m and m["hash"]:
+        return m["hash"]
+    else:
+        with open(dbfilename, "rb") as fobj:
+            hasher = hashlib.new("sha3_256", usedforsecurity=False)
+            hasher.update(fobj.read())
+        return hasher.hexdigest()
 
 
 class DataFileClassifier:
     """Track what files to combine and which to skip."""
 
     def __init__(self) -> None:
-        self.file_hashes: set[bytes] = set()
+        self.file_hashes: set[str] = set()
 
     def classify(self, f: str) -> Literal["combine", "skip"]:
         """Determine whether to combine or skip this file."""
diff --git a/coverage/sqldata.py b/coverage/sqldata.py
@@ -5,13 +5,15 @@
 
 from __future__ import annotations
 
+import base64
 import collections
 import datetime
 import functools
 import glob
 import itertools
 import os
 import random
+import re
 import socket
 import sqlite3
 import string
@@ -25,7 +27,7 @@
 
 from coverage.debug import NoDebugging, auto_repr, file_summary
 from coverage.exceptions import CoverageException, DataError
-from coverage.misc import file_be_gone, isolate_module
+from coverage.misc import Hasher, file_be_gone, isolate_module
 from coverage.numbits import numbits_to_nums, numbits_union, nums_to_numbits
 from coverage.sqlitedb import SqliteDb
 from coverage.types import AnyCallable, FilePath, TArc, TDebugCtl, TLineNo, TWarnFn
@@ -63,6 +65,7 @@
     --  'sys_argv' text         -- The coverage command line that recorded the data.
     --  'version' text          -- The version of coverage.py that made the file.
     --  'when' text             -- Datetime when the file was created.
+    --  'hash' text             -- Hash of the data.
 );
 
 CREATE TABLE file (
@@ -250,6 +253,7 @@ def __init__(
         self._no_disk = no_disk
         self._basename = os.path.abspath(basename or ".coverage")
         self._suffix = suffix
+        self._our_suffix = suffix is True
         self._warn = warn
         self._debug = debug or NoDebugging()
 
@@ -262,6 +266,9 @@ def __init__(
         # Synchronize the operations used during collection.
         self._lock = threading.RLock()
 
+        self._wrote_hash = False
+        self._hasher = Hasher()
+
         # Are we in sync with the data file?
         self._have_used = False
 
@@ -355,10 +362,13 @@ def _init_db(self, db: SqliteDb) -> None:
 
         # When writing metadata, avoid information that will needlessly change
         # the hash of the data file, unless we're debugging processes.
+        # If we control the suffix, then the hash is in the file name, and we
+        # can write any metadata without affecting the hash determination
+        # later.
         meta_data = [
             ("version", __version__),
         ]
-        if self._debug.should("process"):
+        if self._our_suffix or self._debug.should("process"):
             meta_data.extend(
                 [
                     ("sys_argv", str(getattr(sys, "argv", None))),
@@ -472,6 +482,7 @@ def set_context(self, context: str | None) -> None:
             self._debug.write(f"Setting coverage context: {context!r}")
         self._current_context = context
         self._current_context_id = None
+        self._hasher.update(context)
 
     def _set_context_id(self) -> None:
         """Use the _current_context to set _current_context_id."""
@@ -529,7 +540,9 @@ def add_lines(self, line_data: Mapping[str, Collection[TLineNo]]) -> None:
         with self._connect() as con:
             self._set_context_id()
             for filename, linenos in line_data.items():
+                self._hasher.update(filename)
                 line_bits = nums_to_numbits(linenos)
+                self._hasher.update(line_bits)
                 file_id = self._file_id(filename, add=True)
                 query = "SELECT numbits FROM line_bits WHERE file_id = ? AND context_id = ?"
                 with con.execute(query, (file_id, self._current_context_id)) as cur:
@@ -573,6 +586,8 @@ def add_arcs(self, arc_data: Mapping[str, Collection[TArc]]) -> None:
         with self._connect() as con:
             self._set_context_id()
             for filename, arcs in arc_data.items():
+                self._hasher.update(filename)
+                self._hasher.update(arcs)
                 if not arcs:
                     continue
                 file_id = self._file_id(filename, add=True)
@@ -620,6 +635,8 @@ def add_file_tracers(self, file_tracers: Mapping[str, str]) -> None:
         self._start_using()
         with self._connect() as con:
             for filename, plugin_name in file_tracers.items():
+                self._hasher.update(filename)
+                self._hasher.update(plugin_name)
                 file_id = self._file_id(filename, add=True)
                 existing_plugin = self.file_tracer(filename)
                 if existing_plugin:
@@ -897,7 +914,22 @@ def read(self) -> None:
 
     def write(self) -> None:
         """Ensure the data is written to the data file."""
-        self._debug_dataio("Writing (no-op) data file", self._filename)
+        if self._our_suffix and not self._wrote_hash:
+            self._debug_dataio("Finishing data file", self._filename)
+            with self._connect() as con:
+                con.execute_void(
+                    "INSERT OR IGNORE INTO meta (key, value) VALUES ('hash', ?)",
+                    (self._hasher.hexdigest(),),
+                )
+            self.close()
+            data_hash = base64.b64encode(self._hasher.digest(), altchars=b"01").decode()[:NHASH]
+            current_filename = self._filename
+            self._filename += f".H{data_hash}h"
+            self._debug_dataio("Renaming data file to", self._filename)
+            os.rename(current_filename, self._filename)
+            self._wrote_hash = True
+        else:
+            self._debug_dataio("Writing (no-op) data file", self._filename)
 
     def _start_using(self) -> None:
         """Call this before using the database at all."""
@@ -1129,6 +1161,11 @@ def sys_info(cls) -> list[tuple[str, Any]]:
         ]
 
 
+ASCII = string.ascii_letters + string.digits
+NRAND = 6
+NHASH = 10
+
+
 def filename_suffix(suffix: str | bool | None) -> str | None:
     """Compute a filename suffix for a data file.
 
@@ -1145,9 +1182,31 @@ def filename_suffix(suffix: str | bool | None) -> str | None:
         # `save()` at the last minute so that the pid will be correct even
         # if the process forks.
         die = random.Random(os.urandom(8))
-        letters = string.ascii_uppercase + string.ascii_lowercase
-        rolls = "".join(die.choice(letters) for _ in range(6))
-        suffix = f"{socket.gethostname()}.{os.getpid()}.X{rolls}x"
+        rolls = "".join(die.choice(ASCII) for _ in range(NRAND))
+        host = socket.gethostname().replace(".", "_")
+        suffix = f"{host}.pid{os.getpid()}.X{rolls}x"
     elif suffix is False:
         suffix = None
     return suffix
+
+
+# A regex to match parallel file name suffixes, with named groups.
+# We combine this with other regexes, so can't use verbose syntax.
+SUFFIX_PATTERN = (
+    r"\.(?P<host>[^.]+)"
+    + r"\.pid(?P<pid>\d+)"
+    + rf"\.X(?P<random>\w{{{NRAND}}})x"
+    + rf"(\.H(?P<hash>\w{{{NHASH}}}h))?"
+)
+
+
+def filename_match(filename: str) -> re.Match[str] | None:
+    """Return a match object to pick apart the filename."""
+    return re.search(f"{SUFFIX_PATTERN}$", filename)
+
+
+def good_filename_match(filename: str) -> re.Match[str]:
+    """Match the filename where we know it will match."""
+    m = filename_match(filename)
+    assert m is not None
+    return m
diff --git a/doc/dbschema.rst b/doc/dbschema.rst
@@ -71,6 +71,7 @@ This is the database schema:
         --  'sys_argv' text         -- The coverage command line that recorded the data.
         --  'version' text          -- The version of coverage.py that made the file.
         --  'when' text             -- Datetime when the file was created.
+        --  'hash' text             -- Hash of the data.
     );
 
     CREATE TABLE file (
@@ -116,7 +117,7 @@ This is the database schema:
         foreign key (file_id) references file (id)
     );
 
-.. [[[end]]] (sum: agTRSwfwj4)
+.. [[[end]]] (sum: 7dE2ATKbel)
 
 
 .. _numbits:
diff --git a/tests/test_concurrency.py b/tests/test_concurrency.py
@@ -26,6 +26,7 @@
 from coverage.exceptions import ConfigError
 from coverage.files import abs_file
 from coverage.misc import import_local_file
+from coverage.sqldata import SUFFIX_PATTERN
 
 from tests import testenv
 from tests.coveragetest import CoverageTest
@@ -505,7 +506,7 @@ def try_multiprocessing_code(
             assert len(out_lines) == nprocs + 1
             assert all(
                 re.fullmatch(
-                    r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.X\w{6}x",
+                    rf"(Combined data file|Skipping duplicate data) \.coverage{SUFFIX_PATTERN}",
                     line,
                 )
                 for line in out_lines
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -991,6 +991,7 @@ def test_combining_with_crazy_filename(self, dpart: str, fpart: str) -> None:
         self.assert_file_count(glob.escape(basename) + ".*", 0)
 
     def test_meta_data(self) -> None:
+        # TODO: do we care about this?
         # The metadata written to the data file shouldn't interfere with
         # hashing to remove duplicates, except for debug=process, which
         # writes debugging info as metadata.
@@ -999,16 +1000,16 @@ def test_meta_data(self) -> None:
         covdata1.add_lines(LINES_1)
         covdata1.write()
         with sqlite3.connect("meta.1") as con:
-            data = sorted(k for (k,) in con.execute("select key from meta"))
-        assert data == ["has_arcs", "version"]
+            data = {k for (k,) in con.execute("select key from meta")}
+        assert {"has_arcs", "version"} <= data
 
         debug = DebugControlString(options=["process"])
         covdata2 = CoverageData(basename="meta.2", debug=debug)
         covdata2.add_lines(LINES_1)
         covdata2.write()
         with sqlite3.connect("meta.2") as con:
-            data = sorted(k for (k,) in con.execute("select key from meta"))
-        assert data == ["has_arcs", "sys_argv", "version", "when"]
+            data = {k for (k,) in con.execute("select key from meta")}
+        assert {"has_arcs", "sys_argv", "version", "when"} <= data
 
     def make_data_files(self, spec: str, arcs: bool) -> list[CoverageData]:
         """Make a number data files.
diff --git a/tests/test_process.py b/tests/test_process.py
@@ -26,6 +26,7 @@
 from coverage import env
 from coverage.data import line_counts
 from coverage.files import abs_file, python_reported_file
+from coverage.sqldata import good_filename_match
 
 from tests import testenv
 from tests.coveragetest import CoverageTest, TESTS_DIR
@@ -441,9 +442,9 @@ def test_fork(self) -> None:
         # end of the file name.
         self.assert_file_count(".coverage.*", 2)
         data_files = glob.glob(".coverage.*")
-        filepids = {int(name.split(".")[-2]) for name in data_files}
+        filepids = {int(good_filename_match(name)["pid"]) for name in data_files}
         assert filepids == set(pids.values())
-        suffixes = {name.split(".")[-1] for name in data_files}
+        suffixes = {good_filename_match(name)["random"] for name in data_files}
         assert len(suffixes) == 2, f"Same random suffix: {data_files}"
 
         # Each data file should have a subset of the lines.