Skip to content

Commit 160ad3b

Browse files
committed
perf: data files record their hash in the file name
1 parent 9928c51 commit 160ad3b

File tree

7 files changed

+92
-20
lines changed

7 files changed

+92
-20
lines changed

CHANGES.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ Unreleased
2727
third-party code is installed, and avoids measuring it. This shouldn't change
2828
any behavior. If you find that it does, please get in touch.
2929

30+
- Perf: datafiles that will be combined now record their hash as part of the
31+
file name. This lets us skip duplicate data more quickly, speeding the
32+
combining step.
33+
3034

3135
.. start-releases
3236

coverage/data.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from coverage.files import PathAliases
2424
from coverage.misc import Hasher, file_be_gone, human_sorted, plural
2525
from coverage.sqldata import CoverageData as CoverageData # pylint: disable=useless-import-alias
26+
from coverage.sqldata import filename_match
2627

2728

2829
def line_counts(data: CoverageData, fullpath: bool = False) -> dict[str, int]:
@@ -95,19 +96,23 @@ def combinable_files(data_file: str, data_paths: Iterable[str] | None = None) ->
9596
return sorted(files_to_combine)
9697

9798

98-
def hash_for_data_file(f: str) -> bytes:
99+
def hash_for_data_file(dbfilename: str) -> str:
99100
"""Get the hash of the data in the file."""
100-
with open(f, "rb") as fobj:
101-
hasher = hashlib.new("sha3_256", usedforsecurity=False)
102-
hasher.update(fobj.read())
103-
return hasher.digest()
101+
m = filename_match(dbfilename)
102+
if m and m["hash"]:
103+
return m["hash"]
104+
else:
105+
with open(dbfilename, "rb") as fobj:
106+
hasher = hashlib.new("sha3_256", usedforsecurity=False)
107+
hasher.update(fobj.read())
108+
return hasher.hexdigest()
104109

105110

106111
class DataFileClassifier:
107112
"""Track what files to combine and which to skip."""
108113

109114
def __init__(self) -> None:
110-
self.file_hashes: set[bytes] = set()
115+
self.file_hashes: set[str] = set()
111116

112117
def classify(self, f: str) -> Literal["combine", "skip"]:
113118
"""Determine whether to combine or skip this file."""

coverage/sqldata.py

Lines changed: 65 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55

66
from __future__ import annotations
77

8+
import base64
89
import collections
910
import datetime
1011
import functools
1112
import glob
1213
import itertools
1314
import os
1415
import random
16+
import re
1517
import socket
1618
import sqlite3
1719
import string
@@ -25,7 +27,7 @@
2527

2628
from coverage.debug import NoDebugging, auto_repr, file_summary
2729
from coverage.exceptions import CoverageException, DataError
28-
from coverage.misc import file_be_gone, isolate_module
30+
from coverage.misc import Hasher, file_be_gone, isolate_module
2931
from coverage.numbits import numbits_to_nums, numbits_union, nums_to_numbits
3032
from coverage.sqlitedb import SqliteDb
3133
from coverage.types import AnyCallable, FilePath, TArc, TDebugCtl, TLineNo, TWarnFn
@@ -63,6 +65,7 @@
6365
-- 'sys_argv' text -- The coverage command line that recorded the data.
6466
-- 'version' text -- The version of coverage.py that made the file.
6567
-- 'when' text -- Datetime when the file was created.
68+
-- 'hash' text -- Hash of the data.
6669
);
6770
6871
CREATE TABLE file (
@@ -250,6 +253,7 @@ def __init__(
250253
self._no_disk = no_disk
251254
self._basename = os.path.abspath(basename or ".coverage")
252255
self._suffix = suffix
256+
self._our_suffix = suffix is True
253257
self._warn = warn
254258
self._debug = debug or NoDebugging()
255259

@@ -262,6 +266,9 @@ def __init__(
262266
# Synchronize the operations used during collection.
263267
self._lock = threading.RLock()
264268

269+
self._wrote_hash = False
270+
self._hasher = Hasher()
271+
265272
# Are we in sync with the data file?
266273
self._have_used = False
267274

@@ -355,10 +362,13 @@ def _init_db(self, db: SqliteDb) -> None:
355362

356363
# When writing metadata, avoid information that will needlessly change
357364
# the hash of the data file, unless we're debugging processes.
365+
# If we control the suffix, then the hash is in the file name, and we
366+
# can write any metadata without affecting the hash determination
367+
# later.
358368
meta_data = [
359369
("version", __version__),
360370
]
361-
if self._debug.should("process"):
371+
if self._our_suffix or self._debug.should("process"):
362372
meta_data.extend(
363373
[
364374
("sys_argv", str(getattr(sys, "argv", None))),
@@ -472,6 +482,7 @@ def set_context(self, context: str | None) -> None:
472482
self._debug.write(f"Setting coverage context: {context!r}")
473483
self._current_context = context
474484
self._current_context_id = None
485+
self._hasher.update(context)
475486

476487
def _set_context_id(self) -> None:
477488
"""Use the _current_context to set _current_context_id."""
@@ -529,7 +540,9 @@ def add_lines(self, line_data: Mapping[str, Collection[TLineNo]]) -> None:
529540
with self._connect() as con:
530541
self._set_context_id()
531542
for filename, linenos in line_data.items():
543+
self._hasher.update(filename)
532544
line_bits = nums_to_numbits(linenos)
545+
self._hasher.update(line_bits)
533546
file_id = self._file_id(filename, add=True)
534547
query = "SELECT numbits FROM line_bits WHERE file_id = ? AND context_id = ?"
535548
with con.execute(query, (file_id, self._current_context_id)) as cur:
@@ -573,6 +586,8 @@ def add_arcs(self, arc_data: Mapping[str, Collection[TArc]]) -> None:
573586
with self._connect() as con:
574587
self._set_context_id()
575588
for filename, arcs in arc_data.items():
589+
self._hasher.update(filename)
590+
self._hasher.update(arcs)
576591
if not arcs:
577592
continue
578593
file_id = self._file_id(filename, add=True)
@@ -620,6 +635,8 @@ def add_file_tracers(self, file_tracers: Mapping[str, str]) -> None:
620635
self._start_using()
621636
with self._connect() as con:
622637
for filename, plugin_name in file_tracers.items():
638+
self._hasher.update(filename)
639+
self._hasher.update(plugin_name)
623640
file_id = self._file_id(filename, add=True)
624641
existing_plugin = self.file_tracer(filename)
625642
if existing_plugin:
@@ -897,7 +914,22 @@ def read(self) -> None:
897914

898915
def write(self) -> None:
899916
"""Ensure the data is written to the data file."""
900-
self._debug_dataio("Writing (no-op) data file", self._filename)
917+
if self._our_suffix and not self._wrote_hash:
918+
self._debug_dataio("Finishing data file", self._filename)
919+
with self._connect() as con:
920+
con.execute_void(
921+
"INSERT OR IGNORE INTO meta (key, value) VALUES ('hash', ?)",
922+
(self._hasher.hexdigest(),),
923+
)
924+
self.close()
925+
data_hash = base64.b64encode(self._hasher.digest(), altchars=b"01").decode()[:NHASH]
926+
current_filename = self._filename
927+
self._filename += f".H{data_hash}h"
928+
self._debug_dataio("Renaming data file to", self._filename)
929+
os.rename(current_filename, self._filename)
930+
self._wrote_hash = True
931+
else:
932+
self._debug_dataio("Writing (no-op) data file", self._filename)
901933

902934
def _start_using(self) -> None:
903935
"""Call this before using the database at all."""
@@ -1129,6 +1161,11 @@ def sys_info(cls) -> list[tuple[str, Any]]:
11291161
]
11301162

11311163

1164+
ASCII = string.ascii_letters + string.digits
1165+
NRAND = 6
1166+
NHASH = 10
1167+
1168+
11321169
def filename_suffix(suffix: str | bool | None) -> str | None:
11331170
"""Compute a filename suffix for a data file.
11341171
@@ -1145,9 +1182,31 @@ def filename_suffix(suffix: str | bool | None) -> str | None:
11451182
# `save()` at the last minute so that the pid will be correct even
11461183
# if the process forks.
11471184
die = random.Random(os.urandom(8))
1148-
letters = string.ascii_uppercase + string.ascii_lowercase
1149-
rolls = "".join(die.choice(letters) for _ in range(6))
1150-
suffix = f"{socket.gethostname()}.{os.getpid()}.X{rolls}x"
1185+
rolls = "".join(die.choice(ASCII) for _ in range(NRAND))
1186+
host = socket.gethostname().replace(".", "_")
1187+
suffix = f"{host}.pid{os.getpid()}.X{rolls}x"
11511188
elif suffix is False:
11521189
suffix = None
11531190
return suffix
1191+
1192+
1193+
# A regex to match parallel file name suffixes, with named groups.
1194+
# We combine this with other regexes, so can't use verbose syntax.
1195+
SUFFIX_PATTERN = (
1196+
r"\.(?P<host>[^.]+)"
1197+
+ r"\.pid(?P<pid>\d+)"
1198+
+ rf"\.X(?P<random>\w{{{NRAND}}})x"
1199+
+ rf"(\.H(?P<hash>\w{{{NHASH}}}h))?"
1200+
)
1201+
1202+
1203+
def filename_match(filename: str) -> re.Match[str] | None:
1204+
"""Return a match object to pick apart the filename."""
1205+
return re.search(f"{SUFFIX_PATTERN}$", filename)
1206+
1207+
1208+
def good_filename_match(filename: str) -> re.Match[str]:
1209+
"""Match the filename where we know it will match."""
1210+
m = filename_match(filename)
1211+
assert m is not None
1212+
return m

doc/dbschema.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ This is the database schema:
7171
-- 'sys_argv' text -- The coverage command line that recorded the data.
7272
-- 'version' text -- The version of coverage.py that made the file.
7373
-- 'when' text -- Datetime when the file was created.
74+
-- 'hash' text -- Hash of the data.
7475
);
7576
7677
CREATE TABLE file (
@@ -116,7 +117,7 @@ This is the database schema:
116117
foreign key (file_id) references file (id)
117118
);
118119
119-
.. [[[end]]] (sum: agTRSwfwj4)
120+
.. [[[end]]] (sum: 7dE2ATKbel)
120121
121122
122123
.. _numbits:

tests/test_concurrency.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from coverage.exceptions import ConfigError
2727
from coverage.files import abs_file
2828
from coverage.misc import import_local_file
29+
from coverage.sqldata import SUFFIX_PATTERN
2930

3031
from tests import testenv
3132
from tests.coveragetest import CoverageTest
@@ -505,7 +506,7 @@ def try_multiprocessing_code(
505506
assert len(out_lines) == nprocs + 1
506507
assert all(
507508
re.fullmatch(
508-
r"(Combined data file|Skipping duplicate data) \.coverage\..*\.\d+\.X\w{6}x",
509+
rf"(Combined data file|Skipping duplicate data) \.coverage{SUFFIX_PATTERN}",
509510
line,
510511
)
511512
for line in out_lines

tests/test_data.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -991,6 +991,7 @@ def test_combining_with_crazy_filename(self, dpart: str, fpart: str) -> None:
991991
self.assert_file_count(glob.escape(basename) + ".*", 0)
992992

993993
def test_meta_data(self) -> None:
994+
# TODO: do we care about this?
994995
# The metadata written to the data file shouldn't interfere with
995996
# hashing to remove duplicates, except for debug=process, which
996997
# writes debugging info as metadata.
@@ -999,16 +1000,16 @@ def test_meta_data(self) -> None:
9991000
covdata1.add_lines(LINES_1)
10001001
covdata1.write()
10011002
with sqlite3.connect("meta.1") as con:
1002-
data = sorted(k for (k,) in con.execute("select key from meta"))
1003-
assert data == ["has_arcs", "version"]
1003+
data = {k for (k,) in con.execute("select key from meta")}
1004+
assert {"has_arcs", "version"} <= data
10041005

10051006
debug = DebugControlString(options=["process"])
10061007
covdata2 = CoverageData(basename="meta.2", debug=debug)
10071008
covdata2.add_lines(LINES_1)
10081009
covdata2.write()
10091010
with sqlite3.connect("meta.2") as con:
1010-
data = sorted(k for (k,) in con.execute("select key from meta"))
1011-
assert data == ["has_arcs", "sys_argv", "version", "when"]
1011+
data = {k for (k,) in con.execute("select key from meta")}
1012+
assert {"has_arcs", "sys_argv", "version", "when"} <= data
10121013

10131014
def make_data_files(self, spec: str, arcs: bool) -> list[CoverageData]:
10141015
"""Make a number data files.

tests/test_process.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from coverage import env
2727
from coverage.data import line_counts
2828
from coverage.files import abs_file, python_reported_file
29+
from coverage.sqldata import good_filename_match
2930

3031
from tests import testenv
3132
from tests.coveragetest import CoverageTest, TESTS_DIR
@@ -441,9 +442,9 @@ def test_fork(self) -> None:
441442
# end of the file name.
442443
self.assert_file_count(".coverage.*", 2)
443444
data_files = glob.glob(".coverage.*")
444-
filepids = {int(name.split(".")[-2]) for name in data_files}
445+
filepids = {int(good_filename_match(name)["pid"]) for name in data_files}
445446
assert filepids == set(pids.values())
446-
suffixes = {name.split(".")[-1] for name in data_files}
447+
suffixes = {good_filename_match(name)["random"] for name in data_files}
447448
assert len(suffixes) == 2, f"Same random suffix: {data_files}"
448449

449450
# Each data file should have a subset of the lines.

0 commit comments

Comments
 (0)