Skip to content

Commit 7190147

Browse files
committed
refactor: move data duplication detection, and test it more directly
1 parent b71e6c8 commit 7190147

File tree

3 files changed

+70
-12
lines changed

3 files changed

+70
-12
lines changed

coverage/data.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import hashlib
1818
import os.path
1919
from collections.abc import Iterable
20-
from typing import Callable
20+
from typing import Callable, Literal
2121

2222
from coverage.exceptions import CoverageException, NoDataError
2323
from coverage.files import PathAliases
@@ -95,6 +95,30 @@ def combinable_files(data_file: str, data_paths: Iterable[str] | None = None) ->
9595
return sorted(files_to_combine)
9696

9797

98+
def hash_for_data_file(f: str) -> bytes:
99+
"""Get the hash of the data in the file."""
100+
with open(f, "rb") as fobj:
101+
hasher = hashlib.new("sha3_256", usedforsecurity=False)
102+
hasher.update(fobj.read())
103+
return hasher.digest()
104+
105+
106+
class DataFileClassifier:
107+
"""Track what files to combine and which to skip."""
108+
109+
def __init__(self) -> None:
110+
self.file_hashes: set[bytes] = set()
111+
112+
def classify(self, f: str) -> Literal["combine", "skip"]:
113+
"""Determine whether to combine or skip this file."""
114+
sha = hash_for_data_file(f)
115+
if sha in self.file_hashes:
116+
return "skip"
117+
else:
118+
self.file_hashes.add(sha)
119+
return "combine"
120+
121+
98122
def combine_parallel_data(
99123
data: CoverageData,
100124
aliases: PathAliases | None = None,
@@ -140,7 +164,7 @@ def combine_parallel_data(
140164
else:
141165
map_path = functools.cache(aliases.map)
142166

143-
file_hashes = set()
167+
classifier = DataFileClassifier()
144168
combined_any = False
145169

146170
for f in files_to_combine:
@@ -156,20 +180,15 @@ def combine_parallel_data(
156180
except ValueError:
157181
# ValueError can be raised under Windows when os.getcwd() returns a
158182
# folder from a different drive than the drive of f, in which case
159-
# we print the original value of f instead of its relative path
183+
# we print the original value of f instead of its relative path.
160184
rel_file_name = f
161185

162-
with open(f, "rb") as fobj:
163-
hasher = hashlib.new("sha3_256", usedforsecurity=False)
164-
hasher.update(fobj.read())
165-
sha = hasher.digest()
166-
combine_this_one = sha not in file_hashes
186+
file_action = classifier.classify(f)
167187

168188
delete_this_one = not keep
169-
if combine_this_one:
189+
if file_action == "combine":
170190
if data._debug.should("dataio"):
171191
data._debug.write(f"Combining data file {f!r}")
172-
file_hashes.add(sha)
173192
try:
174193
new_data = CoverageData(f, debug=data._debug)
175194
new_data.read()

igor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def do_combine_html():
231231
import coverage
232232

233233
os.environ["COVERAGE_HOME"] = os.getcwd()
234-
cov = coverage.Coverage(config_file="metacov.ini")
234+
cov = coverage.Coverage(config_file="metacov.ini", messages=True)
235235
cov.load()
236236
cov.combine()
237237
cov.save()

tests/test_data.py

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import pytest
2020

21-
from coverage.data import CoverageData, combine_parallel_data
21+
from coverage.data import CoverageData, DataFileClassifier, combine_parallel_data
2222
from coverage.data import add_data_to_hash, line_counts
2323
from coverage.exceptions import DataError, NoDataError
2424
from coverage.files import PathAliases, canonical_filename
@@ -1010,6 +1010,45 @@ def test_meta_data(self) -> None:
10101010
data = sorted(k for (k,) in con.execute("select key from meta"))
10111011
assert data == ["has_arcs", "sys_argv", "version", "when"]
10121012

1013+
def make_data_files(self, spec: str, arcs: bool) -> list[CoverageData]:
1014+
"""Make a number data files.
1015+
1016+
`spec` is a string dictating the data for each file. Same characters
1017+
in spec produce identical data in the corresponding files.
1018+
"""
1019+
datas = []
1020+
for ifile, c in enumerate(spec):
1021+
files_lines = {f"code_{i}.py": list(range(1, 100)) for i in range(10)}
1022+
files_lines[f"more_code_{c}.py"] = list(range(1, 10, 2))
1023+
if arcs:
1024+
files_arcs = {
1025+
fname: [(l, 1000) for l in lines] for fname, lines in files_lines.items()
1026+
}
1027+
kwargs: dict[str, Any] = {"arcs": files_arcs}
1028+
else:
1029+
kwargs = {"lines": files_lines}
1030+
datas.append(self.make_data_file(".coverage", suffix=str(ifile), **kwargs))
1031+
return datas
1032+
1033+
@pytest.mark.parametrize(
1034+
"spec, combine_or_skip",
1035+
[
1036+
("abcdef", "cccccc"),
1037+
("aaaaaa", "csssss"),
1038+
("ababac", "ccsssc"),
1039+
("aaaaab", "cssssc"),
1040+
],
1041+
)
1042+
@pytest.mark.parametrize("arcs", [False, True])
1043+
def test_skipping_duplicates(self, spec: str, combine_or_skip: str, arcs: bool) -> None:
1044+
# Check that DataFileClassifier correctly notices when data is
1045+
# duplicated, and tells us to combine new data and skip duplicates.
1046+
datas = self.make_data_files(spec, arcs=arcs)
1047+
classifier = DataFileClassifier()
1048+
for data_file, c_or_s in zip(datas, combine_or_skip):
1049+
file_action = classifier.classify(data_file.data_filename())
1050+
assert file_action[0] == c_or_s
1051+
10131052

10141053
class DumpsLoadsTest(CoverageTest):
10151054
"""Tests of CoverageData.dumps and loads."""

0 commit comments

Comments
 (0)