refactor: move data duplication detection, and test it more directly

nedbat · nedbat · commit 7190147f835a · 2025-12-10T07:41:51.000-05:00
diff --git a/coverage/data.py b/coverage/data.py
@@ -17,7 +17,7 @@
 import hashlib
 import os.path
 from collections.abc import Iterable
-from typing import Callable
+from typing import Callable, Literal
 
 from coverage.exceptions import CoverageException, NoDataError
 from coverage.files import PathAliases
@@ -95,6 +95,30 @@ def combinable_files(data_file: str, data_paths: Iterable[str] | None = None) ->
     return sorted(files_to_combine)
 
 
+def hash_for_data_file(f: str) -> bytes:
+    """Get the hash of the data in the file."""
+    with open(f, "rb") as fobj:
+        hasher = hashlib.new("sha3_256", usedforsecurity=False)
+        hasher.update(fobj.read())
+    return hasher.digest()
+
+
+class DataFileClassifier:
+    """Track what files to combine and which to skip."""
+
+    def __init__(self) -> None:
+        self.file_hashes: set[bytes] = set()
+
+    def classify(self, f: str) -> Literal["combine", "skip"]:
+        """Determine whether to combine or skip this file."""
+        sha = hash_for_data_file(f)
+        if sha in self.file_hashes:
+            return "skip"
+        else:
+            self.file_hashes.add(sha)
+            return "combine"
+
+
 def combine_parallel_data(
     data: CoverageData,
     aliases: PathAliases | None = None,
@@ -140,7 +164,7 @@ def combine_parallel_data(
     else:
         map_path = functools.cache(aliases.map)
 
-    file_hashes = set()
+    classifier = DataFileClassifier()
     combined_any = False
 
     for f in files_to_combine:
@@ -156,20 +180,15 @@ def combine_parallel_data(
         except ValueError:
             # ValueError can be raised under Windows when os.getcwd() returns a
             # folder from a different drive than the drive of f, in which case
-            # we print the original value of f instead of its relative path
+            # we print the original value of f instead of its relative path.
             rel_file_name = f
 
-        with open(f, "rb") as fobj:
-            hasher = hashlib.new("sha3_256", usedforsecurity=False)
-            hasher.update(fobj.read())
-            sha = hasher.digest()
-            combine_this_one = sha not in file_hashes
+        file_action = classifier.classify(f)
 
         delete_this_one = not keep
-        if combine_this_one:
+        if file_action == "combine":
             if data._debug.should("dataio"):
                 data._debug.write(f"Combining data file {f!r}")
-            file_hashes.add(sha)
             try:
                 new_data = CoverageData(f, debug=data._debug)
                 new_data.read()
diff --git a/igor.py b/igor.py
@@ -231,7 +231,7 @@ def do_combine_html():
     import coverage
 
     os.environ["COVERAGE_HOME"] = os.getcwd()
-    cov = coverage.Coverage(config_file="metacov.ini")
+    cov = coverage.Coverage(config_file="metacov.ini", messages=True)
     cov.load()
     cov.combine()
     cov.save()
diff --git a/tests/test_data.py b/tests/test_data.py
@@ -18,7 +18,7 @@
 
 import pytest
 
-from coverage.data import CoverageData, combine_parallel_data
+from coverage.data import CoverageData, DataFileClassifier, combine_parallel_data
 from coverage.data import add_data_to_hash, line_counts
 from coverage.exceptions import DataError, NoDataError
 from coverage.files import PathAliases, canonical_filename
@@ -1010,6 +1010,45 @@ def test_meta_data(self) -> None:
             data = sorted(k for (k,) in con.execute("select key from meta"))
         assert data == ["has_arcs", "sys_argv", "version", "when"]
 
+    def make_data_files(self, spec: str, arcs: bool) -> list[CoverageData]:
+        """Make a number data files.
+
+        `spec` is a string dictating the data for each file. Same characters
+        in spec produce identical data in the corresponding files.
+        """
+        datas = []
+        for ifile, c in enumerate(spec):
+            files_lines = {f"code_{i}.py": list(range(1, 100)) for i in range(10)}
+            files_lines[f"more_code_{c}.py"] = list(range(1, 10, 2))
+            if arcs:
+                files_arcs = {
+                    fname: [(l, 1000) for l in lines] for fname, lines in files_lines.items()
+                }
+                kwargs: dict[str, Any] = {"arcs": files_arcs}
+            else:
+                kwargs = {"lines": files_lines}
+            datas.append(self.make_data_file(".coverage", suffix=str(ifile), **kwargs))
+        return datas
+
+    @pytest.mark.parametrize(
+        "spec, combine_or_skip",
+        [
+            ("abcdef", "cccccc"),
+            ("aaaaaa", "csssss"),
+            ("ababac", "ccsssc"),
+            ("aaaaab", "cssssc"),
+        ],
+    )
+    @pytest.mark.parametrize("arcs", [False, True])
+    def test_skipping_duplicates(self, spec: str, combine_or_skip: str, arcs: bool) -> None:
+        # Check that DataFileClassifier correctly notices when data is
+        # duplicated, and tells us to combine new data and skip duplicates.
+        datas = self.make_data_files(spec, arcs=arcs)
+        classifier = DataFileClassifier()
+        for data_file, c_or_s in zip(datas, combine_or_skip):
+            file_action = classifier.classify(data_file.data_filename())
+            assert file_action[0] == c_or_s
+
 
 class DumpsLoadsTest(CoverageTest):
     """Tests of CoverageData.dumps and loads."""