Skip to content

Commit bb0cbfa

Browse files
committed
split input tests
1 parent afa6c5d commit bb0cbfa

File tree

11 files changed

+269
-207
lines changed

11 files changed

+269
-207
lines changed

examples/auto_invoice_splitter_extraction_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
def parse_invoice(file_path):
1212
input_source = PathInput(file_path)
1313

14-
if input_source.is_pdf() and input_source.count_doc_pages() > 1:
14+
if input_source.is_pdf() and input_source.page_count > 1:
1515
parse_multi_page(input_source)
1616
else:
1717
parse_single_page(input_source)

mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def extract_receipts(
2424
raise MindeeError(
2525
"No possible receipts candidates found for MultiReceipts extraction."
2626
)
27-
for page_id in range(input_source.count_doc_pages()):
27+
for page_id in range(input_source.page_count):
2828
receipt_positions = [
2929
receipt.bounding_box
3030
for receipt in inference.pages[page_id].prediction.receipts

mindee/input/sources/local_input_source.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -100,18 +100,23 @@ def is_pdf(self) -> bool:
100100
""":return: True if the file is a PDF."""
101101
return self.file_mimetype == "application/pdf"
102102

103-
def count_doc_pages(self) -> int:
103+
@property
104+
def page_count(self) -> int:
104105
"""
105-
Count the pages in the PDF.
106+
Count the pages in the document.
106107
107-
:return: the number of pages.
108+
:return: The number of pages.
108109
"""
109110
if self.is_pdf():
110111
self.file_object.seek(0)
111112
pdf = pdfium.PdfDocument(self.file_object)
112113
return len(pdf)
113114
return 1
114115

116+
def count_doc_pages(self) -> int:
117+
"""Deprecated. Use ``page_count`` instead."""
118+
return self.page_count
119+
115120
def apply_page_options(self, page_options: PageOptions) -> None:
116121
"""Apply cut and merge options on multipage documents."""
117122
if not self.is_pdf():
@@ -131,10 +136,10 @@ def process_pdf(
131136
"""Run any required processing on a PDF file."""
132137
if self.is_pdf_empty():
133138
raise MindeeSourceError(f"PDF pages are empty in: {self.filename}")
134-
pages_count = self.count_doc_pages()
135-
if on_min_pages > pages_count:
139+
page_count = self.page_count
140+
if on_min_pages > page_count:
136141
return
137-
all_pages = list(range(pages_count))
142+
all_pages = list(range(page_count))
138143
if behavior == KEEP_ONLY:
139144
pages_to_keep = set()
140145
for page_id in page_indexes:
@@ -161,7 +166,7 @@ def merge_pdf_pages(self, page_numbers: set) -> None:
161166
"""
162167
Create a new PDF from pages and set it to ``file_object``.
163168
164-
:param page_numbers: List of pages number to use for merging in the original PDF.
169+
:param page_numbers: List of page numbers to use for merging in the original PDF.
165170
:return: None
166171
"""
167172
self.file_object.seek(0)

tests/extraction/test_image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from mindee.extraction.common.image_extractor import extract_multiple_images_from_source
77
from mindee.input.sources.path_input import PathInput
88
from mindee.product.barcode_reader.barcode_reader_v1 import BarcodeReaderV1
9-
from tests.input.test_inputs import PRODUCT_DATA_DIR
9+
from tests.utils import PRODUCT_DATA_DIR
1010

1111

1212
@pytest.fixture

tests/extraction/test_invoice_splitter_auto_extraction.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,8 @@
88
from mindee.parsing.common.document import Document
99
from mindee.product.invoice.invoice_v4 import InvoiceV4
1010
from mindee.product.invoice_splitter.invoice_splitter_v1 import InvoiceSplitterV1
11-
from tests.input.test_inputs import PRODUCT_DATA_DIR
1211
from tests.product import get_id, get_version
13-
from tests.utils import levenshtein_ratio
12+
from tests.utils import PRODUCT_DATA_DIR, levenshtein_ratio
1413

1514

1615
@pytest.fixture

tests/extraction/test_multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from mindee.product.multi_receipts_detector.multi_receipts_detector_v1 import (
1111
MultiReceiptsDetectorV1,
1212
)
13-
from tests.input.test_inputs import PRODUCT_DATA_DIR
13+
from tests.utils import PRODUCT_DATA_DIR
1414

1515

1616
@pytest.fixture

tests/extraction/test_pdf_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from mindee.product.invoice_splitter.invoice_splitter_v1_document import (
99
InvoiceSplitterV1Document,
1010
)
11-
from tests.input.test_inputs import PRODUCT_DATA_DIR
11+
from tests.utils import PRODUCT_DATA_DIR
1212

1313

1414
@pytest.fixture
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
import io
2+
3+
import pypdfium2 as pdfium
4+
import pytest
5+
6+
from mindee.error import MindeeError
7+
from mindee.input.page_options import KEEP_ONLY, REMOVE, PageOptions
8+
from mindee.input.sources import (
9+
Base64Input,
10+
BytesInput,
11+
FileInput,
12+
LocalInputSource,
13+
PathInput,
14+
)
15+
from tests.utils import FILE_TYPES_DIR, PRODUCT_DATA_DIR
16+
17+
18+
def _assert_page_options(input_source: LocalInputSource, numb_pages: int):
19+
assert input_source.is_pdf() is True
20+
# Currently the least verbose way of comparing pages with pypdfium2
21+
# I.e., each page is read and rendered as a rasterized image.
22+
# These images are then compared as raw byte sequences.
23+
cut_pdf = pdfium.PdfDocument(input_source.file_object)
24+
pdf = pdfium.PdfDocument(FILE_TYPES_DIR / "pdf" / f"multipage_cut-{numb_pages}.pdf")
25+
for idx in range(len(pdf)):
26+
pdf_page = pdf.get_page(idx)
27+
pdf_page_render = pdfium.PdfPage.render(pdf_page)
28+
cut_pdf_page = cut_pdf.get_page(idx)
29+
cut_pdf_page_render = pdfium.PdfPage.render(cut_pdf_page)
30+
31+
assert bytes(pdf_page_render.buffer) == bytes(cut_pdf_page_render.buffer)
32+
cut_pdf.close()
33+
pdf.close()
34+
35+
36+
def test_pdf_reconstruct_ok():
37+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
38+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=range(5))
39+
assert isinstance(input_source.file_object, io.BytesIO)
40+
41+
42+
@pytest.mark.parametrize("numb_pages", [1, 2, 3])
43+
def test_process_pdf_cut_n_pages(numb_pages: int):
44+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
45+
input_source.process_pdf(
46+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages]
47+
)
48+
assert input_source.page_count == numb_pages
49+
_assert_page_options(input_source, numb_pages)
50+
51+
52+
@pytest.mark.parametrize("numb_pages", [1, 2, 3])
53+
def test_apply_pages_pdf_cut_n_pages(numb_pages: int):
54+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
55+
input_source.apply_page_options(
56+
PageOptions(on_min_pages=2, page_indexes=[0, -2, -1][:numb_pages])
57+
)
58+
assert input_source.count_doc_pages() == numb_pages
59+
_assert_page_options(input_source, numb_pages)
60+
61+
62+
def test_pdf_keep_5_first_pages():
63+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
64+
assert input_source.is_pdf() is True
65+
input_source.process_pdf(
66+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 2, 3, 4]
67+
)
68+
assert input_source.count_doc_pages() == 5
69+
70+
71+
def test_pdf_keep_invalid_pages():
72+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
73+
assert input_source.is_pdf() is True
74+
input_source.process_pdf(
75+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0, 1, 17]
76+
)
77+
assert input_source.count_doc_pages() == 2
78+
79+
80+
def test_pdf_remove_5_last_pages():
81+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
82+
assert input_source.is_pdf() is True
83+
input_source.process_pdf(
84+
behavior=REMOVE, on_min_pages=2, page_indexes=[-5, -4, -3, -2, -1]
85+
)
86+
assert input_source.count_doc_pages() == 7
87+
88+
89+
def test_pdf_remove_5_first_pages():
90+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
91+
assert input_source.is_pdf() is True
92+
input_source.process_pdf(
93+
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(5))
94+
)
95+
assert input_source.count_doc_pages() == 7
96+
97+
98+
def test_pdf_remove_invalid_pages():
99+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
100+
assert input_source.is_pdf() is True
101+
input_source.process_pdf(behavior=REMOVE, on_min_pages=2, page_indexes=[16])
102+
assert input_source.count_doc_pages() == 12
103+
104+
105+
def test_pdf_keep_no_pages():
106+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
107+
assert input_source.is_pdf() is True
108+
# empty page indexes
109+
with pytest.raises(RuntimeError):
110+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[])
111+
# all invalid pages
112+
with pytest.raises(RuntimeError):
113+
input_source.process_pdf(
114+
behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[16, 17]
115+
)
116+
117+
118+
def test_pdf_remove_all_pages():
119+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "multipage.pdf")
120+
assert input_source.is_pdf() is True
121+
with pytest.raises(RuntimeError):
122+
input_source.process_pdf(
123+
behavior=REMOVE, on_min_pages=2, page_indexes=list(range(15))
124+
)
125+
126+
127+
def test_pdf_input_from_file():
128+
with open(FILE_TYPES_DIR / "pdf" / "multipage.pdf", "rb") as fp:
129+
input_source = FileInput(fp)
130+
assert input_source.is_pdf() is True
131+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
132+
assert input_source.count_doc_pages() == 1
133+
134+
135+
def test_pdf_input_from_base64():
136+
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.txt", "rt") as fp:
137+
input_source = Base64Input(fp.read(), filename="invoice_10p.pdf")
138+
assert input_source.is_pdf() is True
139+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
140+
assert input_source.count_doc_pages() == 1
141+
142+
143+
def test_pdf_input_from_bytes():
144+
with open(PRODUCT_DATA_DIR / "invoices" / "invoice_10p.pdf", "rb") as fp:
145+
input_source = BytesInput(fp.read(), filename="invoice_10p.pdf")
146+
assert input_source.is_pdf() is True
147+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
148+
assert input_source.count_doc_pages() == 1
149+
150+
151+
def test_pdf_blank_check():
152+
with pytest.raises(MindeeError):
153+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank.pdf")
154+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
155+
156+
with pytest.raises(MindeeError):
157+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "blank_1.pdf")
158+
input_source.process_pdf(behavior=KEEP_ONLY, on_min_pages=2, page_indexes=[0])
159+
160+
input_not_blank = PathInput(FILE_TYPES_DIR / "pdf" / "not_blank_image_only.pdf")
161+
assert input_not_blank.count_doc_pages() == 1

tests/input/test_fix_pdf.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import pytest
2+
3+
from mindee import PathInput
4+
from mindee.error import MimeTypeError
5+
from tests.utils import FILE_TYPES_DIR
6+
7+
8+
def test_broken_unfixable_pdf():
9+
with pytest.raises(MimeTypeError):
10+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_unfixable.pdf")
11+
input_source.fix_pdf()
12+
13+
14+
def test_broken_fixable_pdf():
15+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_fixable.pdf")
16+
input_source.fix_pdf()
17+
assert input_source.page_count == 1
18+
19+
20+
def test_broken_fixable_invoice_pdf():
21+
input_source = PathInput(FILE_TYPES_DIR / "pdf" / "broken_invoice.pdf")
22+
input_source.fix_pdf()

0 commit comments

Comments
 (0)