Skip to content
Merged
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates

.PHONY: install-test
install-test:
pip install pytest requests_mock pypdf deepdiff requests-toolbelt
pip install pytest pytest-asyncio pytest-mock requests_mock pypdf deepdiff requests-toolbelt

.PHONY: install-dev
install-dev:
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,21 @@ req = shared.PartitionParameters(
)
```

#### Splitting PDF by pages - strict mode

When `split_pdf_allow_failed=False` (the default), any errors encountered during sending parallel request will break the process and raise an exception.
When `split_pdf_allow_failed=True`, the process will continue even if some requests fail, and the results will be combined at the end (the output from the errored pages will not be included).

Example:
```python
req = shared.PartitionParameters(
files=files,
strategy="fast",
languages=["eng"],
split_pdf_allow_failed=True,
)
```

<!-- Start Retries [retries] -->
## Retries

Expand Down
1 change: 1 addition & 0 deletions USAGE.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ res = s.general.partition(request=operations.PartitionRequest(
1,
10,
],
split_pdf_allow_failed=False,
strategy=shared.Strategy.HI_RES,
),
))
Expand Down
73 changes: 73 additions & 0 deletions _test_unstructured_client/integration/test_decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,76 @@ def test_integration_split_pdf_with_page_range(

assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}"
assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}"


@pytest.mark.parametrize("concurrency_level", [2, 3])
@pytest.mark.parametrize("allow_failed", [True, False])
@pytest.mark.parametrize(
("filename", "expected_ok", "strategy"),
[
("_sample_docs/list-item-example-1.pdf", True, "fast"), # 1 page
("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"), # 2 pages
("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES), # 16 pages
],
)
def test_integration_split_pdf_strict_mode(
concurrency_level: int,
allow_failed: bool,
filename: str,
expected_ok: bool,
strategy: shared.Strategy,
caplog
):
"""Test strict mode (allow failed = False) for split_pdf."""
try:
response = requests.get("http://localhost:8000/general/docs")
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
except requests.exceptions.ConnectionError:
assert False, "The unstructured-api is not running on localhost:8000"

client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")

with open(filename, "rb") as f:
files = shared.Files(
content=f.read(),
file_name=filename,
)

if not expected_ok:
# This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
files.file_name += ".pdf"

req = shared.PartitionParameters(
files=files,
strategy=strategy,
languages=["eng"],
split_pdf_page=True,
split_pdf_concurrency_level=concurrency_level,
split_pdf_allow_failed=allow_failed,
)

try:
resp_split = client.general.partition(req)
except (HTTPValidationError, AttributeError) as exc:
if not expected_ok:
assert "The file does not appear to be a valid PDF." in caplog.text
assert "File does not appear to be a valid PDF" in str(exc)
return
else:
assert exc is None

req.split_pdf_page = False
resp_single = client.general.partition(req)

assert len(resp_split.elements) == len(resp_single.elements)
assert resp_split.content_type == resp_single.content_type
assert resp_split.status_code == resp_single.status_code

diff = DeepDiff(
t1=resp_split.elements,
t2=resp_single.elements,
exclude_regex_paths=[
r"root\[\d+\]\['metadata'\]\['parent_id'\]",
],
)
assert len(diff) == 0
140 changes: 118 additions & 22 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
import asyncio
import io
import logging
from concurrent.futures import Future
from asyncio import Task
from collections import Counter
from typing import Coroutine

import pytest
import requests
from requests_toolbelt import MultipartDecoder, MultipartEncoder

from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
from unstructured_client._hooks.custom.form_utils import (
PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
Expand All @@ -18,7 +22,7 @@
MAX_PAGES_PER_SPLIT,
MIN_PAGES_PER_SPLIT,
SplitPdfHook,
get_optimal_split_size,
get_optimal_split_size, run_tasks,
)
from unstructured_client.models import shared

Expand Down Expand Up @@ -224,7 +228,6 @@ def test_unit_parse_form_data():
b"--boundary--\r\n"
)


decoded_data = MultipartDecoder(
test_form_data,
"multipart/form-data; boundary=boundary",
Expand Down Expand Up @@ -361,22 +364,22 @@ def test_get_optimal_split_size(num_pages, concurrency_level, expected_split_siz
({}, DEFAULT_CONCURRENCY_LEVEL), # no value
({"split_pdf_concurrency_level": 10}, 10), # valid number
(
# exceeds max value
{"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL+1}"},
MAX_CONCURRENCY_LEVEL,
# exceeds max value
{"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL + 1}"},
MAX_CONCURRENCY_LEVEL,
),
({"split_pdf_concurrency_level": -3}, DEFAULT_CONCURRENCY_LEVEL), # negative value
],
)
def test_unit_get_split_pdf_concurrency_level_returns_valid_number(form_data, expected_result):
assert (
form_utils.get_split_pdf_concurrency_level_param(
form_data,
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
fallback_value=DEFAULT_CONCURRENCY_LEVEL,
max_allowed=MAX_CONCURRENCY_LEVEL,
)
== expected_result
form_utils.get_split_pdf_concurrency_level_param(
form_data,
key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
fallback_value=DEFAULT_CONCURRENCY_LEVEL,
max_allowed=MAX_CONCURRENCY_LEVEL,
)
== expected_result
)


Expand Down Expand Up @@ -404,16 +407,16 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result):
@pytest.mark.parametrize(
"page_range, expected_result",
[
(["1", "14"], (1, 14)), # Valid range, start on boundary
(["4", "16"], (4, 16)), # Valid range, end on boundary
(None, (1, 20)), # Range not specified, defaults to full range
(["1", "14"], (1, 14)), # Valid range, start on boundary
(["4", "16"], (4, 16)), # Valid range, end on boundary
(None, (1, 20)), # Range not specified, defaults to full range
(["2", "5"], (2, 5)), # Valid range within boundary
(["2", "100"], None), # End page too high
(["50", "100"], None), # Range too high
(["-50", "5"], None), # Start page too low
(["-50", "-2"], None), # Range too low
(["10", "2"], None), # Backwards range
(["foo", "foo"], None), # Parse error
(["2", "100"], None), # End page too high
(["50", "100"], None), # Range too high
(["-50", "5"], None), # Start page too low
(["-50", "-2"], None), # Range too low
(["10", "2"], None), # Backwards range
(["foo", "foo"], None), # Parse error
],
)
def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
Expand All @@ -432,3 +435,96 @@ def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
return

assert result == expected_result


async def _request_mock(fails: bool, content: str) -> requests.Response:
response = requests.Response()
response.status_code = 500 if fails else 200
response._content = content.encode()
return response


@pytest.mark.parametrize(
("allow_failed", "tasks", "expected_responses"), [
pytest.param(
True, [
_request_mock(fails=False, content="1"),
_request_mock(fails=False, content="2"),
_request_mock(fails=False, content="3"),
_request_mock(fails=False, content="4"),
],
["1", "2", "3", "4"],
id="no failures, fails allower"
),
pytest.param(
True, [
_request_mock(fails=False, content="1"),
_request_mock(fails=True, content="2"),
_request_mock(fails=False, content="3"),
_request_mock(fails=True, content="4"),
],
["1", "2", "3", "4"],
id="failures, fails allowed"
),
pytest.param(
False, [
_request_mock(fails=True, content="failure"),
_request_mock(fails=False, content="2"),
_request_mock(fails=True, content="failure"),
_request_mock(fails=False, content="4"),
],
["failure"],
id="failures, fails disallowed"
),
pytest.param(
False, [
_request_mock(fails=False, content="1"),
_request_mock(fails=False, content="2"),
_request_mock(fails=False, content="3"),
_request_mock(fails=False, content="4"),
],
["1", "2", "3", "4"],
id="no failures, fails disallowed"
),
]
)
@pytest.mark.asyncio
async def test_unit_disallow_failed_coroutines(
allow_failed: bool,
tasks: list[Task],
expected_responses: list[str],
):
"""Test disallow failed coroutines method properly sets the flag to False."""
responses = await run_tasks(tasks, allow_failed=allow_failed)
response_contents = [response[1].content.decode() for response in responses]
assert response_contents == expected_responses


async def _fetch_canceller_error(fails: bool, content: str, cancelled_counter: Counter):
try:
if not fails:
await asyncio.sleep(0.01)
print("Doesn't fail")
else:
print("Fails")
return await _request_mock(fails=fails, content=content)
except asyncio.CancelledError:
cancelled_counter.update(["cancelled"])
print(cancelled_counter["cancelled"])
print("Cancelled")


@pytest.mark.asyncio
async def test_remaining_tasks_cancelled_when_fails_disallowed():
cancelled_counter = Counter()
tasks = [
_fetch_canceller_error(fails=True, content="1", cancelled_counter=cancelled_counter),
*[_fetch_canceller_error(fails=False, content=f"{i}", cancelled_counter=cancelled_counter)
for i in range(2, 200)],
]

await run_tasks(tasks, allow_failed=False)
# give some time to actually cancel the tasks in background
await asyncio.sleep(1)
print("Cancelled amount: ", cancelled_counter["cancelled"])
assert len(tasks) > cancelled_counter["cancelled"] > 0
Loading