Unstructured-IO · pawel-kmiecik · Jul 30, 2024 · Jul 24, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/Makefile b/Makefile
@@ -9,7 +9,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates
 
 .PHONY: install-test
 install-test:
-	pip install pytest requests_mock pypdf deepdiff requests-toolbelt
+	pip install pytest pytest-asyncio pytest-mock requests_mock pypdf deepdiff requests-toolbelt
 
 .PHONY: install-dev
 install-dev:

diff --git a/README.md b/README.md
@@ -109,6 +109,21 @@ req = shared.PartitionParameters(
 )
 ```
 
+#### Splitting PDF by pages - strict mode
+
+When `split_pdf_allow_failed=False` (the default), any errors encountered during sending parallel request will break the process and raise an exception. 
+When `split_pdf_allow_failed=True`, the process will continue even if some requests fail, and the results will be combined at the end (the output from the errored pages will not be included).
+
+Example:
+```python
+req = shared.PartitionParameters(
+    files=files,
+    strategy="fast",
+    languages=["eng"],
+    split_pdf_allow_failed=True,
+)
+```
+
 <!-- Start Retries [retries] -->
 ## Retries
 

diff --git a/USAGE.md b/USAGE.md
@@ -19,6 +19,7 @@ res = s.general.partition(request=operations.PartitionRequest(
             1,
             10,
         ],
+        split_pdf_allow_failed=False,
         strategy=shared.Strategy.HI_RES,
     ),
 ))

diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py
@@ -180,3 +180,76 @@ def test_integration_split_pdf_with_page_range(
 
     assert min(page_numbers) == min_page_number, f"Result should start at page {min_page_number}"
     assert max(page_numbers) == max_page_number, f"Result should end at page {max_page_number}"
+
+
+@pytest.mark.parametrize("concurrency_level", [2, 3])
+@pytest.mark.parametrize("allow_failed", [True, False])
+@pytest.mark.parametrize(
+    ("filename", "expected_ok", "strategy"),
+    [
+        ("_sample_docs/list-item-example-1.pdf", True, "fast"),  # 1 page
+        ("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"),  # 2 pages
+        ("_sample_docs/layout-parser-paper.pdf", True, shared.Strategy.HI_RES),  # 16 pages
+    ],
+)
+def test_integration_split_pdf_strict_mode(
+    concurrency_level: int,
+    allow_failed: bool,
+    filename: str,
+    expected_ok: bool,
+    strategy: shared.Strategy,
+    caplog
+):
+    """Test strict mode (allow failed = False) for split_pdf."""
+    try:
+        response = requests.get("http://localhost:8000/general/docs")
+        assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
+    except requests.exceptions.ConnectionError:
+        assert False, "The unstructured-api is not running on localhost:8000"
+
+    client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
+
+    with open(filename, "rb") as f:
+        files = shared.Files(
+            content=f.read(),
+            file_name=filename,
+        )
+
+    if not expected_ok:
+        # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
+        files.file_name += ".pdf"
+
+    req = shared.PartitionParameters(
+        files=files,
+        strategy=strategy,
+        languages=["eng"],
+        split_pdf_page=True,
+        split_pdf_concurrency_level=concurrency_level,
+        split_pdf_allow_failed=allow_failed,
+    )
+
+    try:
+        resp_split = client.general.partition(req)
+    except (HTTPValidationError, AttributeError) as exc:
+        if not expected_ok:
+            assert "The file does not appear to be a valid PDF." in caplog.text
+            assert "File does not appear to be a valid PDF" in str(exc)
+            return
+        else:
+            assert exc is None
+
+    req.split_pdf_page = False
+    resp_single = client.general.partition(req)
+
+    assert len(resp_split.elements) == len(resp_single.elements)
+    assert resp_split.content_type == resp_single.content_type
+    assert resp_split.status_code == resp_single.status_code
+
+    diff = DeepDiff(
+        t1=resp_split.elements,
+        t2=resp_single.elements,
+        exclude_regex_paths=[
+            r"root\[\d+\]\['metadata'\]\['parent_id'\]",
+        ],
+    )
+    assert len(diff) == 0
diff --git a/_test_unstructured_client/unit/test_split_pdf_hook.py b/_test_unstructured_client/unit/test_split_pdf_hook.py
@@ -1,10 +1,14 @@
+import asyncio
 import io
 import logging
-from concurrent.futures import Future
+from asyncio import Task
+from collections import Counter
+from typing import Coroutine
 
 import pytest
 import requests
 from requests_toolbelt import MultipartDecoder, MultipartEncoder
+
 from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
 from unstructured_client._hooks.custom.form_utils import (
     PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
@@ -18,7 +22,7 @@
     MAX_PAGES_PER_SPLIT,
     MIN_PAGES_PER_SPLIT,
     SplitPdfHook,
-    get_optimal_split_size,
+    get_optimal_split_size, run_tasks,
 )
 from unstructured_client.models import shared
 
@@ -224,7 +228,6 @@ def test_unit_parse_form_data():
         b"--boundary--\r\n"
     )
 
-
     decoded_data = MultipartDecoder(
         test_form_data,
         "multipart/form-data; boundary=boundary",
@@ -361,22 +364,22 @@ def test_get_optimal_split_size(num_pages, concurrency_level, expected_split_siz
         ({}, DEFAULT_CONCURRENCY_LEVEL),  # no value
         ({"split_pdf_concurrency_level": 10}, 10),  # valid number
         (
-            # exceeds max value
-            {"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL+1}"},
-            MAX_CONCURRENCY_LEVEL,
+                # exceeds max value
+                {"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL + 1}"},
+                MAX_CONCURRENCY_LEVEL,
         ),
         ({"split_pdf_concurrency_level": -3}, DEFAULT_CONCURRENCY_LEVEL),  # negative value
     ],
 )
 def test_unit_get_split_pdf_concurrency_level_returns_valid_number(form_data, expected_result):
     assert (
-        form_utils.get_split_pdf_concurrency_level_param(
-            form_data,
-            key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
-            fallback_value=DEFAULT_CONCURRENCY_LEVEL,
-            max_allowed=MAX_CONCURRENCY_LEVEL,
-        )
-        == expected_result
+            form_utils.get_split_pdf_concurrency_level_param(
+                form_data,
+                key=PARTITION_FORM_CONCURRENCY_LEVEL_KEY,
+                fallback_value=DEFAULT_CONCURRENCY_LEVEL,
+                max_allowed=MAX_CONCURRENCY_LEVEL,
+            )
+            == expected_result
     )
 
 
@@ -404,16 +407,16 @@ def test_unit_get_starting_page_number(starting_page_number, expected_result):
 @pytest.mark.parametrize(
     "page_range, expected_result",
     [
-        (["1", "14"], (1, 14)), # Valid range, start on boundary
-        (["4", "16"], (4, 16)), # Valid range, end on boundary
-        (None, (1, 20)), # Range not specified, defaults to full range
+        (["1", "14"], (1, 14)),  # Valid range, start on boundary
+        (["4", "16"], (4, 16)),  # Valid range, end on boundary
+        (None, (1, 20)),  # Range not specified, defaults to full range
         (["2", "5"], (2, 5)),  # Valid range within boundary
-        (["2", "100"], None), # End page too high
-        (["50", "100"], None), # Range too high
-        (["-50", "5"], None), # Start page too low
-        (["-50", "-2"], None), # Range too low
-        (["10", "2"], None), # Backwards range
-        (["foo", "foo"], None), # Parse error
+        (["2", "100"], None),  # End page too high
+        (["50", "100"], None),  # Range too high
+        (["-50", "5"], None),  # Start page too low
+        (["-50", "-2"], None),  # Range too low
+        (["10", "2"], None),  # Backwards range
+        (["foo", "foo"], None),  # Parse error
     ],
 )
 def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
@@ -432,3 +435,96 @@ def test_unit_get_page_range_returns_valid_range(page_range, expected_result):
         return
 
     assert result == expected_result
+
+
+async def _request_mock(fails: bool, content: str) -> requests.Response:
+    response = requests.Response()
+    response.status_code = 500 if fails else 200
+    response._content = content.encode()
+    return response
+
+
+@pytest.mark.parametrize(
+    ("allow_failed", "tasks", "expected_responses"), [
+        pytest.param(
+            True, [
+                _request_mock(fails=False, content="1"),
+                _request_mock(fails=False, content="2"),
+                _request_mock(fails=False, content="3"),
+                _request_mock(fails=False, content="4"),
+            ],
+            ["1", "2", "3", "4"],
+            id="no failures, fails allower"
+        ),
+        pytest.param(
+            True, [
+                _request_mock(fails=False, content="1"),
+                _request_mock(fails=True, content="2"),
+                _request_mock(fails=False, content="3"),
+                _request_mock(fails=True, content="4"),
+            ],
+            ["1", "2", "3", "4"],
+            id="failures, fails allowed"
+        ),
+        pytest.param(
+            False, [
+                _request_mock(fails=True, content="failure"),
+                _request_mock(fails=False, content="2"),
+                _request_mock(fails=True, content="failure"),
+                _request_mock(fails=False, content="4"),
+            ],
+            ["failure"],
+            id="failures, fails disallowed"
+        ),
+        pytest.param(
+            False, [
+                _request_mock(fails=False, content="1"),
+                _request_mock(fails=False, content="2"),
+                _request_mock(fails=False, content="3"),
+                _request_mock(fails=False, content="4"),
+            ],
+            ["1", "2", "3", "4"],
+            id="no failures, fails disallowed"
+        ),
+    ]
+)
+@pytest.mark.asyncio
+async def test_unit_disallow_failed_coroutines(
+        allow_failed: bool,
+        tasks: list[Task],
+        expected_responses: list[str],
+):
+    """Test disallow failed coroutines method properly sets the flag to False."""
+    responses = await run_tasks(tasks, allow_failed=allow_failed)
+    response_contents = [response[1].content.decode() for response in responses]
+    assert response_contents == expected_responses
+
+
+async def _fetch_canceller_error(fails: bool, content: str, cancelled_counter: Counter):
+    try:
+        if not fails:
+            await asyncio.sleep(0.01)
+            print("Doesn't fail")
+        else:
+            print("Fails")
+        return await _request_mock(fails=fails, content=content)
+    except asyncio.CancelledError:
+        cancelled_counter.update(["cancelled"])
+        print(cancelled_counter["cancelled"])
+        print("Cancelled")
+
+
+@pytest.mark.asyncio
+async def test_remaining_tasks_cancelled_when_fails_disallowed():
+    cancelled_counter = Counter()
+    tasks = [
+        _fetch_canceller_error(fails=True, content="1", cancelled_counter=cancelled_counter),
+        *[_fetch_canceller_error(fails=False, content=f"{i}", cancelled_counter=cancelled_counter)
+          for i in range(2, 200)],
+    ]
+
+    await run_tasks(tasks, allow_failed=False)
+    # give some time to actually cancel the tasks in background
+    await asyncio.sleep(1)
+    print("Cancelled amount: ", cancelled_counter["cancelled"])
+    assert len(tasks) > cancelled_counter["cancelled"] > 0
-Original file line number
+Diff line change
@@ Expand Up @@
 ,
 ,
             ],
+            split_pdf_allow_failed=False,
             strategy=shared.Strategy.HI_RES,
         ),
     ))
@@ Expand Down @@