Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .github/workflows/speakeasy_sdk_generation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,3 @@ jobs:
github_access_token: ${{ secrets.GITHUB_TOKEN }}
pypi_token: ${{ secrets.PYPI_TOKEN }}
speakeasy_api_key: ${{ secrets.SPEAKEASY_API_KEY }}
patch-custom-code:
runs-on: ubuntu-latest
needs: [generate]
steps:
- name: Patch in custom code after regenerating
run: make patch-custom-code

4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,7 @@ install-test:

.PHONY: install-dev
install-dev:
pip install jupyter
pip install pylint
pip install jupyter uvloop pylint mypy

## install: installs all test, dev, and experimental requirements
.PHONY: install
Expand Down Expand Up @@ -48,6 +47,7 @@ test-integration-docker:
.PHONY: lint
lint:
pylint --rcfile=pylintrc src
mypy src

#############
# Speakeasy #
Expand Down
55 changes: 2 additions & 53 deletions _test_unstructured_client/unit/test_split_pdf_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,57 +126,6 @@ def test_unit_create_response():
assert response.headers.get("Content-Length"), expected_content_length


def test_unit_create_request():
"""Test create request method properly sets file, Content-Type and Content-Length headers.
List parameters should be flattened in the body."""

# Prepare test data
request = requests.PreparedRequest()
request.headers = {
"Content-Type": "application/json",
"Authorization": "Bearer token",
}
form_data = {
"parameter_1": "value_1",
"parameter_2": "value_2",
"list_parameter": ["value_1", "value_2"],
}
page = (io.BytesIO(b"page_content"), 1)
filename = "test_file.pdf"

# Expected results
expected_page_filename = "test_file.pdf"
expected_body = MultipartEncoder(
fields=[
("parameter_1", "value_1"),
("parameter_2", "value_2"),
("list_parameter", "value_1"),
("list_parameter", "value_2"),
("split_pdf_page", "false"),
("starting_page_number", "7"),
("files", (
expected_page_filename,
page[0],
"application/pdf",
)),
]
)
expected_url = ""

# Create request
body = request_utils.create_request_body(form_data, page[0], filename, 7)
request_obj = request_utils.create_request(request, body)
request_content_type: str = request_obj.headers.get("Content-Type")
# Assert the request object
assert request_obj.method == "POST"
assert request_obj.url == expected_url

# Validate fields ignoring order
assert set(request_obj.data.fields) == set(expected_body.fields)

assert request_content_type.startswith("multipart/form-data")


def test_unit_decode_content_disposition():
"""Test decode content disposition method properly decodes Content-Disposition header."""

Expand Down Expand Up @@ -362,13 +311,13 @@ def test_get_optimal_split_size(num_pages, concurrency_level, expected_split_siz
("form_data", "expected_result"),
[
({}, DEFAULT_CONCURRENCY_LEVEL), # no value
({"split_pdf_concurrency_level": 10}, 10), # valid number
({"split_pdf_concurrency_level": "10"}, 10), # valid number
(
# exceeds max value
{"split_pdf_concurrency_level": f"{MAX_CONCURRENCY_LEVEL + 1}"},
MAX_CONCURRENCY_LEVEL,
),
({"split_pdf_concurrency_level": -3}, DEFAULT_CONCURRENCY_LEVEL), # negative value
({"split_pdf_concurrency_level": "-3"}, DEFAULT_CONCURRENCY_LEVEL), # negative value
],
)
def test_unit_get_split_pdf_concurrency_level_returns_valid_number(form_data, expected_result):
Expand Down
17 changes: 10 additions & 7 deletions src/unstructured_client/_hooks/custom/form_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
from typing import Union

from requests_toolbelt.multipart.decoder import MultipartDecoder
from requests_toolbelt.multipart.decoder import MultipartDecoder # type: ignore

from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client.models import shared
Expand Down Expand Up @@ -35,7 +35,7 @@ def get_page_range(form_data: FormData, key: str, max_pages: int) -> tuple[int,
try:
_page_range = form_data.get(key)

if _page_range is not None:
if isinstance(_page_range, list):
page_range = (int(_page_range[0]), int(_page_range[1]))
else:
page_range = (1, max_pages)
Expand Down Expand Up @@ -108,7 +108,7 @@ def get_split_pdf_allow_failed_param(
"""
allow_failed = form_data.get(key)

if allow_failed is None:
if not isinstance(allow_failed, str):
return fallback_value

if allow_failed.lower() not in ["true", "false"]:
Expand All @@ -121,6 +121,7 @@ def get_split_pdf_allow_failed_param(

return allow_failed.lower() == "true"


def get_split_pdf_concurrency_level_param(
form_data: FormData, key: str, fallback_value: int, max_allowed: int
) -> int:
Expand All @@ -140,7 +141,7 @@ def get_split_pdf_concurrency_level_param(
"""
concurrency_level_str = form_data.get(key)

if concurrency_level_str is None:
if not isinstance(concurrency_level_str, str):
return fallback_value

try:
Expand Down Expand Up @@ -218,10 +219,12 @@ def parse_form_data(decoded_data: MultipartDecoder) -> FormData:
else:
content = part.content.decode()
if name in form_data:
if isinstance(form_data[name], list):
form_data[name].append(content)
form_data_value = form_data[name]
if isinstance(form_data_value, list):
form_data_value.append(content)
else:
form_data[name] = [form_data[name], content]
new_list = [form_data_value, content]
form_data[name] = new_list
else:
form_data[name] = content

Expand Down
5 changes: 3 additions & 2 deletions src/unstructured_client/_hooks/custom/pdf_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
import logging
from typing import Generator, Tuple, Optional
from typing import cast, Generator, Tuple, Optional

from pypdf import PdfReader, PdfWriter
from pypdf.errors import PdfReadError
Expand Down Expand Up @@ -70,7 +70,8 @@ def is_pdf(file: shared.Files) -> bool:
return False

try:
PdfReader(io.BytesIO(file.content), strict=True)
content = cast(bytes, file.content)
PdfReader(io.BytesIO(content), strict=True)
except (PdfReadError, UnicodeDecodeError) as exc:
logger.error(exc)
logger.warning("The file does not appear to be a valid PDF.")
Expand Down
87 changes: 21 additions & 66 deletions src/unstructured_client/_hooks/custom/request_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
import io
import json
import logging
from typing import Optional, Tuple, Any
from typing import Tuple, Any

import httpx
import requests
from requests.structures import CaseInsensitiveDict
from requests_toolbelt.multipart.encoder import MultipartEncoder
from requests_toolbelt.multipart.encoder import MultipartEncoder # type: ignore

from unstructured_client._hooks.custom.common import UNSTRUCTURED_CLIENT_LOGGER_NAME
from unstructured_client._hooks.custom.form_utils import (
Expand Down Expand Up @@ -51,76 +49,33 @@ def create_request_body(
return body


def create_httpx_request(
original_request: requests.Request, body: MultipartEncoder
) -> httpx.Request:
headers = prepare_request_headers(original_request.headers)
return httpx.Request(
method="POST",
url=original_request.url or "",
content=body.to_string(),
headers={**headers, "Content-Type": body.content_type},
)


def create_request(
request: requests.PreparedRequest,
body: MultipartEncoder,
) -> requests.Request:
headers = prepare_request_headers(request.headers)
return requests.Request(
method="POST",
url=request.url or "",
data=body,
headers={**headers, "Content-Type": body.content_type},
)


async def call_api_async(
client: httpx.AsyncClient,
page: Tuple[io.BytesIO, int],
original_request: requests.Request,
original_request: httpx.Request,
form_data: FormData,
filename: str,
limiter: asyncio.Semaphore,
) -> tuple[int, dict]:
) -> httpx.Response:
page_content, page_number = page
body = create_request_body(form_data, page_content, filename, page_number)
new_request = create_httpx_request(original_request, body)
async with limiter:
try:
response = await client.send(new_request)
return response.status_code, response.json()
except Exception:
logger.error("Failed to send request for page %d", page_number)
return 500, {}

original_headers = prepare_request_headers(original_request.headers)

def call_api(
client: Optional[requests.Session],
page: Tuple[io.BytesIO, int],
request: requests.PreparedRequest,
form_data: FormData,
filename: str,
) -> requests.Response:
if client is None:
raise RuntimeError("HTTP client not accessible!")
page_content, page_number = page

body = create_request_body(form_data, page_content, filename, page_number)
new_request = create_request(request, body)
prepared_request = client.prepare_request(new_request)
new_request = httpx.Request(
method="POST",
url=original_request.url or "",
content=body.to_string(),
headers={**original_headers, "Content-Type": body.content_type},
)

try:
return client.send(prepared_request)
except Exception:
logger.error("Failed to send request for page %d", page_number)
return requests.Response()
async with limiter:
response = await client.send(new_request)
return response


def prepare_request_headers(
headers: CaseInsensitiveDict[str],
) -> CaseInsensitiveDict[str]:
headers: httpx.Headers,
) -> httpx.Headers:
"""Prepare the request headers by removing the 'Content-Type' and 'Content-Length' headers.

Args:
Expand All @@ -129,10 +84,10 @@ def prepare_request_headers(
Returns:
The modified request headers.
"""
headers = copy.deepcopy(headers)
headers.pop("Content-Type", None)
headers.pop("Content-Length", None)
return headers
new_headers = headers.copy()
new_headers.pop("Content-Type", None)
new_headers.pop("Content-Length", None)
return new_headers


def prepare_request_payload(form_data: FormData) -> FormData:
Expand All @@ -157,7 +112,7 @@ def prepare_request_payload(form_data: FormData) -> FormData:
return payload


def create_response(response: requests.Response, elements: list) -> requests.Response:
def create_response(response: httpx.Response, elements: list) -> httpx.Response:
"""
Creates a modified response object with updated content.

Expand Down
Loading