Skip to content

Correct / recommended way of using user_data #563

@tlinhart

Description

@tlinhart

After the merge of this PR I receive type errors working with user_data. Consider this sample:

import asyncio

from crawlee import Request
from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute
from crawlee.configuration import Configuration
from crawlee.parsel_crawler import ParselCrawler, ParselCrawlingContext
from crawlee.router import Router

router = Router[ParselCrawlingContext]()


@router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
    for category in context.selector.xpath(
        '//div[@class="side_categories"]//ul/li/ul/li/a'
    ):
        item = {"title": category.xpath("normalize-space()").get()}
        url = category.xpath("./@href").get()
        if url is not None:
            if not is_url_absolute(url):
                url = str(convert_to_absolute_url(context.request.url, url))
            request = Request.from_url(url, method="GET", label="detail")
            request.user_data["item"] = item  # <--- TYPE ERROR
            await context.add_requests([request])


@router.handler("detail")
async def detail_handler(context: ParselCrawlingContext) -> None:
    item = context.request.user_data["item"]
    item["results"] = context.selector.xpath("normalize-space(//form//strong[1])").get()  # <-- TYPE ERROR
    await context.push_data(item)


async def main() -> None:
    config = Configuration.get_global_configuration()
    config.persist_storage = False
    config.write_metadata = False
    crawler = ParselCrawler(request_handler=router)
    await crawler.run(["https://books.toscrape.com"])
    data = await crawler.get_data()
    print(data.items)


if __name__ == "__main__":
    asyncio.run(main())

Both in VS Code (with Pylance) and CLI (mypy) I get type errors on the highlighted spots. Mypy reports this:

./venv/bin/mypy main.py 
main.py:23: error: Incompatible types in assignment (expression has type "dict[str, str | None]", target has type "JsonValue")  [assignment]
main.py:30: error: Unsupported target for indexed assignment ("list[JsonValue] | dict[str, JsonValue] | str | bool | int | float | None")  [index]
main.py:30: error: No overload variant of "__setitem__" of "list" matches argument types "str", "str | None"  [call-overload]
main.py:30: note: Possible overload variants:
main.py:30: note:     def __setitem__(self, SupportsIndex, JsonValue, /) -> None
main.py:30: note:     def __setitem__(self, slice, Iterable[JsonValue], /) -> None
Found 3 errors in 1 file (checked 1 source file)

Metadata

Metadata

Assignees

Labels

bugSomething isn't working.t-toolingIssues with this label are in the ownership of the tooling team.

Type

No type

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions