Package `scrapfly`

Sub-modules

scrapfly.api_config
scrapfly.api_response
scrapfly.browser_config
scrapfly.client
scrapfly.crawler: Scrapfly Crawler API …
scrapfly.errors
scrapfly.extraction_config
scrapfly.frozen_dict
scrapfly.polyfill
scrapfly.reporter
scrapfly.scrape_config
scrapfly.scrapy
scrapfly.screenshot_config
scrapfly.webhook

Functions

def parse_warc(warc_data: bytes | ) ‑> WarcParser

Expand source code

def parse_warc(warc_data: Union[bytes, BinaryIO]) -> WarcParser:
    """
    Convenience function to create a WARC parser

    Args:
        warc_data: WARC data as bytes or file-like object

    Returns:
        WarcParser: Parser instance

    Example:
        ```python
        from scrapfly import parse_warc

        # Quick way to get all pages
        pages = parse_warc(warc_bytes).get_pages()
        for page in pages:
            print(f"{page['url']}: {page['status_code']}")
        ```
    """
    return WarcParser(warc_data)

Convenience function to create a WARC parser

Args

warc_data: WARC data as bytes or file-like object

Returns

WarcParser: Parser instance

Example

from scrapfly import parse_warc

# Quick way to get all pages
pages = parse_warc(warc_bytes).get_pages()
for page in pages:
    print(f"{page['url']}: {page['status_code']}")

Expand source code

def webhook_from_payload(
    payload: Dict[str, Any],
    signing_secrets: Optional[Tuple[str, ...]] = None,
    signature: Optional[str] = None,
) -> CrawlerWebhook:
    """
    Parse a raw crawler webhook envelope into a typed dataclass.

    The envelope shape is ``{"event": <name>, "payload": {...}}``. This
    function inspects ``event`` and returns the corresponding typed
    dataclass — one of :data:`CrawlerWebhook`.

    Args:
        payload: The full webhook body as a dict (i.e. what you get from
            ``request.json``).
        signing_secrets: Optional tuple of signing secrets (hex strings) for
            signature verification.
        signature: Optional webhook signature header value
            (``X-Scrapfly-Webhook-Signature``).

    Returns:
        A typed webhook instance matching the event.

    Raises:
        KeyError: If the envelope is missing required fields.
        ValueError: If ``event`` is not one of the known crawler events.
        WebhookSignatureMissMatch: If signature verification fails.

    Example:
        >>> from flask import Flask, request
        >>> from scrapfly import webhook_from_payload, CrawlerLifecycleWebhook
        >>> app = Flask(__name__)
        >>> @app.route('/webhook', methods=['POST'])
        ... def handle_webhook():
        ...     wh = webhook_from_payload(
        ...         request.json,
        ...         signing_secrets=('your-secret-hex',),
        ...         signature=request.headers.get('X-Scrapfly-Webhook-Signature'),
        ...     )
        ...     if isinstance(wh, CrawlerLifecycleWebhook) and wh.event == 'crawler_finished':
        ...         print(f"Crawl {wh.crawler_uuid} finished — "
        ...               f"{wh.state.urls_visited} URLs visited")
        ...     return '', 200
    """
    if signing_secrets and signature:
        from json import dumps

        from ..api_response import ResponseBodyHandler
        from ..errors import WebhookSignatureMissMatch

        handler = ResponseBodyHandler(signing_secrets=signing_secrets)
        message = dumps(payload, separators=(',', ':')).encode('utf-8')
        if not handler.verify(message, signature):
            raise WebhookSignatureMissMatch()

    event = payload['event']
    inner = payload['payload']

    parser = _DISPATCH.get(event)
    if parser is None:
        raise ValueError(
            f"Unknown crawler webhook event: {event!r}. "
            f"Expected one of: {sorted(_DISPATCH.keys())}"
        )
    return parser.from_payload(event, inner)

Parse a raw crawler webhook envelope into a typed dataclass.

The envelope shape is {"event": <name>, "payload": {...}}. This function inspects event and returns the corresponding typed dataclass — one of :data:CrawlerWebhook.

Args

payload: The full webhook body as a dict (i.e. what you get from request.json).
signing_secrets: Optional tuple of signing secrets (hex strings) for signature verification.
signature: Optional webhook signature header value (X-Scrapfly-Webhook-Signature).

Returns

A typed webhook instance matching the event.

Raises

KeyError: If the envelope is missing required fields.
ValueError: If event is not one of the known crawler events.
WebhookSignatureMissMatch: If signature verification fails.

Example

>>> from flask import Flask, request
>>> from scrapfly import webhook_from_payload, CrawlerLifecycleWebhook
>>> app = Flask(__name__)
>>> @app.route('/webhook', methods=['POST'])
... def handle_webhook():
...     wh = webhook_from_payload(
...         request.json,
...         signing_secrets=('your-secret-hex',),
...         signature=request.headers.get('X-Scrapfly-Webhook-Signature'),
...     )
...     if isinstance(wh, CrawlerLifecycleWebhook) and wh.event == 'crawler_finished':
...         print(f"Crawl {wh.crawler_uuid} finished — "
...               f"{wh.state.urls_visited} URLs visited")
...     return '', 200

Classes

class ApiHttpClientError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ApiHttpClientError(HttpError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

Subclasses

ApiHttpServerError
scrapfly.errors.BadApiKeyError
scrapfly.errors.PaymentRequired
scrapfly.errors.TooManyRequest

class ApiHttpServerError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ApiHttpServerError(ApiHttpClientError):
    pass

Common base class for all non-exit exceptions.

Ancestors

ApiHttpClientError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

Expand source code

class BrowserConfig(BaseApiConfig):

    CLOUD_BROWSER_HOST = 'wss://browser.scrapfly.io'

    def __init__(
        self,
        proxy_pool: Optional[Union[str, ProxyPool]] = None,
        os: Optional[Union[str, OperatingSystem]] = None,
        session: Optional[str] = None,
        country: Optional[str] = None,
        auto_close: Optional[bool] = None,
        timeout: Optional[int] = None,
        debug: Optional[bool] = None,
        extensions: Optional[List[str]] = None,
        block_images: Optional[bool] = None,
        block_styles: Optional[bool] = None,
        block_fonts: Optional[bool] = None,
        block_media: Optional[bool] = None,
        screenshot: Optional[bool] = None,
        resolution: Optional[str] = None,
        target_url: Optional[str] = None,
        cache: Optional[bool] = None,
        blacklist: Optional[bool] = None,
        unblock: Optional[bool] = None,
        unblock_timeout: Optional[int] = None,
        browser_brand: Optional[str] = None,
        byop_proxy: Optional[str] = None,
    ):
        if timeout is not None and timeout > 1800:
            raise ValueError('timeout cannot exceed 1800 seconds (30 minutes)')

        if proxy_pool is not None and isinstance(proxy_pool, str):
            proxy_pool = ProxyPool(proxy_pool)

        if os is not None and isinstance(os, str):
            os = OperatingSystem(os)

        self.proxy_pool = proxy_pool
        self.os = os
        self.session = session
        self.country = country
        self.auto_close = auto_close
        self.timeout = timeout
        self.debug = debug
        self.extensions = extensions
        self.block_images = block_images
        self.block_styles = block_styles
        self.block_fonts = block_fonts
        self.block_media = block_media
        self.screenshot = screenshot
        self.resolution = resolution
        self.target_url = target_url
        self.cache = cache
        self.blacklist = blacklist
        self.unblock = unblock
        self.unblock_timeout = unblock_timeout
        self.browser_brand = browser_brand
        # BYOP (Bring Your Own Proxy): full proxy URL
        # Format: {protocol}://{user}:{pass}@{host}:{port}
        # Supported protocols: http, https, socks5, socks5h, socks5+udp, socks5h+udp
        # The +udp variants enable HTTP/3 (QUIC) via SOCKS5 UDP ASSOCIATE — only
        # works with proxy providers that implement RFC 1928 UDP ASSOCIATE.
        # Requires a Custom plan subscription. See:
        # https://scrapfly.io/docs/cloud-browser-api/byop
        self.byop_proxy = byop_proxy

    def websocket_url(self, api_key: str, host: Optional[str] = None) -> str:
        params = {'api_key': api_key}

        if self.proxy_pool is not None:
            params['proxy_pool'] = self.proxy_pool.value if isinstance(self.proxy_pool, ProxyPool) else self.proxy_pool

        if self.os is not None:
            params['os'] = self.os.value if isinstance(self.os, OperatingSystem) else self.os

        if self.session is not None:
            params['session'] = self.session

        if self.country is not None:
            params['country'] = self.country

        if self.auto_close is not None:
            params['auto_close'] = self._bool_to_http(self.auto_close)

        if self.timeout is not None:
            params['timeout'] = self.timeout

        if self.debug is not None:
            params['debug'] = self._bool_to_http(self.debug)

        if self.extensions:
            params['extensions'] = ','.join(self.extensions)

        if self.block_images is not None:
            params['block_images'] = self._bool_to_http(self.block_images)

        if self.block_styles is not None:
            params['block_styles'] = self._bool_to_http(self.block_styles)

        if self.block_fonts is not None:
            params['block_fonts'] = self._bool_to_http(self.block_fonts)

        if self.block_media is not None:
            params['block_media'] = self._bool_to_http(self.block_media)

        if self.screenshot is not None:
            params['screenshot'] = self._bool_to_http(self.screenshot)

        if self.resolution is not None:
            params['resolution'] = self.resolution

        if self.target_url is not None:
            params['target_url'] = self.target_url

        if self.cache is not None:
            params['cache'] = self._bool_to_http(self.cache)

        if self.blacklist is not None:
            params['blacklist'] = self._bool_to_http(self.blacklist)

        if self.unblock is not None:
            params['unblock'] = self._bool_to_http(self.unblock)

        if self.unblock_timeout is not None:
            params['unblock_timeout'] = self.unblock_timeout

        if self.browser_brand is not None:
            params['browser_brand'] = self.browser_brand

        if self.byop_proxy is not None:
            params['byop_proxy'] = self.byop_proxy

        base_host = host or self.CLOUD_BROWSER_HOST
        return base_host + '?' + urlencode(params)

    def to_dict(self) -> Dict:
        return {
            'proxy_pool': self.proxy_pool.value if isinstance(self.proxy_pool, ProxyPool) else self.proxy_pool,
            'os': self.os.value if isinstance(self.os, OperatingSystem) else self.os,
            'session': self.session,
            'country': self.country,
            'auto_close': self.auto_close,
            'timeout': self.timeout,
            'debug': self.debug,
            'extensions': self.extensions,
            'block_images': self.block_images,
            'block_styles': self.block_styles,
            'block_fonts': self.block_fonts,
            'block_media': self.block_media,
            'screenshot': self.screenshot,
            'resolution': self.resolution,
            'target_url': self.target_url,
            'cache': self.cache,
            'blacklist': self.blacklist,
            'unblock': self.unblock,
            'unblock_timeout': self.unblock_timeout,
            'browser_brand': self.browser_brand,
            'byop_proxy': self.byop_proxy,
        }

    @staticmethod
    def from_dict(browser_config_dict: Dict) -> 'BrowserConfig':
        proxy_pool = browser_config_dict.get('proxy_pool', None)
        if proxy_pool is not None:
            proxy_pool = ProxyPool(proxy_pool)

        os = browser_config_dict.get('os', None)
        if os is not None:
            os = OperatingSystem(os)

        return BrowserConfig(
            proxy_pool=proxy_pool,
            os=os,
            session=browser_config_dict.get('session', None),
            country=browser_config_dict.get('country', None),
            auto_close=browser_config_dict.get('auto_close', None),
            timeout=browser_config_dict.get('timeout', None),
            debug=browser_config_dict.get('debug', None),
            extensions=browser_config_dict.get('extensions', None),
            block_images=browser_config_dict.get('block_images', None),
            block_styles=browser_config_dict.get('block_styles', None),
            block_fonts=browser_config_dict.get('block_fonts', None),
            block_media=browser_config_dict.get('block_media', None),
            screenshot=browser_config_dict.get('screenshot', None),
            resolution=browser_config_dict.get('resolution', None),
            target_url=browser_config_dict.get('target_url', None),
            cache=browser_config_dict.get('cache', None),
            blacklist=browser_config_dict.get('blacklist', None),
            unblock=browser_config_dict.get('unblock', None),
            unblock_timeout=browser_config_dict.get('unblock_timeout', None),
            browser_brand=browser_config_dict.get('browser_brand', None),
            byop_proxy=browser_config_dict.get('byop_proxy', None),
        )

Ancestors

BaseApiConfig

Class variables

var CLOUD_BROWSER_HOST

Static methods

def from_dict(browser_config_dict: Dict) ‑> BrowserConfig

Expand source code

@staticmethod
def from_dict(browser_config_dict: Dict) -> 'BrowserConfig':
    proxy_pool = browser_config_dict.get('proxy_pool', None)
    if proxy_pool is not None:
        proxy_pool = ProxyPool(proxy_pool)

    os = browser_config_dict.get('os', None)
    if os is not None:
        os = OperatingSystem(os)

    return BrowserConfig(
        proxy_pool=proxy_pool,
        os=os,
        session=browser_config_dict.get('session', None),
        country=browser_config_dict.get('country', None),
        auto_close=browser_config_dict.get('auto_close', None),
        timeout=browser_config_dict.get('timeout', None),
        debug=browser_config_dict.get('debug', None),
        extensions=browser_config_dict.get('extensions', None),
        block_images=browser_config_dict.get('block_images', None),
        block_styles=browser_config_dict.get('block_styles', None),
        block_fonts=browser_config_dict.get('block_fonts', None),
        block_media=browser_config_dict.get('block_media', None),
        screenshot=browser_config_dict.get('screenshot', None),
        resolution=browser_config_dict.get('resolution', None),
        target_url=browser_config_dict.get('target_url', None),
        cache=browser_config_dict.get('cache', None),
        blacklist=browser_config_dict.get('blacklist', None),
        unblock=browser_config_dict.get('unblock', None),
        unblock_timeout=browser_config_dict.get('unblock_timeout', None),
        browser_brand=browser_config_dict.get('browser_brand', None),
        byop_proxy=browser_config_dict.get('byop_proxy', None),
    )

Methods

def to_dict(self) ‑> Dict

Expand source code

def to_dict(self) -> Dict:
    return {
        'proxy_pool': self.proxy_pool.value if isinstance(self.proxy_pool, ProxyPool) else self.proxy_pool,
        'os': self.os.value if isinstance(self.os, OperatingSystem) else self.os,
        'session': self.session,
        'country': self.country,
        'auto_close': self.auto_close,
        'timeout': self.timeout,
        'debug': self.debug,
        'extensions': self.extensions,
        'block_images': self.block_images,
        'block_styles': self.block_styles,
        'block_fonts': self.block_fonts,
        'block_media': self.block_media,
        'screenshot': self.screenshot,
        'resolution': self.resolution,
        'target_url': self.target_url,
        'cache': self.cache,
        'blacklist': self.blacklist,
        'unblock': self.unblock,
        'unblock_timeout': self.unblock_timeout,
        'browser_brand': self.browser_brand,
        'byop_proxy': self.byop_proxy,
    }

def websocket_url(self, api_key: str, host: str | None = None) ‑> str

Expand source code

def websocket_url(self, api_key: str, host: Optional[str] = None) -> str:
    params = {'api_key': api_key}

    if self.proxy_pool is not None:
        params['proxy_pool'] = self.proxy_pool.value if isinstance(self.proxy_pool, ProxyPool) else self.proxy_pool

    if self.os is not None:
        params['os'] = self.os.value if isinstance(self.os, OperatingSystem) else self.os

    if self.session is not None:
        params['session'] = self.session

    if self.country is not None:
        params['country'] = self.country

    if self.auto_close is not None:
        params['auto_close'] = self._bool_to_http(self.auto_close)

    if self.timeout is not None:
        params['timeout'] = self.timeout

    if self.debug is not None:
        params['debug'] = self._bool_to_http(self.debug)

    if self.extensions:
        params['extensions'] = ','.join(self.extensions)

    if self.block_images is not None:
        params['block_images'] = self._bool_to_http(self.block_images)

    if self.block_styles is not None:
        params['block_styles'] = self._bool_to_http(self.block_styles)

    if self.block_fonts is not None:
        params['block_fonts'] = self._bool_to_http(self.block_fonts)

    if self.block_media is not None:
        params['block_media'] = self._bool_to_http(self.block_media)

    if self.screenshot is not None:
        params['screenshot'] = self._bool_to_http(self.screenshot)

    if self.resolution is not None:
        params['resolution'] = self.resolution

    if self.target_url is not None:
        params['target_url'] = self.target_url

    if self.cache is not None:
        params['cache'] = self._bool_to_http(self.cache)

    if self.blacklist is not None:
        params['blacklist'] = self._bool_to_http(self.blacklist)

    if self.unblock is not None:
        params['unblock'] = self._bool_to_http(self.unblock)

    if self.unblock_timeout is not None:
        params['unblock_timeout'] = self.unblock_timeout

    if self.browser_brand is not None:
        params['browser_brand'] = self.browser_brand

    if self.byop_proxy is not None:
        params['byop_proxy'] = self.byop_proxy

    base_host = host or self.CLOUD_BROWSER_HOST
    return base_host + '?' + urlencode(params)

class Crawl (client: ScrapflyClient, config: CrawlerConfig)

Expand source code

class Crawl:
    """
    High-level abstraction for managing a crawler job

    The Crawl object maintains the state of a crawler job and provides
    convenient methods for managing its lifecycle.

    Example:
        ```python
        from scrapfly import ScrapflyClient, CrawlerConfig, Crawl

        client = ScrapflyClient(key='your-key')
        config = CrawlerConfig(url='https://example.com', page_limit=10)

        # Create and start crawl
        crawl = Crawl(client, config)
        crawl.crawl()  # Start the crawler

        # Wait for completion
        crawl.wait()

        # Get results
        pages = crawl.warc().get_pages()
        for page in pages:
            print(f"{page['url']}: {page['status_code']}")

        # Or read specific URLs
        html = crawl.read('https://example.com/page1', format='html')
        ```
    """

    def __init__(self, client: 'ScrapflyClient', config: CrawlerConfig):
        """
        Initialize a Crawl object

        Args:
            client: ScrapflyClient instance
            config: CrawlerConfig with crawler settings
        """
        self._client = client
        self._config = config
        self._uuid: Optional[str] = None
        self._status_cache: Optional[CrawlerStatusResponse] = None
        self._artifact_cache: Optional[CrawlerArtifactResponse] = None

    @property
    def uuid(self) -> Optional[str]:
        """Get the crawler job UUID (None if not started)"""
        return self._uuid

    @property
    def started(self) -> bool:
        """Check if the crawler has been started"""
        return self._uuid is not None

    def crawl(self) -> 'Crawl':
        """
        Start the crawler job

        Returns:
            Self for method chaining

        Raises:
            RuntimeError: If crawler already started

        Example:
            ```python
            crawl = Crawl(client, config)
            crawl.crawl()  # Start crawling
            ```
        """
        if self._uuid is not None:
            raise ScrapflyCrawlerError(
                message="Crawler already started",
                code="ALREADY_STARTED",
                http_status_code=400
            )

        response = self._client.start_crawl(self._config)
        self._uuid = response.uuid
        return self

    def status(self, refresh: bool = True) -> CrawlerStatusResponse:
        """
        Get current crawler status

        Args:
            refresh: If True, fetch fresh status from API. If False, return cached status.

        Returns:
            CrawlerStatusResponse with current status

        Raises:
            RuntimeError: If crawler not started yet

        Example:
            ```python
            status = crawl.status()
            print(f"Progress: {status.progress_pct}%")
            print(f"URLs visited: {status.state.urls_visited}")
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        if refresh or self._status_cache is None:
            self._status_cache = self._client.get_crawl_status(self._uuid)

        return self._status_cache

    def wait(
        self,
        poll_interval: int = 5,
        max_wait: Optional[int] = None,
        verbose: bool = False,
        allow_cancelled: bool = False,
    ) -> 'Crawl':
        """
        Wait for crawler to complete

        Polls the status endpoint until the crawler finishes.

        Args:
            poll_interval: Seconds between status checks (default: 5)
            max_wait: Maximum seconds to wait (None = wait forever)
            verbose: If True, print progress updates
            allow_cancelled: If True, return normally when the crawler reaches
                CANCELLED instead of raising. Useful for the cancel-then-wait
                pattern where the caller already knows they triggered the
                cancellation. Defaults to False (raises ScrapflyCrawlerError
                with code='CANCELLED' on user_cancelled), preserving prior
                behavior for callers that observe external cancellations.

        Returns:
            Self for method chaining

        Raises:
            ScrapflyCrawlerError: If crawler not started, failed, or timed out.
                Also raised on cancellation when ``allow_cancelled=False``.

        Example:
            ```python
            # Wait with progress updates
            crawl.crawl().wait(verbose=True)

            # Wait with timeout
            crawl.crawl().wait(max_wait=300)  # 5 minutes max

            # Cancel from the same call site, then wait without re-raising
            crawl.cancel()
            crawl.wait(allow_cancelled=True)
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        start_time = time.time()
        poll_count = 0

        while True:
            status = self.status(refresh=True)
            poll_count += 1

            if verbose:
                logger.info(f"Poll #{poll_count}: {status.status} - "
                           f"{status.progress_pct:.1f}% - "
                           f"{status.state.urls_visited}/{status.state.urls_extracted} URLs")

            if status.is_complete:
                if verbose:
                    logger.info(f"✓ Crawler completed successfully!")
                return self
            elif status.is_failed:
                raise ScrapflyCrawlerError(
                    message=f"Crawler failed with status: {status.status}",
                    code="FAILED",
                    http_status_code=400
                )
            elif status.is_cancelled:
                if allow_cancelled:
                    if verbose:
                        logger.info("Crawler was cancelled (allow_cancelled=True)")
                    return self
                raise ScrapflyCrawlerError(
                    message="Crawler was cancelled",
                    code="CANCELLED",
                    http_status_code=400
                )

            # Check timeout
            if max_wait is not None:
                elapsed = time.time() - start_time
                if elapsed > max_wait:
                    raise ScrapflyCrawlerError(
                        message=f"Timeout waiting for crawler (>{max_wait}s)",
                        code="TIMEOUT",
                        http_status_code=400
                    )

            time.sleep(poll_interval)

    def cancel(self) -> bool:
        """
        Cancel the running crawler job

        Returns:
            True if cancelled successfully

        Raises:
            ScrapflyCrawlerError: If crawler not started yet

        Example:
            ```python
            # Start a crawl
            crawl = Crawl(client, config).crawl()

            # Cancel it
            crawl.cancel()
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        return self._client.cancel_crawl(self._uuid)

    def urls(
        self,
        status: Optional[Literal['visited', 'pending', 'failed']] = None,
        page: int = 1,
        per_page: int = 100,
    ) -> CrawlerUrlsResponse:
        """
        List the crawled URLs (paginated, optionally filtered by status).

        NEW in 0.8.28 — convenience wrapper around
        :meth:`ScrapflyClient.get_crawl_urls` that pre-fills the crawler UUID.

        Args:
            status: Filter by URL status — 'visited', 'pending', or 'failed'.
                When None, the server defaults to 'visited'.
            page: 1-based page number (default 1)
            per_page: Page size (default 100, max 1000)

        Returns:
            CrawlerUrlsResponse with the URL records, total count and pagination metadata.

        Raises:
            ScrapflyCrawlerError: if the crawler has not been started yet.

        Example:
            ```python
            crawl = Crawl(client, config).crawl().wait()
            for entry in crawl.urls(status='visited'):
                print(f"{entry.url} ({entry.status})")
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400,
            )
        return self._client.get_crawl_urls(
            uuid=self._uuid,
            status=status,
            page=page,
            per_page=per_page,
        )

    def warc(self, artifact_type: str = 'warc') -> CrawlerArtifactResponse:
        """
        Download the crawler artifact (WARC file)

        Args:
            artifact_type: Type of artifact to download (default: 'warc')

        Returns:
            CrawlerArtifactResponse with parsed WARC data

        Raises:
            RuntimeError: If crawler not started yet

        Example:
            ```python
            # Get WARC artifact
            artifact = crawl.warc()

            # Get all pages
            pages = artifact.get_pages()

            # Iterate through responses
            for record in artifact.iter_responses():
                print(record.url)
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        if self._artifact_cache is None:
            self._artifact_cache = self._client.get_crawl_artifact(
                self._uuid,
                artifact_type=artifact_type
            )

        return self._artifact_cache

    def har(self) -> CrawlerArtifactResponse:
        """
        Download the crawler artifact in HAR (HTTP Archive) format

        Returns:
            CrawlerArtifactResponse with parsed HAR data

        Raises:
            RuntimeError: If crawler not started yet

        Example:
            ```python
            # Get HAR artifact
            artifact = crawl.har()

            # Get all pages
            pages = artifact.get_pages()

            # Iterate through HAR entries
            for entry in artifact.iter_responses():
                print(f"{entry.url}: {entry.status_code}")
                print(f"Timing: {entry.time}ms")
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        return self._client.get_crawl_artifact(
            self._uuid,
            artifact_type='har'
        )

    def read(self, url: str, format: ContentFormat = 'html') -> Optional[CrawlContent]:
        """
        Read content from a specific URL in the crawl results

        Args:
            url: The URL to retrieve content for
            format: Content format - 'html', 'markdown', 'text', 'clean_html', 'json',
                   'extracted_data', 'page_metadata'

        Returns:
            CrawlContent object with content and metadata, or None if URL not found

        Example:
            ```python
            # Get HTML content for a specific URL
            content = crawl.read('https://example.com/page1')
            if content:
                print(f"URL: {content.url}")
                print(f"Status: {content.status_code}")
                print(f"Duration: {content.duration}s")
                print(content.content)

            # Get markdown content
            content = crawl.read('https://example.com/page1', format='markdown')
            if content:
                print(content.content)

            # Check if URL was crawled
            if crawl.read('https://example.com/missing') is None:
                print("URL not found in crawl results")
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        # For HTML format, we can get it from the WARC artifact (faster)
        if format == 'html':
            artifact = self.warc()
            for record in artifact.iter_responses():
                if record.url == url:
                    # Extract metadata from WARC headers
                    warc_headers = record.warc_headers or {}
                    duration_str = warc_headers.get('WARC-Scrape-Duration')
                    duration = float(duration_str) if duration_str else None

                    return CrawlContent(
                        url=record.url,
                        content=record.content.decode('utf-8', errors='replace'),
                        status_code=record.status_code,
                        headers=record.headers,
                        duration=duration,
                        log_id=warc_headers.get('WARC-Scrape-Log-Id'),
                        country=warc_headers.get('WARC-Scrape-Country'),
                        crawl_uuid=self._uuid
                    )
            return None

        # For other formats (markdown, text, etc.), use the contents API
        try:
            result = self._client.get_crawl_contents(
                self._uuid,
                format=format
            )

            # The API returns: {"contents": {url: {format: content, ...}, ...}, "links": {...}}
            contents = result.get('contents', {})

            if url in contents:
                content_data = contents[url]
                # Content is always a dict with format keys (e.g., {"html": "...", "markdown": "..."})
                content_str = content_data.get(format)

                if content_str:
                    # For non-HTML formats from contents API, we don't have full metadata
                    # Try to get status code from WARC if possible
                    status_code = 200  # Default
                    headers = {}
                    duration = None
                    log_id = None
                    country = None

                    # Try to get metadata from WARC
                    try:
                        artifact = self.warc()
                        for record in artifact.iter_responses():
                            if record.url == url:
                                status_code = record.status_code
                                headers = record.headers
                                warc_headers = record.warc_headers or {}
                                duration_str = warc_headers.get('WARC-Scrape-Duration')
                                duration = float(duration_str) if duration_str else None
                                log_id = warc_headers.get('WARC-Scrape-Log-Id')
                                country = warc_headers.get('WARC-Scrape-Country')
                                break
                    except:
                        pass

                    return CrawlContent(
                        url=url,
                        content=content_str,
                        status_code=status_code,
                        headers=headers,
                        duration=duration,
                        log_id=log_id,
                        country=country,
                        crawl_uuid=self._uuid
                    )

            return None

        except Exception:
            # If contents API fails, return None
            return None

    def read_iter(
        self,
        pattern: str,
        format: ContentFormat = 'html'
    ) -> Iterator[CrawlContent]:
        """
        Iterate through URLs matching a pattern and yield their content

        Supports wildcard patterns using * and ? for flexible URL matching.

        Args:
            pattern: URL pattern with wildcards (* matches any characters, ? matches one)
                    Examples: "/products?page=*", "https://example.com/*/detail", "*/product/*"
            format: Content format to retrieve

        Yields:
            CrawlContent objects for each matching URL

        Example:
            ```python
            # Get all product pages in markdown
            for content in crawl.read_iter(pattern="*/products?page=*", format="markdown"):
                print(f"{content.url}: {len(content.content)} chars")
                print(f"Duration: {content.duration}s")

            # Get all detail pages
            for content in crawl.read_iter(pattern="*/detail/*"):
                process(content.content)

            # Pattern matching examples:
            # "/products?page=*" matches /products?page=1, /products?page=2, etc.
            # "*/product/*" matches any URL with /product/ in the path
            # "https://example.com/page?" matches https://example.com/page1, page2, etc.
            ```
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        # For HTML format, use WARC artifact (faster)
        if format == 'html':
            artifact = self.warc()
            for record in artifact.iter_responses():
                if fnmatch.fnmatch(record.url, pattern):
                    # Extract metadata from WARC headers
                    warc_headers = record.warc_headers or {}
                    duration_str = warc_headers.get('WARC-Scrape-Duration')
                    duration = float(duration_str) if duration_str else None

                    yield CrawlContent(
                        url=record.url,
                        content=record.content.decode('utf-8', errors='replace'),
                        status_code=record.status_code,
                        headers=record.headers,
                        duration=duration,
                        log_id=warc_headers.get('WARC-Scrape-Log-Id'),
                        country=warc_headers.get('WARC-Scrape-Country'),
                        crawl_uuid=self._uuid
                    )
        else:
            # For other formats, use contents API
            try:
                result = self._client.get_crawl_contents(
                    self._uuid,
                    format=format
                )

                contents = result.get('contents', {})

                # Build a metadata cache from WARC for non-HTML formats
                metadata_cache = {}
                try:
                    artifact = self.warc()
                    for record in artifact.iter_responses():
                        warc_headers = record.warc_headers or {}
                        duration_str = warc_headers.get('WARC-Scrape-Duration')
                        metadata_cache[record.url] = {
                            'status_code': record.status_code,
                            'headers': record.headers,
                            'duration': float(duration_str) if duration_str else None,
                            'log_id': warc_headers.get('WARC-Scrape-Log-Id'),
                            'country': warc_headers.get('WARC-Scrape-Country')
                        }
                except:
                    pass

                # Iterate through matching URLs
                for url, content_data in contents.items():
                    if fnmatch.fnmatch(url, pattern):
                        # Content is always a dict with format keys (e.g., {"html": "...", "markdown": "..."})
                        content = content_data.get(format)

                        if content:
                            # Get metadata from cache or use defaults
                            metadata = metadata_cache.get(url, {})
                            yield CrawlContent(
                                url=url,
                                content=content,
                                status_code=metadata.get('status_code', 200),
                                headers=metadata.get('headers', {}),
                                duration=metadata.get('duration'),
                                log_id=metadata.get('log_id'),
                                country=metadata.get('country'),
                                crawl_uuid=self._uuid
                            )

            except Exception:
                # If contents API fails, yield nothing
                return

    def read_batch(
        self,
        urls: List[str],
        formats: List[ContentFormat] = None
    ) -> Dict[str, Dict[str, str]]:
        """
        Retrieve content for multiple URLs in a single batch request

        This is more efficient than calling read() multiple times as it retrieves
        all content in a single API call. Maximum 100 URLs per request.

        Args:
            urls: List of URLs to retrieve (max 100)
            formats: List of content formats to retrieve (e.g., ['markdown', 'text'])
                    If None, defaults to ['html']

        Returns:
            Dictionary mapping URLs to their content in requested formats:
            {
                'https://example.com/page1': {
                    'markdown': '# Page 1...',
                    'text': 'Page 1...'
                },
                'https://example.com/page2': {
                    'markdown': '# Page 2...',
                    'text': 'Page 2...'
                }
            }

        Example:
            ```python
            # Get markdown and text for multiple URLs
            urls = ['https://example.com/page1', 'https://example.com/page2']
            contents = crawl.read_batch(urls, formats=['markdown', 'text'])

            for url, formats in contents.items():
                markdown = formats.get('markdown', '')
                text = formats.get('text', '')
                print(f"{url}: {len(markdown)} chars markdown, {len(text)} chars text")
            ```

        Raises:
            ValueError: If more than 100 URLs are provided
            ScrapflyCrawlerError: If crawler not started or request fails
        """
        if self._uuid is None:
            raise ScrapflyCrawlerError(
                message="Crawler not started yet. Call crawl() first.",
                code="NOT_STARTED",
                http_status_code=400
            )

        if len(urls) > 100:
            raise ValueError("Maximum 100 URLs per batch request")

        if not urls:
            return {}

        # Default to html if no formats specified
        if formats is None:
            formats = ['html']

        # Build URL with formats parameter
        formats_str = ','.join(formats)
        url = f"{self._client.host}/crawl/{self._uuid}/contents/batch"
        params = {
            'key': self._client.key,
            'formats': formats_str
        }

        # Prepare request body (newline-separated URLs)
        body = '\n'.join(urls)

        # Make request
        import requests
        response = requests.post(
            url,
            params=params,
            data=body.encode('utf-8'),
            headers={'Content-Type': 'text/plain'},
            verify=self._client.verify
        )

        if response.status_code != 200:
            raise ScrapflyCrawlerError(
                message=f"Batch content request failed: {response.status_code}",
                code="BATCH_REQUEST_FAILED",
                http_status_code=response.status_code
            )

        # Parse multipart response
        content_type = response.headers.get('Content-Type', '')
        if not content_type.startswith('multipart/related'):
            raise ScrapflyCrawlerError(
                message=f"Unexpected content type: {content_type}",
                code="INVALID_RESPONSE",
                http_status_code=500
            )

        # Extract boundary from Content-Type header
        boundary = None
        for part in content_type.split(';'):
            part = part.strip()
            if part.startswith('boundary='):
                boundary = part.split('=', 1)[1]
                break

        if not boundary:
            raise ScrapflyCrawlerError(
                message="No boundary found in multipart response",
                code="INVALID_RESPONSE",
                http_status_code=500
            )

        # Parse multipart message
        # Prepend Content-Type header to make it a valid email message for the parser
        message_bytes = f"Content-Type: {content_type}\r\n\r\n".encode('utf-8') + response.content
        parser = BytesParser(policy=default)
        message = parser.parsebytes(message_bytes)

        # Extract content from each part
        result = {}

        for part in message.walk():
            # Skip the container itself
            if part.get_content_maintype() == 'multipart':
                continue

            # Get the URL from Content-Location header
            content_location = part.get('Content-Location')
            if not content_location:
                continue

            # Get content type to determine format
            part_content_type = part.get_content_type()
            format_type = None

            # Map MIME types to format names
            if 'markdown' in part_content_type:
                format_type = 'markdown'
            elif 'plain' in part_content_type:
                format_type = 'text'
            elif 'html' in part_content_type:
                format_type = 'html'
            elif 'json' in part_content_type:
                format_type = 'json'

            if not format_type:
                continue

            # Get content
            content = part.get_content()
            if isinstance(content, bytes):
                content = content.decode('utf-8', errors='replace')

            # Initialize URL dict if needed
            if content_location not in result:
                result[content_location] = {}

            # Store content
            result[content_location][format_type] = content

        return result

    def stats(self) -> Dict[str, Any]:
        """
        Get comprehensive statistics about the crawl

        Returns:
            Dictionary with crawl statistics

        Example:
            ```python
            stats = crawl.stats()
            print(f"URLs extracted: {stats['urls_extracted']}")
            print(f"URLs visited: {stats['urls_visited']}")
            print(f"Crawl rate: {stats['crawl_rate']:.1f}%")
            print(f"Total size: {stats['total_size_kb']:.2f} KB")
            ```
        """
        status = self.status(refresh=False)

        # Basic stats from status — uses the wire field names as defined by
        # the scrape-engine source of truth.
        stats_dict = {
            'uuid': self._uuid,
            'status': status.status,
            'urls_extracted': status.state.urls_extracted,
            'urls_visited': status.state.urls_visited,
            'urls_to_crawl': status.state.urls_to_crawl,
            'urls_failed': status.state.urls_failed,
            'urls_skipped': status.state.urls_skipped,
            'progress_pct': status.progress_pct,
            'is_complete': status.is_complete,
            'is_running': status.is_running,
            'is_failed': status.is_failed,
        }

        # Calculate basic crawl rate (visited vs extracted)
        if status.state.urls_extracted > 0:
            stats_dict['crawl_rate'] = (status.state.urls_visited / status.state.urls_extracted) * 100

        # Add artifact stats if available
        if self._artifact_cache is not None:
            pages = self._artifact_cache.get_pages()
            total_size = sum(len(p['content']) for p in pages)
            avg_size = total_size / len(pages) if pages else 0

            stats_dict.update({
                'pages_downloaded': len(pages),
                'total_size_bytes': total_size,
                'total_size_kb': total_size / 1024,
                'total_size_mb': total_size / (1024 * 1024),
                'avg_page_size_bytes': avg_size,
                'avg_page_size_kb': avg_size / 1024,
            })

            # Calculate download rate (pages vs extracted)
            if status.state.urls_extracted > 0:
                stats_dict['download_rate'] = (len(pages) / status.state.urls_extracted) * 100

        return stats_dict

    def __repr__(self):
        url = self._config._params['url']
        if self._uuid is None:
            return f"Crawl(not started, url={url})"

        status_str = "unknown"
        if self._status_cache:
            status_str = self._status_cache.status

        return f"Crawl(uuid={self._uuid}, url={url}, status={status_str})"

High-level abstraction for managing a crawler job

The Crawl object maintains the state of a crawler job and provides convenient methods for managing its lifecycle.

Example

from scrapfly import ScrapflyClient, CrawlerConfig, Crawl

client = ScrapflyClient(key='your-key')
config = CrawlerConfig(url='https://example.com', page_limit=10)

# Create and start crawl
crawl = Crawl(client, config)
crawl.crawl()  # Start the crawler

# Wait for completion
crawl.wait()

# Get results
pages = crawl.warc().get_pages()
for page in pages:
    print(f"{page['url']}: {page['status_code']}")

# Or read specific URLs
html = crawl.read('https://example.com/page1', format='html')

Initialize a Crawl object

Args

client: ScrapflyClient instance
config: CrawlerConfig with crawler settings

Instance variables

prop started : bool

Expand source code

@property
def started(self) -> bool:
    """Check if the crawler has been started"""
    return self._uuid is not None

Check if the crawler has been started

prop uuid : str | None

Expand source code

@property
def uuid(self) -> Optional[str]:
    """Get the crawler job UUID (None if not started)"""
    return self._uuid

Get the crawler job UUID (None if not started)

Methods

def cancel(self) ‑> bool

Expand source code

def cancel(self) -> bool:
    """
    Cancel the running crawler job

    Returns:
        True if cancelled successfully

    Raises:
        ScrapflyCrawlerError: If crawler not started yet

    Example:
        ```python
        # Start a crawl
        crawl = Crawl(client, config).crawl()

        # Cancel it
        crawl.cancel()
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    return self._client.cancel_crawl(self._uuid)

Cancel the running crawler job

Returns

True if cancelled successfully

Raises

ScrapflyCrawlerError: If crawler not started yet

Example

# Start a crawl
crawl = Crawl(client, config).crawl()

# Cancel it
crawl.cancel()

def crawl(self) ‑> Crawl

Expand source code

def crawl(self) -> 'Crawl':
    """
    Start the crawler job

    Returns:
        Self for method chaining

    Raises:
        RuntimeError: If crawler already started

    Example:
        ```python
        crawl = Crawl(client, config)
        crawl.crawl()  # Start crawling
        ```
    """
    if self._uuid is not None:
        raise ScrapflyCrawlerError(
            message="Crawler already started",
            code="ALREADY_STARTED",
            http_status_code=400
        )

    response = self._client.start_crawl(self._config)
    self._uuid = response.uuid
    return self

Start the crawler job

Returns

Self for method chaining

Raises

RuntimeError: If crawler already started

Example

crawl = Crawl(client, config)
crawl.crawl()  # Start crawling

def har(self) ‑> CrawlerArtifactResponse

Expand source code

def har(self) -> CrawlerArtifactResponse:
    """
    Download the crawler artifact in HAR (HTTP Archive) format

    Returns:
        CrawlerArtifactResponse with parsed HAR data

    Raises:
        RuntimeError: If crawler not started yet

    Example:
        ```python
        # Get HAR artifact
        artifact = crawl.har()

        # Get all pages
        pages = artifact.get_pages()

        # Iterate through HAR entries
        for entry in artifact.iter_responses():
            print(f"{entry.url}: {entry.status_code}")
            print(f"Timing: {entry.time}ms")
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    return self._client.get_crawl_artifact(
        self._uuid,
        artifact_type='har'
    )

Download the crawler artifact in HAR (HTTP Archive) format

Returns

CrawlerArtifactResponse with parsed HAR data

Raises

RuntimeError: If crawler not started yet

Example

# Get HAR artifact
artifact = crawl.har()

# Get all pages
pages = artifact.get_pages()

# Iterate through HAR entries
for entry in artifact.iter_responses():
    print(f"{entry.url}: {entry.status_code}")
    print(f"Timing: {entry.time}ms")

def read(self, url: str, format: Literal['html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata'] = 'html') ‑> CrawlContent | None

Expand source code

def read(self, url: str, format: ContentFormat = 'html') -> Optional[CrawlContent]:
    """
    Read content from a specific URL in the crawl results

    Args:
        url: The URL to retrieve content for
        format: Content format - 'html', 'markdown', 'text', 'clean_html', 'json',
               'extracted_data', 'page_metadata'

    Returns:
        CrawlContent object with content and metadata, or None if URL not found

    Example:
        ```python
        # Get HTML content for a specific URL
        content = crawl.read('https://example.com/page1')
        if content:
            print(f"URL: {content.url}")
            print(f"Status: {content.status_code}")
            print(f"Duration: {content.duration}s")
            print(content.content)

        # Get markdown content
        content = crawl.read('https://example.com/page1', format='markdown')
        if content:
            print(content.content)

        # Check if URL was crawled
        if crawl.read('https://example.com/missing') is None:
            print("URL not found in crawl results")
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    # For HTML format, we can get it from the WARC artifact (faster)
    if format == 'html':
        artifact = self.warc()
        for record in artifact.iter_responses():
            if record.url == url:
                # Extract metadata from WARC headers
                warc_headers = record.warc_headers or {}
                duration_str = warc_headers.get('WARC-Scrape-Duration')
                duration = float(duration_str) if duration_str else None

                return CrawlContent(
                    url=record.url,
                    content=record.content.decode('utf-8', errors='replace'),
                    status_code=record.status_code,
                    headers=record.headers,
                    duration=duration,
                    log_id=warc_headers.get('WARC-Scrape-Log-Id'),
                    country=warc_headers.get('WARC-Scrape-Country'),
                    crawl_uuid=self._uuid
                )
        return None

    # For other formats (markdown, text, etc.), use the contents API
    try:
        result = self._client.get_crawl_contents(
            self._uuid,
            format=format
        )

        # The API returns: {"contents": {url: {format: content, ...}, ...}, "links": {...}}
        contents = result.get('contents', {})

        if url in contents:
            content_data = contents[url]
            # Content is always a dict with format keys (e.g., {"html": "...", "markdown": "..."})
            content_str = content_data.get(format)

            if content_str:
                # For non-HTML formats from contents API, we don't have full metadata
                # Try to get status code from WARC if possible
                status_code = 200  # Default
                headers = {}
                duration = None
                log_id = None
                country = None

                # Try to get metadata from WARC
                try:
                    artifact = self.warc()
                    for record in artifact.iter_responses():
                        if record.url == url:
                            status_code = record.status_code
                            headers = record.headers
                            warc_headers = record.warc_headers or {}
                            duration_str = warc_headers.get('WARC-Scrape-Duration')
                            duration = float(duration_str) if duration_str else None
                            log_id = warc_headers.get('WARC-Scrape-Log-Id')
                            country = warc_headers.get('WARC-Scrape-Country')
                            break
                except:
                    pass

                return CrawlContent(
                    url=url,
                    content=content_str,
                    status_code=status_code,
                    headers=headers,
                    duration=duration,
                    log_id=log_id,
                    country=country,
                    crawl_uuid=self._uuid
                )

        return None

    except Exception:
        # If contents API fails, return None
        return None

Read content from a specific URL in the crawl results

Args

url: The URL to retrieve content for
format: Content format - 'html', 'markdown', 'text', 'clean_html', 'json', 'extracted_data', 'page_metadata'

Returns

CrawlContent object with content and metadata, or None if URL not found

Example

# Get HTML content for a specific URL
content = crawl.read('https://example.com/page1')
if content:
    print(f"URL: {content.url}")
    print(f"Status: {content.status_code}")
    print(f"Duration: {content.duration}s")
    print(content.content)

# Get markdown content
content = crawl.read('https://example.com/page1', format='markdown')
if content:
    print(content.content)

# Check if URL was crawled
if crawl.read('https://example.com/missing') is None:
    print("URL not found in crawl results")

def read_batch(self, urls: List[str], formats: List[Literal['html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata']] = None) ‑> Dict[str, Dict[str, str]]

Expand source code

def read_batch(
    self,
    urls: List[str],
    formats: List[ContentFormat] = None
) -> Dict[str, Dict[str, str]]:
    """
    Retrieve content for multiple URLs in a single batch request

    This is more efficient than calling read() multiple times as it retrieves
    all content in a single API call. Maximum 100 URLs per request.

    Args:
        urls: List of URLs to retrieve (max 100)
        formats: List of content formats to retrieve (e.g., ['markdown', 'text'])
                If None, defaults to ['html']

    Returns:
        Dictionary mapping URLs to their content in requested formats:
        {
            'https://example.com/page1': {
                'markdown': '# Page 1...',
                'text': 'Page 1...'
            },
            'https://example.com/page2': {
                'markdown': '# Page 2...',
                'text': 'Page 2...'
            }
        }

    Example:
        ```python
        # Get markdown and text for multiple URLs
        urls = ['https://example.com/page1', 'https://example.com/page2']
        contents = crawl.read_batch(urls, formats=['markdown', 'text'])

        for url, formats in contents.items():
            markdown = formats.get('markdown', '')
            text = formats.get('text', '')
            print(f"{url}: {len(markdown)} chars markdown, {len(text)} chars text")
        ```

    Raises:
        ValueError: If more than 100 URLs are provided
        ScrapflyCrawlerError: If crawler not started or request fails
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    if len(urls) > 100:
        raise ValueError("Maximum 100 URLs per batch request")

    if not urls:
        return {}

    # Default to html if no formats specified
    if formats is None:
        formats = ['html']

    # Build URL with formats parameter
    formats_str = ','.join(formats)
    url = f"{self._client.host}/crawl/{self._uuid}/contents/batch"
    params = {
        'key': self._client.key,
        'formats': formats_str
    }

    # Prepare request body (newline-separated URLs)
    body = '\n'.join(urls)

    # Make request
    import requests
    response = requests.post(
        url,
        params=params,
        data=body.encode('utf-8'),
        headers={'Content-Type': 'text/plain'},
        verify=self._client.verify
    )

    if response.status_code != 200:
        raise ScrapflyCrawlerError(
            message=f"Batch content request failed: {response.status_code}",
            code="BATCH_REQUEST_FAILED",
            http_status_code=response.status_code
        )

    # Parse multipart response
    content_type = response.headers.get('Content-Type', '')
    if not content_type.startswith('multipart/related'):
        raise ScrapflyCrawlerError(
            message=f"Unexpected content type: {content_type}",
            code="INVALID_RESPONSE",
            http_status_code=500
        )

    # Extract boundary from Content-Type header
    boundary = None
    for part in content_type.split(';'):
        part = part.strip()
        if part.startswith('boundary='):
            boundary = part.split('=', 1)[1]
            break

    if not boundary:
        raise ScrapflyCrawlerError(
            message="No boundary found in multipart response",
            code="INVALID_RESPONSE",
            http_status_code=500
        )

    # Parse multipart message
    # Prepend Content-Type header to make it a valid email message for the parser
    message_bytes = f"Content-Type: {content_type}\r\n\r\n".encode('utf-8') + response.content
    parser = BytesParser(policy=default)
    message = parser.parsebytes(message_bytes)

    # Extract content from each part
    result = {}

    for part in message.walk():
        # Skip the container itself
        if part.get_content_maintype() == 'multipart':
            continue

        # Get the URL from Content-Location header
        content_location = part.get('Content-Location')
        if not content_location:
            continue

        # Get content type to determine format
        part_content_type = part.get_content_type()
        format_type = None

        # Map MIME types to format names
        if 'markdown' in part_content_type:
            format_type = 'markdown'
        elif 'plain' in part_content_type:
            format_type = 'text'
        elif 'html' in part_content_type:
            format_type = 'html'
        elif 'json' in part_content_type:
            format_type = 'json'

        if not format_type:
            continue

        # Get content
        content = part.get_content()
        if isinstance(content, bytes):
            content = content.decode('utf-8', errors='replace')

        # Initialize URL dict if needed
        if content_location not in result:
            result[content_location] = {}

        # Store content
        result[content_location][format_type] = content

    return result

Retrieve content for multiple URLs in a single batch request

This is more efficient than calling read() multiple times as it retrieves all content in a single API call. Maximum 100 URLs per request.

Args

urls: List of URLs to retrieve (max 100)
formats: List of content formats to retrieve (e.g., ['markdown', 'text']) If None, defaults to ['html']

Returns

Dictionary mapping URLs to their content in requested formats: { 'https://example.com/page1': { 'markdown': '# Page 1…', 'text': 'Page 1…' }, 'https://example.com/page2': { 'markdown': '# Page 2…', 'text': 'Page 2…' } }

Example

# Get markdown and text for multiple URLs
urls = ['https://example.com/page1', 'https://example.com/page2']
contents = crawl.read_batch(urls, formats=['markdown', 'text'])

for url, formats in contents.items():
    markdown = formats.get('markdown', '')
    text = formats.get('text', '')
    print(f"{url}: {len(markdown)} chars markdown, {len(text)} chars text")

Raises

ValueError: If more than 100 URLs are provided
ScrapflyCrawlerError: If crawler not started or request fails

def read_iter(self, pattern: str, format: Literal['html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata'] = 'html') ‑> Iterator[CrawlContent]

Expand source code

def read_iter(
    self,
    pattern: str,
    format: ContentFormat = 'html'
) -> Iterator[CrawlContent]:
    """
    Iterate through URLs matching a pattern and yield their content

    Supports wildcard patterns using * and ? for flexible URL matching.

    Args:
        pattern: URL pattern with wildcards (* matches any characters, ? matches one)
                Examples: "/products?page=*", "https://example.com/*/detail", "*/product/*"
        format: Content format to retrieve

    Yields:
        CrawlContent objects for each matching URL

    Example:
        ```python
        # Get all product pages in markdown
        for content in crawl.read_iter(pattern="*/products?page=*", format="markdown"):
            print(f"{content.url}: {len(content.content)} chars")
            print(f"Duration: {content.duration}s")

        # Get all detail pages
        for content in crawl.read_iter(pattern="*/detail/*"):
            process(content.content)

        # Pattern matching examples:
        # "/products?page=*" matches /products?page=1, /products?page=2, etc.
        # "*/product/*" matches any URL with /product/ in the path
        # "https://example.com/page?" matches https://example.com/page1, page2, etc.
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    # For HTML format, use WARC artifact (faster)
    if format == 'html':
        artifact = self.warc()
        for record in artifact.iter_responses():
            if fnmatch.fnmatch(record.url, pattern):
                # Extract metadata from WARC headers
                warc_headers = record.warc_headers or {}
                duration_str = warc_headers.get('WARC-Scrape-Duration')
                duration = float(duration_str) if duration_str else None

                yield CrawlContent(
                    url=record.url,
                    content=record.content.decode('utf-8', errors='replace'),
                    status_code=record.status_code,
                    headers=record.headers,
                    duration=duration,
                    log_id=warc_headers.get('WARC-Scrape-Log-Id'),
                    country=warc_headers.get('WARC-Scrape-Country'),
                    crawl_uuid=self._uuid
                )
    else:
        # For other formats, use contents API
        try:
            result = self._client.get_crawl_contents(
                self._uuid,
                format=format
            )

            contents = result.get('contents', {})

            # Build a metadata cache from WARC for non-HTML formats
            metadata_cache = {}
            try:
                artifact = self.warc()
                for record in artifact.iter_responses():
                    warc_headers = record.warc_headers or {}
                    duration_str = warc_headers.get('WARC-Scrape-Duration')
                    metadata_cache[record.url] = {
                        'status_code': record.status_code,
                        'headers': record.headers,
                        'duration': float(duration_str) if duration_str else None,
                        'log_id': warc_headers.get('WARC-Scrape-Log-Id'),
                        'country': warc_headers.get('WARC-Scrape-Country')
                    }
            except:
                pass

            # Iterate through matching URLs
            for url, content_data in contents.items():
                if fnmatch.fnmatch(url, pattern):
                    # Content is always a dict with format keys (e.g., {"html": "...", "markdown": "..."})
                    content = content_data.get(format)

                    if content:
                        # Get metadata from cache or use defaults
                        metadata = metadata_cache.get(url, {})
                        yield CrawlContent(
                            url=url,
                            content=content,
                            status_code=metadata.get('status_code', 200),
                            headers=metadata.get('headers', {}),
                            duration=metadata.get('duration'),
                            log_id=metadata.get('log_id'),
                            country=metadata.get('country'),
                            crawl_uuid=self._uuid
                        )

        except Exception:
            # If contents API fails, yield nothing
            return

Iterate through URLs matching a pattern and yield their content

Supports wildcard patterns using * and ? for flexible URL matching.

Args

pattern: URL pattern with wildcards ( matches any characters, ? matches one) Examples: "/products?page=", "https://example.com//detail", "/product/*"
format: Content format to retrieve

Yields

CrawlContent objects for each matching URL

Example

# Get all product pages in markdown
for content in crawl.read_iter(pattern="*/products?page=*", format="markdown"):
    print(f"{content.url}: {len(content.content)} chars")
    print(f"Duration: {content.duration}s")

# Get all detail pages
for content in crawl.read_iter(pattern="*/detail/*"):
    process(content.content)

# Pattern matching examples:
# "/products?page=*" matches /products?page=1, /products?page=2, etc.
# "*/product/*" matches any URL with /product/ in the path
# "https://example.com/page?" matches <https://example.com/page1,> page2, etc.

def stats(self) ‑> Dict[str, Any]

Expand source code

def stats(self) -> Dict[str, Any]:
    """
    Get comprehensive statistics about the crawl

    Returns:
        Dictionary with crawl statistics

    Example:
        ```python
        stats = crawl.stats()
        print(f"URLs extracted: {stats['urls_extracted']}")
        print(f"URLs visited: {stats['urls_visited']}")
        print(f"Crawl rate: {stats['crawl_rate']:.1f}%")
        print(f"Total size: {stats['total_size_kb']:.2f} KB")
        ```
    """
    status = self.status(refresh=False)

    # Basic stats from status — uses the wire field names as defined by
    # the scrape-engine source of truth.
    stats_dict = {
        'uuid': self._uuid,
        'status': status.status,
        'urls_extracted': status.state.urls_extracted,
        'urls_visited': status.state.urls_visited,
        'urls_to_crawl': status.state.urls_to_crawl,
        'urls_failed': status.state.urls_failed,
        'urls_skipped': status.state.urls_skipped,
        'progress_pct': status.progress_pct,
        'is_complete': status.is_complete,
        'is_running': status.is_running,
        'is_failed': status.is_failed,
    }

    # Calculate basic crawl rate (visited vs extracted)
    if status.state.urls_extracted > 0:
        stats_dict['crawl_rate'] = (status.state.urls_visited / status.state.urls_extracted) * 100

    # Add artifact stats if available
    if self._artifact_cache is not None:
        pages = self._artifact_cache.get_pages()
        total_size = sum(len(p['content']) for p in pages)
        avg_size = total_size / len(pages) if pages else 0

        stats_dict.update({
            'pages_downloaded': len(pages),
            'total_size_bytes': total_size,
            'total_size_kb': total_size / 1024,
            'total_size_mb': total_size / (1024 * 1024),
            'avg_page_size_bytes': avg_size,
            'avg_page_size_kb': avg_size / 1024,
        })

        # Calculate download rate (pages vs extracted)
        if status.state.urls_extracted > 0:
            stats_dict['download_rate'] = (len(pages) / status.state.urls_extracted) * 100

    return stats_dict

Get comprehensive statistics about the crawl

Returns

Dictionary with crawl statistics

Example

stats = crawl.stats()
print(f"URLs extracted: {stats['urls_extracted']}")
print(f"URLs visited: {stats['urls_visited']}")
print(f"Crawl rate: {stats['crawl_rate']:.1f}%")
print(f"Total size: {stats['total_size_kb']:.2f} KB")

def status(self, refresh: bool = True) ‑> CrawlerStatusResponse

Expand source code

def status(self, refresh: bool = True) -> CrawlerStatusResponse:
    """
    Get current crawler status

    Args:
        refresh: If True, fetch fresh status from API. If False, return cached status.

    Returns:
        CrawlerStatusResponse with current status

    Raises:
        RuntimeError: If crawler not started yet

    Example:
        ```python
        status = crawl.status()
        print(f"Progress: {status.progress_pct}%")
        print(f"URLs visited: {status.state.urls_visited}")
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    if refresh or self._status_cache is None:
        self._status_cache = self._client.get_crawl_status(self._uuid)

    return self._status_cache

Get current crawler status

Args

refresh: If True, fetch fresh status from API. If False, return cached status.

Returns

CrawlerStatusResponse with current status

Raises

RuntimeError: If crawler not started yet

Example

status = crawl.status()
print(f"Progress: {status.progress_pct}%")
print(f"URLs visited: {status.state.urls_visited}")

def urls(self, status: Literal['visited', 'pending', 'failed'] | None = None, page: int = 1, per_page: int = 100) ‑> CrawlerUrlsResponse

Expand source code

def urls(
    self,
    status: Optional[Literal['visited', 'pending', 'failed']] = None,
    page: int = 1,
    per_page: int = 100,
) -> CrawlerUrlsResponse:
    """
    List the crawled URLs (paginated, optionally filtered by status).

    NEW in 0.8.28 — convenience wrapper around
    :meth:`ScrapflyClient.get_crawl_urls` that pre-fills the crawler UUID.

    Args:
        status: Filter by URL status — 'visited', 'pending', or 'failed'.
            When None, the server defaults to 'visited'.
        page: 1-based page number (default 1)
        per_page: Page size (default 100, max 1000)

    Returns:
        CrawlerUrlsResponse with the URL records, total count and pagination metadata.

    Raises:
        ScrapflyCrawlerError: if the crawler has not been started yet.

    Example:
        ```python
        crawl = Crawl(client, config).crawl().wait()
        for entry in crawl.urls(status='visited'):
            print(f"{entry.url} ({entry.status})")
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400,
        )
    return self._client.get_crawl_urls(
        uuid=self._uuid,
        status=status,
        page=page,
        per_page=per_page,
    )

List the crawled URLs (paginated, optionally filtered by status).

NEW in 0.8.28 — convenience wrapper around :meth:ScrapflyClient.get_crawl_urls that pre-fills the crawler UUID.

Args

status: Filter by URL status — 'visited', 'pending', or 'failed'. When None, the server defaults to 'visited'.
page: 1-based page number (default 1)
per_page: Page size (default 100, max 1000)

Returns

CrawlerUrlsResponse with the URL records, total count and pagination metadata.

Raises

ScrapflyCrawlerError: if the crawler has not been started yet.

Example

crawl = Crawl(client, config).crawl().wait()
for entry in crawl.urls(status='visited'):
    print(f"{entry.url} ({entry.status})")

def wait(self, poll_interval: int = 5, max_wait: int | None = None, verbose: bool = False, allow_cancelled: bool = False) ‑> Crawl

Expand source code

def wait(
    self,
    poll_interval: int = 5,
    max_wait: Optional[int] = None,
    verbose: bool = False,
    allow_cancelled: bool = False,
) -> 'Crawl':
    """
    Wait for crawler to complete

    Polls the status endpoint until the crawler finishes.

    Args:
        poll_interval: Seconds between status checks (default: 5)
        max_wait: Maximum seconds to wait (None = wait forever)
        verbose: If True, print progress updates
        allow_cancelled: If True, return normally when the crawler reaches
            CANCELLED instead of raising. Useful for the cancel-then-wait
            pattern where the caller already knows they triggered the
            cancellation. Defaults to False (raises ScrapflyCrawlerError
            with code='CANCELLED' on user_cancelled), preserving prior
            behavior for callers that observe external cancellations.

    Returns:
        Self for method chaining

    Raises:
        ScrapflyCrawlerError: If crawler not started, failed, or timed out.
            Also raised on cancellation when ``allow_cancelled=False``.

    Example:
        ```python
        # Wait with progress updates
        crawl.crawl().wait(verbose=True)

        # Wait with timeout
        crawl.crawl().wait(max_wait=300)  # 5 minutes max

        # Cancel from the same call site, then wait without re-raising
        crawl.cancel()
        crawl.wait(allow_cancelled=True)
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    start_time = time.time()
    poll_count = 0

    while True:
        status = self.status(refresh=True)
        poll_count += 1

        if verbose:
            logger.info(f"Poll #{poll_count}: {status.status} - "
                       f"{status.progress_pct:.1f}% - "
                       f"{status.state.urls_visited}/{status.state.urls_extracted} URLs")

        if status.is_complete:
            if verbose:
                logger.info(f"✓ Crawler completed successfully!")
            return self
        elif status.is_failed:
            raise ScrapflyCrawlerError(
                message=f"Crawler failed with status: {status.status}",
                code="FAILED",
                http_status_code=400
            )
        elif status.is_cancelled:
            if allow_cancelled:
                if verbose:
                    logger.info("Crawler was cancelled (allow_cancelled=True)")
                return self
            raise ScrapflyCrawlerError(
                message="Crawler was cancelled",
                code="CANCELLED",
                http_status_code=400
            )

        # Check timeout
        if max_wait is not None:
            elapsed = time.time() - start_time
            if elapsed > max_wait:
                raise ScrapflyCrawlerError(
                    message=f"Timeout waiting for crawler (>{max_wait}s)",
                    code="TIMEOUT",
                    http_status_code=400
                )

        time.sleep(poll_interval)

Wait for crawler to complete

Polls the status endpoint until the crawler finishes.

Args

poll_interval: Seconds between status checks (default: 5)
max_wait: Maximum seconds to wait (None = wait forever)
verbose: If True, print progress updates
allow_cancelled: If True, return normally when the crawler reaches CANCELLED instead of raising. Useful for the cancel-then-wait pattern where the caller already knows they triggered the cancellation. Defaults to False (raises ScrapflyCrawlerError with code='CANCELLED' on user_cancelled), preserving prior behavior for callers that observe external cancellations.

Returns

Self for method chaining

Raises

ScrapflyCrawlerError: If crawler not started, failed, or timed out. Also raised on cancellation when allow_cancelled=False.

Example

# Wait with progress updates
crawl.crawl().wait(verbose=True)

# Wait with timeout
crawl.crawl().wait(max_wait=300)  # 5 minutes max

# Cancel from the same call site, then wait without re-raising
crawl.cancel()
crawl.wait(allow_cancelled=True)

def warc(self, artifact_type: str = 'warc') ‑> CrawlerArtifactResponse

Expand source code

def warc(self, artifact_type: str = 'warc') -> CrawlerArtifactResponse:
    """
    Download the crawler artifact (WARC file)

    Args:
        artifact_type: Type of artifact to download (default: 'warc')

    Returns:
        CrawlerArtifactResponse with parsed WARC data

    Raises:
        RuntimeError: If crawler not started yet

    Example:
        ```python
        # Get WARC artifact
        artifact = crawl.warc()

        # Get all pages
        pages = artifact.get_pages()

        # Iterate through responses
        for record in artifact.iter_responses():
            print(record.url)
        ```
    """
    if self._uuid is None:
        raise ScrapflyCrawlerError(
            message="Crawler not started yet. Call crawl() first.",
            code="NOT_STARTED",
            http_status_code=400
        )

    if self._artifact_cache is None:
        self._artifact_cache = self._client.get_crawl_artifact(
            self._uuid,
            artifact_type=artifact_type
        )

    return self._artifact_cache

Download the crawler artifact (WARC file)

Args

artifact_type: Type of artifact to download (default: 'warc')

Returns

CrawlerArtifactResponse with parsed WARC data

Raises

RuntimeError: If crawler not started yet

Example

# Get WARC artifact
artifact = crawl.warc()

# Get all pages
pages = artifact.get_pages()

# Iterate through responses
for record in artifact.iter_responses():
    print(record.url)

Expand source code

class CrawlContent:
    """
    Response object for a single crawled URL

    Provides access to content and metadata for a crawled page.
    Similar to ScrapeApiResponse but for crawler results.

    Attributes:
        url: The crawled URL (mandatory)
        content: Page content in requested format (mandatory)
        status_code: HTTP response status code (mandatory)
        headers: HTTP response headers (optional)
        duration: Request duration in seconds (optional)
        log_id: Scrape log ID for debugging (optional)
        log_url: URL to view scrape logs (optional)
        country: Country the request was made from (optional)

    Example:
        ```python
        # Get content for a URL
        content = crawl.read('https://example.com', format='markdown')

        print(f"URL: {content.url}")
        print(f"Status: {content.status_code}")
        print(f"Duration: {content.duration}s")
        print(f"Content: {content.content}")

        # Access metadata
        if content.log_url:
            print(f"View logs: {content.log_url}")
        ```
    """

    def __init__(
        self,
        url: str,
        content: str,
        status_code: int,
        headers: Optional[Dict[str, str]] = None,
        duration: Optional[float] = None,
        log_id: Optional[str] = None,
        country: Optional[str] = None,
        crawl_uuid: Optional[str] = None
    ):
        """
        Initialize CrawlContent

        Args:
            url: The crawled URL
            content: Page content in requested format
            status_code: HTTP response status code
            headers: HTTP response headers
            duration: Request duration in seconds
            log_id: Scrape log ID
            country: Country the request was made from
            crawl_uuid: Crawl job UUID
        """
        self.url = url
        self.content = content
        self.status_code = status_code
        self.headers = headers or {}
        self.duration = duration
        self.log_id = log_id
        self.country = country
        self._crawl_uuid = crawl_uuid

    @property
    def log_url(self) -> Optional[str]:
        """
        Get URL to view scrape logs

        Returns:
            Log URL if log_id is available, None otherwise
        """
        if self.log_id:
            return f"https://scrapfly.io/dashboard/logs/{self.log_id}"
        return None

    @property
    def success(self) -> bool:
        """Check if the request was successful (2xx status code)"""
        return 200 <= self.status_code < 300

    @property
    def error(self) -> bool:
        """Check if the request resulted in an error (4xx/5xx status code)"""
        return self.status_code >= 400

    def __repr__(self) -> str:
        return (f"CrawlContent(url={self.url!r}, status={self.status_code}, "
                f"content_length={len(self.content)})")

    def __str__(self) -> str:
        return self.content

    def __len__(self) -> int:
        """Get content length"""
        return len(self.content)

Response object for a single crawled URL

Provides access to content and metadata for a crawled page. Similar to ScrapeApiResponse but for crawler results.

Attributes

url: The crawled URL (mandatory)
content: Page content in requested format (mandatory)
status_code: HTTP response status code (mandatory)
headers: HTTP response headers (optional)
duration: Request duration in seconds (optional)
log_id: Scrape log ID for debugging (optional)
log_url: URL to view scrape logs (optional)
country: Country the request was made from (optional)

Example

# Get content for a URL
content = crawl.read('https://example.com', format='markdown')

print(f"URL: {content.url}")
print(f"Status: {content.status_code}")
print(f"Duration: {content.duration}s")
print(f"Content: {content.content}")

# Access metadata
if content.log_url:
    print(f"View logs: {content.log_url}")

Initialize CrawlContent

Args

url: The crawled URL
content: Page content in requested format
status_code: HTTP response status code
headers: HTTP response headers
duration: Request duration in seconds
log_id: Scrape log ID
country: Country the request was made from
crawl_uuid: Crawl job UUID

Instance variables

prop error : bool

Expand source code

@property
def error(self) -> bool:
    """Check if the request resulted in an error (4xx/5xx status code)"""
    return self.status_code >= 400

Check if the request resulted in an error (4xx/5xx status code)

prop log_url : str | None

Expand source code

@property
def log_url(self) -> Optional[str]:
    """
    Get URL to view scrape logs

    Returns:
        Log URL if log_id is available, None otherwise
    """
    if self.log_id:
        return f"https://scrapfly.io/dashboard/logs/{self.log_id}"
    return None

Get URL to view scrape logs

Returns

Log URL if log_id is available, None otherwise

prop success : bool

Expand source code

@property
def success(self) -> bool:
    """Check if the request was successful (2xx status code)"""
    return 200 <= self.status_code < 300

Check if the request was successful (2xx status code)

class CrawlerArtifactResponse (artifact_data: bytes, artifact_type: str = 'warc')

Expand source code

class CrawlerArtifactResponse:
    """
    Response from downloading crawler artifacts

    Returned by ScrapflyClient.get_crawl_artifact() method.

    Provides high-level access to crawl results with automatic WARC/HAR parsing.
    Users don't need to understand WARC or HAR format to use this class.

    Example:
        ```python
        # Get WARC artifact (default)
        artifact = client.get_crawl_artifact(uuid)

        # Get HAR artifact
        artifact = client.get_crawl_artifact(uuid, artifact_type='har')

        # Easy mode: get all pages as dicts
        pages = artifact.get_pages()
        for page in pages:
            print(f"{page['url']}: {page['status_code']}")
            html = page['content'].decode('utf-8')

        # Memory-efficient: iterate one page at a time
        for record in artifact.iter_responses():
            print(f"{record.url}: {record.status_code}")
            process(record.content)

        # Save to file
        artifact.save('crawl_results.warc.gz')
        ```
    """

    def __init__(self, artifact_data: bytes, artifact_type: str = 'warc'):
        """
        Initialize from artifact data

        Args:
            artifact_data: Raw artifact file bytes
            artifact_type: Type of artifact ('warc' or 'har')
        """
        self._artifact_data = artifact_data
        self._artifact_type = artifact_type
        self._warc_parser: Optional[WarcParser] = None
        self._har_parser: Optional[HarArchive] = None

    @property
    def artifact_type(self) -> str:
        """Get artifact type ('warc' or 'har')"""
        return self._artifact_type

    @property
    def artifact_data(self) -> bytes:
        """Get raw artifact data (for advanced users)"""
        return self._artifact_data

    @property
    def warc_data(self) -> bytes:
        """Get raw WARC data (deprecated, use artifact_data)"""
        return self._artifact_data

    @property
    def parser(self) -> Union[WarcParser, HarArchive]:
        """Get artifact parser instance (lazy-loaded)"""
        if self._artifact_type == 'har':
            if self._har_parser is None:
                self._har_parser = HarArchive(self._artifact_data)
            return self._har_parser
        else:
            if self._warc_parser is None:
                self._warc_parser = parse_warc(self._artifact_data)
            return self._warc_parser

    def iter_records(self) -> Iterator[Union[WarcRecord, HarEntry]]:
        """
        Iterate through all records

        For WARC: iterates through all WARC records
        For HAR: iterates through all HAR entries

        Yields:
            WarcRecord or HarEntry: Each record in the artifact
        """
        if self._artifact_type == 'har':
            return self.parser.iter_entries()
        else:
            return self.parser.iter_records()

    def iter_responses(self) -> Iterator[Union[WarcRecord, HarEntry]]:
        """
        Iterate through HTTP response records only

        This is more memory-efficient than get_pages() for large crawls.

        For WARC: iterates through response records
        For HAR: iterates through all entries (HAR only contains responses)

        Yields:
            WarcRecord or HarEntry: HTTP response records with url, status_code, headers, content
        """
        if self._artifact_type == 'har':
            return self.parser.iter_entries()
        else:
            return self.parser.iter_responses()

    def get_pages(self) -> List[Dict]:
        """
        Get all crawled pages as simple dictionaries

        This is the easiest way to access crawl results.
        Works with both WARC and HAR formats.

        Returns:
            List of dicts with keys: url, status_code, headers, content

        Example:
            ```python
            pages = artifact.get_pages()
            for page in pages:
                print(f"{page['url']}: {len(page['content'])} bytes")
                html = page['content'].decode('utf-8')
            ```
        """
        if self._artifact_type == 'har':
            # Convert HAR entries to page dicts
            pages = []
            for entry in self.parser.iter_entries():
                pages.append({
                    'url': entry.url,
                    'status_code': entry.status_code,
                    'headers': entry.response_headers,
                    'content': entry.content
                })
            return pages
        else:
            return self.parser.get_pages()

    @property
    def total_pages(self) -> int:
        """Get total number of pages in the artifact"""
        return len(self.get_pages())

    def save(self, filepath: str):
        """
        Save WARC data to file

        Args:
            filepath: Path to save the WARC file

        Example:
            ```python
            artifact.save('crawl_results.warc.gz')
            ```
        """
        with open(filepath, 'wb') as f:
            f.write(self.warc_data)

    def __repr__(self):
        return f"CrawlerArtifactResponse(size={len(self.warc_data)} bytes)"

Response from downloading crawler artifacts

Returned by ScrapflyClient.get_crawl_artifact() method.

Provides high-level access to crawl results with automatic WARC/HAR parsing. Users don't need to understand WARC or HAR format to use this class.

Example

# Get WARC artifact (default)
artifact = client.get_crawl_artifact(uuid)

# Get HAR artifact
artifact = client.get_crawl_artifact(uuid, artifact_type='har')

# Easy mode: get all pages as dicts
pages = artifact.get_pages()
for page in pages:
    print(f"{page['url']}: {page['status_code']}")
    html = page['content'].decode('utf-8')

# Memory-efficient: iterate one page at a time
for record in artifact.iter_responses():
    print(f"{record.url}: {record.status_code}")
    process(record.content)

# Save to file
artifact.save('crawl_results.warc.gz')

Initialize from artifact data

Args

artifact_data: Raw artifact file bytes
artifact_type: Type of artifact ('warc' or 'har')

Instance variables

prop artifact_data : bytes

Expand source code

@property
def artifact_data(self) -> bytes:
    """Get raw artifact data (for advanced users)"""
    return self._artifact_data

Get raw artifact data (for advanced users)

prop artifact_type : str

Expand source code

@property
def artifact_type(self) -> str:
    """Get artifact type ('warc' or 'har')"""
    return self._artifact_type

Get artifact type ('warc' or 'har')

prop parser : WarcParser | HarArchive

Expand source code

@property
def parser(self) -> Union[WarcParser, HarArchive]:
    """Get artifact parser instance (lazy-loaded)"""
    if self._artifact_type == 'har':
        if self._har_parser is None:
            self._har_parser = HarArchive(self._artifact_data)
        return self._har_parser
    else:
        if self._warc_parser is None:
            self._warc_parser = parse_warc(self._artifact_data)
        return self._warc_parser

Get artifact parser instance (lazy-loaded)

prop total_pages : int

Expand source code

@property
def total_pages(self) -> int:
    """Get total number of pages in the artifact"""
    return len(self.get_pages())

Get total number of pages in the artifact

prop warc_data : bytes

Expand source code

@property
def warc_data(self) -> bytes:
    """Get raw WARC data (deprecated, use artifact_data)"""
    return self._artifact_data

Get raw WARC data (deprecated, use artifact_data)

Methods

def get_pages(self) ‑> List[Dict]

Expand source code

def get_pages(self) -> List[Dict]:
    """
    Get all crawled pages as simple dictionaries

    This is the easiest way to access crawl results.
    Works with both WARC and HAR formats.

    Returns:
        List of dicts with keys: url, status_code, headers, content

    Example:
        ```python
        pages = artifact.get_pages()
        for page in pages:
            print(f"{page['url']}: {len(page['content'])} bytes")
            html = page['content'].decode('utf-8')
        ```
    """
    if self._artifact_type == 'har':
        # Convert HAR entries to page dicts
        pages = []
        for entry in self.parser.iter_entries():
            pages.append({
                'url': entry.url,
                'status_code': entry.status_code,
                'headers': entry.response_headers,
                'content': entry.content
            })
        return pages
    else:
        return self.parser.get_pages()

Get all crawled pages as simple dictionaries

This is the easiest way to access crawl results. Works with both WARC and HAR formats.

Returns

List of dicts with keys: url, status_code, headers, content

Example

pages = artifact.get_pages()
for page in pages:
    print(f"{page['url']}: {len(page['content'])} bytes")
    html = page['content'].decode('utf-8')

def iter_records(self) ‑> Iterator[WarcRecord | HarEntry]

Expand source code

def iter_records(self) -> Iterator[Union[WarcRecord, HarEntry]]:
    """
    Iterate through all records

    For WARC: iterates through all WARC records
    For HAR: iterates through all HAR entries

    Yields:
        WarcRecord or HarEntry: Each record in the artifact
    """
    if self._artifact_type == 'har':
        return self.parser.iter_entries()
    else:
        return self.parser.iter_records()

Iterate through all records

For WARC: iterates through all WARC records For HAR: iterates through all HAR entries

Yields

WarcRecord or HarEntry: Each record in the artifact

def iter_responses(self) ‑> Iterator[WarcRecord | HarEntry]

Expand source code

def iter_responses(self) -> Iterator[Union[WarcRecord, HarEntry]]:
    """
    Iterate through HTTP response records only

    This is more memory-efficient than get_pages() for large crawls.

    For WARC: iterates through response records
    For HAR: iterates through all entries (HAR only contains responses)

    Yields:
        WarcRecord or HarEntry: HTTP response records with url, status_code, headers, content
    """
    if self._artifact_type == 'har':
        return self.parser.iter_entries()
    else:
        return self.parser.iter_responses()

Iterate through HTTP response records only

This is more memory-efficient than get_pages() for large crawls.

For WARC: iterates through response records For HAR: iterates through all entries (HAR only contains responses)

Yields

WarcRecord or HarEntry: HTTP response records with url, status_code, headers, content

def save(self, filepath: str)

Expand source code

def save(self, filepath: str):
    """
    Save WARC data to file

    Args:
        filepath: Path to save the WARC file

    Example:
        ```python
        artifact.save('crawl_results.warc.gz')
        ```
    """
    with open(filepath, 'wb') as f:
        f.write(self.warc_data)

Save WARC data to file

Args

filepath: Path to save the WARC file

Example

artifact.save('crawl_results.warc.gz')

class CrawlerConfig (url: str, page_limit: int | None = None, max_depth: int | None = None, max_duration: int | None = None, exclude_paths: List[str] | None = None, include_only_paths: List[str] | None = None, ignore_base_path_restriction: bool = False, follow_external_links: bool = False, allowed_external_domains: List[str] | None = None, follow_internal_subdomains: bool | None = None, allowed_internal_subdomains: List[str] | None = None, headers: Dict[str, str] | None = None, delay: int | None = None, user_agent: str | None = None, max_concurrency: int | None = None, rendering_delay: int | None = None, use_sitemaps: bool = False, respect_robots_txt: bool | None = None, ignore_no_follow: bool = False, cache: bool = False, cache_ttl: int | None = None, cache_clear: bool = False, content_formats: List[Literal['html', 'markdown', 'text', 'clean_html']] | None = None, extraction_rules: Dict | None = None, asp: bool = False, proxy_pool: str | None = None, country: str | None = None, webhook_name: str | None = None, webhook_events: List[str] | None = None, max_api_credit: int | None = None)

Expand source code

class CrawlerConfig(BaseApiConfig):
    """
    Configuration for Scrapfly Crawler API

    The Crawler API performs recursive website crawling with advanced
    configuration, content extraction, and artifact storage.

    Example:
        ```python
        from scrapfly import ScrapflyClient, CrawlerConfig
        client = ScrapflyClient(key='YOUR_API_KEY')
        config = CrawlerConfig(
            url='https://example.com',
            page_limit=100,
            max_depth=3,
            content_formats=['markdown', 'html']
        )

        # Start crawl
        start_response = client.start_crawl(config)
        uuid = start_response.uuid

        # Poll status
        status = client.get_crawl_status(uuid)

        # Get results when complete
        if status.is_complete:
            artifact = client.get_crawl_artifact(uuid)
            pages = artifact.get_pages()
        ```
    """

    WEBHOOK_CRAWLER_STARTED = 'crawler_started'
    WEBHOOK_CRAWLER_URL_VISITED = 'crawler_url_visited'
    WEBHOOK_CRAWLER_URL_SKIPPED = 'crawler_url_skipped'
    WEBHOOK_CRAWLER_URL_DISCOVERED = 'crawler_url_discovered'
    WEBHOOK_CRAWLER_URL_FAILED = 'crawler_url_failed'
    WEBHOOK_CRAWLER_STOPPED = 'crawler_stopped'
    WEBHOOK_CRAWLER_CANCELLED = 'crawler_cancelled'
    WEBHOOK_CRAWLER_FINISHED = 'crawler_finished'

    ALL_WEBHOOK_EVENTS = [
        WEBHOOK_CRAWLER_STARTED,
        WEBHOOK_CRAWLER_URL_VISITED,
        WEBHOOK_CRAWLER_URL_SKIPPED,
        WEBHOOK_CRAWLER_URL_DISCOVERED,
        WEBHOOK_CRAWLER_URL_FAILED,
        WEBHOOK_CRAWLER_STOPPED,
        WEBHOOK_CRAWLER_CANCELLED,
        WEBHOOK_CRAWLER_FINISHED,
    ]

    def __init__(
        self,
        url: str,
        # Crawl limits
        page_limit: Optional[int] = None,
        max_depth: Optional[int] = None,
        max_duration: Optional[int] = None,

        # Path filtering (mutually exclusive)
        exclude_paths: Optional[List[str]] = None,
        include_only_paths: Optional[List[str]] = None,

        # Advanced crawl options
        ignore_base_path_restriction: bool = False,
        follow_external_links: bool = False,
        allowed_external_domains: Optional[List[str]] = None,
        # Subdomain control (NEW — added in 0.8.28 to match the documented public API).
        # Server-side default for follow_internal_subdomains is True; we leave the
        # field unset by default so the server applies its own default.
        follow_internal_subdomains: Optional[bool] = None,
        allowed_internal_subdomains: Optional[List[str]] = None,

        # Request configuration
        headers: Optional[Dict[str, str]] = None,
        delay: Optional[int] = None,
        user_agent: Optional[str] = None,
        max_concurrency: Optional[int] = None,
        rendering_delay: Optional[int] = None,

        # Crawl strategy options
        use_sitemaps: bool = False,
        # respect_robots_txt: server default is True. Leave unset (None) so the
        # server applies its own default rather than forcing False on every request.
        respect_robots_txt: Optional[bool] = None,
        ignore_no_follow: bool = False,

        # Cache options
        cache: bool = False,
        cache_ttl: Optional[int] = None,
        cache_clear: bool = False,

        # Content extraction
        content_formats: Optional[List[Literal['html', 'markdown', 'text', 'clean_html']]] = None,
        extraction_rules: Optional[Dict] = None,

        # Web scraping features
        asp: bool = False,
        proxy_pool: Optional[str] = None,
        country: Optional[str] = None,

        # Webhook integration
        webhook_name: Optional[str] = None,
        webhook_events: Optional[List[str]] = None,

        # Cost control
        max_api_credit: Optional[int] = None
    ):
        """
        Initialize a CrawlerConfig

        Args:
            url: Starting URL for the crawl (required)
            page_limit: Maximum number of pages to crawl
            max_depth: Maximum crawl depth from starting URL
            max_duration: Maximum crawl duration in seconds

            exclude_paths: List of path patterns to exclude (mutually exclusive with include_only_paths)
            include_only_paths: List of path patterns to include only (mutually exclusive with exclude_paths)

            ignore_base_path_restriction: Allow crawling outside the base path
            follow_external_links: Follow links to external domains
            allowed_external_domains: List of external domains allowed when follow_external_links is True

            headers: Custom HTTP headers for requests
            delay: Delay between requests in milliseconds
            user_agent: Custom user agent string
            max_concurrency: Maximum concurrent requests
            rendering_delay: Delay for JavaScript rendering in milliseconds

            use_sitemaps: Use sitemap.xml to discover URLs
            respect_robots_txt: Respect robots.txt rules
            ignore_no_follow: Ignore rel="nofollow" attributes

            cache: Enable caching
            cache_ttl: Cache time-to-live in seconds
            cache_clear: Clear cache before crawling

            content_formats: List of content formats to extract ('html', 'markdown', 'text', 'clean_html')
            extraction_rules: Custom extraction rules

            asp: Enable Anti-Scraping Protection bypass
            proxy_pool: Proxy pool to use (e.g., 'public_residential_pool')
            country: Target country for geo-located content

            webhook_name: Webhook name for event notifications
            webhook_events: List of webhook events to trigger

            max_api_credit: Maximum API credits to spend on this crawl
        """
        if exclude_paths and include_only_paths:
            raise ValueError("exclude_paths and include_only_paths are mutually exclusive")

        params = {
            'url': url,
        }

        # Add optional parameters
        if page_limit is not None:
            params['page_limit'] = page_limit
        if max_depth is not None:
            params['max_depth'] = max_depth
        if max_duration is not None:
            params['max_duration'] = max_duration

        # Path filtering
        if exclude_paths:
            params['exclude_paths'] = exclude_paths
        if include_only_paths:
            params['include_only_paths'] = include_only_paths

        # Advanced options
        if ignore_base_path_restriction:
            params['ignore_base_path_restriction'] = True
        if follow_external_links:
            params['follow_external_links'] = True
        if allowed_external_domains:
            params['allowed_external_domains'] = allowed_external_domains
        # Subdomain control (NEW). Both fields are tri-state: None means
        # "unset" (server default applies); explicit True/False / list overrides.
        if follow_internal_subdomains is not None:
            params['follow_internal_subdomains'] = follow_internal_subdomains
        if allowed_internal_subdomains:
            params['allowed_internal_subdomains'] = allowed_internal_subdomains

        # Request configuration
        if headers:
            params['headers'] = headers
        if delay is not None:
            params['delay'] = delay
        if user_agent:
            params['user_agent'] = user_agent
        if max_concurrency is not None:
            params['max_concurrency'] = max_concurrency
        if rendering_delay is not None:
            params['rendering_delay'] = rendering_delay

        # Crawl strategy
        if use_sitemaps:
            params['use_sitemaps'] = True
        # Tri-state: None = let server default win (default True). Explicit
        # True/False overrides.
        if respect_robots_txt is not None:
            params['respect_robots_txt'] = respect_robots_txt
        if ignore_no_follow:
            params['ignore_no_follow'] = True

        # Cache
        if cache:
            params['cache'] = True
        if cache_ttl is not None:
            params['cache_ttl'] = cache_ttl
        if cache_clear:
            params['cache_clear'] = True

        # Content extraction
        if content_formats:
            params['content_formats'] = content_formats
        if extraction_rules:
            params['extraction_rules'] = extraction_rules

        # Web scraping features
        if asp:
            params['asp'] = True
        if proxy_pool:
            params['proxy_pool'] = proxy_pool
        if country:
            params['country'] = country

        # Webhooks
        if webhook_name:
            params['webhook_name'] = webhook_name

        if webhook_events:
            assert all(
                event in self.ALL_WEBHOOK_EVENTS for event in webhook_events
            ), f"Invalid webhook events. Valid events are: {self.ALL_WEBHOOK_EVENTS}"
            
            params['webhook_events'] = webhook_events

        # Cost control
        if max_api_credit is not None:
            params['max_api_credit'] = max_api_credit

        self._params = params

    def to_api_params(self, key: Optional[str] = None) -> Dict:
        """
        Convert config to API parameters

        :param key: API key (optional, can be added by client)
        :return: Dictionary of API parameters
        """
        params = self._params.copy()
        if key:
            params['key'] = key
        return params

Configuration for Scrapfly Crawler API

The Crawler API performs recursive website crawling with advanced configuration, content extraction, and artifact storage.

Example

from scrapfly import ScrapflyClient, CrawlerConfig
client = ScrapflyClient(key='YOUR_API_KEY')
config = CrawlerConfig(
    url='https://example.com',
    page_limit=100,
    max_depth=3,
    content_formats=['markdown', 'html']
)

# Start crawl
start_response = client.start_crawl(config)
uuid = start_response.uuid

# Poll status
status = client.get_crawl_status(uuid)

# Get results when complete
if status.is_complete:
    artifact = client.get_crawl_artifact(uuid)
    pages = artifact.get_pages()

Initialize a CrawlerConfig

Args

url: Starting URL for the crawl (required)
page_limit: Maximum number of pages to crawl
max_depth: Maximum crawl depth from starting URL
max_duration: Maximum crawl duration in seconds
exclude_paths: List of path patterns to exclude (mutually exclusive with include_only_paths)
include_only_paths: List of path patterns to include only (mutually exclusive with exclude_paths)
ignore_base_path_restriction: Allow crawling outside the base path
follow_external_links: Follow links to external domains
allowed_external_domains: List of external domains allowed when follow_external_links is True
headers: Custom HTTP headers for requests
delay: Delay between requests in milliseconds
user_agent: Custom user agent string
max_concurrency: Maximum concurrent requests
rendering_delay: Delay for JavaScript rendering in milliseconds
use_sitemaps: Use sitemap.xml to discover URLs
respect_robots_txt: Respect robots.txt rules
ignore_no_follow: Ignore rel="nofollow" attributes
cache: Enable caching
cache_ttl: Cache time-to-live in seconds
cache_clear: Clear cache before crawling
content_formats: List of content formats to extract ('html', 'markdown', 'text', 'clean_html')
extraction_rules: Custom extraction rules
asp: Enable Anti-Scraping Protection bypass
proxy_pool: Proxy pool to use (e.g., 'public_residential_pool')
country: Target country for geo-located content
webhook_name: Webhook name for event notifications
webhook_events: List of webhook events to trigger
max_api_credit: Maximum API credits to spend on this crawl

Ancestors

BaseApiConfig

Class variables

var ALL_WEBHOOK_EVENTS
var WEBHOOK_CRAWLER_CANCELLED
var WEBHOOK_CRAWLER_FINISHED
var WEBHOOK_CRAWLER_STARTED
var WEBHOOK_CRAWLER_STOPPED
var WEBHOOK_CRAWLER_URL_DISCOVERED
var WEBHOOK_CRAWLER_URL_FAILED
var WEBHOOK_CRAWLER_URL_SKIPPED
var WEBHOOK_CRAWLER_URL_VISITED

Methods

def to_api_params(self, key: str | None = None) ‑> Dict

Expand source code

def to_api_params(self, key: Optional[str] = None) -> Dict:
    """
    Convert config to API parameters

    :param key: API key (optional, can be added by client)
    :return: Dictionary of API parameters
    """
    params = self._params.copy()
    if key:
        params['key'] = key
    return params

Convert config to API parameters

:param key: API key (optional, can be added by client) :return: Dictionary of API parameters

class CrawlerError (message: str, code: str, http_status_code: int, resource: str | None = None, is_retryable: bool = False, retry_delay: int | None = None, retry_times: int | None = None, documentation_url: str | None = None, api_response: ForwardRef('ApiResponse') | None = None)

Expand source code

class CrawlerError(ScrapflyError):
    """Base exception for Crawler API errors"""
    pass

Base exception for Crawler API errors

Ancestors

ScrapflyError
builtins.Exception
builtins.BaseException

Subclasses

ScrapflyCrawlerError

class CrawlerLifecycleWebhook (event: str, crawler_uuid: str, project: str, env: str, action: str, state: CrawlerState, seed_url: str, status_link: str)

Expand source code

@dataclass
class CrawlerLifecycleWebhook(CrawlerWebhookBase):
    """
    Payload for the 4 lifecycle events: ``crawler_started``,
    ``crawler_stopped``, ``crawler_cancelled``, ``crawler_finished``.

    These events all carry the same fields: the seed URL, the common base
    (crawler_uuid / project / env / action / state), and a ``links.status``
    URL pointing at the crawl status endpoint. Disambiguate by inspecting
    ``self.event`` (use :class:`CrawlerWebhookEvent`).

    Attributes:
        seed_url: The root URL the crawl was started from.
        status_link: URL to fetch the live crawler status.
    """

    seed_url: str
    status_link: str

    @classmethod
    def from_payload(cls, event: str, payload: Dict[str, Any]) -> 'CrawlerLifecycleWebhook':
        base = cls._parse_base(event, payload)
        return cls(
            **base,
            seed_url=payload['seed_url'],
            status_link=payload['links']['status'],
        )

Payload for the 4 lifecycle events: crawler_started, crawler_stopped, crawler_cancelled, crawler_finished.

These events all carry the same fields: the seed URL, the common base (crawler_uuid / project / env / action / state), and a links.status URL pointing at the crawl status endpoint. Disambiguate by inspecting self.event (use :class:CrawlerWebhookEvent).

Attributes

seed_url: The root URL the crawl was started from.
status_link: URL to fetch the live crawler status.

Ancestors

CrawlerWebhookBase

Static methods

def from_payload(event: str, payload: Dict[str, Any]) ‑> CrawlerLifecycleWebhook

Instance variables

var seed_url : str
var status_link : str

class CrawlerScrapeResult (status_code: int, country: str, log_uuid: str, log_url: str, content: Dict[str, Any])

Expand source code

@dataclass
class CrawlerScrapeResult:
    """
    The ``scrape`` sub-object of a ``crawler_url_visited`` payload.

    Attributes:
        status_code: HTTP status code returned by the target URL.
        country: 2-letter country code of the proxy that performed the scrape.
        log_uuid: ULID of the scrape log (used to fetch the full log later).
        log_url: Human-browseable dashboard URL for the log.
        content: Map of requested content format (``html``, ``text``,
            ``markdown``, ``clean_html``, ``json``, etc.) to the actual
            rendered string. The keys depend on what the caller requested
            in ``content_formats``.
    """

    status_code: int
    country: str
    log_uuid: str
    log_url: str
    content: Dict[str, Any]

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> 'CrawlerScrapeResult':
        return cls(
            status_code=data['status_code'],
            country=data['country'],
            log_uuid=data['log_uuid'],
            log_url=data['log_url'],
            content=data['content'],
        )

The scrape sub-object of a crawler_url_visited payload.

Attributes

status_code: HTTP status code returned by the target URL.
country: 2-letter country code of the proxy that performed the scrape.
log_uuid: ULID of the scrape log (used to fetch the full log later).
log_url: Human-browseable dashboard URL for the log.
content: Map of requested content format (html, text, markdown, clean_html, json, etc.) to the actual rendered string. The keys depend on what the caller requested in content_formats.

Static methods

def from_dict(data: Dict[str, Any]) ‑> CrawlerScrapeResult

Instance variables

var content : Dict[str, Any]
var country : str
var log_url : str
var log_uuid : str
var status_code : int

class CrawlerStartResponse (response_data: Dict[str, Any])

Expand source code

class CrawlerStartResponse:
    """
    Response from starting a crawler job

    Returned by ScrapflyClient.start_crawl() method.

    Strict parsing: ``uuid`` and ``status`` are part of the documented contract
    and are required. A missing field raises ``KeyError`` so the caller knows
    immediately that the API contract changed.

    Attributes:
        uuid: Unique identifier for the crawler job
        status: Initial status (typically 'PENDING')
    """

    def __init__(self, response_data: Dict[str, Any]):
        """
        Initialize from API response

        Args:
            response_data: Raw API response dictionary
        """
        self._data = response_data
        # API canonical name is `crawler_uuid`; we accept `uuid` only as a
        # legacy fallback, in case an older server emits the short form.
        if 'crawler_uuid' in response_data:
            self.uuid = response_data['crawler_uuid']
        elif 'uuid' in response_data:
            self.uuid = response_data['uuid']
        else:
            raise KeyError(
                "CrawlerStartResponse: required field 'crawler_uuid' (or legacy 'uuid') is missing"
            )
        self.status = response_data['status']
        assert isinstance(self.uuid, str) and self.uuid, (
            f"CrawlerStartResponse: uuid must be a non-empty string, got {self.uuid!r}"
        )
        assert isinstance(self.status, str) and self.status, (
            f"CrawlerStartResponse: status must be a non-empty string, got {self.status!r}"
        )

    def __repr__(self):
        return f"CrawlerStartResponse(uuid={self.uuid}, status={self.status})"

Response from starting a crawler job

Returned by ScrapflyClient.start_crawl() method.

Strict parsing: uuid and status are part of the documented contract and are required. A missing field raises KeyError so the caller knows immediately that the API contract changed.

Attributes

uuid: Unique identifier for the crawler job
status: Initial status (typically 'PENDING')

Initialize from API response

Args

response_data: Raw API response dictionary

class CrawlerState (state: Dict[str, Any])

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

Nested state block of a crawler status response.

Field names match the wire format emitted by the scrape-engine (apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py), which is the single source of truth. Go and TypeScript SDKs expose the same names on their status.state object.

Attributes

urls_visited: Number of URLs successfully crawled.
urls_extracted: Total URLs discovered (seed + links + sitemaps).
urls_to_crawl: Derived as urls_extracted - urls_skipped server-side.
urls_failed: URLs that failed to crawl.
urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
api_credit_used: Total API credits consumed by this crawl.
duration: Elapsed time in seconds.
start_time: Unix epoch seconds when the first worker picked up the job, or None while the job is still in PENDING.
stop_time: Unix epoch seconds when the crawler reached a terminal state, or None while still running.
stop_reason: Reason for stop (page_limit, max_duration, etc.), or None while still running.

Instance variables

var api_credit_used

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var duration

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var start_time

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var stop_reason

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var stop_time

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var urls_extracted

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var urls_failed

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var urls_skipped

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var urls_to_crawl

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

var urls_visited

Expand source code

class CrawlerState:
    """
    Nested ``state`` block of a crawler status response.

    Field names match the wire format emitted by the scrape-engine
    (``apps/scrapfly/scrape-engine/scrape_engine/crawler/config.py``), which
    is the single source of truth. Go and TypeScript SDKs expose the same
    names on their ``status.state`` object.

    Attributes:
        urls_visited: Number of URLs successfully crawled.
        urls_extracted: Total URLs discovered (seed + links + sitemaps).
        urls_to_crawl: Derived as ``urls_extracted - urls_skipped`` server-side.
        urls_failed: URLs that failed to crawl.
        urls_skipped: URLs skipped (filtered by exclude rules, robots.txt, etc.).
        api_credit_used: Total API credits consumed by this crawl.
        duration: Elapsed time in seconds.
        start_time: Unix epoch seconds when the first worker picked up the job,
            or ``None`` while the job is still in ``PENDING``.
        stop_time: Unix epoch seconds when the crawler reached a terminal state,
            or ``None`` while still running.
        stop_reason: Reason for stop (``page_limit``, ``max_duration``, etc.),
            or ``None`` while still running.
    """

    __slots__ = (
        'urls_visited', 'urls_extracted', 'urls_to_crawl',
        'urls_failed', 'urls_skipped',
        'api_credit_used', 'duration',
        'start_time', 'stop_time', 'stop_reason',
    )

    def __init__(self, state: Dict[str, Any]):
        assert isinstance(state, dict), (
            f"CrawlerState: expected dict, got {type(state).__name__}"
        )
        self.urls_visited: int = state['urls_visited']
        self.urls_extracted: int = state['urls_extracted']
        self.urls_to_crawl: int = state['urls_to_crawl']
        self.urls_failed: int = state['urls_failed']
        self.urls_skipped: int = state['urls_skipped']
        self.api_credit_used = state['api_credit_used']
        self.duration = state['duration']
        # Nullable during PENDING — before a worker has picked up the job.
        self.start_time: Optional[int] = state.get('start_time')
        self.stop_time: Optional[int] = state.get('stop_time')
        self.stop_reason: Optional[str] = state.get('stop_reason')

    def __repr__(self):
        return (
            f"CrawlerState(visited={self.urls_visited}, extracted={self.urls_extracted}, "
            f"to_crawl={self.urls_to_crawl}, failed={self.urls_failed}, "
            f"skipped={self.urls_skipped})"
        )

class CrawlerStatusResponse (response_data: Dict[str, Any])

Expand source code

class CrawlerStatusResponse:
    """
    Response from checking crawler job status.

    Returned by :py:meth:`ScrapflyClient.get_crawl_status`. Provides real-time
    progress tracking for crawler jobs.

    **Field names match the wire format.** The scrape-engine is the source of
    truth; the Go and TypeScript SDKs expose identical names. Access state
    counters via the nested ``state`` attribute:

        >>> status.state.urls_visited
        12
        >>> status.state.urls_extracted
        34

    Attributes:
        uuid: Crawler job UUID.
        status: Current status (``PENDING``, ``RUNNING``, ``DONE``, ``CANCELLED``).
        is_success: Whether the crawler job completed successfully (``None`` while running).
        is_finished: Whether the crawler job has finished (regardless of success/failure).
        state: :class:`CrawlerState` — all the per-crawl counters and timings.
    """

    # Status constants
    STATUS_PENDING = 'PENDING'
    STATUS_RUNNING = 'RUNNING'
    STATUS_DONE = 'DONE'
    STATUS_CANCELLED = 'CANCELLED'

    def __init__(self, response_data: Dict[str, Any]):
        """
        Initialize from API response.

        Strict parsing: required fields (``crawler_uuid``, ``status``,
        ``is_success``, ``is_finished``, and the documented ``state.*``
        metrics) are read with direct access so missing keys raise
        ``KeyError`` at parse time. This catches API contract drift loud and
        early.

        Args:
            response_data: Raw API response dictionary.
        """
        self._data = response_data

        # Identification — accept legacy `uuid` only as fallback.
        if 'crawler_uuid' in response_data:
            self.uuid = response_data['crawler_uuid']
        elif 'uuid' in response_data:
            self.uuid = response_data['uuid']
        else:
            raise KeyError(
                "CrawlerStatusResponse: required field 'crawler_uuid' (or legacy 'uuid') is missing"
            )
        self.status = response_data['status']
        # `is_success` may legitimately be `null` while still running.
        self.is_success = response_data['is_success']
        self.is_finished = response_data['is_finished']

        assert isinstance(self.uuid, str) and self.uuid, (
            f"CrawlerStatusResponse: uuid must be a non-empty string, got {self.uuid!r}"
        )
        assert isinstance(self.status, str) and self.status, (
            f"CrawlerStatusResponse: status must be a non-empty string, got {self.status!r}"
        )
        assert isinstance(self.is_finished, bool), (
            f"CrawlerStatusResponse: is_finished must be bool, got {type(self.is_finished).__name__}"
        )
        assert self.is_success is None or isinstance(self.is_success, bool), (
            f"CrawlerStatusResponse: is_success must be bool or None, got {type(self.is_success).__name__}"
        )

        # Nested state — canonical shape matching Go / TS SDKs.
        self.state = CrawlerState(response_data['state'])

    @property
    def is_complete(self) -> bool:
        """Whether the crawler reached DONE with is_success=True."""
        return self.status == self.STATUS_DONE and self.is_success is True

    @property
    def is_running(self) -> bool:
        """Whether the crawler is currently PENDING or RUNNING."""
        return self.status in (self.STATUS_PENDING, self.STATUS_RUNNING)

    @property
    def is_failed(self) -> bool:
        """Whether the crawler reached DONE with is_success=False."""
        return self.status == self.STATUS_DONE and self.is_success is False

    @property
    def is_cancelled(self) -> bool:
        """Whether the crawler was cancelled."""
        return self.status == self.STATUS_CANCELLED

    @property
    def progress_pct(self) -> float:
        """
        Visited/extracted ratio as a percentage (0-100).

        Returns 0.0 when no URLs have been extracted yet.
        """
        if self.state.urls_extracted == 0:
            return 0.0
        return (self.state.urls_visited / self.state.urls_extracted) * 100

    def __repr__(self):
        return (f"CrawlerStatusResponse(uuid={self.uuid}, status={self.status}, "
                f"progress={self.progress_pct:.1f}%, "
                f"visited={self.state.urls_visited}/{self.state.urls_extracted})")

Response from checking crawler job status.

Returned by :py:meth:ScrapflyClient.get_crawl_status(). Provides real-time progress tracking for crawler jobs.

Field names match the wire format. The scrape-engine is the source of truth; the Go and TypeScript SDKs expose identical names. Access state counters via the nested state attribute:

>>> status.state.urls_visited
12
>>> status.state.urls_extracted
34

Attributes

uuid: Crawler job UUID.
status: Current status (PENDING, RUNNING, DONE, CANCELLED).
is_success: Whether the crawler job completed successfully (None while running).
is_finished: Whether the crawler job has finished (regardless of success/failure).
state: :class:CrawlerState — all the per-crawl counters and timings.

Initialize from API response.

Strict parsing: required fields (crawler_uuid, status, is_success, is_finished, and the documented state.* metrics) are read with direct access so missing keys raise KeyError at parse time. This catches API contract drift loud and early.

Args

response_data: Raw API response dictionary.

Class variables

var STATUS_CANCELLED
var STATUS_DONE
var STATUS_PENDING
var STATUS_RUNNING

Instance variables

prop is_cancelled : bool

Expand source code

@property
def is_cancelled(self) -> bool:
    """Whether the crawler was cancelled."""
    return self.status == self.STATUS_CANCELLED

Whether the crawler was cancelled.

prop is_complete : bool

Expand source code

@property
def is_complete(self) -> bool:
    """Whether the crawler reached DONE with is_success=True."""
    return self.status == self.STATUS_DONE and self.is_success is True

Whether the crawler reached DONE with is_success=True.

prop is_failed : bool

Expand source code

@property
def is_failed(self) -> bool:
    """Whether the crawler reached DONE with is_success=False."""
    return self.status == self.STATUS_DONE and self.is_success is False

Whether the crawler reached DONE with is_success=False.

prop is_running : bool

Expand source code

@property
def is_running(self) -> bool:
    """Whether the crawler is currently PENDING or RUNNING."""
    return self.status in (self.STATUS_PENDING, self.STATUS_RUNNING)

Whether the crawler is currently PENDING or RUNNING.

prop progress_pct : float

Expand source code

@property
def progress_pct(self) -> float:
    """
    Visited/extracted ratio as a percentage (0-100).

    Returns 0.0 when no URLs have been extracted yet.
    """
    if self.state.urls_extracted == 0:
        return 0.0
    return (self.state.urls_visited / self.state.urls_extracted) * 100

Visited/extracted ratio as a percentage (0-100).

Returns 0.0 when no URLs have been extracted yet.

class CrawlerUrlDiscoveredWebhook (event: str, crawler_uuid: str, project: str, env: str, action: str, state: CrawlerState, origin: str, discovered_urls: List[str])

Expand source code

@dataclass
class CrawlerUrlDiscoveredWebhook(CrawlerWebhookBase):
    """
    Payload for the ``crawler_url_discovered`` event.

    Emitted when the crawler extracts one or more new URLs from a source.

    Attributes:
        origin: How the URLs were discovered (e.g. ``"navigation"``,
            ``"sitemap"``).
        discovered_urls: The newly-discovered URLs as a list.
    """

    origin: str
    discovered_urls: List[str]

    @classmethod
    def from_payload(cls, event: str, payload: Dict[str, Any]) -> 'CrawlerUrlDiscoveredWebhook':
        base = cls._parse_base(event, payload)
        return cls(
            **base,
            origin=payload['origin'],
            discovered_urls=payload['discovered_urls'],
        )

Payload for the crawler_url_discovered event.

Emitted when the crawler extracts one or more new URLs from a source.

Attributes

origin: How the URLs were discovered (e.g. "navigation", "sitemap").
discovered_urls: The newly-discovered URLs as a list.

Ancestors

CrawlerWebhookBase

Static methods

def from_payload(event: str, payload: Dict[str, Any]) ‑> CrawlerUrlDiscoveredWebhook

Instance variables

var discovered_urls : List[str]
var origin : str

class CrawlerUrlEntry (url: str, status: str, reason: str | None = None)

Expand source code

class CrawlerUrlEntry:
    """
    Single URL entry from ``GET /crawl/{uuid}/urls``.

    The endpoint streams one record per line as ``text/plain``. For
    ``visited`` and ``pending`` URLs each line is just the URL; for ``failed``
    or ``skipped`` URLs the line is ``url,reason``. Streaming text is used
    because this endpoint is expected to scale to millions of records per
    job — JSON is not a suitable wire format at that volume.

    Attributes:
        url: The crawled URL
        status: The filter status used by the caller (``visited``, ``pending``,
            ``failed`` or ``skipped``). Echoed from the request parameter so
            downstream code can disambiguate mixed buffers.
        reason: Only set for ``failed`` / ``skipped`` URLs; ``None`` otherwise.
    """

    __slots__ = ('url', 'status', 'reason')

    def __init__(self, url: str, status: str, reason: Optional[str] = None):
        assert isinstance(url, str) and url, (
            f"CrawlerUrlEntry: url must be a non-empty string, got {url!r}"
        )
        assert isinstance(status, str) and status, (
            f"CrawlerUrlEntry: status must be a non-empty string, got {status!r}"
        )
        self.url = url
        self.status = status
        self.reason = reason

    def __repr__(self):
        if self.reason is not None:
            return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r}, reason={self.reason!r})"
        return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r})"

Single URL entry from GET /crawl/{uuid}/urls.

The endpoint streams one record per line as text/plain. For visited and pending URLs each line is just the URL; for failed or skipped URLs the line is url,reason. Streaming text is used because this endpoint is expected to scale to millions of records per job — JSON is not a suitable wire format at that volume.

Attributes

url: The crawled URL
status: The filter status used by the caller (visited, pending, failed or skipped). Echoed from the request parameter so downstream code can disambiguate mixed buffers.
reason: Only set for failed / skipped URLs; None otherwise.

Instance variables

var reason

Expand source code

class CrawlerUrlEntry:
    """
    Single URL entry from ``GET /crawl/{uuid}/urls``.

    The endpoint streams one record per line as ``text/plain``. For
    ``visited`` and ``pending`` URLs each line is just the URL; for ``failed``
    or ``skipped`` URLs the line is ``url,reason``. Streaming text is used
    because this endpoint is expected to scale to millions of records per
    job — JSON is not a suitable wire format at that volume.

    Attributes:
        url: The crawled URL
        status: The filter status used by the caller (``visited``, ``pending``,
            ``failed`` or ``skipped``). Echoed from the request parameter so
            downstream code can disambiguate mixed buffers.
        reason: Only set for ``failed`` / ``skipped`` URLs; ``None`` otherwise.
    """

    __slots__ = ('url', 'status', 'reason')

    def __init__(self, url: str, status: str, reason: Optional[str] = None):
        assert isinstance(url, str) and url, (
            f"CrawlerUrlEntry: url must be a non-empty string, got {url!r}"
        )
        assert isinstance(status, str) and status, (
            f"CrawlerUrlEntry: status must be a non-empty string, got {status!r}"
        )
        self.url = url
        self.status = status
        self.reason = reason

    def __repr__(self):
        if self.reason is not None:
            return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r}, reason={self.reason!r})"
        return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r})"

var status

Expand source code

class CrawlerUrlEntry:
    """
    Single URL entry from ``GET /crawl/{uuid}/urls``.

    The endpoint streams one record per line as ``text/plain``. For
    ``visited`` and ``pending`` URLs each line is just the URL; for ``failed``
    or ``skipped`` URLs the line is ``url,reason``. Streaming text is used
    because this endpoint is expected to scale to millions of records per
    job — JSON is not a suitable wire format at that volume.

    Attributes:
        url: The crawled URL
        status: The filter status used by the caller (``visited``, ``pending``,
            ``failed`` or ``skipped``). Echoed from the request parameter so
            downstream code can disambiguate mixed buffers.
        reason: Only set for ``failed`` / ``skipped`` URLs; ``None`` otherwise.
    """

    __slots__ = ('url', 'status', 'reason')

    def __init__(self, url: str, status: str, reason: Optional[str] = None):
        assert isinstance(url, str) and url, (
            f"CrawlerUrlEntry: url must be a non-empty string, got {url!r}"
        )
        assert isinstance(status, str) and status, (
            f"CrawlerUrlEntry: status must be a non-empty string, got {status!r}"
        )
        self.url = url
        self.status = status
        self.reason = reason

    def __repr__(self):
        if self.reason is not None:
            return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r}, reason={self.reason!r})"
        return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r})"

var url

Expand source code

class CrawlerUrlEntry:
    """
    Single URL entry from ``GET /crawl/{uuid}/urls``.

    The endpoint streams one record per line as ``text/plain``. For
    ``visited`` and ``pending`` URLs each line is just the URL; for ``failed``
    or ``skipped`` URLs the line is ``url,reason``. Streaming text is used
    because this endpoint is expected to scale to millions of records per
    job — JSON is not a suitable wire format at that volume.

    Attributes:
        url: The crawled URL
        status: The filter status used by the caller (``visited``, ``pending``,
            ``failed`` or ``skipped``). Echoed from the request parameter so
            downstream code can disambiguate mixed buffers.
        reason: Only set for ``failed`` / ``skipped`` URLs; ``None`` otherwise.
    """

    __slots__ = ('url', 'status', 'reason')

    def __init__(self, url: str, status: str, reason: Optional[str] = None):
        assert isinstance(url, str) and url, (
            f"CrawlerUrlEntry: url must be a non-empty string, got {url!r}"
        )
        assert isinstance(status, str) and status, (
            f"CrawlerUrlEntry: status must be a non-empty string, got {status!r}"
        )
        self.url = url
        self.status = status
        self.reason = reason

    def __repr__(self):
        if self.reason is not None:
            return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r}, reason={self.reason!r})"
        return f"CrawlerUrlEntry(url={self.url!r}, status={self.status!r})"

class CrawlerUrlFailedWebhook (event: str, crawler_uuid: str, project: str, env: str, action: str, state: CrawlerState, url: str, error: str, scrape_config: Dict[str, Any], log_link: str | None, scrape_link: str)

Expand source code

@dataclass
class CrawlerUrlFailedWebhook(CrawlerWebhookBase):
    """
    Payload for the ``crawler_url_failed`` event.

    Emitted when a URL cannot be crawled (network error, scrape error,
    blocked, etc.).

    Attributes:
        url: The URL that failed.
        error: The scrapfly error code (e.g. ``ERR::SCRAPE::NETWORK_ERROR``).
        scrape_config: The scrape config that was used for the failed attempt.
        log_link: URL to the full scrape log for this failure. Can be
            ``None`` — the scrape-engine emits ``null`` when no log was
            recorded (e.g. the failure happened before the request was ever
            executed). See
            ``scrape_engine/crawler/webhook_manager.py::dispatch_url_failed``
            line 57.
        scrape_link: URL that re-runs the same scrape as a one-off. Always
            present on the wire (non-nullable). See line 58 of the engine.
    """

    url: str
    error: str
    scrape_config: Dict[str, Any]
    log_link: Optional[str]
    scrape_link: str

    @classmethod
    def from_payload(cls, event: str, payload: Dict[str, Any]) -> 'CrawlerUrlFailedWebhook':
        base = cls._parse_base(event, payload)
        return cls(
            **base,
            url=payload['url'],
            error=payload['error'],
            scrape_config=payload['scrape_config'],
            log_link=payload['links'].get('log'),
            scrape_link=payload['links']['scrape'],
        )

Payload for the crawler_url_failed event.

Emitted when a URL cannot be crawled (network error, scrape error, blocked, etc.).

Attributes

url: The URL that failed.
error: The scrapfly error code (e.g. ERR::SCRAPE::NETWORK_ERROR).
scrape_config: The scrape config that was used for the failed attempt.
log_link: URL to the full scrape log for this failure. Can be None — the scrape-engine emits null when no log was recorded (e.g. the failure happened before the request was ever executed). See scrape_engine/crawler/webhook_manager.py::dispatch_url_failed line 57.
scrape_link: URL that re-runs the same scrape as a one-off. Always present on the wire (non-nullable). See line 58 of the engine.

Ancestors

CrawlerWebhookBase

Static methods

def from_payload(event: str, payload: Dict[str, Any]) ‑> CrawlerUrlFailedWebhook

Instance variables

var error : str
var log_link : str | None
var scrape_config : Dict[str, Any]
var scrape_link : str
var url : str

class CrawlerUrlSkippedWebhook (event: str, crawler_uuid: str, project: str, env: str, action: str, state: CrawlerState, urls: Dict[str, str])

Expand source code

@dataclass
class CrawlerUrlSkippedWebhook(CrawlerWebhookBase):
    """
    Payload for the ``crawler_url_skipped`` event.

    Emitted in a single batch when the crawler decides to skip a set of
    URLs (e.g. when reaching ``page_limit`` with discovered-but-unvisited
    URLs still in the queue).

    Attributes:
        urls: Mapping from URL to the reason it was skipped
            (e.g. ``"page_limit"``, ``"excluded"``, ``"robots_txt"``).
    """

    urls: Dict[str, str]

    @classmethod
    def from_payload(cls, event: str, payload: Dict[str, Any]) -> 'CrawlerUrlSkippedWebhook':
        base = cls._parse_base(event, payload)
        return cls(**base, urls=payload['urls'])

Payload for the crawler_url_skipped event.

Emitted in a single batch when the crawler decides to skip a set of URLs (e.g. when reaching page_limit with discovered-but-unvisited URLs still in the queue).

Attributes

urls: Mapping from URL to the reason it was skipped (e.g. "page_limit", "excluded", "robots_txt").

Ancestors

CrawlerWebhookBase

Static methods

def from_payload(event: str, payload: Dict[str, Any]) ‑> CrawlerUrlSkippedWebhook

Instance variables

var urls : Dict[str, str]

class CrawlerUrlVisitedWebhook (event: str, crawler_uuid: str, project: str, env: str, action: str, state: CrawlerState, url: str, scrape: CrawlerScrapeResult)

Expand source code

@dataclass
class CrawlerUrlVisitedWebhook(CrawlerWebhookBase):
    """
    Payload for the ``crawler_url_visited`` event.

    Emitted after each URL has been successfully scraped.

    Attributes:
        url: The URL that was just visited.
        scrape: Scrape result details (status code, country, log link, content).
    """

    url: str
    scrape: CrawlerScrapeResult

    @classmethod
    def from_payload(cls, event: str, payload: Dict[str, Any]) -> 'CrawlerUrlVisitedWebhook':
        base = cls._parse_base(event, payload)
        return cls(
            **base,
            url=payload['url'],
            scrape=CrawlerScrapeResult.from_dict(payload['scrape']),
        )

Payload for the crawler_url_visited event.

Emitted after each URL has been successfully scraped.

Attributes

url: The URL that was just visited.
scrape: Scrape result details (status code, country, log link, content).

Ancestors

CrawlerWebhookBase

Static methods

def from_payload(event: str, payload: Dict[str, Any]) ‑> CrawlerUrlVisitedWebhook

Instance variables

var scrape : CrawlerScrapeResult
var url : str

class CrawlerUrlsResponse (urls: List[ForwardRef('CrawlerUrlEntry')], page: int, per_page: int)

Expand source code

class CrawlerUrlsResponse:
    """
    Response from ``GET /crawl/{crawler_uuid}/urls``.

    The server returns a streaming ``text/plain`` body with one record per
    line. This class parses that stream into a materialised ``List`` of
    :class:`CrawlerUrlEntry` records for caller convenience.

    Pagination: the wire protocol carries no global ``total``. ``page`` and
    ``per_page`` are echoes of the caller's request parameters — request
    further pages by incrementing ``page`` until the response has no records.

    Attributes:
        urls: List of :class:`CrawlerUrlEntry` records on this page
        page: 1-based page number (echoed from the request)
        per_page: Page size (echoed from the request)
    """

    __slots__ = ('urls', 'page', 'per_page')

    def __init__(self, urls: List['CrawlerUrlEntry'], page: int, per_page: int):
        self.urls = urls
        self.page = page
        self.per_page = per_page

    @classmethod
    def from_text(
        cls,
        body: str,
        status_hint: str,
        page: int,
        per_page: int,
    ) -> 'CrawlerUrlsResponse':
        """
        Parse the raw text body returned by ``GET /crawl/{uuid}/urls``.

        - Empty lines are ignored (trailing newlines, blank records).
        - For ``visited`` / ``pending`` status each line is one URL.
        - For ``failed`` / ``skipped`` status each line is ``url,reason``.
        - When the caller passed no ``status`` filter, the server defaults to
          ``visited``; the caller is expected to pass that as ``status_hint``
          so every parsed record gets the right status tag.

        Args:
            body: Raw response body text.
            status_hint: The status filter the caller used.
            page: Caller-provided page (echoed on the response object).
            per_page: Caller-provided per_page (echoed on the response object).
        """
        entries: List[CrawlerUrlEntry] = []
        for raw_line in body.splitlines():
            line = raw_line.strip()
            if not line:
                continue
            if status_hint in ('visited', 'pending'):
                entries.append(CrawlerUrlEntry(url=line, status=status_hint))
            else:
                # `url,reason` — split on the first comma only. URLs never
                # contain an unencoded comma in the path/query, so this is
                # unambiguous.
                comma_idx = line.find(',')
                if comma_idx == -1:
                    entries.append(CrawlerUrlEntry(url=line, status=status_hint))
                else:
                    entries.append(
                        CrawlerUrlEntry(
                            url=line[:comma_idx],
                            status=status_hint,
                            reason=line[comma_idx + 1:] or None,
                        )
                    )
        return cls(entries, page, per_page)

    def __len__(self) -> int:
        return len(self.urls)

    def __iter__(self) -> Iterator[CrawlerUrlEntry]:
        return iter(self.urls)

    def __repr__(self):
        return (
            f"CrawlerUrlsResponse(page={self.page}, per_page={self.per_page}, "
            f"urls={len(self.urls)})"
        )

Response from GET /crawl/{crawler_uuid}/urls.

The server returns a streaming text/plain body with one record per line. This class parses that stream into a materialised List of :class:CrawlerUrlEntry records for caller convenience.

Pagination: the wire protocol carries no global total. page and per_page are echoes of the caller's request parameters — request further pages by incrementing page until the response has no records.

Attributes

urls: List of :class:CrawlerUrlEntry records on this page
page: 1-based page number (echoed from the request)
per_page: Page size (echoed from the request)

Static methods

def from_text(body: str, status_hint: str, page: int, per_page: int) ‑> CrawlerUrlsResponse

Parse the raw text body returned by GET /crawl/{uuid}/urls.

Empty lines are ignored (trailing newlines, blank records).
For visited / pending status each line is one URL.
For failed / skipped status each line is url,reason.
When the caller passed no status filter, the server defaults to visited; the caller is expected to pass that as status_hint so every parsed record gets the right status tag.

Args

body: Raw response body text.
status_hint: The status filter the caller used.
page: Caller-provided page (echoed on the response object).
per_page: Caller-provided per_page (echoed on the response object).

Instance variables

var page

Expand source code

class CrawlerUrlsResponse:
    """
    Response from ``GET /crawl/{crawler_uuid}/urls``.

    The server returns a streaming ``text/plain`` body with one record per
    line. This class parses that stream into a materialised ``List`` of
    :class:`CrawlerUrlEntry` records for caller convenience.

    Pagination: the wire protocol carries no global ``total``. ``page`` and
    ``per_page`` are echoes of the caller's request parameters — request
    further pages by incrementing ``page`` until the response has no records.

    Attributes:
        urls: List of :class:`CrawlerUrlEntry` records on this page
        page: 1-based page number (echoed from the request)
        per_page: Page size (echoed from the request)
    """

    __slots__ = ('urls', 'page', 'per_page')

    def __init__(self, urls: List['CrawlerUrlEntry'], page: int, per_page: int):
        self.urls = urls
        self.page = page
        self.per_page = per_page

    @classmethod
    def from_text(
        cls,
        body: str,
        status_hint: str,
        page: int,
        per_page: int,
    ) -> 'CrawlerUrlsResponse':
        """
        Parse the raw text body returned by ``GET /crawl/{uuid}/urls``.

        - Empty lines are ignored (trailing newlines, blank records).
        - For ``visited`` / ``pending`` status each line is one URL.
        - For ``failed`` / ``skipped`` status each line is ``url,reason``.
        - When the caller passed no ``status`` filter, the server defaults to
          ``visited``; the caller is expected to pass that as ``status_hint``
          so every parsed record gets the right status tag.

        Args:
            body: Raw response body text.
            status_hint: The status filter the caller used.
            page: Caller-provided page (echoed on the response object).
            per_page: Caller-provided per_page (echoed on the response object).
        """
        entries: List[CrawlerUrlEntry] = []
        for raw_line in body.splitlines():
            line = raw_line.strip()
            if not line:
                continue
            if status_hint in ('visited', 'pending'):
                entries.append(CrawlerUrlEntry(url=line, status=status_hint))
            else:
                # `url,reason` — split on the first comma only. URLs never
                # contain an unencoded comma in the path/query, so this is
                # unambiguous.
                comma_idx = line.find(',')
                if comma_idx == -1:
                    entries.append(CrawlerUrlEntry(url=line, status=status_hint))
                else:
                    entries.append(
                        CrawlerUrlEntry(
                            url=line[:comma_idx],
                            status=status_hint,
                            reason=line[comma_idx + 1:] or None,
                        )
                    )
        return cls(entries, page, per_page)

    def __len__(self) -> int:
        return len(self.urls)

    def __iter__(self) -> Iterator[CrawlerUrlEntry]:
        return iter(self.urls)

    def __repr__(self):
        return (
            f"CrawlerUrlsResponse(page={self.page}, per_page={self.per_page}, "
            f"urls={len(self.urls)})"
        )

var per_page

Expand source code

class CrawlerUrlsResponse:
    """
    Response from ``GET /crawl/{crawler_uuid}/urls``.

    The server returns a streaming ``text/plain`` body with one record per
    line. This class parses that stream into a materialised ``List`` of
    :class:`CrawlerUrlEntry` records for caller convenience.

    Pagination: the wire protocol carries no global ``total``. ``page`` and
    ``per_page`` are echoes of the caller's request parameters — request
    further pages by incrementing ``page`` until the response has no records.

    Attributes:
        urls: List of :class:`CrawlerUrlEntry` records on this page
        page: 1-based page number (echoed from the request)
        per_page: Page size (echoed from the request)
    """

    __slots__ = ('urls', 'page', 'per_page')

    def __init__(self, urls: List['CrawlerUrlEntry'], page: int, per_page: int):
        self.urls = urls
        self.page = page
        self.per_page = per_page

    @classmethod
    def from_text(
        cls,
        body: str,
        status_hint: str,
        page: int,
        per_page: int,
    ) -> 'CrawlerUrlsResponse':
        """
        Parse the raw text body returned by ``GET /crawl/{uuid}/urls``.

        - Empty lines are ignored (trailing newlines, blank records).
        - For ``visited`` / ``pending`` status each line is one URL.
        - For ``failed`` / ``skipped`` status each line is ``url,reason``.
        - When the caller passed no ``status`` filter, the server defaults to
          ``visited``; the caller is expected to pass that as ``status_hint``
          so every parsed record gets the right status tag.

        Args:
            body: Raw response body text.
            status_hint: The status filter the caller used.
            page: Caller-provided page (echoed on the response object).
            per_page: Caller-provided per_page (echoed on the response object).
        """
        entries: List[CrawlerUrlEntry] = []
        for raw_line in body.splitlines():
            line = raw_line.strip()
            if not line:
                continue
            if status_hint in ('visited', 'pending'):
                entries.append(CrawlerUrlEntry(url=line, status=status_hint))
            else:
                # `url,reason` — split on the first comma only. URLs never
                # contain an unencoded comma in the path/query, so this is
                # unambiguous.
                comma_idx = line.find(',')
                if comma_idx == -1:
                    entries.append(CrawlerUrlEntry(url=line, status=status_hint))
                else:
                    entries.append(
                        CrawlerUrlEntry(
                            url=line[:comma_idx],
                            status=status_hint,
                            reason=line[comma_idx + 1:] or None,
                        )
                    )
        return cls(entries, page, per_page)

    def __len__(self) -> int:
        return len(self.urls)

    def __iter__(self) -> Iterator[CrawlerUrlEntry]:
        return iter(self.urls)

    def __repr__(self):
        return (
            f"CrawlerUrlsResponse(page={self.page}, per_page={self.per_page}, "
            f"urls={len(self.urls)})"
        )

var urls

Expand source code

class CrawlerUrlsResponse:
    """
    Response from ``GET /crawl/{crawler_uuid}/urls``.

    The server returns a streaming ``text/plain`` body with one record per
    line. This class parses that stream into a materialised ``List`` of
    :class:`CrawlerUrlEntry` records for caller convenience.

    Pagination: the wire protocol carries no global ``total``. ``page`` and
    ``per_page`` are echoes of the caller's request parameters — request
    further pages by incrementing ``page`` until the response has no records.

    Attributes:
        urls: List of :class:`CrawlerUrlEntry` records on this page
        page: 1-based page number (echoed from the request)
        per_page: Page size (echoed from the request)
    """

    __slots__ = ('urls', 'page', 'per_page')

    def __init__(self, urls: List['CrawlerUrlEntry'], page: int, per_page: int):
        self.urls = urls
        self.page = page
        self.per_page = per_page

    @classmethod
    def from_text(
        cls,
        body: str,
        status_hint: str,
        page: int,
        per_page: int,
    ) -> 'CrawlerUrlsResponse':
        """
        Parse the raw text body returned by ``GET /crawl/{uuid}/urls``.

        - Empty lines are ignored (trailing newlines, blank records).
        - For ``visited`` / ``pending`` status each line is one URL.
        - For ``failed`` / ``skipped`` status each line is ``url,reason``.
        - When the caller passed no ``status`` filter, the server defaults to
          ``visited``; the caller is expected to pass that as ``status_hint``
          so every parsed record gets the right status tag.

        Args:
            body: Raw response body text.
            status_hint: The status filter the caller used.
            page: Caller-provided page (echoed on the response object).
            per_page: Caller-provided per_page (echoed on the response object).
        """
        entries: List[CrawlerUrlEntry] = []
        for raw_line in body.splitlines():
            line = raw_line.strip()
            if not line:
                continue
            if status_hint in ('visited', 'pending'):
                entries.append(CrawlerUrlEntry(url=line, status=status_hint))
            else:
                # `url,reason` — split on the first comma only. URLs never
                # contain an unencoded comma in the path/query, so this is
                # unambiguous.
                comma_idx = line.find(',')
                if comma_idx == -1:
                    entries.append(CrawlerUrlEntry(url=line, status=status_hint))
                else:
                    entries.append(
                        CrawlerUrlEntry(
                            url=line[:comma_idx],
                            status=status_hint,
                            reason=line[comma_idx + 1:] or None,
                        )
                    )
        return cls(entries, page, per_page)

    def __len__(self) -> int:
        return len(self.urls)

    def __iter__(self) -> Iterator[CrawlerUrlEntry]:
        return iter(self.urls)

    def __repr__(self):
        return (
            f"CrawlerUrlsResponse(page={self.page}, per_page={self.per_page}, "
            f"urls={len(self.urls)})"
        )

class CrawlerWebhookBase (event: str, crawler_uuid: str, project: str, env: str, action: str, state: CrawlerState)

Expand source code

@dataclass
class CrawlerWebhookBase:
    """
    Common fields carried by every crawler webhook payload.

    Attributes:
        event: The wire event name (``crawler_started``, etc.).
        crawler_uuid: The crawler job UUID.
        project: Project slug the crawler belongs to.
        env: Environment (``LIVE`` or ``TEST``).
        action: Short action tag emitted by the scrape-engine
            (``started``, ``visited``, ``skipped``, ``url_discovery``,
            ``failed``, ``stopped``, ``cancelled``, ``finished``).
        state: Nested state counters at the moment the webhook was emitted.
    """

    event: str
    crawler_uuid: str
    project: str
    env: str
    action: str
    state: CrawlerState

    @staticmethod
    def _parse_base(event: str, payload: Dict[str, Any]) -> Dict[str, Any]:
        """
        Extract the 5 fields every webhook carries. Used by subclass
        ``from_payload()`` factories.
        """
        return {
            'event': event,
            'crawler_uuid': payload['crawler_uuid'],
            'project': payload['project'],
            'env': payload['env'],
            'action': payload['action'],
            'state': CrawlerState(payload['state']),
        }

Common fields carried by every crawler webhook payload.

Attributes

event: The wire event name (crawler_started, etc.).
crawler_uuid: The crawler job UUID.
project: Project slug the crawler belongs to.
env: Environment (LIVE or TEST).
action: Short action tag emitted by the scrape-engine (started, visited, skipped, url_discovery, failed, stopped, cancelled, finished).
state: Nested state counters at the moment the webhook was emitted.

Subclasses

Instance variables

var action : str
var crawler_uuid : str
var env : str
var event : str
var project : str
var state : CrawlerState

class CrawlerWebhookEvent (value, names=None, *, module=None, qualname=None, type=None, start=1)

Expand source code

class CrawlerWebhookEvent(str, Enum):
    """
    Crawler webhook event names.

    These MUST stay in sync with
    ``apps/scrapfly/scrape-engine/scrape_engine/scrape_engine/crawler/webhook_manager.py``
    class ``WebhookEvents``. The scrape-engine is the source of truth.
    """

    CRAWLER_STARTED = 'crawler_started'
    CRAWLER_STOPPED = 'crawler_stopped'
    CRAWLER_CANCELLED = 'crawler_cancelled'
    CRAWLER_FINISHED = 'crawler_finished'
    CRAWLER_URL_VISITED = 'crawler_url_visited'
    CRAWLER_URL_SKIPPED = 'crawler_url_skipped'
    CRAWLER_URL_DISCOVERED = 'crawler_url_discovered'
    CRAWLER_URL_FAILED = 'crawler_url_failed'

Crawler webhook event names.

These MUST stay in sync with apps/scrapfly/scrape-engine/scrape_engine/scrape_engine/crawler/webhook_manager.py class WebhookEvents. The scrape-engine is the source of truth.

Ancestors

builtins.str
enum.Enum

Class variables

var CRAWLER_CANCELLED
var CRAWLER_FINISHED
var CRAWLER_STARTED
var CRAWLER_STOPPED
var CRAWLER_URL_DISCOVERED
var CRAWLER_URL_FAILED
var CRAWLER_URL_SKIPPED
var CRAWLER_URL_VISITED

class EncoderError (content: str)

Expand source code

class EncoderError(BaseException):

    def __init__(self, content:str):
        self.content = content
        super().__init__()

    def __str__(self) -> str:
        return self.content

    def __repr__(self):
        return "Invalid payload: %s" % self.content

Common base class for all exceptions

Ancestors

builtins.BaseException

class ErrorFactory

Expand source code

class ErrorFactory:
    RESOURCE_TO_ERROR = {
        ScrapflyError.RESOURCE_SCRAPE: ScrapflyScrapeError,
        ScrapflyError.RESOURCE_WEBHOOK: ScrapflyWebhookError,
        ScrapflyError.RESOURCE_PROXY: ScrapflyProxyError,
        ScrapflyError.RESOURCE_SCHEDULE: ScrapflyScheduleError,
        ScrapflyError.RESOURCE_ASP: ScrapflyAspError,
        ScrapflyError.RESOURCE_SESSION: ScrapflySessionError
    }

    # Notable http error has own class for more convenience
    # Only applicable for generic API error
    HTTP_STATUS_TO_ERROR = {
        401: BadApiKeyError,
        402: PaymentRequired,
        429: TooManyRequest
    }

    @staticmethod
    def _get_resource(code: str) -> Optional[Tuple[str, str]]:

        if isinstance(code, str) and '::' in code:
            _, resource, _ = code.split('::')
            return resource

        return None

    @staticmethod
    def create(api_response: 'ScrapeApiResponse'):
        is_retryable = False
        kind = ScrapflyError.KIND_HTTP_BAD_RESPONSE if api_response.success is False else ScrapflyError.KIND_SCRAPFLY_ERROR
        http_code = api_response.status_code
        retry_delay = 5
        retry_times = 3
        description = None
        error_url = 'https://scrapfly.io/docs/scrape-api/errors#api'
        code = api_response.error['code']

        if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
            http_code = api_response.scrape_result['status_code']

        if 'description' in api_response.error:
            description = api_response.error['description']

        message = '%s %s %s' % (str(http_code), code, api_response.error['message'])

        if 'doc_url' in api_response.error:
            error_url = api_response.error['doc_url']

        if 'retryable' in api_response.error:
            is_retryable = api_response.error['retryable']

        resource = ErrorFactory._get_resource(code=code)

        if is_retryable is True:
            if 'X-Retry' in api_response.headers:
                retry_delay = int(api_response.headers['Retry-After'])

        message = '%s: %s' % (message, description) if description else message

        if retry_delay is not None and is_retryable is True:
            message = '%s. Retry delay : %s seconds' % (message, str(retry_delay))

        args = {
            'message': message,
            'code': code,
            'http_status_code': http_code,
            'is_retryable': is_retryable,
            'api_response': api_response,
            'resource': resource,
            'retry_delay': retry_delay,
            'retry_times': retry_times,
            'documentation_url': error_url,
            'request': api_response.request,
            'response': api_response.response
        }

        if kind == ScrapflyError.KIND_HTTP_BAD_RESPONSE:
            if http_code >= 500:
                return ApiHttpServerError(**args)

            is_scraper_api_error = resource in ErrorFactory.RESOURCE_TO_ERROR

            if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR and not is_scraper_api_error:
                return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args)

            if is_scraper_api_error:
                return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)

            return ApiHttpClientError(**args)

        elif kind == ScrapflyError.KIND_SCRAPFLY_ERROR:
            if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
                if http_code >= 500:
                    return UpstreamHttpServerError(**args)

                if http_code >= 400:
                    return UpstreamHttpClientError(**args)

            if resource in ErrorFactory.RESOURCE_TO_ERROR:
                return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)

            return ScrapflyError(**args)

Class variables

var HTTP_STATUS_TO_ERROR
var RESOURCE_TO_ERROR

Static methods

def create(api_response: ScrapeApiResponse)

Expand source code

@staticmethod
def create(api_response: 'ScrapeApiResponse'):
    is_retryable = False
    kind = ScrapflyError.KIND_HTTP_BAD_RESPONSE if api_response.success is False else ScrapflyError.KIND_SCRAPFLY_ERROR
    http_code = api_response.status_code
    retry_delay = 5
    retry_times = 3
    description = None
    error_url = 'https://scrapfly.io/docs/scrape-api/errors#api'
    code = api_response.error['code']

    if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
        http_code = api_response.scrape_result['status_code']

    if 'description' in api_response.error:
        description = api_response.error['description']

    message = '%s %s %s' % (str(http_code), code, api_response.error['message'])

    if 'doc_url' in api_response.error:
        error_url = api_response.error['doc_url']

    if 'retryable' in api_response.error:
        is_retryable = api_response.error['retryable']

    resource = ErrorFactory._get_resource(code=code)

    if is_retryable is True:
        if 'X-Retry' in api_response.headers:
            retry_delay = int(api_response.headers['Retry-After'])

    message = '%s: %s' % (message, description) if description else message

    if retry_delay is not None and is_retryable is True:
        message = '%s. Retry delay : %s seconds' % (message, str(retry_delay))

    args = {
        'message': message,
        'code': code,
        'http_status_code': http_code,
        'is_retryable': is_retryable,
        'api_response': api_response,
        'resource': resource,
        'retry_delay': retry_delay,
        'retry_times': retry_times,
        'documentation_url': error_url,
        'request': api_response.request,
        'response': api_response.response
    }

    if kind == ScrapflyError.KIND_HTTP_BAD_RESPONSE:
        if http_code >= 500:
            return ApiHttpServerError(**args)

        is_scraper_api_error = resource in ErrorFactory.RESOURCE_TO_ERROR

        if http_code in ErrorFactory.HTTP_STATUS_TO_ERROR and not is_scraper_api_error:
            return ErrorFactory.HTTP_STATUS_TO_ERROR[http_code](**args)

        if is_scraper_api_error:
            return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)

        return ApiHttpClientError(**args)

    elif kind == ScrapflyError.KIND_SCRAPFLY_ERROR:
        if code == 'ERR::SCRAPE::BAD_UPSTREAM_RESPONSE':
            if http_code >= 500:
                return UpstreamHttpServerError(**args)

            if http_code >= 400:
                return UpstreamHttpClientError(**args)

        if resource in ErrorFactory.RESOURCE_TO_ERROR:
            return ErrorFactory.RESOURCE_TO_ERROR[resource](**args)

        return ScrapflyError(**args)

class ExtractionAPIError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ExtractionAPIError(HttpError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ExtractionApiResponse (request: requests.models.Request, response: requests.models.Response, extraction_config: ExtractionConfig, api_result: bytes | None = None)

Expand source code

class ExtractionApiResponse(ApiResponse):
    def __init__(self, request: Request, response: Response, extraction_config: ExtractionConfig, api_result: Optional[bytes] = None):
        super().__init__(request, response)
        self.extraction_config = extraction_config
        self.result = self.handle_api_result(api_result)

    @property
    def extraction_result(self) -> Optional[Dict]:
        extraction_result = self.result.get('result', None)
        if not extraction_result:  # handle empty extraction responses
            return {'data': None, 'content_type': None}
        else:
            return extraction_result

    @property
    def data(self) -> Union[Dict, List, str]:  # depends on the LLM prompt
        if self.error is None:
            return self.extraction_result['data']

        return None

    @property
    def content_type(self) -> Optional[str]:
        if self.error is None:
            return self.extraction_result['content_type']

        return None

    @property
    def extraction_success(self) -> bool:
        extraction_result = self.extraction_result
        if extraction_result is None or extraction_result['data'] is None:
            return False

        return True

    @property
    def error(self) -> Optional[Dict]:
        if self.extraction_result is None:
            return self.result

        return None

    def _is_api_error(self, api_result: Dict) -> bool:
        if api_result is None:
            return True

        return 'error_id' in api_result

    def handle_api_result(self, api_result: bytes) -> FrozenDict:
        if self._is_api_error(api_result=api_result) is True:
            return FrozenDict(api_result)

        return FrozenDict({'result': api_result})

    def raise_for_result(self, raise_on_upstream_error=True, error_class=ExtractionAPIError):
        super().raise_for_result(raise_on_upstream_error=raise_on_upstream_error, error_class=error_class)

Ancestors

ApiResponse

Instance variables

prop content_type : str | None

Expand source code

@property
def content_type(self) -> Optional[str]:
    if self.error is None:
        return self.extraction_result['content_type']

    return None

prop data : Dict | List | str

Expand source code

@property
def data(self) -> Union[Dict, List, str]:  # depends on the LLM prompt
    if self.error is None:
        return self.extraction_result['data']

    return None

prop error : Dict | None

Expand source code

@property
def error(self) -> Optional[Dict]:
    if self.extraction_result is None:
        return self.result

    return None

prop extraction_result : Dict | None

Expand source code

@property
def extraction_result(self) -> Optional[Dict]:
    extraction_result = self.result.get('result', None)
    if not extraction_result:  # handle empty extraction responses
        return {'data': None, 'content_type': None}
    else:
        return extraction_result

prop extraction_success : bool

Expand source code

@property
def extraction_success(self) -> bool:
    extraction_result = self.extraction_result
    if extraction_result is None or extraction_result['data'] is None:
        return False

    return True

Methods

def handle_api_result(self, api_result: bytes) ‑> FrozenDict

Expand source code

def handle_api_result(self, api_result: bytes) -> FrozenDict:
    if self._is_api_error(api_result=api_result) is True:
        return FrozenDict(api_result)

    return FrozenDict({'result': api_result})

def raise_for_result(self, raise_on_upstream_error=True, error_class=scrapfly.errors.ExtractionAPIError)

Expand source code

def raise_for_result(self, raise_on_upstream_error=True, error_class=ExtractionAPIError):
    super().raise_for_result(raise_on_upstream_error=raise_on_upstream_error, error_class=error_class)

Inherited members

ApiResponse:
- status_code

Expand source code

class ExtractionConfig(BaseApiConfig):
    body: Union[str, bytes]
    content_type: str
    url: Optional[str] = None
    charset: Optional[str] = None
    extraction_template: Optional[str] = None  # a saved template name
    extraction_ephemeral_template: Optional[Dict]  # ephemeraly declared json template
    extraction_prompt: Optional[str] = None
    extraction_model: Optional[str] = None
    is_document_compressed: Optional[bool] = None
    document_compression_format: Optional[CompressionFormat] = None
    webhook: Optional[str] = None
    timeout: Optional[int] = None
    raise_on_upstream_error: bool = True

    # deprecated options
    template: Optional[str] = None
    ephemeral_template: Optional[Dict] = None

    def __init__(
        self,
        body: Union[str, bytes],
        content_type: str,
        url: Optional[str] = None,
        charset: Optional[str] = None,
        extraction_template: Optional[str] = None,  # a saved template name
        extraction_ephemeral_template: Optional[Dict] = None,  # ephemeraly declared json template
        extraction_prompt: Optional[str] = None,
        extraction_model: Optional[str] = None,
        is_document_compressed: Optional[bool] = None,
        document_compression_format: Optional[CompressionFormat] = None,
        webhook: Optional[str] = None,
        timeout: Optional[int] = None,
        raise_on_upstream_error: bool = True,

        # deprecated options
        template: Optional[str] = None,
        ephemeral_template: Optional[Dict] = None     
    ):
        if template:
            warnings.warn(
                "Deprecation warning: 'template' is deprecated. Use 'extraction_template' instead."
            )
            extraction_template = template

        if ephemeral_template:
            warnings.warn(
                "Deprecation warning: 'ephemeral_template' is deprecated. Use 'extraction_ephemeral_template' instead."
            )
            extraction_ephemeral_template = ephemeral_template

        self.key = None
        self.body = body
        self.content_type = content_type
        self.url = url
        self.charset = charset
        self.extraction_template = extraction_template
        self.extraction_ephemeral_template = extraction_ephemeral_template
        self.extraction_prompt = extraction_prompt
        self.extraction_model = extraction_model
        self.is_document_compressed = is_document_compressed
        self.document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None
        self.webhook = webhook
        self.timeout = timeout
        self.raise_on_upstream_error = raise_on_upstream_error

        if isinstance(body, bytes) or document_compression_format:
            compression_format = detect_compression_format(body)

            if compression_format is not None:
                self.is_document_compressed = True

                if self.document_compression_format and compression_format != self.document_compression_format:
                    raise ExtractionConfigError(
                        f'The detected compression format `{compression_format}` does not match declared format `{self.document_compression_format}`. '
                        f'You must pass the compression format or disable compression.'
                    )
                
                self.document_compression_format = compression_format
            
            else:
                self.is_document_compressed = False

            if self.is_document_compressed is False:
                compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None
                
                if isinstance(self.body, str) and compression_foramt:
                    self.body = self.body.encode('utf-8')

                if compression_foramt == CompressionFormat.GZIP:
                    import gzip
                    self.body = gzip.compress(self.body)

                elif compression_foramt == CompressionFormat.ZSTD:
                    try:
                        import zstandard as zstd
                    except ImportError:
                        raise ExtractionConfigError(
                            f'zstandard is not installed. You must run pip install zstandard'
                            f' to auto compress into zstd or use compression formats.'
                        )
                    self.body = zstd.compress(self.body)
                
                elif compression_foramt == CompressionFormat.DEFLATE:
                    import zlib
                    compressor = zlib.compressobj(wbits=-zlib.MAX_WBITS) # raw deflate compression
                    self.body = compressor.compress(self.body) + compressor.flush()

    def to_api_params(self, key: str) -> Dict:
        params = {
            'key': self.key or key,
            'content_type': self.content_type
        }

        if self.url:
            params['url'] = self.url

        if self.charset:
            params['charset'] = self.charset

        if self.extraction_template and self.extraction_ephemeral_template:
            raise ExtractionConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')

        if self.extraction_template:
            params['extraction_template'] = self.extraction_template

        if self.extraction_ephemeral_template:
            template_json = json.dumps(self.extraction_ephemeral_template)
            params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(template_json.encode('utf-8')).decode('utf-8')

        if self.extraction_prompt:
            params['extraction_prompt'] = quote_plus(self.extraction_prompt)

        if self.extraction_model:
            params['extraction_model'] = self.extraction_model

        if self.webhook:
            params['webhook_name'] = self.webhook

        if self.timeout is not None:
            params['timeout'] = self.timeout

        return params

    def to_dict(self) -> Dict:
        """
        Export the ExtractionConfig instance to a plain dictionary.
        """

        if self.is_document_compressed is True:
                compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None

                if compression_foramt == CompressionFormat.GZIP:
                    import gzip
                    self.body = gzip.decompress(self.body)
                    
                elif compression_foramt == CompressionFormat.ZSTD:
                    import zstandard as zstd
                    self.body = zstd.decompress(self.body)

                elif compression_foramt == CompressionFormat.DEFLATE:
                    import zlib
                    decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
                    self.body = decompressor.decompress(self.body) + decompressor.flush()

                if isinstance(self.body, bytes):
                    self.body = self.body.decode('utf-8')
                    self.is_document_compressed = False

        return {
            'body': self.body,
            'content_type': self.content_type,
            'url': self.url,
            'charset': self.charset,
            'extraction_template': self.extraction_template,
            'extraction_ephemeral_template': self.extraction_ephemeral_template,
            'extraction_prompt': self.extraction_prompt,
            'extraction_model': self.extraction_model,
            'is_document_compressed': self.is_document_compressed,
            'document_compression_format': CompressionFormat(self.document_compression_format).value if self.document_compression_format else None,
            'webhook': self.webhook,
            'raise_on_upstream_error': self.raise_on_upstream_error,
        }
    
    @staticmethod
    def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig':
        """Create an ExtractionConfig instance from a dictionary."""
        body = extraction_config_dict.get('body', None)
        content_type = extraction_config_dict.get('content_type', None)
        url = extraction_config_dict.get('url', None)
        charset = extraction_config_dict.get('charset', None)
        extraction_template = extraction_config_dict.get('extraction_template', None)
        extraction_ephemeral_template = extraction_config_dict.get('extraction_ephemeral_template', None)
        extraction_prompt = extraction_config_dict.get('extraction_prompt', None)
        extraction_model = extraction_config_dict.get('extraction_model', None)
        is_document_compressed = extraction_config_dict.get('is_document_compressed', None)

        document_compression_format = extraction_config_dict.get('document_compression_format', None)
        document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None
        
        webhook = extraction_config_dict.get('webhook', None)
        raise_on_upstream_error = extraction_config_dict.get('raise_on_upstream_error', True)

        return ExtractionConfig(
            body=body,
            content_type=content_type,
            url=url,
            charset=charset,
            extraction_template=extraction_template,
            extraction_ephemeral_template=extraction_ephemeral_template,
            extraction_prompt=extraction_prompt,
            extraction_model=extraction_model,
            is_document_compressed=is_document_compressed,
            document_compression_format=document_compression_format,
            webhook=webhook,
            raise_on_upstream_error=raise_on_upstream_error
        )

Ancestors

BaseApiConfig

Class variables

var body : str | bytes
var charset : str | None
var content_type : str
var document_compression_format : CompressionFormat | None
var ephemeral_template : Dict | None
var extraction_ephemeral_template : Dict | None
var extraction_model : str | None
var extraction_prompt : str | None
var extraction_template : str | None
var is_document_compressed : bool | None
var raise_on_upstream_error : bool
var template : str | None
var timeout : int | None
var url : str | None
var webhook : str | None

Static methods

def from_dict(extraction_config_dict: Dict) ‑> ExtractionConfig

Expand source code

@staticmethod
def from_dict(extraction_config_dict: Dict) -> 'ExtractionConfig':
    """Create an ExtractionConfig instance from a dictionary."""
    body = extraction_config_dict.get('body', None)
    content_type = extraction_config_dict.get('content_type', None)
    url = extraction_config_dict.get('url', None)
    charset = extraction_config_dict.get('charset', None)
    extraction_template = extraction_config_dict.get('extraction_template', None)
    extraction_ephemeral_template = extraction_config_dict.get('extraction_ephemeral_template', None)
    extraction_prompt = extraction_config_dict.get('extraction_prompt', None)
    extraction_model = extraction_config_dict.get('extraction_model', None)
    is_document_compressed = extraction_config_dict.get('is_document_compressed', None)

    document_compression_format = extraction_config_dict.get('document_compression_format', None)
    document_compression_format = CompressionFormat(document_compression_format) if document_compression_format else None
    
    webhook = extraction_config_dict.get('webhook', None)
    raise_on_upstream_error = extraction_config_dict.get('raise_on_upstream_error', True)

    return ExtractionConfig(
        body=body,
        content_type=content_type,
        url=url,
        charset=charset,
        extraction_template=extraction_template,
        extraction_ephemeral_template=extraction_ephemeral_template,
        extraction_prompt=extraction_prompt,
        extraction_model=extraction_model,
        is_document_compressed=is_document_compressed,
        document_compression_format=document_compression_format,
        webhook=webhook,
        raise_on_upstream_error=raise_on_upstream_error
    )

Create an ExtractionConfig instance from a dictionary.

Methods

def to_api_params(self, key: str) ‑> Dict

Expand source code

def to_api_params(self, key: str) -> Dict:
    params = {
        'key': self.key or key,
        'content_type': self.content_type
    }

    if self.url:
        params['url'] = self.url

    if self.charset:
        params['charset'] = self.charset

    if self.extraction_template and self.extraction_ephemeral_template:
        raise ExtractionConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')

    if self.extraction_template:
        params['extraction_template'] = self.extraction_template

    if self.extraction_ephemeral_template:
        template_json = json.dumps(self.extraction_ephemeral_template)
        params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(template_json.encode('utf-8')).decode('utf-8')

    if self.extraction_prompt:
        params['extraction_prompt'] = quote_plus(self.extraction_prompt)

    if self.extraction_model:
        params['extraction_model'] = self.extraction_model

    if self.webhook:
        params['webhook_name'] = self.webhook

    if self.timeout is not None:
        params['timeout'] = self.timeout

    return params

def to_dict(self) ‑> Dict

Expand source code

def to_dict(self) -> Dict:
    """
    Export the ExtractionConfig instance to a plain dictionary.
    """

    if self.is_document_compressed is True:
            compression_foramt = CompressionFormat(self.document_compression_format) if self.document_compression_format else None

            if compression_foramt == CompressionFormat.GZIP:
                import gzip
                self.body = gzip.decompress(self.body)
                
            elif compression_foramt == CompressionFormat.ZSTD:
                import zstandard as zstd
                self.body = zstd.decompress(self.body)

            elif compression_foramt == CompressionFormat.DEFLATE:
                import zlib
                decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS)
                self.body = decompressor.decompress(self.body) + decompressor.flush()

            if isinstance(self.body, bytes):
                self.body = self.body.decode('utf-8')
                self.is_document_compressed = False

    return {
        'body': self.body,
        'content_type': self.content_type,
        'url': self.url,
        'charset': self.charset,
        'extraction_template': self.extraction_template,
        'extraction_ephemeral_template': self.extraction_ephemeral_template,
        'extraction_prompt': self.extraction_prompt,
        'extraction_model': self.extraction_model,
        'is_document_compressed': self.is_document_compressed,
        'document_compression_format': CompressionFormat(self.document_compression_format).value if self.document_compression_format else None,
        'webhook': self.webhook,
        'raise_on_upstream_error': self.raise_on_upstream_error,
    }

Export the ExtractionConfig instance to a plain dictionary.

class HarArchive (har_data: bytes)

Expand source code

class HarArchive:
    """Parser and accessor for HAR (HTTP Archive) format data"""

    def __init__(self, har_data: bytes):
        """
        Initialize HAR archive from bytes

        Args:
            har_data: HAR file content as bytes (JSON format, may be gzipped)
        """
        # Decompress if gzipped
        if isinstance(har_data, bytes):
            if har_data[:2] == b'\x1f\x8b':  # gzip magic number
                har_data = gzip.decompress(har_data)
            har_data = har_data.decode('utf-8')

        # Parse the special format: {"log":{...,"entries":[]}}{"entry1"}{"entry2"}...
        # First object is HAR log structure, subsequent objects are individual entries
        objects = []
        decoder = json.JSONDecoder()
        idx = 0
        while idx < len(har_data):
            har_data_stripped = har_data[idx:].lstrip()
            if not har_data_stripped:
                break
            try:
                obj, end_idx = decoder.raw_decode(har_data_stripped)
                objects.append(obj)
                idx += len(har_data[idx:]) - len(har_data_stripped) + end_idx
            except json.JSONDecodeError:
                break

        # First object should be the HAR log structure
        if objects and 'log' in objects[0]:
            self._data = objects[0]
            self._log = self._data.get('log', {})
            # Remaining objects are the entries
            self._entries = objects[1:] if len(objects) > 1 else []
        else:
            # Fallback: standard HAR format
            self._data = json.loads(har_data) if isinstance(har_data, str) else {}
            self._log = self._data.get('log', {})
            self._entries = self._log.get('entries', [])

    @property
    def version(self) -> str:
        """Get HAR version"""
        return self._log.get('version', '')

    @property
    def creator(self) -> Dict[str, Any]:
        """Get creator information"""
        return self._log.get('creator', {})

    @property
    def pages(self) -> List[Dict[str, Any]]:
        """Get pages list"""
        return self._log.get('pages', [])

    def get_entries(self) -> List[HarEntry]:
        """
        Get all entries as list

        Returns:
            List of HarEntry objects
        """
        return [HarEntry(entry) for entry in self._entries]

    def iter_entries(self) -> Iterator[HarEntry]:
        """
        Iterate through all HAR entries

        Yields:
            HarEntry objects
        """
        for entry in self._entries:
            yield HarEntry(entry)

    def get_urls(self) -> List[str]:
        """
        Get all URLs in the archive

        Returns:
            List of unique URLs
        """
        urls = []
        for entry in self._entries:
            url = entry.get('request', {}).get('url', '')
            if url and url not in urls:
                urls.append(url)
        return urls

    def find_by_url(self, url: str) -> Optional[HarEntry]:
        """
        Find entry by exact URL match

        Args:
            url: URL to search for

        Returns:
            First matching HarEntry or None
        """
        for entry in self.iter_entries():
            if entry.url == url:
                return entry
        return None

    def filter_by_status(self, status_code: int) -> List[HarEntry]:
        """
        Filter entries by status code

        Args:
            status_code: HTTP status code to filter by

        Returns:
            List of matching HarEntry objects
        """
        return [entry for entry in self.iter_entries()
                if entry.status_code == status_code]

    def filter_by_content_type(self, content_type: str) -> List[HarEntry]:
        """
        Filter entries by content type (substring match)

        Args:
            content_type: Content type to filter by (e.g., 'text/html')

        Returns:
            List of matching HarEntry objects
        """
        return [entry for entry in self.iter_entries()
                if content_type.lower() in entry.content_type.lower()]

    def __len__(self) -> int:
        """Get number of entries"""
        return len(self._entries)

    def __repr__(self) -> str:
        return f"<HarArchive {len(self._entries)} entries>"

Parser and accessor for HAR (HTTP Archive) format data

Initialize HAR archive from bytes

Args

har_data: HAR file content as bytes (JSON format, may be gzipped)

Instance variables

prop creator : Dict[str, Any]

Expand source code

@property
def creator(self) -> Dict[str, Any]:
    """Get creator information"""
    return self._log.get('creator', {})

Get creator information

prop pages : List[Dict[str, Any]]

Expand source code

@property
def pages(self) -> List[Dict[str, Any]]:
    """Get pages list"""
    return self._log.get('pages', [])

Get pages list

prop version : str

Expand source code

@property
def version(self) -> str:
    """Get HAR version"""
    return self._log.get('version', '')

Get HAR version

Methods

def filter_by_content_type(self, content_type: str) ‑> List[HarEntry]

Expand source code

def filter_by_content_type(self, content_type: str) -> List[HarEntry]:
    """
    Filter entries by content type (substring match)

    Args:
        content_type: Content type to filter by (e.g., 'text/html')

    Returns:
        List of matching HarEntry objects
    """
    return [entry for entry in self.iter_entries()
            if content_type.lower() in entry.content_type.lower()]

Filter entries by content type (substring match)

Args

content_type: Content type to filter by (e.g., 'text/html')

Returns

List of matching HarEntry objects

def filter_by_status(self, status_code: int) ‑> List[HarEntry]

Expand source code

def filter_by_status(self, status_code: int) -> List[HarEntry]:
    """
    Filter entries by status code

    Args:
        status_code: HTTP status code to filter by

    Returns:
        List of matching HarEntry objects
    """
    return [entry for entry in self.iter_entries()
            if entry.status_code == status_code]

Filter entries by status code

Args

status_code: HTTP status code to filter by

Returns

List of matching HarEntry objects

def find_by_url(self, url: str) ‑> HarEntry | None

Expand source code

def find_by_url(self, url: str) -> Optional[HarEntry]:
    """
    Find entry by exact URL match

    Args:
        url: URL to search for

    Returns:
        First matching HarEntry or None
    """
    for entry in self.iter_entries():
        if entry.url == url:
            return entry
    return None

Find entry by exact URL match

Args

url: URL to search for

Returns

First matching HarEntry or None

def get_entries(self) ‑> List[HarEntry]

Expand source code

def get_entries(self) -> List[HarEntry]:
    """
    Get all entries as list

    Returns:
        List of HarEntry objects
    """
    return [HarEntry(entry) for entry in self._entries]

Get all entries as list

Returns

List of HarEntry objects

def get_urls(self) ‑> List[str]

Expand source code

def get_urls(self) -> List[str]:
    """
    Get all URLs in the archive

    Returns:
        List of unique URLs
    """
    urls = []
    for entry in self._entries:
        url = entry.get('request', {}).get('url', '')
        if url and url not in urls:
            urls.append(url)
    return urls

Get all URLs in the archive

Returns

List of unique URLs

def iter_entries(self) ‑> Iterator[HarEntry]

Expand source code

def iter_entries(self) -> Iterator[HarEntry]:
    """
    Iterate through all HAR entries

    Yields:
        HarEntry objects
    """
    for entry in self._entries:
        yield HarEntry(entry)

Iterate through all HAR entries

Yields

HarEntry objects

class HarEntry (entry_data: Dict[str, Any])

Expand source code

class HarEntry:
    """Represents a single HAR entry (HTTP request/response pair)"""

    def __init__(self, entry_data: Dict[str, Any]):
        """
        Initialize from HAR entry dict

        Args:
            entry_data: HAR entry dictionary
        """
        self._data = entry_data
        self._request = entry_data.get('request', {})
        self._response = entry_data.get('response', {})

    @property
    def url(self) -> str:
        """Get request URL"""
        return self._request.get('url', '')

    @property
    def method(self) -> str:
        """Get HTTP method"""
        return self._request.get('method', 'GET')

    @property
    def status_code(self) -> int:
        """Get response status code"""
        # Handle case where response doesn't exist or status is missing
        if not self._response:
            return 0
        status = self._response.get('status')
        if status is None:
            return 0
        # Ensure it's an int (HAR data might have status as string)
        try:
            return int(status)
        except (ValueError, TypeError):
            return 0

    @property
    def status_text(self) -> str:
        """Get response status text"""
        return self._response.get('statusText', '')

    @property
    def request_headers(self) -> Dict[str, str]:
        """Get request headers as dict"""
        headers = {}
        for header in self._request.get('headers', []):
            headers[header['name']] = header['value']
        return headers

    @property
    def response_headers(self) -> Dict[str, str]:
        """Get response headers as dict"""
        headers = {}
        for header in self._response.get('headers', []):
            headers[header['name']] = header['value']
        return headers

    @property
    def content(self) -> bytes:
        """Get response content as bytes"""
        content_data = self._response.get('content', {})
        text = content_data.get('text', '')

        # Handle base64 encoding if present
        encoding = content_data.get('encoding', '')
        if encoding == 'base64':
            import base64
            return base64.b64decode(text)

        # Return as UTF-8 bytes
        if isinstance(text, str):
            return text.encode('utf-8')
        return text

    @property
    def content_type(self) -> str:
        """Get response content type"""
        return self._response.get('content', {}).get('mimeType', '')

    @property
    def content_size(self) -> int:
        """Get response content size"""
        return self._response.get('content', {}).get('size', 0)

    @property
    def started_datetime(self) -> str:
        """Get when request was started (ISO 8601 format)"""
        return self._data.get('startedDateTime', '')

    @property
    def time(self) -> float:
        """Get total elapsed time in milliseconds"""
        return self._data.get('time', 0.0)

    @property
    def timings(self) -> Dict[str, float]:
        """Get detailed timing information"""
        return self._data.get('timings', {})

    def __repr__(self) -> str:
        return f"<HarEntry {self.method} {self.url} [{self.status_code}]>"

Represents a single HAR entry (HTTP request/response pair)

Initialize from HAR entry dict

Args

entry_data: HAR entry dictionary

Instance variables

prop content : bytes

Expand source code

@property
def content(self) -> bytes:
    """Get response content as bytes"""
    content_data = self._response.get('content', {})
    text = content_data.get('text', '')

    # Handle base64 encoding if present
    encoding = content_data.get('encoding', '')
    if encoding == 'base64':
        import base64
        return base64.b64decode(text)

    # Return as UTF-8 bytes
    if isinstance(text, str):
        return text.encode('utf-8')
    return text

Get response content as bytes

prop content_size : int

Expand source code

@property
def content_size(self) -> int:
    """Get response content size"""
    return self._response.get('content', {}).get('size', 0)

Get response content size

prop content_type : str

Expand source code

@property
def content_type(self) -> str:
    """Get response content type"""
    return self._response.get('content', {}).get('mimeType', '')

Get response content type

prop method : str

Expand source code

@property
def method(self) -> str:
    """Get HTTP method"""
    return self._request.get('method', 'GET')

Get HTTP method

prop request_headers : Dict[str, str]

Expand source code

@property
def request_headers(self) -> Dict[str, str]:
    """Get request headers as dict"""
    headers = {}
    for header in self._request.get('headers', []):
        headers[header['name']] = header['value']
    return headers

Get request headers as dict

prop response_headers : Dict[str, str]

Expand source code

@property
def response_headers(self) -> Dict[str, str]:
    """Get response headers as dict"""
    headers = {}
    for header in self._response.get('headers', []):
        headers[header['name']] = header['value']
    return headers

Get response headers as dict

prop started_datetime : str

Expand source code

@property
def started_datetime(self) -> str:
    """Get when request was started (ISO 8601 format)"""
    return self._data.get('startedDateTime', '')

Get when request was started (ISO 8601 format)

prop status_code : int

Expand source code

@property
def status_code(self) -> int:
    """Get response status code"""
    # Handle case where response doesn't exist or status is missing
    if not self._response:
        return 0
    status = self._response.get('status')
    if status is None:
        return 0
    # Ensure it's an int (HAR data might have status as string)
    try:
        return int(status)
    except (ValueError, TypeError):
        return 0

Get response status code

prop status_text : str

Expand source code

@property
def status_text(self) -> str:
    """Get response status text"""
    return self._response.get('statusText', '')

Get response status text

prop time : float

Expand source code

@property
def time(self) -> float:
    """Get total elapsed time in milliseconds"""
    return self._data.get('time', 0.0)

Get total elapsed time in milliseconds

prop timings : Dict[str, float]

Expand source code

@property
def timings(self) -> Dict[str, float]:
    """Get detailed timing information"""
    return self._data.get('timings', {})

Get detailed timing information

prop url : str

Expand source code

@property
def url(self) -> str:
    """Get request URL"""
    return self._request.get('url', '')

Get request URL

class HttpError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class HttpError(ScrapflyError):

    def __init__(self, request:Request, response:Optional[Response]=None, **kwargs):
        self.request = request
        self.response = response
        super().__init__(**kwargs)

    def __str__(self) -> str:
        if isinstance(self, UpstreamHttpError):
            return f"Target website responded with {self.api_response.scrape_result['status_code']} - {self.api_response.scrape_result['reason']}"

        if self.api_response is not None:
            return self.api_response.error_message

        text = f"{self.response.status_code} - {self.response.reason}"

        # Include detailed error message for all HTTP errors
        if self.message:
            text += f" - {self.message}"

        return text

Common base class for all non-exit exceptions.

Ancestors

ScrapflyError
builtins.Exception
builtins.BaseException

Subclasses

ApiHttpClientError
scrapfly.errors.ExtractionAPIError
scrapfly.errors.QuotaLimitReached
scrapfly.errors.ScraperAPIError
scrapfly.errors.ScreenshotAPIError
scrapfly.errors.TooManyConcurrentRequest
scrapfly.errors.UpstreamHttpError

class OperatingSystem (value, names=None, *, module=None, qualname=None, type=None, start=1)

Expand source code

class OperatingSystem(Enum):
    LINUX = "linux"
    WINDOWS = "windows"
    MACOS = "macos"

An enumeration.

Ancestors

enum.Enum

Class variables

var LINUX
var MACOS
var WINDOWS

class ProxyPool (value, names=None, *, module=None, qualname=None, type=None, start=1)

Expand source code

class ProxyPool(Enum):
    DATACENTER = "datacenter"
    RESIDENTIAL = "residential"

An enumeration.

Ancestors

enum.Enum

Class variables

var DATACENTER
var RESIDENTIAL

class ResponseBodyHandler (use_brotli: bool = False, signing_secrets: Tuple[str] | None = None)

Expand source code

class ResponseBodyHandler:

    SUPPORTED_COMPRESSION = ['gzip', 'deflate']
    SUPPORTED_CONTENT_TYPES = ['application/msgpack', 'application/json']

    class JSONDateTimeDecoder(JSONDecoder):
        def __init__(self, *args, **kargs):
            JSONDecoder.__init__(self, *args, object_hook=_date_parser, **kargs)

    # brotli under perform at same gzip level and upper level destroy the cpu so
    # the trade off do not worth it for most of usage
    def __init__(self, use_brotli: bool = False, signing_secrets: Optional[Tuple[str]] = None):
        if use_brotli is True and 'br' not in self.SUPPORTED_COMPRESSION:
            try:
                try:
                    import brotlicffi as brotli
                    self.SUPPORTED_COMPRESSION.insert(0, 'br')
                except ImportError:
                    import brotli
                    self.SUPPORTED_COMPRESSION.insert(0, 'br')
            except ImportError:
                pass

        try:
            from urllib3.response import HAS_ZSTD
            if HAS_ZSTD and 'zstd' not in self.SUPPORTED_COMPRESSION:
                self.SUPPORTED_COMPRESSION.append('zstd')
        except ImportError:
            pass

        self.content_encoding: str = ', '.join(self.SUPPORTED_COMPRESSION)
        self._signing_secret: Optional[Tuple[str]] = None

        if signing_secrets:
            _secrets = set()

            for signing_secret in signing_secrets:
                _secrets.add(binascii.unhexlify(signing_secret))

            self._signing_secret = tuple(_secrets)

        try:  # automatically use msgpack if available https://msgpack.org/
            import msgpack
            self.accept = 'application/msgpack;charset=utf-8'
            self.content_type = 'application/msgpack;charset=utf-8'
            self.content_loader = partial(msgpack.loads, object_hook=_date_parser, strict_map_key=False)
        except ImportError:
            self.accept = 'application/json;charset=utf-8'
            self.content_type = 'application/json;charset=utf-8'
            self.content_loader = partial(loads, cls=self.JSONDateTimeDecoder)

    def support(self, headers: Dict) -> bool:
        if 'content-type' not in headers:
            return False

        for content_type in self.SUPPORTED_CONTENT_TYPES:
            if headers['content-type'].find(content_type) != -1:
                return True

        return False

    def verify(self, message: bytes, signature: str) -> bool:
        for signing_secret in self._signing_secret:
            if hmac.new(signing_secret, message, hashlib.sha256).hexdigest().upper() == signature:
                return True

        return False

    def read(self, content: bytes, content_encoding: str, content_type: str, signature: Optional[str]) -> Dict:
        if content_encoding == 'gzip' or content_encoding == 'gz':
            import gzip
            content = gzip.decompress(content)
        elif content_encoding == 'deflate':
            import zlib
            content = zlib.decompress(content)
        elif content_encoding == 'brotli' or content_encoding == 'br':
            import brotli
            content = brotli.decompress(content)
        elif content_encoding == 'zstd':
            try:
                from compression import zstd as _zstd  # Python 3.14+
                content = _zstd.decompress(content)
            except ImportError:
                import zstandard
                content = zstandard.decompress(content)

        if self._signing_secret is not None and signature is not None:
            if not self.verify(content, signature):
                raise WebhookSignatureMissMatch()

        if content_type.startswith('application/json'):
            content = loads(content, cls=self.JSONDateTimeDecoder)
        elif content_type.startswith('application/msgpack'):
            import msgpack
            content = msgpack.loads(content, object_hook=_date_parser, strict_map_key=False)

        return content

    def __call__(self, content: bytes, content_type: str) -> Union[str, Dict]:
        content_loader = None

        if content_type.find('application/json') != -1:
            content_loader = partial(loads, cls=self.JSONDateTimeDecoder)
        elif content_type.find('application/msgpack') != -1:
            import msgpack
            content_loader = partial(msgpack.loads, object_hook=_date_parser, strict_map_key=False)

        if content_loader is None:
            raise Exception('Unsupported content type')

        try:
            return content_loader(content)
        except Exception as e:
            try:
                raise EncoderError(content=content.decode('utf-8')) from e
            except UnicodeError:
                raise EncoderError(content=base64.b64encode(content).decode('utf-8')) from e

Class variables

var JSONDateTimeDecoder: Simple JSON https://json.org decoder

Performs the following translations in decoding by default:

+---------------+-------------------+ | JSON | Python | +===============+===================+ | object | dict | +---------------+-------------------+ | array | list | +---------------+-------------------+ | string | str | +---------------+-------------------+ | number (int) | int | +---------------+-------------------+ | number (real) | float | +---------------+-------------------+ | true | True | +---------------+-------------------+ | false | False | +---------------+-------------------+ | null | None | +---------------+-------------------+

It also understands NaN, Infinity, and -Infinity as their corresponding float values, which is outside the JSON spec.
var SUPPORTED_COMPRESSION
var SUPPORTED_CONTENT_TYPES

Methods

def read(self, content: bytes, content_encoding: str, content_type: str, signature: str | None) ‑> Dict

Expand source code

def read(self, content: bytes, content_encoding: str, content_type: str, signature: Optional[str]) -> Dict:
    if content_encoding == 'gzip' or content_encoding == 'gz':
        import gzip
        content = gzip.decompress(content)
    elif content_encoding == 'deflate':
        import zlib
        content = zlib.decompress(content)
    elif content_encoding == 'brotli' or content_encoding == 'br':
        import brotli
        content = brotli.decompress(content)
    elif content_encoding == 'zstd':
        try:
            from compression import zstd as _zstd  # Python 3.14+
            content = _zstd.decompress(content)
        except ImportError:
            import zstandard
            content = zstandard.decompress(content)

    if self._signing_secret is not None and signature is not None:
        if not self.verify(content, signature):
            raise WebhookSignatureMissMatch()

    if content_type.startswith('application/json'):
        content = loads(content, cls=self.JSONDateTimeDecoder)
    elif content_type.startswith('application/msgpack'):
        import msgpack
        content = msgpack.loads(content, object_hook=_date_parser, strict_map_key=False)

    return content

def support(self, headers: Dict) ‑> bool

Expand source code

def support(self, headers: Dict) -> bool:
    if 'content-type' not in headers:
        return False

    for content_type in self.SUPPORTED_CONTENT_TYPES:
        if headers['content-type'].find(content_type) != -1:
            return True

    return False

def verify(self, message: bytes, signature: str) ‑> bool

Expand source code

def verify(self, message: bytes, signature: str) -> bool:
    for signing_secret in self._signing_secret:
        if hmac.new(signing_secret, message, hashlib.sha256).hexdigest().upper() == signature:
            return True

    return False

class ScrapeApiResponse (request: requests.models.Request, response: requests.models.Response, scrape_config: ScrapeConfig, api_result: Dict | None = None, large_object_handler: Callable | None = None)

Expand source code

class ScrapeApiResponse(ApiResponse):
    scrape_config:ScrapeConfig
    large_object_handler:Callable

    def __init__(self, request: Request, response: Response, scrape_config: ScrapeConfig, api_result: Optional[Dict] = None, large_object_handler:Optional[Callable]=None):
        super().__init__(request, response)
        self.scrape_config = scrape_config
        self.large_object_handler = large_object_handler

        if self.scrape_config.method == 'HEAD':
            api_result = {
                'result': {
                    'request_headers': {},
                    'status': 'DONE',
                    'success': 200 <= self.response.status_code < 300,
                    'response_headers': self.response.headers,
                    'status_code': self.response.status_code,
                    'reason': self.response.reason,
                    'format': 'text',
                    'content': ''
                },
                'context': {},
                'config': self.scrape_config.__dict__
            }

            if 'X-Scrapfly-Reject-Code' in self.response.headers:
                api_result['result']['error'] = {
                    'code': self.response.headers['X-Scrapfly-Reject-Code'],
                    'http_code': int(self.response.headers['X-Scrapfly-Reject-Http-Code']),
                    'message': self.response.headers['X-Scrapfly-Reject-Description'],
                    'error_id': self.response.headers['X-Scrapfly-Reject-ID'],
                    'retryable': True if self.response.headers['X-Scrapfly-Reject-Retryable'] == 'yes' else False,
                    'doc_url': '',
                    'links': {}
                }

                if 'X-Scrapfly-Reject-Doc' in self.response.headers:
                    api_result['result']['error']['doc_url'] = self.response.headers['X-Scrapfly-Reject-Doc']
                    api_result['result']['error']['links']['Related Docs'] = self.response.headers['X-Scrapfly-Reject-Doc']

        if isinstance(api_result, str):
            raise HttpError(
                request=request,
                response=response,
                message='Bad gateway',
                code=502,
                http_status_code=502,
                is_retryable=True
            )

        self.result = self.handle_api_result(api_result=api_result)

    @property
    def scrape_result(self) -> Optional[Dict]:
        return self.result.get('result', None)

    @property
    def config(self) -> Optional[Dict]:
        if self.scrape_result is None:
            return None

        return self.result['config']

    @property
    def context(self) -> Optional[Dict]:
        if self.scrape_result is None:
            return None

        return self.result['context']

    @property
    def content(self) -> str:
        if self.scrape_result is None:
            return ''

        return self.scrape_result['content']

    @property
    def success(self) -> bool:
        """
            Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code
        """
        return 200 <= self.response.status_code <= 299

    @property
    def scrape_success(self) -> bool:
        scrape_result = self.scrape_result

        if not scrape_result:
            return False

        return self.scrape_result['success']

    @property
    def error(self) -> Optional[Dict]:
        if self.scrape_result is None:
            return None

        if self.scrape_success is False:
            return self.scrape_result.get('error')

    @property
    def upstream_status_code(self) -> Optional[int]:
        if self.scrape_result is None:
            return None

        if 'status_code' in self.scrape_result:
            return self.scrape_result['status_code']

        return None

    @cached_property
    def soup(self) -> 'BeautifulSoup':
        if self.scrape_result['format'] != 'text':
            raise ContentError("Unable to cast into beautiful soup, the format of data is binary - must be text content")

        try:
            from bs4 import BeautifulSoup
            soup = BeautifulSoup(self.content, "lxml")
            return soup
        except ImportError as e:
            logger.error('You must install scrapfly[parser] to enable this feature')

    @cached_property
    def selector(self) -> 'Selector':
        if self.scrape_result['format'] != 'text':
            raise ContentError("Unable to cast into beautiful soup, the format of data is binary - must be text content")

        try:
            from parsel import Selector
            return Selector(text=self.content)
        except ImportError as e:
            logger.error('You must install parsel or scrapy package to enable this feature')
            raise e

    def handle_api_result(self, api_result: Dict) -> Optional[FrozenDict]:
        if self._is_api_error(api_result=api_result) is True:
            return FrozenDict(api_result)

        try:
            if isinstance(api_result['config']['headers'], list):
                api_result['config']['headers'] = {}
        except TypeError:
            logger.info(api_result)
            raise

        with suppress(KeyError):
            api_result['result']['request_headers'] = CaseInsensitiveDict(api_result['result']['request_headers'])
            api_result['result']['response_headers'] = CaseInsensitiveDict(api_result['result']['response_headers'])

        if self.large_object_handler is not None and api_result['result']['content']:
            content_format = api_result['result']['format']

            if content_format in ['clob', 'blob']:
                api_result['result']['content'], api_result['result']['format'] = self.large_object_handler(callback_url=api_result['result']['content'], format=content_format)
            elif content_format == 'binary':
                base64_payload = api_result['result']['content']

                if isinstance(base64_payload, bytes):
                    base64_payload = base64_payload.decode('utf-8')

                api_result['result']['content'] = BytesIO(b64decode(base64_payload))

        return FrozenDict(api_result)

    def _is_api_error(self, api_result: Dict) -> bool:
        if self.scrape_config.method == 'HEAD':
            if 'X-Reject-Reason' in self.response.headers:
                return True
            return False

        if api_result is None:
            return True

        return 'error_id' in api_result

    def upstream_result_into_response(self, _class=Response) -> Optional[Response]:
        if _class != Response:
            raise RuntimeError('only Response from requests package is supported at the moment')

        if self.result is None:
            return None

        if self.response.status_code != 200:
            return None

        response = Response()
        response.status_code = self.scrape_result['status_code']
        response.reason = self.scrape_result['reason']

        if self.scrape_result['content']:
            if isinstance(self.scrape_result['content'], BytesIO):
                response._content = self.scrape_result['content'].getvalue()
            elif isinstance(self.scrape_result['content'], bytes):
                response._content = self.scrape_result['content']
            elif isinstance(self.scrape_result['content'], str):
                response._content = self.scrape_result['content'].encode('utf-8')
        else:
            response._content = None

        response.headers.update(self.scrape_result['response_headers'])
        response.url = self.scrape_result['url']

        response.request = Request(
            method=self.config['method'],
            url=self.config['url'],
            headers=self.scrape_result['request_headers'],
            data=self.config['body'] if self.config['body'] else None
        )

        if 'set-cookie' in response.headers:
            for raw_cookie in response.headers['set-cookie']:
                for name, cookie in SimpleCookie(raw_cookie).items():
                    expires = cookie.get('expires')

                    if expires == '':
                        expires = None

                    if expires:
                        try:
                            expires = parse(expires).timestamp()
                        except ValueError:
                            expires = None

                    if type(expires) == str:
                        if '.' in expires:
                            expires = float(expires)
                        else:
                            expires = int(expires)

                    response.cookies.set_cookie(Cookie(
                        version=cookie.get('version') if cookie.get('version') else None,
                        name=name,
                        value=cookie.value,
                        path=cookie.get('path', ''),
                        expires=expires,
                        comment=cookie.get('comment'),
                        domain=cookie.get('domain', ''),
                        secure=cookie.get('secure'),
                        port=None,
                        port_specified=False,
                        domain_specified=cookie.get('domain') is not None and cookie.get('domain') != '',
                        domain_initial_dot=bool(cookie.get('domain').startswith('.')) if cookie.get('domain') is not None else False,
                        path_specified=cookie.get('path') != '' and cookie.get('path') is not None,
                        discard=False,
                        comment_url=None,
                        rest={
                            'httponly': cookie.get('httponly'),
                            'samesite': cookie.get('samesite'),
                            'max-age': cookie.get('max-age')
                        }
                    ))

        return response

    def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None, content: Optional[Union[str, bytes]] = None):
        file_content = content or self.scrape_result['content']
        file_path = None
        file_extension = None

        if name:
            name_parts = name.split('.')
            if len(name_parts) > 1:
                file_extension = name_parts[-1]

        if not file:
            if file_extension is None:
                try:
                    mime_type = self.scrape_result['response_headers']['content-type']
                except KeyError:
                    mime_type = 'application/octet-stream'

                if ';' in mime_type:
                    mime_type = mime_type.split(';')[0]

                file_extension = '.' + mime_type.split('/')[1]

            if not name:
                name = self.config['url'].split('/')[-1]

            if name.find(file_extension) == -1:
                name += file_extension

            file_path = path + '/' + name if path is not None else name

            if file_path == file_extension:
                url = re.sub(r'(https|http)?://', '', self.config['url']).replace('/', '-')

                if url[-1] == '-':
                    url = url[:-1]

                url += file_extension

                file_path = url

            file = open(file_path, 'wb')

        if isinstance(file_content, str):
            file_content = BytesIO(file_content.encode('utf-8'))
        elif isinstance(file_content, bytes):
            file_content = BytesIO(file_content)

        file_content.seek(0)
        with file as f:
            shutil.copyfileobj(file_content, f, length=131072)

        logger.info('file %s created' % file_path)

    def raise_for_result(self, raise_on_upstream_error=True, error_class=ApiHttpClientError):
        super().raise_for_result(raise_on_upstream_error=raise_on_upstream_error, error_class=error_class)
        if self.result['result']['status'] == 'DONE' and self.scrape_success is False:
            error = ErrorFactory.create(api_response=self)
            if error:
                if isinstance(error, UpstreamHttpError):
                    if raise_on_upstream_error is True:
                        raise error
                else:
                    raise error

Ancestors

ApiResponse

Class variables

var large_object_handler : Callable
var scrape_config : ScrapeConfig

Instance variables

prop config : Dict | None

Expand source code

@property
def config(self) -> Optional[Dict]:
    if self.scrape_result is None:
        return None

    return self.result['config']

prop content : str

Expand source code

@property
def content(self) -> str:
    if self.scrape_result is None:
        return ''

    return self.scrape_result['content']

prop context : Dict | None

Expand source code

@property
def context(self) -> Optional[Dict]:
    if self.scrape_result is None:
        return None

    return self.result['context']

prop error : Dict | None

Expand source code

@property
def error(self) -> Optional[Dict]:
    if self.scrape_result is None:
        return None

    if self.scrape_success is False:
        return self.scrape_result.get('error')

prop scrape_result : Dict | None

Expand source code

@property
def scrape_result(self) -> Optional[Dict]:
    return self.result.get('result', None)

prop scrape_success : bool

Expand source code

@property
def scrape_success(self) -> bool:
    scrape_result = self.scrape_result

    if not scrape_result:
        return False

    return self.scrape_result['success']

var selector : Selector

Expand source code

@cached_property
def selector(self) -> 'Selector':
    if self.scrape_result['format'] != 'text':
        raise ContentError("Unable to cast into beautiful soup, the format of data is binary - must be text content")

    try:
        from parsel import Selector
        return Selector(text=self.content)
    except ImportError as e:
        logger.error('You must install parsel or scrapy package to enable this feature')
        raise e

var soup : BeautifulSoup

Expand source code

@cached_property
def soup(self) -> 'BeautifulSoup':
    if self.scrape_result['format'] != 'text':
        raise ContentError("Unable to cast into beautiful soup, the format of data is binary - must be text content")

    try:
        from bs4 import BeautifulSoup
        soup = BeautifulSoup(self.content, "lxml")
        return soup
    except ImportError as e:
        logger.error('You must install scrapfly[parser] to enable this feature')

prop success : bool

Expand source code

@property
def success(self) -> bool:
    """
        Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code
    """
    return 200 <= self.response.status_code <= 299

Success means Scrapfly api reply correctly to the call, but the scrape can be unsuccessful if the upstream reply with error status code

prop upstream_status_code : int | None

Expand source code

@property
def upstream_status_code(self) -> Optional[int]:
    if self.scrape_result is None:
        return None

    if 'status_code' in self.scrape_result:
        return self.scrape_result['status_code']

    return None

Methods

def handle_api_result(self, api_result: Dict) ‑> FrozenDict | None

Expand source code

def handle_api_result(self, api_result: Dict) -> Optional[FrozenDict]:
    if self._is_api_error(api_result=api_result) is True:
        return FrozenDict(api_result)

    try:
        if isinstance(api_result['config']['headers'], list):
            api_result['config']['headers'] = {}
    except TypeError:
        logger.info(api_result)
        raise

    with suppress(KeyError):
        api_result['result']['request_headers'] = CaseInsensitiveDict(api_result['result']['request_headers'])
        api_result['result']['response_headers'] = CaseInsensitiveDict(api_result['result']['response_headers'])

    if self.large_object_handler is not None and api_result['result']['content']:
        content_format = api_result['result']['format']

        if content_format in ['clob', 'blob']:
            api_result['result']['content'], api_result['result']['format'] = self.large_object_handler(callback_url=api_result['result']['content'], format=content_format)
        elif content_format == 'binary':
            base64_payload = api_result['result']['content']

            if isinstance(base64_payload, bytes):
                base64_payload = base64_payload.decode('utf-8')

            api_result['result']['content'] = BytesIO(b64decode(base64_payload))

    return FrozenDict(api_result)

def raise_for_result(self, raise_on_upstream_error=True, error_class=scrapfly.errors.ApiHttpClientError)

Expand source code

def raise_for_result(self, raise_on_upstream_error=True, error_class=ApiHttpClientError):
    super().raise_for_result(raise_on_upstream_error=raise_on_upstream_error, error_class=error_class)
    if self.result['result']['status'] == 'DONE' and self.scrape_success is False:
        error = ErrorFactory.create(api_response=self)
        if error:
            if isinstance(error, UpstreamHttpError):
                if raise_on_upstream_error is True:
                    raise error
            else:
                raise error

Expand source code

def sink(self, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None, content: Optional[Union[str, bytes]] = None):
    file_content = content or self.scrape_result['content']
    file_path = None
    file_extension = None

    if name:
        name_parts = name.split('.')
        if len(name_parts) > 1:
            file_extension = name_parts[-1]

    if not file:
        if file_extension is None:
            try:
                mime_type = self.scrape_result['response_headers']['content-type']
            except KeyError:
                mime_type = 'application/octet-stream'

            if ';' in mime_type:
                mime_type = mime_type.split(';')[0]

            file_extension = '.' + mime_type.split('/')[1]

        if not name:
            name = self.config['url'].split('/')[-1]

        if name.find(file_extension) == -1:
            name += file_extension

        file_path = path + '/' + name if path is not None else name

        if file_path == file_extension:
            url = re.sub(r'(https|http)?://', '', self.config['url']).replace('/', '-')

            if url[-1] == '-':
                url = url[:-1]

            url += file_extension

            file_path = url

        file = open(file_path, 'wb')

    if isinstance(file_content, str):
        file_content = BytesIO(file_content.encode('utf-8'))
    elif isinstance(file_content, bytes):
        file_content = BytesIO(file_content)

    file_content.seek(0)
    with file as f:
        shutil.copyfileobj(file_content, f, length=131072)

    logger.info('file %s created' % file_path)

def upstream_result_into_response(self) ‑> requests.models.Response | None

Expand source code

def upstream_result_into_response(self, _class=Response) -> Optional[Response]:
    if _class != Response:
        raise RuntimeError('only Response from requests package is supported at the moment')

    if self.result is None:
        return None

    if self.response.status_code != 200:
        return None

    response = Response()
    response.status_code = self.scrape_result['status_code']
    response.reason = self.scrape_result['reason']

    if self.scrape_result['content']:
        if isinstance(self.scrape_result['content'], BytesIO):
            response._content = self.scrape_result['content'].getvalue()
        elif isinstance(self.scrape_result['content'], bytes):
            response._content = self.scrape_result['content']
        elif isinstance(self.scrape_result['content'], str):
            response._content = self.scrape_result['content'].encode('utf-8')
    else:
        response._content = None

    response.headers.update(self.scrape_result['response_headers'])
    response.url = self.scrape_result['url']

    response.request = Request(
        method=self.config['method'],
        url=self.config['url'],
        headers=self.scrape_result['request_headers'],
        data=self.config['body'] if self.config['body'] else None
    )

    if 'set-cookie' in response.headers:
        for raw_cookie in response.headers['set-cookie']:
            for name, cookie in SimpleCookie(raw_cookie).items():
                expires = cookie.get('expires')

                if expires == '':
                    expires = None

                if expires:
                    try:
                        expires = parse(expires).timestamp()
                    except ValueError:
                        expires = None

                if type(expires) == str:
                    if '.' in expires:
                        expires = float(expires)
                    else:
                        expires = int(expires)

                response.cookies.set_cookie(Cookie(
                    version=cookie.get('version') if cookie.get('version') else None,
                    name=name,
                    value=cookie.value,
                    path=cookie.get('path', ''),
                    expires=expires,
                    comment=cookie.get('comment'),
                    domain=cookie.get('domain', ''),
                    secure=cookie.get('secure'),
                    port=None,
                    port_specified=False,
                    domain_specified=cookie.get('domain') is not None and cookie.get('domain') != '',
                    domain_initial_dot=bool(cookie.get('domain').startswith('.')) if cookie.get('domain') is not None else False,
                    path_specified=cookie.get('path') != '' and cookie.get('path') is not None,
                    discard=False,
                    comment_url=None,
                    rest={
                        'httponly': cookie.get('httponly'),
                        'samesite': cookie.get('samesite'),
                        'max-age': cookie.get('max-age')
                    }
                ))

    return response

Inherited members

ApiResponse:
- status_code

class ScrapeConfig (url: str, retry: bool = True, method: str = 'GET', country: str | None = None, render_js: bool = False, cache: bool = False, cache_clear: bool = False, ssl: bool = False, dns: bool = False, asp: bool = False, debug: bool = False, raise_on_upstream_error: bool = True, cache_ttl: int | None = None, proxy_pool: str | None = None, session: str | None = None, tags: List[str] | Set[str] | None = None, format: Format | None = None, format_options: List[FormatOption] | None = None, extraction_template: str | None = None, extraction_ephemeral_template: Dict | None = None, extraction_prompt: str | None = None, extraction_model: str | None = None, correlation_id: str | None = None, cookies: requests.structures.CaseInsensitiveDict | None = None, body: str | None = None, data: Dict | None = None, headers: requests.structures.CaseInsensitiveDict | Dict[str, str] | None = None, js: str = None, rendering_wait: int = None, rendering_stage: Literal['complete', 'domcontentloaded'] = 'complete', wait_for_selector: str | None = None, screenshots: Dict | None = None, screenshot_flags: List[ScreenshotFlag] | None = None, session_sticky_proxy: bool | None = None, webhook: str | None = None, timeout: int | None = None, js_scenario: List | None = None, extract: Dict | None = None, os: str | None = None, lang: List[str] | None = None, auto_scroll: bool | None = None, cost_budget: int | None = None, browser_brand: str | None = None, geolocation: str | None = None, proxified_response: bool | None = None)

Expand source code

class ScrapeConfig(BaseApiConfig):

    PUBLIC_DATACENTER_POOL = 'public_datacenter_pool'
    PUBLIC_RESIDENTIAL_POOL = 'public_residential_pool'

    url: str
    retry: bool = True
    method: str = 'GET'
    country: Optional[str] = None
    render_js: bool = False
    cache: bool = False
    cache_clear:bool = False
    ssl:bool = False
    dns:bool = False
    asp:bool = False
    debug: bool = False
    raise_on_upstream_error:bool = True
    cache_ttl:Optional[int] = None
    proxy_pool:Optional[str] = None
    session: Optional[str] = None
    tags: Optional[List[str]] = None
    format: Optional[Format] = None, # raw(unchanged)
    format_options: Optional[List[FormatOption]] 
    extraction_template: Optional[str] = None  # a saved template name
    extraction_ephemeral_template: Optional[Dict]  # ephemeraly declared json template
    extraction_prompt: Optional[str] = None
    extraction_model: Optional[str] = None    
    correlation_id: Optional[str] = None
    cookies: Optional[CaseInsensitiveDict] = None
    body: Optional[str] = None
    data: Optional[Dict] = None
    headers: Optional[CaseInsensitiveDict] = None
    js: str = None
    rendering_wait: int = None
    rendering_stage: Literal["complete", "domcontentloaded"] = "complete"
    wait_for_selector: Optional[str] = None
    session_sticky_proxy:bool = True
    screenshots:Optional[Dict]=None
    screenshot_flags: Optional[List[ScreenshotFlag]] = None,
    webhook:Optional[str]=None
    timeout:Optional[int]=None # in milliseconds
    js_scenario: Dict = None
    extract: Dict = None
    lang:Optional[List[str]] = None
    os:Optional[str] = None
    auto_scroll:Optional[bool] = None
    cost_budget:Optional[int] = None
    browser_brand:Optional[str] = None
    geolocation:Optional[str] = None
    proxified_response:Optional[bool] = None

    def __init__(
        self,
        url: str,
        retry: bool = True,
        method: str = 'GET',
        country: Optional[str] = None,
        render_js: bool = False,
        cache: bool = False,
        cache_clear:bool = False,
        ssl:bool = False,
        dns:bool = False,
        asp:bool = False,
        debug: bool = False,
        raise_on_upstream_error:bool = True,
        cache_ttl:Optional[int] = None,
        proxy_pool:Optional[str] = None,
        session: Optional[str] = None,
        tags: Optional[Union[List[str], Set[str]]] = None,
        format: Optional[Format] = None, # raw(unchanged)
        format_options: Optional[List[FormatOption]] = None, # raw(unchanged)
        extraction_template: Optional[str] = None,  # a saved template name
        extraction_ephemeral_template: Optional[Dict] = None,  # ephemeraly declared json template
        extraction_prompt: Optional[str] = None,
        extraction_model: Optional[str] = None,        
        correlation_id: Optional[str] = None,
        cookies: Optional[CaseInsensitiveDict] = None,
        body: Optional[str] = None,
        data: Optional[Dict] = None,
        headers: Optional[Union[CaseInsensitiveDict, Dict[str, str]]] = None,
        js: str = None,
        rendering_wait: int = None,
        rendering_stage: Literal["complete", "domcontentloaded"] = "complete",
        wait_for_selector: Optional[str] = None,
        screenshots:Optional[Dict]=None,
        screenshot_flags: Optional[List[ScreenshotFlag]] = None,
        session_sticky_proxy:Optional[bool] = None,
        webhook:Optional[str] = None,
        timeout:Optional[int] = None, # in milliseconds
        js_scenario:Optional[List] = None,
        extract:Optional[Dict] = None,
        os:Optional[str] = None,
        lang:Optional[List[str]] = None,
        auto_scroll:Optional[bool] = None,
        cost_budget:Optional[int] = None,
        browser_brand:Optional[str] = None,
        geolocation:Optional[str] = None,
        proxified_response:Optional[bool] = None
    ):
        assert(type(url) is str)

        if isinstance(tags, List):
            tags = set(tags)

        cookies = cookies or {}
        headers = headers or {}

        self.cookies = CaseInsensitiveDict(cookies)
        self.headers = CaseInsensitiveDict(headers)
        self.url = url
        self.retry = retry
        self.method = method
        self.country = country
        self.session_sticky_proxy = session_sticky_proxy
        self.render_js = render_js
        self.cache = cache
        self.cache_clear = cache_clear
        self.asp = asp
        self.webhook = webhook
        self.session = session
        self.debug = debug
        self.cache_ttl = cache_ttl
        self.proxy_pool = proxy_pool
        self.tags = tags or set()
        self.format = format
        self.format_options = format_options
        self.extraction_template = extraction_template
        self.extraction_ephemeral_template = extraction_ephemeral_template
        self.extraction_prompt = extraction_prompt
        self.extraction_model = extraction_model        
        self.correlation_id = correlation_id
        self.wait_for_selector = wait_for_selector
        self.body = body
        self.data = data
        self.js = js
        self.rendering_wait = rendering_wait
        self.rendering_stage = rendering_stage
        self.raise_on_upstream_error = raise_on_upstream_error
        self.screenshots = screenshots
        self.screenshot_flags = screenshot_flags
        self.key = None
        self.dns = dns
        self.ssl = ssl
        self.js_scenario = js_scenario
        self.timeout = timeout
        self.extract = extract
        self.lang = lang
        self.os = os
        self.auto_scroll = auto_scroll
        self.cost_budget = cost_budget
        self.browser_brand = browser_brand
        self.geolocation = geolocation
        self.proxified_response = proxified_response

        if cookies:
            _cookies = []

            for name, value in cookies.items():
                _cookies.append(name + '=' + value)

            if 'cookie' in self.headers:
                if self.headers['cookie'][-1] != ';':
                    self.headers['cookie'] += ';'
            else:
                self.headers['cookie'] = ''

            self.headers['cookie'] += '; '.join(_cookies)

        if self.body and self.data:
            raise ScrapeConfigError('You cannot pass both parameters body and data. You must choose')

        if method in ['POST', 'PUT', 'PATCH']:
            if self.body is None and self.data is not None:
                if 'content-type' not in self.headers:
                    self.headers['content-type'] = 'application/x-www-form-urlencoded'
                    self.body = urlencode(data)
                else:
                    if self.headers['content-type'].find('application/json') != -1:
                        self.body = json.dumps(data)
                    elif self.headers['content-type'].find('application/x-www-form-urlencoded') != -1:
                        self.body = urlencode(data)
                    else:
                        raise ScrapeConfigError('Content-Type "%s" not supported, use body parameter to pass pre encoded body according to your content type' % self.headers['content-type'])
            elif self.body is None and self.data is None:
                self.headers['content-type'] = 'text/plain'

    def to_api_params(self, key:str) -> Dict:
        params = {
            'key': self.key or key,
            'url': self.url
        }

        if self.country is not None:
            params['country'] = self.country

        for name, value in self.headers.items():
            params['headers[%s]' % name] = value

        if self.webhook is not None:
            params['webhook_name'] = self.webhook

        if self.timeout is not None:
            params['timeout'] = self.timeout

        if self.extract is not None:
            params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')

        if self.cost_budget is not None:
            params['cost_budget'] = self.cost_budget

        if self.proxified_response is not None:
            params['proxified_response'] = self._bool_to_http(self.proxified_response)

        if self.render_js is True:
            params['render_js'] = self._bool_to_http(self.render_js)

            if self.wait_for_selector is not None:
                params['wait_for_selector'] = self.wait_for_selector

            if self.js:
                params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')

            if self.js_scenario:
                params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')

            if self.rendering_wait:
                params['rendering_wait'] = self.rendering_wait
            
            if self.rendering_stage:
                params['rendering_stage'] = self.rendering_stage

            if self.screenshots is not None:
                for name, element in self.screenshots.items():
                    params['screenshots[%s]' % name] = element

            if self.screenshot_flags is not None:
                self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags]
                params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags)
            else:
                if self.screenshot_flags is not None:
                    logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled')

            if self.auto_scroll is True:
                params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
        else:
            if self.wait_for_selector is not None:
                logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')

            if self.screenshots:
                logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')

            if self.js_scenario:
                logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')

            if self.js:
                logging.warning('Params "js" is ignored. Works only if render_js is enabled')

            if self.rendering_wait:
                logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')

        if self.asp is True:
            params['asp'] = self._bool_to_http(self.asp)

        if self.retry is False:
            params['retry'] = self._bool_to_http(self.retry)

        if self.cache is True:
            params['cache'] = self._bool_to_http(self.cache)

            if self.cache_clear is True:
                params['cache_clear'] = self._bool_to_http(self.cache_clear)

            if self.cache_ttl is not None:
                params['cache_ttl'] = self.cache_ttl
        else:
            if self.cache_clear is True:
                logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')

            if self.cache_ttl is not None:
                logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')

        if self.dns is True:
            params['dns'] = self._bool_to_http(self.dns)

        if self.ssl is True:
            params['ssl'] = self._bool_to_http(self.ssl)

        if self.tags:
            params['tags'] = ','.join(self.tags)

        if self.format:
            params['format'] = Format(self.format).value
            if self.format_options:
                params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options)

        if self.extraction_template and self.extraction_ephemeral_template:
            raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')

        if self.extraction_template:
            params['extraction_template'] = self.extraction_template

        if self.extraction_ephemeral_template:
            self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
            params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')

        if self.extraction_prompt:
            params['extraction_prompt'] = quote_plus(self.extraction_prompt)

        if self.extraction_model:
            params['extraction_model'] = self.extraction_model

        if self.correlation_id:
            params['correlation_id'] = self.correlation_id

        if self.session:
            params['session'] = self.session

            if self.session_sticky_proxy is True: # false by default
                params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
        else:
            if self.session_sticky_proxy:
                logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')

        if self.debug is True:
            params['debug'] = self._bool_to_http(self.debug)

        if self.proxy_pool is not None:
            params['proxy_pool'] = self.proxy_pool

        if self.lang is not None:
            params['lang'] = ','.join(self.lang)

        if self.os is not None:
            params['os'] = self.os

        if self.browser_brand is not None:
            params['browser_brand'] = self.browser_brand
        if self.geolocation is not None:
            params['geolocation'] = self.geolocation

        return params

    @staticmethod
    def from_exported_config(config:str) -> 'ScrapeConfig':
        try:
            from msgpack import loads as msgpack_loads
        except ImportError as e:
            print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
            raise

        data = msgpack_loads(base64.b64decode(config))

        headers = {}

        for name, value in data['headers'].items():
            if isinstance(value, Iterable):
                headers[name] = '; '.join(value)
            else:
                headers[name] = value

        return ScrapeConfig(
            url=data['url'],
            retry=data['retry'],
            headers=headers,
            session=data['session'],
            session_sticky_proxy=data['session_sticky_proxy'],
            cache=data['cache'],
            cache_ttl=data['cache_ttl'],
            cache_clear=data['cache_clear'],
            render_js=data['render_js'],
            method=data['method'],
            asp=data['asp'],
            body=data['body'],
            ssl=data['ssl'],
            dns=data['dns'],
            country=data['country'],
            debug=data['debug'],
            correlation_id=data['correlation_id'],
            tags=data['tags'],
            format=data['format'],
            js=data['js'],
            rendering_wait=data['rendering_wait'],
            screenshots=data['screenshots'] or {},
            screenshot_flags=data['screenshot_flags'],
            proxy_pool=data['proxy_pool'],
            auto_scroll=data['auto_scroll'],
            cost_budget=data['cost_budget']
        )

    def to_dict(self) -> Dict:
        """
        Export the ScrapeConfig instance to a plain dictionary. 
        Useful for JSON-serialization or other external storage.
        """
        
        return {
            'url': self.url,
            'retry': self.retry,
            'method': self.method,
            'country': self.country,
            'render_js': self.render_js,
            'cache': self.cache,
            'cache_clear': self.cache_clear,
            'ssl': self.ssl,
            'dns': self.dns,
            'asp': self.asp,
            'debug': self.debug,
            'raise_on_upstream_error': self.raise_on_upstream_error,
            'cache_ttl': self.cache_ttl,
            'proxy_pool': self.proxy_pool,
            'session': self.session,
            'tags': list(self.tags),
            'format': Format(self.format).value if self.format else None,
            'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None,
            'extraction_template': self.extraction_template,
            'extraction_ephemeral_template': self.extraction_ephemeral_template,
            'extraction_prompt': self.extraction_prompt,
            'extraction_model': self.extraction_model,
            'correlation_id': self.correlation_id,
            'cookies': CaseInsensitiveDict(self.cookies),
            'body': self.body,
            'data': None if self.body else self.data,
            'headers': CaseInsensitiveDict(self.headers),
            'js': self.js,
            'rendering_wait': self.rendering_wait,
            'wait_for_selector': self.wait_for_selector,
            'session_sticky_proxy': self.session_sticky_proxy,
            'screenshots': self.screenshots,
            'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None,
            'webhook': self.webhook,
            'timeout': self.timeout,
            'js_scenario': self.js_scenario,
            'extract': self.extract,
            'lang': self.lang,
            'os': self.os,
            'auto_scroll': self.auto_scroll,
            'cost_budget': self.cost_budget,
            'browser_brand': self.browser_brand,
        }

    @staticmethod
    def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig':
        """Create a ScrapeConfig instance from a dictionary."""
        url = scrape_config_dict.get('url', None)
        retry = scrape_config_dict.get('retry', False)
        method = scrape_config_dict.get('method', 'GET')
        country = scrape_config_dict.get('country', None)
        render_js = scrape_config_dict.get('render_js', False)
        cache = scrape_config_dict.get('cache', False)
        cache_clear = scrape_config_dict.get('cache_clear', False)
        ssl = scrape_config_dict.get('ssl', False)
        dns = scrape_config_dict.get('dns', False)
        asp = scrape_config_dict.get('asp', False)
        debug = scrape_config_dict.get('debug', False)
        raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True)
        cache_ttl = scrape_config_dict.get('cache_ttl', None)
        proxy_pool = scrape_config_dict.get('proxy_pool', None)
        session = scrape_config_dict.get('session', None)
        tags = scrape_config_dict.get('tags', [])

        format = scrape_config_dict.get('format', None)
        format = Format(format) if format else None

        format_options = scrape_config_dict.get('format_options', None)
        format_options = [FormatOption(option) for option in format_options] if format_options else None

        extraction_template = scrape_config_dict.get('extraction_template', None)
        extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None)
        extraction_prompt = scrape_config_dict.get('extraction_prompt', None)
        extraction_model = scrape_config_dict.get('extraction_model', None)
        correlation_id = scrape_config_dict.get('correlation_id', None)
        cookies = scrape_config_dict.get('cookies', {})
        body = scrape_config_dict.get('body', None)
        data = scrape_config_dict.get('data', None)
        headers = scrape_config_dict.get('headers', {})
        js = scrape_config_dict.get('js', None)
        rendering_wait = scrape_config_dict.get('rendering_wait', None)
        wait_for_selector = scrape_config_dict.get('wait_for_selector', None)
        screenshots = scrape_config_dict.get('screenshots', [])
        
        screenshot_flags = scrape_config_dict.get('screenshot_flags', [])
        screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None

        session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False)
        webhook = scrape_config_dict.get('webhook', None)
        timeout = scrape_config_dict.get('timeout', None)
        js_scenario = scrape_config_dict.get('js_scenario', None)
        extract = scrape_config_dict.get('extract', None)
        os = scrape_config_dict.get('os', None)
        lang = scrape_config_dict.get('lang', None)
        auto_scroll = scrape_config_dict.get('auto_scroll', None)
        cost_budget = scrape_config_dict.get('cost_budget', None)
        browser_brand = scrape_config_dict.get('browser_brand', None)

        return ScrapeConfig(
            url=url,
            retry=retry,
            method=method,
            country=country,
            render_js=render_js,
            cache=cache,
            cache_clear=cache_clear,
            ssl=ssl,
            dns=dns,
            asp=asp,
            debug=debug,
            raise_on_upstream_error=raise_on_upstream_error,
            cache_ttl=cache_ttl,
            proxy_pool=proxy_pool,
            session=session,
            tags=tags,
            format=format,
            format_options=format_options,
            extraction_template=extraction_template,
            extraction_ephemeral_template=extraction_ephemeral_template,
            extraction_prompt=extraction_prompt,
            extraction_model=extraction_model,
            correlation_id=correlation_id,
            cookies=cookies,
            body=body,
            data=data,
            headers=headers,
            js=js,
            rendering_wait=rendering_wait,
            wait_for_selector=wait_for_selector,
            screenshots=screenshots,
            screenshot_flags=screenshot_flags,
            session_sticky_proxy=session_sticky_proxy,
            webhook=webhook,
            timeout=timeout,
            js_scenario=js_scenario,
            extract=extract,
            os=os,
            lang=lang,
            auto_scroll=auto_scroll,
            cost_budget=cost_budget,
            browser_brand=browser_brand,
        )

Ancestors

BaseApiConfig

Class variables

var PUBLIC_DATACENTER_POOL
var PUBLIC_RESIDENTIAL_POOL
var asp : bool
var auto_scroll : bool | None
var body : str | None
var browser_brand : str | None
var cache : bool
var cache_clear : bool
var cache_ttl : int | None
var cookies : requests.structures.CaseInsensitiveDict | None
var correlation_id : str | None
var cost_budget : int | None
var country : str | None
var data : Dict | None
var debug : bool
var dns : bool
var extract : Dict
var extraction_ephemeral_template : Dict | None
var extraction_model : str | None
var extraction_prompt : str | None
var extraction_template : str | None
var format : Format | None
var format_options : List[FormatOption] | None
var geolocation : str | None
var headers : requests.structures.CaseInsensitiveDict | None
var js : str
var js_scenario : Dict
var lang : List[str] | None
var method : str
var os : str | None
var proxified_response : bool | None
var proxy_pool : str | None
var raise_on_upstream_error : bool
var render_js : bool
var rendering_stage : Literal['complete', 'domcontentloaded']
var rendering_wait : int
var retry : bool
var screenshot_flags : List[ScreenshotFlag] | None
var screenshots : Dict | None
var session : str | None
var session_sticky_proxy : bool
var ssl : bool
var tags : List[str] | None
var timeout : int | None
var url : str
var wait_for_selector : str | None
var webhook : str | None

Static methods

def from_dict(scrape_config_dict: Dict) ‑> ScrapeConfig

Expand source code

@staticmethod
def from_dict(scrape_config_dict: Dict) -> 'ScrapeConfig':
    """Create a ScrapeConfig instance from a dictionary."""
    url = scrape_config_dict.get('url', None)
    retry = scrape_config_dict.get('retry', False)
    method = scrape_config_dict.get('method', 'GET')
    country = scrape_config_dict.get('country', None)
    render_js = scrape_config_dict.get('render_js', False)
    cache = scrape_config_dict.get('cache', False)
    cache_clear = scrape_config_dict.get('cache_clear', False)
    ssl = scrape_config_dict.get('ssl', False)
    dns = scrape_config_dict.get('dns', False)
    asp = scrape_config_dict.get('asp', False)
    debug = scrape_config_dict.get('debug', False)
    raise_on_upstream_error = scrape_config_dict.get('raise_on_upstream_error', True)
    cache_ttl = scrape_config_dict.get('cache_ttl', None)
    proxy_pool = scrape_config_dict.get('proxy_pool', None)
    session = scrape_config_dict.get('session', None)
    tags = scrape_config_dict.get('tags', [])

    format = scrape_config_dict.get('format', None)
    format = Format(format) if format else None

    format_options = scrape_config_dict.get('format_options', None)
    format_options = [FormatOption(option) for option in format_options] if format_options else None

    extraction_template = scrape_config_dict.get('extraction_template', None)
    extraction_ephemeral_template = scrape_config_dict.get('extraction_ephemeral_template', None)
    extraction_prompt = scrape_config_dict.get('extraction_prompt', None)
    extraction_model = scrape_config_dict.get('extraction_model', None)
    correlation_id = scrape_config_dict.get('correlation_id', None)
    cookies = scrape_config_dict.get('cookies', {})
    body = scrape_config_dict.get('body', None)
    data = scrape_config_dict.get('data', None)
    headers = scrape_config_dict.get('headers', {})
    js = scrape_config_dict.get('js', None)
    rendering_wait = scrape_config_dict.get('rendering_wait', None)
    wait_for_selector = scrape_config_dict.get('wait_for_selector', None)
    screenshots = scrape_config_dict.get('screenshots', [])
    
    screenshot_flags = scrape_config_dict.get('screenshot_flags', [])
    screenshot_flags = [ScreenshotFlag(flag) for flag in screenshot_flags] if screenshot_flags else None

    session_sticky_proxy = scrape_config_dict.get('session_sticky_proxy', False)
    webhook = scrape_config_dict.get('webhook', None)
    timeout = scrape_config_dict.get('timeout', None)
    js_scenario = scrape_config_dict.get('js_scenario', None)
    extract = scrape_config_dict.get('extract', None)
    os = scrape_config_dict.get('os', None)
    lang = scrape_config_dict.get('lang', None)
    auto_scroll = scrape_config_dict.get('auto_scroll', None)
    cost_budget = scrape_config_dict.get('cost_budget', None)
    browser_brand = scrape_config_dict.get('browser_brand', None)

    return ScrapeConfig(
        url=url,
        retry=retry,
        method=method,
        country=country,
        render_js=render_js,
        cache=cache,
        cache_clear=cache_clear,
        ssl=ssl,
        dns=dns,
        asp=asp,
        debug=debug,
        raise_on_upstream_error=raise_on_upstream_error,
        cache_ttl=cache_ttl,
        proxy_pool=proxy_pool,
        session=session,
        tags=tags,
        format=format,
        format_options=format_options,
        extraction_template=extraction_template,
        extraction_ephemeral_template=extraction_ephemeral_template,
        extraction_prompt=extraction_prompt,
        extraction_model=extraction_model,
        correlation_id=correlation_id,
        cookies=cookies,
        body=body,
        data=data,
        headers=headers,
        js=js,
        rendering_wait=rendering_wait,
        wait_for_selector=wait_for_selector,
        screenshots=screenshots,
        screenshot_flags=screenshot_flags,
        session_sticky_proxy=session_sticky_proxy,
        webhook=webhook,
        timeout=timeout,
        js_scenario=js_scenario,
        extract=extract,
        os=os,
        lang=lang,
        auto_scroll=auto_scroll,
        cost_budget=cost_budget,
        browser_brand=browser_brand,
    )

Create a ScrapeConfig instance from a dictionary.

def from_exported_config(config: str) ‑> ScrapeConfig

Expand source code

@staticmethod
def from_exported_config(config:str) -> 'ScrapeConfig':
    try:
        from msgpack import loads as msgpack_loads
    except ImportError as e:
        print('You must install msgpack package - run: pip install "scrapfly-sdk[seepdup] or pip install msgpack')
        raise

    data = msgpack_loads(base64.b64decode(config))

    headers = {}

    for name, value in data['headers'].items():
        if isinstance(value, Iterable):
            headers[name] = '; '.join(value)
        else:
            headers[name] = value

    return ScrapeConfig(
        url=data['url'],
        retry=data['retry'],
        headers=headers,
        session=data['session'],
        session_sticky_proxy=data['session_sticky_proxy'],
        cache=data['cache'],
        cache_ttl=data['cache_ttl'],
        cache_clear=data['cache_clear'],
        render_js=data['render_js'],
        method=data['method'],
        asp=data['asp'],
        body=data['body'],
        ssl=data['ssl'],
        dns=data['dns'],
        country=data['country'],
        debug=data['debug'],
        correlation_id=data['correlation_id'],
        tags=data['tags'],
        format=data['format'],
        js=data['js'],
        rendering_wait=data['rendering_wait'],
        screenshots=data['screenshots'] or {},
        screenshot_flags=data['screenshot_flags'],
        proxy_pool=data['proxy_pool'],
        auto_scroll=data['auto_scroll'],
        cost_budget=data['cost_budget']
    )

Methods

def to_api_params(self, key: str) ‑> Dict

Expand source code

def to_api_params(self, key:str) -> Dict:
    params = {
        'key': self.key or key,
        'url': self.url
    }

    if self.country is not None:
        params['country'] = self.country

    for name, value in self.headers.items():
        params['headers[%s]' % name] = value

    if self.webhook is not None:
        params['webhook_name'] = self.webhook

    if self.timeout is not None:
        params['timeout'] = self.timeout

    if self.extract is not None:
        params['extract'] = base64.urlsafe_b64encode(json.dumps(self.extract).encode('utf-8')).decode('utf-8')

    if self.cost_budget is not None:
        params['cost_budget'] = self.cost_budget

    if self.proxified_response is not None:
        params['proxified_response'] = self._bool_to_http(self.proxified_response)

    if self.render_js is True:
        params['render_js'] = self._bool_to_http(self.render_js)

        if self.wait_for_selector is not None:
            params['wait_for_selector'] = self.wait_for_selector

        if self.js:
            params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')

        if self.js_scenario:
            params['js_scenario'] = base64.urlsafe_b64encode(json.dumps(self.js_scenario).encode('utf-8')).decode('utf-8')

        if self.rendering_wait:
            params['rendering_wait'] = self.rendering_wait
        
        if self.rendering_stage:
            params['rendering_stage'] = self.rendering_stage

        if self.screenshots is not None:
            for name, element in self.screenshots.items():
                params['screenshots[%s]' % name] = element

        if self.screenshot_flags is not None:
            self.screenshot_flags = [ScreenshotFlag(flag) for flag in self.screenshot_flags]
            params["screenshot_flags"] = ",".join(flag.value for flag in self.screenshot_flags)
        else:
            if self.screenshot_flags is not None:
                logging.warning('Params "screenshot_flags" is ignored. Works only if screenshots is enabled')

        if self.auto_scroll is True:
            params['auto_scroll'] = self._bool_to_http(self.auto_scroll)
    else:
        if self.wait_for_selector is not None:
            logging.warning('Params "wait_for_selector" is ignored. Works only if render_js is enabled')

        if self.screenshots:
            logging.warning('Params "screenshots" is ignored. Works only if render_js is enabled')

        if self.js_scenario:
            logging.warning('Params "js_scenario" is ignored. Works only if render_js is enabled')

        if self.js:
            logging.warning('Params "js" is ignored. Works only if render_js is enabled')

        if self.rendering_wait:
            logging.warning('Params "rendering_wait" is ignored. Works only if render_js is enabled')

    if self.asp is True:
        params['asp'] = self._bool_to_http(self.asp)

    if self.retry is False:
        params['retry'] = self._bool_to_http(self.retry)

    if self.cache is True:
        params['cache'] = self._bool_to_http(self.cache)

        if self.cache_clear is True:
            params['cache_clear'] = self._bool_to_http(self.cache_clear)

        if self.cache_ttl is not None:
            params['cache_ttl'] = self.cache_ttl
    else:
        if self.cache_clear is True:
            logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')

        if self.cache_ttl is not None:
            logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')

    if self.dns is True:
        params['dns'] = self._bool_to_http(self.dns)

    if self.ssl is True:
        params['ssl'] = self._bool_to_http(self.ssl)

    if self.tags:
        params['tags'] = ','.join(self.tags)

    if self.format:
        params['format'] = Format(self.format).value
        if self.format_options:
            params['format'] += ':' + ','.join(FormatOption(option).value for option in self.format_options)

    if self.extraction_template and self.extraction_ephemeral_template:
        raise ScrapeConfigError('You cannot pass both parameters extraction_template and extraction_ephemeral_template. You must choose')

    if self.extraction_template:
        params['extraction_template'] = self.extraction_template

    if self.extraction_ephemeral_template:
        self.extraction_ephemeral_template = json.dumps(self.extraction_ephemeral_template)
        params['extraction_template'] = 'ephemeral:' + urlsafe_b64encode(self.extraction_ephemeral_template.encode('utf-8')).decode('utf-8')

    if self.extraction_prompt:
        params['extraction_prompt'] = quote_plus(self.extraction_prompt)

    if self.extraction_model:
        params['extraction_model'] = self.extraction_model

    if self.correlation_id:
        params['correlation_id'] = self.correlation_id

    if self.session:
        params['session'] = self.session

        if self.session_sticky_proxy is True: # false by default
            params['session_sticky_proxy'] = self._bool_to_http(self.session_sticky_proxy)
    else:
        if self.session_sticky_proxy:
            logging.warning('Params "session_sticky_proxy" is ignored. Works only if session is enabled')

    if self.debug is True:
        params['debug'] = self._bool_to_http(self.debug)

    if self.proxy_pool is not None:
        params['proxy_pool'] = self.proxy_pool

    if self.lang is not None:
        params['lang'] = ','.join(self.lang)

    if self.os is not None:
        params['os'] = self.os

    if self.browser_brand is not None:
        params['browser_brand'] = self.browser_brand
    if self.geolocation is not None:
        params['geolocation'] = self.geolocation

    return params

def to_dict(self) ‑> Dict

Expand source code

def to_dict(self) -> Dict:
    """
    Export the ScrapeConfig instance to a plain dictionary. 
    Useful for JSON-serialization or other external storage.
    """
    
    return {
        'url': self.url,
        'retry': self.retry,
        'method': self.method,
        'country': self.country,
        'render_js': self.render_js,
        'cache': self.cache,
        'cache_clear': self.cache_clear,
        'ssl': self.ssl,
        'dns': self.dns,
        'asp': self.asp,
        'debug': self.debug,
        'raise_on_upstream_error': self.raise_on_upstream_error,
        'cache_ttl': self.cache_ttl,
        'proxy_pool': self.proxy_pool,
        'session': self.session,
        'tags': list(self.tags),
        'format': Format(self.format).value if self.format else None,
        'format_options': [FormatOption(option).value for option in self.format_options] if self.format_options else None,
        'extraction_template': self.extraction_template,
        'extraction_ephemeral_template': self.extraction_ephemeral_template,
        'extraction_prompt': self.extraction_prompt,
        'extraction_model': self.extraction_model,
        'correlation_id': self.correlation_id,
        'cookies': CaseInsensitiveDict(self.cookies),
        'body': self.body,
        'data': None if self.body else self.data,
        'headers': CaseInsensitiveDict(self.headers),
        'js': self.js,
        'rendering_wait': self.rendering_wait,
        'wait_for_selector': self.wait_for_selector,
        'session_sticky_proxy': self.session_sticky_proxy,
        'screenshots': self.screenshots,
        'screenshot_flags': [ScreenshotFlag(flag).value for flag in self.screenshot_flags] if self.screenshot_flags else None,
        'webhook': self.webhook,
        'timeout': self.timeout,
        'js_scenario': self.js_scenario,
        'extract': self.extract,
        'lang': self.lang,
        'os': self.os,
        'auto_scroll': self.auto_scroll,
        'cost_budget': self.cost_budget,
        'browser_brand': self.browser_brand,
    }

Export the ScrapeConfig instance to a plain dictionary. Useful for JSON-serialization or other external storage.

class ScraperAPI

Expand source code

class ScraperAPI:

    MONITORING_DATA_FORMAT_STRUCTURED = 'structured'
    MONITORING_DATA_FORMAT_PROMETHEUS = 'prometheus'

    MONITORING_PERIOD_SUBSCRIPTION = 'subscription'
    MONITORING_PERIOD_LAST_7D = 'last7d'
    MONITORING_PERIOD_LAST_24H = 'last24h'
    MONITORING_PERIOD_LAST_1H = 'last1h'
    MONITORING_PERIOD_LAST_5m = 'last5m'

    MONITORING_ACCOUNT_AGGREGATION = 'account'
    MONITORING_PROJECT_AGGREGATION = 'project'
    MONITORING_TARGET_AGGREGATION = 'target'

Class variables

var MONITORING_ACCOUNT_AGGREGATION
var MONITORING_DATA_FORMAT_PROMETHEUS
var MONITORING_DATA_FORMAT_STRUCTURED
var MONITORING_PERIOD_LAST_1H
var MONITORING_PERIOD_LAST_24H
var MONITORING_PERIOD_LAST_5m
var MONITORING_PERIOD_LAST_7D
var MONITORING_PERIOD_SUBSCRIPTION
var MONITORING_PROJECT_AGGREGATION
var MONITORING_TARGET_AGGREGATION

class ScrapflyAspError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflyAspError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflyClient (key: str, host: str = 'https://api.scrapfly.io', verify=True, debug: bool = False, max_concurrency: int = 1, connect_timeout: int = 30, web_scraping_api_read_timeout: int = 160, extraction_api_read_timeout: int = 35, screenshot_api_read_timeout: int = 60, read_timeout: int = 30, default_read_timeout: int = 30, reporter: Callable | None = None, cloud_browser_host: str | None = None, **kwargs)

Expand source code

class ScrapflyClient:

    HOST = 'https://api.scrapfly.io'
    CLOUD_BROWSER_HOST = 'wss://browser.scrapfly.io'
    CLOUD_BROWSER_API_HOST = 'https://browser.scrapfly.io'
    DEFAULT_CONNECT_TIMEOUT = 30
    DEFAULT_READ_TIMEOUT = 30

    DEFAULT_WEBSCRAPING_API_READ_TIMEOUT = 160 # 155 real
    DEFAULT_SCREENSHOT_API_READ_TIMEOUT = 60  # 30 real
    DEFAULT_EXTRACTION_API_READ_TIMEOUT = 35 # 30 real
    DEFAULT_CRAWLER_API_READ_TIMEOUT = 30

    host:str
    key:str
    max_concurrency:int
    verify:bool
    debug:bool
    distributed_mode:bool
    connect_timeout:int
    web_scraping_api_read_timeout:int
    screenshot_api_read_timeout:int
    extraction_api_read_timeout:int
    monitoring_api_read_timeout:int
    default_read_timeout:int
    brotli: bool
    reporter:Reporter
    version:str

    # @deprecated
    read_timeout:int

    CONCURRENCY_AUTO = 'auto' # retrieve the allowed concurrency from your account
    DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S'

    def __init__(
        self,
        key: str,
        host: str = HOST,
        verify=True,
        debug: bool = False,
        max_concurrency:int=1,
        connect_timeout:int = DEFAULT_CONNECT_TIMEOUT,
        web_scraping_api_read_timeout: int = DEFAULT_WEBSCRAPING_API_READ_TIMEOUT,
        extraction_api_read_timeout: int = DEFAULT_EXTRACTION_API_READ_TIMEOUT,
        screenshot_api_read_timeout: int = DEFAULT_SCREENSHOT_API_READ_TIMEOUT,

        # @deprecated
        read_timeout:int = DEFAULT_READ_TIMEOUT,
        default_read_timeout:int = DEFAULT_READ_TIMEOUT,
        reporter:Optional[Callable]=None,
        cloud_browser_host: Optional[str] = None,
        **kwargs
    ):
        if host[-1] == '/':  # remove last '/' if exists
            host = host[:-1]

        if 'distributed_mode' in kwargs:
            warnings.warn("distributed mode is deprecated and will be remove the next version -"
              " user should handle themself the session name based on the concurrency",
              DeprecationWarning,
              stacklevel=2
            )

        if 'brotli' in kwargs:
            warnings.warn("brotli arg is deprecated and will be remove the next version - "
                "brotli is disabled by default",
                DeprecationWarning,
                stacklevel=2
            )

        self.version = __version__
        self.host = host
        self.key = key
        self.verify = verify
        self.cloud_browser_host = cloud_browser_host or self.CLOUD_BROWSER_HOST
        self.cloud_browser_api_host = cloud_browser_host.replace('wss://', 'https://') if cloud_browser_host else self.CLOUD_BROWSER_API_HOST
        self.debug = debug
        self.connect_timeout = connect_timeout
        self.web_scraping_api_read_timeout = web_scraping_api_read_timeout
        self.screenshot_api_read_timeout = screenshot_api_read_timeout
        self.extraction_api_read_timeout = extraction_api_read_timeout
        self.monitoring_api_read_timeout = default_read_timeout
        self.default_read_timeout = default_read_timeout

        # @deprecated
        self.read_timeout = default_read_timeout

        self.max_concurrency = max_concurrency
        self.body_handler = ResponseBodyHandler(use_brotli=False)
        self.async_executor = ThreadPoolExecutor()
        self.http_session = None

        if not self.verify and not self.HOST.endswith('.local'):
            urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

        if self.debug is True:
            http.client.HTTPConnection.debuglevel = 5

        if reporter is None:
            from .reporter import NoopReporter

            reporter = NoopReporter()

        self.reporter = Reporter(reporter)

    @property
    def ua(self) -> str:
        return 'ScrapflySDK/%s (Python %s, %s, %s)' % (
            self.version,
            platform.python_version(),
            platform.uname().system,
            platform.uname().machine
        )

    @cached_property
    def _http_handler(self):
        return partial(self.http_session.request if self.http_session else requests.request)

    @property
    def http(self):
        return self._http_handler

    def _scrape_request(self, scrape_config:ScrapeConfig):
        return {
            'method': scrape_config.method,
            'url': self.host + '/scrape',
            'data': scrape_config.body,
            'verify': self.verify,
            'timeout': (self.connect_timeout, self.web_scraping_api_read_timeout),
            'headers': {
                # When method has a body (POST/PUT/PATCH) AND the caller
                # explicitly set a Content-Type, forward it. Otherwise fall
                # back to the body_handler default so we don't KeyError on
                # callers who omit the header (e.g. simple PUT "test-body").
                'content-type': (
                    scrape_config.headers.get('content-type', self.body_handler.content_type)
                    if scrape_config.method in ['POST', 'PUT', 'PATCH']
                    else self.body_handler.content_type
                ),
                'accept-encoding': self.body_handler.content_encoding,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
            },
            'params': scrape_config.to_api_params(key=self.key)
        }

    def _screenshot_request(self, screenshot_config:ScreenshotConfig):
        return {
            'method': 'GET',
            'url': self.host + '/screenshot',
            'timeout': (self.connect_timeout, self.screenshot_api_read_timeout),
            'verify': self.verify,
            'headers': {
                'accept-encoding': self.body_handler.content_encoding,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
            },
            'params': screenshot_config.to_api_params(key=self.key)
        }

    def _extraction_request(self, extraction_config:ExtractionConfig):
        headers = {
                'content-type': extraction_config.content_type,
                'accept-encoding': self.body_handler.content_encoding,
                'content-encoding': extraction_config.document_compression_format if extraction_config.document_compression_format else None,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
        }

        if extraction_config.document_compression_format:
            headers['content-encoding'] = extraction_config.document_compression_format.value

        return {
            'method': 'POST',
            'url': self.host + '/extraction',
            'data': extraction_config.body,
            'timeout': (self.connect_timeout, self.extraction_api_read_timeout),
            'verify': self.verify,
            'headers': headers,
            'params': extraction_config.to_api_params(key=self.key)
        }


    def account(self) -> Union[str, Dict]:
        response = self._http_handler(
            method='GET',
            url=self.host + '/account',
            params={'key': self.key},
            verify=self.verify,
            headers={
                'accept-encoding': self.body_handler.content_encoding,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
            },
        )

        response.raise_for_status()

        if self.body_handler.support(response.headers):
            return self.body_handler(response.content, response.headers['content-type'])

        return response.content.decode('utf-8')

    def get_monitoring_metrics(self, format:str=ScraperAPI.MONITORING_DATA_FORMAT_STRUCTURED, period:Optional[str]=None, aggregation:Optional[List[MonitoringAggregation]]=None):
        params = {'key': self.key, 'format': format}

        if period is not None:
            params['period'] = period

        if aggregation is not None:
            params['aggregation'] = ','.join(aggregation)

        response = self._http_handler(
            method='GET',
            url=self.host + '/scrape/monitoring/metrics',
            params=params,
            timeout=(self.connect_timeout, self.monitoring_api_read_timeout),
            verify=self.verify,
            headers={
                'accept-encoding': self.body_handler.content_encoding,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
            },
        )

        response.raise_for_status()

        if self.body_handler.support(response.headers):
            return self.body_handler(response.content, response.headers['content-type'])

        return response.content.decode('utf-8')

    def get_monitoring_target_metrics(
            self,
            domain:str,
            group_subdomain:bool=False,
            period:Optional[MonitoringTargetPeriod]=ScraperAPI.MONITORING_PERIOD_LAST_24H,
            start:Optional[datetime.datetime]=None,
            end:Optional[datetime.datetime]=None,
    ):
        params = {
            'key': self.key,
            'domain': domain,
            'group_subdomain': group_subdomain
        }

        if (start is not None and end is None) or (start is None and end is not None):
            raise ValueError('You must provide both start and end date')

        if start is not None and end is not None:
            params['start'] = start.strftime(self.DATETIME_FORMAT)
            params['end'] = end.strftime(self.DATETIME_FORMAT)
            period = None

        params['period'] = period

        response = self._http_handler(
            method='GET',
            url=self.host + '/scrape/monitoring/metrics/target',
            timeout=(self.connect_timeout, self.monitoring_api_read_timeout),
            params=params,
            verify=self.verify,
            headers={
                'accept-encoding': self.body_handler.content_encoding,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
            },
        )

        response.raise_for_status()

        if self.body_handler.support(response.headers):
            return self.body_handler(response.content, response.headers['content-type'])

        return response.content.decode('utf-8')


    def resilient_scrape(
        self,
        scrape_config:ScrapeConfig,
        retry_on_errors:Optional[Set[Exception]]=None,
        retry_on_status_code:Optional[List[int]]=None,
        tries: int = 5,
        delay: int = 20,
    ) -> ScrapeApiResponse:
        if retry_on_errors is None:
            retry_on_errors = {ScrapflyError}
        assert isinstance(retry_on_errors, set), 'retry_on_errors is not a set()'

        @backoff.on_exception(backoff.expo, exception=tuple(retry_on_errors), max_tries=tries, max_time=delay)
        def inner() -> ScrapeApiResponse:

            try:
                return self.scrape(scrape_config=scrape_config)
            except (UpstreamHttpClientError, UpstreamHttpServerError) as e:
                if retry_on_status_code is not None and e.api_response:
                    if e.api_response.upstream_status_code in retry_on_status_code:
                        raise e
                    else:
                        return e.api_response

                raise e

        return inner()

    def open(self):
        if self.http_session is None:
            self.http_session = Session()
            self.http_session.verify = self.verify
            self.http_session.timeout = (self.connect_timeout, self.default_read_timeout)
            self.http_session.params['key'] = self.key
            self.http_session.headers['accept-encoding'] = self.body_handler.content_encoding
            self.http_session.headers['accept'] = self.body_handler.accept
            self.http_session.headers['user-agent'] = self.ua

    def close(self):
        self.http_session.close()
        self.http_session = None

    def __enter__(self) -> 'ScrapflyClient':
        self.open()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.close()

    async def async_scrape(self, scrape_config:ScrapeConfig, loop:Optional[AbstractEventLoop]=None) -> ScrapeApiResponse:
        if loop is None:
            loop = asyncio.get_running_loop()

        return await loop.run_in_executor(self.async_executor, self.scrape, scrape_config)

    async def concurrent_scrape(self, scrape_configs:List[ScrapeConfig], concurrency:Optional[int]=None):
        if concurrency is None:
            concurrency = self.max_concurrency
        elif concurrency == self.CONCURRENCY_AUTO:
            concurrency = self.account()['subscription']['max_concurrency']

        loop = asyncio.get_running_loop()
        processing_tasks = []
        results = []
        processed_tasks = 0
        expected_tasks = len(scrape_configs)

        def scrape_done_callback(task:Task):
            nonlocal processed_tasks

            try:
                if task.cancelled() is True:
                    return

                error = task.exception()

                if error is not None:
                    results.append(error)
                else:
                    results.append(task.result())
            finally:
                processing_tasks.remove(task)
                processed_tasks += 1

        while scrape_configs or results or processing_tasks:
            logger.info("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))

            if scrape_configs:
                if len(processing_tasks) < concurrency:
                    # @todo handle backpressure
                    for _ in range(0, concurrency - len(processing_tasks)):
                        try:
                            scrape_config = scrape_configs.pop()
                        except IndexError:
                            break

                        scrape_config.raise_on_upstream_error = False
                        task = loop.create_task(self.async_scrape(scrape_config=scrape_config, loop=loop))
                        processing_tasks.append(task)
                        task.add_done_callback(scrape_done_callback)

            for _ in results:
                result = results.pop()
                yield result

            await asyncio.sleep(.5)

        logger.debug("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))

    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
    def scrape(self, scrape_config:ScrapeConfig, no_raise:bool=False) -> ScrapeApiResponse:
        """
        Scrape a website
        :param scrape_config: ScrapeConfig
        :param no_raise: bool - if True, do not raise exception on error while the api response is a ScrapflyError for seamless integration
        :return: ScrapeApiResponse

        If you use no_raise=True, make sure to check the api_response.scrape_result.error attribute to handle the error.
        If the error is not none, you will get the following structure for example

        'error': {
            'code': 'ERR::ASP::SHIELD_PROTECTION_FAILED',
            'message': 'The ASP shield failed to solve the challenge against the anti scrapping protection - heuristic_engine bypass failed, please retry in few seconds',
            'retryable': False,
            'http_code': 422,
            'links': {
                'Checkout ASP documentation': 'https://scrapfly.io/docs/scrape-api/anti-scraping-protection#maximize_success_rate', 'Related Error Doc': 'https://scrapfly.io/docs/scrape-api/error/ERR::ASP::SHIELD_PROTECTION_FAILED'
            }
        }
        """

        try:
            logger.debug('--> %s Scrapping %s' % (scrape_config.method, scrape_config.url))
            request_data = self._scrape_request(scrape_config=scrape_config)
            response = self._http_handler(**request_data)

            if scrape_config.proxified_response is True:
                # Proxified mode: the API returns the raw upstream response
                # (target's status, headers, body) instead of the JSON
                # envelope. Error restoration: if X-Scrapfly-Reject-Code is
                # present, the scrape failed and the SDK must raise a typed
                # error with the code/message/retryable from the headers.
                reject_code = response.headers.get('X-Scrapfly-Reject-Code')
                if reject_code:
                    from scrapfly.errors import HttpError
                    reject_desc = response.headers.get('X-Scrapfly-Reject-Description', '')
                    reject_retryable = response.headers.get('X-Scrapfly-Reject-Retryable', 'false').lower() == 'true'
                    retry_after = None
                    if reject_retryable:
                        try:
                            retry_after = int(response.headers.get('Retry-After', '0'))
                        except (ValueError, TypeError):
                            retry_after = None
                    raise HttpError(
                        request=response.request,
                        response=response,
                        code=reject_code,
                        http_status_code=response.status_code,
                        message=reject_desc,
                        is_retryable=reject_retryable,
                        retry_delay=retry_after,
                    )
                self.reporter.report(scrape_api_response=None)
                return response

            scrape_api_response = self._handle_response(response=response, scrape_config=scrape_config)

            self.reporter.report(scrape_api_response=scrape_api_response)

            return scrape_api_response
        except BaseException as e:
            self.reporter.report(error=e)

            if no_raise and isinstance(e, ScrapflyError) and e.api_response is not None:
                return e.api_response

            raise e

    async def async_screenshot(self, screenshot_config:ScreenshotConfig, loop:Optional[AbstractEventLoop]=None) -> ScreenshotApiResponse:
        if loop is None:
            loop = asyncio.get_running_loop()

        return await loop.run_in_executor(self.async_executor, self.screenshot, screenshot_config)

    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
    def screenshot(self, screenshot_config:ScreenshotConfig, no_raise:bool=False) -> ScreenshotApiResponse:
        """
        Take a screenshot
        :param screenshot_config: ScrapeConfig
        :param no_raise: bool - if True, do not raise exception on error while the screenshot api response is a ScrapflyError for seamless integration
        :return: str

        If you use no_raise=True, make sure to check the screenshot_api_response.error attribute to handle the error.
        If the error is not none, you will get the following structure for example

        'error': {
            'code': 'ERR::SCREENSHOT::UNABLE_TO_TAKE_SCREENSHOT',
            'message': 'For some reason we were unable to take the screenshot',
            'http_code': 422,
            'links': {
                'Checkout the related doc: https://scrapfly.io/docs/screenshot-api/error/ERR::SCREENSHOT::UNABLE_TO_TAKE_SCREENSHOT'
            }
        }
        """

        try:
            logger.debug('--> %s Screenshoting' % (screenshot_config.url))
            request_data = self._screenshot_request(screenshot_config=screenshot_config)
            response = self._http_handler(**request_data)
            screenshot_api_response = self._handle_screenshot_response(response=response, screenshot_config=screenshot_config)
            return screenshot_api_response
        except BaseException as e:
            self.reporter.report(error=e)

            if no_raise and isinstance(e, ScrapflyError) and e.api_response is not None:
                return e.api_response

            raise e

    async def async_extraction(self, extraction_config:ExtractionConfig, loop:Optional[AbstractEventLoop]=None) -> ExtractionApiResponse:
        if loop is None:
            loop = asyncio.get_running_loop()

        return await loop.run_in_executor(self.async_executor, self.extract, extraction_config)

    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
    def extract(self, extraction_config:ExtractionConfig, no_raise:bool=False) -> ExtractionApiResponse:
        """
        Extract structured data from text content
        :param extraction_config: ExtractionConfig
        :param no_raise: bool - if True, do not raise exception on error while the extraction api response is a ScrapflyError for seamless integration
        :return: str

        If you use no_raise=True, make sure to check the extraction_api_response.error attribute to handle the error.
        If the error is not none, you will get the following structure for example

        'error': {
            'code': 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
            'message': 'The content type of the response is not supported for extraction',
            'http_code': 422,
            'links': {
                'Checkout the related doc: https://scrapfly.io/docs/extraction-api/error/ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED'
            }
        }
        """

        try:
            logger.debug('--> %s Extracting data from' % (extraction_config.content_type))
            request_data = self._extraction_request(extraction_config=extraction_config)
            response = self._http_handler(**request_data)
            extraction_api_response = self._handle_extraction_response(response=response, extraction_config=extraction_config)
            return extraction_api_response
        except BaseException as e:
            self.reporter.report(error=e)

            if no_raise and isinstance(e, ScrapflyError) and e.api_response is not None:
                return e.api_response

            raise e

    def _handle_response(self, response:Response, scrape_config:ScrapeConfig) -> ScrapeApiResponse:
        try:
            api_response = self._handle_api_response(
                response=response,
                scrape_config=scrape_config,
                raise_on_upstream_error=scrape_config.raise_on_upstream_error
            )

            if scrape_config.method == 'HEAD':
                logger.debug('<-- [%s %s] %s | %ss' % (
                    api_response.response.status_code,
                    api_response.response.reason,
                    api_response.response.request.url,
                    0
                ))
            else:
                logger.debug('<-- [%s %s] %s | %ss' % (
                    api_response.result['result']['status_code'],
                    api_response.result['result']['reason'],
                    api_response.result['config']['url'],
                    api_response.result['result']['duration'])
                )

                logger.debug('Log url: %s' % api_response.result['result']['log_url'])

            return api_response
        except UpstreamHttpError as e:
            logger.critical(e.api_response.error_message)
            raise
        except HttpError as e:
            if e.api_response is not None:
                logger.critical(e.api_response.error_message)
            else:
                logger.critical(e.message)
            raise
        except ScrapflyError as e:
            logger.critical('<-- %s | Docs: %s' % (str(e), e.documentation_url))
            raise

    def _handle_screenshot_response(self, response:Response, screenshot_config:ScreenshotConfig) -> ScreenshotApiResponse:    
        try:
            api_response = self._handle_screenshot_api_response(
                response=response,
                screenshot_config=screenshot_config,
                raise_on_upstream_error=screenshot_config.raise_on_upstream_error
            )
            return api_response
        except UpstreamHttpError as e:
            logger.critical(e.api_response.error_message)
            raise
        except HttpError as e:
            if e.api_response is not None:
                logger.critical(e.api_response.error_message)
            else:
                logger.critical(e.message)
            raise
        except ScrapflyError as e:
            logger.critical('<-- %s | Docs: %s' % (str(e), e.documentation_url))
            raise         

    def _handle_extraction_response(self, response:Response, extraction_config:ExtractionConfig) -> ExtractionApiResponse:
        try:
            api_response = self._handle_extraction_api_response(
                response=response,
                extraction_config=extraction_config,
                raise_on_upstream_error=extraction_config.raise_on_upstream_error
            )
            return api_response
        except UpstreamHttpError as e:
            logger.critical(e.api_response.error_message)
            raise
        except HttpError as e:
            if e.api_response is not None:
                logger.critical(e.api_response.error_message)
            else:
                logger.critical(e.message)
            raise
        except ScrapflyError as e:
            logger.critical('<-- %s | Docs: %s' % (str(e), e.documentation_url))
            raise    

    def save_screenshot(self, screenshot_api_response:ScreenshotApiResponse, name:str, path:Optional[str]=None):
        """
        Save a screenshot from a screenshot API response
        :param api_response: ScreenshotApiResponse
        :param name: str - name of the screenshot to save as
        :param path: Optional[str]
        """

        if screenshot_api_response.screenshot_success is not True:
            raise RuntimeError('Screenshot was not successful')

        if not screenshot_api_response.image:
            raise RuntimeError('Screenshot binary does not exist')

        content = screenshot_api_response.image
        extension_name = screenshot_api_response.metadata['extension_name']

        if path:
            os.makedirs(path, exist_ok=True)
            file_path = os.path.join(path, f'{name}.{extension_name}')
        else:
            file_path = f'{name}.{extension_name}'

        if isinstance(content, bytes):
            content = BytesIO(content)

        with open(file_path, 'wb') as f:
            shutil.copyfileobj(content, f, length=131072)

    def save_scrape_screenshot(self, api_response:ScrapeApiResponse, name:str, path:Optional[str]=None):
        """
        Save a screenshot from a scrape result
        :param api_response: ScrapeApiResponse
        :param name: str - name of the screenshot given in the scrape config
        :param path: Optional[str]
        """

        if not api_response.scrape_result['screenshots']:
            raise RuntimeError('Screenshot %s do no exists' % name)

        try:
            api_response.scrape_result['screenshots'][name]
        except KeyError:
            raise RuntimeError('Screenshot %s do no exists' % name)

        screenshot_response = self._http_handler(
            method='GET',
            url=api_response.scrape_result['screenshots'][name]['url'],
            params={'key': self.key},
            verify=self.verify
        )

        screenshot_response.raise_for_status()

        if not name.endswith('.jpg'):
            name += '.jpg'

        api_response.sink(path=path, name=name, content=screenshot_response.content)

    def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes]]=None, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None) -> str:
        scrape_result = api_response.result['result']
        scrape_config = api_response.result['config']

        file_content = content or scrape_result['content']
        file_path = None
        file_extension = None

        if name:
            name_parts = name.split('.')
            if len(name_parts) > 1:
                file_extension = name_parts[-1]

        if not file:
            if file_extension is None:
                try:
                    mime_type = scrape_result['response_headers']['content-type']
                except KeyError:
                    mime_type = 'application/octet-stream'

                if ';' in mime_type:
                    mime_type = mime_type.split(';')[0]

                file_extension = '.' + mime_type.split('/')[1]

            if not name:
                name = scrape_config['url'].split('/')[-1]

            if name.find(file_extension) == -1:
                name += file_extension

            file_path = path + '/' + name if path else name

            if file_path == file_extension:
                url = re.sub(r'(https|http)?://', '', api_response.config['url']).replace('/', '-')

                if url[-1] == '-':
                    url = url[:-1]

                url += file_extension

                file_path = url

            file = open(file_path, 'wb')

        if isinstance(file_content, str):
            file_content = BytesIO(file_content.encode('utf-8'))
        elif isinstance(file_content, bytes):
            file_content = BytesIO(file_content)

        file_content.seek(0)
        with file as f:
            shutil.copyfileobj(file_content, f, length=131072)

        logger.info('file %s created' % file_path)
        return file_path

    def _handle_scrape_large_objects(
        self,
        callback_url:str,
        format: Literal['clob', 'blob']
    ) -> Tuple[Union[BytesIO, str], str]:
        if format not in ['clob', 'blob']:
            raise ContentError('Large objects handle can handles format format [blob, clob], given: %s' % format)

        response = self._http_handler(**{
            'method': 'GET',
            'url': callback_url,
            'verify': self.verify,
            'timeout': (self.connect_timeout, self.default_read_timeout),
            'headers': {
                'accept-encoding': self.body_handler.content_encoding,
                'accept': self.body_handler.accept,
                'user-agent': self.ua
            },
            'params': {'key': self.key}
        })

        if self.body_handler.support(headers=response.headers):
            content = self.body_handler(content=response.content, content_type=response.headers['content-type'])
        else:
            content = response.content

        if format == 'clob':
            return content.decode('utf-8'), 'text'

        return BytesIO(content), 'binary'

    def _handle_api_response(
        self,
        response: Response,
        scrape_config:ScrapeConfig,
        raise_on_upstream_error: Optional[bool] = True
    ) -> ScrapeApiResponse:

        if scrape_config.method == 'HEAD':
            body = None
        else:
            if self.body_handler.support(headers=response.headers):
                body = self.body_handler(content=response.content, content_type=response.headers['content-type'])
            else:
                # body_handler rejected — content-type not in SUPPORTED_CONTENT_TYPES.
                # Response may still be compressed (zstd/brotli) if requests did
                # not transparently decompress. Probe content-encoding and try
                # the handler's read() anyway before falling back to a tolerant
                # utf-8 decode. Previously this branch raised UnicodeDecodeError
                # on valid zstd/br responses with a non-json/msgpack content-type.
                raw = response.content
                content_encoding = response.headers.get('content-encoding', '').lower()
                if content_encoding in ('gzip', 'gz', 'deflate', 'br', 'brotli', 'zstd'):
                    try:
                        raw = self.body_handler.read(
                            content=raw,
                            content_encoding=content_encoding,
                            content_type=response.headers.get('content-type', ''),
                            signature=None,
                        )
                    except Exception:
                        # Fall through to tolerant decode below; don't mask the
                        # real error with a decoder crash.
                        pass
                if isinstance(raw, (bytes, bytearray)):
                    body = raw.decode('utf-8', errors='replace')
                else:
                    body = raw

        api_response:ScrapeApiResponse = ScrapeApiResponse(
            response=response,
            request=response.request,
            api_result=body,
            scrape_config=scrape_config,
            large_object_handler=self._handle_scrape_large_objects
        )

        api_response.raise_for_result(raise_on_upstream_error=raise_on_upstream_error)

        return api_response

    def _handle_screenshot_api_response(
        self,
        response: Response,
        screenshot_config:ScreenshotConfig,
        raise_on_upstream_error: Optional[bool] = True
    ) -> ScreenshotApiResponse:

        if self.body_handler.support(headers=response.headers):
            body = self.body_handler(content=response.content, content_type=response.headers['content-type'])
        else:
            body = {'result': response.content}

        api_response:ScreenshotApiResponse = ScreenshotApiResponse(
            response=response,
            request=response.request,
            api_result=body,
            screenshot_config=screenshot_config
        )

        api_response.raise_for_result(raise_on_upstream_error=raise_on_upstream_error)

        return api_response

    def _handle_extraction_api_response(
        self,
        response: Response,
        extraction_config:ExtractionConfig,
        raise_on_upstream_error: Optional[bool] = True
    ) -> ExtractionApiResponse:
        
        if self.body_handler.support(headers=response.headers):
            body = self.body_handler(content=response.content, content_type=response.headers['content-type'])
        else:
            body = response.content.decode('utf-8')

        api_response:ExtractionApiResponse = ExtractionApiResponse(
            response=response,
            request=response.request,
            api_result=body,
            extraction_config=extraction_config
        )

        api_response.raise_for_result(raise_on_upstream_error=raise_on_upstream_error)

        return api_response

    @backoff.on_exception(backoff.expo, exception=ConnectionError, max_tries=5)
    def start_crawl(self, crawler_config: CrawlerConfig) -> CrawlerStartResponse:
        """
        Start a crawler job

        :param crawler_config: CrawlerConfig
        :return: CrawlerStartResponse with UUID and initial status

        Example:
            ```python
            from scrapfly import ScrapflyClient, CrawlerConfig

            client = ScrapflyClient(key='YOUR_API_KEY')
            config = CrawlerConfig(
                url='https://example.com',
                page_limit=100,
                max_depth=3
            )

            response = client.start_crawl(config)
            print(f"Crawler started: {response.uuid}")
            ```
        """
        # Get crawler config params (without key)
        body_params = crawler_config.to_api_params()

        # API key must be passed as query parameter, not in body
        query_params = {'key': self.key}

        timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

        url = f'{self.host}/crawl'
        logger.debug(f"Crawler API POST {url}?key=***")
        logger.debug(f"Crawler API body: {body_params}")

        response = self._http_handler(
            method='POST',
            url=url,
            params=query_params,  # key as query param
            json=body_params,      # config in body
            timeout=timeout,
            headers={'User-Agent': self.ua},
            verify=self.verify
        )

        if response.status_code not in (200, 201):
            # Log error details for debugging
            try:
                error_detail = response.json()
            except (ValueError, Exception):
                error_detail = response.text
            logger.debug(f"Crawler API error ({response.status_code}): {error_detail}")
            self._handle_crawler_error_response(response)

        result = response.json()
        return CrawlerStartResponse(result)

    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
    def get_crawl_status(self, uuid: str) -> CrawlerStatusResponse:
        """
        Get crawler job status

        :param uuid: Crawler job UUID
        :return: CrawlerStatusResponse with progress information

        Example:
            ```python
            status = client.get_crawl_status(uuid)
            print(f"Status: {status.status}")
            print(f"Progress: {status.progress_pct:.1f}%")
            print(f"Crawled: {status.urls_crawled}/{status.urls_discovered}")

            if status.is_complete:
                print("Crawl completed!")
            ```
        """
        timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

        response = self._http_handler(
            method='GET',
            url=f'{self.host}/crawl/{uuid}/status',
            params={'key': self.key},  # key as query param (already correct)
            timeout=timeout,
            headers={'User-Agent': self.ua},
            verify=self.verify
        )

        if response.status_code != 200:
            self._handle_crawler_error_response(response)

        result = response.json()
        return CrawlerStatusResponse(result)

    def cancel_crawl(self, crawl_uuid: str) -> bool:
        """
        Cancel a running crawler job

        :param crawl_uuid: Crawler job UUID to cancel
        :return: True if cancelled successfully

        Example:
            ```python
            # Start a crawl
            crawl = client.start_crawl(config)

            # Cancel it
            client.cancel_crawl(crawl.uuid)
            ```
        """
        timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

        response = self._http_handler(
            method='DELETE',
            url=f'{self.host}/crawl/{crawl_uuid}',
            params={'key': self.key},
            timeout=timeout,
            headers={'User-Agent': self.ua},
            verify=self.verify
        )

        if response.status_code not in (200, 204):
            self._handle_crawler_error_response(response)

        return True

    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
    def get_crawl_artifact(
        self,
        uuid: str,
        artifact_type: str = 'warc'
    ) -> CrawlerArtifactResponse:
        """
        Download crawler job artifact

        :param uuid: Crawler job UUID
        :param artifact_type: Artifact type ('warc' or 'har')
        :return: CrawlerArtifactResponse with WARC data and parsing utilities

        Example:
            ```python
            # Wait for crawl to complete
            while True:
                status = client.get_crawl_status(uuid)
                if status.is_complete:
                    break
                time.sleep(5)

            # Download artifact
            artifact = client.get_crawl_artifact(uuid)

            # Easy mode: get all pages
            pages = artifact.get_pages()
            for page in pages:
                print(f"{page['url']}: {page['status_code']}")

            # Memory-efficient: iterate
            for record in artifact.iter_responses():
                process(record.content)

            # Save to file
            artifact.save('crawl.warc.gz')
            ```
        """
        timeout = (self.connect_timeout, 300)  # 5 minutes for large downloads

        response = self._http_handler(
            method='GET',
            url=f'{self.host}/crawl/{uuid}/artifact',
            params={
                'key': self.key,
                'type': artifact_type
            },
            timeout=timeout,
            headers={'User-Agent': self.ua},
            verify=self.verify
        )

        if response.status_code != 200:
            self._handle_crawler_error_response(response)

        return CrawlerArtifactResponse(response.content, artifact_type=artifact_type)

    @backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
    def get_crawl_contents(
        self,
        uuid: str,
        format: Literal['html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata'] = 'html'
    ) -> Dict[str, Any]:
        """
        Get crawl contents in a specific format

        Retrieves extracted content from crawled pages in the format(s) specified
        in your crawl configuration (via content_formats parameter).

        :param uuid: Crawler job UUID
        :param format: Content format - 'html', 'clean_html', 'markdown', 'json', 'text',
                      'extracted_data', 'page_metadata'
        :return: Dictionary with format {"contents": {url: content, ...}, "links": {...}}

        Example:
            ```python
            # Get all content in markdown format
            result = client.get_crawl_contents(uuid, format='markdown')
            contents = result['contents']

            # Access specific URL
            for url, content in contents.items():
                print(f"{url}: {len(content)} chars")
            ```
        """
        timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

        params = {
            'key': self.key,
            'format': format
        }

        response = self._http_handler(
            method='GET',
            url=f'{self.host}/crawl/{uuid}/contents',
            params=params,
            timeout=timeout,
            headers={'User-Agent': self.ua},
            verify=self.verify
        )

        if response.status_code != 200:
            self._handle_crawler_error_response(response)

        return response.json()

    def _handle_crawler_error_response(self, response: Response):
        """Handle error responses from Crawler API"""
        try:
            error_data = response.json()
            error_msg = error_data.get('message', 'Unknown error')
            error_code = error_data.get('code', 'ERR::CRAWLER::UNKNOWN')
        except Exception:
            error_msg = response.text
            error_code = 'ERR::CRAWLER::UNKNOWN'

        raise HttpError(
            message=f"Crawler API error ({response.status_code}): {error_msg}",
            code=error_code,
            http_status_code=response.status_code,
            request=response.request,
            response=response
        )

    def cloud_browser(self, browser_config: Optional[BrowserConfig] = None) -> str:
        """
        Get the WebSocket URL for a Cloud Browser session.
        :param browser_config: Optional BrowserConfig - connection parameters
        :return: str - the full wss:// URL for CDP connection
        """
        if browser_config is None:
            browser_config = BrowserConfig()

        return browser_config.websocket_url(api_key=self.key, host=self.cloud_browser_host)

    def cloud_browser_unblock(
        self,
        url: str,
        proxy_pool: Optional[str] = None,
        country: Optional[str] = None,
        os: Optional[str] = None,
        timeout: Optional[int] = None,
        browser_timeout: Optional[int] = None,
        headers: Optional[Dict] = None,
        body: Optional[str] = None,
        method: Optional[str] = None,
    ) -> Dict:
        """
        Bypass anti-bot protection and get a ready-to-use browser session.
        :param url: Target URL to navigate to and bypass protection
        :param proxy_pool: Proxy pool: 'datacenter' or 'residential'
        :param country: ISO country code for proxy geolocation
        :param os: Operating system fingerprint: 'linux', 'windows', 'macos'
        :param timeout: Navigation timeout in seconds (max 300)
        :param browser_timeout: Browser session timeout in seconds (max 1800)
        :param headers: Custom request headers
        :param body: Request body for POST/PUT/PATCH requests
        :param method: HTTP method: GET, POST, PUT, PATCH, DELETE
        :return: dict with ws_url, session_id, run_id
        """
        proxy_pool_map = {
            'datacenter': 'public_datacenter_pool',
            'residential': 'public_residential_pool',
        }

        json_body = {'url': url}

        if proxy_pool is not None:
            json_body['proxy_pool'] = proxy_pool_map.get(proxy_pool, proxy_pool)

        if country is not None:
            json_body['country'] = country

        if os is not None:
            json_body['os'] = os

        if timeout is not None:
            json_body['timeout'] = timeout

        if browser_timeout is not None:
            json_body['browser_timeout'] = browser_timeout

        if headers is not None:
            json_body['headers'] = headers

        if body is not None:
            json_body['body'] = body

        if method is not None:
            json_body['method'] = method

        response = self._http_handler(
            method='POST',
            url=self.cloud_browser_api_host + '/unblock',
            json=json_body,
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, 155),
            headers={
                'content-type': 'application/json',
                'user-agent': self.ua
            },
        )

        response.raise_for_status()

        return response.json()

    def cloud_browser_session_stop(self, session_id: str) -> None:
        """
        Terminate a Cloud Browser session.
        :param session_id: The session identifier to terminate
        """
        response = self._http_handler(
            method='POST',
            url=self.cloud_browser_api_host + '/session/' + session_id + '/stop',
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

        response.raise_for_status()

    def cloud_browser_playback(self, run_id: str) -> Dict:
        """
        Get playback info for a debug session recording.
        :param run_id: The unique run identifier
        :return: dict with available, metadata, video_url
        """
        response = self._http_handler(
            method='GET',
            url=self.cloud_browser_api_host + '/run/' + run_id + '/playback',
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

        response.raise_for_status()

        return response.json()

    def cloud_browser_video(self, run_id: str, save_path: Optional[str] = None) -> bytes:
        """
        Download a debug session recording video.
        :param run_id: The unique run identifier
        :param save_path: Optional file path to save the video (e.g. 'recording.webm')
        :return: bytes - raw video data
        """
        response = self._http_handler(
            method='GET',
            url=self.cloud_browser_api_host + '/run/' + run_id + '/video',
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, 120),  # Videos can be large
            headers={
                'user-agent': self.ua
            },
            stream=True,
        )

        response.raise_for_status()

        data = response.content
        if save_path:
            with open(save_path, 'wb') as f:
                f.write(data)

        return data

    # --- Cloud Browser Extension Management ---

    def cloud_browser_extension_list(self) -> Dict:
        """
        List all browser extensions for the current account.
        :return: dict with 'extensions' list and 'quota' info (used, limit)
        """
        response = self._http_handler(
            method='GET',
            url=self.cloud_browser_api_host + '/extension',
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

        response.raise_for_status()
        return response.json()

    def cloud_browser_extension_get(self, extension_id: str) -> Dict:
        """
        Get details of a specific browser extension.
        :param extension_id: The extension identifier
        :return: dict with extension details
        """
        response = self._http_handler(
            method='GET',
            url=self.cloud_browser_api_host + '/extension/' + extension_id,
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

        response.raise_for_status()
        return response.json()

    def cloud_browser_extension_upload(self, file_path: str) -> Dict:
        """
        Upload a browser extension from a local file (.zip or .crx).
        :param file_path: Path to the extension file
        :return: dict with 'extension' details and 'is_update' flag
        """
        with open(file_path, 'rb') as f:
            response = self._http_handler(
                method='POST',
                url=self.cloud_browser_api_host + '/extension',
                params={'key': self.key},
                files={'file': (os.path.basename(file_path), f)},
                verify=self.verify,
                timeout=(self.connect_timeout, self.default_read_timeout),
                headers={
                    'user-agent': self.ua
                },
            )

        response.raise_for_status()
        return response.json()

    def cloud_browser_extension_upload_from_url(self, extension_url: str) -> Dict:
        """
        Install a browser extension from a URL pointing to a .crx file.
        URL-based extensions auto-update on each browser session start.
        :param extension_url: URL to the .crx extension file
        :return: dict with 'extension' details and 'is_update' flag
        """
        response = self._http_handler(
            method='POST',
            url=self.cloud_browser_api_host + '/extension',
            params={'key': self.key},
            json={'extension_url': extension_url},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'content-type': 'application/json',
                'user-agent': self.ua
            },
        )

        response.raise_for_status()
        return response.json()

    def cloud_browser_extension_delete(self, extension_id: str) -> Dict:
        """
        Delete a browser extension.
        :param extension_id: The extension identifier to delete
        :return: dict with success status
        """
        response = self._http_handler(
            method='DELETE',
            url=self.cloud_browser_api_host + '/extension/' + extension_id,
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

        response.raise_for_status()
        return response.json()

    def cloud_browser_sessions(self) -> Dict:
        """
        List all running Cloud Browser sessions.
        :return: dict with 'sessions' list and 'total' count
        """
        response = self._http_handler(
            method='GET',
            url=self.cloud_browser_api_host + '/sessions',
            params={'key': self.key},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

        response.raise_for_status()
        return response.json()

Class variables

var CLOUD_BROWSER_API_HOST
var CLOUD_BROWSER_HOST
var CONCURRENCY_AUTO
var DATETIME_FORMAT
var DEFAULT_CONNECT_TIMEOUT
var DEFAULT_CRAWLER_API_READ_TIMEOUT
var DEFAULT_EXTRACTION_API_READ_TIMEOUT
var DEFAULT_READ_TIMEOUT
var DEFAULT_SCREENSHOT_API_READ_TIMEOUT
var DEFAULT_WEBSCRAPING_API_READ_TIMEOUT
var HOST
var brotli : bool
var connect_timeout : int
var debug : bool
var default_read_timeout : int
var distributed_mode : bool
var extraction_api_read_timeout : int
var host : str
var key : str
var max_concurrency : int
var monitoring_api_read_timeout : int
var read_timeout : int
var reporter : scrapfly.reporter.Reporter
var screenshot_api_read_timeout : int
var verify : bool
var version : str
var web_scraping_api_read_timeout : int

Instance variables

prop http

Expand source code

@property
def http(self):
    return self._http_handler

prop ua : str

Expand source code

@property
def ua(self) -> str:
    return 'ScrapflySDK/%s (Python %s, %s, %s)' % (
        self.version,
        platform.python_version(),
        platform.uname().system,
        platform.uname().machine
    )

Methods

def account(self) ‑> str | Dict

Expand source code

def account(self) -> Union[str, Dict]:
    response = self._http_handler(
        method='GET',
        url=self.host + '/account',
        params={'key': self.key},
        verify=self.verify,
        headers={
            'accept-encoding': self.body_handler.content_encoding,
            'accept': self.body_handler.accept,
            'user-agent': self.ua
        },
    )

    response.raise_for_status()

    if self.body_handler.support(response.headers):
        return self.body_handler(response.content, response.headers['content-type'])

    return response.content.decode('utf-8')

async def async_extraction(self, extraction_config: ExtractionConfig, loop: asyncio.events.AbstractEventLoop | None = None) ‑> ExtractionApiResponse

Expand source code

async def async_extraction(self, extraction_config:ExtractionConfig, loop:Optional[AbstractEventLoop]=None) -> ExtractionApiResponse:
    if loop is None:
        loop = asyncio.get_running_loop()

    return await loop.run_in_executor(self.async_executor, self.extract, extraction_config)

async def async_scrape(self, scrape_config: ScrapeConfig, loop: asyncio.events.AbstractEventLoop | None = None) ‑> ScrapeApiResponse

Expand source code

async def async_scrape(self, scrape_config:ScrapeConfig, loop:Optional[AbstractEventLoop]=None) -> ScrapeApiResponse:
    if loop is None:
        loop = asyncio.get_running_loop()

    return await loop.run_in_executor(self.async_executor, self.scrape, scrape_config)

async def async_screenshot(self, screenshot_config: ScreenshotConfig, loop: asyncio.events.AbstractEventLoop | None = None) ‑> ScreenshotApiResponse

Expand source code

async def async_screenshot(self, screenshot_config:ScreenshotConfig, loop:Optional[AbstractEventLoop]=None) -> ScreenshotApiResponse:
    if loop is None:
        loop = asyncio.get_running_loop()

    return await loop.run_in_executor(self.async_executor, self.screenshot, screenshot_config)

def cancel_crawl(self, crawl_uuid: str) ‑> bool

Expand source code

def cancel_crawl(self, crawl_uuid: str) -> bool:
    """
    Cancel a running crawler job

    :param crawl_uuid: Crawler job UUID to cancel
    :return: True if cancelled successfully

    Example:
        ```python
        # Start a crawl
        crawl = client.start_crawl(config)

        # Cancel it
        client.cancel_crawl(crawl.uuid)
        ```
    """
    timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

    response = self._http_handler(
        method='DELETE',
        url=f'{self.host}/crawl/{crawl_uuid}',
        params={'key': self.key},
        timeout=timeout,
        headers={'User-Agent': self.ua},
        verify=self.verify
    )

    if response.status_code not in (200, 204):
        self._handle_crawler_error_response(response)

    return True

Cancel a running crawler job

:param crawl_uuid: Crawler job UUID to cancel :return: True if cancelled successfully

Example

# Start a crawl
crawl = client.start_crawl(config)

# Cancel it
client.cancel_crawl(crawl.uuid)

def close(self)

Expand source code

def close(self):
    self.http_session.close()
    self.http_session = None

def cloud_browser(self, browser_config: BrowserConfig | None = None) ‑> str

Expand source code

def cloud_browser(self, browser_config: Optional[BrowserConfig] = None) -> str:
    """
    Get the WebSocket URL for a Cloud Browser session.
    :param browser_config: Optional BrowserConfig - connection parameters
    :return: str - the full wss:// URL for CDP connection
    """
    if browser_config is None:
        browser_config = BrowserConfig()

    return browser_config.websocket_url(api_key=self.key, host=self.cloud_browser_host)

Get the WebSocket URL for a Cloud Browser session. :param browser_config: Optional BrowserConfig - connection parameters :return: str - the full wss:// URL for CDP connection

def cloud_browser_extension_delete(self, extension_id: str) ‑> Dict

Expand source code

def cloud_browser_extension_delete(self, extension_id: str) -> Dict:
    """
    Delete a browser extension.
    :param extension_id: The extension identifier to delete
    :return: dict with success status
    """
    response = self._http_handler(
        method='DELETE',
        url=self.cloud_browser_api_host + '/extension/' + extension_id,
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'user-agent': self.ua
        },
    )

    response.raise_for_status()
    return response.json()

Delete a browser extension. :param extension_id: The extension identifier to delete :return: dict with success status

def cloud_browser_extension_get(self, extension_id: str) ‑> Dict

Expand source code

def cloud_browser_extension_get(self, extension_id: str) -> Dict:
    """
    Get details of a specific browser extension.
    :param extension_id: The extension identifier
    :return: dict with extension details
    """
    response = self._http_handler(
        method='GET',
        url=self.cloud_browser_api_host + '/extension/' + extension_id,
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'user-agent': self.ua
        },
    )

    response.raise_for_status()
    return response.json()

Get details of a specific browser extension. :param extension_id: The extension identifier :return: dict with extension details

def cloud_browser_extension_list(self) ‑> Dict

Expand source code

def cloud_browser_extension_list(self) -> Dict:
    """
    List all browser extensions for the current account.
    :return: dict with 'extensions' list and 'quota' info (used, limit)
    """
    response = self._http_handler(
        method='GET',
        url=self.cloud_browser_api_host + '/extension',
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'user-agent': self.ua
        },
    )

    response.raise_for_status()
    return response.json()

List all browser extensions for the current account. :return: dict with 'extensions' list and 'quota' info (used, limit)

def cloud_browser_extension_upload(self, file_path: str) ‑> Dict

Expand source code

def cloud_browser_extension_upload(self, file_path: str) -> Dict:
    """
    Upload a browser extension from a local file (.zip or .crx).
    :param file_path: Path to the extension file
    :return: dict with 'extension' details and 'is_update' flag
    """
    with open(file_path, 'rb') as f:
        response = self._http_handler(
            method='POST',
            url=self.cloud_browser_api_host + '/extension',
            params={'key': self.key},
            files={'file': (os.path.basename(file_path), f)},
            verify=self.verify,
            timeout=(self.connect_timeout, self.default_read_timeout),
            headers={
                'user-agent': self.ua
            },
        )

    response.raise_for_status()
    return response.json()

Upload a browser extension from a local file (.zip or .crx). :param file_path: Path to the extension file :return: dict with 'extension' details and 'is_update' flag

def cloud_browser_extension_upload_from_url(self, extension_url: str) ‑> Dict

Expand source code

def cloud_browser_extension_upload_from_url(self, extension_url: str) -> Dict:
    """
    Install a browser extension from a URL pointing to a .crx file.
    URL-based extensions auto-update on each browser session start.
    :param extension_url: URL to the .crx extension file
    :return: dict with 'extension' details and 'is_update' flag
    """
    response = self._http_handler(
        method='POST',
        url=self.cloud_browser_api_host + '/extension',
        params={'key': self.key},
        json={'extension_url': extension_url},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'content-type': 'application/json',
            'user-agent': self.ua
        },
    )

    response.raise_for_status()
    return response.json()

Install a browser extension from a URL pointing to a .crx file. URL-based extensions auto-update on each browser session start. :param extension_url: URL to the .crx extension file :return: dict with 'extension' details and 'is_update' flag

def cloud_browser_playback(self, run_id: str) ‑> Dict

Expand source code

def cloud_browser_playback(self, run_id: str) -> Dict:
    """
    Get playback info for a debug session recording.
    :param run_id: The unique run identifier
    :return: dict with available, metadata, video_url
    """
    response = self._http_handler(
        method='GET',
        url=self.cloud_browser_api_host + '/run/' + run_id + '/playback',
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'user-agent': self.ua
        },
    )

    response.raise_for_status()

    return response.json()

Get playback info for a debug session recording. :param run_id: The unique run identifier :return: dict with available, metadata, video_url

def cloud_browser_session_stop(self, session_id: str) ‑> None

Expand source code

def cloud_browser_session_stop(self, session_id: str) -> None:
    """
    Terminate a Cloud Browser session.
    :param session_id: The session identifier to terminate
    """
    response = self._http_handler(
        method='POST',
        url=self.cloud_browser_api_host + '/session/' + session_id + '/stop',
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'user-agent': self.ua
        },
    )

    response.raise_for_status()

Terminate a Cloud Browser session. :param session_id: The session identifier to terminate

def cloud_browser_sessions(self) ‑> Dict

Expand source code

def cloud_browser_sessions(self) -> Dict:
    """
    List all running Cloud Browser sessions.
    :return: dict with 'sessions' list and 'total' count
    """
    response = self._http_handler(
        method='GET',
        url=self.cloud_browser_api_host + '/sessions',
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, self.default_read_timeout),
        headers={
            'user-agent': self.ua
        },
    )

    response.raise_for_status()
    return response.json()

List all running Cloud Browser sessions. :return: dict with 'sessions' list and 'total' count

Expand source code

def cloud_browser_unblock(
    self,
    url: str,
    proxy_pool: Optional[str] = None,
    country: Optional[str] = None,
    os: Optional[str] = None,
    timeout: Optional[int] = None,
    browser_timeout: Optional[int] = None,
    headers: Optional[Dict] = None,
    body: Optional[str] = None,
    method: Optional[str] = None,
) -> Dict:
    """
    Bypass anti-bot protection and get a ready-to-use browser session.
    :param url: Target URL to navigate to and bypass protection
    :param proxy_pool: Proxy pool: 'datacenter' or 'residential'
    :param country: ISO country code for proxy geolocation
    :param os: Operating system fingerprint: 'linux', 'windows', 'macos'
    :param timeout: Navigation timeout in seconds (max 300)
    :param browser_timeout: Browser session timeout in seconds (max 1800)
    :param headers: Custom request headers
    :param body: Request body for POST/PUT/PATCH requests
    :param method: HTTP method: GET, POST, PUT, PATCH, DELETE
    :return: dict with ws_url, session_id, run_id
    """
    proxy_pool_map = {
        'datacenter': 'public_datacenter_pool',
        'residential': 'public_residential_pool',
    }

    json_body = {'url': url}

    if proxy_pool is not None:
        json_body['proxy_pool'] = proxy_pool_map.get(proxy_pool, proxy_pool)

    if country is not None:
        json_body['country'] = country

    if os is not None:
        json_body['os'] = os

    if timeout is not None:
        json_body['timeout'] = timeout

    if browser_timeout is not None:
        json_body['browser_timeout'] = browser_timeout

    if headers is not None:
        json_body['headers'] = headers

    if body is not None:
        json_body['body'] = body

    if method is not None:
        json_body['method'] = method

    response = self._http_handler(
        method='POST',
        url=self.cloud_browser_api_host + '/unblock',
        json=json_body,
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, 155),
        headers={
            'content-type': 'application/json',
            'user-agent': self.ua
        },
    )

    response.raise_for_status()

    return response.json()

Bypass anti-bot protection and get a ready-to-use browser session. :param url: Target URL to navigate to and bypass protection :param proxy_pool: Proxy pool: 'datacenter' or 'residential' :param country: ISO country code for proxy geolocation :param os: Operating system fingerprint: 'linux', 'windows', 'macos' :param timeout: Navigation timeout in seconds (max 300) :param browser_timeout: Browser session timeout in seconds (max 1800) :param headers: Custom request headers :param body: Request body for POST/PUT/PATCH requests :param method: HTTP method: GET, POST, PUT, PATCH, DELETE :return: dict with ws_url, session_id, run_id

def cloud_browser_video(self, run_id: str, save_path: str | None = None) ‑> bytes

Expand source code

def cloud_browser_video(self, run_id: str, save_path: Optional[str] = None) -> bytes:
    """
    Download a debug session recording video.
    :param run_id: The unique run identifier
    :param save_path: Optional file path to save the video (e.g. 'recording.webm')
    :return: bytes - raw video data
    """
    response = self._http_handler(
        method='GET',
        url=self.cloud_browser_api_host + '/run/' + run_id + '/video',
        params={'key': self.key},
        verify=self.verify,
        timeout=(self.connect_timeout, 120),  # Videos can be large
        headers={
            'user-agent': self.ua
        },
        stream=True,
    )

    response.raise_for_status()

    data = response.content
    if save_path:
        with open(save_path, 'wb') as f:
            f.write(data)

    return data

Download a debug session recording video. :param run_id: The unique run identifier :param save_path: Optional file path to save the video (e.g. 'recording.webm') :return: bytes - raw video data

async def concurrent_scrape(self, scrape_configs: List[ScrapeConfig], concurrency: int | None = None)

Expand source code

async def concurrent_scrape(self, scrape_configs:List[ScrapeConfig], concurrency:Optional[int]=None):
    if concurrency is None:
        concurrency = self.max_concurrency
    elif concurrency == self.CONCURRENCY_AUTO:
        concurrency = self.account()['subscription']['max_concurrency']

    loop = asyncio.get_running_loop()
    processing_tasks = []
    results = []
    processed_tasks = 0
    expected_tasks = len(scrape_configs)

    def scrape_done_callback(task:Task):
        nonlocal processed_tasks

        try:
            if task.cancelled() is True:
                return

            error = task.exception()

            if error is not None:
                results.append(error)
            else:
                results.append(task.result())
        finally:
            processing_tasks.remove(task)
            processed_tasks += 1

    while scrape_configs or results or processing_tasks:
        logger.info("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))

        if scrape_configs:
            if len(processing_tasks) < concurrency:
                # @todo handle backpressure
                for _ in range(0, concurrency - len(processing_tasks)):
                    try:
                        scrape_config = scrape_configs.pop()
                    except IndexError:
                        break

                    scrape_config.raise_on_upstream_error = False
                    task = loop.create_task(self.async_scrape(scrape_config=scrape_config, loop=loop))
                    processing_tasks.append(task)
                    task.add_done_callback(scrape_done_callback)

        for _ in results:
            result = results.pop()
            yield result

        await asyncio.sleep(.5)

    logger.debug("Scrape %d/%d - %d running" % (processed_tasks, expected_tasks, len(processing_tasks)))

def extract(self, extraction_config: ExtractionConfig, no_raise: bool = False) ‑> ExtractionApiResponse

Expand source code

@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def extract(self, extraction_config:ExtractionConfig, no_raise:bool=False) -> ExtractionApiResponse:
    """
    Extract structured data from text content
    :param extraction_config: ExtractionConfig
    :param no_raise: bool - if True, do not raise exception on error while the extraction api response is a ScrapflyError for seamless integration
    :return: str

    If you use no_raise=True, make sure to check the extraction_api_response.error attribute to handle the error.
    If the error is not none, you will get the following structure for example

    'error': {
        'code': 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED',
        'message': 'The content type of the response is not supported for extraction',
        'http_code': 422,
        'links': {
            'Checkout the related doc: https://scrapfly.io/docs/extraction-api/error/ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED'
        }
    }
    """

    try:
        logger.debug('--> %s Extracting data from' % (extraction_config.content_type))
        request_data = self._extraction_request(extraction_config=extraction_config)
        response = self._http_handler(**request_data)
        extraction_api_response = self._handle_extraction_response(response=response, extraction_config=extraction_config)
        return extraction_api_response
    except BaseException as e:
        self.reporter.report(error=e)

        if no_raise and isinstance(e, ScrapflyError) and e.api_response is not None:
            return e.api_response

        raise e

Extract structured data from text content :param extraction_config: ExtractionConfig :param no_raise: bool - if True, do not raise exception on error while the extraction api response is a ScrapflyError for seamless integration :return: str

If you use no_raise=True, make sure to check the extraction_api_response.error attribute to handle the error. If the error is not none, you will get the following structure for example

'error': { 'code': 'ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED', 'message': 'The content type of the response is not supported for extraction', 'http_code': 422, 'links': { 'Checkout the related doc: https://scrapfly.io/docs/extraction-api/error/ERR::EXTRACTION::CONTENT_TYPE_NOT_SUPPORTED' } }

def get_crawl_artifact(self, uuid: str, artifact_type: str = 'warc') ‑> CrawlerArtifactResponse

Expand source code

@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def get_crawl_artifact(
    self,
    uuid: str,
    artifact_type: str = 'warc'
) -> CrawlerArtifactResponse:
    """
    Download crawler job artifact

    :param uuid: Crawler job UUID
    :param artifact_type: Artifact type ('warc' or 'har')
    :return: CrawlerArtifactResponse with WARC data and parsing utilities

    Example:
        ```python
        # Wait for crawl to complete
        while True:
            status = client.get_crawl_status(uuid)
            if status.is_complete:
                break
            time.sleep(5)

        # Download artifact
        artifact = client.get_crawl_artifact(uuid)

        # Easy mode: get all pages
        pages = artifact.get_pages()
        for page in pages:
            print(f"{page['url']}: {page['status_code']}")

        # Memory-efficient: iterate
        for record in artifact.iter_responses():
            process(record.content)

        # Save to file
        artifact.save('crawl.warc.gz')
        ```
    """
    timeout = (self.connect_timeout, 300)  # 5 minutes for large downloads

    response = self._http_handler(
        method='GET',
        url=f'{self.host}/crawl/{uuid}/artifact',
        params={
            'key': self.key,
            'type': artifact_type
        },
        timeout=timeout,
        headers={'User-Agent': self.ua},
        verify=self.verify
    )

    if response.status_code != 200:
        self._handle_crawler_error_response(response)

    return CrawlerArtifactResponse(response.content, artifact_type=artifact_type)

Download crawler job artifact

:param uuid: Crawler job UUID :param artifact_type: Artifact type ('warc' or 'har') :return: CrawlerArtifactResponse with WARC data and parsing utilities

Example

# Wait for crawl to complete
while True:
    status = client.get_crawl_status(uuid)
    if status.is_complete:
        break
    time.sleep(5)

# Download artifact
artifact = client.get_crawl_artifact(uuid)

# Easy mode: get all pages
pages = artifact.get_pages()
for page in pages:
    print(f"{page['url']}: {page['status_code']}")

# Memory-efficient: iterate
for record in artifact.iter_responses():
    process(record.content)

# Save to file
artifact.save('crawl.warc.gz')

def get_crawl_contents(self, uuid: str, format: Literal['html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata'] = 'html') ‑> Dict[str, Any]

Expand source code

@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def get_crawl_contents(
    self,
    uuid: str,
    format: Literal['html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata'] = 'html'
) -> Dict[str, Any]:
    """
    Get crawl contents in a specific format

    Retrieves extracted content from crawled pages in the format(s) specified
    in your crawl configuration (via content_formats parameter).

    :param uuid: Crawler job UUID
    :param format: Content format - 'html', 'clean_html', 'markdown', 'json', 'text',
                  'extracted_data', 'page_metadata'
    :return: Dictionary with format {"contents": {url: content, ...}, "links": {...}}

    Example:
        ```python
        # Get all content in markdown format
        result = client.get_crawl_contents(uuid, format='markdown')
        contents = result['contents']

        # Access specific URL
        for url, content in contents.items():
            print(f"{url}: {len(content)} chars")
        ```
    """
    timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

    params = {
        'key': self.key,
        'format': format
    }

    response = self._http_handler(
        method='GET',
        url=f'{self.host}/crawl/{uuid}/contents',
        params=params,
        timeout=timeout,
        headers={'User-Agent': self.ua},
        verify=self.verify
    )

    if response.status_code != 200:
        self._handle_crawler_error_response(response)

    return response.json()

Get crawl contents in a specific format

Retrieves extracted content from crawled pages in the format(s) specified in your crawl configuration (via content_formats parameter).

:param uuid: Crawler job UUID :param format: Content format - 'html', 'clean_html', 'markdown', 'json', 'text', 'extracted_data', 'page_metadata' :return: Dictionary with format {"contents": {url: content, …}, "links": {…}}

Example

# Get all content in markdown format
result = client.get_crawl_contents(uuid, format='markdown')
contents = result['contents']

# Access specific URL
for url, content in contents.items():
    print(f"{url}: {len(content)} chars")

def get_crawl_status(self, uuid: str) ‑> CrawlerStatusResponse

Expand source code

@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def get_crawl_status(self, uuid: str) -> CrawlerStatusResponse:
    """
    Get crawler job status

    :param uuid: Crawler job UUID
    :return: CrawlerStatusResponse with progress information

    Example:
        ```python
        status = client.get_crawl_status(uuid)
        print(f"Status: {status.status}")
        print(f"Progress: {status.progress_pct:.1f}%")
        print(f"Crawled: {status.urls_crawled}/{status.urls_discovered}")

        if status.is_complete:
            print("Crawl completed!")
        ```
    """
    timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

    response = self._http_handler(
        method='GET',
        url=f'{self.host}/crawl/{uuid}/status',
        params={'key': self.key},  # key as query param (already correct)
        timeout=timeout,
        headers={'User-Agent': self.ua},
        verify=self.verify
    )

    if response.status_code != 200:
        self._handle_crawler_error_response(response)

    result = response.json()
    return CrawlerStatusResponse(result)

Get crawler job status

:param uuid: Crawler job UUID :return: CrawlerStatusResponse with progress information

Example

status = client.get_crawl_status(uuid)
print(f"Status: {status.status}")
print(f"Progress: {status.progress_pct:.1f}%")
print(f"Crawled: {status.urls_crawled}/{status.urls_discovered}")

if status.is_complete:
    print("Crawl completed!")

def get_monitoring_metrics(self, format: str = 'structured', period: str | None = None, aggregation: List[Literal['account', 'project', 'target']] | None = None)

Expand source code

def get_monitoring_metrics(self, format:str=ScraperAPI.MONITORING_DATA_FORMAT_STRUCTURED, period:Optional[str]=None, aggregation:Optional[List[MonitoringAggregation]]=None):
    params = {'key': self.key, 'format': format}

    if period is not None:
        params['period'] = period

    if aggregation is not None:
        params['aggregation'] = ','.join(aggregation)

    response = self._http_handler(
        method='GET',
        url=self.host + '/scrape/monitoring/metrics',
        params=params,
        timeout=(self.connect_timeout, self.monitoring_api_read_timeout),
        verify=self.verify,
        headers={
            'accept-encoding': self.body_handler.content_encoding,
            'accept': self.body_handler.accept,
            'user-agent': self.ua
        },
    )

    response.raise_for_status()

    if self.body_handler.support(response.headers):
        return self.body_handler(response.content, response.headers['content-type'])

    return response.content.decode('utf-8')

def get_monitoring_target_metrics(self, domain: str, group_subdomain: bool = False, period: Literal['subscription', 'last7d', 'last24h', 'last1h', 'last5m'] | None = 'last24h', start: datetime.datetime | None = None, end: datetime.datetime | None = None)

Expand source code

def get_monitoring_target_metrics(
        self,
        domain:str,
        group_subdomain:bool=False,
        period:Optional[MonitoringTargetPeriod]=ScraperAPI.MONITORING_PERIOD_LAST_24H,
        start:Optional[datetime.datetime]=None,
        end:Optional[datetime.datetime]=None,
):
    params = {
        'key': self.key,
        'domain': domain,
        'group_subdomain': group_subdomain
    }

    if (start is not None and end is None) or (start is None and end is not None):
        raise ValueError('You must provide both start and end date')

    if start is not None and end is not None:
        params['start'] = start.strftime(self.DATETIME_FORMAT)
        params['end'] = end.strftime(self.DATETIME_FORMAT)
        period = None

    params['period'] = period

    response = self._http_handler(
        method='GET',
        url=self.host + '/scrape/monitoring/metrics/target',
        timeout=(self.connect_timeout, self.monitoring_api_read_timeout),
        params=params,
        verify=self.verify,
        headers={
            'accept-encoding': self.body_handler.content_encoding,
            'accept': self.body_handler.accept,
            'user-agent': self.ua
        },
    )

    response.raise_for_status()

    if self.body_handler.support(response.headers):
        return self.body_handler(response.content, response.headers['content-type'])

    return response.content.decode('utf-8')

def open(self)

Expand source code

def open(self):
    if self.http_session is None:
        self.http_session = Session()
        self.http_session.verify = self.verify
        self.http_session.timeout = (self.connect_timeout, self.default_read_timeout)
        self.http_session.params['key'] = self.key
        self.http_session.headers['accept-encoding'] = self.body_handler.content_encoding
        self.http_session.headers['accept'] = self.body_handler.accept
        self.http_session.headers['user-agent'] = self.ua

def resilient_scrape(self, scrape_config: ScrapeConfig, retry_on_errors: Set[Exception] | None = None, retry_on_status_code: List[int] | None = None, tries: int = 5, delay: int = 20) ‑> ScrapeApiResponse

Expand source code

def resilient_scrape(
    self,
    scrape_config:ScrapeConfig,
    retry_on_errors:Optional[Set[Exception]]=None,
    retry_on_status_code:Optional[List[int]]=None,
    tries: int = 5,
    delay: int = 20,
) -> ScrapeApiResponse:
    if retry_on_errors is None:
        retry_on_errors = {ScrapflyError}
    assert isinstance(retry_on_errors, set), 'retry_on_errors is not a set()'

    @backoff.on_exception(backoff.expo, exception=tuple(retry_on_errors), max_tries=tries, max_time=delay)
    def inner() -> ScrapeApiResponse:

        try:
            return self.scrape(scrape_config=scrape_config)
        except (UpstreamHttpClientError, UpstreamHttpServerError) as e:
            if retry_on_status_code is not None and e.api_response:
                if e.api_response.upstream_status_code in retry_on_status_code:
                    raise e
                else:
                    return e.api_response

            raise e

    return inner()

def save_scrape_screenshot(self, api_response: ScrapeApiResponse, name: str, path: str | None = None)

Expand source code

def save_scrape_screenshot(self, api_response:ScrapeApiResponse, name:str, path:Optional[str]=None):
    """
    Save a screenshot from a scrape result
    :param api_response: ScrapeApiResponse
    :param name: str - name of the screenshot given in the scrape config
    :param path: Optional[str]
    """

    if not api_response.scrape_result['screenshots']:
        raise RuntimeError('Screenshot %s do no exists' % name)

    try:
        api_response.scrape_result['screenshots'][name]
    except KeyError:
        raise RuntimeError('Screenshot %s do no exists' % name)

    screenshot_response = self._http_handler(
        method='GET',
        url=api_response.scrape_result['screenshots'][name]['url'],
        params={'key': self.key},
        verify=self.verify
    )

    screenshot_response.raise_for_status()

    if not name.endswith('.jpg'):
        name += '.jpg'

    api_response.sink(path=path, name=name, content=screenshot_response.content)

Save a screenshot from a scrape result :param api_response: ScrapeApiResponse :param name: str - name of the screenshot given in the scrape config :param path: Optional[str]

def save_screenshot(self, screenshot_api_response: ScreenshotApiResponse, name: str, path: str | None = None)

Expand source code

def save_screenshot(self, screenshot_api_response:ScreenshotApiResponse, name:str, path:Optional[str]=None):
    """
    Save a screenshot from a screenshot API response
    :param api_response: ScreenshotApiResponse
    :param name: str - name of the screenshot to save as
    :param path: Optional[str]
    """

    if screenshot_api_response.screenshot_success is not True:
        raise RuntimeError('Screenshot was not successful')

    if not screenshot_api_response.image:
        raise RuntimeError('Screenshot binary does not exist')

    content = screenshot_api_response.image
    extension_name = screenshot_api_response.metadata['extension_name']

    if path:
        os.makedirs(path, exist_ok=True)
        file_path = os.path.join(path, f'{name}.{extension_name}')
    else:
        file_path = f'{name}.{extension_name}'

    if isinstance(content, bytes):
        content = BytesIO(content)

    with open(file_path, 'wb') as f:
        shutil.copyfileobj(content, f, length=131072)

Save a screenshot from a screenshot API response :param api_response: ScreenshotApiResponse :param name: str - name of the screenshot to save as :param path: Optional[str]

def scrape(self, scrape_config: ScrapeConfig, no_raise: bool = False) ‑> ScrapeApiResponse

Expand source code

@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def scrape(self, scrape_config:ScrapeConfig, no_raise:bool=False) -> ScrapeApiResponse:
    """
    Scrape a website
    :param scrape_config: ScrapeConfig
    :param no_raise: bool - if True, do not raise exception on error while the api response is a ScrapflyError for seamless integration
    :return: ScrapeApiResponse

    If you use no_raise=True, make sure to check the api_response.scrape_result.error attribute to handle the error.
    If the error is not none, you will get the following structure for example

    'error': {
        'code': 'ERR::ASP::SHIELD_PROTECTION_FAILED',
        'message': 'The ASP shield failed to solve the challenge against the anti scrapping protection - heuristic_engine bypass failed, please retry in few seconds',
        'retryable': False,
        'http_code': 422,
        'links': {
            'Checkout ASP documentation': 'https://scrapfly.io/docs/scrape-api/anti-scraping-protection#maximize_success_rate', 'Related Error Doc': 'https://scrapfly.io/docs/scrape-api/error/ERR::ASP::SHIELD_PROTECTION_FAILED'
        }
    }
    """

    try:
        logger.debug('--> %s Scrapping %s' % (scrape_config.method, scrape_config.url))
        request_data = self._scrape_request(scrape_config=scrape_config)
        response = self._http_handler(**request_data)

        if scrape_config.proxified_response is True:
            # Proxified mode: the API returns the raw upstream response
            # (target's status, headers, body) instead of the JSON
            # envelope. Error restoration: if X-Scrapfly-Reject-Code is
            # present, the scrape failed and the SDK must raise a typed
            # error with the code/message/retryable from the headers.
            reject_code = response.headers.get('X-Scrapfly-Reject-Code')
            if reject_code:
                from scrapfly.errors import HttpError
                reject_desc = response.headers.get('X-Scrapfly-Reject-Description', '')
                reject_retryable = response.headers.get('X-Scrapfly-Reject-Retryable', 'false').lower() == 'true'
                retry_after = None
                if reject_retryable:
                    try:
                        retry_after = int(response.headers.get('Retry-After', '0'))
                    except (ValueError, TypeError):
                        retry_after = None
                raise HttpError(
                    request=response.request,
                    response=response,
                    code=reject_code,
                    http_status_code=response.status_code,
                    message=reject_desc,
                    is_retryable=reject_retryable,
                    retry_delay=retry_after,
                )
            self.reporter.report(scrape_api_response=None)
            return response

        scrape_api_response = self._handle_response(response=response, scrape_config=scrape_config)

        self.reporter.report(scrape_api_response=scrape_api_response)

        return scrape_api_response
    except BaseException as e:
        self.reporter.report(error=e)

        if no_raise and isinstance(e, ScrapflyError) and e.api_response is not None:
            return e.api_response

        raise e

Scrape a website :param scrape_config: ScrapeConfig :param no_raise: bool - if True, do not raise exception on error while the api response is a ScrapflyError for seamless integration :return: ScrapeApiResponse

If you use no_raise=True, make sure to check the api_response.scrape_result.error attribute to handle the error. If the error is not none, you will get the following structure for example

'error': { 'code': 'ERR::ASP::SHIELD_PROTECTION_FAILED', 'message': 'The ASP shield failed to solve the challenge against the anti scrapping protection - heuristic_engine bypass failed, please retry in few seconds', 'retryable': False, 'http_code': 422, 'links': { 'Checkout ASP documentation': 'https://scrapfly.io/docs/scrape-api/anti-scraping-protection#maximize_success_rate', 'Related Error Doc': 'https://scrapfly.io/docs/scrape-api/error/ERR::ASP::SHIELD_PROTECTION_FAILED' } }

def screenshot(self, screenshot_config: ScreenshotConfig, no_raise: bool = False) ‑> ScreenshotApiResponse

Expand source code

@backoff.on_exception(backoff.expo, exception=NetworkError, max_tries=5)
def screenshot(self, screenshot_config:ScreenshotConfig, no_raise:bool=False) -> ScreenshotApiResponse:
    """
    Take a screenshot
    :param screenshot_config: ScrapeConfig
    :param no_raise: bool - if True, do not raise exception on error while the screenshot api response is a ScrapflyError for seamless integration
    :return: str

    If you use no_raise=True, make sure to check the screenshot_api_response.error attribute to handle the error.
    If the error is not none, you will get the following structure for example

    'error': {
        'code': 'ERR::SCREENSHOT::UNABLE_TO_TAKE_SCREENSHOT',
        'message': 'For some reason we were unable to take the screenshot',
        'http_code': 422,
        'links': {
            'Checkout the related doc: https://scrapfly.io/docs/screenshot-api/error/ERR::SCREENSHOT::UNABLE_TO_TAKE_SCREENSHOT'
        }
    }
    """

    try:
        logger.debug('--> %s Screenshoting' % (screenshot_config.url))
        request_data = self._screenshot_request(screenshot_config=screenshot_config)
        response = self._http_handler(**request_data)
        screenshot_api_response = self._handle_screenshot_response(response=response, screenshot_config=screenshot_config)
        return screenshot_api_response
    except BaseException as e:
        self.reporter.report(error=e)

        if no_raise and isinstance(e, ScrapflyError) and e.api_response is not None:
            return e.api_response

        raise e

Take a screenshot :param screenshot_config: ScrapeConfig :param no_raise: bool - if True, do not raise exception on error while the screenshot api response is a ScrapflyError for seamless integration :return: str

If you use no_raise=True, make sure to check the screenshot_api_response.error attribute to handle the error. If the error is not none, you will get the following structure for example

'error': { 'code': 'ERR::SCREENSHOT::UNABLE_TO_TAKE_SCREENSHOT', 'message': 'For some reason we were unable to take the screenshot', 'http_code': 422, 'links': { 'Checkout the related doc: https://scrapfly.io/docs/screenshot-api/error/ERR::SCREENSHOT::UNABLE_TO_TAKE_SCREENSHOT' } }

Expand source code

def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes]]=None, path: Optional[str] = None, name: Optional[str] = None, file: Optional[Union[TextIO, BytesIO]] = None) -> str:
    scrape_result = api_response.result['result']
    scrape_config = api_response.result['config']

    file_content = content or scrape_result['content']
    file_path = None
    file_extension = None

    if name:
        name_parts = name.split('.')
        if len(name_parts) > 1:
            file_extension = name_parts[-1]

    if not file:
        if file_extension is None:
            try:
                mime_type = scrape_result['response_headers']['content-type']
            except KeyError:
                mime_type = 'application/octet-stream'

            if ';' in mime_type:
                mime_type = mime_type.split(';')[0]

            file_extension = '.' + mime_type.split('/')[1]

        if not name:
            name = scrape_config['url'].split('/')[-1]

        if name.find(file_extension) == -1:
            name += file_extension

        file_path = path + '/' + name if path else name

        if file_path == file_extension:
            url = re.sub(r'(https|http)?://', '', api_response.config['url']).replace('/', '-')

            if url[-1] == '-':
                url = url[:-1]

            url += file_extension

            file_path = url

        file = open(file_path, 'wb')

    if isinstance(file_content, str):
        file_content = BytesIO(file_content.encode('utf-8'))
    elif isinstance(file_content, bytes):
        file_content = BytesIO(file_content)

    file_content.seek(0)
    with file as f:
        shutil.copyfileobj(file_content, f, length=131072)

    logger.info('file %s created' % file_path)
    return file_path

def start_crawl(self, crawler_config: CrawlerConfig) ‑> CrawlerStartResponse

Expand source code

@backoff.on_exception(backoff.expo, exception=ConnectionError, max_tries=5)
def start_crawl(self, crawler_config: CrawlerConfig) -> CrawlerStartResponse:
    """
    Start a crawler job

    :param crawler_config: CrawlerConfig
    :return: CrawlerStartResponse with UUID and initial status

    Example:
        ```python
        from scrapfly import ScrapflyClient, CrawlerConfig

        client = ScrapflyClient(key='YOUR_API_KEY')
        config = CrawlerConfig(
            url='https://example.com',
            page_limit=100,
            max_depth=3
        )

        response = client.start_crawl(config)
        print(f"Crawler started: {response.uuid}")
        ```
    """
    # Get crawler config params (without key)
    body_params = crawler_config.to_api_params()

    # API key must be passed as query parameter, not in body
    query_params = {'key': self.key}

    timeout = (self.connect_timeout, self.DEFAULT_CRAWLER_API_READ_TIMEOUT)

    url = f'{self.host}/crawl'
    logger.debug(f"Crawler API POST {url}?key=***")
    logger.debug(f"Crawler API body: {body_params}")

    response = self._http_handler(
        method='POST',
        url=url,
        params=query_params,  # key as query param
        json=body_params,      # config in body
        timeout=timeout,
        headers={'User-Agent': self.ua},
        verify=self.verify
    )

    if response.status_code not in (200, 201):
        # Log error details for debugging
        try:
            error_detail = response.json()
        except (ValueError, Exception):
            error_detail = response.text
        logger.debug(f"Crawler API error ({response.status_code}): {error_detail}")
        self._handle_crawler_error_response(response)

    result = response.json()
    return CrawlerStartResponse(result)

Start a crawler job

:param crawler_config: CrawlerConfig :return: CrawlerStartResponse with UUID and initial status

Example

from scrapfly import ScrapflyClient, CrawlerConfig

client = ScrapflyClient(key='YOUR_API_KEY')
config = CrawlerConfig(
    url='https://example.com',
    page_limit=100,
    max_depth=3
)

response = client.start_crawl(config)
print(f"Crawler started: {response.uuid}")

class ScrapflyCrawlerError (message: str, code: str, http_status_code: int, resource: str | None = None, is_retryable: bool = False, retry_delay: int | None = None, retry_times: int | None = None, documentation_url: str | None = None, api_response: ForwardRef('ApiResponse') | None = None)

Expand source code

class ScrapflyCrawlerError(CrawlerError):
    """Exception raised when a crawler job fails or is cancelled"""
    pass

Exception raised when a crawler job fails or is cancelled

Ancestors

CrawlerError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflyError (message: str, code: str, http_status_code: int, resource: str | None = None, is_retryable: bool = False, retry_delay: int | None = None, retry_times: int | None = None, documentation_url: str | None = None, api_response: ForwardRef('ApiResponse') | None = None)

Expand source code

class ScrapflyError(Exception):
    KIND_HTTP_BAD_RESPONSE = 'HTTP_BAD_RESPONSE'
    KIND_SCRAPFLY_ERROR = 'SCRAPFLY_ERROR'

    RESOURCE_PROXY = 'PROXY'
    RESOURCE_THROTTLE = 'THROTTLE'
    RESOURCE_SCRAPE = 'SCRAPE'
    RESOURCE_ASP = 'ASP'
    RESOURCE_SCHEDULE = 'SCHEDULE'
    RESOURCE_WEBHOOK = 'WEBHOOK'
    RESOURCE_SESSION = 'SESSION'

    def __init__(
        self,
        message: str,
        code: str,
        http_status_code: int,
        resource: Optional[str]=None,
        is_retryable: bool = False,
        retry_delay: Optional[int] = None,
        retry_times: Optional[int] = None,
        documentation_url: Optional[str] = None,
        api_response: Optional['ApiResponse'] = None
    ):
        self.message = message
        self.code = code
        self.retry_delay = retry_delay
        self.retry_times = retry_times
        self.resource = resource
        self.is_retryable = is_retryable
        self.documentation_url = documentation_url
        self.api_response = api_response
        self.http_status_code = http_status_code

        super().__init__(self.message, str(self.code))

    def __str__(self):
        message = self.message

        if self.documentation_url is not None:
            message += '. Learn more: %s' % self.documentation_url

        return message

Common base class for all non-exit exceptions.

Ancestors

builtins.Exception
builtins.BaseException

Subclasses

CrawlerError
scrapfly.errors.ExtraUsageForbidden
scrapfly.errors.HttpError

Class variables

var KIND_HTTP_BAD_RESPONSE
var KIND_SCRAPFLY_ERROR
var RESOURCE_ASP
var RESOURCE_PROXY
var RESOURCE_SCHEDULE
var RESOURCE_SCRAPE
var RESOURCE_SESSION
var RESOURCE_THROTTLE
var RESOURCE_WEBHOOK

class ScrapflyProxyError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflyProxyError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflyScheduleError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflyScheduleError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflyScrapeError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflyScrapeError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflySessionError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflySessionError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflyThrottleError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflyThrottleError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScrapflyWebhookError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScrapflyWebhookError(ScraperAPIError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.ScraperAPIError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScreenshotAPIError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class ScreenshotAPIError(HttpError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class ScreenshotApiResponse (request: requests.models.Request, response: requests.models.Response, screenshot_config: ScreenshotConfig, api_result: bytes | None = None)

Expand source code

class ScreenshotApiResponse(ApiResponse):
    def __init__(self, request: Request, response: Response, screenshot_config: ScreenshotConfig, api_result: Optional[bytes] = None):
        super().__init__(request, response)
        self.screenshot_config = screenshot_config
        self.result = self.handle_api_result(api_result)

    @property
    def image(self) -> Optional[str]:
        binary = self.result.get('result', None)
        if binary is None:
            return ''

        return binary

    @property
    def metadata(self) -> Optional[Dict]:
        if not self.image:
            return {}

        content_type = self.response.headers.get('content-type')
        extension_name = content_type[content_type.find('/') + 1:].split(';')[0]

        return {
            'extension_name': extension_name,
            'upstream-status-code': self.response.headers.get('X-Scrapfly-Upstream-Http-Code'),
            'upstream-url': self.response.headers.get('X-Scrapfly-Upstream-Url')
        }

    @property
    def screenshot_success(self) -> bool:
        if not self.image:
            return False

        return True

    @property
    def error(self) -> Optional[Dict]:
        if self.image:
            return None

        if self.screenshot_success is False:
            return self.result

    def _is_api_error(self, api_result: Dict) -> bool:
        if api_result is None:
            return True

        return 'error_id' in api_result

    def handle_api_result(self, api_result: bytes) -> FrozenDict:
        if self._is_api_error(api_result=api_result) is True:
            return FrozenDict(api_result)

        return api_result

    def raise_for_result(self, raise_on_upstream_error=True, error_class=ScreenshotAPIError):
        super().raise_for_result(raise_on_upstream_error=raise_on_upstream_error, error_class=error_class)

Ancestors

ApiResponse

Instance variables

prop error : Dict | None

Expand source code

@property
def error(self) -> Optional[Dict]:
    if self.image:
        return None

    if self.screenshot_success is False:
        return self.result

prop image : str | None

Expand source code

@property
def image(self) -> Optional[str]:
    binary = self.result.get('result', None)
    if binary is None:
        return ''

    return binary

prop metadata : Dict | None

Expand source code

@property
def metadata(self) -> Optional[Dict]:
    if not self.image:
        return {}

    content_type = self.response.headers.get('content-type')
    extension_name = content_type[content_type.find('/') + 1:].split(';')[0]

    return {
        'extension_name': extension_name,
        'upstream-status-code': self.response.headers.get('X-Scrapfly-Upstream-Http-Code'),
        'upstream-url': self.response.headers.get('X-Scrapfly-Upstream-Url')
    }

prop screenshot_success : bool

Expand source code

@property
def screenshot_success(self) -> bool:
    if not self.image:
        return False

    return True

Methods

def handle_api_result(self, api_result: bytes) ‑> FrozenDict

Expand source code

def handle_api_result(self, api_result: bytes) -> FrozenDict:
    if self._is_api_error(api_result=api_result) is True:
        return FrozenDict(api_result)

    return api_result

def raise_for_result(self, raise_on_upstream_error=True, error_class=scrapfly.errors.ScreenshotAPIError)

Expand source code

def raise_for_result(self, raise_on_upstream_error=True, error_class=ScreenshotAPIError):
    super().raise_for_result(raise_on_upstream_error=raise_on_upstream_error, error_class=error_class)

Inherited members

ApiResponse:
- status_code

Expand source code

class ScreenshotConfig(BaseApiConfig):
    url: str
    format: Optional[Format] = None
    capture: Optional[str] = None
    resolution: Optional[str] = None
    country: Optional[str] = None
    timeout: Optional[int] = None # in milliseconds
    rendering_wait: Optional[int] = None # in milliseconds
    wait_for_selector: Optional[str] = None
    options: Optional[List[Options]] = None
    auto_scroll: Optional[bool] = None
    js: Optional[str] = None
    cache: Optional[bool] = None
    cache_ttl: Optional[int] = None
    cache_clear: Optional[bool] = None
    webhook: Optional[str] = None
    raise_on_upstream_error: bool = True

    def __init__(
        self,
        url: str,
        format: Optional[Format] = None,
        capture: Optional[str] = None,
        resolution: Optional[str] = None,
        country: Optional[str] = None,
        timeout: Optional[int] = None, # in milliseconds
        rendering_wait: Optional[int] = None, # in milliseconds
        wait_for_selector: Optional[str] = None,
        options: Optional[List[Options]] = None,
        auto_scroll: Optional[bool] = None,
        js: Optional[str] = None,
        cache: Optional[bool] = None,
        cache_ttl: Optional[int] = None,
        cache_clear: Optional[bool] = None,
        vision_deficiency: Optional[VisionDeficiency] = None,
        webhook: Optional[str] = None,
        raise_on_upstream_error: bool = True
    ):
        assert(type(url) is str)

        self.url = url
        self.key = None
        self.format = format
        self.capture = capture
        self.resolution = resolution
        self.country = country
        self.timeout = timeout
        self.rendering_wait = rendering_wait
        self.wait_for_selector = wait_for_selector
        self.options = [Options(flag) for flag in options] if options else None
        self.auto_scroll = auto_scroll
        self.js = js
        self.cache = cache
        self.cache_ttl = cache_ttl
        self.cache_clear = cache_clear
        self.vision_deficiency = vision_deficiency
        self.webhook = webhook
        self.raise_on_upstream_error = raise_on_upstream_error

    def to_api_params(self, key:str) -> Dict:
        params = {
            'key': self.key or key,
            'url': self.url
        }

        if self.format:
            params['format'] = Format(self.format).value

        if self.capture:
            params['capture'] = self.capture

        if self.resolution:
            params['resolution'] = self.resolution

        if self.country is not None:
            params['country'] = self.country

        if self.timeout is not None:
            params['timeout'] = self.timeout

        if self.rendering_wait is not None:
            params['rendering_wait'] = self.rendering_wait

        if self.wait_for_selector is not None:
            params['wait_for_selector'] = self.wait_for_selector            

        if self.options is not None:
            params["options"] = ",".join(flag.value for flag in self.options)

        if self.auto_scroll is not None:
            params['auto_scroll'] = self._bool_to_http(self.auto_scroll)

        if self.js:
            params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')

        if self.cache is not None:
            params['cache'] = self._bool_to_http(self.cache)

            if self.cache_ttl is not None:
                params['cache_ttl'] = self.cache_ttl

            if self.cache_clear is not None:
                params['cache_clear'] = self._bool_to_http(self.cache_clear)

        else:
            if self.cache_ttl is not None:
                logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')

            if self.cache_clear is not None:
                logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')

        if self.vision_deficiency is not None:
            params['vision_deficiency'] = self.vision_deficiency.value

        if self.webhook is not None:
            params['webhook_name'] = self.webhook

        return params

    def to_dict(self) -> Dict:
        """
        Export the ScreenshotConfig instance to a plain dictionary.
        """
        return {
            'url': self.url,
            'format': Format(self.format).value if self.format else None,
            'capture': self.capture,
            'resolution': self.resolution,
            'country': self.country,
            'timeout': self.timeout,
            'rendering_wait': self.rendering_wait,
            'wait_for_selector': self.wait_for_selector,
            'options': [Options(option).value for option in self.options] if self.options else None,
            'auto_scroll': self.auto_scroll,
            'js': self.js,
            'cache': self.cache,
            'cache_ttl': self.cache_ttl,
            'cache_clear': self.cache_clear,
            'vision_deficiency': self.vision_deficiency.value if self.vision_deficiency else None,
            'webhook': self.webhook,
            'raise_on_upstream_error': self.raise_on_upstream_error
        }
    
    @staticmethod
    def from_dict(screenshot_config_dict: Dict) -> 'ScreenshotConfig':
        """Create a ScreenshotConfig instance from a dictionary."""
        url = screenshot_config_dict.get('url', None)

        format = screenshot_config_dict.get('format', None)
        format = Format(format) if format else None

        capture = screenshot_config_dict.get('capture', None)
        resolution = screenshot_config_dict.get('resolution', None)
        country = screenshot_config_dict.get('country', None)
        timeout = screenshot_config_dict.get('timeout', None)
        rendering_wait = screenshot_config_dict.get('rendering_wait', None)
        wait_for_selector = screenshot_config_dict.get('wait_for_selector', None)

        options = screenshot_config_dict.get('options', None)
        options = [Options(option) for option in options] if options else None

        auto_scroll = screenshot_config_dict.get('auto_scroll', None)
        js = screenshot_config_dict.get('js', None)
        cache = screenshot_config_dict.get('cache', None)
        cache_ttl = screenshot_config_dict.get('cache_ttl', None)
        cache_clear = screenshot_config_dict.get('cache_clear', None)
        vision_deficiency = screenshot_config_dict.get('vision_deficiency', None)
        webhook = screenshot_config_dict.get('webhook', None)
        raise_on_upstream_error = screenshot_config_dict.get('raise_on_upstream_error', True)

        return ScreenshotConfig(
            url=url,
            format=format,
            capture=capture,
            resolution=resolution,
            country=country,
            timeout=timeout,
            rendering_wait=rendering_wait,
            wait_for_selector=wait_for_selector,
            options=options,
            auto_scroll=auto_scroll,
            js=js,
            cache=cache,
            cache_ttl=cache_ttl,
            cache_clear=cache_clear,
            vision_deficiency=vision_deficiency,
            webhook=webhook,
            raise_on_upstream_error=raise_on_upstream_error
        )

Ancestors

BaseApiConfig

Class variables

var auto_scroll : bool | None
var cache : bool | None
var cache_clear : bool | None
var cache_ttl : int | None
var capture : str | None
var country : str | None
var format : Format | None
var js : str | None
var options : List[Options] | None
var raise_on_upstream_error : bool
var rendering_wait : int | None
var resolution : str | None
var timeout : int | None
var url : str
var wait_for_selector : str | None
var webhook : str | None

Static methods

def from_dict(screenshot_config_dict: Dict) ‑> ScreenshotConfig

Expand source code

@staticmethod
def from_dict(screenshot_config_dict: Dict) -> 'ScreenshotConfig':
    """Create a ScreenshotConfig instance from a dictionary."""
    url = screenshot_config_dict.get('url', None)

    format = screenshot_config_dict.get('format', None)
    format = Format(format) if format else None

    capture = screenshot_config_dict.get('capture', None)
    resolution = screenshot_config_dict.get('resolution', None)
    country = screenshot_config_dict.get('country', None)
    timeout = screenshot_config_dict.get('timeout', None)
    rendering_wait = screenshot_config_dict.get('rendering_wait', None)
    wait_for_selector = screenshot_config_dict.get('wait_for_selector', None)

    options = screenshot_config_dict.get('options', None)
    options = [Options(option) for option in options] if options else None

    auto_scroll = screenshot_config_dict.get('auto_scroll', None)
    js = screenshot_config_dict.get('js', None)
    cache = screenshot_config_dict.get('cache', None)
    cache_ttl = screenshot_config_dict.get('cache_ttl', None)
    cache_clear = screenshot_config_dict.get('cache_clear', None)
    vision_deficiency = screenshot_config_dict.get('vision_deficiency', None)
    webhook = screenshot_config_dict.get('webhook', None)
    raise_on_upstream_error = screenshot_config_dict.get('raise_on_upstream_error', True)

    return ScreenshotConfig(
        url=url,
        format=format,
        capture=capture,
        resolution=resolution,
        country=country,
        timeout=timeout,
        rendering_wait=rendering_wait,
        wait_for_selector=wait_for_selector,
        options=options,
        auto_scroll=auto_scroll,
        js=js,
        cache=cache,
        cache_ttl=cache_ttl,
        cache_clear=cache_clear,
        vision_deficiency=vision_deficiency,
        webhook=webhook,
        raise_on_upstream_error=raise_on_upstream_error
    )

Create a ScreenshotConfig instance from a dictionary.

Methods

def to_api_params(self, key: str) ‑> Dict

Expand source code

def to_api_params(self, key:str) -> Dict:
    params = {
        'key': self.key or key,
        'url': self.url
    }

    if self.format:
        params['format'] = Format(self.format).value

    if self.capture:
        params['capture'] = self.capture

    if self.resolution:
        params['resolution'] = self.resolution

    if self.country is not None:
        params['country'] = self.country

    if self.timeout is not None:
        params['timeout'] = self.timeout

    if self.rendering_wait is not None:
        params['rendering_wait'] = self.rendering_wait

    if self.wait_for_selector is not None:
        params['wait_for_selector'] = self.wait_for_selector            

    if self.options is not None:
        params["options"] = ",".join(flag.value for flag in self.options)

    if self.auto_scroll is not None:
        params['auto_scroll'] = self._bool_to_http(self.auto_scroll)

    if self.js:
        params['js'] = base64.urlsafe_b64encode(self.js.encode('utf-8')).decode('utf-8')

    if self.cache is not None:
        params['cache'] = self._bool_to_http(self.cache)

        if self.cache_ttl is not None:
            params['cache_ttl'] = self.cache_ttl

        if self.cache_clear is not None:
            params['cache_clear'] = self._bool_to_http(self.cache_clear)

    else:
        if self.cache_ttl is not None:
            logging.warning('Params "cache_ttl" is ignored. Works only if cache is enabled')

        if self.cache_clear is not None:
            logging.warning('Params "cache_clear" is ignored. Works only if cache is enabled')

    if self.vision_deficiency is not None:
        params['vision_deficiency'] = self.vision_deficiency.value

    if self.webhook is not None:
        params['webhook_name'] = self.webhook

    return params

def to_dict(self) ‑> Dict

Expand source code

def to_dict(self) -> Dict:
    """
    Export the ScreenshotConfig instance to a plain dictionary.
    """
    return {
        'url': self.url,
        'format': Format(self.format).value if self.format else None,
        'capture': self.capture,
        'resolution': self.resolution,
        'country': self.country,
        'timeout': self.timeout,
        'rendering_wait': self.rendering_wait,
        'wait_for_selector': self.wait_for_selector,
        'options': [Options(option).value for option in self.options] if self.options else None,
        'auto_scroll': self.auto_scroll,
        'js': self.js,
        'cache': self.cache,
        'cache_ttl': self.cache_ttl,
        'cache_clear': self.cache_clear,
        'vision_deficiency': self.vision_deficiency.value if self.vision_deficiency else None,
        'webhook': self.webhook,
        'raise_on_upstream_error': self.raise_on_upstream_error
    }

Export the ScreenshotConfig instance to a plain dictionary.

class UpstreamHttpClientError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class UpstreamHttpClientError(UpstreamHttpError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.UpstreamHttpError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

Subclasses

UpstreamHttpServerError

class UpstreamHttpError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class UpstreamHttpError(HttpError):
    pass

Common base class for all non-exit exceptions.

Ancestors

scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

Subclasses

UpstreamHttpClientError

class UpstreamHttpServerError (request: requests.models.Request, response: requests.models.Response | None = None, **kwargs)

Expand source code

class UpstreamHttpServerError(UpstreamHttpClientError):
    pass

Common base class for all non-exit exceptions.

Ancestors

UpstreamHttpClientError
scrapfly.errors.UpstreamHttpError
scrapfly.errors.HttpError
ScrapflyError
builtins.Exception
builtins.BaseException

class VisionDeficiency (value, names=None, *, module=None, qualname=None, type=None, start=1)

Expand source code

class VisionDeficiency(Enum):
    """
    Simulate vision deficiency for accessibility testing (WCAG compliance)

    Attributes:
        DEUTERANOPIA: Difficulty distinguishing green from red; green appears beige/gray
        PROTANOPIA: Reduced sensitivity to red light; red appears dark/black
        TRITANOPIA: Difficulty distinguishing blue from yellow and violet from red
        ACHROMATOPSIA: Complete inability to perceive color; sees only in grayscale
        REDUCED_CONTRAST: Simulates reduced contrast due to aging, low light, or other factors
        BLURRED_VISION: Simulates uncorrected refractive errors or age-related vision loss
    """
    DEUTERANOPIA = "deuteranopia"
    PROTANOPIA = "protanopia"
    TRITANOPIA = "tritanopia"
    ACHROMATOPSIA = "achromatopsia"
    REDUCED_CONTRAST = "reducedContrast"
    BLURRED_VISION = "blurredVision"

Simulate vision deficiency for accessibility testing (WCAG compliance)

Attributes

DEUTERANOPIA: Difficulty distinguishing green from red; green appears beige/gray
PROTANOPIA: Reduced sensitivity to red light; red appears dark/black
TRITANOPIA: Difficulty distinguishing blue from yellow and violet from red
ACHROMATOPSIA: Complete inability to perceive color; sees only in grayscale
REDUCED_CONTRAST: Simulates reduced contrast due to aging, low light, or other factors
BLURRED_VISION: Simulates uncorrected refractive errors or age-related vision loss

Ancestors

enum.Enum

Class variables

var ACHROMATOPSIA
var BLURRED_VISION
var DEUTERANOPIA
var PROTANOPIA
var REDUCED_CONTRAST
var TRITANOPIA

class WarcParser (warc_data: bytes | )

Expand source code

class WarcParser:
    """
    Parser for WARC files with automatic decompression

    Provides methods to iterate through WARC records and extract page data.

    Example:
        ```python
        # From bytes
        parser = WarcParser(warc_bytes)

        # Iterate all records
        for record in parser.iter_records():
            print(f"{record.url}: {record.status_code}")

        # Get only HTTP responses
        for record in parser.iter_responses():
            print(f"Page: {record.url}")
            html = record.content.decode('utf-8')

        # Get all pages as simple dicts
        pages = parser.get_pages()
        for page in pages:
            print(f"{page['url']}: {page['status_code']}")
        ```
    """

    def __init__(self, warc_data: Union[bytes, BinaryIO]):
        """
        Initialize WARC parser

        Args:
            warc_data: WARC data as bytes or file-like object
                      (supports both gzip-compressed and uncompressed)
        """
        if isinstance(warc_data, bytes):
            # Try to decompress if gzipped
            if warc_data[:2] == b'\x1f\x8b':  # gzip magic number
                try:
                    warc_data = gzip.decompress(warc_data)
                except Exception:
                    pass  # Not gzipped or decompression failed
            self._data = BytesIO(warc_data)
        else:
            self._data = warc_data

    def iter_records(self) -> Iterator[WarcRecord]:
        """
        Iterate through all WARC records

        Yields:
            WarcRecord: Each record in the WARC file
        """
        self._data.seek(0)

        while True:
            # Read WARC version line
            version_line = self._read_line()
            if not version_line or not version_line.startswith(b'WARC/'):
                break

            # Read WARC headers
            warc_headers = self._read_headers()
            if not warc_headers:
                break

            # Get content length
            content_length = int(warc_headers.get('Content-Length', 0))

            # Read content block
            content_block = self._data.read(content_length)

            # Skip trailing newlines
            self._read_line()
            self._read_line()

            # Parse the record
            record = self._parse_record(warc_headers, content_block)
            if record:
                yield record

    def iter_responses(self) -> Iterator[WarcRecord]:
        """
        Iterate through HTTP response records only

        Filters out non-response records (requests, metadata, etc.)

        Yields:
            WarcRecord: HTTP response records only
        """
        for record in self.iter_records():
            if record.record_type == 'response' and record.status_code:
                yield record

    def get_pages(self) -> List[Dict]:
        """
        Get all crawled pages as simple dictionaries

        This is the easiest way to access crawl results without dealing
        with WARC format details.

        Returns:
            List of dicts with keys: url, status_code, headers, content

        Example:
            ```python
            pages = parser.get_pages()
            for page in pages:
                print(f"{page['url']}: {len(page['content'])} bytes")
                html = page['content'].decode('utf-8')
            ```
        """
        pages = []
        for record in self.iter_responses():
            pages.append({
                'url': record.url,
                'status_code': record.status_code,
                'headers': record.headers,
                'content': record.content
            })
        return pages

    def _read_line(self) -> bytes:
        """Read a single line from the WARC file"""
        line = self._data.readline()
        return line.rstrip(b'\r\n')

    def _read_headers(self) -> Dict[str, str]:
        """Read headers until empty line"""
        headers = {}
        while True:
            line = self._read_line()
            if not line:
                break

            # Parse header line
            if b':' in line:
                key, value = line.split(b':', 1)
                headers[key.decode('utf-8').strip()] = value.decode('utf-8').strip()

        return headers

    def _parse_record(self, warc_headers: Dict[str, str], content_block: bytes) -> Optional[WarcRecord]:
        """Parse a WARC record from headers and content"""
        record_type = warc_headers.get('WARC-Type', '')
        url = warc_headers.get('WARC-Target-URI', '')

        if record_type == 'response':
            # Parse HTTP response
            http_headers, body = self._parse_http_response(content_block)
            status_code = self._extract_status_code(content_block)

            return WarcRecord(
                record_type=record_type,
                url=url,
                headers=http_headers,
                content=body,
                status_code=status_code,
                warc_headers=warc_headers
            )
        elif record_type in ['request', 'metadata', 'warcinfo']:
            # Other record types - store raw content
            return WarcRecord(
                record_type=record_type,
                url=url,
                headers={},
                content=content_block,
                status_code=None,
                warc_headers=warc_headers
            )

        return None

    def _parse_http_response(self, content_block: bytes) -> tuple:
        """Parse HTTP response into headers and body"""
        try:
            # Split on double newline (end of headers)
            parts = content_block.split(b'\r\n\r\n', 1)
            if len(parts) < 2:
                parts = content_block.split(b'\n\n', 1)

            if len(parts) == 2:
                header_section, body = parts
            else:
                header_section, body = content_block, b''

            # Parse headers
            headers = {}
            lines = header_section.split(b'\r\n') if b'\r\n' in header_section else header_section.split(b'\n')

            # Skip status line
            for line in lines[1:]:
                if b':' in line:
                    key, value = line.split(b':', 1)
                    headers[key.decode('utf-8', errors='ignore').strip()] = value.decode('utf-8', errors='ignore').strip()

            return headers, body

        except Exception:
            return {}, content_block

    def _extract_status_code(self, content_block: bytes) -> Optional[int]:
        """Extract HTTP status code from response"""
        try:
            # Look for HTTP status line (e.g., "HTTP/1.1 200 OK")
            first_line = content_block.split(b'\r\n', 1)[0] if b'\r\n' in content_block else content_block.split(b'\n', 1)[0]
            match = re.match(rb'HTTP/\d\.\d (\d+)', first_line)
            if match:
                return int(match.group(1))
        except Exception:
            pass
        return None

Parser for WARC files with automatic decompression

Provides methods to iterate through WARC records and extract page data.

Example

# From bytes
parser = WarcParser(warc_bytes)

# Iterate all records
for record in parser.iter_records():
    print(f"{record.url}: {record.status_code}")

# Get only HTTP responses
for record in parser.iter_responses():
    print(f"Page: {record.url}")
    html = record.content.decode('utf-8')

# Get all pages as simple dicts
pages = parser.get_pages()
for page in pages:
    print(f"{page['url']}: {page['status_code']}")

Initialize WARC parser

Args

warc_data: WARC data as bytes or file-like object (supports both gzip-compressed and uncompressed)

Methods

def get_pages(self) ‑> List[Dict]

Expand source code

def get_pages(self) -> List[Dict]:
    """
    Get all crawled pages as simple dictionaries

    This is the easiest way to access crawl results without dealing
    with WARC format details.

    Returns:
        List of dicts with keys: url, status_code, headers, content

    Example:
        ```python
        pages = parser.get_pages()
        for page in pages:
            print(f"{page['url']}: {len(page['content'])} bytes")
            html = page['content'].decode('utf-8')
        ```
    """
    pages = []
    for record in self.iter_responses():
        pages.append({
            'url': record.url,
            'status_code': record.status_code,
            'headers': record.headers,
            'content': record.content
        })
    return pages

Get all crawled pages as simple dictionaries

This is the easiest way to access crawl results without dealing with WARC format details.

Returns

List of dicts with keys: url, status_code, headers, content

Example

pages = parser.get_pages()
for page in pages:
    print(f"{page['url']}: {len(page['content'])} bytes")
    html = page['content'].decode('utf-8')

def iter_records(self) ‑> Iterator[WarcRecord]

Expand source code

def iter_records(self) -> Iterator[WarcRecord]:
    """
    Iterate through all WARC records

    Yields:
        WarcRecord: Each record in the WARC file
    """
    self._data.seek(0)

    while True:
        # Read WARC version line
        version_line = self._read_line()
        if not version_line or not version_line.startswith(b'WARC/'):
            break

        # Read WARC headers
        warc_headers = self._read_headers()
        if not warc_headers:
            break

        # Get content length
        content_length = int(warc_headers.get('Content-Length', 0))

        # Read content block
        content_block = self._data.read(content_length)

        # Skip trailing newlines
        self._read_line()
        self._read_line()

        # Parse the record
        record = self._parse_record(warc_headers, content_block)
        if record:
            yield record

Iterate through all WARC records

Yields

WarcRecord: Each record in the WARC file

def iter_responses(self) ‑> Iterator[WarcRecord]

Expand source code

def iter_responses(self) -> Iterator[WarcRecord]:
    """
    Iterate through HTTP response records only

    Filters out non-response records (requests, metadata, etc.)

    Yields:
        WarcRecord: HTTP response records only
    """
    for record in self.iter_records():
        if record.record_type == 'response' and record.status_code:
            yield record

Iterate through HTTP response records only

Filters out non-response records (requests, metadata, etc.)

Yields

WarcRecord: HTTP response records only

class WarcRecord (record_type: str, url: str, headers: Dict[str, str], content: bytes, status_code: int | None, warc_headers: Dict[str, str])

Expand source code

@dataclass
class WarcRecord:
    """
    Represents a single WARC record

    A WARC file contains multiple records, each representing a captured
    HTTP transaction or metadata.
    """
    record_type: str  # Type of record (response, request, metadata, etc.)
    url: str  # Associated URL
    headers: Dict[str, str]  # HTTP headers
    content: bytes  # Response body/content
    status_code: Optional[int]  # HTTP status code (for response records)
    warc_headers: Dict[str, str]  # WARC-specific headers

    def __repr__(self):
        return f"WarcRecord(type={self.record_type}, url={self.url}, status={self.status_code})"

Represents a single WARC record

A WARC file contains multiple records, each representing a captured HTTP transaction or metadata.

Instance variables

var content : bytes
var headers : Dict[str, str]
var record_type : str
var status_code : int | None
var url : str
var warc_headers : Dict[str, str]