Add Layout+VLM pipeline with prompt injection, ApiVlmModel updates

cau-git · cau-git · commit 0dbdd1fe1cb3 · 2025-09-10T17:34:58.000+02:00
Signed-off-by: Christoph Auer &lt;cau@zurich.ibm.com&gt;
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
@@ -1,12 +1,15 @@
 from enum import Enum
-from typing import Any, Dict, List, Literal, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
 
 from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
 
+if TYPE_CHECKING:
+    from docling.datamodel.base_models import Page
+
 
 class BaseVlmOptions(BaseModel):
     kind: str
@@ -15,7 +18,7 @@ class BaseVlmOptions(BaseModel):
     max_size: Optional[int] = None
     temperature: float = 0.0
 
-    def build_prompt(self, page: Optional[SegmentedPage]) -> str:
+    def build_prompt(self, page: Optional[Union["Page", SegmentedPage]]) -> str:
         return self.prompt
 
     def decode_response(self, text: str) -> str:
diff --git a/docling/datamodel/threaded_layout_vlm_pipeline_options.py b/docling/datamodel/threaded_layout_vlm_pipeline_options.py
@@ -0,0 +1,30 @@
+"""Options for the threaded layout+VLM pipeline."""
+
+from typing import Union
+
+from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_HERON
+from docling.datamodel.pipeline_options import LayoutOptions, PaginatedPipelineOptions
+from docling.datamodel.pipeline_options_vlm_model import (
+    ApiVlmOptions,
+    InlineVlmOptions,
+)
+from docling.datamodel.vlm_model_specs import SMOLDOCLING_TRANSFORMERS
+
+
+class ThreadedLayoutVlmPipelineOptions(PaginatedPipelineOptions):
+    """Pipeline options for the threaded layout+VLM pipeline."""
+
+    # Inherit page image generation from PaginatedPipelineOptions but enable by default
+    generate_page_images: bool = True
+
+    # VLM configuration (will be enhanced with layout awareness by the pipeline)
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = SMOLDOCLING_TRANSFORMERS
+
+    # Layout model configuration
+    layout_options: LayoutOptions = LayoutOptions(model_spec=DOCLING_LAYOUT_HERON)
+
+    # Threading and batching controls
+    layout_batch_size: int = 4
+    vlm_batch_size: int = 2
+    batch_timeout_seconds: float = 2.0
+    queue_max_size: int = 50
diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py
@@ -1,16 +1,23 @@
 from collections.abc import Iterable
 from concurrent.futures import ThreadPoolExecutor
+from typing import Union
+
+import numpy as np
+from PIL.Image import Image
 
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
 from docling.exceptions import OperationNotAllowed
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseVlmPageModel
 from docling.utils.api_image_request import api_image_request
 from docling.utils.profiling import TimeRecorder
 
 
-class ApiVlmModel(BasePageModel):
+class ApiVlmModel(BaseVlmPageModel):
+    # Override the vlm_options type annotation from BaseVlmPageModel
+    vlm_options: ApiVlmOptions  # type: ignore[assignment]
+
     def __init__(
         self,
         enabled: bool,
@@ -37,36 +44,104 @@ def __init__(
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
-        def _vlm_request(page):
+        page_list = list(page_batch)
+        if not page_list:
+            return
+
+        valid_pages = []
+        invalid_pages = []
+
+        for page in page_list:
             assert page._backend is not None
             if not page._backend.is_valid():
-                return page
+                invalid_pages.append(page)
             else:
-                with TimeRecorder(conv_res, "vlm"):
-                    assert page.size is not None
+                valid_pages.append(page)
+
+        # Process valid pages in batch
+        if valid_pages:
+            with TimeRecorder(conv_res, "vlm"):
+                # Prepare images and prompts for batch processing
+                images = []
+                prompts = []
+                pages_with_images = []
 
+                for page in valid_pages:
+                    assert page.size is not None
                     hi_res_image = page.get_image(
                         scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                     )
-                    assert hi_res_image is not None
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
-
-                    prompt = self.vlm_options.build_prompt(page.parsed_page)
-                    page_tags = api_image_request(
-                        image=hi_res_image,
-                        prompt=prompt,
-                        url=self.vlm_options.url,
-                        timeout=self.timeout,
-                        headers=self.vlm_options.headers,
-                        **self.params,
-                    )
 
-                    page_tags = self.vlm_options.decode_response(page_tags)
-                    page.predictions.vlm_response = VlmPrediction(text=page_tags)
+                    # Only process pages with valid images
+                    if hi_res_image is not None:
+                        images.append(hi_res_image)
+                        prompt = self.vlm_options.build_prompt(page)
+                        prompts.append(prompt)
+                        pages_with_images.append(page)
+
+                # Use process_images for the actual inference
+                if images:  # Only if we have valid images
+                    predictions = list(self.process_images(images, prompts))
+
+                    # Attach results to pages
+                    for page, prediction in zip(pages_with_images, predictions):
+                        page.predictions.vlm_response = prediction
+
+        # Yield all pages (valid and invalid)
+        for page in invalid_pages:
+            yield page
+        for page in valid_pages:
+            yield page
+
+    def process_images(
+        self,
+        image_batch: Iterable[Union[Image, np.ndarray]],
+        prompt: Union[str, list[str]],
+    ) -> Iterable[VlmPrediction]:
+        """Process raw images without page metadata."""
+        images = list(image_batch)
+
+        # Handle prompt parameter
+        if isinstance(prompt, str):
+            prompts = [prompt] * len(images)
+        elif isinstance(prompt, list):
+            if len(prompt) != len(images):
+                raise ValueError(
+                    f"Prompt list length ({len(prompt)}) must match image count ({len(images)})"
+                )
+            prompts = prompt
+
+        def _process_single_image(image_prompt_pair):
+            image, prompt_text = image_prompt_pair
+
+            # Convert numpy array to PIL Image if needed
+            if isinstance(image, np.ndarray):
+                if image.ndim == 3 and image.shape[2] in [3, 4]:
+                    from PIL import Image as PILImage
+
+                    image = PILImage.fromarray(image.astype(np.uint8))
+                elif image.ndim == 2:
+                    from PIL import Image as PILImage
+
+                    image = PILImage.fromarray(image.astype(np.uint8), mode="L")
+                else:
+                    raise ValueError(f"Unsupported numpy array shape: {image.shape}")
+
+            # Ensure image is in RGB mode
+            if image.mode != "RGB":
+                image = image.convert("RGB")
+
+            page_tags = api_image_request(
+                image=image,
+                prompt=prompt_text,
+                url=self.vlm_options.url,
+                timeout=self.timeout,
+                headers=self.vlm_options.headers,
+                **self.params,
+            )
 
-                return page
+            page_tags = self.vlm_options.decode_response(page_tags)
+            return VlmPrediction(text=page_tags)
 
         with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
-            yield from executor.map(_vlm_request, page_batch)
+            yield from executor.map(_process_single_image, zip(images, prompts))
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -162,7 +162,7 @@ def __call__(
                         images.append(hi_res_image)
 
                         # Define prompt structure
-                        user_prompt = self.vlm_options.build_prompt(page.parsed_page)
+                        user_prompt = self.vlm_options.build_prompt(page)
 
                         user_prompts.append(user_prompt)
                         pages_with_images.append(page)
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
@@ -104,10 +104,7 @@ def __call__(
                         images.append(hi_res_image)
 
                         # Define prompt structure
-                        if callable(self.vlm_options.prompt):
-                            user_prompt = self.vlm_options.prompt(page.parsed_page)
-                        else:
-                            user_prompt = self.vlm_options.prompt
+                        user_prompt = self.vlm_options.build_prompt(page)
 
                         user_prompts.append(user_prompt)
                         pages_with_images.append(page)
diff --git a/docling/models/vlm_models_inline/vllm_model.py b/docling/models/vlm_models_inline/vllm_model.py
@@ -129,10 +129,7 @@ def __call__(
                         images.append(hi_res_image)
 
                         # Define prompt structure
-                        if callable(self.vlm_options.prompt):
-                            user_prompt = self.vlm_options.prompt(page.parsed_page)
-                        else:
-                            user_prompt = self.vlm_options.prompt
+                        user_prompt = self.vlm_options.build_prompt(page)
 
                         user_prompts.append(user_prompt)
                         pages_with_images.append(page)
diff --git a/docling/pipeline/threaded_layout_vlm_pipeline.py b/docling/pipeline/threaded_layout_vlm_pipeline.py