Skip to content

Commit 0dbdd1f

Browse files
committed
Add Layout+VLM pipeline with prompt injection, ApiVlmModel updates
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
1 parent 0e2f370 commit 0dbdd1f

File tree

7 files changed

+513
-35
lines changed

7 files changed

+513
-35
lines changed

docling/datamodel/pipeline_options_vlm_model.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
11
from enum import Enum
2-
from typing import Any, Dict, List, Literal, Optional
2+
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
33

44
from docling_core.types.doc.page import SegmentedPage
55
from pydantic import AnyUrl, BaseModel
66
from typing_extensions import deprecated
77

88
from docling.datamodel.accelerator_options import AcceleratorDevice
99

10+
if TYPE_CHECKING:
11+
from docling.datamodel.base_models import Page
12+
1013

1114
class BaseVlmOptions(BaseModel):
1215
kind: str
@@ -15,7 +18,7 @@ class BaseVlmOptions(BaseModel):
1518
max_size: Optional[int] = None
1619
temperature: float = 0.0
1720

18-
def build_prompt(self, page: Optional[SegmentedPage]) -> str:
21+
def build_prompt(self, page: Optional[Union["Page", SegmentedPage]]) -> str:
1922
return self.prompt
2023

2124
def decode_response(self, text: str) -> str:
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
"""Options for the threaded layout+VLM pipeline."""
2+
3+
from typing import Union
4+
5+
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_HERON
6+
from docling.datamodel.pipeline_options import LayoutOptions, PaginatedPipelineOptions
7+
from docling.datamodel.pipeline_options_vlm_model import (
8+
ApiVlmOptions,
9+
InlineVlmOptions,
10+
)
11+
from docling.datamodel.vlm_model_specs import SMOLDOCLING_TRANSFORMERS
12+
13+
14+
class ThreadedLayoutVlmPipelineOptions(PaginatedPipelineOptions):
15+
"""Pipeline options for the threaded layout+VLM pipeline."""
16+
17+
# Inherit page image generation from PaginatedPipelineOptions but enable by default
18+
generate_page_images: bool = True
19+
20+
# VLM configuration (will be enhanced with layout awareness by the pipeline)
21+
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = SMOLDOCLING_TRANSFORMERS
22+
23+
# Layout model configuration
24+
layout_options: LayoutOptions = LayoutOptions(model_spec=DOCLING_LAYOUT_HERON)
25+
26+
# Threading and batching controls
27+
layout_batch_size: int = 4
28+
vlm_batch_size: int = 2
29+
batch_timeout_seconds: float = 2.0
30+
queue_max_size: int = 50

docling/models/api_vlm_model.py

Lines changed: 99 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
11
from collections.abc import Iterable
22
from concurrent.futures import ThreadPoolExecutor
3+
from typing import Union
4+
5+
import numpy as np
6+
from PIL.Image import Image
37

48
from docling.datamodel.base_models import Page, VlmPrediction
59
from docling.datamodel.document import ConversionResult
610
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
711
from docling.exceptions import OperationNotAllowed
8-
from docling.models.base_model import BasePageModel
12+
from docling.models.base_model import BaseVlmPageModel
913
from docling.utils.api_image_request import api_image_request
1014
from docling.utils.profiling import TimeRecorder
1115

1216

13-
class ApiVlmModel(BasePageModel):
17+
class ApiVlmModel(BaseVlmPageModel):
18+
# Override the vlm_options type annotation from BaseVlmPageModel
19+
vlm_options: ApiVlmOptions # type: ignore[assignment]
20+
1421
def __init__(
1522
self,
1623
enabled: bool,
@@ -37,36 +44,104 @@ def __init__(
3744
def __call__(
3845
self, conv_res: ConversionResult, page_batch: Iterable[Page]
3946
) -> Iterable[Page]:
40-
def _vlm_request(page):
47+
page_list = list(page_batch)
48+
if not page_list:
49+
return
50+
51+
valid_pages = []
52+
invalid_pages = []
53+
54+
for page in page_list:
4155
assert page._backend is not None
4256
if not page._backend.is_valid():
43-
return page
57+
invalid_pages.append(page)
4458
else:
45-
with TimeRecorder(conv_res, "vlm"):
46-
assert page.size is not None
59+
valid_pages.append(page)
60+
61+
# Process valid pages in batch
62+
if valid_pages:
63+
with TimeRecorder(conv_res, "vlm"):
64+
# Prepare images and prompts for batch processing
65+
images = []
66+
prompts = []
67+
pages_with_images = []
4768

69+
for page in valid_pages:
70+
assert page.size is not None
4871
hi_res_image = page.get_image(
4972
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
5073
)
51-
assert hi_res_image is not None
52-
if hi_res_image:
53-
if hi_res_image.mode != "RGB":
54-
hi_res_image = hi_res_image.convert("RGB")
55-
56-
prompt = self.vlm_options.build_prompt(page.parsed_page)
57-
page_tags = api_image_request(
58-
image=hi_res_image,
59-
prompt=prompt,
60-
url=self.vlm_options.url,
61-
timeout=self.timeout,
62-
headers=self.vlm_options.headers,
63-
**self.params,
64-
)
6574

66-
page_tags = self.vlm_options.decode_response(page_tags)
67-
page.predictions.vlm_response = VlmPrediction(text=page_tags)
75+
# Only process pages with valid images
76+
if hi_res_image is not None:
77+
images.append(hi_res_image)
78+
prompt = self.vlm_options.build_prompt(page)
79+
prompts.append(prompt)
80+
pages_with_images.append(page)
81+
82+
# Use process_images for the actual inference
83+
if images: # Only if we have valid images
84+
predictions = list(self.process_images(images, prompts))
85+
86+
# Attach results to pages
87+
for page, prediction in zip(pages_with_images, predictions):
88+
page.predictions.vlm_response = prediction
89+
90+
# Yield all pages (valid and invalid)
91+
for page in invalid_pages:
92+
yield page
93+
for page in valid_pages:
94+
yield page
95+
96+
def process_images(
97+
self,
98+
image_batch: Iterable[Union[Image, np.ndarray]],
99+
prompt: Union[str, list[str]],
100+
) -> Iterable[VlmPrediction]:
101+
"""Process raw images without page metadata."""
102+
images = list(image_batch)
103+
104+
# Handle prompt parameter
105+
if isinstance(prompt, str):
106+
prompts = [prompt] * len(images)
107+
elif isinstance(prompt, list):
108+
if len(prompt) != len(images):
109+
raise ValueError(
110+
f"Prompt list length ({len(prompt)}) must match image count ({len(images)})"
111+
)
112+
prompts = prompt
113+
114+
def _process_single_image(image_prompt_pair):
115+
image, prompt_text = image_prompt_pair
116+
117+
# Convert numpy array to PIL Image if needed
118+
if isinstance(image, np.ndarray):
119+
if image.ndim == 3 and image.shape[2] in [3, 4]:
120+
from PIL import Image as PILImage
121+
122+
image = PILImage.fromarray(image.astype(np.uint8))
123+
elif image.ndim == 2:
124+
from PIL import Image as PILImage
125+
126+
image = PILImage.fromarray(image.astype(np.uint8), mode="L")
127+
else:
128+
raise ValueError(f"Unsupported numpy array shape: {image.shape}")
129+
130+
# Ensure image is in RGB mode
131+
if image.mode != "RGB":
132+
image = image.convert("RGB")
133+
134+
page_tags = api_image_request(
135+
image=image,
136+
prompt=prompt_text,
137+
url=self.vlm_options.url,
138+
timeout=self.timeout,
139+
headers=self.vlm_options.headers,
140+
**self.params,
141+
)
68142

69-
return page
143+
page_tags = self.vlm_options.decode_response(page_tags)
144+
return VlmPrediction(text=page_tags)
70145

71146
with ThreadPoolExecutor(max_workers=self.concurrency) as executor:
72-
yield from executor.map(_vlm_request, page_batch)
147+
yield from executor.map(_process_single_image, zip(images, prompts))

docling/models/vlm_models_inline/hf_transformers_model.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def __call__(
162162
images.append(hi_res_image)
163163

164164
# Define prompt structure
165-
user_prompt = self.vlm_options.build_prompt(page.parsed_page)
165+
user_prompt = self.vlm_options.build_prompt(page)
166166

167167
user_prompts.append(user_prompt)
168168
pages_with_images.append(page)

docling/models/vlm_models_inline/mlx_model.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,7 @@ def __call__(
104104
images.append(hi_res_image)
105105

106106
# Define prompt structure
107-
if callable(self.vlm_options.prompt):
108-
user_prompt = self.vlm_options.prompt(page.parsed_page)
109-
else:
110-
user_prompt = self.vlm_options.prompt
107+
user_prompt = self.vlm_options.build_prompt(page)
111108

112109
user_prompts.append(user_prompt)
113110
pages_with_images.append(page)

docling/models/vlm_models_inline/vllm_model.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,10 +129,7 @@ def __call__(
129129
images.append(hi_res_image)
130130

131131
# Define prompt structure
132-
if callable(self.vlm_options.prompt):
133-
user_prompt = self.vlm_options.prompt(page.parsed_page)
134-
else:
135-
user_prompt = self.vlm_options.prompt
132+
user_prompt = self.vlm_options.build_prompt(page)
136133

137134
user_prompts.append(user_prompt)
138135
pages_with_images.append(page)

0 commit comments

Comments
 (0)