Skip to content

Commit 900cf9d

Browse files
authored
Fix issue with from pretrained and kwargs in image processors (#41997)
* accept kwargs in image proc from_pretrained * only use kwargs that are in cls.valid_kwargs * remove specific logic for _from_auto * add image_seq_length to Images_kwargs for backward compatibility * fix missing image kwargs in pix2struct
1 parent 154d510 commit 900cf9d

File tree

5 files changed

+17
-16
lines changed

5 files changed

+17
-16
lines changed

src/transformers/image_processing_base.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -362,25 +362,13 @@ def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
362362
"""
363363
image_processor_dict = image_processor_dict.copy()
364364
return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
365-
366-
# The `size` parameter is a dict and was previously an int or tuple in feature extractors.
367-
# We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
368-
# dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
369-
if "size" in kwargs and "size" in image_processor_dict:
370-
image_processor_dict["size"] = kwargs.pop("size")
371-
if "crop_size" in kwargs and "crop_size" in image_processor_dict:
372-
image_processor_dict["crop_size"] = kwargs.pop("crop_size")
373-
365+
image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
374366
image_processor = cls(**image_processor_dict)
375367

376-
# Update image_processor with kwargs if needed
377-
to_remove = []
378-
for key, value in kwargs.items():
368+
# Remove kwargs that are used to initialize the image processor attributes
369+
for key in list(kwargs):
379370
if hasattr(image_processor, key):
380-
setattr(image_processor, key, value)
381-
to_remove.append(key)
382-
for key in to_remove:
383-
kwargs.pop(key, None)
371+
kwargs.pop(key)
384372

385373
logger.info(f"Image processor {image_processor}")
386374
if return_unused_kwargs:

src/transformers/image_processing_utils_fast.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
185185
input_data_format = None
186186
device = None
187187
model_input_names = ["pixel_values"]
188+
image_seq_length = None
188189
valid_kwargs = ImagesKwargs
189190
unused_kwargs = None
190191

src/transformers/models/pix2struct/image_processing_pix2struct.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
5353
"""
5454
max_patches (`int`, *optional*):
5555
Maximum number of patches to extract.
56+
patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
57+
The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
58+
is_vqa (`bool`, *optional*, defaults to `False`):
59+
Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
60+
rendered onto the input images.
5661
header_text (`Union[list[str], str]`, *optional*):
5762
Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
5863
"""
5964

6065
max_patches: int
66+
patch_size: dict[str, int]
67+
is_vqa: bool
6168
header_text: Optional[Union[list[str], str]]
6269

6370

src/transformers/processing_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ class methods and docstrings.
219219
- `'np'`: Return NumPy `np.ndarray` objects.
220220
disable_grouping (`bool`, *optional*):
221221
Whether to group images by shapes when processing or not, only relevant for fast image processing.
222+
image_seq_length (`int`, *optional*):
223+
The number of image tokens to be used for each image in the input.
224+
Added for backward compatibility but this should be set as a processor attribute in future models.
222225
"""
223226

224227
do_convert_rgb: Optional[bool]
@@ -239,6 +242,7 @@ class methods and docstrings.
239242
device: Annotated[Optional[str], device_validator()]
240243
return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
241244
disable_grouping: Optional[bool]
245+
image_seq_length: Optional[int]
242246

243247

244248
class VideosKwargs(TypedDict, total=False):

tests/models/pix2struct/test_processing_pix2struct.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
172172
if "image_processor" not in self.processor_class.attributes:
173173
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
174174
image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
175+
print("image_processor", image_processor)
175176
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
176177

177178
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)

0 commit comments

Comments
 (0)