Fix issue with from pretrained and kwargs in image processors (#41997)

yonigozlan · web-flow · commit 900cf9d33bc0 · 2025-11-04T10:35:39.000-05:00
* accept kwargs in image proc from_pretrained

* only use kwargs that are in cls.valid_kwargs

* remove specific logic for _from_auto

* add image_seq_length to Images_kwargs for backward compatibility

* fix missing image kwargs in pix2struct
diff --git a/src/transformers/image_processing_base.py b/src/transformers/image_processing_base.py
@@ -362,25 +362,13 @@ def from_dict(cls, image_processor_dict: dict[str, Any], **kwargs):
         """
         image_processor_dict = image_processor_dict.copy()
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
-
-        # The `size` parameter is a dict and was previously an int or tuple in feature extractors.
-        # We set `size` here directly to the `image_processor_dict` so that it is converted to the appropriate
-        # dict within the image processor and isn't overwritten if `size` is passed in as a kwarg.
-        if "size" in kwargs and "size" in image_processor_dict:
-            image_processor_dict["size"] = kwargs.pop("size")
-        if "crop_size" in kwargs and "crop_size" in image_processor_dict:
-            image_processor_dict["crop_size"] = kwargs.pop("crop_size")
-
+        image_processor_dict.update({k: v for k, v in kwargs.items() if k in cls.valid_kwargs.__annotations__})
         image_processor = cls(**image_processor_dict)
 
-        # Update image_processor with kwargs if needed
-        to_remove = []
-        for key, value in kwargs.items():
+        # Remove kwargs that are used to initialize the image processor attributes
+        for key in list(kwargs):
             if hasattr(image_processor, key):
-                setattr(image_processor, key, value)
-                to_remove.append(key)
-        for key in to_remove:
-            kwargs.pop(key, None)
+                kwargs.pop(key)
 
         logger.info(f"Image processor {image_processor}")
         if return_unused_kwargs:
diff --git a/src/transformers/image_processing_utils_fast.py b/src/transformers/image_processing_utils_fast.py
@@ -185,6 +185,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
     input_data_format = None
     device = None
     model_input_names = ["pixel_values"]
+    image_seq_length = None
     valid_kwargs = ImagesKwargs
     unused_kwargs = None
 
diff --git a/src/transformers/models/pix2struct/image_processing_pix2struct.py b/src/transformers/models/pix2struct/image_processing_pix2struct.py
@@ -53,11 +53,18 @@ class Pix2StructImageProcessorKwargs(ImagesKwargs, total=False):
     """
     max_patches (`int`, *optional*):
         Maximum number of patches to extract.
+    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
+        The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
+    is_vqa (`bool`, *optional*, defaults to `False`):
+        Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
+        rendered onto the input images.
     header_text (`Union[list[str], str]`, *optional*):
         Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
     """
 
     max_patches: int
+    patch_size: dict[str, int]
+    is_vqa: bool
     header_text: Optional[Union[list[str], str]]
 
 
diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -219,6 +219,9 @@ class methods and docstrings.
             - `'np'`: Return NumPy `np.ndarray` objects.
         disable_grouping (`bool`, *optional*):
             Whether to group images by shapes when processing or not, only relevant for fast image processing.
+        image_seq_length (`int`, *optional*):
+            The number of image tokens to be used for each image in the input.
+            Added for backward compatibility but this should be set as a processor attribute in future models.
     """
 
     do_convert_rgb: Optional[bool]
@@ -239,6 +242,7 @@ class methods and docstrings.
     device: Annotated[Optional[str], device_validator()]
     return_tensors: Annotated[Optional[Union[str, TensorType]], tensor_type_validator()]
     disable_grouping: Optional[bool]
+    image_seq_length: Optional[int]
 
 
 class VideosKwargs(TypedDict, total=False):
diff --git a/tests/models/pix2struct/test_processing_pix2struct.py b/tests/models/pix2struct/test_processing_pix2struct.py
@@ -172,6 +172,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self):
         if "image_processor" not in self.processor_class.attributes:
             self.skipTest(f"image_processor attribute not present in {self.processor_class}")
         image_processor = self.get_component("image_processor", max_patches=1024, patch_size={"height": 8, "width": 8})
+        print("image_processor", image_processor)
         tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
 
         processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)