feat: improve engine caching and fix bugs (#3932)

zewenli98 · web-flow · commit 1169e10e7d70 · 2025-12-19T19:07:22.000-08:00
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
@@ -1419,8 +1419,6 @@ def convert_exported_program_to_serialized_trt_engine(
         interpreter_result = interpret_module_to_result(
             gm,
             inputs=flattened_input_list,
-            arg_inputs=list(trt_arg_inputs),
-            kwarg_inputs=trt_kwarg_inputs,
             settings=settings,
             engine_cache=engine_cache,
         )
diff --git a/py/torch_tensorrt/dynamo/_refit.py b/py/torch_tensorrt/dynamo/_refit.py
@@ -53,7 +53,7 @@
 logger = logging.getLogger(__name__)
 
 
-@needs_refit
+@needs_refit  # type: ignore[misc]
 def construct_refit_mapping(
     module: torch.fx.GraphModule,
     inputs: Sequence[Input],
@@ -86,7 +86,7 @@ def construct_refit_mapping(
     return weight_refit_map
 
 
-@needs_refit
+@needs_refit  # type: ignore[misc]
 def construct_refit_mapping_from_weight_name_map(
     weight_name_map: dict[Any, Any],
     state_dict: dict[Any, Any],
@@ -131,7 +131,7 @@ def construct_refit_mapping_from_weight_name_map(
     return engine_weight_map
 
 
-@needs_refit
+@needs_refit  # type: ignore[misc]
 def _refit_single_trt_engine_with_gm(
     new_gm: torch.fx.GraphModule,
     old_engine: trt.ICudaEngine,
@@ -214,7 +214,7 @@ def _refit_single_trt_engine_with_gm(
             raise AssertionError("Refitting failed.")
 
 
-@needs_refit
+@needs_refit  # type: ignore[misc]
 def refit_module_weights(
     compiled_module: torch.fx.GraphModule | ExportedProgram,
     new_weight_module: ExportedProgram,
@@ -554,9 +554,10 @@ def refit_module_weights(
                     weight_name_map=None,
                 )
 
-        # clear EXCLUDE_WEIGHTS flag
+        # clear EXCLUDE_WEIGHTS flag and set INCLUDE_REFIT flag to make the engine refittable
         serialization_config = engine.create_serialization_config()
         serialization_config.clear_flag(trt.SerializationFlag.EXCLUDE_WEIGHTS)
+        serialization_config.set_flag(trt.SerializationFlag.INCLUDE_REFIT)
         serialized_engine = engine.serialize_with_config(serialization_config)
 
         if isinstance(compiled_submodule, PythonTorchTensorRTModule):
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -200,7 +200,6 @@ def __setstate__(self, state: dict[str, Any]) -> None:
     "engine_capability",
     "hardware_compatible",
     "refit_identical_engine_weights",
-    "strip_engine_weights",  # TODO: @Evan to remove this after implementing caching weight-stripped engines as default?
     "immutable_weights",
     "enable_weight_streaming",
     "tiling_optimization_level",
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -158,9 +158,11 @@ def _pretraced_backend(
                     "require_full_compilation arg is not applicable for torch.compile with backend='torch_tensorrt"
                 )
             if settings.strip_engine_weights:
-                logger.error(
-                    "strip_engine_weights arg is not supported for torch.compile()"
+                logger.warning(
+                    "strip_engine_weights=True is not supported for torch.compile(). It will be set to False automatically."
                 )
+                settings.strip_engine_weights = False
+
             trt_compiled = compile_module(
                 gm,
                 torchtrt_inputs,
diff --git a/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py b/py/torch_tensorrt/dynamo/conversion/_TRTInterpreter.py
@@ -31,7 +31,7 @@
 from torch_tensorrt._utils import is_tensorrt_version_supported
 from torch_tensorrt.dynamo import _defaults
 from torch_tensorrt.dynamo._engine_cache import BaseEngineCache
-from torch_tensorrt.dynamo._settings import CompilationSettings, settings_are_compatible
+from torch_tensorrt.dynamo._settings import CompilationSettings
 from torch_tensorrt.dynamo.conversion._ConversionContext import ConversionContext
 from torch_tensorrt.dynamo.conversion._ConverterRegistry import (
     DYNAMO_CONVERTERS as CONVERTERS,
@@ -594,79 +594,6 @@ def _save_weight_mapping(self) -> None:
         gc.collect()
         torch.cuda.empty_cache()
 
-    @needs_refit  # type: ignore[misc]
-    def _pull_cached_engine(self, hash_val: str) -> Optional[TRTInterpreterResult]:
-        # query the cached TRT engine
-        cached_data = self.engine_cache.check(hash_val)  # type: ignore[union-attr]
-        if cached_data is not None:  # hit the cache
-            (
-                serialized_engine,
-                self._input_names,
-                self._output_names,
-                cached_engine_input_specs,
-                engine_compilation_settings,
-                self.weight_name_map,
-                self.ctx.requires_output_allocator,
-            ) = cached_data
-
-            setting_compatiblity, incompattible_settings = settings_are_compatible(
-                self.compilation_settings, engine_compilation_settings
-            )
-            assert (
-                setting_compatiblity
-            ), f"Attempted to refit a cached engine with incompatible settings: {incompattible_settings}, (old_settings: {engine_compilation_settings}, new_settings: {self.compilation_settings})"
-
-            for i, e in enumerate(
-                [
-                    Input.equivalent_spec(c, i)
-                    for c, i in zip(cached_engine_input_specs, self.input_specs)
-                ]
-            ):
-                assert (
-                    e
-                ), f"Attempted to refit a cached engine built for a different input size (input: {i}, cached size: {cached_engine_input_specs[i]}, new size: {self.input_specs[i]}"
-
-            _LOGGER.info(
-                "Found the cached engine that corresponds to this graph. It is directly loaded."
-            )
-
-            # refit the cached engine with the new graph module
-            if not self.compilation_settings.strip_engine_weights:
-                runtime = trt.Runtime(TRT_LOGGER)
-                engine = runtime.deserialize_cuda_engine(serialized_engine)
-
-                from torch_tensorrt.dynamo._refit import (
-                    _refit_single_trt_engine_with_gm,
-                )
-
-                _refit_single_trt_engine_with_gm(
-                    new_gm=self.module,
-                    old_engine=engine,
-                    input_list=self.input_specs,
-                    settings=self.compilation_settings,
-                    weight_name_map=self.weight_name_map,
-                )
-
-                # TODO: @Evan is waiting for TRT's feature to load the weight-stripped engine
-                # # EXCLUDE_WEIGHTS flag must be cleared
-                # serialization_config = engine.create_serialization_config()
-                # serialization_config.clear_flag(
-                #     trt.SerializationFlag.EXCLUDE_WEIGHTS
-                # )
-                # serialized_engine = engine.serialize_with_config(
-                #     serialization_config
-                # )
-                # # As of now, the engine becomes non-refittable because when EXCLUDE_WEIGHTS flag is cleared, the REFIT flag is also cleared by TRT to make the plan file smaller
-
-            return TRTInterpreterResult(
-                engine,
-                self._input_names,
-                self._output_names,
-                self.weight_name_map,
-                self.ctx.requires_output_allocator,
-            )
-        return None
-
     def run(
         self,
         strict_type_constraints: bool = False,
@@ -682,26 +609,6 @@ def run(
         Return:
             TRTInterpreterResult
         """
-        # self.engine_cache could be None if:
-        # 1) engine_cache is not passed in when calling this function like convert_exported_program_to_serialized_trt_engine etc., or
-        # 2) both cache_built_engines and reuse_cached_engines are False
-        if (
-            self.engine_cache is not None
-            and not self.compilation_settings.immutable_weights
-        ):
-            if (
-                self.compilation_settings.cache_built_engines
-                or self.compilation_settings.reuse_cached_engines
-            ):
-                hash_val = self.engine_cache.get_hash(
-                    self.module, self.input_specs, self.compilation_settings
-                )
-
-                if self.compilation_settings.reuse_cached_engines:
-                    interpreter_result = self._pull_cached_engine(hash_val)
-                    if interpreter_result is not None:  # hit the cache
-                        return interpreter_result  # type: ignore[no-any-return]
-
         self._construct_trt_network_def()
         _LOGGER.debug(
             f"CPU memory usage after network construction: {get_cpu_memory_usage()} MB"
diff --git a/py/torch_tensorrt/dynamo/conversion/_conversion.py b/py/torch_tensorrt/dynamo/conversion/_conversion.py
diff --git a/tests/py/dynamo/models/test_weight_stripped_engine.py b/tests/py/dynamo/models/test_weight_stripped_engine.py

Original file line number	Diff line number	Diff line change
`@@ -1419,8 +1419,6 @@ def convert_exported_program_to_serialized_trt_engine(`
`1419`	`1419`	`interpreter_result = interpret_module_to_result(`
`1420`	`1420`	`gm,`
`1421`	`1421`	`inputs=flattened_input_list,`
`1422`		`- arg_inputs=list(trt_arg_inputs),`
`1423`		`- kwarg_inputs=trt_kwarg_inputs,`
`1424`	`1422`	`settings=settings,`
`1425`	`1423`	`engine_cache=engine_cache,`
`1426`	`1424`	`)`
Original file line number	Diff line number	Diff line change
`@@ -158,9 +158,11 @@ def _pretraced_backend(`
`158`	`158`	`"require_full_compilation arg is not applicable for torch.compile with backend='torch_tensorrt"`
`159`	`159`	`)`
`160`	`160`	`if settings.strip_engine_weights:`
`161`		`- logger.error(`
`162`		`- "strip_engine_weights arg is not supported for torch.compile()"`
	`161`	`+ logger.warning(`
	`162`	`+ "strip_engine_weights=True is not supported for torch.compile(). It will be set to False automatically."`
`163`	`163`	`)`
	`164`	`+ settings.strip_engine_weights = False`
	`165`	`+`
`164`	`166`	`trt_compiled = compile_module(`
`165`	`167`	`gm,`
`166`	`168`	`torchtrt_inputs,`