pytorch · robieta · Nov 12, 2022 · Nov 19, 2022 · Nov 22, 2022 · Nov 26, 2022
diff --git a/test/profiler/test_memory_profiler.py b/test/profiler/test_memory_profiler.py
@@ -1305,14 +1305,14 @@ def step_fn(mark_region):
             aten::detach                             7 (GRADIENT)                                  -> 7 (GRADIENT)
 
             -- Optimizer --------------------------------------------------------------------------------------------
-            aten::clone                              7 (GRADIENT)                                  -> 10 (???)
-            aten::detach                             10 (???)                                      -> 10 (???)
-            aten::detach                             10 (???)                                      -> 10 (???)
-            aten::add_.Tensor                        2 (PARAMETER), 10 (???)                       -> 2 (PARAMETER)
-            aten::clone                              9 (GRADIENT)                                  -> 11 (???)
-            aten::detach                             11 (???)                                      -> 11 (???)
-            aten::detach                             11 (???)                                      -> 11 (???)
-            aten::add_.Tensor                        3 (PARAMETER), 11 (???)                       -> 3 (PARAMETER)
+            aten::clone                              7 (GRADIENT)                                  -> 10 (OPTIMIZER_STATE)
+            aten::detach                             10 (OPTIMIZER_STATE)                          -> 10 (OPTIMIZER_STATE)
+            aten::detach                             10 (OPTIMIZER_STATE)                          -> 10 (OPTIMIZER_STATE)
+            aten::add_.Tensor                        2 (PARAMETER), 10 (OPTIMIZER_STATE)           -> 2 (PARAMETER)
+            aten::clone                              9 (GRADIENT)                                  -> 11 (OPTIMIZER_STATE)
+            aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
+            aten::detach                             11 (OPTIMIZER_STATE)                          -> 11 (OPTIMIZER_STATE)
+            aten::add_.Tensor                        3 (PARAMETER), 11 (OPTIMIZER_STATE)           -> 3 (PARAMETER)
             aten::zero_                              7 (GRADIENT)                                  -> 7 (GRADIENT)
             aten::zero_                              9 (GRADIENT)                                  -> 9 (GRADIENT)""",
         )

diff --git a/torch/profiler/_memory_profiler.py b/torch/profiler/_memory_profiler.py
@@ -1,6 +1,7 @@
 import collections
 import dataclasses
 import enum
+import itertools as it
 from typing import (
     Any,
     cast,
@@ -36,6 +37,7 @@ class Category(enum.Enum):
     ACTIVATION = enum.auto()
     GRADIENT = enum.auto()
     PARAMETER = enum.auto()
+    OPTIMIZER_STATE = enum.auto()
 
 
 @dataclasses.dataclass
@@ -561,6 +563,7 @@ def __init__(self, result: _ProfilerResult) -> None:
         self._set_inputs()
         self._set_parameters_using_data_flow()
         self._set_activations()
+        self._set_optimizer_state()
 
     def _is_gradient(self, *args, **kwargs) -> bool:
         return self._categories.get(*args, **kwargs) == Category.GRADIENT
@@ -781,3 +784,12 @@ def _set_activations(self) -> None:
             ):
                 for i in node.outputs.items():
                     self._categories.setdefault_by_version(*i, Category.ACTIVATION)
+
+    def _set_optimizer_state(self) -> None:
+        for event in self._op_tree.dfs():
+            if event.typed[0] == _EventType.PyCall and event.typed[1].optimizer:
+                parameters = event.typed[1].optimizer.parameters
+                for _, t in it.chain(*[state for _, _, state in parameters]):
+                    key = TensorKey.from_tensor(t)
+                    if key is not None:
+                        self._categories.set_by_id(key, Category.OPTIMIZER_STATE)