[optim] Set defaults to foreach, NOT fused (#95241) (#95415)

janeyx99 · web-flow · commit 00eb7b0d785f · 2023-02-24T09:19:40.000-05:00
Rolling back the default change for Adam and rectifying the docs to reflect that AdamW never defaulted to fused. Since our fused implementations are relatively newer, let's give them a longer bake-in time before flipping the switch for every user. Pull Request resolved: #95241 Approved by: https://github.com/ngimel
diff --git a/torch/optim/adadelta.py b/torch/optim/adadelta.py
@@ -194,7 +194,7 @@ def adadelta(
     # We still respect when the user inputs False for foreach.
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, square_avgs, acc_deltas],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/adagrad.py b/torch/optim/adagrad.py
@@ -211,7 +211,7 @@ def adagrad(
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, state_sums, state_steps],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/adam.py b/torch/optim/adam.py
@@ -4,7 +4,7 @@
 from torch import Tensor
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _stack_if_compiling,
                         _dispatch_sqrt, _default_to_fused_or_foreach, _capturable_doc,
-                        _differentiable_doc, _foreach_doc, _maximize_doc)
+                        _differentiable_doc, _foreach_doc, _fused_doc, _maximize_doc)
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
 __all__ = ['Adam', 'adam']
@@ -218,28 +218,14 @@ def step(self, closure=None):
         {maximize}
         {capturable}
         {differentiable}
-        fused (bool, optional): whether the fused implementation (CUDA only) is used.
-            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
-            are supported. Since the fused implementation is usually significantly faster than
-            the for-loop implementation, we try to use it whenever possible (all parameters
-            are on CUDA and are of a supported type). Else, we attempt to use the foreach
-            implementation and lastly fall back to the for-loop implementation. (default: None)
-
-    .. note:: The foreach and fused implementations are typically faster than the for-loop,
-              single-tensor implementation, so we will try to default to them IF the user has
-              not specified either flag (i.e., when foreach = fused = None). For example, if
-              the user specifies True for foreach but nothing for fused, we will run the foreach
-              implementation. If the user specifies False for fused but nothing for foreach, we will
-              run the for-loop implementation. If the user specifies True for both foreach and
-              fused, we will prioritize fused over foreach. We attempt to use the fastest, so the
-              hierarchy goes fused -> foreach -> for-loop.
+        {fused}
     .. _Adam\: A Method for Stochastic Optimization:
         https://arxiv.org/abs/1412.6980
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
 
     """.format(foreach=_foreach_doc, maximize=_maximize_doc, capturable=_capturable_doc,
-               differentiable=_differentiable_doc)
+               differentiable=_differentiable_doc, fused=_fused_doc)
 
 
 def adam(params: List[Tensor],
@@ -268,10 +254,14 @@ def adam(params: List[Tensor],
     See :class:`~torch.optim.Adam` for details.
     """
 
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
     if fused is None and foreach is None:
-        fused, foreach = _default_to_fused_or_foreach(
+        _, foreach = _default_to_fused_or_foreach(
             [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps],
-            differentiable, has_fused=True)
+            differentiable, use_fused=False)
     if fused is None:
         fused = False
     if foreach is None:
diff --git a/torch/optim/adamax.py b/torch/optim/adamax.py
@@ -207,7 +207,7 @@ def adamax(
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, exp_avgs, exp_infs, state_steps],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/adamw.py b/torch/optim/adamw.py
@@ -2,7 +2,7 @@
 from torch import Tensor
 from .optimizer import (Optimizer, _use_grad_for_differentiable, _get_value, _dispatch_sqrt,
                         _stack_if_compiling, _capturable_doc, _differentiable_doc, _foreach_doc,
-                        _maximize_doc, _default_to_fused_or_foreach)
+                        _fused_doc, _maximize_doc, _default_to_fused_or_foreach)
 from typing import List, Optional
 from torch.utils._foreach_utils import _group_tensors_by_device_and_dtype
 
@@ -248,20 +248,15 @@ def step(self, closure=None):
         {foreach}
         {capturable}
         {differentiable}
-        fused (bool, optional): whether the fused implementation (CUDA only) is used.
-            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
-            are supported. Since the fused implementation is usually significantly faster than
-            the for-loop implementation, we try to use it whenever possible (all parameters
-            are on CUDA and are of a supported type). Else, we continue with the for-loop
-            implementation. (default: None)
-
+        {fused}
     .. _Decoupled Weight Decay Regularization:
         https://arxiv.org/abs/1711.05101
     .. _On the Convergence of Adam and Beyond:
         https://openreview.net/forum?id=ryQu7f-RZ
 
     """.format(maximize=_maximize_doc,
                foreach=_foreach_doc,
+               fused=_fused_doc,
                capturable=_capturable_doc,
                differentiable=_differentiable_doc)
 
@@ -300,11 +295,14 @@ def adamw(
             "API has changed, `state_steps` argument must contain a list of singleton tensors"
         )
 
-    # Respect when the user inputs False/True for foreach.
+    # Respect when the user inputs False/True for foreach or fused. We only want to change
+    # the default when neither have been user-specified. Note that we default to foreach
+    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
+    # bake-in time before making it the default, even if it is typically faster.
     if fused is None and foreach is None:
-        fused, foreach = _default_to_fused_or_foreach(
+        _, foreach = _default_to_fused_or_foreach(
             [params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps],
-            differentiable, has_fused=False)
+            differentiable, use_fused=False)
     if fused is None:
         fused = False
     if foreach is None:
diff --git a/torch/optim/asgd.py b/torch/optim/asgd.py
@@ -186,7 +186,7 @@ def asgd(
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, axs, mus, etas, state_steps],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/nadam.py b/torch/optim/nadam.py
@@ -188,7 +188,7 @@ def nadam(params: List[Tensor],
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, exp_avgs, exp_avg_sqs, mu_products, state_steps],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError('torch.jit.script not supported with foreach optimizers')
diff --git a/torch/optim/optimizer.py b/torch/optim/optimizer.py
@@ -55,20 +55,20 @@ def _dispatch_sqrt(x: float):  # float annotation is needed because of torchscri
         return math.sqrt(x)
 
 # For any optimizer with a faster implementation, we attempt to default to the
-# fastest whenever possible. For foreach, the requirements are to have native
-# tensors all on CUDA. For fused, there's currently the additional requirement
+# fastest + stablest whenever possible. For foreach, the requirements are to have
+# native tensors all on CUDA. For fused, there's currently the additional requirement
 # that the tensors' dtypes must be floating point. Neither alternative supports
 # torch.jit.script nor differentiable, so we fall back to the single tensor
 # implementation in those cases.
 def _default_to_fused_or_foreach(tensorlists: List[List[torch.Tensor]],
                                  differentiable: bool,
-                                 has_fused: bool = False) -> Tuple[bool, bool]:
+                                 use_fused: bool = False) -> Tuple[bool, bool]:
     if torch.jit.is_scripting() or differentiable:
         return False, False
     all_tensors = []
     for tensorlist in tensorlists:
         all_tensors.extend(tensorlist)
-    fused = has_fused and all(
+    fused = use_fused and all(
         p is None or (type(p) == torch.Tensor and p.is_cuda and torch.is_floating_point(p)) for p in all_tensors
     )
     foreach = not fused and all(
@@ -83,6 +83,23 @@ def _default_to_fused_or_foreach(tensorlists: List[List[torch.Tensor]],
             foreach over the for-loop implementation on CUDA, since it is usually
             significantly more performant. (default: None)"""
 
+_fused_doc = r"""fused (bool, optional): whether the fused implementation (CUDA only) is used.
+            Currently, `torch.float64`, `torch.float32`, `torch.float16`, and `torch.bfloat16`
+            are supported. (default: None)
+
+    .. note:: The foreach and fused implementations are typically faster than the for-loop,
+              single-tensor implementation. Thus, if the user has not specified BOTH flags
+              (i.e., when foreach = fused = None), we will attempt defaulting to the foreach
+              implementation when the tensors are all on CUDA. For example, if the user specifies
+              True for fused but nothing for foreach, we will run the fused implementation. If
+              the user specifies False for foreach but nothing for fused (or False for fused but
+              nothing for foreach), we will run the for-loop implementation. If the user specifies
+              True for both foreach and fused, we will prioritize fused over foreach, as it is
+              typically faster. We attempt to use the fastest, so the hierarchy goes fused ->
+              foreach -> for-loop. HOWEVER, since the fused implementation is relatively new,
+              we want to give it sufficient bake-in time, so we default to foreach and NOT
+              fused when the user has not specified either flag."""
+
 _capturable_doc = r"""capturable (bool, optional): whether this instance is safe to
             capture in a CUDA graph. Passing True can impair ungraphed performance,
             so if you don't intend to graph capture this instance, leave it False
diff --git a/torch/optim/radam.py b/torch/optim/radam.py
@@ -210,7 +210,7 @@ def radam(
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, exp_avgs, exp_avg_sqs, state_steps],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/rmsprop.py b/torch/optim/rmsprop.py
@@ -221,7 +221,7 @@ def rmsprop(
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, square_avgs, grad_avgs, momentum_buffer_list],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/rprop.py b/torch/optim/rprop.py
@@ -193,7 +193,7 @@ def rprop(
 
     if foreach is None:
         _, foreach = _default_to_fused_or_foreach([params, grads, prevs, step_sizes],
-                                                  differentiable, has_fused=False)
+                                                  differentiable, use_fused=False)
 
     if foreach and torch.jit.is_scripting():
         raise RuntimeError("torch.jit.script not supported with foreach optimizers")
diff --git a/torch/optim/sgd.py b/torch/optim/sgd.py
@@ -208,7 +208,7 @@ def sgd(params: List[Tensor],
         # because JIT can't handle Optionals nor fancy conditionals when scripting
         if not torch.jit.is_scripting():
             _, foreach = _default_to_fused_or_foreach([params, d_p_list, momentum_buffer_list],
-                                                      differentiable=False, has_fused=False)
+                                                      differentiable=False, use_fused=False)
         else:
             foreach = False