[WIP][dist_optim] introduce distributed functional optimizer

wanchaol · wanchaol · commit 17fd1ff148f8 · 2020-09-23T12:15:37.000-07:00
This PR introduces a distributed functional optimizer, so that distributed optimizer can reuse the functional optimizer APIs and maintain their own states. This could enable the torchscript compatible functional optimizer when using distributed optimizer, helps getting rid of GIL and improve overall performance of training, especially distributed model parallel training ghstack-source-id: 0c75d84 Pull Request resolved: #45221
diff --git a/torch/distributed/optim/adagrad.py b/torch/distributed/optim/adagrad.py
@@ -0,0 +1,90 @@
+from typing import List, Dict, Optional
+import torch
+import torch.optim.functional as F
+
+from torch import Tensor
+
+# Define a TorchScript compatible Functional Adagrad Optimizer
+# where we use these optimizer in a functional way.
+# Instead of using the `param.grad` when updating parameters,
+# we explicitly let the user pass gradients to the `step` function
+# this is so that we could separate the gradients and parameters
+# and allow multithreaded trainer to update the parameters
+# without data traces on accumulating to the same .grad.
+# NOTE: This should be only used by distributed optimizer internals
+# and not meant to expose to the user.
+@torch.jit.script
+class FunctionalAdagrad(object):
+    def __init__(
+        self,
+        params: List[Tensor],
+        lr: float = 1e-2,
+        lr_decay: float = 0.0,
+        weight_decay: float = 0.0,
+        initial_accumulator_value: float = 0.0,
+        warmup_lr_multiplier: float = 1.0,
+        warmup_num_iters: float = 0.0,
+        eps: float = 1e-10,
+        coalesce_grad: bool = True,
+    ):
+        self.defaults = {
+            "lr": lr,
+            "lr_decay": lr_decay,
+            "eps": eps,
+            "weight_decay": weight_decay,
+            "initial_accumulator_value": initial_accumulator_value,
+            "warmup_lr_multiplier": warmup_lr_multiplier,
+            "warmup_num_iters": warmup_num_iters,
+        }
+        self.coalesce_grad = coalesce_grad
+        self.state = torch.jit.annotate(Dict[torch.Tensor, Dict[str, torch.Tensor]], {})
+
+        if len(params) == 0:
+            raise ValueError("optimizer got an empty parameter list")
+
+        # NOTE: we only have one param_group and don't allow user to add additional
+        # param group as it's not a common use case.
+        self.param_group = {"params": params}
+
+        # TODO: no union or any types in TorchScript, make step a scalar tensor instead
+        # This is also needed by if we want to share_memory on the step across processes
+        for p in self.param_group["params"]:
+            self.state[p] = {
+                "sum": torch.full_like(p.data, initial_accumulator_value),
+                "step": torch.tensor(0.0),
+            }
+
+    def step(self, gradients: List[Optional[Tensor]]):
+        params = self.param_group['params']
+        params_with_grad = []
+        grads = []
+        state_sums = []
+        state_steps: List[int] = []
+
+        if len(params) != len(gradients):
+            raise ValueError(
+                "the gradients passed in does not equal to the size of the parameters!"
+                + f"Params length: {len(params)}. "
+                + f"Gradients length: {len(gradients)}"
+            )
+
+        for param, gradient in zip(self.param_group['params'], gradients):
+            if gradient is not None:
+                params_with_grad.append(param)
+                grads.append(gradient)
+                state = self.state[param]
+                state_sums.append(state['sum'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'].item())
+
+        with torch.no_grad():
+            F.adagrad(params,
+                      grads,
+                      state_sums,
+                      state_steps,
+                      self.defaults['lr'],
+                      self.defaults['weight_decay'],
+                      self.defaults['lr_decay'],
+                      self.defaults['eps'])
diff --git a/torch/distributed/optim/optimizer.py b/torch/distributed/optim/optimizer.py
@@ -1,11 +1,16 @@
+from typing import List, Optional
+
 import torch.distributed.rpc as rpc
+import torch.optim as optim
+from .adagrad import FunctionalAdagrad
 import torch.distributed.autograd as dist_autograd
 
+
 from collections import defaultdict
 from threading import Lock
 
 
-class _LocalOptimizer:
+class _LocalOptimizer(object):
     # Ideally we would only need to share a lock for instances of
     # _LocalOptimizer that deal with the same parameters. We are
     # making a simplifying assumption here that if there is more
@@ -14,20 +19,36 @@ class _LocalOptimizer:
     # trainer will create its own instance of _LocalOptimizer but
     # they will all optimize the same parameters on each worker)
     global_lock = Lock()
+    functional_optim_map = {
+        optim.Adagrad: FunctionalAdagrad,
+        # torch.optim.Adam: torch.distributed.optim.Adam
+    }
 
     def __init__(self, optim_cls, local_params_rref, *args, **kwargs):
-        self.optim = optim_cls(
-            [rref.local_value() for rref in local_params_rref],
+        optim_ctor = _LocalOptimizer.functional_optim_map.get(optim_cls, optim_cls)
+        self.is_functional_optim = (optim_ctor != optim_cls)
+        self._local_params = [rref.local_value() for rref in local_params_rref]
+        self.optim = optim_ctor(
+            self._local_params,
             *args,
             **kwargs)
 
     def step(self, autograd_ctx_id):
         all_local_grads = dist_autograd.get_gradients(autograd_ctx_id)
 
-        with _LocalOptimizer.global_lock:
-            for param, grad in all_local_grads.items():
-                param.grad = grad
-            self.optim.step()
+        if self.is_functional_optim:
+            # apply functional optimizer step with a list of gradients
+            grads: List[Optional[torch.Tensor]] = [
+                all_local_grads[p] if p in all_local_grads else None
+                for p in self._local_params
+            ]
+
+            self.optim.step(grads)
+        else:
+            with _LocalOptimizer.global_lock:
+                for param, grad in all_local_grads.items():
+                    param.grad = grad
+                self.optim.step()
 
 
 def _new_local_optimizer(optim_cls, local_params_rref, *args, **kwargs):
diff --git a/torch/optim/functional.py b/torch/optim/functional.py
@@ -6,6 +6,16 @@
 
 # TODO: use foreach API in optim.functional to do all the computation
 
+def _make_sparse(grad, grad_indices, values):
+    size = grad.size()
+    if grad_indices.numel() == 0 or values.numel() == 0:
+        return torch.empty_like(grad)
+    return torch.sparse_coo_tensor(grad_indices, values, size)
+    # constructor = grad.new
+    # if grad_indices.dim() == 0 or values.dim() == 0:
+    #     return constructor().resize_as_(grad)
+    # return constructor(grad_indices, values, size)
+
 def adagrad(params: List[Tensor],
             grads: List[Tensor],
             state_sums: List[Tensor],
@@ -33,15 +43,10 @@ def adagrad(params: List[Tensor],
             grad_values = grad._values()
             size = grad.size()
 
-            def make_sparse(values):
-                constructor = grad.new
-                if grad_indices.dim() == 0 or values.dim() == 0:
-                    return constructor().resize_as_(grad)
-                return constructor(grad_indices, values, size)
-            state_sum.add_(make_sparse(grad_values.pow(2)))
+            state_sum.add_(_make_sparse(grad, grad_indices, grad_values.pow(2)))
             std = state_sum.sparse_mask(grad)
             std_values = std._values().sqrt_().add_(eps)
-            param.add_(make_sparse(grad_values / std_values), alpha=-clr)
+            param.add_(_make_sparse(grad, grad_indices, grad_values / std_values), alpha=-clr)
         else:
             state_sum.addcmul_(grad, grad, value=1)
             std = state_sum.sqrt().add_(eps)
diff --git a/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py b/torch/testing/_internal/distributed/rpc/dist_optimizer_test.py
@@ -198,3 +198,70 @@ def test_dist_optim(self):
             # ensure local equals remote
             self.assertEqual(new_w1, module1.get_w())
             self.assertEqual(new_w2, module2.get_w())
+
+
+    @dist_init
+    def test_dist_optim_functional(self):
+        # local version
+        module1 = MyModule()
+        module2 = MyModule()
+        params = [module1.get_w(), module2.get_w()]
+        local_optim = optim.Adagrad(params, lr=0.05)
+
+        old_w1 = module1.w.clone().detach()
+        old_w2 = module2.w.clone().detach()
+
+        g_cpu = torch.Generator()
+        g_cpu.manual_seed(0)
+        t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+        output1 = module1.forward(t2)
+        output2 = module2.forward(output1)
+        loss = torch.add(output2, t1).sum()
+
+        loss.backward()
+        local_optim.step()
+
+        # distributed version
+        owner1 = "worker%d" % ((self.rank + 1) % self.world_size)
+        owner2 = "worker%d" % ((self.rank + 2) % self.world_size)
+
+        remote_module1 = rpc.remote(owner1, MyModule)
+        remote_module2 = rpc.remote(owner2, MyModule)
+        remote_param1 = remote_method(MyModule.get_w, remote_module1)
+        remote_param2 = remote_method(MyModule.get_w, remote_module2)
+
+        old_w1_remote = remote_param1.to_here()
+
+        # sanity check: local and remote initial weights should match
+        self.assertEqual(old_w1, remote_param1.to_here())
+        self.assertEqual(old_w2, remote_param2.to_here())
+
+        dist_optim = DistributedOptimizer(
+            optim.Adagrad, [remote_param1, remote_param2], lr=0.05
+        )
+
+        with dist_autograd.context() as context_id:
+            g_cpu.manual_seed(0)
+            t1 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            t2 = torch.rand((3, 3), requires_grad=True, generator=g_cpu)
+            output1 = rpc_async_method(MyModule.forward, remote_module1, t2)
+            output2 = rpc_async_method(MyModule.forward, remote_module2, output1.wait())
+            loss = torch.add(output2.wait(), t1)
+
+            dist_autograd.backward(context_id, [loss.sum()])
+            dist_optim.step(context_id)
+
+            new_w1 = rpc_async_method(MyModule.get_w, remote_module1).wait()
+            new_w2 = rpc_async_method(MyModule.get_w, remote_module2).wait()
+            print("old w1: ")
+            print(old_w1)
+            print("new w1: ")
+            print(new_w1)
+
+            # ensure optimizer changed weights
+            self.assertNotEqual(old_w1, new_w1)
+            self.assertNotEqual(old_w2, new_w2)
+            # ensure local equals remote
+            self.assertEqual(new_w1, module1.get_w())
+            self.assertEqual(new_w2, module2.get_w())