Cherry pick dataloader issue fix to 0.3.1 (#5140)

ssnl · soumith · commit 902d57be9fab · 2018-02-09T11:44:58.000-05:00
* cherry pick Fix multiprocessing and dataloader tests on Windows (#4453) * cherry pick Dataloader issues #4643 * fix common IS_WINDOWS
diff --git a/test/common.py b/test/common.py
@@ -31,6 +31,7 @@
 def run_tests():
     unittest.main(argv=UNITTEST_ARGS)
 
+IS_WINDOWS = sys.platform == "win32"
 
 TEST_NUMPY = True
 try:
@@ -332,6 +333,8 @@ def accept_output(update_type):
                 self.assertEqual(s, expected)
 
     if sys.version_info < (3, 2):
+        # assertRegexpMatches renamed assertRegex in 3.2
+        assertRegex = unittest.TestCase.assertRegexpMatches
         # assertRaisesRegexp renamed assertRaisesRegex in 3.2
         assertRaisesRegex = unittest.TestCase.assertRaisesRegexp
 
diff --git a/test/test_dataloader.py b/test/test_dataloader.py
@@ -1,18 +1,24 @@
 import math
 import sys
+import errno
+import os
 import ctypes
+import signal
 import torch
 import time
 import traceback
 import unittest
 from torch import multiprocessing
 from torch.utils.data import Dataset, TensorDataset, DataLoader, ConcatDataset
 from torch.utils.data.dataset import random_split
-from torch.utils.data.dataloader import default_collate
-from common import TestCase, run_tests, TEST_NUMPY
+from torch.utils.data.dataloader import default_collate, ExceptionWrapper
+from common import TestCase, run_tests, TEST_NUMPY, IS_WINDOWS
 from common_nn import TEST_CUDA
 
 
+JOIN_TIMEOUT = 17.0 if IS_WINDOWS else 4.5
+
+
 class TestDatasetRandomSplit(TestCase):
     def test_lengths_must_equal_datset_size(self):
         with self.assertRaises(ValueError):
@@ -100,6 +106,46 @@ def test_add_dataset(self):
         self.assertEqual(0, (d3[0][0] - result[14][0]).abs().sum())
 
 
+# Stores the first encountered exception in .exception.
+# Inspired by https://stackoverflow.com/a/33599967
+class ErrorTrackingProcess(multiprocessing.Process):
+
+    def __init__(self, *args, **kwargs):
+        super(ErrorTrackingProcess, self).__init__(*args, **kwargs)
+        self._pconn, self._cconn = multiprocessing.Pipe()
+        self._exception = None
+
+    def run(self):
+        # Disable stderr printing from os level, and make workers not printing
+        # to stderr.
+        # Can't use sys.stderr.close, otherwise Python `raise` will error with
+        # ValueError: I/O operation on closed file.
+        os.close(sys.stderr.fileno())
+        try:
+            super(ErrorTrackingProcess, self).run()
+            self._cconn.send(None)
+        except Exception as e:
+            self._cconn.send(ExceptionWrapper(sys.exc_info()))
+            raise
+
+    @property
+    def exception(self):
+        if self._pconn.poll():
+            self._exception = self._pconn.recv()
+        if self._exception is None:
+            return None
+        else:
+            return self._exception.exc_type(self._exception.exc_msg)
+
+    # ESRCH means that os.kill can't finds alive proc
+    def send_signal(self, signum, ignore_ESRCH=False):
+        try:
+            os.kill(self.pid, signum)
+        except OSError as e:
+            if not ignore_ESRCH or e.errno != errno.ESRCH:
+                raise
+
+
 class ErrorDataset(Dataset):
 
     def __init__(self, size):
@@ -170,6 +216,23 @@ def __len__(self):
         return self.size
 
 
+def _test_timeout():
+    dataset = SleepDataset(10, 10)
+    dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1)
+    _ = next(iter(dataloader))
+
+
+def _test_segfault():
+    dataset = SegfaultDataset(10)
+    dataloader = DataLoader(dataset, batch_size=2, num_workers=2)
+    _ = next(iter(dataloader))
+
+
+# test custom init function
+def init_fn(worker_id):
+    torch.manual_seed(12345)
+
+
 class TestDataLoader(TestCase):
 
     def setUp(self):
@@ -248,34 +311,30 @@ def test_multiple_dataloaders(self):
 
     @unittest.skipIf(True, "flaky test")
     def test_segfault(self):
-        def _test_segfault():
-            sys.stderr.close()
-            dataset = SegfaultDataset(10)
-            dataloader = DataLoader(dataset, batch_size=2, num_workers=2)
-            _ = next(iter(dataloader))
-
-        p = multiprocessing.Process(target=_test_segfault)
+        p = ErrorTrackingProcess(target=_test_segfault)
         p.start()
-        p.join(1.0)
+        p.join(JOIN_TIMEOUT)
         try:
             self.assertFalse(p.is_alive())
             self.assertNotEqual(p.exitcode, 0)
+            if IS_WINDOWS:
+                self.assertIsInstance(p.exception, OSError)
+                self.assertRegex(str(p.exception), r'access violation reading ')
+            else:
+                self.assertIsInstance(p.exception, RuntimeError)
+                self.assertRegex(str(p.exception), r'DataLoader worker \(pid \d+\) is killed by signal: ')
         finally:
             p.terminate()
 
     def test_timeout(self):
-        def _test_timeout():
-            sys.stderr.close()
-            dataset = SleepDataset(10, 10)
-            dataloader = DataLoader(dataset, batch_size=2, num_workers=2, timeout=1)
-            _ = next(iter(dataloader))
-
-        p = multiprocessing.Process(target=_test_timeout)
+        p = ErrorTrackingProcess(target=_test_timeout)
         p.start()
-        p.join(3.0)
+        p.join(JOIN_TIMEOUT)
         try:
             self.assertFalse(p.is_alive())
             self.assertNotEqual(p.exitcode, 0)
+            self.assertIsInstance(p.exception, RuntimeError)
+            self.assertRegex(str(p.exception), r'DataLoader timed out after \d+ seconds')
         finally:
             p.terminate()
 
@@ -289,10 +348,6 @@ def test_worker_seed(self):
         self.assertEqual(len(seeds), num_workers)
 
     def test_worker_init_fn(self):
-        # test custom init function
-        def init_fn(worker_id):
-            torch.manual_seed(12345)
-
         dataset = SeedDataset(4)
         dataloader = DataLoader(dataset, batch_size=2, num_workers=2,
                                 worker_init_fn=init_fn)
@@ -381,10 +436,10 @@ def test_partial_workers(self):
                 break
         del loader
         for w in workers:
-            w.join(1.0)  # timeout of one second
+            w.join(JOIN_TIMEOUT)
             self.assertFalse(w.is_alive(), 'subprocess not terminated')
             self.assertEqual(w.exitcode, 0)
-        worker_manager_thread.join(1.0)
+        worker_manager_thread.join(JOIN_TIMEOUT)
         self.assertFalse(worker_manager_thread.is_alive())
 
     def test_len(self):
diff --git a/test/test_multiprocessing.py b/test/test_multiprocessing.py
@@ -11,14 +11,15 @@
 import torch.multiprocessing as mp
 from torch.autograd import Variable
 from torch.nn import Parameter
-from common import TestCase, run_tests
+from common import TestCase, run_tests, IS_WINDOWS
 
 
 TEST_REPEATS = 30
 HAS_SHM_FILES = os.path.isdir('/dev/shm')
 TEST_CUDA_IPC = torch.cuda.is_available() and \
     sys.version_info[0] == 3 and \
-    sys.platform != 'darwin'
+    sys.platform != 'darwin' and \
+    sys.platform != 'win32'
 TEST_MULTIGPU = TEST_CUDA_IPC and torch.cuda.device_count() > 1
 
 
@@ -318,6 +319,7 @@ def test_cuda_small_tensors(self):
             self.assertEqual(tensor_size, 5)
             self.assertEqual(storage_size, 5)
 
+    @unittest.skipIf(IS_WINDOWS, 'not applicable to Windows (only fails with fork)')
     @unittest.skipIf(not torch.cuda.is_available(), 'CUDA not available')
     def test_cuda_bad_call(self):
         # Initialize CUDA
diff --git a/torch/csrc/DataLoader.cpp b/torch/csrc/DataLoader.cpp
@@ -40,7 +40,7 @@ static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx)                 \
 
 // signal(2) is really not portable. So use sigaction.
 // http://man7.org/linux/man-pages/man2/signal.2.html
-static void setSignalHandler(int signal, void(*handler)(int, siginfo_t *, void *), struct sigaction *old_sa_ptr)
+static inline void setSignalHandler(int signal, void(*handler)(int, siginfo_t *, void *), struct sigaction *old_sa_ptr)
 {
   struct sigaction sa;
   sa.sa_sigaction = handler;
@@ -56,10 +56,34 @@ SIGNAL_HANDLER(SIGBUS, handler_SIGBUS, "ERROR: Unexpected bus error encountered
   "This might be caused by insufficient shared memory (shm).\n");
 SIGNAL_HANDLER(SIGSEGV, handler_SIGSEGV, "ERROR: Unexpected segmentation fault encountered in worker.\n");
 
+// When an error happend in DataLoader methods and Python starts to exit, the
+// error trace will keep the loader alive, and Python may kill the children
+// processes first before deleting the loader object. Then the cleaning up
+// methods in DataLoader.__del__ are not yet called, and SIGCHILD will print an
+// error saying a worker is killed by SIGTERM. So we suppress SIGTERM from main
+// loader process here to avoid this by _exit(EXIT_SUCCESS). Note that if we
+// exit with nonzero code, the loader SIGCHLD handler may report RuntimeError
+// again, and then it defeats the whole purpose.
+static void handler_SIGTERM(int sig, siginfo_t *info, void *ctx)
+{
+  if (info->si_pid == getppid()) {
+    _exit(EXIT_SUCCESS);
+  }
+  struct sigaction sa;
+  sa.sa_handler = SIG_DFL;
+  sa.sa_flags = 0;
+  if (sigemptyset(&sa.sa_mask) != 0 || sigaction(SIGTERM, &sa, NULL) != 0) {
+    _exit(EXIT_FAILURE);
+  } else {
+    raise(SIGTERM);
+  }
+}
+
 PyObject *THPModule_setWorkerSignalHandlers(PyObject *module, PyObject *arg) {
   HANDLE_TH_ERRORS
   setSignalHandler(SIGBUS, &handler_SIGBUS, NULL);
   setSignalHandler(SIGSEGV, &handler_SIGSEGV, NULL);
+  setSignalHandler(SIGTERM, &handler_SIGTERM, NULL);
   Py_RETURN_TRUE;
   END_HANDLE_TH_ERRORS
 }
@@ -70,33 +94,33 @@ PyObject *THPModule_errorIfAnyWorkerFails(PyObject *module) {
   HANDLE_TH_ERRORS
   int error;
   std::set<pid_t> *pid_set;
-  pid_t pid;
+  pid_t worker_pid;
   siginfo_t infop;
 
   // Only check the pids we care about
   for (auto it = worker_pids.begin(); it != worker_pids.end(); ++it) {
     pid_set = &(it->second);
     for (auto pid_it = pid_set->begin(); pid_it != pid_set->end(); ++pid_it) {
-      pid = *pid_it;
+      worker_pid = *pid_it;
       // Use waitid rather than waitpid so that we can set NOWAIT, and that Python
       // and other handlers can get whatever info they want about the child.
       infop.si_pid = 0;
-      error = waitid(P_PID, pid, &infop, WEXITED|WNOHANG|WNOWAIT);
+      error = waitid(P_PID, worker_pid, &infop, WEXITED|WNOHANG|WNOWAIT);
       // ignore errors and case with no waitable child
       if (error < 0 || infop.si_pid == 0)
         continue;
-      if (infop.si_code == CLD_EXITED && infop.si_status != 0) {  // exit with error
+      if (infop.si_code == CLD_EXITED && infop.si_status != EXIT_SUCCESS) {  // exit with error
         std::ostringstream oss;
-        oss << "DataLoader worker (pid " << pid << ") exited unexpectedly "
-            << "with exit code " << infop.si_status << ".";
+        oss << "DataLoader worker (pid " << worker_pid << ") exited "
+            << "unexpectedly with exit code " << infop.si_status << ".";
         // This is necessary. Otherwise, the runtime error will kill the other
         // workers, and trigger this again.
         pid_set->clear();
         throw std::runtime_error(oss.str());
       }  else if (infop.si_code == CLD_KILLED || infop.si_code == CLD_DUMPED) {  // killed by signal
         std::ostringstream oss;
-        oss << "DataLoader worker (pid " << pid << ") is killed by signal: "
-            << strsignal(infop.si_status) << ".";
+        oss << "DataLoader worker (pid " << worker_pid << ") is killed "
+            << "by signal: " << strsignal(infop.si_status) << ".";
         // This is necessary. Otherwise, the runtime error will kill the other
         // workers, and trigger this again.
         pid_set->clear();
diff --git a/torch/utils/data/dataloader.py b/torch/utils/data/dataloader.py