Skip to content

Commit 8e76dcf

Browse files
ssnlfacebook-github-bot
authored andcommitted
Prevent raising KeyboardInterrupt in worker (#11718)
Summary: Current behavior is that each process (main and workers) will print trace from `KeyboardInterrupt`. And the main process will also print ``` RuntimeError: DataLoader worker (pid 46045) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with nm_workers=0 may give better error trace. ``` due to our SIGCLD handler. Pull Request resolved: #11718 Differential Revision: D9840844 Pulled By: SsnL fbshipit-source-id: 1a05060bb02907fef5aac3f274d2c84f9f42d187
1 parent d24bcfd commit 8e76dcf

File tree

1 file changed

+40
-36
lines changed

1 file changed

+40
-36
lines changed

torch/utils/data/dataloader.py

Lines changed: 40 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -73,49 +73,53 @@ def is_alive(self):
7373

7474

7575
def _worker_loop(dataset, index_queue, data_queue, done_event, collate_fn, seed, init_fn, worker_id):
76-
global _use_shared_memory
77-
_use_shared_memory = True
76+
try:
77+
global _use_shared_memory
78+
_use_shared_memory = True
7879

79-
# Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
80-
# module's handlers are executed after Python returns from C low-level
81-
# handlers, likely when the same fatal signal happened again already.
82-
# https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
83-
_set_worker_signal_handlers()
80+
# Intialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
81+
# module's handlers are executed after Python returns from C low-level
82+
# handlers, likely when the same fatal signal happened again already.
83+
# https://docs.python.org/3/library/signal.html Sec. 18.8.1.1
84+
_set_worker_signal_handlers()
8485

85-
torch.set_num_threads(1)
86-
random.seed(seed)
87-
torch.manual_seed(seed)
86+
torch.set_num_threads(1)
87+
random.seed(seed)
88+
torch.manual_seed(seed)
8889

89-
# Do not wait for putting thread to join when this worker exits. Otherwise,
90-
# this worker may always be waiting to put and doesn't check index_queue
91-
# and done_event for termination signal.
92-
data_queue.cancel_join_thread()
90+
# Do not wait for putting thread to join when this worker exits.
91+
# Otherwise, this worker may always be waiting to put and doesn't check
92+
# index_queue and done_event for termination signal.
93+
data_queue.cancel_join_thread()
9394

94-
if init_fn is not None:
95-
init_fn(worker_id)
95+
if init_fn is not None:
96+
init_fn(worker_id)
9697

97-
watchdog = ManagerWatchdog()
98+
watchdog = ManagerWatchdog()
9899

99-
while True:
100-
try:
101-
r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
102-
except queue.Empty:
103-
if watchdog.is_alive() and not done_event.is_set():
104-
continue
105-
else:
100+
while True:
101+
try:
102+
r = index_queue.get(timeout=MANAGER_STATUS_CHECK_INTERVAL)
103+
except queue.Empty:
104+
if watchdog.is_alive() and not done_event.is_set():
105+
continue
106+
else:
107+
break
108+
# use done_event so that we can get faster exiting signal even if there
109+
# are still indices in index_queue
110+
if r is None or done_event.is_set():
106111
break
107-
# use done_event so that we can get faster exiting signal even if there
108-
# are still indices in index_queue
109-
if r is None or done_event.is_set():
110-
break
111-
idx, batch_indices = r
112-
try:
113-
samples = collate_fn([dataset[i] for i in batch_indices])
114-
except Exception:
115-
data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
116-
else:
117-
data_queue.put((idx, samples))
118-
del samples
112+
idx, batch_indices = r
113+
try:
114+
samples = collate_fn([dataset[i] for i in batch_indices])
115+
except Exception:
116+
data_queue.put((idx, ExceptionWrapper(sys.exc_info())))
117+
else:
118+
data_queue.put((idx, samples))
119+
del samples
120+
except KeyboardInterrupt:
121+
# Main process will raise KeyboardInterrupt anyways.
122+
pass
119123

120124

121125
def _pin_memory_loop(in_queue, out_queue, done_event, pin_memory, device_id):

0 commit comments

Comments
 (0)