Skip to content

Commit 33bffff

Browse files
skiminkiAloril
authored andcommitted
Handle signals, add logging
Add signal handlers for USR1, HUP, INT, TERM. USR1 requests status and the other signals terminate the process. Add also logging (-l, -la options). This allows capturing and logging the hang-up events. Add INFO to all informal STATUS messages. Add ERROR to all informal STATUS error messages.
1 parent 78bf0e3 commit 33bffff

File tree

2 files changed

+158
-19
lines changed

2 files changed

+158
-19
lines changed

cuteseal-remote-runner/README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,3 +57,7 @@ TIMEOUT" message will be sent back to inform that the engine did not
5757
respond in time. Receiving this message triggers the Cutechess server
5858
to immediately forfeit the game due to timeout.
5959

60+
Finally, send USR1 signal to the runner process (note: runner, not
61+
engine!) to request a status report. This can be useful to determine
62+
whether the runner is still alive in case the engine becomes
63+
unresponsive.

cuteseal-remote-runner/main.cc

Lines changed: 154 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11

2+
#include <atomic>
23
#include <cinttypes>
34
#include <cstdarg>
45
#include <cstdio>
@@ -11,6 +12,7 @@
1112

1213
#include <fcntl.h>
1314
#include <poll.h>
15+
#include <signal.h>
1416
#include <sys/types.h>
1517
#include <sys/wait.h>
1618
#include <unistd.h>
@@ -28,15 +30,27 @@ namespace {
2830
uint64_t outCmdCounter { }; // 64 bits should be enough for anyone?
2931
uint64_t clockBaseNs { };
3032

33+
std::atomic<int> sigExitSigNum { -1 }; // non-negative if exit is signaled
34+
std::atomic<bool> sigStatusReport { false }; // status report requested
35+
36+
std::string logPath { };
37+
bool logAppend { };
38+
FILE *logFile { };
39+
3140
void print_usage()
3241
{
33-
puts("Usage: cuteseal-remote-runner <engine> [engine-options ...]\n"
42+
puts("Usage: cuteseal-remote-runner [options] <engine> [engine-options ...]\n"
3443
"\n"
3544
"Run engine and tag all input and output with time stamps. This is\n"
3645
"intended for lag elimination when running engines over a high-latency\n"
3746
"network.\n"
3847
"\n"
39-
"What this script essentially does is as follows:\n"
48+
"Options:\n"
49+
"-h This help.\n"
50+
"-l <file> Log output to a file. Truncate existing log.\n"
51+
"-la <file> Log output to a file. Append to existing log.\n"
52+
"\n"
53+
"What the runner essentially does is as follows:\n"
4054
"- Launches the engine\n"
4155
"- Per input inline:\n"
4256
" o echo the received line to output with timing information attached\n"
@@ -66,7 +80,10 @@ namespace {
6680
"If bestmove is not sent in time, the runner will send 'STATUS TIMEOUT' message,\n"
6781
"which the server-side will consider as a forfeit. This replaces the server-side\n"
6882
"timer-based timeout mechanism. The prefix 'cutechess-deadline <ns>' is not sent\n"
69-
"to the engine.\n");
83+
"to the engine.\n"
84+
"\n"
85+
"Send signal USR1 to cuteseal-remote-runner process to request a status report.\n"
86+
);
7087
}
7188

7289
uint64_t getClockNs()
@@ -77,6 +94,16 @@ namespace {
7794
return (tp.tv_sec * secsPerNs + tp.tv_nsec) - clockBaseNs;
7895
}
7996

97+
void statusSignalHandler(int signum)
98+
{
99+
sigStatusReport.store(true);
100+
}
101+
102+
void terminatingSignalHandler(int signum)
103+
{
104+
sigExitSigNum.store(signum, std::memory_order_relaxed);
105+
}
106+
80107
void timedPrintLine(Stream stream, const char *fmt, ...)
81108
{
82109
constexpr const char *streamNames[] { "STATUS", "STDIN ", "STDOUT", "STDERR" };
@@ -95,14 +122,27 @@ namespace {
95122

96123
puts(""); // newline
97124

125+
if (logFile) {
126+
fprintf(logFile, "%" PRIu64 " %" PRIu64 " %s ",
127+
outCmdCounter,
128+
ns,
129+
streamNames[static_cast<size_t>(stream)]);
130+
va_start(ap, fmt);
131+
vfprintf(logFile, fmt, ap);
132+
va_end(ap);
133+
fputc('\n', logFile);
134+
fflush(logFile);
135+
}
136+
137+
98138
outCmdCounter++;
99139
}
100140

101141
void timedPerror(const char *str)
102142
{
103143
const char *error { strerror(errno) };
104144

105-
timedPrintLine(Stream::STATUS, "%s: %s", str, error);
145+
timedPrintLine(Stream::STATUS, "ERROR %s: %s", str, error);
106146
}
107147

108148
class FdLineBuffer
@@ -168,7 +208,7 @@ namespace {
168208
streamError = ECONNRESET; // we'll use this to mark end of stream
169209
return false;
170210
} else {
171-
if (!(errno == EAGAIN || errno == EWOULDBLOCK)) {
211+
if (!(errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) {
172212
// real error, and not just "no more data at the moment"
173213
streamError = errno;
174214
}
@@ -178,6 +218,18 @@ namespace {
178218
}
179219
};
180220

221+
void printStatus(uint64_t bestmoveDeadlineNs)
222+
{
223+
if (bestmoveDeadlineNs == 0) {
224+
timedPrintLine(Stream::STATUS, "REPORT Runner alive");
225+
}
226+
else {
227+
const int64_t nsLeft = bestmoveDeadlineNs - getClockNs();
228+
timedPrintLine(Stream::STATUS, "REPORT Runner alive, bestmove deadline in %" PRId64 " ns",
229+
std::max<int64_t>(0, nsLeft));
230+
}
231+
}
232+
181233
void runLoop(int childStdin, int childStdout, int childStderr)
182234
{
183235
FdLineBuffer flbIn { STDIN_FILENO };
@@ -221,8 +273,27 @@ namespace {
221273
}
222274

223275
if (poll(fdsToPoll, std::size(fdsToPoll), pollDeadlineMs) < 0) {
224-
timedPerror("Poll failed, aborting");
225-
abort();
276+
277+
if (errno != EINTR) {
278+
timedPerror("Poll failed, aborting");
279+
abort();
280+
}
281+
}
282+
283+
// exit signal occurred?
284+
if (sigExitSigNum.load(std::memory_order_relaxed) != -1) {
285+
const int signum = sigExitSigNum.load(std::memory_order_relaxed);
286+
287+
printStatus(bestmoveDeadlineNs);
288+
timedPrintLine(Stream::STATUS, "INFO Runner received exit signal %d (%s), exitting...", signum, strsignal(signum));
289+
290+
break; // exit
291+
}
292+
293+
// status report requested by signal?
294+
if (sigStatusReport.load(std::memory_order_relaxed)) {
295+
printStatus(bestmoveDeadlineNs);
296+
sigStatusReport.store(false, std::memory_order_relaxed);
226297
}
227298

228299
// go through the streams
@@ -271,13 +342,13 @@ namespace {
271342
// check the streams for errors
272343
for (size_t i = 0; i < std::size(flbs); ++i) {
273344
if (flbs[i]->getError()) {
274-
timedPrintLine(Stream::STATUS, "Stream %s has terminated: %s", pollEntryNames[i], strerror(flbs[i]->getError()));
345+
timedPrintLine(Stream::STATUS, "INFO Stream %s has terminated: %s", pollEntryNames[i], strerror(flbs[i]->getError()));
275346
allStreamsGood = false;
276347
continue; // no need to spam the poll status
277348
}
278349

279350
if (fdsToPoll[i].revents & (POLLHUP | POLLERR | POLLRDHUP)) {
280-
timedPrintLine(Stream::STATUS, "Stream %s has terminated, poll status=%hd", pollEntryNames[i], fdsToPoll[i].revents);
351+
timedPrintLine(Stream::STATUS, "INFO Stream %s has terminated, poll status=%hd", pollEntryNames[i], fdsToPoll[i].revents);
281352
allStreamsGood = false;
282353
}
283354
}
@@ -286,22 +357,62 @@ namespace {
286357
if (toChild) {
287358
fclose(toChild);
288359
}
360+
close(childStdout);
361+
close(childStderr);
289362
}
290363

291364
} // anonymous namespace
292365

293366
int main(int argc, char **argv)
294367
{
295-
if (argc < 2) {
368+
// reset our relative clock
369+
clockBaseNs = getClockNs();
370+
371+
// parse options
372+
if (argc == 0) {
373+
// shouldn't ever happen; this is reserved for ldd
374+
print_usage();
375+
return 127;
376+
}
377+
++argv;
378+
--argc;
379+
380+
while (argc > 0 && argv[0][0] == '-') {
381+
if (strcmp(argv[0], "-l") == 0 && argc >= 2) {
382+
logPath = argv[1];
383+
argv += 2;
384+
argc -= 2;
385+
logAppend = false;
386+
}
387+
else if (strcmp(argv[0], "-la") == 0 && argc >= 2) {
388+
logPath = argv[1];
389+
argv += 2;
390+
argc -= 2;
391+
logAppend = true;
392+
}
393+
else {
394+
print_usage();
395+
return 127;
396+
}
397+
}
398+
399+
// engine specified after options?
400+
if (argc < 1) {
296401
print_usage();
297402
return 127;
298403
}
299404

300405
// ensure we print in line-buffered mode
301406
setlinebuf(stdout);
302407

303-
// reset our relative clock
304-
clockBaseNs = getClockNs();
408+
// open log file if specified
409+
if (!logPath.empty()) {
410+
logFile = fopen(logPath.c_str(), logAppend ? "a" : "w");
411+
if (!logFile) {
412+
timedPerror("Failed to open log file");
413+
return 126;
414+
}
415+
}
305416

306417
// set up the pipes and launch the engine; [0]=read end; [1]=write end
307418
int childIn[2] { -1, -1 };
@@ -346,8 +457,12 @@ int main(int argc, char **argv)
346457
return 126;
347458
}
348459

460+
if (logFile) {
461+
fclose(logFile);
462+
}
463+
349464
// launch the engine
350-
execvp(argv[1], argv + 1);
465+
execvp(argv[0], argv);
351466

352467
// if we get here, something went wrong
353468
perror("Failed to launch the engine");
@@ -359,9 +474,24 @@ int main(int argc, char **argv)
359474
close(childOut[1]);
360475
close(childErr[1]);
361476

362-
timedPrintLine(Stream::STATUS, "Engine launched with pid %d with the following parameters", static_cast<int>(child));
363-
for (int i = 1; i < argc; ++i) {
364-
timedPrintLine(Stream::STATUS, "argv[%d]='%s'", (i - 1), argv[i]);
477+
timedPrintLine(Stream::STATUS, "INFO Engine launched with pid %d with the following parameters", static_cast<int>(child));
478+
for (int i = 0; i < argc; ++i) {
479+
timedPrintLine(Stream::STATUS, "INFO argv[%d]='%s'", i, argv[i]);
480+
}
481+
482+
// assign signal handlers
483+
{
484+
struct sigaction sigact { };
485+
486+
sigact.sa_flags = SA_RESTART;
487+
488+
sigact.sa_handler = &terminatingSignalHandler;
489+
sigaction(SIGTERM, &sigact, NULL);
490+
sigaction(SIGINT, &sigact, NULL);
491+
sigaction(SIGHUP, &sigact, NULL);
492+
493+
sigact.sa_handler = &statusSignalHandler;
494+
sigaction(SIGUSR1, &sigact, NULL);
365495
}
366496

367497
runLoop(childIn[1], childOut[0], childErr[0]);
@@ -373,16 +503,21 @@ int main(int argc, char **argv)
373503
int wstatus { };
374504
if (waitpid(child, &wstatus, 0) == child) {
375505
if (WIFEXITED(wstatus)) {
376-
timedPrintLine(Stream::STATUS, "Engine has terminated with exit code %d", WEXITSTATUS(wstatus));
506+
timedPrintLine(Stream::STATUS, "INFO Engine has terminated with exit code %d", WEXITSTATUS(wstatus));
377507
} else if (WIFSIGNALED(wstatus)) {
378-
timedPrintLine(Stream::STATUS, "Engine has terminated by signal %d (%s)", WTERMSIG(wstatus), strsignal(WTERMSIG(wstatus)));
508+
timedPrintLine(Stream::STATUS, "INFO Engine has terminated by signal %d (%s)", WTERMSIG(wstatus), strsignal(WTERMSIG(wstatus)));
379509
} else {
380-
timedPrintLine(Stream::STATUS, "Engine terminated for unknown reason, waitpid status=%d", wstatus);
510+
timedPrintLine(Stream::STATUS, "INFO Engine terminated for unknown reason, waitpid status=%d", wstatus);
381511
}
382512
} else {
383513
timedPerror("Failed to wait for the child to terminate");
384514
return 126;
385515
}
386516

517+
if (logFile) {
518+
fclose(logFile);
519+
logFile = nullptr;
520+
}
521+
387522
return 0;
388523
}

0 commit comments

Comments
 (0)