Skip to content

Commit b64e51f

Browse files
author
Daniel
committed
Adds reconnect handling for SMTP 421 events.
This commit allows the mailer to treat a SMTP 421 err as an event that should produce a reconnect attempt. Issue letsencrypt#2249 describes a case where we see this SMTP error code from the remote server when our connection has been idle for too long. We now reconnect when this happens rather than failing ungracefully. The logic in the `mail-test-srv` used to force a number of initial connections to be disconnected is changed such that half of these forced disconnects are the normal clean connection close and half are a SMTP 421. This allows the existing integration test for server disconnects to be reused to test the 421 reconnect logic.
1 parent abfdce6 commit b64e51f

File tree

2 files changed

+46
-10
lines changed

2 files changed

+46
-10
lines changed

mail/mailer.go

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"net"
1414
"net/mail"
1515
"net/smtp"
16+
"net/textproto"
1617
"strconv"
1718
"strings"
1819
"time"
@@ -151,7 +152,7 @@ func (m *MailerImpl) generateMessage(to []string, subject, body string) ([]byte,
151152
addrs := []string{}
152153
for _, a := range to {
153154
if !core.IsASCII(a) {
154-
return nil, fmt.Errorf("Non-ASCII email address")
155+
return nil, fmt.Errorf("Non-ASCIR email address")
155156
}
156157
addrs = append(addrs, strconv.Quote(a))
157158
}
@@ -292,10 +293,31 @@ func (m *MailerImpl) SendMail(to []string, subject, msg string) error {
292293
m.stats.Inc("SendMail.Reconnects", 1)
293294
continue
294295
} else if err != nil {
295-
// If it wasn't an EOF error it is unexpected and we return from
296-
// SendMail() with an error
297-
m.stats.Inc("SendMail.Errors", 1)
298-
return err
296+
/*
297+
* If the error is an instace of `textproto.Error` with a SMTP error code,
298+
* and that error code is 421 then treat this as a reconnect-able event.
299+
*
300+
* The SMTP RFC defines this error code as:
301+
* 421 <domain> Service not available, closing transmission channel
302+
* (This may be a reply to any command if the service knows it
303+
* must shut down)
304+
*
305+
* In practice we see this code being used by our production SMTP server
306+
* when the connection has gone idle for too long. For more information
307+
* see issue #2249[0].
308+
*
309+
* [0] - https://github.com/letsencrypt/boulder/issues/2249
310+
*/
311+
if protoErr, ok := err.(*textproto.Error); ok && protoErr.Code == 421 {
312+
m.stats.Inc("SendMail.Errors.SMTP.421", 1)
313+
m.reconnect()
314+
m.stats.Inc("SendMail.Reconnects", 1)
315+
} else {
316+
// If it wasn't an EOF error or a SMTP 421 it is unexpected and we
317+
// return from SendMail() with an error
318+
m.stats.Inc("SendMail.Errors", 1)
319+
return err
320+
}
299321
}
300322
}
301323

test/mail-test-srv/main.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,25 @@ func (srv *mailSrv) handleConn(conn net.Conn) {
101101
case "MAIL":
102102
srv.connNumberMutex.RLock()
103103
if srv.connNumber <= srv.closeFirst {
104-
log.Printf(
105-
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
106-
srv.connNumber, srv.closeFirst)
107-
clearState()
108-
conn.Close()
104+
// Half of the time, close cleanly to simulate the server side closing
105+
// unexpectedly.
106+
if srv.connNumber%2 == 0 {
107+
log.Printf(
108+
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
109+
srv.connNumber, srv.closeFirst)
110+
clearState()
111+
conn.Close()
112+
} else {
113+
// The rest of the time, simulate a stale connection timeout by sending
114+
// a SMTP 421 message. This replicates the timeout/close from issue
115+
// 2249 - https://github.com/letsencrypt/boulder/issues/2249
116+
log.Printf(
117+
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting with 421. Bye!\n",
118+
srv.connNumber, srv.closeFirst)
119+
clearState()
120+
conn.Write([]byte("421 1.2.3 foo.bar.baz Error: timeout exceeded \r\n"))
121+
conn.Close()
122+
}
109123
}
110124
srv.connNumberMutex.RUnlock()
111125
clearState()

0 commit comments

Comments
 (0)