Skip to content

Commit 8efc634

Browse files
authored
Mailer reliability improvements (letsencrypt#2262)
### Connect before sending mail, not at startup Per letsencrypt#2250 when we connect to the remote SMTP server at start-up time by calling `mailer.Connect()` but do not actually call `mailer.SendMail()` until after we have done some potentially expensive/time-consuming work we are liable to have our connection closed due to timeout. This PR moves the `Connect()` call in `expiration-mailer` and `notify-mailer` to be closer to where the actual messages are sent via `SendMail()` and resolves letsencrypt#2250 ### Handle SMTP 421 errors gracefully Issue letsencrypt#2249 describes a case where we see this SMTP error code from the remote server when our connection has been idle for too long. This would manifest when connecting to the remote server at startup, running a very long database query, and then sending mail. This commit allows the mailer to treat SMTP 421 errors as an event that should produce a reconnect attempt and resolves letsencrypt#2249. A unit test is added to the mailer tests to test that reconnection works when the server sends a SMTP 421 error. Prior to b64e51f and support for SMTP 421 reconnection this test failed in a manner matching issue letsencrypt#2249: ``` go test -p 1 -race --test.run TestReconnectSMTP421 github.com/letsencrypt/boulder/mail Wrote goodbye msg: 421 1.2.3 green.eggs.and.spam Error: timeout exceeded Cutting off client early --- FAIL: TestReconnectSMTP421 (0.00s) mailer_test.go:257: Expected SendMail() to not fail. Got err: 421 1.2.3 green.eggs.and.spam Error: timeout exceeded FAIL FAIL github.com/letsencrypt/boulder/mail 0.023s ``` With b64e51f the test passes and the client gracefully reconnects. The existing reconnect testing logic in the `mail-test-srv` integration tests is changed such that half of the forced disconnects are a normal clean connection close and half are a SMTP 421. This allows the existing integration test for server disconnects to be reused to test the 421 reconnect logic.
1 parent 5f1919b commit 8efc634

File tree

5 files changed

+104
-29
lines changed

5 files changed

+104
-29
lines changed

cmd/expiration-mailer/main.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ import (
2424
"github.com/letsencrypt/boulder/cmd"
2525
"github.com/letsencrypt/boulder/core"
2626
blog "github.com/letsencrypt/boulder/log"
27-
"github.com/letsencrypt/boulder/mail"
27+
bmail "github.com/letsencrypt/boulder/mail"
2828
"github.com/letsencrypt/boulder/metrics"
2929
"github.com/letsencrypt/boulder/rpc"
3030
"github.com/letsencrypt/boulder/sa"
@@ -47,7 +47,7 @@ type mailer struct {
4747
log blog.Logger
4848
dbMap *gorp.DbMap
4949
rs regStore
50-
mailer mail.Mailer
50+
mailer bmail.Mailer
5151
emailTemplate *template.Template
5252
subject string
5353
nagTimes []time.Duration
@@ -155,6 +155,15 @@ func (m *mailer) processCerts(allCerts []core.Certificate) {
155155
regIDToCerts[cert.RegistrationID] = cs
156156
}
157157

158+
err := m.mailer.Connect()
159+
if err != nil {
160+
m.log.AuditErr(fmt.Sprintf("Error connecting to send nag emails: %s", err))
161+
return
162+
}
163+
defer func() {
164+
_ = m.mailer.Close()
165+
}()
166+
158167
for regID, certs := range regIDToCerts {
159168
reg, err := m.rs.GetRegistration(ctx, regID)
160169
if err != nil {
@@ -379,7 +388,7 @@ func main() {
379388

380389
smtpPassword, err := c.Mailer.PasswordConfig.Pass()
381390
cmd.FailOnError(err, "Failed to load SMTP password")
382-
mailClient := mail.New(
391+
mailClient := bmail.New(
383392
c.Mailer.Server,
384393
c.Mailer.Port,
385394
c.Mailer.Username,
@@ -389,11 +398,6 @@ func main() {
389398
scope,
390399
*reconnBase,
391400
*reconnMax)
392-
err = mailClient.Connect()
393-
cmd.FailOnError(err, "Couldn't connect to mail server.")
394-
defer func() {
395-
_ = mailClient.Close()
396-
}()
397401

398402
nagCheckInterval := defaultNagCheckInterval
399403
if s := c.Mailer.NagCheckInterval; s != "" {

cmd/notify-mailer/main.go

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,14 @@ func (m *mailer) run() error {
9999
return err
100100
}
101101

102+
err = m.mailer.Connect()
103+
if err != nil {
104+
return err
105+
}
106+
defer func() {
107+
_ = m.mailer.Close()
108+
}()
109+
102110
startTime := m.clk.Now()
103111

104112
for i, dest := range destinations {
@@ -345,13 +353,6 @@ func main() {
345353
*reconnBase,
346354
*reconnMax)
347355
}
348-
err = mailClient.Connect()
349-
cmd.FailOnError(err, fmt.Sprintf("Connecting to %s:%s",
350-
cfg.NotifyMailer.Server, cfg.NotifyMailer.Port))
351-
defer func() {
352-
err = mailClient.Close()
353-
cmd.FailOnError(err, "Closing mail client")
354-
}()
355356

356357
m := mailer{
357358
clk: cmd.Clock(),

mail/mailer.go

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"net"
1414
"net/mail"
1515
"net/smtp"
16+
"net/textproto"
1617
"strconv"
1718
"strings"
1819
"time"
@@ -292,10 +293,31 @@ func (m *MailerImpl) SendMail(to []string, subject, msg string) error {
292293
m.stats.Inc("SendMail.Reconnects", 1)
293294
continue
294295
} else if err != nil {
295-
// If it wasn't an EOF error it is unexpected and we return from
296-
// SendMail() with an error
297-
m.stats.Inc("SendMail.Errors", 1)
298-
return err
296+
/*
297+
* If the error is an instance of `textproto.Error` with a SMTP error code,
298+
* and that error code is 421 then treat this as a reconnect-able event.
299+
*
300+
* The SMTP RFC defines this error code as:
301+
* 421 <domain> Service not available, closing transmission channel
302+
* (This may be a reply to any command if the service knows it
303+
* must shut down)
304+
*
305+
* In practice we see this code being used by our production SMTP server
306+
* when the connection has gone idle for too long. For more information
307+
* see issue #2249[0].
308+
*
309+
* [0] - https://github.com/letsencrypt/boulder/issues/2249
310+
*/
311+
if protoErr, ok := err.(*textproto.Error); ok && protoErr.Code == 421 {
312+
m.stats.Inc("SendMail.Errors.SMTP.421", 1)
313+
m.reconnect()
314+
m.stats.Inc("SendMail.Reconnects", 1)
315+
} else {
316+
// If it wasn't an EOF error or a SMTP 421 it is unexpected and we
317+
// return from SendMail() with an error
318+
m.stats.Inc("SendMail.Errors", 1)
319+
return err
320+
}
299321
}
300322
}
301323

mail/mailer_test.go

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,11 @@ func normalHandler(connID int, t *testing.T, conn net.Conn) {
121121
// The disconnectHandler authenticates the client like the normalHandler but
122122
// additionally processes an email flow (e.g. MAIL, RCPT and DATA commands).
123123
// When the `connID` is <= `closeFirst` the connection is closed immediately
124-
// after the MAIL command is received and prior to issuing a 250 response. In
125-
// this way the first `closeFirst` connections will not complete normally and
126-
// can be tested for reconnection logic.
127-
func disconnectHandler(closeFirst int) connHandler {
124+
// after the MAIL command is received and prior to issuing a 250 response. If
125+
// a `goodbyeMsg` is provided, it is written to the client immediately before
126+
// closing. In this way the first `closeFirst` connections will not complete
127+
// normally and can be tested for reconnection logic.
128+
func disconnectHandler(closeFirst int, goodbyeMsg string) connHandler {
128129
return func(connID int, t *testing.T, conn net.Conn) {
129130
defer func() {
130131
err := conn.Close()
@@ -140,6 +141,13 @@ func disconnectHandler(closeFirst int) connHandler {
140141
}
141142

142143
if connID <= closeFirst {
144+
// If there was a `goodbyeMsg` specified, write it to the client before
145+
// closing the connection. This is a good way to deliver a SMTP error
146+
// before closing
147+
if goodbyeMsg != "" {
148+
_, _ = conn.Write([]byte(fmt.Sprintf("%s\r\n", goodbyeMsg)))
149+
fmt.Printf("Wrote goodbye msg: %s\n", goodbyeMsg)
150+
}
143151
fmt.Printf("Cutting off client early\n")
144152
return
145153
}
@@ -215,7 +223,33 @@ func TestReconnectSuccess(t *testing.T) {
215223

216224
// Configure a test server that will disconnect the first `closedConns`
217225
// connections after the MAIL cmd
218-
go listenForever(l, t, disconnectHandler(closedConns))
226+
go listenForever(l, t, disconnectHandler(closedConns, ""))
227+
228+
// With a mailer client that has a max attempt > `closedConns` we expect no
229+
// error. The message should be delivered after `closedConns` reconnect
230+
// attempts.
231+
err := m.Connect()
232+
if err != nil {
233+
t.Errorf("Failed to connect: %s", err)
234+
}
235+
err = m.SendMail([]string{"hi@bye.com"}, "You are already a winner!", "Just kidding")
236+
if err != nil {
237+
t.Errorf("Expected SendMail() to not fail. Got err: %s", err)
238+
}
239+
}
240+
241+
func TestReconnectSMTP421(t *testing.T) {
242+
m, l, cleanUp := setup(t)
243+
defer cleanUp()
244+
const closedConns = 5
245+
246+
// A SMTP 421 can be generated when the server times out an idle connection.
247+
// For more information see https://github.com/letsencrypt/boulder/issues/2249
248+
smtp421 := "421 1.2.3 green.eggs.and.spam Error: timeout exceeded"
249+
250+
// Configure a test server that will disconnect the first `closedConns`
251+
// connections after the MAIL cmd with a SMTP 421 error
252+
go listenForever(l, t, disconnectHandler(closedConns, smtp421))
219253

220254
// With a mailer client that has a max attempt > `closedConns` we expect no
221255
// error. The message should be delivered after `closedConns` reconnect

test/mail-test-srv/main.go

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,25 @@ func (srv *mailSrv) handleConn(conn net.Conn) {
101101
case "MAIL":
102102
srv.connNumberMutex.RLock()
103103
if srv.connNumber <= srv.closeFirst {
104-
log.Printf(
105-
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
106-
srv.connNumber, srv.closeFirst)
107-
clearState()
108-
conn.Close()
104+
// Half of the time, close cleanly to simulate the server side closing
105+
// unexpectedly.
106+
if srv.connNumber%2 == 0 {
107+
log.Printf(
108+
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
109+
srv.connNumber, srv.closeFirst)
110+
clearState()
111+
conn.Close()
112+
} else {
113+
// The rest of the time, simulate a stale connection timeout by sending
114+
// a SMTP 421 message. This replicates the timeout/close from issue
115+
// 2249 - https://github.com/letsencrypt/boulder/issues/2249
116+
log.Printf(
117+
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting with 421. Bye!\n",
118+
srv.connNumber, srv.closeFirst)
119+
clearState()
120+
conn.Write([]byte("421 1.2.3 foo.bar.baz Error: timeout exceeded \r\n"))
121+
conn.Close()
122+
}
109123
}
110124
srv.connNumberMutex.RUnlock()
111125
clearState()

0 commit comments

Comments
 (0)