-
Notifications
You must be signed in to change notification settings - Fork 148
Expand file tree
/
Copy pathlambdaFunction.go
More file actions
383 lines (330 loc) · 9.83 KB
/
lambdaFunction.go
File metadata and controls
383 lines (330 loc) · 9.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
package lambda
import (
"bufio"
"container/list"
"errors"
"fmt"
"log"
"net/http"
"os"
"path/filepath"
"strings"
"time"
"github.com/open-lambda/open-lambda/ol/common"
"github.com/open-lambda/open-lambda/ol/worker/lambda/packages"
"github.com/open-lambda/open-lambda/ol/worker/sandbox"
)
// LambdaFunc represents a single lambda function (the code)
type LambdaFunc struct {
lmgr *LambdaMgr
name string
rtType common.RuntimeType
// lambda code
lastPull *time.Time
codeDir string
meta *sandbox.SandboxMeta
// lambda execution
funcChan chan *Invocation // server to func
instChan chan *Invocation // func to instances
doneChan chan *Invocation // instances to func
instances *list.List
// send chan to the kill chan to destroy the instance, then
// wait for msg on sent chan to block until it is done
killChan chan chan bool
}
func (f *LambdaFunc) Invoke(w http.ResponseWriter, r *http.Request) {
t := common.T0("LambdaFunc.Invoke")
defer t.T1()
done := make(chan bool)
req := &Invocation{w: w, r: r, done: done}
// send invocation to lambda func task, if room in queue
select {
case f.funcChan <- req:
// block until it's done
<-done
default:
// queue cannot accept more, so reply with backoff
req.w.WriteHeader(http.StatusTooManyRequests)
req.w.Write([]byte("lambda function queue is full\n"))
}
}
// add function name to each log message so we know which logs
// correspond to which LambdaFuncs
func (f *LambdaFunc) printf(format string, args ...any) {
msg := fmt.Sprintf(format, args...)
log.Printf("%s [FUNC %s]", strings.TrimRight(msg, "\n"), f.name)
}
// parseMeta reads in a requirements.txt file that was built from pip-compile
func parseMeta(codeDir string) (meta *sandbox.SandboxMeta, err error) {
meta = &sandbox.SandboxMeta{
Installs: []string{},
Imports: []string{},
}
path := filepath.Join(codeDir, "requirements.txt")
file, err := os.Open(path)
if errors.Is(err, os.ErrNotExist) {
// having a requirements.txt is optional
return meta, nil
} else if err != nil {
return nil, err
}
defer file.Close()
scnr := bufio.NewScanner(file)
for scnr.Scan() {
line := strings.ReplaceAll(scnr.Text(), " ", "")
pkg := strings.Split(line, "#")[0]
if pkg != "" {
pkg = packages.NormalizePkg(pkg)
meta.Installs = append(meta.Installs, pkg)
}
}
return meta, nil
}
// if there is any error:
// 1. we won't switch to the new code
// 2. we won't update pull time (so well check for a fix next time)
func (f *LambdaFunc) pullHandlerIfStale() (err error) {
// check if there is newer code, download it if necessary
now := time.Now()
cacheNs := int64(common.Conf.Registry_cache_ms) * 1000000
// should we check for new code?
if f.lastPull != nil && int64(now.Sub(*f.lastPull)) < cacheNs {
return nil
}
// is there new code?
rtType, codeDir, err := f.lmgr.HandlerPuller.Pull(f.name)
if err != nil {
return err
}
if codeDir == f.codeDir {
return nil
}
f.rtType = rtType
defer func() {
if err != nil {
if err := os.RemoveAll(codeDir); err != nil {
log.Printf("could not cleanup %s after failed pull\n", codeDir)
}
if rtType == common.RT_PYTHON {
// we dirty this dir (e.g., by setting up
// symlinks to packages, so we want the
// HandlerPuller to give us a new one next
// time, even if the code hasn't changed
f.lmgr.HandlerPuller.Reset(f.name)
}
}
}()
if rtType == common.RT_PYTHON {
// inspect new code for dependencies; if we can install
// everything necessary, start using new code
meta, err := parseMeta(codeDir)
if err != nil {
return err
}
// make sure all specified dependencies are installed
// (but don't recursively find others)
for _, pkg := range meta.Installs {
if _, err := f.lmgr.PackagePuller.GetPkg(pkg); err != nil {
return err
}
}
f.lmgr.DepTracer.TraceFunction(codeDir, meta.Installs)
f.meta = meta
} else if rtType == common.RT_NATIVE {
log.Printf("Got native function")
}
f.codeDir = codeDir
f.lastPull = &now
return nil
}
// this Task receives lambda requests, fetches new lambda code as
// needed, and dispatches to a set of lambda instances. Task also
// monitors outstanding requests, and scales the number of instances
// up or down as needed.
//
// communication for a given request is as follows (each of the four
// transfers are commented within the function):
//
// client -> function -> instance -> function -> client
//
// each of the 4 handoffs above is over a chan. In order, those chans are:
// 1. LambdaFunc.funcChan
// 2. LambdaFunc.instChan
// 3. LambdaFunc.doneChan
// 4. Invocation.done
//
// If either LambdaFunc.funcChan or LambdaFunc.instChan is full, we
// respond to the client with a backoff message: StatusTooManyRequests
func (f *LambdaFunc) Task() {
f.printf("debug: LambdaFunc.Task() runs on goroutine %d", common.GetGoroutineID())
// we want to perform various cleanup actions, such as killing
// instances and deleting old code. We want to do these
// asynchronously, but in order. Thus, we use a chan to get
// FIFO behavior and a single cleanup task to get async.
//
// two types can be sent to this chan:
//
// 1. string: this is a path to be deleted
//
// 2. chan: this is a signal chan that corresponds to
// previously initiated cleanup work. We block until we
// receive the complete signal, before proceeding to
// subsequent cleanup tasks in the FIFO.
cleanupChan := make(chan any, 32)
cleanupTaskDone := make(chan bool)
go func() {
for {
msg, ok := <-cleanupChan
if !ok {
cleanupTaskDone <- true
return
}
switch op := msg.(type) {
case string:
if err := os.RemoveAll(op); err != nil {
f.printf("Async code cleanup could not delete %s, even after all instances using it killed: %v", op, err)
}
case chan bool:
<-op
}
}
}()
// stats for autoscaling
outstandingReqs := 0
execMs := common.NewRollingAvg(10)
var lastScaling *time.Time
timeout := time.NewTimer(0)
for {
select {
case <-timeout.C:
if f.codeDir == "" {
continue
}
case req := <-f.funcChan:
// msg: client -> function
// check for new code, and cleanup old code
// (and instances that use it) if necessary
oldCodeDir := f.codeDir
if err := f.pullHandlerIfStale(); err != nil {
f.printf("Error checking for new lambda code at `%s`: %v", f.codeDir, err)
req.w.WriteHeader(http.StatusInternalServerError)
req.w.Write([]byte(err.Error() + "\n"))
req.done <- true
continue
}
if oldCodeDir != "" && oldCodeDir != f.codeDir {
el := f.instances.Front()
for el != nil {
waitChan := el.Value.(*LambdaInstance).AsyncKill()
cleanupChan <- waitChan
el = el.Next()
}
f.instances = list.New()
// cleanupChan is a FIFO, so this will
// happen after the cleanup task waits
// for all instance kills to finish
cleanupChan <- oldCodeDir
}
f.lmgr.DepTracer.TraceInvocation(f.codeDir)
select {
case f.instChan <- req:
// msg: function -> instance
outstandingReqs++
default:
// queue cannot accept more, so reply with backoff
req.w.WriteHeader(http.StatusTooManyRequests)
req.w.Write([]byte("lambda instance queue is full\n"))
req.done <- true
}
case req := <-f.doneChan:
// msg: instance -> function
execMs.Add(req.execMs)
outstandingReqs--
// msg: function -> client
req.done <- true
case done := <-f.killChan:
// signal all instances to die, then wait for
// cleanup task to finish and exit
el := f.instances.Front()
for el != nil {
waitChan := el.Value.(*LambdaInstance).AsyncKill()
cleanupChan <- waitChan
el = el.Next()
}
if f.codeDir != "" {
//cleanupChan <- f.codeDir
}
close(cleanupChan)
<-cleanupTaskDone
done <- true
return
}
// POLICY: how many instances (i.e., virtual sandboxes) should we allocate?
// AUTOSCALING STEP 1: decide how many instances we want
// let's aim to have 1 sandbox per 10ms of outstanding work
// TODO make this configurable
inProgressWorkMs := outstandingReqs * execMs.Avg
desiredInstances := inProgressWorkMs / 10
// if we have, say, one job that will take 100
// seconds, spinning up 100 instances won't do any
// good, so cap by number of outstanding reqs
if outstandingReqs < desiredInstances {
desiredInstances = outstandingReqs
}
// always try to have one instance
if desiredInstances < 1 {
desiredInstances = 1
}
// AUTOSCALING STEP 2: tweak how many instances we have, to get closer to our goal
// make at most one scaling adjustment per 100ms
adjustFreq := time.Millisecond * 100
now := time.Now()
if lastScaling != nil {
elapsed := now.Sub(*lastScaling)
if elapsed < adjustFreq {
if desiredInstances != f.instances.Len() {
timeout = time.NewTimer(adjustFreq - elapsed)
}
continue
}
}
// kill or start at most one instance to get closer to
// desired number
if f.instances.Len() < desiredInstances {
f.printf("increase instances to %d", f.instances.Len()+1)
f.newInstance()
lastScaling = &now
} else if f.instances.Len() > desiredInstances {
f.printf("reduce instances to %d", f.instances.Len()-1)
waitChan := f.instances.Back().Value.(*LambdaInstance).AsyncKill()
f.instances.Remove(f.instances.Back())
cleanupChan <- waitChan
lastScaling = &now
}
if f.instances.Len() != desiredInstances {
// we can only adjust quickly, so we want to
// run through this loop again as soon as
// possible, even if there are no requests to
// service.
timeout = time.NewTimer(adjustFreq)
}
}
}
func (f *LambdaFunc) newInstance() {
if f.codeDir == "" {
panic("cannot start instance until code has been fetched")
}
linst := &LambdaInstance{
lfunc: f,
codeDir: f.codeDir,
meta: f.meta,
killChan: make(chan chan bool, 1),
}
f.instances.PushBack(linst)
go linst.Task()
}
func (f *LambdaFunc) Kill() {
done := make(chan bool)
f.killChan <- done
<-done
}