-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlambda_function.py
More file actions
134 lines (106 loc) · 3.97 KB
/
lambda_function.py
File metadata and controls
134 lines (106 loc) · 3.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
Filter Bedrock model invocation logs for third-party sharing.
Reads JSONL (plain or gzip-compressed .json.gz) from the customer's raw Bedrock
logging bucket (S3 event trigger) and writes uncompressed filtered JSONL
to DEST_BUCKET with inputBodyJson and outputBodyJson omitted.
"""
from __future__ import annotations
import gzip
import json
import logging
import os
from urllib.parse import unquote_plus
try:
from botocore.exceptions import ClientError
except ImportError: # Allows unit tests to import without Lambda/AWS dependencies installed.
class ClientError(Exception):
pass
from sanitizer import LogSanitizer
logger = logging.getLogger()
logger.setLevel(logging.INFO)
S3 = None
DEST_BUCKET = os.environ.get("DEST_BUCKET", "")
DEST_PREFIX = os.environ.get("DEST_PREFIX", "").rstrip("/")
SOURCE_PREFIX = os.environ.get("SOURCE_PREFIX", "").rstrip("/")
def _s3_client():
global S3
if S3 is None:
import boto3
S3 = boto3.client("s3")
return S3
def lambda_handler(event, context):
if not DEST_BUCKET:
raise ValueError("DEST_BUCKET environment variable is required")
sanitizer = LogSanitizer()
records = event.get("Records", [])
logger.info("Processing %s S3 record(s)", len(records))
results = []
for record in records:
try:
bucket = record["s3"]["bucket"]["name"]
key = unquote_plus(record["s3"]["object"]["key"])
except (KeyError, TypeError) as exc:
logger.warning("Skipping invalid S3 event record: %s", exc)
continue
if SOURCE_PREFIX and not key.startswith(SOURCE_PREFIX + "/") and key != SOURCE_PREFIX:
logger.info("Skipping key outside SOURCE_PREFIX: %s", key)
continue
try:
dest_key = _dest_key(key)
filtered_body = _filter_object(bucket, key, sanitizer)
put_args = {
"Bucket": DEST_BUCKET,
"Key": dest_key,
"Body": filtered_body,
"ContentType": "application/x-ndjson",
}
kms_key = os.environ.get("DEST_KMS_KEY_ID")
if kms_key:
put_args["ServerSideEncryption"] = "aws:kms"
put_args["SSEKMSKeyId"] = kms_key
_s3_client().put_object(**put_args)
logger.info("Wrote shareable log to s3://%s/%s", DEST_BUCKET, dest_key)
results.append(
{"source": f"s3://{bucket}/{key}", "dest": f"s3://{DEST_BUCKET}/{dest_key}"}
)
except ClientError as exc:
logger.exception("Failed processing s3://%s/%s: %s", bucket, key, exc)
raise
return {
"statusCode": 200,
"body": json.dumps({"processed": len(results), "objects": results}),
}
def _strip_gzip_suffix(key: str) -> str:
"""Bedrock logs are .json.gz; shareable output is uncompressed .json."""
lower = key.lower()
if lower.endswith(".json.gz"):
return key[:-3]
if lower.endswith(".gz"):
return key[:-3]
return key
def _dest_key(source_key: str) -> str:
key = _strip_gzip_suffix(source_key)
if DEST_PREFIX:
return f"{DEST_PREFIX}/{key}"
return key
def _read_object_bytes(bucket: str, key: str) -> bytes:
response = _s3_client().get_object(Bucket=bucket, Key=key)
data = response["Body"].read()
if key.lower().endswith(".gz"):
data = gzip.decompress(data)
return data
def _filter_object(bucket: str, key: str, sanitizer: LogSanitizer) -> bytes:
payload = _read_object_bytes(bucket, key)
lines_out: list[str] = []
for raw_line in payload.splitlines():
if not raw_line:
continue
line = raw_line.decode("utf-8")
try:
lines_out.append(sanitizer.sanitize_line(line))
except json.JSONDecodeError:
logger.warning("Skipping non-JSON line in %s", key)
continue
if not lines_out:
return b""
return ("\n".join(lines_out) + "\n").encode("utf-8")