forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathjob.py
More file actions
301 lines (251 loc) · 9.69 KB
/
job.py
File metadata and controls
301 lines (251 loc) · 9.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
import tempfile
import time
from datetime import datetime, timedelta
from typing import List
from urllib.parse import urlparse
import fastavro
import pandas as pd
from google.cloud import storage
from google.protobuf.json_format import MessageToJson
from feast.core.CoreService_pb2 import ListIngestionJobsRequest
from feast.core.CoreService_pb2_grpc import CoreServiceStub
from feast.core.IngestionJob_pb2 import IngestionJob as IngestJobProto
from feast.core.IngestionJob_pb2 import IngestionJobStatus
from feast.core.Store_pb2 import Store
from feast.feature_set import FeatureSet
from feast.serving.ServingService_pb2 import (
DATA_FORMAT_AVRO,
JOB_STATUS_DONE,
GetJobRequest,
)
from feast.serving.ServingService_pb2 import Job as JobProto
from feast.serving.ServingService_pb2_grpc import ServingServiceStub
from feast.source import Source
# Maximum no of seconds to wait until the retrieval jobs status is DONE in Feast
# Currently set to the maximum query execution time limit in BigQuery
DEFAULT_TIMEOUT_SEC: int = 21600
# Maximum no of seconds to wait before reloading the job status in Feast
MAX_WAIT_INTERVAL_SEC: int = 60
class RetrievalJob:
"""
A class representing a job for feature retrieval in Feast.
"""
def __init__(self, job_proto: JobProto, serving_stub: ServingServiceStub):
"""
Args:
job_proto: Job proto object (wrapped by this job object)
serving_stub: Stub for Feast serving service
"""
self.job_proto = job_proto
self.serving_stub = serving_stub
self.storage_client = storage.Client(project=None)
@property
def id(self):
"""
Getter for the Job Id
"""
return self.job_proto.id
@property
def status(self):
"""
Getter for the Job status from Feast Core
"""
return self.job_proto.status
def reload(self):
"""
Reload the latest job status
Returns: None
"""
self.job_proto = self.serving_stub.GetJob(GetJobRequest(job=self.job_proto)).job
def get_avro_files(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC):
"""
Wait until job is done to get the file uri to Avro result files on
Google Cloud Storage.
Args:
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
str: Google Cloud Storage file uris of the returned Avro files.
"""
max_wait_datetime = datetime.now() + timedelta(seconds=timeout_sec)
wait_duration_sec = 2
while self.status != JOB_STATUS_DONE:
if datetime.now() > max_wait_datetime:
raise Exception(
"Timeout exceeded while waiting for result. Please retry "
"this method or use a longer timeout value."
)
self.reload()
time.sleep(wait_duration_sec)
# Backoff the wait duration exponentially up till MAX_WAIT_INTERVAL_SEC
wait_duration_sec = min(wait_duration_sec * 2, MAX_WAIT_INTERVAL_SEC)
if self.job_proto.error:
raise Exception(self.job_proto.error)
if self.job_proto.data_format != DATA_FORMAT_AVRO:
raise Exception(
"Feast only supports Avro data format for now. Please check "
"your Feast Serving deployment."
)
return [urlparse(uri) for uri in self.job_proto.file_uris]
def result(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC):
"""
Wait until job is done to get an iterable rows of result. The row can
only represent an Avro row in Feast 0.3.
Args:
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
Iterable of Avro rows.
"""
uris = self.get_avro_files(timeout_sec)
for file_uri in uris:
if file_uri.scheme == "gs":
file_obj = tempfile.TemporaryFile()
self.storage_client.download_blob_to_file(file_uri.geturl(), file_obj)
elif file_uri.scheme == "file":
file_obj = open(file_uri.path, "rb")
else:
raise Exception(
f"Could not identify file URI {file_uri}. Only gs:// and file:// supported"
)
file_obj.seek(0)
avro_reader = fastavro.reader(file_obj)
for record in avro_reader:
yield record
def to_dataframe(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC) -> pd.DataFrame:
"""
Wait until a job is done to get an iterable rows of result. This method
will split the response into chunked DataFrame of a specified size to
to be yielded to the instance calling it.
Args:
max_chunk_size (int):
Maximum number of rows that the DataFrame should contain.
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
pd.DataFrame:
Pandas DataFrame of the feature values.
"""
records = [r for r in self.result(timeout_sec=timeout_sec)]
return pd.DataFrame.from_records(records)
def to_chunked_dataframe(
self, max_chunk_size: int = -1, timeout_sec: int = DEFAULT_TIMEOUT_SEC
) -> pd.DataFrame:
"""
Wait until a job is done to get an iterable rows of result. This method
will split the response into chunked DataFrame of a specified size to
to be yielded to the instance calling it.
Args:
max_chunk_size (int):
Maximum number of rows that the DataFrame should contain.
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
pd.DataFrame:
Pandas DataFrame of the feature values.
"""
# Max chunk size defined by user
records = []
for result in self.result(timeout_sec=timeout_sec):
result.append(records)
if len(records) == max_chunk_size:
df = pd.DataFrame.from_records(records)
records.clear() # Empty records array
yield df
# Handle for last chunk that is < max_chunk_size
if not records:
yield pd.DataFrame.from_records(records)
def __iter__(self):
return iter(self.result())
class IngestJob:
"""
Defines a job for feature ingestion in feast.
"""
def __init__(self, job_proto: IngestJobProto, core_stub: CoreServiceStub):
"""
Construct a native ingest job from its protobuf version.
Args:
job_proto: Job proto object to construct from.
core_stub: stub for Feast CoreService
"""
self.proto = job_proto
self.core_svc = core_stub
def reload(self):
"""
Update this IngestJob with the latest info from Feast
"""
# pull latest proto from feast core
response = self.core_svc.ListIngestionJobs(
ListIngestionJobsRequest(filter=ListIngestionJobsRequest.Filter(id=self.id))
)
self.proto = response.jobs[0]
@property
def id(self) -> str:
"""
Getter for IngestJob's job id.
"""
return self.proto.id
@property
def external_id(self) -> str:
"""
Getter for IngestJob's external job id.
"""
self.reload()
return self.proto.external_id
@property
def status(self) -> IngestionJobStatus:
"""
Getter for IngestJob's status
"""
self.reload()
return self.proto.status
@property
def feature_sets(self) -> List[FeatureSet]:
"""
Getter for the IngestJob's feature sets
"""
# convert featureset protos to native objects
return [FeatureSet.from_proto(fs) for fs in self.proto.feature_sets]
@property
def source(self) -> Source:
"""
Getter for the IngestJob's data source.
"""
return Source.from_proto(self.proto.source)
@property
def store(self) -> Store:
"""
Getter for the IngestJob's target feast store.
"""
return self.proto.store
def wait(self, status: IngestionJobStatus, timeout_secs: float = 300):
"""
Wait for this IngestJob to transtion to the given status.
Raises TimeoutError if the wait operation times out.
Args:
status: The IngestionJobStatus to wait for.
timeout_secs: Maximum seconds to wait before timing out.
"""
# poll & wait for job status to transition
wait_begin = time.time()
wait_secs = 2
elapsed_secs = 0
while self.status != status and elapsed_secs <= timeout_secs:
time.sleep(wait_secs)
# back off wait duration exponentially, capped at MAX_WAIT_INTERVAL_SEC
wait_secs = min(wait_secs * 2, MAX_WAIT_INTERVAL_SEC)
elapsed_secs = time.time() - wait_begin
# raise error if timeout
if elapsed_secs > timeout_secs:
raise TimeoutError("Wait for IngestJob's status to transition timed out")
def __str__(self):
# render the contents of ingest job as human readable string
self.reload()
return str(MessageToJson(self.proto))
def __repr__(self):
# render the ingest job as human readable string
return f"IngestJob<{self.id}>"