forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathjob.py
More file actions
190 lines (156 loc) · 6.29 KB
/
job.py
File metadata and controls
190 lines (156 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import tempfile
import time
from datetime import datetime, timedelta
from typing import Iterable
from urllib.parse import urlparse
import fastavro
import pandas as pd
from google.cloud import storage
from feast.serving.ServingService_pb2 import GetJobRequest
from feast.serving.ServingService_pb2 import (
Job as JobProto,
JOB_STATUS_DONE,
DATA_FORMAT_AVRO,
)
from feast.serving.ServingService_pb2_grpc import ServingServiceStub
# Maximum no of seconds to wait until the jobs status is DONE in Feast
# Currently set to the maximum query execution time limit in BigQuery
DEFAULT_TIMEOUT_SEC: int = 21600
# Maximum no of seconds to wait before reloading the job status in Feast
MAX_WAIT_INTERVAL_SEC: int = 60
class Job:
"""
A class representing a job for feature retrieval in Feast.
"""
def __init__(self, job_proto: JobProto, serving_stub: ServingServiceStub):
"""
Args:
job_proto: Job proto object (wrapped by this job object)
serving_stub: Stub for Feast serving service
storage_client: Google Cloud Storage client
"""
self.job_proto = job_proto
self.serving_stub = serving_stub
self.storage_client = storage.Client(project=None)
@property
def id(self):
"""
Getter for the Job Id
"""
return self.job_proto.id
@property
def status(self):
"""
Getter for the Job status from Feast Core
"""
return self.job_proto.status
def reload(self):
"""
Reload the latest job status
Returns: None
"""
self.job_proto = self.serving_stub.GetJob(GetJobRequest(job=self.job_proto)).job
def get_avro_files(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC):
"""
Wait until job is done to get the file uri to Avro result files on
Google Cloud Storage.
Args:
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
str: Google Cloud Storage file uris of the returned Avro files.
"""
max_wait_datetime = datetime.now() + timedelta(seconds=timeout_sec)
wait_duration_sec = 2
while self.status != JOB_STATUS_DONE:
if datetime.now() > max_wait_datetime:
raise Exception(
"Timeout exceeded while waiting for result. Please retry "
"this method or use a longer timeout value."
)
self.reload()
time.sleep(wait_duration_sec)
# Backoff the wait duration exponentially up till MAX_WAIT_INTERVAL_SEC
wait_duration_sec = min(wait_duration_sec * 2, MAX_WAIT_INTERVAL_SEC)
if self.job_proto.error:
raise Exception(self.job_proto.error)
if self.job_proto.data_format != DATA_FORMAT_AVRO:
raise Exception(
"Feast only supports Avro data format for now. Please check "
"your Feast Serving deployment."
)
return [urlparse(uri) for uri in self.job_proto.file_uris]
def result(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC):
"""
Wait until job is done to get an iterable rows of result. The row can
only represent an Avro row in Feast 0.3.
Args:
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
Iterable of Avro rows.
"""
uris = self.get_avro_files(timeout_sec)
for file_uri in uris:
if file_uri.scheme == "gs":
file_obj = tempfile.TemporaryFile()
self.storage_client.download_blob_to_file(file_uri.geturl(), file_obj)
elif file_uri.scheme == "file":
file_obj = open(file_uri.path, "rb")
else:
raise Exception(
f"Could not identify file URI {file_uri}. Only gs:// and file:// supported"
)
file_obj.seek(0)
avro_reader = fastavro.reader(file_obj)
for record in avro_reader:
yield record
def to_dataframe(self, timeout_sec: int = DEFAULT_TIMEOUT_SEC) -> pd.DataFrame:
"""
Wait until a job is done to get an iterable rows of result. This method
will split the response into chunked DataFrame of a specified size to
to be yielded to the instance calling it.
Args:
max_chunk_size (int):
Maximum number of rows that the DataFrame should contain.
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
pd.DataFrame:
Pandas DataFrame of the feature values.
"""
records = [r for r in self.result(timeout_sec=timeout_sec)]
return pd.DataFrame.from_records(records)
def to_chunked_dataframe(
self, max_chunk_size: int = -1, timeout_sec: int = DEFAULT_TIMEOUT_SEC
) -> pd.DataFrame:
"""
Wait until a job is done to get an iterable rows of result. This method
will split the response into chunked DataFrame of a specified size to
to be yielded to the instance calling it.
Args:
max_chunk_size (int):
Maximum number of rows that the DataFrame should contain.
timeout_sec (int):
Max no of seconds to wait until job is done. If "timeout_sec"
is exceeded, an exception will be raised.
Returns:
pd.DataFrame:
Pandas DataFrame of the feature values.
"""
# Max chunk size defined by user
records = []
for result in self.result(timeout_sec=timeout_sec):
result.append(records)
if len(records) == max_chunk_size:
df = pd.DataFrame.from_records(records)
records.clear() # Empty records array
yield df
# Handle for last chunk that is < max_chunk_size
if not records:
yield pd.DataFrame.from_records(records)
def __iter__(self):
return iter(self.result())