forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbq_util.py
More file actions
288 lines (232 loc) · 9.55 KB
/
Copy pathbq_util.py
File metadata and controls
288 lines (232 loc) · 9.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
# Copyright 2018 The Feast Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
import time
from datetime import datetime
import fastavro
import pandas as pd
import pytz
from google.cloud import bigquery, bigquery_storage_v1beta1
from google.cloud import storage
from google.cloud.bigquery.client import Client as BQClient
from google.cloud.bigquery.job import ExtractJobConfig, DestinationFormat
from google.cloud.bigquery.table import Table
from google.cloud.exceptions import NotFound
from google.cloud.storage import Client as GCSClient
from feast.sdk.resources.feature_set import FileType
from feast.sdk.utils.gs_utils import is_gs_path, split_gs_path, gcs_to_df
def head(client, table, max_rows=10):
"""Get the head of the table. Retrieves rows from the given table at
minimum cost
Args:
client (google.cloud.bigquery.client.Client): bigquery client
table (google.cloud.bigquery.table.Table): bigquery table to get the
head of
max_rows (int, optional): Defaults to 10. maximum number of rows to
retrieve
Returns:
pandas.DataFrame: dataframe containing the head of rows
"""
rows = client.list_rows(table, max_results=max_rows)
rows = [x for x in rows]
return pd.DataFrame(
data=[list(x.values()) for x in rows], columns=list(rows[0].keys())
)
def get_table_name(feature_id, storage_spec):
"""
Get fully qualified BigQuery table name from a feature ID and its
storage spec
Args:
feature_id(str): ID of a feature
storage_spec(feast.specs.StorageSpec_pb2.StorageSpec): storage spec of
the feature
Returns:
str: fully qualified table name of the feature.
"""
if "bigquery" != storage_spec.type:
raise ValueError("storage spec is not BigQuery storage spec")
try:
project = storage_spec.options["project"]
dataset = storage_spec.options["dataset"]
except KeyError:
raise ValueError("storage spec has empty project or dataset option")
table_name = feature_id.split(".")[0]
return ".".join([project, dataset, table_name])
def get_default_templocation(bigquery_client, project=None):
if project is None:
project = bigquery_client.project
assert isinstance(project, str)
assert len(project) > 0
storage_client = storage.Client()
default_bucket_name = f"feast-templocation-{project}"
try:
storage_client.get_bucket(default_bucket_name)
except NotFound:
print(
f'Default bucket "{default_bucket_name}" not found. Attempting to create it.'
)
storage_client.create_bucket(bucket_name=default_bucket_name, project=project)
return f"gs://{default_bucket_name}"
def query_to_dataframe(
query: str,
bigquery_client: bigquery.Client = None,
storage_client: storage.Client = None,
project: str = None,
templocation: str = None,
) -> pd.DataFrame:
"""
Run a query job on BigQuery and return the result in Pandas DataFrame format
Args:
query: BigQuery query e.g. "SELECT * FROM dataset.table"
bigquery_client:
storage_client:
project: Google Cloud project id
templocation: Google Cloud Storage location to store intermediate files, must start with "gs://"
Returns: Pandas DataFrame of the query result
"""
if isinstance(templocation, str) and not templocation.startswith("gs://"):
raise RuntimeError('templocation must start with "gs://"')
if bigquery_client is None:
bigquery_client = bigquery.Client(project=project)
if project is None:
project = bigquery_client.project
query_job = bigquery_client.query(query, project=project)
query_job_state = ""
while not query_job.done():
if query_job.state != query_job_state:
print(f"Query status: {query_job.state}")
query_job_state = query_job.state
time.sleep(5)
if query_job.state != query_job_state:
print(f"Query status: {query_job.state}")
if query_job.exception():
raise query_job.exception()
if not templocation:
templocation = get_default_templocation(bigquery_client, project=project)
if templocation.endswith("/"):
templocation += templocation[:-1]
destination_uri = (
f"{templocation}/bq-{datetime.now(pytz.utc).strftime('%Y%m%dT%H%M%SZ')}.avro"
)
extract_job_config = bigquery.job.ExtractJobConfig(destination_format="AVRO")
extract_job = bigquery_client.extract_table(
query_job.destination, destination_uri, job_config=extract_job_config
)
while not extract_job.done():
time.sleep(5)
if extract_job.exception():
raise extract_job.exception()
if not storage_client:
storage_client = storage.Client(project=project)
print("Reading query result into DataFrame")
bucket_name, blob_name = (
destination_uri.split("/")[2],
"/".join(destination_uri.split("/")[3:]),
)
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.get_blob(blob_name)
downloaded_avro_filename = tempfile.NamedTemporaryFile().name
blob.download_to_filename(downloaded_avro_filename)
with open(downloaded_avro_filename, "rb") as avro_file:
avro_reader = fastavro.reader(avro_file)
df = pd.DataFrame.from_records(avro_reader)
return df
class TableDownloader:
def __init__(self):
self._bqclient = None
self._storageclient = None
self._bqstorageclient = None
@property
def storageclient(self):
if self._storageclient is None:
self._storageclient = GCSClient()
return self._storageclient
@property
def bqclient(self):
if self._bqclient is None:
self._bqclient = BQClient()
return self._bqclient
@property
def bqstorageclient(self):
if self._bqstorageclient is None:
self._bqstorageclient = bigquery_storage_v1beta1.BigQueryStorageClient()
return self._bqstorageclient
def download_table_as_file(
self, full_table_id, dest, file_type, staging_location=None
):
"""
Download a bigquery table as file
Args:
full_table_id (str): fully qualified BigQuery table id
dest (str): destination filename
file_type (feast.sdk.resources.feature_set.FileType): (default:
FileType.CSV) exported file format
staging_location (str, optional): url to staging_location (currently
support a folder in GCS)
Returns: (str) path to the downloaded file
"""
if not staging_location:
df = self.download_table_as_df(full_table_id)
if file_type == FileType.CSV:
df.to_csv(dest, index=False)
elif file_type == FileType.JSON:
df.to_json(dest, index=False)
else:
raise ValueError(
"Only FileType: CSV and JSON are supported for download_table_as_file without staging location"
)
return dest
if not is_gs_path(staging_location):
raise ValueError("staging_uri must be a directory in GCS")
temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
staging_file_path = os.path.join(staging_location, temp_file_name)
job_config = ExtractJobConfig()
job_config.destination_format = file_type
src_table = Table.from_string(full_table_id)
job = self.bqclient.extract_table(
src_table, staging_file_path, job_config=job_config
)
# await completion
job.result()
bucket_name, blob_name = split_gs_path(staging_file_path)
bucket = self.storageclient.get_bucket(bucket_name)
blob = bucket.blob(blob_name)
blob.download_to_filename(dest)
return dest
def download_table_as_df(self, full_table_id, staging_location=None):
"""
Download a BigQuery table as Pandas Dataframe
Args:
full_table_id (src) : fully qualified BigQuery table id
staging_location: url to staging_location (currently
support a folder in GCS)
Returns: pandas.DataFrame: dataframe of the training dataset
"""
if not staging_location:
table = bigquery.TableReference.from_string(full_table_id)
rows = self.bqclient.list_rows(table)
return rows.to_dataframe(bqstorage_client=self.bqstorageclient)
if not is_gs_path(staging_location):
raise ValueError("staging_uri must be a directory in GCS")
temp_file_name = "temp_{}".format(int(round(time.time() * 1000)))
staging_file_path = os.path.join(staging_location, temp_file_name)
job_config = ExtractJobConfig()
job_config.destination_format = DestinationFormat.CSV
job = self.bqclient.extract_table(
Table.from_string(full_table_id), staging_file_path, job_config=job_config
)
# await completion
job.result()
return gcs_to_df(staging_file_path)