forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_source_utils.py
More file actions
56 lines (46 loc) · 1.74 KB
/
data_source_utils.py
File metadata and controls
56 lines (46 loc) · 1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import contextlib
import random
import tempfile
import time
from google.cloud import bigquery
from feast import BigQuerySource, FileSource
from feast.data_format import ParquetFormat
@contextlib.contextmanager
def prep_file_source(df, event_timestamp_column=None) -> FileSource:
with tempfile.NamedTemporaryFile(suffix=".parquet") as f:
f.close()
df.to_parquet(f.name)
file_source = FileSource(
file_format=ParquetFormat(),
path=f.name,
event_timestamp_column=event_timestamp_column,
)
yield file_source
def simple_bq_source_using_table_ref_arg(
df, event_timestamp_column=None
) -> BigQuerySource:
client = bigquery.Client()
gcp_project = client.project
bigquery_dataset = f"ds_{time.time_ns()}"
dataset = bigquery.Dataset(f"{gcp_project}.{bigquery_dataset}")
client.create_dataset(dataset, exists_ok=True)
dataset.default_table_expiration_ms = (
1000
* 60
* 60 # 60 minutes in milliseconds (seems to be minimum limit for gcloud)
)
client.update_dataset(dataset, ["default_table_expiration_ms"])
table_ref = f"{gcp_project}.{bigquery_dataset}.table_{random.randrange(100, 999)}"
job = client.load_table_from_dataframe(df, table_ref)
job.result()
return BigQuerySource(
table_ref=table_ref, event_timestamp_column=event_timestamp_column,
)
def simple_bq_source_using_query_arg(df, event_timestamp_column=None) -> BigQuerySource:
bq_source_using_table_ref = simple_bq_source_using_table_ref_arg(
df, event_timestamp_column
)
return BigQuerySource(
query=f"SELECT * FROM {bq_source_using_table_ref.table_ref}",
event_timestamp_column=event_timestamp_column,
)