forked from feast-dev/feast
-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathingest.py
More file actions
145 lines (113 loc) · 4.53 KB
/
ingest.py
File metadata and controls
145 lines (113 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import logging
from functools import partial
from multiprocessing import Pool
from typing import Iterable, List
import pandas as pd
import pyarrow.parquet as pq
from feast.constants import DATETIME_COLUMN
from feast.feature_set import FeatureSet
from feast.type_map import (
pa_column_to_timestamp_proto_column,
pa_column_to_proto_column,
)
from feast.types import Field_pb2 as FieldProto
from feast.types.FeatureRow_pb2 import FeatureRow
_logger = logging.getLogger(__name__)
GRPC_CONNECTION_TIMEOUT_DEFAULT = 3 # type: int
GRPC_CONNECTION_TIMEOUT_APPLY = 300 # type: int
FEAST_SERVING_URL_ENV_KEY = "FEAST_SERVING_URL" # type: str
FEAST_CORE_URL_ENV_KEY = "FEAST_CORE_URL" # type: str
BATCH_FEATURE_REQUEST_WAIT_TIME_SECONDS = 300
KAFKA_CHUNK_PRODUCTION_TIMEOUT = 120 # type: int
def _encode_pa_tables(file: str, fs: FeatureSet, row_group_idx: int) -> List[bytes]:
"""
Helper function to encode a PyArrow table(s) read from parquet file(s) into
FeatureRows.
This function accepts a list of file directory pointing to many parquet
files. All parquet files must have the same schema.
Each parquet file will be read into as a table and encoded into FeatureRows
using a pool of max_workers workers.
Args:
file (str):
File directory of all the parquet file to encode.
Parquet file must have more than one row group.
fs (feast.feature_set.FeatureSet):
FeatureSet describing parquet files.
row_group_idx(int):
Row group index to read and encode into byte like FeatureRow
protobuf objects.
Returns:
List[bytes]:
List of byte encoded FeatureRows from the parquet file.
"""
pq_file = pq.ParquetFile(file)
# Read parquet file as a PyArrow table
table = pq_file.read_row_group(row_group_idx)
# Add datetime column
datetime_col = pa_column_to_timestamp_proto_column(table.column(DATETIME_COLUMN))
# Preprocess the columns by converting all its values to Proto values
proto_columns = {
field_name: pa_column_to_proto_column(field.dtype, table.column(field_name))
for field_name, field in fs.fields.items()
}
feature_set = f"{fs.project}/{fs.name}:{fs.version}"
# List to store result
feature_rows = []
# Loop optimization declaration(s)
field = FieldProto.Field
proto_items = proto_columns.items()
append = feature_rows.append
# Iterate through the rows
for row_idx in range(table.num_rows):
feature_row = FeatureRow(
event_timestamp=datetime_col[row_idx], feature_set=feature_set
)
# Loop optimization declaration
ext = feature_row.fields.extend
# Insert field from each column
for k, v in proto_items:
ext([field(name=k, value=v[row_idx])])
# Append FeatureRow in byte string form
append(feature_row.SerializeToString())
return feature_rows
def get_feature_row_chunks(
file: str, row_groups: List[int], fs: FeatureSet, max_workers: int
) -> Iterable[List[bytes]]:
"""
Iterator function to encode a PyArrow table read from a parquet file to
FeatureRow(s).
Args:
file (str):
File directory of the parquet file. The parquet file must have more
than one row group.
row_groups (List[int]):
Specific row group indexes to be read and transformed in the parquet
file.
fs (feast.feature_set.FeatureSet):
FeatureSet describing parquet files.
max_workers (int):
Maximum number of workers to spawn.
Returns:
Iterable[List[bytes]]:
Iterable list of byte encoded FeatureRow(s).
"""
pool = Pool(max_workers)
func = partial(_encode_pa_tables, file, fs)
for chunk in pool.imap(func, row_groups):
yield chunk
return
def validate_dataframe(dataframe: pd.DataFrame, feature_set: FeatureSet):
if "datetime" not in dataframe.columns:
raise ValueError(
f'Dataframe does not contain entity "datetime" in columns {dataframe.columns}'
)
for entity in feature_set.entities:
if entity.name not in dataframe.columns:
raise ValueError(
f"Dataframe does not contain entity {entity.name} in columns {dataframe.columns}"
)
for feature in feature_set.features:
if feature.name not in dataframe.columns:
raise ValueError(
f"Dataframe does not contain feature {feature.name} in columns {dataframe.columns}"
)