-
Notifications
You must be signed in to change notification settings - Fork 242
Expand file tree
/
Copy pathtime_partition_pattern_samples.py
More file actions
108 lines (97 loc) · 4.7 KB
/
time_partition_pattern_samples.py
File metadata and controls
108 lines (97 loc) · 4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# Code samples for time partition pattern
from feathr import AvroJsonSchema
from feathr import KafKaSource
from feathr import KafkaConfig
from typing import List
import os
import random
from datetime import datetime, timedelta
from feathr import (BOOLEAN, FLOAT, INPUT_CONTEXT, INT32, STRING,
FeatureQuery, ObservationSettings, Feature, FeatureAnchor, HdfsSource,
TypedKey, ValueType, WindowAggTransformation, BackfillTime,
MaterializationSettings,HdfsSink, Constants)
from feathr import FeathrClient
from feathr.utils.job_utils import get_result_df
# replace by your config path
config_path = "feathr_config.yaml"
# replace by your own data source path
data_source_path = 'dbfs:/timePartitionPattern_postfix_test/df0/daily/'
# replace by your own postfix path
postfix_path = "postfixPath"
# Example for materialize job with 'time_partition_pattern'
client = FeathrClient(config_path=config_path)
batch_source = HdfsSource(name="testTimePartitionSource",
path=data_source_path,
time_partition_pattern="yyyy/MM/dd",
postfix_path=postfix_path
)
key = TypedKey(key_column="key0",
key_column_type=ValueType.INT32)
agg_features = [
Feature(name="f_loc_avg_output",
key=[key],
feature_type=FLOAT,
transform=WindowAggTransformation(agg_expr="f_location_avg_fare",
agg_func="AVG",
window="3d")),
Feature(name="f_loc_max_output",
feature_type=FLOAT,
key=[key],
transform=WindowAggTransformation(agg_expr="f_location_max_fare",
agg_func="MAX",
window="3d")),
]
agg_anchor = FeatureAnchor(name="testTimePartitionFeatures",
source=batch_source,
features=agg_features)
client.build_features(anchor_list=[agg_anchor])
backfill_time_pf = BackfillTime(start=datetime(
2020, 5, 2), end=datetime(2020, 5, 2), step=timedelta(days=1))
now = datetime.now()
# replace by your own output path
output_path_pf = ''.join(['dbfs:/feathrazure_cijob_materialize_offline_','_', str(now.minute), '_', str(now.second), ""])
offline_sink_pf = HdfsSink(output_path=output_path_pf)
settings_pf = MaterializationSettings("nycTaxiTable",
sinks=[offline_sink_pf],
feature_names=[
"f_loc_avg_output", "f_loc_max_output"],
backfill_time=backfill_time_pf)
client.materialize_features(settings_pf)
client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS)
res_df = get_result_df(client, data_format="avro", res_url=output_path_pf + "/df0/daily/2020/05/02")
assert res_df.shape[0] > 0
# Example for get offline job with 'time_partition_pattern'
client = FeathrClient(config_path=config_path)
batch_source = HdfsSource(name="testTimePartitionSource",
path=data_source_path,
time_partition_pattern="yyyy/MM/dd",
postfix_path=postfix_path
)
tpp_key = TypedKey(key_column="f_location_max_fare",
key_column_type=ValueType.FLOAT)
tpp_features = [
Feature(name="key0",
key=tpp_key,
feature_type=FLOAT,
transform=WindowAggTransformation(agg_expr="key0",
agg_func="LATEST",
window="3d"
))
]
tpp_anchor = FeatureAnchor(name="tppFeatures",
source=batch_source,
features=tpp_features)
client.build_features(anchor_list=[tpp_anchor])
feature_query = FeatureQuery(feature_list=["key0"], key=tpp_key)
settings = ObservationSettings(
observation_path='wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/tpp_source.csv',
event_timestamp_column="lpep_dropoff_datetime",
timestamp_format="yyyy-MM-dd HH:mm:ss")
# replace by your own output path
output_path = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), ".avro"])
client.get_offline_features(observation_settings=settings,
feature_query=feature_query,
output_path=output_path)
client.wait_job_to_finish(timeout_sec=Constants.SPARK_JOB_TIMEOUT_SECONDS)
res_df = get_result_df(client, data_format="avro", res_url = output_path)
assert res_df.shape[0] > 0