# Samples for feature naming conflicts check and handle import pytest from feathr import (TypedKey, ValueType, FeatureQuery, ObservationSettings, HdfsSource, Feature,WindowAggTransformation, FLOAT) from feathr import (FeathrClient, FeatureAnchor, ConflictsAutoCorrection) from feathr.utils.job_utils import get_result_df from datetime import datetime # Example for feature naming conflicts check from python client side # with no 'auto-correction' enabled # replace by your own config path client = client = FeathrClient("feathr_config.yaml") location_id = TypedKey(key_column="DOLocationID", key_column_type=ValueType.INT32, description="location id in NYC", full_name="nyc_taxi.location_id") feature_query = FeatureQuery( feature_list=["trip_distance","fare_amount"], key=location_id) # Defined feature names conflict with observation data set column names settings = ObservationSettings( observation_path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv", event_timestamp_column="lpep_dropoff_datetime", timestamp_format="yyyy-MM-dd HH:mm:ss") output_path = "wasbs://fake_path" with pytest.raises(RuntimeError) as e: client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path ) assert str(e.value) == "Feature names exist conflicts with dataset column names: trip_distance,fare_amount" # Defined feature names conflict with provided column names settings = ObservationSettings( observation_path="wasbs://public@fake_file", event_timestamp_column="lpep_dropoff_datetime", timestamp_format="yyyy-MM-dd HH:mm:ss") output_path = "wasbs://fakepath" with pytest.raises(RuntimeError) as e: client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path, dataset_column_names=set(('trip_distance','fare_amount')) ) assert str(e.value) == "Feature names exist conflicts with dataset column names: trip_distance,fare_amount" # Example for feature naming conflicts when auto-correction is enabled # replace by yout own confi path client = FeathrClient(config_path='feathr_config.yaml', local_workspace_dir="conflicts_test") batch_source = HdfsSource(name="nycTaxiBatchSource", path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv", event_timestamp_column="lpep_dropoff_datetime", timestamp_format="yyyy-MM-dd HH:mm:ss") location_id = TypedKey(key_column="DOLocationID", key_column_type=ValueType.INT32, description="location id in NYC", full_name="nyc_taxi.location_id") pu_location_id = TypedKey(key_column="PULocationID", key_column_type=ValueType.INT32, description="location id in NYC", full_name="nyc_taxi.location_id") agg_features = [Feature(name="tip_amount", key=[location_id, pu_location_id], feature_type=FLOAT, transform=WindowAggTransformation(agg_expr="cast_float(fare_amount)", agg_func="AVG", window="3d")), Feature(name="total_amount", key=[location_id, pu_location_id], feature_type=FLOAT, transform=WindowAggTransformation(agg_expr="cast_float(fare_amount)", agg_func="MAX", window="3d")), ] agg_anchor = FeatureAnchor(name="aggregationFeatures", source=batch_source, features=agg_features) client.build_features(anchor_list=[agg_anchor]) now = datetime.now() # Feature names 'tip_amount' and 'total_amount' are conflicted with dataset columns # they will be renamed to 'tip_amount_test' and 'total_amoun_test' in the result feature_query = FeatureQuery( feature_list=["tip_amount", "total_amount"], key=location_id) settings = ObservationSettings( observation_path="wasbs://public@azurefeathrstorage.blob.core.windows.net/sample_data/green_tripdata_2020-04_with_index.csv", event_timestamp_column="lpep_dropoff_datetime", timestamp_format="yyyy-MM-dd HH:mm:ss", conflicts_auto_correction=ConflictsAutoCorrection(rename_features=True, suffix="test")) # replace by your own output path output_path = ''.join(['dbfs:/feathrazure_cijob','_', str(now.minute), '_', str(now.second), ".avro"]) client.get_offline_features(observation_settings=settings, feature_query=feature_query, output_path=output_path ) client.wait_job_to_finish(timeout_sec=500) res_df = get_result_df(client, data_format="avro", res_url = output_path) assert res_df.shape[0] > 0