-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Closed
Labels
Description
Expected Behavior
Spark Kafka Processor query_timeout can be set to None
Current Behavior
Spark Kafka Processor query_timeout cannot be set to None
Steps to reproduce
Use query_timeout=None in SparkProcessorConfig
from datetime import datetime
import time
import pandas as pd
from feast import FeatureStore
from feast.data_source import PushMode
import os
from pyspark.sql import SparkSession
from feast.infra.contrib.stream_processor import ProcessorConfig
from feast.infra.contrib.spark_kafka_processor import SparkProcessorConfig
from feast.infra.contrib.stream_processor import get_stream_processor_object
import sys
def preprocess_fn(rows: pd.DataFrame):
print(f"df columns: {rows.columns}")
print(f"df size: {rows.size}")
print(f"df preview:\n{rows.head()}")
return rows
def start_spark():
store = FeatureStore(repo_path=".")
# See https://spark.apache.org/docs/3.1.2/structured-streaming-kafka-integration.html#deploying for notes on why we need this environment variable.
# os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 pyspark-shell"
spark = SparkSession.builder.appName("feast-spark").getOrCreate()
# spark.conf.set("spark.sql.shuffle.partitions", 5)
ingestion_config = SparkProcessorConfig(mode="spark", source="kafka", spark_session=spark, processing_time="5 seconds",query_timeout=None)
sfv = store.get_stream_feature_view("driver_hourly_stats_stream")
processor = get_stream_processor_object(
config=ingestion_config,
fs=store,
sfv=sfv,
preprocess_fn=preprocess_fn,
)
query = processor.ingest_stream_feature_view(PushMode.OFFLINE)
if __name__ == "__main__":
start_spark()
Execute with spark-submit
spark-submit\
--master local[1] \
--name feast-spark\
--packages=org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0 \
--conf spark.sql.shuffle.partitions=5 \
./test_spark.py
Error:
Traceback (most recent call last):
File "/home/orestis/Desktop/feast_project/feature_streaming/test_spark.py", line 40, in <module>
start_spark()
File "/home/orestis/Desktop/feast_project/feature_streaming/test_spark.py", line 27, in start_spark
ingestion_config = SparkProcessorConfig(mode="spark", source="kafka", spark_session=spark, processing_time="5 seconds",query_timeout=None)
File "pydantic/main.py", line 342, in pydantic.main.BaseModel.__init__
pydantic.error_wrappers.ValidationError: 1 validation error for SparkProcessorConfig
query_timeout
none is not an allowed value (type=type_error.none.not_allowed)
Specifications
- Version: 0.34.1
- Platform: Linux
- Subsystem: Ubuntu
Possible Solution
Add None as a value for query_timeout