feast-dev · xiaoyongzhu · Apr 25, 2021 · Apr 26, 2021 · May 8, 2021 · May 19, 2021
diff --git a/Makefile b/Makefile
@@ -53,7 +53,7 @@ build-local-test-docker:
 	docker build -t feast:local -f infra/docker/tests/Dockerfile .
 
 build-ingestion-jar-no-tests:
-	cd spark/ingestion && ${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -DskipITs=true -Drevision=${REVISION} clean package
+	cd spark/ingestion && ${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -D"spotless.check.skip"=true -DskipITs=true -Drevision=${REVISION} clean package
 
 build-jobservice-docker:
 	docker build -t $(REGISTRY)/feast-jobservice:$(VERSION) -f infra/docker/jobservice/Dockerfile .
@@ -68,3 +68,11 @@ push-spark-docker:
 	docker push $(REGISTRY)/feast-spark:$(VERSION)
 
 install-ci-dependencies: install-python-ci-dependencies
+
+build-ingestion-jar-push:
+	docker build -t $(REGISTRY)/feast-spark:$(VERSION) --build-arg VERSION=$(VERSION) -f infra/docker/spark/Dockerfile .
+	rm -f feast-ingestion-spark-latest.jar
+	docker create -ti --name dummy $(REGISTRY)/feast-spark:latest bash
+	docker cp dummy:/opt/spark/jars/feast-ingestion-spark-latest.jar feast-ingestion-spark-latest.jar
+	docker rm -f dummy
+	python python/feast_spark/copy_to_azure_blob.py
diff --git a/README.md b/README.md
@@ -57,4 +57,14 @@ client.apply(entity, ft)
 
 # Start spark streaming ingestion job that reads from kafka and writes to the online store
 feast_spark.Client(client).start_stream_to_online_ingestion(ft)
-```
+```
+
+Build and push to BLOB storage
+
+In order to build the Spark Ingestion jar and copy it to BLOB storage, you have to set these 3 environment variables:
+
+```bash
+export VERSION=latest
+export REGISTRY=your_registry_name
+export AZURE_STORAGE_CONNECTION_STRING="your_azure_storage_connection_string"
+```
diff --git a/pom.xml b/pom.xml
@@ -18,8 +18,8 @@
         <maven.compiler.source>1.8</maven.compiler.source>
         <maven.compiler.target>1.8</maven.compiler.target>
         <scala.version>2.12</scala.version>
-        <scala.fullVersion>${scala.version}.12</scala.fullVersion>
-        <spark.version>3.0.2</spark.version>
+        <scala.fullVersion>${scala.version}.10</scala.fullVersion>
+        <spark.version>3.1.2</spark.version>
         <scala-maven-plugin.version>4.4.0</scala-maven-plugin.version>
         <maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
         <protobuf.version>3.12.2</protobuf.version>

diff --git a/python/feast_spark/constants.py b/python/feast_spark/constants.py
@@ -93,6 +93,27 @@ class ConfigOptions(metaclass=ConfigMeta):
     # SparkApplication resource template
     SPARK_K8S_JOB_TEMPLATE_PATH = None
 
+    # Synapse dev url
+    AZURE_SYNAPSE_DEV_URL: Optional[str] = None 
+
+    # Synapse pool name
+    AZURE_SYNAPSE_POOL_NAME: Optional[str] = None
+
+    # Datalake directory that linked to Synapse
+    AZURE_SYNAPSE_DATALAKE_DIR: Optional[str] = None
+
+    # Synapse pool executor size: Small, Medium or Large
+    AZURE_SYNAPSE_EXECUTOR_SIZE = "Small"
+
+    # Synapse pool executor count
+    AZURE_SYNAPSE_EXECUTORS = "2"
+
+    # Azure EventHub Connection String (with Kafka API). See more details here:
+    # https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-migration-guide
+    # Code Sample is here: 
+    # https://github.com/Azure/azure-event-hubs-for-kafka/blob/master/tutorials/spark/sparkConsumer.scala
+    AZURE_EVENTHUB_KAFKA_CONNECTION_STRING = ""
+
     #: File format of historical retrieval features
     HISTORICAL_FEATURE_OUTPUT_FORMAT: str = "parquet"
 
@@ -108,6 +129,9 @@ class ConfigOptions(metaclass=ConfigMeta):
     #: Enable or disable TLS/SSL to Redis
     REDIS_SSL: Optional[str] = "False"
 
+    #: Auth string for redis
+    REDIS_AUTH: str = ""
+
     #: BigTable Project ID
     BIGTABLE_PROJECT: Optional[str] = ""
 

diff --git a/python/feast_spark/copy_to_azure_blob.py b/python/feast_spark/copy_to_azure_blob.py
@@ -0,0 +1,41 @@
+# coding: utf-8
+
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+
+"""
+FILE: blob_samples_copy_blob.py
+DESCRIPTION:
+    This sample demos how to copy a blob from a URL.
+USAGE: python blob_samples_copy_blob.py
+    Set the environment variables with your own values before running the sample.
+    1) AZURE_STORAGE_CONNECTION_STRING - the connection string to your storage account
+"""
+
+from __future__ import print_function
+import os
+import sys
+import time
+from azure.storage.blob import BlobServiceClient
+
+def main():
+    try:
+        CONNECTION_STRING = os.environ['AZURE_STORAGE_CONNECTION_STRING']
+
+    except KeyError:
+        print("AZURE_STORAGE_CONNECTION_STRING must be set.")
+        sys.exit(1)
+
+    blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
+    copied_blob = blob_service_client.get_blob_client("feastjar", 'feast-ingestion-spark-latest.jar')
+    # hard code to the current path
+    SOURCE_FILE = "./feast-ingestion-spark-latest.jar"
+
+    with open(SOURCE_FILE, "rb") as data:
+            copied_blob.upload_blob(data, blob_type="BlockBlob",overwrite=True)
+
+if __name__ == "__main__":
+    main()
diff --git a/python/feast_spark/pyspark/abc.py b/python/feast_spark/pyspark/abc.py
@@ -340,13 +340,10 @@ def __init__(
         feature_table: Dict,
         source: Dict,
         jar: str,
-        redis_host: Optional[str] = None,
-        redis_port: Optional[int] = None,
-        redis_ssl: Optional[bool] = None,
-        bigtable_project: Optional[str] = None,
-        bigtable_instance: Optional[str] = None,
-        cassandra_host: Optional[str] = None,
-        cassandra_port: Optional[int] = None,
+        redis_host: str,
+        redis_port: int,
+        redis_ssl: bool,
+        redis_auth: str,
         statsd_host: Optional[str] = None,
         statsd_port: Optional[int] = None,
         deadletter_path: Optional[str] = None,
@@ -359,26 +356,15 @@ def __init__(
         self._redis_host = redis_host
         self._redis_port = redis_port
         self._redis_ssl = redis_ssl
-        self._bigtable_project = bigtable_project
-        self._bigtable_instance = bigtable_instance
-        self._cassandra_host = cassandra_host
-        self._cassandra_port = cassandra_port
+        self._redis_auth = redis_auth
         self._statsd_host = statsd_host
         self._statsd_port = statsd_port
         self._deadletter_path = deadletter_path
         self._stencil_url = stencil_url
         self._drop_invalid_rows = drop_invalid_rows
 
     def _get_redis_config(self):
-        return dict(host=self._redis_host, port=self._redis_port, ssl=self._redis_ssl)
-
-    def _get_bigtable_config(self):
-        return dict(
-            project_id=self._bigtable_project, instance_id=self._bigtable_instance
-        )
-
-    def _get_cassandra_config(self):
-        return dict(host=self._cassandra_host, port=self._cassandra_port)
+        return dict(host=self._redis_host, port=self._redis_port, ssl=self._redis_ssl, auth=self._redis_auth)
 
     def _get_statsd_config(self):
         return (
@@ -405,17 +391,10 @@ def get_arguments(self) -> List[str]:
             json.dumps(self._feature_table),
             "--source",
             json.dumps(self._source),
+            "--redis",
+            json.dumps(self._get_redis_config()),
         ]
 
-        if self._redis_host and self._redis_port:
-            args.extend(["--redis", json.dumps(self._get_redis_config())])
-
-        if self._bigtable_project and self._bigtable_instance:
-            args.extend(["--bigtable", json.dumps(self._get_bigtable_config())])
-
-        if self._cassandra_host and self._cassandra_port:
-            args.extend(["--cassandra", json.dumps(self._get_cassandra_config())])
-
         if self._get_statsd_config():
             args.extend(["--statsd", json.dumps(self._get_statsd_config())])
 
@@ -444,13 +423,14 @@ def __init__(
         start: datetime,
         end: datetime,
         jar: str,
-        redis_host: Optional[str],
-        redis_port: Optional[int],
-        redis_ssl: Optional[bool],
-        bigtable_project: Optional[str],
-        bigtable_instance: Optional[str],
+        redis_host: str,
+        redis_port: int,
+        redis_ssl: bool,
+        redis_auth: str,
+        bigtable_project: Optional[str] = None,
+        bigtable_instance: Optional[str] = None,
         cassandra_host: Optional[str] = None,
-        cassandra_port: Optional[int] = None,
+        cassandra_port: Optional[str] = None,
         statsd_host: Optional[str] = None,
         statsd_port: Optional[int] = None,
         deadletter_path: Optional[str] = None,
@@ -463,10 +443,7 @@ def __init__(
             redis_host,
             redis_port,
             redis_ssl,
-            bigtable_project,
-            bigtable_instance,
-            cassandra_host,
-            cassandra_port,
+            redis_auth,
             statsd_host,
             statsd_port,
             deadletter_path,
@@ -494,7 +471,6 @@ def get_arguments(self) -> List[str]:
             self._end.strftime("%Y-%m-%dT%H:%M:%S"),
         ]
 
-
 class ScheduledBatchIngestionJobParameters(IngestionJobParameters):
     def __init__(
         self,
@@ -559,32 +535,28 @@ def __init__(
         source: Dict,
         jar: str,
         extra_jars: List[str],
-        redis_host: Optional[str],
-        redis_port: Optional[int],
-        redis_ssl: Optional[bool],
-        bigtable_project: Optional[str],
-        bigtable_instance: Optional[str],
-        cassandra_host: Optional[str] = None,
-        cassandra_port: Optional[int] = None,
+        redis_host: str,
+        redis_port: int,
+        redis_ssl: bool,
+        redis_auth: str,
         statsd_host: Optional[str] = None,
         statsd_port: Optional[int] = None,
         deadletter_path: Optional[str] = None,
         checkpoint_path: Optional[str] = None,
         stencil_url: Optional[str] = None,
-        drop_invalid_rows: bool = False,
-        triggering_interval: Optional[int] = None,
+        drop_invalid_rows: Optional[bool] = False,
+        kafka_sasl_auth: Optional[str] = None,
     ):
+        stencil_url: Optional[str] = None,
+        drop_invalid_rows: bool = False,
         super().__init__(
             feature_table,
             source,
             jar,
             redis_host,
             redis_port,
             redis_ssl,
-            bigtable_project,
-            bigtable_instance,
-            cassandra_host,
-            cassandra_port,
+            redis_auth,
             statsd_host,
             statsd_port,
             deadletter_path,
@@ -593,7 +565,7 @@ def __init__(
         )
         self._extra_jars = extra_jars
         self._checkpoint_path = checkpoint_path
-        self._triggering_interval = triggering_interval
+        self._kafka_sasl_auth = kafka_sasl_auth
 
     def get_name(self) -> str:
         return f"{self.get_job_type().to_pascal_case()}-{self.get_feature_table_name()}"
@@ -609,8 +581,8 @@ def get_arguments(self) -> List[str]:
         args.extend(["--mode", "online"])
         if self._checkpoint_path:
             args.extend(["--checkpoint-path", self._checkpoint_path])
-        if self._triggering_interval:
-            args.extend(["--triggering-interval", str(self._triggering_interval)])
+        if self._kafka_sasl_auth:
+            args.extend(["--kafka_sasl_auth", self._kafka_sasl_auth])
         return args
 
     def get_job_hash(self) -> str:
@@ -705,29 +677,6 @@ def offline_to_online_ingestion(
         """
         raise NotImplementedError
 
-    @abc.abstractmethod
-    def schedule_offline_to_online_ingestion(
-        self, ingestion_job_params: ScheduledBatchIngestionJobParameters
-    ):
-        """
-        Submits a scheduled batch ingestion job to a Spark cluster.
-
-        Raises:
-            SparkJobFailure: The spark job submission failed, encountered error
-                during execution, or timeout.
-
-        Returns:
-            ScheduledBatchIngestionJob: wrapper around remote job that can be used to check when job completed.
-        """
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def unschedule_offline_to_online_ingestion(self, project: str, feature_table: str):
-        """
-        Unschedule a scheduled batch ingestion job.
-        """
-        raise NotImplementedError
-
     @abc.abstractmethod
     def start_stream_to_online_ingestion(
         self, ingestion_job_params: StreamIngestionJobParameters

diff --git a/python/feast_spark/pyspark/launcher.py b/python/feast_spark/pyspark/launcher.py
@@ -83,11 +83,24 @@ def _k8s_launcher(config: Config) -> JobLauncher:
     )
 
 
+def _synapse_launcher(config: Config) -> JobLauncher:
+    from feast_spark.pyspark.launchers import synapse
+
+    return synapse.SynapseJobLauncher(
+        synapse_dev_url=config.get(opt.AZURE_SYNAPSE_DEV_URL),
+        pool_name=config.get(opt.AZURE_SYNAPSE_POOL_NAME),
+        datalake_dir=config.get(opt.AZURE_SYNAPSE_DATALAKE_DIR),
+        executor_size=config.get(opt.AZURE_SYNAPSE_EXECUTOR_SIZE),
+        executors=int(config.get(opt.AZURE_SYNAPSE_EXECUTORS))
+    )
+
+
 _launchers = {
     "standalone": _standalone_launcher,
     "dataproc": _dataproc_launcher,
     "emr": _emr_launcher,
     "k8s": _k8s_launcher,
+    'synapse': _synapse_launcher,
 }
 
 
@@ -347,6 +360,7 @@ def start_offline_to_online_ingestion(
             redis_port=bool(client.config.get(opt.REDIS_HOST))
             and client.config.getint(opt.REDIS_PORT),
             redis_ssl=client.config.getboolean(opt.REDIS_SSL),
+            redis_auth=client.config.get(opt.REDIS_AUTH),
             bigtable_project=client.config.get(opt.BIGTABLE_PROJECT),
             bigtable_instance=client.config.get(opt.BIGTABLE_INSTANCE),
             cassandra_host=client.config.get(opt.CASSANDRA_HOST),
@@ -423,11 +437,9 @@ def get_stream_to_online_ingestion_params(
         source=_source_to_argument(feature_table.stream_source, client.config),
         feature_table=_feature_table_to_argument(client, project, feature_table),
         redis_host=client.config.get(opt.REDIS_HOST),
-        redis_port=bool(client.config.get(opt.REDIS_HOST))
-        and client.config.getint(opt.REDIS_PORT),
+        redis_port=client.config.getint(opt.REDIS_PORT),
         redis_ssl=client.config.getboolean(opt.REDIS_SSL),
-        bigtable_project=client.config.get(opt.BIGTABLE_PROJECT),
-        bigtable_instance=client.config.get(opt.BIGTABLE_INSTANCE),
+        redis_auth=client.config.get(opt.REDIS_AUTH),
         statsd_host=client.config.getboolean(opt.STATSD_ENABLED)
         and client.config.get(opt.STATSD_HOST),
         statsd_port=client.config.getboolean(opt.STATSD_ENABLED)
@@ -436,11 +448,9 @@ def get_stream_to_online_ingestion_params(
         checkpoint_path=client.config.get(opt.CHECKPOINT_PATH),
         stencil_url=client.config.get(opt.STENCIL_URL),
         drop_invalid_rows=client.config.get(opt.INGESTION_DROP_INVALID_ROWS),
-        triggering_interval=client.config.getint(
-            opt.SPARK_STREAMING_TRIGGERING_INTERVAL, default=None
-        ),
-    )
+        kafka_sasl_auth=client.config.get(opt.AZURE_EVENTHUB_KAFKA_CONNECTION_STRING),        
 
+    )
 
 def start_stream_to_online_ingestion(
     client: "Client", project: str, feature_table: FeatureTable, extra_jars: List[str]