Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8a1a310
Add Synapse Launcher
xiaoyongzhu Apr 25, 2021
8c06c45
Remove unnecessary printout
xiaoyongzhu Apr 26, 2021
b3fd983
Add eventhub support
xiaoyongzhu May 8, 2021
24f0986
Merge branch 'master' of https://github.com/xiaoyongzhu/feast-spark
xiaoyongzhu May 19, 2021
89f48a3
Add EventHub support and Redis Auth support
xiaoyongzhu May 19, 2021
f2cd8be
Adding EventHub support in Spark jobs
xiaoyongzhu May 19, 2021
53d7e20
add ScheduledBatchIngestionJobParameters
xiaoyongzhu Aug 31, 2021
79866e9
Merge pull request #1 from feast-dev/master
xiaoyongzhu Aug 31, 2021
c469ee7
Add Azure specific dependencies
xiaoyongzhu Sep 2, 2021
91c6822
Change azure storage dependencies
xiaoyongzhu Sep 2, 2021
0a7a56c
Commen for removing/adding spaces between brackets
xiaoyongzhu Sep 9, 2021
f4c0d5a
Delete feature_store_debug.py
xiaoyongzhumsft Sep 9, 2021
6bc9260
Update StreamingPipeline.scala
xiaoyongzhumsft Sep 14, 2021
dd53a53
Merge branch 'feast-dev:master' into master
xiaoyongzhumsft Oct 1, 2021
9fde235
Update synapse.py
xiaoyongzhumsft Oct 1, 2021
bb3d6be
Update synapse.py
xiaoyongzhumsft Oct 1, 2021
6885f31
Merge branch 'feast-dev:master' into master
xiaoyongzhumsft Oct 4, 2021
08da84f
Fix Redis auth issue
xiaoyongzhu Oct 12, 2021
0ddbcef
Update Ingestion jobs and add supporting files
xiaoyongzhu Oct 12, 2021
48a1c44
Fix build issues
xiaoyongzhu Oct 12, 2021
762386e
Add support for Kafka ingestion
xiaoyongzhu Oct 12, 2021
41fc406
Add build and push instructions
xiaoyongzhu Oct 12, 2021
0f7d433
Adding License
xiaoyongzhu Oct 27, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ build-local-test-docker:
docker build -t feast:local -f infra/docker/tests/Dockerfile .

build-ingestion-jar-no-tests:
cd spark/ingestion && ${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -DskipITs=true -Drevision=${REVISION} clean package
cd spark/ingestion && ${MVN} --no-transfer-progress -Dmaven.javadoc.skip=true -Dgpg.skip -DskipUTs=true -D"spotless.check.skip"=true -DskipITs=true -Drevision=${REVISION} clean package

build-jobservice-docker:
docker build -t $(REGISTRY)/feast-jobservice:$(VERSION) -f infra/docker/jobservice/Dockerfile .
Expand All @@ -68,3 +68,11 @@ push-spark-docker:
docker push $(REGISTRY)/feast-spark:$(VERSION)

install-ci-dependencies: install-python-ci-dependencies

build-ingestion-jar-push:
docker build -t $(REGISTRY)/feast-spark:$(VERSION) --build-arg VERSION=$(VERSION) -f infra/docker/spark/Dockerfile .
rm -f feast-ingestion-spark-latest.jar
docker create -ti --name dummy $(REGISTRY)/feast-spark:latest bash
docker cp dummy:/opt/spark/jars/feast-ingestion-spark-latest.jar feast-ingestion-spark-latest.jar
docker rm -f dummy
python python/feast_spark/copy_to_azure_blob.py
12 changes: 11 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,14 @@ client.apply(entity, ft)

# Start spark streaming ingestion job that reads from kafka and writes to the online store
feast_spark.Client(client).start_stream_to_online_ingestion(ft)
```
```

Build and push to BLOB storage

In order to build the Spark Ingestion jar and copy it to BLOB storage, you have to set these 3 environment variables:

```bash
export VERSION=latest
export REGISTRY=your_registry_name
export AZURE_STORAGE_CONNECTION_STRING="your_azure_storage_connection_string"
```
4 changes: 2 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<scala.version>2.12</scala.version>
<scala.fullVersion>${scala.version}.12</scala.fullVersion>
<spark.version>3.0.2</spark.version>
<scala.fullVersion>${scala.version}.10</scala.fullVersion>
<spark.version>3.1.2</spark.version>
<scala-maven-plugin.version>4.4.0</scala-maven-plugin.version>
<maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
<protobuf.version>3.12.2</protobuf.version>
Expand Down
24 changes: 24 additions & 0 deletions python/feast_spark/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,27 @@ class ConfigOptions(metaclass=ConfigMeta):
# SparkApplication resource template
SPARK_K8S_JOB_TEMPLATE_PATH = None

# Synapse dev url
AZURE_SYNAPSE_DEV_URL: Optional[str] = None

# Synapse pool name
AZURE_SYNAPSE_POOL_NAME: Optional[str] = None

# Datalake directory that linked to Synapse
AZURE_SYNAPSE_DATALAKE_DIR: Optional[str] = None

# Synapse pool executor size: Small, Medium or Large
AZURE_SYNAPSE_EXECUTOR_SIZE = "Small"

# Synapse pool executor count
AZURE_SYNAPSE_EXECUTORS = "2"

# Azure EventHub Connection String (with Kafka API). See more details here:
# https://docs.microsoft.com/en-us/azure/event-hubs/apache-kafka-migration-guide
# Code Sample is here:
# https://github.com/Azure/azure-event-hubs-for-kafka/blob/master/tutorials/spark/sparkConsumer.scala
AZURE_EVENTHUB_KAFKA_CONNECTION_STRING = ""

#: File format of historical retrieval features
HISTORICAL_FEATURE_OUTPUT_FORMAT: str = "parquet"

Expand All @@ -108,6 +129,9 @@ class ConfigOptions(metaclass=ConfigMeta):
#: Enable or disable TLS/SSL to Redis
REDIS_SSL: Optional[str] = "False"

#: Auth string for redis
REDIS_AUTH: str = ""

#: BigTable Project ID
BIGTABLE_PROJECT: Optional[str] = ""

Expand Down
41 changes: 41 additions & 0 deletions python/feast_spark/copy_to_azure_blob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# coding: utf-8

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------

"""
FILE: blob_samples_copy_blob.py
DESCRIPTION:
This sample demos how to copy a blob from a URL.
USAGE: python blob_samples_copy_blob.py
Set the environment variables with your own values before running the sample.
1) AZURE_STORAGE_CONNECTION_STRING - the connection string to your storage account
"""

from __future__ import print_function
import os
import sys
import time
from azure.storage.blob import BlobServiceClient

def main():
try:
CONNECTION_STRING = os.environ['AZURE_STORAGE_CONNECTION_STRING']

except KeyError:
print("AZURE_STORAGE_CONNECTION_STRING must be set.")
sys.exit(1)

blob_service_client = BlobServiceClient.from_connection_string(CONNECTION_STRING)
copied_blob = blob_service_client.get_blob_client("feastjar", 'feast-ingestion-spark-latest.jar')
# hard code to the current path
SOURCE_FILE = "./feast-ingestion-spark-latest.jar"

with open(SOURCE_FILE, "rb") as data:
copied_blob.upload_blob(data, blob_type="BlockBlob",overwrite=True)

if __name__ == "__main__":
main()
107 changes: 28 additions & 79 deletions python/feast_spark/pyspark/abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,13 +340,10 @@ def __init__(
feature_table: Dict,
source: Dict,
jar: str,
redis_host: Optional[str] = None,
redis_port: Optional[int] = None,
redis_ssl: Optional[bool] = None,
bigtable_project: Optional[str] = None,
bigtable_instance: Optional[str] = None,
cassandra_host: Optional[str] = None,
cassandra_port: Optional[int] = None,
redis_host: str,
redis_port: int,
redis_ssl: bool,
redis_auth: str,
statsd_host: Optional[str] = None,
statsd_port: Optional[int] = None,
deadletter_path: Optional[str] = None,
Expand All @@ -359,26 +356,15 @@ def __init__(
self._redis_host = redis_host
self._redis_port = redis_port
self._redis_ssl = redis_ssl
self._bigtable_project = bigtable_project
self._bigtable_instance = bigtable_instance
self._cassandra_host = cassandra_host
self._cassandra_port = cassandra_port
self._redis_auth = redis_auth
self._statsd_host = statsd_host
self._statsd_port = statsd_port
self._deadletter_path = deadletter_path
self._stencil_url = stencil_url
self._drop_invalid_rows = drop_invalid_rows

def _get_redis_config(self):
return dict(host=self._redis_host, port=self._redis_port, ssl=self._redis_ssl)

def _get_bigtable_config(self):
return dict(
project_id=self._bigtable_project, instance_id=self._bigtable_instance
)

def _get_cassandra_config(self):
return dict(host=self._cassandra_host, port=self._cassandra_port)
return dict(host=self._redis_host, port=self._redis_port, ssl=self._redis_ssl, auth=self._redis_auth)

def _get_statsd_config(self):
return (
Expand All @@ -405,17 +391,10 @@ def get_arguments(self) -> List[str]:
json.dumps(self._feature_table),
"--source",
json.dumps(self._source),
"--redis",
json.dumps(self._get_redis_config()),
]

if self._redis_host and self._redis_port:
args.extend(["--redis", json.dumps(self._get_redis_config())])

if self._bigtable_project and self._bigtable_instance:
args.extend(["--bigtable", json.dumps(self._get_bigtable_config())])

if self._cassandra_host and self._cassandra_port:
args.extend(["--cassandra", json.dumps(self._get_cassandra_config())])

if self._get_statsd_config():
args.extend(["--statsd", json.dumps(self._get_statsd_config())])

Expand Down Expand Up @@ -444,13 +423,14 @@ def __init__(
start: datetime,
end: datetime,
jar: str,
redis_host: Optional[str],
redis_port: Optional[int],
redis_ssl: Optional[bool],
bigtable_project: Optional[str],
bigtable_instance: Optional[str],
redis_host: str,
redis_port: int,
redis_ssl: bool,
redis_auth: str,
bigtable_project: Optional[str] = None,
bigtable_instance: Optional[str] = None,
cassandra_host: Optional[str] = None,
cassandra_port: Optional[int] = None,
cassandra_port: Optional[str] = None,
statsd_host: Optional[str] = None,
statsd_port: Optional[int] = None,
deadletter_path: Optional[str] = None,
Expand All @@ -463,10 +443,7 @@ def __init__(
redis_host,
redis_port,
redis_ssl,
bigtable_project,
bigtable_instance,
cassandra_host,
cassandra_port,
redis_auth,
statsd_host,
statsd_port,
deadletter_path,
Expand Down Expand Up @@ -494,7 +471,6 @@ def get_arguments(self) -> List[str]:
self._end.strftime("%Y-%m-%dT%H:%M:%S"),
]


class ScheduledBatchIngestionJobParameters(IngestionJobParameters):
def __init__(
self,
Expand Down Expand Up @@ -559,32 +535,28 @@ def __init__(
source: Dict,
jar: str,
extra_jars: List[str],
redis_host: Optional[str],
redis_port: Optional[int],
redis_ssl: Optional[bool],
bigtable_project: Optional[str],
bigtable_instance: Optional[str],
cassandra_host: Optional[str] = None,
cassandra_port: Optional[int] = None,
redis_host: str,
redis_port: int,
redis_ssl: bool,
redis_auth: str,
statsd_host: Optional[str] = None,
statsd_port: Optional[int] = None,
deadletter_path: Optional[str] = None,
checkpoint_path: Optional[str] = None,
stencil_url: Optional[str] = None,
drop_invalid_rows: bool = False,
triggering_interval: Optional[int] = None,
drop_invalid_rows: Optional[bool] = False,
kafka_sasl_auth: Optional[str] = None,
):
stencil_url: Optional[str] = None,
drop_invalid_rows: bool = False,
super().__init__(
feature_table,
source,
jar,
redis_host,
redis_port,
redis_ssl,
bigtable_project,
bigtable_instance,
cassandra_host,
cassandra_port,
redis_auth,
statsd_host,
statsd_port,
deadletter_path,
Expand All @@ -593,7 +565,7 @@ def __init__(
)
self._extra_jars = extra_jars
self._checkpoint_path = checkpoint_path
self._triggering_interval = triggering_interval
self._kafka_sasl_auth = kafka_sasl_auth

def get_name(self) -> str:
return f"{self.get_job_type().to_pascal_case()}-{self.get_feature_table_name()}"
Expand All @@ -609,8 +581,8 @@ def get_arguments(self) -> List[str]:
args.extend(["--mode", "online"])
if self._checkpoint_path:
args.extend(["--checkpoint-path", self._checkpoint_path])
if self._triggering_interval:
args.extend(["--triggering-interval", str(self._triggering_interval)])
if self._kafka_sasl_auth:
args.extend(["--kafka_sasl_auth", self._kafka_sasl_auth])
return args

def get_job_hash(self) -> str:
Expand Down Expand Up @@ -705,29 +677,6 @@ def offline_to_online_ingestion(
"""
raise NotImplementedError

@abc.abstractmethod
def schedule_offline_to_online_ingestion(
self, ingestion_job_params: ScheduledBatchIngestionJobParameters
):
"""
Submits a scheduled batch ingestion job to a Spark cluster.

Raises:
SparkJobFailure: The spark job submission failed, encountered error
during execution, or timeout.

Returns:
ScheduledBatchIngestionJob: wrapper around remote job that can be used to check when job completed.
"""
raise NotImplementedError

@abc.abstractmethod
def unschedule_offline_to_online_ingestion(self, project: str, feature_table: str):
"""
Unschedule a scheduled batch ingestion job.
"""
raise NotImplementedError

@abc.abstractmethod
def start_stream_to_online_ingestion(
self, ingestion_job_params: StreamIngestionJobParameters
Expand Down
26 changes: 18 additions & 8 deletions python/feast_spark/pyspark/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,24 @@ def _k8s_launcher(config: Config) -> JobLauncher:
)


def _synapse_launcher(config: Config) -> JobLauncher:
from feast_spark.pyspark.launchers import synapse

return synapse.SynapseJobLauncher(
synapse_dev_url=config.get(opt.AZURE_SYNAPSE_DEV_URL),
pool_name=config.get(opt.AZURE_SYNAPSE_POOL_NAME),
datalake_dir=config.get(opt.AZURE_SYNAPSE_DATALAKE_DIR),
executor_size=config.get(opt.AZURE_SYNAPSE_EXECUTOR_SIZE),
executors=int(config.get(opt.AZURE_SYNAPSE_EXECUTORS))
)


_launchers = {
"standalone": _standalone_launcher,
"dataproc": _dataproc_launcher,
"emr": _emr_launcher,
"k8s": _k8s_launcher,
'synapse': _synapse_launcher,
}


Expand Down Expand Up @@ -347,6 +360,7 @@ def start_offline_to_online_ingestion(
redis_port=bool(client.config.get(opt.REDIS_HOST))
and client.config.getint(opt.REDIS_PORT),
redis_ssl=client.config.getboolean(opt.REDIS_SSL),
redis_auth=client.config.get(opt.REDIS_AUTH),
bigtable_project=client.config.get(opt.BIGTABLE_PROJECT),
bigtable_instance=client.config.get(opt.BIGTABLE_INSTANCE),
cassandra_host=client.config.get(opt.CASSANDRA_HOST),
Expand Down Expand Up @@ -423,11 +437,9 @@ def get_stream_to_online_ingestion_params(
source=_source_to_argument(feature_table.stream_source, client.config),
feature_table=_feature_table_to_argument(client, project, feature_table),
redis_host=client.config.get(opt.REDIS_HOST),
redis_port=bool(client.config.get(opt.REDIS_HOST))
and client.config.getint(opt.REDIS_PORT),
redis_port=client.config.getint(opt.REDIS_PORT),
redis_ssl=client.config.getboolean(opt.REDIS_SSL),
bigtable_project=client.config.get(opt.BIGTABLE_PROJECT),
bigtable_instance=client.config.get(opt.BIGTABLE_INSTANCE),
redis_auth=client.config.get(opt.REDIS_AUTH),
statsd_host=client.config.getboolean(opt.STATSD_ENABLED)
and client.config.get(opt.STATSD_HOST),
statsd_port=client.config.getboolean(opt.STATSD_ENABLED)
Expand All @@ -436,11 +448,9 @@ def get_stream_to_online_ingestion_params(
checkpoint_path=client.config.get(opt.CHECKPOINT_PATH),
stencil_url=client.config.get(opt.STENCIL_URL),
drop_invalid_rows=client.config.get(opt.INGESTION_DROP_INVALID_ROWS),
triggering_interval=client.config.getint(
opt.SPARK_STREAMING_TRIGGERING_INTERVAL, default=None
),
)
kafka_sasl_auth=client.config.get(opt.AZURE_EVENTHUB_KAFKA_CONNECTION_STRING),

)

def start_stream_to_online_ingestion(
client: "Client", project: str, feature_table: FeatureTable, extra_jars: List[str]
Expand Down
Loading