Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion feathr_project/feathr/_abc.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str):
@abstractmethod
def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str],
reference_files_path: List[str], job_tags: Dict[str, str] = None,
configuration: Dict[str, str] = None):
configuration: Dict[str, str] = None, properties: Dict[str, str] = None):
"""
Submits the feathr job

Expand All @@ -30,6 +30,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name:
arguments (str): all the arugments you want to pass into the spark job
job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
configuration (Dict[str, str]): Additional configs for the spark job
properties (Dict[str, str]): Additional System Properties for the spark job
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's this system property for?

"""
pass
@abstractmethod
Expand Down
8 changes: 6 additions & 2 deletions feathr_project/feathr/_databricks_submission.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import base64
import json
import os
import shlex
import time
import traceback
import urllib
Expand Down Expand Up @@ -114,7 +115,7 @@ def upload_file(self, local_path_or_http_path: str) -> str:
local_path_or_http_path, returned_path)
return returned_path

def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = None):
def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = None, properties: Dict[str, str] = None):
"""
submit the feathr job to databricks
Refer to the databricks doc for more details on the meaning of the parameters:
Expand All @@ -125,9 +126,12 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str, main_class_name:
arguments (str): all the arugments you want to pass into the spark job
job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
configuration (Dict[str, str]): Additional configs for the spark job
properties (Dict[str, str]): Additional System Properties for the spark job
"""


if properties is not None:
arguments.append("--system-properties=%s" % shlex.quote(json.dumps(properties)))

if isinstance(self.config_template, str):
# if the input is a string, load it directly
submission_params = json.loads(self.config_template)
Expand Down
40 changes: 33 additions & 7 deletions feathr_project/feathr/_feature_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from feathr.feature import Feature, FeatureType
from feathr.feature_derivations import DerivedFeature
from feathr.repo_definitions import RepoDefinitions
from feathr.source import HdfsSource, InputContext, Source
from feathr.source import HdfsSource, JdbcSource, InputContext, Source
from feathr.transformation import (ExpressionTransformation, Transformation,
WindowAggTransformation)
from feathr.typed_key import TypedKey
Expand Down Expand Up @@ -86,6 +86,14 @@ def _register_feathr_feature_types(self):

AtlasAttributeDef(
name="path", typeName="string", cardinality=Cardinality.SINGLE),
AtlasAttributeDef(
name="url", typeName="string", cardinality=Cardinality.SINGLE),
AtlasAttributeDef(
name="dbtable", typeName="string", cardinality=Cardinality.SINGLE),
AtlasAttributeDef(
name="query", typeName="string", cardinality=Cardinality.SINGLE),
AtlasAttributeDef(
name="auth", typeName="string", cardinality=Cardinality.SINGLE),
AtlasAttributeDef(name="event_timestamp_column",
typeName="string", cardinality=Cardinality.SINGLE),
AtlasAttributeDef(name="timestamp_format",
Expand Down Expand Up @@ -255,7 +263,7 @@ def _parse_anchors(self, anchor_list: List[FeatureAnchor]) -> List[AtlasEntity]:
anchors_batch.append(anchor_entity)
return anchors_batch

def _parse_source(self, source: Union[Source, HdfsSource]) -> AtlasEntity:
def _parse_source(self, source: Union[Source, HdfsSource, JdbcSource]) -> AtlasEntity:
"""
parse the input sources
"""
Expand All @@ -269,17 +277,35 @@ def _parse_source(self, source: Union[Source, HdfsSource]) -> AtlasEntity:
else:
preprocessing_func = None

source_entity = AtlasEntity(
name=source.name,
qualified_name=self.project_name + self.registry_delimiter + source.name,
attributes={
attrs = {}
if isinstance(source, JdbcSource):
{
"type": INPUT_CONTEXT if input_context else urlparse(source.path).scheme,
"url": INPUT_CONTEXT if input_context else source.url,
"timestamp_format": source.timestamp_format,
"event_timestamp_column": source.event_timestamp_column,
"tags": source.registry_tags,
"preprocessing": preprocessing_func # store the UDF as a string
}
if source.auth is not None:
attrs["auth"] = source.auth
if source.dbtable is not None:
attrs["dbtable"] = source.dbtable
if source.query is not None:
attrs["query"] = source.query
else:
attrs = {
"type": INPUT_CONTEXT if input_context else urlparse(source.path).scheme,
"path": INPUT_CONTEXT if input_context else source.path,
"timestamp_format": source.timestamp_format,
"event_timestamp_column": source.event_timestamp_column,
"tags": source.registry_tags,
"preprocessing": preprocessing_func # store the UDF as a string
},
}
source_entity = AtlasEntity(
name=source.name,
qualified_name=self.project_name + self.registry_delimiter + source.name,
attributes=attrs,
typeName=TYPEDEF_SOURCE,
guid=self.guid.get_guid(),
)
Expand Down
10 changes: 9 additions & 1 deletion feathr_project/feathr/_synapse_submission.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import os
import re
import shlex
import time
import urllib.request
from pathlib import Path
Expand Down Expand Up @@ -71,7 +73,8 @@ def download_result(self, result_path: str, local_folder: str):

def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_class_name: str = None, arguments: List[str] = None,
python_files: List[str]= None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None,
configuration: Dict[str, str] = None):
configuration: Dict[str, str] = None,
properties: Dict[str, str] = None):
"""
Submits the feathr job
Refer to the Apache Livy doc for more details on the meaning of the parameters:
Expand All @@ -92,7 +95,12 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None, main_clas
arguments (str): all the arugments you want to pass into the spark job
job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
configuration (Dict[str, str]): Additional configs for the spark job
properties (Dict[str, str]): Additional System Properties for the spark job
"""

if properties is not None:
arguments.append("--system-properties=%s" % shlex.quote(json.dumps(properties)))

assert main_jar_path, 'main_jar_path should not be none or empty but it is none or empty.'
if main_jar_path.startswith('abfs'):
main_jar_cloud_path = main_jar_path
Expand Down
33 changes: 30 additions & 3 deletions feathr_project/feathr/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
import json
import logging
import os
import tempfile
Expand Down Expand Up @@ -186,7 +187,10 @@ def _check_required_environment_variables_exist(self):

Some required information has to be set via environment variables so the client can work.
"""
for required_field in self.required_fields:
props = []
if hasattr(self, "system_properties"):
props = self.system_properties
for required_field in (self.required_fields + props):
if required_field not in os.environ:
raise RuntimeError(f'{required_field} is not set in environment variable. All required environment '
f'variables are: {self.required_fields}.')
Expand Down Expand Up @@ -235,6 +239,14 @@ def build_features(self, anchor_list: List[FeatureAnchor] = [], derived_feature_
self.anchor_list = anchor_list
self.derived_feature_list = derived_feature_list

# Check if data source used by every anchor requires additional system properties to be set
props = []
for anchor in self.anchor_list:
if hasattr(anchor.source, "get_required_properties"):
props.extend(anchor.source.get_required_properties())
if len(props)>0:
self.system_properties = props

def list_registered_features(self, project_name: str = None) -> List[str]:
"""List all the already registered features. If project_name is not provided or is None, it will return all
the registered features; otherwise it will only return features under this project
Expand Down Expand Up @@ -480,10 +492,11 @@ def _get_offline_features_with_config(self, feature_join_conf_path='feature_join
'--adls-config', self._get_adls_config_str(),
'--blob-config', self._get_blob_config_str(),
'--sql-config', self._get_sql_config_str(),
'--snowflake-config', self._get_snowflake_config_str()
'--snowflake-config', self._get_snowflake_config_str(),
],
reference_files_path=[],
configuration=execution_configuratons
configuration=execution_configuratons,
system_properties=self._get_system_properties()
)

def get_job_result_uri(self, block=True, timeout_sec=300) -> str:
Expand Down Expand Up @@ -581,6 +594,7 @@ def _materialize_features_with_config(self, feature_gen_conf_path: str = 'featur
],
reference_files_path=[],
configuration=execution_configuratons,
system_properties=self._get_system_properties()
)


Expand Down Expand Up @@ -687,6 +701,19 @@ def _get_snowflake_config_str(self):
""".format(JDBC_SF_URL=sf_url, JDBC_SF_USER=sf_user, JDBC_SF_PASSWORD=sf_password, JDBC_SF_ROLE=sf_role)
return config_str

def _get_system_properties(self):
"""Go through all data sources and fill all required system properties"""
prop_and_value = {}
if hasattr(self, "system_properties"):
for prop in self.system_properties:
prop_and_value[prop] = self.envutils.get_environment_variable_with_default(prop)
return prop_and_value
return None

def get_features_from_registry(self, project_name):
""" Sync features from the registry given a project name """
# TODO - Add support for customized workspace path
self.registry.get_features_from_registry(project_name, os.path.abspath("./"))
def _get_kafka_config_str(self):
"""Construct the Kafka config string. The endpoint, access key, secret key, and other parameters can be set via
environment variables."""
Expand Down
59 changes: 59 additions & 0 deletions feathr_project/feathr/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,65 @@ def __str__(self):
return str(self.preprocessing) + '\n' + self.to_feature_config()


class JdbcSource(Source):
def __init__(self, name: str, url: str = "", dbtable: Optional[str] = None, query: Optional[str] = None, auth: Optional[str] = None, preprocessing: Optional[Callable] = None ,event_timestamp_column: Optional[str] = None, timestamp_format: Optional[str] = "epoch",registry_tags: Optional[Dict[str, str]] = None) -> None:
super().__init__(name, event_timestamp_column, timestamp_format, registry_tags)
self.preprocessing = preprocessing
self.url = url
if dbtable is not None:
self.dbtable = dbtable
if query is not None:
self.query = query
if auth is not None:
self.auth = auth.upper()
if self.auth not in ["USERPASS", "TOKEN"]:
raise ValueError("auth must be None or one of following values: ['userpass', 'token']")

def get_required_properties(self):
if not hasattr(self, "auth"):
return []
if self.auth == "USERPASS":
return ["%s_USER" % self.name, "%s_PASSWORD" % self.name]
elif self.auth == "TOKEN":
return ["%s_TOKEN" % self.name]

def to_feature_config(self) -> str:
tm = Template("""
{{source.name}}: {
location: {
type: "jdbc"
url: "{{source.url}}"
{% if source.dbtable is defined %}
dbtable: "{{source.dbtable}}"
{% endif %}
{% if source.query is defined %}
query: "{{source.query}}"
{% endif %}
{% if source.auth is defined %}
{% if source.auth == "USERPASS" %}
user: "${{ "{" }}{{source.name}}_USER{{ "}" }}"
password: "${{ "{" }}{{source.name}}_PASSWORD{{ "}" }}"
{% else %}
useToken: true
token: "${{ "{" }}{{source.name}}_TOKEN{{ "}" }}"
{% endif %}
{% else %}
anonymous: true
{% endif %}
}
{% if source.event_timestamp_column is defined %}
timeWindowParameters: {
timestampColumn: "{{source.event_timestamp_column}}"
timestampColumnFormat: "{{source.timestamp_format}}"
}
{% endif %}
}
""")
msg = tm.render(source=self)
return msg

def __str__(self):
return str(self.preprocessing) + '\n' + self.to_feature_config()
class KafkaConfig:
"""Kafka config for a streaming source
Attributes:
Expand Down
69 changes: 68 additions & 1 deletion feathr_project/test/test_feature_anchor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from feathr import FeatureAnchor
from feathr import Feature
from feathr import HdfsSource
from feathr.source import JdbcSource
from feathr import BOOLEAN, INT32, FLOAT, ValueType
from feathr import INPUT_CONTEXT
from feathr import WindowAggTransformation
Expand Down Expand Up @@ -167,4 +168,70 @@ def test_agg_anchor_to_config():
}
}
"""
assert ''.join(agg_anchor.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())
assert ''.join(agg_anchor.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())

def test_jdbc_source_to_config():
batch_source = JdbcSource(name="nycTaxiBatchSource",
url="jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase",
dbtable="table1",
event_timestamp_column="c1")

expected_agg_feature_config = """
nycTaxiBatchSource: {
location: {
type: "jdbc"
url: "jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase"
dbtable: "table1"
anonymous: true
}
timeWindowParameters: {
timestampColumn: "c1"
timestampColumnFormat: "epoch"
}
}
"""
assert ''.join(batch_source.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())

batch_source = JdbcSource(name="nycTaxiBatchSource",
url="jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase",
dbtable="table1",
auth="userpass",
event_timestamp_column="c1")
expected_agg_feature_config = """
nycTaxiBatchSource: {
location: {
type: "jdbc"
url: "jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase"
dbtable: "table1"
user: "${nycTaxiBatchSource_USER}"
password: "${nycTaxiBatchSource_PASSWORD}"
}
timeWindowParameters: {
timestampColumn: "c1"
timestampColumnFormat: "epoch"
}
}
"""
assert ''.join(batch_source.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())

batch_source = JdbcSource(name="nycTaxiBatchSource",
url="jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase",
dbtable="table1",
auth="token",
event_timestamp_column="c1")
expected_agg_feature_config = """
nycTaxiBatchSource: {
location: {
type: "jdbc"
url: "jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase"
dbtable: "table1"
useToken:true
token: "${nycTaxiBatchSource_TOKEN}"
}
timeWindowParameters: {
timestampColumn: "c1"
timestampColumnFormat: "epoch"
}
}
"""
assert ''.join(batch_source.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())
Loading