feathr-ai · windoze · Apr 11, 2022 · Apr 11, 2022 · Apr 12, 2022 · Apr 12, 2022
diff --git a/feathr_project/feathr/_abc.py b/feathr_project/feathr/_abc.py
@@ -19,7 +19,7 @@ def upload_or_get_cloud_path(self, local_path_or_http_path: str):
     @abstractmethod
     def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name: str, arguments: List[str],
                           reference_files_path: List[str], job_tags: Dict[str, str] = None,
-                          configuration: Dict[str, str] = None):
+                          configuration: Dict[str, str] = None, properties: Dict[str, str] = None):
         """
         Submits the feathr job
 
@@ -30,6 +30,7 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name:
             arguments (str): all the arugments you want to pass into the spark job
             job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
             configuration (Dict[str, str]): Additional configs for the spark job
+            properties (Dict[str, str]): Additional System Properties for the spark job
         """
         pass
     @abstractmethod

diff --git a/feathr_project/feathr/_databricks_submission.py b/feathr_project/feathr/_databricks_submission.py
@@ -1,6 +1,7 @@
 import base64
 import json
 import os
+import shlex
 import time
 import traceback
 import urllib
@@ -114,7 +115,7 @@ def upload_file(self, local_path_or_http_path: str) -> str:
                          local_path_or_http_path, returned_path)
         return returned_path
 
-    def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = None):
+    def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name: str, arguments: List[str], python_files: List[str], reference_files_path: List[str] = [], job_tags: Dict[str, str] = None, configuration: Dict[str, str] = None, properties: Dict[str, str] = None):
         """
         submit the feathr job to databricks
         Refer to the databricks doc for more details on the meaning of the parameters:
@@ -125,9 +126,12 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str,  main_class_name:
             arguments (str): all the arugments you want to pass into the spark job
             job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
             configuration (Dict[str, str]): Additional configs for the spark job
+            properties (Dict[str, str]): Additional System Properties for the spark job
         """
 
-
+        if properties is not None:
+            arguments.append("--system-properties=%s" % shlex.quote(json.dumps(properties)))
+
         if isinstance(self.config_template, str):
             # if the input is a string, load it directly
             submission_params = json.loads(self.config_template)

diff --git a/feathr_project/feathr/_feature_registry.py b/feathr_project/feathr/_feature_registry.py
@@ -29,7 +29,7 @@
 from feathr.feature import Feature, FeatureType
 from feathr.feature_derivations import DerivedFeature
 from feathr.repo_definitions import RepoDefinitions
-from feathr.source import HdfsSource, InputContext, Source
+from feathr.source import HdfsSource, JdbcSource, InputContext, Source
 from feathr.transformation import (ExpressionTransformation, Transformation,
                                    WindowAggTransformation)
 from feathr.typed_key import TypedKey
@@ -86,6 +86,14 @@ def _register_feathr_feature_types(self):
 
                 AtlasAttributeDef(
                     name="path", typeName="string", cardinality=Cardinality.SINGLE),
+                AtlasAttributeDef(
+                    name="url", typeName="string", cardinality=Cardinality.SINGLE),
+                AtlasAttributeDef(
+                    name="dbtable", typeName="string", cardinality=Cardinality.SINGLE),
+                AtlasAttributeDef(
+                    name="query", typeName="string", cardinality=Cardinality.SINGLE),
+                AtlasAttributeDef(
+                    name="auth", typeName="string", cardinality=Cardinality.SINGLE),
                 AtlasAttributeDef(name="event_timestamp_column",
                                   typeName="string", cardinality=Cardinality.SINGLE),
                 AtlasAttributeDef(name="timestamp_format",
@@ -255,7 +263,7 @@ def _parse_anchors(self, anchor_list: List[FeatureAnchor]) -> List[AtlasEntity]:
             anchors_batch.append(anchor_entity)
         return anchors_batch
 
-    def _parse_source(self, source: Union[Source, HdfsSource]) -> AtlasEntity:
+    def _parse_source(self, source: Union[Source, HdfsSource, JdbcSource]) -> AtlasEntity:
         """
         parse the input sources
         """
@@ -269,17 +277,35 @@ def _parse_source(self, source: Union[Source, HdfsSource]) -> AtlasEntity:
         else:
             preprocessing_func = None
 
-        source_entity = AtlasEntity(
-            name=source.name,
-            qualified_name=self.project_name + self.registry_delimiter + source.name,
-            attributes={
+        attrs = {}
+        if isinstance(source, JdbcSource):
+            {
+                "type": INPUT_CONTEXT if input_context else urlparse(source.path).scheme,
+                "url": INPUT_CONTEXT if input_context else source.url,
+                "timestamp_format": source.timestamp_format,
+                "event_timestamp_column": source.event_timestamp_column,
+                "tags": source.registry_tags,
+                "preprocessing": preprocessing_func  # store the UDF as a string
+            }
+            if source.auth is not None:
+                attrs["auth"] = source.auth
+            if source.dbtable is not None:
+                attrs["dbtable"] = source.dbtable
+            if source.query is not None:
+                attrs["query"] = source.query
+        else:
+            attrs = {
                 "type": INPUT_CONTEXT if input_context else urlparse(source.path).scheme,
                 "path": INPUT_CONTEXT if input_context else source.path,
                 "timestamp_format": source.timestamp_format,
                 "event_timestamp_column": source.event_timestamp_column,
                 "tags": source.registry_tags,
                 "preprocessing": preprocessing_func  # store the UDF as a string
-            },
+            }
+        source_entity = AtlasEntity(
+            name=source.name,
+            qualified_name=self.project_name + self.registry_delimiter + source.name,
+            attributes=attrs,
             typeName=TYPEDEF_SOURCE,
             guid=self.guid.get_guid(),
         )

diff --git a/feathr_project/feathr/_synapse_submission.py b/feathr_project/feathr/_synapse_submission.py
@@ -1,5 +1,7 @@
+import json
 import os
 import re
+import shlex
 import time
 import urllib.request
 from pathlib import Path
@@ -71,7 +73,8 @@ def download_result(self, result_path: str, local_folder: str):
 
     def submit_feathr_job(self, job_name: str, main_jar_path: str = None,  main_class_name: str = None, arguments: List[str] = None,
                           python_files: List[str]= None, reference_files_path: List[str] = None, job_tags: Dict[str, str] = None,
-                          configuration: Dict[str, str] = None):
+                          configuration: Dict[str, str] = None,
+                          properties: Dict[str, str] = None):
         """
         Submits the feathr job
         Refer to the Apache Livy doc for more details on the meaning of the parameters:
@@ -92,7 +95,12 @@ def submit_feathr_job(self, job_name: str, main_jar_path: str = None,  main_clas
             arguments (str): all the arugments you want to pass into the spark job
             job_tags (str): tags of the job, for exmaple you might want to put your user ID, or a tag with a certain information
             configuration (Dict[str, str]): Additional configs for the spark job
+            properties (Dict[str, str]): Additional System Properties for the spark job
         """
+
+        if properties is not None:
+            arguments.append("--system-properties=%s" % shlex.quote(json.dumps(properties)))
+
         assert main_jar_path, 'main_jar_path should not be none or empty but it is none or empty.'
         if main_jar_path.startswith('abfs'):
             main_jar_cloud_path = main_jar_path

diff --git a/feathr_project/feathr/client.py b/feathr_project/feathr/client.py
@@ -1,4 +1,5 @@
 import base64
+import json
 import logging
 import os
 import tempfile
@@ -186,7 +187,10 @@ def _check_required_environment_variables_exist(self):
 
         Some required information has to be set via environment variables so the client can work.
         """
-        for required_field in self.required_fields:
+        props = []
+        if hasattr(self, "system_properties"):
+            props = self.system_properties
+        for required_field in (self.required_fields + props):
             if required_field not in os.environ:
                 raise RuntimeError(f'{required_field} is not set in environment variable. All required environment '
                                    f'variables are: {self.required_fields}.')
@@ -235,6 +239,14 @@ def build_features(self, anchor_list: List[FeatureAnchor] = [], derived_feature_
         self.anchor_list = anchor_list
         self.derived_feature_list = derived_feature_list
 
+        # Check if data source used by every anchor requires additional system properties to be set
+        props = []
+        for anchor in self.anchor_list:
+            if hasattr(anchor.source, "get_required_properties"):
+                props.extend(anchor.source.get_required_properties())
+        if len(props)>0:
+            self.system_properties = props
+
     def list_registered_features(self, project_name: str = None) -> List[str]:
         """List all the already registered features. If project_name is not provided or is None, it will return all
         the registered features; otherwise it will only return features under this project
@@ -480,10 +492,11 @@ def _get_offline_features_with_config(self, feature_join_conf_path='feature_join
                 '--adls-config', self._get_adls_config_str(),
                 '--blob-config', self._get_blob_config_str(),
                 '--sql-config', self._get_sql_config_str(),
-                '--snowflake-config', self._get_snowflake_config_str()
+                '--snowflake-config', self._get_snowflake_config_str(),
             ],
             reference_files_path=[],
-            configuration=execution_configuratons
+            configuration=execution_configuratons,
+            system_properties=self._get_system_properties()
         )
 
     def get_job_result_uri(self, block=True, timeout_sec=300) -> str:
@@ -581,6 +594,7 @@ def _materialize_features_with_config(self, feature_gen_conf_path: str = 'featur
             ],
             reference_files_path=[],
             configuration=execution_configuratons,
+            system_properties=self._get_system_properties()
         )
 
 
@@ -687,6 +701,19 @@ def _get_snowflake_config_str(self):
             """.format(JDBC_SF_URL=sf_url, JDBC_SF_USER=sf_user, JDBC_SF_PASSWORD=sf_password, JDBC_SF_ROLE=sf_role)
         return config_str
 
+    def _get_system_properties(self):
+        """Go through all data sources and fill all required system properties"""
+        prop_and_value = {}
+        if hasattr(self, "system_properties"):
+            for prop in self.system_properties:
+                prop_and_value[prop] = self.envutils.get_environment_variable_with_default(prop)
+            return prop_and_value
+        return None
+
+    def get_features_from_registry(self, project_name):
+        """ Sync features from the registry given a project name """
+        # TODO - Add support for customized workspace path
+        self.registry.get_features_from_registry(project_name, os.path.abspath("./"))
     def _get_kafka_config_str(self):
         """Construct the Kafka config string. The endpoint, access key, secret key, and other parameters can be set via
         environment variables."""

diff --git a/feathr_project/feathr/source.py b/feathr_project/feathr/source.py
@@ -110,6 +110,65 @@ def __str__(self):
         return str(self.preprocessing) + '\n' + self.to_feature_config()
 
 
+class JdbcSource(Source):
+    def __init__(self, name: str, url: str = "", dbtable: Optional[str] = None, query: Optional[str] = None, auth: Optional[str] = None, preprocessing: Optional[Callable] = None                 ,event_timestamp_column: Optional[str] = None, timestamp_format: Optional[str] = "epoch",registry_tags: Optional[Dict[str, str]] = None) -> None:
+        super().__init__(name, event_timestamp_column, timestamp_format, registry_tags)
+        self.preprocessing = preprocessing
+        self.url = url
+        if dbtable is not None:
+            self.dbtable = dbtable
+        if query is not None:
+            self.query = query
+        if auth is not None:
+            self.auth = auth.upper()
+            if self.auth not in ["USERPASS", "TOKEN"]:
+                raise ValueError("auth must be None or one of following values: ['userpass', 'token']")
+
+    def get_required_properties(self):
+        if not hasattr(self, "auth"):
+            return []
+        if self.auth == "USERPASS":
+            return ["%s_USER" % self.name, "%s_PASSWORD" % self.name]
+        elif self.auth == "TOKEN":
+            return ["%s_TOKEN" % self.name]
+
+    def to_feature_config(self) -> str:
+        tm = Template("""  
+            {{source.name}}: {
+                location: {
+                    type: "jdbc"
+                    url: "{{source.url}}"
+                    {% if source.dbtable is defined %}
+                    dbtable: "{{source.dbtable}}"
+                    {% endif %}
+                    {% if source.query is defined %}
+                    query: "{{source.query}}"
+                    {% endif %}
+                    {% if source.auth is defined %}
+                        {% if source.auth == "USERPASS" %}
+                    user: "${{ "{" }}{{source.name}}_USER{{ "}" }}"
+                    password: "${{ "{" }}{{source.name}}_PASSWORD{{ "}" }}"
+                        {% else %}
+                    useToken: true
+                    token: "${{ "{" }}{{source.name}}_TOKEN{{ "}" }}"
+                        {% endif %}
+                    {% else %}
+                    anonymous: true
+                    {% endif %}
+                }
+                {% if source.event_timestamp_column is defined %}
+                    timeWindowParameters: {
+                        timestampColumn: "{{source.event_timestamp_column}}"
+                        timestampColumnFormat: "{{source.timestamp_format}}"
+                    }
+                {% endif %}
+            } 
+        """)
+        msg = tm.render(source=self)
+        return msg
+
+    def __str__(self):
+        return str(self.preprocessing) + '\n' + self.to_feature_config()
 class KafkaConfig:
     """Kafka config for a streaming source
     Attributes:

diff --git a/feathr_project/test/test_feature_anchor.py b/feathr_project/test/test_feature_anchor.py
@@ -1,6 +1,7 @@
 from feathr import FeatureAnchor
 from feathr import Feature
 from feathr import HdfsSource
+from feathr.source import JdbcSource
 from feathr import BOOLEAN, INT32, FLOAT, ValueType
 from feathr import INPUT_CONTEXT
 from feathr import WindowAggTransformation
@@ -167,4 +168,70 @@ def test_agg_anchor_to_config():
                 }
             }
         """
-    assert ''.join(agg_anchor.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())
+    assert ''.join(agg_anchor.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())
+
+def test_jdbc_source_to_config():
+    batch_source = JdbcSource(name="nycTaxiBatchSource",
+                              url="jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase",
+                              dbtable="table1",
+                              event_timestamp_column="c1")
+
+    expected_agg_feature_config = """
+        nycTaxiBatchSource: {
+            location: {
+                type: "jdbc"
+                url: "jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase"
+                dbtable: "table1"
+                anonymous: true
+            }
+            timeWindowParameters: {
+                timestampColumn: "c1"
+                timestampColumnFormat: "epoch"
+            }
+        }
+        """
+    assert ''.join(batch_source.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())
+
+    batch_source = JdbcSource(name="nycTaxiBatchSource",
+                              url="jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase",
+                              dbtable="table1",
+                              auth="userpass",
+                              event_timestamp_column="c1")
+    expected_agg_feature_config = """
+        nycTaxiBatchSource: {
+            location: {
+                type: "jdbc"
+                url: "jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase"
+                dbtable: "table1"
+                user: "${nycTaxiBatchSource_USER}"
+                password: "${nycTaxiBatchSource_PASSWORD}"
+            }
+            timeWindowParameters: {
+                timestampColumn: "c1"
+                timestampColumnFormat: "epoch"
+            }
+        }
+        """
+    assert ''.join(batch_source.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())
+
+    batch_source = JdbcSource(name="nycTaxiBatchSource",
+                              url="jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase",
+                              dbtable="table1",
+                              auth="token",
+                              event_timestamp_column="c1")
+    expected_agg_feature_config = """
+        nycTaxiBatchSource: {
+            location: {
+                type: "jdbc"
+                url: "jdbc:sqlserver://myserver.database.windows.net:1433;database=mydatabase"
+                dbtable: "table1"
+                useToken:true
+                token: "${nycTaxiBatchSource_TOKEN}"
+            }
+            timeWindowParameters: {
+                timestampColumn: "c1"
+                timestampColumnFormat: "epoch"
+            }
+        }
+        """
+    assert ''.join(batch_source.to_feature_config().split()) == ''.join(expected_agg_feature_config.split())