feast-dev · feast-ci-bot · Jun 28, 2020 · Jun 1, 2020 · Jun 5, 2020 · Jun 8, 2020
@@ -1,10 +1,8 @@
-import tempfile
 from typing import List
 from urllib.parse import urlparse
 
 import fastavro
 import pandas as pd
-from google.cloud import storage
 from google.protobuf.json_format import MessageToJson
 
 from feast.constants import CONFIG_TIMEOUT_KEY
@@ -23,9 +21,17 @@
 from feast.serving.ServingService_pb2 import Job as JobProto
 from feast.serving.ServingService_pb2_grpc import ServingServiceStub
 from feast.source import Source
+from feast.staging.storage_client import get_staging_client
 from feast.wait import wait_retry_backoff
 from tensorflow_metadata.proto.v0 import statistics_pb2
 
+# Maximum no of seconds to wait until the retrieval jobs status is DONE in Feast
+# Currently set to the maximum query execution time limit in BigQuery
+DEFAULT_TIMEOUT_SEC: int = 21600
+
+# Maximum no of seconds to wait before reloading the job status in Feast
+MAX_WAIT_INTERVAL_SEC: int = 60
+
 
 class RetrievalJob:
     """
@@ -42,8 +48,6 @@ def __init__(
         """
         self.job_proto = job_proto
         self.serving_stub = serving_stub
-        # TODO: abstract away GCP depedency
-        self.gcs_client = storage.Client(project=None)
 
     @property
     def id(self):
@@ -117,16 +121,7 @@ def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])):
         """
         uris = self.get_avro_files(timeout_sec)
         for file_uri in uris:
-            if file_uri.scheme == "gs":
-                file_obj = tempfile.TemporaryFile()
-                self.gcs_client.download_blob_to_file(file_uri.geturl(), file_obj)
-            elif file_uri.scheme == "file":
-                file_obj = open(file_uri.path, "rb")
-            else:
-                raise Exception(
-                    f"Could not identify file URI {file_uri}. Only gs:// and file:// supported"
-                )
-
+            file_obj = get_staging_client(file_uri.scheme).download_file(file_uri)
             file_obj.seek(0)
             avro_reader = fastavro.reader(file_obj)
 

@@ -13,18 +13,18 @@
 # limitations under the License.
 
 import os
-import re
 import shutil
 import tempfile
 import uuid
 from datetime import datetime
 from typing import List, Optional, Tuple, Union
-from urllib.parse import ParseResult, urlparse
+from urllib.parse import urlparse
 
 import pandas as pd
-from google.cloud import storage
 from pandavro import to_avro
 
+from feast.staging.storage_client import get_staging_client
+
 
 def export_source_to_staging_location(
     source: Union[pd.DataFrame, str], staging_location_uri: str
@@ -44,12 +44,14 @@ def export_source_to_staging_location(
                 * Pandas DataFrame
                 * Local Avro file
                 * GCS Avro file
+                * S3 Avro file
 
 
         staging_location_uri (str):
             Remote staging location where DataFrame should be written.
             Examples:
                 * gs://bucket/path/
+                * s3://bucket/path/
                 * file:///data/subfolder/
 
     Returns:
@@ -66,52 +68,37 @@ def export_source_to_staging_location(
         uri_path = None  # type: Optional[str]
         if uri.scheme == "file":
             uri_path = uri.path
-
         # Remote gs staging location provided by serving
         dir_path, file_name, source_path = export_dataframe_to_local(
             df=source, dir_path=uri_path
         )
-    elif urlparse(source).scheme in ["", "file"]:
-        # Local file provided as a source
-        dir_path = ""
-        file_name = os.path.basename(source)
-        source_path = os.path.abspath(
-            os.path.join(urlparse(source).netloc, urlparse(source).path)
-        )
-    elif urlparse(source).scheme == "gs":
-        # Google Cloud Storage path provided
-        input_source_uri = urlparse(source)
-        if "*" in source:
-            # Wildcard path
-            return _get_files(
-                bucket=str(input_source_uri.hostname), uri=input_source_uri
+    elif isinstance(source, str):
+        source_uri = urlparse(source)
+        if source_uri.scheme in ["", "file"]:
+            # Local file provided as a source
+            dir_path = ""
+            file_name = os.path.basename(source)
+            source_path = os.path.abspath(
+                os.path.join(source_uri.netloc, source_uri.path)
             )
         else:
-            return [source]
+            # gs, s3 file provided as a source.
+            return get_staging_client(source_uri.scheme).list_files(
+                bucket=source_uri.hostname, path=source_uri.path
+            )
     else:
         raise Exception(
             f"Only string and DataFrame types are allowed as a "
             f"source, {type(source)} was provided."
         )
 
     # Push data to required staging location
-    if uri.scheme == "gs":
-        # Staging location is a Google Cloud Storage path
-        upload_file_to_gcs(
-            source_path, str(uri.hostname), str(uri.path).strip("/") + "/" + file_name
-        )
-    elif uri.scheme == "file":
-        # Staging location is a file path
-        # Used for end-to-end test
-        pass
-    else:
-        raise Exception(
-            f"Staging location {staging_location_uri} does not have a "
-            f"valid URI. Only gs:// and file:// uri scheme are supported."
-        )
+    get_staging_client(uri.scheme).upload_file(
+        source_path, uri.hostname, str(uri.path).strip("/") + "/" + file_name,
+    )
 
     # Clean up, remove local staging file
-    if isinstance(source, pd.DataFrame) and len(str(dir_path)) > 4:
+    if dir_path and isinstance(source, pd.DataFrame) and len(dir_path) > 4:
         shutil.rmtree(dir_path)
 
     return [staging_location_uri.rstrip("/") + "/" + file_name]
@@ -162,70 +149,6 @@ def export_dataframe_to_local(
     return dir_path, file_name, dest_path
 
 
-def upload_file_to_gcs(local_path: str, bucket: str, remote_path: str) -> None:
-    """
-    Upload a file from the local file system to Google Cloud Storage (GCS).
-
-    Args:
-        local_path (str):
-            Local filesystem path of file to upload.
-
-        bucket (str):
-            GCS bucket destination to upload to.
-
-        remote_path (str):
-            Path within GCS bucket to upload file to, includes file name.
-
-    Returns:
-        None:
-            None
-    """
-
-    storage_client = storage.Client(project=None)
-    bucket_storage = storage_client.get_bucket(bucket)
-    blob = bucket_storage.blob(remote_path)
-    blob.upload_from_filename(local_path)
-
-
-def _get_files(bucket: str, uri: ParseResult) -> List[str]:
-    """
-    List all available files within a Google storage bucket that matches a wild
-    card path.
-
-    Args:
-        bucket (str):
-            Google Storage bucket to reference.
-
-        uri (urllib.parse.ParseResult):
-            Wild card uri path containing the "*" character.
-            Example:
-                * gs://feast/staging_location/*
-                * gs://feast/staging_location/file_*.avro
-
-    Returns:
-        List[str]:
-            List of all available files matching the wildcard path.
-    """
-
-    storage_client = storage.Client(project=None)
-    bucket_storage = storage_client.get_bucket(bucket)
-    path = uri.path
-
-    if "*" in path:
-        regex = re.compile(path.replace("*", ".*?").strip("/"))
-        blob_list = bucket_storage.list_blobs(
-            prefix=path.strip("/").split("*")[0], delimiter="/"
-        )
-        # File path should not be in path (file path must be longer than path)
-        return [
-            f"{uri.scheme}://{uri.hostname}/{file}"
-            for file in [x.name for x in blob_list]
-            if re.match(regex, file) and file not in path
-        ]
-    else:
-        raise Exception(f"{path} is not a wildcard path")
-
-
 def _get_file_name() -> str:
     """
     Create a random file name.