Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 9 additions & 14 deletions sdk/python/feast/job.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
import tempfile
from typing import List
from urllib.parse import urlparse

import fastavro
import pandas as pd
from google.cloud import storage
from google.protobuf.json_format import MessageToJson

from feast.constants import CONFIG_TIMEOUT_KEY
Expand All @@ -23,9 +21,17 @@
from feast.serving.ServingService_pb2 import Job as JobProto
from feast.serving.ServingService_pb2_grpc import ServingServiceStub
from feast.source import Source
from feast.staging.storage_client import get_staging_client
from feast.wait import wait_retry_backoff
from tensorflow_metadata.proto.v0 import statistics_pb2

# Maximum no of seconds to wait until the retrieval jobs status is DONE in Feast
# Currently set to the maximum query execution time limit in BigQuery
DEFAULT_TIMEOUT_SEC: int = 21600

# Maximum no of seconds to wait before reloading the job status in Feast
MAX_WAIT_INTERVAL_SEC: int = 60


class RetrievalJob:
"""
Expand All @@ -42,8 +48,6 @@ def __init__(
"""
self.job_proto = job_proto
self.serving_stub = serving_stub
# TODO: abstract away GCP depedency
self.gcs_client = storage.Client(project=None)

@property
def id(self):
Expand Down Expand Up @@ -117,16 +121,7 @@ def result(self, timeout_sec: int = int(defaults[CONFIG_TIMEOUT_KEY])):
"""
uris = self.get_avro_files(timeout_sec)
for file_uri in uris:
if file_uri.scheme == "gs":
file_obj = tempfile.TemporaryFile()
self.gcs_client.download_blob_to_file(file_uri.geturl(), file_obj)
elif file_uri.scheme == "file":
file_obj = open(file_uri.path, "rb")
else:
raise Exception(
f"Could not identify file URI {file_uri}. Only gs:// and file:// supported"
)

file_obj = get_staging_client(file_uri.scheme).download_file(file_uri)
file_obj.seek(0)
avro_reader = fastavro.reader(file_obj)

Expand Down
119 changes: 21 additions & 98 deletions sdk/python/feast/loaders/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,18 @@
# limitations under the License.

import os
import re
import shutil
import tempfile
import uuid
from datetime import datetime
from typing import List, Optional, Tuple, Union
from urllib.parse import ParseResult, urlparse
from urllib.parse import urlparse

import pandas as pd
from google.cloud import storage
from pandavro import to_avro

from feast.staging.storage_client import get_staging_client


def export_source_to_staging_location(
source: Union[pd.DataFrame, str], staging_location_uri: str
Expand All @@ -44,12 +44,14 @@ def export_source_to_staging_location(
* Pandas DataFrame
* Local Avro file
* GCS Avro file
* S3 Avro file


staging_location_uri (str):
Remote staging location where DataFrame should be written.
Examples:
* gs://bucket/path/
* s3://bucket/path/
* file:///data/subfolder/

Returns:
Expand All @@ -66,52 +68,37 @@ def export_source_to_staging_location(
uri_path = None # type: Optional[str]
if uri.scheme == "file":
uri_path = uri.path

# Remote gs staging location provided by serving
dir_path, file_name, source_path = export_dataframe_to_local(
df=source, dir_path=uri_path
)
elif urlparse(source).scheme in ["", "file"]:
# Local file provided as a source
dir_path = ""
file_name = os.path.basename(source)
source_path = os.path.abspath(
os.path.join(urlparse(source).netloc, urlparse(source).path)
)
elif urlparse(source).scheme == "gs":
# Google Cloud Storage path provided
input_source_uri = urlparse(source)
if "*" in source:
# Wildcard path
return _get_files(
bucket=str(input_source_uri.hostname), uri=input_source_uri
elif isinstance(source, str):
source_uri = urlparse(source)
if source_uri.scheme in ["", "file"]:
# Local file provided as a source
dir_path = ""
file_name = os.path.basename(source)
source_path = os.path.abspath(
os.path.join(source_uri.netloc, source_uri.path)
)
else:
return [source]
# gs, s3 file provided as a source.
return get_staging_client(source_uri.scheme).list_files(
bucket=source_uri.hostname, path=source_uri.path
)
else:
raise Exception(
f"Only string and DataFrame types are allowed as a "
f"source, {type(source)} was provided."
)

# Push data to required staging location
if uri.scheme == "gs":
# Staging location is a Google Cloud Storage path
upload_file_to_gcs(
source_path, str(uri.hostname), str(uri.path).strip("/") + "/" + file_name
)
elif uri.scheme == "file":
# Staging location is a file path
# Used for end-to-end test
pass
else:
raise Exception(
f"Staging location {staging_location_uri} does not have a "
f"valid URI. Only gs:// and file:// uri scheme are supported."
)
get_staging_client(uri.scheme).upload_file(
source_path, uri.hostname, str(uri.path).strip("/") + "/" + file_name,
)

# Clean up, remove local staging file
if isinstance(source, pd.DataFrame) and len(str(dir_path)) > 4:
if dir_path and isinstance(source, pd.DataFrame) and len(dir_path) > 4:
shutil.rmtree(dir_path)

return [staging_location_uri.rstrip("/") + "/" + file_name]
Expand Down Expand Up @@ -162,70 +149,6 @@ def export_dataframe_to_local(
return dir_path, file_name, dest_path


def upload_file_to_gcs(local_path: str, bucket: str, remote_path: str) -> None:
"""
Upload a file from the local file system to Google Cloud Storage (GCS).

Args:
local_path (str):
Local filesystem path of file to upload.

bucket (str):
GCS bucket destination to upload to.

remote_path (str):
Path within GCS bucket to upload file to, includes file name.

Returns:
None:
None
"""

storage_client = storage.Client(project=None)
bucket_storage = storage_client.get_bucket(bucket)
blob = bucket_storage.blob(remote_path)
blob.upload_from_filename(local_path)


def _get_files(bucket: str, uri: ParseResult) -> List[str]:
"""
List all available files within a Google storage bucket that matches a wild
card path.

Args:
bucket (str):
Google Storage bucket to reference.

uri (urllib.parse.ParseResult):
Wild card uri path containing the "*" character.
Example:
* gs://feast/staging_location/*
* gs://feast/staging_location/file_*.avro

Returns:
List[str]:
List of all available files matching the wildcard path.
"""

storage_client = storage.Client(project=None)
bucket_storage = storage_client.get_bucket(bucket)
path = uri.path

if "*" in path:
regex = re.compile(path.replace("*", ".*?").strip("/"))
blob_list = bucket_storage.list_blobs(
prefix=path.strip("/").split("*")[0], delimiter="/"
)
# File path should not be in path (file path must be longer than path)
return [
f"{uri.scheme}://{uri.hostname}/{file}"
for file in [x.name for x in blob_list]
if re.match(regex, file) and file not in path
]
else:
raise Exception(f"{path} is not a wildcard path")


def _get_file_name() -> str:
"""
Create a random file name.
Expand Down
Empty file.
Loading