Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 42 additions & 7 deletions python/feast_spark/client.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,21 @@
import configparser
import os
import uuid
from datetime import datetime
from datetime import datetime, timedelta
from itertools import groupby
from typing import Dict, List, Optional, Union, cast

import pandas as pd
import redis
from croniter import croniter
from google.cloud import bigquery

import feast
from feast.config import Config
from feast.constants import ConfigOptions as feast_opt
from feast.data_source import BigQuerySource, FileSource
from feast.grpc.grpc import create_grpc_channel
from feast.staging.entities import (
stage_entities_to_bq,
stage_entities_to_fs,
table_reference_from_string,
)
from feast.staging.entities import stage_entities_to_fs, table_reference_from_string
from feast_spark.api.JobService_pb2 import (
GetHealthMetricsRequest,
GetHistoricalFeaturesRequest,
Expand Down Expand Up @@ -51,6 +48,44 @@
)


def stage_entities_to_bq_with_partition(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid duplication by adding partition as an optional argument in existing feast.staging.entities file ?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The issue is that, we would have to make this change in the feast repository, and we are having issue publishing Python SDK for branch 0.9 at the moment.

entity_source: pd.DataFrame, project: str, dataset: str
) -> BigQuerySource:
"""
Stores given (entity) dataframe as new table in BQ. Name of the table generated based on current time.
Table will expire in 1 day.
Returns BigQuerySource with reference to created table.
"""

bq_client: bigquery.Client = bigquery.Client()
destination = bigquery.TableReference(
bigquery.DatasetReference(project, dataset),
f"_entities_{datetime.now():%Y%m%d%H%M%s}",
)

# prevent casting ns -> ms exception inside pyarrow
entity_source["event_timestamp"] = entity_source["event_timestamp"].dt.floor("ms")

load_job_config = bigquery.LoadJobConfig(
time_partitioning=bigquery.TimePartitioning(
type_=bigquery.TimePartitioningType.DAY, field="event_timestamp",
)
)
load_job: bigquery.LoadJob = bq_client.load_table_from_dataframe(
entity_source, destination, job_config=load_job_config,
)
load_job.result() # wait until complete

dest_table: bigquery.Table = bq_client.get_table(destination)
dest_table.expires = datetime.now() + timedelta(days=1)
bq_client.update_table(dest_table, fields=["expires"])

return BigQuerySource(
event_timestamp_column="event_timestamp",
table_ref=f"{destination.project}:{destination.dataset_id}.{destination.table_id}",
)


class Client:
_feast: feast.Client

Expand Down Expand Up @@ -197,7 +232,7 @@ def get_historical_features(
staging_bq_project = source_ref.project
staging_bq_dataset = source_ref.dataset_id

entity_source = stage_entities_to_bq(
entity_source = stage_entities_to_bq_with_partition(
entity_source, staging_bq_project, staging_bq_dataset
)
else:
Expand Down
10 changes: 9 additions & 1 deletion python/feast_spark/job_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@
start_stream_to_online_ingestion,
unschedule_offline_to_online_ingestion,
)
from feast_spark.pyspark.launchers.k8s.k8s import JobNotFoundException
from feast_spark.third_party.grpc.health.v1.HealthService_pb2 import (
HealthCheckResponse,
ServingStatus,
Expand Down Expand Up @@ -437,9 +438,16 @@ def ensure_stream_ingestion_jobs(client: Client, all_projects: bool):
opt.JOB_SERVICE_RETRY_FAILED_JOBS
)
):
status = None
try:
status = job.get_status()
except JobNotFoundException:
logger.warning(f"{job.get_id()} was already removed")

if (
isinstance(job, StreamIngestionJob)
and job.get_status() != SparkJobStatus.COMPLETED
and status is not None
and status != SparkJobStatus.COMPLETED
):
jobs_by_hash[job.get_hash()] = job

Expand Down
13 changes: 10 additions & 3 deletions python/feast_spark/pyspark/launchers/k8s/k8s.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ def _truncate_label(label: str) -> str:
return label[:63]


class JobNotFoundException(Exception):
pass


class KubernetesJobMixin:
def __init__(self, api: CustomObjectsApi, namespace: str, job_id: str):
self._api = api
Expand All @@ -85,17 +89,20 @@ def get_id(self) -> str:

def get_error_message(self) -> str:
job = _get_job_by_id(self._api, self._namespace, self._job_id)
assert job is not None
if job is None:
raise JobNotFoundException()
return job.job_error_message

def get_status(self) -> SparkJobStatus:
job = _get_job_by_id(self._api, self._namespace, self._job_id)
assert job is not None
if job is None:
raise JobNotFoundException
return job.state

def get_start_time(self) -> datetime:
job = _get_job_by_id(self._api, self._namespace, self._job_id)
assert job is not None
if job is None:
raise JobNotFoundException
return job.start_time

def cancel(self):
Expand Down