Skip to content

Commit fe520a9

Browse files
Yansonfeast-ci-bot
authored andcommitted
Add retry options to BigQuery (#431)
* Add retry options to BigQuery Add two fields to job properties and use them to set retry options in all BigQuery job waitFor calls, as per Google example. * Change timeout defaults and add config to e2e test to fix failure.
1 parent ba1c828 commit fe520a9

File tree

6 files changed

+90
-12
lines changed

6 files changed

+90
-12
lines changed

.prow/scripts/test-end-to-end-batch.sh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,8 @@ feast:
204204
jobs:
205205
staging-location: ${JOBS_STAGING_LOCATION}
206206
store-type: REDIS
207+
bigquery-initial-retry-delay-secs: 1
208+
bigquery-total-timeout-secs: 900
207209
store-options:
208210
host: localhost
209211
port: 6379

serving/src/main/java/feast/serving/FeastProperties.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,13 +113,23 @@ public void setRedisPoolMaxIdle(int redisPoolMaxIdle) {
113113

114114
public static class JobProperties {
115115
private String stagingLocation;
116+
private int bigqueryInitialRetryDelaySecs;
117+
private int bigqueryTotalTimeoutSecs;
116118
private String storeType;
117119
private Map<String, String> storeOptions;
118120

119121
public String getStagingLocation() {
120122
return this.stagingLocation;
121123
}
122124

125+
public int getBigqueryInitialRetryDelaySecs() {
126+
return bigqueryInitialRetryDelaySecs;
127+
}
128+
129+
public int getBigqueryTotalTimeoutSecs() {
130+
return bigqueryTotalTimeoutSecs;
131+
}
132+
123133
public String getStoreType() {
124134
return this.storeType;
125135
}
@@ -132,6 +142,14 @@ public void setStagingLocation(String stagingLocation) {
132142
this.stagingLocation = stagingLocation;
133143
}
134144

145+
public void setBigqueryInitialRetryDelaySecs(int bigqueryInitialRetryDelaySecs) {
146+
this.bigqueryInitialRetryDelaySecs = bigqueryInitialRetryDelaySecs;
147+
}
148+
149+
public void setBigqueryTotalTimeoutSecs(int bigqueryTotalTimeoutSecs) {
150+
this.bigqueryTotalTimeoutSecs = bigqueryTotalTimeoutSecs;
151+
}
152+
135153
public void setStoreType(String storeType) {
136154
this.storeType = storeType;
137155
}

serving/src/main/java/feast/serving/configuration/ServingServiceConfig.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ public ServingService servingService(
132132
specService,
133133
jobService,
134134
jobStagingLocation,
135+
feastProperties.getJobs().getBigqueryInitialRetryDelaySecs(),
136+
feastProperties.getJobs().getBigqueryTotalTimeoutSecs(),
135137
storage);
136138
break;
137139
case CASSANDRA:

serving/src/main/java/feast/serving/service/BigQueryServingService.java

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import static feast.serving.store.bigquery.QueryTemplater.generateFullTableName;
2121
import static feast.serving.util.Metrics.requestLatency;
2222

23+
import com.google.cloud.RetryOption;
2324
import com.google.cloud.bigquery.BigQuery;
2425
import com.google.cloud.bigquery.BigQueryException;
2526
import com.google.cloud.bigquery.Field;
@@ -57,12 +58,12 @@
5758
import java.util.Optional;
5859
import java.util.UUID;
5960
import java.util.stream.Collectors;
60-
import org.joda.time.Duration;
6161
import org.slf4j.Logger;
62+
import org.threeten.bp.Duration;
6263

6364
public class BigQueryServingService implements ServingService {
6465

65-
public static final long TEMP_TABLE_EXPIRY_DURATION_MS = Duration.standardDays(1).getMillis();
66+
public static final long TEMP_TABLE_EXPIRY_DURATION_MS = Duration.ofDays(1).toMillis();
6667
private static final Logger log = org.slf4j.LoggerFactory.getLogger(BigQueryServingService.class);
6768

6869
private final BigQuery bigquery;
@@ -71,6 +72,8 @@ public class BigQueryServingService implements ServingService {
7172
private final CachedSpecService specService;
7273
private final JobService jobService;
7374
private final String jobStagingLocation;
75+
private final int initialRetryDelaySecs;
76+
private final int totalTimeoutSecs;
7477
private final Storage storage;
7578

7679
public BigQueryServingService(
@@ -80,13 +83,17 @@ public BigQueryServingService(
8083
CachedSpecService specService,
8184
JobService jobService,
8285
String jobStagingLocation,
86+
int initialRetryDelaySecs,
87+
int totalTimeoutSecs,
8388
Storage storage) {
8489
this.bigquery = bigquery;
8590
this.projectId = projectId;
8691
this.datasetId = datasetId;
8792
this.specService = specService;
8893
this.jobService = jobService;
8994
this.jobStagingLocation = jobStagingLocation;
95+
this.initialRetryDelaySecs = initialRetryDelaySecs;
96+
this.totalTimeoutSecs = totalTimeoutSecs;
9097
this.storage = storage;
9198
}
9299

@@ -156,6 +163,8 @@ public GetBatchFeaturesResponse getBatchFeatures(GetBatchFeaturesRequest getFeat
156163
.setEntityTableColumnNames(entityNames)
157164
.setFeatureSetInfos(featureSetInfos)
158165
.setJobStagingLocation(jobStagingLocation)
166+
.setInitialRetryDelaySecs(initialRetryDelaySecs)
167+
.setTotalTimeoutSecs(totalTimeoutSecs)
159168
.build())
160169
.start();
161170

@@ -199,7 +208,7 @@ private Table loadEntities(DatasetSource datasetSource) {
199208
loadJobConfiguration =
200209
loadJobConfiguration.toBuilder().setUseAvroLogicalTypes(true).build();
201210
Job job = bigquery.create(JobInfo.of(loadJobConfiguration));
202-
job.waitFor();
211+
waitForJob(job);
203212

204213
TableInfo expiry =
205214
bigquery
@@ -239,15 +248,15 @@ private TableId generateUUIDs(Table loadedEntityTable) {
239248
.setDestinationTable(TableId.of(projectId, datasetId, createTempTableName()))
240249
.build();
241250
Job queryJob = bigquery.create(JobInfo.of(queryJobConfig));
242-
queryJob.waitFor();
251+
Job completedJob = waitForJob(queryJob);
243252
TableInfo expiry =
244253
bigquery
245254
.getTable(queryJobConfig.getDestinationTable())
246255
.toBuilder()
247256
.setExpirationTime(System.currentTimeMillis() + TEMP_TABLE_EXPIRY_DURATION_MS)
248257
.build();
249258
bigquery.update(expiry);
250-
queryJobConfig = queryJob.getConfiguration();
259+
queryJobConfig = completedJob.getConfiguration();
251260
return queryJobConfig.getDestinationTable();
252261
} catch (InterruptedException | BigQueryException e) {
253262
throw Status.INTERNAL
@@ -257,6 +266,22 @@ private TableId generateUUIDs(Table loadedEntityTable) {
257266
}
258267
}
259268

269+
private Job waitForJob(Job queryJob) throws InterruptedException {
270+
Job completedJob = queryJob.waitFor(
271+
RetryOption.initialRetryDelay(Duration.ofSeconds(initialRetryDelaySecs)),
272+
RetryOption.totalTimeout(Duration.ofSeconds(totalTimeoutSecs)));
273+
if (completedJob == null) {
274+
throw Status.INTERNAL
275+
.withDescription("Job no longer exists")
276+
.asRuntimeException();
277+
} else if (completedJob.getStatus().getError() != null) {
278+
throw Status.INTERNAL
279+
.withDescription("Job failed: " + completedJob.getStatus().getError())
280+
.asRuntimeException();
281+
}
282+
return completedJob;
283+
}
284+
260285
public static String createTempTableName() {
261286
return "_" + UUID.randomUUID().toString().replace("-", "");
262287
}

serving/src/main/java/feast/serving/store/bigquery/BatchRetrievalQueryRunnable.java

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import static feast.serving.store.bigquery.QueryTemplater.createTimestampLimitQuery;
2222

2323
import com.google.auto.value.AutoValue;
24+
import com.google.cloud.RetryOption;
2425
import com.google.cloud.bigquery.BigQuery;
2526
import com.google.cloud.bigquery.BigQueryException;
2627
import com.google.cloud.bigquery.DatasetId;
@@ -51,6 +52,7 @@
5152
import java.util.concurrent.Executors;
5253
import java.util.concurrent.TimeUnit;
5354
import java.util.concurrent.TimeoutException;
55+
import org.threeten.bp.Duration;
5456

5557
/**
5658
* BatchRetrievalQueryRunnable is a Runnable for running a BigQuery Feast batch retrieval job async.
@@ -96,6 +98,10 @@ public abstract class BatchRetrievalQueryRunnable implements Runnable {
9698

9799
public abstract String jobStagingLocation();
98100

101+
public abstract int initialRetryDelaySecs();
102+
103+
public abstract int totalTimeoutSecs();
104+
99105
public abstract Storage storage();
100106

101107
public static Builder builder() {
@@ -122,6 +128,10 @@ public abstract static class Builder {
122128

123129
public abstract Builder setJobStagingLocation(String jobStagingLocation);
124130

131+
public abstract Builder setInitialRetryDelaySecs(int initialRetryDelaySecs);
132+
133+
public abstract Builder setTotalTimeoutSecs(int totalTimeoutSecs);
134+
125135
public abstract Builder setStorage(Storage storage);
126136

127137
public abstract BatchRetrievalQueryRunnable build();
@@ -151,7 +161,7 @@ public void run() {
151161
ExtractJobConfiguration.of(
152162
queryConfig.getDestinationTable(), exportTableDestinationUri, "Avro");
153163
Job extractJob = bigquery().create(JobInfo.of(extractConfig));
154-
extractJob.waitFor();
164+
waitForJob(extractJob);
155165
} catch (BigQueryException | InterruptedException | IOException e) {
156166
jobService()
157167
.upsert(
@@ -200,7 +210,6 @@ private List<String> parseOutputFileURIs() {
200210

201211
Job runBatchQuery(List<String> featureSetQueries)
202212
throws BigQueryException, InterruptedException, IOException {
203-
Job queryJob;
204213
ExecutorService executorService = Executors.newFixedThreadPool(featureSetQueries.size());
205214
ExecutorCompletionService<FeatureSetInfo> executorCompletionService =
206215
new ExecutorCompletionService<>(executorService);
@@ -257,8 +266,8 @@ Job runBatchQuery(List<String> featureSetQueries)
257266
QueryJobConfiguration.newBuilder(joinQuery)
258267
.setDestinationTable(TableId.of(projectId(), datasetId(), createTempTableName()))
259268
.build();
260-
queryJob = bigquery().create(JobInfo.of(queryJobConfig));
261-
queryJob.waitFor();
269+
Job queryJob = bigquery().create(JobInfo.of(queryJobConfig));
270+
Job completedQueryJob = waitForJob(queryJob);
262271

263272
TableInfo expiry =
264273
bigquery()
@@ -268,7 +277,7 @@ Job runBatchQuery(List<String> featureSetQueries)
268277
.build();
269278
bigquery().update(expiry);
270279

271-
return queryJob;
280+
return completedQueryJob;
272281
}
273282

274283
private List<String> generateQueries(FieldValueList timestampLimits) {
@@ -302,7 +311,7 @@ private FieldValueList getTimestampLimits(String entityTableName) {
302311
.build();
303312
try {
304313
Job job = bigquery().create(JobInfo.of(getTimestampLimitsQuery));
305-
TableResult getTimestampLimitsQueryResult = job.waitFor().getQueryResults();
314+
TableResult getTimestampLimitsQueryResult = waitForJob(job).getQueryResults();
306315
TableInfo expiry =
307316
bigquery()
308317
.getTable(getTimestampLimitsQuery.getDestinationTable())
@@ -325,4 +334,21 @@ private FieldValueList getTimestampLimits(String entityTableName) {
325334
.asRuntimeException();
326335
}
327336
}
337+
338+
private Job waitForJob(Job queryJob) throws InterruptedException {
339+
Job completedJob = queryJob.waitFor(
340+
RetryOption.initialRetryDelay(Duration.ofSeconds(initialRetryDelaySecs())),
341+
RetryOption.totalTimeout(Duration.ofSeconds(totalTimeoutSecs())));
342+
if (completedJob == null) {
343+
throw Status.INTERNAL
344+
.withDescription("Job no longer exists")
345+
.asRuntimeException();
346+
} else if (completedJob.getStatus().getError() != null) {
347+
throw Status.INTERNAL
348+
.withDescription("Job failed: " + completedJob.getStatus().getError())
349+
.asRuntimeException();
350+
}
351+
return completedJob;
352+
}
353+
328354
}

serving/src/main/resources/application.yml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,18 @@ feast:
2626
redis-pool-max-idle: ${FEAST_REDIS_POOL_MAX_IDLE:16}
2727

2828
jobs:
29-
# job-staging-location specifies the URI to store intermediate files for batch serving.
29+
# staging-location specifies the URI to store intermediate files for batch serving.
3030
# Feast Serving client is expected to have read access to this staging location
3131
# to download the batch features.
3232
#
3333
# For example: gs://mybucket/myprefix
3434
# Please omit the trailing slash in the URI.
3535
staging-location: ${FEAST_JOB_STAGING_LOCATION:}
36+
#
37+
# Retry options for BigQuery jobs:
38+
bigquery-initial-retry-delay-secs: 1
39+
bigquery-total-timeout-secs: 21600
40+
#
3641
# Type of store to store job metadata. This only needs to be set if the
3742
# serving store type is Bigquery.
3843
store-type: ${FEAST_JOB_STORE_TYPE:}

0 commit comments

Comments
 (0)