Skip to content

Commit 73a0290

Browse files
authored
Merge pull request #19 from baskaranz/feature/kafkaio
KafkaIO implementation for feast
2 parents 3b17d7a + 85c853f commit 73a0290

File tree

10 files changed

+527
-49
lines changed

10 files changed

+527
-49
lines changed

ingestion/pom.xml

Lines changed: 163 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
<com.google.cloud.version>1.35.0</com.google.cloud.version>
3535
<grpcVersion>1.2.0</grpcVersion>
3636
<guice.version>4.1.0</guice.version>
37+
<spring.kafka.version>2.2.2.RELEASE</spring.kafka.version>
3738
</properties>
3839

3940
<build>
@@ -66,6 +67,125 @@
6667
</plugins>
6768
</build>
6869

70+
<profiles>
71+
<profile>
72+
<id>direct-runner</id>
73+
<dependencies>
74+
<dependency>
75+
<groupId>org.apache.beam</groupId>
76+
<artifactId>beam-runners-direct-java</artifactId>
77+
<version>${org.apache.beam.version}</version>
78+
<scope>runtime</scope>
79+
</dependency>
80+
</dependencies>
81+
<build>
82+
<plugins>
83+
<plugin>
84+
<groupId>org.apache.maven.plugins</groupId>
85+
<artifactId>maven-shade-plugin</artifactId>
86+
<version>3.2.1</version>
87+
<executions>
88+
<execution>
89+
<phase>package</phase>
90+
<goals>
91+
<goal>shade</goal>
92+
</goals>
93+
<configuration>
94+
<finalName>${project.artifactId}-direct</finalName>
95+
<transformers>
96+
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
97+
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
98+
<mainClass>feast.ingestion.ImportJob</mainClass>
99+
</transformer>
100+
</transformers>
101+
</configuration>
102+
</execution>
103+
</executions>
104+
</plugin>
105+
</plugins>
106+
</build>
107+
</profile>
108+
<profile>
109+
<id>flink-runner</id>
110+
<dependencies>
111+
<dependency>
112+
<groupId>org.apache.beam</groupId>
113+
<artifactId>beam-runners-flink_2.11</artifactId>
114+
<version>${org.apache.beam.version}</version>
115+
</dependency>
116+
</dependencies>
117+
<build>
118+
<plugins>
119+
<plugin>
120+
<groupId>org.apache.maven.plugins</groupId>
121+
<artifactId>maven-shade-plugin</artifactId>
122+
<version>3.2.1</version>
123+
<executions>
124+
<execution>
125+
<phase>package</phase>
126+
<goals>
127+
<goal>shade</goal>
128+
</goals>
129+
<configuration>
130+
<finalName>${project.artifactId}-flink</finalName>
131+
<transformers>
132+
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer">
133+
</transformer>
134+
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
135+
<mainClass>feast.ingestion.ImportJob</mainClass>
136+
</transformer>
137+
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
138+
<resource>reference.conf</resource>
139+
</transformer>
140+
</transformers>
141+
</configuration>
142+
</execution>
143+
</executions>
144+
</plugin>
145+
</plugins>
146+
</build>
147+
</profile>
148+
<profile>
149+
<id>dataflow-runner</id>
150+
<activation>
151+
<activeByDefault>true</activeByDefault>
152+
</activation>
153+
<dependencies>
154+
<dependency>
155+
<groupId>org.apache.beam</groupId>
156+
<artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
157+
<version>${org.apache.beam.version}</version>
158+
</dependency>
159+
</dependencies>
160+
<build>
161+
<plugins>
162+
<plugin>
163+
<groupId>org.apache.maven.plugins</groupId>
164+
<artifactId>maven-shade-plugin</artifactId>
165+
<version>3.2.1</version>
166+
<executions>
167+
<execution>
168+
<phase>package</phase>
169+
<goals>
170+
<goal>shade</goal>
171+
</goals>
172+
<configuration>
173+
<finalName>${project.artifactId}-dataflow</finalName>
174+
<transformers>
175+
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
176+
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
177+
<mainClass>feast.ingestion.ImportJob</mainClass>
178+
</transformer>
179+
</transformers>
180+
</configuration>
181+
</execution>
182+
</executions>
183+
</plugin>
184+
</plugins>
185+
</build>
186+
</profile>
187+
</profiles>
188+
69189
<dependencies>
70190
<dependency>
71191
<groupId>org.hibernate.validator</groupId>
@@ -214,20 +334,19 @@
214334

215335
<dependency>
216336
<groupId>org.apache.beam</groupId>
217-
<artifactId>beam-sdks-java-io-jdbc</artifactId>
337+
<artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
218338
<version>${org.apache.beam.version}</version>
219339
</dependency>
220340

221-
<!-- Used for local execution (so not in test scope) -->
222341
<dependency>
223342
<groupId>org.apache.beam</groupId>
224-
<artifactId>beam-runners-direct-java</artifactId>
343+
<artifactId>beam-sdks-java-io-jdbc</artifactId>
225344
<version>${org.apache.beam.version}</version>
226345
</dependency>
227346

228347
<dependency>
229348
<groupId>org.apache.beam</groupId>
230-
<artifactId>beam-runners-google-cloud-dataflow-java</artifactId>
349+
<artifactId>beam-sdks-java-io-kafka</artifactId>
231350
<version>${org.apache.beam.version}</version>
232351
</dependency>
233352

@@ -306,12 +425,6 @@
306425
<version>42.2.5</version>
307426
</dependency>
308427

309-
<dependency>
310-
<groupId>org.apache.beam</groupId>
311-
<artifactId>beam-runners-flink_2.11</artifactId>
312-
<version>${org.apache.beam.version}</version>
313-
</dependency>
314-
315428
<dependency>
316429
<groupId>com.github.kstyrc</groupId>
317430
<artifactId>embedded-redis</artifactId>
@@ -325,6 +438,45 @@
325438
<version>1.9.1</version>
326439
<scope>test</scope>
327440
</dependency>
328-
441+
<dependency>
442+
<groupId>com.google.guava</groupId>
443+
<artifactId>guava</artifactId>
444+
<version>26.0-jre</version>
445+
<scope>compile</scope>
446+
</dependency>
447+
<dependency>
448+
<groupId>org.apache.kafka</groupId>
449+
<artifactId>kafka-clients</artifactId>
450+
<version>2.0.0</version>
451+
</dependency>
452+
<dependency>
453+
<groupId>org.springframework.boot</groupId>
454+
<artifactId>spring-boot-starter-test</artifactId>
455+
<version>2.1.1.RELEASE</version>
456+
<scope>test</scope>
457+
</dependency>
458+
<dependency>
459+
<groupId>org.springframework.kafka</groupId>
460+
<artifactId>spring-kafka</artifactId>
461+
<version>${spring.kafka.version}</version>
462+
<scope>test</scope>
463+
</dependency>
464+
<dependency>
465+
<groupId>org.springframework.kafka</groupId>
466+
<artifactId>spring-kafka-test</artifactId>
467+
<version>${spring.kafka.version}</version>
468+
<scope>test</scope>
469+
</dependency>
470+
<dependency>
471+
<groupId>org.apache.beam</groupId>
472+
<artifactId>beam-runners-flink_2.11</artifactId>
473+
<version>${org.apache.beam.version}</version>
474+
</dependency>
475+
<dependency>
476+
<groupId>org.apache.beam</groupId>
477+
<artifactId>beam-runners-direct-java</artifactId>
478+
<version>${org.apache.beam.version}</version>
479+
<scope>runtime</scope>
480+
</dependency>
329481
</dependencies>
330482
</project>

ingestion/src/main/java/feast/ingestion/ImportJob.java

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
package feast.ingestion;
1919

2020
import com.google.api.services.bigquery.model.TableRow;
21+
import com.google.api.services.dataflow.DataflowScopes;
22+
import com.google.auth.oauth2.GoogleCredentials;
2123
import com.google.inject.Guice;
2224
import com.google.inject.Inject;
2325
import com.google.inject.Injector;
@@ -39,7 +41,6 @@
3941
import lombok.extern.slf4j.Slf4j;
4042
import org.apache.beam.runners.dataflow.DataflowPipelineJob;
4143
import org.apache.beam.runners.dataflow.DataflowRunner;
42-
import org.apache.beam.runners.flink.FlinkRunner;
4344
import org.apache.beam.sdk.Pipeline;
4445
import org.apache.beam.sdk.PipelineResult;
4546
import org.apache.beam.sdk.PipelineRunner;
@@ -60,6 +61,7 @@
6061
import org.joda.time.Duration;
6162
import org.slf4j.event.Level;
6263

64+
import java.io.IOException;
6365
import java.util.Arrays;
6466
import java.util.Random;
6567

@@ -104,13 +106,16 @@ public static void main(String[] args) {
104106

105107
public static PipelineResult mainWithResult(String[] args) {
106108
log.info("Arguments: " + Arrays.toString(args));
107-
ImportJobOptions options =
108-
PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportJobOptions.class);
109+
ImportJobOptions options = PipelineOptionsFactory.fromArgs(args).withValidation().as(ImportJobOptions.class);
109110
if (options.getJobName().isEmpty()) {
110111
options.setJobName(generateName());
111112
}
112-
log.info(options.toString());
113-
113+
try {
114+
options.setGcpCredential(GoogleCredentials.getApplicationDefault().createScoped(DataflowScopes.all()));
115+
} catch (IOException e) {
116+
log.error("Exception while setting gcp credential manually : ", e.getMessage());
117+
}
118+
log.info("options: " + options.toString());
114119
ImportSpec importSpec = new ImportSpecSupplier(options).get();
115120
Injector injector =
116121
Guice.createInjector(new ImportJobModule(options, importSpec), new PipelineModule());
@@ -206,8 +211,6 @@ private String retrieveId(PipelineResult result) {
206211
Class<? extends PipelineRunner<?>> runner = options.getRunner();
207212
if (runner.isAssignableFrom(DataflowRunner.class)) {
208213
return ((DataflowPipelineJob) result).getJobId();
209-
} else if (runner.isAssignableFrom(FlinkRunner.class)) {
210-
throw new UnsupportedOperationException("Runner not yet supported.");
211214
} else {
212215
return this.options.getJobName();
213216
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package feast.ingestion.deserializer;
2+
3+
import com.google.protobuf.InvalidProtocolBufferException;
4+
import feast.types.FeatureRowProto.FeatureRow;
5+
import org.apache.kafka.common.errors.SerializationException;
6+
import org.apache.kafka.common.serialization.Deserializer;
7+
8+
import java.util.Map;
9+
10+
/**
11+
* Deserializer for Kafka to deserialize Protocol Buffers messages
12+
*
13+
* @param <FeatureRow> Protobuf message type
14+
*/
15+
public class FeatureRowDeserializer implements Deserializer<FeatureRow> {
16+
17+
@Override
18+
public void configure(Map configs, boolean isKey) {
19+
}
20+
21+
@Override
22+
public FeatureRow deserialize(String topic, byte[] data) {
23+
try {
24+
return FeatureRow.parseFrom(data);
25+
} catch (InvalidProtocolBufferException e) {
26+
throw new SerializationException("Error deserializing FeatureRow from Protobuf message", e);
27+
}
28+
}
29+
30+
@Override
31+
public void close() {
32+
}
33+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package feast.ingestion.deserializer;
2+
3+
import com.google.protobuf.InvalidProtocolBufferException;
4+
import feast.types.FeatureRowProto.*;
5+
import org.apache.kafka.common.errors.SerializationException;
6+
import org.apache.kafka.common.serialization.Deserializer;
7+
8+
import java.util.Map;
9+
10+
/**
11+
* Deserializer for Kafka to deserialize Protocol Buffers messages
12+
*
13+
* @param <FeatureRowKey> Protobuf message type
14+
*/
15+
public class FeatureRowKeyDeserializer implements Deserializer<FeatureRowKey> {
16+
17+
@Override
18+
public void configure(Map configs, boolean isKey) {
19+
}
20+
21+
@Override
22+
public FeatureRowKey deserialize(String topic, byte[] data) {
23+
try {
24+
return FeatureRowKey.parseFrom(data);
25+
} catch (InvalidProtocolBufferException e) {
26+
throw new SerializationException("Error deserializing FeatureRowKey from Protobuf message", e);
27+
}
28+
}
29+
30+
@Override
31+
public void close() {
32+
}
33+
}

ingestion/src/main/java/feast/ingestion/options/ImportJobOptions.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,23 @@
1919

2020
import com.google.auto.service.AutoService;
2121
import java.util.Collections;
22+
import org.apache.beam.runners.flink.FlinkPipelineOptions;
23+
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
2224
import org.apache.beam.sdk.metrics.MetricsSink;
2325
import org.apache.beam.sdk.options.Default;
2426
import org.apache.beam.sdk.options.Description;
2527
import org.apache.beam.sdk.options.PipelineOptions;
2628
import org.apache.beam.sdk.options.PipelineOptionsRegistrar;
2729
import org.apache.beam.sdk.options.Validation.Required;
2830

29-
public interface ImportJobOptions extends PipelineOptions {
31+
public interface ImportJobOptions extends PipelineOptions, FlinkPipelineOptions, GcpOptions {
3032
@Description("Import spec yaml file path")
3133
@Required(groups = {"importSpec"})
3234
String getImportSpecYamlFile();
3335

3436
void setImportSpecYamlFile(String value);
3537

36-
@Description("Import spec as native proto binary encoding conveted to Base64 string")
38+
@Description("Import spec as native proto binary encoding converted to Base64 string")
3739
@Required(groups = {"importSpec"})
3840
String getImportSpecBase64();
3941

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
package feast.ingestion.transform;
2+
3+
public class FeatureEnums {
4+
public enum InputSource {
5+
FILE,
6+
BIGQUERY,
7+
PUBSUB,
8+
KAFKA
9+
}
10+
11+
public enum FileFormat {
12+
CSV,
13+
JSON
14+
}
15+
}

0 commit comments

Comments
 (0)