Skip to content

Commit d9078b5

Browse files
feat(pubsublite): attributes field in sink example (GoogleCloudPlatform#7405)
1 parent 266fa38 commit d9078b5

File tree

1 file changed

+24
-4
lines changed

1 file changed

+24
-4
lines changed

pubsublite/spark-connector/spark_streaming_to_pubsublite_example.py

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ def spark_streaming_to_pubsublite(
2020
) -> None:
2121
# [START pubsublite_spark_streaming_to_pubsublite]
2222
from pyspark.sql import SparkSession
23+
from pyspark.sql.functions import array, create_map, col, lit, when
2324
from pyspark.sql.types import BinaryType, StringType
2425
import uuid
2526

@@ -35,13 +36,32 @@ def spark_streaming_to_pubsublite(
3536
# |-- value: long (nullable = true)
3637
sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load()
3738

39+
# Transform the dataframe to match the required data fields and data types:
40+
# https://github.com/googleapis/java-pubsublite-spark#data-schema
3841
sdf = (
39-
sdf.withColumn("key", (sdf.value % 5).cast(StringType()).cast(BinaryType()))
40-
.withColumn("event_timestamp", sdf.timestamp)
41-
.withColumn("data", sdf.value.cast(StringType()).cast(BinaryType()))
42-
.drop("value", "timestamp")
42+
sdf.withColumn("key", lit("example").cast(BinaryType()))
43+
.withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
44+
.withColumnRenamed("timestamp", "event_timestamp")
45+
# Populate the attributes field. For example, an even value will
46+
# have {"key1", [b"even"]}.
47+
.withColumn(
48+
"attributes",
49+
create_map(
50+
lit("key1"),
51+
array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
52+
),
53+
)
54+
.drop("value")
4355
)
4456

57+
# After the transformation, the schema of the dataframe should look like:
58+
# |-- key: binary (nullable = false)
59+
# |-- data: binary (nullable = true)
60+
# |-- event_timestamp: timestamp (nullable = true)
61+
# |-- attributes: map (nullable = false)
62+
# | |-- key: string
63+
# | |-- value: array (valueContainsNull = false)
64+
# | | |-- element: binary (containsNull = false)
4565
sdf.printSchema()
4666

4767
query = (

0 commit comments

Comments
 (0)