@@ -20,6 +20,7 @@ def spark_streaming_to_pubsublite(
2020) -> None :
2121 # [START pubsublite_spark_streaming_to_pubsublite]
2222 from pyspark .sql import SparkSession
23+ from pyspark .sql .functions import array , create_map , col , lit , when
2324 from pyspark .sql .types import BinaryType , StringType
2425 import uuid
2526
@@ -35,13 +36,32 @@ def spark_streaming_to_pubsublite(
3536 # |-- value: long (nullable = true)
3637 sdf = spark .readStream .format ("rate" ).option ("rowsPerSecond" , 1 ).load ()
3738
39+ # Transform the dataframe to match the required data fields and data types:
40+ # https://github.com/googleapis/java-pubsublite-spark#data-schema
3841 sdf = (
39- sdf .withColumn ("key" , (sdf .value % 5 ).cast (StringType ()).cast (BinaryType ()))
40- .withColumn ("event_timestamp" , sdf .timestamp )
41- .withColumn ("data" , sdf .value .cast (StringType ()).cast (BinaryType ()))
42- .drop ("value" , "timestamp" )
42+ sdf .withColumn ("key" , lit ("example" ).cast (BinaryType ()))
43+ .withColumn ("data" , col ("value" ).cast (StringType ()).cast (BinaryType ()))
44+ .withColumnRenamed ("timestamp" , "event_timestamp" )
45+ # Populate the attributes field. For example, an even value will
46+ # have {"key1", [b"even"]}.
47+ .withColumn (
48+ "attributes" ,
49+ create_map (
50+ lit ("key1" ),
51+ array (when (col ("value" ) % 2 == 0 , b"even" ).otherwise (b"odd" )),
52+ ),
53+ )
54+ .drop ("value" )
4355 )
4456
57+ # After the transformation, the schema of the dataframe should look like:
58+ # |-- key: binary (nullable = false)
59+ # |-- data: binary (nullable = true)
60+ # |-- event_timestamp: timestamp (nullable = true)
61+ # |-- attributes: map (nullable = false)
62+ # | |-- key: string
63+ # | |-- value: array (valueContainsNull = false)
64+ # | | |-- element: binary (containsNull = false)
4565 sdf .printSchema ()
4666
4767 query = (
0 commit comments