feldera/python/tests/workloads/test_kafka_avro.py at felderize · feldera/feldera

History

226 lines (190 loc) · 6.42 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

import unittest

from tests import TEST_CLIENT

from feldera import PipelineBuilder

import time

import os

from confluent_kafka.admin import AdminClient

import requests

import re

def env(name: str, default: str) -> str:

"""Get environment variables for the Kafka broker and Schema registry.

The default values are only meant for internal development; external users must set them."""

return os.getenv(name, default)

# Set these before running the test:

# Example(terminal/shell):

# export KAFKA_BOOTSTRAP_SERVERS= localhost:9092

# export SCHEMA_REGISTRY_URL= http://localhost:8081

KAFKA_BOOTSTRAP = env(

"KAFKA_BOOTSTRAP_SERVERS", "ci-kafka-bootstrap.korat-vibes.ts.net:9094"

)

SCHEMA_REGISTRY = env(

"SCHEMA_REGISTRY_URL", "http://ci-schema-registry.korat-vibes.ts.net"

)

def extract_kafka_avro_artifacts(sql: str) -> tuple[list[str], list[str]]:

"""Extract Kafka topic and schema subjects from the SQL query"""

topics = re.findall(r'"topic"\s*:\s*"([^"]+)"', sql)

subjects = re.findall(r"create view\s+(\w+)", sql, re.I) + re.findall(

r"create index\s+(\w+)", sql, re.I

)

return list(set(topics)), list(set(subjects))

def delete_kafka_topics(bootstrap_servers: str, topics: list[str]):

admin = AdminClient({"bootstrap.servers": bootstrap_servers})

tpcs = admin.delete_topics(topics)

for topic, tpcs in tpcs.items():

try:

tpcs.result()

print(f"Deleted topic: {topic}")

except Exception as e:

print(f"Failed to delete {topic}: {e}")

def delete_schema_subjects(registry_url: str, subjects: list[str]):

for subject in subjects:

r = requests.delete(f"{registry_url}/subjects/{subject}")

print(

f"Deleted schema subject: {subject}"

if r.status_code == 200

else f"Failed to delete {subject}: {r.status_code} {r.text}"

)

def cleanup_kafka(sql: str, bootstrap_servers: str, registry_url: str):

"""Clean up Kafka topics and Schema Subjects after each test run.

Each run produces new records. So, rerunning without cleanup will append data to the same topic(s)."""

topics, subjects = extract_kafka_avro_artifacts(sql)

delete_kafka_topics(bootstrap_servers, topics)

delete_schema_subjects(registry_url, subjects)

# Set the limit for number of records to generate

LIMIT = 1000000

class TestKafkaAvro(unittest.TestCase):

def test_check_avro(self):

sql = f"""

create table t (

id int,

str varchar,

dec decimal,

reall real,

dbl double,

booll boolean,

tmestmp timestamp,

datee date,

tme time

) with (

'materialized' = 'true',

'connectors' = '[{{

"transport": {{

"name": "datagen",

"config": {{ "plan": [{{"limit": {LIMIT}}}], "seed": 1 }}

}}

}}]'

);

create view v

with (

'connectors' = '[{{

"transport": {{

"name": "kafka_output",

"config": {{

"bootstrap.servers": "{KAFKA_BOOTSTRAP}",

"topic": "my_topic_avro"

}}

}},

"format": {{

"name": "avro",

"config": {{

"update_format": "raw",

"registry_urls": ["{SCHEMA_REGISTRY}"]

}}

}},

{{

"index": "t_index",

"transport": {{

"name": "kafka_output",

"config": {{

"bootstrap.servers": "{KAFKA_BOOTSTRAP}",

"topic": "my_topic_avro2"

}}

}},

"format": {{

"name": "avro",

"config": {{

"update_format": "raw",

"registry_urls": ["{SCHEMA_REGISTRY}"]

}}

}}]'

)

as select * from t;

create index t_index on v(id);

create table loopback (

id int,

str varchar,

dec decimal,

reall real,

dbl double,

booll boolean,

tmestmp timestamp,

datee date,

tme time

) with (

'materialized' = 'true',

'connectors' = '[{{

"transport": {{

"name": "kafka_input",

"config": {{

"topic": "my_topic_avro2",

"start_from": "earliest",

"bootstrap.servers": "{KAFKA_BOOTSTRAP}"

}}

}},

"format": {{

"name": "avro",

"config": {{

"update_format": "raw",

"registry_urls": ["{SCHEMA_REGISTRY}"]

}}

}}]'

);

"""

pipeline = PipelineBuilder(

TEST_CLIENT,

"test_kafka_avro",

sql=sql,

).create_or_replace()

try:

pipeline.start()

# NOTE => total_completed_records counts all rows that are processed through each output as follows:

# 1. Written by the view<v> -> Kafka

# 2. Ingested into loopback table from Kafka

# Thus, expected_records = generated_rows * number_of_outputs (in this case 2)

expected_records = LIMIT * 2

timeout_s = 1800

poll_interval_s = 5

start_time = time.perf_counter()

# Poll `total_completed_records` every `poll_interval_s` seconds until it reaches `expected_records`

while True:

stats = TEST_CLIENT.get_pipeline_stats(pipeline.name)

completed = stats["global_metrics"]["total_completed_records"]

print(f"Processed {completed}/{expected_records} rows so far...")

if completed >= expected_records:

break

# Prevent infinite polling

if time.perf_counter() - start_time > timeout_s:

raise AssertionError(

f"Timeout: only {completed}/{expected_records} rows processed"

)

time.sleep(poll_interval_s)

elapsed = time.perf_counter() - start_time

print(

f"All {completed}/{expected_records} rows processed in {elapsed:.3f}s"

)

# Validation: once finished, the loopback table should contain all generated values

# Validate by comparing the hash of the source table 't' and loopback table

expected_hash = pipeline.query_hash("SELECT * FROM t ORDER BY id, str")

result_hash = pipeline.query_hash("SELECT * FROM loopback ORDER BY id, str")

assert result_hash == expected_hash, (

f"Validation failed: loopback table hash mismatch!\n"

f"Expected: {expected_hash}\nGot: {result_hash}"

)

print("Loopback table validated successfully!")

finally:

pipeline.stop(force=True)

# Cleanup Kafka and Schema Registry

cleanup_kafka(sql, KAFKA_BOOTSTRAP, SCHEMA_REGISTRY)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

test_kafka_avro.py

Latest commit

History

test_kafka_avro.py

File metadata and controls