feldera/python/tests/runtime/test_adhoc_queries.py at felderize · feldera/feldera

History

266 lines (229 loc) · 8.5 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

import json

import os

import tempfile

from pathlib import Path

from tests import TEST_CLIENT

from tests.shared_test_pipeline import SharedTestPipeline, sql

class TestAdhocQueries(SharedTestPipeline):

@property

def pipeline(self):

return self.p

def _count(self, sql: str) -> int:

rows = list(self.pipeline.query(sql))

if not rows:

return 0

row = rows[0]

for k in ("c", "count", "count(*)"):

if k in row:

return row[k]

# Fallback: first value

return next(iter(row.values()))

@sql("""

CREATE TABLE not_materialized(id bigint not null);

-- Materialized because of PK constraint.

CREATE TABLE pk_table_materialized(id bigint not null primary key)

WITH (

'connectors' = '[{

"transport": {

"name": "datagen",

"config": {

"plan": [{

"limit": 5

}]

}

}]'

);

-- Not materialized because of lateness attribute on the primary key column.

CREATE TABLE lateness_table1_not_materialized(id bigint not null primary key lateness 0)

WITH (

'connectors' = '[{

"transport": {

"name": "datagen",

"config": {

"plan": [{

"limit": 5

}]

}

}]'

);

-- Materialized because of explicit materialized attribute, despite lateness.

CREATE TABLE lateness_table2_materialized(id bigint not null primary key lateness 0)

WITH('materialized' = 'true');

-- Materialized because lateness does not apply to primary key column.

CREATE TABLE lateness_table3_materialized(id bigint not null primary key, other int not null lateness 0);

CREATE TABLE t1 (

id INT NOT NULL,

dt DATE NOT NULL,

uid UUID NOT NULL

) WITH (

'materialized' = 'true',

'connectors' = '[{

"transport": {

"name": "datagen",

"config": {

"plan": [{

"limit": 5

}]

}

}]'

);

CREATE TABLE t2 (

id INT NOT NULL,

st VARCHAR NOT NULL

) WITH (

'materialized' = 'true',

'connectors' = '[{

"transport": {

"name": "datagen",

"config": {

"plan": [{

"limit": 5

}]

}

}]'

);

CREATE MATERIALIZED VIEW joined AS (

SELECT t1.dt AS c1, t2.st AS c2, t1.uid as c3

FROM t1, t2

WHERE t1.id = t2.id

);

CREATE MATERIALIZED VIEW view_of_not_materialized AS (

SELECT * FROM not_materialized

);

""")

def test_pipeline_adhoc_query(self):

self.pipeline.start()

self.pipeline.execute(

"INSERT INTO lateness_table2_materialized VALUES (10),(20),(30)"

)

# Dropped due to lateness.

self.pipeline.execute(

"INSERT INTO lateness_table2_materialized VALUES (1),(2),(3)"

)

self.pipeline.execute(

"INSERT INTO lateness_table3_materialized VALUES (10, 10), (20, 20), (30, 30)"

)

# Dropped due to lateness.

self.pipeline.execute(

"INSERT INTO lateness_table3_materialized VALUES (1,1),(2,2),(3,3)"

)

# Wait for all datagen and ad hoc connectors to finish.

self.pipeline.wait_for_completion()

ADHOC_SQL_A = "SELECT * FROM joined"

ADHOC_SQL_B = "SELECT t1.dt AS c1, t2.st AS c2, t1.uid as c3 FROM t1, t2 WHERE t1.id = t2.id"

ADHOC_SQL_INSERT = (

"INSERT INTO t1 VALUES "

"(99,'2020-01-01','c32d330f-5757-4ada-bcf6-1fac2d54e37f'),"

"(100,'2020-01-01','00000000-0000-0000-0000-000000000000')"

)

# Baseline row count

assert self._count("SELECT COUNT(*) AS c FROM t1") == 5

# Compare logically equivalent queries in text + json formats.

# Text format via TEST_CLIENT (ASCII table)

text_a = "\n".join(TEST_CLIENT.query_as_text(self.pipeline.name, ADHOC_SQL_A))

text_b = "\n".join(TEST_CLIENT.query_as_text(self.pipeline.name, ADHOC_SQL_B))

def _extract_table_lines(txt: str):

return {

ln.strip()

for ln in txt.splitlines()

if ln.strip() and not ln.startswith("+") and not ln.startswith("| c1")

}

assert _extract_table_lines(text_a) == _extract_table_lines(text_b)

# JSON lines (unordered compare)

json_a = list(TEST_CLIENT.query_as_json(self.pipeline.name, ADHOC_SQL_A))

json_b = list(TEST_CLIENT.query_as_json(self.pipeline.name, ADHOC_SQL_B))

def norm(rows):

return sorted((json.dumps(r, sort_keys=True) for r in rows))

assert norm(json_a) == norm(json_b)

# Parquet comparison with ORDER BY to establish deterministic ordering.

pf1 = tempfile.NamedTemporaryFile(

prefix="adhoc_a_", suffix=".parquet", delete=False

)

pf2 = tempfile.NamedTemporaryFile(

prefix="adhoc_b_", suffix=".parquet", delete=False

)

pf1.close()

pf2.close()

try:

TEST_CLIENT.query_as_parquet(

self.pipeline.name, f"{ADHOC_SQL_A} ORDER BY c1,c2,c3", pf1.name[:-8]

)

TEST_CLIENT.query_as_parquet(

self.pipeline.name, f"{ADHOC_SQL_B} ORDER BY c1,c2,c3", pf2.name[:-8]

)

b1 = Path(pf1.name).read_bytes()

b2 = Path(pf2.name).read_bytes()

assert b1, "Parquet output empty"

assert b1 == b2, "Parquet outputs differ for equivalent queries"

finally:

for p in (

pf1.name,

pf2.name,

pf1.name[:-8] + ".parquet",

pf2.name[:-8] + ".parquet",

try:

os.remove(p)

except OSError:

pass

# Case-sensitive quoted table

quoted_rows = list(self.pipeline.query('SELECT * FROM "TaBle1"'))

assert quoted_rows == [], "Expected quoted TaBle1 to be empty initially"

# Invalid table -> expect error

invalid_ok = False

try:

print(list(self.pipeline.query("SELECT * FROM invalid_table")))

except Exception:

invalid_ok = True

assert invalid_ok, "Expected querying invalid_table to raise"

# Expression runtime error (division by zero) - expect textual ERROR

error_text = "\n".join(

TEST_CLIENT.query_as_text(self.pipeline.name, "SELECT 1/0")

)

assert "ERROR" in error_text.upper()

# Non-materialized table direct access should fail

res = list(self.pipeline.query("SELECT * FROM not_materialized"))

assert res and "error" in res[0]

# INSERT into materialized t1 using JSON query endpoint (to retrieve count row)

insert_resp = list(

TEST_CLIENT.query_as_json(self.pipeline.name, ADHOC_SQL_INSERT)

)

assert insert_resp and insert_resp[0].get("count") == 2

assert self._count("SELECT COUNT(*) AS c FROM t1") == 7

# Non-materialized table via its materialized view

assert self._count("SELECT COUNT(*) AS c FROM view_of_not_materialized") == 0

ins_nm = list(

TEST_CLIENT.query_as_json(

self.pipeline.name, "INSERT INTO not_materialized VALUES (99),(100)"

)

assert ins_nm and ins_nm[0].get("count") == 2

assert self._count("SELECT COUNT(*) AS c FROM view_of_not_materialized") == 2

assert self._count("SELECT COUNT(*) AS c FROM pk_table_materialized") == 5

assert (

self._count("SELECT COUNT(*) AS c FROM lateness_table2_materialized") == 3

)

assert (

self._count("SELECT COUNT(*) AS c FROM lateness_table3_materialized") == 3

)

# FIXME: this should raise an exception, but it currently doesn't.

# https://github.com/feldera/feldera/issues/4973

result = list(

self.pipeline.query(

"SELECT COUNT(*) AS c FROM lateness_table1_not_materialized"

)

assert len(result) == 1 and "Execution error" in str(result[0])

@sql(

"""CREATE TABLE "TaBle1"(id bigint not null) WITH ('materialized' = 'true');"""

)

def test_pipeline_adhoc_query_empty(self):

self.pipeline.start()

rows = list(self.pipeline.query('SELECT COUNT(*) AS c FROM "TaBle1"'))

c = rows[0].get("c")

print(rows, rows[0], c)

assert c == 0

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

test_adhoc_queries.py

Latest commit

History

test_adhoc_queries.py

File metadata and controls