-
Notifications
You must be signed in to change notification settings - Fork 109
Expand file tree
/
Copy pathavro.rs
More file actions
280 lines (249 loc) · 10.7 KB
/
avro.rs
File metadata and controls
280 lines (249 loc) · 10.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
use serde::{Deserialize, Serialize};
use std::{collections::HashMap, fmt::Display};
use utoipa::ToSchema;
/// Supported Avro data change event formats.
#[derive(Deserialize, Serialize, Clone, Debug, PartialEq, Eq, ToSchema, Default)]
pub enum AvroUpdateFormat {
/// Raw encoding.
///
/// Each message in the stream represents a single-record update: an insert, upsert, or delete.
///
/// ### Input Connectors
/// Raw Avro encoding can be used for insert and upsert operations, but not deletes.
/// - The message value contains the record to be inserted or updated.
/// - The message key and headers are ignored.
///
/// ### Output Connectors
/// The raw format supports inserts, upserts, and deletes.
/// - The message value contains the record to be inserted or deleted.
/// - The operation type is specified in the `op` message header field, which can be
/// `insert`, `update`, or `delete`.
/// - The message key can optionally store the primary key (see the `key_mode` property).
#[serde(rename = "raw")]
#[default]
Raw,
/// Debezium data change event format.
#[serde(rename = "debezium")]
Debezium,
/// Confluent JDBC connector change event format.
#[serde(rename = "confluent_jdbc")]
ConfluentJdbc,
}
impl Display for AvroUpdateFormat {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Raw => f.write_str("raw"),
Self::Debezium => f.write_str("debezium"),
Self::ConfluentJdbc => f.write_str("confluent_jdbc"),
}
}
}
/// Schema registry configuration.
#[derive(Clone, Serialize, Deserialize, Debug, Default, ToSchema)]
pub struct AvroSchemaRegistryConfig {
/// List of schema registry URLs.
///
/// * **Input connector**: When non-empty, the connector retrieves Avro
/// message schemas from the registry.
///
/// * **Output connector**: When non-empty, the connector will
/// post the schema to the registry and embed the schema id returned
/// by the registry in Avro messages. Otherwise, schema id 0 is used.
#[serde(default)]
pub registry_urls: Vec<String>,
/// Custom headers that will be added to every call to the schema registry.
///
/// This property is only applicable to output connectors.
///
/// Requires `registry_urls` to be set.
#[serde(default)]
pub registry_headers: HashMap<String, String>,
/// Proxy that will be used to access the schema registry.
///
/// Requires `registry_urls` to be set.
pub registry_proxy: Option<String>,
/// Timeout in seconds used to connect to the registry.
///
/// Requires `registry_urls` to be set.
pub registry_timeout_secs: Option<u64>,
/// Username used to authenticate with the registry.
///
/// Requires `registry_urls` to be set. This option is mutually exclusive with
/// token-based authentication (see `registry_authorization_token`).
pub registry_username: Option<String>,
/// Password used to authenticate with the registry.
///
/// Requires `registry_urls` to be set.
pub registry_password: Option<String>,
/// Token used to authenticate with the registry.
///
/// Requires `registry_urls` to be set. This option is mutually exclusive with
/// password-based authentication (see `registry_username` and `registry_password`).
pub registry_authorization_token: Option<String>,
}
/// Avro output format configuration.
#[derive(Clone, Serialize, Deserialize, Debug, Default, ToSchema)]
#[serde(deny_unknown_fields)]
pub struct AvroParserConfig {
/// Format used to encode data change events in this stream.
///
/// The default value is 'raw'.
#[serde(default)]
pub update_format: AvroUpdateFormat,
/// Avro schema used to encode all records in this stream, specified as a JSON-encoded string.
///
/// When this property is set, the connector uses the provided schema instead of
/// retrieving the schema from the schema registry. This setting is mutually exclusive
/// with `registry_urls`.
pub schema: Option<String>,
/// `true` if serialized messages only contain raw data without the
/// header carrying schema ID.
///
/// See <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
///
/// The default value is `false`.
#[serde(default)]
pub skip_schema_id: bool,
/// Schema registry configuration.
#[serde(flatten)]
pub registry_config: AvroSchemaRegistryConfig,
}
/// Subject name strategies used in registering key and value schemas
/// with the schema registry.
#[derive(Clone, Serialize, Deserialize, Debug, ToSchema)]
pub enum SubjectNameStrategy {
/// The subject name is derived directly from the Kafka topic name.
///
/// For update formats with both key and value components, use subject names
/// `{topic_name}-key` and `{topic_name}-value` for key and value schemas respectively.
/// For update formats without a key (e.g., `raw`), publish value schema
/// under the subject name `{topic_name}`.
///
/// Only applicable when using Kafka as a transport.
#[serde(rename = "topic_name")]
TopicName,
/// The name of the SQL relation that the schema is derived from is used as the subject name:
/// * the SQL view name for the message value schema.
/// * the SQL index name for the message key schema.
#[serde(rename = "record_name")]
RecordName,
/// Combines both the topic name and the record name to form the subject.
///
/// For update formats with both key and value components, use subject names
/// `{topic_name}-{record_name}-key` and `{topic_name}-{record_name}-value` for
/// key and value schemas respectively.
/// For update formats without a key (e.g., `raw`), publish value schema
/// under the subject name `{topic_name}-{record_name}`.
///
/// `{record_name}` is the name of the SQL view or index that this connector
/// is attached to.
///
/// Only applicable when using Kafka as a transport.
#[serde(rename = "topic_record_name")]
TopicRecordName,
}
/// Determines how the message key is generated when the Avro encoder is configured
/// in the `raw` mode.
#[derive(Clone, Serialize, Deserialize, Debug, ToSchema, PartialEq, Eq)]
pub enum AvroEncoderKeyMode {
/// Produce messages without a key.
#[serde(rename = "none")]
None,
/// Uses the unique key columns of the view as the message key.
///
/// This setting is supported when the output connector is configured with the `index` property.
/// It utilizes the values of the index columns specified in the associated `CREATE INDEX` statement
/// as the Avro message key.
///
/// A separate Avro schema will be created and registered in the schema registry
/// for the key component of the message.
#[serde(rename = "key_fields")]
KeyFields,
}
/// Avro output format configuration.
#[derive(Clone, Serialize, Deserialize, Debug, ToSchema)]
#[serde(deny_unknown_fields)]
pub struct AvroEncoderConfig {
/// Format used to encode data change events in this stream.
///
/// The default value is `raw`.
#[serde(default)]
pub update_format: AvroUpdateFormat,
/// Determines how the message key is generated when the Avro encoder is configured
/// in the `raw` mode.
///
/// The default is `key_fields` when the `index` property of the connector is configured and `none` otherwise.
pub key_mode: Option<AvroEncoderKeyMode>,
/// Avro schema used to encode output records.
///
/// When specified, the encoder will use this schema; otherwise it will automatically
/// generate an Avro schema based on the SQL view definition.
///
/// Specified as a string containing schema definition in JSON format.
/// This schema must match precisely the SQL view definition, modulo
/// nullability of columns.
pub schema: Option<String>,
/// Optional name of the field used for Change Data Capture (CDC) annotations.
///
/// Use this setting with data sinks that expect operation type
/// (insert, delete, or update) encoded as a column in the Avro record, such
/// as the [Iceberg Sink Kafka Connector](https://docs.feldera.com/connectors/sinks/iceberg).
///
/// When set (e.g., `"cdc_field": "op"`), the specified field will be added to each record
/// to indicate the type of change:
/// - `"I"` for insert operations
/// - `"U"` for upserts
/// - `"D"` for deletions
///
/// If not set, CDC metadata will not be included in the records.
/// Only works with the `raw` update format.
pub cdc_field: Option<String>,
/// Avro namespace for the generated Avro schemas.
pub namespace: Option<String>,
/// Subject name strategy used to publish Avro schemas used by the connector
/// in the schema registry.
///
/// When this property is not specified, the connector chooses subject name strategy automatically:
/// * `topic_name` for `confluent_jdbc` update format
/// * `record_name` for `raw` update format
pub subject_name_strategy: Option<SubjectNameStrategy>,
/// Set to `true` if serialized messages should only contain raw data
/// without the header carrying schema ID.
/// `False` by default.
///
/// See <https://docs.confluent.io/platform/current/schema-registry/fundamentals/serdes-develop/index.html#wire-format>
#[serde(default)]
pub skip_schema_id: bool,
/// Schema registry configuration.
///
/// When configured, the connector will push the Avro schema, whether it is specified as part of
/// connector configuration or generated automatically, to the schema registry and use the schema id
/// assigned by the registry in the
#[serde(flatten)]
pub registry_config: AvroSchemaRegistryConfig,
/// The number of threads to use during encoding.
///
/// Avro encoder supports encoding multiple records in parallel. This configuration specifies
/// the number of threads to run in parallel.
/// Default: 4
#[serde(default = "default_encoder_threads")]
pub threads: usize,
}
impl Default for AvroEncoderConfig {
fn default() -> Self {
Self {
update_format: Default::default(),
key_mode: Default::default(),
schema: Default::default(),
cdc_field: Default::default(),
namespace: Default::default(),
subject_name_strategy: Default::default(),
skip_schema_id: Default::default(),
registry_config: Default::default(),
threads: default_encoder_threads(),
}
}
}
fn default_encoder_threads() -> usize {
4
}