Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions crates/adapters/src/integrated/delta_table/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ use feldera_adapterlib::utils::datafusion::{
use feldera_storage::tokio::TOKIO_DEDICATED_IO;
use feldera_types::config::FtModel;
use feldera_types::program_schema::Relation;
use feldera_types::transport::delta_table::{DeltaTableReaderConfig, DeltaTableTransactionMode};
use feldera_types::transport::delta_table::{
DeltaTableReaderConfig, DeltaTableSnapshotErrorMode, DeltaTableTransactionMode,
};
use futures_util::StreamExt;
use serde::{Deserialize, Serialize};
use serde_json::Value as JsonValue;
Expand Down Expand Up @@ -1714,8 +1716,10 @@ impl DeltaTableInputEndpointInner {
let batch = match batch {
Ok(batch) => batch,
Err(e) => {
let is_fatal =
self.config.snapshot_error_mode == DeltaTableSnapshotErrorMode::Fail;
self.consumer.error(
false,
is_fatal,
anyhow!("error retrieving batch {num_batches} of {descr}: {e:?}"),
Some("delta-batch"),
);
Expand Down
68 changes: 67 additions & 1 deletion crates/feldera-types/src/transport/delta_table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,21 @@ fn default_num_parsers() -> u32 {
4
}

/// Controls how the Delta table input connector handles errors during snapshot loading.
///
/// * `ignore` - Log a non-fatal warning and skip the failed batch (default).
/// * `fail` - Treat the error as fatal and stop the pipeline immediately.
#[derive(Default, Debug, Clone, Eq, PartialEq, Deserialize, Serialize, ToSchema)]
#[serde(rename_all = "lowercase")]
pub enum DeltaTableSnapshotErrorMode {
/// Log a non-fatal warning and skip the failed batch.
#[default]
Ignore,

/// Treat the error as fatal and stop the pipeline immediately.
Fail,
}

/// Delta table transaction mode.
///
/// Determines how the connector breaks up its input into transactions.
Expand Down Expand Up @@ -307,6 +322,13 @@ pub struct DeltaTableReaderConfig {
#[serde(default)]
pub verbose: u32,

/// Controls how the connector handles errors during snapshot loading.
///
/// * `"ignore"` - Log a non-fatal warning and skip the failed batch (default).
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// * `"ignore"` - Log a non-fatal warning and skip the failed batch (default).
/// * `"ignore"` - Log a non-fatal error and skip the failed batch (default).

/// * `"fail"` - Treat the error as fatal and stop the pipeline immediately.
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// * `"fail"` - Treat the error as fatal and stop the pipeline immediately.
/// * `"fail"` - Treat the error as fatal and stop the connector immediately.

#[serde(default)]
pub snapshot_error_mode: DeltaTableSnapshotErrorMode,

/// Storage options for configuring backend object store.
///
/// For specific options available for different storage backends, see:
Expand Down Expand Up @@ -335,13 +357,57 @@ fn test_delta_reader_config_serde() {
let config = serde_json::from_str::<DeltaTableReaderConfig>(config_str).unwrap();

let serialized_config = serde_json::to_string(&config).unwrap();
let expected = r#"{"uri":"protocol:/path/to/somewhere","mode":"follow","transaction_mode":"none","timestamp_column":"ts","filter":null,"skip_unused_columns":false,"snapshot_filter":"ts BETWEEN '2005-01-01 00:00:00' AND '2010-12-31 23:59:59'","version":null,"datetime":"2010-12-31 00:00:00Z","end_version":null,"cdc_delete_filter":null,"cdc_order_by":null,"num_parsers":4,"max_concurrent_readers":null,"customoption1":"val1","customoption2":"val2","verbose":0}"#;
let expected = r#"{"uri":"protocol:/path/to/somewhere","mode":"follow","transaction_mode":"none","timestamp_column":"ts","filter":null,"skip_unused_columns":false,"snapshot_filter":"ts BETWEEN '2005-01-01 00:00:00' AND '2010-12-31 23:59:59'","version":null,"datetime":"2010-12-31 00:00:00Z","end_version":null,"cdc_delete_filter":null,"cdc_order_by":null,"num_parsers":4,"max_concurrent_readers":null,"snapshot_error_mode":"ignore","customoption1":"val1","customoption2":"val2","verbose":0}"#;
assert_eq!(
serde_json::from_str::<serde_json::Value>(&serialized_config).unwrap(),
serde_json::from_str::<serde_json::Value>(expected).unwrap()
);
}

#[cfg(test)]
#[test]
fn test_snapshot_error_mode_default() {
let config_str = r#"{
"uri": "s3://bucket/table",
"mode": "snapshot"
}"#;
let config = serde_json::from_str::<DeltaTableReaderConfig>(config_str).unwrap();
assert_eq!(
config.snapshot_error_mode,
DeltaTableSnapshotErrorMode::Ignore
);
}

#[cfg(test)]
#[test]
fn test_snapshot_error_mode_ignore() {
let config_str = r#"{
"uri": "s3://bucket/table",
"mode": "snapshot",
"snapshot_error_mode": "ignore"
}"#;
let config = serde_json::from_str::<DeltaTableReaderConfig>(config_str).unwrap();
assert_eq!(
config.snapshot_error_mode,
DeltaTableSnapshotErrorMode::Ignore
);
}

#[cfg(test)]
#[test]
fn test_snapshot_error_mode_fail() {
let config_str = r#"{
"uri": "s3://bucket/table",
"mode": "snapshot",
"snapshot_error_mode": "fail"
}"#;
let config = serde_json::from_str::<DeltaTableReaderConfig>(config_str).unwrap();
assert_eq!(
config.snapshot_error_mode,
DeltaTableSnapshotErrorMode::Fail
);
}

impl DeltaTableReaderConfig {
/// `true` if the configuration requires taking an initial snapshot of the table.
pub fn snapshot(&self) -> bool {
Expand Down
3 changes: 3 additions & 0 deletions openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -7803,6 +7803,9 @@
"type": "boolean",
"description": "Don't read unused columns from the Delta table.\n\nWhen set to `true`, this option instructs the connector to avoid reading\ncolumns from the Delta table that are not used in any view definitions.\nTo be skipped, the columns must be either nullable or have default\nvalues. This can improve ingestion performance, especially for wide\ntables.\n\nNote: The simplest way to exclude unused columns is to omit them from the Feldera SQL table\ndeclaration. The connector never reads columns that aren't declared in the SQL schema.\nAdditionally, the SQL compiler emits warnings for declared but unused columns—use these as\na guide to optimize your schema."
},
"snapshot_error_mode": {
"$ref": "#/components/schemas/DeltaTableSnapshotErrorMode"
},
"snapshot_filter": {
"type": "string",
"description": "Optional snapshot filter.\n\nThis option is only valid when `mode` is set to `snapshot` or `snapshot_and_follow`.\n\nWhen specified, only rows that satisfy the filter condition are included in the\nsnapshot. The condition must be a valid SQL Boolean expression that can be used in\nthe `where` clause of the `select * from snapshot where ...` query.\n\nUnlike the `filter` option, which applies to all records retrieved from the table, this\nfilter only applies to rows in the initial snapshot of the table.\nFor instance, it can be used to specify the range of event times to include in the snapshot,\ne.g.: `ts BETWEEN TIMESTAMP '2005-01-01 00:00:00' AND TIMESTAMP '2010-12-31 23:59:59'`.\n\nThis option can be used together with the `filter` option. During the initial snapshot,\nonly rows that satisfy both `filter` and `snapshot_filter` are retrieved from the Delta table.\nWhen subsequently following changes in the the transaction log (`mode = snapshot_and_follow`),\nall rows that meet the `filter` condition are ingested, regardless of `snapshot_filter`.",
Expand Down
Loading