Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 33 additions & 20 deletions crates/adapters/src/transport/nats/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,15 @@ enum ConnectorError {
Fatal(AnyError),
}

impl std::fmt::Debug for ConnectorError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Retryable(e) => write!(f, "Retryable({e:#})"),
Self::Fatal(e) => write!(f, "Fatal({e:#})"),
}
}
}

impl ConnectorError {
fn with_context(self, context: impl std::fmt::Display + Send + Sync + 'static) -> Self {
match self {
Expand Down Expand Up @@ -318,8 +327,7 @@ impl NatsReader {
&stream_ctx.stream_name,
state.next_sequence.load(Ordering::Acquire),
)
.await
.map_err(ConnectorError::Fatal)?;
.await?;

let nats_consumer = create_nats_consumer(
&jetstream,
Expand Down Expand Up @@ -398,8 +406,7 @@ impl NatsReader {
.map_err(ConnectorError::Retryable)?;

validate_replay_range(&jetstream, &stream_ctx.stream_name, &metadata.sequence_numbers)
.await
.map_err(ConnectorError::Fatal)?;
.await?;

let nats_consumer = create_nats_consumer(
&jetstream,
Expand Down Expand Up @@ -955,7 +962,7 @@ async fn validate_replay_range(
jetstream: &jetstream::Context,
stream_name: &str,
requested_range: &std::ops::Range<u64>,
) -> AnyResult<()> {
) -> Result<(), ConnectorError> {
validate_sequence_bounds(
jetstream,
stream_name,
Expand All @@ -970,7 +977,7 @@ async fn validate_resume_position(
jetstream: &jetstream::Context,
stream_name: &str,
resume_cursor: u64,
) -> AnyResult<()> {
) -> Result<(), ConnectorError> {
validate_sequence_bounds(
jetstream,
stream_name,
Expand All @@ -992,7 +999,7 @@ async fn validate_sequence_bounds(
jetstream: &jetstream::Context,
stream_name: &str,
mode: SequenceValidationMode,
) -> AnyResult<()> {
) -> Result<(), ConnectorError> {
match &mode {
SequenceValidationMode::Replay { requested_range } if requested_range.is_empty() => {
return Ok(());
Expand All @@ -1004,53 +1011,59 @@ async fn validate_sequence_bounds(
_ => {}
}

let stream_state = fetch_stream_state(jetstream, stream_name).await?;
// Fetching stream state is an I/O operation that can fail transiently
// (e.g., timeout, temporary network issues). These should be retryable.
let stream_state = fetch_stream_state(jetstream, stream_name)
.await
.map_err(ConnectorError::Retryable)?;
let available_first = stream_state.first_sequence;
let available_last = stream_state.last_sequence;

// Logical validation errors (data out of bounds, stream empty) are fatal
// because retrying won't change the outcome.
match mode {
SequenceValidationMode::Replay { requested_range } => {
if stream_state.messages == 0 {
return Err(anyhow!(
return Err(ConnectorError::Fatal(anyhow!(
"Replay requested sequences {:?} from stream '{stream_name}', but the stream is empty",
requested_range
));
)));
}

let requested_first = requested_range.start;
let requested_last = requested_range.end - 1;

if requested_first < available_first || requested_first > available_last {
return Err(anyhow!(
return Err(ConnectorError::Fatal(anyhow!(
"Replay start sequence {requested_first} is outside available stream range [{available_first}, {available_last}] for stream '{stream_name}'"
));
)));
}

if requested_last > available_last {
return Err(anyhow!(
return Err(ConnectorError::Fatal(anyhow!(
"Replay end sequence {requested_last} exceeds available stream tail {available_last} for stream '{stream_name}'"
));
)));
}
}
SequenceValidationMode::Resume { resume_cursor } => {
if stream_state.messages == 0 {
return Err(anyhow!(
return Err(ConnectorError::Fatal(anyhow!(
"Resume sequence {resume_cursor} is invalid for stream '{stream_name}': stream is empty"
));
)));
}

let valid_upper = available_last.saturating_add(1);

if resume_cursor < available_first {
return Err(anyhow!(
return Err(ConnectorError::Fatal(anyhow!(
"Resume sequence {resume_cursor} is before earliest available sequence {available_first} for stream '{stream_name}'"
));
)));
}

if resume_cursor > valid_upper {
return Err(anyhow!(
return Err(ConnectorError::Fatal(anyhow!(
"Resume sequence {resume_cursor} is after valid upper bound {valid_upper} for stream '{stream_name}'"
));
)));
}
}
}
Expand Down
200 changes: 199 additions & 1 deletion crates/adapters/src/transport/nats/input/test/custom_tests.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
use super::super::ConnectorError;
use super::NatsTestRecord;
use super::util;
use crate::test::mock_input_pipeline;
use async_nats::jetstream;
use feldera_types::program_schema::Relation;
use std::time::Duration;

// ---------------------------------------------------------------------------
// Configuration Validation (No Test Framework)
// Configuration Validation
// ---------------------------------------------------------------------------

/// Test that inactivity_timeout_secs=0 is rejected early by configuration validation.
Expand Down Expand Up @@ -79,3 +83,197 @@ format:
}
}
}

// ---------------------------------------------------------------------------
// Validation Error Classification
// ---------------------------------------------------------------------------
//
// These tests verify that `validate_resume_position` and `validate_replay_range`
// classify errors correctly:
// - Transient I/O errors (server down) → ConnectorError::Retryable
// - Logical failures (out of bounds) → ConnectorError::Fatal
// - Valid inputs → Ok

const STREAM: &str = "validation_test_stream";
const SUBJECT: &str = "validation_test_subject";

/// Lightweight test harness for validation functions.
///
/// Manages a NATS server, JetStream context, and tokio runtime. The context
/// is created once during `start()` and survives `kill_server()`, so tests
/// can verify behavior against a dead server.
struct NatsTestFixture {
_guard: Option<util::ProcessKillGuard>,
js: jetstream::Context,
rt: tokio::runtime::Runtime,
}

impl NatsTestFixture {
/// Start a NATS server, create the test stream, and connect a JetStream context.
fn start() -> Self {
let rt = tokio::runtime::Runtime::new().unwrap();
let (guard, url) = util::start_nats_and_get_address().unwrap();
let js = rt.block_on(async {
util::create_stream(&url, STREAM, SUBJECT).await.unwrap();
let client = util::wait_for_nats_ready(&url, Duration::from_secs(5))
.await
.unwrap();
jetstream::new(client)
});
Self {
_guard: Some(guard),
js,
rt,
}
}

/// Publish `n` dummy JSON messages to the test subject.
fn publish(&self, n: usize) {
let msgs: Vec<_> = (0..n).map(|i| serde_json::json!({"x": i})).collect();
self.rt.block_on(async {
for msg in &msgs {
let ack = self
.js
.publish(
SUBJECT.to_string(),
serde_json::to_string(msg).unwrap().into(),
)
.await
.unwrap();
ack.await.unwrap();
}
});
}

/// Purge all messages from the test stream.
fn purge(&self) {
self.rt.block_on(async {
let stream = self.js.get_stream(STREAM).await.unwrap();
stream.purge().await.unwrap();
});
}

/// Kill the NATS server. The JetStream context remains usable for
/// testing — subsequent operations will fail with transient I/O errors.
fn kill_server(&mut self) {
self._guard.take();
std::thread::sleep(Duration::from_millis(200));
}
}

fn validate_resume(nats: &NatsTestFixture, cursor: u64) -> Result<(), ConnectorError> {
nats.rt.block_on(super::super::validate_resume_position(
&nats.js, STREAM, cursor,
))
}

fn validate_replay(
nats: &NatsTestFixture,
range: std::ops::Range<u64>,
) -> Result<(), ConnectorError> {
nats.rt.block_on(super::super::validate_replay_range(
&nats.js, STREAM, &range,
))
}

// -- Resume validation --

#[test]
fn test_nats_validate_resume_server_down_is_retryable() {
let mut nats = NatsTestFixture::start();
nats.publish(1);
nats.kill_server();
assert!(matches!(
validate_resume(&nats, 1),
Err(ConnectorError::Retryable(_))
));
}

#[test]
fn test_nats_validate_resume_before_head_is_fatal() {
let nats = NatsTestFixture::start();
nats.publish(5);
nats.purge();
nats.publish(3);
// Stream first_sequence is now 6; cursor 2 is before that.
assert!(matches!(
validate_resume(&nats, 2),
Err(ConnectorError::Fatal(_))
));
}

#[test]
fn test_nats_validate_resume_with_gap_from_tail_is_fatal() {
let nats = NatsTestFixture::start();
nats.publish(3);
// Message 1,2,3 in stream, try resume from 5, so a gap.
assert!(matches!(
validate_resume(&nats, 5),
Err(ConnectorError::Fatal(_))
));
}

/// A fresh start (resume_cursor=0) do not need any stream seqeunce validation
/// and should succeeds on an empty stream.
#[test]
fn test_nats_validate_resume_fresh_start_is_ok() {
let nats = NatsTestFixture::start();
assert!(validate_resume(&nats, 0).is_ok());
}

#[test]
fn test_nats_validate_resume_continue_is_ok() {
let nats = NatsTestFixture::start();
nats.publish(3);
assert!(validate_resume(&nats, 3).is_ok());
}

// -- Replay validation --

#[test]
fn test_nats_validate_replay_server_down_is_retryable() {
let mut nats = NatsTestFixture::start();
nats.publish(1);
nats.kill_server();
assert!(matches!(
validate_replay(&nats, 1..2),
Err(ConnectorError::Retryable(_))
));
}

#[test]
fn test_nats_validate_replay_range_exceeds_tail_is_fatal() {
let nats = NatsTestFixture::start();
nats.publish(3);
// last_sequence=3; requesting [1, 100) exceeds tail.
assert!(matches!(
validate_replay(&nats, 1..100),
Err(ConnectorError::Fatal(_))
));
}

#[test]
fn test_nats_validate_replay_range_precedes_head_is_fatal() {
let nats = NatsTestFixture::start();
nats.publish(3);
nats.purge();
nats.publish(4);
// Sequence in stream 4,5,6,7. Missing 1,2,3.
assert!(matches!(
validate_replay(&nats, 1..8),
Err(ConnectorError::Fatal(_))
));
}

#[test]
fn test_nats_validate_replay_empty_range_is_ok() {
let nats = NatsTestFixture::start();
assert!(validate_replay(&nats, 5..5).is_ok());
}

#[test]
fn test_nats_validate_replay_range_is_ok() {
let nats = NatsTestFixture::start();
nats.publish(6);
assert!(validate_replay(&nats, 3..7).is_ok());
}
4 changes: 4 additions & 0 deletions docs.feldera.com/docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ import TabItem from '@theme/TabItem';
- Health probes now avoid duplicate JetStream stream-info requests,
reducing API pressure during retry and recovery loops.

NATS retry classification during resume and replay validation has also been refined:
transient failures while fetching JetStream stream metadata are now treated as retryable,
while logical sequence-range validation failures remain fatal.

## v0.263.0

Added connector error list to input/output connector stats.
Expand Down
4 changes: 3 additions & 1 deletion docs.feldera.com/docs/connectors/sources/nats.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,13 @@ If not specified, defaults to `"Instant"`.

The connector distinguishes between **retryable** and **fatal** errors:

- **Retryable errors** (temporary network/server issues, missing stream during startup, transient message-stream failures) move the connector into retry mode. It reports non-fatal endpoint errors and retries automatically every `retry_interval_secs`.
- **Retryable errors** (temporary network/server issues, missing stream during startup, transient message-stream failures, and temporary failures while fetching JetStream stream metadata used during startup, resume, or replay validation) move the connector into retry mode. It reports non-fatal endpoint errors and retries automatically every `retry_interval_secs`.
- **Fatal errors** stop the connector and report a fatal endpoint error. This is used when checkpoint/replay metadata is incompatible with the current stream sequence space.

Before reading after startup or resume, the connector validates the checkpoint resume cursor against the stream's available sequence range. During replay, it validates that the requested replay range still exists.

Only transient I/O failures during these validation checks are retried. Once the connector successfully reads the stream metadata, logical validation failures remain fatal.

Typical fatal scenarios include:

- **Stream deleted or recreated**: The checkpoint references sequence numbers that no longer exist in the current stream. For example, the resume cursor is before the stream's earliest available sequence, or after the stream's latest sequence.
Expand Down