Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ libc = "0.2.153"
like = "0.3.1"
log = "0.4.20"
md-5 = "0.10.6"
memory-stats = { version = "1.2.0", features = ["always_use_statm"] }
metrics = "0.23.0"
metrics-exporter-prometheus = "0.15.3"
metrics-util = "0.17.0"
Expand Down
1 change: 0 additions & 1 deletion crates/adapters/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ postgres = { workspace = true }
postgres-openssl = { workspace = true }
dyn-clone = { workspace = true }
cpu-time = "1.0.0"
memory-stats = { version = "1.2.0", features = ["always_use_statm"] }
feldera-ir = { workspace = true }
base64 = { workspace = true }
aws-msk-iam-sasl-signer = "1.0.1"
Expand Down
2 changes: 1 addition & 1 deletion crates/adapters/src/adhoc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ pub(crate) fn create_session_context(
// Initialize datafusion memory limits
let mut runtime_env_builder = RuntimeEnvBuilder::new();
if let Some(memory_mb_max) = config.global.resources.memory_mb_max {
let memory_bytes_max = memory_mb_max * 1024 * 1024;
let memory_bytes_max = memory_mb_max * 1_000_000;
runtime_env_builder = runtime_env_builder
.with_memory_pool(Arc::new(FairSpillPool::new(memory_bytes_max as usize)));
}
Expand Down
70 changes: 66 additions & 4 deletions crates/adapters/src/controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ use dbsp::circuit::metrics::{
use dbsp::circuit::tokio::TOKIO;
use dbsp::circuit::{CheckpointCommitter, CircuitStorageConfig, DevTweaks, Mode};
use dbsp::storage::backend::{StorageBackend, StoragePath};
use dbsp::utils::process_rss_bytes;
use dbsp::{
DBSPHandle,
circuit::{CircuitConfig, Layout},
Expand Down Expand Up @@ -99,7 +100,6 @@ use governor::Quota;
use governor::RateLimiter;
use itertools::Itertools;
use journal::StepMetadata;
use memory_stats::memory_stats;
use nonzero_ext::nonzero;
use rmpv::Value as RmpValue;
use serde_json::Value as JsonValue;
Expand Down Expand Up @@ -810,10 +810,22 @@ impl Controller {
/// Returns the current controller status in the form used by the external
/// API.
pub fn api_status(&self) -> ExternalControllerStatus {
let runtime = self.inner.runtime.upgrade();
let memory_pressure = runtime
.as_ref()
.map(|runtime| runtime.current_memory_pressure())
.unwrap_or_default();
let memory_pressure_epoch = runtime
.as_ref()
.map(|runtime| runtime.memory_pressure_epoch())
.unwrap_or_default();

self.status().to_api_type(
self.can_suspend(),
self.pipeline_complete(),
self.inner.transaction_info.lock().unwrap().clone(),
memory_pressure,
memory_pressure_epoch,
)
}

Expand Down Expand Up @@ -1188,6 +1200,7 @@ impl Controller {
F: MetricsFormatter,
{
metrics.process_metrics(labels);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change

metrics.gauge(
"records_input_buffered",
"Total amount of data currently buffered by all endpoints, in records.",
Expand Down Expand Up @@ -1346,6 +1359,25 @@ impl Controller {
);

write_fbuf_slab_metrics(metrics, labels, &runtime.fbuf_slabs_stats());

metrics.gauge(
"max_rss_bytes",
"Configured maximum process RSS in bytes. A value of 0 means no limit is configured.",
labels,
runtime.max_rss_bytes().unwrap_or(0),
);
metrics.gauge(
"memory_pressure",
"Current memory pressure level in [0..3]: low=0, moderate=1, high=2, critical=3.",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i wonder what moderate and high do seems like a binary decision to me

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they engage different backpressure mechanisms. Moderate is designed to reduce memory without affecting performance much. high will slow things down at least until the memory pressure has gone down.

labels,
runtime.current_memory_pressure() as u8,
);
metrics.gauge(
"memory_pressure_epoch",
"Monotonic counter incremented whenever memory pressure transitions to high/critical.",
labels,
runtime.memory_pressure_epoch(),
);
}

metrics.counter(
Expand Down Expand Up @@ -4220,6 +4252,7 @@ impl ControllerInit {
// Can't change number of workers or hosts yet.
hosts: checkpoint_config.global.hosts,
workers: checkpoint_config.global.workers,
max_rss_mb: checkpoint_config.global.max_rss_mb,

// The checkpoint determines the fault tolerance model, but the
// pipeline manager can override the details of the
Expand Down Expand Up @@ -4293,21 +4326,50 @@ impl ControllerInit {
}

pub fn set_incarnation_uuid(&mut self, incarnation_uuid: Uuid) {
self.incarnation_uuid = incarnation_uuid;
self.incarnation_uuid = incarnation_uuid
}

fn circuit_config(
layout: Option<Layout>,
pipeline_config: &PipelineConfig,
storage: Option<CircuitStorageConfig>,
) -> Result<CircuitConfig, ControllerError> {
let dev_tweaks = DevTweaks::from_config(&pipeline_config.global.dev_tweaks);

let mut max_rss_mb = pipeline_config.global.max_rss_mb;

if max_rss_mb.is_none()
&& let Some(memory_mb_max) = &pipeline_config.global.resources.memory_mb_max
{
warn!(
"RSS memory limit ('max_rss_mb') is not set, but a Kubernetes memory limit \
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

print something if max_rss_mb is > than kubernetes limit?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

or if there is a significant difference between the to (>50%)?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a good idea, there's no good reason for that config

('resources.memory_mb_max' = {memory_mb_max} MB) is configured. \
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

one wonders if there is even need for both

Copy link
Copy Markdown
Contributor Author

@ryzhyk ryzhyk Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there are several reasons to keep them separate (I'd love to address them somehow)

  • memory_mb_max may be set in values.yaml instead of in the pipeline config. Can we access that setting somehow?
  • The pipeline may get a pod withmemory_mb_min, not _max. Ideally we'd have a way to find out how much memory the pipeline actually has. I asked about that on slack and you guys told me it's probably impossible.
  • We also have rclone, which runs as a separate process and can use non-trivial amount of memory. max_rss_mb may need to account for that.

Using the Kubernetes limit as the RSS memory limit."
);
max_rss_mb = Some(*memory_mb_max);
} else if max_rss_mb.is_none() && pipeline_config.global.resources.memory_mb_max.is_none() {
warn!(
"RSS memory limit ('max_rss_mb') is not set, and no Kubernetes memory limit \
('resources.memory_mb_max') is configured. We recommend configuring at least one of these settings to avoid out-of-memory failures."
);
} else if let Some(max_rss_mb) = max_rss_mb
&& let Some(memory_mb_max) = &pipeline_config.global.resources.memory_mb_max
&& max_rss_mb > *memory_mb_max
{
warn!(
"RSS memory limit ('max_rss_mb') is set to {max_rss_mb} MB exceeds the Kubernetes memory limit \
('resources.memory_mb_max' = {memory_mb_max} MB) is configured. This will likely cause out-of-memory failures."
);
}

Ok(CircuitConfig {
layout: layout
.unwrap_or_else(|| Layout::new_solo(pipeline_config.global.workers as usize)),
max_rss_bytes: max_rss_mb.map(|mb| mb * 1_000_000),
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be a constant 1_000_000?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't expect the definition of 1MB to ever change. It's probably an overkill

Copy link
Copy Markdown
Contributor

@gz gz Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's more about documenting what you're converting to in the code e.g. you write it as mb * ONE_MB_IN_BYTES it makes it easy to read/understand

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unit mismatch: memory_mb_max is used elsewhere as MiB (e.g. adhoc.rs:44 multiplies by 1024 * 1024), but here it is multiplied by 1_000_000 (SI MB). These will disagree by ~5% at large values. Pick one and be consistent — MiB is what Kubernetes / Linux think of as "megabytes".

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

adhoc.rs is wrong in this case. I don't think it's important the way it's used there, but I'll change it to 1_000_000 there.

pin_cpus: pipeline_config.global.pin_cpus.clone(),
storage,
mode: Mode::Persistent,
dev_tweaks: DevTweaks::from_config(&pipeline_config.global.dev_tweaks),
dev_tweaks,
})
}

Expand Down Expand Up @@ -4513,7 +4575,7 @@ impl StatisticsThread {
total_processed_records: controller_status
.global_metrics
.num_total_processed_records(),
memory_bytes: memory_stats().map_or(0, |stats| stats.physical_mem as u64),
memory_bytes: process_rss_bytes().unwrap_or_default(),
storage_bytes,
};
let mut time_series = controller_status.time_series.lock().unwrap();
Expand Down
13 changes: 8 additions & 5 deletions crates/adapters/src/controller/stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ use base64::{Engine, prelude::BASE64_URL_SAFE_NO_PAD};
use chrono::{DateTime, Utc};
use cpu_time::ProcessTime;
use crossbeam::sync::Unparker;
use dbsp::utils::process_rss_bytes;
use feldera_adapterlib::{
errors::journal::ControllerError,
format::{BufferSize, ParseError},
Expand All @@ -61,11 +62,11 @@ use feldera_types::{
},
config::{FtModel, PipelineConfig},
coordination::Completion,
memory_pressure::MemoryPressure,
suspend::SuspendError,
time_series::SampleStatistics,
transaction::CommitProgressSummary,
};
use memory_stats::memory_stats;
use parking_lot::{RwLock, RwLockReadGuard};
use serde::{Deserialize, Serialize};
use std::{
Expand Down Expand Up @@ -1171,6 +1172,8 @@ impl ControllerStatus {
suspend_error: Result<(), SuspendError>,
pipeline_complete: bool,
transaction_info: TransactionInfo,
memory_pressure: MemoryPressure,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does the epoch need to be a separate field? could it be inside of MemoryPressure? (havent looked at the structure yet)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They are only loosely related.

memory_pressure_epoch: u64,
) -> feldera_types::adapter_stats::ExternalControllerStatus {
use feldera_types::adapter_stats;

Expand Down Expand Up @@ -1229,12 +1232,12 @@ impl ControllerStatus {
.num_milliseconds()
.try_into()
.unwrap_or(0),
rss_bytes: if let Some(usage) = memory_stats() {
usage.physical_mem as u64
} else {
rss_bytes: process_rss_bytes().unwrap_or_else(|| {
error!("Failed to fetch process RSS");
0
},
}),
memory_pressure,
memory_pressure_epoch,
cpu_msecs: match ProcessTime::try_now() {
Ok(time) => time.as_duration().as_millis() as u64,
Err(e) => {
Expand Down
3 changes: 3 additions & 0 deletions crates/adapters/src/controller/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use csv::{ReaderBuilder as CsvReaderBuilder, WriterBuilder as CsvWriterBuilder};
use feldera_types::{
config::{InputEndpointConfig, OutputEndpointConfig},
constants::STATE_FILE,
memory_pressure::MemoryPressure,
};
use serde_json::json;
use std::{
Expand Down Expand Up @@ -2305,6 +2306,8 @@ fn test_external_controller_status_serialization() {
])),
false,
TransactionInfo::default(),
MemoryPressure::default(),
0,
);
external_status.global_metrics.rss_bytes = 1024 * 1024 * 512; // 512 MB
external_status.global_metrics.cpu_msecs = 45_000;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::test::{TestStruct, init_test_logger, test_circuit, wait};
use crate::{Controller, PipelineConfig};
use anyhow::Result as AnyResult;
use csv::ReaderBuilder as CsvReaderBuilder;
use feldera_types::memory_pressure::MemoryPressure;
use serde_json;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
Expand Down Expand Up @@ -268,6 +269,8 @@ outputs:
Ok(()),
false,
TransactionInfo::default(),
MemoryPressure::default(),
0,
))
.unwrap()
);
Expand Down
1 change: 1 addition & 0 deletions crates/adapters/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -725,6 +725,7 @@ pub(crate) fn run_in_posix_runtime<F>(
let temp = tempfile::tempdir().expect("failed to create temp dir for storage");
let cconf = CircuitConfig {
layout: Layout::new_solo(1),
max_rss_bytes: None,
mode: Mode::Ephemeral,
pin_cpus: Vec::new(),
storage: Some(
Expand Down
1 change: 1 addition & 0 deletions crates/dbsp/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ async-stream = { workspace = true }
futures-util = { workspace = true }
rmp-serde = { workspace = true }
feldera-buffer-cache = { workspace = true }
memory-stats = { workspace = true }

[dev-dependencies]
rand = { workspace = true }
Expand Down
1 change: 1 addition & 0 deletions crates/dbsp/benches/cursor_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ fn bench(storage: bool) {
let temp = tempdir().expect("failed to create temp dir for storage");
let config = CircuitConfig {
layout: Layout::new_solo(1),
max_rss_bytes: None,
mode: Mode::Ephemeral,
pin_cpus: Vec::new(),
storage: Some(
Expand Down
1 change: 1 addition & 0 deletions crates/dbsp/benches/input_map_ingest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ fn main() -> Result<()> {
let temp = tempdir().context("failed to create temp directory for storage backend")?;
let config = CircuitConfig {
layout: Layout::new_solo(WORKERS),
max_rss_bytes: None,
mode: Mode::Ephemeral,
pin_cpus: Vec::new(),
storage: Some(
Expand Down
1 change: 1 addition & 0 deletions crates/dbsp/benches/list_merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ fn bench(generate_on_storage: bool) {
let temp = tempdir().expect("failed to create temp directory");
let config = CircuitConfig {
layout: Layout::new_solo(1),
max_rss_bytes: None,
mode: Mode::Ephemeral,
pin_cpus: Vec::new(),
storage: Some(
Expand Down
11 changes: 11 additions & 0 deletions crates/dbsp/src/circuit/dbsp_handle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,10 @@ pub struct CircuitConfig {
/// How the circuit is laid out across one or multiple machines.
pub layout: Layout,

/// The maximum amount of memory, in bytes, that the process is allowed to use.
/// Used to calculate the memory pressure level.
pub max_rss_bytes: Option<u64>,

/// Optionally, CPU numbers for pinning the worker threads.
pub pin_cpus: Vec<usize>,

Expand Down Expand Up @@ -565,13 +569,19 @@ impl CircuitConfig {
pub fn with_workers(n: usize) -> Self {
Self {
layout: Layout::new_solo(n),
max_rss_bytes: None,
pin_cpus: Vec::new(),
mode: Mode::Ephemeral,
storage: None,
dev_tweaks: DevTweaks::default(),
}
}

pub fn with_max_rss_bytes(mut self, max_rss: Option<u64>) -> Self {
self.max_rss_bytes = max_rss;
self
}

pub fn with_mode(mut self, mode: Mode) -> Self {
self.mode = mode;
self
Expand Down Expand Up @@ -2130,6 +2140,7 @@ pub(crate) mod tests {
let temp = tempdir().expect("Can't create temp dir for storage");
let cconf = CircuitConfig {
layout: Layout::new_solo(1),
max_rss_bytes: None,
mode: Mode::Ephemeral,
pin_cpus: Vec::new(),
storage: Some(
Expand Down
Loading
Loading