feldera/crates/feldera-types/src/config.rs at felderize · feldera/feldera

History

1703 lines (1512 loc) · 63.1 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

//! Controller configuration.

//!

//! This module defines the controller configuration structure. The leaves of

//! this structure are individual transport-specific and data-format-specific

//! endpoint configs. We represent these configs as opaque JSON values, so

//! that the entire configuration tree can be deserialized from a JSON file.

use crate::preprocess::PreprocessorConfig;

use crate::program_schema::ProgramSchema;

use crate::secret_resolver::default_secrets_directory;

use crate::transport::adhoc::AdHocInputConfig;

use crate::transport::clock::ClockConfig;

use crate::transport::datagen::DatagenInputConfig;

use crate::transport::delta_table::{DeltaTableReaderConfig, DeltaTableWriterConfig};

use crate::transport::file::{FileInputConfig, FileOutputConfig};

use crate::transport::http::HttpInputConfig;

use crate::transport::iceberg::IcebergReaderConfig;

use crate::transport::kafka::{KafkaInputConfig, KafkaOutputConfig};

use crate::transport::nats::NatsInputConfig;

use crate::transport::nexmark::NexmarkInputConfig;

use crate::transport::postgres::{PostgresReaderConfig, PostgresWriterConfig};

use crate::transport::pubsub::PubSubInputConfig;

use crate::transport::redis::RedisOutputConfig;

use crate::transport::s3::S3InputConfig;

use crate::transport::url::UrlInputConfig;

use core::fmt;

use feldera_ir::{MirNode, MirNodeId};

use serde::de::{self, MapAccess, Visitor};

use serde::{Deserialize, Deserializer, Serialize};

use serde_json::Value as JsonValue;

use std::collections::HashMap;

use std::fmt::Display;

use std::path::Path;

use std::str::FromStr;

use std::time::Duration;

use std::{borrow::Cow, cmp::max, collections::BTreeMap};

use utoipa::ToSchema;

use utoipa::openapi::{ObjectBuilder, OneOfBuilder, Ref, RefOr, Schema, SchemaType};

const DEFAULT_MAX_PARALLEL_CONNECTOR_INIT: u64 = 10;

/// Default value of `ConnectorConfig::max_queued_records`.

pub const fn default_max_queued_records() -> u64 {

1_000_000

}

pub const DEFAULT_MAX_WORKER_BATCH_SIZE: u64 = 10_000;

pub const DEFAULT_CLOCK_RESOLUTION_USECS: u64 = 1_000_000;

/// Program information included in the pipeline configuration.

#[derive(Debug, Clone, Serialize, Deserialize, ToSchema, PartialEq, Eq)]

pub struct ProgramIr {

/// The MIR of the program.

pub mir: HashMap<MirNodeId, MirNode>,

/// Program schema.

pub program_schema: ProgramSchema,

}

/// Pipeline deployment configuration.

/// It represents configuration entries directly provided by the user

/// (e.g., runtime configuration) and entries derived from the schema

/// of the compiled program (e.g., connectors). Storage configuration,

/// if applicable, is set by the runner.

#[derive(Debug, Clone, Serialize, Deserialize, ToSchema, PartialEq)]

pub struct PipelineConfig {

/// Global controller configuration.

#[serde(flatten)]

#[schema(inline)]

pub global: RuntimeConfig,

/// Configuration for multihost pipelines.

///

/// The presence of this field indicates that the pipeline is running in

/// multihost mode. In the pod with ordinal 0, this triggers starting the

/// coordinator process. In all pods, this tells the pipeline process to

/// await a connection from the coordinator instead of initializing the

/// pipeline immediately.

pub multihost: Option<MultihostConfig>,

/// Unique system-generated name of the pipeline (format: `pipeline-<uuid>`).

/// It is unique across all tenants and cannot be changed.

///

/// The `<uuid>` is also used in the naming of various resources that back the pipeline,

/// and as such this name is useful to find/identify corresponding resources.

pub name: Option<String>,

/// Name given by the tenant to the pipeline. It is only unique within the same tenant, and can

/// be changed by the tenant when the pipeline is stopped.

///

/// Given a specific tenant, it can be used to find/identify a specific pipeline of theirs.

pub given_name: Option<String>,

/// Configuration for persistent storage

///

/// If `global.storage` is `Some(_)`, this field must be set to some

/// [`StorageConfig`]. If `global.storage` is `None``, the pipeline ignores

/// this field.

#[serde(default)]

pub storage_config: Option<StorageConfig>,

/// Directory containing values of secrets.

///

/// If this is not set, a default directory is used.

pub secrets_dir: Option<String>,

/// Input endpoint configuration.

#[serde(default)]

pub inputs: BTreeMap<Cow<'static, str>, InputEndpointConfig>,

/// Output endpoint configuration.

#[serde(default)]

pub outputs: BTreeMap<Cow<'static, str>, OutputEndpointConfig>,

/// Program information.

#[serde(default)]

pub program_ir: Option<ProgramIr>,

}

impl PipelineConfig {

pub fn max_parallel_connector_init(&self) -> u64 {

max(

self.global

.max_parallel_connector_init

.unwrap_or(DEFAULT_MAX_PARALLEL_CONNECTOR_INIT),

)

}

pub fn with_storage(self, storage: Option<(StorageConfig, StorageOptions)>) -> Self {

let (storage_config, storage_options) = storage.unzip();

Self {

global: RuntimeConfig {

storage: storage_options,

..self.global

storage_config,

..self

}

pub fn storage(&self) -> Option<(&StorageConfig, &StorageOptions)> {

let storage_options = self.global.storage.as_ref();

let storage_config = self.storage_config.as_ref();

storage_config.zip(storage_options)

}

/// Returns `self.secrets_dir`, or the default secrets directory if it isn't

/// set.

pub fn secrets_dir(&self) -> &Path {

match &self.secrets_dir {

Some(dir) => Path::new(dir.as_str()),

None => default_secrets_directory(),

}

/// Abbreviated config that can be printed in the log on pipeline startup.

pub fn display_summary(&self) -> String {

// TODO: we may want to further abbreviate connector config.

let summary = serde_json::json!({

"name": self.name,

"given_name": self.given_name,

"global": self.global,

"storage_config": self.storage_config,

"secrets_dir": self.secrets_dir,

"inputs": self.inputs,

"outputs": self.outputs

});

serde_json::to_string_pretty(&summary).unwrap_or_else(|_| "{}".to_string())

}

/// A subset of fields in `PipelineConfig` that are generated by the compiler.

/// These fields are shipped to the pipeline by the compilation server along with

/// the program binary.

// Note: An alternative would be to embed these fields in the program binary itself

// as static strings. This would work well for program IR, but it would require recompiling

// the program anytime a connector config changes, whereas today connector changes

// do not require recompilation.

#[derive(Default, Deserialize, Serialize, Eq, PartialEq, Debug, Clone)]

pub struct PipelineConfigProgramInfo {

/// Input endpoint configuration.

pub inputs: BTreeMap<Cow<'static, str>, InputEndpointConfig>,

/// Output endpoint configuration.

#[serde(default)]

pub outputs: BTreeMap<Cow<'static, str>, OutputEndpointConfig>,

/// Program information.

#[serde(default)]

pub program_ir: Option<ProgramIr>,

}

/// Configuration for a multihost Feldera pipeline.

///

/// This configuration is primarily for the coordinator.

#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

pub struct MultihostConfig {

/// Number of hosts to launch.

///

/// For the configuration to be truly multihost, this should be at least 2.

/// A value of 1 still runs the multihost coordinator but it only

/// coordinates a single host.

pub hosts: usize,

}

impl Default for MultihostConfig {

fn default() -> Self {

Self { hosts: 1 }

}

/// Configuration for persistent storage in a [`PipelineConfig`].

#[derive(Debug, Clone, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

pub struct StorageConfig {

/// A directory to keep pipeline state, as a path on the filesystem of the

/// machine or container where the pipeline will run.

///

/// When storage is enabled, this directory stores the data for

/// [StorageBackendConfig::Default].

///

/// When fault tolerance is enabled, this directory stores checkpoints and

/// the log.

pub path: String,

/// How to cache access to storage in this pipeline.

#[serde(default)]

pub cache: StorageCacheConfig,

}

impl StorageConfig {

pub fn path(&self) -> &Path {

Path::new(&self.path)

}

/// How to cache access to storage within a Feldera pipeline.

#[derive(Copy, Clone, Default, Deserialize, Serialize, Debug, PartialEq, Eq, ToSchema)]

#[serde(rename_all = "snake_case")]

pub enum StorageCacheConfig {

/// Use the operating system's page cache as the primary storage cache.

///

/// This is the default because it currently performs better than

/// `FelderaCache`.

#[default]

PageCache,

/// Use Feldera's internal cache implementation.

///

/// This is under development. It will become the default when its

/// performance exceeds that of `PageCache`.

FelderaCache,

}

impl StorageCacheConfig {

#[cfg(unix)]

pub fn to_custom_open_flags(&self) -> i32 {

match self {

StorageCacheConfig::PageCache => (),

StorageCacheConfig::FelderaCache => {

#[cfg(target_os = "linux")]

return libc::O_DIRECT;

}

/// Storage configuration for a pipeline.

#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

#[serde(default)]

pub struct StorageOptions {

/// How to connect to the underlying storage.

pub backend: StorageBackendConfig,

/// For a batch of data maintained as part of a persistent index during a

/// pipeline run, the minimum estimated number of bytes to write it to

/// storage.

///

/// This is provided for debugging and fine-tuning and should ordinarily be

/// left unset.

///

/// A value of 0 will write even empty batches to storage, and nonzero

/// values provide a threshold. `usize::MAX` would effectively disable

/// storage for such batches. The default is 10,048,576 (10 MiB).

pub min_storage_bytes: Option<usize>,

/// For a batch of data passed through the pipeline during a single step,

/// the minimum estimated number of bytes to write it to storage.

///

/// This is provided for debugging and fine-tuning and should ordinarily be

/// left unset. A value of 0 will write even empty batches to storage, and

/// nonzero values provide a threshold. `usize::MAX`, the default,

/// effectively disables storage for such batches. If it is set to another

/// value, it should ordinarily be greater than or equal to

/// `min_storage_bytes`.

pub min_step_storage_bytes: Option<usize>,

/// The form of compression to use in data batches.

///

/// Compression has a CPU cost but it can take better advantage of limited

/// NVMe and network bandwidth, which means that it can increase overall

/// performance.

pub compression: StorageCompression,

/// The maximum size of the in-memory storage cache, in MiB.

///

/// If set, the specified cache size is spread across all the foreground and

/// background threads. If unset, each foreground or background thread cache

/// is limited to 256 MiB.

pub cache_mib: Option<usize>,

}

/// Backend storage configuration.

#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

#[serde(tag = "name", content = "config", rename_all = "snake_case")]

pub enum StorageBackendConfig {

/// Use the default storage configuration.

///

/// This currently uses the local file system.

#[default]

Default,

/// Use the local file system.

///

/// This uses ordinary system file operations.

File(Box<FileBackendConfig>),

/// Object storage.

Object(ObjectStorageConfig),

}

impl Display for StorageBackendConfig {

fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

match self {

StorageBackendConfig::Default => write!(f, "default"),

StorageBackendConfig::File(_) => write!(f, "file"),

StorageBackendConfig::Object(_) => write!(f, "object"),

}

/// Storage compression algorithm.

#[derive(Debug, Copy, Clone, Default, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

#[serde(rename_all = "snake_case")]

pub enum StorageCompression {

/// Use Feldera's default compression algorithm.

///

/// The default may change as Feldera's performance is tuned and new

/// algorithms are introduced.

#[default]

Default,

/// Do not compress.

None,

/// Use [Snappy](https://en.wikipedia.org/wiki/Snappy_(compression)) compression.

Snappy,

}

#[derive(Debug, Clone, Eq, PartialEq)]

pub enum StartFromCheckpoint {

Latest,

Uuid(uuid::Uuid),

}

impl ToSchema<'_> for StartFromCheckpoint {

fn schema() -> (

&'static str,

utoipa::openapi::RefOr<utoipa::openapi::schema::Schema>,

) {

(

"StartFromCheckpoint",

utoipa::openapi::RefOr::T(Schema::OneOf(

OneOfBuilder::new()

.item(

ObjectBuilder::new()

.schema_type(SchemaType::String)

.enum_values(Some(["latest"].into_iter()))

.build(),

)

.item(

ObjectBuilder::new()

.schema_type(SchemaType::String)

.format(Some(utoipa::openapi::SchemaFormat::KnownFormat(

utoipa::openapi::KnownFormat::Uuid,

)))

.build(),

)

.nullable(true)

.build(),

)),

)

}

impl<'de> Deserialize<'de> for StartFromCheckpoint {

fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>

where

D: Deserializer<'de>,

{

struct StartFromCheckpointVisitor;

impl<'de> Visitor<'de> for StartFromCheckpointVisitor {

type Value = StartFromCheckpoint;

fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {

formatter.write_str("a UUID string or the string \"latest\"")

}

fn visit_str<E>(self, value: &str) -> Result<Self::Value, E>

where

E: de::Error,

{

if value == "latest" {

Ok(StartFromCheckpoint::Latest)

} else {

uuid::Uuid::parse_str(value)

.map(StartFromCheckpoint::Uuid)

.map_err(|_| E::invalid_value(serde::de::Unexpected::Str(value), &self))

}

deserializer.deserialize_str(StartFromCheckpointVisitor)

}

impl Serialize for StartFromCheckpoint {

fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>

where

S: serde::Serializer,

{

match self {

StartFromCheckpoint::Latest => serializer.serialize_str("latest"),

StartFromCheckpoint::Uuid(uuid) => serializer.serialize_str(&uuid.to_string()),

}

#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

pub struct SyncConfig {

/// The endpoint URL for the storage service.

///

/// This is typically required for custom or local S3-compatible storage providers like MinIO.

/// Example: `http://localhost:9000`

///

/// Relevant rclone config key: [`endpoint`](https://rclone.org/s3/#s3-endpoint)

pub endpoint: Option<String>,

/// The name of the storage bucket.

///

/// This may include a path to a folder inside the bucket (e.g., `my-bucket/data`).

pub bucket: String,

/// The region that this bucket is in.

///

/// Leave empty for Minio or the default region (`us-east-1` for AWS).

pub region: Option<String>,

/// The name of the cloud storage provider (e.g., `"AWS"`, `"Minio"`).

///

/// Used for provider-specific behavior in rclone.

/// If omitted, defaults to `"Other"`.

///

/// See [rclone S3 provider documentation](https://rclone.org/s3/#s3-provider)

pub provider: Option<String>,

/// The access key used to authenticate with the storage provider.

///

/// If not provided, rclone will fall back to environment-based credentials, such as

/// `RCLONE_S3_ACCESS_KEY_ID`. In Kubernetes environments using IRSA (IAM Roles for Service Accounts),

/// this can be left empty to allow automatic authentication via the pod's service account.

pub access_key: Option<String>,

/// The secret key used together with the access key for authentication.

///

/// If not provided, rclone will fall back to environment-based credentials, such as

/// `RCLONE_S3_SECRET_ACCESS_KEY`. In Kubernetes environments using IRSA (IAM Roles for Service Accounts),

/// this can be left empty to allow automatic authentication via the pod's service account.

pub secret_key: Option<String>,

/// When set, the pipeline will try fetch the specified checkpoint from the

/// object store.

///

/// If `fail_if_no_checkpoint` is `true`, the pipeline will fail to initialize.

pub start_from_checkpoint: Option<StartFromCheckpoint>,

/// When true, the pipeline will fail to initialize if fetching the

/// specified checkpoint fails (missing, download error).

/// When false, the pipeline will start from scratch instead.

///

/// False by default.

#[schema(default = std::primitive::bool::default)]

#[serde(default)]

pub fail_if_no_checkpoint: bool,

/// The number of file transfers to run in parallel.

/// Default: 20

pub transfers: Option<u8>,

/// The number of checkers to run in parallel.

/// Default: 20

pub checkers: Option<u8>,

/// Set to skip post copy check of checksums, and only check the file sizes.

/// This can significantly improve the throughput.

/// Defualt: false

pub ignore_checksum: Option<bool>,

/// Number of streams to use for multi-thread downloads.

/// Default: 10

pub multi_thread_streams: Option<u8>,

/// Use multi-thread download for files above this size.

/// Format: `[size][Suffix]` (Example: 1G, 500M)

/// Supported suffixes: k|M|G|T

/// Default: 100M

pub multi_thread_cutoff: Option<String>,

/// The number of chunks of the same file that are uploaded for multipart uploads.

/// Default: 10

pub upload_concurrency: Option<u8>,

/// When `true`, the pipeline starts in **standby** mode; processing doesn't

/// start until activation (`POST /activate`).

/// If this pipeline was previously activated and the storage has not been

/// cleared, the pipeline will auto activate, no newer checkpoints will be

/// fetched.

///

/// Standby behavior depends on `start_from_checkpoint`:

/// - If `latest`, pipeline continuously fetches the latest available

/// checkpoint until activated.

/// - If checkpoint UUID, pipeline fetches this checkpoint once and waits

/// in standby until activated.

///

/// Default: `false`

#[schema(default = std::primitive::bool::default)]

#[serde(default)]

pub standby: bool,

/// The interval (in seconds) between each attempt to fetch the latest

/// checkpoint from object store while in standby mode.

///

/// Applies only when `start_from_checkpoint` is set to `latest`.

///

/// Default: 10 seconds

#[schema(default = default_pull_interval)]

#[serde(default = "default_pull_interval")]

pub pull_interval: u64,

/// The interval (in seconds) between each push of checkpoints to object store.

///

/// Default: disabled (no periodic push).

#[serde(default)]

pub push_interval: Option<u64>,

/// Extra flags to pass to `rclone`.

///

/// WARNING: Supplying incorrect or conflicting flags can break `rclone`.

/// Use with caution.

///

/// Refer to the docs to see the supported flags:

/// - [Global flags](https://rclone.org/flags/)

/// - [S3 specific flags](https://rclone.org/s3/)

pub flags: Option<Vec<String>>,

/// The minimum number of checkpoints to retain in object store.

/// No checkpoints will be deleted if the total count is below this threshold.

///

/// Default: 10

#[schema(default = default_retention_min_count)]

#[serde(default = "default_retention_min_count")]

pub retention_min_count: u32,

/// The minimum age (in days) a checkpoint must reach before it becomes

/// eligible for deletion. All younger checkpoints will be preserved.

///

/// Default: 30

#[schema(default = default_retention_min_age)]

#[serde(default = "default_retention_min_age")]

pub retention_min_age: u32,

/// A read-only bucket used as a fallback checkpoint source.

///

/// When the pipeline has no local checkpoint and `bucket` contains no

/// checkpoint either, it will attempt to fetch the checkpoint from this

/// location instead. All connection settings (`endpoint`, `region`,

/// `provider`, `access_key`, `secret_key`) are shared with `bucket`.

///

/// The pipeline **never writes** to `read_bucket`.

///

/// Must point to a different location than `bucket`.

#[serde(default)]

pub read_bucket: Option<String>,

}

fn default_pull_interval() -> u64 {

}

fn default_retention_min_count() -> u32 {

}

fn default_retention_min_age() -> u32 {

}

impl SyncConfig {

pub fn validate(&self) -> Result<(), String> {

if self.standby && self.start_from_checkpoint.is_none() {

return Err(r#"invalid sync config: `standby` set to `true` but `start_from_checkpoint` not set.

Standby mode requires `start_from_checkpoint` to be set.

Consider setting `start_from_checkpoint` to `"latest"`."#.to_owned());

}

if let Some(ref rb) = self.read_bucket

&& rb == &self.bucket

{

return Err(

"invalid sync config: `read_bucket` and `bucket` must point to different locations"

.to_owned(),

);

}

Ok(())

}

/// Configuration for supplying a custom pipeline StatefulSet template via a Kubernetes ConfigMap.

///

/// Operators can provide a custom StatefulSet YAML that the Kubernetes runner will use when

/// creating pipeline StatefulSets for a pipeline. The custom template must be stored as the

/// value of a key in a ConfigMap in the same namespace as the pipeline; set `name` to the

/// ConfigMap name and `key` to the entry that contains the template.

///

/// Recommendations and requirements:

/// - **Start from the default template and modify it as needed.** The default template is present

/// in ConfigMap named as `<release-name>-pipeline-template`, with key `pipelineTemplate` in the release

/// namespace and should be used as a reference.

/// - The template must contain a valid Kubernetes `StatefulSet` manifest in YAML form. The

/// runner substitutes variables in the template before parsing; therefore the final YAML

/// must be syntactically valid.

/// - The runner performs simple string substitution for the following placeholders. Please ensure these

/// placeholders are placed at appropriate location for their semantics:

/// - `{id}`: pipeline Kubernetes name (used for object names and labels)

/// - `{namespace}`: Kubernetes namespace where the pipeline runs

/// - `{pipeline_executor_image}`: container image used to run the pipeline executor

/// - `{binary_ref}`: program binary reference passed as an argument

/// - `{program_info_ref}`: program info reference passed as an argument

/// - `{pipeline_storage_path}`: mount path for persistent pipeline storage

/// - `{storage_class_name}`: storage class name to use for PVCs (if applicable)

/// - `{deployment_id}`: UUID identifying the deployment instance

/// - `{deployment_initial}`: initial desired runtime status (e.g., `provisioning`)

/// - `{bootstrap_policy}`: bootstrap policy value when applicable

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, ToSchema)]

pub struct PipelineTemplateConfig {

/// Name of the ConfigMap containing the pipeline template.

pub name: String,

/// Key in the ConfigMap containing the pipeline template.

///

/// If not set, defaults to `pipelineTemplate`.

#[schema(default = default_pipeline_template_key)]

#[serde(default = "default_pipeline_template_key")]

pub key: String,

}

fn default_pipeline_template_key() -> String {

"pipelineTemplate".to_string()

}

#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

pub struct ObjectStorageConfig {

/// URL.

///

/// The following URL schemes are supported:

///

/// * S3:

/// - `s3://<bucket>/<path>`

/// - `s3a://<bucket>/<path>`

/// - `https://s3.<region>.amazonaws.com/<bucket>`

/// - `https://<bucket>.s3.<region>.amazonaws.com`

/// - `https://ACCOUNT_ID.r2.cloudflarestorage.com/bucket`

/// * Google Cloud Storage:

/// - `gs://<bucket>/<path>`

/// * Microsoft Azure Blob Storage:

/// - `abfs[s]://<container>/<path>` (according to [fsspec](https://github.com/fsspec/adlfs))

/// - `abfs[s]://<file_system>@<account_name>.dfs.core.windows.net/<path>`

/// - `abfs[s]://<file_system>@<account_name>.dfs.fabric.microsoft.com/<path>`

/// - `az://<container>/<path>` (according to [fsspec](https://github.com/fsspec/adlfs))

/// - `adl://<container>/<path>` (according to [fsspec](https://github.com/fsspec/adlfs))

/// - `azure://<container>/<path>` (custom)

/// - `https://<account>.dfs.core.windows.net`

/// - `https://<account>.blob.core.windows.net`

/// - `https://<account>.blob.core.windows.net/<container>`

/// - `https://<account>.dfs.fabric.microsoft.com`

/// - `https://<account>.dfs.fabric.microsoft.com/<container>`

/// - `https://<account>.blob.fabric.microsoft.com`

/// - `https://<account>.blob.fabric.microsoft.com/<container>`

///

/// Settings derived from the URL will override other settings.

pub url: String,

/// Additional options as key-value pairs.

///

/// The following keys are supported:

///

/// * S3:

/// - `access_key_id`: AWS Access Key.

/// - `secret_access_key`: AWS Secret Access Key.

/// - `region`: Region.

/// - `default_region`: Default region.

/// - `endpoint`: Custom endpoint for communicating with S3,

/// e.g. `https://localhost:4566` for testing against a localstack

/// instance.

/// - `token`: Token to use for requests (passed to underlying provider).

/// - [Other keys](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants).

/// * Google Cloud Storage:

/// - `service_account`: Path to the service account file.

/// - `service_account_key`: The serialized service account key.

/// - `google_application_credentials`: Application credentials path.

/// - [Other keys](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html).

/// * Microsoft Azure Blob Storage:

/// - `access_key`: Azure Access Key.

/// - `container_name`: Azure Container Name.

/// - `account`: Azure Account.

/// - `bearer_token_authorization`: Static bearer token for authorizing requests.

/// - `client_id`: Client ID for use in client secret or Kubernetes federated credential flow.

/// - `client_secret`: Client secret for use in client secret flow.

/// - `tenant_id`: Tenant ID for use in client secret or Kubernetes federated credential flow.

/// - `endpoint`: Override the endpoint for communicating with blob storage.

/// - [Other keys](https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants).

///

/// Options set through the URL take precedence over those set with these

/// options.

#[serde(flatten)]

pub other_options: BTreeMap<String, String>,

}

/// Configuration for local file system access.

#[derive(Debug, Clone, Default, Eq, PartialEq, Serialize, Deserialize, ToSchema)]

#[serde(default)]

pub struct FileBackendConfig {

/// Whether to use background threads for file I/O.

///

/// Background threads should improve performance, but they can reduce

/// performance if too few cores are available. This is provided for

/// debugging and fine-tuning and should ordinarily be left unset.

pub async_threads: Option<bool>,

/// Per-I/O operation sleep duration, in milliseconds.

///

/// This is for simulating slow storage devices. Do not use this in

/// production.

pub ioop_delay: Option<u64>,

/// Configuration to synchronize checkpoints to object store.

pub sync: Option<SyncConfig>,

}

/// Global pipeline configuration settings. This is the publicly

/// exposed type for users to configure pipelines.

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, ToSchema)]

#[serde(default)]

pub struct RuntimeConfig {

/// Number of DBSP worker threads.

///

/// Each DBSP "foreground" worker thread is paired with a "background"

/// thread for LSM merging, making the total number of threads twice the

/// specified number.

///

/// The typical sweet spot for the number of workers is between 4 and 16.

/// Each worker increases overall memory consumption for data structures

/// used during a step.

pub workers: u16,

/// Number of DBSP hosts.

///

/// The worker threads are evenly divided among the hosts. For single-host

/// deployments, this should be 1 (the default).

///

/// Multihost pipelines are an enterprise-only preview feature.

pub hosts: usize,

/// Storage configuration.

///

/// - If this is `None`, the default, the pipeline's state is kept in

/// in-memory data-structures. This is useful if the pipeline's state

/// will fit in memory and if the pipeline is ephemeral and does not need

/// to be recovered after a restart. The pipeline will most likely run

/// faster since it does not need to access storage.

///

/// - If set, the pipeline's state is kept on storage. This allows the

/// pipeline to work with state that will not fit into memory. It also

/// allows the state to be checkpointed and recovered across restarts.

#[serde(deserialize_with = "deserialize_storage_options")]

pub storage: Option<StorageOptions>,

/// Fault tolerance configuration.

#[serde(deserialize_with = "deserialize_fault_tolerance")]

pub fault_tolerance: FtConfig,

/// Enable CPU profiler.

///

/// The default value is `true`.

pub cpu_profiler: bool,

/// Enable pipeline tracing.

pub tracing: bool,

/// Jaeger tracing endpoint to send tracing information to.

pub tracing_endpoint_jaeger: String,

/// Minimal input batch size.

///

/// The controller delays pushing input records to the circuit until at

/// least `min_batch_size_records` records have been received (total

/// across all endpoints) or `max_buffering_delay_usecs` microseconds

/// have passed since at least one input records has been buffered.

/// Defaults to 0.

pub min_batch_size_records: u64,

/// Maximal delay in microseconds to wait for `min_batch_size_records` to

/// get buffered by the controller, defaults to 0.

pub max_buffering_delay_usecs: u64,

/// Resource reservations and limits. This is enforced

/// only in Feldera Cloud.

pub resources: ResourceConfig,

/// Real-time clock resolution in microseconds.

///

/// This parameter controls the execution of queries that use the `NOW()` function. The output of such

/// queries depends on the real-time clock and can change over time without any external

/// inputs. If the query uses `NOW()`, the pipeline will update the clock value and trigger incremental

/// recomputation at most each `clock_resolution_usecs` microseconds. If the query does not use

/// `NOW()`, then clock value updates are suppressed and the pipeline ignores this setting.

///

/// It is set to 1 second (1,000,000 microseconds) by default.

pub clock_resolution_usecs: Option<u64>,

/// Optionally, a list of CPU numbers for CPUs to which the pipeline may pin

/// its worker threads. Specify at least twice as many CPU numbers as

/// workers. CPUs are generally numbered starting from 0. The pipeline

/// might not be able to honor CPU pinning requests.

///

/// CPU pinning can make pipelines run faster and perform more consistently,

/// as long as different pipelines running on the same machine are pinned to

/// different CPUs.

pub pin_cpus: Vec<usize>,

/// Timeout in seconds for the `Provisioning` phase of the pipeline.

/// Setting this value will override the default of the runner.

pub provisioning_timeout_secs: Option<u64>,

/// The maximum number of connectors initialized in parallel during pipeline

/// startup.

///

/// At startup, the pipeline must initialize all of its input and output connectors.

/// Depending on the number and types of connectors, this can take a long time.

/// To accelerate the process, multiple connectors are initialized concurrently.

/// This option controls the maximum number of connectors that can be initialized

/// in parallel.

///

/// The default is 10.

pub max_parallel_connector_init: Option<u64>,

/// Specification of additional (sidecar) containers.

pub init_containers: Option<serde_json::Value>,

/// Deprecated: setting this true or false does not have an effect anymore.

pub checkpoint_during_suspend: bool,

/// Sets the number of available runtime threads for the http server.

///

/// In most cases, this does not need to be set explicitly and

/// the default is sufficient. Can be increased in case the

/// pipeline HTTP API operations are a bottleneck.

///

/// If not specified, the default is set to `workers`.

pub http_workers: Option<u64>,

/// Sets the number of available runtime threads for async IO tasks.

///

/// This affects some networking and file I/O operations

/// especially adapters and ad-hoc queries.

///

/// In most cases, this does not need to be set explicitly and

/// the default is sufficient. Can be increased in case

/// ingress, egress or ad-hoc queries are a bottleneck.

///

/// If not specified, the default is set to `workers`.

pub io_workers: Option<u64>,

/// Environment variables for the pipeline process.

///

/// These are key-value pairs injected into the pipeline process environment.

/// Some variable names are reserved by the platform and cannot be overridden

/// (for example `RUST_LOG`, and variables in the `FELDERA_`,

/// `KUBERNETES_`, and `TOKIO_` namespaces).

#[serde(default)]

pub env: BTreeMap<String, String>,

/// Optional settings for tweaking Feldera internals.

///

/// The available key-value pairs change from one version of Feldera to

/// another, so users should not depend on particular settings being

/// available, or on their behavior.

pub dev_tweaks: BTreeMap<String, serde_json::Value>,

/// Log filtering directives.

///

/// If set to a valid [tracing-subscriber] filter, this controls the log

/// messages emitted by the pipeline process. Otherwise, or if the filter

/// has invalid syntax, messages at "info" severity and higher are written

/// to the log and all others are discarded.

///

/// [tracing-subscriber]: https://docs.rs/tracing-subscriber/latest/tracing_subscriber/filter/struct.EnvFilter.html#directives

pub logging: Option<String>,

/// ConfigMap containing a custom pipeline template (Enterprise only).

///

/// This feature is only available in Feldera Enterprise. If set, the Kubernetes runner

/// will read the template from the specified ConfigMap and use it instead of the default

/// StatefulSet template for the configured pipeline.

///

/// check [`PipelineTemplateConfig`] documentation for details.

pub pipeline_template_configmap: Option<PipelineTemplateConfig>,

}

/// Accepts "true" and "false" and converts them to the new format.

fn deserialize_storage_options<'de, D>(deserializer: D) -> Result<Option<StorageOptions>, D::Error>

where

D: Deserializer<'de>,

{

struct BoolOrStruct;

impl<'de> Visitor<'de> for BoolOrStruct {

type Value = Option<StorageOptions>;

fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {

formatter.write_str("boolean or StorageOptions")

}

fn visit_bool<E>(self, v: bool) -> Result<Self::Value, E>

where

E: de::Error,

{

match v {

false => Ok(None),

true => Ok(Some(StorageOptions::default())),

}

fn visit_unit<E>(self) -> Result<Self::Value, E>

where

E: de::Error,

{

Ok(None)

}

fn visit_none<E>(self) -> Result<Self::Value, E>

where

E: de::Error,

{

Ok(None)

}

fn visit_map<M>(self, map: M) -> Result<Option<StorageOptions>, M::Error>

where

M: MapAccess<'de>,

{

Deserialize::deserialize(de::value::MapAccessDeserializer::new(map)).map(Some)

}

deserializer.deserialize_any(BoolOrStruct)

}

/// Accepts very old 'initial_state' and 'latest_checkpoint' as enabling fault

/// tolerance.

///

/// Accepts `null` as disabling fault tolerance.

///

/// Otherwise, deserializes [FtConfig] in the way that one might otherwise

/// expect.

fn deserialize_fault_tolerance<'de, D>(deserializer: D) -> Result<FtConfig, D::Error>

where

D: Deserializer<'de>,

{

struct StringOrStruct;

impl<'de> Visitor<'de> for StringOrStruct {

type Value = FtConfig;

fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {

formatter.write_str("none or FtConfig or 'initial_state' or 'latest_checkpoint'")

}

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

config.rs

Latest commit

History

config.rs

File metadata and controls