-
Notifications
You must be signed in to change notification settings - Fork 108
Expand file tree
/
Copy pathsync.rs
More file actions
244 lines (205 loc) · 7.99 KB
/
sync.rs
File metadata and controls
244 lines (205 loc) · 7.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
#![allow(unused_imports)]
use anyhow::Context;
use std::sync::{
Arc, LazyLock, Mutex, Weak,
atomic::{AtomicU64, Ordering},
};
use dbsp::circuit::{CircuitStorageConfig, checkpointer::Checkpointer};
use feldera_adapterlib::errors::journal::ControllerError;
use feldera_storage::{
StorageBackend, StoragePath, checkpoint_synchronizer::CheckpointSynchronizer,
histogram::ExponentialHistogram,
};
use feldera_types::{
checkpoint::CheckpointMetadata,
config::{FileBackendConfig, StorageBackendConfig, SyncConfig},
constants::ACTIVATION_MARKER_FILE,
};
use crate::server::ServerState;
// Pull metrics
/// Bytes transferred when pulling a checkpoint.
pub static CHECKPOINT_SYNC_PULL_TRANSFERRED_BYTES: ExponentialHistogram =
ExponentialHistogram::new();
/// Transfer speed when pulling a checkpoint, in bytes per second.
pub static CHECKPOINT_SYNC_PULL_TRANSFER_SPEED: ExponentialHistogram = ExponentialHistogram::new();
/// Number of checkpoints pulled successfully.
pub static CHECKPOINT_SYNC_PULL_SUCCESS: AtomicU64 = AtomicU64::new(0);
/// Number of failures when pulling a checkpoint.
pub static CHECKPOINT_SYNC_PULL_FAILURES: AtomicU64 = AtomicU64::new(0);
/// Time taken to pull a checkpoint from object store in seconds.
pub static CHECKPOINT_SYNC_PULL_DURATION_SECONDS: ExponentialHistogram =
ExponentialHistogram::new();
// Push metrics
/// Bytes transferred when pushing a checkpoint.
pub static CHECKPOINT_SYNC_PUSH_TRANSFERRED_BYTES: ExponentialHistogram =
ExponentialHistogram::new();
/// Transfer speed when pushing a checkpoint, in bytes per second.
pub static CHECKPOINT_SYNC_PUSH_TRANSFER_SPEED: ExponentialHistogram = ExponentialHistogram::new();
/// Number of checkpoints pushed successfully.
pub static CHECKPOINT_SYNC_PUSH_SUCCESS: AtomicU64 = AtomicU64::new(0);
/// Number of failures when pushing a checkpoint.
pub static CHECKPOINT_SYNC_PUSH_FAILURES: AtomicU64 = AtomicU64::new(0);
/// Time taken to push a checkpoint to object store in seconds.
pub static CHECKPOINT_SYNC_PUSH_DURATION_SECONDS: ExponentialHistogram =
ExponentialHistogram::new();
/// Lazily resolves the checkpoint synchronizer.
///
/// This panic is safe as all enterprise builds must include the checkpoint-sync
/// crate.
pub(super) static SYNCHRONIZER: LazyLock<&'static dyn CheckpointSynchronizer> =
LazyLock::new(|| {
let Some(synchronizer) = inventory::iter::<&dyn CheckpointSynchronizer>
.into_iter()
.next()
else {
unreachable!("no checkpoint synchronizer found; are enterprise features enabled?");
};
*synchronizer
});
/// Pulls the checkpoint specified by the sync config and garbage collects all
/// older checkpoints.
#[cfg(feature = "feldera-enterprise")]
fn pull_and_gc(
storage: Arc<dyn StorageBackend>,
sync: &SyncConfig,
prev: &mut uuid::Uuid,
) -> Result<CheckpointMetadata, ControllerError> {
match SYNCHRONIZER
.pull(storage.clone(), sync.to_owned())
.map_err(|e| ControllerError::checkpoint_fetch_error(format!("{e:?}")))
{
Err(err) => {
tracing::error!("{:?}", err.to_string());
CHECKPOINT_SYNC_PULL_FAILURES.fetch_add(1, Ordering::Relaxed);
Err(err)
}
Ok((cpm, metrics)) => {
if cpm.uuid != *prev {
CHECKPOINT_SYNC_PULL_SUCCESS.fetch_add(1, Ordering::Relaxed);
*prev = cpm.uuid;
if let Some(metrics) = metrics {
CHECKPOINT_SYNC_PULL_TRANSFER_SPEED.record(metrics.speed);
CHECKPOINT_SYNC_PULL_DURATION_SECONDS.record(metrics.duration.as_secs());
CHECKPOINT_SYNC_PULL_TRANSFERRED_BYTES.record(metrics.bytes);
}
}
let checkpointer = Checkpointer::new(storage.clone())?;
checkpointer.gc_startup()?;
Ok(cpm)
}
}
}
#[cfg(feature = "feldera-enterprise")]
pub fn is_pull_necessary(storage: &CircuitStorageConfig) -> Option<&SyncConfig> {
let StorageBackendConfig::File(ref file_cfg) = storage.options.backend else {
return None;
};
let FileBackendConfig {
sync: Some(ref sync),
..
} = **file_cfg
else {
return None;
};
sync.start_from_checkpoint.as_ref()?;
Some(sync)
}
#[cfg(feature = "feldera-enterprise")]
pub fn pull_once(storage: &CircuitStorageConfig, sync: &SyncConfig) -> Result<(), ControllerError> {
pull_and_gc(storage.backend.clone(), sync, &mut uuid::Uuid::nil())?;
Ok(())
}
#[cfg(feature = "feldera-enterprise")]
pub fn continuous_pull<F>(
storage: &CircuitStorageConfig,
is_activated: F,
) -> Result<(), ControllerError>
where
F: Fn() -> bool,
{
let StorageBackendConfig::File(ref file_cfg) = storage.options.backend else {
return Err(ControllerError::InvalidStandby(
"standby mode requires file storage backend",
));
};
let FileBackendConfig {
sync: Some(ref sync),
..
} = **file_cfg
else {
return Err(ControllerError::InvalidStandby(
"standby mode requires file storage backend to have synchronization configured",
));
};
sync.validate()
.map_err(ControllerError::checkpoint_fetch_error)?;
if sync.start_from_checkpoint.is_none() {
return Err(ControllerError::InvalidStandby(
"standby mode requires file storage backend to have synchronization configured to start from a checkpoint",
));
}
let mut cpm = None;
let activation_file = StoragePath::from(ACTIVATION_MARKER_FILE);
let previously_activated = storage.backend.exists(&activation_file).unwrap_or(false);
if previously_activated {
let previous_activation = storage
.backend
.read_json::<Option<CheckpointMetadata>>(&activation_file)
.ok()
.flatten();
tracing::info!(
"this pipeline was previously activated from {}, skipping standby mode",
previous_activation
.map(|p| format!("checkpoint '{}'", p.uuid))
.unwrap_or_else(|| "scratch".to_owned())
);
return Ok(());
}
let mut prev = uuid::Uuid::nil();
let mut pull_once_again_after_activation = false;
// This should run at least once before checking for activation, to ensure
// that we pull a checkpoint if one is available.
// Also, if we receive an activation signal, we run one more iteration to
// ensure that we have the latest checkpoint before activating.
loop {
match pull_and_gc(storage.backend.clone(), sync, &mut prev) {
Err(err) => {
// On our final attempt to pull the checkpoint after activation, if we fail, we should error out and not activate with a potentially stale or missing checkpoint.
if pull_once_again_after_activation {
return Err(err);
}
}
Ok(c) => {
cpm = Some(c);
}
};
if !sync.standby {
return Ok(());
}
if is_activated() {
if pull_once_again_after_activation {
// We've already done one iteration after activation, now we're done
break;
}
pull_once_again_after_activation = true;
// Continue to pull one more time to get the latest checkpoint
}
std::thread::sleep(std::time::Duration::from_secs(sync.pull_interval));
}
tracing::debug!("creating activation marker file: {}", activation_file);
if let Err(marker_err) = storage
.backend
.write_json(&activation_file, &cpm)
.and_then(|reader| reader.commit())
.context("failed to write activation marker file")
{
tracing::error!("{marker_err:?}");
if sync.fail_if_no_checkpoint {
return Err(ControllerError::checkpoint_fetch_error(format!(
"{marker_err:?}"
)));
}
}
tracing::info!("pipeline activated");
Ok(())
}