Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 64 additions & 3 deletions crates/dbsp/src/operator/accumulate_trace.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::{
Circuit, DBData, DBWeight, Stream,
circuit::metadata::MetaItem,
dynamic::{DowncastTrait, DynData, Erase},
dynamic::{DowncastTrait, DynData, Erase, WithFactory},
operator::TraceBound,
trace::{BatchReaderFactories, Filter},
typed_batch::{Batch, DynBatch, DynBatchReader, Spine, TypedBatch, TypedBox},
Expand Down Expand Up @@ -72,8 +72,7 @@ where
C: Circuit,
B: Batch<Time = ()>,
{
/// Like `accumulate_integrate_trace`, but additionally applies a retainment policy to
/// keys in the trace.
/// Applies a retainment policy to keys in the integral of `self`.
///
/// ## Background
///
Expand Down Expand Up @@ -222,6 +221,68 @@ where
);
}

/// Applies a retainment policy that keeps all values above the threshold
/// in `bounds_stream` and up to `n` latest values before the threshold.
///
/// Notifies the garbage collector that it should preserve all values that
/// satisfy the predicate and the last `n` values before the first value that
/// satisfies the predicate for each key. If no value associated with a key
/// satisfies the predicate, the last `n` values are preserved.
///
/// Used to garbage collect streams that need to preserve a fixed number of
/// values below a waterline, regardless of how far in the past they are.
/// Examples include the right-hand side of an asof join and inputs to top-k
/// operators.
///
/// IMPORTANT: this method assumes that for each key in `self`, values are
/// sorted in such a way that once the `retain_key_function` predicate is
/// satisfied for a value, it is also satisfied for all subsequent values.
///
/// # Arguments
///
/// * `bounds_stream` - This stream carries scalar values (i.e., single
/// records, not Z-sets). The key retainment condition is defined
/// relative to the last value received from this stream. Typically, this
/// value represents the lowest upper bound of all partially ordered
/// timestamps in `self` or some other stream, computed with the help of
/// the [`waterline`](`Stream::waterline`) operator and adjusted by some
/// constant offsets, dictated, e.g., by window sizes used in the queries
/// and the maximal out-of-ordedness of data in the input streams.
///
/// * `retain_key_func` - given the value received from the `bounds_stream`
/// at the last clock cycle and a key, returns `true` if the key should be
/// retained in the trace and `false` if it should be discarded.
///
/// * `n` - the number of values to preserve
#[track_caller]
pub fn accumulate_integrate_trace_retain_values_last_n<TS, RV>(
&self,
bounds_stream: &Stream<C, TypedBox<TS, DynData>>,
retain_value_func: RV,
n: usize,
) where
TS: DBData + Erase<DynData>,
RV: Fn(&B::Val, &TS) -> bool + Clone + Send + Sync + 'static,
{
self.inner()
.dyn_accumulate_integrate_trace_retain_values_last_n(
WithFactory::<B::Val>::FACTORY,
&bounds_stream.inner_data(),
Box::new(move |ts: &DynData| {
let metadata = MetaItem::String(format!("{ts:?}"));
let ts = clone_box(ts);
let retain_val_func = retain_value_func.clone();
Filter::new(Box::new(move |v: &B::DynV| {
retain_val_func(unsafe { v.downcast::<B::Val>() }, unsafe {
ts.as_ref().downcast::<TS>()
})
}))
.with_metadata(metadata)
}),
n,
);
}

/// Constructs and returns a untimed trace of this stream.
///
/// The trace is unbounded, meaning that data will not be discarded because
Expand Down
28 changes: 25 additions & 3 deletions crates/dbsp/src/operator/dynamic/accumulate_trace.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use crate::Runtime;
use crate::circuit::circuit_builder::{StreamId, register_replay_stream};
use crate::circuit::metadata::{NUM_ALLOCATIONS_LABEL, NUM_INPUTS_LABEL};
use crate::dynamic::{Weight, WeightTrait};
use crate::dynamic::{Factory, Weight, WeightTrait};
use crate::operator::dynamic::trace::{DelayedTraceId, TraceBounds};
use crate::operator::{TraceBound, require_persistent_id};
use crate::trace::spine_async::WithSnapshot;
use crate::trace::{BatchReaderFactories, Builder, MergeCursor};
use crate::trace::{BatchReaderFactories, Builder, GroupFilter, MergeCursor};
use crate::{
Error, Timestamp,
circuit::{
Expand Down Expand Up @@ -157,7 +157,29 @@ where
bounds.set_unique_val_bound_name(bounds_stream.get_persistent_id().as_deref());

bounds_stream.inspect(move |ts| {
let filter = retain_val_func(ts.as_ref());
let filter = GroupFilter::Simple(retain_val_func(ts.as_ref()));
bounds.set_val_filter(filter);
});
}

/// See [`Stream::accumulate_integrate_trace_retain_values_last_n`].
#[track_caller]
pub fn dyn_accumulate_integrate_trace_retain_values_last_n<TS>(
&self,
val_factory: &'static dyn Factory<B::Val>,
bounds_stream: &Stream<C, Box<TS>>,
retain_val_func: Box<dyn Fn(&TS) -> Filter<B::Val>>,
n: usize,
) where
B: Batch<Time = ()>,
TS: DataTrait + ?Sized,
Box<TS>: Clone,
{
let bounds = self.accumulate_trace_bounds();
bounds.set_unique_val_bound_name(bounds_stream.get_persistent_id().as_deref());

bounds_stream.inspect(move |ts| {
let filter = GroupFilter::LastN(n, retain_val_func(ts.as_ref()), val_factory);
bounds.set_val_filter(filter);
});
}
Expand Down
165 changes: 148 additions & 17 deletions crates/dbsp/src/operator/dynamic/asof_join.rs
Original file line number Diff line number Diff line change
Expand Up @@ -717,10 +717,14 @@ where

#[cfg(test)]
mod test {
use std::cmp::{max, min};

use crate::{
DBData, DBSPHandle, OrdIndexedZSet, OrdZSet, OutputHandle, Runtime, ZSetHandle, ZWeight,
DBData, DBSPHandle, OrdIndexedZSet, OrdZSet, OutputHandle, Runtime, TypedBox, ZSetHandle,
ZWeight,
algebra::F32,
circuit::CircuitConfig,
dynamic::DowncastTrait,
typed_batch::IndexedZSetReader,
utils::{Tup2, Tup3, Tup4},
zset,
Expand Down Expand Up @@ -782,6 +786,111 @@ mod test {
.unwrap()
}

/// Like `test_circuit`, but additionally garbage collects both sides of the ASOF join.
fn test_circuit_with_waterline() -> (
DBSPHandle,
(
ZSetHandle<Transaction>,
ZSetHandle<User>,
OutputHandle<OrdZSet<Output>>,
),
) {
Runtime::init_circuit(CircuitConfig::with_workers(2), |circuit| {
let (transactions, transactions_handle) = circuit.add_input_zset::<Transaction>();
let (users, users_handle) = circuit.add_input_zset::<User>();

let transactions = transactions.map_index(|transaction| (transaction.1, *transaction));
let users = users.map_index(|user| (user.1, user.clone()));

let user_waterline = users
.waterline(
|| u64::MIN,
|_k, Tup3(ts, _, _)| {
// println!("{} ts: {:?}", Runtime::worker_index(), *ts);
(*ts).saturating_sub(LATENESS)
},
|ts1, ts2| {
// println!("{} max({:?}, {:?})", Runtime::worker_index(), ts1, ts2);
max(*ts1, *ts2)
},
)
/*.inspect(move |waterline: &TypedBox<u64, DynData>| {
println!(
"user waterline: {:?}",
waterline.inner().downcast_checked::<u64>()
);
})*/;

let transaction_waterline = transactions
.waterline(
|| u64::MIN,
|_k, Tup3(ts, _, _)| (*ts).saturating_sub(LATENESS),
|ts1, ts2| max(*ts1, *ts2),
)
/*.inspect(move |waterline: &TypedBox<u64, DynData>| {
println!(
"transaction waterline: {:?}",
waterline.inner().downcast_checked::<u64>()
);
})*/;

let waterline = transaction_waterline
.apply2(&user_waterline, |ts1, ts2| {
TypedBox::new(min(unsafe { *ts1.inner().downcast::<u64>() }, unsafe {
*ts2.inner().downcast::<u64>()
}))
})
/*.inspect(move |waterline: &TypedBox<u64, DynData>| {
println!(
"waterline: {:?}",
waterline.inner().downcast_checked::<u64>()
);
})*/;

let join = |_key: &CCNum, transaction: &Transaction, user: Option<&User>| {
Tup4(
transaction.0,
transaction.1,
transaction.2,
user.map(|u| u.2.clone()),
)
};
let ts_func1 = |transaction: &Transaction| transaction.0;
let ts_func2 = |user: &User| user.0;

let result = transactions.asof_join(&users, join, ts_func1, ts_func2);

transactions.accumulate_integrate_trace_retain_values(
&waterline,
|transaction: &Transaction, ts: &u64| transaction.0 >= *ts,
);

users.accumulate_integrate_trace_retain_values_last_n(
&waterline,
|user: &User, ts: &u64| user.0 >= *ts,
1,
);

let expected_result = transactions
.shard()
.integrate()
.apply2(&users.shard().integrate(), move |t, u| {
asof_join_reference(t, u, join, ts_func1, ts_func2)
});

result
.integrate()
.apply2(&expected_result, |actual, expected| {
assert_eq!(actual, expected)
});

let output_handle = result.output();

Ok((transactions_handle, users_handle, output_handle))
})
.unwrap()
}

#[test]
fn asof_join_test() {
let (mut dbsp, (transactions, users, result)) = test_circuit();
Expand Down Expand Up @@ -1011,43 +1120,53 @@ mod test {
OrdZSet::from_keys((), result)
}

/// We generate both input streams to the asof join to have this lateness.
const LATENESS: u64 = 20;

// Generate a transaction for step `step`. Adds the value of `step` to
// a randomly generated timestamp in order to make sure that the waterline
// of the stream moves forward.
prop_compose! {
fn transaction()
(time in 0..100u64,
fn transaction(step: usize)
(time in 0..LATENESS,
cc_num in 0..10u64,
amt in 0..100i32,
w in 1..=2 as ZWeight)
-> Tup2<Transaction, ZWeight> {
Tup2(Tup3(time, cc_num, F32::new(amt as f32)), w)
Tup2(Tup3(step as u64 + time, cc_num, F32::new(amt as f32)), w)
}
}

// Generate a random user for step `step`. Adds the value of `step` to
// a randomly generated timestamp in order to make sure that the waterline
// of the stream moves forward.
prop_compose! {
fn user()
(time in 0..100u64,
fn user(step: usize)
(time in 0..LATENESS,
cc_num in 0..5u64,
name in "[A-Z][a-z]{5}",
w in 1..=2 as ZWeight)
-> Tup2<User, ZWeight> {
Tup2(Tup3(time, cc_num, name), w)
Tup2(Tup3(step as u64 + time, cc_num, name), w)
}
}

// Generates an array of transactions and an array of users to feed to the circuit during step `step`.
prop_compose! {
fn input()
(transactions in vec(transaction(), 0..20),
users in vec(user(), 0..10))
fn input(step: usize)
(transactions in vec(transaction(step), 0..20),
users in vec(user(step), 0..10))
-> (Vec<Tup2<Transaction, ZWeight>>, Vec<Tup2<User, ZWeight>>) {
(transactions, users)
}
}

prop_compose! {
fn inputs(steps: usize)
(inputs in vec(input(), 0..=steps))
-> Vec<(Vec<Tup2<Transaction,ZWeight>>, Vec<Tup2<User, ZWeight>>)> {
inputs
}
/// Generate inputs to the test circuit for `steps` steps.
fn inputs(
steps: usize,
) -> impl Strategy<Value = Vec<(Vec<Tup2<Transaction, ZWeight>>, Vec<Tup2<User, ZWeight>>)>>
{
(0..steps).map(input).collect::<Vec<_>>().prop_map(|v| v)
}

proptest! {
Expand All @@ -1061,7 +1180,7 @@ mod test {
*w = -*w;
}
for Tup2(_u, w) in us.iter_mut() {
*w = -*w;
*w = -*w;
}
}

Expand All @@ -1079,5 +1198,17 @@ mod test {
dbsp.transaction().unwrap();
}
}

#[test]
fn asof_join_with_waterline_proptest(inputs in inputs(100)) {
let (mut dbsp, (htransactions, husers, _hresult)) = test_circuit_with_waterline();

for (mut transactions, mut users) in inputs {
htransactions.append(&mut transactions);
husers.append(&mut users);

dbsp.transaction().unwrap();
}
}
}
}
Loading
Loading