Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion crates/dbsp/src/circuit/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ impl BatchSizeStats {
/// General metadata about an operator's execution
#[derive(Debug, Clone, PartialEq, Default, Serialize)]
pub struct OperatorMeta {
entries: Vec<(MetaLabel, MetaItem)>,
pub entries: Vec<(MetaLabel, MetaItem)>,
}

impl OperatorMeta {
Expand All @@ -145,6 +145,9 @@ impl OperatorMeta {
.map(|(_label, item)| item.clone())
}

/// Merges the mergeable entries in `other` into this operator metadata.
///
/// See [MetaItem::merge] to learn about merging metadata.
pub fn merge(&mut self, other: &Self) {
for (label, src) in &other.entries {
if src.is_mergeable() {
Expand Down Expand Up @@ -338,6 +341,7 @@ impl MetaItem {
}
}

/// Returns whether this kind of metadata item is mergeable.
pub fn is_mergeable(&self) -> bool {
matches!(
self,
Expand All @@ -349,6 +353,12 @@ impl MetaItem {
)
}

/// Attempts to merge `self` and `other` and returns the result if there is
/// one.
///
/// Counts are merged by adding them, percents are merged by adding the
/// numerator and denominator, and so on. Arbitrary strings and integers
/// can't be merged.
pub fn merge(&self, other: &Self) -> Option<Self> {
match (self, other) {
(Self::Count(a), Self::Count(b)) => Some(Self::Count(a + b)),
Expand Down
2 changes: 1 addition & 1 deletion crates/dbsp/src/storage/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
//! A "layer file" stores `n > 0` columns of data, each of which has a key type
//! `K[i]` and an auxiliary data type `A[i]`. Each column is arranged into
//! groups of rows, where column 0 forms a single group and each row in column
//! `i` is associated with a group of one or more rows in column `i + 1` (for
//! `i` is associated with a group of zero or more rows in column `i + 1` (for
//! `i + 1 < n`). A group contains sorted, unique values. A group cursor for
//! column `i` can move forward and backward by rows, seek forward and backward
//! by the key type `K[i]` or using a predicate based on `K[i]`, and (when `i +
Expand Down
31 changes: 27 additions & 4 deletions crates/dbsp/src/storage/file/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@
//! [`IndexBlockHeader::bound_map_offset`] and the format in
//! [`IndexBlockHeader::bound_map_varint`].
//!
//! <a name="omitted-bounds">Omitted bounds</a>: Columns 1 and greater may
//! omit bounds, in which case those bounds are set to 0. A min-bound (the
//! first in each pair) may be omitted for a given child if the first row in
//! that child is the first row in its row group. A max-bound (the second in
//! each pair) may be omitted for a given child if the last row in that child
//! is the last row in its row group. If any bounds are omitted then
//! [FileTrailer::incompatible_features] must include
//! [FileTrailer::OMITTED_BOUNDS].
//!
//! Column 0 may not omit bounds (it consists of a single row group so it
//! could only omit the very first and very last bound in any case).
//!
//! * An array of "row totals", one for each of
//! [`IndexBlockHeader::n_children`]. The first row total is the total number
//! of rows in the first child tree, the second row total is that plus the
Expand Down Expand Up @@ -181,11 +193,23 @@ pub struct FileTrailer {
///
/// If any of these bits are set, the version number must be at least 3.
///
/// No incompatible features are currently defined. This bitmap is for
/// future expansion.
/// [FileTrailer::OMITTED_BOUNDS] and [FileTrailer::EMPTY_ROW_GROUP] are the
/// only current incompatible features.
pub incompatible_features: u64,
}

impl FileTrailer {
/// Must be set in [FileTrailer::incompatible_features] if any bounds are
/// [omitted].
///
/// [omitted]: crate::storage::file::format#omitted-bounds
pub const OMITTED_BOUNDS: u64 = 1;

/// Must be set in [FileTrailer::incompatible_features] if any row group
/// (other than column 0) contains zero rows.
pub const EMPTY_ROW_GROUP: u64 = 2;
}

/// Information about a column.
///
/// Embedded inside the [`FileTrailer`] block.
Expand All @@ -207,8 +231,7 @@ pub struct FileTrailerColumn {
#[brw(align_after = 4)]
pub node_type: NodeType,

/// Number of rows in the column. Column 0 may have any number of rows;
/// subsequent columns must each have more rows than the previous.
/// Number of rows in the column.
pub n_rows: u64,
}

Expand Down
78 changes: 42 additions & 36 deletions crates/dbsp/src/storage/file/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,10 @@ pub enum CorruptionError {
/// Invalid filter block location.
#[error("Invalid file block location ({0}).")]
InvalidFilterLocation(InvalidBlockLocation),

/// Missing bounds for row group that spans data block.
#[error("Missing bounds for row group that spans data block.")]
MissingBounds,
}

/// Reader for an array of [Varint]s in a storage file.
Expand Down Expand Up @@ -576,7 +580,7 @@ where
let row_groups = self.row_groups.as_ref().unwrap();
let start = row_groups.get(&self.raw, index);
let end = row_groups.get(&self.raw, index + 1);
if start < end {
if start <= end {
Ok(start..end)
} else {
Err(CorruptionError::InvalidRowGroup {
Expand Down Expand Up @@ -1069,10 +1073,15 @@ where
Err(CorruptionError::MissingRow(row).into())
}

unsafe fn get_bound(&self, index: usize, bound: &mut K) {
unsafe fn get_bound<'a>(&self, index: usize, bound: &'a mut K) -> Option<&'a mut K> {
unsafe {
let offset = self.bounds.get(&self.raw, index) as usize;
bound.deserialize_from_bytes(&self.raw, offset)
if offset != 0 {
bound.deserialize_from_bytes(&self.raw, offset);
Some(bound)
} else {
None
}
}
}

Expand Down Expand Up @@ -1102,12 +1111,16 @@ where
let row = self.get_row_bound(mid) + self.first_row;
let cmp = match range_compare(target_rows, row) {
Equal => {
self.get_bound(mid, bound);
let cmp = compare(bound);
if cmp == Equal {
return Some(mid / 2);
if let Some(bound) = self.get_bound(mid, bound) {
match compare(bound) {
Equal => return Some(mid / 2),
cmp => cmp,
}
} else if mid % 2 == 1 {
Less
} else {
Greater
}
cmp
}
cmp => cmp,
};
Expand Down Expand Up @@ -1147,8 +1160,10 @@ where
let mut end = self.n_children();
while *start < end {
let mid = start.midpoint(end);
self.get_bound(mid * 2, tmp_key);
if &targets[start_index] < tmp_key {
if self
.get_bound(mid * 2, tmp_key)
.is_some_and(|bound| &targets[start_index] < bound)
{
end = mid;
} else {
*start = mid + 1;
Expand All @@ -1170,15 +1185,16 @@ where
}

/// Returns the comparison of the largest bound key using `compare`.
unsafe fn compare_max<C>(&self, key_factory: &dyn Factory<K>, compare: &C) -> Ordering
unsafe fn compare_max<C>(&self, key_factory: &dyn Factory<K>, compare: &C) -> Option<Ordering>
where
C: Fn(&K) -> Ordering,
{
unsafe {
let mut ordering = Equal;
let mut ordering = None;
key_factory.with(&mut |key| {
self.get_bound(self.n_children() * 2 - 1, key);
ordering = compare(key);
ordering = self
.get_bound(self.n_children() * 2 - 1, key)
.map(|key| compare(key));
});
ordering
}
Expand Down Expand Up @@ -1527,11 +1543,12 @@ where
);
}

if file_trailer.incompatible_features != 0 {
return Err(CorruptionError::UnsupportedIncompatibleFeatures(
file_trailer.incompatible_features,
)
.into());
let unsupported_features = file_trailer.incompatible_features
& !(FileTrailer::OMITTED_BOUNDS | FileTrailer::EMPTY_ROW_GROUP);
if unsupported_features != 0 {
return Err(
CorruptionError::UnsupportedIncompatibleFeatures(unsupported_features).into(),
);
}

assert_eq!(factories.len(), file_trailer.columns.len());
Expand All @@ -1551,18 +1568,6 @@ where
expected: T::n_columns(),
});
}
for i in 1..columns.len() {
let prev_n_rows = columns[i - 1].n_rows;
let this_n_rows = columns[i].n_rows;
if this_n_rows < prev_n_rows {
return Err(CorruptionError::DecreasingRowCount {
column: i,
prev_n_rows,
this_n_rows,
}
.into());
}
}

let bloom_filter = match bloom_filter {
Some(bloom_filter) => Some(bloom_filter),
Expand Down Expand Up @@ -1799,7 +1804,6 @@ where
/// Returns `true` if the row group contains no rows.
///
/// The row group for column 0 is empty if and only if the layer file is
/// empty. A row group obtained from [`Cursor::next_column`] is never
/// empty.
pub fn is_empty(&self) -> bool {
self.rows.is_empty()
Expand Down Expand Up @@ -2308,9 +2312,8 @@ where
T: ColumnSpec,
{
/// Obtains the row group in the next column associated with the current
/// row. If the cursor is on a row, the returned row group will contain at
/// least one row. If the cursor is before or after the row group, the
/// returned row group will be empty.
/// row. If the cursor is before or after the row group, the returned row
/// group will be empty.
///
/// This method does not do I/O, but it can report [Error::Corruption].
pub fn next_column<'b>(&'b self) -> Result<RowGroup<'a, NK, NA, NN, T>, Error> {
Expand Down Expand Up @@ -2608,7 +2611,10 @@ where
// `index_block` and the greatest value under `index_block` is less
// than the target.
if rows.end > index_block.rows().end
&& index_block.compare_max(row_group.factories.key_factory, compare) == Greater
&& index_block
.compare_max(row_group.factories.key_factory, compare)
.ok_or(CorruptionError::MissingBounds)?
== Greater
{
rows.start = index_block.rows().end;
continue;
Expand Down
Loading