Skip to content

Commit fe4a83d

Browse files
committed
fixup! [HSTACK] Deep column projections
1 parent 1e2ac8d commit fe4a83d

10 files changed

Lines changed: 26 additions & 133 deletions

File tree

datafusion/common/src/deep.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ pub fn data_type_recurs(dt: &DataType) -> bool {
6060
DataType::LargeList(f) => data_type_recurs(f.data_type()),
6161
DataType::LargeListView(f) => data_type_recurs(f.data_type()),
6262
// list of struct
63-
DataType::Map(f, _) => true,
63+
DataType::Map(_, _) => true,
6464
DataType::Struct(_) => true,
6565
}
6666
}
@@ -108,7 +108,7 @@ pub fn rewrite_record_batch_fields(
108108
out_arrays.push(tmp_array);
109109
out_fields.push(dst_field);
110110
} else if error_on_missing_source_fields {
111-
return Err(crate::DataFusionError::Internal(format!(
111+
return Err(DataFusionError::Internal(format!(
112112
"field {dst_name} not found in source"
113113
)));
114114
}
@@ -537,7 +537,7 @@ pub fn has_deep_projection(possible: Option<&HashMap<usize, Vec<String>>>) -> bo
537537
return false;
538538
}
539539
let tmp = possible.unwrap();
540-
!(tmp.is_empty() || tmp.iter().all(|(k, v)| v.len() == 0))
540+
!(tmp.is_empty() || tmp.iter().all(|(_k, v)| v.len() == 0))
541541
}
542542

543543
/// Combines the current projection (numeric indices of top-level columns) with
@@ -667,7 +667,7 @@ pub fn fix_possible_field_accesses(schema: &DFSchemaRef, field_idx: usize, rest:
667667
(false, true, new_field)
668668
}
669669
DataType::Map(inner_map, _) => {
670-
let mut new_field: Option<FieldRef> = None;
670+
let new_field: Option<FieldRef>;
671671
match inner_map.data_type() {
672672
DataType::Struct(inner_map_struct) => {
673673
new_field = Some(inner_map_struct[1].clone());

datafusion/core/src/datasource/listing/table.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,8 @@
1717

1818
//! The table implementation.
1919
20-
use std::collections::HashMap;
2120
use std::{any::Any, str::FromStr, sync::Arc};
22-
21+
use std::collections::HashMap;
2322
use super::helpers::{expr_applicable_for_cols, pruned_partition_list, split_files};
2423
use super::{ListingTableUrl, PartitionedFile};
2524

datafusion/core/src/datasource/physical_plan/file_scan_config.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,6 @@ impl FileScanConfig {
250250
}
251251
Some(projection_deep) => {
252252
trace!("FileScanConfig::project DEEP PROJECT");
253-
let field_arc = Arc::new(self.file_schema.field(idx).clone());
254253
let rewritten_field_arc = rewrite_field_projection(self.file_schema.clone(), idx, &projection_deep);
255254
trace!("FileScanConfig::project DEEP PROJECT {:#?}", rewritten_field_arc);
256255
rewritten_field_arc.as_ref().clone()

datafusion/core/src/datasource/physical_plan/parquet/opener.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ fn create_initial_plan(
345345
// But, we neeed to walk through both the arrow schema (which KNOWS about the map type)
346346
// and the parquet leaves to do this correctly.
347347
fn equivalent_projection_paths_from_parquet_schema(
348-
arrow_schema: SchemaRef,
348+
_arrow_schema: SchemaRef,
349349
parquet_schema: &SchemaDescriptor,
350350
) -> Vec<(usize, (String, String))> {
351351
let mut output: Vec<(usize, (String, String))> = vec![];

datafusion/core/src/datasource/schema_adapter_deep.rs

Lines changed: 11 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,112 +1,11 @@
1-
use crate::datasource::schema_adapter::{SchemaAdapter, SchemaMapper};
2-
use arrow_array::RecordBatch;
3-
use arrow_schema::{Fields, Schema, SchemaRef};
4-
use datafusion_common::deep::{can_rewrite_field, try_rewrite_record_batch, try_rewrite_record_batch_with_mappings};
5-
use datafusion_common::plan_err;
6-
use std::sync::Arc;
7-
use log::trace;
8-
9-
#[derive(Clone, Debug)]
10-
pub(crate) struct NestedSchemaAdapter {
11-
/// The schema for the table, projected to include only the fields being output (projected) by the
12-
/// associated ParquetExec
13-
pub projected_table_schema: SchemaRef,
14-
/// The entire table schema for the table we're using this to adapt.
15-
///
16-
/// This is used to evaluate any filters pushed down into the scan
17-
/// which may refer to columns that are not referred to anywhere
18-
/// else in the plan.
19-
pub table_schema: SchemaRef,
20-
}
21-
22-
impl NestedSchemaAdapter {
23-
fn map_schema_nested(
24-
&self,
25-
fields: &Fields,
26-
) -> datafusion_common::Result<(Arc<NestedSchemaMapping>, Vec<usize>)> {
27-
let mut projection = Vec::with_capacity(fields.len());
28-
let mut field_mappings = vec![None; self.table_schema.fields().len()];
29-
30-
// start from the destination fields
31-
for (table_idx, table_field) in self.table_schema.fields.iter().enumerate() {
32-
// if the file exists in the source, check if we can rewrite it to the destination,
33-
// and add it to the projections
34-
if let Some((file_idx, file_field)) = fields.find(table_field.name()) {
35-
if can_rewrite_field(table_field.clone(), file_field.clone(), true) {
36-
field_mappings[table_idx] = Some(projection.len());
37-
projection.push(file_idx);
38-
} else {
39-
return plan_err!(
40-
"Cannot cast file schema field {} of type {:?} to table schema field of type {:?}",
41-
file_field.name(),
42-
file_field.data_type(),
43-
table_field.data_type()
44-
);
45-
}
46-
}
47-
}
48-
Ok((
49-
Arc::new(NestedSchemaMapping {
50-
projected_table_schema: self.projected_table_schema.clone(),
51-
field_mappings,
52-
table_schema: self.table_schema.clone(),
53-
}),
54-
projection,
55-
))
56-
}
57-
}
58-
59-
impl SchemaAdapter for NestedSchemaAdapter {
60-
fn map_column_index(&self, index: usize, file_schema: &Schema) -> Option<usize> {
61-
let field = self.projected_table_schema.field(index);
62-
Some(file_schema.fields.find(field.name())?.0)
63-
}
1+
//! TODO: module doc
642
65-
fn map_schema(
66-
&self,
67-
file_schema: &Schema,
68-
) -> datafusion_common::Result<(Arc<dyn SchemaMapper>, Vec<usize>)> {
69-
// self.map_schema_nested(file_schema.fields())
70-
// .map(|(s, v)| (s as Arc<dyn SchemaMapper>, v))
71-
trace!(target: "deep", "map_schema: file_schema: {:#?}", file_schema);
72-
trace!(target: "deep", "map_schema: table_schema: {:#?}", self.table_schema);
73-
trace!(target: "deep", "map_schema: projected_table_schema: {:#?}", self.projected_table_schema);
74-
75-
let mut projection = Vec::with_capacity(file_schema.fields().len());
76-
let mut field_mappings = vec![None; self.projected_table_schema.fields().len()];
77-
78-
for (file_idx, file_field) in file_schema.fields.iter().enumerate() {
79-
if let Some((table_idx, table_field)) =
80-
self.projected_table_schema.fields().find(file_field.name())
81-
{
82-
match can_rewrite_field(table_field.clone(), file_field.clone(), true) {
83-
true => {
84-
field_mappings[table_idx] = Some(projection.len());
85-
projection.push(file_idx);
86-
}
87-
false => {
88-
return plan_err!(
89-
"Cannot cast file schema field {} of type {:?} to table schema field of type {:?}",
90-
file_field.name(),
91-
file_field.data_type(),
92-
table_field.data_type()
93-
)
94-
}
95-
}
96-
}
97-
}
98-
99-
Ok((
100-
Arc::new(NestedSchemaMapping {
101-
projected_table_schema: self.projected_table_schema.clone(),
102-
field_mappings,
103-
table_schema: self.table_schema.clone(),
104-
}),
105-
projection,
106-
))
107-
}
108-
}
3+
use crate::datasource::schema_adapter::SchemaMapper;
4+
use arrow_array::RecordBatch;
5+
use arrow_schema::SchemaRef;
6+
use datafusion_common::deep::{try_rewrite_record_batch, try_rewrite_record_batch_with_mappings};
1097

8+
/// TODO: struct doc
1109
#[derive(Debug)]
11110
pub struct NestedSchemaMapping {
11211
/// The schema of the table. This is the expected schema after conversion and it should match
@@ -221,7 +120,7 @@ mod tests {
221120
true,
222121
),
223122
]));
224-
let out = rewrite_schema(
123+
let _ = rewrite_schema(
225124
schema,
226125
&vec![1],
227126
&HashMap::from([
@@ -237,7 +136,7 @@ mod tests {
237136
async fn test_rewrite() -> crate::error::Result<()> {
238137
let _ = env_logger::try_init();
239138

240-
let message_type = "
139+
let _message_type = "
241140
message schema {
242141
REQUIRED INT32 int1;
243142
OPTIONAL INT32 int2;
@@ -634,13 +533,13 @@ mod tests {
634533
let _ = env_logger::try_init();
635534
let ctx = SessionContext::new();
636535

637-
let dfr = ctx
536+
let _dfr = ctx
638537
.sql(
639538
r#"
640539
create external table
641540
test
642541
stored as parquet
643-
location '/Users/adragomi/work/arrow/benchmark/profile_export_prod_delta/part-00001-1b493913-ef97-4da6-9f8c-da1506b378f1-c000.snappy.parquet'
542+
location '../benchmark/adobe_1day_sorted.parquet'
644543
"#,
645544
)
646545
.await
@@ -672,7 +571,7 @@ mod tests {
672571
let results = df
673572
.collect()
674573
.await?;
675-
print_batches(results.as_slice());
574+
print_batches(results.as_slice()).ok();
676575
info!("results: {}", results.len());
677576

678577
Ok(())

datafusion/expr/src/logical_plan/tree_node.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -614,7 +614,7 @@ impl LogicalPlan {
614614
table_name,
615615
source,
616616
projection,
617-
projection_deep
617+
projection_deep,
618618
projected_schema,
619619
filters,
620620
fetch,

datafusion/optimizer/src/optimize_projections/mod.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,10 @@ use std::sync::Arc;
2626
use crate::optimizer::ApplyOrder;
2727
use crate::{OptimizerConfig, OptimizerRule};
2828
use recursive::recursive;
29-
use std::collections::HashSet;
30-
use std::sync::Arc;
3129

3230
use datafusion_common::{
3331
get_required_group_by_exprs_indices, internal_datafusion_err, internal_err, Column,
34-
HashMap, JoinType, Result,
32+
JoinType, Result,
3533
};
3634
use datafusion_expr::expr::Alias;
3735
use datafusion_expr::Unnest;
@@ -281,7 +279,7 @@ fn optimize_projections(
281279

282280
let (new_projection, new_projection_deep) = if projection.is_some() {
283281
let projection_clone = projection.unwrap().clone();
284-
let projection_deep_clone = projection_deep.clone();
282+
let _projection_deep_clone = projection_deep.clone();
285283
indices.into_mapped_indices_deep(|idx| projection_clone[idx])
286284
} else {
287285
indices.into_inner_deep()
@@ -487,7 +485,7 @@ fn merge_consecutive_projections(proj: Projection) -> Result<Transformed<Project
487485
};
488486

489487
// Count usages (referrals) of each projection expression in its input fields:
490-
let mut column_referral_map = HashMap::<&Column, usize>::new();
488+
let mut column_referral_map = datafusion_common::HashMap::<&Column, usize>::new();
491489
expr.iter()
492490
.for_each(|expr| expr.add_column_ref_counts(&mut column_referral_map));
493491

datafusion/optimizer/src/optimize_projections/required_indices.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,6 @@ impl RequiredIndicies {
6666

6767
/// Create a new instance with the specified indices as required
6868
pub fn new_from_indices(indices: Vec<usize>) -> Self {
69-
let indices_len = indices.len();
7069
Self {
7170
indices,
7271
deep_indices: HashMap::new(),

datafusion/physical-plan/src/joins/hash_join.rs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,6 @@ use datafusion_expr::Operator;
7676
use datafusion_physical_expr_common::datum::compare_op_for_nested;
7777
use futures::{ready, Stream, StreamExt, TryStreamExt};
7878
use parking_lot::Mutex;
79-
use crate::joins::utils::project_index_to_exprs;
8079

8180
type SharedBitmapBuilder = Mutex<BooleanBufferBuilder>;
8281

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -709,10 +709,10 @@ pub fn remap_join_projections_join_to_output(
709709
right: Arc<dyn ExecutionPlan>,
710710
join_type: &JoinType,
711711
projection: Option<Vec<usize>>,
712-
) -> datafusion_common::Result<Option<Vec<usize>>> {
712+
) -> Result<Option<Vec<usize>>> {
713713
match projection {
714714
Some(ref projection) => {
715-
let (join_schema, indices) = build_join_schema(
715+
let (join_schema, _) = build_join_schema(
716716
left.schema().as_ref(),
717717
right.schema().as_ref(),
718718
join_type
@@ -729,9 +729,9 @@ pub fn remap_join_projections_join_to_output(
729729
ProjectionMapping::try_new(&projection_exprs, &join_schema)?;
730730

731731
// projection mapping contains from and to, get the second one
732-
let dest_physical_exprs = projection_mapping.map.iter().map(|(f, t)| t.clone()).collect::<Vec<_>>();
732+
let dest_physical_exprs = projection_mapping.map.iter().map(|(_, t)| t.clone()).collect::<Vec<_>>();
733733
let dest_columns = dest_physical_exprs.iter().map(|pe| pe.as_any().downcast_ref::<Column>()).collect::<Vec<_>>();
734-
let output = dest_physical_exprs.iter().enumerate().map(|(idx, pe)| {
734+
let output = dest_physical_exprs.iter().enumerate().map(|(idx, _)| {
735735
// :Vec<(Arc<dyn PhysicalExpr>, String)>
736736
// (pe.clone(), dest_column.name().to_owned())
737737
let dest_column = dest_columns.get(idx).unwrap().unwrap();
@@ -752,7 +752,7 @@ pub fn project_index_to_exprs(
752752
.map(|index| {
753753
let field = schema.field(*index);
754754
(
755-
Arc::new(datafusion_physical_expr::expressions::Column::new(
755+
Arc::new(Column::new(
756756
field.name(),
757757
*index,
758758
)) as Arc<dyn PhysicalExpr>,

0 commit comments

Comments
 (0)