Skip to content

Commit ea296cd

Browse files
AlenkaFkszucsjorisvandenbossche
committed
ARROW-16240: [Python] Support row_group_size/chunk_size keyword in pq.write_to_dataset with use_legacy_dataset=False
Closes apache#12955 from AlenkaF/ARROW-16240 Lead-authored-by: Alenka Frim <frim.alenka@gmail.com> Co-authored-by: Krisztián Szűcs <szucs.krisztian@gmail.com> Co-authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com> Signed-off-by: Krisztián Szűcs <szucs.krisztian@gmail.com>
1 parent 98b324d commit ea296cd

2 files changed

Lines changed: 9 additions & 2 deletions

File tree

python/pyarrow/parquet/__init__.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3109,6 +3109,12 @@ def file_visitor(written_file):
31093109
# extract non-file format options
31103110
schema = kwargs.pop("schema", None)
31113111
use_threads = kwargs.pop("use_threads", True)
3112+
chunk_size = kwargs.pop("chunk_size", None)
3113+
row_group_size = kwargs.pop("row_group_size", None)
3114+
3115+
row_group_size = (
3116+
row_group_size if row_group_size is not None else chunk_size
3117+
)
31123118

31133119
# raise for unsupported keywords
31143120
msg = (
@@ -3147,7 +3153,8 @@ def file_visitor(written_file):
31473153
partitioning=partitioning, use_threads=use_threads,
31483154
file_visitor=file_visitor,
31493155
basename_template=basename_template,
3150-
existing_data_behavior=existing_data_behavior)
3156+
existing_data_behavior=existing_data_behavior,
3157+
max_rows_per_group=row_group_size)
31513158
return
31523159

31533160
# warnings and errors when using legecy implementation

python/pyarrow/tests/test_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -971,7 +971,7 @@ def _create_dataset_for_fragments(tempdir, chunk_size=None, filesystem=None):
971971
path = str(tempdir / "test_parquet_dataset")
972972

973973
# write_to_dataset currently requires pandas
974-
pq.write_to_dataset(table, path, use_legacy_dataset=True,
974+
pq.write_to_dataset(table, path,
975975
partition_cols=["part"], chunk_size=chunk_size)
976976
dataset = ds.dataset(
977977
path, format="parquet", partitioning="hive", filesystem=filesystem

0 commit comments

Comments
 (0)