@@ -3018,7 +3018,8 @@ def write_to_dataset(table, root_path, partition_cols=None,
30183018 use_threads = None , file_visitor = None ,
30193019 existing_data_behavior = None ,
30203020 ** kwargs ):
3021- """Wrapper around parquet.write_table for writing a Table to
3021+ """Wrapper around dataset.write_dataset (when use_legacy_dataset=False) or
3022+ parquet.write_table (when use_legacy_dataset=True) for writing a Table to
30223023 Parquet format by partitions.
30233024 For each combination of partition columns and values,
30243025 a subdirectories are created in the following
@@ -3052,6 +3053,9 @@ def write_to_dataset(table, root_path, partition_cols=None,
30523053 A callback function that takes the partition key(s) as an argument
30533054 and allow you to override the partition filename. If nothing is
30543055 passed, the filename will consist of a uuid.
3056+ This option is only supported for use_legacy_dataset=True.
3057+ When use_legacy_dataset=None and this option is specified,
3058+ use_legacy_datase will be set to True.
30553059 use_legacy_dataset : bool
30563060 Default is False. Set to True to use the the legacy behaviour
30573061 (this option is deprecated, and the legacy implementation will be
@@ -3061,17 +3065,21 @@ def write_to_dataset(table, root_path, partition_cols=None,
30613065 use_threads : bool, default True
30623066 Write files in parallel. If enabled, then maximum parallelism will be
30633067 used determined by the number of available CPU cores.
3068+ This option is only supported for use_legacy_dataset=False.
30643069 schema : Schema, optional
3070+ This option is only supported for use_legacy_dataset=False.
30653071 partitioning : Partitioning or list[str], optional
30663072 The partitioning scheme specified with the
30673073 ``pyarrow.dataset.partitioning()`` function or a list of field names.
30683074 When providing a list of field names, you can use
30693075 ``partitioning_flavor`` to drive which partitioning type should be
30703076 used.
3077+ This option is only supported for use_legacy_dataset=False.
30713078 basename_template : str, optional
30723079 A template string used to generate basenames of written data files.
30733080 The token '{i}' will be replaced with an automatically incremented
30743081 integer. If not specified, it defaults to "guid-{i}.parquet".
3082+ This option is only supported for use_legacy_dataset=False.
30753083 file_visitor : function
30763084 If set, this function will be called with a WrittenFile instance
30773085 for each file created during the call. This object will have both
@@ -3091,16 +3099,12 @@ def write_to_dataset(table, root_path, partition_cols=None,
30913099
30923100 def file_visitor(written_file):
30933101 visited_paths.append(written_file.path)
3102+ This option is only supported for use_legacy_dataset=False.
30943103 existing_data_behavior : 'overwrite_or_ignore' | 'error' | \
30953104 'delete_matching'
30963105 Controls how the dataset will handle data that already exists in
30973106 the destination. The default behaviour is 'overwrite_or_ignore'.
30983107
3099- Only used in the new code path using the new Arrow Dataset API
3100- (``use_legacy_dataset=False``). In case the legacy implementation
3101- is selected the parameter is ignored as the old implementation does
3102- not support it (only has the default behaviour).
3103-
31043108 'overwrite_or_ignore' will ignore any existing data and will
31053109 overwrite files with the same name as an output file. Other
31063110 existing files will be ignored. This behavior, in combination
@@ -3113,9 +3117,15 @@ def file_visitor(written_file):
31133117 dataset. The first time each partition directory is encountered
31143118 the entire directory will be deleted. This allows you to overwrite
31153119 old partitions completely.
3120+ This option is only supported for use_legacy_dataset=False.
31163121 **kwargs : dict,
3117- Additional kwargs for write_table function. See docstring for
3118- `write_table` or `ParquetWriter` for more information.
3122+ When use_legacy_dataset=False, used as additional kwargs for
3123+ `dataset.write_dataset` function (passed to
3124+ `ParquetFileFormat.make_write_options`). See the docstring
3125+ of `write_table` for the available options.
3126+ When use_legacy_dataset=True, used as additional kwargs for
3127+ `parquet.write_table` function (See docstring for `write_table`
3128+ or `ParquetWriter` for more information).
31193129 Using `metadata_collector` in kwargs allows one to collect the
31203130 file metadata instances of dataset pieces. The file paths in the
31213131 ColumnChunkMetaData will be set relative to `root_path`.
0 commit comments