@@ -257,7 +257,7 @@ cdef class Dataset(_Weakrefable):
257257 ... 'n_legs': [2, 2, 4, 4, 5, 100],
258258 ... 'animal': ["Flamingo", "Parrot", "Dog", "Horse",
259259 ... "Brittle stars", "Centipede"]})
260- >>>
260+ >>>
261261 >>> import pyarrow.parquet as pq
262262 >>> pq.write_table(table, "dataset_scanner.parquet")
263263
@@ -1221,12 +1221,12 @@ cdef class CsvFileFormat(FileFormat):
12211221 ----------
12221222 parse_options : pyarrow.csv.ParseOptions
12231223 Options regarding CSV parsing.
1224+ default_fragment_scan_options : CsvFragmentScanOptions
1225+ Default options for fragments scan.
12241226 convert_options : pyarrow.csv.ConvertOptions
12251227 Options regarding value conversion.
12261228 read_options : pyarrow.csv.ReadOptions
12271229 General read options.
1228- default_fragment_scan_options : CsvFragmentScanOptions
1229- Default options for fragments scan.
12301230 """
12311231 cdef:
12321232 CCsvFileFormat* csv_format
@@ -2315,17 +2315,17 @@ cdef class Scanner(_Weakrefable):
23152315 projections.
23162316
23172317 The list of columns or expressions may use the special fields
2318- `__batch_index` (the index of the batch within the fragment),
2319- `__fragment_index` (the index of the fragment within the dataset),
2318+ `__batch_index` (the index of the batch within the fragment),
2319+ `__fragment_index` (the index of the fragment within the dataset),
23202320 `__last_in_fragment` (whether the batch is last in fragment), and
2321- `__filename` (the name of the source file or a description of the
2321+ `__filename` (the name of the source file or a description of the
23222322 source fragment).
23232323
23242324 The columns will be passed down to Datasets and corresponding data
23252325 fragments to avoid loading, copying, and deserializing columns
23262326 that will not be required further down the compute chain.
2327- By default all of the available columns are projected.
2328- Raises an exception if any of the referenced column names does
2327+ By default all of the available columns are projected.
2328+ Raises an exception if any of the referenced column names does
23292329 not exist in the dataset's Schema.
23302330 filter : Expression, default None
23312331 Scan will return only the rows matching the filter.
@@ -2338,8 +2338,9 @@ cdef class Scanner(_Weakrefable):
23382338 record batches are overflowing memory then this method can be
23392339 called to reduce their size.
23402340 batch_readahead : int, default 16
2341- The number of batches to read ahead in a file. Increasing this number
2342- will increase RAM usage but could also improve IO utilization.
2341+ The number of batches to read ahead in a file. This might not work
2342+ for all file formats. Increasing this number will increase
2343+ RAM usage but could also improve IO utilization.
23432344 fragment_readahead : int, default 4
23442345 The number of files to read ahead. Increasing this number will increase
23452346 RAM usage but could also improve IO utilization.
@@ -2375,14 +2376,13 @@ cdef class Scanner(_Weakrefable):
23752376 return self .wrapped
23762377
23772378 @staticmethod
2378- def from_dataset (Dataset dataset not None ,
2379- bint use_threads = True , object use_async = None ,
2380- MemoryPool memory_pool = None ,
2381- object columns = None , Expression filter = None ,
2382- int batch_size = _DEFAULT_BATCH_SIZE,
2379+ def from_dataset (Dataset dataset not None , *, object columns = None ,
2380+ Expression filter = None , int batch_size = _DEFAULT_BATCH_SIZE,
23832381 int batch_readahead = _DEFAULT_BATCH_READAHEAD,
23842382 int fragment_readahead = _DEFAULT_FRAGMENT_READAHEAD,
2385- FragmentScanOptions fragment_scan_options = None ):
2383+ FragmentScanOptions fragment_scan_options = None ,
2384+ bint use_threads = True , object use_async = None ,
2385+ MemoryPool memory_pool = None ):
23862386 """
23872387 Create Scanner from Dataset,
23882388
@@ -2397,10 +2397,10 @@ cdef class Scanner(_Weakrefable):
23972397 projections.
23982398
23992399 The list of columns or expressions may use the special fields
2400- `__batch_index` (the index of the batch within the fragment),
2401- `__fragment_index` (the index of the fragment within the dataset),
2400+ `__batch_index` (the index of the batch within the fragment),
2401+ `__fragment_index` (the index of the fragment within the dataset),
24022402 `__last_in_fragment` (whether the batch is last in fragment), and
2403- `__filename` (the name of the source file or a description of the
2403+ `__filename` (the name of the source file or a description of the
24042404 source fragment).
24052405
24062406 The columns will be passed down to Datasets and corresponding data
@@ -2426,6 +2426,9 @@ cdef class Scanner(_Weakrefable):
24262426 fragment_readahead : int, default 4
24272427 The number of files to read ahead. Increasing this number will increase
24282428 RAM usage but could also improve IO utilization.
2429+ fragment_scan_options : FragmentScanOptions, default None
2430+ Options specific to a particular scan and fragment type, which
2431+ can change between different scans of the same dataset.
24292432 use_threads : bool, default True
24302433 If enabled, then maximum parallelism will be used determined by
24312434 the number of available CPU cores.
@@ -2436,9 +2439,6 @@ cdef class Scanner(_Weakrefable):
24362439 memory_pool : MemoryPool, default None
24372440 For memory allocations, if required. If not specified, uses the
24382441 default pool.
2439- fragment_scan_options : FragmentScanOptions, default None
2440- Options specific to a particular scan and fragment type, which
2441- can change between different scans of the same dataset.
24422442 """
24432443 cdef:
24442444 shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2461,13 +2461,13 @@ cdef class Scanner(_Weakrefable):
24612461 return Scanner.wrap(scanner)
24622462
24632463 @staticmethod
2464- def from_fragment (Fragment fragment not None , Schema schema = None ,
2465- bint use_threads = True , object use_async = None ,
2466- MemoryPool memory_pool = None ,
2464+ def from_fragment (Fragment fragment not None , *, Schema schema = None ,
24672465 object columns = None , Expression filter = None ,
24682466 int batch_size = _DEFAULT_BATCH_SIZE,
24692467 int batch_readahead = _DEFAULT_BATCH_READAHEAD,
2470- FragmentScanOptions fragment_scan_options = None ):
2468+ FragmentScanOptions fragment_scan_options = None ,
2469+ bint use_threads = True , object use_async = None ,
2470+ MemoryPool memory_pool = None ,):
24712471 """
24722472 Create Scanner from Fragment,
24732473
@@ -2484,10 +2484,10 @@ cdef class Scanner(_Weakrefable):
24842484 projections.
24852485
24862486 The list of columns or expressions may use the special fields
2487- `__batch_index` (the index of the batch within the fragment),
2488- `__fragment_index` (the index of the fragment within the dataset),
2487+ `__batch_index` (the index of the batch within the fragment),
2488+ `__fragment_index` (the index of the fragment within the dataset),
24892489 `__last_in_fragment` (whether the batch is last in fragment), and
2490- `__filename` (the name of the source file or a description of the
2490+ `__filename` (the name of the source file or a description of the
24912491 source fragment).
24922492
24932493 The columns will be passed down to Datasets and corresponding data
@@ -2510,6 +2510,9 @@ cdef class Scanner(_Weakrefable):
25102510 The number of batches to read ahead in a file. This might not work
25112511 for all file formats. Increasing this number will increase
25122512 RAM usage but could also improve IO utilization.
2513+ fragment_scan_options : FragmentScanOptions, default None
2514+ Options specific to a particular scan and fragment type, which
2515+ can change between different scans of the same dataset.
25132516 use_threads : bool, default True
25142517 If enabled, then maximum parallelism will be used determined by
25152518 the number of available CPU cores.
@@ -2520,9 +2523,6 @@ cdef class Scanner(_Weakrefable):
25202523 memory_pool : MemoryPool, default None
25212524 For memory allocations, if required. If not specified, uses the
25222525 default pool.
2523- fragment_scan_options : FragmentScanOptions, default None
2524- Options specific to a particular scan and fragment type, which
2525- can change between different scans of the same dataset.
25262526 """
25272527 cdef:
25282528 shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2549,11 +2549,11 @@ cdef class Scanner(_Weakrefable):
25492549 return Scanner.wrap(scanner)
25502550
25512551 @staticmethod
2552- def from_batches (source , Schema schema = None , bint use_threads = True ,
2553- object use_async = None , MemoryPool memory_pool = None ,
2554- object columns = None , Expression filter = None ,
2555- int batch_size = _DEFAULT_BATCH_SIZE ,
2556- FragmentScanOptions fragment_scan_options = None ):
2552+ def from_batches (source , *, Schema schema = None , object columns = None ,
2553+ Expression filter = None , int batch_size = _DEFAULT_BATCH_SIZE ,
2554+ FragmentScanOptions fragment_scan_options = None ,
2555+ bint use_threads = True , object use_async = None ,
2556+ MemoryPool memory_pool = None ):
25572557 """
25582558 Create a Scanner from an iterator of batches.
25592559
@@ -2574,6 +2574,8 @@ cdef class Scanner(_Weakrefable):
25742574 Scan will return only the rows matching the filter.
25752575 batch_size : int, default 128Ki
25762576 The maximum row count for scanned record batches.
2577+ fragment_scan_options : FragmentScanOptions
2578+ The fragment scan options.
25772579 use_threads : bool, default True
25782580 If enabled, then maximum parallelism will be used determined by
25792581 the number of available CPU cores.
@@ -2584,8 +2586,6 @@ cdef class Scanner(_Weakrefable):
25842586 memory_pool : MemoryPool, default None
25852587 For memory allocations, if required. If not specified, uses the
25862588 default pool.
2587- fragment_scan_options : FragmentScanOptions
2588- The fragment scan options.
25892589 """
25902590 cdef:
25912591 shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
0 commit comments