Skip to content

Commit 0f91e68

Browse files
authored
ARROW-15006: [Python][CI][Doc] Enable numpydoc check PR03 (apache#13983)
Adds an additional numypdoc check to CI (PR03) and fixes all corresponding violations. Note this does not fully resolve [ARROW-15006](https://issues.apache.org/jira/browse/ARROW-15006). Authored-by: Bryce Mecum <petridish@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
1 parent f49f8ed commit 0f91e68

7 files changed

Lines changed: 136 additions & 137 deletions

File tree

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1068,7 +1068,7 @@ services:
10681068
["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
10691069
/arrow/ci/scripts/python_build.sh /arrow /build &&
10701070
pip install -e /arrow/dev/archery[numpydoc] &&
1071-
archery numpydoc --allow-rule PR01,PR10 &&
1071+
archery numpydoc --allow-rule PR01,PR03,PR10 &&
10721072
/arrow/ci/scripts/python_test.sh /arrow"]
10731073

10741074
conda-python-dask:

python/pyarrow/_csv.pyx

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -189,22 +189,22 @@ cdef class ReadOptions(_Weakrefable):
189189
self.options.reset(new CCSVReadOptions(CCSVReadOptions.Defaults()))
190190

191191
def __init__(self, *, use_threads=None, block_size=None, skip_rows=None,
192-
column_names=None, autogenerate_column_names=None,
193-
encoding='utf8', skip_rows_after_names=None):
192+
skip_rows_after_names=None, column_names=None,
193+
autogenerate_column_names=None, encoding='utf8'):
194194
if use_threads is not None:
195195
self.use_threads = use_threads
196196
if block_size is not None:
197197
self.block_size = block_size
198198
if skip_rows is not None:
199199
self.skip_rows = skip_rows
200+
if skip_rows_after_names is not None:
201+
self.skip_rows_after_names = skip_rows_after_names
200202
if column_names is not None:
201203
self.column_names = column_names
202204
if autogenerate_column_names is not None:
203205
self.autogenerate_column_names= autogenerate_column_names
204206
# Python-specific option
205207
self.encoding = encoding
206-
if skip_rows_after_names is not None:
207-
self.skip_rows_after_names = skip_rows_after_names
208208

209209
@property
210210
def use_threads(self):
@@ -243,6 +243,23 @@ cdef class ReadOptions(_Weakrefable):
243243
def skip_rows(self, value):
244244
deref(self.options).skip_rows = value
245245

246+
@property
247+
def skip_rows_after_names(self):
248+
"""
249+
The number of rows to skip after the column names.
250+
This number can be larger than the number of rows in one
251+
block, and empty rows are counted.
252+
The order of application is as follows:
253+
- `skip_rows` is applied (if non-zero);
254+
- column names aread (unless `column_names` is set);
255+
- `skip_rows_after_names` is applied (if non-zero).
256+
"""
257+
return deref(self.options).skip_rows_after_names
258+
259+
@skip_rows_after_names.setter
260+
def skip_rows_after_names(self, value):
261+
deref(self.options).skip_rows_after_names = value
262+
246263
@property
247264
def column_names(self):
248265
"""
@@ -271,23 +288,6 @@ cdef class ReadOptions(_Weakrefable):
271288
def autogenerate_column_names(self, value):
272289
deref(self.options).autogenerate_column_names = value
273290

274-
@property
275-
def skip_rows_after_names(self):
276-
"""
277-
The number of rows to skip after the column names.
278-
This number can be larger than the number of rows in one
279-
block, and empty rows are counted.
280-
The order of application is as follows:
281-
- `skip_rows` is applied (if non-zero);
282-
- column names aread (unless `column_names` is set);
283-
- `skip_rows_after_names` is applied (if non-zero).
284-
"""
285-
return deref(self.options).skip_rows_after_names
286-
287-
@skip_rows_after_names.setter
288-
def skip_rows_after_names(self, value):
289-
deref(self.options).skip_rows_after_names = value
290-
291291
def validate(self):
292292
check_status(deref(self.options).Validate())
293293

@@ -296,11 +296,11 @@ cdef class ReadOptions(_Weakrefable):
296296
self.use_threads == other.use_threads and
297297
self.block_size == other.block_size and
298298
self.skip_rows == other.skip_rows and
299+
self.skip_rows_after_names == other.skip_rows_after_names and
299300
self.column_names == other.column_names and
300301
self.autogenerate_column_names ==
301302
other.autogenerate_column_names and
302-
self.encoding == other.encoding and
303-
self.skip_rows_after_names == other.skip_rows_after_names
303+
self.encoding == other.encoding
304304
)
305305

306306
@staticmethod
@@ -605,11 +605,6 @@ cdef class ConvertOptions(_Weakrefable):
605605
decimal_point : 1-character string, optional (default '.')
606606
The character used as decimal point in floating-point and decimal
607607
data.
608-
timestamp_parsers : list, optional
609-
A sequence of strptime()-compatible format strings, tried in order
610-
when attempting to infer or convert timestamp values (the special
611-
value ISO8601() can also be given). By default, a fast built-in
612-
ISO-8601 parser is used.
613608
strings_can_be_null : bool, optional (default False)
614609
Whether string / binary columns can have null values.
615610
If true, then strings in null_values are considered null for
@@ -620,16 +615,6 @@ cdef class ConvertOptions(_Weakrefable):
620615
If true, then strings in "null_values" are also considered null
621616
when they appear quoted in the CSV file. Otherwise, quoted values
622617
are never considered null.
623-
auto_dict_encode : bool, optional (default False)
624-
Whether to try to automatically dict-encode string / binary data.
625-
If true, then when type inference detects a string or binary column,
626-
it it dict-encoded up to `auto_dict_max_cardinality` distinct values
627-
(per chunk), after which it switches to regular encoding.
628-
This setting is ignored for non-inferred columns (those in
629-
`column_types`).
630-
auto_dict_max_cardinality : int, optional
631-
The maximum dictionary cardinality for `auto_dict_encode`.
632-
This value is per chunk.
633618
include_columns : list, optional
634619
The names of columns to include in the Table.
635620
If empty, the Table will include all columns from the CSV file.
@@ -641,6 +626,21 @@ cdef class ConvertOptions(_Weakrefable):
641626
produce a column of nulls (whose type is selected using
642627
`column_types`, or null by default).
643628
This option is ignored if `include_columns` is empty.
629+
auto_dict_encode : bool, optional (default False)
630+
Whether to try to automatically dict-encode string / binary data.
631+
If true, then when type inference detects a string or binary column,
632+
it it dict-encoded up to `auto_dict_max_cardinality` distinct values
633+
(per chunk), after which it switches to regular encoding.
634+
This setting is ignored for non-inferred columns (those in
635+
`column_types`).
636+
auto_dict_max_cardinality : int, optional
637+
The maximum dictionary cardinality for `auto_dict_encode`.
638+
This value is per chunk.
639+
timestamp_parsers : list, optional
640+
A sequence of strptime()-compatible format strings, tried in order
641+
when attempting to infer or convert timestamp values (the special
642+
value ISO8601() can also be given). By default, a fast built-in
643+
ISO-8601 parser is used.
644644
645645
Examples
646646
--------

python/pyarrow/_dataset.pyx

Lines changed: 39 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ cdef class Dataset(_Weakrefable):
257257
... 'n_legs': [2, 2, 4, 4, 5, 100],
258258
... 'animal': ["Flamingo", "Parrot", "Dog", "Horse",
259259
... "Brittle stars", "Centipede"]})
260-
>>>
260+
>>>
261261
>>> import pyarrow.parquet as pq
262262
>>> pq.write_table(table, "dataset_scanner.parquet")
263263
@@ -1221,12 +1221,12 @@ cdef class CsvFileFormat(FileFormat):
12211221
----------
12221222
parse_options : pyarrow.csv.ParseOptions
12231223
Options regarding CSV parsing.
1224+
default_fragment_scan_options : CsvFragmentScanOptions
1225+
Default options for fragments scan.
12241226
convert_options : pyarrow.csv.ConvertOptions
12251227
Options regarding value conversion.
12261228
read_options : pyarrow.csv.ReadOptions
12271229
General read options.
1228-
default_fragment_scan_options : CsvFragmentScanOptions
1229-
Default options for fragments scan.
12301230
"""
12311231
cdef:
12321232
CCsvFileFormat* csv_format
@@ -2315,17 +2315,17 @@ cdef class Scanner(_Weakrefable):
23152315
projections.
23162316
23172317
The list of columns or expressions may use the special fields
2318-
`__batch_index` (the index of the batch within the fragment),
2319-
`__fragment_index` (the index of the fragment within the dataset),
2318+
`__batch_index` (the index of the batch within the fragment),
2319+
`__fragment_index` (the index of the fragment within the dataset),
23202320
`__last_in_fragment` (whether the batch is last in fragment), and
2321-
`__filename` (the name of the source file or a description of the
2321+
`__filename` (the name of the source file or a description of the
23222322
source fragment).
23232323
23242324
The columns will be passed down to Datasets and corresponding data
23252325
fragments to avoid loading, copying, and deserializing columns
23262326
that will not be required further down the compute chain.
2327-
By default all of the available columns are projected.
2328-
Raises an exception if any of the referenced column names does
2327+
By default all of the available columns are projected.
2328+
Raises an exception if any of the referenced column names does
23292329
not exist in the dataset's Schema.
23302330
filter : Expression, default None
23312331
Scan will return only the rows matching the filter.
@@ -2338,8 +2338,9 @@ cdef class Scanner(_Weakrefable):
23382338
record batches are overflowing memory then this method can be
23392339
called to reduce their size.
23402340
batch_readahead : int, default 16
2341-
The number of batches to read ahead in a file. Increasing this number
2342-
will increase RAM usage but could also improve IO utilization.
2341+
The number of batches to read ahead in a file. This might not work
2342+
for all file formats. Increasing this number will increase
2343+
RAM usage but could also improve IO utilization.
23432344
fragment_readahead : int, default 4
23442345
The number of files to read ahead. Increasing this number will increase
23452346
RAM usage but could also improve IO utilization.
@@ -2375,14 +2376,13 @@ cdef class Scanner(_Weakrefable):
23752376
return self.wrapped
23762377

23772378
@staticmethod
2378-
def from_dataset(Dataset dataset not None,
2379-
bint use_threads=True, object use_async=None,
2380-
MemoryPool memory_pool=None,
2381-
object columns=None, Expression filter=None,
2382-
int batch_size=_DEFAULT_BATCH_SIZE,
2379+
def from_dataset(Dataset dataset not None, *, object columns=None,
2380+
Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
23832381
int batch_readahead=_DEFAULT_BATCH_READAHEAD,
23842382
int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
2385-
FragmentScanOptions fragment_scan_options=None):
2383+
FragmentScanOptions fragment_scan_options=None,
2384+
bint use_threads=True, object use_async=None,
2385+
MemoryPool memory_pool=None):
23862386
"""
23872387
Create Scanner from Dataset,
23882388
@@ -2397,10 +2397,10 @@ cdef class Scanner(_Weakrefable):
23972397
projections.
23982398
23992399
The list of columns or expressions may use the special fields
2400-
`__batch_index` (the index of the batch within the fragment),
2401-
`__fragment_index` (the index of the fragment within the dataset),
2400+
`__batch_index` (the index of the batch within the fragment),
2401+
`__fragment_index` (the index of the fragment within the dataset),
24022402
`__last_in_fragment` (whether the batch is last in fragment), and
2403-
`__filename` (the name of the source file or a description of the
2403+
`__filename` (the name of the source file or a description of the
24042404
source fragment).
24052405
24062406
The columns will be passed down to Datasets and corresponding data
@@ -2426,6 +2426,9 @@ cdef class Scanner(_Weakrefable):
24262426
fragment_readahead : int, default 4
24272427
The number of files to read ahead. Increasing this number will increase
24282428
RAM usage but could also improve IO utilization.
2429+
fragment_scan_options : FragmentScanOptions, default None
2430+
Options specific to a particular scan and fragment type, which
2431+
can change between different scans of the same dataset.
24292432
use_threads : bool, default True
24302433
If enabled, then maximum parallelism will be used determined by
24312434
the number of available CPU cores.
@@ -2436,9 +2439,6 @@ cdef class Scanner(_Weakrefable):
24362439
memory_pool : MemoryPool, default None
24372440
For memory allocations, if required. If not specified, uses the
24382441
default pool.
2439-
fragment_scan_options : FragmentScanOptions, default None
2440-
Options specific to a particular scan and fragment type, which
2441-
can change between different scans of the same dataset.
24422442
"""
24432443
cdef:
24442444
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2461,13 +2461,13 @@ cdef class Scanner(_Weakrefable):
24612461
return Scanner.wrap(scanner)
24622462

24632463
@staticmethod
2464-
def from_fragment(Fragment fragment not None, Schema schema=None,
2465-
bint use_threads=True, object use_async=None,
2466-
MemoryPool memory_pool=None,
2464+
def from_fragment(Fragment fragment not None, *, Schema schema=None,
24672465
object columns=None, Expression filter=None,
24682466
int batch_size=_DEFAULT_BATCH_SIZE,
24692467
int batch_readahead=_DEFAULT_BATCH_READAHEAD,
2470-
FragmentScanOptions fragment_scan_options=None):
2468+
FragmentScanOptions fragment_scan_options=None,
2469+
bint use_threads=True, object use_async=None,
2470+
MemoryPool memory_pool=None,):
24712471
"""
24722472
Create Scanner from Fragment,
24732473
@@ -2484,10 +2484,10 @@ cdef class Scanner(_Weakrefable):
24842484
projections.
24852485
24862486
The list of columns or expressions may use the special fields
2487-
`__batch_index` (the index of the batch within the fragment),
2488-
`__fragment_index` (the index of the fragment within the dataset),
2487+
`__batch_index` (the index of the batch within the fragment),
2488+
`__fragment_index` (the index of the fragment within the dataset),
24892489
`__last_in_fragment` (whether the batch is last in fragment), and
2490-
`__filename` (the name of the source file or a description of the
2490+
`__filename` (the name of the source file or a description of the
24912491
source fragment).
24922492
24932493
The columns will be passed down to Datasets and corresponding data
@@ -2510,6 +2510,9 @@ cdef class Scanner(_Weakrefable):
25102510
The number of batches to read ahead in a file. This might not work
25112511
for all file formats. Increasing this number will increase
25122512
RAM usage but could also improve IO utilization.
2513+
fragment_scan_options : FragmentScanOptions, default None
2514+
Options specific to a particular scan and fragment type, which
2515+
can change between different scans of the same dataset.
25132516
use_threads : bool, default True
25142517
If enabled, then maximum parallelism will be used determined by
25152518
the number of available CPU cores.
@@ -2520,9 +2523,6 @@ cdef class Scanner(_Weakrefable):
25202523
memory_pool : MemoryPool, default None
25212524
For memory allocations, if required. If not specified, uses the
25222525
default pool.
2523-
fragment_scan_options : FragmentScanOptions, default None
2524-
Options specific to a particular scan and fragment type, which
2525-
can change between different scans of the same dataset.
25262526
"""
25272527
cdef:
25282528
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2549,11 +2549,11 @@ cdef class Scanner(_Weakrefable):
25492549
return Scanner.wrap(scanner)
25502550

25512551
@staticmethod
2552-
def from_batches(source, Schema schema=None, bint use_threads=True,
2553-
object use_async=None, MemoryPool memory_pool=None,
2554-
object columns=None, Expression filter=None,
2555-
int batch_size=_DEFAULT_BATCH_SIZE,
2556-
FragmentScanOptions fragment_scan_options=None):
2552+
def from_batches(source, *, Schema schema=None, object columns=None,
2553+
Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
2554+
FragmentScanOptions fragment_scan_options=None,
2555+
bint use_threads=True, object use_async=None,
2556+
MemoryPool memory_pool=None):
25572557
"""
25582558
Create a Scanner from an iterator of batches.
25592559
@@ -2574,6 +2574,8 @@ cdef class Scanner(_Weakrefable):
25742574
Scan will return only the rows matching the filter.
25752575
batch_size : int, default 128Ki
25762576
The maximum row count for scanned record batches.
2577+
fragment_scan_options : FragmentScanOptions
2578+
The fragment scan options.
25772579
use_threads : bool, default True
25782580
If enabled, then maximum parallelism will be used determined by
25792581
the number of available CPU cores.
@@ -2584,8 +2586,6 @@ cdef class Scanner(_Weakrefable):
25842586
memory_pool : MemoryPool, default None
25852587
For memory allocations, if required. If not specified, uses the
25862588
default pool.
2587-
fragment_scan_options : FragmentScanOptions
2588-
The fragment scan options.
25892589
"""
25902590
cdef:
25912591
shared_ptr[CScanOptions] options = make_shared[CScanOptions]()

python/pyarrow/array.pxi

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -702,11 +702,11 @@ cdef class _PandasConvertible(_Weakrefable):
702702
memory_pool : MemoryPool, default None
703703
Arrow MemoryPool to use for allocations. Uses the default memory
704704
pool is not passed.
705-
strings_to_categorical : bool, default False
706-
Encode string (UTF8) and binary types to pandas.Categorical.
707705
categories : list, default empty
708706
List of fields that should be returned as pandas.Categorical. Only
709707
applies to table-like data structures.
708+
strings_to_categorical : bool, default False
709+
Encode string (UTF8) and binary types to pandas.Categorical.
710710
zero_copy_only : bool, default False
711711
Raise an ArrowException if this function call would require copying
712712
the underlying data.
@@ -2549,11 +2549,11 @@ cdef class DictionaryArray(Array):
25492549
The array of values referenced by the indices.
25502550
mask : ndarray or pandas.Series, bool type
25512551
True values indicate that indices are actually null.
2552+
ordered : bool, default False
2553+
Set to True if the category values are ordered.
25522554
from_pandas : bool, default False
25532555
If True, the indices should be treated as though they originated in
25542556
a pandas.Categorical (null encoded as -1).
2555-
ordered : bool, default False
2556-
Set to True if the category values are ordered.
25572557
safe : bool, default True
25582558
If True, check that the dictionary indices are in range.
25592559
memory_pool : MemoryPool, default None

python/pyarrow/ipc.pxi

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,12 +106,12 @@ cdef class IpcReadOptions(_Weakrefable):
106106
107107
Parameters
108108
----------
109-
use_threads : bool
110-
Whether to use the global CPU thread pool to parallelize any
111-
computational tasks like decompression.
112109
ensure_native_endian : bool
113110
Whether to convert incoming data to platform-native endianness.
114111
Default is true.
112+
use_threads : bool
113+
Whether to use the global CPU thread pool to parallelize any
114+
computational tasks like decompression.
115115
included_fields : list
116116
If empty (the default), return all deserialized fields.
117117
If non-empty, the values are the indices of fields to read on

0 commit comments

Comments
 (0)