ARROW-15006: [Python][CI][Doc] Enable numpydoc check PR03 (apache#13983)

amoeba · web-flow · commit 0f91e684ddda · 2022-10-20T09:41:24.000+02:00
Adds an additional numypdoc check to CI (PR03) and fixes all corresponding violations. Note this does not fully resolve [ARROW-15006](https://issues.apache.org/jira/browse/ARROW-15006). Authored-by: Bryce Mecum <petridish@gmail.com> Signed-off-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1068,7 +1068,7 @@ services:
       ["/arrow/ci/scripts/cpp_build.sh /arrow /build &&
         /arrow/ci/scripts/python_build.sh /arrow /build &&
         pip install -e /arrow/dev/archery[numpydoc] &&
-        archery numpydoc --allow-rule PR01,PR10 &&
+        archery numpydoc --allow-rule PR01,PR03,PR10 &&
         /arrow/ci/scripts/python_test.sh /arrow"]
 
   conda-python-dask:
diff --git a/python/pyarrow/_csv.pyx b/python/pyarrow/_csv.pyx
@@ -189,22 +189,22 @@ cdef class ReadOptions(_Weakrefable):
         self.options.reset(new CCSVReadOptions(CCSVReadOptions.Defaults()))
 
     def __init__(self, *, use_threads=None, block_size=None, skip_rows=None,
-                 column_names=None, autogenerate_column_names=None,
-                 encoding='utf8', skip_rows_after_names=None):
+                 skip_rows_after_names=None, column_names=None,
+                 autogenerate_column_names=None, encoding='utf8'):
         if use_threads is not None:
             self.use_threads = use_threads
         if block_size is not None:
             self.block_size = block_size
         if skip_rows is not None:
             self.skip_rows = skip_rows
+        if skip_rows_after_names is not None:
+            self.skip_rows_after_names = skip_rows_after_names
         if column_names is not None:
             self.column_names = column_names
         if autogenerate_column_names is not None:
             self.autogenerate_column_names= autogenerate_column_names
         # Python-specific option
         self.encoding = encoding
-        if skip_rows_after_names is not None:
-            self.skip_rows_after_names = skip_rows_after_names
 
     @property
     def use_threads(self):
@@ -243,6 +243,23 @@ cdef class ReadOptions(_Weakrefable):
     def skip_rows(self, value):
         deref(self.options).skip_rows = value
 
+    @property
+    def skip_rows_after_names(self):
+        """
+        The number of rows to skip after the column names.
+        This number can be larger than the number of rows in one
+        block, and empty rows are counted.
+        The order of application is as follows:
+        - `skip_rows` is applied (if non-zero);
+        - column names aread (unless `column_names` is set);
+        - `skip_rows_after_names` is applied (if non-zero).
+        """
+        return deref(self.options).skip_rows_after_names
+
+    @skip_rows_after_names.setter
+    def skip_rows_after_names(self, value):
+        deref(self.options).skip_rows_after_names = value
+
     @property
     def column_names(self):
         """
@@ -271,23 +288,6 @@ cdef class ReadOptions(_Weakrefable):
     def autogenerate_column_names(self, value):
         deref(self.options).autogenerate_column_names = value
 
-    @property
-    def skip_rows_after_names(self):
-        """
-        The number of rows to skip after the column names.
-        This number can be larger than the number of rows in one
-        block, and empty rows are counted.
-        The order of application is as follows:
-        - `skip_rows` is applied (if non-zero);
-        - column names aread (unless `column_names` is set);
-        - `skip_rows_after_names` is applied (if non-zero).
-        """
-        return deref(self.options).skip_rows_after_names
-
-    @skip_rows_after_names.setter
-    def skip_rows_after_names(self, value):
-        deref(self.options).skip_rows_after_names = value
-
     def validate(self):
         check_status(deref(self.options).Validate())
 
@@ -296,11 +296,11 @@ cdef class ReadOptions(_Weakrefable):
             self.use_threads == other.use_threads and
             self.block_size == other.block_size and
             self.skip_rows == other.skip_rows and
+            self.skip_rows_after_names == other.skip_rows_after_names and
             self.column_names == other.column_names and
             self.autogenerate_column_names ==
             other.autogenerate_column_names and
-            self.encoding == other.encoding and
-            self.skip_rows_after_names == other.skip_rows_after_names
+            self.encoding == other.encoding
         )
 
     @staticmethod
@@ -605,11 +605,6 @@ cdef class ConvertOptions(_Weakrefable):
     decimal_point : 1-character string, optional (default '.')
         The character used as decimal point in floating-point and decimal
         data.
-    timestamp_parsers : list, optional
-        A sequence of strptime()-compatible format strings, tried in order
-        when attempting to infer or convert timestamp values (the special
-        value ISO8601() can also be given).  By default, a fast built-in
-        ISO-8601 parser is used.
     strings_can_be_null : bool, optional (default False)
         Whether string / binary columns can have null values.
         If true, then strings in null_values are considered null for
@@ -620,16 +615,6 @@ cdef class ConvertOptions(_Weakrefable):
         If true, then strings in "null_values" are also considered null
         when they appear quoted in the CSV file. Otherwise, quoted values
         are never considered null.
-    auto_dict_encode : bool, optional (default False)
-        Whether to try to automatically dict-encode string / binary data.
-        If true, then when type inference detects a string or binary column,
-        it it dict-encoded up to `auto_dict_max_cardinality` distinct values
-        (per chunk), after which it switches to regular encoding.
-        This setting is ignored for non-inferred columns (those in
-        `column_types`).
-    auto_dict_max_cardinality : int, optional
-        The maximum dictionary cardinality for `auto_dict_encode`.
-        This value is per chunk.
     include_columns : list, optional
         The names of columns to include in the Table.
         If empty, the Table will include all columns from the CSV file.
@@ -641,6 +626,21 @@ cdef class ConvertOptions(_Weakrefable):
         produce a column of nulls (whose type is selected using
         `column_types`, or null by default).
         This option is ignored if `include_columns` is empty.
+    auto_dict_encode : bool, optional (default False)
+        Whether to try to automatically dict-encode string / binary data.
+        If true, then when type inference detects a string or binary column,
+        it it dict-encoded up to `auto_dict_max_cardinality` distinct values
+        (per chunk), after which it switches to regular encoding.
+        This setting is ignored for non-inferred columns (those in
+        `column_types`).
+    auto_dict_max_cardinality : int, optional
+        The maximum dictionary cardinality for `auto_dict_encode`.
+        This value is per chunk.
+    timestamp_parsers : list, optional
+        A sequence of strptime()-compatible format strings, tried in order
+        when attempting to infer or convert timestamp values (the special
+        value ISO8601() can also be given).  By default, a fast built-in
+        ISO-8601 parser is used.
 
     Examples
     --------
diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx
@@ -257,7 +257,7 @@ cdef class Dataset(_Weakrefable):
         ...                   'n_legs': [2, 2, 4, 4, 5, 100],
         ...                   'animal': ["Flamingo", "Parrot", "Dog", "Horse",
         ...                              "Brittle stars", "Centipede"]})
-        >>> 
+        >>>
         >>> import pyarrow.parquet as pq
         >>> pq.write_table(table, "dataset_scanner.parquet")
 
@@ -1221,12 +1221,12 @@ cdef class CsvFileFormat(FileFormat):
     ----------
     parse_options : pyarrow.csv.ParseOptions
         Options regarding CSV parsing.
+    default_fragment_scan_options : CsvFragmentScanOptions
+        Default options for fragments scan.
     convert_options : pyarrow.csv.ConvertOptions
         Options regarding value conversion.
     read_options : pyarrow.csv.ReadOptions
         General read options.
-    default_fragment_scan_options : CsvFragmentScanOptions
-        Default options for fragments scan.
     """
     cdef:
         CCsvFileFormat* csv_format
@@ -2315,17 +2315,17 @@ cdef class Scanner(_Weakrefable):
         projections.
 
         The list of columns or expressions may use the special fields
-        `__batch_index` (the index of the batch within the fragment), 
-        `__fragment_index` (the index of the fragment within the dataset), 
+        `__batch_index` (the index of the batch within the fragment),
+        `__fragment_index` (the index of the fragment within the dataset),
         `__last_in_fragment` (whether the batch is last in fragment), and
-        `__filename` (the name of the source file or a description of the 
+        `__filename` (the name of the source file or a description of the
         source fragment).
 
         The columns will be passed down to Datasets and corresponding data
         fragments to avoid loading, copying, and deserializing columns
         that will not be required further down the compute chain.
-        By default all of the available columns are projected. 
-        Raises an exception if any of the referenced column names does 
+        By default all of the available columns are projected.
+        Raises an exception if any of the referenced column names does
         not exist in the dataset's Schema.
     filter : Expression, default None
         Scan will return only the rows matching the filter.
@@ -2338,8 +2338,9 @@ cdef class Scanner(_Weakrefable):
         record batches are overflowing memory then this method can be
         called to reduce their size.
     batch_readahead : int, default 16
-        The number of batches to read ahead in a file. Increasing this number 
-        will increase RAM usage but could also improve IO utilization.
+        The number of batches to read ahead in a file. This might not work
+        for all file formats. Increasing this number will increase
+        RAM usage but could also improve IO utilization.
     fragment_readahead : int, default 4
         The number of files to read ahead. Increasing this number will increase
         RAM usage but could also improve IO utilization.
@@ -2375,14 +2376,13 @@ cdef class Scanner(_Weakrefable):
         return self.wrapped
 
     @staticmethod
-    def from_dataset(Dataset dataset not None,
-                     bint use_threads=True, object use_async=None,
-                     MemoryPool memory_pool=None,
-                     object columns=None, Expression filter=None,
-                     int batch_size=_DEFAULT_BATCH_SIZE,
+    def from_dataset(Dataset dataset not None, *, object columns=None,
+                     Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
                      int batch_readahead=_DEFAULT_BATCH_READAHEAD,
                      int fragment_readahead=_DEFAULT_FRAGMENT_READAHEAD,
-                     FragmentScanOptions fragment_scan_options=None):
+                     FragmentScanOptions fragment_scan_options=None,
+                     bint use_threads=True, object use_async=None,
+                     MemoryPool memory_pool=None):
         """
         Create Scanner from Dataset,
 
@@ -2397,10 +2397,10 @@ cdef class Scanner(_Weakrefable):
             projections.
 
             The list of columns or expressions may use the special fields
-            `__batch_index` (the index of the batch within the fragment), 
-            `__fragment_index` (the index of the fragment within the dataset), 
+            `__batch_index` (the index of the batch within the fragment),
+            `__fragment_index` (the index of the fragment within the dataset),
             `__last_in_fragment` (whether the batch is last in fragment), and
-            `__filename` (the name of the source file or a description of the 
+            `__filename` (the name of the source file or a description of the
             source fragment).
 
             The columns will be passed down to Datasets and corresponding data
@@ -2426,6 +2426,9 @@ cdef class Scanner(_Weakrefable):
         fragment_readahead : int, default 4
             The number of files to read ahead. Increasing this number will increase
             RAM usage but could also improve IO utilization.
+        fragment_scan_options : FragmentScanOptions, default None
+            Options specific to a particular scan and fragment type, which
+            can change between different scans of the same dataset.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2436,9 +2439,6 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
-        fragment_scan_options : FragmentScanOptions, default None
-            Options specific to a particular scan and fragment type, which
-            can change between different scans of the same dataset.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2461,13 +2461,13 @@ cdef class Scanner(_Weakrefable):
         return Scanner.wrap(scanner)
 
     @staticmethod
-    def from_fragment(Fragment fragment not None, Schema schema=None,
-                      bint use_threads=True, object use_async=None,
-                      MemoryPool memory_pool=None,
+    def from_fragment(Fragment fragment not None, *, Schema schema=None,
                       object columns=None, Expression filter=None,
                       int batch_size=_DEFAULT_BATCH_SIZE,
                       int batch_readahead=_DEFAULT_BATCH_READAHEAD,
-                      FragmentScanOptions fragment_scan_options=None):
+                      FragmentScanOptions fragment_scan_options=None,
+                      bint use_threads=True, object use_async=None,
+                      MemoryPool memory_pool=None,):
         """
         Create Scanner from Fragment,
 
@@ -2484,10 +2484,10 @@ cdef class Scanner(_Weakrefable):
             projections.
 
             The list of columns or expressions may use the special fields
-            `__batch_index` (the index of the batch within the fragment), 
-            `__fragment_index` (the index of the fragment within the dataset), 
+            `__batch_index` (the index of the batch within the fragment),
+            `__fragment_index` (the index of the fragment within the dataset),
             `__last_in_fragment` (whether the batch is last in fragment), and
-            `__filename` (the name of the source file or a description of the 
+            `__filename` (the name of the source file or a description of the
             source fragment).
 
             The columns will be passed down to Datasets and corresponding data
@@ -2510,6 +2510,9 @@ cdef class Scanner(_Weakrefable):
             The number of batches to read ahead in a file. This might not work
             for all file formats. Increasing this number will increase
             RAM usage but could also improve IO utilization.
+        fragment_scan_options : FragmentScanOptions, default None
+            Options specific to a particular scan and fragment type, which
+            can change between different scans of the same dataset.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2520,9 +2523,6 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
-        fragment_scan_options : FragmentScanOptions, default None
-            Options specific to a particular scan and fragment type, which
-            can change between different scans of the same dataset.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
@@ -2549,11 +2549,11 @@ cdef class Scanner(_Weakrefable):
         return Scanner.wrap(scanner)
 
     @staticmethod
-    def from_batches(source, Schema schema=None, bint use_threads=True,
-                     object use_async=None, MemoryPool memory_pool=None,
-                     object columns=None, Expression filter=None,
-                     int batch_size=_DEFAULT_BATCH_SIZE,
-                     FragmentScanOptions fragment_scan_options=None):
+    def from_batches(source, *, Schema schema=None, object columns=None,
+                     Expression filter=None, int batch_size=_DEFAULT_BATCH_SIZE,
+                     FragmentScanOptions fragment_scan_options=None,
+                     bint use_threads=True, object use_async=None,
+                     MemoryPool memory_pool=None):
         """
         Create a Scanner from an iterator of batches.
 
@@ -2574,6 +2574,8 @@ cdef class Scanner(_Weakrefable):
             Scan will return only the rows matching the filter.
         batch_size : int, default 128Ki
             The maximum row count for scanned record batches.
+        fragment_scan_options : FragmentScanOptions
+            The fragment scan options.
         use_threads : bool, default True
             If enabled, then maximum parallelism will be used determined by
             the number of available CPU cores.
@@ -2584,8 +2586,6 @@ cdef class Scanner(_Weakrefable):
         memory_pool : MemoryPool, default None
             For memory allocations, if required. If not specified, uses the
             default pool.
-        fragment_scan_options : FragmentScanOptions
-            The fragment scan options.
         """
         cdef:
             shared_ptr[CScanOptions] options = make_shared[CScanOptions]()
diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
@@ -702,11 +702,11 @@ cdef class _PandasConvertible(_Weakrefable):
         memory_pool : MemoryPool, default None
             Arrow MemoryPool to use for allocations. Uses the default memory
             pool is not passed.
-        strings_to_categorical : bool, default False
-            Encode string (UTF8) and binary types to pandas.Categorical.
         categories : list, default empty
             List of fields that should be returned as pandas.Categorical. Only
             applies to table-like data structures.
+        strings_to_categorical : bool, default False
+            Encode string (UTF8) and binary types to pandas.Categorical.
         zero_copy_only : bool, default False
             Raise an ArrowException if this function call would require copying
             the underlying data.
@@ -2549,11 +2549,11 @@ cdef class DictionaryArray(Array):
             The array of values referenced by the indices.
         mask : ndarray or pandas.Series, bool type
             True values indicate that indices are actually null.
+        ordered : bool, default False
+            Set to True if the category values are ordered.
         from_pandas : bool, default False
             If True, the indices should be treated as though they originated in
             a pandas.Categorical (null encoded as -1).
-        ordered : bool, default False
-            Set to True if the category values are ordered.
         safe : bool, default True
             If True, check that the dictionary indices are in range.
         memory_pool : MemoryPool, default None
diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi
@@ -106,12 +106,12 @@ cdef class IpcReadOptions(_Weakrefable):
 
     Parameters
     ----------
-    use_threads : bool
-        Whether to use the global CPU thread pool to parallelize any
-        computational tasks like decompression.
     ensure_native_endian : bool
         Whether to convert incoming data to platform-native endianness.
         Default is true.
+    use_threads : bool
+        Whether to use the global CPU thread pool to parallelize any
+        computational tasks like decompression.
     included_fields : list
         If empty (the default), return all deserialized fields.
         If non-empty, the values are the indices of fields to read on
diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py
diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py