arrow/python/pyarrow/_csv.pyx at master · cppwanderboy/arrow

474 lines (404 loc) · 15.3 KB
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#   http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.
# cython: profile=False
# distutils: language = c++
# cython: embedsignature = True
# cython: language_level = 3
from __future__ import absolute_import
from pyarrow.includes.common cimport *
from pyarrow.includes.libarrow cimport *
from pyarrow.lib cimport (check_status, Field, MemoryPool, ensure_type,
                          maybe_unbox_memory_pool, get_input_stream,
                          pyarrow_wrap_table, pyarrow_wrap_data_type,
                          pyarrow_unwrap_data_type)
from pyarrow.compat import frombytes, tobytes, Mapping
cdef unsigned char _single_char(s) except 0:
    val = ord(s)
    if val == 0 or val > 127:
        raise ValueError("Expecting an ASCII character")
    return <unsigned char> val
cdef class ReadOptions:
    Options for reading CSV files.
    Parameters
    ----------
    use_threads : bool, optional (default True)
        Whether to use multiple threads to accelerate reading
    block_size : int, optional
        How much bytes to process at a time from the input stream.
        This will determine multi-threading granularity as well as
        the size of individual chunks in the Table.
    skip_rows: int, optional (default 0)
        The number of rows to skip at the start of the CSV data, not
        including the row of column names (if any).
    column_names: list, optional
        The Table column names.  If empty, column names will be
        read from the first row after `skip_rows`.
        CCSVReadOptions options
    # Avoid mistakingly creating attributes
    __slots__ = ()
    def __init__(self, use_threads=None, block_size=None, skip_rows=None,
                 column_names=None):
        self.options = CCSVReadOptions.Defaults()
        if use_threads is not None:
            self.use_threads = use_threads
        if block_size is not None:
            self.block_size = block_size
        if skip_rows is not None:
            self.skip_rows = skip_rows
        if column_names is not None:
            self.column_names = column_names
    @property
    def use_threads(self):
        """
        Whether to use multiple threads to accelerate reading.
        """
        return self.options.use_threads
    @use_threads.setter
    def use_threads(self, value):
        self.options.use_threads = value
    @property
    def block_size(self):
        """
        How much bytes to process at a time from the input stream.
        This will determine multi-threading granularity as well as
        the size of individual chunks in the Table.
        """
        return self.options.block_size
    @block_size.setter
    def block_size(self, value):
        self.options.block_size = value
    @property
    def skip_rows(self):
        """
        The number of rows to skip at the start of the CSV data, not
        including the row of column names (if any).
        """
        return self.options.skip_rows
    @skip_rows.setter
    def skip_rows(self, value):
        self.options.skip_rows = value
    @property
    def column_names(self):
        """
        The Table column names.  If empty, column names will be
        read from the first row after `skip_rows`.
        """
        return [frombytes(s) for s in self.options.column_names]
    @column_names.setter
    def column_names(self, value):
        self.options.column_names.clear()
        for item in value:
            self.options.column_names.push_back(tobytes(item))
cdef class ParseOptions:
    Options for parsing CSV files.
    Parameters
    ----------
    delimiter: 1-character string, optional (default ',')
        The character delimiting individual cells in the CSV data.
    quote_char: 1-character string or False, optional (default '"')
        The character used optionally for quoting CSV values
        (False if quoting is not allowed).
    double_quote: bool, optional (default True)
        Whether two quotes in a quoted CSV value denote a single quote
        in the data.
    escape_char: 1-character string or False, optional (default False)
        The character used optionally for escaping special characters
        (False if escaping is not allowed).
    newlines_in_values: bool, optional (default False)
        Whether newline characters are allowed in CSV values.
        Setting this to True reduces the performance of multi-threaded
        CSV reading.
    ignore_empty_lines: bool, optional (default True)
        Whether empty lines are ignored in CSV input.
        If False, an empty line is interpreted as containing a single empty
        value (assuming a one-column CSV file).
        CCSVParseOptions options
    __slots__ = ()
    def __init__(self, delimiter=None, quote_char=None, double_quote=None,
                 escape_char=None, newlines_in_values=None,
                 ignore_empty_lines=None):
        self.options = CCSVParseOptions.Defaults()
        if delimiter is not None:
            self.delimiter = delimiter
        if quote_char is not None:
            self.quote_char = quote_char
        if double_quote is not None:
            self.double_quote = double_quote
        if escape_char is not None:
            self.escape_char = escape_char
        if newlines_in_values is not None:
            self.newlines_in_values = newlines_in_values
        if ignore_empty_lines is not None:
            self.ignore_empty_lines = ignore_empty_lines
    @property
    def delimiter(self):
        """
        The character delimiting individual cells in the CSV data.
        """
        return chr(self.options.delimiter)
    @delimiter.setter
    def delimiter(self, value):
        self.options.delimiter = _single_char(value)
    @property
    def quote_char(self):
        """
        The character used optionally for quoting CSV values
        (False if quoting is not allowed).
        """
        if self.options.quoting:
            return chr(self.options.quote_char)
        else:
            return False
    @quote_char.setter
    def quote_char(self, value):
        if value is False:
            self.options.quoting = False
        else:
            self.options.quote_char = _single_char(value)
            self.options.quoting = True
    @property
    def double_quote(self):
        """
        Whether two quotes in a quoted CSV value denote a single quote
        in the data.
        """
        return self.options.double_quote
    @double_quote.setter
    def double_quote(self, value):
        self.options.double_quote = value
    @property
    def escape_char(self):
        """
        The character used optionally for escaping special characters
        (False if escaping is not allowed).
        """
        if self.options.escaping:
            return chr(self.options.escape_char)
        else:
            return False
    @escape_char.setter
    def escape_char(self, value):
        if value is False:
            self.options.escaping = False
        else:
            self.options.escape_char = _single_char(value)
            self.options.escaping = True
    @property
    def newlines_in_values(self):
        """
        Whether newline characters are allowed in CSV values.
        Setting this to True reduces the performance of multi-threaded
        CSV reading.
        """
        return self.options.newlines_in_values
    @newlines_in_values.setter
    def newlines_in_values(self, value):
        self.options.newlines_in_values = value
    @property
    def ignore_empty_lines(self):
        """
        Whether empty lines are ignored in CSV input.
        If False, an empty line is interpreted as containing a single empty
        value (assuming a one-column CSV file).
        """
        return self.options.ignore_empty_lines
    @ignore_empty_lines.setter
    def ignore_empty_lines(self, value):
        self.options.ignore_empty_lines = value
cdef class ConvertOptions:
    Options for converting CSV data.
    Parameters
    ----------
    check_utf8 : bool, optional (default True)
        Whether to check UTF8 validity of string columns.
    column_types: dict, optional
        Map column names to column types
        (disabling type inference on those columns).
    null_values: list, optional
        A sequence of strings that denote nulls in the data
        (defaults are appropriate in most cases).
    true_values: list, optional
        A sequence of strings that denote true booleans in the data
        (defaults are appropriate in most cases).
    false_values: list, optional
        A sequence of strings that denote false booleans in the data
        (defaults are appropriate in most cases).
    strings_can_be_null: bool, optional (default False)
        Whether string / binary columns can have null values.
        If true, then strings in null_values are considered null for
        string columns.
        If false, then all strings are valid string values.
        CCSVConvertOptions options
    # Avoid mistakingly creating attributes
    __slots__ = ()
    def __init__(self, check_utf8=None, column_types=None, null_values=None,
                 true_values=None, false_values=None,
                 strings_can_be_null=None):
        self.options = CCSVConvertOptions.Defaults()
        if check_utf8 is not None:
            self.check_utf8 = check_utf8
        if column_types is not None:
            self.column_types = column_types
        if null_values is not None:
            self.null_values = null_values
        if true_values is not None:
            self.true_values = true_values
        if false_values is not None:
            self.false_values = false_values
        if strings_can_be_null is not None:
            self.strings_can_be_null = strings_can_be_null
    @property
    def check_utf8(self):
        """
        Whether to check UTF8 validity of string columns.
        """
        return self.options.check_utf8
    @check_utf8.setter
    def check_utf8(self, value):
        self.options.check_utf8 = value
    @property
    def strings_can_be_null(self):
        """
        Whether string / binary columns can have null values.
        """
        return self.options.strings_can_be_null
    @strings_can_be_null.setter
    def strings_can_be_null(self, value):
        self.options.strings_can_be_null = value
    @property
    def column_types(self):
        """
        Map column names to column types
        (disabling type inference on those columns).
        """
        d = {frombytes(item.first): pyarrow_wrap_data_type(item.second)
             for item in self.options.column_types}
        return d
    @column_types.setter
    def column_types(self, value):
        cdef:
            shared_ptr[CDataType] typ
        if isinstance(value, Mapping):
            value = value.items()
        self.options.column_types.clear()
        for item in value:
            if isinstance(item, Field):
                k = item.name
                v = item.type
            else:
                k, v = item
            typ = pyarrow_unwrap_data_type(ensure_type(v))
            assert typ != NULL
            self.options.column_types[tobytes(k)] = typ
    @property
    def null_values(self):
        """
        A sequence of strings that denote nulls in the data.
        """
        return [frombytes(x) for x in self.options.null_values]
    @null_values.setter
    def null_values(self, value):
        self.options.null_values = [tobytes(x) for x in value]
    @property
    def true_values(self):
        """
        A sequence of strings that denote true booleans in the data.
        """
        return [frombytes(x) for x in self.options.true_values]
    @true_values.setter
    def true_values(self, value):
        self.options.true_values = [tobytes(x) for x in value]
    @property
    def false_values(self):
        """
        A sequence of strings that denote false booleans in the data.
        """
        return [frombytes(x) for x in self.options.false_values]
    @false_values.setter
    def false_values(self, value):
        self.options.false_values = [tobytes(x) for x in value]
cdef _get_reader(input_file, shared_ptr[InputStream]* out):
    use_memory_map = False
    get_input_stream(input_file, use_memory_map, out)
cdef _get_read_options(ReadOptions read_options, CCSVReadOptions* out):
    if read_options is None:
        out[0] = CCSVReadOptions.Defaults()
        out[0] = read_options.options
cdef _get_parse_options(ParseOptions parse_options, CCSVParseOptions* out):
    if parse_options is None:
        out[0] = CCSVParseOptions.Defaults()
        out[0] = parse_options.options
cdef _get_convert_options(ConvertOptions convert_options,
                          CCSVConvertOptions* out):
    if convert_options is None:
        out[0] = CCSVConvertOptions.Defaults()
        out[0] = convert_options.options
def read_csv(input_file, read_options=None, parse_options=None,
             convert_options=None, MemoryPool memory_pool=None):
    Read a Table from a stream of CSV data.
    Parameters
    ----------
    input_file: string, path or file-like object
        The location of CSV data.  If a string or path, and if it ends
        with a recognized compressed file extension (e.g. ".gz" or ".bz2"),
        the data is automatically decompressed when reading.
    read_options: pyarrow.csv.ReadOptions, optional
        Options for the CSV reader (see pyarrow.csv.ReadOptions constructor
        for defaults)
    parse_options: pyarrow.csv.ParseOptions, optional
        Options for the CSV parser
        (see pyarrow.csv.ParseOptions constructor for defaults)
    convert_options: pyarrow.csv.ConvertOptions, optional
        Options for converting CSV data
        (see pyarrow.csv.ConvertOptions constructor for defaults)
    memory_pool: MemoryPool, optional
        Pool to allocate Table memory from
    Returns
    -------
    :class:`pyarrow.Table`
        Contents of the CSV file as a in-memory table.
        shared_ptr[InputStream] stream
        CCSVReadOptions c_read_options
        CCSVParseOptions c_parse_options
        CCSVConvertOptions c_convert_options
        shared_ptr[CCSVReader] reader
        shared_ptr[CTable] table
    _get_reader(input_file, &stream)
    _get_read_options(read_options, &c_read_options)
    _get_parse_options(parse_options, &c_parse_options)
    _get_convert_options(convert_options, &c_convert_options)
    check_status(CCSVReader.Make(maybe_unbox_memory_pool(memory_pool),
                                 stream, c_read_options, c_parse_options,
                                 c_convert_options, &reader))
    with nogil:
        check_status(reader.get().Read(&table))
    return pyarrow_wrap_table(table)
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

_csv.pyx

Latest commit

History

_csv.pyx

File metadata and controls