datafusion-python/python/tests/test_io.py at main · apache/datafusion-python

History

138 lines (106 loc) · 4.5 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

# Licensed to the Apache Software Foundation (ASF) under one

# or more contributor license agreements. See the NOTICE file

# distributed with this work for additional information

# regarding copyright ownership. The ASF licenses this file

# to you under the Apache License, Version 2.0 (the

# "License"); you may not use this file except in compliance

# with the License. You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing,

# software distributed under the License is distributed on an

# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY

# KIND, either express or implied. See the License for the

# specific language governing permissions and limitations

# under the License.

from pathlib import Path

import pyarrow as pa

import pytest

from datafusion import column

from datafusion.io import read_avro, read_csv, read_json, read_parquet

from .utils import range_table

def test_read_json_global_ctx(ctx):

path = Path(__file__).parent.resolve()

# Default

test_data_path = Path(path) / "data_test_context" / "data.json"

df = read_json(test_data_path)

result = df.collect()

assert result[0].column(0) == pa.array(["a", "b", "c"])

assert result[0].column(1) == pa.array([1, 2, 3])

# Schema

schema = pa.schema(

[

pa.field("A", pa.string(), nullable=True),

]

)

df = read_json(test_data_path, schema=schema)

result = df.collect()

assert result[0].column(0) == pa.array(["a", "b", "c"])

assert result[0].schema == schema

# File extension

test_data_path = Path(path) / "data_test_context" / "data.json"

df = read_json(test_data_path, file_extension=".json")

result = df.collect()

assert result[0].column(0) == pa.array(["a", "b", "c"])

assert result[0].column(1) == pa.array([1, 2, 3])

def test_read_parquet_global():

parquet_df = read_parquet(path="parquet/data/alltypes_plain.parquet")

parquet_df.show()

assert parquet_df is not None

path = Path.cwd() / "parquet/data/alltypes_plain.parquet"

parquet_df = read_parquet(path=path)

assert parquet_df is not None

def test_read_csv():

csv_df = read_csv(path="testing/data/csv/aggregate_test_100.csv")

csv_df.select(column("c1")).show()

def test_read_csv_list():

csv_df = read_csv(path=["testing/data/csv/aggregate_test_100.csv"])

expected = csv_df.count() * 2

double_csv_df = read_csv(

path=[

"testing/data/csv/aggregate_test_100.csv",

]

)

actual = double_csv_df.count()

double_csv_df.select(column("c1")).show()

assert actual == expected

def test_read_avro():

avro_df = read_avro(path="testing/data/avro/alltypes_plain.avro")

avro_df.show()

assert avro_df is not None

path = Path.cwd() / "testing/data/avro/alltypes_plain.avro"

avro_df = read_avro(path=path)

assert avro_df is not None

def test_arrow_c_stream_large_dataset(ctx):

"""DataFrame streaming yields batches incrementally using Arrow APIs.

This test constructs a DataFrame that would be far larger than available

memory if materialized. Use the public API

``pa.RecordBatchReader.from_stream(df)`` (which is same as

``pa.RecordBatchReader._import_from_c_capsule(df.__arrow_c_stream__())``)

to read record batches incrementally without collecting the full dataset,

so reading a handful of batches should not exhaust process memory.

"""

# Create a very large DataFrame using range; this would be terabytes if collected

df = range_table(ctx, 0, 1 << 40)

reader = pa.RecordBatchReader.from_stream(df)

# Track RSS before consuming batches

# RSS is a practical measure of RAM usage visible to the OS. It excludes memory

# that has been swapped out and provides a simple cross-platform-ish indicator

# (psutil normalizes per-OS sources).

psutil = pytest.importorskip("psutil")

process = psutil.Process()

start_rss = process.memory_info().rss

for _ in range(5):

batch = reader.read_next_batch()

assert batch is not None

assert len(batch) > 0

current_rss = process.memory_info().rss

# Ensure memory usage hasn't grown substantially (>50MB)

assert current_rss - start_rss < 50 * 1024 * 1024

def test_table_from_arrow_c_stream(ctx, fail_collect):

df = range_table(ctx, 0, 10)

table = pa.table(df)

assert table.shape == (10, 1)

assert table.column_names == ["value"]

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

test_io.py

Latest commit

History

test_io.py

File metadata and controls