forked from lancedb/lancedb
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharrow.py
More file actions
65 lines (51 loc) · 1.87 KB
/
arrow.py
File metadata and controls
65 lines (51 loc) · 1.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from typing import List, Optional, Union
import pyarrow as pa
from ._lancedb import RecordBatchStream
class AsyncRecordBatchReader:
"""
An async iterator over a stream of RecordBatches.
Also allows access to the schema of the stream
"""
def __init__(
self,
inner: Union[RecordBatchStream, pa.Table],
max_batch_length: Optional[int] = None,
):
"""
Attributes
----------
schema : pa.Schema
The schema of the batches produced by the stream.
Accessing the schema does not consume any data from the stream
"""
if isinstance(inner, pa.Table):
self._inner = self._async_iter_from_table(inner, max_batch_length)
self.schema: pa.Schema = inner.schema
elif isinstance(inner, RecordBatchStream):
self._inner = inner
self.schema: pa.Schema = inner.schema
else:
raise TypeError("inner must be a RecordBatchStream or a Table")
async def read_all(self) -> List[pa.RecordBatch]:
"""
Read all the record batches from the stream
This consumes the entire stream and returns a list of record batches
If there are a lot of results this may consume a lot of memory
"""
return [batch async for batch in self]
def __aiter__(self):
return self
async def __anext__(self) -> pa.RecordBatch:
return await self._inner.__anext__()
@staticmethod
async def _async_iter_from_table(
table: pa.Table, max_batch_length: Optional[int] = None
):
"""
Create an AsyncRecordBatchReader from a Table
This is useful when you have a Table that you want to iterate
over asynchronously
"""
batches = table.to_batches(max_chunksize=max_batch_length)
for batch in batches:
yield batch