forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsort_df.py
More file actions
204 lines (156 loc) · 6.7 KB
/
Copy pathsort_df.py
File metadata and controls
204 lines (156 loc) · 6.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from multiprocessing import Pool
import numpy as np
import os
import pandas as pd
import pyarrow as pa
import pyarrow.plasma as plasma
import subprocess
import time
import multimerge
# To run this example, you will first need to run "python setup.py install" in
# this directory to build the Cython module.
#
# You will only see speedups if you run this code on more data, this is just a
# small example that can run on a laptop.
#
# The values we used to get a speedup (on a m4.10xlarge instance on EC2) were
# object_store_size = 84 * 10 ** 9
# num_cores = 20
# num_rows = 10 ** 9
# num_cols = 1
client = None
object_store_size = 2 * 10 ** 9 # 2 GB
num_cores = 8
num_rows = 200000
num_cols = 2
column_names = [str(i) for i in range(num_cols)]
column_to_sort = column_names[0]
# Connect to clients
def connect():
global client
client = plasma.connect('/tmp/store', '', 0)
np.random.seed(int(time.time() * 10e7) % 10000000)
def put_df(df):
record_batch = pa.RecordBatch.from_pandas(df)
# Get size of record batch and schema
mock_sink = pa.MockOutputStream()
stream_writer = pa.RecordBatchStreamWriter(mock_sink, record_batch.schema)
stream_writer.write_batch(record_batch)
data_size = mock_sink.size()
# Generate an ID and allocate a buffer in the object store for the
# serialized DataFrame
object_id = plasma.ObjectID(np.random.bytes(20))
buf = client.create(object_id, data_size)
# Write the serialized DataFrame to the object store
sink = pa.FixedSizeBufferWriter(buf)
stream_writer = pa.RecordBatchStreamWriter(sink, record_batch.schema)
stream_writer.write_batch(record_batch)
# Seal the object
client.seal(object_id)
return object_id
def get_dfs(object_ids):
"""Retrieve dataframes from the object store given their object IDs."""
buffers = client.get_buffers(object_ids)
return [pa.RecordBatchStreamReader(buf).read_next_batch().to_pandas()
for buf in buffers]
def local_sort(object_id):
"""Sort a partition of a dataframe."""
# Get the dataframe from the object store.
[df] = get_dfs([object_id])
# Sort the dataframe.
sorted_df = df.sort_values(by=column_to_sort)
# Get evenly spaced values from the dataframe.
indices = np.linspace(0, len(df) - 1, num=num_cores, dtype=np.int64)
# Put the sorted dataframe in the object store and return the corresponding
# object ID as well as the sampled values.
return put_df(sorted_df), sorted_df.as_matrix().take(indices)
def local_partitions(object_id_and_pivots):
"""Take a sorted partition of a dataframe and split it into more pieces."""
object_id, pivots = object_id_and_pivots
[df] = get_dfs([object_id])
split_at = df[column_to_sort].searchsorted(pivots)
split_at = [0] + list(split_at) + [len(df)]
# Partition the sorted dataframe and put each partition into the object
# store.
return [put_df(df[i:j]) for i, j in zip(split_at[:-1], split_at[1:])]
def merge(object_ids):
"""Merge a number of sorted dataframes into a single sorted dataframe."""
dfs = get_dfs(object_ids)
# In order to use our multimerge code, we have to convert the arrays from
# the Fortran format to the C format.
arrays = [np.ascontiguousarray(df.as_matrix()) for df in dfs]
for a in arrays:
assert a.dtype == np.float64
assert not np.isfortran(a)
# Filter out empty arrays.
arrays = [a for a in arrays if a.shape[0] > 0]
if len(arrays) == 0:
return None
resulting_array = multimerge.multimerge2d(*arrays)
merged_df2 = pd.DataFrame(resulting_array, columns=column_names)
return put_df(merged_df2)
if __name__ == '__main__':
# Start the plasma store.
p = subprocess.Popen(['plasma_store',
'-s', '/tmp/store',
'-m', str(object_store_size)])
# Connect to the plasma store.
connect()
# Connect the processes in the pool.
pool = Pool(initializer=connect, initargs=(), processes=num_cores)
# Create a DataFrame from a numpy array.
df = pd.DataFrame(np.random.randn(num_rows, num_cols),
columns=column_names)
partition_ids = [put_df(partition) for partition
in np.split(df, num_cores)]
# Begin timing the parallel sort example.
parallel_sort_start = time.time()
# Sort each partition and subsample them. The subsampled values will be
# used to create buckets.
sorted_df_ids, pivot_groups = list(zip(*pool.map(local_sort,
partition_ids)))
# Choose the pivots.
all_pivots = np.concatenate(pivot_groups)
indices = np.linspace(0, len(all_pivots) - 1, num=num_cores,
dtype=np.int64)
pivots = np.take(np.sort(all_pivots), indices)
# Break all of the sorted partitions into even smaller partitions. Group
# the object IDs from each bucket together.
results = list(zip(*pool.map(local_partitions,
zip(sorted_df_ids,
len(sorted_df_ids) * [pivots]))))
# Merge each of the buckets and store the results in the object store.
object_ids = pool.map(merge, results)
resulting_ids = [object_id for object_id in object_ids
if object_id is not None]
# Stop timing the paralle sort example.
parallel_sort_end = time.time()
print('Parallel sort took {} seconds.'
.format(parallel_sort_end - parallel_sort_start))
serial_sort_start = time.time()
original_sorted_df = df.sort_values(by=column_to_sort)
serial_sort_end = time.time()
# Check that we sorted the DataFrame properly.
sorted_dfs = get_dfs(resulting_ids)
sorted_df = pd.concat(sorted_dfs)
print('Serial sort took {} seconds.'
.format(serial_sort_end - serial_sort_start))
assert np.allclose(sorted_df.values, original_sorted_df.values)
# Kill the object store.
p.kill()