-
Notifications
You must be signed in to change notification settings - Fork 244
Expand file tree
/
Copy pathtest_map.py
More file actions
137 lines (106 loc) · 3.94 KB
/
Copy pathtest_map.py
File metadata and controls
137 lines (106 loc) · 3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from time import time
import numpy as np
import pytest
from docarray import BaseDoc, DocArray
from docarray.documents import ImageDoc
from docarray.typing import NdArray
from docarray.utils.map import map_docs, map_docs_batch
from tests.units.typing.test_bytes import IMAGE_PATHS
pytestmark = [pytest.mark.benchmark, pytest.mark.slow]
class MyMatrix(BaseDoc):
matrix: NdArray
def cpu_intensive(doc: MyMatrix) -> MyMatrix:
# some cpu intensive function
for i in range(3000):
sqrt_matrix = np.sqrt(doc.matrix)
doc.matrix = sqrt_matrix
return doc
def test_map_docs_multiprocessing():
if os.cpu_count() > 1:
def time_multiprocessing(num_workers: int) -> float:
n_docs = 5
rng = np.random.RandomState(0)
matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)]
da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices])
start_time = time()
list(
map_docs(
da=da, func=cpu_intensive, backend='process', num_worker=num_workers
)
)
return time() - start_time
time_1_cpu = time_multiprocessing(num_workers=1)
time_2_cpu = time_multiprocessing(num_workers=2)
assert time_2_cpu < time_1_cpu
def cpu_intensive_batch(da: DocArray[MyMatrix]) -> DocArray[MyMatrix]:
# some cpu intensive function
for doc in da:
for i in range(3000):
sqrt_matrix = np.sqrt(doc.matrix)
doc.matrix = sqrt_matrix
return da
def test_map_docs_batch_multiprocessing():
if os.cpu_count() > 1:
def time_multiprocessing(num_workers: int) -> float:
n_docs = 16
rng = np.random.RandomState(0)
matrices = [rng.random(size=(1000, 1000)) for _ in range(n_docs)]
da = DocArray[MyMatrix]([MyMatrix(matrix=m) for m in matrices])
start_time = time()
list(
map_docs_batch(
da=da,
func=cpu_intensive_batch,
batch_size=8,
backend='process',
num_worker=num_workers,
)
)
return time() - start_time
time_1_cpu = time_multiprocessing(num_workers=1)
time_2_cpu = time_multiprocessing(num_workers=2)
assert time_2_cpu < time_1_cpu
def io_intensive(img: ImageDoc) -> ImageDoc:
# some io intensive function: load and set image url
img.tensor = img.url.load()
return img
def test_map_docs_multithreading():
def time_multithreading(num_workers: int) -> float:
n_docs = 100
da = DocArray[ImageDoc](
[ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)]
)
start_time = time()
list(
map_docs(da=da, func=io_intensive, backend='thread', num_worker=num_workers)
)
return time() - start_time
time_1_thread = time_multithreading(num_workers=1)
time_2_thread = time_multithreading(num_workers=2)
assert time_2_thread < time_1_thread
def io_intensive_batch(da: DocArray[ImageDoc]) -> DocArray[ImageDoc]:
# some io intensive function: load and set image url
for doc in da:
doc.tensor = doc.url.load()
return da
def test_map_docs_batch_multithreading():
def time_multithreading_batch(num_workers: int) -> float:
n_docs = 100
da = DocArray[ImageDoc](
[ImageDoc(url=IMAGE_PATHS['png']) for _ in range(n_docs)]
)
start_time = time()
list(
map_docs_batch(
da=da,
func=io_intensive_batch,
backend='thread',
num_worker=num_workers,
batch_size=10,
)
)
return time() - start_time
time_1_thread = time_multithreading_batch(num_workers=1)
time_2_thread = time_multithreading_batch(num_workers=2)
assert time_2_thread < time_1_thread