|
1 | | -#include <iostream> |
2 | 1 | #include <algorithm> |
3 | 2 | #include <chrono> |
4 | | -#include <vector> |
| 3 | +#include <iostream> |
5 | 4 | #include <map> |
| 5 | +#include <vector> |
6 | 6 |
|
7 | 7 | #include "simdjson.h" |
8 | 8 |
|
9 | | -#define NB_ITERATION 5 |
10 | | -#define MIN_BATCH_SIZE 200000 |
| 9 | +#define NB_ITERATION 20 |
| 10 | +#define MIN_BATCH_SIZE 10000 |
11 | 11 | #define MAX_BATCH_SIZE 10000000 |
12 | 12 |
|
13 | 13 | bool test_baseline = false; |
14 | 14 | bool test_per_batch = true; |
15 | | -bool test_best_batch = true; |
| 15 | +bool test_best_batch = false; |
16 | 16 |
|
17 | | -bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j){ |
18 | | - return i.second > j.second; |
| 17 | +bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j) { |
| 18 | + return i.second > j.second; |
19 | 19 | } |
20 | 20 |
|
21 | | -int main (int argc, char *argv[]){ |
22 | | - |
23 | | - if (argc <= 1) { |
24 | | - std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl; |
25 | | - exit(1); |
26 | | - } |
27 | | - const char *filename = argv[1]; |
28 | | - auto [p, err] = simdjson::padded_string::load(filename); |
29 | | - if (err) { |
30 | | - std::cerr << "Could not load the file " << filename << std::endl; |
| 21 | +int main(int argc, char *argv[]) { |
| 22 | + |
| 23 | + if (argc <= 1) { |
| 24 | + std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl; |
| 25 | + exit(1); |
| 26 | + } |
| 27 | + const char *filename = argv[1]; |
| 28 | + auto[p, err] = simdjson::padded_string::load(filename); |
| 29 | + if (err) { |
| 30 | + std::cerr << "Could not load the file " << filename << std::endl; |
| 31 | + return EXIT_FAILURE; |
| 32 | + } |
| 33 | + if (test_baseline) { |
| 34 | + std::wclog << "Baseline: Getline + normal parse... " << std::endl; |
| 35 | + std::cout << "Gigabytes/second\t" |
| 36 | + << "Nb of documents parsed" << std::endl; |
| 37 | + for (auto i = 0; i < 3; i++) { |
| 38 | + // Actual test |
| 39 | + simdjson::dom::parser parser; |
| 40 | + simdjson::error_code alloc_error = parser.allocate(p.size()); |
| 41 | + if (alloc_error) { |
| 42 | + std::cerr << alloc_error << std::endl; |
31 | 43 | return EXIT_FAILURE; |
| 44 | + } |
| 45 | + std::istringstream ss(std::string(p.data(), p.size())); |
| 46 | + |
| 47 | + auto start = std::chrono::steady_clock::now(); |
| 48 | + int count = 0; |
| 49 | + std::string line; |
| 50 | + int parse_res = simdjson::SUCCESS; |
| 51 | + while (getline(ss, line)) { |
| 52 | + // TODO we're likely triggering simdjson's padding reallocation here. Is |
| 53 | + // that intentional? |
| 54 | + parser.parse(line); |
| 55 | + count++; |
| 56 | + } |
| 57 | + |
| 58 | + auto end = std::chrono::steady_clock::now(); |
| 59 | + |
| 60 | + std::chrono::duration<double> secs = end - start; |
| 61 | + double speedinGBs = static_cast<double>(p.size()) / |
| 62 | + (static_cast<double>(secs.count()) * 1000000000.0); |
| 63 | + std::cout << speedinGBs << "\t\t\t\t" << count << std::endl; |
| 64 | + |
| 65 | + if (parse_res != simdjson::SUCCESS) { |
| 66 | + std::cerr << "Parsing failed" << std::endl; |
| 67 | + exit(1); |
| 68 | + } |
32 | 69 | } |
33 | | - if (test_baseline) { |
34 | | - std::wclog << "Baseline: Getline + normal parse... " << std::endl; |
35 | | - std::cout << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl; |
36 | | - for (auto i = 0; i < 3; i++) { |
37 | | - //Actual test |
38 | | - simdjson::dom::parser parser; |
39 | | - simdjson::error_code alloc_error = parser.allocate(p.size()); |
40 | | - if (alloc_error) { |
41 | | - std::cerr << alloc_error << std::endl; |
42 | | - return EXIT_FAILURE; |
43 | | - } |
44 | | - std::istringstream ss(std::string(p.data(), p.size())); |
45 | | - |
46 | | - auto start = std::chrono::steady_clock::now(); |
47 | | - int count = 0; |
48 | | - std::string line; |
49 | | - int parse_res = simdjson::SUCCESS; |
50 | | - while (getline(ss, line)) { |
51 | | - // TODO we're likely triggering simdjson's padding reallocation here. Is that intentional? |
52 | | - parser.parse(line); |
53 | | - count++; |
54 | | - } |
55 | | - |
56 | | - auto end = std::chrono::steady_clock::now(); |
57 | | - |
58 | | - std::chrono::duration<double> secs = end - start; |
59 | | - double speedinGBs = static_cast<double>(p.size()) / (static_cast<double>(secs.count()) * 1000000000.0); |
60 | | - std::cout << speedinGBs << "\t\t\t\t" << count << std::endl; |
61 | | - |
62 | | - if (parse_res != simdjson::SUCCESS) { |
63 | | - std::cerr << "Parsing failed" << std::endl; |
64 | | - exit(1); |
65 | | - } |
66 | | - } |
67 | | - } |
68 | | - |
69 | | - std::map<size_t, double> batch_size_res; |
70 | | - if(test_per_batch) { |
71 | | - std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE |
72 | | - << " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl; |
73 | | - std::cout << "Batch Size\t" << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl; |
74 | | - for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE; i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 50) { |
75 | | - batch_size_res.insert(std::pair<size_t, double>(i, 0)); |
76 | | - int count; |
77 | | - for (size_t j = 0; j < 5; j++) { |
78 | | - //Actual test |
79 | | - simdjson::dom::parser parser; |
80 | | - simdjson::error_code error; |
81 | | - |
82 | | - auto start = std::chrono::steady_clock::now(); |
83 | | - count = 0; |
84 | | - for (auto result : parser.parse_many(p, 4000000)) { |
85 | | - error = result.error(); |
86 | | - count++; |
87 | | - } |
88 | | - auto end = std::chrono::steady_clock::now(); |
89 | | - |
90 | | - std::chrono::duration<double> secs = end - start; |
91 | | - double speedinGBs = static_cast<double>(p.size()) / (static_cast<double>(secs.count()) * 1000000000.0); |
92 | | - if (speedinGBs > batch_size_res.at(i)) |
93 | | - batch_size_res[i] = speedinGBs; |
94 | | - |
95 | | - if (error != simdjson::SUCCESS) { |
96 | | - std::wcerr << "Parsing failed with: " << error << std::endl; |
97 | | - exit(1); |
98 | | - } |
99 | | - } |
100 | | - std::cout << i << "\t\t" << std::fixed << std::setprecision(3) << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl; |
101 | | - |
| 70 | + } |
| 71 | + |
| 72 | + std::map<size_t, double> batch_size_res; |
| 73 | + if (test_per_batch) { |
| 74 | + std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE |
| 75 | + << " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl; |
| 76 | + std::cout << "Batch Size\t" |
| 77 | + << "Gigabytes/second\t" |
| 78 | + << "Nb of documents parsed" << std::endl; |
| 79 | + for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE; |
| 80 | + i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 100) { |
| 81 | + batch_size_res.insert(std::pair<size_t, double>(i, 0)); |
| 82 | + int count; |
| 83 | + for (size_t j = 0; j < 5; j++) { |
| 84 | + // Actual test |
| 85 | + simdjson::dom::parser parser; |
| 86 | + simdjson::error_code error; |
| 87 | + |
| 88 | + auto start = std::chrono::steady_clock::now(); |
| 89 | + count = 0; |
| 90 | + for (auto result : parser.parse_many(p, i)) { |
| 91 | + error = result.error(); |
| 92 | + if (error != simdjson::SUCCESS) { |
| 93 | + std::wcerr << "Parsing failed with: " << error_message(error) << std::endl; |
| 94 | + exit(1); |
| 95 | + } |
| 96 | + count++; |
102 | 97 | } |
| 98 | + auto end = std::chrono::steady_clock::now(); |
| 99 | + |
| 100 | + std::chrono::duration<double> secs = end - start; |
| 101 | + double speedinGBs = static_cast<double>(p.size()) / |
| 102 | + (static_cast<double>(secs.count()) * 1000000000.0); |
| 103 | + if (speedinGBs > batch_size_res.at(i)) |
| 104 | + batch_size_res[i] = speedinGBs; |
| 105 | + } |
| 106 | + std::cout << i << "\t\t" << std::fixed << std::setprecision(3) |
| 107 | + << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl; |
103 | 108 | } |
104 | | - |
105 | | - if (test_best_batch) { |
106 | | - size_t optimal_batch_size; |
107 | | - if (test_per_batch) { |
108 | | - optimal_batch_size = (*min_element(batch_size_res.begin(), batch_size_res.end(), compare)).first; |
109 | | - } else { |
110 | | - optimal_batch_size = MIN_BATCH_SIZE; |
| 109 | + } |
| 110 | + size_t optimal_batch_size{}; |
| 111 | + double best_speed{}; |
| 112 | + if (test_per_batch) { |
| 113 | + std::pair<size_t, double> best_results; |
| 114 | + best_results = |
| 115 | + (*min_element(batch_size_res.begin(), batch_size_res.end(), compare)); |
| 116 | + optimal_batch_size = best_results.first; |
| 117 | + best_speed = best_results.second; |
| 118 | + } else { |
| 119 | + optimal_batch_size = MIN_BATCH_SIZE; |
| 120 | + } |
| 121 | + std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..." |
| 122 | + << std::endl; |
| 123 | + std::wclog << "Best speed: " << best_speed << "..." << std::endl; |
| 124 | + |
| 125 | + if (test_best_batch) { |
| 126 | + std::wclog << "Starting speed test... Best of " << NB_ITERATION |
| 127 | + << " iterations..." << std::endl; |
| 128 | + std::vector<double> res; |
| 129 | + for (int i = 0; i < NB_ITERATION; i++) { |
| 130 | + |
| 131 | + // Actual test |
| 132 | + simdjson::dom::parser parser; |
| 133 | + simdjson::error_code error; |
| 134 | + |
| 135 | + auto start = std::chrono::steady_clock::now(); |
| 136 | + // This includes allocation of the parser |
| 137 | + for (auto result : parser.parse_many(p, optimal_batch_size)) { |
| 138 | + error = result.error(); |
| 139 | + if (error != simdjson::SUCCESS) { |
| 140 | + std::wcerr << "Parsing failed with: " << error_message(error) << std::endl; |
| 141 | + exit(1); |
111 | 142 | } |
112 | | - std::wclog << "Starting speed test... Best of " << NB_ITERATION << " iterations..." << std::endl; |
113 | | - std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..." << std::endl; |
114 | | - std::vector<double> res; |
115 | | - for (int i = 0; i < NB_ITERATION; i++) { |
116 | | - |
117 | | - // Actual test |
118 | | - simdjson::dom::parser parser; |
119 | | - simdjson::error_code error; |
120 | | - |
121 | | - auto start = std::chrono::steady_clock::now(); |
122 | | - // TODO this includes allocation of the parser; is that intentional? |
123 | | - for (auto result : parser.parse_many(p, 4000000)) { |
124 | | - error = result.error(); |
125 | | - } |
126 | | - auto end = std::chrono::steady_clock::now(); |
127 | | - |
128 | | - std::chrono::duration<double> secs = end - start; |
129 | | - res.push_back(secs.count()); |
130 | | - |
131 | | - if (error != simdjson::SUCCESS) { |
132 | | - std::wcerr << "Parsing failed with: " << error << std::endl; |
133 | | - exit(1); |
134 | | - } |
135 | | - |
136 | | - } |
137 | | - |
138 | | - double min_result = *min_element(res.begin(), res.end()); |
139 | | - double speedinGBs = static_cast<double>(p.size()) / (min_result * 1000000000.0); |
140 | | - |
| 143 | + } |
| 144 | + auto end = std::chrono::steady_clock::now(); |
141 | 145 |
|
142 | | - std::cout << "Min: " << min_result << " bytes read: " << p.size() |
143 | | - << " Gigabytes/second: " << speedinGBs << std::endl; |
| 146 | + std::chrono::duration<double> secs = end - start; |
| 147 | + res.push_back(secs.count()); |
144 | 148 | } |
145 | 149 |
|
146 | | - return 0; |
| 150 | + double min_result = *min_element(res.begin(), res.end()); |
| 151 | + double speedinGBs = |
| 152 | + static_cast<double>(p.size()) / (min_result * 1000000000.0); |
| 153 | + |
| 154 | + std::cout << "Min: " << min_result << " bytes read: " << p.size() |
| 155 | + << " Gigabytes/second: " << speedinGBs << std::endl; |
| 156 | + } |
| 157 | +#ifdef SIMDJSON_THREADS_ENABLED |
| 158 | + // Multithreading probably does not help matters for small files (less than 10 |
| 159 | + // MB). |
| 160 | + if (p.size() < 10000000) { |
| 161 | + std::cout << std::endl; |
| 162 | + |
| 163 | + std::cout << "Warning: your file is small and the performance results are " |
| 164 | + "probably meaningless" |
| 165 | + << std::endl; |
| 166 | + std::cout << "as far as multithreaded performance goes." << std::endl; |
| 167 | + |
| 168 | + std::cout << std::endl; |
| 169 | + |
| 170 | + std::cout |
| 171 | + << "Try to concatenate the file with itself to generate a large one." |
| 172 | + << std::endl; |
| 173 | + std::cout << "In bash: " << std::endl; |
| 174 | + std::cout << "for i in {1..1000}; do cat '" << filename |
| 175 | + << "' >> bar.ndjson; done" << std::endl; |
| 176 | + std::cout << argv[0] << " bar.ndjson" << std::endl; |
| 177 | + } |
| 178 | +#endif |
| 179 | + |
| 180 | + return 0; |
147 | 181 | } |
0 commit comments