JavaScriptExpert
diff --git a/‎benchmark/parse_stream.cpp‎
100755100644
Lines changed: 159 additions & 125 deletions b/‎benchmark/parse_stream.cpp‎
100755100644
Lines changed: 159 additions & 125 deletions
diff --git a/‎doc/basics.md‎
Lines changed: 3 additions & 1 deletion b/‎doc/basics.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎include/simdjson/dom/document_stream.h‎
Lines changed: 54 additions & 2 deletions b/‎include/simdjson/dom/document_stream.h‎
Lines changed: 54 additions & 2 deletions
@@ -1,147 +1,181 @@
-#include <iostream>
 #include <algorithm>
 #include <chrono>
-#include <vector>
+#include <iostream>
 #include <map>
+#include <vector>
 
 #include "simdjson.h"
 
-#define NB_ITERATION 5
-#define MIN_BATCH_SIZE 200000
+#define NB_ITERATION 20
+#define MIN_BATCH_SIZE 10000
 #define MAX_BATCH_SIZE 10000000
 
 bool test_baseline = false;
 bool test_per_batch = true;
-bool test_best_batch = true;
+bool test_best_batch = false;
 
-bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j){
-    return i.second > j.second;
+bool compare(std::pair<size_t, double> i, std::pair<size_t, double> j) {
+  return i.second > j.second;
 }
 
-int main (int argc, char *argv[]){
-
-    if (argc <= 1) {
-        std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
-        exit(1);
-    }
-    const char *filename = argv[1];
-    auto [p, err] = simdjson::padded_string::load(filename);
-    if (err) {
-        std::cerr << "Could not load the file " << filename << std::endl;
+int main(int argc, char *argv[]) {
+
+  if (argc <= 1) {
+    std::cerr << "Usage: " << argv[0] << " <jsonfile>" << std::endl;
+    exit(1);
+  }
+  const char *filename = argv[1];
+  auto[p, err] = simdjson::padded_string::load(filename);
+  if (err) {
+    std::cerr << "Could not load the file " << filename << std::endl;
+    return EXIT_FAILURE;
+  }
+  if (test_baseline) {
+    std::wclog << "Baseline: Getline + normal parse... " << std::endl;
+    std::cout << "Gigabytes/second\t"
+              << "Nb of documents parsed" << std::endl;
+    for (auto i = 0; i < 3; i++) {
+      // Actual test
+      simdjson::dom::parser parser;
+      simdjson::error_code alloc_error = parser.allocate(p.size());
+      if (alloc_error) {
+        std::cerr << alloc_error << std::endl;
         return EXIT_FAILURE;
+      }
+      std::istringstream ss(std::string(p.data(), p.size()));
+
+      auto start = std::chrono::steady_clock::now();
+      int count = 0;
+      std::string line;
+      int parse_res = simdjson::SUCCESS;
+      while (getline(ss, line)) {
+        // TODO we're likely triggering simdjson's padding reallocation here. Is
+        // that intentional?
+        parser.parse(line);
+        count++;
+      }
+
+      auto end = std::chrono::steady_clock::now();
+
+      std::chrono::duration<double> secs = end - start;
+      double speedinGBs = static_cast<double>(p.size()) /
+                          (static_cast<double>(secs.count()) * 1000000000.0);
+      std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
+
+      if (parse_res != simdjson::SUCCESS) {
+        std::cerr << "Parsing failed" << std::endl;
+        exit(1);
+      }
     }
-    if (test_baseline) {
-        std::wclog << "Baseline: Getline + normal parse... " << std::endl;
-        std::cout << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl;
-        for (auto i = 0; i < 3; i++) {
-            //Actual test
-            simdjson::dom::parser parser;
-            simdjson::error_code alloc_error = parser.allocate(p.size());
-            if (alloc_error) {
-                std::cerr << alloc_error << std::endl;
-                return EXIT_FAILURE;
-            }
-            std::istringstream ss(std::string(p.data(), p.size()));
-
-            auto start = std::chrono::steady_clock::now();
-            int count = 0;
-            std::string line;
-            int parse_res = simdjson::SUCCESS;
-            while (getline(ss, line)) {
-                // TODO we're likely triggering simdjson's padding reallocation here. Is that intentional?
-                parser.parse(line);
-                count++;
-            }
-
-            auto end = std::chrono::steady_clock::now();
-
-            std::chrono::duration<double> secs = end - start;
-            double speedinGBs = static_cast<double>(p.size()) / (static_cast<double>(secs.count()) * 1000000000.0);
-            std::cout << speedinGBs << "\t\t\t\t" << count << std::endl;
-
-            if (parse_res != simdjson::SUCCESS) {
-                std::cerr << "Parsing failed" << std::endl;
-                exit(1);
-            }
-        }
-    }
-
-    std::map<size_t, double> batch_size_res;
-    if(test_per_batch) {
-        std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE
-                << " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
-        std::cout << "Batch Size\t" << "Gigabytes/second\t" << "Nb of documents parsed" << std::endl;
-        for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE; i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 50) {
-            batch_size_res.insert(std::pair<size_t, double>(i, 0));
-            int count;
-            for (size_t j = 0; j < 5; j++) {
-                //Actual test
-                simdjson::dom::parser parser;
-                simdjson::error_code error;
-
-                auto start = std::chrono::steady_clock::now();
-                count = 0;
-                for (auto result : parser.parse_many(p, 4000000)) {
-                    error = result.error();
-                    count++;
-                }
-                auto end = std::chrono::steady_clock::now();
-
-                std::chrono::duration<double> secs = end - start;
-                double speedinGBs = static_cast<double>(p.size()) / (static_cast<double>(secs.count()) * 1000000000.0);
-                if (speedinGBs > batch_size_res.at(i))
-                    batch_size_res[i] = speedinGBs;
-
-                if (error != simdjson::SUCCESS) {
-                    std::wcerr << "Parsing failed with: " << error << std::endl;
-                    exit(1);
-                }
-            }
-            std::cout << i << "\t\t" << std::fixed << std::setprecision(3) << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
-
+  }
+
+  std::map<size_t, double> batch_size_res;
+  if (test_per_batch) {
+    std::wclog << "parse_many: Speed per batch_size... from " << MIN_BATCH_SIZE
+               << " bytes to " << MAX_BATCH_SIZE << " bytes..." << std::endl;
+    std::cout << "Batch Size\t"
+              << "Gigabytes/second\t"
+              << "Nb of documents parsed" << std::endl;
+    for (size_t i = MIN_BATCH_SIZE; i <= MAX_BATCH_SIZE;
+         i += (MAX_BATCH_SIZE - MIN_BATCH_SIZE) / 100) {
+      batch_size_res.insert(std::pair<size_t, double>(i, 0));
+      int count;
+      for (size_t j = 0; j < 5; j++) {
+        // Actual test
+        simdjson::dom::parser parser;
+        simdjson::error_code error;
+
+        auto start = std::chrono::steady_clock::now();
+        count = 0;
+        for (auto result : parser.parse_many(p, i)) {
+          error = result.error();
+          if (error != simdjson::SUCCESS) {
+            std::wcerr << "Parsing failed with: " <<  error_message(error) << std::endl;
+            exit(1);
+          }
+          count++;
         }
+        auto end = std::chrono::steady_clock::now();
+
+        std::chrono::duration<double> secs = end - start;
+        double speedinGBs = static_cast<double>(p.size()) /
+                            (static_cast<double>(secs.count()) * 1000000000.0);
+        if (speedinGBs > batch_size_res.at(i))
+          batch_size_res[i] = speedinGBs;
+      }
+      std::cout << i << "\t\t" << std::fixed << std::setprecision(3)
+                << batch_size_res.at(i) << "\t\t\t\t" << count << std::endl;
     }
-
-    if (test_best_batch) {
-        size_t optimal_batch_size;
-        if (test_per_batch) {
-            optimal_batch_size = (*min_element(batch_size_res.begin(), batch_size_res.end(), compare)).first;
-        } else {
-            optimal_batch_size = MIN_BATCH_SIZE;
+  }
+  size_t optimal_batch_size{};
+  double best_speed{};
+  if (test_per_batch) {
+    std::pair<size_t, double> best_results;
+    best_results =
+        (*min_element(batch_size_res.begin(), batch_size_res.end(), compare));
+    optimal_batch_size = best_results.first;
+    best_speed = best_results.second;
+  } else {
+    optimal_batch_size = MIN_BATCH_SIZE;
+  }
+  std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..."
+             << std::endl;
+  std::wclog << "Best speed: " << best_speed << "..." << std::endl;
+
+  if (test_best_batch) {
+    std::wclog << "Starting speed test... Best of " << NB_ITERATION
+               << " iterations..." << std::endl;
+    std::vector<double> res;
+    for (int i = 0; i < NB_ITERATION; i++) {
+
+      // Actual test
+      simdjson::dom::parser parser;
+      simdjson::error_code error;
+
+      auto start = std::chrono::steady_clock::now();
+      // This includes allocation of the parser
+      for (auto result : parser.parse_many(p, optimal_batch_size)) {
+        error = result.error();
+        if (error != simdjson::SUCCESS) {
+          std::wcerr << "Parsing failed with: " << error_message(error) << std::endl;
+          exit(1);
         }
-        std::wclog << "Starting speed test... Best of " << NB_ITERATION << " iterations..." << std::endl;
-        std::wclog << "Seemingly optimal batch_size: " << optimal_batch_size << "..." << std::endl;
-        std::vector<double> res;
-        for (int i = 0; i < NB_ITERATION; i++) {
-
-            // Actual test
-            simdjson::dom::parser parser;
-            simdjson::error_code error;
-
-            auto start = std::chrono::steady_clock::now();
-            // TODO this includes allocation of the parser; is that intentional?
-            for (auto result : parser.parse_many(p, 4000000)) {
-                error = result.error();
-            }
-            auto end = std::chrono::steady_clock::now();
-
-            std::chrono::duration<double> secs = end - start;
-            res.push_back(secs.count());
-
-            if (error != simdjson::SUCCESS) {
-                std::wcerr << "Parsing failed with: " << error << std::endl;
-                exit(1);
-            }
-
-        }
-
-        double min_result = *min_element(res.begin(), res.end());
-        double speedinGBs = static_cast<double>(p.size()) / (min_result * 1000000000.0);
-
+      }
+      auto end = std::chrono::steady_clock::now();
 
-        std::cout << "Min:  " << min_result << " bytes read: " << p.size()
-                << " Gigabytes/second: " << speedinGBs << std::endl;
+      std::chrono::duration<double> secs = end - start;
+      res.push_back(secs.count());
     }
 
-    return 0;
+    double min_result = *min_element(res.begin(), res.end());
+    double speedinGBs =
+        static_cast<double>(p.size()) / (min_result * 1000000000.0);
+
+    std::cout << "Min:  " << min_result << " bytes read: " << p.size()
+              << " Gigabytes/second: " << speedinGBs << std::endl;
+  }
+#ifdef SIMDJSON_THREADS_ENABLED
+  // Multithreading probably does not help matters for small files (less than 10
+  // MB).
+  if (p.size() < 10000000) {
+    std::cout << std::endl;
+
+    std::cout << "Warning: your file is small and the performance results are "
+                 "probably meaningless"
+              << std::endl;
+    std::cout << "as far as multithreaded performance goes." << std::endl;
+
+    std::cout << std::endl;
+
+    std::cout
+        << "Try to concatenate the file with itself to generate a large one."
+        << std::endl;
+    std::cout << "In bash: " << std::endl;
+    std::cout << "for i in {1..1000}; do cat '" << filename
+              << "' >> bar.ndjson; done" << std::endl;
+    std::cout << argv[0] << " bar.ndjson" << std::endl;
+  }
+#endif
+
+  return 0;
 }
@@ -452,7 +452,7 @@ The simdjson library also support multithreaded JSON streaming through a large f
 smaller JSON documents in either [ndjson](http://ndjson.org) or [JSON lines](http://jsonlines.org)
 format. If your JSON documents all contain arrays or objects, we even support direct file
 concatenation without whitespace. The concatenated file has no size restrictions (including larger
-than 4GB), though each individual document must be less than 4GB.
+than 4GB), though each individual document must be no larger than 4 GB.
 
 Here is a simple example, given "x.json" with this content:
 
@@ -472,6 +472,8 @@ for (dom::element doc : parser.load_many(filename)) {
 
 In-memory ndjson strings can be parsed as well, with `parser.parse_many(string)`.
 
+Both `load_many` and `parse_many` take an optional parameter `size_t batch_size` which defines the window processing size. It is set by default to a large value (`1000000` corresponding to 1 MB). None of your JSON documents should exceed this window size, or else you will get  the error `simdjson::CAPACITY`. You cannot set this window size larger than 4 GB: you will get  the error `simdjson::CAPACITY`. The smaller the window size is, the less memory the function will use. Setting the window size too small (e.g., less than 100 kB) may also impact performance negatively. Leaving it to 1 MB is expected to be a good choice, unless you have some larger documents.
+
 See [parse_many.md](parse_many.md) for detailed information and design.
 
 Thread Safety
 
@@ -6,11 +6,63 @@
 #include "simdjson/error.h"
 #ifdef SIMDJSON_THREADS_ENABLED
 #include <thread>
+#include <mutex>
+#include <condition_variable>
 #endif
 
 namespace simdjson {
 namespace dom {
 
+
+#ifdef SIMDJSON_THREADS_ENABLED
+struct stage1_worker {
+  stage1_worker() noexcept = default;
+  stage1_worker(const stage1_worker&) = delete;
+  stage1_worker(stage1_worker&&) = delete;
+  stage1_worker operator=(const stage1_worker&) = delete;
+  ~stage1_worker();
+  /** 
+   * We only start the thread when it is needed, not at object construction, this may throw.
+   * You should only call this once. 
+   **/
+  void start_thread();
+  /** 
+   * Start a stage 1 job. You should first call 'run', then 'finish'. 
+   * You must call start_thread once before.
+   */
+  void run(document_stream * ds, dom::parser * stage1, size_t next_batch_start);
+  /** Wait for the run to finish (blocking). You should first call 'run', then 'finish'. **/
+  void finish();
+
+private:
+
+  /** 
+   * Normally, we would never stop the thread. But we do in the destructor.
+   * This function is only safe assuming that you are not waiting for results. You 
+   * should have called run, then finish, and be done. 
+   **/
+  void stop_thread();
+
+  std::thread thread{};
+  /** These three variables define the work done by the thread. **/
+  dom::parser * stage1_thread_parser{};
+  size_t _next_batch_start{};
+  document_stream * owner{};
+  /** 
+   * We have two state variables. This could be streamlined to one variable in the future but 
+   * we use two for clarity.
+   */
+  bool has_work{false};
+  bool can_work{true};
+
+  /**
+   * We lock using a mutex.
+   */
+  std::mutex locking_mutex{};
+  std::condition_variable cond_var{};
+};
+#endif
+
 /**
  * A forward-only stream of documents.
  *
@@ -142,8 +194,8 @@ class document_stream {
   /** The error returned from the stage 1 thread. */
   error_code stage1_thread_error{UNINITIALIZED};
   /** The thread used to run stage 1 against the next batch in the background. */
-  std::thread stage1_thread{};
-
+  friend struct stage1_worker;
+  std::unique_ptr<stage1_worker> worker{new(std::nothrow) stage1_worker()};
   /**
    * The parser used to run stage 1 in the background. Will be swapped
    * with the regular parser when finished.