Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
.idea
.gradle
build
profilers
68 changes: 67 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,67 @@
# simdjson-java
# simdjson-java

A Java version of [simdjson](https://github.com/simdjson/simdjson) - a JSON parser using SIMD instructions,
based on the paper [Parsing Gigabytes of JSON per Second](https://arxiv.org/abs/1902.08318)
by Geoff Langdale and Daniel Lemire.

This implementation is still missing several features available in simdsjon. For example:

* Support for Unicode characters
* UTF-8 validation
* Full support for parsing floats
* Support for 512-bit vectors

## Code Sample

```java
byte[] json = loadTwitterJson();

SimdJsonParser parser = new SimdJsonParser();
JsonValue jsonValue = simdJsonParser.parse(json, json.length);
Iterator<JsonValue> tweets = jsonValue.get("statuses").arrayIterator();
while (tweets.hasNext()) {
JsonValue tweet = tweets.next();
JsonValue user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
System.out.println(user.get("screen_name").asString());
}
}
```

## Benchmarks

To run the JMH benchmarks, execute the following command:

```./gradlew jmh```

## Tests

To run the tests, execute the following command:

```./gradlew test```

## Performance

This section presents a performance comparison of different JSON parsers available as Java libraries. The benchmark used
the [twitter.json](src/jmh/resources/twitter.json) dataset, and its goal was to measure the throughput (ops/s) of parsing
and finding all unique users with a default profile.

**Note that simdjson-java is still missing several features (mentioned in the introduction), so the following results
may not reflect its real performance.**

Environment:
* CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
* OS: Ubuntu 23.04, kernel 6.2.0-23-generic
* Java: OpenJDK 64-Bit Server VM Temurin-20.0.1+9

Library | Version | Throughput (ops/s)
---------------------------------------------------|---------|--------------------
simdjson-java | - | 1450.951
simdjson-java (padded) | - | 1505.227
[jackson](https://github.com/FasterXML/jackson) | 2.15.2 | 504.562
[fastjson2](https://github.com/alibaba/fastjson) | 2.0.35 | 590.743
[jsoniter](https://github.com/json-iterator/java) | 0.9.23 | 384.664

To reproduce the benchmark results, execute the following command:

```./gradlew jmh -Pjmh.includes='.*ParseAndSelectBenchmark.*'```
60 changes: 59 additions & 1 deletion build.gradle
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import me.champeau.jmh.JmhBytecodeGeneratorTask
import org.gradle.internal.os.OperatingSystem

plugins {
id 'java'
id 'me.champeau.jmh' version '0.7.1'
}

group = 'com.github.piotrrzysko'
version = '1.0-SNAPSHOT'
version = '0.0.1-SNAPSHOT'

repositories {
mavenCentral()
Expand All @@ -21,6 +24,10 @@ ext {
}

dependencies {
jmhImplementation group: 'com.fasterxml.jackson.core', name: 'jackson-databind', version: '2.15.2'
jmhImplementation group: 'com.alibaba.fastjson2', name: 'fastjson2', version: '2.0.35'
jmhImplementation group: 'com.jsoniter', name: 'jsoniter', version: '0.9.23'

testImplementation group: 'org.assertj', name: 'assertj-core', version: '3.24.2'
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-api', version: junitVersion
testImplementation group: 'org.junit.jupiter', name: 'junit-jupiter-params', version: junitVersion
Expand All @@ -29,4 +36,55 @@ dependencies {

test {
useJUnitPlatform()
jvmArgs += [
'--add-modules', 'jdk.incubator.vector',
'-Xmx2g'
]
}

tasks.withType(JmhBytecodeGeneratorTask).configureEach {
jvmArgs.set(["--add-modules=jdk.incubator.vector"])
}

tasks.withType(JavaCompile).configureEach {
options.compilerArgs.add("--add-modules=jdk.incubator.vector")
}

compileTestJava {
options.compilerArgs += [
'--add-modules', 'jdk.incubator.vector'
]
}

jmh {
fork = 1
warmupIterations = 3
iterations = 5
jvmArgsPrepend = [
'--add-modules=jdk.incubator.vector'
]
if (getBooleanProperty('jmh.profilersEnabled', false)) {
if (OperatingSystem.current().isLinux()) {
profilers = [
'perf',
'perfasm:intelSyntax=true',
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('LD_LIBRARY_PATH')
]
} else if (OperatingSystem.current().isMacOsX()) {
profilers = [
'async:verbose=true;output=flamegraph;event=cpu;dir=./profilers/async;libPath=' + getAsyncProfilerLibPath('DYLD_LIBRARY_PATH')
]
}
}
if (project.hasProperty('jmh.includes')) {
includes = [project.findProperty('jmh.includes')]
}
}

def getBooleanProperty(String name, boolean defaultValue) {
Boolean.valueOf((project.findProperty(name) ?: defaultValue) as String)
}

static def getAsyncProfilerLibPath(String envVarName) {
System.getenv(envVarName) ?: System.getProperty('java.library.path')
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
package com.github.piotrrzysko.simdjson;

import com.alibaba.fastjson2.JSON;
import com.alibaba.fastjson2.JSONObject;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jsoniter.JsonIterator;
import com.jsoniter.any.Any;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;

import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import static com.github.piotrrzysko.simdjson.SimdJsonPaddingUtil.padded;

@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
public class ParseAndSelectBenchmark {

private final SimdJsonParser simdJsonParser = new SimdJsonParser();
private final ObjectMapper objectMapper = new ObjectMapper();

private byte[] buffer;
private byte[] bufferPadded;

@Setup(Level.Trial)
public void setup() throws IOException {
try (InputStream is = ParseBenchmark.class.getResourceAsStream("/twitter.json")) {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jackson() throws IOException {
JsonNode jacksonJsonNode = objectMapper.readTree(buffer);
Set<String> defaultUsers = new HashSet<>();
Iterator<JsonNode> tweets = jacksonJsonNode.get("statuses").elements();
while (tweets.hasNext()) {
JsonNode tweet = tweets.next();
JsonNode user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
defaultUsers.add(user.get("screen_name").textValue());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_fastjson() {
JSONObject jsonObject = (JSONObject) JSON.parse(buffer);
Set<String> defaultUsers = new HashSet<>();
Iterator<Object> tweets = jsonObject.getJSONArray("statuses").iterator();
while (tweets.hasNext()) {
JSONObject tweet = (JSONObject) tweets.next();
JSONObject user = (JSONObject) tweet.get("user");
if (user.getBoolean("default_profile")) {
defaultUsers.add(user.getString("screen_name"));
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_jsoniter() {
Any json = JsonIterator.deserialize(buffer);
Set<String> defaultUsers = new HashSet<>();
for (Any tweet : json.get("statuses")) {
Any user = tweet.get("user");
if (user.get("default_profile").toBoolean()) {
defaultUsers.add(user.get("screen_name").toString());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjson() {
JsonValue simdJsonValue = simdJsonParser.parse(buffer, buffer.length);
Set<String> defaultUsers = new HashSet<>();
Iterator<JsonValue> tweets = simdJsonValue.get("statuses").arrayIterator();
while (tweets.hasNext()) {
JsonValue tweet = tweets.next();
JsonValue user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
defaultUsers.add(user.get("screen_name").asString());
}
}
return defaultUsers.size();
}

@Benchmark
public int countUniqueUsersWithDefaultProfile_simdjsonPadded() {
JsonValue simdJsonValue = simdJsonParser.parse(bufferPadded, buffer.length);
Set<String> defaultUsers = new HashSet<>();
Iterator<JsonValue> tweets = simdJsonValue.get("statuses").arrayIterator();
while (tweets.hasNext()) {
JsonValue tweet = tweets.next();
JsonValue user = tweet.get("user");
if (user.get("default_profile").asBoolean()) {
defaultUsers.add(user.get("screen_name").asString());
}
}
return defaultUsers.size();
}
}
49 changes: 49 additions & 0 deletions src/jmh/java/com/github/piotrrzysko/simdjson/ParseBenchmark.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.github.piotrrzysko.simdjson;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;

import java.io.IOException;
import java.io.InputStream;
import java.util.concurrent.TimeUnit;

import static com.github.piotrrzysko.simdjson.SimdJsonPaddingUtil.padded;

@State(Scope.Benchmark)
@BenchmarkMode(Mode.Throughput)
@OutputTimeUnit(TimeUnit.SECONDS)
public class ParseBenchmark {

@Param({"/twitter.json" /*, "/gsoc-2018.json - unicode is not supported yet"*/, "/github_events.json"})
String fileName;

private final SimdJsonParser simdJsonParser = new SimdJsonParser();

private byte[] buffer;
private byte[] bufferPadded;

@Setup(Level.Trial)
public void setup() throws IOException {
try (InputStream is = ParseBenchmark.class.getResourceAsStream(fileName)) {
buffer = is.readAllBytes();
bufferPadded = padded(buffer);
}
}

@Benchmark
public JsonValue simdjson() {
return simdJsonParser.parse(buffer, buffer.length);
}

@Benchmark
public JsonValue simdjsonPadded() {
return simdJsonParser.parse(bufferPadded, buffer.length);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package com.github.piotrrzysko.simdjson;

class SimdJsonPaddingUtil {

static byte[] padded(byte[] src) {
byte[] bufferPadded = new byte[src.length + 64];
System.arraycopy(src, 0, bufferPadded, 0, src.length);
return bufferPadded;
}
}
Loading