graphql-java · dondonz · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/autoresearch-execution-large/autoresearch.sh b/autoresearch-execution-large/autoresearch.sh
@@ -0,0 +1,166 @@
+#!/usr/bin/env bash
+# Autoresearch loop driver for graphql-java execution engine optimization.
+#
+# Usage:
+#   ./autoresearch-execution-large/autoresearch.sh [max_iterations]
+#
+# Default: 200 iterations (designed for overnight runs)
+#
+# Safety:
+#   The agent runs with --permission-mode plan and explicit --allowedTools.
+#   It can read files, edit source code, and run gradle for profiling.
+#   Tests, benchmarks, git commits, and reverts are handled by the outer harness.
+
+set -euo pipefail
+
+MAX_ITERATIONS="${1:-200}"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+LOG_FILE="$SCRIPT_DIR/results.tsv"
+BEST_SCORE_FILE="$SCRIPT_DIR/.best_score"
+TEST_FILTER='--tests "graphql.execution.*" --tests "graphql.GraphQLTest"'
+
+cd "$PROJECT_DIR"
+
+# Verify claude CLI is available
+if ! command -v claude &>/dev/null; then
+    echo "ERROR: 'claude' CLI not found on PATH. Install Claude Code first."
+    exit 1
+fi
+
+# Initialize log
+if [ ! -f "$LOG_FILE" ]; then
+    printf "iteration\tcommit\tscore\tdelta\tstatus\tdescription\n" > "$LOG_FILE"
+fi
+
+# Get baseline score
+echo "=== Getting baseline score ==="
+BASELINE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
+if [ "$BASELINE" = "FAILED" ]; then
+    echo "ERROR: Baseline benchmark failed. Fix issues before starting autoresearch."
+    exit 1
+fi
+echo "Baseline: $BASELINE ops/s"
+echo "$BASELINE" > "$BEST_SCORE_FILE"
+
+BEST_SCORE="$BASELINE"
+
+for i in $(seq 1 "$MAX_ITERATIONS"); do
+    echo ""
+    echo "========================================"
+    echo "=== Iteration $i / $MAX_ITERATIONS ==="
+    echo "=== Best score: $BEST_SCORE ops/s ==="
+    echo "========================================"
+
+    # Build the prompt for this iteration
+    RECENT_LOG=$(tail -10 "$LOG_FILE" 2>/dev/null || echo "No previous iterations")
+
+    PROMPT="You are running iteration $i of an autoresearch optimization loop for graphql-java.
+
+Read autoresearch-execution-large/program.md for full context and strategy.
+
+Current best benchmark score: $BEST_SCORE ops/s (baseline was: $BASELINE ops/s)
+
+Previous optimization log (last 10 entries):
+$RECENT_LOG
+
+YOUR TASK: Make exactly ONE focused optimization to the execution engine code.
+- Read the code files first. If this is iteration 1 or you haven't profiled yet, run the
+  benchmark with async-profiler first to identify hotspots.
+- Pick the most promising strategy from program.md that has NOT already been tried (check the log above)
+- Make a minimal, targeted change to ONE or TWO files
+- Do NOT run tests or benchmarks — the outer harness handles that
+- Do NOT commit — the outer harness handles that
+- After editing, output a single-line summary of what you changed and why
+
+SCOPE: Only modify files under src/main/java/graphql/execution/, src/main/java/graphql/GraphQL.java,
+or the utility files listed in program.md (ImmutableKit.java, FpKit.java).
+
+Make the change now."
+
+    # Allowed tools: read-only exploration + code edits + safe bash commands
+    # The agent can profile (gradlew jmh), inspect files, and edit source code.
+    # Tests, benchmarks, git commits, and reverts are handled by this outer harness.
+    ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*),Bash(git checkout:*)'
+
+    echo "--- Asking Claude to make an optimization ---"
+    CLAUDE_OUTPUT=$(claude \
+        --model sonnet \
+        --permission-mode plan \
+        --allowedTools "$ALLOWED_TOOLS" \
+        --max-turns 25 \
+        --verbose \
+        -p "$PROMPT" \
+        2>&1) || true
+
+    echo "$CLAUDE_OUTPUT" | tail -5
+
+    # Check if anything changed
+    if git diff --quiet src/main/java/; then
+        echo "No source changes in iteration $i, skipping"
+        printf "%s\t-\t-\t-\tskipped\tno changes\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    # Show what changed
+    echo "--- Changes made ---"
+    git diff --stat src/main/java/
+
+    # Run targeted tests locally
+    echo "--- Running tests ---"
+    if ! ./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q 2>&1 | tail -10; then
+        echo "Tests FAILED — reverting changes"
+        git checkout -- src/
+        printf "%s\t-\t-\t-\treverted\ttests failed\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    # Run benchmark
+    echo "--- Running benchmark ---"
+    SCORE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
+    if [ "$SCORE" = "FAILED" ]; then
+        echo "Benchmark FAILED — reverting changes"
+        git checkout -- src/
+        printf "%s\t-\t-\t-\treverted\tbenchmark failed\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    # Compare (using awk for floating point)
+    IMPROVED=$(echo "$SCORE $BEST_SCORE" | awk '{print ($1 > $2) ? "yes" : "no"}')
+    DELTA=$(echo "$SCORE $BEST_SCORE" | awk '{printf "%.3f", $1 - $2}')
+
+    if [ "$IMPROVED" = "yes" ]; then
+        echo ""
+        echo "*** IMPROVED! $BEST_SCORE -> $SCORE ops/s (+$DELTA) ***"
+        echo ""
+        BEST_SCORE="$SCORE"
+        echo "$BEST_SCORE" > "$BEST_SCORE_FILE"
+
+        DESCRIPTION=$(git diff --stat src/main/java/ | tail -1 | xargs)
+
+        git add src/main/java/
+        git commit -m "autoresearch: iteration $i [+$DELTA ops/s]
+
+$(git diff --cached --stat | head -5)"
+
+        COMMIT=$(git rev-parse --short HEAD)
+        printf "%s\t%s\t%s\t+%s\tkept\t%s\n" "$i" "$COMMIT" "$SCORE" "$DELTA" "$DESCRIPTION" >> "$LOG_FILE"
+    else
+        echo "No improvement: $SCORE vs $BEST_SCORE ops/s ($DELTA) — reverting"
+        git checkout -- src/
+        printf "%s\t-\t%s\t%s\treverted\tno improvement\n" "$i" "$SCORE" "$DELTA" >> "$LOG_FILE"
+    fi
+done
+
+echo ""
+echo "========================================"
+echo "=== Autoresearch complete ==="
+echo "=== Baseline:    $BASELINE ops/s ==="
+echo "=== Final best:  $BEST_SCORE ops/s ==="
+TOTAL_DELTA=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.3f", $1 - $2}')
+TOTAL_PCT=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.1f", (($1 - $2) / $2) * 100}')
+echo "=== Improvement: +$TOTAL_DELTA ops/s ($TOTAL_PCT%) ==="
+echo "========================================"
+echo ""
+echo "Results log: $LOG_FILE"
+echo "Review kept commits: git log --oneline --grep='autoresearch'"
diff --git a/autoresearch-execution-large/program.md b/autoresearch-execution-large/program.md
@@ -0,0 +1,96 @@
+# Autoresearch: Optimize Execution Engine Performance (Large In-Memory Query)
+
+## Goal
+
+Improve the throughput (ops/sec) of `LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput` by making
+targeted optimizations to the core execution engine. This benchmark executes a sync query returning 10M scalar
+values — it cleanly isolates the execution engine (field resolution, result assembly, ResultNodesInfo).
+
+Every improvement must pass the relevant test suite locally. Final full-suite verification happens on a clean EC2 instance.
+
+## Metric
+
+- **Primary**: `LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput` — higher is better (ops/sec)
+- Run with: `./gradlew jmh -PjmhInclude="performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput" -PjmhFork=1 -PjmhIterations=3 -PjmhWarmupIterations=2`
+- A run takes ~3-5 minutes. Parse the score from JMH's output line containing `benchMarkSimpleQueriesThroughput`.
+- **Use async-profiler** to identify hotspots before optimizing: add `-PjmhProfilers=async` to the JMH command. Output goes to `performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput-Throughput/summary-cpu.txt`.
+
+## Scope — Files You May Modify
+
+Primary targets under `src/main/java/graphql/execution/`:
+
+- `ExecutionStrategy.java` (1141 lines) — the main execution strategy, field resolution
+- `AsyncExecutionStrategy.java` (97 lines) — async field execution
+- `Execution.java` (328 lines) — top-level execution orchestration
+- `FieldCollector.java` (182 lines) — collects fields from selection sets
+- `ResultNodesInfo.java` (55 lines) — tracks result node info during execution
+- `ExecutionStepInfoFactory.java` (92 lines) — creates step info per field
+- `FetchedValue.java` (82 lines) — wraps fetched values
+- `FieldValueInfo.java` (101 lines) — field value tracking
+- `MergedSelectionSet.java` (73 lines) — merged selections
+- `MergedField.java` — merged field representation
+
+Also consider:
+- `graphql/GraphQL.java` (624 lines) — top-level entry point
+- `graphql/collect/ImmutableKit.java` — collection utilities
+- `graphql/util/FpKit.java` — functional programming utilities
+- `graphql/execution/instrumentation/` — instrumentation overhead
+
+**Do NOT modify**: test files, benchmark files, schema files, build files.
+
+## Constraints
+
+1. **Relevant tests must pass locally**: Run `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q` for fast iteration (~30 sec). Full suite runs on EC2.
+2. **No new dependencies**: This is a firm project policy.
+3. **No wildcard imports, no inner classes, no Optional**: Project coding standards.
+4. **Preserve public API**: All `@PublicApi` method signatures must remain unchanged.
+5. **Thread safety**: The execution engine is called concurrently. Don't introduce shared mutable state.
+6. **Use `graphql.Assert`** not `Objects.requireNonNull`.
+
+## Optimization Strategies to Explore (ordered by expected impact)
+
+### High Impact
+1. **Profile first**: Run async-profiler to identify actual CPU hotspots before making changes. The previous ENF autoresearch found that Guava ImmutableMap/ImmutableListMultimap builders were the dominant hotspot due to Object.hashCode() overhead — similar patterns may exist here.
+2. **Reduce object allocation in the execution hot loop**: The execution strategy creates many intermediate objects per field (ExecutionStepInfo, FetchedValue, FieldValueInfo). Consider whether allocations can be reduced.
+3. **Optimize ResultNodesInfo**: This is called for every field resolution. Any overhead here multiplies by the number of fields (10M in this benchmark).
+4. **Replace Guava immutable builders with mutable collections**: If ImmutableMap.Builder or ImmutableList.Builder are used in hot paths, replacing with LinkedHashMap/ArrayList (as was done in the ENF optimization) can yield 20%+ improvements.
+5. **Reduce instrumentation overhead**: Even "no-op" instrumentation has method call overhead per field.
+
+### Medium Impact
+6. **Optimize FieldCollector**: Field collection happens at each level. Caching or pre-computing merged selection sets could help.
+7. **Reduce ExecutionStepInfo creation overhead**: ExecutionStepInfo is created per-field. Consider lazy computation of expensive fields.
+8. **Avoid unnecessary wrapping/unwrapping**: FetchedValue wrapping, DataFetcherResult handling.
+9. **Replace stream operations with loops**: In hot paths, `.stream().collect()` has overhead.
+
+### Lower Impact (but easy wins)
+10. **Pre-size collections**: When field count is known, pre-size ArrayList/HashMap.
+11. **Cache repeated lookups**: Schema type lookups, field definition lookups.
+12. **Reduce string operations**: String concatenation in hot paths.
+
+## How to Iterate
+
+1. **Profile first** with async-profiler to identify actual hotspots
+2. Pick ONE strategy targeting the top hotspot
+3. Make a focused, minimal change
+4. Run tests locally: `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q`
+5. Run the benchmark — compare to previous best
+6. If improved: commit with message "autoresearch: <description> [+X.XX ops/s]"
+7. If not improved: revert with `git checkout -- src/`
+8. Re-profile to see updated hotspots, then pick next strategy
+
+## Lessons from Previous Autoresearch (ENF Optimization)
+
+These patterns delivered the biggest wins in the ENF autoresearch:
+
+- **ImmutableMap.Builder → LinkedHashMap**: Saved 20k ops/s. The `.build()` call hashes all keys, and Object.hashCode() on Apple Silicon triggers expensive `pthread_jit_write_protect_np`.
+- **ImmutableListMultimap → parallel ArrayList**: Saved 22k ops/s. Same hashCode issue. Replaced keyed multimap with index-aligned parallel lists.
+- **Avoid groupingBy when only checking group count**: Saved 13k ops/s. Replaced full map creation with a boolean flag.
+- **Short-circuit for empty/single-element cases**: Multiple small wins from fast-pathing the common case.
+- **Cache lambda captures**: Reusing a Supplier field instead of creating `() -> value` per call.
+
+## Important Notes
+
+- The benchmark queries 10M scalar fields — execution engine overhead per field is the bottleneck.
+- `GraphQL.execute()` is the entry point; it calls `Execution.execute()` → `ExecutionStrategy.execute()`.
+- The execution engine is inherently recursive (fields within fields).
+- Guava is an existing dependency — you can use Guava utilities but nothing else new.
diff --git a/autoresearch-execution-large/run_benchmark.sh b/autoresearch-execution-large/run_benchmark.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Runs the LargeInMemoryQuery throughput benchmark and extracts the score.
+# Usage: ./autoresearch-execution-large/run_benchmark.sh
+# Output: prints the benchmark score (ops/sec) to stdout, or "FAILED" on error.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+cd "$PROJECT_DIR"
+
+echo "=== Running LargeInMemoryQuery throughput benchmark ===" >&2
+BENCHMARK_OUTPUT=$(./gradlew jmh \
+    -PjmhInclude="performance.LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput" \
+    -PjmhFork=1 \
+    -PjmhIterations=3 \
+    -PjmhWarmupIterations=2 \
+    2>&1)
+
+# Extract score from JMH output line like:
+# LargeInMemoryQueryPerformance.benchMarkSimpleQueriesThroughput  thrpt    3  XX.XXX ± Y.YYY  ops/s
+SCORE=$(echo "$BENCHMARK_OUTPUT" | grep -E "benchMarkSimpleQueriesThroughput\s+thrpt" | awk '{print $(NF-3)}')
+
+if [ -z "$SCORE" ]; then
+    echo "FAILED: could not extract benchmark score" >&2
+    echo "Last 20 lines of output:" >&2
+    echo "$BENCHMARK_OUTPUT" | tail -20 >&2
+    echo "FAILED"
+    exit 1
+fi
+
+echo "Score: $SCORE ops/s" >&2
+echo "$SCORE"