graphql-java · dondonz · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/autoresearch-execution-complex/autoresearch.sh b/autoresearch-execution-complex/autoresearch.sh
@@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# Autoresearch loop driver for graphql-java execution engine optimization (complex query).
+#
+# Usage:
+#   ./autoresearch-execution-complex/autoresearch.sh [max_iterations]
+#
+# Default: 200 iterations (designed for overnight runs)
+#
+# Safety:
+#   The agent runs with --permission-mode plan and explicit --allowedTools.
+#   It can read files, edit source code, and run gradle for profiling.
+#   Tests, benchmarks, git commits, and reverts are handled by the outer harness.
+
+set -euo pipefail
+
+MAX_ITERATIONS="${1:-200}"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+LOG_FILE="$SCRIPT_DIR/results.tsv"
+BEST_SCORE_FILE="$SCRIPT_DIR/.best_score"
+
+cd "$PROJECT_DIR"
+
+if ! command -v claude &>/dev/null; then
+    echo "ERROR: 'claude' CLI not found on PATH. Install Claude Code first."
+    exit 1
+fi
+
+if [ ! -f "$LOG_FILE" ]; then
+    printf "iteration\tcommit\tscore\tdelta\tstatus\tdescription\n" > "$LOG_FILE"
+fi
+
+echo "=== Getting baseline score ==="
+BASELINE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
+if [ "$BASELINE" = "FAILED" ]; then
+    echo "ERROR: Baseline benchmark failed."
+    exit 1
+fi
+echo "Baseline: $BASELINE ops/s"
+echo "$BASELINE" > "$BEST_SCORE_FILE"
+
+BEST_SCORE="$BASELINE"
+
+for i in $(seq 1 "$MAX_ITERATIONS"); do
+    echo ""
+    echo "========================================"
+    echo "=== Iteration $i / $MAX_ITERATIONS ==="
+    echo "=== Best score: $BEST_SCORE ops/s ==="
+    echo "========================================"
+
+    RECENT_LOG=$(tail -10 "$LOG_FILE" 2>/dev/null || echo "No previous iterations")
+
+    PROMPT="You are running iteration $i of an autoresearch optimization loop for graphql-java.
+
+Read autoresearch-execution-complex/program.md for full context and strategy.
+
+Current best benchmark score: $BEST_SCORE ops/s (baseline was: $BASELINE ops/s)
+
+Previous optimization log (last 10 entries):
+$RECENT_LOG
+
+YOUR TASK: Make exactly ONE focused optimization to the execution engine code.
+- Read the code files first. If this is iteration 1 or you haven't profiled yet, run the
+  benchmark with async-profiler first to identify hotspots.
+- Pick the most promising strategy from program.md that has NOT already been tried
+- Make a minimal, targeted change to ONE or TWO files
+- Do NOT run tests or benchmarks — the outer harness handles that
+- Do NOT commit — the outer harness handles that
+- After editing, output a single-line summary of what you changed and why
+
+SCOPE: Only modify files under src/main/java/graphql/execution/, src/main/java/graphql/GraphQL.java,
+or the utility files listed in program.md.
+
+Make the change now."
+
+    # Allowed tools: read-only exploration + code edits + safe bash commands
+    ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*),Bash(git checkout:*)'
+
+    echo "--- Asking Claude to make an optimization ---"
+    CLAUDE_OUTPUT=$(claude \
+        --model sonnet \
+        --permission-mode plan \
+        --allowedTools "$ALLOWED_TOOLS" \
+        --max-turns 25 \
+        --verbose \
+        -p "$PROMPT" \
+        2>&1) || true
+
+    echo "$CLAUDE_OUTPUT" | tail -5
+
+    if git diff --quiet src/main/java/; then
+        echo "No source changes in iteration $i, skipping"
+        printf "%s\t-\t-\t-\tskipped\tno changes\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    echo "--- Changes made ---"
+    git diff --stat src/main/java/
+
+    echo "--- Running tests ---"
+    if ! ./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q 2>&1 | tail -10; then
+        echo "Tests FAILED — reverting changes"
+        git checkout -- src/
+        printf "%s\t-\t-\t-\treverted\ttests failed\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    echo "--- Running benchmark ---"
+    SCORE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
+    if [ "$SCORE" = "FAILED" ]; then
+        echo "Benchmark FAILED — reverting changes"
+        git checkout -- src/
+        printf "%s\t-\t-\t-\treverted\tbenchmark failed\n" "$i" >> "$LOG_FILE"
+        continue
+    fi
+
+    IMPROVED=$(echo "$SCORE $BEST_SCORE" | awk '{print ($1 > $2) ? "yes" : "no"}')
+    DELTA=$(echo "$SCORE $BEST_SCORE" | awk '{printf "%.3f", $1 - $2}')
+
+    if [ "$IMPROVED" = "yes" ]; then
+        echo ""
+        echo "*** IMPROVED! $BEST_SCORE -> $SCORE ops/s (+$DELTA) ***"
+        echo ""
+        BEST_SCORE="$SCORE"
+        echo "$BEST_SCORE" > "$BEST_SCORE_FILE"
+
+        DESCRIPTION=$(git diff --stat src/main/java/ | tail -1 | xargs)
+
+        git add src/main/java/
+        git commit -m "autoresearch: iteration $i [+$DELTA ops/s]
+
+$(git diff --cached --stat | head -5)"
+
+        COMMIT=$(git rev-parse --short HEAD)
+        printf "%s\t%s\t%s\t+%s\tkept\t%s\n" "$i" "$COMMIT" "$SCORE" "$DELTA" "$DESCRIPTION" >> "$LOG_FILE"
+    else
+        echo "No improvement: $SCORE vs $BEST_SCORE ops/s ($DELTA) — reverting"
+        git checkout -- src/
+        printf "%s\t-\t%s\t%s\treverted\tno improvement\n" "$i" "$SCORE" "$DELTA" >> "$LOG_FILE"
+    fi
+done
+
+echo ""
+echo "========================================"
+echo "=== Autoresearch complete ==="
+echo "=== Baseline:    $BASELINE ops/s ==="
+echo "=== Final best:  $BEST_SCORE ops/s ==="
+TOTAL_DELTA=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.3f", $1 - $2}')
+TOTAL_PCT=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.1f", (($1 - $2) / $2) * 100}')
+echo "=== Improvement: +$TOTAL_DELTA ops/s ($TOTAL_PCT%) ==="
+echo "========================================"
+echo ""
+echo "Results log: $LOG_FILE"
+echo "Review kept commits: git log --oneline --grep='autoresearch'"
diff --git a/autoresearch-execution-complex/program.md b/autoresearch-execution-complex/program.md
@@ -0,0 +1,95 @@
+# Autoresearch: Optimize Execution Engine Performance (Complex Async Query)
+
+## Goal
+
+Improve the throughput (ops/sec) of `ComplexQueryPerformance.benchMarkSimpleQueriesThroughput` by making
+targeted optimizations to the core execution engine. This benchmark executes a complex query with both
+sync and async data fetchers, multiple threads, and nested object types (shops → departments → products,
+each with 12 fields). It exercises the full execution pipeline including async completion handling.
+
+Every improvement must pass the relevant test suite locally. Final full-suite verification happens on a clean EC2 instance.
+
+## Metric
+
+- **Primary**: `ComplexQueryPerformance.benchMarkSimpleQueriesThroughput` — higher is better (ops/sec)
+- Run with: `./gradlew jmh -PjmhInclude="performance.ComplexQueryPerformance.benchMarkSimpleQueriesThroughput" -PjmhFork=1 -PjmhIterations=3 -PjmhWarmupIterations=2`
+- Note: This benchmark has a `@Param({"5", "10", "20"})` for `howManyItems`. The default JMH run will test all three. For faster iteration during development, you can filter to just one param.
+- **Use async-profiler** to identify hotspots: add `-PjmhProfilers=async` to the JMH command.
+
+## Scope — Files You May Modify
+
+Primary targets under `src/main/java/graphql/execution/`:
+
+- `ExecutionStrategy.java` (1141 lines) — the main execution strategy, field resolution
+- `AsyncExecutionStrategy.java` (97 lines) — async field execution
+- `Execution.java` (328 lines) — top-level execution orchestration
+- `FieldCollector.java` (182 lines) — collects fields from selection sets
+- `ResultNodesInfo.java` (55 lines) — tracks result node info during execution
+- `ExecutionStepInfoFactory.java` (92 lines) — creates step info per field
+- `FetchedValue.java` (82 lines) — wraps fetched values
+- `FieldValueInfo.java` (101 lines) — field value tracking
+- `MergedSelectionSet.java` (73 lines) — merged selections
+- `MergedField.java` — merged field representation
+- `Async.java` — async execution utilities
+
+Also consider:
+- `graphql/GraphQL.java` (624 lines) — top-level entry point
+- `graphql/collect/ImmutableKit.java` — collection utilities
+- `graphql/util/FpKit.java` — functional programming utilities
+- `graphql/execution/instrumentation/` — instrumentation overhead
+
+**Do NOT modify**: test files, benchmark files, schema files, build files.
+
+## Constraints
+
+1. **Relevant tests must pass locally**: Run `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q` for fast iteration. Full suite runs on EC2.
+2. **No new dependencies**: This is a firm project policy.
+3. **No wildcard imports, no inner classes, no Optional**: Project coding standards.
+4. **Preserve public API**: All `@PublicApi` method signatures must remain unchanged.
+5. **Thread safety**: The execution engine is called concurrently. Don't introduce shared mutable state.
+6. **Use `graphql.Assert`** not `Objects.requireNonNull`.
+
+## Optimization Strategies to Explore (ordered by expected impact)
+
+### High Impact
+1. **Profile first**: Run async-profiler to identify actual CPU hotspots before making changes.
+2. **Reduce CompletableFuture overhead**: The async path creates many CompletableFuture chains. Consider whether composition can be simplified.
+3. **Optimize field resolution dispatch**: Each field goes through ExecutionStrategy which has overhead for instrumentation, error handling, etc. Batch processing or reducing per-field overhead helps.
+4. **Replace Guava immutable builders with mutable collections in hot paths**: ImmutableMap.Builder and ImmutableList.Builder have expensive hashCode overhead during build().
+5. **Reduce instrumentation overhead**: Even "no-op" instrumentation dispatches per field.
+
+### Medium Impact
+6. **Optimize FieldCollector for repeated patterns**: The query has repeated field patterns across shops/departments/products.
+7. **Reduce ExecutionStepInfo creation overhead**: Created per-field, consider lazy computation.
+8. **Optimize DataFetcherResult handling**: Unwrapping overhead per field.
+9. **Reduce lock contention in async paths**: If multiple threads contend on shared state.
+
+### Lower Impact (but easy wins)
+10. **Pre-size collections**: When field count is known.
+11. **Cache repeated type/field lookups**.
+12. **Replace stream operations with loops** in hot paths.
+
+## How to Iterate
+
+1. **Profile first** with async-profiler to identify actual hotspots
+2. Pick ONE strategy targeting the top hotspot
+3. Make a focused, minimal change
+4. Run tests locally: `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q`
+5. Run the benchmark — compare to previous best
+6. If improved: commit
+7. If not improved: revert with `git checkout -- src/`
+8. Re-profile, then pick next strategy
+
+## Lessons from Previous Autoresearch (ENF Optimization)
+
+- **ImmutableMap.Builder → LinkedHashMap**: Saved 20k ops/s due to Object.hashCode() overhead.
+- **ImmutableListMultimap → parallel ArrayList**: Saved 22k ops/s. Same hashCode issue.
+- **Avoid groupingBy when only checking group count**: Replaced full map with boolean flag.
+- **Short-circuit for empty/single-element cases**: Multiple small wins.
+- **Cache lambda captures**: Reuse Supplier fields instead of per-call lambdas.
+
+## Important Notes
+
+- This benchmark involves **multiple threads** (10 query threads + 10 fetcher threads). Be careful with thread safety.
+- The async data fetchers include `Thread.sleep()` to simulate real-world latency. Optimizations to the execution engine reduce the non-sleep overhead.
+- Guava is an existing dependency — you can use Guava utilities but nothing else new.
diff --git a/autoresearch-execution-complex/run_benchmark.sh b/autoresearch-execution-complex/run_benchmark.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+# Runs the ComplexQuery throughput benchmark and extracts the score.
+# Usage: ./autoresearch-execution-complex/run_benchmark.sh
+# Output: prints the benchmark score (ops/sec) to stdout, or "FAILED" on error.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+cd "$PROJECT_DIR"
+
+echo "=== Running ComplexQuery throughput benchmark ===" >&2
+BENCHMARK_OUTPUT=$(./gradlew jmh \
+    -PjmhInclude="performance.ComplexQueryPerformance.benchMarkSimpleQueriesThroughput" \
+    -PjmhFork=1 \
+    -PjmhIterations=2 \
+    -PjmhWarmupIterations=2 \
+    2>&1)
+
+# Extract score from JMH output — take the first (howManyItems=5) result for consistency
+# ComplexQueryPerformance.benchMarkSimpleQueriesThroughput    5  thrpt    2  XX.XXX ± Y.YYY  ops/s
+SCORE=$(echo "$BENCHMARK_OUTPUT" | grep -E "benchMarkSimpleQueriesThroughput\s+" | head -1 | awk '{print $(NF-3)}')
+
+if [ -z "$SCORE" ]; then
+    echo "FAILED: could not extract benchmark score" >&2
+    echo "Last 20 lines of output:" >&2
+    echo "$BENCHMARK_OUTPUT" | tail -20 >&2
+    echo "FAILED"
+    exit 1
+fi
+
+echo "Score: $SCORE ops/s" >&2
+echo "$SCORE"