Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions autoresearch-execution-complex/autoresearch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env bash
# Autoresearch loop driver for graphql-java execution engine optimization (complex query).
#
# Usage:
# ./autoresearch-execution-complex/autoresearch.sh [max_iterations]
#
# Default: 200 iterations (designed for overnight runs)
#
# Safety:
# The agent runs with --permission-mode plan and explicit --allowedTools.
# It can read files, edit source code, and run gradle for profiling.
# Tests, benchmarks, git commits, and reverts are handled by the outer harness.

set -euo pipefail

MAX_ITERATIONS="${1:-200}"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
LOG_FILE="$SCRIPT_DIR/results.tsv"
BEST_SCORE_FILE="$SCRIPT_DIR/.best_score"

cd "$PROJECT_DIR"

if ! command -v claude &>/dev/null; then
echo "ERROR: 'claude' CLI not found on PATH. Install Claude Code first."
exit 1
fi

if [ ! -f "$LOG_FILE" ]; then
printf "iteration\tcommit\tscore\tdelta\tstatus\tdescription\n" > "$LOG_FILE"
fi

echo "=== Getting baseline score ==="
BASELINE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
if [ "$BASELINE" = "FAILED" ]; then
echo "ERROR: Baseline benchmark failed."
exit 1
fi
echo "Baseline: $BASELINE ops/s"
echo "$BASELINE" > "$BEST_SCORE_FILE"

BEST_SCORE="$BASELINE"

for i in $(seq 1 "$MAX_ITERATIONS"); do
echo ""
echo "========================================"
echo "=== Iteration $i / $MAX_ITERATIONS ==="
echo "=== Best score: $BEST_SCORE ops/s ==="
echo "========================================"

RECENT_LOG=$(tail -10 "$LOG_FILE" 2>/dev/null || echo "No previous iterations")

PROMPT="You are running iteration $i of an autoresearch optimization loop for graphql-java.

Read autoresearch-execution-complex/program.md for full context and strategy.

Current best benchmark score: $BEST_SCORE ops/s (baseline was: $BASELINE ops/s)

Previous optimization log (last 10 entries):
$RECENT_LOG

YOUR TASK: Make exactly ONE focused optimization to the execution engine code.
- Read the code files first. If this is iteration 1 or you haven't profiled yet, run the
benchmark with async-profiler first to identify hotspots.
- Pick the most promising strategy from program.md that has NOT already been tried
- Make a minimal, targeted change to ONE or TWO files
- Do NOT run tests or benchmarks — the outer harness handles that
- Do NOT commit — the outer harness handles that
- After editing, output a single-line summary of what you changed and why

SCOPE: Only modify files under src/main/java/graphql/execution/, src/main/java/graphql/GraphQL.java,
or the utility files listed in program.md.

Make the change now."

# Allowed tools: read-only exploration + code edits + safe bash commands
ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*),Bash(git checkout:*)'

echo "--- Asking Claude to make an optimization ---"
CLAUDE_OUTPUT=$(claude \
--model sonnet \
--permission-mode plan \
--allowedTools "$ALLOWED_TOOLS" \
--max-turns 25 \
--verbose \
-p "$PROMPT" \
2>&1) || true

echo "$CLAUDE_OUTPUT" | tail -5

if git diff --quiet src/main/java/; then
echo "No source changes in iteration $i, skipping"
printf "%s\t-\t-\t-\tskipped\tno changes\n" "$i" >> "$LOG_FILE"
continue
fi

echo "--- Changes made ---"
git diff --stat src/main/java/

echo "--- Running tests ---"
if ! ./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q 2>&1 | tail -10; then
echo "Tests FAILED — reverting changes"
git checkout -- src/
printf "%s\t-\t-\t-\treverted\ttests failed\n" "$i" >> "$LOG_FILE"
continue
fi

echo "--- Running benchmark ---"
SCORE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
if [ "$SCORE" = "FAILED" ]; then
echo "Benchmark FAILED — reverting changes"
git checkout -- src/
printf "%s\t-\t-\t-\treverted\tbenchmark failed\n" "$i" >> "$LOG_FILE"
continue
fi

IMPROVED=$(echo "$SCORE $BEST_SCORE" | awk '{print ($1 > $2) ? "yes" : "no"}')
DELTA=$(echo "$SCORE $BEST_SCORE" | awk '{printf "%.3f", $1 - $2}')

if [ "$IMPROVED" = "yes" ]; then
echo ""
echo "*** IMPROVED! $BEST_SCORE -> $SCORE ops/s (+$DELTA) ***"
echo ""
BEST_SCORE="$SCORE"
echo "$BEST_SCORE" > "$BEST_SCORE_FILE"

DESCRIPTION=$(git diff --stat src/main/java/ | tail -1 | xargs)

git add src/main/java/
git commit -m "autoresearch: iteration $i [+$DELTA ops/s]

$(git diff --cached --stat | head -5)"

COMMIT=$(git rev-parse --short HEAD)
printf "%s\t%s\t%s\t+%s\tkept\t%s\n" "$i" "$COMMIT" "$SCORE" "$DELTA" "$DESCRIPTION" >> "$LOG_FILE"
else
echo "No improvement: $SCORE vs $BEST_SCORE ops/s ($DELTA) — reverting"
git checkout -- src/
printf "%s\t-\t%s\t%s\treverted\tno improvement\n" "$i" "$SCORE" "$DELTA" >> "$LOG_FILE"
fi
done

echo ""
echo "========================================"
echo "=== Autoresearch complete ==="
echo "=== Baseline: $BASELINE ops/s ==="
echo "=== Final best: $BEST_SCORE ops/s ==="
TOTAL_DELTA=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.3f", $1 - $2}')
TOTAL_PCT=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.1f", (($1 - $2) / $2) * 100}')
echo "=== Improvement: +$TOTAL_DELTA ops/s ($TOTAL_PCT%) ==="
echo "========================================"
echo ""
echo "Results log: $LOG_FILE"
echo "Review kept commits: git log --oneline --grep='autoresearch'"
95 changes: 95 additions & 0 deletions autoresearch-execution-complex/program.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# Autoresearch: Optimize Execution Engine Performance (Complex Async Query)

## Goal

Improve the throughput (ops/sec) of `ComplexQueryPerformance.benchMarkSimpleQueriesThroughput` by making
targeted optimizations to the core execution engine. This benchmark executes a complex query with both
sync and async data fetchers, multiple threads, and nested object types (shops → departments → products,
each with 12 fields). It exercises the full execution pipeline including async completion handling.

Every improvement must pass the relevant test suite locally. Final full-suite verification happens on a clean EC2 instance.

## Metric

- **Primary**: `ComplexQueryPerformance.benchMarkSimpleQueriesThroughput` — higher is better (ops/sec)
- Run with: `./gradlew jmh -PjmhInclude="performance.ComplexQueryPerformance.benchMarkSimpleQueriesThroughput" -PjmhFork=1 -PjmhIterations=3 -PjmhWarmupIterations=2`
- Note: This benchmark has a `@Param({"5", "10", "20"})` for `howManyItems`. The default JMH run will test all three. For faster iteration during development, you can filter to just one param.
- **Use async-profiler** to identify hotspots: add `-PjmhProfilers=async` to the JMH command.

## Scope — Files You May Modify

Primary targets under `src/main/java/graphql/execution/`:

- `ExecutionStrategy.java` (1141 lines) — the main execution strategy, field resolution
- `AsyncExecutionStrategy.java` (97 lines) — async field execution
- `Execution.java` (328 lines) — top-level execution orchestration
- `FieldCollector.java` (182 lines) — collects fields from selection sets
- `ResultNodesInfo.java` (55 lines) — tracks result node info during execution
- `ExecutionStepInfoFactory.java` (92 lines) — creates step info per field
- `FetchedValue.java` (82 lines) — wraps fetched values
- `FieldValueInfo.java` (101 lines) — field value tracking
- `MergedSelectionSet.java` (73 lines) — merged selections
- `MergedField.java` — merged field representation
- `Async.java` — async execution utilities

Also consider:
- `graphql/GraphQL.java` (624 lines) — top-level entry point
- `graphql/collect/ImmutableKit.java` — collection utilities
- `graphql/util/FpKit.java` — functional programming utilities
- `graphql/execution/instrumentation/` — instrumentation overhead

**Do NOT modify**: test files, benchmark files, schema files, build files.

## Constraints

1. **Relevant tests must pass locally**: Run `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q` for fast iteration. Full suite runs on EC2.
2. **No new dependencies**: This is a firm project policy.
3. **No wildcard imports, no inner classes, no Optional**: Project coding standards.
4. **Preserve public API**: All `@PublicApi` method signatures must remain unchanged.
5. **Thread safety**: The execution engine is called concurrently. Don't introduce shared mutable state.
6. **Use `graphql.Assert`** not `Objects.requireNonNull`.

## Optimization Strategies to Explore (ordered by expected impact)

### High Impact
1. **Profile first**: Run async-profiler to identify actual CPU hotspots before making changes.
2. **Reduce CompletableFuture overhead**: The async path creates many CompletableFuture chains. Consider whether composition can be simplified.
3. **Optimize field resolution dispatch**: Each field goes through ExecutionStrategy which has overhead for instrumentation, error handling, etc. Batch processing or reducing per-field overhead helps.
4. **Replace Guava immutable builders with mutable collections in hot paths**: ImmutableMap.Builder and ImmutableList.Builder have expensive hashCode overhead during build().
5. **Reduce instrumentation overhead**: Even "no-op" instrumentation dispatches per field.

### Medium Impact
6. **Optimize FieldCollector for repeated patterns**: The query has repeated field patterns across shops/departments/products.
7. **Reduce ExecutionStepInfo creation overhead**: Created per-field, consider lazy computation.
8. **Optimize DataFetcherResult handling**: Unwrapping overhead per field.
9. **Reduce lock contention in async paths**: If multiple threads contend on shared state.

### Lower Impact (but easy wins)
10. **Pre-size collections**: When field count is known.
11. **Cache repeated type/field lookups**.
12. **Replace stream operations with loops** in hot paths.

## How to Iterate

1. **Profile first** with async-profiler to identify actual hotspots
2. Pick ONE strategy targeting the top hotspot
3. Make a focused, minimal change
4. Run tests locally: `./gradlew test --tests "graphql.execution.*" --tests "graphql.GraphQLTest" -q`
5. Run the benchmark — compare to previous best
6. If improved: commit
7. If not improved: revert with `git checkout -- src/`
8. Re-profile, then pick next strategy

## Lessons from Previous Autoresearch (ENF Optimization)

- **ImmutableMap.Builder → LinkedHashMap**: Saved 20k ops/s due to Object.hashCode() overhead.
- **ImmutableListMultimap → parallel ArrayList**: Saved 22k ops/s. Same hashCode issue.
- **Avoid groupingBy when only checking group count**: Replaced full map with boolean flag.
- **Short-circuit for empty/single-element cases**: Multiple small wins.
- **Cache lambda captures**: Reuse Supplier fields instead of per-call lambdas.

## Important Notes

- This benchmark involves **multiple threads** (10 query threads + 10 fetcher threads). Be careful with thread safety.
- The async data fetchers include `Thread.sleep()` to simulate real-world latency. Optimizations to the execution engine reduce the non-sleep overhead.
- Guava is an existing dependency — you can use Guava utilities but nothing else new.
34 changes: 34 additions & 0 deletions autoresearch-execution-complex/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
# Runs the ComplexQuery throughput benchmark and extracts the score.
# Usage: ./autoresearch-execution-complex/run_benchmark.sh
# Output: prints the benchmark score (ops/sec) to stdout, or "FAILED" on error.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"

cd "$PROJECT_DIR"

echo "=== Running ComplexQuery throughput benchmark ===" >&2
BENCHMARK_OUTPUT=$(./gradlew jmh \
-PjmhInclude="performance.ComplexQueryPerformance.benchMarkSimpleQueriesThroughput" \
-PjmhFork=1 \
-PjmhIterations=2 \
-PjmhWarmupIterations=2 \
2>&1)

# Extract score from JMH output — take the first (howManyItems=5) result for consistency
# ComplexQueryPerformance.benchMarkSimpleQueriesThroughput 5 thrpt 2 XX.XXX ± Y.YYY ops/s
SCORE=$(echo "$BENCHMARK_OUTPUT" | grep -E "benchMarkSimpleQueriesThroughput\s+" | head -1 | awk '{print $(NF-3)}')

if [ -z "$SCORE" ]; then
echo "FAILED: could not extract benchmark score" >&2
echo "Last 20 lines of output:" >&2
echo "$BENCHMARK_OUTPUT" | tail -20 >&2
echo "FAILED"
exit 1
fi

echo "Score: $SCORE ops/s" >&2
echo "$SCORE"