Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 154 additions & 0 deletions autoresearch-validator/autoresearch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/usr/bin/env bash
# Autoresearch loop driver for graphql-java validator optimization.
#
# Usage:
# ./autoresearch-validator/autoresearch.sh [max_iterations]
#
# Default: 200 iterations (designed for overnight runs)
#
# Safety:
# The agent runs with --permission-mode plan and explicit --allowedTools.
# It can read files, edit source code, and run gradle for profiling.
# Tests, benchmarks, git commits, and reverts are handled by the outer harness.

set -euo pipefail

MAX_ITERATIONS="${1:-200}"
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
LOG_FILE="$SCRIPT_DIR/results.tsv"
BEST_SCORE_FILE="$SCRIPT_DIR/.best_score"

cd "$PROJECT_DIR"

if ! command -v claude &>/dev/null; then
echo "ERROR: 'claude' CLI not found on PATH. Install Claude Code first."
exit 1
fi

if [ ! -f "$LOG_FILE" ]; then
printf "iteration\tcommit\tscore\tdelta\tstatus\tdescription\n" > "$LOG_FILE"
fi

echo "=== Getting baseline score ==="
BASELINE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
if [ "$BASELINE" = "FAILED" ]; then
echo "ERROR: Baseline benchmark failed."
exit 1
fi
echo "Baseline: $BASELINE ops/s"
echo "$BASELINE" > "$BEST_SCORE_FILE"

BEST_SCORE="$BASELINE"

for i in $(seq 1 "$MAX_ITERATIONS"); do
echo ""
echo "========================================"
echo "=== Iteration $i / $MAX_ITERATIONS ==="
echo "=== Best score: $BEST_SCORE ops/s ==="
echo "========================================"

RECENT_LOG=$(tail -10 "$LOG_FILE" 2>/dev/null || echo "No previous iterations")

PROMPT="You are running iteration $i of an autoresearch optimization loop for graphql-java.

Read autoresearch-validator/program.md for full context and strategy.

Current best benchmark score: $BEST_SCORE ops/s (baseline was: $BASELINE ops/s)

Previous optimization log (last 10 entries):
$RECENT_LOG

YOUR TASK: Make exactly ONE focused optimization to the validation code.
- Read the code files first. If this is iteration 1 or you haven't profiled yet, run the
benchmark with async-profiler first to identify hotspots.
- Pick the most promising strategy from program.md that has NOT already been tried
- Make a minimal, targeted change to ONE or TWO files
- Do NOT run tests or benchmarks — the outer harness handles that
- Do NOT commit — the outer harness handles that
- After editing, output a single-line summary of what you changed and why

SCOPE: Only modify files under src/main/java/graphql/validation/,
or the utility files listed in program.md (ImmutableKit.java, FpKit.java, AstComparator.java).

Make the change now."

# Allowed tools: read-only exploration + code edits + safe bash commands
ALLOWED_TOOLS='Read,Glob,Grep,Edit,Write,Bash(./gradlew:*),Bash(cat:*),Bash(wc:*),Bash(head:*),Bash(tail:*),Bash(find:*),Bash(ls:*),Bash(grep:*),Bash(git diff:*),Bash(git status:*),Bash(git log:*),Bash(git show:*),Bash(git checkout:*)'

echo "--- Asking Claude to make an optimization ---"
CLAUDE_OUTPUT=$(claude \
--model sonnet \
--permission-mode plan \
--allowedTools "$ALLOWED_TOOLS" \
--max-turns 25 \
--verbose \
-p "$PROMPT" \
2>&1) || true

echo "$CLAUDE_OUTPUT" | tail -5

if git diff --quiet src/main/java/; then
echo "No source changes in iteration $i, skipping"
printf "%s\t-\t-\t-\tskipped\tno changes\n" "$i" >> "$LOG_FILE"
continue
fi

echo "--- Changes made ---"
git diff --stat src/main/java/

echo "--- Running tests ---"
if ! ./gradlew test --tests "graphql.validation.*" -q 2>&1 | tail -10; then
echo "Tests FAILED — reverting changes"
git checkout -- src/
printf "%s\t-\t-\t-\treverted\ttests failed\n" "$i" >> "$LOG_FILE"
continue
fi

echo "--- Running benchmark ---"
SCORE=$(bash "$SCRIPT_DIR/run_benchmark.sh")
if [ "$SCORE" = "FAILED" ]; then
echo "Benchmark FAILED — reverting changes"
git checkout -- src/
printf "%s\t-\t-\t-\treverted\tbenchmark failed\n" "$i" >> "$LOG_FILE"
continue
fi

IMPROVED=$(echo "$SCORE $BEST_SCORE" | awk '{print ($1 > $2) ? "yes" : "no"}')
DELTA=$(echo "$SCORE $BEST_SCORE" | awk '{printf "%.3f", $1 - $2}')

if [ "$IMPROVED" = "yes" ]; then
echo ""
echo "*** IMPROVED! $BEST_SCORE -> $SCORE ops/s (+$DELTA) ***"
echo ""
BEST_SCORE="$SCORE"
echo "$BEST_SCORE" > "$BEST_SCORE_FILE"

DESCRIPTION=$(git diff --stat src/main/java/ | tail -1 | xargs)

git add src/main/java/
git commit -m "autoresearch: iteration $i [+$DELTA ops/s]

$(git diff --cached --stat | head -5)"

COMMIT=$(git rev-parse --short HEAD)
printf "%s\t%s\t%s\t+%s\tkept\t%s\n" "$i" "$COMMIT" "$SCORE" "$DELTA" "$DESCRIPTION" >> "$LOG_FILE"
else
echo "No improvement: $SCORE vs $BEST_SCORE ops/s ($DELTA) — reverting"
git checkout -- src/
printf "%s\t-\t%s\t%s\treverted\tno improvement\n" "$i" "$SCORE" "$DELTA" >> "$LOG_FILE"
fi
done

echo ""
echo "========================================"
echo "=== Autoresearch complete ==="
echo "=== Baseline: $BASELINE ops/s ==="
echo "=== Final best: $BEST_SCORE ops/s ==="
TOTAL_DELTA=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.3f", $1 - $2}')
TOTAL_PCT=$(echo "$BEST_SCORE $BASELINE" | awk '{printf "%.1f", (($1 - $2) / $2) * 100}')
echo "=== Improvement: +$TOTAL_DELTA ops/s ($TOTAL_PCT%) ==="
echo "========================================"
echo ""
echo "Results log: $LOG_FILE"
echo "Review kept commits: git log --oneline --grep='autoresearch'"
93 changes: 93 additions & 0 deletions autoresearch-validator/program.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
# Autoresearch: Optimize Validator Performance (Overlapping Field Validation)

## Goal

Improve the throughput (ops/sec) of `OverlappingFieldValidationPerformance.overlappingFieldValidationThroughput`
by making targeted optimizations to the validation engine. Validation runs on every query, so improvements here
have broad impact. The benchmark tests overlapping field validation with a large schema and query, plus several
generated scenarios (repeated fields, fragments, deep abstract/concrete types).

Every improvement must pass the relevant test suite locally. Final full-suite verification happens on a clean EC2 instance.

## Metric

- **Primary**: `OverlappingFieldValidationPerformance.overlappingFieldValidationThroughput` — higher is better (ops/sec)
- Run with: `./gradlew jmh -PjmhInclude="performance.OverlappingFieldValidationPerformance.overlappingFieldValidationThroughput" -PjmhFork=1 -PjmhIterations=3 -PjmhWarmupIterations=2`
- **Use async-profiler** to identify hotspots: add `-PjmhProfilers=async` to the JMH command.
- Additional benchmarks for cross-validation: `benchmarkRepeatedFields`, `benchmarkOverlapFrag`, `benchmarkDeepAbstractConcrete` etc.

## Scope — Files You May Modify

Primary targets under `src/main/java/graphql/validation/`:

- `OperationValidator.java` (1785 lines) — the main validation logic, including overlapping fields check. **This is the primary target.**
- `Validator.java` (54 lines) — top-level validator entry point
- `LanguageTraversal.java` (44 lines) — AST traversal for validation
- `TraversalContext.java` — maintains type context during traversal
- `ValidationContext.java` — validation context
- `ValidationErrorCollector.java` — error collection
- `DocumentVisitor.java` — visitor interface

Also consider utility classes:
- `graphql/collect/ImmutableKit.java` — collection utilities
- `graphql/util/FpKit.java` — functional programming utilities
- `graphql/language/AstComparator.java` — AST comparison (used in field merging checks)

**Do NOT modify**: test files, benchmark files, schema files, build files.

## Constraints

1. **Relevant tests must pass locally**: Run `./gradlew test --tests "graphql.validation.*" -q` for fast iteration (~10 sec). Full suite runs on EC2.
2. **No new dependencies**: This is a firm project policy.
3. **No wildcard imports, no inner classes, no Optional**: Project coding standards.
4. **Preserve public API**: All `@PublicApi` method signatures must remain unchanged.
5. **Thread safety**: The validator may be called concurrently. Don't introduce shared mutable state.
6. **Use `graphql.Assert`** not `Objects.requireNonNull`.

## Optimization Strategies to Explore (ordered by expected impact)

### High Impact
1. **Profile first**: Run async-profiler to identify actual CPU hotspots. The overlapping field validation in OperationValidator is known to have O(n^2) or worse complexity in some cases.
2. **Reduce algorithmic complexity in overlapping field checks**: The `OVERLAPPING_FIELDS_CAN_BE_MERGED` rule compares pairs of fields. Memoization, caching of comparison results, or smarter traversal order can reduce redundant work.
3. **Replace Guava immutable builders with mutable collections in hot paths**: ImmutableMap.Builder and ImmutableList.Builder have expensive hashCode overhead during build(). This was the #1 finding in the ENF optimization.
4. **Cache field-pair comparison results**: If the same field pairs are compared repeatedly across different contexts, cache the results.

### Medium Impact
5. **Optimize AstComparator usage**: Field merging checks use AST comparison. If the same AST nodes are compared multiple times, caching helps.
6. **Reduce object allocation in validation traversal**: Each visited node may create validation state objects.
7. **Optimize type resolution during validation**: Type lookups for overlapping field checks.
8. **Early termination**: Skip validation checks that can't apply to the current node type.

### Lower Impact (but easy wins)
9. **Pre-size collections**: When the number of fields/fragments is known.
10. **Replace stream operations with loops** in hot paths.
11. **Cache schema type lookups** that are repeated during validation.
12. **Reduce string operations**: Error message construction in non-error paths.

## How to Iterate

1. **Profile first** with async-profiler to identify actual hotspots
2. Pick ONE strategy targeting the top hotspot
3. Make a focused, minimal change
4. Run tests locally: `./gradlew test --tests "graphql.validation.*" -q`
5. Run the benchmark — compare to previous best
6. If improved: commit with message "autoresearch: <description> [+X.XX ops/s]"
7. If not improved: revert with `git checkout -- src/`
8. Re-profile to see updated hotspots, then pick next strategy

## Lessons from Previous Autoresearch (ENF Optimization)

- **ImmutableMap.Builder → LinkedHashMap**: Saved 20k ops/s due to Object.hashCode() overhead.
- **ImmutableListMultimap → parallel ArrayList**: Saved 22k ops/s. Same hashCode issue.
- **Avoid groupingBy when only checking group count**: Replaced full map with boolean flag.
- **Short-circuit for empty/single-element cases**: Multiple small wins.
- **Cache lambda captures**: Reuse Supplier fields instead of per-call lambdas.
- **Profile-guided optimization**: The biggest wins came from profiling, not guessing.

## Important Notes

- `OperationValidator.java` at 1785 lines is the main target. It implements all validation rules.
- The overlapping fields check (`OVERLAPPING_FIELDS_CAN_BE_MERGED`) is the most expensive rule and is specifically what the benchmark tests.
- The benchmark uses `@Param({"100"})` for size, generating queries with 100 fields/fragments.
- The validation runs `LanguageTraversal.traverse(document, operationValidator)` which walks the AST.
- Guava is an existing dependency — you can use Guava utilities but nothing else new.
34 changes: 34 additions & 0 deletions autoresearch-validator/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env bash
# Runs the OverlappingFieldValidation throughput benchmark and extracts the score.
# Usage: ./autoresearch-validator/run_benchmark.sh
# Output: prints the benchmark score (ops/sec) to stdout, or "FAILED" on error.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"

cd "$PROJECT_DIR"

echo "=== Running OverlappingFieldValidation throughput benchmark ===" >&2
BENCHMARK_OUTPUT=$(./gradlew jmh \
-PjmhInclude="performance.OverlappingFieldValidationPerformance.overlappingFieldValidationThroughput" \
-PjmhFork=1 \
-PjmhIterations=3 \
-PjmhWarmupIterations=2 \
2>&1)

# Extract score from JMH output line like:
# OverlappingFieldValidationPerformance.overlappingFieldValidationThroughput 100 thrpt 3 XX.XXX ± Y.YYY ops/s
SCORE=$(echo "$BENCHMARK_OUTPUT" | grep -E "overlappingFieldValidationThroughput\s+" | awk '{print $(NF-3)}')

if [ -z "$SCORE" ]; then
echo "FAILED: could not extract benchmark score" >&2
echo "Last 20 lines of output:" >&2
echo "$BENCHMARK_OUTPUT" | tail -20 >&2
echo "FAILED"
exit 1
fi

echo "Score: $SCORE ops/s" >&2
echo "$SCORE"