forked from garrytan/gstack
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark-cli.test.ts
More file actions
177 lines (161 loc) · 6.98 KB
/
benchmark-cli.test.ts
File metadata and controls
177 lines (161 loc) · 6.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
/**
* gstack-model-benchmark CLI tests (offline).
*
* Covers CLI wiring that unit tests against benchmark-runner.ts can't see:
* - --dry-run auth/provider-list resolution
* - unknown provider WARN path
* - provider default (claude) when --models omitted
* - prompt resolution (inline --prompt vs positional file path)
* - output format flag wiring via --dry-run (avoids real CLI invocation)
*
* All tests use --dry-run so no API calls happen.
*/
import { describe, test, expect } from 'bun:test';
import { spawnSync } from 'child_process';
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
const ROOT = path.resolve(import.meta.dir, '..');
const BIN = path.join(ROOT, 'bin', 'gstack-model-benchmark');
function run(args: string[], opts: { env?: Record<string, string> } = {}): { status: number | null; stdout: string; stderr: string } {
const result = spawnSync('bun', ['run', BIN, ...args], {
cwd: ROOT,
env: { ...process.env, ...opts.env },
encoding: 'utf-8',
timeout: 15000,
});
return {
status: result.status,
stdout: result.stdout?.toString() ?? '',
stderr: result.stderr?.toString() ?? '',
};
}
describe('gstack-model-benchmark --dry-run', () => {
test('prints provider availability report and exits 0', () => {
const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('gstack-model-benchmark --dry-run');
expect(r.stdout).toContain('claude');
expect(r.stdout).toContain('gpt');
expect(r.stdout).toContain('gemini');
expect(r.stdout).toContain('no prompts sent');
});
test('reports default provider when --models omitted', () => {
const r = run(['--prompt', 'hi', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('providers: claude');
});
test('unknown provider in --models emits WARN and is dropped', () => {
const r = run(['--prompt', 'hi', '--models', 'claude,gpt-42-fake', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stderr).toContain('unknown provider');
expect(r.stderr).toContain('gpt-42-fake');
expect(r.stdout).toContain('providers: claude');
expect(r.stdout).not.toContain('gpt-42-fake');
});
test('empty --models list falls back to claude default', () => {
const r = run(['--prompt', 'hi', '--models', '', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('providers: claude');
});
test('--timeout-ms and --workdir flags flow through to dry-run report', () => {
const r = run(['--prompt', 'hi', '--timeout-ms', '9999', '--workdir', '/tmp', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('timeout_ms: 9999');
expect(r.stdout).toContain('workdir: /tmp');
});
test('--judge flag reported in dry-run output', () => {
const r = run(['--prompt', 'hi', '--judge', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('judge: on');
});
test('--output flag reported in dry-run', () => {
const r = run(['--prompt', 'hi', '--output', 'json', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('output: json');
});
test('each adapter reports either OK or NOT READY, never crashes', () => {
const r = run(['--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run']);
expect(r.status).toBe(0);
// Each provider line must end in OK or NOT READY
const lines = r.stdout.split('\n');
const adapterLines = lines.filter(l => /^\s+(claude|gpt|gemini):/.test(l));
expect(adapterLines.length).toBe(3);
for (const line of adapterLines) {
expect(line).toMatch(/(OK|NOT READY)/);
}
});
test('NOT READY path fires when auth env vars are stripped', () => {
// On a dev machine with full auth configured, the default --dry-run output
// shows OK for every provider with credentials. Strip auth env vars AND
// point HOME at an empty temp dir so adapters can't find file-based creds.
// This test exists to catch regressions where the NOT READY branch itself
// breaks (crash, missing remediation hint, wrong message format).
//
// Note: claude adapter's `os.homedir()` call is sometimes cached by Bun and
// doesn't always pick up the HOME override, so this test asserts only on
// gpt + gemini adapters where HOME redirection reliably makes the adapter's
// credentials-path check fail. Two adapters hitting NOT READY with full
// remediation messages is sufficient coverage for the branch.
const emptyHome = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-noauth-home-'));
try {
const minimalEnv: Record<string, string> = {
PATH: process.env.PATH ?? '',
TERM: process.env.TERM ?? 'xterm',
HOME: emptyHome,
};
const result = spawnSync('bun', ['run', BIN, '--prompt', 'hi', '--models', 'claude,gpt,gemini', '--dry-run'], {
cwd: ROOT,
env: minimalEnv,
encoding: 'utf-8',
timeout: 15000,
});
expect(result.status).toBe(0);
const out = result.stdout?.toString() ?? '';
// gpt + gemini must report NOT READY in this clean env (their auth check
// reads paths under the overridden HOME).
expect(out).toMatch(/gpt:\s+NOT READY/);
expect(out).toMatch(/gemini:\s+NOT READY/);
// Every NOT READY line must include a concrete remediation hint so users
// can resolve the missing auth. This is the regression we care about.
const notReadyLines = out.split('\n').filter(l => l.includes('NOT READY'));
expect(notReadyLines.length).toBeGreaterThanOrEqual(2);
for (const line of notReadyLines) {
expect(line).toMatch(/(install|Install|login|export|Run|Log in)/);
}
} finally {
fs.rmSync(emptyHome, { recursive: true, force: true });
}
});
test('long prompt is truncated in dry-run display', () => {
const longPrompt = 'x'.repeat(200);
const r = run(['--prompt', longPrompt, '--dry-run']);
expect(r.status).toBe(0);
// Summary truncates to 80 chars + ellipsis
expect(r.stdout).toMatch(/prompt:\s+x{80}…/);
});
});
describe('gstack-model-benchmark prompt resolution', () => {
test('positional file path is read and passed as prompt', () => {
const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'bench-prompt-'));
const promptFile = path.join(tmp, 'prompt.txt');
fs.writeFileSync(promptFile, 'hello from file');
try {
const r = run([promptFile, '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('hello from file');
} finally {
fs.rmSync(tmp, { recursive: true, force: true });
}
});
test('positional non-file arg is treated as inline prompt', () => {
const r = run(['treat-me-as-inline', '--dry-run']);
expect(r.status).toBe(0);
expect(r.stdout).toContain('treat-me-as-inline');
});
test('missing prompt exits non-zero', () => {
const r = run(['--dry-run']);
expect(r.status).not.toBe(0);
expect(r.stderr).toContain('specify a prompt');
});
});