Files
argument.es/bulk-test.ts
Theo Browne 593ad125ff Pause
2026-02-20 04:26:14 -08:00

204 lines
6.3 KiB
TypeScript

import { appendFileSync } from "node:fs";
import { join } from "node:path";
import {
MODELS,
type Model,
shuffle,
withRetry,
callGeneratePrompt,
callGenerateAnswer,
callVote,
isRealString
} from "./game.ts";
if (!process.env.OPENROUTER_API_KEY) {
console.error("Error: Set OPENROUTER_API_KEY environment variable");
process.exit(1);
}
const TOTAL_ROUNDS = 1000;
const CONCURRENCY = 100;
const startTime = Date.now();
const LOGS_DIR = join(import.meta.dir, "logs");
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
const LOG_FILE = join(LOGS_DIR, `bulk-test-${timestamp}.log`);
type RoundResult = {
roundNum: number;
prompter: Model;
prompt: string;
contA: Model;
answerA: string;
contB: Model;
answerB: string;
votes: { voter: Model, votedFor: "A" | "B" | null }[];
votesA: number;
votesB: number;
winner: Model | null;
error?: string;
};
const scores: Record<string, number> = Object.fromEntries(MODELS.map((m) => [m.name, 0]));
const results: RoundResult[] = [];
let completedRounds = 0;
let failedRounds = 0;
let currentTaskIndex = 0;
function updateProgress() {
process.stdout.write(`\rProgress: ${completedRounds + failedRounds}/${TOTAL_ROUNDS} (Success: ${completedRounds}, Failed: ${failedRounds})`);
}
function logToBulkLog(message: string) {
appendFileSync(LOG_FILE, message + "\n");
}
async function runRound(roundNum: number): Promise<RoundResult> {
const shuffled = shuffle([...MODELS]);
const prompter = shuffled[0]!;
const contA = shuffled[1]!;
const contB = shuffled[2]!;
const voters = shuffled.slice(3);
let prompt = "";
try {
prompt = await withRetry(
() => callGeneratePrompt(prompter),
(s) => isRealString(s, 10),
3,
`BulkR${roundNum}:prompt:${prompter.name}`
);
} catch (err: any) {
return { roundNum, prompter, prompt: "", contA, answerA: "", contB, answerB: "", votes: [], votesA: 0, votesB: 0, winner: null, error: `Prompt failed: ${err.message}` };
}
let answerA = "", answerB = "";
try {
const [ansA, ansB] = await Promise.all([
withRetry(() => callGenerateAnswer(contA, prompt), (s) => isRealString(s, 3), 3, `BulkR${roundNum}:answer:${contA.name}`),
withRetry(() => callGenerateAnswer(contB, prompt), (s) => isRealString(s, 3), 3, `BulkR${roundNum}:answer:${contB.name}`)
]);
answerA = ansA;
answerB = ansB;
} catch (err: any) {
return { roundNum, prompter, prompt, contA, answerA: "", contB, answerB: "", votes: [], votesA: 0, votesB: 0, winner: null, error: `Answer failed: ${err.message}` };
}
let votesA = 0;
let votesB = 0;
const roundVotes: { voter: Model, votedFor: "A" | "B" | null }[] = [];
await Promise.all(voters.map(async (voter) => {
try {
const showAFirst = Math.random() > 0.5;
const first = showAFirst ? { answer: answerA } : { answer: answerB };
const second = showAFirst ? { answer: answerB } : { answer: answerA };
const vote = await withRetry(
() => callVote(voter, prompt, first, second),
(v) => v === "A" || v === "B",
3,
`BulkR${roundNum}:vote:${voter.name}`
);
const votedForA = showAFirst ? vote === "A" : vote === "B";
if (votedForA) votesA++; else votesB++;
roundVotes.push({ voter, votedFor: votedForA ? "A" : "B" });
} catch (err) {
roundVotes.push({ voter, votedFor: null });
}
}));
let winner: Model | null = null;
if (votesA > votesB) winner = contA;
else if (votesB > votesA) winner = contB;
return {
roundNum,
prompter,
prompt,
contA,
answerA,
contB,
answerB,
votes: roundVotes,
votesA,
votesB,
winner
};
}
async function worker() {
while (true) {
const roundNum = currentTaskIndex + 1;
if (roundNum > TOTAL_ROUNDS) break;
currentTaskIndex++;
try {
const result = await runRound(roundNum);
if (result.error) {
failedRounds++;
logToBulkLog(`\n=== ROUND ${roundNum} FAILED ===\nError: ${result.error}\n`);
} else {
completedRounds++;
if (result.winner) {
scores[result.winner.name] = (scores[result.winner.name] ?? 0) + 1;
}
let roundLog = `\n=== ROUND ${roundNum} ===\n`;
roundLog += `Prompter (${result.prompter.name}): ${result.prompt}\n`;
roundLog += `Contestant A (${result.contA.name}): ${result.answerA} [Votes: ${result.votesA}]\n`;
roundLog += `Contestant B (${result.contB.name}): ${result.answerB} [Votes: ${result.votesB}]\n`;
roundLog += `\nVotes:\n`;
for (const v of result.votes) {
const votedName = v.votedFor === "A" ? result.contA.name : v.votedFor === "B" ? result.contB.name : "FAILED";
roundLog += ` - ${v.voter.name} voted for: ${votedName}\n`;
}
roundLog += `\nWinner: ${result.winner ? result.winner.name : "TIE"}\n`;
logToBulkLog(roundLog);
results.push(result);
}
} catch (err) {
failedRounds++;
logToBulkLog(`\n=== ROUND ${roundNum} UNHANDLED ERROR ===\nError: ${err}\n`);
}
updateProgress();
}
}
async function main() {
console.log(`Starting bulk test of ${TOTAL_ROUNDS} rounds with concurrency ${CONCURRENCY}...`);
console.log(`Readable log with outputs and votes will be saved to: ${LOG_FILE}\n`);
logToBulkLog(`BULK TEST STARTED AT ${new Date().toISOString()}`);
logToBulkLog(`Total Rounds: ${TOTAL_ROUNDS}, Concurrency: ${CONCURRENCY}\n`);
updateProgress();
const workers = Array.from({ length: CONCURRENCY }, () => worker());
await Promise.all(workers);
console.log(`\n\nBulk test complete! (${((Date.now() - startTime) / 1000).toFixed(1)}s)`);
// Generate summary
const sortedScores = Object.entries(scores).sort((a, b) => b[1] - a[1]);
let summary = `\n\n=== BULK TEST FINAL SUMMARY ===\n`;
summary += `Total Rounds: ${TOTAL_ROUNDS}\n`;
summary += `Completed: ${completedRounds}\n`;
summary += `Failed: ${failedRounds}\n`;
summary += `Duration: ${((Date.now() - startTime) / 1000).toFixed(1)}s\n\n`;
summary += `=== FINAL RANKS ===\n`;
sortedScores.forEach(([name, score], index) => {
summary += `${index + 1}. ${name}: ${score} wins\n`;
});
logToBulkLog(summary);
console.log(summary);
console.log(`Readable log with outputs and votes saved to: ${LOG_FILE}`);
}
main().catch(console.error);