More cleanup and also bulk testing
This commit is contained in:
203
bulk-test.ts
Normal file
203
bulk-test.ts
Normal file
@@ -0,0 +1,203 @@
|
||||
import { appendFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import {
|
||||
MODELS,
|
||||
Model,
|
||||
shuffle,
|
||||
withRetry,
|
||||
callGeneratePrompt,
|
||||
callGenerateAnswer,
|
||||
callVote,
|
||||
isRealString
|
||||
} from "./game.ts";
|
||||
|
||||
if (!process.env.OPENROUTER_API_KEY) {
|
||||
console.error("Error: Set OPENROUTER_API_KEY environment variable");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const TOTAL_ROUNDS = 1000;
|
||||
const CONCURRENCY = 100;
|
||||
|
||||
const startTime = Date.now();
|
||||
const LOGS_DIR = join(import.meta.dir, "logs");
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, "-");
|
||||
const LOG_FILE = join(LOGS_DIR, `bulk-test-${timestamp}.log`);
|
||||
|
||||
type RoundResult = {
|
||||
roundNum: number;
|
||||
prompter: Model;
|
||||
prompt: string;
|
||||
contA: Model;
|
||||
answerA: string;
|
||||
contB: Model;
|
||||
answerB: string;
|
||||
votes: { voter: Model, votedFor: "A" | "B" | null }[];
|
||||
votesA: number;
|
||||
votesB: number;
|
||||
winner: Model | null;
|
||||
error?: string;
|
||||
};
|
||||
|
||||
const scores: Record<string, number> = Object.fromEntries(MODELS.map((m) => [m.name, 0]));
|
||||
const results: RoundResult[] = [];
|
||||
let completedRounds = 0;
|
||||
let failedRounds = 0;
|
||||
let currentTaskIndex = 0;
|
||||
|
||||
function updateProgress() {
|
||||
process.stdout.write(`\rProgress: ${completedRounds + failedRounds}/${TOTAL_ROUNDS} (Success: ${completedRounds}, Failed: ${failedRounds})`);
|
||||
}
|
||||
|
||||
function logToBulkLog(message: string) {
|
||||
appendFileSync(LOG_FILE, message + "\n");
|
||||
}
|
||||
|
||||
async function runRound(roundNum: number): Promise<RoundResult> {
|
||||
const shuffled = shuffle([...MODELS]);
|
||||
const prompter = shuffled[0]!;
|
||||
const contA = shuffled[1]!;
|
||||
const contB = shuffled[2]!;
|
||||
const voters = shuffled.slice(3);
|
||||
|
||||
let prompt = "";
|
||||
try {
|
||||
prompt = await withRetry(
|
||||
() => callGeneratePrompt(prompter),
|
||||
(s) => isRealString(s, 10),
|
||||
3,
|
||||
`BulkR${roundNum}:prompt:${prompter.name}`
|
||||
);
|
||||
} catch (err: any) {
|
||||
return { roundNum, prompter, prompt: "", contA, answerA: "", contB, answerB: "", votes: [], votesA: 0, votesB: 0, winner: null, error: `Prompt failed: ${err.message}` };
|
||||
}
|
||||
|
||||
let answerA = "", answerB = "";
|
||||
try {
|
||||
const [ansA, ansB] = await Promise.all([
|
||||
withRetry(() => callGenerateAnswer(contA, prompt), (s) => isRealString(s, 3), 3, `BulkR${roundNum}:answer:${contA.name}`),
|
||||
withRetry(() => callGenerateAnswer(contB, prompt), (s) => isRealString(s, 3), 3, `BulkR${roundNum}:answer:${contB.name}`)
|
||||
]);
|
||||
answerA = ansA;
|
||||
answerB = ansB;
|
||||
} catch (err: any) {
|
||||
return { roundNum, prompter, prompt, contA, answerA: "", contB, answerB: "", votes: [], votesA: 0, votesB: 0, winner: null, error: `Answer failed: ${err.message}` };
|
||||
}
|
||||
|
||||
let votesA = 0;
|
||||
let votesB = 0;
|
||||
const roundVotes: { voter: Model, votedFor: "A" | "B" | null }[] = [];
|
||||
|
||||
await Promise.all(voters.map(async (voter) => {
|
||||
try {
|
||||
const showAFirst = Math.random() > 0.5;
|
||||
const first = showAFirst ? { answer: answerA } : { answer: answerB };
|
||||
const second = showAFirst ? { answer: answerB } : { answer: answerA };
|
||||
const vote = await withRetry(
|
||||
() => callVote(voter, prompt, first, second),
|
||||
(v) => v === "A" || v === "B",
|
||||
3,
|
||||
`BulkR${roundNum}:vote:${voter.name}`
|
||||
);
|
||||
|
||||
const votedForA = showAFirst ? vote === "A" : vote === "B";
|
||||
if (votedForA) votesA++; else votesB++;
|
||||
roundVotes.push({ voter, votedFor: votedForA ? "A" : "B" });
|
||||
} catch (err) {
|
||||
roundVotes.push({ voter, votedFor: null });
|
||||
}
|
||||
}));
|
||||
|
||||
let winner: Model | null = null;
|
||||
if (votesA > votesB) winner = contA;
|
||||
else if (votesB > votesA) winner = contB;
|
||||
|
||||
return {
|
||||
roundNum,
|
||||
prompter,
|
||||
prompt,
|
||||
contA,
|
||||
answerA,
|
||||
contB,
|
||||
answerB,
|
||||
votes: roundVotes,
|
||||
votesA,
|
||||
votesB,
|
||||
winner
|
||||
};
|
||||
}
|
||||
|
||||
async function worker() {
|
||||
while (true) {
|
||||
const roundNum = currentTaskIndex + 1;
|
||||
if (roundNum > TOTAL_ROUNDS) break;
|
||||
currentTaskIndex++;
|
||||
|
||||
try {
|
||||
const result = await runRound(roundNum);
|
||||
if (result.error) {
|
||||
failedRounds++;
|
||||
logToBulkLog(`\n=== ROUND ${roundNum} FAILED ===\nError: ${result.error}\n`);
|
||||
} else {
|
||||
completedRounds++;
|
||||
if (result.winner) {
|
||||
scores[result.winner.name]++;
|
||||
}
|
||||
|
||||
let roundLog = `\n=== ROUND ${roundNum} ===\n`;
|
||||
roundLog += `Prompter (${result.prompter.name}): ${result.prompt}\n`;
|
||||
roundLog += `Contestant A (${result.contA.name}): ${result.answerA} [Votes: ${result.votesA}]\n`;
|
||||
roundLog += `Contestant B (${result.contB.name}): ${result.answerB} [Votes: ${result.votesB}]\n`;
|
||||
|
||||
roundLog += `\nVotes:\n`;
|
||||
for (const v of result.votes) {
|
||||
const votedName = v.votedFor === "A" ? result.contA.name : v.votedFor === "B" ? result.contB.name : "FAILED";
|
||||
roundLog += ` - ${v.voter.name} voted for: ${votedName}\n`;
|
||||
}
|
||||
|
||||
roundLog += `\nWinner: ${result.winner ? result.winner.name : "TIE"}\n`;
|
||||
logToBulkLog(roundLog);
|
||||
results.push(result);
|
||||
}
|
||||
} catch (err) {
|
||||
failedRounds++;
|
||||
logToBulkLog(`\n=== ROUND ${roundNum} UNHANDLED ERROR ===\nError: ${err}\n`);
|
||||
}
|
||||
updateProgress();
|
||||
}
|
||||
}
|
||||
|
||||
async function main() {
|
||||
console.log(`Starting bulk test of ${TOTAL_ROUNDS} rounds with concurrency ${CONCURRENCY}...`);
|
||||
console.log(`Readable log with outputs and votes will be saved to: ${LOG_FILE}\n`);
|
||||
|
||||
logToBulkLog(`BULK TEST STARTED AT ${new Date().toISOString()}`);
|
||||
logToBulkLog(`Total Rounds: ${TOTAL_ROUNDS}, Concurrency: ${CONCURRENCY}\n`);
|
||||
|
||||
updateProgress();
|
||||
const workers = Array.from({ length: CONCURRENCY }, () => worker());
|
||||
|
||||
await Promise.all(workers);
|
||||
|
||||
console.log(`\n\nBulk test complete! (${((Date.now() - startTime) / 1000).toFixed(1)}s)`);
|
||||
|
||||
// Generate summary
|
||||
const sortedScores = Object.entries(scores).sort((a, b) => b[1] - a[1]);
|
||||
|
||||
let summary = `\n\n=== BULK TEST FINAL SUMMARY ===\n`;
|
||||
summary += `Total Rounds: ${TOTAL_ROUNDS}\n`;
|
||||
summary += `Completed: ${completedRounds}\n`;
|
||||
summary += `Failed: ${failedRounds}\n`;
|
||||
summary += `Duration: ${((Date.now() - startTime) / 1000).toFixed(1)}s\n\n`;
|
||||
|
||||
summary += `=== FINAL RANKS ===\n`;
|
||||
sortedScores.forEach(([name, score], index) => {
|
||||
summary += `${index + 1}. ${name}: ${score} wins\n`;
|
||||
});
|
||||
|
||||
logToBulkLog(summary);
|
||||
console.log(summary);
|
||||
console.log(`Readable log with outputs and votes saved to: ${LOG_FILE}`);
|
||||
}
|
||||
|
||||
main().catch(console.error);
|
||||
2
game.ts
2
game.ts
@@ -15,7 +15,7 @@ export const MODELS = [
|
||||
{ id: "anthropic/claude-opus-4.6", name: "Opus 4.6" },
|
||||
{ id: "anthropic/claude-sonnet-4.6", name: "Sonnet 4.6" },
|
||||
{ id: "x-ai/grok-4.1-fast", name: "Grok 4.1" },
|
||||
{ id: "minimax/minimax-m2.5", name: "MiniMax 2.5" },
|
||||
// { id: "minimax/minimax-m2.5", name: "MiniMax 2.5" },
|
||||
] as const;
|
||||
|
||||
export type Model = (typeof MODELS)[number];
|
||||
|
||||
Reference in New Issue
Block a user