diff --git a/bulk-test.ts b/bulk-test.ts new file mode 100644 index 0000000..b222f59 --- /dev/null +++ b/bulk-test.ts @@ -0,0 +1,203 @@ +import { appendFileSync } from "node:fs"; +import { join } from "node:path"; +import { + MODELS, + Model, + shuffle, + withRetry, + callGeneratePrompt, + callGenerateAnswer, + callVote, + isRealString +} from "./game.ts"; + +if (!process.env.OPENROUTER_API_KEY) { + console.error("Error: Set OPENROUTER_API_KEY environment variable"); + process.exit(1); +} + +const TOTAL_ROUNDS = 1000; +const CONCURRENCY = 100; + +const startTime = Date.now(); +const LOGS_DIR = join(import.meta.dir, "logs"); +const timestamp = new Date().toISOString().replace(/[:.]/g, "-"); +const LOG_FILE = join(LOGS_DIR, `bulk-test-${timestamp}.log`); + +type RoundResult = { + roundNum: number; + prompter: Model; + prompt: string; + contA: Model; + answerA: string; + contB: Model; + answerB: string; + votes: { voter: Model, votedFor: "A" | "B" | null }[]; + votesA: number; + votesB: number; + winner: Model | null; + error?: string; +}; + +const scores: Record = Object.fromEntries(MODELS.map((m) => [m.name, 0])); +const results: RoundResult[] = []; +let completedRounds = 0; +let failedRounds = 0; +let currentTaskIndex = 0; + +function updateProgress() { + process.stdout.write(`\rProgress: ${completedRounds + failedRounds}/${TOTAL_ROUNDS} (Success: ${completedRounds}, Failed: ${failedRounds})`); +} + +function logToBulkLog(message: string) { + appendFileSync(LOG_FILE, message + "\n"); +} + +async function runRound(roundNum: number): Promise { + const shuffled = shuffle([...MODELS]); + const prompter = shuffled[0]!; + const contA = shuffled[1]!; + const contB = shuffled[2]!; + const voters = shuffled.slice(3); + + let prompt = ""; + try { + prompt = await withRetry( + () => callGeneratePrompt(prompter), + (s) => isRealString(s, 10), + 3, + `BulkR${roundNum}:prompt:${prompter.name}` + ); + } catch (err: any) { + return { roundNum, prompter, prompt: "", contA, answerA: "", contB, answerB: "", votes: [], votesA: 0, votesB: 0, winner: null, error: `Prompt failed: ${err.message}` }; + } + + let answerA = "", answerB = ""; + try { + const [ansA, ansB] = await Promise.all([ + withRetry(() => callGenerateAnswer(contA, prompt), (s) => isRealString(s, 3), 3, `BulkR${roundNum}:answer:${contA.name}`), + withRetry(() => callGenerateAnswer(contB, prompt), (s) => isRealString(s, 3), 3, `BulkR${roundNum}:answer:${contB.name}`) + ]); + answerA = ansA; + answerB = ansB; + } catch (err: any) { + return { roundNum, prompter, prompt, contA, answerA: "", contB, answerB: "", votes: [], votesA: 0, votesB: 0, winner: null, error: `Answer failed: ${err.message}` }; + } + + let votesA = 0; + let votesB = 0; + const roundVotes: { voter: Model, votedFor: "A" | "B" | null }[] = []; + + await Promise.all(voters.map(async (voter) => { + try { + const showAFirst = Math.random() > 0.5; + const first = showAFirst ? { answer: answerA } : { answer: answerB }; + const second = showAFirst ? { answer: answerB } : { answer: answerA }; + const vote = await withRetry( + () => callVote(voter, prompt, first, second), + (v) => v === "A" || v === "B", + 3, + `BulkR${roundNum}:vote:${voter.name}` + ); + + const votedForA = showAFirst ? vote === "A" : vote === "B"; + if (votedForA) votesA++; else votesB++; + roundVotes.push({ voter, votedFor: votedForA ? "A" : "B" }); + } catch (err) { + roundVotes.push({ voter, votedFor: null }); + } + })); + + let winner: Model | null = null; + if (votesA > votesB) winner = contA; + else if (votesB > votesA) winner = contB; + + return { + roundNum, + prompter, + prompt, + contA, + answerA, + contB, + answerB, + votes: roundVotes, + votesA, + votesB, + winner + }; +} + +async function worker() { + while (true) { + const roundNum = currentTaskIndex + 1; + if (roundNum > TOTAL_ROUNDS) break; + currentTaskIndex++; + + try { + const result = await runRound(roundNum); + if (result.error) { + failedRounds++; + logToBulkLog(`\n=== ROUND ${roundNum} FAILED ===\nError: ${result.error}\n`); + } else { + completedRounds++; + if (result.winner) { + scores[result.winner.name]++; + } + + let roundLog = `\n=== ROUND ${roundNum} ===\n`; + roundLog += `Prompter (${result.prompter.name}): ${result.prompt}\n`; + roundLog += `Contestant A (${result.contA.name}): ${result.answerA} [Votes: ${result.votesA}]\n`; + roundLog += `Contestant B (${result.contB.name}): ${result.answerB} [Votes: ${result.votesB}]\n`; + + roundLog += `\nVotes:\n`; + for (const v of result.votes) { + const votedName = v.votedFor === "A" ? result.contA.name : v.votedFor === "B" ? result.contB.name : "FAILED"; + roundLog += ` - ${v.voter.name} voted for: ${votedName}\n`; + } + + roundLog += `\nWinner: ${result.winner ? result.winner.name : "TIE"}\n`; + logToBulkLog(roundLog); + results.push(result); + } + } catch (err) { + failedRounds++; + logToBulkLog(`\n=== ROUND ${roundNum} UNHANDLED ERROR ===\nError: ${err}\n`); + } + updateProgress(); + } +} + +async function main() { + console.log(`Starting bulk test of ${TOTAL_ROUNDS} rounds with concurrency ${CONCURRENCY}...`); + console.log(`Readable log with outputs and votes will be saved to: ${LOG_FILE}\n`); + + logToBulkLog(`BULK TEST STARTED AT ${new Date().toISOString()}`); + logToBulkLog(`Total Rounds: ${TOTAL_ROUNDS}, Concurrency: ${CONCURRENCY}\n`); + + updateProgress(); + const workers = Array.from({ length: CONCURRENCY }, () => worker()); + + await Promise.all(workers); + + console.log(`\n\nBulk test complete! (${((Date.now() - startTime) / 1000).toFixed(1)}s)`); + + // Generate summary + const sortedScores = Object.entries(scores).sort((a, b) => b[1] - a[1]); + + let summary = `\n\n=== BULK TEST FINAL SUMMARY ===\n`; + summary += `Total Rounds: ${TOTAL_ROUNDS}\n`; + summary += `Completed: ${completedRounds}\n`; + summary += `Failed: ${failedRounds}\n`; + summary += `Duration: ${((Date.now() - startTime) / 1000).toFixed(1)}s\n\n`; + + summary += `=== FINAL RANKS ===\n`; + sortedScores.forEach(([name, score], index) => { + summary += `${index + 1}. ${name}: ${score} wins\n`; + }); + + logToBulkLog(summary); + console.log(summary); + console.log(`Readable log with outputs and votes saved to: ${LOG_FILE}`); +} + +main().catch(console.error); diff --git a/game.ts b/game.ts index 80f5222..f13bca0 100644 --- a/game.ts +++ b/game.ts @@ -15,7 +15,7 @@ export const MODELS = [ { id: "anthropic/claude-opus-4.6", name: "Opus 4.6" }, { id: "anthropic/claude-sonnet-4.6", name: "Sonnet 4.6" }, { id: "x-ai/grok-4.1-fast", name: "Grok 4.1" }, - { id: "minimax/minimax-m2.5", name: "MiniMax 2.5" }, + // { id: "minimax/minimax-m2.5", name: "MiniMax 2.5" }, ] as const; export type Model = (typeof MODELS)[number];