diff --git a/utils/replicate-translate.ts b/utils/replicate-translate.ts index d12fd14..304812a 100644 --- a/utils/replicate-translate.ts +++ b/utils/replicate-translate.ts @@ -136,24 +136,98 @@ export async function replicateTranslate( : translateViaCloud(text, targetLanguage); } -// Batch translate using separator trick to minimize API calls const SEPARATOR = "\n{{SEP}}\n"; +const MAX_CHARS = 4800; // JigsawStack limit is 5000 — leave headroom + +/** + * Split a single text that exceeds MAX_CHARS into translatable chunks, + * preferring paragraph/sentence boundaries. + */ +async function translateLongText(text: string, targetLanguage: string): Promise { + const chunks: string[] = []; + let remaining = text; + + while (remaining.length > MAX_CHARS) { + let splitAt = MAX_CHARS; + const para = remaining.lastIndexOf("\n", MAX_CHARS); + const sentence = remaining.lastIndexOf(". ", MAX_CHARS); + if (para > MAX_CHARS * 0.5) splitAt = para + 1; + else if (sentence > MAX_CHARS * 0.5) splitAt = sentence + 2; + chunks.push(remaining.slice(0, splitAt)); + remaining = remaining.slice(splitAt); + } + if (remaining.trim()) chunks.push(remaining); + + const results = await Promise.all(chunks.map(c => replicateTranslate(c, targetLanguage))); + return results.join(" "); +} + +/** + * Build groups of texts that fit within MAX_CHARS when joined with SEPARATOR. + * Texts that individually exceed MAX_CHARS are kept alone for chunked translation. + */ +function buildBatches(texts: string[]): { indices: number[]; long: boolean }[] { + const batches: { indices: number[]; long: boolean }[] = []; + let current: number[] = []; + let currentLen = 0; + + for (let i = 0; i < texts.length; i++) { + const t = texts[i]; + + if (t.length > MAX_CHARS) { + // Flush current group first + if (current.length > 0) { batches.push({ indices: current, long: false }); current = []; currentLen = 0; } + batches.push({ indices: [i], long: true }); + continue; + } + + const added = currentLen === 0 ? t.length : currentLen + SEPARATOR.length + t.length; + if (added > MAX_CHARS && current.length > 0) { + batches.push({ indices: current, long: false }); + current = [i]; + currentLen = t.length; + } else { + current.push(i); + currentLen = added; + } + } + if (current.length > 0) batches.push({ indices: current, long: false }); + return batches; +} export async function replicateTranslateBatch( texts: string[], targetLanguage: string ): Promise { if (texts.length === 0) return []; - if (texts.length === 1) { - return [await replicateTranslate(texts[0], targetLanguage)]; + + const results: string[] = new Array(texts.length); + const batches = buildBatches(texts); + + // Process batches sequentially to avoid hammering the model + for (const batch of batches) { + if (batch.long) { + // Single oversized text — chunk it + results[batch.indices[0]] = await translateLongText(texts[batch.indices[0]], targetLanguage); + } else if (batch.indices.length === 1) { + results[batch.indices[0]] = await replicateTranslate(texts[batch.indices[0]], targetLanguage); + } else { + // Multi-text batch within limit + const joined = batch.indices.map(i => texts[i]).join(SEPARATOR); + const translated = await replicateTranslate(joined, targetLanguage); + const parts = translated.split(SEPARATOR); + + if (parts.length === batch.indices.length) { + batch.indices.forEach((idx, i) => { results[idx] = parts[i]; }); + } else { + // Separator got translated — fall back to individual calls + const individual = await Promise.all( + batch.indices.map(i => replicateTranslate(texts[i], targetLanguage)) + ); + batch.indices.forEach((idx, i) => { results[idx] = individual[i]; }); + } + } } - const joined = texts.join(SEPARATOR); - const translated = await replicateTranslate(joined, targetLanguage); - - const parts = translated.split(SEPARATOR); - if (parts.length === texts.length) return parts; - - // Fallback: translate individually if separator got mangled - return Promise.all(texts.map(t => replicateTranslate(t, targetLanguage))); + return results; }