fix: respect JigsawStack 5000-char limit with proper batching
- buildBatches: groups texts into chunks that fit within 4800 chars (200 char safety margin) when joined with the separator - translateLongText: splits individual cells/paragraphs that exceed the limit at paragraph/sentence boundaries, translates each chunk, then rejoins — instead of hitting the API with oversized input - Process batches sequentially to avoid overloading the local model - Separator fallback still works: if separator gets translated, falls back to individual calls per text within that batch Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -136,24 +136,98 @@ export async function replicateTranslate(
|
|||||||
: translateViaCloud(text, targetLanguage);
|
: translateViaCloud(text, targetLanguage);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Batch translate using separator trick to minimize API calls
|
|
||||||
const SEPARATOR = "\n{{SEP}}\n";
|
const SEPARATOR = "\n{{SEP}}\n";
|
||||||
|
const MAX_CHARS = 4800; // JigsawStack limit is 5000 — leave headroom
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split a single text that exceeds MAX_CHARS into translatable chunks,
|
||||||
|
* preferring paragraph/sentence boundaries.
|
||||||
|
*/
|
||||||
|
async function translateLongText(text: string, targetLanguage: string): Promise<string> {
|
||||||
|
const chunks: string[] = [];
|
||||||
|
let remaining = text;
|
||||||
|
|
||||||
|
while (remaining.length > MAX_CHARS) {
|
||||||
|
let splitAt = MAX_CHARS;
|
||||||
|
const para = remaining.lastIndexOf("\n", MAX_CHARS);
|
||||||
|
const sentence = remaining.lastIndexOf(". ", MAX_CHARS);
|
||||||
|
if (para > MAX_CHARS * 0.5) splitAt = para + 1;
|
||||||
|
else if (sentence > MAX_CHARS * 0.5) splitAt = sentence + 2;
|
||||||
|
chunks.push(remaining.slice(0, splitAt));
|
||||||
|
remaining = remaining.slice(splitAt);
|
||||||
|
}
|
||||||
|
if (remaining.trim()) chunks.push(remaining);
|
||||||
|
|
||||||
|
const results = await Promise.all(chunks.map(c => replicateTranslate(c, targetLanguage)));
|
||||||
|
return results.join(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build groups of texts that fit within MAX_CHARS when joined with SEPARATOR.
|
||||||
|
* Texts that individually exceed MAX_CHARS are kept alone for chunked translation.
|
||||||
|
*/
|
||||||
|
function buildBatches(texts: string[]): { indices: number[]; long: boolean }[] {
|
||||||
|
const batches: { indices: number[]; long: boolean }[] = [];
|
||||||
|
let current: number[] = [];
|
||||||
|
let currentLen = 0;
|
||||||
|
|
||||||
|
for (let i = 0; i < texts.length; i++) {
|
||||||
|
const t = texts[i];
|
||||||
|
|
||||||
|
if (t.length > MAX_CHARS) {
|
||||||
|
// Flush current group first
|
||||||
|
if (current.length > 0) { batches.push({ indices: current, long: false }); current = []; currentLen = 0; }
|
||||||
|
batches.push({ indices: [i], long: true });
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const added = currentLen === 0 ? t.length : currentLen + SEPARATOR.length + t.length;
|
||||||
|
if (added > MAX_CHARS && current.length > 0) {
|
||||||
|
batches.push({ indices: current, long: false });
|
||||||
|
current = [i];
|
||||||
|
currentLen = t.length;
|
||||||
|
} else {
|
||||||
|
current.push(i);
|
||||||
|
currentLen = added;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (current.length > 0) batches.push({ indices: current, long: false });
|
||||||
|
return batches;
|
||||||
|
}
|
||||||
|
|
||||||
export async function replicateTranslateBatch(
|
export async function replicateTranslateBatch(
|
||||||
texts: string[],
|
texts: string[],
|
||||||
targetLanguage: string
|
targetLanguage: string
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
if (texts.length === 0) return [];
|
if (texts.length === 0) return [];
|
||||||
if (texts.length === 1) {
|
|
||||||
return [await replicateTranslate(texts[0], targetLanguage)];
|
const results: string[] = new Array(texts.length);
|
||||||
|
const batches = buildBatches(texts);
|
||||||
|
|
||||||
|
// Process batches sequentially to avoid hammering the model
|
||||||
|
for (const batch of batches) {
|
||||||
|
if (batch.long) {
|
||||||
|
// Single oversized text — chunk it
|
||||||
|
results[batch.indices[0]] = await translateLongText(texts[batch.indices[0]], targetLanguage);
|
||||||
|
} else if (batch.indices.length === 1) {
|
||||||
|
results[batch.indices[0]] = await replicateTranslate(texts[batch.indices[0]], targetLanguage);
|
||||||
|
} else {
|
||||||
|
// Multi-text batch within limit
|
||||||
|
const joined = batch.indices.map(i => texts[i]).join(SEPARATOR);
|
||||||
|
const translated = await replicateTranslate(joined, targetLanguage);
|
||||||
|
const parts = translated.split(SEPARATOR);
|
||||||
|
|
||||||
|
if (parts.length === batch.indices.length) {
|
||||||
|
batch.indices.forEach((idx, i) => { results[idx] = parts[i]; });
|
||||||
|
} else {
|
||||||
|
// Separator got translated — fall back to individual calls
|
||||||
|
const individual = await Promise.all(
|
||||||
|
batch.indices.map(i => replicateTranslate(texts[i], targetLanguage))
|
||||||
|
);
|
||||||
|
batch.indices.forEach((idx, i) => { results[idx] = individual[i]; });
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const joined = texts.join(SEPARATOR);
|
return results;
|
||||||
const translated = await replicateTranslate(joined, targetLanguage);
|
|
||||||
|
|
||||||
const parts = translated.split(SEPARATOR);
|
|
||||||
if (parts.length === texts.length) return parts;
|
|
||||||
|
|
||||||
// Fallback: translate individually if separator got mangled
|
|
||||||
return Promise.all(texts.map(t => replicateTranslate(t, targetLanguage)));
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user