107 lines
3.5 KiB
TypeScript
107 lines
3.5 KiB
TypeScript
|
|
import JSZip from "jszip";
|
||
|
|
import { replicateTranslateBatch } from "../replicate-translate";
|
||
|
|
|
||
|
|
function escapeXml(text: string): string {
|
||
|
|
return text
|
||
|
|
.replace(/&/g, "&")
|
||
|
|
.replace(/</g, "<")
|
||
|
|
.replace(/>/g, ">")
|
||
|
|
.replace(/"/g, """)
|
||
|
|
.replace(/'/g, "'");
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Extract paragraph texts from document.xml string.
|
||
|
|
* Returns array of {index, text} where index is the paragraph number.
|
||
|
|
*/
|
||
|
|
function extractParagraphs(xml: string): { index: number; text: string; start: number; end: number }[] {
|
||
|
|
const paragraphs: { index: number; text: string; start: number; end: number }[] = [];
|
||
|
|
const pRegex = /<w:p[ >]/g;
|
||
|
|
const pCloseTag = "</w:p>";
|
||
|
|
let idx = 0;
|
||
|
|
let match: RegExpExecArray | null;
|
||
|
|
|
||
|
|
while ((match = pRegex.exec(xml)) !== null) {
|
||
|
|
const start = match.index;
|
||
|
|
const end = xml.indexOf(pCloseTag, start) + pCloseTag.length;
|
||
|
|
if (end < pCloseTag.length) break;
|
||
|
|
|
||
|
|
const paraXml = xml.slice(start, end);
|
||
|
|
|
||
|
|
// Extract all text content within this paragraph
|
||
|
|
const textParts: string[] = [];
|
||
|
|
const tRegex = /<w:t[^>]*>([\s\S]*?)<\/w:t>/g;
|
||
|
|
let tMatch: RegExpExecArray | null;
|
||
|
|
while ((tMatch = tRegex.exec(paraXml)) !== null) {
|
||
|
|
textParts.push(tMatch[1]);
|
||
|
|
}
|
||
|
|
|
||
|
|
const text = textParts.join("").trim();
|
||
|
|
if (text) {
|
||
|
|
paragraphs.push({ index: idx, text, start, end });
|
||
|
|
}
|
||
|
|
idx++;
|
||
|
|
}
|
||
|
|
return paragraphs;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Replace text within a paragraph XML while preserving formatting of first run.
|
||
|
|
* Empties all other text runs.
|
||
|
|
*/
|
||
|
|
function replaceParagraphText(paraXml: string, translatedText: string): string {
|
||
|
|
let firstDone = false;
|
||
|
|
return paraXml.replace(/<w:t([^>]*)>([\s\S]*?)<\/w:t>/g, (_match, attrs, content) => {
|
||
|
|
if (!firstDone && content.trim()) {
|
||
|
|
firstDone = true;
|
||
|
|
return `<w:t xml:space="preserve">${escapeXml(translatedText)}</w:t>`;
|
||
|
|
}
|
||
|
|
if (firstDone) {
|
||
|
|
return `<w:t></w:t>`;
|
||
|
|
}
|
||
|
|
return _match; // preserve empty runs before the first text
|
||
|
|
});
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Translate a DOCX file buffer, preserving formatting.
|
||
|
|
*/
|
||
|
|
export async function translateDocx(
|
||
|
|
buffer: Buffer,
|
||
|
|
targetLanguage: string
|
||
|
|
): Promise<Buffer> {
|
||
|
|
const zip = await JSZip.loadAsync(buffer);
|
||
|
|
const docFile = zip.file("word/document.xml");
|
||
|
|
if (!docFile) throw new Error("Invalid DOCX: missing word/document.xml");
|
||
|
|
|
||
|
|
let xml = await docFile.async("string");
|
||
|
|
const paragraphs = extractParagraphs(xml);
|
||
|
|
|
||
|
|
if (paragraphs.length === 0) {
|
||
|
|
return Buffer.from(await zip.generateAsync({ type: "nodebuffer" }));
|
||
|
|
}
|
||
|
|
|
||
|
|
// Translate all paragraphs
|
||
|
|
const translations = await replicateTranslateBatch(
|
||
|
|
paragraphs.map(p => p.text),
|
||
|
|
targetLanguage
|
||
|
|
);
|
||
|
|
|
||
|
|
// Replace paragraphs from end to start to preserve offsets
|
||
|
|
const sorted = [...paragraphs].sort((a, b) => b.start - a.start);
|
||
|
|
for (const para of sorted) {
|
||
|
|
const translationIdx = paragraphs.findIndex(p => p.start === para.start);
|
||
|
|
const translated = translations[translationIdx] ?? para.text;
|
||
|
|
const originalPara = xml.slice(para.start, para.end);
|
||
|
|
const translatedPara = replaceParagraphText(originalPara, translated);
|
||
|
|
xml = xml.slice(0, para.start) + translatedPara + xml.slice(para.end);
|
||
|
|
}
|
||
|
|
|
||
|
|
zip.file("word/document.xml", xml);
|
||
|
|
const outBuffer = await zip.generateAsync({
|
||
|
|
type: "nodebuffer",
|
||
|
|
compression: "DEFLATE"
|
||
|
|
});
|
||
|
|
return Buffer.from(outBuffer);
|
||
|
|
}
|