utils/document-processors/docx.ts

import JSZip from "jszip";
import { replicateTranslateBatch } from "../replicate-translate";

function escapeXml(text: string): string {
    return text
        .replace(/&/g, "&amp;")
        .replace(/</g, "&lt;")
        .replace(/>/g, "&gt;")
        .replace(/"/g, "&quot;")
        .replace(/'/g, "&apos;");
}

/**
 * Extract paragraph texts from document.xml string.
 * Returns array of {index, text} where index is the paragraph number.
 */
function extractParagraphs(xml: string): { index: number; text: string; start: number; end: number }[] {
    const paragraphs: { index: number; text: string; start: number; end: number }[] = [];
    const pRegex = /<w:p[ >]/g;
    const pCloseTag = "</w:p>";
    let idx = 0;
    let match: RegExpExecArray | null;

    while ((match = pRegex.exec(xml)) !== null) {
        const start = match.index;
        const end = xml.indexOf(pCloseTag, start) + pCloseTag.length;
        if (end < pCloseTag.length) break;

        const paraXml = xml.slice(start, end);

        // Extract all text content within this paragraph
        const textParts: string[] = [];
        const tRegex = /<w:t[^>]*>([\s\S]*?)<\/w:t>/g;
        let tMatch: RegExpExecArray | null;
        while ((tMatch = tRegex.exec(paraXml)) !== null) {
            textParts.push(tMatch[1]);
        }

        const text = textParts.join("").trim();
        if (text) {
            paragraphs.push({ index: idx, text, start, end });
        }
        idx++;
    }
    return paragraphs;
}

/**
 * Replace text within a paragraph XML while preserving formatting of first run.
 * Empties all other text runs.
 */
function replaceParagraphText(paraXml: string, translatedText: string): string {
    let firstDone = false;
    return paraXml.replace(/<w:t([^>]*)>([\s\S]*?)<\/w:t>/g, (_match, attrs, content) => {
        if (!firstDone && content.trim()) {
            firstDone = true;
            return `<w:t xml:space="preserve">${escapeXml(translatedText)}</w:t>`;
        }
        if (firstDone) {
            return `<w:t></w:t>`;
        }
        return _match; // preserve empty runs before the first text
    });
}

/**
 * Translate a DOCX file buffer, preserving formatting.
 */
export async function translateDocx(
    buffer: Buffer,
    targetLanguage: string
): Promise<Buffer> {
    const zip = await JSZip.loadAsync(buffer);
    const docFile = zip.file("word/document.xml");
    if (!docFile) throw new Error("Invalid DOCX: missing word/document.xml");

    let xml = await docFile.async("string");
    const paragraphs = extractParagraphs(xml);

    if (paragraphs.length === 0) {
        return Buffer.from(await zip.generateAsync({ type: "nodebuffer" }));
    }

    // Translate all paragraphs
    const translations = await replicateTranslateBatch(
        paragraphs.map(p => p.text),
        targetLanguage
    );

    // Replace paragraphs from end to start to preserve offsets
    const sorted = [...paragraphs].sort((a, b) => b.start - a.start);
    for (const para of sorted) {
        const translationIdx = paragraphs.findIndex(p => p.start === para.start);
        const translated = translations[translationIdx] ?? para.text;
        const originalPara = xml.slice(para.start, para.end);
        const translatedPara = replaceParagraphText(originalPara, translated);
        xml = xml.slice(0, para.start) + translatedPara + xml.slice(para.end);
    }

    zip.file("word/document.xml", xml);
    const outBuffer = await zip.generateAsync({
        type: "nodebuffer",
        compression: "DEFLATE"
    });
    return Buffer.from(outBuffer);
}
feat: add admin panel, Replicate AI translation, and document translation - Admin panel (/admin) with JWT auth: configure Replicate API token, JigsawStack API key, model version, enable/disable AI translation, change admin password. Settings persisted in data/settings.json. - Replicate AI translation: POST /api/translate/replicate uses JigsawStack text-translate model via Replicate API. Main page switches to client-side AI translation when enabled. - Document translation tab: supports PDF, DOCX, XLSX, XLS, CSV. Excel/Word formatting fully preserved (SheetJS + JSZip XML manipulation). PDF uses pdf-parse extraction + pdf-lib reconstruction. Column selector UI for tabular data (per-sheet, All/None toggles). - Updated README with full implementation documentation. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-10 07:43:54 +01:00			`import JSZip from "jszip";`
			`import { replicateTranslateBatch } from "../replicate-translate";`

			`function escapeXml(text: string): string {`
			`return text`
			`.replace(/&/g, "&")`
			`.replace(/</g, "<")`
			`.replace(/>/g, ">")`
			`.replace(/"/g, """)`
			`.replace(/'/g, "'");`
			`}`

			`/**`
			`* Extract paragraph texts from document.xml string.`
			`* Returns array of {index, text} where index is the paragraph number.`
			`*/`
			`function extractParagraphs(xml: string): { index: number; text: string; start: number; end: number }[] {`
			`const paragraphs: { index: number; text: string; start: number; end: number }[] = [];`
			`const pRegex = /<w:p[ >]/g;`
			`const pCloseTag = "</w:p>";`
			`let idx = 0;`
			`let match: RegExpExecArray \| null;`

			`while ((match = pRegex.exec(xml)) !== null) {`
			`const start = match.index;`
			`const end = xml.indexOf(pCloseTag, start) + pCloseTag.length;`
			`if (end < pCloseTag.length) break;`

			`const paraXml = xml.slice(start, end);`

			`// Extract all text content within this paragraph`
			`const textParts: string[] = [];`
			`const tRegex = /<w:t[^>]>([\s\S]?)<\/w:t>/g;`
			`let tMatch: RegExpExecArray \| null;`
			`while ((tMatch = tRegex.exec(paraXml)) !== null) {`
			`textParts.push(tMatch[1]);`
			`}`

			`const text = textParts.join("").trim();`
			`if (text) {`
			`paragraphs.push({ index: idx, text, start, end });`
			`}`
			`idx++;`
			`}`
			`return paragraphs;`
			`}`

			`/**`
			`* Replace text within a paragraph XML while preserving formatting of first run.`
			`* Empties all other text runs.`
			`*/`
			`function replaceParagraphText(paraXml: string, translatedText: string): string {`
			`let firstDone = false;`
			`return paraXml.replace(/<w:t([^>])>([\s\S]?)<\/w:t>/g, (_match, attrs, content) => {`
			`if (!firstDone && content.trim()) {`
			`firstDone = true;`
			return `<w:t xml:space="preserve">${escapeXml(translatedText)}</w:t>`;
			`}`
			`if (firstDone) {`
			return `<w:t></w:t>`;
			`}`
			`return _match; // preserve empty runs before the first text`
			`});`
			`}`

			`/**`
			`* Translate a DOCX file buffer, preserving formatting.`
			`*/`
			`export async function translateDocx(`
			`buffer: Buffer,`
			`targetLanguage: string`
			`): Promise<Buffer> {`
			`const zip = await JSZip.loadAsync(buffer);`
			`const docFile = zip.file("word/document.xml");`
			`if (!docFile) throw new Error("Invalid DOCX: missing word/document.xml");`

			`let xml = await docFile.async("string");`
			`const paragraphs = extractParagraphs(xml);`

			`if (paragraphs.length === 0) {`
			`return Buffer.from(await zip.generateAsync({ type: "nodebuffer" }));`
			`}`

			`// Translate all paragraphs`
			`const translations = await replicateTranslateBatch(`
			`paragraphs.map(p => p.text),`
			`targetLanguage`
			`);`

			`// Replace paragraphs from end to start to preserve offsets`
			`const sorted = [...paragraphs].sort((a, b) => b.start - a.start);`
			`for (const para of sorted) {`
			`const translationIdx = paragraphs.findIndex(p => p.start === para.start);`
			`const translated = translations[translationIdx] ?? para.text;`
			`const originalPara = xml.slice(para.start, para.end);`
			`const translatedPara = replaceParagraphText(originalPara, translated);`
			`xml = xml.slice(0, para.start) + translatedPara + xml.slice(para.end);`
			`}`

			`zip.file("word/document.xml", xml);`
			`const outBuffer = await zip.generateAsync({`
			`type: "nodebuffer",`
			`compression: "DEFLATE"`
			`});`
			`return Buffer.from(outBuffer);`
			`}`