Files
LingvAI/utils/document-processors/docx.ts
Malin 0799101da3 feat: add admin panel, Replicate AI translation, and document translation
- Admin panel (/admin) with JWT auth: configure Replicate API token,
  JigsawStack API key, model version, enable/disable AI translation,
  change admin password. Settings persisted in data/settings.json.

- Replicate AI translation: POST /api/translate/replicate uses
  JigsawStack text-translate model via Replicate API. Main page
  switches to client-side AI translation when enabled.

- Document translation tab: supports PDF, DOCX, XLSX, XLS, CSV.
  Excel/Word formatting fully preserved (SheetJS + JSZip XML manipulation).
  PDF uses pdf-parse extraction + pdf-lib reconstruction.
  Column selector UI for tabular data (per-sheet, All/None toggles).

- Updated README with full implementation documentation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 07:43:54 +01:00

107 lines
3.5 KiB
TypeScript

import JSZip from "jszip";
import { replicateTranslateBatch } from "../replicate-translate";
function escapeXml(text: string): string {
return text
.replace(/&/g, "&amp;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;")
.replace(/"/g, "&quot;")
.replace(/'/g, "&apos;");
}
/**
* Extract paragraph texts from document.xml string.
* Returns array of {index, text} where index is the paragraph number.
*/
function extractParagraphs(xml: string): { index: number; text: string; start: number; end: number }[] {
const paragraphs: { index: number; text: string; start: number; end: number }[] = [];
const pRegex = /<w:p[ >]/g;
const pCloseTag = "</w:p>";
let idx = 0;
let match: RegExpExecArray | null;
while ((match = pRegex.exec(xml)) !== null) {
const start = match.index;
const end = xml.indexOf(pCloseTag, start) + pCloseTag.length;
if (end < pCloseTag.length) break;
const paraXml = xml.slice(start, end);
// Extract all text content within this paragraph
const textParts: string[] = [];
const tRegex = /<w:t[^>]*>([\s\S]*?)<\/w:t>/g;
let tMatch: RegExpExecArray | null;
while ((tMatch = tRegex.exec(paraXml)) !== null) {
textParts.push(tMatch[1]);
}
const text = textParts.join("").trim();
if (text) {
paragraphs.push({ index: idx, text, start, end });
}
idx++;
}
return paragraphs;
}
/**
* Replace text within a paragraph XML while preserving formatting of first run.
* Empties all other text runs.
*/
function replaceParagraphText(paraXml: string, translatedText: string): string {
let firstDone = false;
return paraXml.replace(/<w:t([^>]*)>([\s\S]*?)<\/w:t>/g, (_match, attrs, content) => {
if (!firstDone && content.trim()) {
firstDone = true;
return `<w:t xml:space="preserve">${escapeXml(translatedText)}</w:t>`;
}
if (firstDone) {
return `<w:t></w:t>`;
}
return _match; // preserve empty runs before the first text
});
}
/**
* Translate a DOCX file buffer, preserving formatting.
*/
export async function translateDocx(
buffer: Buffer,
targetLanguage: string
): Promise<Buffer> {
const zip = await JSZip.loadAsync(buffer);
const docFile = zip.file("word/document.xml");
if (!docFile) throw new Error("Invalid DOCX: missing word/document.xml");
let xml = await docFile.async("string");
const paragraphs = extractParagraphs(xml);
if (paragraphs.length === 0) {
return Buffer.from(await zip.generateAsync({ type: "nodebuffer" }));
}
// Translate all paragraphs
const translations = await replicateTranslateBatch(
paragraphs.map(p => p.text),
targetLanguage
);
// Replace paragraphs from end to start to preserve offsets
const sorted = [...paragraphs].sort((a, b) => b.start - a.start);
for (const para of sorted) {
const translationIdx = paragraphs.findIndex(p => p.start === para.start);
const translated = translations[translationIdx] ?? para.text;
const originalPara = xml.slice(para.start, para.end);
const translatedPara = replaceParagraphText(originalPara, translated);
xml = xml.slice(0, para.start) + translatedPara + xml.slice(para.end);
}
zip.file("word/document.xml", xml);
const outBuffer = await zip.generateAsync({
type: "nodebuffer",
compression: "DEFLATE"
});
return Buffer.from(outBuffer);
}