Files
LingvAI/utils/document-processors/pdf.ts
Malin 0799101da3 feat: add admin panel, Replicate AI translation, and document translation
- Admin panel (/admin) with JWT auth: configure Replicate API token,
  JigsawStack API key, model version, enable/disable AI translation,
  change admin password. Settings persisted in data/settings.json.

- Replicate AI translation: POST /api/translate/replicate uses
  JigsawStack text-translate model via Replicate API. Main page
  switches to client-side AI translation when enabled.

- Document translation tab: supports PDF, DOCX, XLSX, XLS, CSV.
  Excel/Word formatting fully preserved (SheetJS + JSZip XML manipulation).
  PDF uses pdf-parse extraction + pdf-lib reconstruction.
  Column selector UI for tabular data (per-sheet, All/None toggles).

- Updated README with full implementation documentation.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-10 07:43:54 +01:00

116 lines
3.4 KiB
TypeScript

import { PDFDocument, rgb, StandardFonts } from "pdf-lib";
import { replicateTranslateBatch } from "../replicate-translate";
type PdfParseResult = {
numpages: number;
text: string;
info: Record<string, unknown>;
};
async function parsePdf(buffer: Buffer): Promise<PdfParseResult> {
// Avoid Next.js issues with pdf-parse test file imports
// eslint-disable-next-line @typescript-eslint/no-var-requires
const pdfParse = require("pdf-parse/lib/pdf-parse.js");
return pdfParse(buffer);
}
function wrapText(text: string, maxCharsPerLine: number): string[] {
const words = text.split(/\s+/);
const lines: string[] = [];
let current = "";
for (const word of words) {
if ((current + " " + word).trim().length > maxCharsPerLine) {
if (current) lines.push(current);
current = word;
} else {
current = current ? current + " " + word : word;
}
}
if (current) lines.push(current);
return lines;
}
/**
* Translate a PDF buffer. Since PDFs don't support in-place text editing,
* this extracts text, translates it, and creates a new formatted PDF.
*/
export async function translatePdf(
buffer: Buffer,
targetLanguage: string,
sourceLanguage?: string
): Promise<Buffer> {
const parsed = await parsePdf(buffer);
const rawText = parsed.text;
// Split into paragraphs (separated by double newlines or page breaks)
const paragraphs = rawText
.split(/\n{2,}|\f/)
.map(p => p.replace(/\n/g, " ").trim())
.filter(p => p.length > 0);
if (paragraphs.length === 0) {
throw new Error("No extractable text found in PDF");
}
// Translate all paragraphs
const translations = await replicateTranslateBatch(paragraphs, targetLanguage);
// Build output PDF
const pdfDoc = await PDFDocument.create();
const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
const boldFont = await pdfDoc.embedFont(StandardFonts.HelveticaBold);
const PAGE_WIDTH = 595;
const PAGE_HEIGHT = 842;
const MARGIN = 50;
const FONT_SIZE = 11;
const TITLE_SIZE = 13;
const LINE_HEIGHT = 16;
const MAX_LINE_CHARS = 80;
let page = pdfDoc.addPage([PAGE_WIDTH, PAGE_HEIGHT]);
let y = PAGE_HEIGHT - MARGIN;
function ensureSpace(needed: number) {
if (y - needed < MARGIN) {
page = pdfDoc.addPage([PAGE_WIDTH, PAGE_HEIGHT]);
y = PAGE_HEIGHT - MARGIN;
}
}
// Title
const title = `Translation to: ${targetLanguage}${sourceLanguage ? ` (from: ${sourceLanguage})` : ""}`;
ensureSpace(TITLE_SIZE + LINE_HEIGHT);
page.drawText(title, {
x: MARGIN,
y,
size: TITLE_SIZE,
font: boldFont,
color: rgb(0.2, 0.2, 0.7)
});
y -= TITLE_SIZE + LINE_HEIGHT;
// Draw translated paragraphs
for (const para of translations) {
const lines = wrapText(para, MAX_LINE_CHARS);
ensureSpace(lines.length * LINE_HEIGHT + LINE_HEIGHT);
for (const line of lines) {
ensureSpace(LINE_HEIGHT);
page.drawText(line, {
x: MARGIN,
y,
size: FONT_SIZE,
font,
color: rgb(0, 0, 0)
});
y -= LINE_HEIGHT;
}
y -= LINE_HEIGHT * 0.5; // paragraph gap
}
const pdfBytes = await pdfDoc.save();
return Buffer.from(pdfBytes);
}