LingvAI/utils/document-processors/pdf.ts

import { PDFDocument, rgb, StandardFonts } from "pdf-lib";
import { replicateTranslateBatch } from "../replicate-translate";

type PdfParseResult = {
    numpages: number;
    text: string;
    info: Record<string, unknown>;
};

async function parsePdf(buffer: Buffer): Promise<PdfParseResult> {
    // Avoid Next.js issues with pdf-parse test file imports
    // eslint-disable-next-line @typescript-eslint/no-var-requires
    const pdfParse = require("pdf-parse/lib/pdf-parse.js");
    return pdfParse(buffer);
}

function wrapText(text: string, maxCharsPerLine: number): string[] {
    const words = text.split(/\s+/);
    const lines: string[] = [];
    let current = "";

    for (const word of words) {
        if ((current + " " + word).trim().length > maxCharsPerLine) {
            if (current) lines.push(current);
            current = word;
        } else {
            current = current ? current + " " + word : word;
        }
    }
    if (current) lines.push(current);
    return lines;
}

/**
 * Translate a PDF buffer. Since PDFs don't support in-place text editing,
 * this extracts text, translates it, and creates a new formatted PDF.
 */
export async function translatePdf(
    buffer: Buffer,
    targetLanguage: string,
    sourceLanguage?: string
): Promise<Buffer> {
    const parsed = await parsePdf(buffer);
    const rawText = parsed.text;

    // Split into paragraphs (separated by double newlines or page breaks)
    const paragraphs = rawText
        .split(/\n{2,}|\f/)
        .map(p => p.replace(/\n/g, " ").trim())
        .filter(p => p.length > 0);

    if (paragraphs.length === 0) {
        throw new Error("No extractable text found in PDF");
    }

    // Translate all paragraphs
    const translations = await replicateTranslateBatch(paragraphs, targetLanguage);

    // Build output PDF
    const pdfDoc = await PDFDocument.create();
    const font = await pdfDoc.embedFont(StandardFonts.Helvetica);
    const boldFont = await pdfDoc.embedFont(StandardFonts.HelveticaBold);

    const PAGE_WIDTH = 595;
    const PAGE_HEIGHT = 842;
    const MARGIN = 50;
    const FONT_SIZE = 11;
    const TITLE_SIZE = 13;
    const LINE_HEIGHT = 16;
    const MAX_LINE_CHARS = 80;

    let page = pdfDoc.addPage([PAGE_WIDTH, PAGE_HEIGHT]);
    let y = PAGE_HEIGHT - MARGIN;

    function ensureSpace(needed: number) {
        if (y - needed < MARGIN) {
            page = pdfDoc.addPage([PAGE_WIDTH, PAGE_HEIGHT]);
            y = PAGE_HEIGHT - MARGIN;
        }
    }

    // Title
    const title = `Translation to: ${targetLanguage}${sourceLanguage ? ` (from: ${sourceLanguage})` : ""}`;
    ensureSpace(TITLE_SIZE + LINE_HEIGHT);
    page.drawText(title, {
        x: MARGIN,
        y,
        size: TITLE_SIZE,
        font: boldFont,
        color: rgb(0.2, 0.2, 0.7)
    });
    y -= TITLE_SIZE + LINE_HEIGHT;

    // Draw translated paragraphs
    for (const para of translations) {
        const lines = wrapText(para, MAX_LINE_CHARS);
        ensureSpace(lines.length * LINE_HEIGHT + LINE_HEIGHT);

        for (const line of lines) {
            ensureSpace(LINE_HEIGHT);
            page.drawText(line, {
                x: MARGIN,
                y,
                size: FONT_SIZE,
                font,
                color: rgb(0, 0, 0)
            });
            y -= LINE_HEIGHT;
        }
        y -= LINE_HEIGHT * 0.5; // paragraph gap
    }

    const pdfBytes = await pdfDoc.save();
    return Buffer.from(pdfBytes);
}