mono/packages/content/ref/pdf-to-images/dist/lib/pdf.js

70 lines
3.3 KiB
JavaScript

import * as mupdf from 'mupdf';
import { Logger } from 'tslog';
import { dirname } from 'node:path';
import { resolveVariables } from '@polymech/commons';
import { sync as mkdir } from '@polymech/fs/dir';
import { writeFileSync } from 'node:fs';
import { Buffer } from 'node:buffer';
// Helper function to convert object-like image data to Buffer
function imageDataObjectToBuffer(imageDataObject) {
const keys = Object.keys(imageDataObject).map(Number).sort((a, b) => a - b);
const bufferLength = keys.length > 0 ? keys[keys.length - 1] + 1 : 0; // Determine length based on max index + 1
const buffer = Buffer.allocUnsafe(bufferLength); // Use allocUnsafe for performance if overwriting all bytes
for (const key in imageDataObject) {
if (Object.prototype.hasOwnProperty.call(imageDataObject, key)) {
const index = parseInt(key, 10);
if (!isNaN(index) && index >= 0 && index < bufferLength) {
buffer[index] = imageDataObject[key];
}
}
}
return buffer;
}
export async function convertPdfToImages(pdfData, options) {
const logger = options.logger || new Logger();
const outputFiles = [];
try {
const doc = mupdf.Document.openDocument(pdfData, 'pdf');
const pageCount = doc.countPages();
// Validate and determine page range (adjusting for 0-based index)
const start = (options.startPage ?? 1) - 1;
const end = (options.endPage ?? pageCount) - 1;
if (start < 0 || start >= pageCount) {
throw new Error(`startPage (${options.startPage}) is out of valid range (1-${pageCount})`);
}
if (end < 0 || end >= pageCount) {
throw new Error(`endPage (${options.endPage}) is out of valid range (1-${pageCount})`);
}
if (start > end) {
// This should also be caught by Zod schema, but good to double-check
throw new Error(`startPage (${options.startPage}) cannot be greater than endPage (${options.endPage})`);
}
const numPagesToProcess = end - start + 1;
logger.info(`Processing pages ${start + 1} to ${end + 1} (${numPagesToProcess} pages) of ${pageCount} total`);
for (let i = start; i <= end; i++) {
const pageNumber = i + 1; // User-facing page number (1-based)
// Create page-specific variables
const pageVariables = {
...options.baseVariables,
PAGE: pageNumber.toString()
};
// Resolve the output path using the template and page-specific variables
const outputPath = await resolveVariables(options.outputPathTemplate, false, pageVariables);
const page = doc.loadPage(i);
const pixmap = page.toPixmap([1, 0, 0, 1, 0, 0], mupdf.ColorSpace.DeviceRGB, false);
const imageData = options.format === 'png'
? pixmap.asPNG()
: pixmap.asJPEG(100, false);
mkdir(dirname(outputPath));
writeFileSync(outputPath, imageDataObjectToBuffer(imageData));
outputFiles.push(outputPath);
logger.info(`Converted page ${pageNumber} to ${outputPath}`);
}
return outputFiles;
}
catch (error) {
logger.error('Error converting PDF to images:', error);
throw error;
}
}