media:pdf:tests 2/3
1
packages/media/dist-in/cli.d.ts
vendored
Normal file
@ -0,0 +1 @@
|
||||
export declare const cli: any;
|
||||
4
packages/media/dist-in/cli.js
Normal file
@ -0,0 +1,4 @@
|
||||
import yargs from 'yargs';
|
||||
import { hideBin } from 'yargs/helpers';
|
||||
export const cli = yargs(hideBin(process.argv));
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiY2xpLmpzIiwic291cmNlUm9vdCI6IiIsInNvdXJjZXMiOlsiLi4vc3JjL2NsaS50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiQUFBQSxPQUFPLEtBQUssTUFBTSxPQUFPLENBQUE7QUFDekIsT0FBTyxFQUFFLE9BQU8sRUFBRSxNQUFNLGVBQWUsQ0FBQTtBQUV2QyxNQUFNLENBQUMsTUFBTSxHQUFHLEdBQUcsS0FBSyxDQUFDLE9BQU8sQ0FBQyxPQUFPLENBQUMsSUFBSSxDQUFDLENBQUMsQ0FBQSJ9
|
||||
1
packages/media/dist-in/commands/pdf2jpg.d.ts
vendored
@ -3,4 +3,3 @@ export declare const command = "pdf2jpg";
|
||||
export declare const desc = "Convert PDF to images";
|
||||
export declare const builder: (yargs: CLI.Argv) => any;
|
||||
export declare function handler(argv: CLI.Arguments): Promise<void>;
|
||||
export declare const register: (cli: CLI.Argv) => any;
|
||||
|
||||
@ -3,6 +3,7 @@ import { existsSync } from 'node:fs';
|
||||
import * as z from 'zod';
|
||||
import { runConversion } from '../lib/pdf/convert.js';
|
||||
import { ConvertCommandSchema } from '../lib/pdf/types.js';
|
||||
import { cli } from '../cli.js';
|
||||
export const command = 'pdf2jpg';
|
||||
export const desc = 'Convert PDF to images';
|
||||
export const builder = (yargs) => {
|
||||
@ -65,7 +66,5 @@ export async function handler(argv) {
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
export const register = (cli) => {
|
||||
return cli.command(command, desc, builder, handler);
|
||||
};
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicGRmMmpwZy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3NyYy9jb21tYW5kcy9wZGYyanBnLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUNBLE9BQU8sRUFBRSxNQUFNLEVBQUUsTUFBTSxhQUFhLENBQUE7QUFDcEMsT0FBTyxFQUFFLFVBQVUsRUFBRSxNQUFNLFNBQVMsQ0FBQztBQUNyQyxPQUFPLEtBQUssQ0FBQyxNQUFNLEtBQUssQ0FBQztBQUN6QixPQUFPLEVBQUUsYUFBYSxFQUFFLE1BQU0sdUJBQXVCLENBQUM7QUFDdEQsT0FBTyxFQUF3QixvQkFBb0IsRUFBRSxNQUFNLHFCQUFxQixDQUFBO0FBRWhGLE1BQU0sQ0FBQyxNQUFNLE9BQU8sR0FBRyxTQUFTLENBQUM7QUFDakMsTUFBTSxDQUFDLE1BQU0sSUFBSSxHQUFHLHVCQUF1QixDQUFDO0FBRTVDLE1BQU0sQ0FBQyxNQUFNLE9BQU8sR0FBRyxDQUFDLEtBQWUsRUFBRSxFQUFFO0lBQ3ZDLE9BQU8sS0FBSztTQUNQLE1BQU0sQ0FBQyxPQUFPLEVBQUU7UUFDYixLQUFLLEVBQUUsR0FBRztRQUNWLFFBQVEsRUFBRSw0QkFBNEI7UUFDdEMsWUFBWSxFQUFFLElBQUk7UUFDbEIsSUFBSSxFQUFFLFFBQVE7S0FDakIsQ0FBQztTQUNELE1BQU0sQ0FBQyxRQUFRLEVBQUU7UUFDZCxLQUFLLEVBQUUsR0FBRztRQUNWLFFBQVEsRUFBRSxxREFBcUQ7UUFDL0QsSUFBSSxFQUFFLFFBQVE7S0FDakIsQ0FBQztTQUNELE1BQU0sQ0FBQyxLQUFLLEVBQUU7UUFDWCxRQUFRLEVBQUUsa0NBQWtDO1FBQzVDLE9BQU8sRUFBRSxHQUFHO1FBQ1osSUFBSSxFQUFFLFFBQVE7S0FDakIsQ0FBQztTQUNELE1BQU0sQ0FBQyxPQUFPLEVBQUU7UUFDYixRQUFRLEVBQUUsZ0VBQWdFO1FBQzFFLE9BQU8sRUFBRSxDQUFDO1FBQ1YsSUFBSSxFQUFFLFFBQVE7S0FDakIsQ0FBQztTQUNELE1BQU0sQ0FBQyxRQUFRLEVBQUU7UUFDZCxRQUFRLEVBQUUscUJBQXFCO1FBQy9CLE9BQU8sRUFBRSxDQUFDLEtBQUssRUFBRSxLQUFLLENBQUM7UUFDdkIsT0FBTyxFQUFFLEtBQUs7UUFDZCxJQUFJLEVBQUUsUUFBUTtLQUNqQixDQUFDO1NBQ0QsTUFBTSxDQUFDLFdBQVcsRUFBRTtRQUNqQixRQUFRLEVBQUUsdUNBQXVDO1FBQ2pELElBQUksRUFBRSxRQUFRO0tBQ2pCLENBQUM7U0FDRCxNQUFNLENBQUMsU0FBUyxFQUFFO1FBQ2YsUUFBUSxFQUFFLHNDQUFzQztRQUNoRCxJQUFJLEVBQUUsUUFBUTtLQUNqQixDQUFDLENBQUE7QUFDVixDQUFDLENBQUE7QUFFRCxNQUFNLENBQUMsS0FBSyxVQUFVLE9BQU8sQ0FBQyxJQUFtQjtJQUM3QyxJQUFJLENBQUM7UUFDRCxNQUFNLE1BQU0sR0FBRyxvQkFBb0IsQ0FBQyxLQUFLLENBQUMsSUFBSSxDQUF5QixDQUFDO1FBQ3hFLElBQUksQ0FBQyxVQUFVLENBQUMsTUFBTSxDQUFDLEtBQUssQ0FBQyxFQUFFLENBQUM7WUFDNUIsTUFBTSxJQUFJLEtBQUssQ0FBQyxjQUFjLE1BQU0sQ0FBQyxLQUFLLGlCQUFpQixDQUFDLENBQUM7UUFDakUsQ0FBQztRQUNELE1BQU0sQ0FBQyxJQUFJLENBQUMsd0NBQXdDLENBQUMsQ0FBQztRQUN0RCxNQUFNLFdBQVcsR0FBRyxNQUFNLGFBQWEsQ0FBQyxNQUFNLEVBQUUsTUFBTSxDQUFDLENBQUM7UUFDeEQsTUFBTSxDQUFDLElBQUksQ0FBQyxtQ0FBbUMsQ0FBQyxDQUFDO1FBQ2pELE1BQU0sQ0FBQyxJQUFJLENBQUMsYUFBYSxXQUFXLENBQUMsTUFBTSxTQUFTLENBQUMsQ0FBQztJQUMxRCxDQUFDO0lBQUMsT0FBTyxLQUFLLEVBQUUsQ0FBQztRQUNiLElBQUksS0FBSyxZQUFZLENBQUMsQ0FBQyxRQUFRLEVBQUUsQ0FBQztZQUM5QixNQUFNLENBQUMsS0FBSyxDQUFDLG9CQUFvQixFQUFFLEtBQUssQ0FBQyxPQUFPLEVBQUUsQ0FBQyxDQUFDO1FBQ3hELENBQUM7YUFBTSxDQUFDO1lBQ0osTUFBTSxPQUFPLEdBQUcsS0FBSyxZQUFZLEtBQUssQ0FBQyxDQUFDLENBQUMsS0FBSyxDQUFDLE9BQU8sQ0FBQyxDQUFDLENBQUMsTUFBTSxDQUFDLEtBQUssQ0FBQyxDQUFDO1lBQ3ZFLE1BQU0sQ0FBQyxLQUFLLENBQUMsa0NBQWtDLEVBQUUsT0FBTyxFQUFFLEtBQUssQ0FBQyxDQUFDO1FBQ3JFLENBQUM7UUFDRCxPQUFPLENBQUMsSUFBSSxDQUFDLENBQUMsQ0FBQyxDQUFDO0lBQ3BCLENBQUM7QUFDTCxDQUFDO0FBRUQsTUFBTSxDQUFDLE1BQU0sUUFBUSxHQUFHLENBQUMsR0FBYSxFQUFFLEVBQUU7SUFDdEMsT0FBTyxHQUFHLENBQUMsT0FBTyxDQUFDLE9BQU8sRUFBRSxJQUFJLEVBQUUsT0FBTyxFQUFFLE9BQU8sQ0FBQyxDQUFBO0FBQ3ZELENBQUMsQ0FBQSJ9
|
||||
cli.command(command, desc, builder, handler);
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicGRmMmpwZy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3NyYy9jb21tYW5kcy9wZGYyanBnLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUNBLE9BQU8sRUFBRSxNQUFNLEVBQUUsTUFBTSxhQUFhLENBQUE7QUFDcEMsT0FBTyxFQUFFLFVBQVUsRUFBRSxNQUFNLFNBQVMsQ0FBQztBQUNyQyxPQUFPLEtBQUssQ0FBQyxNQUFNLEtBQUssQ0FBQztBQUN6QixPQUFPLEVBQUUsYUFBYSxFQUFFLE1BQU0sdUJBQXVCLENBQUM7QUFDdEQsT0FBTyxFQUF3QixvQkFBb0IsRUFBRSxNQUFNLHFCQUFxQixDQUFBO0FBQ2hGLE9BQU8sRUFBRSxHQUFHLEVBQUUsTUFBTSxXQUFXLENBQUM7QUFFaEMsTUFBTSxDQUFDLE1BQU0sT0FBTyxHQUFHLFNBQVMsQ0FBQztBQUNqQyxNQUFNLENBQUMsTUFBTSxJQUFJLEdBQUcsdUJBQXVCLENBQUM7QUFFNUMsTUFBTSxDQUFDLE1BQU0sT0FBTyxHQUFHLENBQUMsS0FBZSxFQUFFLEVBQUU7SUFDdkMsT0FBTyxLQUFLO1NBQ1AsTUFBTSxDQUFDLE9BQU8sRUFBRTtRQUNiLEtBQUssRUFBRSxHQUFHO1FBQ1YsUUFBUSxFQUFFLDRCQUE0QjtRQUN0QyxZQUFZLEVBQUUsSUFBSTtRQUNsQixJQUFJLEVBQUUsUUFBUTtLQUNqQixDQUFDO1NBQ0QsTUFBTSxDQUFDLFFBQVEsRUFBRTtRQUNkLEtBQUssRUFBRSxHQUFHO1FBQ1YsUUFBUSxFQUFFLHFEQUFxRDtRQUMvRCxJQUFJLEVBQUUsUUFBUTtLQUNqQixDQUFDO1NBQ0QsTUFBTSxDQUFDLEtBQUssRUFBRTtRQUNYLFFBQVEsRUFBRSxrQ0FBa0M7UUFDNUMsT0FBTyxFQUFFLEdBQUc7UUFDWixJQUFJLEVBQUUsUUFBUTtLQUNqQixDQUFDO1NBQ0QsTUFBTSxDQUFDLE9BQU8sRUFBRTtRQUNiLFFBQVEsRUFBRSxnRUFBZ0U7UUFDMUUsT0FBTyxFQUFFLENBQUM7UUFDVixJQUFJLEVBQUUsUUFBUTtLQUNqQixDQUFDO1NBQ0QsTUFBTSxDQUFDLFFBQVEsRUFBRTtRQUNkLFFBQVEsRUFBRSxxQkFBcUI7UUFDL0IsT0FBTyxFQUFFLENBQUMsS0FBSyxFQUFFLEtBQUssQ0FBQztRQUN2QixPQUFPLEVBQUUsS0FBSztRQUNkLElBQUksRUFBRSxRQUFRO0tBQ2pCLENBQUM7U0FDRCxNQUFNLENBQUMsV0FBVyxFQUFFO1FBQ2pCLFFBQVEsRUFBRSx1Q0FBdUM7UUFDakQsSUFBSSxFQUFFLFFBQVE7S0FDakIsQ0FBQztTQUNELE1BQU0sQ0FBQyxTQUFTLEVBQUU7UUFDZixRQUFRLEVBQUUsc0NBQXNDO1FBQ2hELElBQUksRUFBRSxRQUFRO0tBQ2pCLENBQUMsQ0FBQTtBQUNWLENBQUMsQ0FBQTtBQUVELE1BQU0sQ0FBQyxLQUFLLFVBQVUsT0FBTyxDQUFDLElBQW1CO0lBQzdDLElBQUksQ0FBQztRQUNELE1BQU0sTUFBTSxHQUFHLG9CQUFvQixDQUFDLEtBQUssQ0FBQyxJQUFJLENBQXlCLENBQUM7UUFDeEUsSUFBSSxDQUFDLFVBQVUsQ0FBQyxNQUFNLENBQUMsS0FBSyxDQUFDLEVBQUUsQ0FBQztZQUM1QixNQUFNLElBQUksS0FBSyxDQUFDLGNBQWMsTUFBTSxDQUFDLEtBQUssaUJBQWlCLENBQUMsQ0FBQztRQUNqRSxDQUFDO1FBQ0QsTUFBTSxDQUFDLElBQUksQ0FBQyx3Q0FBd0MsQ0FBQyxDQUFDO1FBQ3RELE1BQU0sV0FBVyxHQUFHLE1BQU0sYUFBYSxDQUFDLE1BQU0sRUFBRSxNQUFNLENBQUMsQ0FBQztRQUN4RCxNQUFNLENBQUMsSUFBSSxDQUFDLG1DQUFtQyxDQUFDLENBQUM7UUFDakQsTUFBTSxDQUFDLElBQUksQ0FBQyxhQUFhLFdBQVcsQ0FBQyxNQUFNLFNBQVMsQ0FBQyxDQUFDO0lBQzFELENBQUM7SUFBQyxPQUFPLEtBQUssRUFBRSxDQUFDO1FBQ2IsSUFBSSxLQUFLLFlBQVksQ0FBQyxDQUFDLFFBQVEsRUFBRSxDQUFDO1lBQzlCLE1BQU0sQ0FBQyxLQUFLLENBQUMsb0JBQW9CLEVBQUUsS0FBSyxDQUFDLE9BQU8sRUFBRSxDQUFDLENBQUM7UUFDeEQsQ0FBQzthQUFNLENBQUM7WUFDSixNQUFNLE9BQU8sR0FBRyxLQUFLLFlBQVksS0FBSyxDQUFDLENBQUMsQ0FBQyxLQUFLLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxNQUFNLENBQUMsS0FBSyxDQUFDLENBQUM7WUFDdkUsTUFBTSxDQUFDLEtBQUssQ0FBQyxrQ0FBa0MsRUFBRSxPQUFPLEVBQUUsS0FBSyxDQUFDLENBQUM7UUFDckUsQ0FBQztRQUNELE9BQU8sQ0FBQyxJQUFJLENBQUMsQ0FBQyxDQUFDLENBQUM7SUFDcEIsQ0FBQztBQUNMLENBQUM7QUFFRCxHQUFHLENBQUMsT0FBTyxDQUFDLE9BQU8sRUFBRSxJQUFJLEVBQUUsT0FBTyxFQUFFLE9BQU8sQ0FBQyxDQUFBIn0=
|
||||
@ -2,7 +2,7 @@ import * as path from 'path';
|
||||
import { Helper } from '../lib/process/index.js';
|
||||
import * as bluebird from 'bluebird';
|
||||
import { logger } from '../index.js';
|
||||
const fg = require('fast-glob');
|
||||
import { filesEx } from '@polymech/commons/glob';
|
||||
const defaultOptions = (yargs) => {
|
||||
return yargs.option('input', {
|
||||
default: './',
|
||||
@ -32,7 +32,7 @@ export const register = (cli) => {
|
||||
return;
|
||||
}
|
||||
const src = path.resolve('' + argv.input);
|
||||
const files = fg.sync('*.svg|*.SVG', { dot: true, cwd: src, absolute: true });
|
||||
const files = filesEx(src, '*.svg|*.SVG');
|
||||
if (argv.debug) {
|
||||
logger.debug(`Begin convert SVG files at ${src} ${files}`);
|
||||
}
|
||||
@ -42,4 +42,4 @@ export const register = (cli) => {
|
||||
}
|
||||
});
|
||||
};
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic3ZnMmpwZy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3NyYy9jb21tYW5kcy9zdmcyanBnLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUNBLE9BQU8sS0FBSyxJQUFJLE1BQU0sTUFBTSxDQUFDO0FBQzdCLE9BQU8sRUFBRSxNQUFNLEVBQUUsTUFBTSx5QkFBeUIsQ0FBQztBQUNqRCxPQUFPLEtBQUssUUFBUSxNQUFNLFVBQVUsQ0FBQztBQUNyQyxPQUFPLEVBQUUsTUFBTSxFQUFFLE1BQU0sYUFBYSxDQUFDO0FBQ3JDLE1BQU0sRUFBRSxHQUFHLE9BQU8sQ0FBQyxXQUFXLENBQUMsQ0FBQztBQUVoQyxNQUFNLGNBQWMsR0FBRyxDQUFDLEtBQWUsRUFBRSxFQUFFO0lBQ3ZDLE9BQU8sS0FBSyxDQUFDLE1BQU0sQ0FBQyxPQUFPLEVBQUU7UUFDekIsT0FBTyxFQUFFLElBQUk7UUFDYixRQUFRLEVBQUUsYUFBYTtLQUMxQixDQUFDLENBQUMsTUFBTSxDQUFDLE9BQU8sRUFBRTtRQUNmLE9BQU8sRUFBRSxPQUFPO1FBQ2hCLFFBQVEsRUFBRSwrQkFBK0I7S0FDNUMsQ0FBQyxDQUFBO0FBQ04sQ0FBQyxDQUFDO0FBRUYsSUFBSSxPQUFPLEdBQUcsQ0FBQyxLQUFlLEVBQUUsRUFBRSxDQUFDLGNBQWMsQ0FBQyxLQUFLLENBQUMsQ0FBQztBQUV6RCxLQUFLLFVBQVUsWUFBWSxDQUFDLEtBQUs7SUFDN0IsT0FBTyxRQUFRLENBQUMsU0FBUyxDQUFDLEtBQUssRUFBRSxDQUFDLElBQVksRUFBRSxFQUFFO1FBQzlDLE1BQU0sT0FBTyxHQUFHLElBQUksQ0FBQyxLQUFLLENBQUMsSUFBSSxDQUFDLENBQUM7UUFDakMsTUFBTSxPQUFPLEdBQUcsTUFBTSxDQUFDLEdBQUcsQ0FBQyxPQUFPLENBQUMsR0FBRyxFQUFFLFNBQVMsRUFDakQ7WUFDSSxJQUFJLE9BQU8sQ0FBQyxJQUFJLEdBQUc7WUFDbkIsYUFBYTtZQUNiLGdCQUFnQjtZQUNoQixJQUFJLE9BQU8sQ0FBQyxJQUFJLE9BQU87U0FDMUIsQ0FBQyxDQUFDO1FBQ0gsT0FBTyxPQUFPLENBQUM7SUFDbkIsQ0FBQyxDQUFDLENBQUM7QUFDUCxDQUFDO0FBQ0Qsb0RBQW9EO0FBRXBELE1BQU0sQ0FBQyxNQUFNLFFBQVEsR0FBRyxDQUFDLEdBQWEsRUFBRSxFQUFFO0lBQ3RDLE9BQU8sR0FBRyxDQUFDLE9BQU8sQ0FBQyxTQUFTLEVBQUUsRUFBRSxFQUFFLE9BQU8sRUFBRSxLQUFLLEVBQUUsSUFBbUIsRUFBRSxFQUFFO1FBQ3JFLElBQUksSUFBSSxDQUFDLElBQUksRUFBRSxDQUFDO1lBQUMsT0FBTztRQUFDLENBQUM7UUFDMUIsTUFBTSxHQUFHLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxFQUFFLEdBQUcsSUFBSSxDQUFDLEtBQUssQ0FBQyxDQUFDO1FBQzFDLE1BQU0sS0FBSyxHQUFHLEVBQUUsQ0FBQyxJQUFJLENBQUMsYUFBYSxFQUFFLEVBQUUsR0FBRyxFQUFFLElBQUksRUFBRSxHQUFHLEVBQUUsR0FBRyxFQUFFLFFBQVEsRUFBRSxJQUFJLEVBQUUsQ0FBQyxDQUFDO1FBQzlFLElBQUksSUFBSSxDQUFDLEtBQUssRUFBRSxDQUFDO1lBQ2IsTUFBTSxDQUFDLEtBQUssQ0FBQyw4QkFBOEIsR0FBRyxJQUFJLEtBQUssRUFBRSxDQUFDLENBQUM7UUFDL0QsQ0FBQztRQUNELE1BQU0sWUFBWSxDQUFDLEtBQUssQ0FBQyxDQUFDO1FBQzFCLElBQUksSUFBSSxDQUFDLEtBQUssRUFBRSxDQUFDO1lBQ2IsTUFBTSxDQUFDLEtBQUssQ0FBQyxhQUFhLEtBQUssQ0FBQyxNQUFNLFFBQVEsQ0FBQyxDQUFDO1FBQ3BELENBQUM7SUFDTCxDQUFDLENBQUMsQ0FBQztBQUNQLENBQUMsQ0FBQyJ9
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic3ZnMmpwZy5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uLy4uL3NyYy9jb21tYW5kcy9zdmcyanBnLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUNBLE9BQU8sS0FBSyxJQUFJLE1BQU0sTUFBTSxDQUFDO0FBQzdCLE9BQU8sRUFBRSxNQUFNLEVBQUUsTUFBTSx5QkFBeUIsQ0FBQztBQUNqRCxPQUFPLEtBQUssUUFBUSxNQUFNLFVBQVUsQ0FBQztBQUNyQyxPQUFPLEVBQUUsTUFBTSxFQUFFLE1BQU0sYUFBYSxDQUFDO0FBQ3JDLE9BQU8sRUFBRSxPQUFPLEVBQUUsTUFBTSx3QkFBd0IsQ0FBQztBQUVqRCxNQUFNLGNBQWMsR0FBRyxDQUFDLEtBQWUsRUFBRSxFQUFFO0lBQ3ZDLE9BQU8sS0FBSyxDQUFDLE1BQU0sQ0FBQyxPQUFPLEVBQUU7UUFDekIsT0FBTyxFQUFFLElBQUk7UUFDYixRQUFRLEVBQUUsYUFBYTtLQUMxQixDQUFDLENBQUMsTUFBTSxDQUFDLE9BQU8sRUFBRTtRQUNmLE9BQU8sRUFBRSxPQUFPO1FBQ2hCLFFBQVEsRUFBRSwrQkFBK0I7S0FDNUMsQ0FBQyxDQUFBO0FBQ04sQ0FBQyxDQUFDO0FBRUYsSUFBSSxPQUFPLEdBQUcsQ0FBQyxLQUFlLEVBQUUsRUFBRSxDQUFDLGNBQWMsQ0FBQyxLQUFLLENBQUMsQ0FBQztBQUV6RCxLQUFLLFVBQVUsWUFBWSxDQUFDLEtBQUs7SUFDN0IsT0FBTyxRQUFRLENBQUMsU0FBUyxDQUFDLEtBQUssRUFBRSxDQUFDLElBQVksRUFBRSxFQUFFO1FBQzlDLE1BQU0sT0FBTyxHQUFHLElBQUksQ0FBQyxLQUFLLENBQUMsSUFBSSxDQUFDLENBQUM7UUFDakMsTUFBTSxPQUFPLEdBQUcsTUFBTSxDQUFDLEdBQUcsQ0FBQyxPQUFPLENBQUMsR0FBRyxFQUFFLFNBQVMsRUFDakQ7WUFDSSxJQUFJLE9BQU8sQ0FBQyxJQUFJLEdBQUc7WUFDbkIsYUFBYTtZQUNiLGdCQUFnQjtZQUNoQixJQUFJLE9BQU8sQ0FBQyxJQUFJLE9BQU87U0FDMUIsQ0FBQyxDQUFDO1FBQ0gsT0FBTyxPQUFPLENBQUM7SUFDbkIsQ0FBQyxDQUFDLENBQUM7QUFDUCxDQUFDO0FBQ0Qsb0RBQW9EO0FBRXBELE1BQU0sQ0FBQyxNQUFNLFFBQVEsR0FBRyxDQUFDLEdBQWEsRUFBRSxFQUFFO0lBQ3RDLE9BQU8sR0FBRyxDQUFDLE9BQU8sQ0FBQyxTQUFTLEVBQUUsRUFBRSxFQUFFLE9BQU8sRUFBRSxLQUFLLEVBQUUsSUFBbUIsRUFBRSxFQUFFO1FBQ3JFLElBQUksSUFBSSxDQUFDLElBQUksRUFBRSxDQUFDO1lBQUMsT0FBTztRQUFDLENBQUM7UUFDMUIsTUFBTSxHQUFHLEdBQUcsSUFBSSxDQUFDLE9BQU8sQ0FBQyxFQUFFLEdBQUcsSUFBSSxDQUFDLEtBQUssQ0FBQyxDQUFDO1FBQzFDLE1BQU0sS0FBSyxHQUFHLE9BQU8sQ0FBQyxHQUFHLEVBQUUsYUFBYSxDQUFDLENBQUM7UUFDMUMsSUFBSSxJQUFJLENBQUMsS0FBSyxFQUFFLENBQUM7WUFDYixNQUFNLENBQUMsS0FBSyxDQUFDLDhCQUE4QixHQUFHLElBQUksS0FBSyxFQUFFLENBQUMsQ0FBQztRQUMvRCxDQUFDO1FBQ0QsTUFBTSxZQUFZLENBQUMsS0FBSyxDQUFDLENBQUM7UUFDMUIsSUFBSSxJQUFJLENBQUMsS0FBSyxFQUFFLENBQUM7WUFDYixNQUFNLENBQUMsS0FBSyxDQUFDLGFBQWEsS0FBSyxDQUFDLE1BQU0sUUFBUSxDQUFDLENBQUM7UUFDcEQsQ0FBQztJQUNMLENBQUMsQ0FBQyxDQUFDO0FBQ1AsQ0FBQyxDQUFDIn0=
|
||||
@ -5,7 +5,6 @@ import sharp from 'sharp';
|
||||
import { IResizeOptions } from '../../../index.js';
|
||||
export declare const GLOB_BASIC = "png|jpg|tiff|jpeg|webp";
|
||||
export declare const GLOB_MIN = "*.{png,jpg,jpeg,PNG,JPG,JPEG}";
|
||||
export declare const files: (dir: any, glob: any) => any;
|
||||
export declare const getFormats: (src: any, folder: any) => {
|
||||
src: string;
|
||||
dist: string;
|
||||
|
||||
3
packages/media/dist-in/main.d.ts
vendored
@ -1,2 +1,3 @@
|
||||
#!/usr/bin/env node
|
||||
export {};
|
||||
import './commands/resize.js';
|
||||
import './commands/pdf2jpg.js';
|
||||
|
||||
@ -1,18 +1,9 @@
|
||||
#!/usr/bin/env node
|
||||
import { defaults } from './_cli.js';
|
||||
defaults();
|
||||
import * as cli from 'yargs';
|
||||
import { register as registerPDF2JPG } from './commands/pdf2jpg.js';
|
||||
registerPDF2JPG(cli);
|
||||
import { register as registerSVG2JPG } from './commands/svg2jpg.js';
|
||||
registerSVG2JPG(cli);
|
||||
/*
|
||||
import { register as registerResize } from './commands/resize.js'; registerResize(cli)
|
||||
import { register as registerConvert } from './commands/convert.js'; registerConvert(cli)
|
||||
import { register as registerAudio } from './commands/audio.js'; registerAudio(cli)
|
||||
import { register as registerVideo } from './commands/video.js'; registerVideo(cli)
|
||||
import { register as registerWatermark } from './commands/watermark.js'; registerWatermark(cli)
|
||||
*/
|
||||
import { cli } from './cli.js';
|
||||
import './commands/resize.js';
|
||||
import './commands/pdf2jpg.js';
|
||||
const argv = cli.argv;
|
||||
if (argv.h || argv.help) {
|
||||
cli.showHelp();
|
||||
@ -21,4 +12,4 @@ if (argv.h || argv.help) {
|
||||
else if (argv.v || argv.version) {
|
||||
process.exit();
|
||||
}
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoibWFpbi5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uL3NyYy9tYWluLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiI7QUFDQSxPQUFPLEVBQUUsUUFBUSxFQUFFLE1BQU0sV0FBVyxDQUFDO0FBQUMsUUFBUSxFQUFFLENBQUE7QUFFaEQsT0FBTyxLQUFLLEdBQUcsTUFBTSxPQUFPLENBQUE7QUFLNUIsT0FBTyxFQUFFLFFBQVEsSUFBSSxlQUFlLEVBQUUsTUFBTSx1QkFBdUIsQ0FBQztBQUFDLGVBQWUsQ0FBQyxHQUFHLENBQUMsQ0FBQTtBQUN6RixPQUFPLEVBQUUsUUFBUSxJQUFJLGVBQWUsRUFBRSxNQUFNLHVCQUF1QixDQUFDO0FBQUMsZUFBZSxDQUFDLEdBQUcsQ0FBQyxDQUFBO0FBQ3pGOzs7Ozs7RUFNRTtBQUNGLE1BQU0sSUFBSSxHQUFRLEdBQUcsQ0FBQyxJQUFJLENBQUM7QUFFM0IsSUFBSSxJQUFJLENBQUMsQ0FBQyxJQUFJLElBQUksQ0FBQyxJQUFJLEVBQUUsQ0FBQztJQUN0QixHQUFHLENBQUMsUUFBUSxFQUFFLENBQUM7SUFDZixPQUFPLENBQUMsSUFBSSxFQUFFLENBQUM7QUFDbkIsQ0FBQztLQUFNLElBQUksSUFBSSxDQUFDLENBQUMsSUFBSSxJQUFJLENBQUMsT0FBTyxFQUFFLENBQUM7SUFDaEMsT0FBTyxDQUFDLElBQUksRUFBRSxDQUFDO0FBQ25CLENBQUMifQ==
|
||||
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoibWFpbi5qcyIsInNvdXJjZVJvb3QiOiIiLCJzb3VyY2VzIjpbIi4uL3NyYy9tYWluLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiI7QUFDQSxPQUFPLEVBQUUsUUFBUSxFQUFFLE1BQU0sV0FBVyxDQUFDO0FBQUMsUUFBUSxFQUFFLENBQUE7QUFFaEQsT0FBTyxFQUFFLEdBQUcsRUFBRSxNQUFNLFVBQVUsQ0FBQTtBQUM5QixPQUFPLHNCQUFzQixDQUFBO0FBQzdCLE9BQU8sdUJBQXVCLENBQUE7QUFFOUIsTUFBTSxJQUFJLEdBQVEsR0FBRyxDQUFDLElBQUksQ0FBQztBQUUzQixJQUFJLElBQUksQ0FBQyxDQUFDLElBQUksSUFBSSxDQUFDLElBQUksRUFBRSxDQUFDO0lBQ3RCLEdBQUcsQ0FBQyxRQUFRLEVBQUUsQ0FBQztJQUNmLE9BQU8sQ0FBQyxJQUFJLEVBQUUsQ0FBQztBQUNuQixDQUFDO0tBQU0sSUFBSSxJQUFJLENBQUMsQ0FBQyxJQUFJLElBQUksQ0FBQyxPQUFPLEVBQUUsQ0FBQztJQUNoQyxPQUFPLENBQUMsSUFBSSxFQUFFLENBQUM7QUFDbkIsQ0FBQyJ9
|
||||
1
packages/media/fs_err
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/jpg_images/page_2.jpg
Normal file
@ -0,0 +1 @@
|
||||
mock-jpg-data
|
||||
1
packages/media/jpg_images/page_3.jpg
Normal file
@ -0,0 +1 @@
|
||||
mock-jpg-data
|
||||
1
packages/media/jpg_images/page_4.jpg
Normal file
@ -0,0 +1 @@
|
||||
mock-jpg-data
|
||||
1
packages/media/log_test_1.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/log_test_2.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/log_test_3.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/log_test_4.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/log_test_5.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/output/image_1.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/output/image_2.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/output/image_3.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/output/image_4.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1
packages/media/output/image_5.png
Normal file
@ -0,0 +1 @@
|
||||
mock-png-data
|
||||
1144
packages/media/package-lock.json
generated
@ -21,7 +21,9 @@
|
||||
"@types/fluent-ffmpeg": "^2.1.27",
|
||||
"@types/node": "^24.0.10",
|
||||
"bluebird": "^3.7.2",
|
||||
"fast-glob": "^3.3.2",
|
||||
"fluent-ffmpeg": "^2.1.3",
|
||||
"glob": "^11.0.0",
|
||||
"js-beautify": "^1.14.6",
|
||||
"mupdf": "^1.3.3",
|
||||
"novita-sdk": "^1.0.37",
|
||||
@ -34,7 +36,8 @@
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/showdown": "^2.0.6",
|
||||
"vitest": "^3.1.1"
|
||||
"vitest": "^3.1.1",
|
||||
"@types/glob": "^8.1.0"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "tsc; mocha --full-trace mocha \"spec/**/*.spec.js\"",
|
||||
|
||||
1
packages/media/ref/pdf-to-images/.gitignore
vendored
@ -1 +0,0 @@
|
||||
./tests/out
|
||||
@ -1,21 +0,0 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"name": "Launch Program",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"program": "${workspaceFolder}\\dist\\index.js",
|
||||
"preLaunchTask": "tsc: build - tsconfig.json",
|
||||
"outFiles": [
|
||||
"${workspaceFolder}/dist/**/*.js"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -1,155 +0,0 @@
|
||||
# @polymech/pdf
|
||||
|
||||
## Installation
|
||||
|
||||
1. **Clone the repository (optional):**
|
||||
```bash
|
||||
git clone <repository-url> # Replace with your repository URL
|
||||
cd <directory-name> # e.g., cd pdf
|
||||
```
|
||||
|
||||
2. **Install dependencies:**
|
||||
```bash
|
||||
npm install
|
||||
```
|
||||
|
||||
3. **Build the project:**
|
||||
```bash
|
||||
npm run build
|
||||
```
|
||||
|
||||
## CLI Usage
|
||||
|
||||
When running from within the cloned project directory after building:
|
||||
|
||||
```bash
|
||||
npm start -- convert [options]
|
||||
# or directly execute the built script
|
||||
node dist/index.js convert [options]
|
||||
```
|
||||
|
||||
*(Note: If you publish this package and install it elsewhere, you might execute it differently, potentially using `npx @polymech/pdf convert ...` if a `bin` entry is added to `package.json`)*
|
||||
|
||||
Available command: `convert` - Convert PDF to images
|
||||
|
||||
### Options for `convert` command:
|
||||
|
||||
* `-i, --input <string>`: Input PDF file (required)
|
||||
* `-o, --output <string>`: Output directory prefix for images (required)
|
||||
* `--dpi <number>`: DPI for output images (default: 300)
|
||||
* `--format <string>`: Output image format (choices: 'png', 'jpg', default: 'png')
|
||||
* `-s, --startPage <number>`: First page to convert (1-based)
|
||||
* `-e, --endPage <number>`: Last page to convert (1-based, inclusive)
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
node dist/index.js convert -i mydocument.pdf -o output/image
|
||||
```
|
||||
|
||||
This will generate images like `output/image_1.png`, `output/image_2.png`, etc.
|
||||
|
||||
Another example (using JPG format and 150 DPI):
|
||||
|
||||
```bash
|
||||
node dist/index.js convert -i report.pdf -o images/report_page --format jpg --dpi 150
|
||||
```
|
||||
|
||||
This generates `images/report_page_1.jpg`, `images/report_page_2.jpg`, etc.
|
||||
|
||||
Example specifying a page range (pages 3 to 5):
|
||||
|
||||
```bash
|
||||
node dist/index.js convert -i long_doc.pdf -o pages/doc_pg --startPage 3 --endPage 5
|
||||
```
|
||||
|
||||
This generates `pages/doc_pg_3.png`, `pages/doc_pg_4.png`, `pages/doc_pg_5.png`.
|
||||
|
||||
## API Usage
|
||||
|
||||
```typescript
|
||||
import { convertPdfToImages, ImageFormat, PdfToImageOptions } from './dist/lib/pdf'; // Adjust path based on your project structure
|
||||
import { readFile } from 'node:fs/promises';
|
||||
|
||||
async function example() {
|
||||
try {
|
||||
const pdfBuffer = await readFile('mydocument.pdf');
|
||||
|
||||
const options: PdfToImageOptions = {
|
||||
outputPathPrefix: 'output/image',
|
||||
dpi: 300,
|
||||
format: 'png'
|
||||
};
|
||||
|
||||
const outputFilePaths = await convertPdfToImages(pdfBuffer, options);
|
||||
console.log('Generated images:', outputFilePaths);
|
||||
} catch (error) {
|
||||
console.error('Error:', error);
|
||||
}
|
||||
}
|
||||
|
||||
example();
|
||||
```
|
||||
|
||||
Example using JPG format:
|
||||
|
||||
```typescript
|
||||
import { convertPdfToImages, PdfToImageOptions } from './dist/lib/pdf'; // Adjust path
|
||||
import { readFile } from 'node:fs/promises';
|
||||
import { Logger } from 'tslog'; // Assuming you want logging
|
||||
|
||||
async function exampleJpg() {
|
||||
const logger = new Logger();
|
||||
try {
|
||||
const pdfBuffer = await readFile('report.pdf');
|
||||
const options: PdfToImageOptions = {
|
||||
outputPathPrefix: 'images/report_page',
|
||||
dpi: 150,
|
||||
format: 'jpg',
|
||||
};
|
||||
const outputFilePaths = await convertPdfToImages(pdfBuffer, options);
|
||||
logger.info('Generated JPG images:', outputFilePaths);
|
||||
} catch (error) {
|
||||
logger.error('Error generating JPGs:', error);
|
||||
}
|
||||
}
|
||||
|
||||
exampleJpg();
|
||||
```
|
||||
|
||||
Example with specific page range:
|
||||
|
||||
```typescript
|
||||
import { convertPdfToImages, PdfToImageOptions } from './dist/lib/pdf'; // Adjust path
|
||||
import { readFile } from 'node:fs/promises';
|
||||
|
||||
async function examplePageRange() {
|
||||
try {
|
||||
const pdfBuffer = await readFile('long_doc.pdf');
|
||||
const options: PdfToImageOptions = {
|
||||
outputPathPrefix: 'pages/doc_pg',
|
||||
dpi: 200,
|
||||
format: 'png',
|
||||
startPage: 3,
|
||||
endPage: 5
|
||||
};
|
||||
const outputFilePaths = await convertPdfToImages(pdfBuffer, options);
|
||||
console.log('Generated specific pages:', outputFilePaths);
|
||||
} catch (error) {
|
||||
console.error('Error generating page range:', error);
|
||||
}
|
||||
}
|
||||
|
||||
examplePageRange();
|
||||
```
|
||||
|
||||
### Exports
|
||||
|
||||
* `convertPdfToImages(pdfData: Buffer, options: PdfToImageOptions): Promise<string[]>`: Converts a PDF buffer to images.
|
||||
* `ImageFormat`: Type alias for `'png' | 'jpg'`.
|
||||
* `PdfToImageOptions`: Interface for conversion options (`outputPathPrefix`, `dpi`, `format`, optional `startPage`, optional `endPage`, optional `logger`).
|
||||
|
||||
|
||||
### References
|
||||
|
||||
- https://github.com/opendatalab/PDF-Extract-Kit/tree/main/project/pdf2markdown (Using Yolo)
|
||||
@ -1,30 +0,0 @@
|
||||
import { Logger } from 'tslog';
|
||||
import { ConvertCommandSchema } from '../types.js';
|
||||
import { existsSync } from 'node:fs';
|
||||
import * as z from 'zod';
|
||||
import { runConversion } from '../lib/convert.js';
|
||||
export const command = 'convert';
|
||||
export const desc = 'Convert PDF to images';
|
||||
export async function handler(argv) {
|
||||
const logger = new Logger();
|
||||
try {
|
||||
const config = ConvertCommandSchema.parse(argv);
|
||||
if (!existsSync(config.input)) {
|
||||
throw new Error(`Input file ${config.input} does not exist`);
|
||||
}
|
||||
logger.info("Calling conversion library function...");
|
||||
const outputFiles = await runConversion(config, logger);
|
||||
logger.info(`Conversion completed successfully`);
|
||||
logger.info(`Generated ${outputFiles.length} images`);
|
||||
}
|
||||
catch (error) {
|
||||
if (error instanceof z.ZodError) {
|
||||
logger.error('Invalid arguments:', error.flatten());
|
||||
}
|
||||
else {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
logger.error('Error during conversion command:', message, error);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
@ -1,5 +0,0 @@
|
||||
/**
|
||||
* Default output path template when no output is specified.
|
||||
* Variables: ${SRC_DIR}, ${SRC_NAME}, ${PAGE}, ${FORMAT}
|
||||
*/
|
||||
export const DEFAULT_OUTPUT_TEMPLATE = "${SRC_DIR}/${SRC_NAME}_${PAGE}.${FORMAT}";
|
||||
16
packages/media/ref/pdf-to-images/dist/index.js
vendored
@ -1,16 +0,0 @@
|
||||
import yargs from 'yargs';
|
||||
import { hideBin } from 'yargs/helpers';
|
||||
import * as convertCommand from './commands/convert.js';
|
||||
import { ConvertCommandArgsSchema } from './types.js';
|
||||
import { toYargs } from '@polymech/commons';
|
||||
const commandModule = {
|
||||
command: convertCommand.command,
|
||||
describe: convertCommand.desc,
|
||||
builder: (yargs) => toYargs(yargs, ConvertCommandArgsSchema),
|
||||
handler: convertCommand.handler
|
||||
};
|
||||
yargs(hideBin(process.argv))
|
||||
.command(commandModule)
|
||||
.demandCommand(1, 'You need to specify a command')
|
||||
.help()
|
||||
.parse();
|
||||
118
packages/media/ref/pdf-to-images/dist/lib/convert.js
vendored
@ -1,118 +0,0 @@
|
||||
import { statSync } from "node:fs";
|
||||
import { sep, resolve as pathResolve, parse as pathParse, relative as pathRelative } from "node:path";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { DEFAULT_ROOTS, DEFAULT_VARS, pathInfoEx } from "@polymech/commons";
|
||||
import { convertPdfToImages } from "./pdf.js";
|
||||
import { DEFAULT_OUTPUT_TEMPLATE } from "../constants.js";
|
||||
/**
|
||||
* Runs the PDF to images conversion process.
|
||||
* Generates variables, determines output path, reads PDF, and calls the conversion engine.
|
||||
* @param config - The conversion configuration options (inferred from Zod schema).
|
||||
* @param logger - The logger instance to use for logging.
|
||||
* @returns A promise that resolves with an array of generated image file paths.
|
||||
*/
|
||||
export async function runConversion(config, logger) {
|
||||
const inputPath = pathResolve(config.input);
|
||||
let srcInfo = {};
|
||||
try {
|
||||
srcInfo = pathInfoEx(inputPath);
|
||||
const parsed = pathParse(inputPath);
|
||||
srcInfo = {
|
||||
...srcInfo,
|
||||
SRC_DIR: parsed.dir,
|
||||
SRC_NAME: parsed.name,
|
||||
SRC_EXT: parsed.ext,
|
||||
};
|
||||
}
|
||||
catch (e) {
|
||||
logger.warn("pathInfoEx not found or failed, using basic path.parse");
|
||||
}
|
||||
let baseVariables = {
|
||||
...DEFAULT_ROOTS,
|
||||
...DEFAULT_VARS({}),
|
||||
...srcInfo,
|
||||
DPI: config.dpi,
|
||||
FORMAT: config.format,
|
||||
};
|
||||
if (baseVariables.ROOT && baseVariables.SRC_DIR) {
|
||||
baseVariables.SRC_REL = pathRelative(baseVariables.ROOT, baseVariables.SRC_DIR);
|
||||
}
|
||||
const srcName = baseVariables.SRC_NAME || '';
|
||||
const dashed = srcName.split('-');
|
||||
if (dashed.length > 1) {
|
||||
for (let i = 0; i < dashed.length; i++) {
|
||||
baseVariables[`SRC_NAME-${i}`] = dashed[i];
|
||||
}
|
||||
}
|
||||
const dotted = srcName.split('.');
|
||||
if (dotted.length > 1) {
|
||||
for (let i = 0; i < dotted.length; i++) {
|
||||
baseVariables[`SRC_NAME.${i}`] = dotted[i];
|
||||
}
|
||||
}
|
||||
const underscored = srcName.split('_');
|
||||
if (underscored.length > 1) {
|
||||
for (let i = 0; i < underscored.length; i++) {
|
||||
baseVariables[`SRC_NAME_${i}`] = underscored[i];
|
||||
}
|
||||
}
|
||||
// Process var-* arguments directly from config object passed in
|
||||
const cliVars = Object.keys(config).filter(k => k.startsWith('var-')).reduce((acc, k) => {
|
||||
acc[k.replace('var-', '').toUpperCase()] = config[k];
|
||||
return acc;
|
||||
}, {});
|
||||
// Uppercase base variable keys
|
||||
baseVariables = Object.keys(baseVariables).reduce((acc, key) => {
|
||||
acc[key.toUpperCase()] = baseVariables[key];
|
||||
return acc;
|
||||
}, {});
|
||||
baseVariables = { ...baseVariables, ...cliVars };
|
||||
let outputPathTemplate;
|
||||
let isExplicitDir = false;
|
||||
if (config.output) {
|
||||
const outputPath = pathResolve(config.output);
|
||||
try {
|
||||
const stats = statSync(outputPath);
|
||||
if (stats.isDirectory()) {
|
||||
isExplicitDir = true;
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
if (config.output.endsWith(sep) || config.output.endsWith("/")) {
|
||||
isExplicitDir = true;
|
||||
}
|
||||
else {
|
||||
isExplicitDir = false;
|
||||
}
|
||||
}
|
||||
if (isExplicitDir) {
|
||||
baseVariables["OUT_DIR"] = outputPath;
|
||||
outputPathTemplate = "${OUT_DIR}/${SRC_NAME}_${PAGE}.${FORMAT}";
|
||||
logger.info(`Output directory specified: ${outputPath}`);
|
||||
}
|
||||
else {
|
||||
outputPathTemplate = config.output;
|
||||
logger.info(`Using output path pattern: ${outputPathTemplate}`);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Use default pattern directly from constant
|
||||
outputPathTemplate = DEFAULT_OUTPUT_TEMPLATE;
|
||||
logger.info(`Using default output path pattern: ${outputPathTemplate}`);
|
||||
}
|
||||
// --- Read PDF and Call Conversion (moved from commands/convert.ts) ---
|
||||
logger.info(`Reading PDF: ${config.input}`);
|
||||
const pdfData = await readFile(config.input);
|
||||
logger.info(`Starting conversion process...`);
|
||||
const outputFiles = await convertPdfToImages(pdfData, {
|
||||
baseVariables,
|
||||
outputPathTemplate,
|
||||
dpi: config.dpi,
|
||||
format: config.format,
|
||||
scale: config.scale,
|
||||
startPage: config.startPage,
|
||||
endPage: config.endPage,
|
||||
logger
|
||||
});
|
||||
return outputFiles;
|
||||
}
|
||||
77
packages/media/ref/pdf-to-images/dist/lib/pdf.js
vendored
@ -1,77 +0,0 @@
|
||||
import * as mupdf from 'mupdf';
|
||||
import { Logger } from 'tslog';
|
||||
import { dirname } from 'node:path';
|
||||
import { resolveVariables } from '@polymech/commons';
|
||||
import { sync as mkdir } from '@polymech/fs/dir';
|
||||
import { writeFileSync } from 'node:fs';
|
||||
import { Buffer } from 'node:buffer';
|
||||
// Helper function to convert object-like image data to Buffer
|
||||
function imageDataObjectToBuffer(imageDataObject) {
|
||||
const keys = Object.keys(imageDataObject).map(Number).sort((a, b) => a - b);
|
||||
const bufferLength = keys.length > 0 ? keys[keys.length - 1] + 1 : 0; // Determine length based on max index + 1
|
||||
const buffer = Buffer.allocUnsafe(bufferLength); // Use allocUnsafe for performance if overwriting all bytes
|
||||
for (const key in imageDataObject) {
|
||||
if (Object.prototype.hasOwnProperty.call(imageDataObject, key)) {
|
||||
const index = parseInt(key, 10);
|
||||
if (!isNaN(index) && index >= 0 && index < bufferLength) {
|
||||
buffer[index] = imageDataObject[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
export async function convertPdfToImages(pdfData, options) {
|
||||
const logger = options.logger || new Logger();
|
||||
const outputFiles = [];
|
||||
try {
|
||||
const doc = mupdf.Document.openDocument(pdfData, 'pdf');
|
||||
const pageCount = doc.countPages();
|
||||
// Validate and determine page range (adjusting for 0-based index)
|
||||
const start = (options.startPage ?? 1) - 1;
|
||||
const end = (options.endPage ?? pageCount) - 1;
|
||||
if (start < 0 || start >= pageCount) {
|
||||
throw new Error(`startPage (${options.startPage}) is out of valid range (1-${pageCount})`);
|
||||
}
|
||||
if (end < 0 || end >= pageCount) {
|
||||
throw new Error(`endPage (${options.endPage}) is out of valid range (1-${pageCount})`);
|
||||
}
|
||||
if (start > end) {
|
||||
// This should also be caught by Zod schema, but good to double-check
|
||||
throw new Error(`startPage (${options.startPage}) cannot be greater than endPage (${options.endPage})`);
|
||||
}
|
||||
const numPagesToProcess = end - start + 1;
|
||||
logger.info(`Processing pages ${start + 1} to ${end + 1} (${numPagesToProcess} pages) of ${pageCount} total`);
|
||||
// Determine the scaling matrix
|
||||
const scaleValue = options.scale ?? 2;
|
||||
const matrix = scaleValue === 1 ? mupdf.Matrix.identity : mupdf.Matrix.scale(scaleValue, scaleValue);
|
||||
logger.info(`Using scale factor: ${scaleValue}`);
|
||||
for (let i = start; i <= end; i++) {
|
||||
const pageNumber = i + 1; // User-facing page number (1-based)
|
||||
// Create page-specific variables
|
||||
const pageVariables = {
|
||||
...options.baseVariables,
|
||||
PAGE: pageNumber.toString()
|
||||
};
|
||||
// Resolve the output path using the template and page-specific variables
|
||||
const outputPath = await resolveVariables(options.outputPathTemplate, false, pageVariables);
|
||||
const page = doc.loadPage(i);
|
||||
// Use the scaling matrix here
|
||||
const pixmap = page.toPixmap(matrix, mupdf.ColorSpace.DeviceRGB, false);
|
||||
// Note: DPI is implicitly handled by the scaling factor now.
|
||||
// The pixmap dimensions will be scaled * scaleFactor.
|
||||
// We might want to remove the explicit DPI option later if it's confusing.
|
||||
const imageData = options.format === 'png'
|
||||
? pixmap.asPNG()
|
||||
: pixmap.asJPEG(60, false);
|
||||
mkdir(dirname(outputPath));
|
||||
writeFileSync(outputPath, imageDataObjectToBuffer(imageData));
|
||||
outputFiles.push(outputPath);
|
||||
logger.info(`Converted page ${pageNumber} to ${outputPath}`);
|
||||
}
|
||||
return outputFiles;
|
||||
}
|
||||
catch (error) {
|
||||
logger.error('Error converting PDF to images:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
40
packages/media/ref/pdf-to-images/dist/types.js
vendored
@ -1,40 +0,0 @@
|
||||
import { z } from 'zod';
|
||||
// Define the base shape for arguments
|
||||
export const ConvertCommandArgsSchema = z.object({
|
||||
input: z.string().describe('Path to the input PDF file'),
|
||||
output: z.string().describe('Output path template (e.g., output/page_{PAGE}.png)').optional(),
|
||||
dpi: z.number().int().positive().default(300).describe('Resolution for the output images'),
|
||||
scale: z.number().positive().default(2).describe('Scaling factor to apply before rendering (e.g., 2 for 2x size)').optional(),
|
||||
format: z.enum(['png', 'jpg']).default('png').describe('Output image format'),
|
||||
startPage: z.number().int().positive().describe('First page to convert (1-based index)').optional(),
|
||||
endPage: z.number().int().positive().describe('Last page to convert (1-based index)').optional()
|
||||
});
|
||||
// Add refinements, transformations, and catchall for final validation/parsing
|
||||
export const ConvertCommandSchema = ConvertCommandArgsSchema
|
||||
.catchall(z.any()) // Allow var-* and other properties
|
||||
.transform((data) => {
|
||||
// Explicitly pick known fields + extras (var-*)
|
||||
const known = {
|
||||
input: data.input,
|
||||
output: data.output,
|
||||
dpi: data.dpi,
|
||||
format: data.format,
|
||||
startPage: data.startPage,
|
||||
endPage: data.endPage,
|
||||
scale: data.scale,
|
||||
};
|
||||
// Keep only extra properties (like var-*)
|
||||
const extras = Object.keys(data)
|
||||
.filter(key => !['input', 'output', 'dpi', 'format', 'startPage', 'endPage', 'scale', '_', '$0'].includes(key))
|
||||
.reduce((acc, key) => { acc[key] = data[key]; return acc; }, {});
|
||||
return { ...known, ...extras };
|
||||
})
|
||||
.refine((data) => {
|
||||
if (data.startPage !== undefined && data.endPage !== undefined) {
|
||||
return data.startPage <= data.endPage;
|
||||
}
|
||||
return true;
|
||||
}, {
|
||||
message: "startPage must be less than or equal to endPage",
|
||||
path: ["startPage"],
|
||||
});
|
||||
1823
packages/media/ref/pdf-to-images/package-lock.json
generated
@ -1,43 +0,0 @@
|
||||
{
|
||||
"name": "@polymech/pdf",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "dist/index.js",
|
||||
"bin": {
|
||||
"pdf-to-images": "dist/index.js"
|
||||
},
|
||||
"scripts": {
|
||||
"dev": "tsc -p . --watch",
|
||||
"build": "tsc",
|
||||
"start": "node dist/index.js",
|
||||
"test:pdf": "node dist/index.js convert --input tests/e5dc.pdf --output tests/out/e5dc/ --startPage 3 --endPage 5",
|
||||
"test:basic": "vitest run",
|
||||
"test:variables": "vitest run tests/cli/variables.test.ts"
|
||||
},
|
||||
"keywords": [
|
||||
"pdf",
|
||||
"images",
|
||||
"convert",
|
||||
"pdf-to-images"
|
||||
],
|
||||
"author": "Polymech",
|
||||
"license": "ISC",
|
||||
"type": "module",
|
||||
"dependencies": {
|
||||
"@polymech/commons": "file:../../../commons",
|
||||
"@polymech/fs": "file:../../../fs",
|
||||
"@types/yargs": "^17.0.33",
|
||||
"init": "^0.1.2",
|
||||
"mupdf": "^1.3.3",
|
||||
"p-map": "^7.0.3",
|
||||
"tslog": "^4.9.3",
|
||||
"typescript": "^5.8.2",
|
||||
"vitest": "^3.1.1",
|
||||
"xlsx": "^0.18.5",
|
||||
"yargs": "^17.7.2",
|
||||
"zod": "^3.24.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^22.13.10"
|
||||
}
|
||||
}
|
||||
@ -1,97 +0,0 @@
|
||||
# PDF to Markdown Integration
|
||||
|
||||
This directory contains the necessary setup and guidance for integrating the `pdf2markdown` tool from the [opendatalab/PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit/tree/main/project/pdf2markdown) repository.
|
||||
|
||||
## Setup Instructions
|
||||
|
||||
1. **Clone the Repository:** Clone the `PDF-Extract-Kit` repository into a suitable location (e.g., a `vendor` directory or similar within this project, or manage it as a git submodule).
|
||||
|
||||
```bash
|
||||
# Example: Cloning into a vendor directory
|
||||
git clone https://github.com/opendatalab/PDF-Extract-Kit.git ../../vendor/PDF-Extract-Kit
|
||||
# Or using a submodule
|
||||
# git submodule add https://github.com/opendatalab/PDF-Extract-Kit.git vendor/PDF-Extract-Kit
|
||||
```
|
||||
|
||||
2. **Install Python Dependencies:** The `pdf2markdown` tool relies on several Python libraries. You need to have Python installed (check the repository for specific version requirements, likely Python 3.x). Set up a virtual environment and install the required packages. Navigate to the cloned repository directory. While the repository doesn't seem to have a top-level `requirements.txt`, you might need to install dependencies based on the components used (YOLOv8, UniMERNet, StructEqTable, PaddleOCR). You may need to piece together the requirements from the individual components or look for specific setup instructions within the `PDF-Extract-Kit` documentation if available.
|
||||
|
||||
```bash
|
||||
# Navigate to the cloned repo (adjust path as needed)
|
||||
cd ../../vendor/PDF-Extract-Kit
|
||||
|
||||
# Create a virtual environment (recommended)
|
||||
python -m venv venv
|
||||
source venv/bin/activate # On Windows use `venv\Scripts\activate`
|
||||
|
||||
# Install common dependencies (this is a guess, refer to PDF-Extract-Kit docs for specifics)
|
||||
# You'll likely need libraries for YOLO, OCR (PaddleOCR), etc.
|
||||
# pip install -r requirements.txt # Look for requirements files in subdirectories if they exist
|
||||
|
||||
# Example: Install PaddleOCR (check their docs for CPU/GPU versions)
|
||||
# pip install paddlepaddle paddleocr
|
||||
|
||||
# You will need to research and install the specific dependencies for YOLOv8,
|
||||
# UniMERNet, and StructEqTable as used by this project.
|
||||
```
|
||||
|
||||
3. **Configuration:** The tool uses a YAML configuration file (`project/pdf2markdown/configs/pdf2markdown.yaml`). You might need to adjust paths or settings within this file, especially if models need to be downloaded or paths to resources are specific to your environment.
|
||||
|
||||
## Usage from TypeScript CLI
|
||||
|
||||
You can execute the Python script from your TypeScript code using Node.js's `child_process` module.
|
||||
|
||||
```typescript
|
||||
import { exec } from 'child_process';
|
||||
import path from 'path';
|
||||
|
||||
async function convertPdfToMarkdown(pdfFilePath: string, outputMarkdownPath: string): Promise<void> {
|
||||
// Adjust these paths based on where you cloned the repo and the location of this script
|
||||
const repoRoot = path.resolve(__dirname, '../../vendor/PDF-Extract-Kit'); // Example path
|
||||
const scriptPath = path.join(repoRoot, 'project/pdf2markdown/scripts/run_project.py');
|
||||
const configPath = path.join(repoRoot, 'project/pdf2markdown/configs/pdf2markdown.yaml');
|
||||
const pythonExecutable = path.join(repoRoot, 'venv/bin/python'); // Or venv\Scripts\python.exe on Windows, or just 'python' if in PATH
|
||||
|
||||
// Construct the command
|
||||
// IMPORTANT: You'll need to modify the run_project.py script or its config
|
||||
// to accept input PDF path and output MD path as arguments, or handle
|
||||
// input/output in a way that suits your CLI (e.g., reading config, environment variables).
|
||||
// The current script seems to rely solely on the config file.
|
||||
// For now, let's assume you modify the config file or the script handles it.
|
||||
// You might need to dynamically update the config file before running.
|
||||
|
||||
// Placeholder command - needs refinement based on how run_project.py handles I/O
|
||||
const command = `${pythonExecutable} ${scriptPath} --config ${configPath} --input ${pdfFilePath} --output ${outputMarkdownPath}`; // Hypothetical arguments
|
||||
|
||||
console.log(`Executing: ${command}`);
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
exec(command, (error, stdout, stderr) => {
|
||||
if (error) {
|
||||
console.error(`Error executing pdf2markdown: ${error.message}`);
|
||||
console.error(`Stderr: ${stderr}`);
|
||||
reject(error);
|
||||
return;
|
||||
}
|
||||
console.log(`Stdout: ${stdout}`);
|
||||
console.warn(`Stderr: ${stderr}`); // Log stderr even on success, as it might contain warnings
|
||||
resolve();
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Example usage in your CLI command:
|
||||
// const inputPdf = 'path/to/your/input.pdf';
|
||||
// const outputMd = 'path/to/your/output.md';
|
||||
// convertPdfToMarkdown(inputPdf, outputMd)
|
||||
// .then(() => console.log('PDF converted to Markdown successfully.'))
|
||||
// .catch(err => console.error('Conversion failed:', err));
|
||||
|
||||
```
|
||||
|
||||
## Important Considerations
|
||||
|
||||
* **Dependency Management:** Managing Python dependencies within a TypeScript project can be complex. Consider using Docker to encapsulate the Python environment or ensuring clear setup steps for developers.
|
||||
* **Script Modification:** The provided `run_project.py` script seems tailored to use its YAML config file directly. You will likely need to modify this Python script (or the way it's called) to accept input PDF file paths and desired output Markdown file paths as command-line arguments for seamless integration into your CLI.
|
||||
* **Error Handling:** Robust error handling is crucial. The Python script might fail for various reasons (invalid PDF, missing dependencies, model errors). Ensure your TypeScript wrapper handles errors from the child process gracefully.
|
||||
* **Performance:** Executing a Python process involves overhead. For high-throughput scenarios, explore potential optimizations or alternative libraries.
|
||||
* **Model Downloads:** The underlying models (YOLO, etc.) might require downloading large files during the first run or setup. Account for this in your setup instructions and potentially during the first execution from your CLI.
|
||||
@ -1,353 +0,0 @@
|
||||
import * as path from 'path'
|
||||
import pMap from 'p-map'
|
||||
import pkg from 'which';
|
||||
const { sync: which } = pkg;
|
||||
import { resolve, OSR_CACHE } from '@polymech/commons'
|
||||
import { dirname,equalFiles, swProcMessage } from './sw-util.js'
|
||||
import { reportCSV } from '../report/csv.js'
|
||||
import { logger, substitute } from '../index.js'
|
||||
import { removeEmpty } from '../lib/index.js'
|
||||
import { SolidworkOptions } from '../types.js'
|
||||
import { Helper } from '../lib/process/index.js'
|
||||
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { sync as read } from "@polymech/fs/read"
|
||||
import { sync as write } from "@polymech/fs/write"
|
||||
import { sync as dir } from "@polymech/fs/dir"
|
||||
|
||||
import { sync as rm } from "@polymech/fs/remove"
|
||||
import { deepClone as clone } from "@polymech/core/objects"
|
||||
|
||||
import { swRayTraceRenderQuality_e, IAssembly, IAssemblyData } from './sw-types.js'
|
||||
import { get_cached, get_path_cached, get_cache_key, set_cached } from '@polymech/cache'
|
||||
|
||||
import {
|
||||
MODULE_NAME,
|
||||
MSG_FAILED_TO_LOAD
|
||||
} from '../constants.js'
|
||||
|
||||
import { closeAppByName, fileAsBuffer, getSWBin, removeEmptyValues } from './sw-util.js'
|
||||
|
||||
export const convertFile = async (
|
||||
src,
|
||||
target,
|
||||
view: string,
|
||||
onNode: (data) => void = () => { },
|
||||
options: SolidworkOptions,
|
||||
configuration: string) => {
|
||||
configuration = options.configuration || configuration
|
||||
options.close && closeAppByName('SLDWORKS')
|
||||
const osr_cache = OSR_CACHE()
|
||||
let cache_key_obj: any = {
|
||||
sw: options.sw,
|
||||
src,
|
||||
target,
|
||||
configuration
|
||||
}
|
||||
if (target.endsWith('.jpg')) {
|
||||
cache_key_obj =
|
||||
{
|
||||
...cache_key_obj,
|
||||
quality: options.quality,
|
||||
width: options.width,
|
||||
height: options.height,
|
||||
renderer: options.renderer
|
||||
}
|
||||
}
|
||||
if (target.endsWith('.xlsx')) {
|
||||
cache_key_obj = {
|
||||
...cache_key_obj,
|
||||
"bom-config": options['bom-config'],
|
||||
"bom-detail": options['bom-detail'],
|
||||
"bom-template": options['bom-template'],
|
||||
"bom-type": options['bom-type'],
|
||||
"bom-images": options['bom-images'],
|
||||
}
|
||||
}
|
||||
const ca_options = JSON.parse(JSON.stringify(removeEmpty(cache_key_obj)))
|
||||
let cached = await get_cached(src, ca_options, MODULE_NAME)
|
||||
const cachedPath = await get_path_cached(src, ca_options, MODULE_NAME)
|
||||
if (!exists(target)) {
|
||||
cached = null;
|
||||
}
|
||||
if (osr_cache && cached && cachedPath && options.cache == true) {
|
||||
if (!exists(target) || !equalFiles(target, cachedPath)) {
|
||||
write(target, Buffer.from(cached))
|
||||
}
|
||||
logger.debug(`[${MODULE_NAME}] Skipping conversion of ${src} to ${target}`)
|
||||
await onNode({ src, target, options })
|
||||
return Promise.resolve()
|
||||
}
|
||||
|
||||
const parts = path.parse(target)
|
||||
const source_parts = path.parse(src)
|
||||
let exe = '' + options.script
|
||||
let cwd = getSWBin(options.sw)
|
||||
let _target = '' + target
|
||||
let onPost = null
|
||||
|
||||
// SW Photoview wont render correctly in hidden mode
|
||||
if (parts.ext === '.jpg' && source_parts.ext.toLowerCase() === '.sldasm' && options.renderer.toLowerCase() === ' ') {
|
||||
logger.debug(`[${MODULE_NAME}] Converting ${src} to ${target} : - Photoview: - ` + options.hidden)
|
||||
options.hidden = "false"
|
||||
}
|
||||
let args = [
|
||||
`--source="${src}"`,
|
||||
`--target="${target}"`,
|
||||
`--configuration="${configuration}"`,
|
||||
`--view="*${view}"`,
|
||||
`--hidden=` + options.hidden || "true",
|
||||
`--width=` + options.width,
|
||||
`--height=` + options.height,
|
||||
`--swv=` + options.swv,
|
||||
`--renderer=` + options.renderer.toLowerCase() || "solidworks",
|
||||
`--quality=${options.quality || swRayTraceRenderQuality_e.swRenderQuality_Good}`
|
||||
]
|
||||
|
||||
if (options.save) args.push(`--save`)
|
||||
if (options.pack) args.push(`--pack`)
|
||||
if (options.rebuild) args.push(`--rebuild`)
|
||||
if (options.light) args.push(`--light`)
|
||||
if (options.write) args.push(`--write`)
|
||||
|
||||
if (parts.ext === '.json' && source_parts.ext.toLowerCase() === '.sldasm') {
|
||||
exe = 'model-reader.exe'
|
||||
args = [
|
||||
`--source="${path.resolve(src)}"`,
|
||||
`--target="${_target}"`
|
||||
]
|
||||
onPost = () => {
|
||||
try {
|
||||
let props = read(_target, 'json') as any[];
|
||||
if (!props) {
|
||||
logger.error('Error reading model file ', src)
|
||||
return false
|
||||
}
|
||||
props = props.map(removeEmpty)
|
||||
write(_target, props)
|
||||
return true
|
||||
} catch (e) {
|
||||
logger.error(`Error executing model-reader::onPost for ${src} to ${_target}`)
|
||||
write(_target, {})
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
if (parts.base.endsWith('-configs.json') && source_parts.ext.toLowerCase() === '.sldasm') {
|
||||
exe = 'getconfigs.exe'
|
||||
args = [
|
||||
`--source="${path.resolve(src)}"`,
|
||||
`--target="${path.resolve(_target)}"`
|
||||
]
|
||||
onPost = () => {
|
||||
try {
|
||||
let props = read(_target, 'json') as any[];
|
||||
if (!props) {
|
||||
logger.error('Error reading configurations file ', src)
|
||||
return false
|
||||
}
|
||||
return true
|
||||
} catch (e) {
|
||||
logger.error(`Error executing get::onPost for ${src} to ${_target}`)
|
||||
write(_target, {})
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
if (parts.ext === '.html') {
|
||||
exe = 'ExportHTML.exe'
|
||||
if (!configuration || configuration === 'Default') {
|
||||
args = [
|
||||
`"${src}"`,
|
||||
`"${target}"`,
|
||||
]
|
||||
} else if (configuration) {
|
||||
//EDrawings Control doesnt support configurations directly, we need a configuration specific edrawings file exported instead
|
||||
const eDrawingsFile = src.toLowerCase().replace('.sldasm', `-${configuration}.EASM`)
|
||||
if (!exists(eDrawingsFile)) {
|
||||
logger.error(`Configuration specific edrawing file ${eDrawingsFile} doesnt exists`)
|
||||
return Promise.resolve()
|
||||
}
|
||||
args = [
|
||||
`"${eDrawingsFile}"`,
|
||||
`"${target}"`,
|
||||
`${configuration}`
|
||||
]
|
||||
}
|
||||
}
|
||||
if (parts.ext === '.xlsx') {
|
||||
exe = 'bom.exe';
|
||||
args = [
|
||||
`"${src}"`,
|
||||
`"${target}"`,
|
||||
`--configuration ${options['bom-config']}`,
|
||||
`--type ${options['bom-type']}`,
|
||||
`--detail ${options['bom-detail']}`
|
||||
]
|
||||
|
||||
options['bom-images'] && args.push('--images')
|
||||
options['bom-template'] && args.push(`--template ${options['bom-template']}`)
|
||||
|
||||
if (!options.cache && exists(target)) {
|
||||
rm(target);
|
||||
}
|
||||
}
|
||||
if (source_parts.ext === '.drawio') {
|
||||
exe = 'draw.io.exe';
|
||||
try {
|
||||
cwd = path.parse(which(exe)).dir;
|
||||
} catch (e) {
|
||||
logger.error(`Cant find ${exe}`);
|
||||
return Promise.resolve();
|
||||
}
|
||||
args = [
|
||||
`"${src}"`,
|
||||
'-x',
|
||||
`-f ${parts.ext.replace('.', '')}`,
|
||||
`${options.args}`
|
||||
]
|
||||
}
|
||||
const bin = path.resolve(`${cwd}/${exe}`)
|
||||
if (!exists(bin)) {
|
||||
logger.error(`${bin} doesnt exists in ${cwd}`)
|
||||
logger.error('__dirname:' + dirname())
|
||||
logger.error('options.sw ' + options.sw)
|
||||
return
|
||||
}
|
||||
const ret = await Helper.run(cwd, exe, args, options.debug)
|
||||
ret.messages = [...new Set(ret.messages)]
|
||||
const failed = !!ret.messages.find((m: string) => m.includes(MSG_FAILED_TO_LOAD))
|
||||
ret.messages = ret.messages.map((m: string) => swProcMessage(m)).filter(x => x != null).map(x => x.message)
|
||||
const info = {
|
||||
...ret,
|
||||
src,
|
||||
target,
|
||||
failed: failed,
|
||||
options
|
||||
}
|
||||
|
||||
await onNode(info)
|
||||
onPost && onPost()
|
||||
if (info.failed) {
|
||||
rm(_target)
|
||||
return ret
|
||||
}
|
||||
osr_cache && options.cache == true && await set_cached(src, ca_options, MODULE_NAME, fileAsBuffer(_target))
|
||||
options.close && closeAppByName('SLDWORKS')
|
||||
return ret
|
||||
}
|
||||
export async function convertFiles(file, targets: string[], view, onNode: (data: any) => void = () => { }, options: SolidworkOptions) {
|
||||
if (options.dry) {
|
||||
logger.info(`Dry run convert ${file} to `, targets.map((t) => { `\n\t${t}` }).join(',\n'))
|
||||
return Promise.resolve()
|
||||
}
|
||||
return pMap(targets, (target: any) => {
|
||||
return convertFile(file, target.target, view, onNode, options, target.configuration);
|
||||
}, { concurrency: 1 })
|
||||
}
|
||||
export const report = (data, dst: string) => {
|
||||
|
||||
let report: any = null;
|
||||
if (dst.endsWith('.md')) {
|
||||
//report = reportMarkdown(data)
|
||||
}
|
||||
|
||||
if (dst.endsWith('.csv')) {
|
||||
report = reportCSV(data)
|
||||
}
|
||||
|
||||
logger.info(`Write report to ${dst}`)
|
||||
report = write(dst, data)
|
||||
|
||||
return report;
|
||||
}
|
||||
export const targets = (file: string, options: SolidworkOptions) => {
|
||||
const srcParts = path.parse(file)
|
||||
const variables = clone(options.variables)
|
||||
const targets = []
|
||||
|
||||
let configurations: any = { "Default": null }
|
||||
if (options.configuration && options.configuration !== 'Default') {
|
||||
configurations[options.configuration] = null
|
||||
delete configurations["Default"]
|
||||
}
|
||||
for (const conf in configurations) {
|
||||
if (options.dstInfo.IS_GLOB) {
|
||||
options.dstInfo.GLOB_EXTENSIONS.forEach((e) => {
|
||||
variables.SRC_NAME = srcParts.name
|
||||
variables.SRC_DIR = srcParts.dir
|
||||
variables.CONFIGURATION = conf
|
||||
let targetPath = substitute(options.variables.DST_PATH, options.alt, variables)
|
||||
targetPath = path.resolve(targetPath.replace(options.variables.DST_FILE_EXT, '') + e)
|
||||
const parts = path.parse(targetPath)
|
||||
if (srcParts.ext === parts.ext) {
|
||||
return
|
||||
}
|
||||
if (!exists(parts.dir)) {
|
||||
try {
|
||||
dir(parts.dir)
|
||||
} catch (e) {
|
||||
if (options.debug) {
|
||||
logger.error(`Error creating target path ${parts.dir} for ${targetPath}`);
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
targets.push({
|
||||
target: targetPath,
|
||||
configuration: conf
|
||||
})
|
||||
})
|
||||
} else {
|
||||
variables.SRC_NAME = srcParts.name
|
||||
variables.SRC_DIR = srcParts.dir
|
||||
variables.CONFIGURATION = conf
|
||||
let targetPath = substitute(options.variables.DST_PATH, options.alt, variables)
|
||||
if (!exists(targetPath)) {
|
||||
try {
|
||||
dir(targetPath)
|
||||
} catch (e) {
|
||||
if (options.debug) {
|
||||
logger.error(`Error creating target path ${targetPath}`)
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
targets.push({
|
||||
target: targetPath,
|
||||
configuration: conf
|
||||
})
|
||||
}
|
||||
}
|
||||
return targets
|
||||
}
|
||||
export async function convert(options: SolidworkOptions) {
|
||||
logger.setSettings({ minLevel: options.logLevel as any || 'warn' })
|
||||
let reports = []
|
||||
const onNode = options.onNode || ((data) => reports.push(data))
|
||||
if (options.srcInfo.FILES.length === 0) {
|
||||
logger.warn(`No files found to convert : `, options.src)
|
||||
return
|
||||
}
|
||||
//skip orphan / temporary files
|
||||
options.srcInfo.FILES = options.srcInfo.FILES.filter((f) => {
|
||||
return f.includes('~$') === false
|
||||
})
|
||||
|
||||
const ret = await pMap(options.srcInfo.FILES, async (f) => {
|
||||
const outputs = targets(f, options)
|
||||
logger.info(`Convert ${f} to ${outputs.map(t => t.target).join(',')}`)
|
||||
return convertFiles(f, outputs, options.view, onNode, options)
|
||||
}, { concurrency: 1 })
|
||||
|
||||
if (options.report) {
|
||||
const reportOutFile: string = path.resolve(resolve(options.report, false, {
|
||||
dst: options.srcInfo.DIR,
|
||||
...options.variables,
|
||||
CONFIGURATION: options.configuration || ''
|
||||
}))
|
||||
logger.debug(`Write report to ${reportOutFile}`)
|
||||
report(reports, reportOutFile)
|
||||
}
|
||||
return ret
|
||||
}
|
||||
@ -1,84 +0,0 @@
|
||||
import * as CLI from 'yargs'
|
||||
import * as path from 'path'
|
||||
import { resolve, forward_slash, pathInfo } from "@polymech/commons"
|
||||
|
||||
export const sanitize = (argv: any): SolidworkOptions => {
|
||||
const src = forward_slash(path.resolve(resolve(argv.src)))
|
||||
const config: any = argv.config ? read(path.resolve('' + argv.config), 'json') : {}
|
||||
const extraVariables = {}
|
||||
for (const key in config) {
|
||||
if (Object.prototype.hasOwnProperty.call(config, key)) {
|
||||
const element = config[key];
|
||||
if (typeof element === 'string') {
|
||||
extraVariables[key] = element
|
||||
}
|
||||
}
|
||||
}
|
||||
const args: SolidworkOptions = {
|
||||
src: src,
|
||||
dst: '' + argv.dst as string,
|
||||
debug: argv.debug,
|
||||
verbose: argv.verbose,
|
||||
dry: argv.dry,
|
||||
onNode: argv.onNode,
|
||||
cache: argv.cache,
|
||||
hidden: argv.hidden || "true",
|
||||
renderer: argv.renderer || "solidworks",
|
||||
alt: argv.alt,
|
||||
quality: argv.quality,
|
||||
logLevel: argv.logLevel,
|
||||
close: argv.close,
|
||||
width: argv.width || "1024",
|
||||
height: argv.height || "1024",
|
||||
script: argv.script || 'convert.exe',
|
||||
sw: argv.sw || 2024,
|
||||
swv: argv.swv || 32,
|
||||
configuration: argv.configuration || 'Default',
|
||||
report: argv.report || DEFAULT_REPORT,
|
||||
pack: argv.pack,
|
||||
light: argv.light,
|
||||
rebuild: argv.rebuild,
|
||||
save: argv.save,
|
||||
write: argv.write,
|
||||
variables: { ...extraVariables },
|
||||
view: argv.view || 'Render',
|
||||
args: argv.args || '',
|
||||
"bom-config": argv['bom-config'],
|
||||
"bom-detail": argv['bom-detail'],
|
||||
"bom-template": argv['bom-template'],
|
||||
"bom-type": argv['bom-type'],
|
||||
"bom-images": argv['bom-images'],
|
||||
} as SolidworkOptions
|
||||
|
||||
if (!args.src) {
|
||||
logger.error('Invalid source, abort')
|
||||
return process.exit()
|
||||
}
|
||||
|
||||
args.srcInfo = pathInfo(src)
|
||||
|
||||
if (!args.srcInfo.FILES) {
|
||||
logger.error(`Invalid source files, abort`, args.srcInfo)
|
||||
return process.exit()
|
||||
}
|
||||
|
||||
for (const key in args.srcInfo) {
|
||||
if (Object.prototype.hasOwnProperty.call(args.srcInfo, key)) {
|
||||
args.variables['SRC_' + key] = args.srcInfo[key]
|
||||
}
|
||||
}
|
||||
|
||||
if (argv.dst) {
|
||||
args.dst = path.resolve(substitute(args.dst, args.variables))
|
||||
args.dstInfo = pathInfo(args.dst as string)
|
||||
args.dstInfo.PATH = argv.dst as string
|
||||
for (const key in args.dstInfo) {
|
||||
if (Object.prototype.hasOwnProperty.call(args.dstInfo, key)) {
|
||||
args.variables['DST_' + key] = args.dstInfo[key]
|
||||
}
|
||||
}
|
||||
}
|
||||
(args as SolidworkOptions).view = argv.view as string || "Render"
|
||||
return args
|
||||
}
|
||||
|
||||
@ -1,219 +0,0 @@
|
||||
# **TypeScript Libraries and Tools for PDF Data Extraction to JSON**
|
||||
|
||||
**1\. Introduction**
|
||||
|
||||
The prevalence of Portable Document Format (PDF) files in various domains necessitates efficient programmatic methods for accessing and processing their content. PDFs serve as a standard for document storage and exchange, frequently containing valuable data that applications need to extract and utilize.1 This demand for automated data retrieval has spurred the development of numerous tools and libraries capable of parsing and extracting information from these documents. However, the very nature of the PDF format, designed primarily for visual presentation, introduces significant hurdles for automated extraction processes.2 A single PDF can encompass a diverse range of content, including textual data formatted in intricate layouts, embedded images, and tabular data often represented through visual cues rather than semantic structures.2 Furthermore, the increasing prevalence of scanned documents adds another layer of complexity, as text within these PDFs exists as images, requiring Optical Character Recognition (OCR) to convert them into machine-readable text.3
|
||||
|
||||
Given the user's preference for TypeScript, this report will specifically investigate solutions built with or offering robust support for this language. TypeScript, as a statically-typed superset of JavaScript, provides enhanced code maintainability and scalability, making it a suitable choice for developing reliable data extraction pipelines.5 This report aims to identify and analyze suitable TypeScript libraries for extracting text, images, and tables from PDF documents, explore available OCR options for handling image-based text, and investigate the potential integration of local Artificial Intelligence (AI) models for advanced structured data extraction, particularly for complex tables. The ultimate goal is to provide a comparative overview of these solutions, summarizing their key features, licensing, development status, and suitability for the task of converting PDF content into JSON format. The structure of this report will follow a logical progression, starting with basic content extraction and advancing to more sophisticated techniques involving OCR and AI integration, culminating in a comparative analysis and recommendations.
|
||||
|
||||
**2\. TypeScript Libraries for Basic PDF Content Extraction**
|
||||
|
||||
The initial step in processing PDF documents programmatically often involves extracting the fundamental content: text and images. Several TypeScript and JavaScript libraries offer functionalities to achieve this, each with its own strengths and limitations.1
|
||||
|
||||
* **2.1. Text Extraction:**
|
||||
* **pdf-parse:** This popular Node.js package is recognized for its straightforward approach to extracting text from PDF files.1 It allows developers to easily retrieve the textual content of a PDF document through a user-friendly interface.1 However, a notable limitation of pdf-parse is its inability to preserve the structural integrity of tables within the PDF.1 It tends to treat the content of tables as continuous lines of text, which can be problematic when structured data is required.1 Furthermore, user experiences reported in online forums indicate potential issues such as compilation errors in specific environments, suggesting possible environmental dependencies or library-specific bugs.15 Additionally, its capability to retain formatting details like margins, centered text, or page information is limited, as highlighted by user feedback seeking more layout-aware text extraction.16 This makes pdf-parse suitable for basic text retrieval when the document's layout and structure are not critical for subsequent processing.
|
||||
* **pdf2json:** This module focuses on transforming PDF files from their binary format into a JSON representation.1 By converting the PDF content into a JSON structure, it provides more granular information compared to pdf-parse, potentially including the coordinates of text elements within the document.1 This coordinate information could theoretically be used to reconstruct some of the document's layout. However, a significant drawback of pdf2json is its lack of recent updates, with reports indicating that it hasn't been actively maintained for several years.16 This raises concerns regarding its compatibility with newer PDF standards and potential security vulnerabilities. While it might offer more structural data than pdf-parse due to the inclusion of coordinates, its outdated status makes it a less reliable choice for long-term projects.
|
||||
* **pdf-ts:** As a TypeScript library specifically designed for PDF text extraction, pdf-ts aims to provide a type-safe and well-integrated solution within TypeScript projects.5 Its primary focus is on extracting textual content from PDF documents. The library has garnered some community interest, indicated by its 36 stars and 1 fork on GitHub.5 The last recorded release was on August 7, 2023\.5 Being written in TypeScript, it offers the advantages of static typing, which can lead to more robust and maintainable codebases compared to plain JavaScript libraries.
|
||||
* **js-extract:** This library is essentially a packaged version of examples demonstrating how to use the widely adopted pdf.js library within a Node.js environment.6 Its core functionality lies in reading a PDF file and exporting all pages along with the extracted text and their corresponding coordinates.6 The inclusion of coordinate data makes js-extract a potentially valuable tool for scenarios where preserving or reconstructing the structure of the PDF, including the positioning of text elements, is important.6 This capability could be particularly useful for attempting to identify and extract tabular data. The library is licensed under the MIT license, and its last published version (0.2.1) was two years ago.6 The fact that it's built upon pdf.js is noteworthy, as pdf.js is a mature and actively developed library primarily used for rendering PDFs in web browsers, suggesting a solid foundation for its text extraction capabilities.
|
||||
* **PDF-LIB:** While the provided snippets do not explicitly detail PDF-LIB's text extraction capabilities, it is mentioned as the underlying library for pdf-io, which focuses on image extraction.17 Additionally, a user in an online forum mentioned attempting to use PDF-LIB for parsing PDFs, suggesting it possesses broader PDF manipulation functionalities beyond just image handling.15 PDF-LIB is a powerful library for creating and modifying PDF documents in JavaScript environments.18 Its API provides low-level access to the structure of PDF files, which could potentially be leveraged for custom text extraction logic, although this might require a deeper understanding of the PDF format itself.
|
||||
* **2.2. Image Extraction:**
|
||||
* **pdf-io:** This TypeScript library is specifically designed for the task of extracting images from PDF files.17 It provides functionalities to parse a given PDF document, identify image objects within it, and save these images as PNG files.17 The library relies on the robust pdf-lib for PDF parsing and pngjs for encoding the extracted image data into the PNG format.17 It offers a straightforward API, with a constructor that accepts either a file path or a buffer containing the PDF data, and an extractImages() method to perform the extraction.17 The extracted images can either be saved to a specified output directory or returned as an array of Uint8Array or Buffer objects if the PDF was loaded from a buffer.17 pdf-io is licensed under the MIT license.17 While it directly addresses the need for image extraction, its current GitHub status, with 3 stars and 1 fork and no recent releases, suggests it might be a relatively small or less actively maintained project.17
|
||||
* **node-pdf-extract-image:** Similar to pdf-io, this library focuses on extracting images from PDF documents.7 It utilizes pdfjs-dist, the distribution of Mozilla's pdf.js library for Node.js environments, to read and process PDF files.7 The extracted images are then encoded as PNG files using the pngjs library.7 It's important to note that this library only extracts images that are explicitly embedded within the PDF and will return an empty array if no images are found.7 It provides a simple asynchronous function, extractImagesFromPdf, which accepts either a buffer containing the PDF data or the path to the PDF file.7 The resulting images are returned as an array of buffers, which can then be written to disk or further processed.7 This library is also licensed under the MIT license.7 By leveraging the widely used pdfjs-dist, it benefits from the maturity and extensive capabilities of pdf.js in handling various PDF structures.
|
||||
* **pdf-img-convert:** Although not explicitly identified as a TypeScript library, pdf-img-convert is mentioned as a solution for extracting images from PDFs by converting each page into an image.19 It operates using pdf.js under the hood, suggesting a JavaScript-based implementation. This approach is particularly useful when dealing with PDFs where content might not be directly extractable as text or individual image objects, as it essentially renders each page as a raster image.
|
||||
* **Apryse WebViewer:** This is a commercial JavaScript SDK that offers a comprehensive suite of PDF functionalities, including the extraction of image content.20 It provides a detailed API that allows developers to traverse the PDF's display list, identify elements of type image, and export them in various formats such as PNG or TIFF.20 While the license is commercial, Apryse WebViewer likely offers robust features, performance, and dedicated support, making it a viable option for projects with budget for a commercial solution requiring advanced PDF processing capabilities.
|
||||
|
||||
**3\. Leveraging OCR for Image-Based Text**
|
||||
|
||||
When dealing with scanned PDF documents or PDFs where text is embedded as images, Optical Character Recognition (OCR) technology becomes essential to convert these images into machine-readable text.3 Several JavaScript and TypeScript libraries are available for performing OCR, with tesseract.js being a prominent open-source option.11
|
||||
|
||||
* **3.1. TypeScript OCR Libraries:**
|
||||
* **tesseract.js:** This library stands out as a pure JavaScript port of the highly regarded Tesseract OCR engine, which boasts support for over 100 languages.11 It enables OCR to be performed directly within a web browser or on a server using Node.js.11 tesseract.js offers functionalities such as automatic text orientation and script detection, and it provides an interface to access bounding boxes for paragraphs, words, and characters.11 Demonstrations show its capability to accurately recognize text from images.22 Installation is straightforward using CDN, npm, or yarn.12 The library is licensed under the Apache-2.0 license and enjoys a high level of community engagement, evidenced by its substantial number of stars on GitHub.12 It operates locally, processing images directly without relying on external AI models for its core OCR functionality.12 This local processing aligns well with the user's preference for local solutions. While tesseract.js itself does not directly support PDF files as input, it can be effectively used in conjunction with libraries like PDF.js to first convert PDF pages into images (e.g., canvas elements) and then perform OCR on these images.21
|
||||
* **Other OCR Packages:** The npm ecosystem contains a variety of other packages related to OCR.24 Some of these, like @gzqyl/react-native-ocr and @gutenye/ocr-react-native, are tailored for specific environments like React Native and might offer local OCR capabilities.24 ollama-ocr indicates a potential integration with local visual AI models run by Ollama for OCR tasks.24 Exploring these packages further might reveal specialized features or integrations relevant to specific use cases.
|
||||
* **Considerations from Research:** Research comparing different OCR models suggests that the optimal choice depends on the characteristics of the input images.25 While cloud-based models might offer superior accuracy in some scenarios, local models like EasyOCR (mentioned in a research blog) can provide a cost-effective solution with competitive accuracy.25 Tesseract, the engine behind tesseract.js, is known for its wide language support but can struggle with documents that are not clean or machine-generated, such as scanned documents or those with unusual fonts.26 docTR, another open-source option, performs better on scanned documents but lacks handwriting support.26 Therefore, the suitability of tesseract.js will depend on the quality and nature of the images extracted from the PDFs.
|
||||
* 3.2. Integrating Image Extraction with OCR:
|
||||
The process of extracting text from image-based PDFs typically involves a two-stage approach.21 First, a library capable of extracting images from the PDF, such as pdf-io or node-pdf-extract-image, is used to obtain the image data, often in PNG format.7 Second, this image data or the path to the saved image file is then provided as input to an OCR library like tesseract.js.12 tesseract.js processes the image and outputs the recognized text.12 Examples demonstrate the use of PDF.js to render PDF pages onto a canvas element, followed by tesseract.js performing OCR on the content of this canvas.21 This method effectively bridges the gap between PDF content and OCR processing in a JavaScript/TypeScript environment.
|
||||
|
||||
**4\. Extracting Tables from PDFs in TypeScript**
|
||||
|
||||
Extracting tabular data from PDF documents presents a more complex challenge compared to simple text or image extraction.4 This is primarily because PDFs do not inherently define tables as semantic structures; instead, tables are usually rendered using lines and text elements positioned in a grid-like fashion.4 This lack of semantic information necessitates more sophisticated techniques to identify and reconstruct the table structure.
|
||||
|
||||
* **4.1. Rule-based Table Extraction Libraries:**
|
||||
* **pdf-tables-parser:** This JavaScript/TypeScript library is specifically designed to address the challenge of extracting text tables from PDF files.8 It aims to efficiently parse PDF documents and extract structured table data, even from multi-page PDFs and those with complex layouts.8 The library offers several configurable options, such as hasTitles to indicate if tables have title rows, threshold to adjust the sensitivity for grouping rows, maxStrLength for setting a maximum string length for cells, and ignoreTexts to specify text to be ignored during extraction.8 The extracted table data is provided as a straightforward 2D array, where each inner array represents a row and the elements within are the cell contents.8 While it's a TypeScript library, its GitHub status with only 1 star and 0 forks suggests it might be a relatively new or less widely adopted project.8
|
||||
* **@mkas3/pdf-table-parser:** This library is a TypeScript-based rewrite of the JavaScript library pdf-table-extractor, with the added benefit of built-in type declarations.9 Its goal is to simplify the process of parsing tables from PDF files.9 It offers options like maxEdgesPerPage to control the number of edges processed on each page and a progressFunc callback to monitor the extraction process.9 The library returns a Promise that resolves to an array of page objects, where each page object contains an array of tables. Each table is represented by an array of rows, and each row contains an array of cell objects with their content.9 This structured JSON output format can be convenient for further data processing. The library has been published more recently and has seen some weekly downloads, indicating a degree of current usage.9
|
||||
* **@kobataku/pdf-table-extractor:** This package is presented as a fork of the original pdf-table-extractor library, specifically created to provide a valid npm module for TypeScript development.10 It allows users to extract tables from PDF files and obtain the data as a 2D array.10 However, this package was published six years ago and has a very low number of weekly downloads, which might suggest it is no longer actively maintained or has been superseded by more recent alternatives like @mkas3/pdf-table-parser.10 For detailed information on the extraction algorithm, users are referred to the original repository.10
|
||||
* **pdf2array:** This is described as a hobby project that aims to simplify the extraction of tabular data from PDF files using the pdf.js library.28 Being in its early stages of development, its reliability and capabilities might be limited compared to more established libraries. However, its existence indicates an ongoing interest within the TypeScript community in developing better solutions for PDF table extraction.
|
||||
* 4.2. Potential of Local AI Models for Advanced Table Extraction:
|
||||
Traditional rule-based approaches to table extraction often face significant limitations due to the inherent lack of semantic structure in PDF documents.1 Tables are visually interpreted by humans based on layout, lines, and the spatial arrangement of text, but translating this human intuition into robust code is challenging.4 Issues like merged cells, tables without clear borderlines, and inconsistent layouts can easily confuse rule-based algorithms.29
|
||||
The emergence of AI, particularly Large Language Models (LLMs) and vision models, offers promising avenues for more advanced and accurate table extraction.2 These models can leverage their understanding of language and visual patterns to identify and interpret table structures more effectively.32 For instance, LayoutPDFReader within the llamaindex framework employs intelligent chunking to preserve the context of tables, although it currently lacks OCR capabilities.2 Tools like AlgoDocs and Docsumo utilize AI to extract tables, even from scanned documents.3 The Unstructured library provides a hi\_res strategy that leverages AI for improved table extraction.31 gmft is a specialized tool that uses Microsoft's TATR model for deep table recognition, focusing on alignment and speed by potentially reusing existing OCR output.30 PDF-Extract-Kit integrates state-of-the-art models for various document parsing tasks, including table recognition capable of outputting in formats like LaTeX, HTML, and Markdown.33 Converting the PDF to an image and then using tools like img2table is another approach that combines image processing with potential AI-driven table detection.29
|
||||
Integrating local AI models into a TypeScript-based workflow for table extraction is an area of growing interest.13 Libraries like instructor-js facilitate structured extraction using LLMs (including local ones through platforms like Ollama) by defining schemas using Zod.13 Documind is an open-source tool that can extract structured data, including from tables, using both cloud-based (OpenAI) and local LLMs (Llava and Llama3.2).14 Unstructured also supports integration with local models via Ollama for various data extraction tasks, including tables.31 One potential approach involves using OCR (like Tesseract.js) to extract text and then feeding this text into a local LLM with specific instructions to identify and structure tabular data.42 While the direct TypeScript integration of local AI models specifically for PDF table extraction is still evolving, these initial explorations suggest a promising direction for overcoming the limitations of traditional methods.
|
||||
|
||||
**5\. Converting Extracted Data to JSON Format**
|
||||
|
||||
Once the desired data—whether text, images, or tables—has been extracted from the PDF document, the final step often involves converting this information into JSON (JavaScript Object Notation) format.63 JSON is a lightweight data-interchange format that is easy for humans to read and write and easy for machines to parse and generate.66
|
||||
|
||||
The structure of the extracted data will vary depending on the library and the type of content. Extracted text might be a simple string, images could be represented as file paths or base64 encoded strings, and tables might be in the form of 2D arrays or more complex nested objects.8
|
||||
|
||||
TypeScript provides a built-in global object, JSON, which offers methods for working with JSON data, including the crucial stringify() method.63 This method takes a JavaScript value (which can be an object or an array) and converts it into a JSON string.65
|
||||
|
||||
For example, if a table is extracted as a 2D array in TypeScript:
|
||||
|
||||
TypeScript
|
||||
|
||||
const tableData: string \= \["Header 1", "Header 2"\],
|
||||
\["Data 1", "Data 2"\],
|
||||
\["Data 3", "Data 4"\];
|
||||
|
||||
This can be easily converted to a JSON string using JSON.stringify():
|
||||
|
||||
TypeScript
|
||||
|
||||
const jsonString: string \= JSON.stringify(tableData);
|
||||
console.log(jsonString); // Output: \[\["Header 1","Header 2"\],\["Data 1","Data 2"\],\["Data 3","Data 4"\]\]
|
||||
|
||||
Similarly, if extracted data is structured as an array of objects, as might be the case with @mkas3/pdf-table-parser 9:
|
||||
|
||||
TypeScript
|
||||
|
||||
const pageTables \= \[
|
||||
{
|
||||
page: 1,
|
||||
tables: \[
|
||||
{
|
||||
rows: \[{ content: "Header 1" }, { content: "Header 2" }\],
|
||||
\[{ content: "Data 1" }, { content: "Data 2" }\],
|
||||
},
|
||||
\],
|
||||
},
|
||||
\];
|
||||
|
||||
This can also be converted to a JSON string:
|
||||
|
||||
TypeScript
|
||||
|
||||
const jsonOutput: string \= JSON.stringify(pageTables);
|
||||
console.log(jsonOutput);
|
||||
/\*
|
||||
Output:
|
||||
\[{"page":1,"tables":\[{"rows":\[\[{"content":"Header 1"},{"content":"Header 2"}\],\[{"content":"Data 1"},{"content":"Data 2"}\]\]}\]}\]
|
||||
\*/
|
||||
|
||||
Various techniques exist for transforming arrays of objects into a JSON object with specific key-value pairs if a different structure is desired.63 TypeScript's flexibility allows for structuring the extracted data in a way that best suits the application's needs before the final conversion to JSON using JSON.stringify().
|
||||
|
||||
**6\. Integrating Local AI Models for Structured Data Extraction**
|
||||
|
||||
Integrating local AI models for structured data extraction, including from PDFs, involves several considerations regarding setup, model selection, and interaction.13
|
||||
|
||||
* General Approaches:
|
||||
The primary ways to leverage local LLMs in a TypeScript environment include using libraries that provide direct integration with local LLM inference servers or making direct API calls to these servers.13 Libraries like instructor-js offer integration with platforms like Ollama, which simplifies the process of running and interacting with local models.13 Similarly, tools like Documind and Unstructured are designed to work with local LLMs for document processing tasks.14 If a library doesn't offer built-in integration, developers might need to make HTTP requests to the API endpoints exposed by local LLM inference servers, such as those provided by Ollama.45
|
||||
* Considerations for Model Selection and Setup:
|
||||
Choosing the right local LLM is crucial and depends on the specific data extraction task.25 Different models possess varying capabilities in understanding text, identifying entities, and structuring data. Models specifically fine-tuned for information extraction or document understanding are generally preferred.56 Running LLMs locally can be computationally intensive, requiring adequate hardware resources, including CPU, RAM, and potentially a dedicated GPU for optimal performance.50 Setting up the local LLM inference server (e.g., using Ollama or a similar framework) and downloading the desired model are necessary prerequisites.45 It's also important to consider the availability and licensing terms of the models.12 Effective communication with the LLM is achieved through careful prompt engineering, which involves crafting clear and specific instructions to guide the model in extracting and structuring the data according to the desired schema.36
|
||||
* Local AI for Table Extraction (Revisited):
|
||||
As previously discussed, local AI models hold significant potential for enhancing table extraction from PDFs.14 By leveraging their semantic understanding, these models can often overcome the limitations of rule-based methods when dealing with complex table structures.14 Libraries like instructor-js with its JSON mode allow developers to define a schema representing the desired table structure and instruct the local LLM to extract the table data in that format.13 Tools like Documind and Unstructured also aim to facilitate this process by providing abstractions for interacting with local LLMs for document intelligence tasks, including table extraction.14 A common strategy involves first extracting the text content of the PDF (potentially using OCR if needed) and then providing this text along with a well-crafted prompt and schema to the local LLM to identify and structure the tabular data.42
|
||||
|
||||
**7\. Comparative Analysis of Potential Solutions**
|
||||
|
||||
The following table summarizes the key findings for several TypeScript and JavaScript libraries and tools discussed in this report, highlighting their capabilities for PDF data extraction to JSON.
|
||||
|
||||
| Name | Description | Links | License | Status | Features | Local AI Model Integration Details |
|
||||
| :---- | :---- | :---- | :---- | :---- | :---- | :---- |
|
||||
| pdf-parse | Node.js library for basic text extraction from PDFs. | [https://www.npmjs.com/package/pdf-parse](https://www.npmjs.com/package/pdf-parse) | MIT | Popular, user-friendly interface | Text extraction. Limited table structure preservation. | No explicit local AI integration. |
|
||||
| pdf2json | Transforms PDF to JSON format, including text coordinates. | [https://www.npmjs.com/package/pdf2json](https://www.npmjs.com/package/pdf2json) | Apache-2.0 | Outdated, not recently updated | Converts PDF to JSON, includes text coordinates. | No explicit local AI integration. |
|
||||
| pdf-ts | TypeScript library for PDF text extraction. | [https://github.com/axflow/pdf-ts](https://github.com/axflow/pdf-ts) | MIT | Moderate community interest, last release Aug 2023 | Text extraction. | No explicit local AI integration. |
|
||||
| js-extract | Extracts text with coordinates from PDFs using pdf.js. | [https://www.npmjs.com/package/pdf.js-extract](https://www.npmjs.com/package/pdf.js-extract) | MIT | Last published 2 years ago | Text extraction with coordinates. Potential for table reconstruction. | No explicit local AI integration. |
|
||||
| pdf-io | TypeScript library for image extraction from PDFs. | ([https://github.com/Sorvereign/pdf-io](https://github.com/Sorvereign/pdf-io)) | MIT | Low activity, no recent releases | Image extraction to PNG. Built on pdf-lib and pngjs. | No explicit local AI integration. |
|
||||
| node-pdf-extract-image | Extracts images from PDFs using pdfjs-dist and pngjs. | [https://github.com/bangbang93/node-pdf-extract-image](https://github.com/bangbang93/node-pdf-extract-image) | MIT | No explicit status provided | Image extraction to PNG. | No explicit local AI integration. |
|
||||
| tesseract.js | Pure JavaScript port of Tesseract OCR engine. | [https://github.com/naptha/tesseract.js/](https://github.com/naptha/tesseract.js/) | Apache-2.0 | Highly active and popular | OCR for over 100 languages. Runs in browser and Node.js. | Local processing, not direct integration with external AI models for OCR. |
|
||||
| pdf-tables-parser | JavaScript/TypeScript library for text table extraction. | [https://github.com/kanakkholwal/pdf-tables-parser](https://github.com/kanakkholwal/pdf-tables-parser) | ISC | Low adoption, possibly newer | Extracts tables as 2D arrays. Supports multi-page PDFs and complex layouts. Configurable options. | No explicit local AI integration. |
|
||||
| @mkas3/pdf-table-parser | TypeScript rewrite of pdf-table-extractor for table parsing. | [https://www.npmjs.com/package/@mkas3/pdf-table-parser](https://www.npmjs.com/package/@mkas3/pdf-table-parser) | MIT | Some recent activity | Parses tables from PDFs. Typed. Offers structured JSON output. | No explicit local AI integration. |
|
||||
| @kobataku/pdf-table-extractor | TypeScript fork of pdf-table-extractor for table extraction. | [https://www.npmjs.com/package/@kobataku/pdf-table-extractor](https://www.npmjs.com/package/@kobataku/pdf-table-extractor) | BSD License | Outdated, very low activity | Extracts tables as 2D arrays. | No explicit local AI integration. |
|
||||
| instructor-js | TypeScript library for structured extraction using LLMs. | [https://github.com/instructor-ai/instructor-js](https://github.com/instructor-ai/instructor-js) | MIT | Actively developed | Facilitates structured data extraction using LLMs (including local via Ollama) and Zod schemas. Supports various output modes (JSON, TOOLS, FUNCTIONS). | Supports local models via integration with llm-polyglot and platforms like Ollama. Requires setting up the local LLM and defining schemas. |
|
||||
| Documind | Open-source tool for turning documents into structured data. | [https://github.com/DocumindHQ/documind](https://github.com/DocumindHQ/documind) | Not specified | Actively developed | Extracts structured JSON from unstructured documents. Supports custom schemas. Works with OpenAI and local LLMs (Llava, Llama3.2). Converts documents to Markdown. | Supports local LLMs (Llava and Llama3.2). Requires installation of system dependencies (Ghostscript, GraphicsMagick) and setting up environment variables for API keys (if using cloud models). |
|
||||
| Unstructured | Open-source library for pre-processing unstructured data. | [https://github.com/Unstructured-IO/unstructured](https://github.com/Unstructured-IO/unstructured) | Apache-2.0 | Actively developed | Extracts raw text, tables, and metadata from various document formats, including PDFs. Modular functions and connectors for data ingestion and pre-processing. | Supports local AI model processing via integration with Ollama. Requires installing Ollama and configuring the connection within Unstructured. |
|
||||
|
||||
**8\. Recommendations and Implementation Considerations**
|
||||
|
||||
Based on the analysis, the selection of libraries and tools will depend on the specific requirements of the PDF data extraction task. For basic text extraction without the need for structural information, pdf-parse offers an easy-to-use solution.1 However, for scenarios requiring more structural awareness, including the potential reconstruction of tables, js-extract with its coordinate data might be a better starting point.6
|
||||
|
||||
For image extraction, both pdf-io and node-pdf-extract-image provide dedicated functionalities, with the latter leveraging the widely adopted pdfjs-dist.7 If dealing with scanned documents or PDFs with text embedded as images, tesseract.js stands out as a robust and actively maintained OCR library with extensive language support.11 A common implementation strategy would involve using an image extraction library to obtain images and then passing them to tesseract.js for OCR.21
|
||||
|
||||
Extracting tables accurately remains a complex challenge. For rule-based approaches in TypeScript, @mkas3/pdf-table-parser appears to be a more recently updated and actively used option compared to others like @kobataku/pdf-table-extractor.9 However, for more complex or poorly formatted tables, leveraging local AI models offers a promising direction. Libraries like instructor-js, Documind, and Unstructured provide mechanisms to integrate with local LLM inference servers (e.g., via Ollama) to perform structured data extraction, including from tables.13 This approach typically involves defining a schema for the desired output format and prompting the LLM to extract and structure the data accordingly.36
|
||||
|
||||
When implementing a PDF data extraction solution, it's crucial to consider potential performance bottlenecks, especially when processing large files or using local AI models, which can be resource-intensive.50 Optimization techniques might be necessary to ensure acceptable processing times. Robust error handling is also essential to manage potential issues with malformed PDFs or unexpected content.15 For integrating local AI models, the setup involves installing and configuring the LLM inference server (like Ollama), downloading the desired models, and potentially fine-tuning prompts to achieve the desired accuracy and output format.45
|
||||
|
||||
**9\. Conclusion**
|
||||
|
||||
This report has explored a range of TypeScript libraries and tools capable of extracting various types of data from PDF documents and converting it into JSON format. While libraries like pdf-parse, pdf-ts, and js-extract offer solutions for text extraction, and pdf-io and node-pdf-extract-image facilitate image retrieval, the extraction of tabular data and text from images often requires more advanced techniques. tesseract.js provides a strong foundation for OCR in TypeScript environments.
|
||||
|
||||
The integration of local AI models represents a significant advancement in the field of PDF data extraction, particularly for handling the complexities of table extraction and other structured information retrieval tasks. Libraries like instructor-js, along with tools such as Documind and Unstructured, are paving the way for leveraging the power of local LLMs within TypeScript applications to achieve more accurate and nuanced data extraction from PDF documents. As this field continues to evolve, the ability to effectively combine traditional extraction methods with the semantic understanding of AI models will be crucial for unlocking the vast amounts of data contained within PDF files.
|
||||
|
||||
#### **Works cited**
|
||||
|
||||
1. Parsing PDFs in Node.js \- LogRocket Blog, accessed on April 24, 2025, [https://blog.logrocket.com/parsing-pdfs-node-js/](https://blog.logrocket.com/parsing-pdfs-node-js/)
|
||||
2. Mastering PDFs: Extracting Sections, Headings, Paragraphs, and Tables with Cutting-Edge Parser \- LlamaIndex, accessed on April 24, 2025, [https://www.llamaindex.ai/blog/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125](https://www.llamaindex.ai/blog/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125)
|
||||
3. PDF Image Extraction: A Comprehensive Guide To Extracting Image Data From Scanned Pdf Files In 2025 \- AlgoDocs, accessed on April 24, 2025, [https://www.algodocs.com/pdf-image-extraction-comprehensive-guide-2025/](https://www.algodocs.com/pdf-image-extraction-comprehensive-guide-2025/)
|
||||
4. How can I extract tables as structured data from PDF documents? \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/17591426/how-can-i-extract-tables-as-structured-data-from-pdf-documents](https://stackoverflow.com/questions/17591426/how-can-i-extract-tables-as-structured-data-from-pdf-documents)
|
||||
5. axflow/pdf-ts: PDF text extraction in TypeScript \- GitHub, accessed on April 24, 2025, [https://github.com/axflow/pdf-ts](https://github.com/axflow/pdf-ts)
|
||||
6. pdf.js-extract \- npm, accessed on April 24, 2025, [https://www.npmjs.com/package/pdf.js-extract](https://www.npmjs.com/package/pdf.js-extract)
|
||||
7. bangbang93/node-pdf-extract-image \- GitHub, accessed on April 24, 2025, [https://github.com/bangbang93/node-pdf-extract-image](https://github.com/bangbang93/node-pdf-extract-image)
|
||||
8. kanakkholwal/pdf-tables-parser: Library to extract text ... \- GitHub, accessed on April 24, 2025, [https://github.com/kanakkholwal/pdf-tables-parser](https://github.com/kanakkholwal/pdf-tables-parser)
|
||||
9. @mkas3/pdf-table-parser \- npm, accessed on April 24, 2025, [https://www.npmjs.com/package/@mkas3/pdf-table-parser](https://www.npmjs.com/package/@mkas3/pdf-table-parser)
|
||||
10. @kobataku/pdf-table-extractor \- npm, accessed on April 24, 2025, [https://www.npmjs.com/package/@kobataku/pdf-table-extractor](https://www.npmjs.com/package/@kobataku/pdf-table-extractor)
|
||||
11. Tesseract.js | Pure Javascript OCR for 100 Languages\!, accessed on April 24, 2025, [https://tesseract.projectnaptha.com/](https://tesseract.projectnaptha.com/)
|
||||
12. naptha/tesseract.js: Pure Javascript OCR for more than 100 Languages \- GitHub, accessed on April 24, 2025, [https://github.com/naptha/tesseract.js/](https://github.com/naptha/tesseract.js/)
|
||||
13. instructor-ai/instructor-js: structured extraction for llms \- GitHub, accessed on April 24, 2025, [https://github.com/instructor-ai/instructor-js](https://github.com/instructor-ai/instructor-js)
|
||||
14. DocumindHQ/documind: Open-source platform for extracting structured data from documents using AI. \- GitHub, accessed on April 24, 2025, [https://github.com/DocumindHQ/documind](https://github.com/DocumindHQ/documind)
|
||||
15. PDF Parsing with Typescript \- Palantir Developer Community, accessed on April 24, 2025, [https://community.palantir.com/t/pdf-parsing-with-typescript/718](https://community.palantir.com/t/pdf-parsing-with-typescript/718)
|
||||
16. Looking for a good pdf-parser to extract text. Any suggestions? : r/node \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/node/comments/186y7y0/looking\_for\_a\_good\_pdfparser\_to\_extract\_text\_any/](https://www.reddit.com/r/node/comments/186y7y0/looking_for_a_good_pdfparser_to_extract_text_any/)
|
||||
17. Sorvereign/pdf-io: A TypeScript library that allows you to ... \- GitHub, accessed on April 24, 2025, [https://github.com/Sorvereign/pdf-io](https://github.com/Sorvereign/pdf-io)
|
||||
18. Javascript Extract Images From Pdf | Restackio, accessed on April 24, 2025, [https://www.restack.io/p/javascript-extract-images-from-pdf-answer-cat-ai](https://www.restack.io/p/javascript-extract-images-from-pdf-answer-cat-ai)
|
||||
19. Extract images from PDF file with JavaScript \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/18680261/extract-images-from-pdf-file-with-javascript](https://stackoverflow.com/questions/18680261/extract-images-from-pdf-file-with-javascript)
|
||||
20. PDF Image Extraction Library for JavaScript \- Apryse documentation, accessed on April 24, 2025, [https://docs.apryse.com/web/guides/extraction/image-extract](https://docs.apryse.com/web/guides/extraction/image-extract)
|
||||
21. Using OCR in JavaScript to extract text \- Dropbox Sign, accessed on April 24, 2025, [https://sign.dropbox.com/blog/using-ocr-in-javascript](https://sign.dropbox.com/blog/using-ocr-in-javascript)
|
||||
22. JavaScript OCR Using Tesseract.js | Interesting JS Library Series | Episode 1 \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=kHTasYqs4Tw](https://www.youtube.com/watch?v=kHTasYqs4Tw)
|
||||
23. Running OCR against PDFs and images directly in your browser \- Simon Willison's Weblog, accessed on April 24, 2025, [https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/](https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/)
|
||||
24. ocr \- npm search, accessed on April 24, 2025, [https://www.npmjs.com/search?q=ocr\&page=2](https://www.npmjs.com/search?q=ocr&page=2)
|
||||
25. Best OCR Models for Text Recognition in Images \- Roboflow Blog, accessed on April 24, 2025, [https://blog.roboflow.com/best-ocr-models-text-recognition/](https://blog.roboflow.com/best-ocr-models-text-recognition/)
|
||||
26. Our search for the best OCR tool in 2023, and what we found \- Source \- OpenNews, accessed on April 24, 2025, [https://source.opennews.org/articles/our-search-best-ocr-tool-2023/](https://source.opennews.org/articles/our-search-best-ocr-tool-2023/)
|
||||
27. pomgui/pdf-tables-parser: Library to parse a pdf file and extract all the tables contained returning a json object. \- GitHub, accessed on April 24, 2025, [https://github.com/pomgui/pdf-tables-parser](https://github.com/pomgui/pdf-tables-parser)
|
||||
28. Extracting tabular data from PDF files : r/typescript \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/typescript/comments/ze3b8c/extracting\_tabular\_data\_from\_pdf\_files/](https://www.reddit.com/r/typescript/comments/ze3b8c/extracting_tabular_data_from_pdf_files/)
|
||||
29. Extract tables from PDF for RAG : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1cn0z11/extract\_tables\_from\_pdf\_for\_rag/](https://www.reddit.com/r/LangChain/comments/1cn0z11/extract_tables_from_pdf_for_rag/)
|
||||
30. PDF Table Extraction, the Definitive Guide (+ gmft release\!) : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1dclawv/pdf\_table\_extraction\_the\_definitive\_guide\_gmft/](https://www.reddit.com/r/LangChain/comments/1dclawv/pdf_table_extraction_the_definitive_guide_gmft/)
|
||||
31. Table extraction from PDF \- Unstructured, accessed on April 24, 2025, [https://docs.unstructured.io/examplecode/codesamples/apioss/table-extraction-from-pdf](https://docs.unstructured.io/examplecode/codesamples/apioss/table-extraction-from-pdf)
|
||||
32. Best table parsers of pdf? : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1fwt2cn/best\_table\_parsers\_of\_pdf/](https://www.reddit.com/r/LangChain/comments/1fwt2cn/best_table_parsers_of_pdf/)
|
||||
33. opendatalab/PDF-Extract-Kit: A Comprehensive Toolkit for High-Quality PDF Content Extraction \- GitHub, accessed on April 24, 2025, [https://github.com/opendatalab/PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
|
||||
34. Extract table from pdf and images online \- Docsumo, accessed on April 24, 2025, [https://www.docsumo.com/free-tools/extract-tables-from-pdf-images](https://www.docsumo.com/free-tools/extract-tables-from-pdf-images)
|
||||
35. LLM model for table data \- Languages at Hugging Face, accessed on April 24, 2025, [https://discuss.huggingface.co/t/llm-model-for-table-data/44230](https://discuss.huggingface.co/t/llm-model-for-table-data/44230)
|
||||
36. Structured Data Extraction | Phoenix \- Arize AI, accessed on April 24, 2025, [https://docs.arize.com/phoenix/cookbook/structured-data-extraction](https://docs.arize.com/phoenix/cookbook/structured-data-extraction)
|
||||
37. Building a Trend Detection System with AI in TypeScript: A Step-by-Step Guide \- Firecrawl, accessed on April 24, 2025, [https://www.firecrawl.dev/blog/trend-finder-typescript](https://www.firecrawl.dev/blog/trend-finder-typescript)
|
||||
38. Build a custom RAG AI agent in TypeScript and Jupyter \- Deno, accessed on April 24, 2025, [https://deno.com/blog/build-custom-rag-ai-agent](https://deno.com/blog/build-custom-rag-ai-agent)
|
||||
39. Building a Clone of OpenAI's Deep Research with TypeScript and Firecrawl, accessed on April 24, 2025, [https://www.firecrawl.dev/blog/open-deep-research-explainer](https://www.firecrawl.dev/blog/open-deep-research-explainer)
|
||||
40. Structured Data Extraction \- LlamaIndex, accessed on April 24, 2025, [https://docs.llamaindex.ai/en/stable/use\_cases/extraction/](https://docs.llamaindex.ai/en/stable/use_cases/extraction/)
|
||||
41. Possible to write TypeScript package to call into local LLM and generate consistent output? : r/LLMDevs \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LLMDevs/comments/1ixlj2w/possible\_to\_write\_typescript\_package\_to\_call\_into/](https://www.reddit.com/r/LLMDevs/comments/1ixlj2w/possible_to_write_typescript_package_to_call_into/)
|
||||
42. What model would you use to extract full pdf? : r/ollama \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/ollama/comments/1gc8je1/what\_model\_would\_you\_use\_to\_extract\_full\_pdf/](https://www.reddit.com/r/ollama/comments/1gc8je1/what_model_would_you_use_to_extract_full_pdf/)
|
||||
43. instructor-ai/instructor: structured outputs for llms \- GitHub, accessed on April 24, 2025, [https://github.com/instructor-ai/instructor](https://github.com/instructor-ai/instructor)
|
||||
44. Instructor (JS): Welcome To Instructor, accessed on April 24, 2025, [https://js.useinstructor.com/](https://js.useinstructor.com/)
|
||||
45. Build a Local AI Chatbot with Ollama and JavaScript | Full Guide \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=qY2xYNJhB1A](https://www.youtube.com/watch?v=qY2xYNJhB1A)
|
||||
46. Structured Output for Open Source and Local LLMS \- Instructor (JS), accessed on April 24, 2025, [https://js.useinstructor.com/blog/2024/03/07/open-source-local-structured-output-zod-json-openai/](https://js.useinstructor.com/blog/2024/03/07/open-source-local-structured-output-zod-json-openai/)
|
||||
47. Build a 3D AI Teacher w/ Next.js, ChatGPT & Azure \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=\_bi4Ol0QEL4](https://www.youtube.com/watch?v=_bi4Ol0QEL4)
|
||||
48. Building LLM Agents in JavaScript: A Comprehensive Guide \- Adyog, accessed on April 24, 2025, [https://blog.adyog.com/2024/09/11/building-llm-agents-in-javascript-a-comprehensive-guide/](https://blog.adyog.com/2024/09/11/building-llm-agents-in-javascript-a-comprehensive-guide/)
|
||||
49. Harry-027/DocuMind: A document based RAG application \- GitHub, accessed on April 24, 2025, [https://github.com/Harry-027/DocuMind](https://github.com/Harry-027/DocuMind)
|
||||
50. Building a Local LLM Rig: Need Advice on Components and Setup\! : r/LocalLLM \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LocalLLM/comments/1k5i84j/building\_a\_local\_llm\_rig\_need\_advice\_on/](https://www.reddit.com/r/LocalLLM/comments/1k5i84j/building_a_local_llm_rig_need_advice_on/)
|
||||
51. DocuMind (RAG app using Ollama) \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/ollama/comments/1jqajhl/documind\_rag\_app\_using\_ollama/](https://www.reddit.com/r/ollama/comments/1jqajhl/documind_rag_app_using_ollama/)
|
||||
52. Documind \- Chat with PDF AI, accessed on April 24, 2025, [https://www.documind.chat/](https://www.documind.chat/)
|
||||
53. Show HN: Documind – Open-source AI tool to turn documents into structured data, accessed on April 24, 2025, [https://news.ycombinator.com/item?id=42171311](https://news.ycombinator.com/item?id=42171311)
|
||||
54. LM Studio \+ AnythingLLM: Process Local Documents with RAG Like a Pro\! \- YouTube, accessed on April 24, 2025, [https://m.youtube.com/watch?v=UG8uftJXcNs](https://m.youtube.com/watch?v=UG8uftJXcNs)
|
||||
55. Extract Entities Using Azure OpenAI Structured Outputs Mode | Microsoft Learn, accessed on April 24, 2025, [https://learn.microsoft.com/en-us/azure/developer/ai/how-to/extract-entities-using-structured-outputs](https://learn.microsoft.com/en-us/azure/developer/ai/how-to/extract-entities-using-structured-outputs)
|
||||
56. Structured Data Extraction from Unstructured Text Python LLMs Ollama Pydantic Llama 3.2 Granite 3.2 \- IBM TechXchange Community, accessed on April 24, 2025, [https://community.ibm.com/community/user/blogs/nickolus-plowden/2025/04/10/structured-data-extraction-from-unstructured-text](https://community.ibm.com/community/user/blogs/nickolus-plowden/2025/04/10/structured-data-extraction-from-unstructured-text)
|
||||
57. Structured data extraction from unstructured content using LLM schemas, accessed on April 24, 2025, [https://simonwillison.net/2025/Feb/28/llm-schemas/](https://simonwillison.net/2025/Feb/28/llm-schemas/)
|
||||
58. Open-Source Document Extraction: Unstract, DeepSeek & PostgreSQL, accessed on April 24, 2025, [https://unstract.com/blog/open-source-document-data-extraction-with-unstract-deepseek/](https://unstract.com/blog/open-source-document-data-extraction-with-unstract-deepseek/)
|
||||
59. How to Convert Unstructured Data to Structured Data Using AI \- Multimodal.dev, accessed on April 24, 2025, [https://www.multimodal.dev/post/how-to-convert-unstructured-data-to-structured-data](https://www.multimodal.dev/post/how-to-convert-unstructured-data-to-structured-data)
|
||||
60. Extracting unstructured text and images into database tables with GPT-4 Turbo and Datasette Extract \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=g3NtJatmQR0](https://www.youtube.com/watch?v=g3NtJatmQR0)
|
||||
61. Unstructured-IO/unstructured: Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines. \- GitHub, accessed on April 24, 2025, [https://github.com/Unstructured-IO/unstructured](https://github.com/Unstructured-IO/unstructured)
|
||||
62. Text-to-Table: Extracting Unstructured Data from a Large Legal Text : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1gl3pl7/texttotable\_extracting\_unstructured\_data\_from\_a/](https://www.reddit.com/r/LangChain/comments/1gl3pl7/texttotable_extracting_unstructured_data_from_a/)
|
||||
63. Typescript convert an array to JSON \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/48101176/typescript-convert-an-array-to-json](https://stackoverflow.com/questions/48101176/typescript-convert-an-array-to-json)
|
||||
64. How to Convert an Object to a JSON String in Typescript \- GeeksforGeeks, accessed on April 24, 2025, [https://www.geeksforgeeks.org/how-to-convert-an-object-to-a-json-string-in-typescript/](https://www.geeksforgeeks.org/how-to-convert-an-object-to-a-json-string-in-typescript/)
|
||||
65. JSON.stringify() \- JavaScript \- MDN Web Docs, accessed on April 24, 2025, [https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global\_Objects/JSON/stringify](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify)
|
||||
66. How do I convert JavaScript array to JSON? \- ReqBin, accessed on April 24, 2025, [https://reqbin.com/code/javascript/n2ek7onb/javascript-array-to-json-example](https://reqbin.com/code/javascript/n2ek7onb/javascript-array-to-json-example)
|
||||
67. Converting an Array to a JSON Object in JavaScript \- Boot.dev Blog, accessed on April 24, 2025, [https://blog.boot.dev/javascript/converting-an-array-to-json-object-in-javascript/](https://blog.boot.dev/javascript/converting-an-array-to-json-object-in-javascript/)
|
||||
68. How to convert array to JSON in typescript \- javascript \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/73301732/how-to-convert-array-to-json-in-typescript](https://stackoverflow.com/questions/73301732/how-to-convert-array-to-json-in-typescript)
|
||||
69. How to convert an array to a JSON object \- Codedamn, accessed on April 24, 2025, [https://codedamn.com/news/javascript/how-to-convert-an-array-to-a-json-object](https://codedamn.com/news/javascript/how-to-convert-an-array-to-a-json-object)
|
||||
@ -1,137 +0,0 @@
|
||||
# **Analysis of Open-Source Image to Markdown Conversion Tools with Table and AI Support**
|
||||
|
||||
**1\. Introduction**
|
||||
|
||||
The conversion of visual information into text-based formats like Markdown is an increasingly important capability for a variety of applications. These include the creation of accessible documentation, efficient knowledge management systems, and the integration of content with large language models for further processing and analysis. The user's specific requirements for tools in this domain are multifaceted, focusing on open-source availability, seamless integration with TypeScript projects, the accurate conversion of images to Markdown with particular attention to tabular data, and the utilization of artificial intelligence for enhanced accuracy, provided that the AI is either freely accessible or can be hosted locally. This report aims to analyze the provided research material to identify and evaluate open-source tools that align with these stringent criteria. The investigation will delve into the features, capabilities, and limitations of each potential solution, ultimately providing a comprehensive overview to aid in the selection of the most suitable tool.
|
||||
|
||||
**2\. Detailed Analysis of Potential Open-Source Tools**
|
||||
|
||||
* **2.1. LlamaOCR**
|
||||
* **Description:** LlamaOCR is presented as an open-source Optical Character Recognition (OCR) library built upon the Llama 3.2 Vision model, with the primary function of transforming images into Markdown formatted text.1 This tool aims to simplify workflows by enabling the direct conversion of visual content into an easily editable and structured format.
|
||||
* **TypeScript Integration:** A significant advantage of LlamaOCR is its availability as an npm package, installable via the command npm install llama-ocr.1 This method of distribution directly facilitates its integration into JavaScript and TypeScript-based projects, aligning with the user's preference for TypeScript-friendly solutions. The ease of installation and usage within existing TypeScript environments lowers the barrier to entry for developers in this ecosystem.
|
||||
* **AI Capabilities (Free with API Key):** LlamaOCR leverages the capabilities of the Llama 3.2 Vision model, which is accessible through a free endpoint provided by Together AI.1 To utilize this feature, users are required to register on the Together AI platform to obtain a free API key. While the core AI functionality is offered without direct cost, the dependency on an external service introduces a point of consideration. The "free" tier of Together AI, while allowing for initial use, may have limitations such as rate limits on the number of requests or the volume of data processed within a specific timeframe, as indicated by information regarding Together AI's pricing and rate limits.4 These restrictions could potentially impact the scalability and sustained use of LlamaOCR for users with high-volume processing needs.
|
||||
* **Table Support:** LlamaOCR is described as being proficient at extracting text from images, even those with complex layouts such as tables.1 A key feature highlighted is its "Markdown-First Design," which suggests that the tool aims to directly output the recognized text in Markdown format, preserving the original formatting and structure of the image, including attempting to represent tabular data using Markdown table syntax.1 This implies that LlamaOCR is designed to identify rows and columns within an image and translate them into the corresponding Markdown table structure using pipes (|) and hyphens (-). However, the research material lacks specific examples demonstrating the accuracy and handling of various table complexities, such as merged cells or multi-line content.
|
||||
* **Languages:** Given its distribution as an npm package, LlamaOCR primarily supports JavaScript and TypeScript environments.1 The underlying Llama 3.2 Vision model likely possesses multilingual OCR capabilities, but this is not explicitly detailed within the context of the tool's documentation in the provided snippets.
|
||||
* **Getting Started:** The initial setup for LlamaOCR involves a straightforward two-step process: installation via npm install llama-ocr and obtaining a free API key from Together AI.1 Subsequently, within a project, the ocr function can be imported to initiate image processing.1
|
||||
* **Example Usage:** A provided code snippet illustrates the basic usage of the ocr function, which takes the path to the image file and the Together AI API key as arguments. The function then asynchronously processes the image and returns the extracted text in Markdown format, which can be logged to the console.1
|
||||
* **Advanced Use Cases:** The documentation suggests potential for more advanced applications, including automating OCR tasks for multiple files through batch processing and integrating LlamaOCR into web applications to allow users to upload images and receive instant Markdown conversions.1
|
||||
* **Roadmap:** Future development plans for LlamaOCR include adding support for OCR on local and remote images, as well as for single and multi-page PDF documents. Additionally, the tool aims to provide output in JSON format alongside the current Markdown output, offering greater flexibility for data processing and integration.2 The current limitation to image file formats might be a constraint for users needing to process PDFs, but the planned expansion indicates ongoing development.
|
||||
* **Repository Link:** The primary GitHub repository for LlamaOCR appears to be [https://github.com/Nutlope/llama-ocr](https://github.com/Nutlope/llama-ocr).3 It is also worth noting the existence of other related repositories that utilize the Llama vision model for OCR, such as [https://github.com/MinimalDevops/llama-ocr](https://github.com/MinimalDevops/llama-ocr) 10, which offers a Python-based OCR assistant using Streamlit and Ollama, and([https://github.com/yYorky/LlamaOCR](https://github.com/yYorky/LlamaOCR)) 11, which focuses on invoice processing and outputs data in CSV format, potentially indicating table extraction capabilities.
|
||||
* **Example Markdown Output:** While the documentation consistently mentions that LlamaOCR outputs in Markdown format 1, the provided research material does not include a specific example demonstrating the conversion of an image containing a table into a Markdown table. Snippets 1, and 2 offer general context about the tool, and 42 shows a Markdown table from a different project (llama\_parse), while 43 presents an example generated by ChatGPT. The absence of a direct LlamaOCR example for table conversion makes it challenging to definitively assess its effectiveness in this crucial aspect.
|
||||
* **Free Tier Limits (Together AI):** The research material provides information on the free tier of Together AI, the service that powers LlamaOCR's AI capabilities.4 This tier includes rate limits on requests per minute (RPM) and tokens per minute (TPM) for various models, including image models. Specifically, the free tier for image models has a limit of 60 images per minute, with a lower limit of 10 images per minute for the FLUX.1 \[schnell\] model.4 Users should be aware of these limitations, as exceeding them may require upgrading to a paid tier.
|
||||
* **2.2. Marker**
|
||||
* **Description:** Marker is presented as a Python-based tool designed for the rapid and accurate conversion of a wide range of document formats, including both PDF files and images, into Markdown, JSON, and HTML.12 It is highlighted for its ability to handle complex formatting, including tables, and offers the option to enhance accuracy through the use of Large Language Models (LLMs).
|
||||
* **TypeScript Friendly:** Although Marker is primarily developed in Python, it can be integrated into TypeScript projects by executing it as a separate process and then consuming its output, which can be in Markdown or JSON format.12 The research material does not indicate the existence of direct TypeScript bindings or a dedicated TypeScript API for Marker. Therefore, while it can be used in conjunction with TypeScript projects, it does not offer the same level of direct integration as a native TypeScript library.
|
||||
* **AI Capabilities (Free/Local with Ollama):** A significant feature of Marker is its optional integration with LLMs to improve conversion accuracy, particularly for tasks such as table merging and form extraction.12 Notably, Marker supports the use of locally hosted LLMs through its integration with Ollama.12 This directly addresses the user's requirement for AI that can be hosted locally, offering a free and private alternative to cloud-based AI services, provided the user has the necessary computational resources to run Ollama. Marker also supports cloud-based LLM services like Gemini, which requires an API key, as well as other options like Google Vertex, Claude, and OpenAI.12
|
||||
* **Table Support:** Marker is described as being highly proficient in formatting tables extracted from various document formats, including images within PDFs.12 It features a dedicated TableConverter class specifically designed for extracting and converting tabular data.12 This converter can output tables in HTML format and, with the output\_format=json setting, can also provide cell bounding boxes, offering detailed structural information about the extracted tables.12 This specialized focus on table handling suggests that Marker may offer a higher degree of accuracy and flexibility in converting image-based tables to Markdown compared to more general OCR tools.
|
||||
* **Languages:** Marker is primarily written in Python.13
|
||||
* **Installation and Usage:** Marker can be easily installed using the Python package manager pip with the command pip install marker-pdf. For converting documents other than PDFs, additional dependencies may need to be installed.12 The tool offers a command-line interface for converting single files using the marker\_single command or multiple files within a folder using the marker command with options for specifying the number of parallel processes.12
|
||||
* **Configuration:** Marker provides a wide range of command-line flags to control the conversion process. These include options for specifying the output directory (--output\_dir), the output format (--output\_format), whether to use an LLM (--use\_llm), and settings for handling images and OCR (--disable\_image\_extraction, \--force\_ocr, \--strip\_existing\_ocr).12 Language settings can also be specified using the \--languages flag.12 For more advanced configuration, Marker supports the use of a JSON configuration file.12 When using the \--use\_llm flag with Ollama, users can configure the Ollama base URL (--ollama\_base\_url) and the specific model to be used (--ollama\_model).12
|
||||
* **LLM Services:** When the \--use\_llm flag is enabled, Marker supports a variety of LLM services, including Gemini (using the Gemini developer API), Google Vertex, Ollama (for local models), Claude (using the Anthropic API), and OpenAI (supporting any OpenAI-like endpoint).12 This provides users with a broad choice of AI models and services to enhance the accuracy of their document conversions.
|
||||
* **GPU Support:** Marker is capable of running on GPU, CPU, or MPS (Metal Performance Shaders for Apple silicon), which can significantly improve the speed of the conversion process, especially when utilizing LLMs or processing large volumes of data.12
|
||||
* **Repository Link:** The GitHub repository for Marker is located at [https://github.com/VikParuchuri/marker](https://github.com/VikParuchuri/marker).12
|
||||
* **Example Markdown Output:** The research material includes examples of Markdown output generated by Marker, which demonstrate its ability to handle tables and images effectively.18 Snippet 18 shows a Markdown file with a figure, and 19 illustrates JSON and Markdown output from processing a research paper, including the extraction of sections and handling of images. Additionally, 44 suggests that Marker is helpful for preserving tables when converting PDFs to Markdown. These examples provide evidence of Marker's capability to convert image-based tables into Markdown format.
|
||||
* **Ollama Integration Details:** Marker's integration with Ollama is well-documented in the research material.12 Users can configure the connection to their local Ollama server by specifying the base URL (typically http://localhost:11434) and the name of the desired model (e.g., gemma3:27b) through command-line arguments or within a configuration file.12 This integration allows Marker to leverage the power of locally run LLMs for tasks like improving table accuracy and formatting without requiring an internet connection or external API keys for the LLM itself.
|
||||
* **2.3. MarkItDown**
|
||||
* **Description:** MarkItDown is a lightweight Python utility developed by Microsoft for the purpose of converting various file formats to Markdown.13 Its primary focus is on preserving the important structure and content of documents, making it suitable for use with Large Language Models (LLMs) and related text analysis pipelines.
|
||||
* **TypeScript Friendly:** MarkItDown is primarily a Python-based tool, and the provided research material does not mention any direct TypeScript bindings or a dedicated TypeScript API.17 Similar to Marker, integration with TypeScript projects would likely involve running MarkItDown as a separate process and consuming its Markdown output.
|
||||
* **AI Capabilities (API-based):** MarkItDown supports the use of LLMs for tasks such as generating descriptions for images within the converted documents.17 The documentation provides examples of using OpenAI's API, specifically the gpt-4o model, by instantiating the MarkItDown class with an llm\_client and llm\_model.17 Additionally, it supports using Microsoft's Azure Document Intelligence service for document conversion, which requires providing an endpoint to an Azure Document Intelligence Resource.17 The research material does not indicate any direct integration with locally hosted LLM solutions like Ollama.
|
||||
* **Table Support:** MarkItDown is designed to preserve important document structure and content during the conversion to Markdown, explicitly including headings, lists, tables, and links.13 This suggests that the tool aims to accurately represent tabular data from various input formats, including images within documents, in Markdown table format.
|
||||
* **Languages:** MarkItDown is written in Python.17
|
||||
* **Installation and Usage:** MarkItDown can be installed using pip with the command pip install 'markitdown\[all\]', which installs all optional dependencies required for handling various file formats.17 The tool can be used from the command line by specifying the path to the file to be converted, with an option to specify the output file using the \-o flag or by piping the output.17 It also offers a Python API for programmatic use, allowing developers to integrate its conversion capabilities into their Python applications.17
|
||||
* **Supported Formats:** MarkItDown supports a wide range of input file formats, including PDF, PowerPoint, Word, Excel, Images (with EXIF metadata and OCR), Audio (with EXIF metadata and speech transcription), HTML, text-based formats like CSV, JSON, and XML, as well as ZIP files by iterating over their contents.13 This broad format support makes it a versatile tool for handling various types of documents that may contain images and tables.
|
||||
* **Plugins:** MarkItDown supports third-party plugins, which can be used to extend its functionality. Plugins are disabled by default but can be enabled using the \--use-plugins command-line option.17
|
||||
* **Repository Link:** The GitHub repository for MarkItDown is located at [https://github.com/microsoft/markitdown](https://github.com/microsoft/markitdown).17
|
||||
* **Example Markdown Output:** While the research material mentions MarkItDown's focus on preserving document structure, including tables 13, it does not provide a specific example of converting an image of a table directly to Markdown table syntax.
|
||||
* **AI Capabilities (API-based):** As previously mentioned, MarkItDown's AI capabilities rely on integration with external API services like OpenAI and Azure Document Intelligence.17 This means that for AI-powered features like image descriptions, users would need to have API keys for these services, and usage might incur costs depending on the volume of data processed. There is no indication of support for free or locally hosted AI solutions like Ollama within the provided snippets.
|
||||
* **2.4. markdownify-mcp**
|
||||
* **Description:** markdownify-mcp is described as a Model Context Protocol (MCP) server built using TypeScript, with the purpose of converting various file types and web content into Markdown format.23
|
||||
* **TypeScript Friendly:** Being built entirely with TypeScript, markdownify-mcp is inherently friendly to TypeScript developers.23 This provides a significant advantage for users who prefer to work within the JavaScript/TypeScript ecosystem, potentially allowing for easier integration, understanding of the codebase, and customization if needed.
|
||||
* **AI Capabilities:** The features list for markdownify-mcp includes "Convert images to Markdown with metadata".23 However, the provided research material does not explicitly mention the use of artificial intelligence in this conversion process. It is possible that the tool relies on more traditional OCR techniques or that details about AI usage are not covered in the available snippets.
|
||||
* **Table Support:** While markdownify-mcp can convert images to Markdown, the research material does not provide specific information on how it handles tables within images.23 Snippet 23 suggests that more detailed information regarding the image-to-markdown tool's functionality, including table handling, might be available in the project's README file or by examining the source code in src/tools.ts.
|
||||
* **Languages:** The primary programming language for markdownify-mcp is TypeScript.23
|
||||
* **Installation and Usage:** To get started with markdownify-mcp, users need to clone the project repository, install dependencies using the pnpm install command, build the project using pnpm run build, and then start the server with pnpm start.23
|
||||
* **Tools:** markdownify-mcp provides a set of tools for converting different types of content to Markdown, including pdf-to-markdown, bing-search-to-markdown, webpage-to-markdown, image-to-markdown, and audio-to-markdown.23 The image-to-markdown tool is the most relevant to the user's query.
|
||||
* **Repository Link:** The GitHub repository for markdownify-mcp is located at [https://github.com/zcaceres/markdownify-mcp](https://github.com/zcaceres/markdownify-mcp).23
|
||||
* **Example Markdown Output:** The provided research material does not include an example of the Markdown output generated by markdownify-mcp, specifically for the image-to-markdown tool and its potential handling of tables. This lack of a concrete example makes it difficult to assess the quality and format of the output for the user's specific requirements.
|
||||
|
||||
**3\. Comparison of Tools**
|
||||
|
||||
The following table summarizes the key features of the analyzed open-source tools based on the research material:
|
||||
|
||||
| Tool/Library | Link | TypeScript Friendly | Table Support | AI Capabilities | Primary Languages |
|
||||
| :---- | :---- | :---- | :---- | :---- | :---- |
|
||||
| LlamaOCR | [https://github.com/Nutlope/llama-ocr](https://github.com/Nutlope/llama-ocr) | Yes | Claims support for complex layouts like tables | Free (via Together AI API key) | JavaScript, TypeScript |
|
||||
| Marker | [https://github.com/VikParuchuri/marker](https://github.com/VikParuchuri/marker) | No | Strong support with dedicated TableConverter | Optional (Ollama for local, others via API) | Python |
|
||||
| MarkItDown | [https://github.com/microsoft/markitdown](https://github.com/microsoft/markitdown) | No | Aims to preserve tables during conversion | API-based (OpenAI, Azure) | Python |
|
||||
| markdownify-mcp | [https://github.com/zcaceres/markdownify-mcp](https://github.com/zcaceres/markdownify-mcp) | Yes | Details unclear in snippets | Not explicitly mentioned in snippets | TypeScript |
|
||||
|
||||
**4\. Other Potential Solutions and Considerations**
|
||||
|
||||
* **Mathpix Snip:** Mathpix Snip is a powerful OCR tool that offers specific features for converting images and PDFs to Markdown tables using AI.24 It is available as a web and mobile application, as well as a desktop snipping tool.24 While it boasts strong table conversion capabilities and supports a wide range of languages 27, it is not strictly open-source and requires a paid subscription for unlimited use beyond the free tier.26 However, for users prioritizing accuracy, especially with STEM content, it remains a noteworthy option for comparison.
|
||||
* **Konbert:** Konbert is an online converter that utilizes AI-powered OCR to transform JPG and PNG images into Markdown tables.32 It offers a free service for files up to 5MB.32 While it directly addresses the image-to-table conversion requirement using AI, it is not an open-source library and relies on an external online service. Concerns about transparency and reliability have also been raised in reviews regarding a similar tool from the same domain.34
|
||||
* **Aspose:** Aspose is a commercial library for.NET and Java that provides a wide range of document conversion capabilities, including JPG to Markdown.36 While it is known for high-quality conversion and supports various file formats 37, it does not meet the user's open-source requirement. Reviews suggest generally positive experiences, but potential issues with image quality during conversion and the importance of image resolution for OCR accuracy have been noted.38
|
||||
* **General Challenges of OCR Accuracy:** It is important to acknowledge the inherent challenges in achieving perfect accuracy with OCR, especially when dealing with complex table layouts, low-resolution or distorted images, and handwritten text.1 Even AI-powered OCR is subject to limitations, and the quality of the output is heavily dependent on the quality of the input image. Users should set realistic expectations and be prepared for potential manual corrections regardless of the tool chosen.
|
||||
|
||||
**5\. Recommendations**
|
||||
|
||||
Based on the analysis, the following recommendations are provided:
|
||||
|
||||
* **LlamaOCR:** Due to its direct TypeScript integration and the availability of a free AI model via Together AI, LlamaOCR appears to be a strong candidate for users primarily working within the TypeScript ecosystem. However, thorough testing of its table conversion accuracy with the user's specific image types is crucial. Users should also be aware of the potential limitations of Together AI's free API tier regarding usage and rate limits.
|
||||
* **Marker:** For users who require robust table support and prefer the option of locally hosted AI, Marker presents a compelling solution. Its integration with Ollama directly addresses this need. While it is a Python-based tool, the potential benefits of its strong table handling capabilities and local AI option might outweigh the integration efforts required for a TypeScript workflow. Exploring methods for integrating Python processes into TypeScript applications could be beneficial.
|
||||
* **MarkItDown:** MarkItDown is a viable option for users who need to convert images within various document formats to Markdown and are comfortable with a Python-based tool. Its focus on preserving document structure, including tables, is relevant. However, its AI capabilities rely on cloud-based APIs and do not meet the "free or locally hosted" criterion.
|
||||
* **markdownify-mcp:** For users who strictly require a TypeScript-based solution, markdownify-mcp is an option. However, given the lack of detailed information about its AI capabilities and table conversion effectiveness in the provided snippets, further investigation of its documentation and source code is recommended before making a decision.
|
||||
|
||||
Ultimately, the most suitable tool will depend on the user's specific priorities, technical environment, and the nature of the images and tables they need to convert. Practical testing of the most promising tools with representative samples is highly recommended to determine the best fit.
|
||||
|
||||
**6\. Conclusion**
|
||||
|
||||
The analysis of the provided research material reveals several open-source tools with the potential to convert images to Markdown, including handling tables and leveraging AI. LlamaOCR stands out for its direct TypeScript integration and free AI (via API), while Marker offers robust table support and the option for locally hosted AI through Ollama. MarkItDown provides broad format support and API-based AI, and markdownify-mcp offers a purely TypeScript-based solution. Each tool presents its own set of strengths and trade-offs concerning TypeScript compatibility, AI implementation, and table handling capabilities. The "perfect" solution will depend on the user's specific needs and priorities. Thorough testing and evaluation of the most promising options are crucial to ensure the selected tool meets the required accuracy and workflow integration demands.
|
||||
|
||||
#### **Works cited**
|
||||
|
||||
1. Transforming Images into Markdown: A Guide to LlamaOCR \- Cohorte Projects, accessed on April 23, 2025, [https://www.cohorte.co/blog/transforming-images-into-markdown-a-guide-to-llamaocr](https://www.cohorte.co/blog/transforming-images-into-markdown-a-guide-to-llamaocr)
|
||||
2. Llama OCR: OCR library that converts images to Markdown in three lines of code using the free Llama 3.2 Vision interface \- Chief AI Sharing Circle \- 首席AI分享圈, accessed on April 23, 2025, [https://www.aisharenet.com/en/llama-ocr/](https://www.aisharenet.com/en/llama-ocr/)
|
||||
3. Nutlope/llama-ocr: Document to Markdown OCR library with Llama 3.2 vision \- GitHub, accessed on April 23, 2025, [https://github.com/Nutlope/llama-ocr](https://github.com/Nutlope/llama-ocr)
|
||||
4. Rate limits \- Introduction \- Together AI, accessed on April 23, 2025, [https://docs.together.ai/docs/rate-limits](https://docs.together.ai/docs/rate-limits)
|
||||
5. Is the Together AI API key free? Exploring scalable AI solutions for developers and SMBs, accessed on April 23, 2025, [https://www.byteplus.com/en/topic/552569](https://www.byteplus.com/en/topic/552569)
|
||||
6. How to use the Free Tier? \- AI/ML API Documentation, accessed on April 23, 2025, [https://docs.aimlapi.com/faq/free-tier](https://docs.aimlapi.com/faq/free-tier)
|
||||
7. Is the Together API key free? Exploring scalable AI solutions for developers and SMBs, accessed on April 23, 2025, [https://www.byteplus.com/en/topic/554906](https://www.byteplus.com/en/topic/554906)
|
||||
8. Together Pricing | The Most Powerful Tools at the Best Value, accessed on April 23, 2025, [https://www.together.ai/pricing](https://www.together.ai/pricing)
|
||||
9. Try These Free, Unlimited AI API Keys for Cursor / Cline \- Hugging Face, accessed on April 23, 2025, [https://huggingface.co/blog/lynn-mikami/free-ai-apis](https://huggingface.co/blog/lynn-mikami/free-ai-apis)
|
||||
10. MinimalDevops/llama-ocr: llama-ocr using python \- GitHub, accessed on April 23, 2025, [https://github.com/MinimalDevops/llama-ocr](https://github.com/MinimalDevops/llama-ocr)
|
||||
11. yYorky/LlamaOCR: Effortlessly process invoices with AI\! This project uses the Llama3.2 Vision Model for OCR, converting invoice images into structured, machine-readable tables. Designed for accountants, it automates data extraction and outputs in tabular format ready for ERP integration, improving efficiency and accuracy in invoice management. \- GitHub, accessed on April 23, 2025, [https://github.com/yYorky/LlamaOCR](https://github.com/yYorky/LlamaOCR)
|
||||
12. VikParuchuri/marker: Convert PDF to markdown \+ JSON quickly with high accuracy \- GitHub, accessed on April 23, 2025, [https://github.com/VikParuchuri/marker](https://github.com/VikParuchuri/marker)
|
||||
13. Microsoft has released an open source Python tool for converting other document formats to markdown : r/ObsidianMD \- Reddit, accessed on April 23, 2025, [https://www.reddit.com/r/ObsidianMD/comments/1hioaov/microsoft\_has\_released\_an\_open\_source\_python\_tool/](https://www.reddit.com/r/ObsidianMD/comments/1hioaov/microsoft_has_released_an_open_source_python_tool/)
|
||||
14. Optimal Hardware for Running Ollama Models with Marker for PDF to Markdown Conversion, accessed on April 23, 2025, [https://www.reddit.com/r/ollama/comments/1itbr79/optimal\_hardware\_for\_running\_ollama\_models\_with/](https://www.reddit.com/r/ollama/comments/1itbr79/optimal_hardware_for_running_ollama_models_with/)
|
||||
15. Ollama Inference Failure and Broken Pipe Error · Issue \#621 · VikParuchuri/marker \- GitHub, accessed on April 23, 2025, [https://github.com/VikParuchuri/marker/issues/621](https://github.com/VikParuchuri/marker/issues/621)
|
||||
16. Extract Table Info From PDF & Summarise It Using Llama3 via Ollama | LangChain, accessed on April 23, 2025, [https://www.youtube.com/watch?v=hQu8WN8NuVg](https://www.youtube.com/watch?v=hQu8WN8NuVg)
|
||||
17. microsoft/markitdown: Python tool for converting files and office documents to Markdown. \- GitHub, accessed on April 23, 2025, [https://github.com/microsoft/markitdown](https://github.com/microsoft/markitdown)
|
||||
18. Vision Parse with Ollama \- Parse PDF Documents into MarkDown Content \- YouTube, accessed on April 23, 2025, [https://www.youtube.com/watch?v=6ilFgwUyuWE](https://www.youtube.com/watch?v=6ilFgwUyuWE)
|
||||
19. Marker: This Open-Source Tool will make your PDFs LLM Ready \- YouTube, accessed on April 23, 2025, [https://www.youtube.com/watch?v=mdLBr9IMmgI\&pp=0gcJCfcAhR29\_xXO](https://www.youtube.com/watch?v=mdLBr9IMmgI&pp=0gcJCfcAhR29_xXO)
|
||||
20. Extract Table Info From SCANNED PDF & Summarise It Using Llama3.1 via Ollama, accessed on April 23, 2025, [https://www.youtube.com/watch?v=nkE65p42RgM](https://www.youtube.com/watch?v=nkE65p42RgM)
|
||||
21. Microsoft open sources a markdown library to convert documents to markdown \- Community, accessed on April 23, 2025, [https://community.openai.com/t/microsoft-open-sources-a-markdown-library-to-convert-documents-to-markdown/1061731](https://community.openai.com/t/microsoft-open-sources-a-markdown-library-to-convert-documents-to-markdown/1061731)
|
||||
22. MarkItDown: Python tool for converting files and office documents to Markdown | Hacker News, accessed on April 23, 2025, [https://news.ycombinator.com/item?id=42410803](https://news.ycombinator.com/item?id=42410803)
|
||||
23. zcaceres/markdownify-mcp: A Model Context Protocol ... \- GitHub, accessed on April 23, 2025, [https://github.com/zcaceres/markdownify-mcp](https://github.com/zcaceres/markdownify-mcp)
|
||||
24. OCR-powered Markdown Table Generator \- Mathpix, accessed on April 23, 2025, [https://mathpix.com/blog/ocr-powered-markdown-table-generator](https://mathpix.com/blog/ocr-powered-markdown-table-generator)
|
||||
25. Mathpix now supports basic table OCR, accessed on April 23, 2025, [https://mathpix.com/blog/v1-table-recognition](https://mathpix.com/blog/v1-table-recognition)
|
||||
26. Snip Apps \- Mathpix, accessed on April 23, 2025, [https://mathpix.com/snip](https://mathpix.com/snip)
|
||||
27. All Supported Languages \- Mathpix, accessed on April 23, 2025, [https://mathpix.com/language-support](https://mathpix.com/language-support)
|
||||
28. Convert API User Guide: Supported Languages \- Mathpix, accessed on April 23, 2025, [https://mathpix.com/docs/convert/supported\_languages](https://mathpix.com/docs/convert/supported_languages)
|
||||
29. Snip web app now translated into 14 languages \- Mathpix, accessed on April 23, 2025, [https://mathpix.com/blog/multi-language-interface](https://mathpix.com/blog/multi-language-interface)
|
||||
30. The best OCR for Chinese and math \- Mathpix, accessed on April 23, 2025, [https://mathpix.com/blog/ocr-chinese-characters](https://mathpix.com/blog/ocr-chinese-characters)
|
||||
31. Mathpix Pricing, accessed on April 23, 2025, [https://mathpix.com/pricing](https://mathpix.com/pricing)
|
||||
32. Convert JPG to Markdown Table \- Konbert, accessed on April 23, 2025, [https://konbert.com/convert/jpeg/to/markdown](https://konbert.com/convert/jpeg/to/markdown)
|
||||
33. Convert PNG to Markdown Table, accessed on April 23, 2025, [https://konbert.com/convert/png/to/markdown](https://konbert.com/convert/png/to/markdown)
|
||||
34. tool to convert pdf to markdown and keep all the formatting, tables, images etc. \- Reddit, accessed on April 23, 2025, [https://www.reddit.com/r/ObsidianMD/comments/1jkmdx9/tool\_to\_convert\_pdf\_to\_markdown\_and\_keep\_all\_the/](https://www.reddit.com/r/ObsidianMD/comments/1jkmdx9/tool_to_convert_pdf_to_markdown_and_keep_all_the/)
|
||||
35. Convert JPG to Markdown table \- table.studio | The AI Spreadsheet, accessed on April 23, 2025, [https://table.studio/convert/jpeg/to/markdown](https://table.studio/convert/jpeg/to/markdown)
|
||||
36. Convert JPG To Markdown Online \- Aspose Products, accessed on April 23, 2025, [https://products.aspose.app/words/conversion/jpg-to-md](https://products.aspose.app/words/conversion/jpg-to-md)
|
||||
37. Aspose.Cells-Cloud 25.3.0 \- NuGet Gallery, accessed on April 23, 2025, [https://www.nuget.org/packages/Aspose.Cells-Cloud](https://www.nuget.org/packages/Aspose.Cells-Cloud)
|
||||
38. Aspose.OCR fails to read simple JPEG files \- Stack Overflow, accessed on April 23, 2025, [https://stackoverflow.com/questions/45921387/aspose-ocr-fails-to-read-simple-jpeg-files](https://stackoverflow.com/questions/45921387/aspose-ocr-fails-to-read-simple-jpeg-files)
|
||||
39. Low image quality when converting Word to Markdown \- Free Support Forum \- aspose.com, accessed on April 23, 2025, [https://forum.aspose.com/t/low-image-quality-when-converting-word-to-markdown/269407](https://forum.aspose.com/t/low-image-quality-when-converting-word-to-markdown/269407)
|
||||
40. Llama-OCR: Document to Markdown | Hacker News, accessed on April 23, 2025, [https://news.ycombinator.com/item?id=42154410](https://news.ycombinator.com/item?id=42154410)
|
||||
41. LlamaOCR.com – Document to markdown, accessed on April 23, 2025, [https://llamaocr.com/](https://llamaocr.com/)
|
||||
42. markdown output for table in pdf is incorrect · Issue \#167 · run-llama/llama\_cloud\_services, accessed on April 23, 2025, [https://github.com/run-llama/llama\_parse/issues/167](https://github.com/run-llama/llama_parse/issues/167)
|
||||
43. From Screenshots to Markdown Tables with LLMs \- Shekhar Gulati, accessed on April 23, 2025, [https://shekhargulati.com/2024/07/22/from-screenshots-to-markdown-tables-with-llms/](https://shekhargulati.com/2024/07/22/from-screenshots-to-markdown-tables-with-llms/)
|
||||
44. What model would you use to extract full pdf? : r/ollama \- Reddit, accessed on April 23, 2025, [https://www.reddit.com/r/ollama/comments/1gc8je1/what\_model\_would\_you\_use\_to\_extract\_full\_pdf/](https://www.reddit.com/r/ollama/comments/1gc8je1/what_model_would_you_use_to_extract_full_pdf/)
|
||||
45. Zerox [https://github.com/getomni-ai/zerox?tab=readme-ov-file](https://github.com/getomni-ai/zerox?tab=readme-ov-file)
|
||||
@ -1,32 +0,0 @@
|
||||
import { Arguments } from 'yargs';
|
||||
import { Logger } from 'tslog';
|
||||
import { ConvertCommandSchema, ConvertCommandConfig } from '../types.js';
|
||||
import { existsSync } from 'node:fs';
|
||||
import * as z from 'zod';
|
||||
import { runConversion } from '../lib/convert.js';
|
||||
|
||||
export const command = 'convert';
|
||||
export const desc = 'Convert PDF to images';
|
||||
|
||||
export async function handler(argv: Arguments<ConvertCommandConfig>): Promise<void> {
|
||||
const logger = new Logger();
|
||||
try {
|
||||
const config = ConvertCommandSchema.parse(argv);
|
||||
if (!existsSync(config.input)) {
|
||||
throw new Error(`Input file ${config.input} does not exist`);
|
||||
}
|
||||
logger.info("Calling conversion library function...");
|
||||
const outputFiles = await runConversion(config, logger);
|
||||
logger.info(`Conversion completed successfully`);
|
||||
logger.info(`Generated ${outputFiles.length} images`);
|
||||
} catch (error) {
|
||||
if (error instanceof z.ZodError) {
|
||||
logger.error('Invalid arguments:', error.flatten());
|
||||
} else {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
logger.error('Error during conversion command:', message, error);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,5 +0,0 @@
|
||||
/**
|
||||
* Default output path template when no output is specified.
|
||||
* Variables: ${SRC_DIR}, ${SRC_NAME}, ${PAGE}, ${FORMAT}
|
||||
*/
|
||||
export const DEFAULT_OUTPUT_TEMPLATE = "${SRC_DIR}/${SRC_NAME}_${PAGE}.${FORMAT}";
|
||||
@ -1,19 +0,0 @@
|
||||
import yargs from 'yargs';
|
||||
import { hideBin } from 'yargs/helpers';
|
||||
import * as convertCommand from './commands/convert.js';
|
||||
import type { CommandModule } from 'yargs';
|
||||
import { ConvertCommandConfig, ConvertCommandArgsSchema } from './types.js';
|
||||
import { toYargs } from '@polymech/commons';
|
||||
|
||||
const commandModule: CommandModule<{}, ConvertCommandConfig> = {
|
||||
command: convertCommand.command,
|
||||
describe: convertCommand.desc,
|
||||
builder: (yargs) => toYargs(yargs, ConvertCommandArgsSchema),
|
||||
handler: convertCommand.handler
|
||||
};
|
||||
|
||||
yargs(hideBin(process.argv))
|
||||
.command(commandModule)
|
||||
.demandCommand(1, 'You need to specify a command')
|
||||
.help()
|
||||
.parse();
|
||||
@ -1,129 +0,0 @@
|
||||
import { Logger } from "tslog";
|
||||
import { statSync } from "node:fs";
|
||||
import { sep, resolve as pathResolve, parse as pathParse, relative as pathRelative } from "node:path";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { DEFAULT_ROOTS, DEFAULT_VARS, pathInfoEx, resolveVariables } from "@polymech/commons";
|
||||
import { convertPdfToImages } from "./pdf.js";
|
||||
import { DEFAULT_OUTPUT_TEMPLATE } from "../constants.js";
|
||||
import type { ConvertCommandConfig } from "../types.js";
|
||||
|
||||
/**
|
||||
* Runs the PDF to images conversion process.
|
||||
* Generates variables, determines output path, reads PDF, and calls the conversion engine.
|
||||
* @param config - The conversion configuration options (inferred from Zod schema).
|
||||
* @param logger - The logger instance to use for logging.
|
||||
* @returns A promise that resolves with an array of generated image file paths.
|
||||
*/
|
||||
export async function runConversion(config: ConvertCommandConfig, logger: Logger<any>): Promise<string[]> {
|
||||
const inputPath = pathResolve(config.input);
|
||||
let srcInfo: any = {};
|
||||
try {
|
||||
srcInfo = pathInfoEx(inputPath);
|
||||
const parsed = pathParse(inputPath);
|
||||
srcInfo = {
|
||||
...srcInfo,
|
||||
SRC_DIR: parsed.dir,
|
||||
SRC_NAME: parsed.name,
|
||||
SRC_EXT: parsed.ext,
|
||||
};
|
||||
} catch (e) {
|
||||
logger.warn("pathInfoEx not found or failed, using basic path.parse");
|
||||
}
|
||||
|
||||
let baseVariables: Record<string, any> = {
|
||||
...DEFAULT_ROOTS,
|
||||
...DEFAULT_VARS({}),
|
||||
...srcInfo,
|
||||
DPI: config.dpi,
|
||||
FORMAT: config.format,
|
||||
};
|
||||
|
||||
if (baseVariables.ROOT && baseVariables.SRC_DIR) {
|
||||
baseVariables.SRC_REL = pathRelative(baseVariables.ROOT, baseVariables.SRC_DIR);
|
||||
}
|
||||
|
||||
const srcName = baseVariables.SRC_NAME || '';
|
||||
const dashed = srcName.split('-');
|
||||
if (dashed.length > 1) {
|
||||
for (let i = 0; i < dashed.length; i++) {
|
||||
baseVariables[`SRC_NAME-${i}`] = dashed[i];
|
||||
}
|
||||
}
|
||||
const dotted = srcName.split('.');
|
||||
if (dotted.length > 1) {
|
||||
for (let i = 0; i < dotted.length; i++) {
|
||||
baseVariables[`SRC_NAME.${i}`] = dotted[i];
|
||||
}
|
||||
}
|
||||
const underscored = srcName.split('_');
|
||||
if (underscored.length > 1) {
|
||||
for (let i = 0; i < underscored.length; i++) {
|
||||
baseVariables[`SRC_NAME_${i}`] = underscored[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Process var-* arguments directly from config object passed in
|
||||
const cliVars = Object.keys(config).filter(k => k.startsWith('var-')).reduce((acc, k) => {
|
||||
acc[k.replace('var-', '').toUpperCase()] = config[k];
|
||||
return acc;
|
||||
}, {} as Record<string, any>);
|
||||
|
||||
// Uppercase base variable keys
|
||||
baseVariables = Object.keys(baseVariables).reduce((acc, key) => {
|
||||
acc[key.toUpperCase()] = baseVariables[key];
|
||||
return acc;
|
||||
}, {} as Record<string, any>);
|
||||
|
||||
baseVariables = { ...baseVariables, ...cliVars };
|
||||
|
||||
let outputPathTemplate: string;
|
||||
let isExplicitDir = false;
|
||||
|
||||
if (config.output) {
|
||||
const outputPath = pathResolve(config.output);
|
||||
try {
|
||||
const stats = statSync(outputPath);
|
||||
if (stats.isDirectory()) {
|
||||
isExplicitDir = true;
|
||||
}
|
||||
} catch (e: any) {
|
||||
if (config.output.endsWith(sep) || config.output.endsWith("/")) {
|
||||
isExplicitDir = true;
|
||||
} else {
|
||||
isExplicitDir = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (isExplicitDir) {
|
||||
baseVariables["OUT_DIR"] = outputPath;
|
||||
outputPathTemplate = "${OUT_DIR}/${SRC_NAME}_${PAGE}.${FORMAT}";
|
||||
logger.info(`Output directory specified: ${outputPath}`);
|
||||
} else {
|
||||
outputPathTemplate = config.output;
|
||||
logger.info(`Using output path pattern: ${outputPathTemplate}`);
|
||||
}
|
||||
|
||||
} else {
|
||||
// Use default pattern directly from constant
|
||||
outputPathTemplate = DEFAULT_OUTPUT_TEMPLATE;
|
||||
logger.info(`Using default output path pattern: ${outputPathTemplate}`);
|
||||
}
|
||||
|
||||
// --- Read PDF and Call Conversion (moved from commands/convert.ts) ---
|
||||
logger.info(`Reading PDF: ${config.input}`);
|
||||
const pdfData = await readFile(config.input);
|
||||
|
||||
logger.info(`Starting conversion process...`);
|
||||
const outputFiles = await convertPdfToImages(pdfData, {
|
||||
baseVariables,
|
||||
outputPathTemplate,
|
||||
dpi: config.dpi,
|
||||
format: config.format,
|
||||
scale: config.scale,
|
||||
startPage: config.startPage,
|
||||
endPage: config.endPage,
|
||||
logger
|
||||
});
|
||||
|
||||
return outputFiles;
|
||||
}
|
||||
@ -1,116 +0,0 @@
|
||||
import * as mupdf from 'mupdf'
|
||||
import { Logger } from 'tslog'
|
||||
import { writeFile } from 'node:fs/promises'
|
||||
import { dirname } from 'node:path'
|
||||
import { resolveVariables, pathInfoEx } from '@polymech/commons'
|
||||
import { sync as write } from '@polymech/fs/write'
|
||||
import { sync as mkdir } from '@polymech/fs/dir'
|
||||
import { writeFileSync } from 'node:fs'
|
||||
import { Buffer } from 'node:buffer'
|
||||
|
||||
// Helper function to convert object-like image data to Buffer
|
||||
function imageDataObjectToBuffer(imageDataObject: Record<string, number>): Buffer {
|
||||
const keys = Object.keys(imageDataObject).map(Number).sort((a, b) => a - b);
|
||||
const bufferLength = keys.length > 0 ? keys[keys.length - 1] + 1 : 0; // Determine length based on max index + 1
|
||||
const buffer = Buffer.allocUnsafe(bufferLength); // Use allocUnsafe for performance if overwriting all bytes
|
||||
|
||||
for (const key in imageDataObject) {
|
||||
if (Object.prototype.hasOwnProperty.call(imageDataObject, key)) {
|
||||
const index = parseInt(key, 10);
|
||||
if (!isNaN(index) && index >= 0 && index < bufferLength) {
|
||||
buffer[index] = imageDataObject[key];
|
||||
}
|
||||
}
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
export type ImageFormat = 'png' | 'jpg';
|
||||
|
||||
export interface PdfToImageOptions {
|
||||
baseVariables: Record<string, any>;
|
||||
outputPathTemplate: string;
|
||||
dpi: number;
|
||||
scale?: number;
|
||||
format: ImageFormat;
|
||||
startPage?: number;
|
||||
endPage?: number;
|
||||
logger?: Logger<any>;
|
||||
}
|
||||
|
||||
export async function convertPdfToImages(
|
||||
pdfData: Buffer,
|
||||
options: PdfToImageOptions
|
||||
): Promise<string[]> {
|
||||
const logger = options.logger || new Logger<any>();
|
||||
const outputFiles: string[] = [];
|
||||
|
||||
try {
|
||||
const doc = mupdf.Document.openDocument(pdfData, 'pdf');
|
||||
const pageCount = doc.countPages();
|
||||
|
||||
// Validate and determine page range (adjusting for 0-based index)
|
||||
const start = (options.startPage ?? 1) - 1;
|
||||
const end = (options.endPage ?? pageCount) - 1;
|
||||
|
||||
if (start < 0 || start >= pageCount) {
|
||||
throw new Error(`startPage (${options.startPage}) is out of valid range (1-${pageCount})`);
|
||||
}
|
||||
if (end < 0 || end >= pageCount) {
|
||||
throw new Error(`endPage (${options.endPage}) is out of valid range (1-${pageCount})`);
|
||||
}
|
||||
if (start > end) {
|
||||
// This should also be caught by Zod schema, but good to double-check
|
||||
throw new Error(`startPage (${options.startPage}) cannot be greater than endPage (${options.endPage})`);
|
||||
}
|
||||
|
||||
const numPagesToProcess = end - start + 1;
|
||||
logger.info(`Processing pages ${start + 1} to ${end + 1} (${numPagesToProcess} pages) of ${pageCount} total`);
|
||||
|
||||
// Determine the scaling matrix
|
||||
const scaleValue = options.scale ?? 2;
|
||||
const matrix = scaleValue === 1 ? mupdf.Matrix.identity : mupdf.Matrix.scale(scaleValue, scaleValue);
|
||||
|
||||
logger.info(`Using scale factor: ${scaleValue}`);
|
||||
|
||||
for (let i = start; i <= end; i++) {
|
||||
const pageNumber = i + 1; // User-facing page number (1-based)
|
||||
|
||||
// Create page-specific variables
|
||||
const pageVariables: Record<string, string> = {
|
||||
...options.baseVariables,
|
||||
PAGE: pageNumber.toString()
|
||||
};
|
||||
|
||||
// Resolve the output path using the template and page-specific variables
|
||||
const outputPath = await resolveVariables(options.outputPathTemplate, false, pageVariables);
|
||||
|
||||
const page = doc.loadPage(i);
|
||||
// Use the scaling matrix here
|
||||
const pixmap = page.toPixmap(
|
||||
matrix,
|
||||
mupdf.ColorSpace.DeviceRGB,
|
||||
false
|
||||
);
|
||||
|
||||
// Note: DPI is implicitly handled by the scaling factor now.
|
||||
// The pixmap dimensions will be scaled * scaleFactor.
|
||||
// We might want to remove the explicit DPI option later if it's confusing.
|
||||
|
||||
const imageData = options.format === 'png'
|
||||
? pixmap.asPNG()
|
||||
: pixmap.asJPEG(60, false);
|
||||
|
||||
mkdir(dirname(outputPath));
|
||||
writeFileSync(outputPath, imageDataObjectToBuffer(imageData as any))
|
||||
outputFiles.push(outputPath);
|
||||
logger.info(`Converted page ${pageNumber} to ${outputPath}`);
|
||||
}
|
||||
|
||||
return outputFiles;
|
||||
} catch (error) {
|
||||
logger.error('Error converting PDF to images:', error);
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,46 +0,0 @@
|
||||
import { z } from 'zod';
|
||||
|
||||
// Define the base shape for arguments
|
||||
export const ConvertCommandArgsSchema = z.object({
|
||||
input: z.string().describe('Path to the input PDF file'),
|
||||
output: z.string().describe('Output path template (e.g., output/page_{PAGE}.png)').optional(),
|
||||
dpi: z.number().int().positive().default(300).describe('Resolution for the output images'),
|
||||
scale: z.number().positive().default(2).describe('Scaling factor to apply before rendering (e.g., 2 for 2x size)').optional(),
|
||||
format: z.enum(['png', 'jpg']).default('png').describe('Output image format'),
|
||||
startPage: z.number().int().positive().describe('First page to convert (1-based index)').optional(),
|
||||
endPage: z.number().int().positive().describe('Last page to convert (1-based index)').optional()
|
||||
});
|
||||
|
||||
// Add refinements, transformations, and catchall for final validation/parsing
|
||||
export const ConvertCommandSchema = ConvertCommandArgsSchema
|
||||
.catchall(z.any()) // Allow var-* and other properties
|
||||
.transform((data) => {
|
||||
// Explicitly pick known fields + extras (var-*)
|
||||
const known = {
|
||||
input: data.input,
|
||||
output: data.output,
|
||||
dpi: data.dpi,
|
||||
format: data.format,
|
||||
startPage: data.startPage,
|
||||
endPage: data.endPage,
|
||||
scale: data.scale,
|
||||
};
|
||||
// Keep only extra properties (like var-*)
|
||||
const extras = Object.keys(data)
|
||||
.filter(key => !['input', 'output', 'dpi', 'format', 'startPage', 'endPage', 'scale', '_', '$0'].includes(key))
|
||||
.reduce((acc, key) => { acc[key] = data[key]; return acc; }, {} as any);
|
||||
|
||||
return { ...known, ...extras };
|
||||
})
|
||||
.refine((data) => {
|
||||
if (data.startPage !== undefined && data.endPage !== undefined) {
|
||||
return data.startPage <= data.endPage;
|
||||
}
|
||||
return true;
|
||||
}, {
|
||||
message: "startPage must be less than or equal to endPage",
|
||||
path: ["startPage"],
|
||||
});
|
||||
|
||||
export type ConvertCommandConfig = z.infer<typeof ConvertCommandSchema>;
|
||||
|
||||
@ -1,13 +0,0 @@
|
||||
%PDF-1.4
|
||||
1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj 3 0 obj<</Type/Page/MediaBox[0 0 3 3]>>endobj
|
||||
xref
|
||||
0 4
|
||||
0000000000 65535 f
|
||||
0000000010 00000 n
|
||||
0000000053 00000 n
|
||||
0000000102 00000 n
|
||||
|
||||
trailer<</Size 4/Root 1 0 R>>
|
||||
startxref
|
||||
149
|
||||
%%EOF
|
||||
@ -1,254 +0,0 @@
|
||||
// Test suite for src/commands/convert.ts
|
||||
import { describe, it, expect, vi, beforeEach, Mock, beforeAll } from 'vitest';
|
||||
// Import types first
|
||||
import type { ConvertCommandConfig } from '../../src/types.js';
|
||||
import type { Arguments } from 'yargs';
|
||||
// Remove Buffer import if readFile is no longer mocked directly
|
||||
// import { Buffer } from 'node:buffer';
|
||||
// Import path for constants only if needed, remove specific utils
|
||||
import path from 'path';
|
||||
|
||||
// --- Define Mock Functions ---
|
||||
// Keep only mocks needed for the simplified handler
|
||||
const mockRunConversion = vi.fn(); // Mock the library function
|
||||
const mockExistsSync = vi.fn();
|
||||
const mockLoggerInfo = vi.fn();
|
||||
const mockLoggerError = vi.fn();
|
||||
const mockProcessExit = vi.spyOn(process, 'exit').mockImplementation((() => {}) as any);
|
||||
|
||||
// Remove mocks for functions no longer directly called by handler
|
||||
// const mockConvertPdfToImagesFn = vi.fn();
|
||||
// const mockStatSync = vi.fn();
|
||||
// const mockReadFile = vi.fn();
|
||||
// const mockMkdir = vi.fn();
|
||||
// const mockDirname = vi.fn();
|
||||
// const mockBasename = vi.fn();
|
||||
// const mockExtname = vi.fn();
|
||||
// const mockResolve = vi.fn();
|
||||
// const mockParse = vi.fn();
|
||||
// const mockRelative = vi.fn();
|
||||
// const mockResolveVariables = vi.fn();
|
||||
// const mockPathInfoEx = vi.fn();
|
||||
// const mockDEFAULT_ROOTS = { CWD: '/test/cwd', SCRIPT_DIR: '/test/script' };
|
||||
// const mockDEFAULT_VARS = vi.fn().mockReturnValue({ SOME_DEFAULT: 'value' });
|
||||
|
||||
// Use beforeAll for mocks
|
||||
beforeAll(() => {
|
||||
// Mock dependencies using vi.doMock
|
||||
// Remove unused mocks
|
||||
// vi.doMock('../../src/lib/pdf.js', ...)
|
||||
vi.doMock('../../src/lib/convert.js', () => ({
|
||||
runConversion: mockRunConversion, // Mock the refactored library function
|
||||
}));
|
||||
vi.doMock('node:fs', () => ({
|
||||
existsSync: mockExistsSync,
|
||||
// statSync: mockStatSync,
|
||||
}));
|
||||
// vi.doMock('node:fs/promises', ...)
|
||||
// vi.doMock('node:path', ...)
|
||||
vi.doMock('tslog', () => ({
|
||||
Logger: vi.fn().mockImplementation(() => ({
|
||||
info: mockLoggerInfo,
|
||||
error: mockLoggerError,
|
||||
})),
|
||||
}));
|
||||
// vi.doMock('@polymech/commons', ...)
|
||||
});
|
||||
|
||||
// --- Test Suite ---
|
||||
describe('Convert Command CLI Handler', () => {
|
||||
let convertHandler: typeof import('../../src/commands/convert.js').handler;
|
||||
|
||||
// Import the handler after mocks are set
|
||||
beforeAll(async () => {
|
||||
await vi.dynamicImportSettled(); // Ensure mocks are applied
|
||||
const commandModule = await import('../../src/commands/convert.js');
|
||||
convertHandler = commandModule.handler;
|
||||
});
|
||||
|
||||
// --- Helper Function to Run Handler ---
|
||||
// Helper remains largely the same
|
||||
async function runHandlerHelper(args: Partial<ConvertCommandConfig & { _: (string | number)[], $0: string, output?: string }>) {
|
||||
// Add default values for required fields if not provided in args,
|
||||
// reflecting what yargs + schema default would do.
|
||||
const fullArgs = {
|
||||
_: ['convert'],
|
||||
$0: 'test',
|
||||
dpi: 300,
|
||||
format: 'png',
|
||||
...args,
|
||||
} as Arguments<ConvertCommandConfig & {output?: string}>;
|
||||
|
||||
// Need to simulate the full argv object including potential var-* args
|
||||
// Zod schema parsing happens inside the handler now.
|
||||
|
||||
if (!convertHandler) throw new Error('Handler not loaded');
|
||||
await convertHandler(fullArgs);
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
// Reset only necessary mocks
|
||||
mockRunConversion.mockResolvedValue(['path/to/image1.png']); // Default success
|
||||
mockExistsSync.mockReturnValue(true);
|
||||
// Removed resets for unused mocks
|
||||
mockProcessExit.mockClear();
|
||||
});
|
||||
|
||||
// --- Updated Test cases ---
|
||||
it('should call runConversion with config when output is omitted (uses default)', async () => {
|
||||
const args = {
|
||||
input: 'pdfs/document.pdf',
|
||||
// output is omitted
|
||||
};
|
||||
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockExistsSync).toHaveBeenCalledWith(args.input);
|
||||
expect(mockRunConversion).toHaveBeenCalledTimes(1);
|
||||
// Check the config object passed to runConversion
|
||||
expect(mockRunConversion).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
input: args.input,
|
||||
output: undefined, // Output should be undefined in the config passed from handler
|
||||
dpi: 300, // Default DPI
|
||||
format: 'png', // Default format
|
||||
// other args like startPage/endPage should be undefined
|
||||
}),
|
||||
expect.anything() // Logger instance
|
||||
);
|
||||
// Verify final success logs are called
|
||||
expect(mockLoggerInfo).toHaveBeenCalledWith('Conversion completed successfully');
|
||||
expect(mockLoggerInfo).toHaveBeenCalledWith(expect.stringContaining('Generated')); // Check for generated count message
|
||||
expect(mockProcessExit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should call runConversion with custom output path template when provided', async () => {
|
||||
const customPattern = 'images/custom_${SRC_NAME}_page${PAGE}.${FORMAT}';
|
||||
const args = {
|
||||
input: 'in.pdf',
|
||||
output: customPattern,
|
||||
};
|
||||
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockExistsSync).toHaveBeenCalledWith(args.input);
|
||||
expect(mockRunConversion).toHaveBeenCalledTimes(1);
|
||||
expect(mockRunConversion).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
input: args.input,
|
||||
output: customPattern, // Expect the custom pattern string
|
||||
}),
|
||||
expect.anything() // Logger instance
|
||||
);
|
||||
expect(mockProcessExit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should call runConversion with output path when it is a directory', async () => {
|
||||
const dirPath = 'output/images/';
|
||||
const args = {
|
||||
input: 'some/path/doc.pdf',
|
||||
output: dirPath,
|
||||
};
|
||||
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockExistsSync).toHaveBeenCalledWith(args.input);
|
||||
expect(mockRunConversion).toHaveBeenCalledTimes(1);
|
||||
expect(mockRunConversion).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
input: args.input,
|
||||
output: dirPath, // Expect the directory path string
|
||||
}),
|
||||
expect.anything()
|
||||
);
|
||||
expect(mockProcessExit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Test for specific args being passed through
|
||||
it('should call runConversion with specific args', async () => {
|
||||
const args = {
|
||||
input: 'input.pdf',
|
||||
output: 'output/prefix',
|
||||
dpi: 150,
|
||||
format: 'jpg' as const,
|
||||
startPage: 2,
|
||||
endPage: 5,
|
||||
};
|
||||
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockExistsSync).toHaveBeenCalledWith(args.input);
|
||||
expect(mockRunConversion).toHaveBeenCalledTimes(1);
|
||||
expect(mockRunConversion).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
input: args.input,
|
||||
output: args.output,
|
||||
dpi: args.dpi,
|
||||
format: args.format,
|
||||
startPage: args.startPage,
|
||||
endPage: args.endPage,
|
||||
}),
|
||||
expect.anything()
|
||||
);
|
||||
expect(mockProcessExit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
// Test for var-* args being passed through
|
||||
it('should pass var-* arguments to runConversion', async () => {
|
||||
const args = {
|
||||
input: 'input.pdf',
|
||||
'var-MY_VAR': 'myValue',
|
||||
'var-OTHER': 123
|
||||
};
|
||||
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockRunConversion).toHaveBeenCalledTimes(1);
|
||||
expect(mockRunConversion).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
input: args.input,
|
||||
'var-MY_VAR': 'myValue', // Zod schema with catchall should preserve these
|
||||
'var-OTHER': 123,
|
||||
}),
|
||||
expect.anything()
|
||||
);
|
||||
expect(mockProcessExit).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
|
||||
// --- Error Handling Tests ---
|
||||
it('should handle missing input file', async () => {
|
||||
mockExistsSync.mockReturnValue(false);
|
||||
const args = { input: 'nonexistent.pdf' }; // Output is optional
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockRunConversion).not.toHaveBeenCalled(); // Should not be called
|
||||
// Check logger error message (updated)
|
||||
expect(mockLoggerError).toHaveBeenCalledWith(
|
||||
"Error during conversion command:", // Updated error context
|
||||
expect.stringContaining('Input file nonexistent.pdf does not exist'),
|
||||
expect.any(Error)
|
||||
);
|
||||
expect(mockProcessExit).toHaveBeenCalledWith(1);
|
||||
});
|
||||
|
||||
it('should handle conversion error from runConversion', async () => {
|
||||
const conversionError = new Error('Conversion library failed');
|
||||
mockRunConversion.mockRejectedValue(conversionError); // Mock runConversion to throw
|
||||
const args = { input: 'in.pdf', output: 'out' };
|
||||
|
||||
await runHandlerHelper(args);
|
||||
|
||||
expect(mockRunConversion).toHaveBeenCalledTimes(1);
|
||||
// Check logger error message (updated)
|
||||
expect(mockLoggerError).toHaveBeenCalledWith(
|
||||
"Error during conversion command:", // Updated error context
|
||||
conversionError.message,
|
||||
conversionError
|
||||
);
|
||||
expect(mockProcessExit).toHaveBeenCalledWith(1);
|
||||
});
|
||||
|
||||
// Remove tests checking internal logic that was moved (e.g., mkdir calls)
|
||||
});
|
||||
@ -1,61 +0,0 @@
|
||||
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
|
||||
import { execSync } from 'node:child_process';
|
||||
import { existsSync, rmSync, readdirSync } from 'node:fs';
|
||||
import * as path from 'node:path';
|
||||
|
||||
const packageRoot = process.cwd(); // Assumes test runs from package root
|
||||
const inputPdf = path.join('tests', 'RS485-780.pdf');
|
||||
const outputDir = path.join(packageRoot, 'tests', 'out', 'RS485-780');
|
||||
const outputPattern = '${SRC_DIR}/out/${SRC_NAME}/${SRC_NAME}-${PAGE}.jpg';
|
||||
|
||||
// Expected number of pages for RS485-780.pdf
|
||||
const expectedPageCount = 29;
|
||||
const expectedBaseName = 'RS485-780';
|
||||
const expectedFormat = 'jpg'; // Default format
|
||||
|
||||
describe('CLI Integration Test - Variable Output Path', () => {
|
||||
beforeAll(() => {
|
||||
if (existsSync(outputDir)) {
|
||||
rmSync(outputDir, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
if (existsSync(outputDir)) {
|
||||
// rmSync(outputDir, { recursive: true, force: true }); // Optional: clean up after tests
|
||||
}
|
||||
});
|
||||
|
||||
it('should create images in the correct directory with the correct filenames using variable substitution', () => {
|
||||
// Construct the command
|
||||
// Ensure paths in the command are relative to the execution directory if needed,
|
||||
// but here inputPdf is relative, and outputPattern relies on internal resolution.
|
||||
// Quote the output pattern for safety in the shell.
|
||||
const command = `node dist/index.js convert --input "${inputPdf}" --output "${outputPattern}"`;
|
||||
|
||||
// Execute the command
|
||||
let commandOutput = '';
|
||||
try {
|
||||
// Use { stdio: 'pipe' } to potentially suppress noisy output or capture errors
|
||||
commandOutput = execSync(command, { encoding: 'utf8', stdio: 'pipe' });
|
||||
console.log('Command execution output:', commandOutput);
|
||||
} catch (error: any) {
|
||||
// If the command fails, log the error and fail the test
|
||||
console.error('Command execution failed:', error.stderr || error.stdout || error.message);
|
||||
expect.fail(`Command execution failed: ${error.message}`);
|
||||
}
|
||||
|
||||
// 1. Check if the output directory exists
|
||||
expect(existsSync(outputDir), `Output directory "${outputDir}" should exist`).toBe(true);
|
||||
|
||||
// 2. Check the number of files created
|
||||
const files = readdirSync(outputDir);
|
||||
expect(files.length, `Should have created ${expectedPageCount} files`).toBe(expectedPageCount);
|
||||
|
||||
// 3. Check filenames
|
||||
for (let i = 1; i <= expectedPageCount; i++) {
|
||||
const expectedFilename = `${expectedBaseName}-${i}.${expectedFormat}`;
|
||||
expect(files, `File list should include "${expectedFilename}"`).toContain(expectedFilename);
|
||||
}
|
||||
});
|
||||
});
|
||||
@ -1,188 +0,0 @@
|
||||
// Test suite for src/lib/pdf.ts
|
||||
import { describe, it, expect, vi, beforeEach, Mock, beforeAll } from 'vitest';
|
||||
import { convertPdfToImages, PdfToImageOptions } from '../../src/lib/pdf';
|
||||
import { Logger } from 'tslog';
|
||||
import { Buffer } from 'node:buffer';
|
||||
|
||||
// --- Define Mock Functions FIRST ---
|
||||
const mockWriteFile = vi.fn();
|
||||
const mockAsPNG = vi.fn();
|
||||
const mockAsJPEG = vi.fn();
|
||||
const mockToPixmap = vi.fn();
|
||||
const mockLoadPage = vi.fn();
|
||||
const mockCountPages = vi.fn();
|
||||
const mockOpenDocument = vi.fn();
|
||||
const mockLoggerInfo = vi.fn();
|
||||
const mockLoggerError = vi.fn();
|
||||
|
||||
// Use beforeAll to ensure mocks are set before tests run
|
||||
beforeAll(() => {
|
||||
// Use vi.doMock for non-hoisted mocking
|
||||
vi.doMock('node:fs/promises', () => ({
|
||||
writeFile: mockWriteFile,
|
||||
}));
|
||||
vi.doMock('mupdf', () => ({
|
||||
Document: {
|
||||
openDocument: mockOpenDocument,
|
||||
},
|
||||
ColorSpace: {
|
||||
DeviceRGB: 'DeviceRGB'
|
||||
}
|
||||
}));
|
||||
vi.doMock('tslog', () => ({
|
||||
Logger: vi.fn().mockImplementation(() => ({
|
||||
info: mockLoggerInfo,
|
||||
error: mockLoggerError,
|
||||
})),
|
||||
}));
|
||||
});
|
||||
|
||||
// --- Dynamic Import and Test Execution ---
|
||||
async function runTests() {
|
||||
// Dynamically import the module *after* mocks are set up
|
||||
const { convertPdfToImages } = await import('../../src/lib/pdf');
|
||||
|
||||
describe('convertPdfToImages Function', () => {
|
||||
let convertPdfToImages: typeof import('../../src/lib/pdf').convertPdfToImages;
|
||||
|
||||
// Import the actual function *after* mocks are applied
|
||||
beforeAll(async () => {
|
||||
// Ensure mocks are ready before importing the module that uses them
|
||||
await vi.dynamicImportSettled();
|
||||
const pdfLib = await import('../../src/lib/pdf');
|
||||
convertPdfToImages = pdfLib.convertPdfToImages;
|
||||
});
|
||||
|
||||
const mockPdfData = Buffer.from('mock-pdf-data');
|
||||
const baseOptions: Omit<PdfToImageOptions, 'outputPathPrefix'> = {
|
||||
dpi: 300,
|
||||
format: 'png',
|
||||
};
|
||||
|
||||
// Mocks returned by other mocks need to be configured within beforeEach
|
||||
const mockPixmap = { asPNG: mockAsPNG, asJPEG: mockAsJPEG };
|
||||
const mockPage = { toPixmap: mockToPixmap };
|
||||
const mockDoc = { countPages: mockCountPages, loadPage: mockLoadPage };
|
||||
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
|
||||
// Configure mock implementations/return values
|
||||
mockWriteFile.mockResolvedValue(undefined);
|
||||
mockAsPNG.mockReturnValue(Buffer.from('mock-png-data'));
|
||||
mockAsJPEG.mockReturnValue(Buffer.from('mock-jpg-data'));
|
||||
mockToPixmap.mockReturnValue(mockPixmap);
|
||||
mockLoadPage.mockReturnValue(mockPage);
|
||||
mockCountPages.mockReturnValue(5);
|
||||
mockOpenDocument.mockReturnValue(mockDoc);
|
||||
});
|
||||
|
||||
it('should convert all pages to PNG by default', async () => {
|
||||
const options: PdfToImageOptions = { ...baseOptions, outputPathPrefix: 'output/image' };
|
||||
const result = await convertPdfToImages(mockPdfData, options);
|
||||
|
||||
expect(mockOpenDocument).toHaveBeenCalledWith(mockPdfData, 'pdf');
|
||||
expect(mockCountPages).toHaveBeenCalled();
|
||||
expect(mockLoadPage).toHaveBeenCalledTimes(5);
|
||||
expect(mockToPixmap).toHaveBeenCalledTimes(5);
|
||||
expect(mockAsPNG).toHaveBeenCalledTimes(5);
|
||||
expect(mockAsJPEG).not.toHaveBeenCalled();
|
||||
expect(mockWriteFile).toHaveBeenCalledTimes(5);
|
||||
expect(mockWriteFile).toHaveBeenNthCalledWith(1, 'output/image_1.png', Buffer.from('mock-png-data'));
|
||||
expect(mockWriteFile).toHaveBeenNthCalledWith(5, 'output/image_5.png', Buffer.from('mock-png-data'));
|
||||
expect(result).toEqual([
|
||||
'output/image_1.png',
|
||||
'output/image_2.png',
|
||||
'output/image_3.png',
|
||||
'output/image_4.png',
|
||||
'output/image_5.png',
|
||||
]);
|
||||
});
|
||||
|
||||
it('should convert specified page range to JPG', async () => {
|
||||
const options: PdfToImageOptions = {
|
||||
...baseOptions,
|
||||
outputPathPrefix: 'jpg_images/page',
|
||||
format: 'jpg',
|
||||
startPage: 2,
|
||||
endPage: 4,
|
||||
};
|
||||
const result = await convertPdfToImages(mockPdfData, options);
|
||||
|
||||
expect(mockLoadPage).toHaveBeenCalledTimes(3); // Pages 2, 3, 4
|
||||
expect(mockLoadPage).toHaveBeenNthCalledWith(1, 1); // 0-based index for page 2
|
||||
expect(mockLoadPage).toHaveBeenNthCalledWith(3, 3); // 0-based index for page 4
|
||||
expect(mockToPixmap).toHaveBeenCalledTimes(3);
|
||||
expect(mockAsJPEG).toHaveBeenCalledTimes(3);
|
||||
expect(mockAsPNG).not.toHaveBeenCalled();
|
||||
expect(mockWriteFile).toHaveBeenCalledTimes(3);
|
||||
expect(mockWriteFile).toHaveBeenNthCalledWith(1, 'jpg_images/page_2.jpg', Buffer.from('mock-jpg-data'));
|
||||
expect(mockWriteFile).toHaveBeenNthCalledWith(3, 'jpg_images/page_4.jpg', Buffer.from('mock-jpg-data'));
|
||||
expect(result).toEqual([
|
||||
'jpg_images/page_2.jpg',
|
||||
'jpg_images/page_3.jpg',
|
||||
'jpg_images/page_4.jpg',
|
||||
]);
|
||||
});
|
||||
|
||||
it('should throw error for invalid startPage', async () => {
|
||||
const options: PdfToImageOptions = { ...baseOptions, outputPathPrefix: 'err', startPage: 0 };
|
||||
await expect(convertPdfToImages(mockPdfData, options))
|
||||
.rejects.toThrow('startPage (0) is out of valid range (1-5)');
|
||||
});
|
||||
|
||||
it('should throw error for invalid endPage', async () => {
|
||||
mockCountPages.mockReturnValue(3); // Adjust page count for this test
|
||||
const options: PdfToImageOptions = { ...baseOptions, outputPathPrefix: 'err', endPage: 4 };
|
||||
await expect(convertPdfToImages(mockPdfData, options))
|
||||
.rejects.toThrow('endPage (4) is out of valid range (1-3)');
|
||||
});
|
||||
|
||||
it('should throw error if startPage > endPage', async () => {
|
||||
const options: PdfToImageOptions = { ...baseOptions, outputPathPrefix: 'err', startPage: 4, endPage: 2 };
|
||||
await expect(convertPdfToImages(mockPdfData, options))
|
||||
.rejects.toThrow('startPage (4) cannot be greater than endPage (2)');
|
||||
});
|
||||
|
||||
it('should propagate errors from mupdf loadPage', async () => {
|
||||
const mupdfError = new Error('mupdf loadPage failed');
|
||||
mockLoadPage.mockImplementation(() => {
|
||||
throw mupdfError;
|
||||
});
|
||||
const options: PdfToImageOptions = { ...baseOptions, outputPathPrefix: 'mupdf_err' };
|
||||
await expect(convertPdfToImages(mockPdfData, options))
|
||||
.rejects.toThrow('mupdf loadPage failed');
|
||||
});
|
||||
|
||||
it('should propagate errors from writeFile', async () => {
|
||||
const fsError = new Error('fs failed');
|
||||
mockWriteFile.mockImplementation(async () => {
|
||||
throw fsError;
|
||||
});
|
||||
const options: PdfToImageOptions = { ...baseOptions, outputPathPrefix: 'fs_err' };
|
||||
await expect(convertPdfToImages(mockPdfData, options))
|
||||
.rejects.toThrow('fs failed');
|
||||
});
|
||||
|
||||
it('should use provided logger', async () => {
|
||||
const customLogger = { info: vi.fn(), error: vi.fn() };
|
||||
const options: PdfToImageOptions = {
|
||||
...baseOptions,
|
||||
outputPathPrefix: 'log_test',
|
||||
logger: customLogger as any
|
||||
};
|
||||
await convertPdfToImages(mockPdfData, options);
|
||||
expect(customLogger.info).toHaveBeenCalledWith(expect.stringContaining('Processing pages 1 to 5'));
|
||||
expect(customLogger.info).toHaveBeenCalledWith(expect.stringContaining('Converted page 1 to log_test_1.png'));
|
||||
expect(customLogger.info).toHaveBeenCalledTimes(6);
|
||||
expect(mockLoggerInfo).not.toHaveBeenCalled(); // Ensure default mock logger wasn't used
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
// Run the tests
|
||||
runTests();
|
||||
|
||||
// Need to alias the mock for use within the test file scope
|
||||
import * as fsPromises from 'node:fs/promises';
|
||||
import * as mupdf from 'mupdf';
|
||||
|
Before Width: | Height: | Size: 46 KiB |
|
Before Width: | Height: | Size: 54 KiB |
|
Before Width: | Height: | Size: 61 KiB |
|
Before Width: | Height: | Size: 57 KiB |
|
Before Width: | Height: | Size: 52 KiB |
|
Before Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 62 KiB |
|
Before Width: | Height: | Size: 50 KiB |
|
Before Width: | Height: | Size: 53 KiB |
|
Before Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 40 KiB |
|
Before Width: | Height: | Size: 50 KiB |
|
Before Width: | Height: | Size: 40 KiB |
|
Before Width: | Height: | Size: 47 KiB |
|
Before Width: | Height: | Size: 35 KiB |
|
Before Width: | Height: | Size: 37 KiB |
|
Before Width: | Height: | Size: 51 KiB |
|
Before Width: | Height: | Size: 46 KiB |
|
Before Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 51 KiB |
|
Before Width: | Height: | Size: 49 KiB |
|
Before Width: | Height: | Size: 40 KiB |
|
Before Width: | Height: | Size: 44 KiB |
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 56 KiB |
|
Before Width: | Height: | Size: 40 KiB |
|
Before Width: | Height: | Size: 48 KiB |
|
Before Width: | Height: | Size: 42 KiB |
|
Before Width: | Height: | Size: 46 KiB |
|
Before Width: | Height: | Size: 37 KiB |
|
Before Width: | Height: | Size: 88 KiB |
|
Before Width: | Height: | Size: 83 KiB |
|
Before Width: | Height: | Size: 40 KiB |
@ -1,15 +0,0 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2022",
|
||||
"module": "NodeNext",
|
||||
"moduleResolution": "NodeNext",
|
||||
"outDir": "./dist",
|
||||
"rootDir": "./src",
|
||||
"strict": true,
|
||||
"esModuleInterop": true,
|
||||
"skipLibCheck": true,
|
||||
"forceConsistentCasingInFileNames": true
|
||||
},
|
||||
"include": ["src/**/*"],
|
||||
"exclude": ["node_modules", "dist"]
|
||||
}
|
||||
@ -1,21 +0,0 @@
|
||||
const xlsx = require("xlsx");
|
||||
|
||||
const data = [
|
||||
{ FunctionalCodeSet: "SKI780", Group: "Group P (readable)", Codes: ["P0","P1","P2","P3","P4","P5","P6","P7","P8","P9","PA","PB"] },
|
||||
{ FunctionalCodeSet: "SKI780", Group: "Group A (readable)", Codes: ["A0","A1","A2","A5","A6","A7","A8","A9","AA","AB","AC"] }
|
||||
];
|
||||
|
||||
const rows = data.flatMap(({ FunctionalCodeSet, Group, Codes }) =>
|
||||
Codes.map(code => ({ FunctionalCodeSet, Group, Code: code }))
|
||||
);
|
||||
|
||||
const wb = xlsx.utils.book_new();
|
||||
const ws = xlsx.utils.json_to_sheet(rows);
|
||||
xlsx.utils.book_append_sheet(wb, ws, "SKI780 Codes");
|
||||
|
||||
ws['!freeze'] = { ySplit: 1 }; // freeze header row
|
||||
ws['!autofilter'] = { ref: `A1:C${rows.length+1}` };
|
||||
ws['!cols'] = [ { wch: 18 }, { wch: 22 }, { wch: 6 } ]; // or compute
|
||||
|
||||
xlsx.writeFile(wb, "SKI780_Functional_Codes.xlsx");
|
||||
console.log("✓ SKI780_Functional_Codes.xlsx created with proper column widths!");
|
||||
119
packages/media/ref/pdf-to-images/xlsx/package-lock.json
generated
@ -1,119 +0,0 @@
|
||||
{
|
||||
"name": "xlsx",
|
||||
"version": "1.0.0",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {
|
||||
"": {
|
||||
"name": "xlsx",
|
||||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"xlsx": "^0.18.5"
|
||||
}
|
||||
},
|
||||
"node_modules/adler-32": {
|
||||
"version": "1.3.1",
|
||||
"resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz",
|
||||
"integrity": "sha512-ynZ4w/nUUv5rrsR8UUGoe1VC9hZj6V5hU9Qw1HlMDJGEJw5S7TfTErWTjMys6M7vr0YWcPqs3qAr4ss0nDfP+A==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/cfb": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/cfb/-/cfb-1.2.2.tgz",
|
||||
"integrity": "sha512-KfdUZsSOw19/ObEWasvBP/Ac4reZvAGauZhs6S/gqNhXhI7cKwvlH7ulj+dOEYnca4bm4SGo8C1bTAQvnTjgQA==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"adler-32": "~1.3.0",
|
||||
"crc-32": "~1.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/codepage": {
|
||||
"version": "1.15.0",
|
||||
"resolved": "https://registry.npmjs.org/codepage/-/codepage-1.15.0.tgz",
|
||||
"integrity": "sha512-3g6NUTPd/YtuuGrhMnOMRjFc+LJw/bnMp3+0r/Wcz3IXUuCosKRJvMphm5+Q+bvTVGcJJuRvVLuYba+WojaFaA==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/crc-32": {
|
||||
"version": "1.2.2",
|
||||
"resolved": "https://registry.npmjs.org/crc-32/-/crc-32-1.2.2.tgz",
|
||||
"integrity": "sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==",
|
||||
"license": "Apache-2.0",
|
||||
"bin": {
|
||||
"crc32": "bin/crc32.njs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/frac": {
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/frac/-/frac-1.1.2.tgz",
|
||||
"integrity": "sha512-w/XBfkibaTl3YDqASwfDUqkna4Z2p9cFSr1aHDt0WoMTECnRfBOv2WArlZILlqgWlmdIlALXGpM2AOhEk5W3IA==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/ssf": {
|
||||
"version": "0.11.2",
|
||||
"resolved": "https://registry.npmjs.org/ssf/-/ssf-0.11.2.tgz",
|
||||
"integrity": "sha512-+idbmIXoYET47hH+d7dfm2epdOMUDjqcB4648sTZ+t2JwoyBFL/insLfB/racrDmsKB3diwsDA696pZMieAC5g==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"frac": "~1.1.2"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/wmf": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/wmf/-/wmf-1.0.2.tgz",
|
||||
"integrity": "sha512-/p9K7bEh0Dj6WbXg4JG0xvLQmIadrner1bi45VMJTfnbVHsc7yIajZyoSoK60/dtVBs12Fm6WkUI5/3WAVsNMw==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/word": {
|
||||
"version": "0.3.0",
|
||||
"resolved": "https://registry.npmjs.org/word/-/word-0.3.0.tgz",
|
||||
"integrity": "sha512-OELeY0Q61OXpdUfTp+oweA/vtLVg5VDOXh+3he3PNzLGG/y0oylSOC1xRVj0+l4vQ3tj/bB1HVHv1ocXkQceFA==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
},
|
||||
"node_modules/xlsx": {
|
||||
"version": "0.18.5",
|
||||
"resolved": "https://registry.npmjs.org/xlsx/-/xlsx-0.18.5.tgz",
|
||||
"integrity": "sha512-dmg3LCjBPHZnQp5/F/+nnTa+miPJxUXB6vtk42YjBBKayDNagxGEeIdWApkYPOf3Z3pm3k62Knjzp7lMeTEtFQ==",
|
||||
"license": "Apache-2.0",
|
||||
"dependencies": {
|
||||
"adler-32": "~1.3.0",
|
||||
"cfb": "~1.2.1",
|
||||
"codepage": "~1.15.0",
|
||||
"crc-32": "~1.2.1",
|
||||
"ssf": "~0.11.2",
|
||||
"wmf": "~1.0.1",
|
||||
"word": "~0.3.0"
|
||||
},
|
||||
"bin": {
|
||||
"xlsx": "bin/xlsx.njs"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=0.8"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1,16 +0,0 @@
|
||||
{
|
||||
"name": "xlsx",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"type": "commonjs",
|
||||
"dependencies": {
|
||||
"xlsx": "^0.18.5"
|
||||
}
|
||||
}
|
||||
@ -1,12 +1,9 @@
|
||||
import { IConvertVideoOptions, IResizeOptions } from './types.js'
|
||||
import { forward_slash, pathInfo, pathInfoEx,substitute } from "@polymech/commons"
|
||||
import { isFile, resolve } from "@polymech/commons"
|
||||
import { isFile, resolve, globBase } from "@polymech/commons"
|
||||
import { sync as exists } from "@polymech/fs/exists"
|
||||
import { GLOB_BASIC } from './lib/media/images/index.js'
|
||||
|
||||
|
||||
const globBase = require('glob-base')
|
||||
|
||||
export const defaults = () => {
|
||||
const DefaultCommand = 'info';
|
||||
if (process.argv.length === 2) {
|
||||
|
||||
4
packages/media/src/cli.ts
Normal file
@ -0,0 +1,4 @@
|
||||
import yargs from 'yargs'
|
||||
import { hideBin } from 'yargs/helpers'
|
||||
|
||||
export const cli = yargs(hideBin(process.argv))
|
||||
@ -1,11 +1,11 @@
|
||||
import * as CLI from 'yargs'
|
||||
import { logger } from '../index.js'
|
||||
import { convert } from '../lib/media/audio/convert.js'
|
||||
|
||||
import { cli } from '../main.js'
|
||||
import { defaults, sanitizeVideo } from '../_cli.js'
|
||||
import { IConvertVideoOptions } from '../types'
|
||||
import { IConvertVideoOptions } from '../types.js'
|
||||
|
||||
export const defaultOptions = (yargs: CLI.Argv) => {
|
||||
export const defaultOptions = (yargs) => {
|
||||
return yargs.option('src', {
|
||||
describe: 'FILE|FOLDER|GLOB',
|
||||
demandOption: true
|
||||
@ -22,15 +22,11 @@ export const defaultOptions = (yargs: CLI.Argv) => {
|
||||
})
|
||||
}
|
||||
|
||||
let options = (yargs: CLI.Argv) => defaultOptions(yargs)
|
||||
|
||||
export const register = (cli: CLI.Argv) => {
|
||||
return cli.command('video <verb>', 'Convert video', options, (argv: CLI.Arguments) => {
|
||||
defaults()
|
||||
const options = sanitizeVideo(argv) as IConvertVideoOptions
|
||||
logger.info("options " + argv.dst, options)
|
||||
if (argv.verb = 'convert') {
|
||||
return convert(options)
|
||||
}
|
||||
})
|
||||
}
|
||||
cli.command('video <verb>', 'Convert video', defaultOptions, async (argv) => {
|
||||
defaults()
|
||||
const options = sanitizeVideo(argv) as IConvertVideoOptions
|
||||
logger.info("options " + argv.dst, options)
|
||||
if (options.verb == 'convert') {
|
||||
return convert(options)
|
||||
}
|
||||
})
|
||||
|
||||
@ -4,6 +4,7 @@ import { existsSync } from 'node:fs';
|
||||
import * as z from 'zod';
|
||||
import { runConversion } from '../lib/pdf/convert.js';
|
||||
import { ConvertCommandConfig, ConvertCommandSchema } from '../lib/pdf/types.js'
|
||||
import { cli } from '../cli.js';
|
||||
|
||||
export const command = 'pdf2jpg';
|
||||
export const desc = 'Convert PDF to images';
|
||||
@ -68,6 +69,4 @@ export async function handler(argv: CLI.Arguments) {
|
||||
}
|
||||
}
|
||||
|
||||
export const register = (cli: CLI.Argv) => {
|
||||
return cli.command(command, desc, builder, handler)
|
||||
}
|
||||
cli.command(command, desc, builder, handler)
|
||||
|
||||
@ -3,7 +3,7 @@ import { logger } from '../index.js'
|
||||
import {
|
||||
resize
|
||||
} from '../lib/media/images/resize.js'
|
||||
|
||||
import { cli } from '../cli.js'
|
||||
import {
|
||||
sanitize,
|
||||
defaults
|
||||
|
||||