kbot tests :)

This commit is contained in:
lovebird 2025-04-24 18:59:47 +02:00
parent 741afaa301
commit f912ca24e0
10 changed files with 37249 additions and 32 deletions

View File

@ -7,6 +7,8 @@ export interface IKBotOptions {
output?: string | undefined;
/** Optional destination path for the result, will substitute ${MODEL_NAME} and ${ROUTER} in the path. Optional, used for "completion" mode */
dst?: string | undefined;
/** How to handle output if --dst file already exists: "concat" (append) or "merge" (try to merge structures if possible, otherwise append). Only used if --dst is specified. */
append?: ("concat" | "merge") | undefined;
/** Iterate over items, supported: GLOB | Path to JSON File | array of strings (comma separated). To test different models, use --each="gpt-3.5-turbo,gpt-4o", the actual string will exposed as variable `ITEM`, eg: --dst="${ITEM}-output.md" */
each?: string | undefined;
/** Disable tools categories, eg: --disable=fs,git,interact,terminal,search,web,email,user */
@ -149,6 +151,7 @@ export interface IKBotOptions {
meta-llama/llama-3-70b-instruct | paid
meta-llama/llama-3-8b-instruct | paid
meta-llama/llama-3.1-405b | paid
meta-llama/llama-3.1-405b:free | free
meta-llama/llama-3.1-405b-instruct | paid
meta-llama/llama-3.1-70b-instruct | paid
meta-llama/llama-3.1-8b-instruct | paid
@ -305,7 +308,6 @@ export interface IKBotOptions {
sophosympatheia/rogue-rose-103b-v0.2:free | free
sao10k/l3-lunaris-8b | paid
sao10k/l3-euryale-70b | paid
sao10k/l3.1-70b-hanami-x1 | paid
sao10k/l3.1-euryale-70b | paid
sao10k/l3.3-euryale-70b | paid
shisa-ai/shisa-v2-llama3.3-70b:free | free
@ -327,7 +329,6 @@ export interface IKBotOptions {
x-ai/grok-3-mini-beta | paid
x-ai/grok-beta | paid
x-ai/grok-vision-beta | paid
xwin-lm/xwin-lm-70b | paid

 OpenAI models:


View File

@ -114,15 +114,15 @@ export declare const InvokeToolSchema: z.ZodObject<{
env_key: z.ZodOptional<z.ZodString>;
}, "strip", z.ZodTypeAny, {
function?: string;
tools?: string;
params?: string;
tools?: string;
target?: string;
output?: string;
env_key?: string;
}, {
function?: string;
tools?: string;
params?: string;
tools?: string;
target?: string;
output?: string;
env_key?: string;

View File

@ -23,7 +23,7 @@
"marked": "^15.0.4",
"mime-types": "^2.1.35",
"nodemailer": "^6.9.16",
"openai": "^4.87.4",
"openai": "^4.96.0",
"p-map": "^7.0.3",
"rotating-file-stream": "^3.2.6",
"screenshot-desktop": "^1.15.0",
@ -35,7 +35,7 @@
"type-fest": "^4.30.2",
"winston": "^3.17.0",
"yargs": "^17.7.2",
"zod": "^3.24.1",
"zod": "^3.24.3",
"zod-to-json-schema": "^3.24.1"
},
"bin": {
@ -1003,8 +1003,8 @@
"tslog": "^3.3.3",
"tsup": "^2.0.3",
"yargs": "^17.7.2",
"zod": "^3.24.2",
"zod-to-json-schema": "^3.24.1",
"zod": "^3.24.3",
"zod-to-json-schema": "^3.24.5",
"zod-to-ts": "^1.2.0"
},
"devDependencies": {
@ -5358,9 +5358,9 @@
}
},
"node_modules/openai": {
"version": "4.87.4",
"resolved": "https://registry.npmjs.org/openai/-/openai-4.87.4.tgz",
"integrity": "sha512-lsfM20jZY4A0lNexfoUAkfmrEXxaTXvv8OKYicpeAJUNHObpRgkvC7pxPgMnB6gc9ID8OCwzzhEhBpNy69UR7w==",
"version": "4.96.0",
"resolved": "https://registry.npmjs.org/openai/-/openai-4.96.0.tgz",
"integrity": "sha512-dKoW56i02Prv2XQolJ9Rl9Svqubqkzg3QpwEOBuSVZLk05Shelu7s+ErRTwFc1Bs3JZ2qBqBfVpXQiJhwOGG8A==",
"license": "Apache-2.0",
"dependencies": {
"@types/node": "^18.11.18",
@ -6629,7 +6629,9 @@
}
},
"node_modules/zod": {
"version": "3.24.2",
"version": "3.24.3",
"resolved": "https://registry.npmjs.org/zod/-/zod-3.24.3.tgz",
"integrity": "sha512-HhY1oqzWCQWuUqvBFnsyrtZRhyPeR7SUGv+C4+MsisMuVfSPx8HpwWqH8tRahSlt6M3PiFAcoeFhZAqIXTxoSg==",
"license": "MIT",
"funding": {
"url": "https://github.com/sponsors/colinhacks"

View File

@ -43,7 +43,7 @@
"marked": "^15.0.4",
"mime-types": "^2.1.35",
"nodemailer": "^6.9.16",
"openai": "^4.87.4",
"openai": "^4.96.0",
"p-map": "^7.0.3",
"rotating-file-stream": "^3.2.6",
"screenshot-desktop": "^1.15.0",
@ -55,7 +55,7 @@
"type-fest": "^4.30.2",
"winston": "^3.17.0",
"yargs": "^17.7.2",
"zod": "^3.24.1",
"zod": "^3.24.3",
"zod-to-json-schema": "^3.24.1"
},
"devDependencies": {

View File

@ -7,6 +7,8 @@ export interface IKBotOptions {
output?: string | undefined;
/** Optional destination path for the result, will substitute ${MODEL_NAME} and ${ROUTER} in the path. Optional, used for "completion" mode */
dst?: string | undefined;
/** How to handle output if --dst file already exists: "concat" (append) or "merge" (try to merge structures if possible, otherwise append). Only used if --dst is specified. */
append?: ("concat" | "merge") | undefined;
/** Iterate over items, supported: GLOB | Path to JSON File | array of strings (comma separated). To test different models, use --each="gpt-3.5-turbo,gpt-4o", the actual string will exposed as variable `ITEM`, eg: --dst="${ITEM}-output.md" */
each?: string | undefined;
/** Disable tools categories, eg: --disable=fs,git,interact,terminal,search,web,email,user */
@ -149,6 +151,7 @@ export interface IKBotOptions {
meta-llama/llama-3-70b-instruct | paid
meta-llama/llama-3-8b-instruct | paid
meta-llama/llama-3.1-405b | paid
meta-llama/llama-3.1-405b:free | free
meta-llama/llama-3.1-405b-instruct | paid
meta-llama/llama-3.1-70b-instruct | paid
meta-llama/llama-3.1-8b-instruct | paid
@ -305,7 +308,6 @@ export interface IKBotOptions {
sophosympatheia/rogue-rose-103b-v0.2:free | free
sao10k/l3-lunaris-8b | paid
sao10k/l3-euryale-70b | paid
sao10k/l3.1-70b-hanami-x1 | paid
sao10k/l3.1-euryale-70b | paid
sao10k/l3.3-euryale-70b | paid
shisa-ai/shisa-v2-llama3.3-70b:free | free
@ -327,7 +329,6 @@ export interface IKBotOptions {
x-ai/grok-3-mini-beta | paid
x-ai/grok-beta | paid
x-ai/grok-vision-beta | paid
xwin-lm/xwin-lm-70b | paid

 OpenAI models:


View File

@ -114,11 +114,11 @@ See more in [./docs/Examples.md](./docs/Examples.md) and [./docs/Integration.md]
- [-] internal files
- [ ] outside root / component
- [ ] naming conventions
- [-] default configurations
- [ ] default configurations
- [ ] sw errors
- [ ] library compat
- [-] orphan files
- [-] equations
- [ ] orphan files
- [ ] equations
- [ ] tree
- [ ] md
- [x] json
@ -133,7 +133,7 @@ See more in [./docs/Examples.md](./docs/Examples.md) and [./docs/Integration.md]
- [ ] motion analysis
- [x] query(tree)
- [ ] sw: explode
- [ ] sw: timeouts
- [-] sw: timeouts
- [x] sw: osr-default props
- [-] sw: cache instance (node IPC | csharp JIT?)
- [x] sw: cache

View File

@ -1,12 +1,12 @@
import { z } from 'zod';
// Define the base shape for arguments
export const ConvertCommandArgsSchema = z.object({
input: z.string(),
output: z.string().optional(),
dpi: z.number().int().positive().default(300),
format: z.enum(['png', 'jpg']).default('png'),
startPage: z.number().int().positive().optional(),
endPage: z.number().int().positive().optional()
input: z.string().describe('Path to the input PDF file'),
output: z.string().describe('Output path template (e.g., output/page_{PAGE}.png)').optional(),
dpi: z.number().int().positive().default(300).describe('Resolution for the output images'),
format: z.enum(['png', 'jpg']).default('png').describe('Output image format'),
startPage: z.number().int().positive().describe('First page to convert (1-based index)').optional(),
endPage: z.number().int().positive().describe('Last page to convert (1-based index)').optional()
});
// Add refinements, transformations, and catchall for final validation/parsing
export const ConvertCommandSchema = ConvertCommandArgsSchema

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,219 @@
# **TypeScript Libraries and Tools for PDF Data Extraction to JSON**
**1\. Introduction**
The prevalence of Portable Document Format (PDF) files in various domains necessitates efficient programmatic methods for accessing and processing their content. PDFs serve as a standard for document storage and exchange, frequently containing valuable data that applications need to extract and utilize.1 This demand for automated data retrieval has spurred the development of numerous tools and libraries capable of parsing and extracting information from these documents. However, the very nature of the PDF format, designed primarily for visual presentation, introduces significant hurdles for automated extraction processes.2 A single PDF can encompass a diverse range of content, including textual data formatted in intricate layouts, embedded images, and tabular data often represented through visual cues rather than semantic structures.2 Furthermore, the increasing prevalence of scanned documents adds another layer of complexity, as text within these PDFs exists as images, requiring Optical Character Recognition (OCR) to convert them into machine-readable text.3
Given the user's preference for TypeScript, this report will specifically investigate solutions built with or offering robust support for this language. TypeScript, as a statically-typed superset of JavaScript, provides enhanced code maintainability and scalability, making it a suitable choice for developing reliable data extraction pipelines.5 This report aims to identify and analyze suitable TypeScript libraries for extracting text, images, and tables from PDF documents, explore available OCR options for handling image-based text, and investigate the potential integration of local Artificial Intelligence (AI) models for advanced structured data extraction, particularly for complex tables. The ultimate goal is to provide a comparative overview of these solutions, summarizing their key features, licensing, development status, and suitability for the task of converting PDF content into JSON format. The structure of this report will follow a logical progression, starting with basic content extraction and advancing to more sophisticated techniques involving OCR and AI integration, culminating in a comparative analysis and recommendations.
**2\. TypeScript Libraries for Basic PDF Content Extraction**
The initial step in processing PDF documents programmatically often involves extracting the fundamental content: text and images. Several TypeScript and JavaScript libraries offer functionalities to achieve this, each with its own strengths and limitations.1
* **2.1. Text Extraction:**
* **pdf-parse:** This popular Node.js package is recognized for its straightforward approach to extracting text from PDF files.1 It allows developers to easily retrieve the textual content of a PDF document through a user-friendly interface.1 However, a notable limitation of pdf-parse is its inability to preserve the structural integrity of tables within the PDF.1 It tends to treat the content of tables as continuous lines of text, which can be problematic when structured data is required.1 Furthermore, user experiences reported in online forums indicate potential issues such as compilation errors in specific environments, suggesting possible environmental dependencies or library-specific bugs.15 Additionally, its capability to retain formatting details like margins, centered text, or page information is limited, as highlighted by user feedback seeking more layout-aware text extraction.16 This makes pdf-parse suitable for basic text retrieval when the document's layout and structure are not critical for subsequent processing.
* **pdf2json:** This module focuses on transforming PDF files from their binary format into a JSON representation.1 By converting the PDF content into a JSON structure, it provides more granular information compared to pdf-parse, potentially including the coordinates of text elements within the document.1 This coordinate information could theoretically be used to reconstruct some of the document's layout. However, a significant drawback of pdf2json is its lack of recent updates, with reports indicating that it hasn't been actively maintained for several years.16 This raises concerns regarding its compatibility with newer PDF standards and potential security vulnerabilities. While it might offer more structural data than pdf-parse due to the inclusion of coordinates, its outdated status makes it a less reliable choice for long-term projects.
* **pdf-ts:** As a TypeScript library specifically designed for PDF text extraction, pdf-ts aims to provide a type-safe and well-integrated solution within TypeScript projects.5 Its primary focus is on extracting textual content from PDF documents. The library has garnered some community interest, indicated by its 36 stars and 1 fork on GitHub.5 The last recorded release was on August 7, 2023\.5 Being written in TypeScript, it offers the advantages of static typing, which can lead to more robust and maintainable codebases compared to plain JavaScript libraries.
* **js-extract:** This library is essentially a packaged version of examples demonstrating how to use the widely adopted pdf.js library within a Node.js environment.6 Its core functionality lies in reading a PDF file and exporting all pages along with the extracted text and their corresponding coordinates.6 The inclusion of coordinate data makes js-extract a potentially valuable tool for scenarios where preserving or reconstructing the structure of the PDF, including the positioning of text elements, is important.6 This capability could be particularly useful for attempting to identify and extract tabular data. The library is licensed under the MIT license, and its last published version (0.2.1) was two years ago.6 The fact that it's built upon pdf.js is noteworthy, as pdf.js is a mature and actively developed library primarily used for rendering PDFs in web browsers, suggesting a solid foundation for its text extraction capabilities.
* **PDF-LIB:** While the provided snippets do not explicitly detail PDF-LIB's text extraction capabilities, it is mentioned as the underlying library for pdf-io, which focuses on image extraction.17 Additionally, a user in an online forum mentioned attempting to use PDF-LIB for parsing PDFs, suggesting it possesses broader PDF manipulation functionalities beyond just image handling.15 PDF-LIB is a powerful library for creating and modifying PDF documents in JavaScript environments.18 Its API provides low-level access to the structure of PDF files, which could potentially be leveraged for custom text extraction logic, although this might require a deeper understanding of the PDF format itself.
* **2.2. Image Extraction:**
* **pdf-io:** This TypeScript library is specifically designed for the task of extracting images from PDF files.17 It provides functionalities to parse a given PDF document, identify image objects within it, and save these images as PNG files.17 The library relies on the robust pdf-lib for PDF parsing and pngjs for encoding the extracted image data into the PNG format.17 It offers a straightforward API, with a constructor that accepts either a file path or a buffer containing the PDF data, and an extractImages() method to perform the extraction.17 The extracted images can either be saved to a specified output directory or returned as an array of Uint8Array or Buffer objects if the PDF was loaded from a buffer.17 pdf-io is licensed under the MIT license.17 While it directly addresses the need for image extraction, its current GitHub status, with 3 stars and 1 fork and no recent releases, suggests it might be a relatively small or less actively maintained project.17
* **node-pdf-extract-image:** Similar to pdf-io, this library focuses on extracting images from PDF documents.7 It utilizes pdfjs-dist, the distribution of Mozilla's pdf.js library for Node.js environments, to read and process PDF files.7 The extracted images are then encoded as PNG files using the pngjs library.7 It's important to note that this library only extracts images that are explicitly embedded within the PDF and will return an empty array if no images are found.7 It provides a simple asynchronous function, extractImagesFromPdf, which accepts either a buffer containing the PDF data or the path to the PDF file.7 The resulting images are returned as an array of buffers, which can then be written to disk or further processed.7 This library is also licensed under the MIT license.7 By leveraging the widely used pdfjs-dist, it benefits from the maturity and extensive capabilities of pdf.js in handling various PDF structures.
* **pdf-img-convert:** Although not explicitly identified as a TypeScript library, pdf-img-convert is mentioned as a solution for extracting images from PDFs by converting each page into an image.19 It operates using pdf.js under the hood, suggesting a JavaScript-based implementation. This approach is particularly useful when dealing with PDFs where content might not be directly extractable as text or individual image objects, as it essentially renders each page as a raster image.
* **Apryse WebViewer:** This is a commercial JavaScript SDK that offers a comprehensive suite of PDF functionalities, including the extraction of image content.20 It provides a detailed API that allows developers to traverse the PDF's display list, identify elements of type image, and export them in various formats such as PNG or TIFF.20 While the license is commercial, Apryse WebViewer likely offers robust features, performance, and dedicated support, making it a viable option for projects with budget for a commercial solution requiring advanced PDF processing capabilities.
**3\. Leveraging OCR for Image-Based Text**
When dealing with scanned PDF documents or PDFs where text is embedded as images, Optical Character Recognition (OCR) technology becomes essential to convert these images into machine-readable text.3 Several JavaScript and TypeScript libraries are available for performing OCR, with tesseract.js being a prominent open-source option.11
* **3.1. TypeScript OCR Libraries:**
* **tesseract.js:** This library stands out as a pure JavaScript port of the highly regarded Tesseract OCR engine, which boasts support for over 100 languages.11 It enables OCR to be performed directly within a web browser or on a server using Node.js.11 tesseract.js offers functionalities such as automatic text orientation and script detection, and it provides an interface to access bounding boxes for paragraphs, words, and characters.11 Demonstrations show its capability to accurately recognize text from images.22 Installation is straightforward using CDN, npm, or yarn.12 The library is licensed under the Apache-2.0 license and enjoys a high level of community engagement, evidenced by its substantial number of stars on GitHub.12 It operates locally, processing images directly without relying on external AI models for its core OCR functionality.12 This local processing aligns well with the user's preference for local solutions. While tesseract.js itself does not directly support PDF files as input, it can be effectively used in conjunction with libraries like PDF.js to first convert PDF pages into images (e.g., canvas elements) and then perform OCR on these images.21
* **Other OCR Packages:** The npm ecosystem contains a variety of other packages related to OCR.24 Some of these, like @gzqyl/react-native-ocr and @gutenye/ocr-react-native, are tailored for specific environments like React Native and might offer local OCR capabilities.24 ollama-ocr indicates a potential integration with local visual AI models run by Ollama for OCR tasks.24 Exploring these packages further might reveal specialized features or integrations relevant to specific use cases.
* **Considerations from Research:** Research comparing different OCR models suggests that the optimal choice depends on the characteristics of the input images.25 While cloud-based models might offer superior accuracy in some scenarios, local models like EasyOCR (mentioned in a research blog) can provide a cost-effective solution with competitive accuracy.25 Tesseract, the engine behind tesseract.js, is known for its wide language support but can struggle with documents that are not clean or machine-generated, such as scanned documents or those with unusual fonts.26 docTR, another open-source option, performs better on scanned documents but lacks handwriting support.26 Therefore, the suitability of tesseract.js will depend on the quality and nature of the images extracted from the PDFs.
* 3.2. Integrating Image Extraction with OCR:
The process of extracting text from image-based PDFs typically involves a two-stage approach.21 First, a library capable of extracting images from the PDF, such as pdf-io or node-pdf-extract-image, is used to obtain the image data, often in PNG format.7 Second, this image data or the path to the saved image file is then provided as input to an OCR library like tesseract.js.12 tesseract.js processes the image and outputs the recognized text.12 Examples demonstrate the use of PDF.js to render PDF pages onto a canvas element, followed by tesseract.js performing OCR on the content of this canvas.21 This method effectively bridges the gap between PDF content and OCR processing in a JavaScript/TypeScript environment.
**4\. Extracting Tables from PDFs in TypeScript**
Extracting tabular data from PDF documents presents a more complex challenge compared to simple text or image extraction.4 This is primarily because PDFs do not inherently define tables as semantic structures; instead, tables are usually rendered using lines and text elements positioned in a grid-like fashion.4 This lack of semantic information necessitates more sophisticated techniques to identify and reconstruct the table structure.
* **4.1. Rule-based Table Extraction Libraries:**
* **pdf-tables-parser:** This JavaScript/TypeScript library is specifically designed to address the challenge of extracting text tables from PDF files.8 It aims to efficiently parse PDF documents and extract structured table data, even from multi-page PDFs and those with complex layouts.8 The library offers several configurable options, such as hasTitles to indicate if tables have title rows, threshold to adjust the sensitivity for grouping rows, maxStrLength for setting a maximum string length for cells, and ignoreTexts to specify text to be ignored during extraction.8 The extracted table data is provided as a straightforward 2D array, where each inner array represents a row and the elements within are the cell contents.8 While it's a TypeScript library, its GitHub status with only 1 star and 0 forks suggests it might be a relatively new or less widely adopted project.8
* **@mkas3/pdf-table-parser:** This library is a TypeScript-based rewrite of the JavaScript library pdf-table-extractor, with the added benefit of built-in type declarations.9 Its goal is to simplify the process of parsing tables from PDF files.9 It offers options like maxEdgesPerPage to control the number of edges processed on each page and a progressFunc callback to monitor the extraction process.9 The library returns a Promise that resolves to an array of page objects, where each page object contains an array of tables. Each table is represented by an array of rows, and each row contains an array of cell objects with their content.9 This structured JSON output format can be convenient for further data processing. The library has been published more recently and has seen some weekly downloads, indicating a degree of current usage.9
* **@kobataku/pdf-table-extractor:** This package is presented as a fork of the original pdf-table-extractor library, specifically created to provide a valid npm module for TypeScript development.10 It allows users to extract tables from PDF files and obtain the data as a 2D array.10 However, this package was published six years ago and has a very low number of weekly downloads, which might suggest it is no longer actively maintained or has been superseded by more recent alternatives like @mkas3/pdf-table-parser.10 For detailed information on the extraction algorithm, users are referred to the original repository.10
* **pdf2array:** This is described as a hobby project that aims to simplify the extraction of tabular data from PDF files using the pdf.js library.28 Being in its early stages of development, its reliability and capabilities might be limited compared to more established libraries. However, its existence indicates an ongoing interest within the TypeScript community in developing better solutions for PDF table extraction.
* 4.2. Potential of Local AI Models for Advanced Table Extraction:
Traditional rule-based approaches to table extraction often face significant limitations due to the inherent lack of semantic structure in PDF documents.1 Tables are visually interpreted by humans based on layout, lines, and the spatial arrangement of text, but translating this human intuition into robust code is challenging.4 Issues like merged cells, tables without clear borderlines, and inconsistent layouts can easily confuse rule-based algorithms.29
The emergence of AI, particularly Large Language Models (LLMs) and vision models, offers promising avenues for more advanced and accurate table extraction.2 These models can leverage their understanding of language and visual patterns to identify and interpret table structures more effectively.32 For instance, LayoutPDFReader within the llamaindex framework employs intelligent chunking to preserve the context of tables, although it currently lacks OCR capabilities.2 Tools like AlgoDocs and Docsumo utilize AI to extract tables, even from scanned documents.3 The Unstructured library provides a hi\_res strategy that leverages AI for improved table extraction.31 gmft is a specialized tool that uses Microsoft's TATR model for deep table recognition, focusing on alignment and speed by potentially reusing existing OCR output.30 PDF-Extract-Kit integrates state-of-the-art models for various document parsing tasks, including table recognition capable of outputting in formats like LaTeX, HTML, and Markdown.33 Converting the PDF to an image and then using tools like img2table is another approach that combines image processing with potential AI-driven table detection.29
Integrating local AI models into a TypeScript-based workflow for table extraction is an area of growing interest.13 Libraries like instructor-js facilitate structured extraction using LLMs (including local ones through platforms like Ollama) by defining schemas using Zod.13 Documind is an open-source tool that can extract structured data, including from tables, using both cloud-based (OpenAI) and local LLMs (Llava and Llama3.2).14 Unstructured also supports integration with local models via Ollama for various data extraction tasks, including tables.31 One potential approach involves using OCR (like Tesseract.js) to extract text and then feeding this text into a local LLM with specific instructions to identify and structure tabular data.42 While the direct TypeScript integration of local AI models specifically for PDF table extraction is still evolving, these initial explorations suggest a promising direction for overcoming the limitations of traditional methods.
**5\. Converting Extracted Data to JSON Format**
Once the desired data—whether text, images, or tables—has been extracted from the PDF document, the final step often involves converting this information into JSON (JavaScript Object Notation) format.63 JSON is a lightweight data-interchange format that is easy for humans to read and write and easy for machines to parse and generate.66
The structure of the extracted data will vary depending on the library and the type of content. Extracted text might be a simple string, images could be represented as file paths or base64 encoded strings, and tables might be in the form of 2D arrays or more complex nested objects.8
TypeScript provides a built-in global object, JSON, which offers methods for working with JSON data, including the crucial stringify() method.63 This method takes a JavaScript value (which can be an object or an array) and converts it into a JSON string.65
For example, if a table is extracted as a 2D array in TypeScript:
TypeScript
const tableData: string \= \["Header 1", "Header 2"\],
\["Data 1", "Data 2"\],
\["Data 3", "Data 4"\];
This can be easily converted to a JSON string using JSON.stringify():
TypeScript
const jsonString: string \= JSON.stringify(tableData);
console.log(jsonString); // Output: \[\["Header 1","Header 2"\],\["Data 1","Data 2"\],\["Data 3","Data 4"\]\]
Similarly, if extracted data is structured as an array of objects, as might be the case with @mkas3/pdf-table-parser 9:
TypeScript
const pageTables \= \[
{
page: 1,
tables: \[
{
rows: \[{ content: "Header 1" }, { content: "Header 2" }\],
\[{ content: "Data 1" }, { content: "Data 2" }\],
},
\],
},
\];
This can also be converted to a JSON string:
TypeScript
const jsonOutput: string \= JSON.stringify(pageTables);
console.log(jsonOutput);
/\*
Output:
\[{"page":1,"tables":\[{"rows":\[\[{"content":"Header 1"},{"content":"Header 2"}\],\[{"content":"Data 1"},{"content":"Data 2"}\]\]}\]}\]
\*/
Various techniques exist for transforming arrays of objects into a JSON object with specific key-value pairs if a different structure is desired.63 TypeScript's flexibility allows for structuring the extracted data in a way that best suits the application's needs before the final conversion to JSON using JSON.stringify().
**6\. Integrating Local AI Models for Structured Data Extraction**
Integrating local AI models for structured data extraction, including from PDFs, involves several considerations regarding setup, model selection, and interaction.13
* General Approaches:
The primary ways to leverage local LLMs in a TypeScript environment include using libraries that provide direct integration with local LLM inference servers or making direct API calls to these servers.13 Libraries like instructor-js offer integration with platforms like Ollama, which simplifies the process of running and interacting with local models.13 Similarly, tools like Documind and Unstructured are designed to work with local LLMs for document processing tasks.14 If a library doesn't offer built-in integration, developers might need to make HTTP requests to the API endpoints exposed by local LLM inference servers, such as those provided by Ollama.45
* Considerations for Model Selection and Setup:
Choosing the right local LLM is crucial and depends on the specific data extraction task.25 Different models possess varying capabilities in understanding text, identifying entities, and structuring data. Models specifically fine-tuned for information extraction or document understanding are generally preferred.56 Running LLMs locally can be computationally intensive, requiring adequate hardware resources, including CPU, RAM, and potentially a dedicated GPU for optimal performance.50 Setting up the local LLM inference server (e.g., using Ollama or a similar framework) and downloading the desired model are necessary prerequisites.45 It's also important to consider the availability and licensing terms of the models.12 Effective communication with the LLM is achieved through careful prompt engineering, which involves crafting clear and specific instructions to guide the model in extracting and structuring the data according to the desired schema.36
* Local AI for Table Extraction (Revisited):
As previously discussed, local AI models hold significant potential for enhancing table extraction from PDFs.14 By leveraging their semantic understanding, these models can often overcome the limitations of rule-based methods when dealing with complex table structures.14 Libraries like instructor-js with its JSON mode allow developers to define a schema representing the desired table structure and instruct the local LLM to extract the table data in that format.13 Tools like Documind and Unstructured also aim to facilitate this process by providing abstractions for interacting with local LLMs for document intelligence tasks, including table extraction.14 A common strategy involves first extracting the text content of the PDF (potentially using OCR if needed) and then providing this text along with a well-crafted prompt and schema to the local LLM to identify and structure the tabular data.42
**7\. Comparative Analysis of Potential Solutions**
The following table summarizes the key findings for several TypeScript and JavaScript libraries and tools discussed in this report, highlighting their capabilities for PDF data extraction to JSON.
| Name | Description | Links | License | Status | Features | Local AI Model Integration Details |
| :---- | :---- | :---- | :---- | :---- | :---- | :---- |
| pdf-parse | Node.js library for basic text extraction from PDFs. | [https://www.npmjs.com/package/pdf-parse](https://www.npmjs.com/package/pdf-parse) | MIT | Popular, user-friendly interface | Text extraction. Limited table structure preservation. | No explicit local AI integration. |
| pdf2json | Transforms PDF to JSON format, including text coordinates. | [https://www.npmjs.com/package/pdf2json](https://www.npmjs.com/package/pdf2json) | Apache-2.0 | Outdated, not recently updated | Converts PDF to JSON, includes text coordinates. | No explicit local AI integration. |
| pdf-ts | TypeScript library for PDF text extraction. | [https://github.com/axflow/pdf-ts](https://github.com/axflow/pdf-ts) | MIT | Moderate community interest, last release Aug 2023 | Text extraction. | No explicit local AI integration. |
| js-extract | Extracts text with coordinates from PDFs using pdf.js. | [https://www.npmjs.com/package/pdf.js-extract](https://www.npmjs.com/package/pdf.js-extract) | MIT | Last published 2 years ago | Text extraction with coordinates. Potential for table reconstruction. | No explicit local AI integration. |
| pdf-io | TypeScript library for image extraction from PDFs. | ([https://github.com/Sorvereign/pdf-io](https://github.com/Sorvereign/pdf-io)) | MIT | Low activity, no recent releases | Image extraction to PNG. Built on pdf-lib and pngjs. | No explicit local AI integration. |
| node-pdf-extract-image | Extracts images from PDFs using pdfjs-dist and pngjs. | [https://github.com/bangbang93/node-pdf-extract-image](https://github.com/bangbang93/node-pdf-extract-image) | MIT | No explicit status provided | Image extraction to PNG. | No explicit local AI integration. |
| tesseract.js | Pure JavaScript port of Tesseract OCR engine. | [https://github.com/naptha/tesseract.js/](https://github.com/naptha/tesseract.js/) | Apache-2.0 | Highly active and popular | OCR for over 100 languages. Runs in browser and Node.js. | Local processing, not direct integration with external AI models for OCR. |
| pdf-tables-parser | JavaScript/TypeScript library for text table extraction. | [https://github.com/kanakkholwal/pdf-tables-parser](https://github.com/kanakkholwal/pdf-tables-parser) | ISC | Low adoption, possibly newer | Extracts tables as 2D arrays. Supports multi-page PDFs and complex layouts. Configurable options. | No explicit local AI integration. |
| @mkas3/pdf-table-parser | TypeScript rewrite of pdf-table-extractor for table parsing. | [https://www.npmjs.com/package/@mkas3/pdf-table-parser](https://www.npmjs.com/package/@mkas3/pdf-table-parser) | MIT | Some recent activity | Parses tables from PDFs. Typed. Offers structured JSON output. | No explicit local AI integration. |
| @kobataku/pdf-table-extractor | TypeScript fork of pdf-table-extractor for table extraction. | [https://www.npmjs.com/package/@kobataku/pdf-table-extractor](https://www.npmjs.com/package/@kobataku/pdf-table-extractor) | BSD License | Outdated, very low activity | Extracts tables as 2D arrays. | No explicit local AI integration. |
| instructor-js | TypeScript library for structured extraction using LLMs. | [https://github.com/instructor-ai/instructor-js](https://github.com/instructor-ai/instructor-js) | MIT | Actively developed | Facilitates structured data extraction using LLMs (including local via Ollama) and Zod schemas. Supports various output modes (JSON, TOOLS, FUNCTIONS). | Supports local models via integration with llm-polyglot and platforms like Ollama. Requires setting up the local LLM and defining schemas. |
| Documind | Open-source tool for turning documents into structured data. | [https://github.com/DocumindHQ/documind](https://github.com/DocumindHQ/documind) | Not specified | Actively developed | Extracts structured JSON from unstructured documents. Supports custom schemas. Works with OpenAI and local LLMs (Llava, Llama3.2). Converts documents to Markdown. | Supports local LLMs (Llava and Llama3.2). Requires installation of system dependencies (Ghostscript, GraphicsMagick) and setting up environment variables for API keys (if using cloud models). |
| Unstructured | Open-source library for pre-processing unstructured data. | [https://github.com/Unstructured-IO/unstructured](https://github.com/Unstructured-IO/unstructured) | Apache-2.0 | Actively developed | Extracts raw text, tables, and metadata from various document formats, including PDFs. Modular functions and connectors for data ingestion and pre-processing. | Supports local AI model processing via integration with Ollama. Requires installing Ollama and configuring the connection within Unstructured. |
**8\. Recommendations and Implementation Considerations**
Based on the analysis, the selection of libraries and tools will depend on the specific requirements of the PDF data extraction task. For basic text extraction without the need for structural information, pdf-parse offers an easy-to-use solution.1 However, for scenarios requiring more structural awareness, including the potential reconstruction of tables, js-extract with its coordinate data might be a better starting point.6
For image extraction, both pdf-io and node-pdf-extract-image provide dedicated functionalities, with the latter leveraging the widely adopted pdfjs-dist.7 If dealing with scanned documents or PDFs with text embedded as images, tesseract.js stands out as a robust and actively maintained OCR library with extensive language support.11 A common implementation strategy would involve using an image extraction library to obtain images and then passing them to tesseract.js for OCR.21
Extracting tables accurately remains a complex challenge. For rule-based approaches in TypeScript, @mkas3/pdf-table-parser appears to be a more recently updated and actively used option compared to others like @kobataku/pdf-table-extractor.9 However, for more complex or poorly formatted tables, leveraging local AI models offers a promising direction. Libraries like instructor-js, Documind, and Unstructured provide mechanisms to integrate with local LLM inference servers (e.g., via Ollama) to perform structured data extraction, including from tables.13 This approach typically involves defining a schema for the desired output format and prompting the LLM to extract and structure the data accordingly.36
When implementing a PDF data extraction solution, it's crucial to consider potential performance bottlenecks, especially when processing large files or using local AI models, which can be resource-intensive.50 Optimization techniques might be necessary to ensure acceptable processing times. Robust error handling is also essential to manage potential issues with malformed PDFs or unexpected content.15 For integrating local AI models, the setup involves installing and configuring the LLM inference server (like Ollama), downloading the desired models, and potentially fine-tuning prompts to achieve the desired accuracy and output format.45
**9\. Conclusion**
This report has explored a range of TypeScript libraries and tools capable of extracting various types of data from PDF documents and converting it into JSON format. While libraries like pdf-parse, pdf-ts, and js-extract offer solutions for text extraction, and pdf-io and node-pdf-extract-image facilitate image retrieval, the extraction of tabular data and text from images often requires more advanced techniques. tesseract.js provides a strong foundation for OCR in TypeScript environments.
The integration of local AI models represents a significant advancement in the field of PDF data extraction, particularly for handling the complexities of table extraction and other structured information retrieval tasks. Libraries like instructor-js, along with tools such as Documind and Unstructured, are paving the way for leveraging the power of local LLMs within TypeScript applications to achieve more accurate and nuanced data extraction from PDF documents. As this field continues to evolve, the ability to effectively combine traditional extraction methods with the semantic understanding of AI models will be crucial for unlocking the vast amounts of data contained within PDF files.
#### **Works cited**
1. Parsing PDFs in Node.js \- LogRocket Blog, accessed on April 24, 2025, [https://blog.logrocket.com/parsing-pdfs-node-js/](https://blog.logrocket.com/parsing-pdfs-node-js/)
2. Mastering PDFs: Extracting Sections, Headings, Paragraphs, and Tables with Cutting-Edge Parser \- LlamaIndex, accessed on April 24, 2025, [https://www.llamaindex.ai/blog/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125](https://www.llamaindex.ai/blog/mastering-pdfs-extracting-sections-headings-paragraphs-and-tables-with-cutting-edge-parser-faea18870125)
3. PDF Image Extraction: A Comprehensive Guide To Extracting Image Data From Scanned Pdf Files In 2025 \- AlgoDocs, accessed on April 24, 2025, [https://www.algodocs.com/pdf-image-extraction-comprehensive-guide-2025/](https://www.algodocs.com/pdf-image-extraction-comprehensive-guide-2025/)
4. How can I extract tables as structured data from PDF documents? \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/17591426/how-can-i-extract-tables-as-structured-data-from-pdf-documents](https://stackoverflow.com/questions/17591426/how-can-i-extract-tables-as-structured-data-from-pdf-documents)
5. axflow/pdf-ts: PDF text extraction in TypeScript \- GitHub, accessed on April 24, 2025, [https://github.com/axflow/pdf-ts](https://github.com/axflow/pdf-ts)
6. pdf.js-extract \- npm, accessed on April 24, 2025, [https://www.npmjs.com/package/pdf.js-extract](https://www.npmjs.com/package/pdf.js-extract)
7. bangbang93/node-pdf-extract-image \- GitHub, accessed on April 24, 2025, [https://github.com/bangbang93/node-pdf-extract-image](https://github.com/bangbang93/node-pdf-extract-image)
8. kanakkholwal/pdf-tables-parser: Library to extract text ... \- GitHub, accessed on April 24, 2025, [https://github.com/kanakkholwal/pdf-tables-parser](https://github.com/kanakkholwal/pdf-tables-parser)
9. @mkas3/pdf-table-parser \- npm, accessed on April 24, 2025, [https://www.npmjs.com/package/@mkas3/pdf-table-parser](https://www.npmjs.com/package/@mkas3/pdf-table-parser)
10. @kobataku/pdf-table-extractor \- npm, accessed on April 24, 2025, [https://www.npmjs.com/package/@kobataku/pdf-table-extractor](https://www.npmjs.com/package/@kobataku/pdf-table-extractor)
11. Tesseract.js | Pure Javascript OCR for 100 Languages\!, accessed on April 24, 2025, [https://tesseract.projectnaptha.com/](https://tesseract.projectnaptha.com/)
12. naptha/tesseract.js: Pure Javascript OCR for more than 100 Languages \- GitHub, accessed on April 24, 2025, [https://github.com/naptha/tesseract.js/](https://github.com/naptha/tesseract.js/)
13. instructor-ai/instructor-js: structured extraction for llms \- GitHub, accessed on April 24, 2025, [https://github.com/instructor-ai/instructor-js](https://github.com/instructor-ai/instructor-js)
14. DocumindHQ/documind: Open-source platform for extracting structured data from documents using AI. \- GitHub, accessed on April 24, 2025, [https://github.com/DocumindHQ/documind](https://github.com/DocumindHQ/documind)
15. PDF Parsing with Typescript \- Palantir Developer Community, accessed on April 24, 2025, [https://community.palantir.com/t/pdf-parsing-with-typescript/718](https://community.palantir.com/t/pdf-parsing-with-typescript/718)
16. Looking for a good pdf-parser to extract text. Any suggestions? : r/node \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/node/comments/186y7y0/looking\_for\_a\_good\_pdfparser\_to\_extract\_text\_any/](https://www.reddit.com/r/node/comments/186y7y0/looking_for_a_good_pdfparser_to_extract_text_any/)
17. Sorvereign/pdf-io: A TypeScript library that allows you to ... \- GitHub, accessed on April 24, 2025, [https://github.com/Sorvereign/pdf-io](https://github.com/Sorvereign/pdf-io)
18. Javascript Extract Images From Pdf | Restackio, accessed on April 24, 2025, [https://www.restack.io/p/javascript-extract-images-from-pdf-answer-cat-ai](https://www.restack.io/p/javascript-extract-images-from-pdf-answer-cat-ai)
19. Extract images from PDF file with JavaScript \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/18680261/extract-images-from-pdf-file-with-javascript](https://stackoverflow.com/questions/18680261/extract-images-from-pdf-file-with-javascript)
20. PDF Image Extraction Library for JavaScript \- Apryse documentation, accessed on April 24, 2025, [https://docs.apryse.com/web/guides/extraction/image-extract](https://docs.apryse.com/web/guides/extraction/image-extract)
21. Using OCR in JavaScript to extract text \- Dropbox Sign, accessed on April 24, 2025, [https://sign.dropbox.com/blog/using-ocr-in-javascript](https://sign.dropbox.com/blog/using-ocr-in-javascript)
22. JavaScript OCR Using Tesseract.js | Interesting JS Library Series | Episode 1 \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=kHTasYqs4Tw](https://www.youtube.com/watch?v=kHTasYqs4Tw)
23. Running OCR against PDFs and images directly in your browser \- Simon Willison's Weblog, accessed on April 24, 2025, [https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/](https://simonwillison.net/2024/Mar/30/ocr-pdfs-images/)
24. ocr \- npm search, accessed on April 24, 2025, [https://www.npmjs.com/search?q=ocr\&page=2](https://www.npmjs.com/search?q=ocr&page=2)
25. Best OCR Models for Text Recognition in Images \- Roboflow Blog, accessed on April 24, 2025, [https://blog.roboflow.com/best-ocr-models-text-recognition/](https://blog.roboflow.com/best-ocr-models-text-recognition/)
26. Our search for the best OCR tool in 2023, and what we found \- Source \- OpenNews, accessed on April 24, 2025, [https://source.opennews.org/articles/our-search-best-ocr-tool-2023/](https://source.opennews.org/articles/our-search-best-ocr-tool-2023/)
27. pomgui/pdf-tables-parser: Library to parse a pdf file and extract all the tables contained returning a json object. \- GitHub, accessed on April 24, 2025, [https://github.com/pomgui/pdf-tables-parser](https://github.com/pomgui/pdf-tables-parser)
28. Extracting tabular data from PDF files : r/typescript \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/typescript/comments/ze3b8c/extracting\_tabular\_data\_from\_pdf\_files/](https://www.reddit.com/r/typescript/comments/ze3b8c/extracting_tabular_data_from_pdf_files/)
29. Extract tables from PDF for RAG : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1cn0z11/extract\_tables\_from\_pdf\_for\_rag/](https://www.reddit.com/r/LangChain/comments/1cn0z11/extract_tables_from_pdf_for_rag/)
30. PDF Table Extraction, the Definitive Guide (+ gmft release\!) : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1dclawv/pdf\_table\_extraction\_the\_definitive\_guide\_gmft/](https://www.reddit.com/r/LangChain/comments/1dclawv/pdf_table_extraction_the_definitive_guide_gmft/)
31. Table extraction from PDF \- Unstructured, accessed on April 24, 2025, [https://docs.unstructured.io/examplecode/codesamples/apioss/table-extraction-from-pdf](https://docs.unstructured.io/examplecode/codesamples/apioss/table-extraction-from-pdf)
32. Best table parsers of pdf? : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1fwt2cn/best\_table\_parsers\_of\_pdf/](https://www.reddit.com/r/LangChain/comments/1fwt2cn/best_table_parsers_of_pdf/)
33. opendatalab/PDF-Extract-Kit: A Comprehensive Toolkit for High-Quality PDF Content Extraction \- GitHub, accessed on April 24, 2025, [https://github.com/opendatalab/PDF-Extract-Kit](https://github.com/opendatalab/PDF-Extract-Kit)
34. Extract table from pdf and images online \- Docsumo, accessed on April 24, 2025, [https://www.docsumo.com/free-tools/extract-tables-from-pdf-images](https://www.docsumo.com/free-tools/extract-tables-from-pdf-images)
35. LLM model for table data \- Languages at Hugging Face, accessed on April 24, 2025, [https://discuss.huggingface.co/t/llm-model-for-table-data/44230](https://discuss.huggingface.co/t/llm-model-for-table-data/44230)
36. Structured Data Extraction | Phoenix \- Arize AI, accessed on April 24, 2025, [https://docs.arize.com/phoenix/cookbook/structured-data-extraction](https://docs.arize.com/phoenix/cookbook/structured-data-extraction)
37. Building a Trend Detection System with AI in TypeScript: A Step-by-Step Guide \- Firecrawl, accessed on April 24, 2025, [https://www.firecrawl.dev/blog/trend-finder-typescript](https://www.firecrawl.dev/blog/trend-finder-typescript)
38. Build a custom RAG AI agent in TypeScript and Jupyter \- Deno, accessed on April 24, 2025, [https://deno.com/blog/build-custom-rag-ai-agent](https://deno.com/blog/build-custom-rag-ai-agent)
39. Building a Clone of OpenAI's Deep Research with TypeScript and Firecrawl, accessed on April 24, 2025, [https://www.firecrawl.dev/blog/open-deep-research-explainer](https://www.firecrawl.dev/blog/open-deep-research-explainer)
40. Structured Data Extraction \- LlamaIndex, accessed on April 24, 2025, [https://docs.llamaindex.ai/en/stable/use\_cases/extraction/](https://docs.llamaindex.ai/en/stable/use_cases/extraction/)
41. Possible to write TypeScript package to call into local LLM and generate consistent output? : r/LLMDevs \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LLMDevs/comments/1ixlj2w/possible\_to\_write\_typescript\_package\_to\_call\_into/](https://www.reddit.com/r/LLMDevs/comments/1ixlj2w/possible_to_write_typescript_package_to_call_into/)
42. What model would you use to extract full pdf? : r/ollama \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/ollama/comments/1gc8je1/what\_model\_would\_you\_use\_to\_extract\_full\_pdf/](https://www.reddit.com/r/ollama/comments/1gc8je1/what_model_would_you_use_to_extract_full_pdf/)
43. instructor-ai/instructor: structured outputs for llms \- GitHub, accessed on April 24, 2025, [https://github.com/instructor-ai/instructor](https://github.com/instructor-ai/instructor)
44. Instructor (JS): Welcome To Instructor, accessed on April 24, 2025, [https://js.useinstructor.com/](https://js.useinstructor.com/)
45. Build a Local AI Chatbot with Ollama and JavaScript | Full Guide \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=qY2xYNJhB1A](https://www.youtube.com/watch?v=qY2xYNJhB1A)
46. Structured Output for Open Source and Local LLMS \- Instructor (JS), accessed on April 24, 2025, [https://js.useinstructor.com/blog/2024/03/07/open-source-local-structured-output-zod-json-openai/](https://js.useinstructor.com/blog/2024/03/07/open-source-local-structured-output-zod-json-openai/)
47. Build a 3D AI Teacher w/ Next.js, ChatGPT & Azure \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=\_bi4Ol0QEL4](https://www.youtube.com/watch?v=_bi4Ol0QEL4)
48. Building LLM Agents in JavaScript: A Comprehensive Guide \- Adyog, accessed on April 24, 2025, [https://blog.adyog.com/2024/09/11/building-llm-agents-in-javascript-a-comprehensive-guide/](https://blog.adyog.com/2024/09/11/building-llm-agents-in-javascript-a-comprehensive-guide/)
49. Harry-027/DocuMind: A document based RAG application \- GitHub, accessed on April 24, 2025, [https://github.com/Harry-027/DocuMind](https://github.com/Harry-027/DocuMind)
50. Building a Local LLM Rig: Need Advice on Components and Setup\! : r/LocalLLM \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LocalLLM/comments/1k5i84j/building\_a\_local\_llm\_rig\_need\_advice\_on/](https://www.reddit.com/r/LocalLLM/comments/1k5i84j/building_a_local_llm_rig_need_advice_on/)
51. DocuMind (RAG app using Ollama) \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/ollama/comments/1jqajhl/documind\_rag\_app\_using\_ollama/](https://www.reddit.com/r/ollama/comments/1jqajhl/documind_rag_app_using_ollama/)
52. Documind \- Chat with PDF AI, accessed on April 24, 2025, [https://www.documind.chat/](https://www.documind.chat/)
53. Show HN: Documind Open-source AI tool to turn documents into structured data, accessed on April 24, 2025, [https://news.ycombinator.com/item?id=42171311](https://news.ycombinator.com/item?id=42171311)
54. LM Studio \+ AnythingLLM: Process Local Documents with RAG Like a Pro\! \- YouTube, accessed on April 24, 2025, [https://m.youtube.com/watch?v=UG8uftJXcNs](https://m.youtube.com/watch?v=UG8uftJXcNs)
55. Extract Entities Using Azure OpenAI Structured Outputs Mode | Microsoft Learn, accessed on April 24, 2025, [https://learn.microsoft.com/en-us/azure/developer/ai/how-to/extract-entities-using-structured-outputs](https://learn.microsoft.com/en-us/azure/developer/ai/how-to/extract-entities-using-structured-outputs)
56. Structured Data Extraction from Unstructured Text Python LLMs Ollama Pydantic Llama 3.2 Granite 3.2 \- IBM TechXchange Community, accessed on April 24, 2025, [https://community.ibm.com/community/user/blogs/nickolus-plowden/2025/04/10/structured-data-extraction-from-unstructured-text](https://community.ibm.com/community/user/blogs/nickolus-plowden/2025/04/10/structured-data-extraction-from-unstructured-text)
57. Structured data extraction from unstructured content using LLM schemas, accessed on April 24, 2025, [https://simonwillison.net/2025/Feb/28/llm-schemas/](https://simonwillison.net/2025/Feb/28/llm-schemas/)
58. Open-Source Document Extraction: Unstract, DeepSeek & PostgreSQL, accessed on April 24, 2025, [https://unstract.com/blog/open-source-document-data-extraction-with-unstract-deepseek/](https://unstract.com/blog/open-source-document-data-extraction-with-unstract-deepseek/)
59. How to Convert Unstructured Data to Structured Data Using AI \- Multimodal.dev, accessed on April 24, 2025, [https://www.multimodal.dev/post/how-to-convert-unstructured-data-to-structured-data](https://www.multimodal.dev/post/how-to-convert-unstructured-data-to-structured-data)
60. Extracting unstructured text and images into database tables with GPT-4 Turbo and Datasette Extract \- YouTube, accessed on April 24, 2025, [https://www.youtube.com/watch?v=g3NtJatmQR0](https://www.youtube.com/watch?v=g3NtJatmQR0)
61. Unstructured-IO/unstructured: Open source libraries and APIs to build custom preprocessing pipelines for labeling, training, or production machine learning pipelines. \- GitHub, accessed on April 24, 2025, [https://github.com/Unstructured-IO/unstructured](https://github.com/Unstructured-IO/unstructured)
62. Text-to-Table: Extracting Unstructured Data from a Large Legal Text : r/LangChain \- Reddit, accessed on April 24, 2025, [https://www.reddit.com/r/LangChain/comments/1gl3pl7/texttotable\_extracting\_unstructured\_data\_from\_a/](https://www.reddit.com/r/LangChain/comments/1gl3pl7/texttotable_extracting_unstructured_data_from_a/)
63. Typescript convert an array to JSON \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/48101176/typescript-convert-an-array-to-json](https://stackoverflow.com/questions/48101176/typescript-convert-an-array-to-json)
64. How to Convert an Object to a JSON String in Typescript \- GeeksforGeeks, accessed on April 24, 2025, [https://www.geeksforgeeks.org/how-to-convert-an-object-to-a-json-string-in-typescript/](https://www.geeksforgeeks.org/how-to-convert-an-object-to-a-json-string-in-typescript/)
65. JSON.stringify() \- JavaScript \- MDN Web Docs, accessed on April 24, 2025, [https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global\_Objects/JSON/stringify](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/JSON/stringify)
66. How do I convert JavaScript array to JSON? \- ReqBin, accessed on April 24, 2025, [https://reqbin.com/code/javascript/n2ek7onb/javascript-array-to-json-example](https://reqbin.com/code/javascript/n2ek7onb/javascript-array-to-json-example)
67. Converting an Array to a JSON Object in JavaScript \- Boot.dev Blog, accessed on April 24, 2025, [https://blog.boot.dev/javascript/converting-an-array-to-json-object-in-javascript/](https://blog.boot.dev/javascript/converting-an-array-to-json-object-in-javascript/)
68. How to convert array to JSON in typescript \- javascript \- Stack Overflow, accessed on April 24, 2025, [https://stackoverflow.com/questions/73301732/how-to-convert-array-to-json-in-typescript](https://stackoverflow.com/questions/73301732/how-to-convert-array-to-json-in-typescript)
69. How to convert an array to a JSON object \- Codedamn, accessed on April 24, 2025, [https://codedamn.com/news/javascript/how-to-convert-an-array-to-a-json-object](https://codedamn.com/news/javascript/how-to-convert-an-array-to-a-json-object)

View File

@ -2,12 +2,12 @@ import { z } from 'zod';
// Define the base shape for arguments
export const ConvertCommandArgsSchema = z.object({
input: z.string(),
output: z.string().optional(),
dpi: z.number().int().positive().default(300),
format: z.enum(['png', 'jpg']).default('png'),
startPage: z.number().int().positive().optional(),
endPage: z.number().int().positive().optional()
input: z.string().describe('Path to the input PDF file'),
output: z.string().describe('Output path template (e.g., output/page_{PAGE}.png)').optional(),
dpi: z.number().int().positive().default(300).describe('Resolution for the output images'),
format: z.enum(['png', 'jpg']).default('png').describe('Output image format'),
startPage: z.number().int().positive().describe('First page to convert (1-based index)').optional(),
endPage: z.number().int().positive().describe('Last page to convert (1-based index)').optional()
});
// Add refinements, transformations, and catchall for final validation/parsing