email search - pupepeteer

This commit is contained in:
babayaga 2026-01-21 16:44:04 +01:00
parent b0c7aeab32
commit c90809c857
8 changed files with 700 additions and 247 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,4 +1,4 @@
import type { launch, WaitForOptions, Page, Browser, PuppeteerLaunchOptions } from "puppeteer";
import type { WaitForOptions, Page, Browser } from "puppeteer";
import { Document } from "@langchain/core/documents";
import { BaseDocumentLoader, DocumentLoader } from "langchain/document_loaders/base";
export { Page, Browser };
@ -13,7 +13,7 @@ export type PuppeteerGotoOptions = WaitForOptions & {
*/
export type PuppeteerEvaluate = (page: Page, browser: Browser) => Promise<string>;
export type PuppeteerWebBaseLoaderOptions = {
launchOptions?: PuppeteerLaunchOptions;
launchOptions?: any;
gotoOptions?: PuppeteerGotoOptions;
evaluate?: PuppeteerEvaluate;
};
@ -54,12 +54,4 @@ export declare class PuppeteerWebBaseLoader extends BaseDocumentLoader implement
* @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.
*/
screenshot(): Promise<Document>;
/**
* Static method that imports the necessary Puppeteer modules. It returns
* a Promise that resolves to an object containing the imported modules.
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
*/
static imports(): Promise<{
launch: typeof launch;
}>;
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@ -41,7 +41,7 @@
"p-limit": "^7.2.0",
"p-map": "^4.0.0",
"publish": "^0.6.0",
"puppeteer": "^19.11.1",
"puppeteer": "^24.35.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"serpapi": "^1.1.1",

View File

@ -0,0 +1,28 @@
import puppeteer from 'puppeteer';
(async () => {
console.log('[Test] Launching browser...');
try {
const browser = await puppeteer.launch({
headless: false
});
console.log('[Test] Browser launched successfully.');
const page = await browser.newPage();
console.log('[Test] New page created.');
console.log('[Test] Navigating to example.com...');
await page.goto('https://example.com');
console.log('[Test] Navigation successful.');
const title = await page.title();
console.log(`[Test] Page title: ${title}`);
await browser.close();
console.log('[Test] Browser closed.');
} catch (error) {
console.error('[Test] Error:', error);
process.exit(1);
}
})();

View File

@ -195,11 +195,11 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
}).slice(0, maxPages)
await pMap(pagesToSearch, async (page: Page) => {
logger.debug(`[findEmailEach] Processing page: ${page.url}`);
if (opts.checkCancelled && await opts.checkCancelled()) {
// logger.info(`[findEmailEach] Cancellation requested for ${location.title}`);
logger.info(`[findEmailEach] Cancellation requested for ${location.title}`);
return
}
if (emails.length >= abortAfter) {
return
}
@ -215,6 +215,7 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
emails.push(...pageEmails)
}
page.status = 'SEARCHED_EMAIL'
logger.debug(`[findEmailEach] Finished page: ${page.url}`);
} catch (error) {
if (error.message === 'CancelledByUser') {
throw error;
@ -225,6 +226,7 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
}
if (onProgress) {
logger.info(`[findEmailEach] Progress for ${location.title}`);
await onProgress(page)
}
}, { concurrency, stopOnError: false })

View File

@ -3,8 +3,8 @@ import type {
WaitForOptions,
Page,
Browser,
PuppeteerLaunchOptions,
} from "puppeteer"
import puppeteer from 'puppeteer';
import { Document } from "@langchain/core/documents"
import { BaseDocumentLoader, DocumentLoader } from "langchain/document_loaders/base"
@ -27,7 +27,7 @@ export type PuppeteerEvaluate = (
) => Promise<string>;
export type PuppeteerWebBaseLoaderOptions = {
launchOptions?: PuppeteerLaunchOptions;
launchOptions?: any;
gotoOptions?: PuppeteerGotoOptions;
evaluate?: PuppeteerEvaluate;
};
@ -49,6 +49,8 @@ export type PuppeteerWebBaseLoaderOptions = {
* const screenshot = await loader.screenshot();
* ```
*/
import { logger } from '../index.js'
// Singleton browser promise to prevent race conditions
let browserPromise: Promise<Browser> | null = null;
let idleTimer: NodeJS.Timeout | null = null;
@ -59,7 +61,7 @@ const resetIdleTimer = () => {
if (idleTimer) clearTimeout(idleTimer);
idleTimer = setTimeout(async () => {
if (browserPromise) {
console.log(`[Puppeteer] Browser idle timeout (${IDLE_TIMEOUT_SECONDS}s) reached, closing browser`);
logger.info(`[Puppeteer] Browser idle timeout (${IDLE_TIMEOUT_SECONDS}s) reached, closing browser`);
const browser = await browserPromise;
await browser.close();
browserPromise = null;
@ -71,15 +73,39 @@ const launchBrowser = async (options?: PuppeteerWebBaseLoaderOptions): Promise<B
resetIdleTimer();
if (browserPromise) return browserPromise;
logger.info('[Puppeteer] Launching browser...');
browserPromise = (async () => {
const { launch } = await PuppeteerWebBaseLoader.imports();
const b = await launch({
headless: process.env.EMAIL_SEARCH_HEADLESS === 'false' ? false : true,
try {
// Static import used above
logger.debug('[Puppeteer] Imports resolved. Starting launch...');
const { launch } = puppeteer;
const headlessEnv = process.env.EMAIL_SEARCH_HEADLESS;
// Use 'new' for headless:true to avoid deprecation warning. Respect 'false' for debugging.
const headlessMode = headlessEnv === 'false' ? false : 'new';
// Wrap launch in a timeout to prevent infinite hangs
const launchPromise = launch({
headless: headlessMode as any, // Use user config or default to 'new'
defaultViewport: null,
args: ['--no-sandbox', '--disable-setuid-sandbox'], // CRITICAL: Prevent OS-level hangs
ignoreDefaultArgs: ["--disable-extensions"],
...options?.launchOptions,
});
const timeoutPromise = new Promise<never>((_, reject) =>
setTimeout(() => reject(new Error('Browser Launch Timeout (30s)')), 30000)
);
const b = await Promise.race([launchPromise, timeoutPromise]);
logger.info('[Puppeteer] Browser launched successfully');
return b;
} catch (e) {
logger.error('[Puppeteer] Failed to launch browser', e);
browserPromise = null; // CRITICAL: Reset promise so next attempt can retry
throw e;
}
})();
return browserPromise;
@ -88,7 +114,9 @@ const launchBrowser = async (options?: PuppeteerWebBaseLoaderOptions): Promise<B
export const getBrowser = () => browserPromise;
export const getPage = async (browser: Browser) => {
// Always create a new page for concurrency
logger.debug('[Puppeteer] Creating new page');
const page = await browser.newPage()
logger.debug('[Puppeteer] New page created');
return page
}
@ -114,26 +142,29 @@ export class PuppeteerWebBaseLoader
// PuppeteerWebBaseLoader.browser = browser // Static property usage is deprecated/incorrect with this pattern
return limit(async () => {
console.log(`[Puppeteer] Entering limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
logger.debug(`[Puppeteer] Entering limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
try {
const page = await getPage(browser)
try {
logger.debug(`[Puppeteer] Navigating to ${url}`);
await page.goto(url, {
timeout: 5000,
waitUntil: "domcontentloaded",
...options?.gotoOptions,
});
logger.debug(`[Puppeteer] Navigated to ${url}, evaluating...`);
const bodyHTML = options?.evaluate
? await options?.evaluate(page, browser)
: await page.evaluate(() => document.body.innerHTML);
logger.debug(`[Puppeteer] Evaluated ${url}`);
return bodyHTML
} finally {
await page.close()
}
} finally {
console.log(`[Puppeteer] Exiting limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
logger.debug(`[Puppeteer] Exiting limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
resetIdleTimer();
}
})
@ -173,10 +204,10 @@ export class PuppeteerWebBaseLoader
url: string,
options?: PuppeteerWebBaseLoaderOptions
): Promise<Document> {
const { launch } = await PuppeteerWebBaseLoader.imports();
const { launch } = puppeteer;
const browser = await launch({
headless: true,
headless: "new",
defaultViewport: null,
args: ["--no-screenshot"],
ignoreDefaultArgs: ["--disable-extensions"],
@ -190,7 +221,7 @@ export class PuppeteerWebBaseLoader
...options?.gotoOptions,
});
const screenshot = await page.screenshot();
const base64 = screenshot.toString("base64");
const base64 = screenshot.toString();
const metadata = { source: url };
return new Document({ pageContent: base64, metadata });
}
@ -205,23 +236,5 @@ export class PuppeteerWebBaseLoader
return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);
}
/**
* Static method that imports the necessary Puppeteer modules. It returns
* a Promise that resolves to an object containing the imported modules.
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
*/
static async imports(): Promise<{
launch: typeof launch;
}> {
try {
// eslint-disable-next-line import/no-extraneous-dependencies
const { launch } = await import("puppeteer")
return { launch };
} catch (e) {
console.error(e);
throw new Error(
"Please install puppeteer as a dependency with, e.g. `yarn add puppeteer`"
);
}
}
}