email search - pupepeteer
This commit is contained in:
parent
b0c7aeab32
commit
c90809c857
File diff suppressed because one or more lines are too long
12
packages/search/dist-in/lib/pupeteer.d.ts
vendored
12
packages/search/dist-in/lib/pupeteer.d.ts
vendored
@ -1,4 +1,4 @@
|
||||
import type { launch, WaitForOptions, Page, Browser, PuppeteerLaunchOptions } from "puppeteer";
|
||||
import type { WaitForOptions, Page, Browser } from "puppeteer";
|
||||
import { Document } from "@langchain/core/documents";
|
||||
import { BaseDocumentLoader, DocumentLoader } from "langchain/document_loaders/base";
|
||||
export { Page, Browser };
|
||||
@ -13,7 +13,7 @@ export type PuppeteerGotoOptions = WaitForOptions & {
|
||||
*/
|
||||
export type PuppeteerEvaluate = (page: Page, browser: Browser) => Promise<string>;
|
||||
export type PuppeteerWebBaseLoaderOptions = {
|
||||
launchOptions?: PuppeteerLaunchOptions;
|
||||
launchOptions?: any;
|
||||
gotoOptions?: PuppeteerGotoOptions;
|
||||
evaluate?: PuppeteerEvaluate;
|
||||
};
|
||||
@ -54,12 +54,4 @@ export declare class PuppeteerWebBaseLoader extends BaseDocumentLoader implement
|
||||
* @returns {Promise<Document>} A document object containing the screenshot of the page encoded in base64.
|
||||
*/
|
||||
screenshot(): Promise<Document>;
|
||||
/**
|
||||
* Static method that imports the necessary Puppeteer modules. It returns
|
||||
* a Promise that resolves to an object containing the imported modules.
|
||||
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
|
||||
*/
|
||||
static imports(): Promise<{
|
||||
launch: typeof launch;
|
||||
}>;
|
||||
}
|
||||
|
||||
File diff suppressed because one or more lines are too long
736
packages/search/package-lock.json
generated
736
packages/search/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -41,7 +41,7 @@
|
||||
"p-limit": "^7.2.0",
|
||||
"p-map": "^4.0.0",
|
||||
"publish": "^0.6.0",
|
||||
"puppeteer": "^19.11.1",
|
||||
"puppeteer": "^24.35.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"serpapi": "^1.1.1",
|
||||
|
||||
28
packages/search/scripts/test-puppeteer.js
Normal file
28
packages/search/scripts/test-puppeteer.js
Normal file
@ -0,0 +1,28 @@
|
||||
|
||||
import puppeteer from 'puppeteer';
|
||||
|
||||
(async () => {
|
||||
console.log('[Test] Launching browser...');
|
||||
try {
|
||||
const browser = await puppeteer.launch({
|
||||
headless: false
|
||||
});
|
||||
console.log('[Test] Browser launched successfully.');
|
||||
|
||||
const page = await browser.newPage();
|
||||
console.log('[Test] New page created.');
|
||||
|
||||
console.log('[Test] Navigating to example.com...');
|
||||
await page.goto('https://example.com');
|
||||
console.log('[Test] Navigation successful.');
|
||||
|
||||
const title = await page.title();
|
||||
console.log(`[Test] Page title: ${title}`);
|
||||
|
||||
await browser.close();
|
||||
console.log('[Test] Browser closed.');
|
||||
} catch (error) {
|
||||
console.error('[Test] Error:', error);
|
||||
process.exit(1);
|
||||
}
|
||||
})();
|
||||
@ -195,11 +195,11 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
|
||||
}).slice(0, maxPages)
|
||||
|
||||
await pMap(pagesToSearch, async (page: Page) => {
|
||||
logger.debug(`[findEmailEach] Processing page: ${page.url}`);
|
||||
if (opts.checkCancelled && await opts.checkCancelled()) {
|
||||
// logger.info(`[findEmailEach] Cancellation requested for ${location.title}`);
|
||||
logger.info(`[findEmailEach] Cancellation requested for ${location.title}`);
|
||||
return
|
||||
}
|
||||
|
||||
if (emails.length >= abortAfter) {
|
||||
return
|
||||
}
|
||||
@ -215,6 +215,7 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
|
||||
emails.push(...pageEmails)
|
||||
}
|
||||
page.status = 'SEARCHED_EMAIL'
|
||||
logger.debug(`[findEmailEach] Finished page: ${page.url}`);
|
||||
} catch (error) {
|
||||
if (error.message === 'CancelledByUser') {
|
||||
throw error;
|
||||
@ -225,6 +226,7 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
|
||||
}
|
||||
|
||||
if (onProgress) {
|
||||
logger.info(`[findEmailEach] Progress for ${location.title}`);
|
||||
await onProgress(page)
|
||||
}
|
||||
}, { concurrency, stopOnError: false })
|
||||
|
||||
@ -3,8 +3,8 @@ import type {
|
||||
WaitForOptions,
|
||||
Page,
|
||||
Browser,
|
||||
PuppeteerLaunchOptions,
|
||||
} from "puppeteer"
|
||||
import puppeteer from 'puppeteer';
|
||||
|
||||
import { Document } from "@langchain/core/documents"
|
||||
import { BaseDocumentLoader, DocumentLoader } from "langchain/document_loaders/base"
|
||||
@ -27,7 +27,7 @@ export type PuppeteerEvaluate = (
|
||||
) => Promise<string>;
|
||||
|
||||
export type PuppeteerWebBaseLoaderOptions = {
|
||||
launchOptions?: PuppeteerLaunchOptions;
|
||||
launchOptions?: any;
|
||||
gotoOptions?: PuppeteerGotoOptions;
|
||||
evaluate?: PuppeteerEvaluate;
|
||||
};
|
||||
@ -49,6 +49,8 @@ export type PuppeteerWebBaseLoaderOptions = {
|
||||
* const screenshot = await loader.screenshot();
|
||||
* ```
|
||||
*/
|
||||
import { logger } from '../index.js'
|
||||
|
||||
// Singleton browser promise to prevent race conditions
|
||||
let browserPromise: Promise<Browser> | null = null;
|
||||
let idleTimer: NodeJS.Timeout | null = null;
|
||||
@ -59,7 +61,7 @@ const resetIdleTimer = () => {
|
||||
if (idleTimer) clearTimeout(idleTimer);
|
||||
idleTimer = setTimeout(async () => {
|
||||
if (browserPromise) {
|
||||
console.log(`[Puppeteer] Browser idle timeout (${IDLE_TIMEOUT_SECONDS}s) reached, closing browser`);
|
||||
logger.info(`[Puppeteer] Browser idle timeout (${IDLE_TIMEOUT_SECONDS}s) reached, closing browser`);
|
||||
const browser = await browserPromise;
|
||||
await browser.close();
|
||||
browserPromise = null;
|
||||
@ -71,15 +73,39 @@ const launchBrowser = async (options?: PuppeteerWebBaseLoaderOptions): Promise<B
|
||||
resetIdleTimer();
|
||||
if (browserPromise) return browserPromise;
|
||||
|
||||
logger.info('[Puppeteer] Launching browser...');
|
||||
browserPromise = (async () => {
|
||||
const { launch } = await PuppeteerWebBaseLoader.imports();
|
||||
const b = await launch({
|
||||
headless: process.env.EMAIL_SEARCH_HEADLESS === 'false' ? false : true,
|
||||
defaultViewport: null,
|
||||
ignoreDefaultArgs: ["--disable-extensions"],
|
||||
...options?.launchOptions,
|
||||
});
|
||||
return b;
|
||||
try {
|
||||
// Static import used above
|
||||
logger.debug('[Puppeteer] Imports resolved. Starting launch...');
|
||||
const { launch } = puppeteer;
|
||||
|
||||
const headlessEnv = process.env.EMAIL_SEARCH_HEADLESS;
|
||||
// Use 'new' for headless:true to avoid deprecation warning. Respect 'false' for debugging.
|
||||
const headlessMode = headlessEnv === 'false' ? false : 'new';
|
||||
|
||||
// Wrap launch in a timeout to prevent infinite hangs
|
||||
const launchPromise = launch({
|
||||
headless: headlessMode as any, // Use user config or default to 'new'
|
||||
defaultViewport: null,
|
||||
args: ['--no-sandbox', '--disable-setuid-sandbox'], // CRITICAL: Prevent OS-level hangs
|
||||
ignoreDefaultArgs: ["--disable-extensions"],
|
||||
...options?.launchOptions,
|
||||
});
|
||||
|
||||
const timeoutPromise = new Promise<never>((_, reject) =>
|
||||
setTimeout(() => reject(new Error('Browser Launch Timeout (30s)')), 30000)
|
||||
);
|
||||
|
||||
const b = await Promise.race([launchPromise, timeoutPromise]);
|
||||
|
||||
logger.info('[Puppeteer] Browser launched successfully');
|
||||
return b;
|
||||
} catch (e) {
|
||||
logger.error('[Puppeteer] Failed to launch browser', e);
|
||||
browserPromise = null; // CRITICAL: Reset promise so next attempt can retry
|
||||
throw e;
|
||||
}
|
||||
})();
|
||||
|
||||
return browserPromise;
|
||||
@ -88,7 +114,9 @@ const launchBrowser = async (options?: PuppeteerWebBaseLoaderOptions): Promise<B
|
||||
export const getBrowser = () => browserPromise;
|
||||
export const getPage = async (browser: Browser) => {
|
||||
// Always create a new page for concurrency
|
||||
logger.debug('[Puppeteer] Creating new page');
|
||||
const page = await browser.newPage()
|
||||
logger.debug('[Puppeteer] New page created');
|
||||
return page
|
||||
}
|
||||
|
||||
@ -114,26 +142,29 @@ export class PuppeteerWebBaseLoader
|
||||
// PuppeteerWebBaseLoader.browser = browser // Static property usage is deprecated/incorrect with this pattern
|
||||
|
||||
return limit(async () => {
|
||||
console.log(`[Puppeteer] Entering limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
|
||||
logger.debug(`[Puppeteer] Entering limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
|
||||
try {
|
||||
const page = await getPage(browser)
|
||||
try {
|
||||
logger.debug(`[Puppeteer] Navigating to ${url}`);
|
||||
await page.goto(url, {
|
||||
timeout: 5000,
|
||||
waitUntil: "domcontentloaded",
|
||||
...options?.gotoOptions,
|
||||
});
|
||||
logger.debug(`[Puppeteer] Navigated to ${url}, evaluating...`);
|
||||
|
||||
const bodyHTML = options?.evaluate
|
||||
? await options?.evaluate(page, browser)
|
||||
: await page.evaluate(() => document.body.innerHTML);
|
||||
|
||||
logger.debug(`[Puppeteer] Evaluated ${url}`);
|
||||
return bodyHTML
|
||||
} finally {
|
||||
await page.close()
|
||||
}
|
||||
} finally {
|
||||
console.log(`[Puppeteer] Exiting limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
|
||||
logger.debug(`[Puppeteer] Exiting limit (Active: ${limit.activeCount}, Pending: ${limit.pendingCount}) for ${url}`);
|
||||
resetIdleTimer();
|
||||
}
|
||||
})
|
||||
@ -173,10 +204,10 @@ export class PuppeteerWebBaseLoader
|
||||
url: string,
|
||||
options?: PuppeteerWebBaseLoaderOptions
|
||||
): Promise<Document> {
|
||||
const { launch } = await PuppeteerWebBaseLoader.imports();
|
||||
const { launch } = puppeteer;
|
||||
|
||||
const browser = await launch({
|
||||
headless: true,
|
||||
headless: "new",
|
||||
defaultViewport: null,
|
||||
args: ["--no-screenshot"],
|
||||
ignoreDefaultArgs: ["--disable-extensions"],
|
||||
@ -190,7 +221,7 @@ export class PuppeteerWebBaseLoader
|
||||
...options?.gotoOptions,
|
||||
});
|
||||
const screenshot = await page.screenshot();
|
||||
const base64 = screenshot.toString("base64");
|
||||
const base64 = screenshot.toString();
|
||||
const metadata = { source: url };
|
||||
return new Document({ pageContent: base64, metadata });
|
||||
}
|
||||
@ -205,23 +236,5 @@ export class PuppeteerWebBaseLoader
|
||||
return PuppeteerWebBaseLoader._screenshot(this.webPath, this.options);
|
||||
}
|
||||
|
||||
/**
|
||||
* Static method that imports the necessary Puppeteer modules. It returns
|
||||
* a Promise that resolves to an object containing the imported modules.
|
||||
* @returns Promise that resolves to an object containing the imported Puppeteer modules.
|
||||
*/
|
||||
static async imports(): Promise<{
|
||||
launch: typeof launch;
|
||||
}> {
|
||||
try {
|
||||
// eslint-disable-next-line import/no-extraneous-dependencies
|
||||
const { launch } = await import("puppeteer")
|
||||
return { launch };
|
||||
} catch (e) {
|
||||
console.error(e);
|
||||
throw new Error(
|
||||
"Please install puppeteer as a dependency with, e.g. `yarn add puppeteer`"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user