email cancel

This commit is contained in:
babayaga 2025-12-26 01:17:00 +01:00
parent 8ecd654eda
commit 16a541127c
7 changed files with 114 additions and 23 deletions

View File

@ -8,7 +8,7 @@ export declare class HtmlToTextTransformer extends MappingDocumentTransformer {
}>>;
}
export declare const cheerioLoader: (url: string) => Promise<unknown>;
export declare const puppeteerLoader: (url: string, headless: boolean, location: LocalResult) => Promise<unknown>;
export declare const puppeteerLoader: (url: string, headless: boolean, location: LocalResult, checkCancelled?: () => Promise<boolean>) => Promise<unknown>;
export declare const findEMail: (question: string, url: string, opts: {
headless?: boolean;
searchFrom?: string;
@ -18,5 +18,6 @@ export declare const findEmailEach: (location: LocalResult, opts: {
headless?: boolean;
searchFrom?: string;
abortAfter?: number;
checkCancelled?: () => Promise<boolean>;
[key: string]: any;
}, onProgress?: (page: Page) => Promise<void>) => Promise<string[]>;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -9,7 +9,7 @@ import { isValidUrl } from './html.js'
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
const mailtoRegex = /^mailto:([^\s@]+@[^\s@]+\.[^\s@]+)$/i
import { PuppeteerWebBaseLoader as loader } from './pupeteer.js'
import { PuppeteerWebBaseLoader as loader, getBrowser } from './pupeteer.js'
export class HtmlToTextTransformer extends MappingDocumentTransformer {
static lc_name() {
@ -44,10 +44,16 @@ export const cheerioLoader = async (url: string) => {
}
export const puppeteerLoader = async (url: string, headless: boolean, location: LocalResult) => {
export const puppeteerLoader = async (url: string, headless: boolean, location: LocalResult, checkCancelled?: () => Promise<boolean>) => {
if (isValidUrl(url) === false || url.indexOf('mailto') !== -1) {
return []
}
if (checkCancelled && await checkCancelled()) {
logger.info('Cancelled before loading ' + url);
return [];
}
let loaderWithOptions
try { // Function to detect a valid URL
loaderWithOptions = new loader(
@ -63,20 +69,51 @@ export const puppeteerLoader = async (url: string, headless: boolean, location:
waitUntil: "networkidle0",
},
async evaluate(page, browser) {
if (checkCancelled && await checkCancelled()) {
debugger
const pid = browser.process()?.pid;
logger.warn(`Killing browser process ${pid} due to cancellation`);
await browser.close();
throw new Error('CancelledByUser');
}
const result = await page.evaluate(() => document.body.innerHTML)
// await browser.close()
return result
}
}
)
const docs = await loaderWithOptions.load()
// Race load against cancellation
const loadPromise = loaderWithOptions.load();
const cancelPromise = new Promise<never>(async (_, reject) => {
if (!checkCancelled) return;
// Poll for cancellation
while (true) {
await new Promise(r => setTimeout(r, 1000));
if (await checkCancelled()) {
const browser = await getBrowser();
if (browser) {
const pid = browser.process()?.pid;
logger.info(`Killing browser process ${pid} due to cancellation`);
await browser.close();
}
reject(new Error('CancelledByUser'));
break;
}
}
});
const docs = await Promise.race([loadPromise, cancelPromise]);
const splitter = RecursiveCharacterTextSplitter.fromLanguage("html")
const transformer = new HtmlToTextTransformer()
const sequence = splitter.pipe(transformer as any)
const ret = await sequence.invoke(docs)
return ret
} catch (error) {
logger.warn('Error loading page: ' + url, error.message)
if (error instanceof Error && error.message === 'CancelledByUser') {
throw error;
}
logger.warn('Error loading page: ' + url, error instanceof Error ? error.message : String(error))
location.rejected = true
// loader.browser && loader.browser.close()
@ -111,7 +148,7 @@ export const findEMail = async (question: string, url: string, opts: { headless?
return false
}
let pageUrl = url
let docs = await puppeteerLoader(pageUrl, opts.headless, location) as any
let docs = await puppeteerLoader(pageUrl, opts.headless, location, opts.checkCancelled) as any
let emails: string[] = []
docs.forEach((d: any) => {
if (d.pageContent && d.pageContent.indexOf('@') !== -1) {
@ -131,7 +168,7 @@ export const findEMail = async (question: string, url: string, opts: { headless?
}
export const findEmailEach = async (location: LocalResult, opts: { headless?: boolean, searchFrom?: string, abortAfter?: number, [key: string]: any }, onProgress?: (page: Page) => Promise<void>) => {
export const findEmailEach = async (location: LocalResult, opts: { headless?: boolean, searchFrom?: string, abortAfter?: number, checkCancelled?: () => Promise<boolean>, [key: string]: any }, onProgress?: (page: Page) => Promise<void>) => {
if (!location.meta || !location.meta.pages) {
return []
}
@ -140,6 +177,12 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
const abortAfter = opts.abortAfter ?? 1
for (const page of location.meta.pages) {
if (opts.checkCancelled && await opts.checkCancelled()) {
debugger
logger.info(`[findEmailEach] Cancellation requested for ${location.title}`);
break;
}
if (emails.length >= abortAfter) {
break
}
@ -157,6 +200,10 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
}
page.status = 'SEARCHED_EMAIL'
} catch (error) {
if (error.message === 'CancelledByUser') {
debugger
throw error;
}
page.status = 'FAILED'
page.error = error.message
logger.error(`Error scraping email from ${page.url}:`, error)

View File

@ -33,7 +33,7 @@ export const zodSchemaBase = () =>
google_domain: z.string().default('google.com'),
headless: z.boolean().default(true).describe('Headless mode'),
language: z.string().default('en'),
limit: z.number().default(5),
limit: z.number().default(250),
logLevel: z.string().default('info'),
meta: z.boolean().default(false),
searchCache: z.boolean().default(false).describe('Use search cache'),
@ -43,7 +43,7 @@ export const zodSchemaBase = () =>
searchFrom: z.string().optional().default('barcelona, spain'),
source: z.union([z.string(), z.record(z.string(), z.array(z.string()))]).optional(),
type: z.string().optional().default('search'),
zoom: z.number().optional().default(13),
zoom: z.number().optional().default(12),
//index: z.string().optional().default('${OSR_ROOT}/osr-directory/meta/index.json').describe('Index file'),
//store: z.string().optional().default('${OSR_ROOT}/osr-directory/meta/index.db').describe('Index store'),
index: z.string().optional().describe('Index file'),

View File

@ -200,7 +200,7 @@ export const searchGoogleMap = async (
results = results.filter((r) => r.geo.city.toLowerCase() === opts.filterCity.toLowerCase())
}
if (opts.filterCountry) {
results = results.filter((r) => r.geo.countryName.toLowerCase() === opts.filterCountry.toLowerCase())
// results = results.filter((r) => r.geo.countryName.toLowerCase() === opts.filterCountry.toLowerCase())
}
if (opts.filterContinent) {
results = results.filter((r) => r.geo.continent.toLowerCase() === opts.filterContinent.toLowerCase())
@ -215,7 +215,7 @@ export const searchGoogleMap = async (
return index[r.title] == null || !index[r.title].geo || !index[r.title].meta
})
logger.info(
`found ${newResults.length} new items for "${query}" from "${params.searchFrom}" | ${beforeCached} total before cache filtering`,
`found ${newResults.length} new items for "${query}" (Zoom: ${opts.zoom} | Limit: ${opts.limit}) from "${params.searchFrom}" | ${beforeCached} total before cache filtering`,
)
const processedResults = newResults.slice(0, opts.limit)
await enrichResults(processedResults, index, opts)