email cancel
This commit is contained in:
parent
8ecd654eda
commit
16a541127c
3
packages/search/dist-in/lib/email.d.ts
vendored
3
packages/search/dist-in/lib/email.d.ts
vendored
@ -8,7 +8,7 @@ export declare class HtmlToTextTransformer extends MappingDocumentTransformer {
|
||||
}>>;
|
||||
}
|
||||
export declare const cheerioLoader: (url: string) => Promise<unknown>;
|
||||
export declare const puppeteerLoader: (url: string, headless: boolean, location: LocalResult) => Promise<unknown>;
|
||||
export declare const puppeteerLoader: (url: string, headless: boolean, location: LocalResult, checkCancelled?: () => Promise<boolean>) => Promise<unknown>;
|
||||
export declare const findEMail: (question: string, url: string, opts: {
|
||||
headless?: boolean;
|
||||
searchFrom?: string;
|
||||
@ -18,5 +18,6 @@ export declare const findEmailEach: (location: LocalResult, opts: {
|
||||
headless?: boolean;
|
||||
searchFrom?: string;
|
||||
abortAfter?: number;
|
||||
checkCancelled?: () => Promise<boolean>;
|
||||
[key: string]: any;
|
||||
}, onProgress?: (page: Page) => Promise<void>) => Promise<string[]>;
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -9,7 +9,7 @@ import { isValidUrl } from './html.js'
|
||||
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
|
||||
const mailtoRegex = /^mailto:([^\s@]+@[^\s@]+\.[^\s@]+)$/i
|
||||
|
||||
import { PuppeteerWebBaseLoader as loader } from './pupeteer.js'
|
||||
import { PuppeteerWebBaseLoader as loader, getBrowser } from './pupeteer.js'
|
||||
|
||||
export class HtmlToTextTransformer extends MappingDocumentTransformer {
|
||||
static lc_name() {
|
||||
@ -44,10 +44,16 @@ export const cheerioLoader = async (url: string) => {
|
||||
}
|
||||
|
||||
|
||||
export const puppeteerLoader = async (url: string, headless: boolean, location: LocalResult) => {
|
||||
export const puppeteerLoader = async (url: string, headless: boolean, location: LocalResult, checkCancelled?: () => Promise<boolean>) => {
|
||||
if (isValidUrl(url) === false || url.indexOf('mailto') !== -1) {
|
||||
return []
|
||||
}
|
||||
|
||||
if (checkCancelled && await checkCancelled()) {
|
||||
logger.info('Cancelled before loading ' + url);
|
||||
return [];
|
||||
}
|
||||
|
||||
let loaderWithOptions
|
||||
try { // Function to detect a valid URL
|
||||
loaderWithOptions = new loader(
|
||||
@ -63,20 +69,51 @@ export const puppeteerLoader = async (url: string, headless: boolean, location:
|
||||
waitUntil: "networkidle0",
|
||||
},
|
||||
async evaluate(page, browser) {
|
||||
if (checkCancelled && await checkCancelled()) {
|
||||
debugger
|
||||
const pid = browser.process()?.pid;
|
||||
logger.warn(`Killing browser process ${pid} due to cancellation`);
|
||||
await browser.close();
|
||||
throw new Error('CancelledByUser');
|
||||
}
|
||||
const result = await page.evaluate(() => document.body.innerHTML)
|
||||
// await browser.close()
|
||||
return result
|
||||
}
|
||||
}
|
||||
)
|
||||
const docs = await loaderWithOptions.load()
|
||||
// Race load against cancellation
|
||||
const loadPromise = loaderWithOptions.load();
|
||||
|
||||
const cancelPromise = new Promise<never>(async (_, reject) => {
|
||||
if (!checkCancelled) return;
|
||||
// Poll for cancellation
|
||||
while (true) {
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
if (await checkCancelled()) {
|
||||
const browser = await getBrowser();
|
||||
if (browser) {
|
||||
const pid = browser.process()?.pid;
|
||||
logger.info(`Killing browser process ${pid} due to cancellation`);
|
||||
await browser.close();
|
||||
}
|
||||
reject(new Error('CancelledByUser'));
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
const docs = await Promise.race([loadPromise, cancelPromise]);
|
||||
const splitter = RecursiveCharacterTextSplitter.fromLanguage("html")
|
||||
const transformer = new HtmlToTextTransformer()
|
||||
const sequence = splitter.pipe(transformer as any)
|
||||
const ret = await sequence.invoke(docs)
|
||||
return ret
|
||||
} catch (error) {
|
||||
logger.warn('Error loading page: ' + url, error.message)
|
||||
if (error instanceof Error && error.message === 'CancelledByUser') {
|
||||
throw error;
|
||||
}
|
||||
logger.warn('Error loading page: ' + url, error instanceof Error ? error.message : String(error))
|
||||
location.rejected = true
|
||||
// loader.browser && loader.browser.close()
|
||||
|
||||
@ -111,7 +148,7 @@ export const findEMail = async (question: string, url: string, opts: { headless?
|
||||
return false
|
||||
}
|
||||
let pageUrl = url
|
||||
let docs = await puppeteerLoader(pageUrl, opts.headless, location) as any
|
||||
let docs = await puppeteerLoader(pageUrl, opts.headless, location, opts.checkCancelled) as any
|
||||
let emails: string[] = []
|
||||
docs.forEach((d: any) => {
|
||||
if (d.pageContent && d.pageContent.indexOf('@') !== -1) {
|
||||
@ -131,7 +168,7 @@ export const findEMail = async (question: string, url: string, opts: { headless?
|
||||
}
|
||||
|
||||
|
||||
export const findEmailEach = async (location: LocalResult, opts: { headless?: boolean, searchFrom?: string, abortAfter?: number, [key: string]: any }, onProgress?: (page: Page) => Promise<void>) => {
|
||||
export const findEmailEach = async (location: LocalResult, opts: { headless?: boolean, searchFrom?: string, abortAfter?: number, checkCancelled?: () => Promise<boolean>, [key: string]: any }, onProgress?: (page: Page) => Promise<void>) => {
|
||||
if (!location.meta || !location.meta.pages) {
|
||||
return []
|
||||
}
|
||||
@ -140,6 +177,12 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
|
||||
const abortAfter = opts.abortAfter ?? 1
|
||||
|
||||
for (const page of location.meta.pages) {
|
||||
if (opts.checkCancelled && await opts.checkCancelled()) {
|
||||
debugger
|
||||
logger.info(`[findEmailEach] Cancellation requested for ${location.title}`);
|
||||
break;
|
||||
}
|
||||
|
||||
if (emails.length >= abortAfter) {
|
||||
break
|
||||
}
|
||||
@ -157,6 +200,10 @@ export const findEmailEach = async (location: LocalResult, opts: { headless?: bo
|
||||
}
|
||||
page.status = 'SEARCHED_EMAIL'
|
||||
} catch (error) {
|
||||
if (error.message === 'CancelledByUser') {
|
||||
debugger
|
||||
throw error;
|
||||
}
|
||||
page.status = 'FAILED'
|
||||
page.error = error.message
|
||||
logger.error(`Error scraping email from ${page.url}:`, error)
|
||||
|
||||
@ -33,7 +33,7 @@ export const zodSchemaBase = () =>
|
||||
google_domain: z.string().default('google.com'),
|
||||
headless: z.boolean().default(true).describe('Headless mode'),
|
||||
language: z.string().default('en'),
|
||||
limit: z.number().default(5),
|
||||
limit: z.number().default(250),
|
||||
logLevel: z.string().default('info'),
|
||||
meta: z.boolean().default(false),
|
||||
searchCache: z.boolean().default(false).describe('Use search cache'),
|
||||
@ -43,7 +43,7 @@ export const zodSchemaBase = () =>
|
||||
searchFrom: z.string().optional().default('barcelona, spain'),
|
||||
source: z.union([z.string(), z.record(z.string(), z.array(z.string()))]).optional(),
|
||||
type: z.string().optional().default('search'),
|
||||
zoom: z.number().optional().default(13),
|
||||
zoom: z.number().optional().default(12),
|
||||
//index: z.string().optional().default('${OSR_ROOT}/osr-directory/meta/index.json').describe('Index file'),
|
||||
//store: z.string().optional().default('${OSR_ROOT}/osr-directory/meta/index.db').describe('Index store'),
|
||||
index: z.string().optional().describe('Index file'),
|
||||
|
||||
@ -200,7 +200,7 @@ export const searchGoogleMap = async (
|
||||
results = results.filter((r) => r.geo.city.toLowerCase() === opts.filterCity.toLowerCase())
|
||||
}
|
||||
if (opts.filterCountry) {
|
||||
results = results.filter((r) => r.geo.countryName.toLowerCase() === opts.filterCountry.toLowerCase())
|
||||
// results = results.filter((r) => r.geo.countryName.toLowerCase() === opts.filterCountry.toLowerCase())
|
||||
}
|
||||
if (opts.filterContinent) {
|
||||
results = results.filter((r) => r.geo.continent.toLowerCase() === opts.filterContinent.toLowerCase())
|
||||
@ -215,7 +215,7 @@ export const searchGoogleMap = async (
|
||||
return index[r.title] == null || !index[r.title].geo || !index[r.title].meta
|
||||
})
|
||||
logger.info(
|
||||
`found ${newResults.length} new items for "${query}" from "${params.searchFrom}" | ${beforeCached} total before cache filtering`,
|
||||
`found ${newResults.length} new items for "${query}" (Zoom: ${opts.zoom} | Limit: ${opts.limit}) from "${params.searchFrom}" | ${beforeCached} total before cache filtering`,
|
||||
)
|
||||
const processedResults = newResults.slice(0, opts.limit)
|
||||
await enrichResults(processedResults, index, opts)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user