find email 1/2

This commit is contained in:
babayaga 2025-11-25 20:51:34 +01:00
parent 0d79173e5d
commit 910293aa5e
21 changed files with 833 additions and 370 deletions

View File

@ -1,5 +1,5 @@
import { MappingDocumentTransformer, Document } from "@langchain/core/documents";
import { LocalResult } from './map_types.js';
import { LocalResult, Page } from './map_types.js';
export declare class HtmlToTextTransformer extends MappingDocumentTransformer {
static lc_name(): string;
constructor(options?: {});
@ -14,3 +14,8 @@ export declare const findEMail: (question: string, url: string, opts: {
searchFrom?: string;
[key: string]: any;
}, location: LocalResult) => Promise<false | string[]>;
export declare const findEmailEach: (location: LocalResult, opts: {
headless?: boolean;
searchFrom?: string;
[key: string]: any;
}, onProgress?: (page: Page) => Promise<void>) => Promise<string[]>;

File diff suppressed because one or more lines are too long

View File

@ -33,8 +33,8 @@ export declare const zodSchemaBase: () => z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
api_key: z.ZodOptional<z.ZodString>;
@ -66,8 +66,8 @@ export declare const zodSchemaBase: () => z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
api_key: z.ZodOptional<z.ZodString>;
@ -99,8 +99,8 @@ export declare const zodSchemaBase: () => z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, z.ZodTypeAny, "passthrough">>;
export declare const zodSchema: () => z.ZodEffects<z.ZodObject<{
@ -133,8 +133,8 @@ export declare const zodSchema: () => z.ZodEffects<z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, "passthrough", z.ZodTypeAny, z.objectOutputType<{
api_key: z.ZodOptional<z.ZodString>;
@ -166,8 +166,8 @@ export declare const zodSchema: () => z.ZodEffects<z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
api_key: z.ZodOptional<z.ZodString>;
@ -199,8 +199,8 @@ export declare const zodSchema: () => z.ZodEffects<z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, z.ZodTypeAny, "passthrough">>, z.objectOutputType<{
api_key: z.ZodOptional<z.ZodString>;
@ -232,8 +232,8 @@ export declare const zodSchema: () => z.ZodEffects<z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, z.ZodTypeAny, "passthrough">, z.objectInputType<{
api_key: z.ZodOptional<z.ZodString>;
@ -265,8 +265,8 @@ export declare const zodSchema: () => z.ZodEffects<z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
}, z.ZodTypeAny, "passthrough">>;
export declare const zodSchemaEachExtras: () => z.ZodObject<{
@ -329,8 +329,8 @@ export declare const zodSchemaEach: () => z.ZodEffects<z.ZodObject<{
source: z.ZodOptional<z.ZodUnion<[z.ZodString, z.ZodRecord<z.ZodString, z.ZodArray<z.ZodString, "many">>]>>;
type: z.ZodDefault<z.ZodOptional<z.ZodString>>;
zoom: z.ZodDefault<z.ZodOptional<z.ZodNumber>>;
index: z.ZodDefault<z.ZodString>;
store: z.ZodDefault<z.ZodString>;
index: z.ZodOptional<z.ZodString>;
store: z.ZodOptional<z.ZodString>;
variables: z.ZodOptional<z.ZodAny>;
} & {
logLevel: z.ZodDefault<z.ZodString>;

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,5 @@
import { AxiosRequestConfig } from "axios";
import { Browser } from 'puppeteer';
import * as puppeteer from 'puppeteer';
import { Browser, Page } from 'puppeteer';
import { LocalResult, LocationSiteMeta } from './map_types.js';
export declare const STATS_SUFFIX = "_stats.json";
export declare const SESSION_EVENTS_SUFFIX = "_session.json";
@ -9,13 +8,13 @@ export declare var scope: Scope;
export declare const extractEmail: (input: string) => string | null;
export declare const meta: (loc: LocalResult, options: any) => Promise<LocationSiteMeta | void>;
export declare const isValidUrl: (url: string) => boolean;
export declare const parse: (url: string, config: AxiosRequestConfig | null, options: any) => Promise<LocationSiteMeta>;
export declare const parseHtml: (url: string, config: AxiosRequestConfig | null, options: any) => Promise<LocationSiteMeta>;
export declare const getScope: (cliArgs?: any) => Scope;
export declare function capture_responses(scope: Scope, page: puppeteer.Page): Promise<void>;
export declare function capture_responses(scope: Scope, page: Page): Promise<void>;
export declare class Scope {
browser: Browser;
context: any;
page: puppeteer.Page;
page: Page;
args: any;
requests: any[];
responses: any[];

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,7 @@
export * from './types.js';
export * from './googlemaps.js';
import { getJson as searchSerpAPI } from "serpapi";
export declare const SearchProviders: {
scaleserp: (params: any) => Promise<import("./types.js").IScaleserpResponse>;
serpApi: typeof searchSerpAPI;
};
export declare const cleanOptions: (opts: any) => any;
export * from './map_types.js';
export * from './email.js';
export * from './html.js';
export * from './providers.js';
export declare const types: () => void;

View File

@ -1,24 +1,13 @@
export * from './types.js';
export * from './googlemaps.js';
export * from './map_types.js';
export * from './email.js';
export * from './html.js';
export * from './providers.js';
import { generate_interfaces } from '@polymech/commons';
import { getJson as searchSerpAPI } from "serpapi";
import { search as searchScaleserp } from './scalesep.js';
import { zodSchema as zodSchemaGoogleMaps, zodSchemaEach } from './googlemaps.js';
export const SearchProviders = {
scaleserp: searchScaleserp,
serpApi: searchSerpAPI
};
export const cleanOptions = (opts) => {
return {
...opts,
openai: 'hidden',
bigdata: 'hidden',
api_key: 'hidden',
geocode_key: 'hidden'
};
};
export const types = () => generate_interfaces([
zodSchemaGoogleMaps(),
zodSchemaEach(),
], 'src/lib/types-googlemaps.ts');
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvbGliL2luZGV4LnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUFBLGNBQWMsWUFBWSxDQUFBO0FBQzFCLGNBQWMsaUJBQWlCLENBQUE7QUFDL0IsT0FBTyxFQUFFLG1CQUFtQixFQUFFLE1BQU0sbUJBQW1CLENBQUE7QUFDdkQsT0FBTyxFQUFFLE9BQU8sSUFBSSxhQUFhLEVBQUUsTUFBTSxTQUFTLENBQUE7QUFDbEQsT0FBTyxFQUFFLE1BQU0sSUFBSSxlQUFlLEVBQUUsTUFBTSxlQUFlLENBQUE7QUFDekQsT0FBTyxFQUFFLFNBQVMsSUFBSSxtQkFBbUIsRUFBRSxhQUFhLEVBQUUsTUFBTSxpQkFBaUIsQ0FBQTtBQUVqRixNQUFNLENBQUMsTUFBTSxlQUFlLEdBQUc7SUFDM0IsU0FBUyxFQUFFLGVBQWU7SUFDMUIsT0FBTyxFQUFFLGFBQWE7Q0FDekIsQ0FBQTtBQUVELE1BQU0sQ0FBQyxNQUFNLFlBQVksR0FBRyxDQUFDLElBQVMsRUFBRSxFQUFFO0lBQ3RDLE9BQU87UUFDSCxHQUFHLElBQUk7UUFDUCxNQUFNLEVBQUUsUUFBUTtRQUNoQixPQUFPLEVBQUUsUUFBUTtRQUNqQixPQUFPLEVBQUUsUUFBUTtRQUNqQixXQUFXLEVBQUUsUUFBUTtLQUN4QixDQUFBO0FBQ0wsQ0FBQyxDQUFBO0FBRUQsTUFBTSxDQUFDLE1BQU0sS0FBSyxHQUFHLEdBQUcsRUFBRSxDQUFDLG1CQUFtQixDQUFDO0lBQzNDLG1CQUFtQixFQUFTO0lBQzVCLGFBQWEsRUFBUztDQUN6QixFQUFFLDZCQUE2QixDQUFDLENBQUEifQ==
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi9zcmMvbGliL2luZGV4LnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiJBQUFBLGNBQWMsWUFBWSxDQUFBO0FBQzFCLGNBQWMsaUJBQWlCLENBQUE7QUFDL0IsY0FBYyxnQkFBZ0IsQ0FBQTtBQUM5QixjQUFjLFlBQVksQ0FBQTtBQUMxQixjQUFjLFdBQVcsQ0FBQTtBQUN6QixjQUFjLGdCQUFnQixDQUFBO0FBQzlCLE9BQU8sRUFBRSxtQkFBbUIsRUFBRSxNQUFNLG1CQUFtQixDQUFBO0FBQ3ZELE9BQU8sRUFBRSxTQUFTLElBQUksbUJBQW1CLEVBQUUsYUFBYSxFQUFFLE1BQU0saUJBQWlCLENBQUE7QUFFakYsTUFBTSxDQUFDLE1BQU0sS0FBSyxHQUFHLEdBQUcsRUFBRSxDQUFDLG1CQUFtQixDQUFDO0lBQzNDLG1CQUFtQixFQUFTO0lBQzVCLGFBQWEsRUFBUztDQUN6QixFQUFFLDZCQUE2QixDQUFDLENBQUEifQ==

View File

@ -1,17 +1,4 @@
export interface SearchMetadata {
id: string;
status: string;
json_endpoint: string;
created_at: string;
processed_at: string;
google_maps_url: string;
raw_html_file: string;
total_time_taken: number;
}
export interface SearchInformation {
local_results_state: string;
query_displayed: string;
}
import { SearchMetadata, SearchInformation, SearchParameters } from './types.js';
export interface GpsCoordinates {
latitude: number;
longitude: number;
@ -54,14 +41,6 @@ export type LocalResult = {
email?: string;
[key: string]: any;
};
export interface SearchParameters {
engine: string;
type: string;
q: string;
ll: string;
google_domain: string;
hl: string;
}
export interface SearchResult {
search_metadata: SearchMetadata;
search_information: SearchInformation;
@ -69,18 +48,27 @@ export interface SearchResult {
search_parameters: SearchParameters;
}
export interface LocationSiteMeta {
og?: Og;
meta?: Meta;
links?: string[];
title?: string;
description?: string;
image?: string;
url?: string;
social?: Page[];
seo?: SeoData;
pages?: Page[];
externalLinks?: Page[];
images?: Image[];
allLinks?: string[];
}
export interface SeoData {
keywords?: string[];
instagram?: string;
facebook?: string;
linkedin?: string;
youtube?: string;
twitter?: string;
structured?: Structured[];
og?: Og;
metaTags?: Meta;
}
export interface Page {
url: string;
source: string;
status: 'PENDING' | 'SEARCHING_EMAIL' | 'SEARCHED_EMAIL' | 'FAILED';
error?: string;
}
export interface Og {
url?: string;

View File

@ -0,0 +1,6 @@
import { getJson as searchSerpAPI } from "serpapi";
export declare const SearchProviders: {
scaleserp: (params: any) => Promise<import("./types.js").IScaleserpResponse>;
serpApi: typeof searchSerpAPI;
};
export declare const cleanOptions: (opts: any) => any;

View File

@ -0,0 +1,16 @@
import { getJson as searchSerpAPI } from "serpapi";
import { search as searchScaleserp } from './scalesep.js';
export const SearchProviders = {
scaleserp: searchScaleserp,
serpApi: searchSerpAPI
};
export const cleanOptions = (opts) => {
return {
...opts,
openai: 'hidden',
bigdata: 'hidden',
api_key: 'hidden',
geocode_key: 'hidden'
};
};
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoicHJvdmlkZXJzLmpzIiwic291cmNlUm9vdCI6IiIsInNvdXJjZXMiOlsiLi4vLi4vc3JjL2xpYi9wcm92aWRlcnMudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6IkFBQUEsT0FBTyxFQUFFLE9BQU8sSUFBSSxhQUFhLEVBQUUsTUFBTSxTQUFTLENBQUE7QUFDbEQsT0FBTyxFQUFFLE1BQU0sSUFBSSxlQUFlLEVBQUUsTUFBTSxlQUFlLENBQUE7QUFFekQsTUFBTSxDQUFDLE1BQU0sZUFBZSxHQUFHO0lBQzNCLFNBQVMsRUFBRSxlQUFlO0lBQzFCLE9BQU8sRUFBRSxhQUFhO0NBQ3pCLENBQUE7QUFFRCxNQUFNLENBQUMsTUFBTSxZQUFZLEdBQUcsQ0FBQyxJQUFTLEVBQUUsRUFBRTtJQUN0QyxPQUFPO1FBQ0gsR0FBRyxJQUFJO1FBQ1AsTUFBTSxFQUFFLFFBQVE7UUFDaEIsT0FBTyxFQUFFLFFBQVE7UUFDakIsT0FBTyxFQUFFLFFBQVE7UUFDakIsV0FBVyxFQUFFLFFBQVE7S0FDeEIsQ0FBQTtBQUNMLENBQUMsQ0FBQSJ9

View File

@ -35,6 +35,8 @@
"p-map": "^4.0.0",
"publish": "^0.6.0",
"puppeteer": "^19.11.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"serpapi": "^1.1.1",
"tslog": "^4.10.2",
"typescript": "^5.6.3",
@ -44,7 +46,7 @@
"zod-to-ts": "^1.2.0"
},
"bin": {
"polymech-search": "main.js"
"polymech-search": "dist-in/main.js"
},
"engines": {
"node": ">= 14.0.0"
@ -7922,6 +7924,21 @@
"@types/node": "*"
}
},
"node_modules/@types/debug": {
"version": "4.1.12",
"resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
"integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==",
"license": "MIT",
"dependencies": {
"@types/ms": "*"
}
},
"node_modules/@types/ms": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz",
"integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==",
"license": "MIT"
},
"node_modules/@types/node": {
"version": "22.19.1",
"license": "MIT",
@ -8041,6 +8058,15 @@
"version": "2.0.1",
"license": "Python-2.0"
},
"node_modules/arr-union": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz",
"integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/asynckit": {
"version": "0.4.0",
"license": "MIT"
@ -8358,6 +8384,22 @@
"url": "https://github.com/chalk/wrap-ansi?sponsor=1"
}
},
"node_modules/clone-deep": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz",
"integrity": "sha512-we+NuQo2DHhSl+DP6jlUiAhyAjBQrYnpOk15rN6c6JSPScjiCLh8IbSU+VTcph6YS3o7mASE8a0+gbZ7ChLpgg==",
"license": "MIT",
"dependencies": {
"for-own": "^0.1.3",
"is-plain-object": "^2.0.1",
"kind-of": "^3.0.2",
"lazy-cache": "^1.0.3",
"shallow-clone": "^0.1.2"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/color-convert": {
"version": "2.0.1",
"license": "MIT",
@ -8401,8 +8443,7 @@
},
"node_modules/concat-map": {
"version": "0.0.1",
"license": "MIT",
"optional": true
"license": "MIT"
},
"node_modules/console-control-strings": {
"version": "1.1.0",
@ -8845,6 +8886,27 @@
}
}
},
"node_modules/for-in": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz",
"integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/for-own": {
"version": "0.1.5",
"resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz",
"integrity": "sha512-SKmowqGTJoPzLO1T0BBJpkfp3EMacCMOuH40hOUbrbzElVktk4DioXVM99QkLCyKoiuOmyjgcWMpVz2xjE7LZw==",
"license": "MIT",
"dependencies": {
"for-in": "^1.0.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/foreground-child": {
"version": "3.3.1",
"license": "ISC",
@ -8899,6 +8961,20 @@
"version": "1.0.0",
"license": "MIT"
},
"node_modules/fs-extra": {
"version": "10.1.0",
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz",
"integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==",
"license": "MIT",
"dependencies": {
"graceful-fs": "^4.2.0",
"jsonfile": "^6.0.1",
"universalify": "^2.0.0"
},
"engines": {
"node": ">=12"
}
},
"node_modules/fs-minipass": {
"version": "3.0.3",
"license": "ISC",
@ -8911,8 +8987,7 @@
},
"node_modules/fs.realpath": {
"version": "1.0.0",
"license": "ISC",
"optional": true
"license": "ISC"
},
"node_modules/function-bind": {
"version": "1.1.2",
@ -9019,8 +9094,7 @@
},
"node_modules/graceful-fs": {
"version": "4.2.11",
"license": "ISC",
"optional": true
"license": "ISC"
},
"node_modules/has-symbols": {
"version": "1.1.0",
@ -9224,7 +9298,6 @@
"node_modules/inflight": {
"version": "1.0.6",
"license": "ISC",
"optional": true,
"dependencies": {
"once": "^1.3.0",
"wrappy": "1"
@ -9258,6 +9331,15 @@
"version": "1.1.6",
"license": "MIT"
},
"node_modules/is-extendable": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz",
"integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/is-fullwidth-code-point": {
"version": "3.0.0",
"license": "MIT",
@ -9270,6 +9352,18 @@
"license": "MIT",
"optional": true
},
"node_modules/is-plain-object": {
"version": "2.0.4",
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
"integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
"license": "MIT",
"dependencies": {
"isobject": "^3.0.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/isarray": {
"version": "1.0.0",
"license": "MIT"
@ -9278,6 +9372,15 @@
"version": "2.0.0",
"license": "ISC"
},
"node_modules/isobject": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
"integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/jackspeak": {
"version": "3.4.3",
"license": "BlueOak-1.0.0",
@ -9324,6 +9427,18 @@
"version": "2.3.1",
"license": "MIT"
},
"node_modules/jsonfile": {
"version": "6.2.0",
"resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.2.0.tgz",
"integrity": "sha512-FGuPw30AdOIUTRMC2OMRtQV+jkVj2cfPqSeWXv1NEAJ1qZ5zb1X6z1mFhbfOB/iy3ssJCD+3KuZ8r8C3uVFlAg==",
"license": "MIT",
"dependencies": {
"universalify": "^2.0.0"
},
"optionalDependencies": {
"graceful-fs": "^4.1.6"
}
},
"node_modules/jsonpath-plus": {
"version": "9.0.0",
"license": "MIT",
@ -9355,6 +9470,18 @@
"@keyv/serialize": "^1.1.1"
}
},
"node_modules/kind-of": {
"version": "3.2.2",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz",
"integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==",
"license": "MIT",
"dependencies": {
"is-buffer": "^1.1.5"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/langchain": {
"version": "0.1.37",
"license": "MIT",
@ -9636,6 +9763,15 @@
"uuid": "dist/bin/uuid"
}
},
"node_modules/lazy-cache": {
"version": "1.0.4",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz",
"integrity": "sha512-RE2g0b5VGZsOCFOCgP7omTRYFqydmZkBwl5oNnQ1lDYC57uyO9KqNnNVxT7COSHTxrRCWVcAVOcbjk+tvh/rgQ==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/leac": {
"version": "0.6.0",
"license": "MIT",
@ -9979,6 +10115,20 @@
"is-buffer": "~1.1.6"
}
},
"node_modules/merge-deep": {
"version": "3.0.3",
"resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.3.tgz",
"integrity": "sha512-qtmzAS6t6grwEkNrunqTBdn0qKwFgNWvlxUbAV8es9M7Ot1EbyApytCnvE0jALPa46ZpKDUo527kKiaWplmlFA==",
"license": "MIT",
"dependencies": {
"arr-union": "^3.1.0",
"clone-deep": "^0.2.4",
"kind-of": "^3.0.2"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/mime-db": {
"version": "1.52.0",
"license": "MIT",
@ -10167,6 +10317,28 @@
"version": "3.0.0",
"license": "MIT"
},
"node_modules/mixin-object": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz",
"integrity": "sha512-ALGF1Jt9ouehcaXaHhn6t1yGWRqGaHkPFndtFVHfZXOvkIZ/yoGaSi0AHVTafb3ZBGg4dr/bDwnaEKqCXzchMA==",
"license": "MIT",
"dependencies": {
"for-in": "^0.1.3",
"is-extendable": "^0.1.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/mixin-object/node_modules/for-in": {
"version": "0.1.8",
"resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz",
"integrity": "sha512-F0to7vbBSHP8E3l6dCjxNOLuSFAACIxFy3UehTUlG7svlXi37HHsDkyVcHo0Pq8QwrE+pXvWSVX3ZT1T9wAZ9g==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/mkdirp": {
"version": "1.0.4",
"license": "MIT",
@ -12522,7 +12694,6 @@
"node_modules/path-is-absolute": {
"version": "1.0.1",
"license": "MIT",
"optional": true,
"engines": {
"node": ">=0.10.0"
}
@ -12729,6 +12900,143 @@
}
}
},
"node_modules/puppeteer-extra": {
"version": "3.3.6",
"resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz",
"integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==",
"license": "MIT",
"peer": true,
"dependencies": {
"@types/debug": "^4.1.0",
"debug": "^4.1.1",
"deepmerge": "^4.2.2"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"@types/puppeteer": "*",
"puppeteer": "*",
"puppeteer-core": "*"
},
"peerDependenciesMeta": {
"@types/puppeteer": {
"optional": true
},
"puppeteer": {
"optional": true
},
"puppeteer-core": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin": {
"version": "3.2.3",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.2.3.tgz",
"integrity": "sha512-6RNy0e6pH8vaS3akPIKGg28xcryKscczt4wIl0ePciZENGE2yoaQJNd17UiEbdmh5/6WW6dPcfRWT9lxBwCi2Q==",
"license": "MIT",
"dependencies": {
"@types/debug": "^4.1.0",
"debug": "^4.1.1",
"merge-deep": "^3.0.1"
},
"engines": {
"node": ">=9.11.2"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-stealth": {
"version": "2.11.2",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
"integrity": "sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==",
"license": "MIT",
"dependencies": {
"debug": "^4.1.1",
"puppeteer-extra-plugin": "^3.2.3",
"puppeteer-extra-plugin-user-preferences": "^2.4.1"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-user-data-dir": {
"version": "2.4.1",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-data-dir/-/puppeteer-extra-plugin-user-data-dir-2.4.1.tgz",
"integrity": "sha512-kH1GnCcqEDoBXO7epAse4TBPJh9tEpVEK/vkedKfjOVOhZAvLkHGc9swMs5ChrJbRnf8Hdpug6TJlEuimXNQ+g==",
"license": "MIT",
"dependencies": {
"debug": "^4.1.1",
"fs-extra": "^10.0.0",
"puppeteer-extra-plugin": "^3.2.3",
"rimraf": "^3.0.2"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-user-preferences": {
"version": "2.4.1",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-preferences/-/puppeteer-extra-plugin-user-preferences-2.4.1.tgz",
"integrity": "sha512-i1oAZxRbc1bk8MZufKCruCEC3CCafO9RKMkkodZltI4OqibLFXF3tj6HZ4LZ9C5vCXZjYcDWazgtY69mnmrQ9A==",
"license": "MIT",
"dependencies": {
"debug": "^4.1.1",
"deepmerge": "^4.2.2",
"puppeteer-extra-plugin": "^3.2.3",
"puppeteer-extra-plugin-user-data-dir": "^2.4.1"
},
"engines": {
"node": ">=8"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/rc": {
"version": "1.2.8",
"license": "(BSD-2-Clause OR MIT OR Apache-2.0)",
@ -12779,7 +13087,6 @@
"node_modules/rimraf": {
"version": "3.0.2",
"license": "ISC",
"optional": true,
"dependencies": {
"glob": "^7.1.3"
},
@ -12793,7 +13100,6 @@
"node_modules/rimraf/node_modules/brace-expansion": {
"version": "1.1.12",
"license": "MIT",
"optional": true,
"dependencies": {
"balanced-match": "^1.0.0",
"concat-map": "0.0.1"
@ -12802,7 +13108,6 @@
"node_modules/rimraf/node_modules/glob": {
"version": "7.2.3",
"license": "ISC",
"optional": true,
"dependencies": {
"fs.realpath": "^1.0.0",
"inflight": "^1.0.4",
@ -12821,7 +13126,6 @@
"node_modules/rimraf/node_modules/minimatch": {
"version": "3.1.2",
"license": "ISC",
"optional": true,
"dependencies": {
"brace-expansion": "^1.1.7"
},
@ -12879,6 +13183,42 @@
"license": "ISC",
"optional": true
},
"node_modules/shallow-clone": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz",
"integrity": "sha512-J1zdXCky5GmNnuauESROVu31MQSnLoYvlyEn6j2Ztk6Q5EHFIhxkMhYcv6vuDzl2XEzoRr856QwzMgWM/TmZgw==",
"license": "MIT",
"dependencies": {
"is-extendable": "^0.1.1",
"kind-of": "^2.0.1",
"lazy-cache": "^0.2.3",
"mixin-object": "^2.0.1"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/shallow-clone/node_modules/kind-of": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz",
"integrity": "sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==",
"license": "MIT",
"dependencies": {
"is-buffer": "^1.0.2"
},
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/shallow-clone/node_modules/lazy-cache": {
"version": "0.2.7",
"resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz",
"integrity": "sha512-gkX52wvU/R8DVMMt78ATVPFMJqfW8FPz1GZ1sVHBVQHmu/WvhIWE4cE1GBzhJNFicDeYhnwp6Rl35BcAIM3YOQ==",
"license": "MIT",
"engines": {
"node": ">=0.10.0"
}
},
"node_modules/shebang-command": {
"version": "2.0.0",
"license": "MIT",
@ -13271,6 +13611,15 @@
"node": "^14.17.0 || ^16.13.0 || >=18.0.0"
}
},
"node_modules/universalify": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz",
"integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==",
"license": "MIT",
"engines": {
"node": ">= 10.0.0"
}
},
"node_modules/util-deprecate": {
"version": "1.0.2",
"license": "MIT"

View File

@ -41,6 +41,8 @@
"p-map": "^4.0.0",
"publish": "^0.6.0",
"puppeteer": "^19.11.1",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"serpapi": "^1.1.1",
"tslog": "^4.10.2",
"typescript": "^5.6.3",
@ -71,4 +73,4 @@
"keywords": [
"typescript"
]
}
}

View File

@ -3,7 +3,7 @@ import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio"
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"
import { htmlToText } from "html-to-text"
import { MappingDocumentTransformer, Document } from "@langchain/core/documents"
import { LocalResult } from './map_types.js'
import { LocalResult, Page } from './map_types.js'
import { isValidUrl } from './html.js'
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g
@ -59,7 +59,7 @@ export const puppeteerLoader = async (url: string, headless: boolean, location:
},
gotoOptions: {
timeout: 5000,
timeout: 15000,
waitUntil: "networkidle0",
},
async evaluate(page, browser) {
@ -86,10 +86,19 @@ export const puppeteerLoader = async (url: string, headless: boolean, location:
const extractEmailAddresses = (text: string): string[] => {
const lines = text.split(/\r?\n/)
const emailAddresses: string[] = []
const imageExtensions = ['.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.bmp', '.ico', '.tiff', '.avif'];
for (const line of lines) {
const matches = line.match(emailRegex)
if (matches) {
emailAddresses.push(...matches)
for (const match of matches) {
// Filter out image filenames often found in srcset (e.g. image@2x.png)
const lowerMatch = match.toLowerCase();
const isImage = imageExtensions.some(ext => lowerMatch.endsWith(ext));
if (!isImage) {
emailAddresses.push(match);
}
}
}
}
return emailAddresses
@ -102,9 +111,6 @@ export const findEMail = async (question: string, url: string, opts: { headless?
return false
}
let pageUrl = url
if (location.meta && location.meta.links && location.meta.links.length) {
pageUrl = location.meta.links[0]
}
let docs = await puppeteerLoader(pageUrl, opts.headless, location) as any
let emails: string[] = []
docs.forEach((d: any) => {
@ -123,3 +129,47 @@ export const findEMail = async (question: string, url: string, opts: { headless?
location.email && logger.debug(`Found email for ${url} / ${location.title} : ${location.type} : ${location.email} : ${opts.searchFrom}`)
return emails
}
export const findEmailEach = async (location: LocalResult, opts: { headless?: boolean, searchFrom?: string, [key: string]: any }, onProgress?: (page: Page) => Promise<void>) => {
if (!location.meta || !location.meta.pages) {
return []
}
const emails: string[] = []
for (const page of location.meta.pages) {
if (page.status !== 'PENDING') {
continue
}
page.status = 'SEARCHING_EMAIL'
try {
logger.info(`Scraping email from ${page.url}`);
const pageEmails = await findEMail('find email', page.url, opts, location)
if (pageEmails && Array.isArray(pageEmails)) {
emails.push(...pageEmails)
}
page.status = 'SEARCHED_EMAIL'
} catch (error) {
page.status = 'FAILED'
page.error = error.message
logger.error(`Error scraping email from ${page.url}:`, error)
}
if (onProgress) {
await onProgress(page)
}
}
// Update location emails
if (emails.length > 0) {
const uniqueEmails = [...new Set([...(location.emails || []), ...emails])]
location.emails = uniqueEmails
if (uniqueEmails.length > 0) {
location.email = uniqueEmails[0]
}
}
return emails
}

View File

@ -7,26 +7,6 @@ export enum ResolveFlags {
PHOTOS = 'PHOTOS',
}
const o = {
query: "plastichub",
engine: "google_maps",
type: "search",
q: "plastichub",
ll: "@41.6911354,2.1652746,13z",
google_domain: "google.es",
hl: "en",
searchFrom: "barcelona, spain",
api_key: "517879d08bd8f13df9c4265c42aea8cfe960942f3a10e8774bbec11becbfb687",
geocode_key: "65bcf01943459613018206nmi9830a9",
openai: {
key: "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",
"key-p": "sk-x9O7hWAAeDCdX6HVyv49R2NV7JhFjGhUj7gG5szBoBT3BlbkFJfzB9Mo7j8Yl3xevSgeoSR-GXpftEevoS4ybwJrcWsA",
},
headless: false,
bigdata: {
key: "bdc_26a67478a1f1492faf5cec9c498da553",
},
}
// Base schema without transformation - allows merging
export const zodSchemaBase = () =>
@ -55,7 +35,7 @@ export const zodSchemaBase = () =>
language: z.string().default('en'),
limit: z.number().default(5),
logLevel: z.string().default('info'),
meta: z.boolean().default(true),
meta: z.boolean().default(false),
searchCache: z.boolean().default(false).describe('Use search cache'),
query: z.string().default('plastichub'),
resolve: z.array(z.nativeEnum(ResolveFlags)).default([ResolveFlags.PHOTOS]).optional(),
@ -64,8 +44,10 @@ export const zodSchemaBase = () =>
source: z.union([z.string(), z.record(z.string(), z.array(z.string()))]).optional(),
type: z.string().optional().default('search'),
zoom: z.number().optional().default(13),
index: z.string().default('${OSR_ROOT}/osr-directory/meta/index.json').describe('Index file'),
store: z.string().default('${OSR_ROOT}/osr-directory/meta/index.db').describe('Index store'),
//index: z.string().optional().default('${OSR_ROOT}/osr-directory/meta/index.json').describe('Index file'),
//store: z.string().optional().default('${OSR_ROOT}/osr-directory/meta/index.db').describe('Index store'),
index: z.string().optional().describe('Index file'),
store: z.string().optional().describe('Index store'),
variables: z.any().optional(),
})
.passthrough()

View File

@ -32,7 +32,7 @@ import { get_cached_object, set_cached_object } from '@polymech/cache/lib'
import { OSR_CACHE } from '@polymech/commons'
import { logger } from '../index.js'
import { cleanOptions, SearchProviders } from './index.js'
import { cleanOptions, SearchProviders } from './providers.js'
import { IScaleserpSearch } from './types.js'
import { findEMail } from './email.js'
import { defaultEngine, defaultFromLocation, defaultGoogleDomain, defaultLanguage, PAGE_SIZE, SEARCH_AI_PROMPTS } from './constants.js'

View File

@ -5,11 +5,15 @@ import * as cheerio from "cheerio"
import * as path from 'path'
import { URL } from 'url'
import { Browser } from 'puppeteer'
import * as puppeteer from 'puppeteer'
import { Browser, Page } from 'puppeteer'
import puppeteerExtra from 'puppeteer-extra'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
const puppeteerExtraAny = puppeteerExtra as any
puppeteerExtraAny.use(StealthPlugin())
import { logger } from '../index.js'
import { LocalResult, LocationSiteMeta, Meta, Og, Image, Structured } from './map_types.js'
import { LocalResult, LocationSiteMeta, Meta, Og, Image, Structured, Page as PageType } from './map_types.js'
export const STATS_SUFFIX = '_stats.json'
export const SESSION_EVENTS_SUFFIX = '_session.json'
@ -25,13 +29,8 @@ const debugResponses = false
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
export const extractEmail = (input: string): string | null => {
// Regular expression to match a typical email format
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;
// Use the regex to search for an email in the input string
const match = input.match(emailRegex);
// Return the matched email, or null if none is found
return match ? match[0] : null;
}
@ -44,14 +43,16 @@ export const meta = async (loc: LocalResult, options: any): Promise<LocationSite
return
}
try {
const _meta: LocationSiteMeta = await parse(loc.website, null, options) || {}
const _meta: LocationSiteMeta = await parseHtml(loc.website, null, options) || {}
loc.meta = _meta
loc.instagram = _meta.instagram
loc.facebook = _meta.facebook
loc.youtube = _meta.youtube
loc.linkedin = _meta.linkedin
loc.twitter = _meta.twitter
loc.email = (_meta.allLinks || []).map((l) => extractEmail(l)).filter((e) => e !== null)[0]
if (_meta.social) {
loc.instagram = _meta.social.find(p => p.source === 'instagram')?.url
loc.facebook = _meta.social.find(p => p.source === 'facebook')?.url
loc.youtube = _meta.social.find(p => p.source === 'youtube')?.url
loc.linkedin = _meta.social.find(p => p.source === 'linkedin')?.url
loc.twitter = _meta.social.find(p => p.source === 'twitter')?.url
}
return _meta
} catch (error) {
logger.error('Error retrieving meta data : ' + loc.website, error.message)
@ -73,23 +74,47 @@ const readMetaTags = ($: cheerio.CheerioAPI, name: string) => {
return $(`meta[name="${name}"]`).attr('content') || $(`meta[property="${name}"]`).attr('content') || null;
};
export const parse = async (url: string, config: AxiosRequestConfig | null, options: any): Promise<LocationSiteMeta> => {
export const parseHtml = async (url: string, config: AxiosRequestConfig | null, options: any): Promise<LocationSiteMeta> => {
if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url)) return {} as LocationSiteMeta;
const { data } = await axios(url,
{
...config,
httpsAgent: new https.Agent({
rejectUnauthorized: false
}),
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
},
timeout: 10000
})
let content = '';
let currentUrl = url;
const $ = cheerio.load(data)
if (options && options.headless) {
try {
const browser = await puppeteerExtraAny.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] });
const page = await browser.newPage();
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
await page.goto(url, { waitUntil: 'networkidle2', timeout: options.timeout || 30000 });
content = await page.content();
currentUrl = page.url();
await browser.close();
} catch (e) {
logger.error(`Puppeteer failed for ${url}: ${e.message}`);
}
}
if (!content) {
try {
const { data } = await axios(url, {
...config,
httpsAgent: new https.Agent({ rejectUnauthorized: false }),
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
},
timeout: 10000
});
content = data;
} catch (e) {
logger.error(`Axios failed for ${url}: ${e.message}`);
return {} as LocationSiteMeta;
}
}
console.log(`[ParseHtml] Successfully fetched ${url}, content length: ${content.length}`);
const $ = cheerio.load(content)
const og: Og = {}
const meta: Meta = {}
const images: Image[] = []
@ -127,15 +152,11 @@ export const parse = async (url: string, config: AxiosRequestConfig | null, opti
}
})
// Array to store JSON-LD data
const jsonLdArray: Structured[] = [];
// Select all <script> tags with type "application/ld+json"
$('script[type="application/ld+json"]').each((_, element) => {
const jsonLdContent = $(element).html();
if (jsonLdContent) {
try {
// Parse the JSON-LD content and push it to the array
const jsonData = JSON.parse(jsonLdContent);
jsonLdArray.push(jsonData);
} catch (e) {
@ -144,35 +165,69 @@ export const parse = async (url: string, config: AxiosRequestConfig | null, opti
}
})
$('a').each((index, element) => {
const href = $(element).attr('href')
if (href && isValidUrl(href)) {
if (href.indexOf('contact') !== -1 && !links.includes(href)) {
links.push(href)
let href = $(element).attr('href')
if (href) {
try {
href = new URL(href, url).href;
if (isValidUrl(href)) {
if (href.indexOf('contact') !== -1 && !links.includes(href)) {
links.push(href)
}
allLinks.push(href)
}
} catch (e) {
// Ignore invalid URLs
}
allLinks.push(href)
}
})
allLinks = [...new Set(allLinks)]
const instagram = allLinks.find(link => link.includes('instagram.com'))
const facebook = allLinks.find(link => link.includes('facebook.com'))
const linkedin = allLinks.find(link => link.includes('linkedin.com'))
const youtube = allLinks.find(link => link.includes('youtube.com'))
const twitter = allLinks.find(link => link.includes('twitter.com'))
const socialLinks: PageType[] = []
const internalPages: PageType[] = []
const externalLinks: PageType[] = []
allLinks.forEach(link => {
if (link.includes('instagram.com')) socialLinks.push({ url: link, source: 'instagram', status: 'PENDING' })
else if (link.includes('facebook.com')) socialLinks.push({ url: link, source: 'facebook', status: 'PENDING' })
else if (link.includes('linkedin.com')) socialLinks.push({ url: link, source: 'linkedin', status: 'PENDING' })
else if (link.includes('youtube.com')) socialLinks.push({ url: link, source: 'youtube', status: 'PENDING' })
else if (link.includes('twitter.com')) socialLinks.push({ url: link, source: 'twitter', status: 'PENDING' })
else if (link.includes('mailto:')) { /* ignore mailto */ }
else {
try {
const baseUrl = new URL(url).hostname;
const linkUrl = new URL(link).hostname;
if (linkUrl === baseUrl || linkUrl.endsWith('.' + baseUrl)) {
internalPages.push({ url: link, source: 'site', status: 'PENDING' });
} else {
externalLinks.push({ url: link, source: 'external', status: 'PENDING' });
}
} catch (e) {
externalLinks.push({ url: link, source: 'external', status: 'PENDING' });
}
}
})
const ret: LocationSiteMeta = {
meta,
og,
images,
keywords:
($('meta[property="og:keywords"]').attr("content") ||
title: meta.title || og.title,
description: meta.description || og.description,
image: meta.image || og.image,
url: meta.url || og.url || url,
social: socialLinks,
seo: {
keywords: ($('meta[property="og:keywords"]').attr("content") ||
$('meta[name="keywords"]').attr("content") || "").split(',').map(s => s.trim()).filter(s => s),
links,
allLinks,
instagram,
facebook,
linkedin,
youtube,
twitter,
structured: jsonLdArray
structured: jsonLdArray,
og,
metaTags: meta
},
pages: internalPages,
externalLinks: externalLinks,
images
}
return ret
}
@ -184,19 +239,8 @@ export const getScope = (cliArgs?: any) => {
}
return instance;
}
/*
export async function capture_request(where: any[], request: Request) {
debugRequests && logger.debug('Request', { url: request.url(), data: request.postData() });
where.push({ url: request.url(), data: await request.postData(), request: request });
debugRequests && logger.debug('requests', where.map(r => r.url));
}
export async function capture_response(where: any[], response: Response) {
debugResponses && logger.debug('Response', { url: response.url(), data: await response.json() });
where.push(response);
}
*/
export async function capture_responses(scope: Scope, page: puppeteer.Page) {
export async function capture_responses(scope: Scope, page: Page) {
try {
// await page.setRequestInterception(true);
} catch (e) {
@ -227,7 +271,7 @@ export async function capture_responses(scope: Scope, page: puppeteer.Page) {
export class Scope {
browser!: Browser
context!: any
page!: puppeteer.Page
page!: Page
args!: any;
requests: any[] = []
responses: any[] = []
@ -249,13 +293,11 @@ export class Scope {
`--user-data-dir=${path.resolve('../chrome')}`
];
this.browser = await puppeteer.launch({
this.browser = await puppeteerExtraAny.launch({
... this.args,
args: args
});
// const context = await this.browser.createIncognitoBrowserContext();
this.page = await this.browser.newPage();
// this.page = await context.newPage();
this.page.on('console', msg => {
// error('Browser error:', msg);
@ -263,27 +305,11 @@ export class Scope {
this.page.on('error', msg => logger.error('Browser Error:', msg));
this.page.on('pageerror', msg => logger.error('Browser Page Error:', msg));
this.page.on('requestfailed', msg => logger.error('Browser Page Request Error:', msg));
//capture_requests(this, this.page);
//capture_responses(this, this.page);
// this.args.disableRequests !== 'true' && capture_requests(this, this.page);
// this.args.disableResponses !== 'true' && capture_requests(this, this.page);
// capture_responses(this, this.page);
const page2 = this.page as any;
//page2.setCacheEnabled(false);
/**
await page2._client.on('Security.certificateError', (event: any) => {
page2._client.send('Security.handleCertificateError', {
eventId: event.eventId,
action: 'continue' // ignore error and continue request
})
})
*/
}
}
export const body = async (url: string) => {
const options = {
headless: false,

View File

@ -1,25 +1,12 @@
export * from './types.js'
export * from './googlemaps.js'
export * from './map_types.js'
export * from './email.js'
export * from './html.js'
export * from './providers.js'
import { generate_interfaces } from '@polymech/commons'
import { getJson as searchSerpAPI } from "serpapi"
import { search as searchScaleserp } from './scalesep.js'
import { zodSchema as zodSchemaGoogleMaps, zodSchemaEach } from './googlemaps.js'
export const SearchProviders = {
scaleserp: searchScaleserp,
serpApi: searchSerpAPI
}
export const cleanOptions = (opts: any) => {
return {
...opts,
openai: 'hidden',
bigdata: 'hidden',
api_key: 'hidden',
geocode_key: 'hidden'
}
}
export const types = () => generate_interfaces([
zodSchemaGoogleMaps() as any,
zodSchemaEach() as any,

View File

@ -1,18 +1,4 @@
export interface SearchMetadata {
id: string;
status: string;
json_endpoint: string;
created_at: string;
processed_at: string;
google_maps_url: string;
raw_html_file: string;
total_time_taken: number;
}
export interface SearchInformation {
local_results_state: string;
query_displayed: string;
}
import { SearchMetadata, SearchInformation, SearchParameters } from './types.js'
export interface GpsCoordinates {
latitude: number;
@ -57,14 +43,6 @@ export type LocalResult = {
[key: string]: any
}
export interface SearchParameters {
engine: string;
type: string;
q: string;
ll: string;
google_domain: string;
hl: string;
}
export interface SearchResult {
search_metadata: SearchMetadata;
@ -79,18 +57,34 @@ export interface SearchResult {
//
/////////////////////////////////////////////////
export interface LocationSiteMeta {
og?: Og
meta?: Meta
links?: string[]
images?: Image[]
allLinks?: string[]
keywords?: string[]
instagram?: string
facebook?: string
linkedin?: string
youtube?: string
twitter?: string
structured?: Structured[]
// Normalized Metadata
title?: string;
description?: string;
image?: string;
url?: string;
// Grouped Data
social?: Page[];
seo?: SeoData;
// Crawling & Content
pages?: Page[];
externalLinks?: Page[];
images?: Image[];
}
export interface SeoData {
keywords?: string[];
structured?: Structured[];
og?: Og;
metaTags?: Meta;
}
export interface Page {
url: string
source: string
status: 'PENDING' | 'SEARCHING_EMAIL' | 'SEARCHED_EMAIL' | 'FAILED'
error?: string
}
export interface Og {

View File

@ -0,0 +1,17 @@
import { getJson as searchSerpAPI } from "serpapi"
import { search as searchScaleserp } from './scalesep.js'
export const SearchProviders = {
scaleserp: searchScaleserp,
serpApi: searchSerpAPI
}
export const cleanOptions = (opts: any) => {
return {
...opts,
openai: 'hidden',
bigdata: 'hidden',
api_key: 'hidden',
geocode_key: 'hidden'
}
}