mono/packages/search/dist-in/lib/html.js
2025-11-25 11:07:35 +01:00

320 lines
25 KiB
JavaScript

import https from 'https';
import axios from "axios";
import * as cheerio from "cheerio";
import * as path from 'path';
import { URL } from 'url';
import * as puppeteer from 'puppeteer';
import { logger } from '../index.js';
export const STATS_SUFFIX = '_stats.json';
export const SESSION_EVENTS_SUFFIX = '_session.json';
export const TRACE_SUFFIX = '_trace.json';
export var scope;
const included_categories = ['devtools.timeline'];
const _url_short = (url) => new URL(url).hostname;
let instance;
const debugRequests = true;
const debugResponses = false;
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
export const extractEmail = (input) => {
// Regular expression to match a typical email format
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;
// Use the regex to search for an email in the input string
const match = input.match(emailRegex);
// Return the matched email, or null if none is found
return match ? match[0] : null;
};
export const meta = async (loc, options) => {
if (!loc.website) {
logger.warn(`No website to retrieve meta data : ${loc.title}`);
return;
}
if (loc.meta) {
return;
}
try {
const _meta = await parse(loc.website, null, options) || {};
loc.meta = _meta;
loc.instagram = _meta.instagram;
loc.facebook = _meta.facebook;
loc.youtube = _meta.youtube;
loc.linkedin = _meta.linkedin;
loc.twitter = _meta.twitter;
loc.email = (_meta.allLinks || []).map((l) => extractEmail(l)).filter((e) => e !== null)[0];
return _meta;
}
catch (error) {
logger.error('Error retrieving meta data : ' + loc.website, error.message);
if (error.status)
loc.rejected = error.status === 404 || error.status === 403 || error.status === 999 || error.status === 503;
}
};
export const isValidUrl = (url) => {
try {
new URL(url);
return true;
}
catch (error) {
return false;
}
};
const readMetaTags = ($, name) => {
return $(`meta[name="${name}"]`).attr('content') || $(`meta[property="${name}"]`).attr('content') || null;
};
export const parse = async (url, config, options) => {
if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url))
return {};
const { data } = await axios(url, {
...config,
httpsAgent: new https.Agent({
rejectUnauthorized: false
}),
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
},
timeout: 10000
});
const $ = cheerio.load(data);
const og = {};
const meta = {};
const images = [];
const links = [];
let allLinks = [];
const title = $('title').text();
if (title)
meta.title = title;
const canonical = $('link[rel=canonical]').attr('href');
if (canonical) {
meta.url = canonical;
}
['title', 'description', 'image'].forEach(s => {
const val = readMetaTags($, s);
if (val)
meta[s] = val;
});
['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name', 'og:type'].forEach(s => {
const val = readMetaTags($, s);
if (val)
og[s.split(':')[1]] = val;
});
$('img').each((i, el) => {
let src = $(el).attr('src');
if (src) {
try {
src = new URL(src, url).href;
images.push({ src });
}
catch (e) {
// ignore invalid urls
}
}
});
// Array to store JSON-LD data
const jsonLdArray = [];
// Select all <script> tags with type "application/ld+json"
$('script[type="application/ld+json"]').each((_, element) => {
const jsonLdContent = $(element).html();
if (jsonLdContent) {
try {
// Parse the JSON-LD content and push it to the array
const jsonData = JSON.parse(jsonLdContent);
jsonLdArray.push(jsonData);
}
catch (e) {
logger.error(`Error parsing JSON-LD: ${e.message} @ ${url}`);
}
}
});
$('a').each((index, element) => {
const href = $(element).attr('href');
if (href && isValidUrl(href)) {
if (href.indexOf('contact') !== -1 && !links.includes(href)) {
links.push(href);
}
allLinks.push(href);
}
});
allLinks = [...new Set(allLinks)];
const instagram = allLinks.find(link => link.includes('instagram.com'));
const facebook = allLinks.find(link => link.includes('facebook.com'));
const linkedin = allLinks.find(link => link.includes('linkedin.com'));
const youtube = allLinks.find(link => link.includes('youtube.com'));
const twitter = allLinks.find(link => link.includes('twitter.com'));
const ret = {
meta,
og,
images,
keywords: ($('meta[property="og:keywords"]').attr("content") ||
$('meta[name="keywords"]').attr("content") || "").split(',').map(s => s.trim()).filter(s => s),
links,
allLinks,
instagram,
facebook,
linkedin,
youtube,
twitter,
structured: jsonLdArray
};
return ret;
};
export const getScope = (cliArgs) => {
if (!instance) {
instance = new Scope();
instance.args = cliArgs;
}
return instance;
};
/*
export async function capture_request(where: any[], request: Request) {
debugRequests && logger.debug('Request', { url: request.url(), data: request.postData() });
where.push({ url: request.url(), data: await request.postData(), request: request });
debugRequests && logger.debug('requests', where.map(r => r.url));
}
export async function capture_response(where: any[], response: Response) {
debugResponses && logger.debug('Response', { url: response.url(), data: await response.json() });
where.push(response);
}
*/
export async function capture_responses(scope, page) {
try {
// await page.setRequestInterception(true);
}
catch (e) {
logger.error('error intercepting responses', e);
}
scope.responses = [];
page.on('response', response => {
try {
const isJson = (response.headers()['content-type'] || '').startsWith('application/json;') === true;
const url = response.url();
if (response.status() === 200) {
if (isJson) {
// capture_response(scope.responses, response as any);
}
if (scope.onResponse) {
scope.onResponse(response, scope);
}
}
else {
debugResponses && logger.error(`Error loading ${url} : ${response.status()}`);
}
}
catch (e) {
debugResponses && logger.error('Error parsing response');
}
});
}
export class Scope {
browser;
context;
page;
args;
requests = [];
responses = [];
eventBeacons = [];
mutationBeacons = [];
sessionSuffix = '';
onResponse;
onRequest;
async init() {
this.sessionSuffix = ' - ' + new Date().getTime();
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"',
`--user-data-dir=${path.resolve('../chrome')}`
];
this.browser = await puppeteer.launch({
...this.args,
args: args
});
// const context = await this.browser.createIncognitoBrowserContext();
this.page = await this.browser.newPage();
// this.page = await context.newPage();
this.page.on('console', msg => {
// error('Browser error:', msg);
});
this.page.on('error', msg => logger.error('Browser Error:', msg));
this.page.on('pageerror', msg => logger.error('Browser Page Error:', msg));
this.page.on('requestfailed', msg => logger.error('Browser Page Request Error:', msg));
//capture_requests(this, this.page);
//capture_responses(this, this.page);
// this.args.disableRequests !== 'true' && capture_requests(this, this.page);
// this.args.disableResponses !== 'true' && capture_requests(this, this.page);
// capture_responses(this, this.page);
const page2 = this.page;
//page2.setCacheEnabled(false);
/**
await page2._client.on('Security.certificateError', (event: any) => {
page2._client.send('Security.handleCertificateError', {
eventId: event.eventId,
action: 'continue' // ignore error and continue request
})
})
*/
}
}
export const body = async (url) => {
const options = {
headless: false,
url: url
};
let timeout = 8000;
if (!scope) {
try {
scope = await getScope(options);
await scope.init();
}
catch (e) {
debugger;
logger.error("Invalid scope - abort", e);
return;
}
scope.page.on("pageerror", function (err) {
logger.error('page-error!', err);
});
scope.page.on("error", function (err) {
logger.error('brower-error!', err);
});
scope.page.on('console', msg => {
if (msg._type === 'error') {
logger.error('Browser error:', msg);
scope.page.isError = true;
}
});
}
const parse = (resolve) => {
scope.page.content().then((c) => {
if (!c) {
logger.error('error user page ', url);
resolve(null);
return;
}
const $ = cheerio.load(c, {
xmlMode: true
});
const links = [];
$('a').each((index, element) => {
const href = $(element).attr('href');
if (href && href.indexOf('contact') !== -1) {
links.push(href);
}
});
resolve({ html: c, text: $('body').text(), links });
});
};
return new Promise((resolve) => {
scope.page.goto(url, {
timeout: timeout,
waitUntil: 'networkidle0'
}).then((v) => {
parse(resolve);
}).catch((e) => {
logger.error('error loading page', e);
});
});
};
//# sourceMappingURL=data:application/json;base64,