mono/packages/search/lib/html.js
2025-03-11 11:28:14 +01:00

335 lines
13 KiB
JavaScript

"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.contactUrl = exports.body = exports.Scope = exports.getScope = exports.parse = exports.isValidUrl = exports.meta = exports.extractEmail = exports.scope = exports.TRACE_SUFFIX = exports.SESSION_EVENTS_SUFFIX = exports.STATS_SUFFIX = void 0;
exports.capture_responses = capture_responses;
const https = require('https');
const axios_1 = require("axios");
const node_html_parser_1 = require("node-html-parser");
const cheerio = require("cheerio");
const path = require("path");
const url_1 = require("url");
const puppeteer = require("puppeteer");
const index_1 = require("../index");
const html_1 = require("./html");
exports.STATS_SUFFIX = '_stats.json';
exports.SESSION_EVENTS_SUFFIX = '_session.json';
exports.TRACE_SUFFIX = '_trace.json';
const included_categories = ['devtools.timeline'];
const _url_short = (url) => new url_1.URL(url).hostname;
let instance;
const debugRequests = true;
const debugResponses = false;
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
const extractEmail = (input) => {
// Regular expression to match a typical email format
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;
// Use the regex to search for an email in the input string
const match = input.match(emailRegex);
// Return the matched email, or null if none is found
return match ? match[0] : null;
};
exports.extractEmail = extractEmail;
const meta = (loc, options) => __awaiter(void 0, void 0, void 0, function* () {
if (!loc.website) {
index_1.logger.warn(`No website to retrieve meta data : ${loc.title}`);
return;
}
if (loc.meta) {
return;
}
try {
const _meta = (yield (0, html_1.parse)(loc.website, null, options)) || {};
loc.meta = _meta;
loc.instagram = _meta.instagram;
loc.facebook = _meta.facebook;
loc.youtube = _meta.youtube;
loc.linkedin = _meta.linkedin;
loc.twitter = _meta.twitter;
loc.email = (_meta.allLinks || []).map((l) => (0, exports.extractEmail)(l)).filter((e) => e !== null)[0];
}
catch (error) {
index_1.logger.error('Error retrieving meta data : ' + loc.website, error.message);
if (error.status)
loc.rejected = error.status === 404 || error.status === 403 || error.status === 999 || error.status === 503;
}
});
exports.meta = meta;
const isValidUrl = (url) => {
try {
new url_1.URL(url);
return true;
}
catch (error) {
return false;
}
};
exports.isValidUrl = isValidUrl;
const readMetaTags = (el, name) => {
var prop = el.getAttribute('name') || el.getAttribute('property');
return prop == name ? el.getAttribute('content') : null;
};
const parse = (url, config, options) => __awaiter(void 0, void 0, void 0, function* () {
if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url))
return {};
const { data } = yield (0, axios_1.default)(url, Object.assign(Object.assign({}, config), { httpsAgent: new https.Agent({
rejectUnauthorized: false
}), headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}, timeout: 10000 }));
const $ = (0, node_html_parser_1.parse)(data);
const og = {};
const meta = {};
const images = [];
const links = [];
let allLinks = [];
const title = $.querySelector('title');
if (title)
meta.title = title.text;
const canonical = $.querySelector('link[rel=canonical]');
if (canonical) {
meta.url = canonical.getAttribute('href');
}
const metas = $.querySelectorAll('meta');
for (let i = 0; i < metas.length; i++) {
const el = metas[i];
// const prop = el.getAttribute('property') || el.getAttribute('name');
['title', 'description', 'image'].forEach(s => {
const val = readMetaTags(el, s);
if (val)
meta[s] = val;
});
['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name', 'og:type'].forEach(s => {
const val = readMetaTags(el, s);
if (val)
og[s.split(':')[1]] = val;
});
}
$.querySelectorAll('img').forEach(el => {
let src = el.getAttribute('src');
if (src) {
src = new url_1.URL(src, url).href;
images.push({ src });
}
});
const _$ = cheerio.load(data);
// Array to store JSON-LD data
const jsonLdArray = [];
// Select all <script> tags with type "application/ld+json"
_$('script[type="application/ld+json"]').each((_, element) => {
const jsonLdContent = _$(element).html();
if (jsonLdContent) {
try {
// Parse the JSON-LD content and push it to the array
const jsonData = JSON.parse(jsonLdContent);
jsonLdArray.push(jsonData);
}
catch (e) {
index_1.logger.error(`Error parsing JSON-LD: ${e.message} @ ${url}`);
}
}
});
_$('a').each((index, element) => {
const href = _$(element).attr('href');
if (href && (0, exports.isValidUrl)(href) && href.indexOf('contact') !== -1 && !links.includes(href)) {
links.push(href);
}
(0, exports.isValidUrl)(href) && allLinks.push(href);
});
allLinks = [...new Set(allLinks)];
const instagram = allLinks.find(link => link.includes('instagram.com'));
const facebook = allLinks.find(link => link.includes('facebook.com'));
const linkedin = allLinks.find(link => link.includes('linkedin.com'));
const youtube = allLinks.find(link => link.includes('youtube.com'));
const twitter = allLinks.find(link => link.includes('twitter.com'));
const ret = {
meta,
og,
images,
keywords: _$('meta[property="og:keywords"]').attr("content") ||
_$('meta[name="keywords"]').attr("content") || [],
links,
allLinks,
instagram,
facebook,
linkedin,
youtube,
twitter,
structured: jsonLdArray
};
return ret;
});
exports.parse = parse;
const getScope = (cliArgs) => {
if (!instance) {
instance = new Scope();
instance.args = cliArgs;
}
return instance;
};
exports.getScope = getScope;
/*
export async function capture_request(where: any[], request: Request) {
debugRequests && logger.debug('Request', { url: request.url(), data: request.postData() });
where.push({ url: request.url(), data: await request.postData(), request: request });
debugRequests && logger.debug('requests', where.map(r => r.url));
}
export async function capture_response(where: any[], response: Response) {
debugResponses && logger.debug('Response', { url: response.url(), data: await response.json() });
where.push(response);
}
*/
function capture_responses(scope, page) {
return __awaiter(this, void 0, void 0, function* () {
try {
// await page.setRequestInterception(true);
}
catch (e) {
index_1.logger.error('error intercepting responses', e);
}
scope.responses = [];
page.on('response', response => {
try {
const isJson = (response.headers()['content-type'] || '').startsWith('application/json;') === true;
const url = response.url();
if (response.status() === 200) {
if (isJson) {
// capture_response(scope.responses, response as any);
}
if (scope.onResponse) {
scope.onResponse(response, scope);
}
}
else {
debugResponses && index_1.logger.error(`Error loading ${url} : ${response.status()}`);
}
}
catch (e) {
debugResponses && index_1.logger.error('Error parsing response');
}
});
});
}
class Scope {
constructor() {
this.requests = [];
this.responses = [];
this.eventBeacons = [];
this.mutationBeacons = [];
this.sessionSuffix = '';
}
init() {
return __awaiter(this, void 0, void 0, function* () {
this.sessionSuffix = ' - ' + new Date().getTime();
const args = [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-infobars',
'--window-position=0,0',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
'--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"',
`--user-data-dir=${path.resolve('../chrome')}`
];
this.browser = yield puppeteer.launch(Object.assign(Object.assign({}, this.args), { args: args }));
// const context = await this.browser.createIncognitoBrowserContext();
this.page = yield this.browser.newPage();
// this.page = await context.newPage();
this.page.on('console', msg => {
// error('Browser error:', msg);
});
this.page.on('error', msg => index_1.logger.error('Browser Error:', msg));
this.page.on('pageerror', msg => index_1.logger.error('Browser Page Error:', msg));
this.page.on('requestfailed', msg => index_1.logger.error('Browser Page Request Error:', msg));
//capture_requests(this, this.page);
//capture_responses(this, this.page);
// this.args.disableRequests !== 'true' && capture_requests(this, this.page);
// this.args.disableResponses !== 'true' && capture_requests(this, this.page);
// capture_responses(this, this.page);
const page2 = this.page;
//page2.setCacheEnabled(false);
/**
await page2._client.on('Security.certificateError', (event: any) => {
page2._client.send('Security.handleCertificateError', {
eventId: event.eventId,
action: 'continue' // ignore error and continue request
})
})
*/
});
}
}
exports.Scope = Scope;
const body = (url) => __awaiter(void 0, void 0, void 0, function* () {
const options = {
headless: false,
url: url
};
let timeout = 8000;
if (!exports.scope) {
try {
exports.scope = yield (0, exports.getScope)(options);
yield exports.scope.init();
}
catch (e) {
debugger;
index_1.logger.error("Invalid scope - abort", e);
return;
}
exports.scope.page.on("pageerror", function (err) {
index_1.logger.error('page-error!', err);
});
exports.scope.page.on("error", function (err) {
index_1.logger.error('brower-error!', err);
});
exports.scope.page.on('console', msg => {
if (msg._type === 'error') {
index_1.logger.error('Browser error:', msg);
exports.scope.page.isError = true;
}
});
}
const parse = (resolve) => {
exports.scope.page.content().then((c) => {
if (!c) {
index_1.logger.error('error user page ', url);
resolve(null);
return;
}
const $ = cheerio.load(c, {
xmlMode: true
});
const links = [];
$('a').each((index, element) => {
const href = $(element).attr('href');
if (href && href.indexOf('contact') !== -1) {
links.push(href);
}
});
resolve({ html: c, text: $('body').text(), links });
});
};
return new Promise((resolve) => {
exports.scope.page.goto(url, {
timeout: timeout,
waitUntil: 'networkidle0'
}).then((v) => {
parse(resolve);
}).catch((e) => {
index_1.logger.error('error loading page', e);
});
});
});
exports.body = body;
const contactUrl = (url) => __awaiter(void 0, void 0, void 0, function* () {
});
exports.contactUrl = contactUrl;
//# sourceMappingURL=html.js.map