335 lines
13 KiB
JavaScript
335 lines
13 KiB
JavaScript
"use strict";
|
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
});
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.contactUrl = exports.body = exports.Scope = exports.getScope = exports.parse = exports.isValidUrl = exports.meta = exports.extractEmail = exports.scope = exports.TRACE_SUFFIX = exports.SESSION_EVENTS_SUFFIX = exports.STATS_SUFFIX = void 0;
|
|
exports.capture_responses = capture_responses;
|
|
const https = require('https');
|
|
const axios_1 = require("axios");
|
|
const node_html_parser_1 = require("node-html-parser");
|
|
const cheerio = require("cheerio");
|
|
const path = require("path");
|
|
const url_1 = require("url");
|
|
const puppeteer = require("puppeteer");
|
|
const index_1 = require("../index");
|
|
const html_1 = require("./html");
|
|
exports.STATS_SUFFIX = '_stats.json';
|
|
exports.SESSION_EVENTS_SUFFIX = '_session.json';
|
|
exports.TRACE_SUFFIX = '_trace.json';
|
|
const included_categories = ['devtools.timeline'];
|
|
const _url_short = (url) => new url_1.URL(url).hostname;
|
|
let instance;
|
|
const debugRequests = true;
|
|
const debugResponses = false;
|
|
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
|
|
const extractEmail = (input) => {
|
|
// Regular expression to match a typical email format
|
|
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;
|
|
// Use the regex to search for an email in the input string
|
|
const match = input.match(emailRegex);
|
|
// Return the matched email, or null if none is found
|
|
return match ? match[0] : null;
|
|
};
|
|
exports.extractEmail = extractEmail;
|
|
const meta = (loc, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
if (!loc.website) {
|
|
index_1.logger.warn(`No website to retrieve meta data : ${loc.title}`);
|
|
return;
|
|
}
|
|
if (loc.meta) {
|
|
return;
|
|
}
|
|
try {
|
|
const _meta = (yield (0, html_1.parse)(loc.website, null, options)) || {};
|
|
loc.meta = _meta;
|
|
loc.instagram = _meta.instagram;
|
|
loc.facebook = _meta.facebook;
|
|
loc.youtube = _meta.youtube;
|
|
loc.linkedin = _meta.linkedin;
|
|
loc.twitter = _meta.twitter;
|
|
loc.email = (_meta.allLinks || []).map((l) => (0, exports.extractEmail)(l)).filter((e) => e !== null)[0];
|
|
}
|
|
catch (error) {
|
|
index_1.logger.error('Error retrieving meta data : ' + loc.website, error.message);
|
|
if (error.status)
|
|
loc.rejected = error.status === 404 || error.status === 403 || error.status === 999 || error.status === 503;
|
|
}
|
|
});
|
|
exports.meta = meta;
|
|
const isValidUrl = (url) => {
|
|
try {
|
|
new url_1.URL(url);
|
|
return true;
|
|
}
|
|
catch (error) {
|
|
return false;
|
|
}
|
|
};
|
|
exports.isValidUrl = isValidUrl;
|
|
const readMetaTags = (el, name) => {
|
|
var prop = el.getAttribute('name') || el.getAttribute('property');
|
|
return prop == name ? el.getAttribute('content') : null;
|
|
};
|
|
const parse = (url, config, options) => __awaiter(void 0, void 0, void 0, function* () {
|
|
if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url))
|
|
return {};
|
|
const { data } = yield (0, axios_1.default)(url, Object.assign(Object.assign({}, config), { httpsAgent: new https.Agent({
|
|
rejectUnauthorized: false
|
|
}), headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
|
|
}, timeout: 10000 }));
|
|
const $ = (0, node_html_parser_1.parse)(data);
|
|
const og = {};
|
|
const meta = {};
|
|
const images = [];
|
|
const links = [];
|
|
let allLinks = [];
|
|
const title = $.querySelector('title');
|
|
if (title)
|
|
meta.title = title.text;
|
|
const canonical = $.querySelector('link[rel=canonical]');
|
|
if (canonical) {
|
|
meta.url = canonical.getAttribute('href');
|
|
}
|
|
const metas = $.querySelectorAll('meta');
|
|
for (let i = 0; i < metas.length; i++) {
|
|
const el = metas[i];
|
|
// const prop = el.getAttribute('property') || el.getAttribute('name');
|
|
['title', 'description', 'image'].forEach(s => {
|
|
const val = readMetaTags(el, s);
|
|
if (val)
|
|
meta[s] = val;
|
|
});
|
|
['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name', 'og:type'].forEach(s => {
|
|
const val = readMetaTags(el, s);
|
|
if (val)
|
|
og[s.split(':')[1]] = val;
|
|
});
|
|
}
|
|
$.querySelectorAll('img').forEach(el => {
|
|
let src = el.getAttribute('src');
|
|
if (src) {
|
|
src = new url_1.URL(src, url).href;
|
|
images.push({ src });
|
|
}
|
|
});
|
|
const _$ = cheerio.load(data);
|
|
// Array to store JSON-LD data
|
|
const jsonLdArray = [];
|
|
// Select all <script> tags with type "application/ld+json"
|
|
_$('script[type="application/ld+json"]').each((_, element) => {
|
|
const jsonLdContent = _$(element).html();
|
|
if (jsonLdContent) {
|
|
try {
|
|
// Parse the JSON-LD content and push it to the array
|
|
const jsonData = JSON.parse(jsonLdContent);
|
|
jsonLdArray.push(jsonData);
|
|
}
|
|
catch (e) {
|
|
index_1.logger.error(`Error parsing JSON-LD: ${e.message} @ ${url}`);
|
|
}
|
|
}
|
|
});
|
|
_$('a').each((index, element) => {
|
|
const href = _$(element).attr('href');
|
|
if (href && (0, exports.isValidUrl)(href) && href.indexOf('contact') !== -1 && !links.includes(href)) {
|
|
links.push(href);
|
|
}
|
|
(0, exports.isValidUrl)(href) && allLinks.push(href);
|
|
});
|
|
allLinks = [...new Set(allLinks)];
|
|
const instagram = allLinks.find(link => link.includes('instagram.com'));
|
|
const facebook = allLinks.find(link => link.includes('facebook.com'));
|
|
const linkedin = allLinks.find(link => link.includes('linkedin.com'));
|
|
const youtube = allLinks.find(link => link.includes('youtube.com'));
|
|
const twitter = allLinks.find(link => link.includes('twitter.com'));
|
|
const ret = {
|
|
meta,
|
|
og,
|
|
images,
|
|
keywords: _$('meta[property="og:keywords"]').attr("content") ||
|
|
_$('meta[name="keywords"]').attr("content") || [],
|
|
links,
|
|
allLinks,
|
|
instagram,
|
|
facebook,
|
|
linkedin,
|
|
youtube,
|
|
twitter,
|
|
structured: jsonLdArray
|
|
};
|
|
return ret;
|
|
});
|
|
exports.parse = parse;
|
|
const getScope = (cliArgs) => {
|
|
if (!instance) {
|
|
instance = new Scope();
|
|
instance.args = cliArgs;
|
|
}
|
|
return instance;
|
|
};
|
|
exports.getScope = getScope;
|
|
/*
|
|
export async function capture_request(where: any[], request: Request) {
|
|
debugRequests && logger.debug('Request', { url: request.url(), data: request.postData() });
|
|
where.push({ url: request.url(), data: await request.postData(), request: request });
|
|
debugRequests && logger.debug('requests', where.map(r => r.url));
|
|
}
|
|
export async function capture_response(where: any[], response: Response) {
|
|
debugResponses && logger.debug('Response', { url: response.url(), data: await response.json() });
|
|
where.push(response);
|
|
}
|
|
*/
|
|
function capture_responses(scope, page) {
|
|
return __awaiter(this, void 0, void 0, function* () {
|
|
try {
|
|
// await page.setRequestInterception(true);
|
|
}
|
|
catch (e) {
|
|
index_1.logger.error('error intercepting responses', e);
|
|
}
|
|
scope.responses = [];
|
|
page.on('response', response => {
|
|
try {
|
|
const isJson = (response.headers()['content-type'] || '').startsWith('application/json;') === true;
|
|
const url = response.url();
|
|
if (response.status() === 200) {
|
|
if (isJson) {
|
|
// capture_response(scope.responses, response as any);
|
|
}
|
|
if (scope.onResponse) {
|
|
scope.onResponse(response, scope);
|
|
}
|
|
}
|
|
else {
|
|
debugResponses && index_1.logger.error(`Error loading ${url} : ${response.status()}`);
|
|
}
|
|
}
|
|
catch (e) {
|
|
debugResponses && index_1.logger.error('Error parsing response');
|
|
}
|
|
});
|
|
});
|
|
}
|
|
class Scope {
|
|
constructor() {
|
|
this.requests = [];
|
|
this.responses = [];
|
|
this.eventBeacons = [];
|
|
this.mutationBeacons = [];
|
|
this.sessionSuffix = '';
|
|
}
|
|
init() {
|
|
return __awaiter(this, void 0, void 0, function* () {
|
|
this.sessionSuffix = ' - ' + new Date().getTime();
|
|
const args = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-infobars',
|
|
'--window-position=0,0',
|
|
'--ignore-certifcate-errors',
|
|
'--ignore-certifcate-errors-spki-list',
|
|
'--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"',
|
|
`--user-data-dir=${path.resolve('../chrome')}`
|
|
];
|
|
this.browser = yield puppeteer.launch(Object.assign(Object.assign({}, this.args), { args: args }));
|
|
// const context = await this.browser.createIncognitoBrowserContext();
|
|
this.page = yield this.browser.newPage();
|
|
// this.page = await context.newPage();
|
|
this.page.on('console', msg => {
|
|
// error('Browser error:', msg);
|
|
});
|
|
this.page.on('error', msg => index_1.logger.error('Browser Error:', msg));
|
|
this.page.on('pageerror', msg => index_1.logger.error('Browser Page Error:', msg));
|
|
this.page.on('requestfailed', msg => index_1.logger.error('Browser Page Request Error:', msg));
|
|
//capture_requests(this, this.page);
|
|
//capture_responses(this, this.page);
|
|
// this.args.disableRequests !== 'true' && capture_requests(this, this.page);
|
|
// this.args.disableResponses !== 'true' && capture_requests(this, this.page);
|
|
// capture_responses(this, this.page);
|
|
const page2 = this.page;
|
|
//page2.setCacheEnabled(false);
|
|
/**
|
|
await page2._client.on('Security.certificateError', (event: any) => {
|
|
page2._client.send('Security.handleCertificateError', {
|
|
eventId: event.eventId,
|
|
action: 'continue' // ignore error and continue request
|
|
})
|
|
})
|
|
*/
|
|
});
|
|
}
|
|
}
|
|
exports.Scope = Scope;
|
|
const body = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
const options = {
|
|
headless: false,
|
|
url: url
|
|
};
|
|
let timeout = 8000;
|
|
if (!exports.scope) {
|
|
try {
|
|
exports.scope = yield (0, exports.getScope)(options);
|
|
yield exports.scope.init();
|
|
}
|
|
catch (e) {
|
|
debugger;
|
|
index_1.logger.error("Invalid scope - abort", e);
|
|
return;
|
|
}
|
|
exports.scope.page.on("pageerror", function (err) {
|
|
index_1.logger.error('page-error!', err);
|
|
});
|
|
exports.scope.page.on("error", function (err) {
|
|
index_1.logger.error('brower-error!', err);
|
|
});
|
|
exports.scope.page.on('console', msg => {
|
|
if (msg._type === 'error') {
|
|
index_1.logger.error('Browser error:', msg);
|
|
exports.scope.page.isError = true;
|
|
}
|
|
});
|
|
}
|
|
const parse = (resolve) => {
|
|
exports.scope.page.content().then((c) => {
|
|
if (!c) {
|
|
index_1.logger.error('error user page ', url);
|
|
resolve(null);
|
|
return;
|
|
}
|
|
const $ = cheerio.load(c, {
|
|
xmlMode: true
|
|
});
|
|
const links = [];
|
|
$('a').each((index, element) => {
|
|
const href = $(element).attr('href');
|
|
if (href && href.indexOf('contact') !== -1) {
|
|
links.push(href);
|
|
}
|
|
});
|
|
resolve({ html: c, text: $('body').text(), links });
|
|
});
|
|
};
|
|
return new Promise((resolve) => {
|
|
exports.scope.page.goto(url, {
|
|
timeout: timeout,
|
|
waitUntil: 'networkidle0'
|
|
}).then((v) => {
|
|
parse(resolve);
|
|
}).catch((e) => {
|
|
index_1.logger.error('error loading page', e);
|
|
});
|
|
});
|
|
});
|
|
exports.body = body;
|
|
const contactUrl = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
});
|
|
exports.contactUrl = contactUrl;
|
|
//# sourceMappingURL=html.js.map
|