350 lines
29 KiB
JavaScript
350 lines
29 KiB
JavaScript
import https from 'https';
|
|
import axios from "axios";
|
|
import * as cheerio from "cheerio";
|
|
import * as path from 'path';
|
|
import { URL } from 'url';
|
|
import puppeteerExtra from 'puppeteer-extra';
|
|
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
|
|
const puppeteerExtraAny = puppeteerExtra;
|
|
puppeteerExtraAny.use(StealthPlugin());
|
|
import { logger } from '../index.js';
|
|
export const STATS_SUFFIX = '_stats.json';
|
|
export const SESSION_EVENTS_SUFFIX = '_session.json';
|
|
export const TRACE_SUFFIX = '_trace.json';
|
|
export var scope;
|
|
const included_categories = ['devtools.timeline'];
|
|
const _url_short = (url) => new URL(url).hostname;
|
|
let instance;
|
|
const debugRequests = true;
|
|
const debugResponses = false;
|
|
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
|
|
export const extractEmail = (input) => {
|
|
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;
|
|
const match = input.match(emailRegex);
|
|
return match ? match[0] : null;
|
|
};
|
|
export const meta = async (loc, options) => {
|
|
if (!loc.website) {
|
|
logger.warn(`No website to retrieve meta data : ${loc.title}`);
|
|
return;
|
|
}
|
|
if (loc.meta) {
|
|
return;
|
|
}
|
|
try {
|
|
const _meta = await parseHtml(loc.website, null, options) || {};
|
|
loc.meta = _meta;
|
|
if (_meta.social) {
|
|
loc.instagram = _meta.social.find(p => p.source === 'instagram')?.url;
|
|
loc.facebook = _meta.social.find(p => p.source === 'facebook')?.url;
|
|
loc.youtube = _meta.social.find(p => p.source === 'youtube')?.url;
|
|
loc.linkedin = _meta.social.find(p => p.source === 'linkedin')?.url;
|
|
loc.twitter = _meta.social.find(p => p.source === 'twitter')?.url;
|
|
}
|
|
return _meta;
|
|
}
|
|
catch (error) {
|
|
logger.error('Error retrieving meta data : ' + loc.website, error.message);
|
|
if (error.status)
|
|
loc.rejected = error.status === 404 || error.status === 403 || error.status === 999 || error.status === 503;
|
|
}
|
|
};
|
|
export const isValidUrl = (url) => {
|
|
try {
|
|
new URL(url);
|
|
return true;
|
|
}
|
|
catch (error) {
|
|
return false;
|
|
}
|
|
};
|
|
const readMetaTags = ($, name) => {
|
|
return $(`meta[name="${name}"]`).attr('content') || $(`meta[property="${name}"]`).attr('content') || null;
|
|
};
|
|
export const parseHtml = async (url, config, options) => {
|
|
if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url))
|
|
return {};
|
|
let content = '';
|
|
let currentUrl = url;
|
|
if (options && options.headless) {
|
|
try {
|
|
const browser = await puppeteerExtraAny.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] });
|
|
const page = await browser.newPage();
|
|
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36');
|
|
await page.goto(url, { waitUntil: 'networkidle2', timeout: options.timeout || 30000 });
|
|
content = await page.content();
|
|
currentUrl = page.url();
|
|
await browser.close();
|
|
}
|
|
catch (e) {
|
|
logger.error(`Puppeteer failed for ${url}: ${e.message}`);
|
|
}
|
|
}
|
|
if (!content) {
|
|
try {
|
|
const { data } = await axios(url, {
|
|
...config,
|
|
httpsAgent: new https.Agent({ rejectUnauthorized: false }),
|
|
headers: {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
|
|
},
|
|
timeout: 10000
|
|
});
|
|
content = data;
|
|
}
|
|
catch (e) {
|
|
logger.error(`Axios failed for ${url}: ${e.message}`);
|
|
return {};
|
|
}
|
|
}
|
|
const $ = cheerio.load(content);
|
|
const og = {};
|
|
const meta = {};
|
|
const images = [];
|
|
const links = [];
|
|
let allLinks = [];
|
|
const title = $('title').text();
|
|
if (title)
|
|
meta.title = title;
|
|
const canonical = $('link[rel=canonical]').attr('href');
|
|
if (canonical) {
|
|
meta.url = canonical;
|
|
}
|
|
['title', 'description', 'image'].forEach(s => {
|
|
const val = readMetaTags($, s);
|
|
if (val)
|
|
meta[s] = val;
|
|
});
|
|
['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name', 'og:type'].forEach(s => {
|
|
const val = readMetaTags($, s);
|
|
if (val)
|
|
og[s.split(':')[1]] = val;
|
|
});
|
|
$('img').each((i, el) => {
|
|
let src = $(el).attr('src');
|
|
if (src) {
|
|
try {
|
|
src = new URL(src, url).href;
|
|
images.push({ src });
|
|
}
|
|
catch (e) {
|
|
// ignore invalid urls
|
|
}
|
|
}
|
|
});
|
|
const jsonLdArray = [];
|
|
$('script[type="application/ld+json"]').each((_, element) => {
|
|
const jsonLdContent = $(element).html();
|
|
if (jsonLdContent) {
|
|
try {
|
|
const jsonData = JSON.parse(jsonLdContent);
|
|
jsonLdArray.push(jsonData);
|
|
}
|
|
catch (e) {
|
|
logger.error(`Error parsing JSON-LD: ${e.message} @ ${url}`);
|
|
}
|
|
}
|
|
});
|
|
$('a').each((index, element) => {
|
|
let href = $(element).attr('href');
|
|
if (href) {
|
|
try {
|
|
href = new URL(href, url).href;
|
|
if (isValidUrl(href)) {
|
|
if (href.indexOf('contact') !== -1 && !links.includes(href)) {
|
|
links.push(href);
|
|
}
|
|
allLinks.push(href);
|
|
}
|
|
}
|
|
catch (e) {
|
|
// Ignore invalid URLs
|
|
}
|
|
}
|
|
});
|
|
allLinks = [...new Set(allLinks)];
|
|
const socialLinks = [];
|
|
const internalPages = [];
|
|
const externalLinks = [];
|
|
allLinks.forEach(link => {
|
|
if (link.includes('instagram.com'))
|
|
socialLinks.push({ url: link, source: 'instagram', status: 'PENDING' });
|
|
else if (link.includes('facebook.com'))
|
|
socialLinks.push({ url: link, source: 'facebook', status: 'PENDING' });
|
|
else if (link.includes('linkedin.com'))
|
|
socialLinks.push({ url: link, source: 'linkedin', status: 'PENDING' });
|
|
else if (link.includes('youtube.com'))
|
|
socialLinks.push({ url: link, source: 'youtube', status: 'PENDING' });
|
|
else if (link.includes('twitter.com'))
|
|
socialLinks.push({ url: link, source: 'twitter', status: 'PENDING' });
|
|
else if (link.includes('mailto:')) { /* ignore mailto */ }
|
|
else {
|
|
try {
|
|
const baseUrl = new URL(url).hostname;
|
|
const linkUrl = new URL(link).hostname;
|
|
if (linkUrl === baseUrl || linkUrl.endsWith('.' + baseUrl)) {
|
|
internalPages.push({ url: link, source: 'site', status: 'PENDING' });
|
|
}
|
|
else {
|
|
externalLinks.push({ url: link, source: 'external', status: 'PENDING' });
|
|
}
|
|
}
|
|
catch (e) {
|
|
externalLinks.push({ url: link, source: 'external', status: 'PENDING' });
|
|
}
|
|
}
|
|
});
|
|
const ret = {
|
|
title: meta.title || og.title,
|
|
description: meta.description || og.description,
|
|
image: meta.image || og.image,
|
|
url: meta.url || og.url || url,
|
|
social: socialLinks,
|
|
seo: {
|
|
keywords: ($('meta[property="og:keywords"]').attr("content") ||
|
|
$('meta[name="keywords"]').attr("content") || "").split(',').map(s => s.trim()).filter(s => s),
|
|
structured: jsonLdArray,
|
|
og,
|
|
metaTags: meta
|
|
},
|
|
pages: internalPages,
|
|
externalLinks: externalLinks,
|
|
images
|
|
};
|
|
return ret;
|
|
};
|
|
export const getScope = (cliArgs) => {
|
|
if (!instance) {
|
|
instance = new Scope();
|
|
instance.args = cliArgs;
|
|
}
|
|
return instance;
|
|
};
|
|
export async function capture_responses(scope, page) {
|
|
try {
|
|
// await page.setRequestInterception(true);
|
|
}
|
|
catch (e) {
|
|
logger.error('error intercepting responses', e);
|
|
}
|
|
scope.responses = [];
|
|
page.on('response', response => {
|
|
try {
|
|
const isJson = (response.headers()['content-type'] || '').startsWith('application/json;') === true;
|
|
const url = response.url();
|
|
if (response.status() === 200) {
|
|
if (isJson) {
|
|
// capture_response(scope.responses, response as any);
|
|
}
|
|
if (scope.onResponse) {
|
|
scope.onResponse(response, scope);
|
|
}
|
|
}
|
|
else {
|
|
debugResponses && logger.error(`Error loading ${url} : ${response.status()}`);
|
|
}
|
|
}
|
|
catch (e) {
|
|
debugResponses && logger.error('Error parsing response');
|
|
}
|
|
});
|
|
}
|
|
export class Scope {
|
|
browser;
|
|
context;
|
|
page;
|
|
args;
|
|
requests = [];
|
|
responses = [];
|
|
eventBeacons = [];
|
|
mutationBeacons = [];
|
|
sessionSuffix = '';
|
|
onResponse;
|
|
onRequest;
|
|
async init() {
|
|
this.sessionSuffix = ' - ' + new Date().getTime();
|
|
const args = [
|
|
'--no-sandbox',
|
|
'--disable-setuid-sandbox',
|
|
'--disable-infobars',
|
|
'--window-position=0,0',
|
|
'--ignore-certifcate-errors',
|
|
'--ignore-certifcate-errors-spki-list',
|
|
'--user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3312.0 Safari/537.36"',
|
|
`--user-data-dir=${path.resolve('../chrome')}`
|
|
];
|
|
this.browser = await puppeteerExtraAny.launch({
|
|
...this.args,
|
|
args: args
|
|
});
|
|
this.page = await this.browser.newPage();
|
|
this.page.on('console', msg => {
|
|
// error('Browser error:', msg);
|
|
});
|
|
this.page.on('error', msg => logger.error('Browser Error:', msg));
|
|
this.page.on('pageerror', msg => logger.error('Browser Page Error:', msg));
|
|
this.page.on('requestfailed', msg => logger.error('Browser Page Request Error:', msg));
|
|
const page2 = this.page;
|
|
}
|
|
}
|
|
export const body = async (url) => {
|
|
const options = {
|
|
headless: false,
|
|
url: url
|
|
};
|
|
let timeout = 8000;
|
|
if (!scope) {
|
|
try {
|
|
scope = await getScope(options);
|
|
await scope.init();
|
|
}
|
|
catch (e) {
|
|
debugger;
|
|
logger.error("Invalid scope - abort", e);
|
|
return;
|
|
}
|
|
scope.page.on("pageerror", function (err) {
|
|
logger.error('page-error!', err);
|
|
});
|
|
scope.page.on("error", function (err) {
|
|
logger.error('brower-error!', err);
|
|
});
|
|
scope.page.on('console', msg => {
|
|
if (msg._type === 'error') {
|
|
logger.error('Browser error:', msg);
|
|
scope.page.isError = true;
|
|
}
|
|
});
|
|
}
|
|
const parse = (resolve) => {
|
|
scope.page.content().then((c) => {
|
|
if (!c) {
|
|
logger.error('error user page ', url);
|
|
resolve(null);
|
|
return;
|
|
}
|
|
const $ = cheerio.load(c, {
|
|
xmlMode: true
|
|
});
|
|
const links = [];
|
|
$('a').each((index, element) => {
|
|
const href = $(element).attr('href');
|
|
if (href && href.indexOf('contact') !== -1) {
|
|
links.push(href);
|
|
}
|
|
});
|
|
resolve({ html: c, text: $('body').text(), links });
|
|
});
|
|
};
|
|
return new Promise((resolve) => {
|
|
scope.page.goto(url, {
|
|
timeout: timeout,
|
|
waitUntil: 'networkidle0'
|
|
}).then((v) => {
|
|
parse(resolve);
|
|
}).catch((e) => {
|
|
logger.error('error loading page', e);
|
|
});
|
|
});
|
|
};
|
|
//# sourceMappingURL=data:application/json;base64,
|