"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.contactUrl = exports.body = exports.Scope = exports.getScope = exports.parse = exports.isValidUrl = exports.meta = exports.extractEmail = exports.scope = exports.TRACE_SUFFIX = exports.SESSION_EVENTS_SUFFIX = exports.STATS_SUFFIX = void 0; exports.capture_responses = capture_responses; const https = require('https'); const axios_1 = require("axios"); const node_html_parser_1 = require("node-html-parser"); const cheerio = require("cheerio"); const path = require("path"); const url_1 = require("url"); const puppeteer = require("puppeteer"); const index_1 = require("../index"); const html_1 = require("./html"); exports.STATS_SUFFIX = '_stats.json'; exports.SESSION_EVENTS_SUFFIX = '_session.json'; exports.TRACE_SUFFIX = '_trace.json'; const included_categories = ['devtools.timeline']; const _url_short = (url) => new url_1.URL(url).hostname; let instance; const debugRequests = true; const debugResponses = false; process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0'; const extractEmail = (input) => { // Regular expression to match a typical email format const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/; // Use the regex to search for an email in the input string const match = input.match(emailRegex); // Return the matched email, or null if none is found return match ? match[0] : null; }; exports.extractEmail = extractEmail; const meta = (loc, options) => __awaiter(void 0, void 0, void 0, function* () { if (!loc.website) { index_1.logger.warn(`No website to retrieve meta data : ${loc.title}`); return; } if (loc.meta) { return; } try { const _meta = (yield (0, html_1.parse)(loc.website, null, options)) || {}; loc.meta = _meta; loc.instagram = _meta.instagram; loc.facebook = _meta.facebook; loc.youtube = _meta.youtube; loc.linkedin = _meta.linkedin; loc.twitter = _meta.twitter; loc.email = (_meta.allLinks || []).map((l) => (0, exports.extractEmail)(l)).filter((e) => e !== null)[0]; } catch (error) { index_1.logger.error('Error retrieving meta data : ' + loc.website, error.message); if (error.status) loc.rejected = error.status === 404 || error.status === 403 || error.status === 999 || error.status === 503; } }); exports.meta = meta; const isValidUrl = (url) => { try { new url_1.URL(url); return true; } catch (error) { return false; } }; exports.isValidUrl = isValidUrl; const readMetaTags = (el, name) => { var prop = el.getAttribute('name') || el.getAttribute('property'); return prop == name ? el.getAttribute('content') : null; }; const parse = (url, config, options) => __awaiter(void 0, void 0, void 0, function* () { if (!/(^http(s?):\/\/[^\s$.?#].[^\s]*)/i.test(url)) return {}; const { data } = yield (0, axios_1.default)(url, Object.assign(Object.assign({}, config), { httpsAgent: new https.Agent({ rejectUnauthorized: false }), headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' }, timeout: 10000 })); const $ = (0, node_html_parser_1.parse)(data); const og = {}; const meta = {}; const images = []; const links = []; let allLinks = []; const title = $.querySelector('title'); if (title) meta.title = title.text; const canonical = $.querySelector('link[rel=canonical]'); if (canonical) { meta.url = canonical.getAttribute('href'); } const metas = $.querySelectorAll('meta'); for (let i = 0; i < metas.length; i++) { const el = metas[i]; // const prop = el.getAttribute('property') || el.getAttribute('name'); ['title', 'description', 'image'].forEach(s => { const val = readMetaTags(el, s); if (val) meta[s] = val; }); ['og:title', 'og:description', 'og:image', 'og:url', 'og:site_name', 'og:type'].forEach(s => { const val = readMetaTags(el, s); if (val) og[s.split(':')[1]] = val; }); } $.querySelectorAll('img').forEach(el => { let src = el.getAttribute('src'); if (src) { src = new url_1.URL(src, url).href; images.push({ src }); } }); const _$ = cheerio.load(data); // Array to store JSON-LD data const jsonLdArray = []; // Select all