mono/packages/search/lib/email.js
2025-03-11 11:28:14 +01:00

134 lines
5.5 KiB
JavaScript

"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.findEMail = exports.puppeteerLoader = exports.cheerioLoader = exports.HtmlToTextTransformer = void 0;
const index_1 = require("../index");
const cheerio_1 = require("langchain/document_loaders/web/cheerio");
const text_splitter_1 = require("langchain/text_splitter");
const html_to_text_1 = require("html-to-text");
const documents_1 = require("@langchain/core/documents");
const html_1 = require("./html");
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
const mailtoRegex = /^mailto:([^\s@]+@[^\s@]+\.[^\s@]+)$/i;
const pupeteer_1 = require("./pupeteer");
class HtmlToTextTransformer extends documents_1.MappingDocumentTransformer {
static lc_name() {
return "HtmlToTextTransformer";
}
constructor(options = {}) {
super(options);
Object.defineProperty(this, "options", {
enumerable: true,
configurable: true,
writable: true,
value: options
});
}
_transformDocument(document) {
return __awaiter(this, void 0, void 0, function* () {
const extractedContent = (0, html_to_text_1.htmlToText)(document.pageContent, this['options']);
return new documents_1.Document({
pageContent: extractedContent,
metadata: Object.assign({}, document.metadata),
});
});
}
}
exports.HtmlToTextTransformer = HtmlToTextTransformer;
const cheerioLoader = (url) => __awaiter(void 0, void 0, void 0, function* () {
const loader = new cheerio_1.CheerioWebBaseLoader(url);
const docs = yield loader.load();
const splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage("html");
const transformer = new HtmlToTextTransformer();
const sequence = splitter.pipe(transformer);
const ret = yield sequence.invoke(docs);
return ret;
});
exports.cheerioLoader = cheerioLoader;
const puppeteerLoader = (url, headless, location) => __awaiter(void 0, void 0, void 0, function* () {
if ((0, html_1.isValidUrl)(url) === false || url.indexOf('mailto') !== -1) {
return [];
}
let loaderWithOptions;
try { // Function to detect a valid URL
loaderWithOptions = new pupeteer_1.PuppeteerWebBaseLoader(url, {
launchOptions: {
headless,
ignoreHTTPSErrors: true
},
gotoOptions: {
timeout: 5000,
waitUntil: "networkidle0",
},
evaluate(page, browser) {
return __awaiter(this, void 0, void 0, function* () {
const result = yield page.evaluate(() => document.body.innerHTML);
// await browser.close()
return result;
});
}
});
const docs = yield loaderWithOptions.load();
const splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage("html");
const transformer = new HtmlToTextTransformer();
const sequence = splitter.pipe(transformer);
const ret = yield sequence.invoke(docs);
return ret;
}
catch (error) {
index_1.logger.warn('Error loading page: ' + url, error.message);
location.rejected = true;
// loader.browser && loader.browser.close()
return [];
}
});
exports.puppeteerLoader = puppeteerLoader;
const extractEmailAddresses = (text) => {
const lines = text.split(/\r?\n/);
const emailAddresses = [];
for (const line of lines) {
const matches = line.match(emailRegex);
if (matches) {
emailAddresses.push(...matches);
}
}
return emailAddresses;
};
const findEMail = (question, url, opts, location) => __awaiter(void 0, void 0, void 0, function* () {
// for some weird reason only the user knows :)
if (url.match(emailRegex) || url.match(mailtoRegex) || url.indexOf('mailto') !== -1) {
index_1.logger.warn('Email found in URL', url);
return false;
}
let pageUrl = url;
if (location.meta && location.meta.links && location.meta.links.length) {
pageUrl = location.meta.links[0];
}
let docs = yield (0, exports.puppeteerLoader)(pageUrl, opts.headless, location);
let emails = [];
docs.forEach((d) => {
if (d.pageContent && d.pageContent.indexOf('@') !== -1) {
const mails = extractEmailAddresses(d.pageContent);
if (mails) {
emails.push(...mails);
}
}
});
emails = [...new Set(emails)];
location.emails = emails;
if (emails.length) {
location.email = emails[0];
}
location.email && index_1.logger.debug(`Found email for ${url} / ${location.title} : ${location.type} : ${location.email} : ${opts.searchFrom}`);
return emails;
});
exports.findEMail = findEMail;
//# sourceMappingURL=email.js.map