134 lines
5.5 KiB
JavaScript
134 lines
5.5 KiB
JavaScript
"use strict";
|
|
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
|
|
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
|
|
return new (P || (P = Promise))(function (resolve, reject) {
|
|
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
|
|
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
|
|
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
|
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
});
|
|
};
|
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
exports.findEMail = exports.puppeteerLoader = exports.cheerioLoader = exports.HtmlToTextTransformer = void 0;
|
|
const index_1 = require("../index");
|
|
const cheerio_1 = require("langchain/document_loaders/web/cheerio");
|
|
const text_splitter_1 = require("langchain/text_splitter");
|
|
const html_to_text_1 = require("html-to-text");
|
|
const documents_1 = require("@langchain/core/documents");
|
|
const html_1 = require("./html");
|
|
const emailRegex = /[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/g;
|
|
const mailtoRegex = /^mailto:([^\s@]+@[^\s@]+\.[^\s@]+)$/i;
|
|
const pupeteer_1 = require("./pupeteer");
|
|
class HtmlToTextTransformer extends documents_1.MappingDocumentTransformer {
|
|
static lc_name() {
|
|
return "HtmlToTextTransformer";
|
|
}
|
|
constructor(options = {}) {
|
|
super(options);
|
|
Object.defineProperty(this, "options", {
|
|
enumerable: true,
|
|
configurable: true,
|
|
writable: true,
|
|
value: options
|
|
});
|
|
}
|
|
_transformDocument(document) {
|
|
return __awaiter(this, void 0, void 0, function* () {
|
|
const extractedContent = (0, html_to_text_1.htmlToText)(document.pageContent, this['options']);
|
|
return new documents_1.Document({
|
|
pageContent: extractedContent,
|
|
metadata: Object.assign({}, document.metadata),
|
|
});
|
|
});
|
|
}
|
|
}
|
|
exports.HtmlToTextTransformer = HtmlToTextTransformer;
|
|
const cheerioLoader = (url) => __awaiter(void 0, void 0, void 0, function* () {
|
|
const loader = new cheerio_1.CheerioWebBaseLoader(url);
|
|
const docs = yield loader.load();
|
|
const splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage("html");
|
|
const transformer = new HtmlToTextTransformer();
|
|
const sequence = splitter.pipe(transformer);
|
|
const ret = yield sequence.invoke(docs);
|
|
return ret;
|
|
});
|
|
exports.cheerioLoader = cheerioLoader;
|
|
const puppeteerLoader = (url, headless, location) => __awaiter(void 0, void 0, void 0, function* () {
|
|
if ((0, html_1.isValidUrl)(url) === false || url.indexOf('mailto') !== -1) {
|
|
return [];
|
|
}
|
|
let loaderWithOptions;
|
|
try { // Function to detect a valid URL
|
|
loaderWithOptions = new pupeteer_1.PuppeteerWebBaseLoader(url, {
|
|
launchOptions: {
|
|
headless,
|
|
ignoreHTTPSErrors: true
|
|
},
|
|
gotoOptions: {
|
|
timeout: 5000,
|
|
waitUntil: "networkidle0",
|
|
},
|
|
evaluate(page, browser) {
|
|
return __awaiter(this, void 0, void 0, function* () {
|
|
const result = yield page.evaluate(() => document.body.innerHTML);
|
|
// await browser.close()
|
|
return result;
|
|
});
|
|
}
|
|
});
|
|
const docs = yield loaderWithOptions.load();
|
|
const splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage("html");
|
|
const transformer = new HtmlToTextTransformer();
|
|
const sequence = splitter.pipe(transformer);
|
|
const ret = yield sequence.invoke(docs);
|
|
return ret;
|
|
}
|
|
catch (error) {
|
|
index_1.logger.warn('Error loading page: ' + url, error.message);
|
|
location.rejected = true;
|
|
// loader.browser && loader.browser.close()
|
|
return [];
|
|
}
|
|
});
|
|
exports.puppeteerLoader = puppeteerLoader;
|
|
const extractEmailAddresses = (text) => {
|
|
const lines = text.split(/\r?\n/);
|
|
const emailAddresses = [];
|
|
for (const line of lines) {
|
|
const matches = line.match(emailRegex);
|
|
if (matches) {
|
|
emailAddresses.push(...matches);
|
|
}
|
|
}
|
|
return emailAddresses;
|
|
};
|
|
const findEMail = (question, url, opts, location) => __awaiter(void 0, void 0, void 0, function* () {
|
|
// for some weird reason only the user knows :)
|
|
if (url.match(emailRegex) || url.match(mailtoRegex) || url.indexOf('mailto') !== -1) {
|
|
index_1.logger.warn('Email found in URL', url);
|
|
return false;
|
|
}
|
|
let pageUrl = url;
|
|
if (location.meta && location.meta.links && location.meta.links.length) {
|
|
pageUrl = location.meta.links[0];
|
|
}
|
|
let docs = yield (0, exports.puppeteerLoader)(pageUrl, opts.headless, location);
|
|
let emails = [];
|
|
docs.forEach((d) => {
|
|
if (d.pageContent && d.pageContent.indexOf('@') !== -1) {
|
|
const mails = extractEmailAddresses(d.pageContent);
|
|
if (mails) {
|
|
emails.push(...mails);
|
|
}
|
|
}
|
|
});
|
|
emails = [...new Set(emails)];
|
|
location.emails = emails;
|
|
if (emails.length) {
|
|
location.email = emails[0];
|
|
}
|
|
location.email && index_1.logger.debug(`Found email for ${url} / ${location.title} : ${location.type} : ${location.email} : ${opts.searchFrom}`);
|
|
return emails;
|
|
});
|
|
exports.findEMail = findEMail;
|
|
//# sourceMappingURL=email.js.map
|