mono/packages/search/dist-in/lib/googlemaps.js
2026-01-22 16:49:47 +01:00

495 lines
44 KiB
JavaScript

import * as path from 'path';
import { CONFIG_DEFAULT, DEFAULT_ROOTS, pathInfo, filesEx } from '@polymech/commons';
import { cleanObjectStrings } from './googlemaps-utils.js';
import { parse as parseProfile } from '@polymech/commons/profile';
import { isFile, resolve, substitute } from '@polymech/commons';
import { zodSchema, zodSchemaEach, yargsOptions, yargsOptionsEach, ResolveFlags, } from './googlemaps-zod.js';
import { parseCustomUrl, resolvePath } from './googlemaps-utils.js';
export { zodSchema, zodSchemaEach, yargsOptions, yargsOptionsEach, ResolveFlags };
import { clone } from '../options.js';
import { sync as write } from '@polymech/fs/write';
import { sync as read } from '@polymech/fs/read';
import { sync as exists } from '@polymech/fs/exists';
import { isArray, isObject } from '@polymech/core/primitives';
import pMap from 'p-map';
import { get_cached_object, set_cached_object } from '@polymech/cache/lib';
import { OSR_CACHE } from '@polymech/commons';
import { logger } from '../index.js';
import { cleanOptions, SearchProviders } from './providers.js';
import { findEMail } from './email.js';
import { defaultEngine, defaultFromLocation, defaultGoogleDomain, defaultLanguage, PAGE_SIZE, SEARCH_AI_PROMPTS } from './constants.js';
import { meta } from './html.js';
import { reverse, REVERSE_DEFAULT } from './geo.js';
import { writeReport } from '../lib/report_map.js';
import { geocode_forward } from './geo.js';
import { store as getStore } from '@polymech/registry';
import axios from 'axios';
const MODULE_NAME = 'osr-search';
const queryExtras = '';
export var SearchQueriesES;
(function (SearchQueriesES) {
SearchQueriesES["INJECTION"] = "inyecci\u00F3n de plastico";
})(SearchQueriesES || (SearchQueriesES = {}));
export const locationString = (coords, zoom = 13) => `@${coords},${zoom}z`;
export const store = async (storePath, ns = 'osr-search') => getStore(storePath, ns);
export const getStored = async (title, storePath, ns = 'osr-search') => getStore(storePath, ns).get(title);
export const searchVendor = async (name, dst, opts) => {
let q = name;
let ret = await SearchProviders.scaleserp({
api_key: opts.api_key,
q: q + queryExtras,
});
let urls = ret.organic_results.filter((u) => {
return !opts.blacklist.includes(new URL(u.link).hostname);
});
urls = urls.map((u) => u.link);
dst && write(dst, urls);
return urls;
};
export const defaultParamsGoogleES = (query, mixin) => {
return {
location: defaultFromLocation,
hl: defaultLanguage,
gl: defaultLanguage,
google_domain: defaultGoogleDomain,
q: query,
...mixin
};
};
export const defaultSearchParamsMapsES = (query, zoom, mixin = {}) => {
return {
engine: defaultEngine,
type: 'search',
q: query,
ll: locationString('41.6911354,2.1652746', zoom),
google_domain: defaultGoogleDomain,
hl: defaultLanguage,
...mixin,
};
};
export const searchVendorSA = async (query, location, key, opts) => {
const googleParams = {
api_key: key,
location: location,
hl: "en",
gl: "us",
google_domain: "google.com",
...opts,
q: query + queryExtras,
};
return await SearchProviders.serpApi("google", googleParams);
};
export const searchGoogleMap = async (query, key, opts) => {
const roundCoords = (coords, decimals = 3) => {
const [latitude, longitude, zoom] = coords.split(',').map((part, index) => {
if (index < 2) {
return parseFloat(parseFloat(part).toFixed(decimals));
}
return part;
});
return `@${latitude},${longitude},${zoom}`;
};
const googleParams = {
...opts,
api_key: key,
q: query + queryExtras,
ll: opts.searchCoord
};
let results = [];
let pageIdx = 0;
let index = opts.index ? read(opts.index, 'json') || {} : {};
const params = googleParams;
let cached;
const cache_key = {
engine: params.engine,
type: params.type,
q: params.q,
google_domain: params.google_domain,
hl: params.hl,
zoom: params.zoom,
searchFrom: params.searchFrom,
limit: params.limit
};
if (opts.searchCache && OSR_CACHE()) {
cached = await get_cached_object(cache_key, MODULE_NAME);
}
let page = cached || await SearchProviders.serpApi(googleParams.engine, {
...googleParams
});
while (page && page.local_results) {
page.local_results.forEach((r) => {
r.page = pageIdx;
});
results.push(...page.local_results);
if (results.length >= opts.limit)
break;
pageIdx++;
page = await page.next?.();
}
if (page && page.place_results && !isArray(page.place_results)) {
page.place_results = [page.place_results];
}
while (page && page.place_results) {
page.place_results.forEach((r) => {
r.page = pageIdx;
});
results.push(...page.place_results);
if (results.length >= opts.limit)
break;
pageIdx++;
page = await page.next?.();
}
if (opts.searchCache && OSR_CACHE()) {
set_cached_object(cache_key, MODULE_NAME, results);
}
let idx = 0;
//const cachedLoc = async (title: string) => getStored(title, opts.store, MODULE_NAME)
// Optimization: Defer expensive enrichment (photos, meta) until AFTER filtering
// Only perform cheaper geo-enrichment (reverse geocoding) if needed for filtering
const preFilterOpts = { ...opts, resolve: [], meta: true, findEMail: false };
await enrichResults(results, index, preFilterOpts);
logger.debug(`search ${query} with ${params.ll} / ${params.searchFrom} @ ${opts.zoom} | ${results.length} results before filters`);
if (opts.filterCity) {
results = results.filter((r) => r.geo.city.toLowerCase() === opts.filterCity.toLowerCase());
}
if (opts.filterCountry) {
results = results.filter((r) => r.geo.countryName.toLowerCase() === opts.filterCountry.toLowerCase());
}
if (opts.filterContinent) {
results = results.filter((r) => r.geo.continent.toLowerCase() === opts.filterContinent.toLowerCase());
}
if (opts.filterType) {
results = results.filter((r) => r.type === opts.filterType);
}
if (opts.excludedTypes && opts.excludedTypes.length > 0) {
results = results.filter((r) => {
// If result has no types, keep it (conservative)
if (!r.types || r.types.length === 0)
return true;
// If ANY of result types matches ANY of excluded types, filter it OUT
return !r.types.some(t => opts.excludedTypes.includes(t));
});
}
results = results.filter((r) => r.gps_coordinates);
const beforeCached = results.length;
const newResults = results.filter((r) => {
return index[r.title] == null || !index[r.title].geo || !index[r.title].meta;
});
logger.info(`found ${newResults.length} new items for "${query}" (Zoom: ${opts.zoom} | Limit: ${opts.limit}) from "${params.searchFrom}" | ${beforeCached} total before cache filtering`);
const processedResults = newResults.slice(0, opts.limit);
await enrichResults(processedResults, index, opts);
return results;
};
const enrichResults = async (results, index, opts) => {
let idx = 0;
await pMap(results, async (entry) => {
idx++;
entry.position = entry.page * PAGE_SIZE + idx;
try {
if (index[entry.title] && index[entry.title].geo) {
entry.geo = index[entry.title].geo;
return;
}
return reverse(entry, opts);
}
catch (e) {
logger.error(`Error reverse geocoding ${entry.title}`);
entry.geo = REVERSE_DEFAULT;
}
}, { concurrency: opts.concurrency });
if (opts.meta) {
await pMap(results, (entry) => {
if (entry.meta || !entry.website || entry.rejected) {
return;
}
try {
if (index[entry.title] && index[entry.title].meta) {
entry.meta = index[entry.title].meta;
return;
}
return meta(entry, opts);
}
catch (e) {
// entry.meta = {}
}
}, { concurrency: 1 });
}
if (opts.findEMail && opts.meta) {
await pMap(results, async (entry) => {
if (index[entry.title] && index[entry.title].email) {
entry.email = index[entry.title].email;
return;
}
if (entry.meta && entry.website && !entry.email) {
try {
return findEMail(SEARCH_AI_PROMPTS.GET_EMAIL, entry.website, opts, entry);
}
catch (e) {
logger.error(`Error retrieving EMail data ${entry.title}`);
}
}
}, { concurrency: 1 });
}
if (opts.resolve?.includes(ResolveFlags.PHOTOS)) {
await pMap(results, async (entry) => {
if (entry.google_media || !entry.data_id) {
return;
}
if (index[entry.title] && index[entry.title].google_media) {
entry.google_media = index[entry.title].google_media;
return;
}
try {
if (!entry.photos_link)
return;
const url = `${entry.photos_link}&api_key=${opts.api_key}`;
const response = await axios.get(url);
const photos = response.data;
if (photos) {
entry.google_media = photos;
}
}
catch (e) {
logger.error(`Error retrieving photo data for ${entry.title}`, e);
}
}, { concurrency: 10 });
}
return results;
};
export const parse = (argv) => {
const args = argv;
logger.settings.minLevel = args.logLevel || 2;
const config = CONFIG_DEFAULT(args.env_key);
if (!config) {
logger.warn('No config found!');
return;
}
if (config && !config.serpapi.key) {
logger.warn('No serpapi key found in config!');
return;
}
let opts = {
query: argv.query,
...defaultSearchParamsMapsES(argv.query, argv.zoom),
...argv,
api_key: argv.api_key || config.serpapi.key,
geocode_key: argv.geocode_key || config.geocoder.key,
openai: config.openai,
headless: argv.headless ? true : false,
bigdata: { key: config.bigdata.key }
};
opts = zodSchema().parse(opts);
/*
opts.source && isString(opts.source) && (opts.source = path.resolve(resolve(args.source, false)))
if (opts.source && isString(opts.source)) {
if (exists(opts.source)) {
opts.source = read(opts.source, 'json')
} else {
logger.error(`Source file ${args.source} not found : ${opts.source}`)
return
}
}
*/
if (!opts.source && !opts.query) {
logger.warn(`Invalid source and query`);
return;
}
if (opts.index) {
opts.index = path.resolve(resolve(opts.index, false));
}
if (opts.store) {
opts.store = path.resolve(resolve(opts.store, false));
}
if (!opts.api_key) {
logger.error('No Serpapi key found in config or options!');
return;
}
if (!opts.query) {
logger.error('No query specified');
return;
}
return opts;
};
export const googleMaps = async (opts) => {
opts = parse(opts);
if (!opts) {
logger.error('Invalid options', opts);
return;
}
try {
const searchFrom = substitute(false, opts.searchFrom, opts.variables);
if (searchFrom && opts.geocode_key && !opts.searchCoord) {
const coords = await geocode_forward(searchFrom, opts.geocode_key);
if (coords) {
opts.searchCoord = locationString(coords, opts.zoom);
}
else {
logger.error(`Error geocoding "${searchFrom}"`);
}
}
}
catch (error) {
logger.error(`Error geocoding "${opts.searchFrom}"`, error, error.stack);
}
let ret = [];
const search = async (query, category, opts) => {
opts = clone(opts);
if (opts.dst) {
opts.dst = resolvePath(path.join(opts.cwd || '', opts.dst || ''), query, category, opts);
logger.debug(`output destination --dst "${opts.dst}"`);
}
if (opts.dst && opts.cache !== false && exists(opts.dst + '.json')) {
const cachedPath = opts.dst + '.json';
const cached = read(cachedPath, 'json') || [];
logger.debug(`Searching ${opts.query} with ${opts.searchFrom} :: returning cached ${cached.length}`);
ret = [...ret, ...cached];
return cached;
}
try {
const sr = await searchGoogleMap(query, opts.api_key, { ...opts });
if (sr && sr.length && opts.dst) {
write(opts.dst + '.json', sr);
writeReport(sr, opts.dst, opts);
const parts = path.parse(opts.dst);
write(path.join(parts.dir, parts.name + '_options.json'), cleanOptions(opts));
}
ret = [...ret, ...sr];
return sr;
}
catch (error) {
logger.error('Error searching GoogleMaps : ' + error.message, error, error.stack);
return [];
}
};
// @todos : retry, ...
const all = (await pMap(Object.keys(opts.source), (k) => {
return pMap(opts.source[k], (t) => {
return search(t, k, opts);
}, {
concurrency: 1,
});
}, {
concurrency: 1,
})).flat(2);
if (opts.dst) {
opts.dst = resolvePath(opts.dst, 'all', 'all', opts);
let existingResults = [];
if (exists(opts.dst + '.json')) {
existingResults = read(opts.dst + '.json', 'json') || [];
}
// Combine, deduplicate, clean, and process URLs in a single chain
const finalResults = Array.from([...existingResults, ...ret].reduce((map, obj) => {
if (obj.place_id) {
map.set(obj.place_id, obj);
}
return map;
}, new Map()).values()).map(cleanObjectStrings)
.map((r) => {
if (r.website && typeof r.website === 'string' && r.website.startsWith('/url?q=')) {
try {
const urlString = r.website.substring('/url?q='.length);
const decodedUrl = decodeURIComponent(urlString);
const urlParts = decodedUrl.split('&');
r.website = urlParts[0];
}
catch (e) {
logger.warn(`Could not parse website URL: ${r.website}`);
}
}
return r;
});
write(opts.dst + '.json', finalResults);
writeReport(finalResults, opts.dst, opts);
}
if (opts.index) {
let index = read(opts.index, 'json') || {};
ret.forEach((r) => {
if (!index[r.title]) {
index[r.title] = r;
}
});
write(opts.index, index);
}
return all;
};
export const migrate = async (opts) => {
if (!opts.store) {
logger.error('No store provided');
return;
}
if (!opts.index) {
logger.error('No index provided');
return;
}
let index = read(opts.index, 'json') || {};
if (!isArray(index) && isObject(index)) {
index = Object.keys(index).map((k) => index[k]);
}
const ns = 'osr-search';
let _store = store(opts.store, ns);
const ret = await pMap(index, async (r) => {
// return _store.set(r.title, r)
});
return ret;
};
export const each = async (opts) => {
logger.settings.minLevel = opts.logLevel || 2;
let items = [];
if (!opts.list) {
logger.error('No list provided for each command');
return;
}
let listPath = path.resolve(resolve(opts.list));
const profile = parseProfile(opts.profile, {
variables: {
AREA: opts.area,
COUNTRY: opts.country,
...DEFAULT_ROOTS
}, includes: [], env: {}
}, { env: opts.env });
opts = parse(opts);
if (!opts) {
logger.error('Invalid options', opts);
return;
}
if (opts.migrate) {
return migrate(opts);
}
const list = await parseCustomUrl(substitute(false, opts.list, profile.variables));
if (isArray(list)) {
items = list;
}
else if (exists(listPath) && isFile(listPath) && path.parse(listPath).ext === '.json') {
items = read(listPath, 'json') || [];
}
else if (pathInfo(opts.list).IS_GLOB) {
items = filesEx(path.resolve(resolve(opts.cwd)) || './', opts.list) || [];
}
if (!items || items.length === 0) {
logger.error('osr-cli::each: invalid list or empty list');
return;
}
items = items.filter((item) => !!item);
logger.debug(`${items.length} items`);
write(path.join(path.resolve(resolve(opts.cwd), 'list.json')), items);
const all = await pMap(items, (KEY) => {
const variables = {
KEY,
TOWN: KEY,
...profile.variables
};
const googleOpts = {
...opts,
query: substitute(false, opts.query, variables),
dst: substitute(false, opts.dst, variables),
searchFrom: substitute(false, opts.searchFrom, variables),
variables
};
const ret = googleMaps(googleOpts);
return ret;
}, { concurrency: 1 });
opts.log && write(path.resolve(resolve(opts.log)), all);
return all;
};
//# sourceMappingURL=data:application/json;base64,