cleanup kbot cpp :)

This commit is contained in:
lovebird 2026-03-30 11:21:35 +02:00
parent 2f7c18adb5
commit b0916df0f5
28 changed files with 97 additions and 3305 deletions

View File

@ -90,12 +90,6 @@ add_subdirectory(packages/http)
add_subdirectory(packages/json)
add_subdirectory(packages/polymech)
add_subdirectory(packages/ipc)
add_subdirectory(packages/geo)
add_subdirectory(packages/gadm_reader)
add_subdirectory(packages/grid)
add_subdirectory(packages/search)
add_subdirectory(packages/enrichers)
add_subdirectory(packages/liboai/liboai)
add_subdirectory(packages/kbot)
@ -103,19 +97,15 @@ add_subdirectory(packages/kbot)
# Sources
add_executable(${PROJECT_NAME}
src/main.cpp
src/cmd_gridsearch.cpp
src/cmd_gridsearch-filters.cpp
src/cmd_gridsearch-uds.cpp
src/cmd_gridsearch-postgres.cpp
src/cmd_kbot.cpp
src/gridsearch_serialize.cpp
src/cmd_kbot_uds.cpp
src/sys_metrics.cpp
)
# Output file name is kbot.exe / kbot (not kbot-cli)
set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME "kbot")
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers kbot)
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc search kbot)
target_include_directories(${PROJECT_NAME} PRIVATE
${asio_SOURCE_DIR}/asio/include

View File

@ -21,11 +21,6 @@
"kbot:ai": ".\\dist\\kbot.exe kbot ai --prompt \"hi\"",
"kbot:run": ".\\dist\\kbot.exe kbot run --list",
"test:ipc": "node orchestrator/test-ipc.mjs",
"test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs",
"test:gridsearch-filter-ipc": "cmake --build build/release --target test_gridsearch_ipc && .\\dist\\test_gridsearch_ipc.exe",
"test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs",
"test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs",
"test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs",
"test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe"
},
"repository": {

View File

@ -1,4 +0,0 @@
add_library(enrichers STATIC src/enrichers.cpp)
target_include_directories(enrichers PUBLIC include)
target_link_libraries(enrichers PUBLIC http html json logger)

View File

@ -1,162 +0,0 @@
#pragma once
#include <map>
#include <string>
#include <vector>
namespace enrichers {
// ── Status codes ────────────────────────────────────────────────────────────
enum class EnrichStatus {
OK,
NO_EMAIL,
META_TIMEOUT,
EMAIL_TIMEOUT,
FETCH_ERROR,
NO_PAGES,
ERROR,
};
const char *status_string(EnrichStatus s);
// ── Data types ──────────────────────────────────────────────────────────────
struct PageError {
std::string url;
std::string status; // "SEARCHED_EMAIL", "FAILED", ...
std::string method; // "GET", "SCRAPELESS", ...
std::string error;
int http_status = 0;
std::vector<std::string> emails;
};
struct SocialLink {
std::string platform; // "instagram", "facebook", "linkedin", ...
std::string url;
};
struct SiteMeta {
std::string title;
std::string description;
std::string og_image;
std::string canonical;
std::vector<SocialLink> socials;
std::vector<std::string> internal_pages; // discovered internal hrefs
std::vector<std::string> emails;
std::string body_text;
std::string body_html;
std::map<std::string, std::string> sites; // url -> body_md
int http_status = 0;
std::string fetch_error;
std::vector<std::string> json_ld;
};
struct EnrichedNode {
int idx = 0;
std::string title;
std::string place_id;
std::string website;
std::string address;
std::string type;
std::string grid_area;
std::string grid_gid;
int pages_found = 0;
int pages_scraped = 0;
std::vector<std::string> emails;
std::vector<SocialLink> socials;
int meta_ms = 0;
int email_ms = 0;
int total_ms = 0;
EnrichStatus status = EnrichStatus::NO_EMAIL;
std::string error;
std::map<std::string, std::string> pages; // "home" → body text
std::vector<std::string> meta_pages;
std::vector<PageError> page_errors;
std::string enricher_hash;
std::string geo_json;
std::map<std::string, std::string> sites; // url -> body_md
};
// ── Configuration ───────────────────────────────────────────────────────────
struct EnrichConfig {
bool enable_homepage_md = true;
int meta_timeout_ms = 10000;
int email_timeout_ms = 15000;
int email_page_timeout_ms = 10000;
int email_max_pages = 8;
int email_abort_after = 1;
/// Scrapeless API key — if set, pages that yield no emails via plain
/// HTTP GET will be re-fetched through the Scrapeless Universal Scraping
/// API (JS rendering). Leave empty to disable the fallback.
std::string scrapeless_key;
std::string bigdata_key;
std::vector<std::string> contact_patterns = {
"contact", "kontakt", "contacto", "contacta", "impression",
"about", "impress", "impressum", "datenschutz", "privacy",
"legal", "team", "nosotros", "empresa", "sobre",
};
std::vector<std::string> probe_paths = {
"/contact", "/contacto", "/kontakt", "/contacta",
"/about", "/about-us", "/impressum",
};
std::string meta_scraper;
int meta_concurrency = 5;
int meta_idle_timeout = 60;
};
// ── Location input ──────────────────────────────────────────────────────────
struct LocationInput {
std::string title;
std::string place_id;
std::string website;
std::string address;
std::string type;
std::string grid_area;
std::string grid_gid;
double lat = 0;
double lng = 0;
};
// ── Core API ────────────────────────────────────────────────────────────────
/// Check if a candidate string looks like a real email address.
bool is_likely_email(const std::string &candidate);
/// Extract all email addresses from a text body.
std::vector<std::string> extract_emails(const std::string &text);
/// Scrape metadata from a website URL (static HTML via libcurl + lexbor).
SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000);
/// Scrape emails from a single page URL.
std::vector<std::string> scrape_emails_from_page(const std::string &url,
int timeout_ms = 10000);
/// Fetch a page via Scrapeless Universal Scraping API (JS rendering),
/// then extract emails from the rendered HTML. Returns empty if key is
/// blank or the API call fails.
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms = 15000);
/// Scrape metadata from a website URL via Scrapeless Universal API (JS
/// rendering).
SiteMeta scrape_meta_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms = 15000);
/// Full enrichment pipeline for a single location: meta → email.
EnrichedNode enrich_location(const LocationInput &loc,
const EnrichConfig &cfg = {});
/// Resolve a URL relative to a base URL.
std::string resolve_url(const std::string &base, const std::string &href);
} // namespace enrichers

View File

@ -1,800 +0,0 @@
#include "enrichers/enrichers.h"
#include "html/html.h"
#include "http/http.h"
#include "logger/logger.h"
#include "json/json.h"
#include <algorithm>
#include <chrono>
#include <future>
#include <regex>
#include <set>
#include <sstream>
namespace enrichers {
// ── Status string ───────────────────────────────────────────────────────────
const char *status_string(EnrichStatus s) {
switch (s) {
case EnrichStatus::OK:
return "OK";
case EnrichStatus::NO_EMAIL:
return "NO_EMAIL";
case EnrichStatus::META_TIMEOUT:
return "META_TIMEOUT";
case EnrichStatus::EMAIL_TIMEOUT:
return "EMAIL_TIMEOUT";
case EnrichStatus::FETCH_ERROR:
return "FETCH_ERROR";
case EnrichStatus::NO_PAGES:
return "NO_PAGES";
case EnrichStatus::ERROR:
return "ERROR";
}
return "UNKNOWN";
}
// ── Timing helper ───────────────────────────────────────────────────────────
static int elapsed_ms(std::chrono::steady_clock::time_point t0) {
auto now = std::chrono::steady_clock::now();
return static_cast<int>(
std::chrono::duration_cast<std::chrono::milliseconds>(now - t0).count());
}
// ── Email extraction ────────────────────────────────────────────────────────
static const std::regex
EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
std::regex::optimize);
// Asset extensions that disqualify an email-like string
static const std::vector<std::string> ASSET_EXTS = {
".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp",
".avif", ".css", ".js", ".woff", ".woff2", ".ttf",
".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico",
};
static std::string to_lower(const std::string &s) {
std::string out = s;
std::transform(out.begin(), out.end(), out.begin(),
[](unsigned char c) { return std::tolower(c); });
return out;
}
bool is_likely_email(const std::string &candidate) {
if (candidate.size() < 5 || candidate.size() > 254)
return false;
if (candidate.find("..") != std::string::npos)
return false;
auto at_pos = candidate.find('@');
if (at_pos == std::string::npos || at_pos == 0 ||
at_pos == candidate.size() - 1)
return false;
auto lower = to_lower(candidate);
// Reject asset-like extensions
for (auto &ext : ASSET_EXTS) {
if (lower.size() >= ext.size() &&
lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) {
return false;
}
}
// Reject common placeholders
if (lower.find("example") != std::string::npos)
return false;
if (lower.find("sentry") != std::string::npos)
return false;
if (lower.find("test") != std::string::npos)
return false;
if (lower.find("placeholder") != std::string::npos)
return false;
if (lower.find("wixpress.com") != std::string::npos)
return false;
// Reject if local part is pure hex hash (8+ hex chars)
if (at_pos >= 8) {
auto local = lower.substr(0, at_pos);
bool all_hex = std::all_of(local.begin(), local.end(), [](char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
});
if (all_hex)
return false;
}
// Reject if domain part looks numeric-only (e.g. 1234@5678)
auto domain = lower.substr(at_pos + 1);
auto dot_pos = domain.find('.');
if (dot_pos == std::string::npos)
return false;
if (domain.length() - dot_pos <= 2)
return false; // Minimum 2 chars for TLD
auto domPart = domain.substr(0, dot_pos);
bool all_digits =
!domPart.empty() &&
std::all_of(domPart.begin(), domPart.end(),
[](unsigned char c) { return std::isdigit(c); });
if (all_digits)
return false;
return true;
}
static bool is_valid_email_char(char c) {
return std::isalnum(static_cast<unsigned char>(c)) || c == '.' || c == '_' ||
c == '%' || c == '+' || c == '-';
}
std::vector<std::string> extract_emails(const std::string &text) {
std::vector<std::string> results;
if (text.empty())
return results;
std::set<std::string> seen;
size_t pos = 0;
while ((pos = text.find('@', pos)) != std::string::npos) {
if (pos == 0 || pos == text.length() - 1) {
pos++;
continue;
}
// Scan backwards
size_t start = pos;
while (start > 0 && is_valid_email_char(text[start - 1])) {
start--;
}
// Scan forwards
size_t end = pos;
while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) {
end++;
}
if (start < pos && end > pos) {
std::string candidate = text.substr(start, end - start + 1);
// Strip trailing dots/hyphens eagerly grabbed
while (!candidate.empty() &&
(candidate.back() == '.' || candidate.back() == '-')) {
candidate.pop_back();
end--;
}
// Strip leading dots/hyphens
size_t local_start = 0;
while (local_start < candidate.length() &&
(candidate[local_start] == '.' || candidate[local_start] == '-')) {
local_start++;
}
if (local_start > 0) {
candidate = candidate.substr(local_start);
}
std::string lower = to_lower(candidate);
if (is_likely_email(lower)) {
if (seen.insert(lower).second) {
results.push_back(lower);
}
}
}
pos = end + 1;
}
return results;
}
// ── URL resolution ──────────────────────────────────────────────────────────
std::string resolve_url(const std::string &base, const std::string &href) {
if (href.empty())
return {};
// Already absolute
if (href.find("http://") == 0 || href.find("https://") == 0)
return href;
// Protocol-relative
if (href.find("//") == 0) {
auto proto_end = base.find("//");
if (proto_end != std::string::npos) {
return base.substr(0, proto_end) + href;
}
return "https:" + href;
}
// Skip non-HTTP
if (href.find("mailto:") == 0 || href.find("tel:") == 0 ||
href.find("javascript:") == 0 || href[0] == '#') {
return {};
}
// Relative path
// Find base origin: https://example.com
auto proto = base.find("://");
if (proto == std::string::npos)
return {};
auto origin_end = base.find('/', proto + 3);
std::string origin =
(origin_end != std::string::npos) ? base.substr(0, origin_end) : base;
if (href[0] == '/') {
return origin + href;
}
// Relative without leading slash
if (origin_end != std::string::npos) {
auto last_slash = base.rfind('/');
if (last_slash > proto + 2) {
return base.substr(0, last_slash + 1) + href;
}
}
return origin + "/" + href;
}
// ── Social link classification ──────────────────────────────────────────────
static std::string classify_social(const std::string &url) {
auto lower = to_lower(url);
if (lower.find("instagram.com") != std::string::npos)
return "instagram";
if (lower.find("facebook.com") != std::string::npos)
return "facebook";
if (lower.find("linkedin.com") != std::string::npos)
return "linkedin";
if (lower.find("twitter.com") != std::string::npos ||
lower.find("x.com") != std::string::npos)
return "twitter";
if (lower.find("youtube.com") != std::string::npos)
return "youtube";
if (lower.find("tiktok.com") != std::string::npos)
return "tiktok";
if (lower.find("pinterest.com") != std::string::npos)
return "pinterest";
if (lower.find("github.com") != std::string::npos)
return "github";
return {};
}
// ── Same-origin check ───────────────────────────────────────────────────────
static std::string get_origin(const std::string &url) {
auto proto = url.find("://");
if (proto == std::string::npos)
return {};
auto origin_end = url.find('/', proto + 3);
return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url;
}
static bool is_same_origin(const std::string &base_url,
const std::string &href) {
auto bo = to_lower(get_origin(base_url));
auto ho = to_lower(get_origin(href));
if (bo.empty() || ho.empty())
return false;
// Strip www. for comparison
auto strip_www = [](std::string &s) {
auto pos = s.find("://www.");
if (pos != std::string::npos) {
s = s.substr(0, pos + 3) + s.substr(pos + 7);
}
};
strip_www(bo);
strip_www(ho);
return bo == ho;
}
// ── Contact page matching ───────────────────────────────────────────────────
static bool matches_contact_pattern(const std::string &url,
const std::vector<std::string> &patterns) {
auto lower = to_lower(url);
for (auto &pat : patterns) {
if (lower.find(to_lower(pat)) != std::string::npos)
return true;
}
return false;
}
// ── Shared HTML parsing logic for Meta ──────────────────────────────────────
static SiteMeta parse_meta_html(const std::string &url, int http_status,
const std::string &html_body,
const std::string &fetch_error) {
SiteMeta meta;
meta.http_status = http_status;
if (!fetch_error.empty()) {
meta.fetch_error = fetch_error;
return meta;
}
meta.body_html = html_body;
// Parse with lexbor helpers
meta.title = html::get_title(html_body);
meta.description = html::get_meta(html_body, "description");
meta.og_image = html::get_meta(html_body, "og:image");
meta.canonical = html::get_canonical(html_body);
meta.body_text = html::get_body_text(html_body);
meta.json_ld = html::get_json_ld(html_body);
// OG fallbacks
if (meta.description.empty())
meta.description = html::get_meta(html_body, "og:description");
if (meta.title.empty())
meta.title = html::get_meta(html_body, "og:title");
// Links — classify into social / internal / mailto
auto links = html::get_links(html_body);
std::set<std::string> seen_pages;
// Extract emails from body text (much smaller than raw HTML)
meta.emails = extract_emails(meta.body_text);
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string email = lk.href.substr(7);
// Strip anything after ? (like ?subject=...)
auto q = email.find('?');
if (q != std::string::npos)
email = email.substr(0, q);
// Clean it
email = to_lower(email);
if (is_likely_email(email)) {
if (std::find(meta.emails.begin(), meta.emails.end(), email) ==
meta.emails.end()) {
meta.emails.push_back(email);
}
}
continue;
}
auto resolved = resolve_url(url, lk.href);
if (resolved.empty())
continue;
auto social = classify_social(resolved);
if (!social.empty()) {
meta.socials.push_back({social, resolved});
continue;
}
if (is_same_origin(url, resolved)) {
// Strip fragment (#) from URL
auto hash_pos = resolved.find('#');
if (hash_pos != std::string::npos) {
resolved = resolved.substr(0, hash_pos);
}
if (!resolved.empty() && seen_pages.insert(resolved).second) {
meta.internal_pages.push_back(resolved);
}
}
}
return meta;
}
// ── scrape_meta ─────────────────────────────────────────────────────────────
SiteMeta scrape_meta(const std::string &url, int timeout_ms) {
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36";
auto resp = http::get(url, opts);
std::string fetch_err;
if (resp.status_code < 0 || resp.status_code >= 400) {
fetch_err = resp.body;
}
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
fetch_err);
}
// ── scrape_emails_from_page ─────────────────────────────────────────────────
std::vector<std::string> scrape_emails_from_page(const std::string &url,
int timeout_ms,
int &out_status_code) {
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36";
auto resp = http::get(url, opts);
out_status_code = static_cast<int>(resp.status_code);
if (resp.status_code < 0 || resp.status_code >= 400) {
return {};
}
// Extract body text then find emails
auto text = html::get_body_text(resp.body);
auto from_text = extract_emails(text);
// Extract mailto: links from HTML directly without regexing the huge string
auto links = html::get_links(resp.body);
std::set<std::string> seen(from_text.begin(), from_text.end());
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string m = lk.href.substr(7);
auto q = m.find('?');
if (q != std::string::npos)
m = m.substr(0, q);
m = to_lower(m);
if (is_likely_email(m)) {
if (seen.insert(m).second) {
from_text.push_back(m);
}
}
}
}
return from_text;
}
static std::string extract_scrapeless_html(const std::string &json_body) {
std::string data = json::get_string(json_body, "data");
if (data.empty()) {
return json_body; // Fallback to raw response if not found
}
return data;
}
SiteMeta scrape_meta_scrapeless(const std::string &url,
const std::string &api_key, int timeout_ms) {
if (api_key.empty())
return parse_meta_html(url, 0, "", "missing api key");
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
url +
R"(","jsRender":{"enabled":true,"headless":true}}})";
http::PostOptions opts;
opts.content_type = "application/json";
opts.bearer_token = api_key;
opts.timeout_ms =
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
payload, opts);
std::string fetch_err;
if (resp.status_code < 0 || resp.status_code >= 400) {
fetch_err = resp.body;
logger::error("[meta:scrapeless] API Error HTTP " +
std::to_string(resp.status_code) + " for " + url + " : " +
fetch_err);
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
fetch_err);
}
std::string rendered_html = extract_scrapeless_html(resp.body);
return parse_meta_html(url, static_cast<int>(resp.status_code), rendered_html,
"");
}
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms) {
if (api_key.empty())
return {};
// Build the Scrapeless Universal Scraping API request body.
// We ask for the fully-rendered HTML of the target URL.
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
url +
R"(","jsRender":{"enabled":true,"headless":true}}})";
http::PostOptions opts;
opts.content_type = "application/json";
opts.bearer_token = api_key;
opts.timeout_ms =
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
payload, opts);
if (resp.status_code < 0 || resp.status_code >= 400) {
logger::error("[email:scrapeless] API Error HTTP " +
std::to_string(resp.status_code) + " for " + url + " : " +
resp.body);
return {}; // API error — silent fallback
}
std::string rendered_html = extract_scrapeless_html(resp.body);
// Parse and extract emails from the rendered HTML
auto text = html::get_body_text(rendered_html);
auto from_text = extract_emails(text);
// Fast mailto extraction instead of HTML regex
auto links = html::get_links(rendered_html);
std::set<std::string> seen(from_text.begin(), from_text.end());
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string m = lk.href.substr(7);
auto q = m.find('?');
if (q != std::string::npos)
m = m.substr(0, q);
m = to_lower(m);
if (is_likely_email(m)) {
if (seen.insert(m).second) {
from_text.push_back(m);
}
}
}
}
return from_text;
}
// ── enrich_location ─────────────────────────────────────────────────────────
EnrichedNode enrich_location(const LocationInput &loc,
const EnrichConfig &cfg) {
auto t0 = std::chrono::steady_clock::now();
EnrichedNode node;
node.title = loc.title;
node.place_id = loc.place_id;
node.website = loc.website;
node.address = loc.address;
node.type = loc.type;
node.grid_area = loc.grid_area;
node.grid_gid = loc.grid_gid;
node.status = EnrichStatus::NO_EMAIL;
if (loc.website.empty()) {
node.status = EnrichStatus::FETCH_ERROR;
node.error = "no website";
node.total_ms = elapsed_ms(t0);
return node;
}
// ── Phase 1: Meta scrape ────────────────────────────────────────────────
auto meta_t0 = std::chrono::steady_clock::now();
SiteMeta meta;
bool meta_timed_out = false;
try {
if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) {
logger::debug("[meta:scrapeless] Fetching " + loc.website);
meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key,
cfg.meta_timeout_ms);
} else {
logger::debug("[meta:http] Fetching " + loc.website);
meta = scrape_meta(loc.website, cfg.meta_timeout_ms);
}
} catch (...) {
meta.fetch_error = "exception during meta scrape";
meta_timed_out = true;
}
node.meta_ms = elapsed_ms(meta_t0);
// Check if meta took too long (within threshold of timeout)
if (node.meta_ms >= cfg.meta_timeout_ms - 1000) {
meta_timed_out = true;
}
// logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title)
// + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: "
// + std::to_string(meta.internal_pages.size()));
if (!meta.body_text.empty())
node.pages["home"] = meta.body_text;
if (cfg.enable_homepage_md && !meta.body_html.empty()) {
// Cap HTML body at 512 KB to prevent stack overflow in recursive html2md
// parser
static constexpr size_t MAX_HTML_BYTES = 512 * 1024;
if (meta.body_html.size() > MAX_HTML_BYTES) {
logger::warn("[" + loc.title + "] body_html too large (" +
std::to_string(meta.body_html.size() / 1024) +
" KB), skipping markdown conversion");
} else {
try {
node.sites[loc.website] = html::to_markdown(meta.body_html);
} catch (const std::exception &e) {
logger::warn("[" + loc.title +
"] html::to_markdown failed: " + e.what());
} catch (...) {
logger::warn("[" + loc.title +
"] html::to_markdown crashed (unknown exception)");
}
}
}
node.meta_pages = meta.internal_pages;
node.pages_found = static_cast<int>(meta.internal_pages.size());
node.socials = meta.socials;
if (!meta.fetch_error.empty()) {
node.error = meta.fetch_error;
node.status = EnrichStatus::FETCH_ERROR;
node.total_ms = elapsed_ms(t0);
return node;
}
// If meta already found emails, we're done (early exit like TS)
if (!meta.emails.empty()) {
node.emails = meta.emails;
node.status = EnrichStatus::OK;
node.total_ms = elapsed_ms(t0);
return node;
}
// ── Build contact page list ─────────────────────────────────────────────
std::vector<std::string> contact_pages;
std::set<std::string> seen_urls;
for (auto &page_url : meta.internal_pages) {
if (matches_contact_pattern(page_url, cfg.contact_patterns)) {
if (seen_urls.insert(page_url).second) {
contact_pages.push_back(page_url);
}
}
}
// No more probe paths. If we found 0 contact pages, we just give up or time
// out.
node.pages_found = static_cast<int>(contact_pages.size());
if (contact_pages.empty()) {
logger::debug("[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] No contact pages found.");
node.status =
meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES;
node.total_ms = elapsed_ms(t0);
return node;
}
logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) +
"] Contact pages to scrape: " +
std::to_string(contact_pages.size()) + " (parallel)");
// ── Phase 2: Email scrape per contact page ──────────────────────────────
struct AsyncResult {
std::string url;
std::vector<PageError> errors;
std::vector<std::string> emails;
int ms;
};
int pages_to_scrape =
std::min(static_cast<int>(contact_pages.size()), cfg.email_max_pages);
std::vector<std::thread> contact_threads;
std::vector<AsyncResult> contact_results(pages_to_scrape);
auto email_t0 = std::chrono::steady_clock::now();
for (int i = 0; i < pages_to_scrape; ++i) {
auto page_url = contact_pages[i];
contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() {
auto start = std::chrono::steady_clock::now();
AsyncResult res;
res.url = page_url;
PageError pe1;
pe1.url = page_url;
pe1.method = "GET";
int http_status = 0;
try {
// logger::debug("[email:http] Fetching " + page_url);
auto page_emails = scrape_emails_from_page(
page_url, cfg.email_page_timeout_ms, http_status);
pe1.emails = page_emails;
logger::debug("[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] HTTP fetch finished code " +
std::to_string(http_status) + " for " + page_url);
if (page_emails.empty()) {
if (http_status == 404 || http_status == 400 || http_status == 500) {
pe1.status = "NOT_FOUND";
pe1.error = "HTTP " + std::to_string(http_status);
} else {
pe1.status = "AXIOS_NO_EMAIL";
res.errors.push_back(pe1); // pushed before scrapeless
if (cfg.meta_scraper == "SCRAPELESS" &&
!cfg.scrapeless_key.empty()) {
PageError pe2;
pe2.url = page_url;
pe2.method = "SCRAPELESS";
try {
logger::debug("[email:scrapeless] Fallback scraping " +
page_url);
auto s_emails =
scrape_emails_scrapeless(page_url, cfg.scrapeless_key,
cfg.email_page_timeout_ms + 5000);
pe2.emails = s_emails;
pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL";
if (!s_emails.empty())
res.emails = s_emails;
logger::debug(
"[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] Scrapeless fallback finished for " + page_url);
} catch (...) {
pe2.status = "FAILED";
pe2.error = "scrapeless exception";
}
res.errors.push_back(pe2);
}
res.ms = elapsed_ms(start);
contact_results[i] = res;
return;
}
} else {
pe1.status = "SEARCHED_EMAIL";
res.emails = page_emails;
}
} catch (...) {
pe1.status = "AXIOS_FAILED";
pe1.error = "exception";
}
// Only insert pe1 if we didn't already push it during fallback
if (res.errors.empty() || res.errors[0].method != "GET") {
res.errors.insert(res.errors.begin(), pe1);
}
res.ms = elapsed_ms(start);
contact_results[i] = res;
});
}
for (auto &t : contact_threads) {
if (t.joinable())
t.join();
}
std::set<std::string> all_emails;
int pages_scraped = 0;
for (auto &res : contact_results) {
pages_scraped++;
for (auto &pe : res.errors) {
node.page_errors.push_back(std::move(pe));
}
for (auto &e : res.emails) {
all_emails.insert(e);
}
}
node.email_ms = elapsed_ms(email_t0);
node.pages_scraped = pages_scraped;
// Merge emails
node.emails.assign(all_emails.begin(), all_emails.end());
// Final status
bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000;
if (!node.emails.empty()) {
node.status = EnrichStatus::OK;
} else if (email_timed_out) {
node.status = EnrichStatus::EMAIL_TIMEOUT;
} else if (meta_timed_out) {
node.status = EnrichStatus::META_TIMEOUT;
} else {
node.status = EnrichStatus::NO_EMAIL;
}
node.total_ms = elapsed_ms(t0);
return node;
}
} // namespace enrichers

View File

@ -1,6 +0,0 @@
add_library(gadm_reader STATIC src/gadm_reader.cpp)
target_include_directories(gadm_reader PUBLIC include)
# Depends on geo (for Coord type) and json (for RapidJSON)
target_link_libraries(gadm_reader PUBLIC geo json)

View File

@ -1,75 +0,0 @@
#pragma once
#include "geo/geo.h"
#include <array>
#include <string>
#include <vector>
namespace gadm {
// ── Feature (mirrors TS GridFeature) ────────────────────────────────────────
struct Feature {
std::string gid; // e.g. "ABW", "AFG.1.1_1"
std::string name; // e.g. "Aruba", "Baharak"
int level = 0; // GADM admin level
// Outer ring + holes (MultiPolygon flattened to rings)
std::vector<std::vector<geo::Coord>> rings;
// Bounding box (computed from rings)
geo::BBox bbox;
// GHS enrichment (parsed from cached JSON)
double ghsPopulation = 0;
double ghsBuiltWeight = 0;
double ghsPopMaxDensity = 0;
double ghsBuiltMax = 0;
geo::Coord ghsPopCenter;
geo::Coord ghsBuiltCenter;
// Weighted centers: [lon, lat, weight]
std::vector<std::array<double, 3>> ghsPopCenters;
std::vector<std::array<double, 3>> ghsBuiltCenters;
// Computed from geometry
double areaSqKm = 0;
bool isOuter = true;
};
// ── Result ──────────────────────────────────────────────────────────────────
struct BoundaryResult {
std::vector<Feature> features;
std::string error; // empty on success
};
// ── API ─────────────────────────────────────────────────────────────────────
/// Load a pre-cached GADM boundary file.
///
/// Tries these file paths in order:
/// 1. cacheDir/boundary_{gid}_{targetLevel}.json
/// 2. cacheDir/boundary_{countryCode}_{targetLevel}.json (fallback for country-level)
///
/// Returns a BoundaryResult with parsed features or an error string.
BoundaryResult load_boundary(
const std::string& gid,
int targetLevel,
const std::string& cacheDir = "cache/gadm"
);
/// Load a boundary file directly by path.
BoundaryResult load_boundary_file(const std::string& filepath);
/// Extract the ISO country code from a GID (e.g. "AFG.1.1_1" → "AFG").
std::string country_code(const std::string& gid);
/// Infer the GADM level from a GID string.
/// "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2, etc.
int infer_level(const std::string& gid);
} // namespace gadm

View File

@ -1,231 +0,0 @@
#include "gadm_reader/gadm_reader.h"
#include <algorithm>
#include <fstream>
#include <sstream>
#include <rapidjson/document.h>
namespace gadm {
// ── Helpers ─────────────────────────────────────────────────────────────────
std::string country_code(const std::string& gid) {
auto dot = gid.find('.');
return (dot != std::string::npos) ? gid.substr(0, dot) : gid;
}
int infer_level(const std::string& gid) {
// Count dots: "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2
int dots = 0;
for (char c : gid) {
if (c == '.') dots++;
}
return dots;
}
static std::string read_file(const std::string& path) {
std::ifstream ifs(path, std::ios::binary);
if (!ifs.is_open()) return "";
std::ostringstream oss;
oss << ifs.rdbuf();
return oss.str();
}
/// Parse a coord array [lon, lat] → geo::Coord
static geo::Coord parse_coord(const rapidjson::Value& arr) {
if (arr.IsArray() && arr.Size() >= 2) {
return {arr[0].GetDouble(), arr[1].GetDouble()};
}
return {};
}
/// Parse a ring array [[lon,lat], [lon,lat], ...] → vector<Coord>
static std::vector<geo::Coord> parse_ring(const rapidjson::Value& arr) {
std::vector<geo::Coord> ring;
if (!arr.IsArray()) return ring;
ring.reserve(arr.Size());
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
ring.push_back(parse_coord(arr[i]));
}
return ring;
}
/// Parse weighted centers [[lon, lat, weight], ...]
static std::vector<std::array<double, 3>> parse_weighted_centers(
const rapidjson::Value& arr) {
std::vector<std::array<double, 3>> centers;
if (!arr.IsArray()) return centers;
centers.reserve(arr.Size());
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
const auto& c = arr[i];
if (c.IsArray() && c.Size() >= 3) {
centers.push_back({c[0].GetDouble(), c[1].GetDouble(), c[2].GetDouble()});
}
}
return centers;
}
/// Get a double from properties, with fallback
static double get_double(const rapidjson::Value& props, const char* key,
double fallback = 0.0) {
if (props.HasMember(key) && props[key].IsNumber()) {
return props[key].GetDouble();
}
return fallback;
}
/// Get a bool from properties, with fallback
static bool get_bool(const rapidjson::Value& props, const char* key,
bool fallback = true) {
if (props.HasMember(key) && props[key].IsBool()) {
return props[key].GetBool();
}
return fallback;
}
/// Get a string from properties, checking GID_0, GID_1, GID_2, etc.
static std::string get_gid(const rapidjson::Value& props) {
// Try GID_5 down to GID_0, return the most specific one found
for (int lvl = 5; lvl >= 0; --lvl) {
std::string key = "GID_" + std::to_string(lvl);
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
return props[key.c_str()].GetString();
}
}
return "";
}
/// Get the name (NAME_0, NAME_1, ... NAME_5)
static std::string get_name(const rapidjson::Value& props) {
for (int lvl = 5; lvl >= 0; --lvl) {
std::string key = "NAME_" + std::to_string(lvl);
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
return props[key.c_str()].GetString();
}
}
return "";
}
/// Parse a single GeoJSON Feature object into a gadm::Feature
static Feature parse_feature(const rapidjson::Value& feat) {
Feature f;
// Properties
if (feat.HasMember("properties") && feat["properties"].IsObject()) {
const auto& props = feat["properties"];
f.gid = get_gid(props);
f.name = get_name(props);
f.level = infer_level(f.gid);
f.ghsPopulation = get_double(props, "ghsPopulation");
f.ghsBuiltWeight = get_double(props, "ghsBuiltWeight");
f.ghsPopMaxDensity = get_double(props, "ghsPopMaxDensity");
f.ghsBuiltMax = get_double(props, "ghsBuiltMax");
f.isOuter = get_bool(props, "isOuter");
if (props.HasMember("ghsPopCenter") && props["ghsPopCenter"].IsArray()) {
f.ghsPopCenter = parse_coord(props["ghsPopCenter"]);
}
if (props.HasMember("ghsBuiltCenter") && props["ghsBuiltCenter"].IsArray()) {
f.ghsBuiltCenter = parse_coord(props["ghsBuiltCenter"]);
}
if (props.HasMember("ghsPopCenters") && props["ghsPopCenters"].IsArray()) {
f.ghsPopCenters = parse_weighted_centers(props["ghsPopCenters"]);
}
if (props.HasMember("ghsBuiltCenters") && props["ghsBuiltCenters"].IsArray()) {
f.ghsBuiltCenters = parse_weighted_centers(props["ghsBuiltCenters"]);
}
}
// Geometry
if (feat.HasMember("geometry") && feat["geometry"].IsObject()) {
const auto& geom = feat["geometry"];
std::string gtype;
if (geom.HasMember("type") && geom["type"].IsString()) {
gtype = geom["type"].GetString();
}
if (geom.HasMember("coordinates") && geom["coordinates"].IsArray()) {
const auto& coords = geom["coordinates"];
if (gtype == "Polygon") {
// coordinates: [ [ring], [hole], ... ]
for (rapidjson::SizeType r = 0; r < coords.Size(); ++r) {
f.rings.push_back(parse_ring(coords[r]));
}
} else if (gtype == "MultiPolygon") {
// coordinates: [ [ [ring], [hole] ], [ [ring] ], ... ]
for (rapidjson::SizeType p = 0; p < coords.Size(); ++p) {
if (coords[p].IsArray()) {
for (rapidjson::SizeType r = 0; r < coords[p].Size(); ++r) {
f.rings.push_back(parse_ring(coords[p][r]));
}
}
}
}
}
}
// Compute bbox and area from first ring (outer boundary)
if (!f.rings.empty() && !f.rings[0].empty()) {
f.bbox = geo::bbox(f.rings[0]);
f.areaSqKm = geo::area_sq_km(f.rings[0]);
}
return f;
}
// ── Public API ──────────────────────────────────────────────────────────────
BoundaryResult load_boundary_file(const std::string& filepath) {
BoundaryResult result;
std::string json = read_file(filepath);
if (json.empty()) {
result.error = "Failed to read file: " + filepath;
return result;
}
rapidjson::Document doc;
doc.Parse(json.c_str());
if (doc.HasParseError()) {
result.error = "JSON parse error in: " + filepath;
return result;
}
// Expect a FeatureCollection
if (!doc.HasMember("features") || !doc["features"].IsArray()) {
result.error = "Missing 'features' array in: " + filepath;
return result;
}
const auto& features = doc["features"];
result.features.reserve(features.Size());
for (rapidjson::SizeType i = 0; i < features.Size(); ++i) {
result.features.push_back(parse_feature(features[i]));
}
return result;
}
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
const std::string& cacheDir) {
std::string cc = country_code(gid);
std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json";
// Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json
std::string path = cacheDir + "/" + cc + "/" + filename;
auto result = load_boundary_file(path);
if (result.error.empty()) return result;
// Fallback (flat): cacheDir/boundary_{gid}_{level}.json
path = cacheDir + "/" + filename;
result = load_boundary_file(path);
if (result.error.empty()) return result;
// Both failed
result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir;
return result;
}
} // namespace gadm

View File

@ -1,5 +0,0 @@
add_library(geo STATIC src/geo.cpp)
target_include_directories(geo PUBLIC include)
# No external dependencies pure math

View File

@ -1,100 +0,0 @@
#pragma once
#include <array>
#include <cmath>
#include <vector>
namespace geo {
// ── Constants ───────────────────────────────────────────────────────────────
constexpr double EARTH_RADIUS_KM = 6371.0;
constexpr double PI = 3.14159265358979323846;
constexpr double DEG2RAD = PI / 180.0;
constexpr double RAD2DEG = 180.0 / PI;
// ── Core types ──────────────────────────────────────────────────────────────
struct Coord {
double lon = 0;
double lat = 0;
};
struct BBox {
double minLon = 0;
double minLat = 0;
double maxLon = 0;
double maxLat = 0;
Coord center() const {
return {(minLon + maxLon) / 2.0, (minLat + maxLat) / 2.0};
}
double width_deg() const { return maxLon - minLon; }
double height_deg() const { return maxLat - minLat; }
};
// ── Distance ────────────────────────────────────────────────────────────────
/// Haversine distance between two WGS84 points, in kilometers.
double distance_km(Coord a, Coord b);
/// Haversine distance in meters.
inline double distance_m(Coord a, Coord b) { return distance_km(a, b) * 1000.0; }
// ── Bounding box ────────────────────────────────────────────────────────────
/// Compute the bounding box of a polygon ring.
BBox bbox(const std::vector<Coord>& ring);
/// Compute the bounding box that covers all features' rings.
BBox bbox_union(const std::vector<BBox>& boxes);
// ── Centroid ────────────────────────────────────────────────────────────────
/// Geometric centroid of a polygon ring (simple average method).
Coord centroid(const std::vector<Coord>& ring);
// ── Area ────────────────────────────────────────────────────────────────────
/// Approximate area of a polygon ring in square meters.
/// Uses the Shoelace formula with latitude cosine correction.
double area_sq_m(const std::vector<Coord>& ring);
/// Area in square kilometers.
inline double area_sq_km(const std::vector<Coord>& ring) {
return area_sq_m(ring) / 1e6;
}
// ── Point-in-polygon ────────────────────────────────────────────────────────
/// Ray-casting point-in-polygon test.
/// Same algorithm as gadm/cpp pip.h but using Coord structs.
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring);
// ── Bearing & destination ───────────────────────────────────────────────────
/// Initial bearing from a to b, in degrees (0 = north, 90 = east).
double bearing_deg(Coord from, Coord to);
/// Compute the destination point given start, bearing (degrees), and distance (km).
Coord destination(Coord from, double bearing_deg, double distance_km);
// ── Grid tessellation ───────────────────────────────────────────────────────
/// Generate a flat square grid of cell centers over a bbox.
/// cellSizeKm defines the side length of each square cell.
/// Returns center coordinates of each cell.
std::vector<Coord> square_grid(BBox extent, double cellSizeKm);
/// Generate a flat hex grid of cell centers over a bbox.
/// cellSizeKm defines the distance between hex centers.
/// Returns center coordinates of each cell.
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm);
// ── Viewport estimation (matches TS estimateViewportAreaSqKm) ──────────────
/// Estimate the km² visible in a viewport at a given lat/zoom.
double estimate_viewport_sq_km(double lat, int zoom,
int widthPx = 1024, int heightPx = 768);
} // namespace geo

View File

@ -1,204 +0,0 @@
#include "geo/geo.h"
#include <algorithm>
#include <cmath>
namespace geo {
// ── Distance (Haversine) ────────────────────────────────────────────────────
double distance_km(Coord a, Coord b) {
double dLat = (b.lat - a.lat) * DEG2RAD;
double dLon = (b.lon - a.lon) * DEG2RAD;
double lat1 = a.lat * DEG2RAD;
double lat2 = b.lat * DEG2RAD;
double sinDLat = std::sin(dLat / 2.0);
double sinDLon = std::sin(dLon / 2.0);
double h = sinDLat * sinDLat + std::cos(lat1) * std::cos(lat2) * sinDLon * sinDLon;
return 2.0 * EARTH_RADIUS_KM * std::asin(std::sqrt(h));
}
// ── Bounding box ────────────────────────────────────────────────────────────
BBox bbox(const std::vector<Coord>& ring) {
if (ring.empty()) return {};
BBox b{ring[0].lon, ring[0].lat, ring[0].lon, ring[0].lat};
for (size_t i = 1; i < ring.size(); ++i) {
b.minLon = std::min(b.minLon, ring[i].lon);
b.minLat = std::min(b.minLat, ring[i].lat);
b.maxLon = std::max(b.maxLon, ring[i].lon);
b.maxLat = std::max(b.maxLat, ring[i].lat);
}
return b;
}
BBox bbox_union(const std::vector<BBox>& boxes) {
if (boxes.empty()) return {};
BBox u = boxes[0];
for (size_t i = 1; i < boxes.size(); ++i) {
u.minLon = std::min(u.minLon, boxes[i].minLon);
u.minLat = std::min(u.minLat, boxes[i].minLat);
u.maxLon = std::max(u.maxLon, boxes[i].maxLon);
u.maxLat = std::max(u.maxLat, boxes[i].maxLat);
}
return u;
}
// ── Centroid ────────────────────────────────────────────────────────────────
Coord centroid(const std::vector<Coord>& ring) {
if (ring.empty()) return {};
double sumLon = 0, sumLat = 0;
// Exclude last point if it's the same as first (closed ring)
size_t n = ring.size();
if (n > 1 && ring[0].lon == ring[n - 1].lon && ring[0].lat == ring[n - 1].lat) {
n--;
}
for (size_t i = 0; i < n; ++i) {
sumLon += ring[i].lon;
sumLat += ring[i].lat;
}
return {sumLon / static_cast<double>(n), sumLat / static_cast<double>(n)};
}
// ── Area (Shoelace + latitude cosine correction) ────────────────────────────
double area_sq_m(const std::vector<Coord>& ring) {
if (ring.size() < 3) return 0.0;
// Shoelace formula in projected coordinates.
// Each degree of longitude = cos(lat) * 111320 meters at that latitude.
// Each degree of latitude = 110540 meters (approximate).
double sum = 0.0;
size_t n = ring.size();
for (size_t i = 0; i < n; ++i) {
size_t j = (i + 1) % n;
// Convert coordinates to approximate meters using the average latitude
double avgLat = (ring[i].lat + ring[j].lat) / 2.0;
double cosLat = std::cos(avgLat * DEG2RAD);
double x_i = ring[i].lon * cosLat * 111320.0;
double y_i = ring[i].lat * 110540.0;
double x_j = ring[j].lon * cosLat * 111320.0;
double y_j = ring[j].lat * 110540.0;
sum += x_i * y_j - x_j * y_i;
}
return std::abs(sum) / 2.0;
}
// ── Point-in-polygon (ray casting) ──────────────────────────────────────────
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring) {
bool inside = false;
size_t n = ring.size();
for (size_t i = 0, j = n - 1; i < n; j = i++) {
double xi = ring[i].lon, yi = ring[i].lat;
double xj = ring[j].lon, yj = ring[j].lat;
if (((yi > pt.lat) != (yj > pt.lat)) &&
(pt.lon < (xj - xi) * (pt.lat - yi) / (yj - yi) + xi)) {
inside = !inside;
}
}
return inside;
}
// ── Bearing ─────────────────────────────────────────────────────────────────
double bearing_deg(Coord from, Coord to) {
double dLon = (to.lon - from.lon) * DEG2RAD;
double lat1 = from.lat * DEG2RAD;
double lat2 = to.lat * DEG2RAD;
double y = std::sin(dLon) * std::cos(lat2);
double x = std::cos(lat1) * std::sin(lat2) -
std::sin(lat1) * std::cos(lat2) * std::cos(dLon);
double brng = std::atan2(y, x) * RAD2DEG;
return std::fmod(brng + 360.0, 360.0);
}
// ── Destination point ───────────────────────────────────────────────────────
Coord destination(Coord from, double brng_deg, double dist_km) {
double brng = brng_deg * DEG2RAD;
double lat1 = from.lat * DEG2RAD;
double lon1 = from.lon * DEG2RAD;
double d = dist_km / EARTH_RADIUS_KM;
double lat2 = std::asin(std::sin(lat1) * std::cos(d) +
std::cos(lat1) * std::sin(d) * std::cos(brng));
double lon2 = lon1 + std::atan2(
std::sin(brng) * std::sin(d) * std::cos(lat1),
std::cos(d) - std::sin(lat1) * std::sin(lat2));
return {lon2 * RAD2DEG, lat2 * RAD2DEG};
}
// ── Square grid ─────────────────────────────────────────────────────────────
std::vector<Coord> square_grid(BBox extent, double cellSizeKm) {
std::vector<Coord> centers;
if (cellSizeKm <= 0) return centers;
// Convert cell size to degrees at the center latitude
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
double cosLat = std::cos(centerLat * DEG2RAD);
if (cosLat < 1e-10) cosLat = 1e-10; // Avoid division by zero near poles
double cellLatDeg = cellSizeKm / 110.574; // ~110.574 km per degree lat
double cellLonDeg = cellSizeKm / (111.320 * cosLat); // longitude correction
for (double lat = extent.minLat + cellLatDeg / 2.0;
lat < extent.maxLat; lat += cellLatDeg) {
for (double lon = extent.minLon + cellLonDeg / 2.0;
lon < extent.maxLon; lon += cellLonDeg) {
centers.push_back({lon, lat});
}
}
return centers;
}
// ── Hex grid ────────────────────────────────────────────────────────────────
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm) {
std::vector<Coord> centers;
if (cellSizeKm <= 0) return centers;
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
double cosLat = std::cos(centerLat * DEG2RAD);
if (cosLat < 1e-10) cosLat = 1e-10;
// Hex spacing: horizontal = cellSize, vertical = cellSize * sqrt(3)/2
double cellLatDeg = cellSizeKm / 110.574;
double cellLonDeg = cellSizeKm / (111.320 * cosLat);
double rowHeight = cellLatDeg * std::sqrt(3.0) / 2.0;
int row = 0;
for (double lat = extent.minLat + rowHeight / 2.0;
lat < extent.maxLat; lat += rowHeight) {
// Offset every other row by half the cell width
double lonOffset = (row % 2 == 1) ? cellLonDeg / 2.0 : 0.0;
for (double lon = extent.minLon + cellLonDeg / 2.0 + lonOffset;
lon < extent.maxLon; lon += cellLonDeg) {
centers.push_back({lon, lat});
}
row++;
}
return centers;
}
// ── Viewport estimation ─────────────────────────────────────────────────────
double estimate_viewport_sq_km(double lat, int zoom, int widthPx, int heightPx) {
double metersPerPx =
(156543.03392 * std::cos(lat * DEG2RAD)) / std::pow(2.0, zoom);
double widthKm = (widthPx * metersPerPx) / 1000.0;
double heightKm = (heightPx * metersPerPx) / 1000.0;
return widthKm * heightKm;
}
} // namespace geo

View File

@ -1,6 +0,0 @@
add_library(grid STATIC src/grid.cpp)
target_include_directories(grid PUBLIC include)
# Depends on geo (math) and gadm_reader (Feature type)
target_link_libraries(grid PUBLIC geo gadm_reader)

View File

@ -1,56 +0,0 @@
#pragma once
#include "geo/geo.h"
#include "gadm_reader/gadm_reader.h"
#include <functional>
#include <string>
#include <vector>
namespace grid {
// ── Types (mirror TS GridSearchHop) ─────────────────────────────────────────
struct Waypoint {
int step = 0;
double lng = 0;
double lat = 0;
double radius_km = 0;
std::string area_gid;
std::string area_name;
};
struct GridOptions {
std::string gridMode = "hex"; // "hex", "square", "admin", "centers"
double cellSize = 5.0; // km
double cellOverlap = 0.0;
double centroidOverlap = 0.5;
int maxCellsLimit = 15000;
double maxElevation = 0;
double minDensity = 0;
double minGhsPop = 0;
double minGhsBuilt = 0;
std::string ghsFilterMode = "AND"; // "AND" | "OR"
bool allowMissingGhs = false;
bool bypassFilters = false;
std::string pathOrder = "snake"; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest"
bool groupByRegion = true;
};
struct GridResult {
std::vector<Waypoint> waypoints;
int validCells = 0;
int skippedCells = 0;
std::string error;
};
// ── API ─────────────────────────────────────────────────────────────────────
/// Generate grid waypoints from GADM features + options.
/// This is the main entry point — equivalent to generateGridSearchCells() in TS.
GridResult generate(
const std::vector<gadm::Feature>& features,
const GridOptions& opts
);
} // namespace grid

View File

@ -1,393 +0,0 @@
#include "grid/grid.h"
#include <algorithm>
#include <cmath>
#include <map>
#include <unordered_map>
namespace grid {
// ── Internal types ──────────────────────────────────────────────────────────
struct CellInfo {
geo::Coord center;
double radius_km;
int region_idx;
bool allowed;
std::string reason;
};
// ── Filter logic (mirrors checkCellFilters in TS) ───────────────────────────
static bool check_filters(const gadm::Feature& feat, const GridOptions& opts,
double areaSqKm, std::string& reason) {
if (opts.bypassFilters) return true;
// GHS filter
bool checkPop = opts.minGhsPop > 0;
bool checkBuilt = opts.minGhsBuilt > 0;
if (checkPop || checkBuilt) {
double ghsPop = feat.ghsPopulation;
double ghsBuilt = feat.ghsBuiltWeight;
bool popPass = checkPop && ((ghsPop == 0 && opts.allowMissingGhs) || ghsPop >= opts.minGhsPop);
bool builtPass = checkBuilt && ((ghsBuilt == 0 && opts.allowMissingGhs) || ghsBuilt >= opts.minGhsBuilt);
if (opts.ghsFilterMode == "OR") {
if (checkPop && checkBuilt && !popPass && !builtPass) {
reason = "GHS (OR) below thresholds";
return false;
} else if (checkPop && !checkBuilt && !popPass) {
reason = "GHS Pop below threshold";
return false;
} else if (checkBuilt && !checkPop && !builtPass) {
reason = "GHS Built below threshold";
return false;
}
} else {
if (checkPop && !popPass) {
reason = "GHS Pop below threshold";
return false;
}
if (checkBuilt && !builtPass) {
reason = "GHS Built below threshold";
return false;
}
}
}
return true;
}
// ── Sorting ─────────────────────────────────────────────────────────────────
static void sort_waypoints(std::vector<Waypoint>& wps, const std::string& pathOrder,
double cellSize) {
if (wps.size() <= 1) return;
double rowTolerance = std::min((cellSize / 111.32) * 0.5, 0.5);
if (pathOrder == "zigzag" || pathOrder == "snake") {
// Sort top-to-bottom, left-to-right within row tolerance
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
if (std::abs(a.lat - b.lat) > rowTolerance) {
return b.lat < a.lat; // higher lat first (north to south)
}
return a.lng < b.lng; // left to right
});
if (pathOrder == "snake") {
// Group into rows, reverse every other row
std::vector<std::vector<Waypoint>> rows;
std::vector<Waypoint> currentRow;
double lastY = wps[0].lat;
for (auto& wp : wps) {
if (std::abs(wp.lat - lastY) > rowTolerance) {
rows.push_back(std::move(currentRow));
currentRow.clear();
lastY = wp.lat;
}
currentRow.push_back(wp);
}
if (!currentRow.empty()) rows.push_back(std::move(currentRow));
wps.clear();
for (size_t i = 0; i < rows.size(); ++i) {
if (i % 2 == 1) std::reverse(rows[i].begin(), rows[i].end());
for (auto& wp : rows[i]) wps.push_back(std::move(wp));
}
}
} else if (pathOrder == "spiral-out" || pathOrder == "spiral-in") {
// Sort by distance from center of all waypoints
double cLon = 0, cLat = 0;
for (const auto& wp : wps) { cLon += wp.lng; cLat += wp.lat; }
cLon /= wps.size();
cLat /= wps.size();
geo::Coord center{cLon, cLat};
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
double dA = geo::distance_km(center, {a.lng, a.lat});
double dB = geo::distance_km(center, {b.lng, b.lat});
return (pathOrder == "spiral-out") ? (dA < dB) : (dA > dB);
});
} else if (pathOrder == "shortest") {
// Greedy nearest-neighbor
std::vector<Waypoint> sorted;
sorted.reserve(wps.size());
std::vector<bool> used(wps.size(), false);
sorted.push_back(wps[0]);
used[0] = true;
for (size_t step = 1; step < wps.size(); ++step) {
const auto& cur = sorted.back();
double bestDist = 1e18;
size_t bestIdx = 0;
for (size_t i = 0; i < wps.size(); ++i) {
if (used[i]) continue;
double dx = wps[i].lng - cur.lng;
double dy = wps[i].lat - cur.lat;
double distSq = dx * dx + dy * dy;
if (distSq < bestDist) {
bestDist = distSq;
bestIdx = i;
}
}
sorted.push_back(wps[bestIdx]);
used[bestIdx] = true;
}
wps = std::move(sorted);
}
}
// ── Admin mode ──────────────────────────────────────────────────────────────
static GridResult generate_admin(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult res;
for (size_t i = 0; i < features.size(); ++i) {
const auto& f = features[i];
if (f.rings.empty() || f.rings[0].empty()) continue;
std::string reason;
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
geo::Coord center = geo::centroid(f.rings[0]);
// Radius = distance from centroid to bbox corner
double radiusKm = geo::distance_km(center, {f.bbox.maxLon, f.bbox.maxLat});
if (allowed) {
res.waypoints.push_back({
static_cast<int>(res.waypoints.size() + 1),
std::round(center.lon * 1e6) / 1e6,
std::round(center.lat * 1e6) / 1e6,
std::round(radiusKm * 100.0) / 100.0,
f.gid,
f.name
});
res.validCells++;
} else {
res.skippedCells++;
}
}
return res;
}
// ── Centers mode ────────────────────────────────────────────────────────────
static GridResult generate_centers(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult res;
struct AcceptedCenter {
geo::Coord coord;
};
std::vector<AcceptedCenter> accepted;
double minAllowedDist = opts.cellSize * (1.0 - opts.centroidOverlap);
for (size_t i = 0; i < features.size(); ++i) {
const auto& f = features[i];
// Collect unique centers by rounding to 5 decimal places
std::map<std::string, std::array<double, 3>> centersMap; // key → [lon, lat, weight]
auto addCenter = [&](double lon, double lat, double weight) {
char key[32];
snprintf(key, sizeof(key), "%.5f,%.5f", lon, lat);
std::string k(key);
if (centersMap.find(k) == centersMap.end()) {
centersMap[k] = {lon, lat, weight};
}
};
// Single pop/built centers
if (f.ghsPopCenter.lon != 0 || f.ghsPopCenter.lat != 0) {
addCenter(f.ghsPopCenter.lon, f.ghsPopCenter.lat, f.ghsPopulation);
}
if (f.ghsBuiltCenter.lon != 0 || f.ghsBuiltCenter.lat != 0) {
addCenter(f.ghsBuiltCenter.lon, f.ghsBuiltCenter.lat, f.ghsBuiltWeight);
}
// Weighted center arrays
for (const auto& c : f.ghsPopCenters) {
addCenter(c[0], c[1], c[2]);
}
for (const auto& c : f.ghsBuiltCenters) {
addCenter(c[0], c[1], c[2]);
}
for (const auto& [key, val] : centersMap) {
geo::Coord pt{val[0], val[1]};
std::string reason;
// For centers, use the feature's overall filters
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
// Check overlap with already-accepted centers
if (allowed && !accepted.empty()) {
for (const auto& ac : accepted) {
double dist = geo::distance_km(pt, ac.coord);
if (dist < minAllowedDist) {
allowed = false;
reason = "overlaps another centroid";
break;
}
}
}
if (allowed) {
accepted.push_back({pt});
res.waypoints.push_back({
static_cast<int>(res.waypoints.size() + 1),
std::round(pt.lon * 1e6) / 1e6,
std::round(pt.lat * 1e6) / 1e6,
std::round((opts.cellSize / 2.0) * 100.0) / 100.0,
f.gid,
f.name
});
res.validCells++;
} else {
res.skippedCells++;
}
}
}
return res;
}
// ── Polygon grid mode (hex / square) ────────────────────────────────────────
static GridResult generate_polygon_grid(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult res;
// Compute union bbox of all features
std::vector<geo::BBox> boxes;
for (const auto& f : features) {
if (!f.rings.empty()) boxes.push_back(f.bbox);
}
if (boxes.empty()) return res;
geo::BBox extent = geo::bbox_union(boxes);
// Estimate cell count to prevent runaway
double widthKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.maxLon, extent.minLat});
double heightKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.minLon, extent.maxLat});
double approxCellArea = opts.cellSize * opts.cellSize * 2.6;
int approxCells = static_cast<int>(std::ceil((widthKm * heightKm) / approxCellArea));
if (approxCells > opts.maxCellsLimit) {
res.error = "Grid too massive (~" + std::to_string(approxCells) + " cells). Increase cell size or select smaller region.";
return res;
}
// Generate grid centers
std::vector<geo::Coord> gridCenters;
if (opts.gridMode == "square") {
gridCenters = geo::square_grid(extent, opts.cellSize);
} else {
gridCenters = geo::hex_grid(extent, opts.cellSize);
}
// For each grid center, check if it intersects any feature polygon
for (const auto& gc : gridCenters) {
bool intersects = false;
int regionIdx = -1;
for (size_t i = 0; i < features.size(); ++i) {
if (features[i].rings.empty()) continue;
if (geo::point_in_polygon(gc, features[i].rings[0])) {
intersects = true;
regionIdx = static_cast<int>(i);
break;
}
}
if (!intersects) continue;
const auto& regionFeat = features[regionIdx];
std::string reason;
bool allowed = check_filters(regionFeat, opts, regionFeat.areaSqKm, reason);
// Compute cell radius (half diagonal of cell)
double cellRadiusKm = opts.cellSize * std::sqrt(2.0) / 2.0;
if (allowed) {
res.waypoints.push_back({
static_cast<int>(res.waypoints.size() + 1),
std::round(gc.lon * 1e6) / 1e6,
std::round(gc.lat * 1e6) / 1e6,
std::round(cellRadiusKm * 100.0) / 100.0,
regionFeat.gid,
regionFeat.name
});
res.validCells++;
} else {
res.skippedCells++;
}
}
return res;
}
// ── Main entry point ────────────────────────────────────────────────────────
GridResult generate(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult result;
if (features.empty()) {
result.error = "No features provided";
return result;
}
if (opts.gridMode == "admin") {
result = generate_admin(features, opts);
} else if (opts.gridMode == "centers") {
result = generate_centers(features, opts);
} else {
result = generate_polygon_grid(features, opts);
}
if (!result.error.empty()) return result;
// Sort waypoints
if (result.waypoints.size() > 1) {
if (opts.groupByRegion && features.size() > 1) {
std::stable_sort(result.waypoints.begin(), result.waypoints.end(),
[](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; });
auto start = result.waypoints.begin();
while (start != result.waypoints.end()) {
auto end = start;
while (end != result.waypoints.end() && end->area_gid == start->area_gid) {
++end;
}
std::vector<Waypoint> group(start, end);
sort_waypoints(group, opts.pathOrder, opts.cellSize);
std::copy(group.begin(), group.end(), start);
start = end;
}
} else {
sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize);
}
}
// Re-number steps after sorting
for (size_t i = 0; i < result.waypoints.size(); ++i) {
result.waypoints[i].step = static_cast<int>(i + 1);
}
return result;
}
} // namespace grid

View File

@ -1,68 +1,105 @@
#include "kbot.h"
#include <taskflow/taskflow.hpp>
#include <iostream>
#include "logger/logger.h"
#include "llm_client.h"
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
namespace polymech {
namespace kbot {
int run_kbot_ai_pipeline(const KBotOptions& opts, const KBotCallbacks& cb) {
logger::debug("Starting kbot ai pipeline");
if (opts.dry_run) {
logger::info("Dry run triggered for kbot ai");
}
namespace {
// Scaffolding multithreaded AI tasks
tf::Executor executor(4);
tf::Taskflow taskflow;
taskflow.emplace([opts, cb](){
logger::debug("Executing kbot ai completion via LLMClient...");
LLMClient client(opts);
std::string target_prompt = opts.prompt.empty() ? "Respond with 'Hello from KBot C++ AI Pipeline!'" : opts.prompt;
LLMResponse res = client.execute_chat(target_prompt);
if (res.success) {
std::cout << res.text << "\n";
if (cb.onEvent) {
cb.onEvent("ai_progress", "{\"message\":\"Task completion received\"}");
}
} else {
logger::error("AI Task Failed: " + res.error);
if (cb.onEvent) {
cb.onEvent("ai_error", "{\"error\":\"Task failed\"}");
}
}
});
executor.run(taskflow).wait();
if (cb.onEvent) {
cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"ai\"}");
}
return 0;
std::string json_job_result_ai(bool success, const std::string &text_or_error, bool is_text) {
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> w(buf);
w.StartObject();
w.Key("status");
w.String(success ? "success" : "error");
w.Key("mode");
w.String("ai");
if (success && is_text) {
w.Key("text");
w.String(text_or_error.c_str(),
static_cast<rapidjson::SizeType>(text_or_error.size()));
} else if (!success) {
w.Key("error");
w.String(text_or_error.c_str(),
static_cast<rapidjson::SizeType>(text_or_error.size()));
}
w.EndObject();
return std::string(buf.GetString(), buf.GetSize());
}
int run_kbot_run_pipeline(const KBotRunOptions& opts, const KBotCallbacks& cb) {
logger::info("Starting kbot run pipeline (stub) for config: " + opts.config);
if (opts.dry) {
logger::info("Dry run triggered for kbot run");
}
if (opts.list) {
logger::info("List configs mode enabled");
}
// Stub std::system call execution (simulating child_process.execFileSync from TypeScript)
if (!opts.dry && !opts.list) {
logger::info("Simulating launching: .vscode/launch.json targeting " + opts.config);
}
} // namespace
int run_kbot_ai_pipeline(const KBotOptions &opts, const KBotCallbacks &cb) {
logger::debug("Starting kbot ai pipeline");
if (opts.dry_run) {
logger::info("Dry run triggered for kbot ai");
if (cb.onEvent) {
cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"run\"}");
cb.onEvent("job_result", json_job_result_ai(true, "[dry-run] no LLM call", true));
}
return 0;
}
LLMClient client(opts);
const std::string target_prompt =
opts.prompt.empty() ? "Respond with 'Hello from KBot C++ AI Pipeline!'"
: opts.prompt;
logger::debug("Executing kbot ai completion via LLMClient...");
LLMResponse res = client.execute_chat(target_prompt);
if (res.success) {
std::cout << res.text << "\n";
if (cb.onEvent) {
cb.onEvent("ai_progress",
"{\"message\":\"Task completion received\",\"has_text\":true}");
}
} else {
logger::error("AI Task Failed: " + res.error);
if (cb.onEvent) {
rapidjson::StringBuffer ebuf;
rapidjson::Writer<rapidjson::StringBuffer> ew(ebuf);
ew.StartObject();
ew.Key("error");
ew.String(res.error.c_str(),
static_cast<rapidjson::SizeType>(res.error.size()));
ew.EndObject();
cb.onEvent("ai_error",
std::string(ebuf.GetString(), ebuf.GetSize()));
}
}
if (cb.onEvent) {
if (res.success)
cb.onEvent("job_result", json_job_result_ai(true, res.text, true));
else
cb.onEvent("job_result", json_job_result_ai(false, res.error, false));
}
return res.success ? 0 : 1;
}
int run_kbot_run_pipeline(const KBotRunOptions &opts, const KBotCallbacks &cb) {
logger::info("Starting kbot run pipeline (stub) for config: " + opts.config);
if (opts.dry) {
logger::info("Dry run triggered for kbot run");
}
if (opts.list) {
logger::info("List configs mode enabled");
}
if (!opts.dry && !opts.list) {
logger::info("Simulating launching: .vscode/launch.json targeting " + opts.config);
}
if (cb.onEvent) {
cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"run\"}");
}
return 0;
}
} // namespace kbot

View File

@ -1,7 +0,0 @@
add_library(search STATIC src/search.cpp)
target_include_directories(search PUBLIC include)
# Depends on http (curl) and json (RapidJSON wrapper)
target_link_libraries(search PUBLIC http json)
target_link_libraries(search PRIVATE tomlplusplus::tomlplusplus)

View File

@ -1,93 +0,0 @@
#pragma once
#include <string>
#include <vector>
namespace search {
// ── Result types ────────────────────────────────────────────────────────────
struct GpsCoordinates {
double lat = 0;
double lng = 0;
};
struct MapResult {
std::string title;
std::string place_id;
std::string data_id;
std::string address;
std::string phone;
std::string website;
std::string type;
std::vector<std::string> types;
double rating = 0;
int reviews = 0;
GpsCoordinates gps;
std::string thumbnail;
std::string raw_json;
std::string geo_json;
};
struct SearchResult {
std::vector<MapResult> results;
int apiCalls = 0;
std::string error;
};
// ── Config ──────────────────────────────────────────────────────────────────
struct SystemTuningOptions {
int executor_threads = 0; // 0 = hardware concurrency
int max_concurrent_jobs_per_user = 10;
int http_concurrency_throttle = 50;
int queue_depth_max = 10000;
int bulk_dequeue_size = 1;
int ipc_timeout_ms = 300000;
int max_ipc_connections = 100;
int buffer_size_max = 50 * 1024 * 1024;
};
struct Config {
SystemTuningOptions system;
std::string serpapi_key;
std::string geocoder_key;
std::string bigdata_key;
std::string scrapeless_key;
std::string postgres_url;
std::string supabase_url;
std::string supabase_service_key;
// [enricher]
std::string enricher_meta_scraper;
int enricher_meta_concurrency = 5;
int enricher_meta_idle_timeout = 60;
int enricher_location_concurrency = 1;
};
/// Load config from a TOML file (e.g. config/postgres.toml)
Config load_config(const std::string &path = "config/postgres.toml");
// ── Search API ──────────────────────────────────────────────────────────────
struct SearchOptions {
std::string query;
double lat = 0;
double lng = 0;
int zoom = 13;
int limit = 20;
std::string engine = "google_maps";
std::string hl = "en";
std::string google_domain = "google.com";
};
/// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit.
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts);
/// Resolve geo coordinate to place info
std::string resolve_geo(double lat, double lng, const std::string &key,
int timeout_ms = 3000);
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
int concurrency = 10, int timeout_ms = 3000);
} // namespace search

View File

@ -1,311 +0,0 @@
#include "search/search.h"
#include "http/http.h"
#include <rapidjson/document.h>
#include <toml++/toml.hpp>
#include <atomic>
#include <cstdio>
#include <iostream>
#include <mutex>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include <sstream>
#include <thread>
namespace search {
// ── URL encoding (minimal) ──────────────────────────────────────────────────
static std::string url_encode(const std::string &val) {
std::string result;
result.reserve(val.size() * 2);
for (unsigned char c : val) {
if (isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_' ||
c == '.' || c == '~') {
result += static_cast<char>(c);
} else {
char buf[4];
snprintf(buf, sizeof(buf), "%%%02X", c);
result += buf;
}
}
return result;
}
// ── Config loading ──────────────────────────────────────────────────────────
Config load_config(const std::string &path) {
Config cfg;
try {
auto tbl = toml::parse_file(path);
// [postgres]
if (auto v = tbl["postgres"]["url"].value<std::string>())
cfg.postgres_url = *v;
// [supabase]
if (auto v = tbl["supabase"]["url"].value<std::string>())
cfg.supabase_url = *v;
if (auto v = tbl["supabase"]["service_key"].value<std::string>())
cfg.supabase_service_key = *v;
// [services]
if (auto v = tbl["services"]["SERPAPI_KEY"].value<std::string>())
cfg.serpapi_key = *v;
if (auto v = tbl["services"]["GEO_CODER_KEY"].value<std::string>())
cfg.geocoder_key = *v;
if (auto v = tbl["services"]["BIG_DATA_KEY"].value<std::string>())
cfg.bigdata_key = *v;
if (auto v = tbl["services"]["SCRAPELESS_KEY"].value<std::string>())
cfg.scrapeless_key = *v;
// [enricher]
if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value<std::string>())
cfg.enricher_meta_scraper = *v;
if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value<int>())
cfg.enricher_meta_concurrency = *v;
if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value<int>())
cfg.enricher_meta_idle_timeout = *v;
if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value<int>())
cfg.enricher_location_concurrency = *v;
// [system]
if (auto v = tbl["system"]["executor_threads"].value<int>())
cfg.system.executor_threads = *v;
if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value<int>())
cfg.system.max_concurrent_jobs_per_user = *v;
if (auto v = tbl["system"]["http_concurrency_throttle"].value<int>())
cfg.system.http_concurrency_throttle = *v;
if (auto v = tbl["system"]["queue_depth_max"].value<int>())
cfg.system.queue_depth_max = *v;
if (auto v = tbl["system"]["bulk_dequeue_size"].value<int>())
cfg.system.bulk_dequeue_size = *v;
if (auto v = tbl["system"]["ipc_timeout_ms"].value<int>())
cfg.system.ipc_timeout_ms = *v;
if (auto v = tbl["system"]["max_ipc_connections"].value<int>())
cfg.system.max_ipc_connections = *v;
if (auto v = tbl["system"]["buffer_size_max"].value<int>())
cfg.system.buffer_size_max = *v;
} catch (const toml::parse_error &err) {
std::cerr << "[config] TOML parse error in " << path << ": " << err.what()
<< "\n";
}
return cfg;
}
// ── SerpAPI URL builder ─────────────────────────────────────────────────────
static std::string build_serpapi_url(const Config &cfg,
const SearchOptions &opts, int start) {
std::ostringstream url;
url << "https://serpapi.com/search.json"
<< "?engine=" << url_encode(opts.engine)
<< "&q=" << url_encode(opts.query)
<< "&api_key=" << url_encode(cfg.serpapi_key)
<< "&hl=" << url_encode(opts.hl)
<< "&google_domain=" << url_encode(opts.google_domain);
if (opts.lat != 0 || opts.lng != 0) {
char llBuf[128];
snprintf(llBuf, sizeof(llBuf), "@%.7f,%.7f,%dz", opts.lat, opts.lng,
opts.zoom);
url << "&ll=" << url_encode(std::string(llBuf));
}
if (start > 0) {
url << "&start=" << start;
}
return url.str();
}
// ── JSON result parser ──────────────────────────────────────────────────────
static void parse_results(const rapidjson::Value &arr,
std::vector<MapResult> &out) {
if (!arr.IsArray())
return;
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
const auto &obj = arr[i];
if (!obj.IsObject())
continue;
MapResult r;
// Capture raw JSON string
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
obj.Accept(writer);
r.raw_json = std::string(buf.GetString(), buf.GetSize());
if (obj.HasMember("title") && obj["title"].IsString())
r.title = obj["title"].GetString();
if (obj.HasMember("place_id") && obj["place_id"].IsString())
r.place_id = obj["place_id"].GetString();
if (obj.HasMember("data_id") && obj["data_id"].IsString())
r.data_id = obj["data_id"].GetString();
if (obj.HasMember("address") && obj["address"].IsString())
r.address = obj["address"].GetString();
if (obj.HasMember("phone") && obj["phone"].IsString())
r.phone = obj["phone"].GetString();
if (obj.HasMember("website") && obj["website"].IsString())
r.website = obj["website"].GetString();
if (obj.HasMember("type") && obj["type"].IsString())
r.type = obj["type"].GetString();
if (obj.HasMember("rating") && obj["rating"].IsNumber())
r.rating = obj["rating"].GetDouble();
if (obj.HasMember("reviews") && obj["reviews"].IsInt())
r.reviews = obj["reviews"].GetInt();
if (obj.HasMember("thumbnail") && obj["thumbnail"].IsString())
r.thumbnail = obj["thumbnail"].GetString();
if (obj.HasMember("gps_coordinates") && obj["gps_coordinates"].IsObject()) {
const auto &gps = obj["gps_coordinates"];
if (gps.HasMember("latitude") && gps["latitude"].IsNumber())
r.gps.lat = gps["latitude"].GetDouble();
if (gps.HasMember("longitude") && gps["longitude"].IsNumber())
r.gps.lng = gps["longitude"].GetDouble();
}
if (obj.HasMember("types") && obj["types"].IsArray()) {
for (rapidjson::SizeType j = 0; j < obj["types"].Size(); ++j) {
if (obj["types"][j].IsString())
r.types.push_back(obj["types"][j].GetString());
}
}
out.push_back(std::move(r));
}
}
// ── Main search function ────────────────────────────────────────────────────
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts) {
SearchResult result;
if (cfg.serpapi_key.empty()) {
result.error = "No SerpAPI key configured";
return result;
}
if (opts.query.empty()) {
result.error = "Empty search query";
return result;
}
const int PAGE_SIZE = 20;
int start = 0;
while (static_cast<int>(result.results.size()) < opts.limit) {
std::string url = build_serpapi_url(cfg, opts, start);
auto resp = http::get(url);
result.apiCalls++;
if (resp.status_code != 200) {
result.error = "SerpAPI HTTP " + std::to_string(resp.status_code);
break;
}
rapidjson::Document doc;
doc.Parse(resp.body.c_str());
if (doc.HasParseError()) {
result.error = "Failed to parse SerpAPI response";
break;
}
size_t beforeCount = result.results.size();
// local_results (main listing)
if (doc.HasMember("local_results") && doc["local_results"].IsArray()) {
parse_results(doc["local_results"], result.results);
}
// place_results (single result or array)
if (doc.HasMember("place_results")) {
if (doc["place_results"].IsArray()) {
parse_results(doc["place_results"], result.results);
} else if (doc["place_results"].IsObject()) {
rapidjson::Document arr;
arr.SetArray();
arr.PushBack(rapidjson::Value(doc["place_results"], arr.GetAllocator()),
arr.GetAllocator());
parse_results(arr, result.results);
}
}
size_t pageCount = result.results.size() - beforeCount;
if (pageCount == 0)
break; // No more results
if (static_cast<int>(pageCount) < PAGE_SIZE)
break; // Last page (partial)
start += PAGE_SIZE;
}
// Trim to limit
if (static_cast<int>(result.results.size()) > opts.limit) {
result.results.resize(opts.limit);
}
return result;
}
// ── Geo enrichment ──────────────────────────────────────────────────────────
std::string resolve_geo(double lat, double lng, const std::string &key,
int timeout_ms) {
if (key.empty())
return "{}";
char url[512];
snprintf(
url, sizeof(url),
"https://api.bigdatacloud.net/data/"
"reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s",
lat, lng, key.c_str());
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
auto resp = http::get(url, opts);
if (resp.status_code == 200 && !resp.body.empty()) {
return resp.body;
}
return "{}";
}
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
int concurrency, int timeout_ms) {
if (key.empty() || results.empty())
return;
std::atomic<size_t> current_idx{0};
std::vector<std::thread> threads;
int num_threads =
std::min<int>(concurrency, static_cast<int>(results.size()));
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back([&]() {
while (true) {
size_t idx = current_idx.fetch_add(1);
if (idx >= results.size())
break;
auto &r = results[idx];
if (r.gps.lat != 0 || r.gps.lng != 0) {
r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms);
}
}
});
}
for (auto &t : threads) {
if (t.joinable())
t.join();
}
}
} // namespace search

View File

@ -1,65 +0,0 @@
#pragma once
#include "search/search.h"
#include "gadm_reader/gadm_reader.h"
#include "geo/geo.h"
#include <functional>
#include <map>
#include <string>
#include <vector>
namespace polymech {
// ── Filter context ──────────────────────────────────────────────────────────
// All runtime data a filter predicate may need. Passed by const-ref so filters
// are pure read-only functions with no side-effects.
struct WaypointCtx {
double lat;
double lng;
double radius_km;
std::string area_gid; // e.g. "ESP.6.1.10.2_1"
};
struct FilterContext {
const WaypointCtx& waypoint;
const std::vector<std::string>& filter_types; // must-match list
const std::vector<std::string>& exclude_types; // deny list
const std::map<std::string, std::vector<gadm::Feature>>& country_boundaries;
};
// ── Predicate type ──────────────────────────────────────────────────────────
// Returns true → KEEP the result.
// Returns false → DISCARD the result.
using LocationFilter = std::function<bool(const search::MapResult&, const FilterContext&)>;
// ── Individual filters ──────────────────────────────────────────────────────
/// Discard results that have no website (non-actionable leads).
bool filter_requires_website(const search::MapResult& r, const FilterContext& ctx);
/// Discard results whose type matches any entry in ctx.exclude_types.
bool filter_exclude_types(const search::MapResult& r, const FilterContext& ctx);
/// If ctx.filter_types is non-empty, keep only results that match ≥1 type.
bool filter_match_types(const search::MapResult& r, const FilterContext& ctx);
/// Keep only results inside the country-level boundary polygon (L0) of the
/// waypoint's country. Falls back to radius-based overlap (1.5 × radius_km)
/// to gracefully handle legitimate border-proximity results.
bool filter_country_boundary(const search::MapResult& r, const FilterContext& ctx);
// ── Filter set builder ──────────────────────────────────────────────────────
/// Return the ordered list of default filters applied to every SerpAPI batch.
/// Filters are evaluated left-to-right; the first false short-circuits.
std::vector<LocationFilter> default_location_filters();
/// Run `filters` against `result`. Returns true (keep) only if every
/// filter passes.
bool apply_filters(const search::MapResult& result,
const FilterContext& ctx,
const std::vector<LocationFilter>& filters);
} // namespace polymech

View File

@ -1,28 +0,0 @@
#pragma once
#include "cmd_gridsearch.h"
#include "search/search.h"
#include "enrichers/enrichers.h"
#include <set>
namespace polymech {
struct PostgresStateStore {
std::string run_id;
std::string user_id;
std::string parent_id; // optional: parent run ID for expand jobs
bool enabled = false;
void init_run(const PipelineOptions &opts);
void update_status(const std::string &status);
void complete_run(const std::string &result_json);
void fail_run(const std::string &error_msg);
void upsert_places(const std::vector<search::MapResult> &places);
void update_place_enrichment(const enrichers::EnrichedNode &enode);
/// Query places table in chunks to find place_ids that already have meta (enriched).
/// Returns set of place_ids that should be skipped during enrichment.
std::set<std::string> filter_already_enriched(const std::vector<std::string> &place_ids);
};
} // namespace polymech

View File

@ -1,88 +0,0 @@
#pragma once
#include <CLI/CLI.hpp>
#include <functional>
#include <string>
#include <memory>
#include <atomic>
#include "search/search.h"
#include "grid/grid.h"
#include <vector>
namespace polymech {
std::string json_escape(const std::string &s);
struct AreaDef {
std::string gid;
std::string name;
int level;
};
struct AccumulatedResult {
search::MapResult result;
std::string grid_area;
std::string grid_gid;
};
struct PipelineOptions {
std::vector<AreaDef> areas;
grid::GridOptions grid_opts;
std::string search_query;
std::string search_domain = "google.com";
std::string search_language = "en";
std::string search_country;
int search_limit = 20;
int search_zoom = 13;
bool dry_run = false;
bool enrich = false;
std::string config_path = "config/postgres.toml";
std::string cache_dir = "cache/gadm";
bool persistence_postgres = false;
bool daemon_mode = false;
std::string job_id;
std::string default_user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
search::SystemTuningOptions tuning;
std::shared_ptr<std::atomic<bool>> cancel_token;
std::vector<std::string> filter_types; // if non-empty, only locations matching ≥1 type pass
std::vector<std::string> exclude_types; // if non-empty, drop locations matching any
bool no_cache = false; // skip pre-enrich dedup — force re-enrichment
std::string parent_id; // if set, this run is an "expand" child of another run
};
std::string json_escape(const std::string &s);
/// Optional callbacks for streaming progress events (used in IPC mode).
/// When nullptr / empty, the pipeline runs silently (CLI mode).
struct GridsearchCallbacks {
/// Emit a progress event. `type` is one of:
/// grid-ready, waypoint-start, area, location,
/// enrich-start, node, node-error, nodePage
/// `json` is the raw JSON payload string.
std::function<void(const std::string& type, const std::string& json)> onEvent;
};
CLI::App* setup_cmd_gridsearch(CLI::App& app);
/// CLI entry point (standalone mode — reads static vars set by CLI11).
int run_cmd_gridsearch();
/// IPC entry point — parse `payload` JSON, run the pipeline, emit events via `cb`.
/// Returns 0 on success.
int run_cmd_gridsearch_ipc(const std::string& payload,
const std::string& jobId,
const GridsearchCallbacks& cb,
bool daemon_mode = false,
const std::string& daemon_uid = "");
/// Core Pipeline
int run_pipeline(const PipelineOptions &opts, std::ostream *file_out,
const GridsearchCallbacks &cb);
/// UDS entry point — starts a persistent AF_UNIX / Named Pipe server that processes
/// concurrent jobs using Moodycamel ConcurrentQueue and Taskflow executor.
int run_cmd_gridsearch_uds(const std::string& pipe_path,
bool daemon_mode,
const std::string& daemon_uid);
} // namespace polymech

View File

@ -100,6 +100,8 @@ int run_kbot_ai_ipc(const std::string& payload, const std::string& jobId, const
if (doc.HasMember("prompt") && doc["prompt"].IsString()) opts.prompt = doc["prompt"].GetString();
if (doc.HasMember("dry_run") && doc["dry_run"].IsBool()) opts.dry_run = doc["dry_run"].GetBool();
if (doc.HasMember("api_key") && doc["api_key"].IsString()) opts.api_key = doc["api_key"].GetString();
if (doc.HasMember("router") && doc["router"].IsString()) opts.router = doc["router"].GetString();
if (doc.HasMember("model") && doc["model"].IsString()) opts.model = doc["model"].GetString();
}
if (opts.api_key.empty()) {

View File

@ -18,6 +18,9 @@ int run_cmd_kbot_run();
int run_kbot_ai_ipc(const std::string& payload, const std::string& jobId, const kbot::KBotCallbacks& cb);
int run_kbot_run_ipc(const std::string& payload, const std::string& jobId, const kbot::KBotCallbacks& cb);
/// Standalone UDS/TCP server for KBot (orchestrator tests, LLM worker).
int run_cmd_kbot_uds(const std::string& pipe_path);
/// Helper to check parsed state
bool is_kbot_ai_parsed();
bool is_kbot_run_parsed();

View File

@ -1,351 +0,0 @@
#include "gridsearch_serialize.h"
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include "cmd_gridsearch.h"
namespace polymech::serialize {
// ── grid-ready ──────────────────────────────────────────────────────────────
std::string grid_ready(const std::vector<grid::Waypoint>& waypoints) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("areas"); w.StartArray();
for (size_t i = 0; i < waypoints.size(); ++i) {
const auto& wp = waypoints[i];
w.StartObject();
w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str());
w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str());
w.Key("lat"); w.Double(wp.lat);
w.Key("lon"); w.Double(wp.lng);
w.Key("radius_km"); w.Double(wp.radius_km);
w.Key("area_gid"); w.String(wp.area_gid.c_str());
w.Key("area_name"); w.String(wp.area_name.c_str());
w.Key("index"); w.Int(static_cast<int>(i));
w.EndObject();
}
w.EndArray();
w.Key("total"); w.Int(static_cast<int>(waypoints.size()));
w.EndObject();
return sb.GetString();
}
// ── waypoint-start ──────────────────────────────────────────────────────────
std::string waypoint_start(const grid::Waypoint& wp, int index, int total) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str());
w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str());
w.Key("lat"); w.Double(wp.lat);
w.Key("lon"); w.Double(wp.lng);
w.Key("radius_km"); w.Double(wp.radius_km);
w.Key("area_gid"); w.String(wp.area_gid.c_str());
w.Key("area_name"); w.String(wp.area_name.c_str());
w.Key("index"); w.Int(index);
w.Key("total"); w.Int(total);
w.EndObject();
return sb.GetString();
}
// ── location (per search result) ────────────────────────────────────────────
std::string location(const search::MapResult& r, int step) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("location"); w.StartObject();
w.Key("title"); w.String(r.title.c_str());
w.Key("place_id"); w.String(r.place_id.c_str());
w.Key("address"); w.String(r.address.c_str());
w.Key("website"); w.String(r.website.c_str());
w.Key("type"); w.String(r.type.c_str());
w.Key("phone"); w.String(r.phone.c_str());
w.Key("rating"); w.Double(r.rating);
w.Key("reviews"); w.Int(r.reviews);
w.Key("lat"); w.Double(r.gps.lat);
w.Key("lng"); w.Double(r.gps.lng);
w.Key("types"); w.StartArray();
for (const auto& t : r.types) w.String(t.c_str());
w.EndArray();
w.EndObject();
w.Key("areaName"); w.String(("Waypoint " + std::to_string(step)).c_str());
w.EndObject();
return sb.GetString();
}
// ── area_start ──────────────────────────────────────────────────────────────
std::string area_start(const std::string& area_gid, const std::string& area_name) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("gid"); w.String(area_gid.c_str());
w.Key("name"); w.String(area_name.c_str());
w.EndObject();
return sb.GetString();
}
// ── area_finish ─────────────────────────────────────────────────────────────
std::string area_finish(const std::string& area_gid) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("gid"); w.String(area_gid.c_str());
w.EndObject();
return sb.GetString();
}
// ── waypoint_finish ─────────────────────────────────────────────────────────
std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str());
w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str());
w.Key("results"); w.Int(results);
w.Key("apiCalls"); w.Int(apiCalls);
w.EndObject();
return sb.GetString();
}
// ── enrich-start ────────────────────────────────────────────────────────────
std::string enrich_start(int locationCount) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("locationCount"); w.Int(locationCount);
w.EndObject();
return sb.GetString();
}
// ── nodePage (per page error) ───────────────────────────────────────────────
std::string node_page(const enrichers::PageError& pe, const std::string& placeId) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("location"); w.String(placeId.c_str());
w.Key("url"); w.String(pe.url.c_str());
w.Key("status"); w.String(pe.status.c_str());
w.Key("error"); w.String(pe.error.c_str());
w.EndObject();
return sb.GetString();
}
// ── node-error ──────────────────────────────────────────────────────────────
std::string node_error(const enrichers::EnrichedNode& node) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("node"); w.StartObject();
w.Key("title"); w.String(node.title.c_str());
w.Key("placeId"); w.String(node.place_id.c_str());
w.EndObject();
w.Key("error"); w.String(node.error.c_str());
w.EndObject();
return sb.GetString();
}
// ── node (enriched location) ────────────────────────────────────────────────
std::string node(const enrichers::EnrichedNode& n) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("idx"); w.Int(n.idx);
w.Key("title"); w.String(n.title.c_str());
w.Key("placeId"); w.String(n.place_id.c_str());
w.Key("website"); w.String(n.website.c_str());
w.Key("address"); w.String(n.address.c_str());
w.Key("type"); w.String(n.type.c_str());
w.Key("status"); w.String(enrichers::status_string(n.status));
w.Key("emails"); w.StartArray();
for (const auto& e : n.emails) w.String(e.c_str());
w.EndArray();
w.Key("social"); w.StartArray();
for (const auto& s : n.socials) {
w.StartObject();
w.Key("url"); w.String(s.url.c_str());
w.Key("platform"); w.String(s.platform.c_str());
w.EndObject();
}
w.EndArray();
w.Key("sites"); w.StartArray();
for (const auto& s : n.sites) {
w.StartObject();
w.Key("url"); w.String(s.first.c_str());
w.Key("name"); w.String("home");
w.Key("content"); w.String(s.second.c_str());
w.EndObject();
}
w.EndArray();
w.Key("pagesFound"); w.Int(n.pages_found);
w.Key("pagesScraped"); w.Int(n.pages_scraped);
w.Key("metaMs"); w.Int(n.meta_ms);
w.Key("emailMs"); w.Int(n.email_ms);
w.Key("totalMs"); w.Int(n.total_ms);
w.Key("gridArea"); w.String(n.grid_area.c_str());
w.Key("gridGid"); w.String(n.grid_gid.c_str());
w.EndObject();
return sb.GetString();
}
// ── write_options helper ────────────────────────────────────────────────────
static void write_options(rapidjson::Writer<rapidjson::StringBuffer>& w, const polymech::PipelineOptions& opts) {
w.Key("options");
w.StartObject();
w.Key("jobId"); w.String(opts.job_id.c_str());
w.Key("searchQuery"); w.String(opts.search_query.c_str());
w.Key("searchDomain"); w.String(opts.search_domain.c_str());
w.Key("searchLanguage"); w.String(opts.search_language.c_str());
w.Key("searchCountry"); w.String(opts.search_country.c_str());
w.Key("searchLimit"); w.Int(opts.search_limit);
w.Key("searchZoom"); w.Int(opts.search_zoom);
w.Key("dryRun"); w.Bool(opts.dry_run);
w.Key("enrich"); w.Bool(opts.enrich);
w.Key("grid");
w.StartObject();
w.Key("gridMode"); w.String(opts.grid_opts.gridMode.c_str());
w.Key("cellSize"); w.Double(opts.grid_opts.cellSize);
w.Key("cellOverlap"); w.Double(opts.grid_opts.cellOverlap);
w.Key("centroidOverlap"); w.Double(opts.grid_opts.centroidOverlap);
w.Key("maxCellsLimit"); w.Int(opts.grid_opts.maxCellsLimit);
w.Key("maxElevation"); w.Double(opts.grid_opts.maxElevation);
w.Key("minDensity"); w.Double(opts.grid_opts.minDensity);
w.Key("minGhsPop"); w.Double(opts.grid_opts.minGhsPop);
w.Key("minGhsBuilt"); w.Double(opts.grid_opts.minGhsBuilt);
w.Key("ghsFilterMode"); w.String(opts.grid_opts.ghsFilterMode.c_str());
w.Key("allowMissingGhs"); w.Bool(opts.grid_opts.allowMissingGhs);
w.Key("bypassFilters"); w.Bool(opts.grid_opts.bypassFilters);
w.Key("pathOrder"); w.String(opts.grid_opts.pathOrder.c_str());
w.Key("groupByRegion"); w.Bool(opts.grid_opts.groupByRegion);
w.EndObject();
w.Key("areas");
w.StartArray();
for (const auto& a : opts.areas) {
w.StartObject();
w.Key("gid"); w.String(a.gid.c_str());
w.Key("name"); w.String(a.name.c_str());
w.Key("level"); w.Int(a.level);
w.EndObject();
}
w.EndArray();
w.EndObject();
}
// ── job_result (with enrichment) ────────────────────────────────────────────
std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs,
int totalEmails, int totalPagesScraped, int freshApiCalls,
int waypointCount, int validCells, int skippedCells,
int totalResults, const std::vector<std::string>& enrichResults,
double totalScannedSqKm, double totalPopulation) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
write_options(w, opts);
w.Key("enumMs"); w.Int64(enumMs);
w.Key("searchMs"); w.Int64(searchMs);
w.Key("enrichMs"); w.Int64(enrichMs);
w.Key("totalMs"); w.Int64(totalMs);
w.Key("gridStats");
w.StartObject();
w.Key("validCells"); w.Int(validCells);
w.Key("skippedCells"); w.Int(skippedCells);
w.Key("totalWaypoints"); w.Int(waypointCount);
w.EndObject();
w.Key("searchStats");
w.StartObject();
w.Key("apiCalls"); w.Int(freshApiCalls);
w.Key("filtered"); w.Int(0); // placeholder if needed
w.Key("areaCount"); w.Int(waypointCount);
w.Key("totalResults"); w.Int(totalResults);
w.Key("totalScannedSqKm"); w.Double(totalScannedSqKm);
w.Key("totalPopulation"); w.Double(totalPopulation);
w.EndObject();
w.Key("totalEmails"); w.Int(totalEmails);
w.Key("enrichResults");
w.StartArray();
for (const auto& id : enrichResults) {
w.String(id.c_str());
}
w.EndArray();
w.Key("freshApiCalls"); w.Int(freshApiCalls);
w.Key("waypointCount"); w.Int(waypointCount);
w.Key("totalPagesScraped"); w.Int(totalPagesScraped);
w.EndObject();
return sb.GetString();
}
// ── job_result (search only) ────────────────────────────────────────────────
std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs,
int freshApiCalls, int waypointCount, int validCells,
int skippedCells, int totalResults, const std::vector<std::string>& enrichResults,
double totalScannedSqKm, double totalPopulation) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
write_options(w, opts);
w.Key("enumMs"); w.Int64(enumMs);
w.Key("searchMs"); w.Int64(searchMs);
w.Key("enrichMs"); w.Int64(0);
w.Key("totalMs"); w.Int64(totalMs);
w.Key("gridStats");
w.StartObject();
w.Key("validCells"); w.Int(validCells);
w.Key("skippedCells"); w.Int(skippedCells);
w.Key("totalWaypoints"); w.Int(waypointCount);
w.EndObject();
w.Key("searchStats");
w.StartObject();
w.Key("apiCalls"); w.Int(freshApiCalls);
w.Key("filtered"); w.Int(0);
w.Key("areaCount"); w.Int(waypointCount);
w.Key("totalResults"); w.Int(totalResults);
w.Key("totalScannedSqKm"); w.Double(totalScannedSqKm);
w.Key("totalPopulation"); w.Double(totalPopulation);
w.EndObject();
w.Key("totalEmails"); w.Int(0);
w.Key("enrichResults");
w.StartArray();
for (const auto& id : enrichResults) {
w.String(id.c_str());
}
w.EndArray();
w.Key("freshApiCalls"); w.Int(freshApiCalls);
w.Key("waypointCount"); w.Int(waypointCount);
w.Key("totalPagesScraped"); w.Int(0);
w.EndObject();
return sb.GetString();
}
} // namespace polymech::serialize

View File

@ -1,60 +0,0 @@
#pragma once
#include "enrichers/enrichers.h"
#include "grid/grid.h"
#include "search/search.h"
#include <cstdint>
#include <string>
#include <vector>
namespace polymech {
struct PipelineOptions;
}
namespace polymech::serialize {
/// grid-ready event payload
std::string grid_ready(const std::vector<grid::Waypoint>& waypoints);
/// waypoint-start event payload
std::string waypoint_start(const grid::Waypoint& wp, int index, int total);
/// location event payload (per search result)
std::string location(const search::MapResult& r, int step);
/// waypoint-finish event payload (waypoint done)
std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls);
/// area-start event payload
std::string area_start(const std::string& area_gid, const std::string& area_name);
/// area-finish event payload
std::string area_finish(const std::string& area_gid);
/// enrich-start event payload
std::string enrich_start(int locationCount);
/// nodePage event payload (per page error)
std::string node_page(const enrichers::PageError& pe, const std::string& placeId);
/// node-error event payload
std::string node_error(const enrichers::EnrichedNode& node);
/// node event payload (enriched location)
std::string node(const enrichers::EnrichedNode& node);
/// job_result summary (with enrichment)
std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs,
int totalEmails, int totalPagesScraped, int freshApiCalls,
int waypointCount, int validCells, int skippedCells,
int totalResults, const std::vector<std::string>& enrichResults,
double totalScannedSqKm, double totalPopulation);
/// job_result summary (search only, no enrichment)
std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs,
int freshApiCalls, int waypointCount, int validCells,
int skippedCells, int totalResults, const std::vector<std::string>& enrichResults,
double totalScannedSqKm, double totalPopulation);
} // namespace polymech::serialize

View File

@ -17,11 +17,6 @@
#include "logger/logger.h"
#include "postgres/postgres.h"
#include "json/json.h"
#include "gadm_reader/gadm_reader.h"
#include "grid/grid.h"
#include "search/search.h"
#include "enrichers/enrichers.h"
#include "cmd_gridsearch.h"
#include "cmd_kbot.h"
#ifndef PROJECT_VERSION
@ -77,17 +72,12 @@ int main(int argc, char *argv[]) {
db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10);
// Subcommand: worker — IPC mode (spawned by Node.js orchestrator)
bool daemon_mode = false;
std::string daemon_uid;
std::string worker_config = "config/postgres.toml";
std::string uds_path;
auto *worker_cmd = app.add_subcommand(
"worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)");
worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)");
worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml");
worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)");
worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path");
worker_cmd->add_option("--uds", uds_path,
"Listen on TCP port (Windows) or Unix socket path");
// Subcommand: kbot — AI workflows & task configurations
auto* kbot_cmd = polymech::setup_cmd_kbot(app);
@ -109,19 +99,10 @@ int main(int argc, char *argv[]) {
// ── worker mode ─────────────────────────────────────────────────────────
if (worker_cmd->parsed()) {
logger::info("Worker mode: listening on stdin");
if (daemon_mode) {
logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid));
auto cfg = search::load_config(worker_config);
postgres::Config pcfg;
pcfg.supabase_url = cfg.supabase_url;
pcfg.supabase_key = cfg.supabase_service_key;
postgres::init(pcfg);
}
if (!uds_path.empty()) {
logger::info("Worker mode: UDS Server active on " + uds_path);
int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid);
int rc = polymech::run_cmd_kbot_uds(uds_path);
return rc;
}
@ -140,27 +121,6 @@ int main(int argc, char *argv[]) {
if (req.type == "ping") {
ipc::write_message({req.id, "pong", "{}"});
} else if (req.type == "gridsearch") {
logger::info("Worker: gridsearch job received");
// Build callbacks that emit IPC events.
// Progress events use id "0" (unmatched → event for orchestrator).
// The final job_result uses the original req.id so the promise resolves.
std::string req_id = req.id;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&req_id](const std::string& type, const std::string& json) {
if (type == "job_result") {
ipc::write_message({req_id, "job_result", json});
} else {
ipc::write_message({"0", type, json});
}
};
int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid);
if (rc != 0) {
ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"});
}
} else if (req.type == "job") {
// Stub: echo the payload back as job_result
ipc::write_message({req.id, "job_result", req.payload});

View File

@ -53,12 +53,6 @@ add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp)
target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads)
catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_gridsearch_ipc e2e/test_gridsearch_ipc.cpp ../src/cmd_gridsearch.cpp ../src/cmd_gridsearch-filters.cpp ../src/cmd_gridsearch-uds.cpp ../src/cmd_gridsearch-postgres.cpp ../src/gridsearch_serialize.cpp ../src/sys_metrics.cpp)
target_link_libraries(test_gridsearch_ipc PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers Threads::Threads)
target_include_directories(test_gridsearch_ipc PRIVATE ${CMAKE_SOURCE_DIR}/src ${asio_SOURCE_DIR}/asio/include ${taskflow_SOURCE_DIR} ${concurrentqueue_SOURCE_DIR})
target_compile_definitions(test_gridsearch_ipc PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
catch_discover_tests(test_gridsearch_ipc WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_ipc unit/test_ipc.cpp)
target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads)
catch_discover_tests(test_ipc)

View File

@ -1,144 +0,0 @@
#include <catch2/catch_test_macros.hpp>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <rapidjson/document.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include "../../src/cmd_gridsearch.h"
#include "logger/logger.h"
// ── Helpers ──────────────────────────────────────────────────────────────────
static std::string read_file_contents(const std::string &path) {
std::ifstream f(path);
if (!f.is_open())
return "";
std::stringstream ss;
ss << f.rdbuf();
return ss.str();
}
/// Read a JSON config file and inject test-safe overrides:
/// - configPath = "config/postgres.toml"
/// - enrich = false (no live HTTP / thread-pool in tests)
/// - persistencePostgres = false
static std::string load_test_payload(const std::string &config_path) {
std::string raw = read_file_contents(config_path);
if (raw.empty())
return "";
rapidjson::Document doc;
doc.Parse(raw.c_str());
if (doc.HasParseError())
return "";
auto &alloc = doc.GetAllocator();
// Remove-then-add ensures no double-add assertion from rapidjson
auto inject_bool = [&](const char *key, bool val) {
if (doc.HasMember(key))
doc.RemoveMember(key);
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val), alloc);
};
auto inject_str = [&](const char *key, const char *val) {
if (doc.HasMember(key))
doc.RemoveMember(key);
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val, alloc),
alloc);
};
inject_str("configPath", "config/postgres.toml");
inject_str("cacheDir", "../../packages/gadm/cache/gadm"); // server/cache/gadm
inject_bool("enrich", false); // no live enrichment in tests
inject_bool("persistencePostgres", false);
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
doc.Accept(writer);
return buf.GetString();
}
// ── Tests
// ─────────────────────────────────────────────────────────────────────
TEST_CASE("E2E: Gridsearch Country Boundary Filter (Lamu/KEN)",
"[e2e][gridsearch][boundary]") {
REQUIRE_NOTHROW(logger::init("test-gridsearch"));
// Lamu, Kenya — SerpAPI often returns US results for obscure African regions.
// boundary_KEN_0.json should filter them out.
std::string payload = load_test_payload("config/gridsearch-lamu.json");
REQUIRE(!payload.empty());
std::vector<std::string> location_events;
int error_count = 0;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&](const std::string &type, const std::string &json) {
if (type == "location") {
location_events.push_back(json);
} else if (type == "error") {
error_count++;
std::cout << "[ERROR EVENT]: " << json << "\n";
}
};
int result =
polymech::run_cmd_gridsearch_ipc(payload, "test-lamu-job", cb, false, "");
REQUIRE(result == 0);
REQUIRE(error_count == 0);
// All returned locations must be within Kenya (no USA coords).
// Verify: no location has lng < -30 (Americas) or lng > 60 (not Africa/Asia)
// and lat outside [-5, 5] for Lamu county bounds.
int outside_kenya = 0;
for (const auto &loc_json : location_events) {
rapidjson::Document loc;
loc.Parse(loc_json.c_str());
if (loc.HasParseError())
continue;
if (loc.HasMember("gps") && loc["gps"].IsObject()) {
double lng =
loc["gps"].HasMember("lng") ? loc["gps"]["lng"].GetDouble() : 0;
// Kenya longitude range: ~34..42; USA is roughly -130..-60
if (lng < 20.0 || lng > 55.0)
outside_kenya++;
}
}
CHECK(outside_kenya == 0);
std::cout << "Lamu boundary test: " << location_events.size()
<< " locations kept, " << outside_kenya << " outside Kenya.\n";
}
TEST_CASE("E2E: Gridsearch Type Filter (Sample/ABW)",
"[e2e][gridsearch][filter]") {
std::string payload = load_test_payload("config/gridsearch-sample.json");
REQUIRE(!payload.empty());
std::vector<std::string> location_events;
int error_count = 0;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&](const std::string &type, const std::string &json) {
if (type == "location")
location_events.push_back(json);
else if (type == "error")
error_count++;
};
int result = polymech::run_cmd_gridsearch_ipc(payload, "test-sample-job", cb,
false, "");
REQUIRE(result == 0);
REQUIRE(error_count == 0);
std::cout << "Sample (ABW) type filter test: " << location_events.size()
<< " locations.\n";
}