cleanup kbot cpp :)
This commit is contained in:
parent
2f7c18adb5
commit
b0916df0f5
@ -90,12 +90,6 @@ add_subdirectory(packages/http)
|
||||
add_subdirectory(packages/json)
|
||||
add_subdirectory(packages/polymech)
|
||||
add_subdirectory(packages/ipc)
|
||||
add_subdirectory(packages/geo)
|
||||
add_subdirectory(packages/gadm_reader)
|
||||
add_subdirectory(packages/grid)
|
||||
add_subdirectory(packages/search)
|
||||
add_subdirectory(packages/enrichers)
|
||||
|
||||
add_subdirectory(packages/liboai/liboai)
|
||||
|
||||
add_subdirectory(packages/kbot)
|
||||
@ -103,19 +97,15 @@ add_subdirectory(packages/kbot)
|
||||
# ── Sources ──────────────────────────────────────────────────────────────────
|
||||
add_executable(${PROJECT_NAME}
|
||||
src/main.cpp
|
||||
src/cmd_gridsearch.cpp
|
||||
src/cmd_gridsearch-filters.cpp
|
||||
src/cmd_gridsearch-uds.cpp
|
||||
src/cmd_gridsearch-postgres.cpp
|
||||
src/cmd_kbot.cpp
|
||||
src/gridsearch_serialize.cpp
|
||||
src/cmd_kbot_uds.cpp
|
||||
src/sys_metrics.cpp
|
||||
)
|
||||
|
||||
# Output file name is kbot.exe / kbot (not kbot-cli)
|
||||
set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME "kbot")
|
||||
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers kbot)
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc search kbot)
|
||||
|
||||
target_include_directories(${PROJECT_NAME} PRIVATE
|
||||
${asio_SOURCE_DIR}/asio/include
|
||||
|
||||
@ -21,11 +21,6 @@
|
||||
"kbot:ai": ".\\dist\\kbot.exe kbot ai --prompt \"hi\"",
|
||||
"kbot:run": ".\\dist\\kbot.exe kbot run --list",
|
||||
"test:ipc": "node orchestrator/test-ipc.mjs",
|
||||
"test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs",
|
||||
"test:gridsearch-filter-ipc": "cmake --build build/release --target test_gridsearch_ipc && .\\dist\\test_gridsearch_ipc.exe",
|
||||
"test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs",
|
||||
"test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs",
|
||||
"test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs",
|
||||
"test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe"
|
||||
},
|
||||
"repository": {
|
||||
|
||||
@ -1,4 +0,0 @@
|
||||
add_library(enrichers STATIC src/enrichers.cpp)
|
||||
|
||||
target_include_directories(enrichers PUBLIC include)
|
||||
target_link_libraries(enrichers PUBLIC http html json logger)
|
||||
@ -1,162 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace enrichers {
|
||||
|
||||
// ── Status codes ────────────────────────────────────────────────────────────
|
||||
|
||||
enum class EnrichStatus {
|
||||
OK,
|
||||
NO_EMAIL,
|
||||
META_TIMEOUT,
|
||||
EMAIL_TIMEOUT,
|
||||
FETCH_ERROR,
|
||||
NO_PAGES,
|
||||
ERROR,
|
||||
};
|
||||
|
||||
const char *status_string(EnrichStatus s);
|
||||
|
||||
// ── Data types ──────────────────────────────────────────────────────────────
|
||||
|
||||
struct PageError {
|
||||
std::string url;
|
||||
std::string status; // "SEARCHED_EMAIL", "FAILED", ...
|
||||
std::string method; // "GET", "SCRAPELESS", ...
|
||||
std::string error;
|
||||
int http_status = 0;
|
||||
std::vector<std::string> emails;
|
||||
};
|
||||
|
||||
struct SocialLink {
|
||||
std::string platform; // "instagram", "facebook", "linkedin", ...
|
||||
std::string url;
|
||||
};
|
||||
|
||||
struct SiteMeta {
|
||||
std::string title;
|
||||
std::string description;
|
||||
std::string og_image;
|
||||
std::string canonical;
|
||||
std::vector<SocialLink> socials;
|
||||
std::vector<std::string> internal_pages; // discovered internal hrefs
|
||||
std::vector<std::string> emails;
|
||||
std::string body_text;
|
||||
std::string body_html;
|
||||
std::map<std::string, std::string> sites; // url -> body_md
|
||||
int http_status = 0;
|
||||
std::string fetch_error;
|
||||
std::vector<std::string> json_ld;
|
||||
};
|
||||
|
||||
struct EnrichedNode {
|
||||
int idx = 0;
|
||||
std::string title;
|
||||
std::string place_id;
|
||||
std::string website;
|
||||
std::string address;
|
||||
std::string type;
|
||||
std::string grid_area;
|
||||
std::string grid_gid;
|
||||
int pages_found = 0;
|
||||
int pages_scraped = 0;
|
||||
std::vector<std::string> emails;
|
||||
std::vector<SocialLink> socials;
|
||||
int meta_ms = 0;
|
||||
int email_ms = 0;
|
||||
int total_ms = 0;
|
||||
EnrichStatus status = EnrichStatus::NO_EMAIL;
|
||||
std::string error;
|
||||
std::map<std::string, std::string> pages; // "home" → body text
|
||||
std::vector<std::string> meta_pages;
|
||||
std::vector<PageError> page_errors;
|
||||
std::string enricher_hash;
|
||||
std::string geo_json;
|
||||
std::map<std::string, std::string> sites; // url -> body_md
|
||||
};
|
||||
|
||||
// ── Configuration ───────────────────────────────────────────────────────────
|
||||
|
||||
struct EnrichConfig {
|
||||
bool enable_homepage_md = true;
|
||||
int meta_timeout_ms = 10000;
|
||||
int email_timeout_ms = 15000;
|
||||
int email_page_timeout_ms = 10000;
|
||||
int email_max_pages = 8;
|
||||
int email_abort_after = 1;
|
||||
|
||||
/// Scrapeless API key — if set, pages that yield no emails via plain
|
||||
/// HTTP GET will be re-fetched through the Scrapeless Universal Scraping
|
||||
/// API (JS rendering). Leave empty to disable the fallback.
|
||||
std::string scrapeless_key;
|
||||
|
||||
std::string bigdata_key;
|
||||
|
||||
std::vector<std::string> contact_patterns = {
|
||||
"contact", "kontakt", "contacto", "contacta", "impression",
|
||||
"about", "impress", "impressum", "datenschutz", "privacy",
|
||||
"legal", "team", "nosotros", "empresa", "sobre",
|
||||
};
|
||||
std::vector<std::string> probe_paths = {
|
||||
"/contact", "/contacto", "/kontakt", "/contacta",
|
||||
"/about", "/about-us", "/impressum",
|
||||
};
|
||||
|
||||
std::string meta_scraper;
|
||||
int meta_concurrency = 5;
|
||||
int meta_idle_timeout = 60;
|
||||
};
|
||||
|
||||
// ── Location input ──────────────────────────────────────────────────────────
|
||||
|
||||
struct LocationInput {
|
||||
std::string title;
|
||||
std::string place_id;
|
||||
std::string website;
|
||||
std::string address;
|
||||
std::string type;
|
||||
std::string grid_area;
|
||||
std::string grid_gid;
|
||||
double lat = 0;
|
||||
double lng = 0;
|
||||
};
|
||||
|
||||
// ── Core API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Check if a candidate string looks like a real email address.
|
||||
bool is_likely_email(const std::string &candidate);
|
||||
|
||||
/// Extract all email addresses from a text body.
|
||||
std::vector<std::string> extract_emails(const std::string &text);
|
||||
|
||||
/// Scrape metadata from a website URL (static HTML via libcurl + lexbor).
|
||||
SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000);
|
||||
|
||||
/// Scrape emails from a single page URL.
|
||||
std::vector<std::string> scrape_emails_from_page(const std::string &url,
|
||||
int timeout_ms = 10000);
|
||||
|
||||
/// Fetch a page via Scrapeless Universal Scraping API (JS rendering),
|
||||
/// then extract emails from the rendered HTML. Returns empty if key is
|
||||
/// blank or the API call fails.
|
||||
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
|
||||
const std::string &api_key,
|
||||
int timeout_ms = 15000);
|
||||
|
||||
/// Scrape metadata from a website URL via Scrapeless Universal API (JS
|
||||
/// rendering).
|
||||
SiteMeta scrape_meta_scrapeless(const std::string &url,
|
||||
const std::string &api_key,
|
||||
int timeout_ms = 15000);
|
||||
|
||||
/// Full enrichment pipeline for a single location: meta → email.
|
||||
EnrichedNode enrich_location(const LocationInput &loc,
|
||||
const EnrichConfig &cfg = {});
|
||||
|
||||
/// Resolve a URL relative to a base URL.
|
||||
std::string resolve_url(const std::string &base, const std::string &href);
|
||||
|
||||
} // namespace enrichers
|
||||
@ -1,800 +0,0 @@
|
||||
#include "enrichers/enrichers.h"
|
||||
#include "html/html.h"
|
||||
#include "http/http.h"
|
||||
#include "logger/logger.h"
|
||||
#include "json/json.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
|
||||
namespace enrichers {
|
||||
|
||||
// ── Status string ───────────────────────────────────────────────────────────
|
||||
|
||||
const char *status_string(EnrichStatus s) {
|
||||
switch (s) {
|
||||
case EnrichStatus::OK:
|
||||
return "OK";
|
||||
case EnrichStatus::NO_EMAIL:
|
||||
return "NO_EMAIL";
|
||||
case EnrichStatus::META_TIMEOUT:
|
||||
return "META_TIMEOUT";
|
||||
case EnrichStatus::EMAIL_TIMEOUT:
|
||||
return "EMAIL_TIMEOUT";
|
||||
case EnrichStatus::FETCH_ERROR:
|
||||
return "FETCH_ERROR";
|
||||
case EnrichStatus::NO_PAGES:
|
||||
return "NO_PAGES";
|
||||
case EnrichStatus::ERROR:
|
||||
return "ERROR";
|
||||
}
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
// ── Timing helper ───────────────────────────────────────────────────────────
|
||||
|
||||
static int elapsed_ms(std::chrono::steady_clock::time_point t0) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
return static_cast<int>(
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(now - t0).count());
|
||||
}
|
||||
|
||||
// ── Email extraction ────────────────────────────────────────────────────────
|
||||
|
||||
static const std::regex
|
||||
EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
|
||||
std::regex::optimize);
|
||||
|
||||
// Asset extensions that disqualify an email-like string
|
||||
static const std::vector<std::string> ASSET_EXTS = {
|
||||
".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp",
|
||||
".avif", ".css", ".js", ".woff", ".woff2", ".ttf",
|
||||
".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico",
|
||||
};
|
||||
|
||||
static std::string to_lower(const std::string &s) {
|
||||
std::string out = s;
|
||||
std::transform(out.begin(), out.end(), out.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
return out;
|
||||
}
|
||||
|
||||
bool is_likely_email(const std::string &candidate) {
|
||||
if (candidate.size() < 5 || candidate.size() > 254)
|
||||
return false;
|
||||
if (candidate.find("..") != std::string::npos)
|
||||
return false;
|
||||
auto at_pos = candidate.find('@');
|
||||
if (at_pos == std::string::npos || at_pos == 0 ||
|
||||
at_pos == candidate.size() - 1)
|
||||
return false;
|
||||
|
||||
auto lower = to_lower(candidate);
|
||||
|
||||
// Reject asset-like extensions
|
||||
for (auto &ext : ASSET_EXTS) {
|
||||
if (lower.size() >= ext.size() &&
|
||||
lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reject common placeholders
|
||||
if (lower.find("example") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("sentry") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("test") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("placeholder") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("wixpress.com") != std::string::npos)
|
||||
return false;
|
||||
|
||||
// Reject if local part is pure hex hash (8+ hex chars)
|
||||
if (at_pos >= 8) {
|
||||
auto local = lower.substr(0, at_pos);
|
||||
bool all_hex = std::all_of(local.begin(), local.end(), [](char c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
|
||||
});
|
||||
if (all_hex)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject if domain part looks numeric-only (e.g. 1234@5678)
|
||||
auto domain = lower.substr(at_pos + 1);
|
||||
auto dot_pos = domain.find('.');
|
||||
if (dot_pos == std::string::npos)
|
||||
return false;
|
||||
if (domain.length() - dot_pos <= 2)
|
||||
return false; // Minimum 2 chars for TLD
|
||||
|
||||
auto domPart = domain.substr(0, dot_pos);
|
||||
bool all_digits =
|
||||
!domPart.empty() &&
|
||||
std::all_of(domPart.begin(), domPart.end(),
|
||||
[](unsigned char c) { return std::isdigit(c); });
|
||||
if (all_digits)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_valid_email_char(char c) {
|
||||
return std::isalnum(static_cast<unsigned char>(c)) || c == '.' || c == '_' ||
|
||||
c == '%' || c == '+' || c == '-';
|
||||
}
|
||||
|
||||
std::vector<std::string> extract_emails(const std::string &text) {
|
||||
std::vector<std::string> results;
|
||||
if (text.empty())
|
||||
return results;
|
||||
|
||||
std::set<std::string> seen;
|
||||
size_t pos = 0;
|
||||
|
||||
while ((pos = text.find('@', pos)) != std::string::npos) {
|
||||
if (pos == 0 || pos == text.length() - 1) {
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Scan backwards
|
||||
size_t start = pos;
|
||||
while (start > 0 && is_valid_email_char(text[start - 1])) {
|
||||
start--;
|
||||
}
|
||||
|
||||
// Scan forwards
|
||||
size_t end = pos;
|
||||
while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) {
|
||||
end++;
|
||||
}
|
||||
|
||||
if (start < pos && end > pos) {
|
||||
std::string candidate = text.substr(start, end - start + 1);
|
||||
|
||||
// Strip trailing dots/hyphens eagerly grabbed
|
||||
while (!candidate.empty() &&
|
||||
(candidate.back() == '.' || candidate.back() == '-')) {
|
||||
candidate.pop_back();
|
||||
end--;
|
||||
}
|
||||
|
||||
// Strip leading dots/hyphens
|
||||
size_t local_start = 0;
|
||||
while (local_start < candidate.length() &&
|
||||
(candidate[local_start] == '.' || candidate[local_start] == '-')) {
|
||||
local_start++;
|
||||
}
|
||||
if (local_start > 0) {
|
||||
candidate = candidate.substr(local_start);
|
||||
}
|
||||
|
||||
std::string lower = to_lower(candidate);
|
||||
if (is_likely_email(lower)) {
|
||||
if (seen.insert(lower).second) {
|
||||
results.push_back(lower);
|
||||
}
|
||||
}
|
||||
}
|
||||
pos = end + 1;
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// ── URL resolution ──────────────────────────────────────────────────────────
|
||||
|
||||
std::string resolve_url(const std::string &base, const std::string &href) {
|
||||
if (href.empty())
|
||||
return {};
|
||||
|
||||
// Already absolute
|
||||
if (href.find("http://") == 0 || href.find("https://") == 0)
|
||||
return href;
|
||||
|
||||
// Protocol-relative
|
||||
if (href.find("//") == 0) {
|
||||
auto proto_end = base.find("//");
|
||||
if (proto_end != std::string::npos) {
|
||||
return base.substr(0, proto_end) + href;
|
||||
}
|
||||
return "https:" + href;
|
||||
}
|
||||
|
||||
// Skip non-HTTP
|
||||
if (href.find("mailto:") == 0 || href.find("tel:") == 0 ||
|
||||
href.find("javascript:") == 0 || href[0] == '#') {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Relative path
|
||||
// Find base origin: https://example.com
|
||||
auto proto = base.find("://");
|
||||
if (proto == std::string::npos)
|
||||
return {};
|
||||
auto origin_end = base.find('/', proto + 3);
|
||||
std::string origin =
|
||||
(origin_end != std::string::npos) ? base.substr(0, origin_end) : base;
|
||||
|
||||
if (href[0] == '/') {
|
||||
return origin + href;
|
||||
}
|
||||
|
||||
// Relative without leading slash
|
||||
if (origin_end != std::string::npos) {
|
||||
auto last_slash = base.rfind('/');
|
||||
if (last_slash > proto + 2) {
|
||||
return base.substr(0, last_slash + 1) + href;
|
||||
}
|
||||
}
|
||||
return origin + "/" + href;
|
||||
}
|
||||
|
||||
// ── Social link classification ──────────────────────────────────────────────
|
||||
|
||||
static std::string classify_social(const std::string &url) {
|
||||
auto lower = to_lower(url);
|
||||
if (lower.find("instagram.com") != std::string::npos)
|
||||
return "instagram";
|
||||
if (lower.find("facebook.com") != std::string::npos)
|
||||
return "facebook";
|
||||
if (lower.find("linkedin.com") != std::string::npos)
|
||||
return "linkedin";
|
||||
if (lower.find("twitter.com") != std::string::npos ||
|
||||
lower.find("x.com") != std::string::npos)
|
||||
return "twitter";
|
||||
if (lower.find("youtube.com") != std::string::npos)
|
||||
return "youtube";
|
||||
if (lower.find("tiktok.com") != std::string::npos)
|
||||
return "tiktok";
|
||||
if (lower.find("pinterest.com") != std::string::npos)
|
||||
return "pinterest";
|
||||
if (lower.find("github.com") != std::string::npos)
|
||||
return "github";
|
||||
return {};
|
||||
}
|
||||
|
||||
// ── Same-origin check ───────────────────────────────────────────────────────
|
||||
|
||||
static std::string get_origin(const std::string &url) {
|
||||
auto proto = url.find("://");
|
||||
if (proto == std::string::npos)
|
||||
return {};
|
||||
auto origin_end = url.find('/', proto + 3);
|
||||
return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url;
|
||||
}
|
||||
|
||||
static bool is_same_origin(const std::string &base_url,
|
||||
const std::string &href) {
|
||||
auto bo = to_lower(get_origin(base_url));
|
||||
auto ho = to_lower(get_origin(href));
|
||||
if (bo.empty() || ho.empty())
|
||||
return false;
|
||||
// Strip www. for comparison
|
||||
auto strip_www = [](std::string &s) {
|
||||
auto pos = s.find("://www.");
|
||||
if (pos != std::string::npos) {
|
||||
s = s.substr(0, pos + 3) + s.substr(pos + 7);
|
||||
}
|
||||
};
|
||||
strip_www(bo);
|
||||
strip_www(ho);
|
||||
return bo == ho;
|
||||
}
|
||||
|
||||
// ── Contact page matching ───────────────────────────────────────────────────
|
||||
|
||||
static bool matches_contact_pattern(const std::string &url,
|
||||
const std::vector<std::string> &patterns) {
|
||||
auto lower = to_lower(url);
|
||||
for (auto &pat : patterns) {
|
||||
if (lower.find(to_lower(pat)) != std::string::npos)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ── Shared HTML parsing logic for Meta ──────────────────────────────────────
|
||||
|
||||
static SiteMeta parse_meta_html(const std::string &url, int http_status,
|
||||
const std::string &html_body,
|
||||
const std::string &fetch_error) {
|
||||
SiteMeta meta;
|
||||
meta.http_status = http_status;
|
||||
|
||||
if (!fetch_error.empty()) {
|
||||
meta.fetch_error = fetch_error;
|
||||
return meta;
|
||||
}
|
||||
|
||||
meta.body_html = html_body;
|
||||
|
||||
// Parse with lexbor helpers
|
||||
meta.title = html::get_title(html_body);
|
||||
meta.description = html::get_meta(html_body, "description");
|
||||
meta.og_image = html::get_meta(html_body, "og:image");
|
||||
meta.canonical = html::get_canonical(html_body);
|
||||
meta.body_text = html::get_body_text(html_body);
|
||||
meta.json_ld = html::get_json_ld(html_body);
|
||||
|
||||
// OG fallbacks
|
||||
if (meta.description.empty())
|
||||
meta.description = html::get_meta(html_body, "og:description");
|
||||
if (meta.title.empty())
|
||||
meta.title = html::get_meta(html_body, "og:title");
|
||||
|
||||
// Links — classify into social / internal / mailto
|
||||
auto links = html::get_links(html_body);
|
||||
std::set<std::string> seen_pages;
|
||||
|
||||
// Extract emails from body text (much smaller than raw HTML)
|
||||
meta.emails = extract_emails(meta.body_text);
|
||||
|
||||
for (auto &lk : links) {
|
||||
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
|
||||
std::string email = lk.href.substr(7);
|
||||
// Strip anything after ? (like ?subject=...)
|
||||
auto q = email.find('?');
|
||||
if (q != std::string::npos)
|
||||
email = email.substr(0, q);
|
||||
// Clean it
|
||||
email = to_lower(email);
|
||||
if (is_likely_email(email)) {
|
||||
if (std::find(meta.emails.begin(), meta.emails.end(), email) ==
|
||||
meta.emails.end()) {
|
||||
meta.emails.push_back(email);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
auto resolved = resolve_url(url, lk.href);
|
||||
if (resolved.empty())
|
||||
continue;
|
||||
|
||||
auto social = classify_social(resolved);
|
||||
if (!social.empty()) {
|
||||
meta.socials.push_back({social, resolved});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_same_origin(url, resolved)) {
|
||||
// Strip fragment (#) from URL
|
||||
auto hash_pos = resolved.find('#');
|
||||
if (hash_pos != std::string::npos) {
|
||||
resolved = resolved.substr(0, hash_pos);
|
||||
}
|
||||
if (!resolved.empty() && seen_pages.insert(resolved).second) {
|
||||
meta.internal_pages.push_back(resolved);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return meta;
|
||||
}
|
||||
|
||||
// ── scrape_meta ─────────────────────────────────────────────────────────────
|
||||
|
||||
SiteMeta scrape_meta(const std::string &url, int timeout_ms) {
|
||||
http::GetOptions opts;
|
||||
opts.timeout_ms = timeout_ms;
|
||||
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36";
|
||||
|
||||
auto resp = http::get(url, opts);
|
||||
std::string fetch_err;
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
fetch_err = resp.body;
|
||||
}
|
||||
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
|
||||
fetch_err);
|
||||
}
|
||||
|
||||
// ── scrape_emails_from_page ─────────────────────────────────────────────────
|
||||
|
||||
std::vector<std::string> scrape_emails_from_page(const std::string &url,
|
||||
int timeout_ms,
|
||||
int &out_status_code) {
|
||||
http::GetOptions opts;
|
||||
opts.timeout_ms = timeout_ms;
|
||||
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36";
|
||||
|
||||
auto resp = http::get(url, opts);
|
||||
out_status_code = static_cast<int>(resp.status_code);
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Extract body text then find emails
|
||||
auto text = html::get_body_text(resp.body);
|
||||
auto from_text = extract_emails(text);
|
||||
|
||||
// Extract mailto: links from HTML directly without regexing the huge string
|
||||
auto links = html::get_links(resp.body);
|
||||
std::set<std::string> seen(from_text.begin(), from_text.end());
|
||||
|
||||
for (auto &lk : links) {
|
||||
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
|
||||
std::string m = lk.href.substr(7);
|
||||
auto q = m.find('?');
|
||||
if (q != std::string::npos)
|
||||
m = m.substr(0, q);
|
||||
m = to_lower(m);
|
||||
if (is_likely_email(m)) {
|
||||
if (seen.insert(m).second) {
|
||||
from_text.push_back(m);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return from_text;
|
||||
}
|
||||
|
||||
static std::string extract_scrapeless_html(const std::string &json_body) {
|
||||
std::string data = json::get_string(json_body, "data");
|
||||
if (data.empty()) {
|
||||
return json_body; // Fallback to raw response if not found
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
SiteMeta scrape_meta_scrapeless(const std::string &url,
|
||||
const std::string &api_key, int timeout_ms) {
|
||||
if (api_key.empty())
|
||||
return parse_meta_html(url, 0, "", "missing api key");
|
||||
|
||||
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
|
||||
url +
|
||||
R"(","jsRender":{"enabled":true,"headless":true}}})";
|
||||
|
||||
http::PostOptions opts;
|
||||
opts.content_type = "application/json";
|
||||
opts.bearer_token = api_key;
|
||||
opts.timeout_ms =
|
||||
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
|
||||
|
||||
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
|
||||
payload, opts);
|
||||
|
||||
std::string fetch_err;
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
fetch_err = resp.body;
|
||||
logger::error("[meta:scrapeless] API Error HTTP " +
|
||||
std::to_string(resp.status_code) + " for " + url + " : " +
|
||||
fetch_err);
|
||||
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
|
||||
fetch_err);
|
||||
}
|
||||
|
||||
std::string rendered_html = extract_scrapeless_html(resp.body);
|
||||
return parse_meta_html(url, static_cast<int>(resp.status_code), rendered_html,
|
||||
"");
|
||||
}
|
||||
|
||||
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
|
||||
const std::string &api_key,
|
||||
int timeout_ms) {
|
||||
if (api_key.empty())
|
||||
return {};
|
||||
|
||||
// Build the Scrapeless Universal Scraping API request body.
|
||||
// We ask for the fully-rendered HTML of the target URL.
|
||||
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
|
||||
url +
|
||||
R"(","jsRender":{"enabled":true,"headless":true}}})";
|
||||
|
||||
http::PostOptions opts;
|
||||
opts.content_type = "application/json";
|
||||
opts.bearer_token = api_key;
|
||||
opts.timeout_ms =
|
||||
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
|
||||
|
||||
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
|
||||
payload, opts);
|
||||
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
logger::error("[email:scrapeless] API Error HTTP " +
|
||||
std::to_string(resp.status_code) + " for " + url + " : " +
|
||||
resp.body);
|
||||
return {}; // API error — silent fallback
|
||||
}
|
||||
|
||||
std::string rendered_html = extract_scrapeless_html(resp.body);
|
||||
|
||||
// Parse and extract emails from the rendered HTML
|
||||
auto text = html::get_body_text(rendered_html);
|
||||
auto from_text = extract_emails(text);
|
||||
|
||||
// Fast mailto extraction instead of HTML regex
|
||||
auto links = html::get_links(rendered_html);
|
||||
std::set<std::string> seen(from_text.begin(), from_text.end());
|
||||
|
||||
for (auto &lk : links) {
|
||||
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
|
||||
std::string m = lk.href.substr(7);
|
||||
auto q = m.find('?');
|
||||
if (q != std::string::npos)
|
||||
m = m.substr(0, q);
|
||||
m = to_lower(m);
|
||||
if (is_likely_email(m)) {
|
||||
if (seen.insert(m).second) {
|
||||
from_text.push_back(m);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return from_text;
|
||||
}
|
||||
|
||||
// ── enrich_location ─────────────────────────────────────────────────────────
|
||||
|
||||
EnrichedNode enrich_location(const LocationInput &loc,
|
||||
const EnrichConfig &cfg) {
|
||||
auto t0 = std::chrono::steady_clock::now();
|
||||
|
||||
EnrichedNode node;
|
||||
node.title = loc.title;
|
||||
node.place_id = loc.place_id;
|
||||
node.website = loc.website;
|
||||
node.address = loc.address;
|
||||
node.type = loc.type;
|
||||
node.grid_area = loc.grid_area;
|
||||
node.grid_gid = loc.grid_gid;
|
||||
node.status = EnrichStatus::NO_EMAIL;
|
||||
|
||||
if (loc.website.empty()) {
|
||||
node.status = EnrichStatus::FETCH_ERROR;
|
||||
node.error = "no website";
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
// ── Phase 1: Meta scrape ────────────────────────────────────────────────
|
||||
|
||||
auto meta_t0 = std::chrono::steady_clock::now();
|
||||
SiteMeta meta;
|
||||
bool meta_timed_out = false;
|
||||
|
||||
try {
|
||||
if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) {
|
||||
logger::debug("[meta:scrapeless] Fetching " + loc.website);
|
||||
meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key,
|
||||
cfg.meta_timeout_ms);
|
||||
} else {
|
||||
logger::debug("[meta:http] Fetching " + loc.website);
|
||||
meta = scrape_meta(loc.website, cfg.meta_timeout_ms);
|
||||
}
|
||||
} catch (...) {
|
||||
meta.fetch_error = "exception during meta scrape";
|
||||
meta_timed_out = true;
|
||||
}
|
||||
node.meta_ms = elapsed_ms(meta_t0);
|
||||
|
||||
// Check if meta took too long (within threshold of timeout)
|
||||
if (node.meta_ms >= cfg.meta_timeout_ms - 1000) {
|
||||
meta_timed_out = true;
|
||||
}
|
||||
|
||||
// logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title)
|
||||
// + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: "
|
||||
// + std::to_string(meta.internal_pages.size()));
|
||||
|
||||
if (!meta.body_text.empty())
|
||||
node.pages["home"] = meta.body_text;
|
||||
if (cfg.enable_homepage_md && !meta.body_html.empty()) {
|
||||
// Cap HTML body at 512 KB to prevent stack overflow in recursive html2md
|
||||
// parser
|
||||
static constexpr size_t MAX_HTML_BYTES = 512 * 1024;
|
||||
if (meta.body_html.size() > MAX_HTML_BYTES) {
|
||||
logger::warn("[" + loc.title + "] body_html too large (" +
|
||||
std::to_string(meta.body_html.size() / 1024) +
|
||||
" KB), skipping markdown conversion");
|
||||
} else {
|
||||
try {
|
||||
node.sites[loc.website] = html::to_markdown(meta.body_html);
|
||||
} catch (const std::exception &e) {
|
||||
logger::warn("[" + loc.title +
|
||||
"] html::to_markdown failed: " + e.what());
|
||||
} catch (...) {
|
||||
logger::warn("[" + loc.title +
|
||||
"] html::to_markdown crashed (unknown exception)");
|
||||
}
|
||||
}
|
||||
}
|
||||
node.meta_pages = meta.internal_pages;
|
||||
node.pages_found = static_cast<int>(meta.internal_pages.size());
|
||||
node.socials = meta.socials;
|
||||
|
||||
if (!meta.fetch_error.empty()) {
|
||||
node.error = meta.fetch_error;
|
||||
node.status = EnrichStatus::FETCH_ERROR;
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
// If meta already found emails, we're done (early exit like TS)
|
||||
if (!meta.emails.empty()) {
|
||||
node.emails = meta.emails;
|
||||
node.status = EnrichStatus::OK;
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
// ── Build contact page list ─────────────────────────────────────────────
|
||||
|
||||
std::vector<std::string> contact_pages;
|
||||
std::set<std::string> seen_urls;
|
||||
|
||||
for (auto &page_url : meta.internal_pages) {
|
||||
if (matches_contact_pattern(page_url, cfg.contact_patterns)) {
|
||||
if (seen_urls.insert(page_url).second) {
|
||||
contact_pages.push_back(page_url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No more probe paths. If we found 0 contact pages, we just give up or time
|
||||
// out.
|
||||
|
||||
node.pages_found = static_cast<int>(contact_pages.size());
|
||||
|
||||
if (contact_pages.empty()) {
|
||||
logger::debug("[" +
|
||||
std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] No contact pages found.");
|
||||
node.status =
|
||||
meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES;
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] Contact pages to scrape: " +
|
||||
std::to_string(contact_pages.size()) + " (parallel)");
|
||||
|
||||
// ── Phase 2: Email scrape per contact page ──────────────────────────────
|
||||
|
||||
struct AsyncResult {
|
||||
std::string url;
|
||||
std::vector<PageError> errors;
|
||||
std::vector<std::string> emails;
|
||||
int ms;
|
||||
};
|
||||
|
||||
int pages_to_scrape =
|
||||
std::min(static_cast<int>(contact_pages.size()), cfg.email_max_pages);
|
||||
|
||||
std::vector<std::thread> contact_threads;
|
||||
std::vector<AsyncResult> contact_results(pages_to_scrape);
|
||||
|
||||
auto email_t0 = std::chrono::steady_clock::now();
|
||||
|
||||
for (int i = 0; i < pages_to_scrape; ++i) {
|
||||
auto page_url = contact_pages[i];
|
||||
|
||||
contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
AsyncResult res;
|
||||
res.url = page_url;
|
||||
|
||||
PageError pe1;
|
||||
pe1.url = page_url;
|
||||
pe1.method = "GET";
|
||||
|
||||
int http_status = 0;
|
||||
try {
|
||||
// logger::debug("[email:http] Fetching " + page_url);
|
||||
auto page_emails = scrape_emails_from_page(
|
||||
page_url, cfg.email_page_timeout_ms, http_status);
|
||||
pe1.emails = page_emails;
|
||||
logger::debug("[" +
|
||||
std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] HTTP fetch finished code " +
|
||||
std::to_string(http_status) + " for " + page_url);
|
||||
|
||||
if (page_emails.empty()) {
|
||||
if (http_status == 404 || http_status == 400 || http_status == 500) {
|
||||
pe1.status = "NOT_FOUND";
|
||||
pe1.error = "HTTP " + std::to_string(http_status);
|
||||
} else {
|
||||
pe1.status = "AXIOS_NO_EMAIL";
|
||||
res.errors.push_back(pe1); // pushed before scrapeless
|
||||
|
||||
if (cfg.meta_scraper == "SCRAPELESS" &&
|
||||
!cfg.scrapeless_key.empty()) {
|
||||
PageError pe2;
|
||||
pe2.url = page_url;
|
||||
pe2.method = "SCRAPELESS";
|
||||
try {
|
||||
logger::debug("[email:scrapeless] Fallback scraping " +
|
||||
page_url);
|
||||
auto s_emails =
|
||||
scrape_emails_scrapeless(page_url, cfg.scrapeless_key,
|
||||
cfg.email_page_timeout_ms + 5000);
|
||||
pe2.emails = s_emails;
|
||||
pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL";
|
||||
if (!s_emails.empty())
|
||||
res.emails = s_emails;
|
||||
logger::debug(
|
||||
"[" +
|
||||
std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] Scrapeless fallback finished for " + page_url);
|
||||
} catch (...) {
|
||||
pe2.status = "FAILED";
|
||||
pe2.error = "scrapeless exception";
|
||||
}
|
||||
res.errors.push_back(pe2);
|
||||
}
|
||||
res.ms = elapsed_ms(start);
|
||||
contact_results[i] = res;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
pe1.status = "SEARCHED_EMAIL";
|
||||
res.emails = page_emails;
|
||||
}
|
||||
} catch (...) {
|
||||
pe1.status = "AXIOS_FAILED";
|
||||
pe1.error = "exception";
|
||||
}
|
||||
// Only insert pe1 if we didn't already push it during fallback
|
||||
if (res.errors.empty() || res.errors[0].method != "GET") {
|
||||
res.errors.insert(res.errors.begin(), pe1);
|
||||
}
|
||||
res.ms = elapsed_ms(start);
|
||||
contact_results[i] = res;
|
||||
});
|
||||
}
|
||||
|
||||
for (auto &t : contact_threads) {
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
|
||||
std::set<std::string> all_emails;
|
||||
int pages_scraped = 0;
|
||||
|
||||
for (auto &res : contact_results) {
|
||||
pages_scraped++;
|
||||
for (auto &pe : res.errors) {
|
||||
node.page_errors.push_back(std::move(pe));
|
||||
}
|
||||
for (auto &e : res.emails) {
|
||||
all_emails.insert(e);
|
||||
}
|
||||
}
|
||||
|
||||
node.email_ms = elapsed_ms(email_t0);
|
||||
node.pages_scraped = pages_scraped;
|
||||
|
||||
// Merge emails
|
||||
node.emails.assign(all_emails.begin(), all_emails.end());
|
||||
|
||||
// Final status
|
||||
bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000;
|
||||
if (!node.emails.empty()) {
|
||||
node.status = EnrichStatus::OK;
|
||||
} else if (email_timed_out) {
|
||||
node.status = EnrichStatus::EMAIL_TIMEOUT;
|
||||
} else if (meta_timed_out) {
|
||||
node.status = EnrichStatus::META_TIMEOUT;
|
||||
} else {
|
||||
node.status = EnrichStatus::NO_EMAIL;
|
||||
}
|
||||
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
} // namespace enrichers
|
||||
@ -1,6 +0,0 @@
|
||||
add_library(gadm_reader STATIC src/gadm_reader.cpp)
|
||||
|
||||
target_include_directories(gadm_reader PUBLIC include)
|
||||
|
||||
# Depends on geo (for Coord type) and json (for RapidJSON)
|
||||
target_link_libraries(gadm_reader PUBLIC geo json)
|
||||
@ -1,75 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "geo/geo.h"
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace gadm {
|
||||
|
||||
// ── Feature (mirrors TS GridFeature) ────────────────────────────────────────
|
||||
|
||||
struct Feature {
|
||||
std::string gid; // e.g. "ABW", "AFG.1.1_1"
|
||||
std::string name; // e.g. "Aruba", "Baharak"
|
||||
int level = 0; // GADM admin level
|
||||
|
||||
// Outer ring + holes (MultiPolygon flattened to rings)
|
||||
std::vector<std::vector<geo::Coord>> rings;
|
||||
|
||||
// Bounding box (computed from rings)
|
||||
geo::BBox bbox;
|
||||
|
||||
// GHS enrichment (parsed from cached JSON)
|
||||
double ghsPopulation = 0;
|
||||
double ghsBuiltWeight = 0;
|
||||
double ghsPopMaxDensity = 0;
|
||||
double ghsBuiltMax = 0;
|
||||
|
||||
geo::Coord ghsPopCenter;
|
||||
geo::Coord ghsBuiltCenter;
|
||||
|
||||
// Weighted centers: [lon, lat, weight]
|
||||
std::vector<std::array<double, 3>> ghsPopCenters;
|
||||
std::vector<std::array<double, 3>> ghsBuiltCenters;
|
||||
|
||||
// Computed from geometry
|
||||
double areaSqKm = 0;
|
||||
|
||||
bool isOuter = true;
|
||||
};
|
||||
|
||||
// ── Result ──────────────────────────────────────────────────────────────────
|
||||
|
||||
struct BoundaryResult {
|
||||
std::vector<Feature> features;
|
||||
std::string error; // empty on success
|
||||
};
|
||||
|
||||
// ── API ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Load a pre-cached GADM boundary file.
|
||||
///
|
||||
/// Tries these file paths in order:
|
||||
/// 1. cacheDir/boundary_{gid}_{targetLevel}.json
|
||||
/// 2. cacheDir/boundary_{countryCode}_{targetLevel}.json (fallback for country-level)
|
||||
///
|
||||
/// Returns a BoundaryResult with parsed features or an error string.
|
||||
BoundaryResult load_boundary(
|
||||
const std::string& gid,
|
||||
int targetLevel,
|
||||
const std::string& cacheDir = "cache/gadm"
|
||||
);
|
||||
|
||||
/// Load a boundary file directly by path.
|
||||
BoundaryResult load_boundary_file(const std::string& filepath);
|
||||
|
||||
/// Extract the ISO country code from a GID (e.g. "AFG.1.1_1" → "AFG").
|
||||
std::string country_code(const std::string& gid);
|
||||
|
||||
/// Infer the GADM level from a GID string.
|
||||
/// "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2, etc.
|
||||
int infer_level(const std::string& gid);
|
||||
|
||||
} // namespace gadm
|
||||
@ -1,231 +0,0 @@
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
|
||||
namespace gadm {
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
std::string country_code(const std::string& gid) {
|
||||
auto dot = gid.find('.');
|
||||
return (dot != std::string::npos) ? gid.substr(0, dot) : gid;
|
||||
}
|
||||
|
||||
int infer_level(const std::string& gid) {
|
||||
// Count dots: "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2
|
||||
int dots = 0;
|
||||
for (char c : gid) {
|
||||
if (c == '.') dots++;
|
||||
}
|
||||
return dots;
|
||||
}
|
||||
|
||||
static std::string read_file(const std::string& path) {
|
||||
std::ifstream ifs(path, std::ios::binary);
|
||||
if (!ifs.is_open()) return "";
|
||||
std::ostringstream oss;
|
||||
oss << ifs.rdbuf();
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
/// Parse a coord array [lon, lat] → geo::Coord
|
||||
static geo::Coord parse_coord(const rapidjson::Value& arr) {
|
||||
if (arr.IsArray() && arr.Size() >= 2) {
|
||||
return {arr[0].GetDouble(), arr[1].GetDouble()};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Parse a ring array [[lon,lat], [lon,lat], ...] → vector<Coord>
|
||||
static std::vector<geo::Coord> parse_ring(const rapidjson::Value& arr) {
|
||||
std::vector<geo::Coord> ring;
|
||||
if (!arr.IsArray()) return ring;
|
||||
ring.reserve(arr.Size());
|
||||
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
|
||||
ring.push_back(parse_coord(arr[i]));
|
||||
}
|
||||
return ring;
|
||||
}
|
||||
|
||||
/// Parse weighted centers [[lon, lat, weight], ...]
|
||||
static std::vector<std::array<double, 3>> parse_weighted_centers(
|
||||
const rapidjson::Value& arr) {
|
||||
std::vector<std::array<double, 3>> centers;
|
||||
if (!arr.IsArray()) return centers;
|
||||
centers.reserve(arr.Size());
|
||||
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
|
||||
const auto& c = arr[i];
|
||||
if (c.IsArray() && c.Size() >= 3) {
|
||||
centers.push_back({c[0].GetDouble(), c[1].GetDouble(), c[2].GetDouble()});
|
||||
}
|
||||
}
|
||||
return centers;
|
||||
}
|
||||
|
||||
/// Get a double from properties, with fallback
|
||||
static double get_double(const rapidjson::Value& props, const char* key,
|
||||
double fallback = 0.0) {
|
||||
if (props.HasMember(key) && props[key].IsNumber()) {
|
||||
return props[key].GetDouble();
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/// Get a bool from properties, with fallback
|
||||
static bool get_bool(const rapidjson::Value& props, const char* key,
|
||||
bool fallback = true) {
|
||||
if (props.HasMember(key) && props[key].IsBool()) {
|
||||
return props[key].GetBool();
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/// Get a string from properties, checking GID_0, GID_1, GID_2, etc.
|
||||
static std::string get_gid(const rapidjson::Value& props) {
|
||||
// Try GID_5 down to GID_0, return the most specific one found
|
||||
for (int lvl = 5; lvl >= 0; --lvl) {
|
||||
std::string key = "GID_" + std::to_string(lvl);
|
||||
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
|
||||
return props[key.c_str()].GetString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/// Get the name (NAME_0, NAME_1, ... NAME_5)
|
||||
static std::string get_name(const rapidjson::Value& props) {
|
||||
for (int lvl = 5; lvl >= 0; --lvl) {
|
||||
std::string key = "NAME_" + std::to_string(lvl);
|
||||
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
|
||||
return props[key.c_str()].GetString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/// Parse a single GeoJSON Feature object into a gadm::Feature
|
||||
static Feature parse_feature(const rapidjson::Value& feat) {
|
||||
Feature f;
|
||||
|
||||
// Properties
|
||||
if (feat.HasMember("properties") && feat["properties"].IsObject()) {
|
||||
const auto& props = feat["properties"];
|
||||
f.gid = get_gid(props);
|
||||
f.name = get_name(props);
|
||||
f.level = infer_level(f.gid);
|
||||
f.ghsPopulation = get_double(props, "ghsPopulation");
|
||||
f.ghsBuiltWeight = get_double(props, "ghsBuiltWeight");
|
||||
f.ghsPopMaxDensity = get_double(props, "ghsPopMaxDensity");
|
||||
f.ghsBuiltMax = get_double(props, "ghsBuiltMax");
|
||||
f.isOuter = get_bool(props, "isOuter");
|
||||
|
||||
if (props.HasMember("ghsPopCenter") && props["ghsPopCenter"].IsArray()) {
|
||||
f.ghsPopCenter = parse_coord(props["ghsPopCenter"]);
|
||||
}
|
||||
if (props.HasMember("ghsBuiltCenter") && props["ghsBuiltCenter"].IsArray()) {
|
||||
f.ghsBuiltCenter = parse_coord(props["ghsBuiltCenter"]);
|
||||
}
|
||||
if (props.HasMember("ghsPopCenters") && props["ghsPopCenters"].IsArray()) {
|
||||
f.ghsPopCenters = parse_weighted_centers(props["ghsPopCenters"]);
|
||||
}
|
||||
if (props.HasMember("ghsBuiltCenters") && props["ghsBuiltCenters"].IsArray()) {
|
||||
f.ghsBuiltCenters = parse_weighted_centers(props["ghsBuiltCenters"]);
|
||||
}
|
||||
}
|
||||
|
||||
// Geometry
|
||||
if (feat.HasMember("geometry") && feat["geometry"].IsObject()) {
|
||||
const auto& geom = feat["geometry"];
|
||||
std::string gtype;
|
||||
if (geom.HasMember("type") && geom["type"].IsString()) {
|
||||
gtype = geom["type"].GetString();
|
||||
}
|
||||
|
||||
if (geom.HasMember("coordinates") && geom["coordinates"].IsArray()) {
|
||||
const auto& coords = geom["coordinates"];
|
||||
|
||||
if (gtype == "Polygon") {
|
||||
// coordinates: [ [ring], [hole], ... ]
|
||||
for (rapidjson::SizeType r = 0; r < coords.Size(); ++r) {
|
||||
f.rings.push_back(parse_ring(coords[r]));
|
||||
}
|
||||
} else if (gtype == "MultiPolygon") {
|
||||
// coordinates: [ [ [ring], [hole] ], [ [ring] ], ... ]
|
||||
for (rapidjson::SizeType p = 0; p < coords.Size(); ++p) {
|
||||
if (coords[p].IsArray()) {
|
||||
for (rapidjson::SizeType r = 0; r < coords[p].Size(); ++r) {
|
||||
f.rings.push_back(parse_ring(coords[p][r]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute bbox and area from first ring (outer boundary)
|
||||
if (!f.rings.empty() && !f.rings[0].empty()) {
|
||||
f.bbox = geo::bbox(f.rings[0]);
|
||||
f.areaSqKm = geo::area_sq_km(f.rings[0]);
|
||||
}
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
// ── Public API ──────────────────────────────────────────────────────────────
|
||||
|
||||
BoundaryResult load_boundary_file(const std::string& filepath) {
|
||||
BoundaryResult result;
|
||||
|
||||
std::string json = read_file(filepath);
|
||||
if (json.empty()) {
|
||||
result.error = "Failed to read file: " + filepath;
|
||||
return result;
|
||||
}
|
||||
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json.c_str());
|
||||
if (doc.HasParseError()) {
|
||||
result.error = "JSON parse error in: " + filepath;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Expect a FeatureCollection
|
||||
if (!doc.HasMember("features") || !doc["features"].IsArray()) {
|
||||
result.error = "Missing 'features' array in: " + filepath;
|
||||
return result;
|
||||
}
|
||||
|
||||
const auto& features = doc["features"];
|
||||
result.features.reserve(features.Size());
|
||||
for (rapidjson::SizeType i = 0; i < features.Size(); ++i) {
|
||||
result.features.push_back(parse_feature(features[i]));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
|
||||
const std::string& cacheDir) {
|
||||
std::string cc = country_code(gid);
|
||||
std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json";
|
||||
|
||||
// Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json
|
||||
std::string path = cacheDir + "/" + cc + "/" + filename;
|
||||
auto result = load_boundary_file(path);
|
||||
if (result.error.empty()) return result;
|
||||
|
||||
// Fallback (flat): cacheDir/boundary_{gid}_{level}.json
|
||||
path = cacheDir + "/" + filename;
|
||||
result = load_boundary_file(path);
|
||||
if (result.error.empty()) return result;
|
||||
|
||||
// Both failed
|
||||
result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace gadm
|
||||
@ -1,5 +0,0 @@
|
||||
add_library(geo STATIC src/geo.cpp)
|
||||
|
||||
target_include_directories(geo PUBLIC include)
|
||||
|
||||
# No external dependencies — pure math
|
||||
@ -1,100 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
namespace geo {
|
||||
|
||||
// ── Constants ───────────────────────────────────────────────────────────────
|
||||
constexpr double EARTH_RADIUS_KM = 6371.0;
|
||||
constexpr double PI = 3.14159265358979323846;
|
||||
constexpr double DEG2RAD = PI / 180.0;
|
||||
constexpr double RAD2DEG = 180.0 / PI;
|
||||
|
||||
// ── Core types ──────────────────────────────────────────────────────────────
|
||||
|
||||
struct Coord {
|
||||
double lon = 0;
|
||||
double lat = 0;
|
||||
};
|
||||
|
||||
struct BBox {
|
||||
double minLon = 0;
|
||||
double minLat = 0;
|
||||
double maxLon = 0;
|
||||
double maxLat = 0;
|
||||
|
||||
Coord center() const {
|
||||
return {(minLon + maxLon) / 2.0, (minLat + maxLat) / 2.0};
|
||||
}
|
||||
|
||||
double width_deg() const { return maxLon - minLon; }
|
||||
double height_deg() const { return maxLat - minLat; }
|
||||
};
|
||||
|
||||
// ── Distance ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Haversine distance between two WGS84 points, in kilometers.
|
||||
double distance_km(Coord a, Coord b);
|
||||
|
||||
/// Haversine distance in meters.
|
||||
inline double distance_m(Coord a, Coord b) { return distance_km(a, b) * 1000.0; }
|
||||
|
||||
// ── Bounding box ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Compute the bounding box of a polygon ring.
|
||||
BBox bbox(const std::vector<Coord>& ring);
|
||||
|
||||
/// Compute the bounding box that covers all features' rings.
|
||||
BBox bbox_union(const std::vector<BBox>& boxes);
|
||||
|
||||
// ── Centroid ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Geometric centroid of a polygon ring (simple average method).
|
||||
Coord centroid(const std::vector<Coord>& ring);
|
||||
|
||||
// ── Area ────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Approximate area of a polygon ring in square meters.
|
||||
/// Uses the Shoelace formula with latitude cosine correction.
|
||||
double area_sq_m(const std::vector<Coord>& ring);
|
||||
|
||||
/// Area in square kilometers.
|
||||
inline double area_sq_km(const std::vector<Coord>& ring) {
|
||||
return area_sq_m(ring) / 1e6;
|
||||
}
|
||||
|
||||
// ── Point-in-polygon ────────────────────────────────────────────────────────
|
||||
|
||||
/// Ray-casting point-in-polygon test.
|
||||
/// Same algorithm as gadm/cpp pip.h but using Coord structs.
|
||||
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring);
|
||||
|
||||
// ── Bearing & destination ───────────────────────────────────────────────────
|
||||
|
||||
/// Initial bearing from a to b, in degrees (0 = north, 90 = east).
|
||||
double bearing_deg(Coord from, Coord to);
|
||||
|
||||
/// Compute the destination point given start, bearing (degrees), and distance (km).
|
||||
Coord destination(Coord from, double bearing_deg, double distance_km);
|
||||
|
||||
// ── Grid tessellation ───────────────────────────────────────────────────────
|
||||
|
||||
/// Generate a flat square grid of cell centers over a bbox.
|
||||
/// cellSizeKm defines the side length of each square cell.
|
||||
/// Returns center coordinates of each cell.
|
||||
std::vector<Coord> square_grid(BBox extent, double cellSizeKm);
|
||||
|
||||
/// Generate a flat hex grid of cell centers over a bbox.
|
||||
/// cellSizeKm defines the distance between hex centers.
|
||||
/// Returns center coordinates of each cell.
|
||||
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm);
|
||||
|
||||
// ── Viewport estimation (matches TS estimateViewportAreaSqKm) ──────────────
|
||||
|
||||
/// Estimate the km² visible in a viewport at a given lat/zoom.
|
||||
double estimate_viewport_sq_km(double lat, int zoom,
|
||||
int widthPx = 1024, int heightPx = 768);
|
||||
|
||||
} // namespace geo
|
||||
@ -1,204 +0,0 @@
|
||||
#include "geo/geo.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
|
||||
namespace geo {
|
||||
|
||||
// ── Distance (Haversine) ────────────────────────────────────────────────────
|
||||
|
||||
double distance_km(Coord a, Coord b) {
|
||||
double dLat = (b.lat - a.lat) * DEG2RAD;
|
||||
double dLon = (b.lon - a.lon) * DEG2RAD;
|
||||
double lat1 = a.lat * DEG2RAD;
|
||||
double lat2 = b.lat * DEG2RAD;
|
||||
|
||||
double sinDLat = std::sin(dLat / 2.0);
|
||||
double sinDLon = std::sin(dLon / 2.0);
|
||||
double h = sinDLat * sinDLat + std::cos(lat1) * std::cos(lat2) * sinDLon * sinDLon;
|
||||
return 2.0 * EARTH_RADIUS_KM * std::asin(std::sqrt(h));
|
||||
}
|
||||
|
||||
// ── Bounding box ────────────────────────────────────────────────────────────
|
||||
|
||||
BBox bbox(const std::vector<Coord>& ring) {
|
||||
if (ring.empty()) return {};
|
||||
BBox b{ring[0].lon, ring[0].lat, ring[0].lon, ring[0].lat};
|
||||
for (size_t i = 1; i < ring.size(); ++i) {
|
||||
b.minLon = std::min(b.minLon, ring[i].lon);
|
||||
b.minLat = std::min(b.minLat, ring[i].lat);
|
||||
b.maxLon = std::max(b.maxLon, ring[i].lon);
|
||||
b.maxLat = std::max(b.maxLat, ring[i].lat);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
BBox bbox_union(const std::vector<BBox>& boxes) {
|
||||
if (boxes.empty()) return {};
|
||||
BBox u = boxes[0];
|
||||
for (size_t i = 1; i < boxes.size(); ++i) {
|
||||
u.minLon = std::min(u.minLon, boxes[i].minLon);
|
||||
u.minLat = std::min(u.minLat, boxes[i].minLat);
|
||||
u.maxLon = std::max(u.maxLon, boxes[i].maxLon);
|
||||
u.maxLat = std::max(u.maxLat, boxes[i].maxLat);
|
||||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
// ── Centroid ────────────────────────────────────────────────────────────────
|
||||
|
||||
Coord centroid(const std::vector<Coord>& ring) {
|
||||
if (ring.empty()) return {};
|
||||
double sumLon = 0, sumLat = 0;
|
||||
// Exclude last point if it's the same as first (closed ring)
|
||||
size_t n = ring.size();
|
||||
if (n > 1 && ring[0].lon == ring[n - 1].lon && ring[0].lat == ring[n - 1].lat) {
|
||||
n--;
|
||||
}
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
sumLon += ring[i].lon;
|
||||
sumLat += ring[i].lat;
|
||||
}
|
||||
return {sumLon / static_cast<double>(n), sumLat / static_cast<double>(n)};
|
||||
}
|
||||
|
||||
// ── Area (Shoelace + latitude cosine correction) ────────────────────────────
|
||||
|
||||
double area_sq_m(const std::vector<Coord>& ring) {
|
||||
if (ring.size() < 3) return 0.0;
|
||||
|
||||
// Shoelace formula in projected coordinates.
|
||||
// Each degree of longitude = cos(lat) * 111320 meters at that latitude.
|
||||
// Each degree of latitude = 110540 meters (approximate).
|
||||
double sum = 0.0;
|
||||
size_t n = ring.size();
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
size_t j = (i + 1) % n;
|
||||
// Convert coordinates to approximate meters using the average latitude
|
||||
double avgLat = (ring[i].lat + ring[j].lat) / 2.0;
|
||||
double cosLat = std::cos(avgLat * DEG2RAD);
|
||||
|
||||
double x_i = ring[i].lon * cosLat * 111320.0;
|
||||
double y_i = ring[i].lat * 110540.0;
|
||||
double x_j = ring[j].lon * cosLat * 111320.0;
|
||||
double y_j = ring[j].lat * 110540.0;
|
||||
|
||||
sum += x_i * y_j - x_j * y_i;
|
||||
}
|
||||
return std::abs(sum) / 2.0;
|
||||
}
|
||||
|
||||
// ── Point-in-polygon (ray casting) ──────────────────────────────────────────
|
||||
|
||||
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring) {
|
||||
bool inside = false;
|
||||
size_t n = ring.size();
|
||||
for (size_t i = 0, j = n - 1; i < n; j = i++) {
|
||||
double xi = ring[i].lon, yi = ring[i].lat;
|
||||
double xj = ring[j].lon, yj = ring[j].lat;
|
||||
|
||||
if (((yi > pt.lat) != (yj > pt.lat)) &&
|
||||
(pt.lon < (xj - xi) * (pt.lat - yi) / (yj - yi) + xi)) {
|
||||
inside = !inside;
|
||||
}
|
||||
}
|
||||
return inside;
|
||||
}
|
||||
|
||||
// ── Bearing ─────────────────────────────────────────────────────────────────
|
||||
|
||||
double bearing_deg(Coord from, Coord to) {
|
||||
double dLon = (to.lon - from.lon) * DEG2RAD;
|
||||
double lat1 = from.lat * DEG2RAD;
|
||||
double lat2 = to.lat * DEG2RAD;
|
||||
|
||||
double y = std::sin(dLon) * std::cos(lat2);
|
||||
double x = std::cos(lat1) * std::sin(lat2) -
|
||||
std::sin(lat1) * std::cos(lat2) * std::cos(dLon);
|
||||
double brng = std::atan2(y, x) * RAD2DEG;
|
||||
return std::fmod(brng + 360.0, 360.0);
|
||||
}
|
||||
|
||||
// ── Destination point ───────────────────────────────────────────────────────
|
||||
|
||||
Coord destination(Coord from, double brng_deg, double dist_km) {
|
||||
double brng = brng_deg * DEG2RAD;
|
||||
double lat1 = from.lat * DEG2RAD;
|
||||
double lon1 = from.lon * DEG2RAD;
|
||||
double d = dist_km / EARTH_RADIUS_KM;
|
||||
|
||||
double lat2 = std::asin(std::sin(lat1) * std::cos(d) +
|
||||
std::cos(lat1) * std::sin(d) * std::cos(brng));
|
||||
double lon2 = lon1 + std::atan2(
|
||||
std::sin(brng) * std::sin(d) * std::cos(lat1),
|
||||
std::cos(d) - std::sin(lat1) * std::sin(lat2));
|
||||
|
||||
return {lon2 * RAD2DEG, lat2 * RAD2DEG};
|
||||
}
|
||||
|
||||
// ── Square grid ─────────────────────────────────────────────────────────────
|
||||
|
||||
std::vector<Coord> square_grid(BBox extent, double cellSizeKm) {
|
||||
std::vector<Coord> centers;
|
||||
if (cellSizeKm <= 0) return centers;
|
||||
|
||||
// Convert cell size to degrees at the center latitude
|
||||
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
|
||||
double cosLat = std::cos(centerLat * DEG2RAD);
|
||||
if (cosLat < 1e-10) cosLat = 1e-10; // Avoid division by zero near poles
|
||||
|
||||
double cellLatDeg = cellSizeKm / 110.574; // ~110.574 km per degree lat
|
||||
double cellLonDeg = cellSizeKm / (111.320 * cosLat); // longitude correction
|
||||
|
||||
for (double lat = extent.minLat + cellLatDeg / 2.0;
|
||||
lat < extent.maxLat; lat += cellLatDeg) {
|
||||
for (double lon = extent.minLon + cellLonDeg / 2.0;
|
||||
lon < extent.maxLon; lon += cellLonDeg) {
|
||||
centers.push_back({lon, lat});
|
||||
}
|
||||
}
|
||||
return centers;
|
||||
}
|
||||
|
||||
// ── Hex grid ────────────────────────────────────────────────────────────────
|
||||
|
||||
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm) {
|
||||
std::vector<Coord> centers;
|
||||
if (cellSizeKm <= 0) return centers;
|
||||
|
||||
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
|
||||
double cosLat = std::cos(centerLat * DEG2RAD);
|
||||
if (cosLat < 1e-10) cosLat = 1e-10;
|
||||
|
||||
// Hex spacing: horizontal = cellSize, vertical = cellSize * sqrt(3)/2
|
||||
double cellLatDeg = cellSizeKm / 110.574;
|
||||
double cellLonDeg = cellSizeKm / (111.320 * cosLat);
|
||||
double rowHeight = cellLatDeg * std::sqrt(3.0) / 2.0;
|
||||
|
||||
int row = 0;
|
||||
for (double lat = extent.minLat + rowHeight / 2.0;
|
||||
lat < extent.maxLat; lat += rowHeight) {
|
||||
// Offset every other row by half the cell width
|
||||
double lonOffset = (row % 2 == 1) ? cellLonDeg / 2.0 : 0.0;
|
||||
for (double lon = extent.minLon + cellLonDeg / 2.0 + lonOffset;
|
||||
lon < extent.maxLon; lon += cellLonDeg) {
|
||||
centers.push_back({lon, lat});
|
||||
}
|
||||
row++;
|
||||
}
|
||||
return centers;
|
||||
}
|
||||
|
||||
// ── Viewport estimation ─────────────────────────────────────────────────────
|
||||
|
||||
double estimate_viewport_sq_km(double lat, int zoom, int widthPx, int heightPx) {
|
||||
double metersPerPx =
|
||||
(156543.03392 * std::cos(lat * DEG2RAD)) / std::pow(2.0, zoom);
|
||||
double widthKm = (widthPx * metersPerPx) / 1000.0;
|
||||
double heightKm = (heightPx * metersPerPx) / 1000.0;
|
||||
return widthKm * heightKm;
|
||||
}
|
||||
|
||||
} // namespace geo
|
||||
@ -1,6 +0,0 @@
|
||||
add_library(grid STATIC src/grid.cpp)
|
||||
|
||||
target_include_directories(grid PUBLIC include)
|
||||
|
||||
# Depends on geo (math) and gadm_reader (Feature type)
|
||||
target_link_libraries(grid PUBLIC geo gadm_reader)
|
||||
@ -1,56 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "geo/geo.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace grid {
|
||||
|
||||
// ── Types (mirror TS GridSearchHop) ─────────────────────────────────────────
|
||||
|
||||
struct Waypoint {
|
||||
int step = 0;
|
||||
double lng = 0;
|
||||
double lat = 0;
|
||||
double radius_km = 0;
|
||||
std::string area_gid;
|
||||
std::string area_name;
|
||||
};
|
||||
|
||||
struct GridOptions {
|
||||
std::string gridMode = "hex"; // "hex", "square", "admin", "centers"
|
||||
double cellSize = 5.0; // km
|
||||
double cellOverlap = 0.0;
|
||||
double centroidOverlap = 0.5;
|
||||
int maxCellsLimit = 15000;
|
||||
double maxElevation = 0;
|
||||
double minDensity = 0;
|
||||
double minGhsPop = 0;
|
||||
double minGhsBuilt = 0;
|
||||
std::string ghsFilterMode = "AND"; // "AND" | "OR"
|
||||
bool allowMissingGhs = false;
|
||||
bool bypassFilters = false;
|
||||
std::string pathOrder = "snake"; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest"
|
||||
bool groupByRegion = true;
|
||||
};
|
||||
|
||||
struct GridResult {
|
||||
std::vector<Waypoint> waypoints;
|
||||
int validCells = 0;
|
||||
int skippedCells = 0;
|
||||
std::string error;
|
||||
};
|
||||
|
||||
// ── API ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Generate grid waypoints from GADM features + options.
|
||||
/// This is the main entry point — equivalent to generateGridSearchCells() in TS.
|
||||
GridResult generate(
|
||||
const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts
|
||||
);
|
||||
|
||||
} // namespace grid
|
||||
@ -1,393 +0,0 @@
|
||||
#include "grid/grid.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace grid {
|
||||
|
||||
// ── Internal types ──────────────────────────────────────────────────────────
|
||||
|
||||
struct CellInfo {
|
||||
geo::Coord center;
|
||||
double radius_km;
|
||||
int region_idx;
|
||||
bool allowed;
|
||||
std::string reason;
|
||||
};
|
||||
|
||||
// ── Filter logic (mirrors checkCellFilters in TS) ───────────────────────────
|
||||
|
||||
static bool check_filters(const gadm::Feature& feat, const GridOptions& opts,
|
||||
double areaSqKm, std::string& reason) {
|
||||
if (opts.bypassFilters) return true;
|
||||
|
||||
// GHS filter
|
||||
bool checkPop = opts.minGhsPop > 0;
|
||||
bool checkBuilt = opts.minGhsBuilt > 0;
|
||||
|
||||
if (checkPop || checkBuilt) {
|
||||
double ghsPop = feat.ghsPopulation;
|
||||
double ghsBuilt = feat.ghsBuiltWeight;
|
||||
bool popPass = checkPop && ((ghsPop == 0 && opts.allowMissingGhs) || ghsPop >= opts.minGhsPop);
|
||||
bool builtPass = checkBuilt && ((ghsBuilt == 0 && opts.allowMissingGhs) || ghsBuilt >= opts.minGhsBuilt);
|
||||
|
||||
if (opts.ghsFilterMode == "OR") {
|
||||
if (checkPop && checkBuilt && !popPass && !builtPass) {
|
||||
reason = "GHS (OR) below thresholds";
|
||||
return false;
|
||||
} else if (checkPop && !checkBuilt && !popPass) {
|
||||
reason = "GHS Pop below threshold";
|
||||
return false;
|
||||
} else if (checkBuilt && !checkPop && !builtPass) {
|
||||
reason = "GHS Built below threshold";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (checkPop && !popPass) {
|
||||
reason = "GHS Pop below threshold";
|
||||
return false;
|
||||
}
|
||||
if (checkBuilt && !builtPass) {
|
||||
reason = "GHS Built below threshold";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ── Sorting ─────────────────────────────────────────────────────────────────
|
||||
|
||||
static void sort_waypoints(std::vector<Waypoint>& wps, const std::string& pathOrder,
|
||||
double cellSize) {
|
||||
if (wps.size() <= 1) return;
|
||||
|
||||
double rowTolerance = std::min((cellSize / 111.32) * 0.5, 0.5);
|
||||
|
||||
if (pathOrder == "zigzag" || pathOrder == "snake") {
|
||||
// Sort top-to-bottom, left-to-right within row tolerance
|
||||
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
|
||||
if (std::abs(a.lat - b.lat) > rowTolerance) {
|
||||
return b.lat < a.lat; // higher lat first (north to south)
|
||||
}
|
||||
return a.lng < b.lng; // left to right
|
||||
});
|
||||
|
||||
if (pathOrder == "snake") {
|
||||
// Group into rows, reverse every other row
|
||||
std::vector<std::vector<Waypoint>> rows;
|
||||
std::vector<Waypoint> currentRow;
|
||||
double lastY = wps[0].lat;
|
||||
|
||||
for (auto& wp : wps) {
|
||||
if (std::abs(wp.lat - lastY) > rowTolerance) {
|
||||
rows.push_back(std::move(currentRow));
|
||||
currentRow.clear();
|
||||
lastY = wp.lat;
|
||||
}
|
||||
currentRow.push_back(wp);
|
||||
}
|
||||
if (!currentRow.empty()) rows.push_back(std::move(currentRow));
|
||||
|
||||
wps.clear();
|
||||
for (size_t i = 0; i < rows.size(); ++i) {
|
||||
if (i % 2 == 1) std::reverse(rows[i].begin(), rows[i].end());
|
||||
for (auto& wp : rows[i]) wps.push_back(std::move(wp));
|
||||
}
|
||||
}
|
||||
|
||||
} else if (pathOrder == "spiral-out" || pathOrder == "spiral-in") {
|
||||
// Sort by distance from center of all waypoints
|
||||
double cLon = 0, cLat = 0;
|
||||
for (const auto& wp : wps) { cLon += wp.lng; cLat += wp.lat; }
|
||||
cLon /= wps.size();
|
||||
cLat /= wps.size();
|
||||
geo::Coord center{cLon, cLat};
|
||||
|
||||
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
|
||||
double dA = geo::distance_km(center, {a.lng, a.lat});
|
||||
double dB = geo::distance_km(center, {b.lng, b.lat});
|
||||
return (pathOrder == "spiral-out") ? (dA < dB) : (dA > dB);
|
||||
});
|
||||
|
||||
} else if (pathOrder == "shortest") {
|
||||
// Greedy nearest-neighbor
|
||||
std::vector<Waypoint> sorted;
|
||||
sorted.reserve(wps.size());
|
||||
std::vector<bool> used(wps.size(), false);
|
||||
|
||||
sorted.push_back(wps[0]);
|
||||
used[0] = true;
|
||||
|
||||
for (size_t step = 1; step < wps.size(); ++step) {
|
||||
const auto& cur = sorted.back();
|
||||
double bestDist = 1e18;
|
||||
size_t bestIdx = 0;
|
||||
|
||||
for (size_t i = 0; i < wps.size(); ++i) {
|
||||
if (used[i]) continue;
|
||||
double dx = wps[i].lng - cur.lng;
|
||||
double dy = wps[i].lat - cur.lat;
|
||||
double distSq = dx * dx + dy * dy;
|
||||
if (distSq < bestDist) {
|
||||
bestDist = distSq;
|
||||
bestIdx = i;
|
||||
}
|
||||
}
|
||||
|
||||
sorted.push_back(wps[bestIdx]);
|
||||
used[bestIdx] = true;
|
||||
}
|
||||
|
||||
wps = std::move(sorted);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Admin mode ──────────────────────────────────────────────────────────────
|
||||
|
||||
static GridResult generate_admin(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult res;
|
||||
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
const auto& f = features[i];
|
||||
if (f.rings.empty() || f.rings[0].empty()) continue;
|
||||
|
||||
std::string reason;
|
||||
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
|
||||
|
||||
geo::Coord center = geo::centroid(f.rings[0]);
|
||||
// Radius = distance from centroid to bbox corner
|
||||
double radiusKm = geo::distance_km(center, {f.bbox.maxLon, f.bbox.maxLat});
|
||||
|
||||
if (allowed) {
|
||||
res.waypoints.push_back({
|
||||
static_cast<int>(res.waypoints.size() + 1),
|
||||
std::round(center.lon * 1e6) / 1e6,
|
||||
std::round(center.lat * 1e6) / 1e6,
|
||||
std::round(radiusKm * 100.0) / 100.0,
|
||||
f.gid,
|
||||
f.name
|
||||
});
|
||||
res.validCells++;
|
||||
} else {
|
||||
res.skippedCells++;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// ── Centers mode ────────────────────────────────────────────────────────────
|
||||
|
||||
static GridResult generate_centers(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult res;
|
||||
|
||||
struct AcceptedCenter {
|
||||
geo::Coord coord;
|
||||
};
|
||||
std::vector<AcceptedCenter> accepted;
|
||||
|
||||
double minAllowedDist = opts.cellSize * (1.0 - opts.centroidOverlap);
|
||||
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
const auto& f = features[i];
|
||||
|
||||
// Collect unique centers by rounding to 5 decimal places
|
||||
std::map<std::string, std::array<double, 3>> centersMap; // key → [lon, lat, weight]
|
||||
|
||||
auto addCenter = [&](double lon, double lat, double weight) {
|
||||
char key[32];
|
||||
snprintf(key, sizeof(key), "%.5f,%.5f", lon, lat);
|
||||
std::string k(key);
|
||||
if (centersMap.find(k) == centersMap.end()) {
|
||||
centersMap[k] = {lon, lat, weight};
|
||||
}
|
||||
};
|
||||
|
||||
// Single pop/built centers
|
||||
if (f.ghsPopCenter.lon != 0 || f.ghsPopCenter.lat != 0) {
|
||||
addCenter(f.ghsPopCenter.lon, f.ghsPopCenter.lat, f.ghsPopulation);
|
||||
}
|
||||
if (f.ghsBuiltCenter.lon != 0 || f.ghsBuiltCenter.lat != 0) {
|
||||
addCenter(f.ghsBuiltCenter.lon, f.ghsBuiltCenter.lat, f.ghsBuiltWeight);
|
||||
}
|
||||
|
||||
// Weighted center arrays
|
||||
for (const auto& c : f.ghsPopCenters) {
|
||||
addCenter(c[0], c[1], c[2]);
|
||||
}
|
||||
for (const auto& c : f.ghsBuiltCenters) {
|
||||
addCenter(c[0], c[1], c[2]);
|
||||
}
|
||||
|
||||
for (const auto& [key, val] : centersMap) {
|
||||
geo::Coord pt{val[0], val[1]};
|
||||
|
||||
std::string reason;
|
||||
// For centers, use the feature's overall filters
|
||||
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
|
||||
|
||||
// Check overlap with already-accepted centers
|
||||
if (allowed && !accepted.empty()) {
|
||||
for (const auto& ac : accepted) {
|
||||
double dist = geo::distance_km(pt, ac.coord);
|
||||
if (dist < minAllowedDist) {
|
||||
allowed = false;
|
||||
reason = "overlaps another centroid";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (allowed) {
|
||||
accepted.push_back({pt});
|
||||
res.waypoints.push_back({
|
||||
static_cast<int>(res.waypoints.size() + 1),
|
||||
std::round(pt.lon * 1e6) / 1e6,
|
||||
std::round(pt.lat * 1e6) / 1e6,
|
||||
std::round((opts.cellSize / 2.0) * 100.0) / 100.0,
|
||||
f.gid,
|
||||
f.name
|
||||
});
|
||||
res.validCells++;
|
||||
} else {
|
||||
res.skippedCells++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// ── Polygon grid mode (hex / square) ────────────────────────────────────────
|
||||
|
||||
static GridResult generate_polygon_grid(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult res;
|
||||
|
||||
// Compute union bbox of all features
|
||||
std::vector<geo::BBox> boxes;
|
||||
for (const auto& f : features) {
|
||||
if (!f.rings.empty()) boxes.push_back(f.bbox);
|
||||
}
|
||||
if (boxes.empty()) return res;
|
||||
|
||||
geo::BBox extent = geo::bbox_union(boxes);
|
||||
|
||||
// Estimate cell count to prevent runaway
|
||||
double widthKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.maxLon, extent.minLat});
|
||||
double heightKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.minLon, extent.maxLat});
|
||||
double approxCellArea = opts.cellSize * opts.cellSize * 2.6;
|
||||
int approxCells = static_cast<int>(std::ceil((widthKm * heightKm) / approxCellArea));
|
||||
|
||||
if (approxCells > opts.maxCellsLimit) {
|
||||
res.error = "Grid too massive (~" + std::to_string(approxCells) + " cells). Increase cell size or select smaller region.";
|
||||
return res;
|
||||
}
|
||||
|
||||
// Generate grid centers
|
||||
std::vector<geo::Coord> gridCenters;
|
||||
if (opts.gridMode == "square") {
|
||||
gridCenters = geo::square_grid(extent, opts.cellSize);
|
||||
} else {
|
||||
gridCenters = geo::hex_grid(extent, opts.cellSize);
|
||||
}
|
||||
|
||||
// For each grid center, check if it intersects any feature polygon
|
||||
for (const auto& gc : gridCenters) {
|
||||
bool intersects = false;
|
||||
int regionIdx = -1;
|
||||
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
if (features[i].rings.empty()) continue;
|
||||
if (geo::point_in_polygon(gc, features[i].rings[0])) {
|
||||
intersects = true;
|
||||
regionIdx = static_cast<int>(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!intersects) continue;
|
||||
|
||||
const auto& regionFeat = features[regionIdx];
|
||||
std::string reason;
|
||||
bool allowed = check_filters(regionFeat, opts, regionFeat.areaSqKm, reason);
|
||||
|
||||
// Compute cell radius (half diagonal of cell)
|
||||
double cellRadiusKm = opts.cellSize * std::sqrt(2.0) / 2.0;
|
||||
|
||||
if (allowed) {
|
||||
res.waypoints.push_back({
|
||||
static_cast<int>(res.waypoints.size() + 1),
|
||||
std::round(gc.lon * 1e6) / 1e6,
|
||||
std::round(gc.lat * 1e6) / 1e6,
|
||||
std::round(cellRadiusKm * 100.0) / 100.0,
|
||||
regionFeat.gid,
|
||||
regionFeat.name
|
||||
});
|
||||
res.validCells++;
|
||||
} else {
|
||||
res.skippedCells++;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// ── Main entry point ────────────────────────────────────────────────────────
|
||||
|
||||
GridResult generate(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult result;
|
||||
|
||||
if (features.empty()) {
|
||||
result.error = "No features provided";
|
||||
return result;
|
||||
}
|
||||
|
||||
if (opts.gridMode == "admin") {
|
||||
result = generate_admin(features, opts);
|
||||
} else if (opts.gridMode == "centers") {
|
||||
result = generate_centers(features, opts);
|
||||
} else {
|
||||
result = generate_polygon_grid(features, opts);
|
||||
}
|
||||
|
||||
if (!result.error.empty()) return result;
|
||||
|
||||
// Sort waypoints
|
||||
if (result.waypoints.size() > 1) {
|
||||
if (opts.groupByRegion && features.size() > 1) {
|
||||
std::stable_sort(result.waypoints.begin(), result.waypoints.end(),
|
||||
[](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; });
|
||||
|
||||
auto start = result.waypoints.begin();
|
||||
while (start != result.waypoints.end()) {
|
||||
auto end = start;
|
||||
while (end != result.waypoints.end() && end->area_gid == start->area_gid) {
|
||||
++end;
|
||||
}
|
||||
std::vector<Waypoint> group(start, end);
|
||||
sort_waypoints(group, opts.pathOrder, opts.cellSize);
|
||||
std::copy(group.begin(), group.end(), start);
|
||||
start = end;
|
||||
}
|
||||
} else {
|
||||
sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize);
|
||||
}
|
||||
}
|
||||
|
||||
// Re-number steps after sorting
|
||||
for (size_t i = 0; i < result.waypoints.size(); ++i) {
|
||||
result.waypoints[i].step = static_cast<int>(i + 1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace grid
|
||||
@ -1,68 +1,105 @@
|
||||
#include "kbot.h"
|
||||
#include <taskflow/taskflow.hpp>
|
||||
#include <iostream>
|
||||
#include "logger/logger.h"
|
||||
#include "llm_client.h"
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
|
||||
namespace polymech {
|
||||
namespace kbot {
|
||||
|
||||
int run_kbot_ai_pipeline(const KBotOptions& opts, const KBotCallbacks& cb) {
|
||||
logger::debug("Starting kbot ai pipeline");
|
||||
if (opts.dry_run) {
|
||||
logger::info("Dry run triggered for kbot ai");
|
||||
}
|
||||
namespace {
|
||||
|
||||
// Scaffolding multithreaded AI tasks
|
||||
tf::Executor executor(4);
|
||||
tf::Taskflow taskflow;
|
||||
|
||||
taskflow.emplace([opts, cb](){
|
||||
logger::debug("Executing kbot ai completion via LLMClient...");
|
||||
LLMClient client(opts);
|
||||
|
||||
std::string target_prompt = opts.prompt.empty() ? "Respond with 'Hello from KBot C++ AI Pipeline!'" : opts.prompt;
|
||||
LLMResponse res = client.execute_chat(target_prompt);
|
||||
|
||||
if (res.success) {
|
||||
std::cout << res.text << "\n";
|
||||
if (cb.onEvent) {
|
||||
cb.onEvent("ai_progress", "{\"message\":\"Task completion received\"}");
|
||||
}
|
||||
} else {
|
||||
logger::error("AI Task Failed: " + res.error);
|
||||
if (cb.onEvent) {
|
||||
cb.onEvent("ai_error", "{\"error\":\"Task failed\"}");
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
executor.run(taskflow).wait();
|
||||
|
||||
if (cb.onEvent) {
|
||||
cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"ai\"}");
|
||||
}
|
||||
return 0;
|
||||
std::string json_job_result_ai(bool success, const std::string &text_or_error, bool is_text) {
|
||||
rapidjson::StringBuffer buf;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(buf);
|
||||
w.StartObject();
|
||||
w.Key("status");
|
||||
w.String(success ? "success" : "error");
|
||||
w.Key("mode");
|
||||
w.String("ai");
|
||||
if (success && is_text) {
|
||||
w.Key("text");
|
||||
w.String(text_or_error.c_str(),
|
||||
static_cast<rapidjson::SizeType>(text_or_error.size()));
|
||||
} else if (!success) {
|
||||
w.Key("error");
|
||||
w.String(text_or_error.c_str(),
|
||||
static_cast<rapidjson::SizeType>(text_or_error.size()));
|
||||
}
|
||||
w.EndObject();
|
||||
return std::string(buf.GetString(), buf.GetSize());
|
||||
}
|
||||
|
||||
int run_kbot_run_pipeline(const KBotRunOptions& opts, const KBotCallbacks& cb) {
|
||||
logger::info("Starting kbot run pipeline (stub) for config: " + opts.config);
|
||||
if (opts.dry) {
|
||||
logger::info("Dry run triggered for kbot run");
|
||||
}
|
||||
if (opts.list) {
|
||||
logger::info("List configs mode enabled");
|
||||
}
|
||||
|
||||
// Stub std::system call execution (simulating child_process.execFileSync from TypeScript)
|
||||
if (!opts.dry && !opts.list) {
|
||||
logger::info("Simulating launching: .vscode/launch.json targeting " + opts.config);
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int run_kbot_ai_pipeline(const KBotOptions &opts, const KBotCallbacks &cb) {
|
||||
logger::debug("Starting kbot ai pipeline");
|
||||
|
||||
if (opts.dry_run) {
|
||||
logger::info("Dry run triggered for kbot ai");
|
||||
if (cb.onEvent) {
|
||||
cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"run\"}");
|
||||
cb.onEvent("job_result", json_job_result_ai(true, "[dry-run] no LLM call", true));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
LLMClient client(opts);
|
||||
const std::string target_prompt =
|
||||
opts.prompt.empty() ? "Respond with 'Hello from KBot C++ AI Pipeline!'"
|
||||
: opts.prompt;
|
||||
|
||||
logger::debug("Executing kbot ai completion via LLMClient...");
|
||||
LLMResponse res = client.execute_chat(target_prompt);
|
||||
|
||||
if (res.success) {
|
||||
std::cout << res.text << "\n";
|
||||
if (cb.onEvent) {
|
||||
cb.onEvent("ai_progress",
|
||||
"{\"message\":\"Task completion received\",\"has_text\":true}");
|
||||
}
|
||||
} else {
|
||||
logger::error("AI Task Failed: " + res.error);
|
||||
if (cb.onEvent) {
|
||||
rapidjson::StringBuffer ebuf;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> ew(ebuf);
|
||||
ew.StartObject();
|
||||
ew.Key("error");
|
||||
ew.String(res.error.c_str(),
|
||||
static_cast<rapidjson::SizeType>(res.error.size()));
|
||||
ew.EndObject();
|
||||
cb.onEvent("ai_error",
|
||||
std::string(ebuf.GetString(), ebuf.GetSize()));
|
||||
}
|
||||
}
|
||||
|
||||
if (cb.onEvent) {
|
||||
if (res.success)
|
||||
cb.onEvent("job_result", json_job_result_ai(true, res.text, true));
|
||||
else
|
||||
cb.onEvent("job_result", json_job_result_ai(false, res.error, false));
|
||||
}
|
||||
|
||||
return res.success ? 0 : 1;
|
||||
}
|
||||
|
||||
int run_kbot_run_pipeline(const KBotRunOptions &opts, const KBotCallbacks &cb) {
|
||||
logger::info("Starting kbot run pipeline (stub) for config: " + opts.config);
|
||||
if (opts.dry) {
|
||||
logger::info("Dry run triggered for kbot run");
|
||||
}
|
||||
if (opts.list) {
|
||||
logger::info("List configs mode enabled");
|
||||
}
|
||||
|
||||
if (!opts.dry && !opts.list) {
|
||||
logger::info("Simulating launching: .vscode/launch.json targeting " + opts.config);
|
||||
}
|
||||
|
||||
if (cb.onEvent) {
|
||||
cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"run\"}");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace kbot
|
||||
|
||||
@ -1,7 +0,0 @@
|
||||
add_library(search STATIC src/search.cpp)
|
||||
|
||||
target_include_directories(search PUBLIC include)
|
||||
|
||||
# Depends on http (curl) and json (RapidJSON wrapper)
|
||||
target_link_libraries(search PUBLIC http json)
|
||||
target_link_libraries(search PRIVATE tomlplusplus::tomlplusplus)
|
||||
@ -1,93 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace search {
|
||||
|
||||
// ── Result types ────────────────────────────────────────────────────────────
|
||||
|
||||
struct GpsCoordinates {
|
||||
double lat = 0;
|
||||
double lng = 0;
|
||||
};
|
||||
|
||||
struct MapResult {
|
||||
std::string title;
|
||||
std::string place_id;
|
||||
std::string data_id;
|
||||
std::string address;
|
||||
std::string phone;
|
||||
std::string website;
|
||||
std::string type;
|
||||
std::vector<std::string> types;
|
||||
double rating = 0;
|
||||
int reviews = 0;
|
||||
GpsCoordinates gps;
|
||||
std::string thumbnail;
|
||||
std::string raw_json;
|
||||
std::string geo_json;
|
||||
};
|
||||
|
||||
struct SearchResult {
|
||||
std::vector<MapResult> results;
|
||||
int apiCalls = 0;
|
||||
std::string error;
|
||||
};
|
||||
|
||||
// ── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
struct SystemTuningOptions {
|
||||
int executor_threads = 0; // 0 = hardware concurrency
|
||||
int max_concurrent_jobs_per_user = 10;
|
||||
int http_concurrency_throttle = 50;
|
||||
int queue_depth_max = 10000;
|
||||
int bulk_dequeue_size = 1;
|
||||
int ipc_timeout_ms = 300000;
|
||||
int max_ipc_connections = 100;
|
||||
int buffer_size_max = 50 * 1024 * 1024;
|
||||
};
|
||||
|
||||
struct Config {
|
||||
SystemTuningOptions system;
|
||||
std::string serpapi_key;
|
||||
std::string geocoder_key;
|
||||
std::string bigdata_key;
|
||||
std::string scrapeless_key;
|
||||
std::string postgres_url;
|
||||
std::string supabase_url;
|
||||
std::string supabase_service_key;
|
||||
// [enricher]
|
||||
std::string enricher_meta_scraper;
|
||||
int enricher_meta_concurrency = 5;
|
||||
int enricher_meta_idle_timeout = 60;
|
||||
int enricher_location_concurrency = 1;
|
||||
};
|
||||
|
||||
/// Load config from a TOML file (e.g. config/postgres.toml)
|
||||
Config load_config(const std::string &path = "config/postgres.toml");
|
||||
|
||||
// ── Search API ──────────────────────────────────────────────────────────────
|
||||
|
||||
struct SearchOptions {
|
||||
std::string query;
|
||||
double lat = 0;
|
||||
double lng = 0;
|
||||
int zoom = 13;
|
||||
int limit = 20;
|
||||
std::string engine = "google_maps";
|
||||
std::string hl = "en";
|
||||
std::string google_domain = "google.com";
|
||||
};
|
||||
|
||||
/// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit.
|
||||
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts);
|
||||
|
||||
/// Resolve geo coordinate to place info
|
||||
std::string resolve_geo(double lat, double lng, const std::string &key,
|
||||
int timeout_ms = 3000);
|
||||
|
||||
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
|
||||
int concurrency = 10, int timeout_ms = 3000);
|
||||
|
||||
} // namespace search
|
||||
@ -1,311 +0,0 @@
|
||||
#include "search/search.h"
|
||||
#include "http/http.h"
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
#include <toml++/toml.hpp>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
|
||||
namespace search {
|
||||
|
||||
// ── URL encoding (minimal) ──────────────────────────────────────────────────
|
||||
|
||||
static std::string url_encode(const std::string &val) {
|
||||
std::string result;
|
||||
result.reserve(val.size() * 2);
|
||||
for (unsigned char c : val) {
|
||||
if (isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_' ||
|
||||
c == '.' || c == '~') {
|
||||
result += static_cast<char>(c);
|
||||
} else {
|
||||
char buf[4];
|
||||
snprintf(buf, sizeof(buf), "%%%02X", c);
|
||||
result += buf;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Config loading ──────────────────────────────────────────────────────────
|
||||
|
||||
Config load_config(const std::string &path) {
|
||||
Config cfg;
|
||||
try {
|
||||
auto tbl = toml::parse_file(path);
|
||||
|
||||
// [postgres]
|
||||
if (auto v = tbl["postgres"]["url"].value<std::string>())
|
||||
cfg.postgres_url = *v;
|
||||
|
||||
// [supabase]
|
||||
if (auto v = tbl["supabase"]["url"].value<std::string>())
|
||||
cfg.supabase_url = *v;
|
||||
if (auto v = tbl["supabase"]["service_key"].value<std::string>())
|
||||
cfg.supabase_service_key = *v;
|
||||
|
||||
// [services]
|
||||
if (auto v = tbl["services"]["SERPAPI_KEY"].value<std::string>())
|
||||
cfg.serpapi_key = *v;
|
||||
if (auto v = tbl["services"]["GEO_CODER_KEY"].value<std::string>())
|
||||
cfg.geocoder_key = *v;
|
||||
if (auto v = tbl["services"]["BIG_DATA_KEY"].value<std::string>())
|
||||
cfg.bigdata_key = *v;
|
||||
if (auto v = tbl["services"]["SCRAPELESS_KEY"].value<std::string>())
|
||||
cfg.scrapeless_key = *v;
|
||||
|
||||
// [enricher]
|
||||
if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value<std::string>())
|
||||
cfg.enricher_meta_scraper = *v;
|
||||
if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value<int>())
|
||||
cfg.enricher_meta_concurrency = *v;
|
||||
if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value<int>())
|
||||
cfg.enricher_meta_idle_timeout = *v;
|
||||
if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value<int>())
|
||||
cfg.enricher_location_concurrency = *v;
|
||||
|
||||
// [system]
|
||||
if (auto v = tbl["system"]["executor_threads"].value<int>())
|
||||
cfg.system.executor_threads = *v;
|
||||
if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value<int>())
|
||||
cfg.system.max_concurrent_jobs_per_user = *v;
|
||||
if (auto v = tbl["system"]["http_concurrency_throttle"].value<int>())
|
||||
cfg.system.http_concurrency_throttle = *v;
|
||||
if (auto v = tbl["system"]["queue_depth_max"].value<int>())
|
||||
cfg.system.queue_depth_max = *v;
|
||||
if (auto v = tbl["system"]["bulk_dequeue_size"].value<int>())
|
||||
cfg.system.bulk_dequeue_size = *v;
|
||||
if (auto v = tbl["system"]["ipc_timeout_ms"].value<int>())
|
||||
cfg.system.ipc_timeout_ms = *v;
|
||||
if (auto v = tbl["system"]["max_ipc_connections"].value<int>())
|
||||
cfg.system.max_ipc_connections = *v;
|
||||
if (auto v = tbl["system"]["buffer_size_max"].value<int>())
|
||||
cfg.system.buffer_size_max = *v;
|
||||
|
||||
} catch (const toml::parse_error &err) {
|
||||
std::cerr << "[config] TOML parse error in " << path << ": " << err.what()
|
||||
<< "\n";
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// ── SerpAPI URL builder ─────────────────────────────────────────────────────
|
||||
|
||||
static std::string build_serpapi_url(const Config &cfg,
|
||||
const SearchOptions &opts, int start) {
|
||||
std::ostringstream url;
|
||||
url << "https://serpapi.com/search.json"
|
||||
<< "?engine=" << url_encode(opts.engine)
|
||||
<< "&q=" << url_encode(opts.query)
|
||||
<< "&api_key=" << url_encode(cfg.serpapi_key)
|
||||
<< "&hl=" << url_encode(opts.hl)
|
||||
<< "&google_domain=" << url_encode(opts.google_domain);
|
||||
|
||||
if (opts.lat != 0 || opts.lng != 0) {
|
||||
char llBuf[128];
|
||||
snprintf(llBuf, sizeof(llBuf), "@%.7f,%.7f,%dz", opts.lat, opts.lng,
|
||||
opts.zoom);
|
||||
url << "&ll=" << url_encode(std::string(llBuf));
|
||||
}
|
||||
|
||||
if (start > 0) {
|
||||
url << "&start=" << start;
|
||||
}
|
||||
|
||||
return url.str();
|
||||
}
|
||||
|
||||
// ── JSON result parser ──────────────────────────────────────────────────────
|
||||
|
||||
static void parse_results(const rapidjson::Value &arr,
|
||||
std::vector<MapResult> &out) {
|
||||
if (!arr.IsArray())
|
||||
return;
|
||||
|
||||
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
|
||||
const auto &obj = arr[i];
|
||||
if (!obj.IsObject())
|
||||
continue;
|
||||
|
||||
MapResult r;
|
||||
|
||||
// Capture raw JSON string
|
||||
rapidjson::StringBuffer buf;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
|
||||
obj.Accept(writer);
|
||||
r.raw_json = std::string(buf.GetString(), buf.GetSize());
|
||||
|
||||
if (obj.HasMember("title") && obj["title"].IsString())
|
||||
r.title = obj["title"].GetString();
|
||||
if (obj.HasMember("place_id") && obj["place_id"].IsString())
|
||||
r.place_id = obj["place_id"].GetString();
|
||||
if (obj.HasMember("data_id") && obj["data_id"].IsString())
|
||||
r.data_id = obj["data_id"].GetString();
|
||||
if (obj.HasMember("address") && obj["address"].IsString())
|
||||
r.address = obj["address"].GetString();
|
||||
if (obj.HasMember("phone") && obj["phone"].IsString())
|
||||
r.phone = obj["phone"].GetString();
|
||||
if (obj.HasMember("website") && obj["website"].IsString())
|
||||
r.website = obj["website"].GetString();
|
||||
if (obj.HasMember("type") && obj["type"].IsString())
|
||||
r.type = obj["type"].GetString();
|
||||
if (obj.HasMember("rating") && obj["rating"].IsNumber())
|
||||
r.rating = obj["rating"].GetDouble();
|
||||
if (obj.HasMember("reviews") && obj["reviews"].IsInt())
|
||||
r.reviews = obj["reviews"].GetInt();
|
||||
if (obj.HasMember("thumbnail") && obj["thumbnail"].IsString())
|
||||
r.thumbnail = obj["thumbnail"].GetString();
|
||||
|
||||
if (obj.HasMember("gps_coordinates") && obj["gps_coordinates"].IsObject()) {
|
||||
const auto &gps = obj["gps_coordinates"];
|
||||
if (gps.HasMember("latitude") && gps["latitude"].IsNumber())
|
||||
r.gps.lat = gps["latitude"].GetDouble();
|
||||
if (gps.HasMember("longitude") && gps["longitude"].IsNumber())
|
||||
r.gps.lng = gps["longitude"].GetDouble();
|
||||
}
|
||||
|
||||
if (obj.HasMember("types") && obj["types"].IsArray()) {
|
||||
for (rapidjson::SizeType j = 0; j < obj["types"].Size(); ++j) {
|
||||
if (obj["types"][j].IsString())
|
||||
r.types.push_back(obj["types"][j].GetString());
|
||||
}
|
||||
}
|
||||
|
||||
out.push_back(std::move(r));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Main search function ────────────────────────────────────────────────────
|
||||
|
||||
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts) {
|
||||
SearchResult result;
|
||||
|
||||
if (cfg.serpapi_key.empty()) {
|
||||
result.error = "No SerpAPI key configured";
|
||||
return result;
|
||||
}
|
||||
|
||||
if (opts.query.empty()) {
|
||||
result.error = "Empty search query";
|
||||
return result;
|
||||
}
|
||||
|
||||
const int PAGE_SIZE = 20;
|
||||
int start = 0;
|
||||
|
||||
while (static_cast<int>(result.results.size()) < opts.limit) {
|
||||
std::string url = build_serpapi_url(cfg, opts, start);
|
||||
auto resp = http::get(url);
|
||||
result.apiCalls++;
|
||||
|
||||
if (resp.status_code != 200) {
|
||||
result.error = "SerpAPI HTTP " + std::to_string(resp.status_code);
|
||||
break;
|
||||
}
|
||||
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(resp.body.c_str());
|
||||
if (doc.HasParseError()) {
|
||||
result.error = "Failed to parse SerpAPI response";
|
||||
break;
|
||||
}
|
||||
|
||||
size_t beforeCount = result.results.size();
|
||||
|
||||
// local_results (main listing)
|
||||
if (doc.HasMember("local_results") && doc["local_results"].IsArray()) {
|
||||
parse_results(doc["local_results"], result.results);
|
||||
}
|
||||
|
||||
// place_results (single result or array)
|
||||
if (doc.HasMember("place_results")) {
|
||||
if (doc["place_results"].IsArray()) {
|
||||
parse_results(doc["place_results"], result.results);
|
||||
} else if (doc["place_results"].IsObject()) {
|
||||
rapidjson::Document arr;
|
||||
arr.SetArray();
|
||||
arr.PushBack(rapidjson::Value(doc["place_results"], arr.GetAllocator()),
|
||||
arr.GetAllocator());
|
||||
parse_results(arr, result.results);
|
||||
}
|
||||
}
|
||||
|
||||
size_t pageCount = result.results.size() - beforeCount;
|
||||
|
||||
if (pageCount == 0)
|
||||
break; // No more results
|
||||
if (static_cast<int>(pageCount) < PAGE_SIZE)
|
||||
break; // Last page (partial)
|
||||
|
||||
start += PAGE_SIZE;
|
||||
}
|
||||
|
||||
// Trim to limit
|
||||
if (static_cast<int>(result.results.size()) > opts.limit) {
|
||||
result.results.resize(opts.limit);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Geo enrichment ──────────────────────────────────────────────────────────
|
||||
|
||||
std::string resolve_geo(double lat, double lng, const std::string &key,
|
||||
int timeout_ms) {
|
||||
if (key.empty())
|
||||
return "{}";
|
||||
char url[512];
|
||||
snprintf(
|
||||
url, sizeof(url),
|
||||
"https://api.bigdatacloud.net/data/"
|
||||
"reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s",
|
||||
lat, lng, key.c_str());
|
||||
|
||||
http::GetOptions opts;
|
||||
opts.timeout_ms = timeout_ms;
|
||||
auto resp = http::get(url, opts);
|
||||
if (resp.status_code == 200 && !resp.body.empty()) {
|
||||
return resp.body;
|
||||
}
|
||||
return "{}";
|
||||
}
|
||||
|
||||
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
|
||||
int concurrency, int timeout_ms) {
|
||||
if (key.empty() || results.empty())
|
||||
return;
|
||||
|
||||
std::atomic<size_t> current_idx{0};
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
int num_threads =
|
||||
std::min<int>(concurrency, static_cast<int>(results.size()));
|
||||
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
threads.emplace_back([&]() {
|
||||
while (true) {
|
||||
size_t idx = current_idx.fetch_add(1);
|
||||
if (idx >= results.size())
|
||||
break;
|
||||
|
||||
auto &r = results[idx];
|
||||
if (r.gps.lat != 0 || r.gps.lng != 0) {
|
||||
r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (auto &t : threads) {
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace search
|
||||
@ -1,65 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "search/search.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
#include "geo/geo.h"
|
||||
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
// ── Filter context ──────────────────────────────────────────────────────────
|
||||
// All runtime data a filter predicate may need. Passed by const-ref so filters
|
||||
// are pure read-only functions with no side-effects.
|
||||
|
||||
struct WaypointCtx {
|
||||
double lat;
|
||||
double lng;
|
||||
double radius_km;
|
||||
std::string area_gid; // e.g. "ESP.6.1.10.2_1"
|
||||
};
|
||||
|
||||
struct FilterContext {
|
||||
const WaypointCtx& waypoint;
|
||||
const std::vector<std::string>& filter_types; // must-match list
|
||||
const std::vector<std::string>& exclude_types; // deny list
|
||||
const std::map<std::string, std::vector<gadm::Feature>>& country_boundaries;
|
||||
};
|
||||
|
||||
// ── Predicate type ──────────────────────────────────────────────────────────
|
||||
// Returns true → KEEP the result.
|
||||
// Returns false → DISCARD the result.
|
||||
using LocationFilter = std::function<bool(const search::MapResult&, const FilterContext&)>;
|
||||
|
||||
// ── Individual filters ──────────────────────────────────────────────────────
|
||||
|
||||
/// Discard results that have no website (non-actionable leads).
|
||||
bool filter_requires_website(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
/// Discard results whose type matches any entry in ctx.exclude_types.
|
||||
bool filter_exclude_types(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
/// If ctx.filter_types is non-empty, keep only results that match ≥1 type.
|
||||
bool filter_match_types(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
/// Keep only results inside the country-level boundary polygon (L0) of the
|
||||
/// waypoint's country. Falls back to radius-based overlap (1.5 × radius_km)
|
||||
/// to gracefully handle legitimate border-proximity results.
|
||||
bool filter_country_boundary(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
// ── Filter set builder ──────────────────────────────────────────────────────
|
||||
|
||||
/// Return the ordered list of default filters applied to every SerpAPI batch.
|
||||
/// Filters are evaluated left-to-right; the first false short-circuits.
|
||||
std::vector<LocationFilter> default_location_filters();
|
||||
|
||||
/// Run `filters` against `result`. Returns true (keep) only if every
|
||||
/// filter passes.
|
||||
bool apply_filters(const search::MapResult& result,
|
||||
const FilterContext& ctx,
|
||||
const std::vector<LocationFilter>& filters);
|
||||
|
||||
} // namespace polymech
|
||||
@ -1,28 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "cmd_gridsearch.h"
|
||||
#include "search/search.h"
|
||||
#include "enrichers/enrichers.h"
|
||||
#include <set>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
struct PostgresStateStore {
|
||||
std::string run_id;
|
||||
std::string user_id;
|
||||
std::string parent_id; // optional: parent run ID for expand jobs
|
||||
bool enabled = false;
|
||||
|
||||
void init_run(const PipelineOptions &opts);
|
||||
void update_status(const std::string &status);
|
||||
void complete_run(const std::string &result_json);
|
||||
void fail_run(const std::string &error_msg);
|
||||
void upsert_places(const std::vector<search::MapResult> &places);
|
||||
void update_place_enrichment(const enrichers::EnrichedNode &enode);
|
||||
|
||||
/// Query places table in chunks to find place_ids that already have meta (enriched).
|
||||
/// Returns set of place_ids that should be skipped during enrichment.
|
||||
std::set<std::string> filter_already_enriched(const std::vector<std::string> &place_ids);
|
||||
};
|
||||
|
||||
} // namespace polymech
|
||||
@ -1,88 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include <CLI/CLI.hpp>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <atomic>
|
||||
#include "search/search.h"
|
||||
#include "grid/grid.h"
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
std::string json_escape(const std::string &s);
|
||||
|
||||
struct AreaDef {
|
||||
std::string gid;
|
||||
std::string name;
|
||||
int level;
|
||||
};
|
||||
|
||||
struct AccumulatedResult {
|
||||
search::MapResult result;
|
||||
std::string grid_area;
|
||||
std::string grid_gid;
|
||||
};
|
||||
|
||||
struct PipelineOptions {
|
||||
std::vector<AreaDef> areas;
|
||||
grid::GridOptions grid_opts;
|
||||
std::string search_query;
|
||||
std::string search_domain = "google.com";
|
||||
std::string search_language = "en";
|
||||
std::string search_country;
|
||||
int search_limit = 20;
|
||||
int search_zoom = 13;
|
||||
bool dry_run = false;
|
||||
bool enrich = false;
|
||||
std::string config_path = "config/postgres.toml";
|
||||
std::string cache_dir = "cache/gadm";
|
||||
bool persistence_postgres = false;
|
||||
bool daemon_mode = false;
|
||||
std::string job_id;
|
||||
std::string default_user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
|
||||
search::SystemTuningOptions tuning;
|
||||
std::shared_ptr<std::atomic<bool>> cancel_token;
|
||||
std::vector<std::string> filter_types; // if non-empty, only locations matching ≥1 type pass
|
||||
std::vector<std::string> exclude_types; // if non-empty, drop locations matching any
|
||||
bool no_cache = false; // skip pre-enrich dedup — force re-enrichment
|
||||
std::string parent_id; // if set, this run is an "expand" child of another run
|
||||
};
|
||||
|
||||
std::string json_escape(const std::string &s);
|
||||
|
||||
/// Optional callbacks for streaming progress events (used in IPC mode).
|
||||
/// When nullptr / empty, the pipeline runs silently (CLI mode).
|
||||
struct GridsearchCallbacks {
|
||||
/// Emit a progress event. `type` is one of:
|
||||
/// grid-ready, waypoint-start, area, location,
|
||||
/// enrich-start, node, node-error, nodePage
|
||||
/// `json` is the raw JSON payload string.
|
||||
std::function<void(const std::string& type, const std::string& json)> onEvent;
|
||||
};
|
||||
|
||||
CLI::App* setup_cmd_gridsearch(CLI::App& app);
|
||||
|
||||
/// CLI entry point (standalone mode — reads static vars set by CLI11).
|
||||
int run_cmd_gridsearch();
|
||||
|
||||
/// IPC entry point — parse `payload` JSON, run the pipeline, emit events via `cb`.
|
||||
/// Returns 0 on success.
|
||||
int run_cmd_gridsearch_ipc(const std::string& payload,
|
||||
const std::string& jobId,
|
||||
const GridsearchCallbacks& cb,
|
||||
bool daemon_mode = false,
|
||||
const std::string& daemon_uid = "");
|
||||
|
||||
/// Core Pipeline
|
||||
int run_pipeline(const PipelineOptions &opts, std::ostream *file_out,
|
||||
const GridsearchCallbacks &cb);
|
||||
|
||||
/// UDS entry point — starts a persistent AF_UNIX / Named Pipe server that processes
|
||||
/// concurrent jobs using Moodycamel ConcurrentQueue and Taskflow executor.
|
||||
int run_cmd_gridsearch_uds(const std::string& pipe_path,
|
||||
bool daemon_mode,
|
||||
const std::string& daemon_uid);
|
||||
|
||||
} // namespace polymech
|
||||
@ -100,6 +100,8 @@ int run_kbot_ai_ipc(const std::string& payload, const std::string& jobId, const
|
||||
if (doc.HasMember("prompt") && doc["prompt"].IsString()) opts.prompt = doc["prompt"].GetString();
|
||||
if (doc.HasMember("dry_run") && doc["dry_run"].IsBool()) opts.dry_run = doc["dry_run"].GetBool();
|
||||
if (doc.HasMember("api_key") && doc["api_key"].IsString()) opts.api_key = doc["api_key"].GetString();
|
||||
if (doc.HasMember("router") && doc["router"].IsString()) opts.router = doc["router"].GetString();
|
||||
if (doc.HasMember("model") && doc["model"].IsString()) opts.model = doc["model"].GetString();
|
||||
}
|
||||
|
||||
if (opts.api_key.empty()) {
|
||||
|
||||
@ -18,6 +18,9 @@ int run_cmd_kbot_run();
|
||||
int run_kbot_ai_ipc(const std::string& payload, const std::string& jobId, const kbot::KBotCallbacks& cb);
|
||||
int run_kbot_run_ipc(const std::string& payload, const std::string& jobId, const kbot::KBotCallbacks& cb);
|
||||
|
||||
/// Standalone UDS/TCP server for KBot (orchestrator tests, LLM worker).
|
||||
int run_cmd_kbot_uds(const std::string& pipe_path);
|
||||
|
||||
/// Helper to check parsed state
|
||||
bool is_kbot_ai_parsed();
|
||||
bool is_kbot_run_parsed();
|
||||
|
||||
@ -1,351 +0,0 @@
|
||||
#include "gridsearch_serialize.h"
|
||||
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
#include "cmd_gridsearch.h"
|
||||
|
||||
namespace polymech::serialize {
|
||||
|
||||
// ── grid-ready ──────────────────────────────────────────────────────────────
|
||||
|
||||
std::string grid_ready(const std::vector<grid::Waypoint>& waypoints) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("areas"); w.StartArray();
|
||||
for (size_t i = 0; i < waypoints.size(); ++i) {
|
||||
const auto& wp = waypoints[i];
|
||||
w.StartObject();
|
||||
w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str());
|
||||
w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str());
|
||||
w.Key("lat"); w.Double(wp.lat);
|
||||
w.Key("lon"); w.Double(wp.lng);
|
||||
w.Key("radius_km"); w.Double(wp.radius_km);
|
||||
w.Key("area_gid"); w.String(wp.area_gid.c_str());
|
||||
w.Key("area_name"); w.String(wp.area_name.c_str());
|
||||
w.Key("index"); w.Int(static_cast<int>(i));
|
||||
w.EndObject();
|
||||
}
|
||||
w.EndArray();
|
||||
w.Key("total"); w.Int(static_cast<int>(waypoints.size()));
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── waypoint-start ──────────────────────────────────────────────────────────
|
||||
|
||||
std::string waypoint_start(const grid::Waypoint& wp, int index, int total) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str());
|
||||
w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str());
|
||||
w.Key("lat"); w.Double(wp.lat);
|
||||
w.Key("lon"); w.Double(wp.lng);
|
||||
w.Key("radius_km"); w.Double(wp.radius_km);
|
||||
w.Key("area_gid"); w.String(wp.area_gid.c_str());
|
||||
w.Key("area_name"); w.String(wp.area_name.c_str());
|
||||
w.Key("index"); w.Int(index);
|
||||
w.Key("total"); w.Int(total);
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── location (per search result) ────────────────────────────────────────────
|
||||
|
||||
std::string location(const search::MapResult& r, int step) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("location"); w.StartObject();
|
||||
w.Key("title"); w.String(r.title.c_str());
|
||||
w.Key("place_id"); w.String(r.place_id.c_str());
|
||||
w.Key("address"); w.String(r.address.c_str());
|
||||
w.Key("website"); w.String(r.website.c_str());
|
||||
w.Key("type"); w.String(r.type.c_str());
|
||||
w.Key("phone"); w.String(r.phone.c_str());
|
||||
w.Key("rating"); w.Double(r.rating);
|
||||
w.Key("reviews"); w.Int(r.reviews);
|
||||
w.Key("lat"); w.Double(r.gps.lat);
|
||||
w.Key("lng"); w.Double(r.gps.lng);
|
||||
w.Key("types"); w.StartArray();
|
||||
for (const auto& t : r.types) w.String(t.c_str());
|
||||
w.EndArray();
|
||||
w.EndObject();
|
||||
w.Key("areaName"); w.String(("Waypoint " + std::to_string(step)).c_str());
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── area_start ──────────────────────────────────────────────────────────────
|
||||
|
||||
std::string area_start(const std::string& area_gid, const std::string& area_name) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("gid"); w.String(area_gid.c_str());
|
||||
w.Key("name"); w.String(area_name.c_str());
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── area_finish ─────────────────────────────────────────────────────────────
|
||||
|
||||
std::string area_finish(const std::string& area_gid) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("gid"); w.String(area_gid.c_str());
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── waypoint_finish ─────────────────────────────────────────────────────────
|
||||
|
||||
std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str());
|
||||
w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str());
|
||||
w.Key("results"); w.Int(results);
|
||||
w.Key("apiCalls"); w.Int(apiCalls);
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── enrich-start ────────────────────────────────────────────────────────────
|
||||
|
||||
std::string enrich_start(int locationCount) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("locationCount"); w.Int(locationCount);
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── nodePage (per page error) ───────────────────────────────────────────────
|
||||
|
||||
std::string node_page(const enrichers::PageError& pe, const std::string& placeId) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("location"); w.String(placeId.c_str());
|
||||
w.Key("url"); w.String(pe.url.c_str());
|
||||
w.Key("status"); w.String(pe.status.c_str());
|
||||
w.Key("error"); w.String(pe.error.c_str());
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── node-error ──────────────────────────────────────────────────────────────
|
||||
|
||||
std::string node_error(const enrichers::EnrichedNode& node) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("node"); w.StartObject();
|
||||
w.Key("title"); w.String(node.title.c_str());
|
||||
w.Key("placeId"); w.String(node.place_id.c_str());
|
||||
w.EndObject();
|
||||
w.Key("error"); w.String(node.error.c_str());
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── node (enriched location) ────────────────────────────────────────────────
|
||||
|
||||
std::string node(const enrichers::EnrichedNode& n) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
w.Key("idx"); w.Int(n.idx);
|
||||
w.Key("title"); w.String(n.title.c_str());
|
||||
w.Key("placeId"); w.String(n.place_id.c_str());
|
||||
w.Key("website"); w.String(n.website.c_str());
|
||||
w.Key("address"); w.String(n.address.c_str());
|
||||
w.Key("type"); w.String(n.type.c_str());
|
||||
w.Key("status"); w.String(enrichers::status_string(n.status));
|
||||
w.Key("emails"); w.StartArray();
|
||||
for (const auto& e : n.emails) w.String(e.c_str());
|
||||
w.EndArray();
|
||||
|
||||
w.Key("social"); w.StartArray();
|
||||
for (const auto& s : n.socials) {
|
||||
w.StartObject();
|
||||
w.Key("url"); w.String(s.url.c_str());
|
||||
w.Key("platform"); w.String(s.platform.c_str());
|
||||
w.EndObject();
|
||||
}
|
||||
w.EndArray();
|
||||
|
||||
w.Key("sites"); w.StartArray();
|
||||
for (const auto& s : n.sites) {
|
||||
w.StartObject();
|
||||
w.Key("url"); w.String(s.first.c_str());
|
||||
w.Key("name"); w.String("home");
|
||||
w.Key("content"); w.String(s.second.c_str());
|
||||
w.EndObject();
|
||||
}
|
||||
w.EndArray();
|
||||
|
||||
w.Key("pagesFound"); w.Int(n.pages_found);
|
||||
w.Key("pagesScraped"); w.Int(n.pages_scraped);
|
||||
w.Key("metaMs"); w.Int(n.meta_ms);
|
||||
w.Key("emailMs"); w.Int(n.email_ms);
|
||||
w.Key("totalMs"); w.Int(n.total_ms);
|
||||
w.Key("gridArea"); w.String(n.grid_area.c_str());
|
||||
w.Key("gridGid"); w.String(n.grid_gid.c_str());
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── write_options helper ────────────────────────────────────────────────────
|
||||
|
||||
static void write_options(rapidjson::Writer<rapidjson::StringBuffer>& w, const polymech::PipelineOptions& opts) {
|
||||
w.Key("options");
|
||||
w.StartObject();
|
||||
w.Key("jobId"); w.String(opts.job_id.c_str());
|
||||
w.Key("searchQuery"); w.String(opts.search_query.c_str());
|
||||
w.Key("searchDomain"); w.String(opts.search_domain.c_str());
|
||||
w.Key("searchLanguage"); w.String(opts.search_language.c_str());
|
||||
w.Key("searchCountry"); w.String(opts.search_country.c_str());
|
||||
w.Key("searchLimit"); w.Int(opts.search_limit);
|
||||
w.Key("searchZoom"); w.Int(opts.search_zoom);
|
||||
w.Key("dryRun"); w.Bool(opts.dry_run);
|
||||
w.Key("enrich"); w.Bool(opts.enrich);
|
||||
|
||||
w.Key("grid");
|
||||
w.StartObject();
|
||||
w.Key("gridMode"); w.String(opts.grid_opts.gridMode.c_str());
|
||||
w.Key("cellSize"); w.Double(opts.grid_opts.cellSize);
|
||||
w.Key("cellOverlap"); w.Double(opts.grid_opts.cellOverlap);
|
||||
w.Key("centroidOverlap"); w.Double(opts.grid_opts.centroidOverlap);
|
||||
w.Key("maxCellsLimit"); w.Int(opts.grid_opts.maxCellsLimit);
|
||||
w.Key("maxElevation"); w.Double(opts.grid_opts.maxElevation);
|
||||
w.Key("minDensity"); w.Double(opts.grid_opts.minDensity);
|
||||
w.Key("minGhsPop"); w.Double(opts.grid_opts.minGhsPop);
|
||||
w.Key("minGhsBuilt"); w.Double(opts.grid_opts.minGhsBuilt);
|
||||
w.Key("ghsFilterMode"); w.String(opts.grid_opts.ghsFilterMode.c_str());
|
||||
w.Key("allowMissingGhs"); w.Bool(opts.grid_opts.allowMissingGhs);
|
||||
w.Key("bypassFilters"); w.Bool(opts.grid_opts.bypassFilters);
|
||||
w.Key("pathOrder"); w.String(opts.grid_opts.pathOrder.c_str());
|
||||
w.Key("groupByRegion"); w.Bool(opts.grid_opts.groupByRegion);
|
||||
w.EndObject();
|
||||
|
||||
w.Key("areas");
|
||||
w.StartArray();
|
||||
for (const auto& a : opts.areas) {
|
||||
w.StartObject();
|
||||
w.Key("gid"); w.String(a.gid.c_str());
|
||||
w.Key("name"); w.String(a.name.c_str());
|
||||
w.Key("level"); w.Int(a.level);
|
||||
w.EndObject();
|
||||
}
|
||||
w.EndArray();
|
||||
w.EndObject();
|
||||
}
|
||||
|
||||
// ── job_result (with enrichment) ────────────────────────────────────────────
|
||||
|
||||
std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs,
|
||||
int totalEmails, int totalPagesScraped, int freshApiCalls,
|
||||
int waypointCount, int validCells, int skippedCells,
|
||||
int totalResults, const std::vector<std::string>& enrichResults,
|
||||
double totalScannedSqKm, double totalPopulation) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
write_options(w, opts);
|
||||
|
||||
w.Key("enumMs"); w.Int64(enumMs);
|
||||
w.Key("searchMs"); w.Int64(searchMs);
|
||||
w.Key("enrichMs"); w.Int64(enrichMs);
|
||||
w.Key("totalMs"); w.Int64(totalMs);
|
||||
|
||||
w.Key("gridStats");
|
||||
w.StartObject();
|
||||
w.Key("validCells"); w.Int(validCells);
|
||||
w.Key("skippedCells"); w.Int(skippedCells);
|
||||
w.Key("totalWaypoints"); w.Int(waypointCount);
|
||||
w.EndObject();
|
||||
|
||||
w.Key("searchStats");
|
||||
w.StartObject();
|
||||
w.Key("apiCalls"); w.Int(freshApiCalls);
|
||||
w.Key("filtered"); w.Int(0); // placeholder if needed
|
||||
w.Key("areaCount"); w.Int(waypointCount);
|
||||
w.Key("totalResults"); w.Int(totalResults);
|
||||
w.Key("totalScannedSqKm"); w.Double(totalScannedSqKm);
|
||||
w.Key("totalPopulation"); w.Double(totalPopulation);
|
||||
w.EndObject();
|
||||
|
||||
w.Key("totalEmails"); w.Int(totalEmails);
|
||||
|
||||
w.Key("enrichResults");
|
||||
w.StartArray();
|
||||
for (const auto& id : enrichResults) {
|
||||
w.String(id.c_str());
|
||||
}
|
||||
w.EndArray();
|
||||
|
||||
w.Key("freshApiCalls"); w.Int(freshApiCalls);
|
||||
w.Key("waypointCount"); w.Int(waypointCount);
|
||||
w.Key("totalPagesScraped"); w.Int(totalPagesScraped);
|
||||
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
// ── job_result (search only) ────────────────────────────────────────────────
|
||||
|
||||
std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs,
|
||||
int freshApiCalls, int waypointCount, int validCells,
|
||||
int skippedCells, int totalResults, const std::vector<std::string>& enrichResults,
|
||||
double totalScannedSqKm, double totalPopulation) {
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
w.StartObject();
|
||||
write_options(w, opts);
|
||||
|
||||
w.Key("enumMs"); w.Int64(enumMs);
|
||||
w.Key("searchMs"); w.Int64(searchMs);
|
||||
w.Key("enrichMs"); w.Int64(0);
|
||||
w.Key("totalMs"); w.Int64(totalMs);
|
||||
|
||||
w.Key("gridStats");
|
||||
w.StartObject();
|
||||
w.Key("validCells"); w.Int(validCells);
|
||||
w.Key("skippedCells"); w.Int(skippedCells);
|
||||
w.Key("totalWaypoints"); w.Int(waypointCount);
|
||||
w.EndObject();
|
||||
|
||||
w.Key("searchStats");
|
||||
w.StartObject();
|
||||
w.Key("apiCalls"); w.Int(freshApiCalls);
|
||||
w.Key("filtered"); w.Int(0);
|
||||
w.Key("areaCount"); w.Int(waypointCount);
|
||||
w.Key("totalResults"); w.Int(totalResults);
|
||||
w.Key("totalScannedSqKm"); w.Double(totalScannedSqKm);
|
||||
w.Key("totalPopulation"); w.Double(totalPopulation);
|
||||
w.EndObject();
|
||||
|
||||
w.Key("totalEmails"); w.Int(0);
|
||||
|
||||
w.Key("enrichResults");
|
||||
w.StartArray();
|
||||
for (const auto& id : enrichResults) {
|
||||
w.String(id.c_str());
|
||||
}
|
||||
w.EndArray();
|
||||
|
||||
w.Key("freshApiCalls"); w.Int(freshApiCalls);
|
||||
w.Key("waypointCount"); w.Int(waypointCount);
|
||||
w.Key("totalPagesScraped"); w.Int(0);
|
||||
|
||||
w.EndObject();
|
||||
return sb.GetString();
|
||||
}
|
||||
|
||||
} // namespace polymech::serialize
|
||||
@ -1,60 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "enrichers/enrichers.h"
|
||||
#include "grid/grid.h"
|
||||
#include "search/search.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
struct PipelineOptions;
|
||||
}
|
||||
|
||||
namespace polymech::serialize {
|
||||
|
||||
/// grid-ready event payload
|
||||
std::string grid_ready(const std::vector<grid::Waypoint>& waypoints);
|
||||
|
||||
/// waypoint-start event payload
|
||||
std::string waypoint_start(const grid::Waypoint& wp, int index, int total);
|
||||
|
||||
/// location event payload (per search result)
|
||||
std::string location(const search::MapResult& r, int step);
|
||||
|
||||
/// waypoint-finish event payload (waypoint done)
|
||||
std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls);
|
||||
|
||||
/// area-start event payload
|
||||
std::string area_start(const std::string& area_gid, const std::string& area_name);
|
||||
|
||||
/// area-finish event payload
|
||||
std::string area_finish(const std::string& area_gid);
|
||||
|
||||
/// enrich-start event payload
|
||||
std::string enrich_start(int locationCount);
|
||||
|
||||
/// nodePage event payload (per page error)
|
||||
std::string node_page(const enrichers::PageError& pe, const std::string& placeId);
|
||||
|
||||
/// node-error event payload
|
||||
std::string node_error(const enrichers::EnrichedNode& node);
|
||||
|
||||
/// node event payload (enriched location)
|
||||
std::string node(const enrichers::EnrichedNode& node);
|
||||
|
||||
/// job_result summary (with enrichment)
|
||||
std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs,
|
||||
int totalEmails, int totalPagesScraped, int freshApiCalls,
|
||||
int waypointCount, int validCells, int skippedCells,
|
||||
int totalResults, const std::vector<std::string>& enrichResults,
|
||||
double totalScannedSqKm, double totalPopulation);
|
||||
|
||||
/// job_result summary (search only, no enrichment)
|
||||
std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs,
|
||||
int freshApiCalls, int waypointCount, int validCells,
|
||||
int skippedCells, int totalResults, const std::vector<std::string>& enrichResults,
|
||||
double totalScannedSqKm, double totalPopulation);
|
||||
|
||||
} // namespace polymech::serialize
|
||||
@ -17,11 +17,6 @@
|
||||
#include "logger/logger.h"
|
||||
#include "postgres/postgres.h"
|
||||
#include "json/json.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
#include "grid/grid.h"
|
||||
#include "search/search.h"
|
||||
#include "enrichers/enrichers.h"
|
||||
#include "cmd_gridsearch.h"
|
||||
#include "cmd_kbot.h"
|
||||
|
||||
#ifndef PROJECT_VERSION
|
||||
@ -77,17 +72,12 @@ int main(int argc, char *argv[]) {
|
||||
db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10);
|
||||
|
||||
// Subcommand: worker — IPC mode (spawned by Node.js orchestrator)
|
||||
bool daemon_mode = false;
|
||||
std::string daemon_uid;
|
||||
std::string worker_config = "config/postgres.toml";
|
||||
std::string uds_path;
|
||||
|
||||
auto *worker_cmd = app.add_subcommand(
|
||||
"worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)");
|
||||
worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)");
|
||||
worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml");
|
||||
worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)");
|
||||
worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path");
|
||||
worker_cmd->add_option("--uds", uds_path,
|
||||
"Listen on TCP port (Windows) or Unix socket path");
|
||||
|
||||
// Subcommand: kbot — AI workflows & task configurations
|
||||
auto* kbot_cmd = polymech::setup_cmd_kbot(app);
|
||||
@ -109,19 +99,10 @@ int main(int argc, char *argv[]) {
|
||||
// ── worker mode ─────────────────────────────────────────────────────────
|
||||
if (worker_cmd->parsed()) {
|
||||
logger::info("Worker mode: listening on stdin");
|
||||
|
||||
if (daemon_mode) {
|
||||
logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid));
|
||||
auto cfg = search::load_config(worker_config);
|
||||
postgres::Config pcfg;
|
||||
pcfg.supabase_url = cfg.supabase_url;
|
||||
pcfg.supabase_key = cfg.supabase_service_key;
|
||||
postgres::init(pcfg);
|
||||
}
|
||||
|
||||
if (!uds_path.empty()) {
|
||||
logger::info("Worker mode: UDS Server active on " + uds_path);
|
||||
int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid);
|
||||
int rc = polymech::run_cmd_kbot_uds(uds_path);
|
||||
return rc;
|
||||
}
|
||||
|
||||
@ -140,27 +121,6 @@ int main(int argc, char *argv[]) {
|
||||
if (req.type == "ping") {
|
||||
ipc::write_message({req.id, "pong", "{}"});
|
||||
|
||||
} else if (req.type == "gridsearch") {
|
||||
logger::info("Worker: gridsearch job received");
|
||||
|
||||
// Build callbacks that emit IPC events.
|
||||
// Progress events use id "0" (unmatched → event for orchestrator).
|
||||
// The final job_result uses the original req.id so the promise resolves.
|
||||
std::string req_id = req.id;
|
||||
polymech::GridsearchCallbacks cb;
|
||||
cb.onEvent = [&req_id](const std::string& type, const std::string& json) {
|
||||
if (type == "job_result") {
|
||||
ipc::write_message({req_id, "job_result", json});
|
||||
} else {
|
||||
ipc::write_message({"0", type, json});
|
||||
}
|
||||
};
|
||||
|
||||
int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid);
|
||||
if (rc != 0) {
|
||||
ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"});
|
||||
}
|
||||
|
||||
} else if (req.type == "job") {
|
||||
// Stub: echo the payload back as job_result
|
||||
ipc::write_message({req.id, "job_result", req.payload});
|
||||
|
||||
@ -53,12 +53,6 @@ add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp)
|
||||
target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads)
|
||||
catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_gridsearch_ipc e2e/test_gridsearch_ipc.cpp ../src/cmd_gridsearch.cpp ../src/cmd_gridsearch-filters.cpp ../src/cmd_gridsearch-uds.cpp ../src/cmd_gridsearch-postgres.cpp ../src/gridsearch_serialize.cpp ../src/sys_metrics.cpp)
|
||||
target_link_libraries(test_gridsearch_ipc PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers Threads::Threads)
|
||||
target_include_directories(test_gridsearch_ipc PRIVATE ${CMAKE_SOURCE_DIR}/src ${asio_SOURCE_DIR}/asio/include ${taskflow_SOURCE_DIR} ${concurrentqueue_SOURCE_DIR})
|
||||
target_compile_definitions(test_gridsearch_ipc PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
|
||||
catch_discover_tests(test_gridsearch_ipc WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_ipc unit/test_ipc.cpp)
|
||||
target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads)
|
||||
catch_discover_tests(test_ipc)
|
||||
|
||||
@ -1,144 +0,0 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
|
||||
#include "../../src/cmd_gridsearch.h"
|
||||
#include "logger/logger.h"
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
static std::string read_file_contents(const std::string &path) {
|
||||
std::ifstream f(path);
|
||||
if (!f.is_open())
|
||||
return "";
|
||||
std::stringstream ss;
|
||||
ss << f.rdbuf();
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
/// Read a JSON config file and inject test-safe overrides:
|
||||
/// - configPath = "config/postgres.toml"
|
||||
/// - enrich = false (no live HTTP / thread-pool in tests)
|
||||
/// - persistencePostgres = false
|
||||
static std::string load_test_payload(const std::string &config_path) {
|
||||
std::string raw = read_file_contents(config_path);
|
||||
if (raw.empty())
|
||||
return "";
|
||||
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(raw.c_str());
|
||||
if (doc.HasParseError())
|
||||
return "";
|
||||
|
||||
auto &alloc = doc.GetAllocator();
|
||||
|
||||
// Remove-then-add ensures no double-add assertion from rapidjson
|
||||
auto inject_bool = [&](const char *key, bool val) {
|
||||
if (doc.HasMember(key))
|
||||
doc.RemoveMember(key);
|
||||
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val), alloc);
|
||||
};
|
||||
auto inject_str = [&](const char *key, const char *val) {
|
||||
if (doc.HasMember(key))
|
||||
doc.RemoveMember(key);
|
||||
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val, alloc),
|
||||
alloc);
|
||||
};
|
||||
|
||||
inject_str("configPath", "config/postgres.toml");
|
||||
inject_str("cacheDir", "../../packages/gadm/cache/gadm"); // server/cache/gadm
|
||||
inject_bool("enrich", false); // no live enrichment in tests
|
||||
inject_bool("persistencePostgres", false);
|
||||
|
||||
rapidjson::StringBuffer buf;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
|
||||
doc.Accept(writer);
|
||||
return buf.GetString();
|
||||
}
|
||||
|
||||
// ── Tests
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("E2E: Gridsearch Country Boundary Filter (Lamu/KEN)",
|
||||
"[e2e][gridsearch][boundary]") {
|
||||
REQUIRE_NOTHROW(logger::init("test-gridsearch"));
|
||||
|
||||
// Lamu, Kenya — SerpAPI often returns US results for obscure African regions.
|
||||
// boundary_KEN_0.json should filter them out.
|
||||
std::string payload = load_test_payload("config/gridsearch-lamu.json");
|
||||
REQUIRE(!payload.empty());
|
||||
|
||||
std::vector<std::string> location_events;
|
||||
int error_count = 0;
|
||||
|
||||
polymech::GridsearchCallbacks cb;
|
||||
cb.onEvent = [&](const std::string &type, const std::string &json) {
|
||||
if (type == "location") {
|
||||
location_events.push_back(json);
|
||||
} else if (type == "error") {
|
||||
error_count++;
|
||||
std::cout << "[ERROR EVENT]: " << json << "\n";
|
||||
}
|
||||
};
|
||||
|
||||
int result =
|
||||
polymech::run_cmd_gridsearch_ipc(payload, "test-lamu-job", cb, false, "");
|
||||
|
||||
REQUIRE(result == 0);
|
||||
REQUIRE(error_count == 0);
|
||||
|
||||
// All returned locations must be within Kenya (no USA coords).
|
||||
// Verify: no location has lng < -30 (Americas) or lng > 60 (not Africa/Asia)
|
||||
// and lat outside [-5, 5] for Lamu county bounds.
|
||||
int outside_kenya = 0;
|
||||
for (const auto &loc_json : location_events) {
|
||||
rapidjson::Document loc;
|
||||
loc.Parse(loc_json.c_str());
|
||||
if (loc.HasParseError())
|
||||
continue;
|
||||
if (loc.HasMember("gps") && loc["gps"].IsObject()) {
|
||||
double lng =
|
||||
loc["gps"].HasMember("lng") ? loc["gps"]["lng"].GetDouble() : 0;
|
||||
// Kenya longitude range: ~34..42; USA is roughly -130..-60
|
||||
if (lng < 20.0 || lng > 55.0)
|
||||
outside_kenya++;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(outside_kenya == 0);
|
||||
std::cout << "Lamu boundary test: " << location_events.size()
|
||||
<< " locations kept, " << outside_kenya << " outside Kenya.\n";
|
||||
}
|
||||
|
||||
TEST_CASE("E2E: Gridsearch Type Filter (Sample/ABW)",
|
||||
"[e2e][gridsearch][filter]") {
|
||||
std::string payload = load_test_payload("config/gridsearch-sample.json");
|
||||
REQUIRE(!payload.empty());
|
||||
|
||||
std::vector<std::string> location_events;
|
||||
int error_count = 0;
|
||||
|
||||
polymech::GridsearchCallbacks cb;
|
||||
cb.onEvent = [&](const std::string &type, const std::string &json) {
|
||||
if (type == "location")
|
||||
location_events.push_back(json);
|
||||
else if (type == "error")
|
||||
error_count++;
|
||||
};
|
||||
|
||||
int result = polymech::run_cmd_gridsearch_ipc(payload, "test-sample-job", cb,
|
||||
false, "");
|
||||
|
||||
REQUIRE(result == 0);
|
||||
REQUIRE(error_count == 0);
|
||||
|
||||
std::cout << "Sample (ABW) type filter test: " << location_events.size()
|
||||
<< " locations.\n";
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user