diff --git a/packages/kbot/cpp/CMakeLists.txt b/packages/kbot/cpp/CMakeLists.txt index 20f04f7e..b85d7d76 100644 --- a/packages/kbot/cpp/CMakeLists.txt +++ b/packages/kbot/cpp/CMakeLists.txt @@ -90,12 +90,6 @@ add_subdirectory(packages/http) add_subdirectory(packages/json) add_subdirectory(packages/polymech) add_subdirectory(packages/ipc) -add_subdirectory(packages/geo) -add_subdirectory(packages/gadm_reader) -add_subdirectory(packages/grid) -add_subdirectory(packages/search) -add_subdirectory(packages/enrichers) - add_subdirectory(packages/liboai/liboai) add_subdirectory(packages/kbot) @@ -103,19 +97,15 @@ add_subdirectory(packages/kbot) # ── Sources ────────────────────────────────────────────────────────────────── add_executable(${PROJECT_NAME} src/main.cpp - src/cmd_gridsearch.cpp - src/cmd_gridsearch-filters.cpp - src/cmd_gridsearch-uds.cpp - src/cmd_gridsearch-postgres.cpp src/cmd_kbot.cpp - src/gridsearch_serialize.cpp + src/cmd_kbot_uds.cpp src/sys_metrics.cpp ) # Output file name is kbot.exe / kbot (not kbot-cli) set_target_properties(${PROJECT_NAME} PROPERTIES OUTPUT_NAME "kbot") -target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers kbot) +target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc search kbot) target_include_directories(${PROJECT_NAME} PRIVATE ${asio_SOURCE_DIR}/asio/include diff --git a/packages/kbot/cpp/package.json b/packages/kbot/cpp/package.json index 8fcfa978..a681e7af 100644 --- a/packages/kbot/cpp/package.json +++ b/packages/kbot/cpp/package.json @@ -21,11 +21,6 @@ "kbot:ai": ".\\dist\\kbot.exe kbot ai --prompt \"hi\"", "kbot:run": ".\\dist\\kbot.exe kbot run --list", "test:ipc": "node orchestrator/test-ipc.mjs", - "test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs", - "test:gridsearch-filter-ipc": "cmake --build build/release --target test_gridsearch_ipc && .\\dist\\test_gridsearch_ipc.exe", - "test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs", - "test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs", - "test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs", "test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe" }, "repository": { diff --git a/packages/kbot/cpp/packages/enrichers/CMakeLists.txt b/packages/kbot/cpp/packages/enrichers/CMakeLists.txt deleted file mode 100644 index 5d9338d2..00000000 --- a/packages/kbot/cpp/packages/enrichers/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_library(enrichers STATIC src/enrichers.cpp) - -target_include_directories(enrichers PUBLIC include) -target_link_libraries(enrichers PUBLIC http html json logger) diff --git a/packages/kbot/cpp/packages/enrichers/include/enrichers/enrichers.h b/packages/kbot/cpp/packages/enrichers/include/enrichers/enrichers.h deleted file mode 100644 index d053bab7..00000000 --- a/packages/kbot/cpp/packages/enrichers/include/enrichers/enrichers.h +++ /dev/null @@ -1,162 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace enrichers { - -// ── Status codes ──────────────────────────────────────────────────────────── - -enum class EnrichStatus { - OK, - NO_EMAIL, - META_TIMEOUT, - EMAIL_TIMEOUT, - FETCH_ERROR, - NO_PAGES, - ERROR, -}; - -const char *status_string(EnrichStatus s); - -// ── Data types ────────────────────────────────────────────────────────────── - -struct PageError { - std::string url; - std::string status; // "SEARCHED_EMAIL", "FAILED", ... - std::string method; // "GET", "SCRAPELESS", ... - std::string error; - int http_status = 0; - std::vector emails; -}; - -struct SocialLink { - std::string platform; // "instagram", "facebook", "linkedin", ... - std::string url; -}; - -struct SiteMeta { - std::string title; - std::string description; - std::string og_image; - std::string canonical; - std::vector socials; - std::vector internal_pages; // discovered internal hrefs - std::vector emails; - std::string body_text; - std::string body_html; - std::map sites; // url -> body_md - int http_status = 0; - std::string fetch_error; - std::vector json_ld; -}; - -struct EnrichedNode { - int idx = 0; - std::string title; - std::string place_id; - std::string website; - std::string address; - std::string type; - std::string grid_area; - std::string grid_gid; - int pages_found = 0; - int pages_scraped = 0; - std::vector emails; - std::vector socials; - int meta_ms = 0; - int email_ms = 0; - int total_ms = 0; - EnrichStatus status = EnrichStatus::NO_EMAIL; - std::string error; - std::map pages; // "home" → body text - std::vector meta_pages; - std::vector page_errors; - std::string enricher_hash; - std::string geo_json; - std::map sites; // url -> body_md -}; - -// ── Configuration ─────────────────────────────────────────────────────────── - -struct EnrichConfig { - bool enable_homepage_md = true; - int meta_timeout_ms = 10000; - int email_timeout_ms = 15000; - int email_page_timeout_ms = 10000; - int email_max_pages = 8; - int email_abort_after = 1; - - /// Scrapeless API key — if set, pages that yield no emails via plain - /// HTTP GET will be re-fetched through the Scrapeless Universal Scraping - /// API (JS rendering). Leave empty to disable the fallback. - std::string scrapeless_key; - - std::string bigdata_key; - - std::vector contact_patterns = { - "contact", "kontakt", "contacto", "contacta", "impression", - "about", "impress", "impressum", "datenschutz", "privacy", - "legal", "team", "nosotros", "empresa", "sobre", - }; - std::vector probe_paths = { - "/contact", "/contacto", "/kontakt", "/contacta", - "/about", "/about-us", "/impressum", - }; - - std::string meta_scraper; - int meta_concurrency = 5; - int meta_idle_timeout = 60; -}; - -// ── Location input ────────────────────────────────────────────────────────── - -struct LocationInput { - std::string title; - std::string place_id; - std::string website; - std::string address; - std::string type; - std::string grid_area; - std::string grid_gid; - double lat = 0; - double lng = 0; -}; - -// ── Core API ──────────────────────────────────────────────────────────────── - -/// Check if a candidate string looks like a real email address. -bool is_likely_email(const std::string &candidate); - -/// Extract all email addresses from a text body. -std::vector extract_emails(const std::string &text); - -/// Scrape metadata from a website URL (static HTML via libcurl + lexbor). -SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000); - -/// Scrape emails from a single page URL. -std::vector scrape_emails_from_page(const std::string &url, - int timeout_ms = 10000); - -/// Fetch a page via Scrapeless Universal Scraping API (JS rendering), -/// then extract emails from the rendered HTML. Returns empty if key is -/// blank or the API call fails. -std::vector scrape_emails_scrapeless(const std::string &url, - const std::string &api_key, - int timeout_ms = 15000); - -/// Scrape metadata from a website URL via Scrapeless Universal API (JS -/// rendering). -SiteMeta scrape_meta_scrapeless(const std::string &url, - const std::string &api_key, - int timeout_ms = 15000); - -/// Full enrichment pipeline for a single location: meta → email. -EnrichedNode enrich_location(const LocationInput &loc, - const EnrichConfig &cfg = {}); - -/// Resolve a URL relative to a base URL. -std::string resolve_url(const std::string &base, const std::string &href); - -} // namespace enrichers diff --git a/packages/kbot/cpp/packages/enrichers/src/enrichers.cpp b/packages/kbot/cpp/packages/enrichers/src/enrichers.cpp deleted file mode 100644 index 3497edcc..00000000 --- a/packages/kbot/cpp/packages/enrichers/src/enrichers.cpp +++ /dev/null @@ -1,800 +0,0 @@ -#include "enrichers/enrichers.h" -#include "html/html.h" -#include "http/http.h" -#include "logger/logger.h" -#include "json/json.h" - -#include -#include -#include -#include -#include -#include - -namespace enrichers { - -// ── Status string ─────────────────────────────────────────────────────────── - -const char *status_string(EnrichStatus s) { - switch (s) { - case EnrichStatus::OK: - return "OK"; - case EnrichStatus::NO_EMAIL: - return "NO_EMAIL"; - case EnrichStatus::META_TIMEOUT: - return "META_TIMEOUT"; - case EnrichStatus::EMAIL_TIMEOUT: - return "EMAIL_TIMEOUT"; - case EnrichStatus::FETCH_ERROR: - return "FETCH_ERROR"; - case EnrichStatus::NO_PAGES: - return "NO_PAGES"; - case EnrichStatus::ERROR: - return "ERROR"; - } - return "UNKNOWN"; -} - -// ── Timing helper ─────────────────────────────────────────────────────────── - -static int elapsed_ms(std::chrono::steady_clock::time_point t0) { - auto now = std::chrono::steady_clock::now(); - return static_cast( - std::chrono::duration_cast(now - t0).count()); -} - -// ── Email extraction ──────────────────────────────────────────────────────── - -static const std::regex - EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})", - std::regex::optimize); - -// Asset extensions that disqualify an email-like string -static const std::vector ASSET_EXTS = { - ".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp", - ".avif", ".css", ".js", ".woff", ".woff2", ".ttf", - ".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico", -}; - -static std::string to_lower(const std::string &s) { - std::string out = s; - std::transform(out.begin(), out.end(), out.begin(), - [](unsigned char c) { return std::tolower(c); }); - return out; -} - -bool is_likely_email(const std::string &candidate) { - if (candidate.size() < 5 || candidate.size() > 254) - return false; - if (candidate.find("..") != std::string::npos) - return false; - auto at_pos = candidate.find('@'); - if (at_pos == std::string::npos || at_pos == 0 || - at_pos == candidate.size() - 1) - return false; - - auto lower = to_lower(candidate); - - // Reject asset-like extensions - for (auto &ext : ASSET_EXTS) { - if (lower.size() >= ext.size() && - lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) { - return false; - } - } - - // Reject common placeholders - if (lower.find("example") != std::string::npos) - return false; - if (lower.find("sentry") != std::string::npos) - return false; - if (lower.find("test") != std::string::npos) - return false; - if (lower.find("placeholder") != std::string::npos) - return false; - if (lower.find("wixpress.com") != std::string::npos) - return false; - - // Reject if local part is pure hex hash (8+ hex chars) - if (at_pos >= 8) { - auto local = lower.substr(0, at_pos); - bool all_hex = std::all_of(local.begin(), local.end(), [](char c) { - return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f'); - }); - if (all_hex) - return false; - } - - // Reject if domain part looks numeric-only (e.g. 1234@5678) - auto domain = lower.substr(at_pos + 1); - auto dot_pos = domain.find('.'); - if (dot_pos == std::string::npos) - return false; - if (domain.length() - dot_pos <= 2) - return false; // Minimum 2 chars for TLD - - auto domPart = domain.substr(0, dot_pos); - bool all_digits = - !domPart.empty() && - std::all_of(domPart.begin(), domPart.end(), - [](unsigned char c) { return std::isdigit(c); }); - if (all_digits) - return false; - - return true; -} - -static bool is_valid_email_char(char c) { - return std::isalnum(static_cast(c)) || c == '.' || c == '_' || - c == '%' || c == '+' || c == '-'; -} - -std::vector extract_emails(const std::string &text) { - std::vector results; - if (text.empty()) - return results; - - std::set seen; - size_t pos = 0; - - while ((pos = text.find('@', pos)) != std::string::npos) { - if (pos == 0 || pos == text.length() - 1) { - pos++; - continue; - } - - // Scan backwards - size_t start = pos; - while (start > 0 && is_valid_email_char(text[start - 1])) { - start--; - } - - // Scan forwards - size_t end = pos; - while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) { - end++; - } - - if (start < pos && end > pos) { - std::string candidate = text.substr(start, end - start + 1); - - // Strip trailing dots/hyphens eagerly grabbed - while (!candidate.empty() && - (candidate.back() == '.' || candidate.back() == '-')) { - candidate.pop_back(); - end--; - } - - // Strip leading dots/hyphens - size_t local_start = 0; - while (local_start < candidate.length() && - (candidate[local_start] == '.' || candidate[local_start] == '-')) { - local_start++; - } - if (local_start > 0) { - candidate = candidate.substr(local_start); - } - - std::string lower = to_lower(candidate); - if (is_likely_email(lower)) { - if (seen.insert(lower).second) { - results.push_back(lower); - } - } - } - pos = end + 1; - } - - return results; -} - -// ── URL resolution ────────────────────────────────────────────────────────── - -std::string resolve_url(const std::string &base, const std::string &href) { - if (href.empty()) - return {}; - - // Already absolute - if (href.find("http://") == 0 || href.find("https://") == 0) - return href; - - // Protocol-relative - if (href.find("//") == 0) { - auto proto_end = base.find("//"); - if (proto_end != std::string::npos) { - return base.substr(0, proto_end) + href; - } - return "https:" + href; - } - - // Skip non-HTTP - if (href.find("mailto:") == 0 || href.find("tel:") == 0 || - href.find("javascript:") == 0 || href[0] == '#') { - return {}; - } - - // Relative path - // Find base origin: https://example.com - auto proto = base.find("://"); - if (proto == std::string::npos) - return {}; - auto origin_end = base.find('/', proto + 3); - std::string origin = - (origin_end != std::string::npos) ? base.substr(0, origin_end) : base; - - if (href[0] == '/') { - return origin + href; - } - - // Relative without leading slash - if (origin_end != std::string::npos) { - auto last_slash = base.rfind('/'); - if (last_slash > proto + 2) { - return base.substr(0, last_slash + 1) + href; - } - } - return origin + "/" + href; -} - -// ── Social link classification ────────────────────────────────────────────── - -static std::string classify_social(const std::string &url) { - auto lower = to_lower(url); - if (lower.find("instagram.com") != std::string::npos) - return "instagram"; - if (lower.find("facebook.com") != std::string::npos) - return "facebook"; - if (lower.find("linkedin.com") != std::string::npos) - return "linkedin"; - if (lower.find("twitter.com") != std::string::npos || - lower.find("x.com") != std::string::npos) - return "twitter"; - if (lower.find("youtube.com") != std::string::npos) - return "youtube"; - if (lower.find("tiktok.com") != std::string::npos) - return "tiktok"; - if (lower.find("pinterest.com") != std::string::npos) - return "pinterest"; - if (lower.find("github.com") != std::string::npos) - return "github"; - return {}; -} - -// ── Same-origin check ─────────────────────────────────────────────────────── - -static std::string get_origin(const std::string &url) { - auto proto = url.find("://"); - if (proto == std::string::npos) - return {}; - auto origin_end = url.find('/', proto + 3); - return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url; -} - -static bool is_same_origin(const std::string &base_url, - const std::string &href) { - auto bo = to_lower(get_origin(base_url)); - auto ho = to_lower(get_origin(href)); - if (bo.empty() || ho.empty()) - return false; - // Strip www. for comparison - auto strip_www = [](std::string &s) { - auto pos = s.find("://www."); - if (pos != std::string::npos) { - s = s.substr(0, pos + 3) + s.substr(pos + 7); - } - }; - strip_www(bo); - strip_www(ho); - return bo == ho; -} - -// ── Contact page matching ─────────────────────────────────────────────────── - -static bool matches_contact_pattern(const std::string &url, - const std::vector &patterns) { - auto lower = to_lower(url); - for (auto &pat : patterns) { - if (lower.find(to_lower(pat)) != std::string::npos) - return true; - } - return false; -} - -// ── Shared HTML parsing logic for Meta ────────────────────────────────────── - -static SiteMeta parse_meta_html(const std::string &url, int http_status, - const std::string &html_body, - const std::string &fetch_error) { - SiteMeta meta; - meta.http_status = http_status; - - if (!fetch_error.empty()) { - meta.fetch_error = fetch_error; - return meta; - } - - meta.body_html = html_body; - - // Parse with lexbor helpers - meta.title = html::get_title(html_body); - meta.description = html::get_meta(html_body, "description"); - meta.og_image = html::get_meta(html_body, "og:image"); - meta.canonical = html::get_canonical(html_body); - meta.body_text = html::get_body_text(html_body); - meta.json_ld = html::get_json_ld(html_body); - - // OG fallbacks - if (meta.description.empty()) - meta.description = html::get_meta(html_body, "og:description"); - if (meta.title.empty()) - meta.title = html::get_meta(html_body, "og:title"); - - // Links — classify into social / internal / mailto - auto links = html::get_links(html_body); - std::set seen_pages; - - // Extract emails from body text (much smaller than raw HTML) - meta.emails = extract_emails(meta.body_text); - - for (auto &lk : links) { - if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) { - std::string email = lk.href.substr(7); - // Strip anything after ? (like ?subject=...) - auto q = email.find('?'); - if (q != std::string::npos) - email = email.substr(0, q); - // Clean it - email = to_lower(email); - if (is_likely_email(email)) { - if (std::find(meta.emails.begin(), meta.emails.end(), email) == - meta.emails.end()) { - meta.emails.push_back(email); - } - } - continue; - } - - auto resolved = resolve_url(url, lk.href); - if (resolved.empty()) - continue; - - auto social = classify_social(resolved); - if (!social.empty()) { - meta.socials.push_back({social, resolved}); - continue; - } - - if (is_same_origin(url, resolved)) { - // Strip fragment (#) from URL - auto hash_pos = resolved.find('#'); - if (hash_pos != std::string::npos) { - resolved = resolved.substr(0, hash_pos); - } - if (!resolved.empty() && seen_pages.insert(resolved).second) { - meta.internal_pages.push_back(resolved); - } - } - } - - return meta; -} - -// ── scrape_meta ───────────────────────────────────────────────────────────── - -SiteMeta scrape_meta(const std::string &url, int timeout_ms) { - http::GetOptions opts; - opts.timeout_ms = timeout_ms; - opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36"; - - auto resp = http::get(url, opts); - std::string fetch_err; - if (resp.status_code < 0 || resp.status_code >= 400) { - fetch_err = resp.body; - } - return parse_meta_html(url, static_cast(resp.status_code), resp.body, - fetch_err); -} - -// ── scrape_emails_from_page ───────────────────────────────────────────────── - -std::vector scrape_emails_from_page(const std::string &url, - int timeout_ms, - int &out_status_code) { - http::GetOptions opts; - opts.timeout_ms = timeout_ms; - opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/120.0.0.0 Safari/537.36"; - - auto resp = http::get(url, opts); - out_status_code = static_cast(resp.status_code); - if (resp.status_code < 0 || resp.status_code >= 400) { - return {}; - } - - // Extract body text then find emails - auto text = html::get_body_text(resp.body); - auto from_text = extract_emails(text); - - // Extract mailto: links from HTML directly without regexing the huge string - auto links = html::get_links(resp.body); - std::set seen(from_text.begin(), from_text.end()); - - for (auto &lk : links) { - if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) { - std::string m = lk.href.substr(7); - auto q = m.find('?'); - if (q != std::string::npos) - m = m.substr(0, q); - m = to_lower(m); - if (is_likely_email(m)) { - if (seen.insert(m).second) { - from_text.push_back(m); - } - } - } - } - - return from_text; -} - -static std::string extract_scrapeless_html(const std::string &json_body) { - std::string data = json::get_string(json_body, "data"); - if (data.empty()) { - return json_body; // Fallback to raw response if not found - } - return data; -} - -SiteMeta scrape_meta_scrapeless(const std::string &url, - const std::string &api_key, int timeout_ms) { - if (api_key.empty()) - return parse_meta_html(url, 0, "", "missing api key"); - - std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" + - url + - R"(","jsRender":{"enabled":true,"headless":true}}})"; - - http::PostOptions opts; - opts.content_type = "application/json"; - opts.bearer_token = api_key; - opts.timeout_ms = - std::max(timeout_ms, 45000); // Scrapeless needs generous timeout - - auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request", - payload, opts); - - std::string fetch_err; - if (resp.status_code < 0 || resp.status_code >= 400) { - fetch_err = resp.body; - logger::error("[meta:scrapeless] API Error HTTP " + - std::to_string(resp.status_code) + " for " + url + " : " + - fetch_err); - return parse_meta_html(url, static_cast(resp.status_code), resp.body, - fetch_err); - } - - std::string rendered_html = extract_scrapeless_html(resp.body); - return parse_meta_html(url, static_cast(resp.status_code), rendered_html, - ""); -} - -std::vector scrape_emails_scrapeless(const std::string &url, - const std::string &api_key, - int timeout_ms) { - if (api_key.empty()) - return {}; - - // Build the Scrapeless Universal Scraping API request body. - // We ask for the fully-rendered HTML of the target URL. - std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" + - url + - R"(","jsRender":{"enabled":true,"headless":true}}})"; - - http::PostOptions opts; - opts.content_type = "application/json"; - opts.bearer_token = api_key; - opts.timeout_ms = - std::max(timeout_ms, 45000); // Scrapeless needs generous timeout - - auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request", - payload, opts); - - if (resp.status_code < 0 || resp.status_code >= 400) { - logger::error("[email:scrapeless] API Error HTTP " + - std::to_string(resp.status_code) + " for " + url + " : " + - resp.body); - return {}; // API error — silent fallback - } - - std::string rendered_html = extract_scrapeless_html(resp.body); - - // Parse and extract emails from the rendered HTML - auto text = html::get_body_text(rendered_html); - auto from_text = extract_emails(text); - - // Fast mailto extraction instead of HTML regex - auto links = html::get_links(rendered_html); - std::set seen(from_text.begin(), from_text.end()); - - for (auto &lk : links) { - if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) { - std::string m = lk.href.substr(7); - auto q = m.find('?'); - if (q != std::string::npos) - m = m.substr(0, q); - m = to_lower(m); - if (is_likely_email(m)) { - if (seen.insert(m).second) { - from_text.push_back(m); - } - } - } - } - - return from_text; -} - -// ── enrich_location ───────────────────────────────────────────────────────── - -EnrichedNode enrich_location(const LocationInput &loc, - const EnrichConfig &cfg) { - auto t0 = std::chrono::steady_clock::now(); - - EnrichedNode node; - node.title = loc.title; - node.place_id = loc.place_id; - node.website = loc.website; - node.address = loc.address; - node.type = loc.type; - node.grid_area = loc.grid_area; - node.grid_gid = loc.grid_gid; - node.status = EnrichStatus::NO_EMAIL; - - if (loc.website.empty()) { - node.status = EnrichStatus::FETCH_ERROR; - node.error = "no website"; - node.total_ms = elapsed_ms(t0); - return node; - } - - // ── Phase 1: Meta scrape ──────────────────────────────────────────────── - - auto meta_t0 = std::chrono::steady_clock::now(); - SiteMeta meta; - bool meta_timed_out = false; - - try { - if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) { - logger::debug("[meta:scrapeless] Fetching " + loc.website); - meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key, - cfg.meta_timeout_ms); - } else { - logger::debug("[meta:http] Fetching " + loc.website); - meta = scrape_meta(loc.website, cfg.meta_timeout_ms); - } - } catch (...) { - meta.fetch_error = "exception during meta scrape"; - meta_timed_out = true; - } - node.meta_ms = elapsed_ms(meta_t0); - - // Check if meta took too long (within threshold of timeout) - if (node.meta_ms >= cfg.meta_timeout_ms - 1000) { - meta_timed_out = true; - } - - // logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title) - // + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: " - // + std::to_string(meta.internal_pages.size())); - - if (!meta.body_text.empty()) - node.pages["home"] = meta.body_text; - if (cfg.enable_homepage_md && !meta.body_html.empty()) { - // Cap HTML body at 512 KB to prevent stack overflow in recursive html2md - // parser - static constexpr size_t MAX_HTML_BYTES = 512 * 1024; - if (meta.body_html.size() > MAX_HTML_BYTES) { - logger::warn("[" + loc.title + "] body_html too large (" + - std::to_string(meta.body_html.size() / 1024) + - " KB), skipping markdown conversion"); - } else { - try { - node.sites[loc.website] = html::to_markdown(meta.body_html); - } catch (const std::exception &e) { - logger::warn("[" + loc.title + - "] html::to_markdown failed: " + e.what()); - } catch (...) { - logger::warn("[" + loc.title + - "] html::to_markdown crashed (unknown exception)"); - } - } - } - node.meta_pages = meta.internal_pages; - node.pages_found = static_cast(meta.internal_pages.size()); - node.socials = meta.socials; - - if (!meta.fetch_error.empty()) { - node.error = meta.fetch_error; - node.status = EnrichStatus::FETCH_ERROR; - node.total_ms = elapsed_ms(t0); - return node; - } - - // If meta already found emails, we're done (early exit like TS) - if (!meta.emails.empty()) { - node.emails = meta.emails; - node.status = EnrichStatus::OK; - node.total_ms = elapsed_ms(t0); - return node; - } - - // ── Build contact page list ───────────────────────────────────────────── - - std::vector contact_pages; - std::set seen_urls; - - for (auto &page_url : meta.internal_pages) { - if (matches_contact_pattern(page_url, cfg.contact_patterns)) { - if (seen_urls.insert(page_url).second) { - contact_pages.push_back(page_url); - } - } - } - - // No more probe paths. If we found 0 contact pages, we just give up or time - // out. - - node.pages_found = static_cast(contact_pages.size()); - - if (contact_pages.empty()) { - logger::debug("[" + - std::string(loc.title.empty() ? loc.website : loc.title) + - "] No contact pages found."); - node.status = - meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES; - node.total_ms = elapsed_ms(t0); - return node; - } - - logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) + - "] Contact pages to scrape: " + - std::to_string(contact_pages.size()) + " (parallel)"); - - // ── Phase 2: Email scrape per contact page ────────────────────────────── - - struct AsyncResult { - std::string url; - std::vector errors; - std::vector emails; - int ms; - }; - - int pages_to_scrape = - std::min(static_cast(contact_pages.size()), cfg.email_max_pages); - - std::vector contact_threads; - std::vector contact_results(pages_to_scrape); - - auto email_t0 = std::chrono::steady_clock::now(); - - for (int i = 0; i < pages_to_scrape; ++i) { - auto page_url = contact_pages[i]; - - contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() { - auto start = std::chrono::steady_clock::now(); - AsyncResult res; - res.url = page_url; - - PageError pe1; - pe1.url = page_url; - pe1.method = "GET"; - - int http_status = 0; - try { - // logger::debug("[email:http] Fetching " + page_url); - auto page_emails = scrape_emails_from_page( - page_url, cfg.email_page_timeout_ms, http_status); - pe1.emails = page_emails; - logger::debug("[" + - std::string(loc.title.empty() ? loc.website : loc.title) + - "] HTTP fetch finished code " + - std::to_string(http_status) + " for " + page_url); - - if (page_emails.empty()) { - if (http_status == 404 || http_status == 400 || http_status == 500) { - pe1.status = "NOT_FOUND"; - pe1.error = "HTTP " + std::to_string(http_status); - } else { - pe1.status = "AXIOS_NO_EMAIL"; - res.errors.push_back(pe1); // pushed before scrapeless - - if (cfg.meta_scraper == "SCRAPELESS" && - !cfg.scrapeless_key.empty()) { - PageError pe2; - pe2.url = page_url; - pe2.method = "SCRAPELESS"; - try { - logger::debug("[email:scrapeless] Fallback scraping " + - page_url); - auto s_emails = - scrape_emails_scrapeless(page_url, cfg.scrapeless_key, - cfg.email_page_timeout_ms + 5000); - pe2.emails = s_emails; - pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL"; - if (!s_emails.empty()) - res.emails = s_emails; - logger::debug( - "[" + - std::string(loc.title.empty() ? loc.website : loc.title) + - "] Scrapeless fallback finished for " + page_url); - } catch (...) { - pe2.status = "FAILED"; - pe2.error = "scrapeless exception"; - } - res.errors.push_back(pe2); - } - res.ms = elapsed_ms(start); - contact_results[i] = res; - return; - } - } else { - pe1.status = "SEARCHED_EMAIL"; - res.emails = page_emails; - } - } catch (...) { - pe1.status = "AXIOS_FAILED"; - pe1.error = "exception"; - } - // Only insert pe1 if we didn't already push it during fallback - if (res.errors.empty() || res.errors[0].method != "GET") { - res.errors.insert(res.errors.begin(), pe1); - } - res.ms = elapsed_ms(start); - contact_results[i] = res; - }); - } - - for (auto &t : contact_threads) { - if (t.joinable()) - t.join(); - } - - std::set all_emails; - int pages_scraped = 0; - - for (auto &res : contact_results) { - pages_scraped++; - for (auto &pe : res.errors) { - node.page_errors.push_back(std::move(pe)); - } - for (auto &e : res.emails) { - all_emails.insert(e); - } - } - - node.email_ms = elapsed_ms(email_t0); - node.pages_scraped = pages_scraped; - - // Merge emails - node.emails.assign(all_emails.begin(), all_emails.end()); - - // Final status - bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000; - if (!node.emails.empty()) { - node.status = EnrichStatus::OK; - } else if (email_timed_out) { - node.status = EnrichStatus::EMAIL_TIMEOUT; - } else if (meta_timed_out) { - node.status = EnrichStatus::META_TIMEOUT; - } else { - node.status = EnrichStatus::NO_EMAIL; - } - - node.total_ms = elapsed_ms(t0); - return node; -} - -} // namespace enrichers diff --git a/packages/kbot/cpp/packages/gadm_reader/CMakeLists.txt b/packages/kbot/cpp/packages/gadm_reader/CMakeLists.txt deleted file mode 100644 index 7f860097..00000000 --- a/packages/kbot/cpp/packages/gadm_reader/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -add_library(gadm_reader STATIC src/gadm_reader.cpp) - -target_include_directories(gadm_reader PUBLIC include) - -# Depends on geo (for Coord type) and json (for RapidJSON) -target_link_libraries(gadm_reader PUBLIC geo json) diff --git a/packages/kbot/cpp/packages/gadm_reader/include/gadm_reader/gadm_reader.h b/packages/kbot/cpp/packages/gadm_reader/include/gadm_reader/gadm_reader.h deleted file mode 100644 index 8dfaaa6a..00000000 --- a/packages/kbot/cpp/packages/gadm_reader/include/gadm_reader/gadm_reader.h +++ /dev/null @@ -1,75 +0,0 @@ -#pragma once - -#include "geo/geo.h" - -#include -#include -#include - -namespace gadm { - -// ── Feature (mirrors TS GridFeature) ──────────────────────────────────────── - -struct Feature { - std::string gid; // e.g. "ABW", "AFG.1.1_1" - std::string name; // e.g. "Aruba", "Baharak" - int level = 0; // GADM admin level - - // Outer ring + holes (MultiPolygon flattened to rings) - std::vector> rings; - - // Bounding box (computed from rings) - geo::BBox bbox; - - // GHS enrichment (parsed from cached JSON) - double ghsPopulation = 0; - double ghsBuiltWeight = 0; - double ghsPopMaxDensity = 0; - double ghsBuiltMax = 0; - - geo::Coord ghsPopCenter; - geo::Coord ghsBuiltCenter; - - // Weighted centers: [lon, lat, weight] - std::vector> ghsPopCenters; - std::vector> ghsBuiltCenters; - - // Computed from geometry - double areaSqKm = 0; - - bool isOuter = true; -}; - -// ── Result ────────────────────────────────────────────────────────────────── - -struct BoundaryResult { - std::vector features; - std::string error; // empty on success -}; - -// ── API ───────────────────────────────────────────────────────────────────── - -/// Load a pre-cached GADM boundary file. -/// -/// Tries these file paths in order: -/// 1. cacheDir/boundary_{gid}_{targetLevel}.json -/// 2. cacheDir/boundary_{countryCode}_{targetLevel}.json (fallback for country-level) -/// -/// Returns a BoundaryResult with parsed features or an error string. -BoundaryResult load_boundary( - const std::string& gid, - int targetLevel, - const std::string& cacheDir = "cache/gadm" -); - -/// Load a boundary file directly by path. -BoundaryResult load_boundary_file(const std::string& filepath); - -/// Extract the ISO country code from a GID (e.g. "AFG.1.1_1" → "AFG"). -std::string country_code(const std::string& gid); - -/// Infer the GADM level from a GID string. -/// "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2, etc. -int infer_level(const std::string& gid); - -} // namespace gadm diff --git a/packages/kbot/cpp/packages/gadm_reader/src/gadm_reader.cpp b/packages/kbot/cpp/packages/gadm_reader/src/gadm_reader.cpp deleted file mode 100644 index 7ea96fa7..00000000 --- a/packages/kbot/cpp/packages/gadm_reader/src/gadm_reader.cpp +++ /dev/null @@ -1,231 +0,0 @@ -#include "gadm_reader/gadm_reader.h" - -#include -#include -#include - -#include - -namespace gadm { - -// ── Helpers ───────────────────────────────────────────────────────────────── - -std::string country_code(const std::string& gid) { - auto dot = gid.find('.'); - return (dot != std::string::npos) ? gid.substr(0, dot) : gid; -} - -int infer_level(const std::string& gid) { - // Count dots: "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2 - int dots = 0; - for (char c : gid) { - if (c == '.') dots++; - } - return dots; -} - -static std::string read_file(const std::string& path) { - std::ifstream ifs(path, std::ios::binary); - if (!ifs.is_open()) return ""; - std::ostringstream oss; - oss << ifs.rdbuf(); - return oss.str(); -} - -/// Parse a coord array [lon, lat] → geo::Coord -static geo::Coord parse_coord(const rapidjson::Value& arr) { - if (arr.IsArray() && arr.Size() >= 2) { - return {arr[0].GetDouble(), arr[1].GetDouble()}; - } - return {}; -} - -/// Parse a ring array [[lon,lat], [lon,lat], ...] → vector -static std::vector parse_ring(const rapidjson::Value& arr) { - std::vector ring; - if (!arr.IsArray()) return ring; - ring.reserve(arr.Size()); - for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) { - ring.push_back(parse_coord(arr[i])); - } - return ring; -} - -/// Parse weighted centers [[lon, lat, weight], ...] -static std::vector> parse_weighted_centers( - const rapidjson::Value& arr) { - std::vector> centers; - if (!arr.IsArray()) return centers; - centers.reserve(arr.Size()); - for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) { - const auto& c = arr[i]; - if (c.IsArray() && c.Size() >= 3) { - centers.push_back({c[0].GetDouble(), c[1].GetDouble(), c[2].GetDouble()}); - } - } - return centers; -} - -/// Get a double from properties, with fallback -static double get_double(const rapidjson::Value& props, const char* key, - double fallback = 0.0) { - if (props.HasMember(key) && props[key].IsNumber()) { - return props[key].GetDouble(); - } - return fallback; -} - -/// Get a bool from properties, with fallback -static bool get_bool(const rapidjson::Value& props, const char* key, - bool fallback = true) { - if (props.HasMember(key) && props[key].IsBool()) { - return props[key].GetBool(); - } - return fallback; -} - -/// Get a string from properties, checking GID_0, GID_1, GID_2, etc. -static std::string get_gid(const rapidjson::Value& props) { - // Try GID_5 down to GID_0, return the most specific one found - for (int lvl = 5; lvl >= 0; --lvl) { - std::string key = "GID_" + std::to_string(lvl); - if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) { - return props[key.c_str()].GetString(); - } - } - return ""; -} - -/// Get the name (NAME_0, NAME_1, ... NAME_5) -static std::string get_name(const rapidjson::Value& props) { - for (int lvl = 5; lvl >= 0; --lvl) { - std::string key = "NAME_" + std::to_string(lvl); - if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) { - return props[key.c_str()].GetString(); - } - } - return ""; -} - -/// Parse a single GeoJSON Feature object into a gadm::Feature -static Feature parse_feature(const rapidjson::Value& feat) { - Feature f; - - // Properties - if (feat.HasMember("properties") && feat["properties"].IsObject()) { - const auto& props = feat["properties"]; - f.gid = get_gid(props); - f.name = get_name(props); - f.level = infer_level(f.gid); - f.ghsPopulation = get_double(props, "ghsPopulation"); - f.ghsBuiltWeight = get_double(props, "ghsBuiltWeight"); - f.ghsPopMaxDensity = get_double(props, "ghsPopMaxDensity"); - f.ghsBuiltMax = get_double(props, "ghsBuiltMax"); - f.isOuter = get_bool(props, "isOuter"); - - if (props.HasMember("ghsPopCenter") && props["ghsPopCenter"].IsArray()) { - f.ghsPopCenter = parse_coord(props["ghsPopCenter"]); - } - if (props.HasMember("ghsBuiltCenter") && props["ghsBuiltCenter"].IsArray()) { - f.ghsBuiltCenter = parse_coord(props["ghsBuiltCenter"]); - } - if (props.HasMember("ghsPopCenters") && props["ghsPopCenters"].IsArray()) { - f.ghsPopCenters = parse_weighted_centers(props["ghsPopCenters"]); - } - if (props.HasMember("ghsBuiltCenters") && props["ghsBuiltCenters"].IsArray()) { - f.ghsBuiltCenters = parse_weighted_centers(props["ghsBuiltCenters"]); - } - } - - // Geometry - if (feat.HasMember("geometry") && feat["geometry"].IsObject()) { - const auto& geom = feat["geometry"]; - std::string gtype; - if (geom.HasMember("type") && geom["type"].IsString()) { - gtype = geom["type"].GetString(); - } - - if (geom.HasMember("coordinates") && geom["coordinates"].IsArray()) { - const auto& coords = geom["coordinates"]; - - if (gtype == "Polygon") { - // coordinates: [ [ring], [hole], ... ] - for (rapidjson::SizeType r = 0; r < coords.Size(); ++r) { - f.rings.push_back(parse_ring(coords[r])); - } - } else if (gtype == "MultiPolygon") { - // coordinates: [ [ [ring], [hole] ], [ [ring] ], ... ] - for (rapidjson::SizeType p = 0; p < coords.Size(); ++p) { - if (coords[p].IsArray()) { - for (rapidjson::SizeType r = 0; r < coords[p].Size(); ++r) { - f.rings.push_back(parse_ring(coords[p][r])); - } - } - } - } - } - } - - // Compute bbox and area from first ring (outer boundary) - if (!f.rings.empty() && !f.rings[0].empty()) { - f.bbox = geo::bbox(f.rings[0]); - f.areaSqKm = geo::area_sq_km(f.rings[0]); - } - - return f; -} - -// ── Public API ────────────────────────────────────────────────────────────── - -BoundaryResult load_boundary_file(const std::string& filepath) { - BoundaryResult result; - - std::string json = read_file(filepath); - if (json.empty()) { - result.error = "Failed to read file: " + filepath; - return result; - } - - rapidjson::Document doc; - doc.Parse(json.c_str()); - if (doc.HasParseError()) { - result.error = "JSON parse error in: " + filepath; - return result; - } - - // Expect a FeatureCollection - if (!doc.HasMember("features") || !doc["features"].IsArray()) { - result.error = "Missing 'features' array in: " + filepath; - return result; - } - - const auto& features = doc["features"]; - result.features.reserve(features.Size()); - for (rapidjson::SizeType i = 0; i < features.Size(); ++i) { - result.features.push_back(parse_feature(features[i])); - } - - return result; -} - -BoundaryResult load_boundary(const std::string& gid, int targetLevel, - const std::string& cacheDir) { - std::string cc = country_code(gid); - std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json"; - - // Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json - std::string path = cacheDir + "/" + cc + "/" + filename; - auto result = load_boundary_file(path); - if (result.error.empty()) return result; - - // Fallback (flat): cacheDir/boundary_{gid}_{level}.json - path = cacheDir + "/" + filename; - result = load_boundary_file(path); - if (result.error.empty()) return result; - - // Both failed - result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir; - return result; -} - -} // namespace gadm diff --git a/packages/kbot/cpp/packages/geo/CMakeLists.txt b/packages/kbot/cpp/packages/geo/CMakeLists.txt deleted file mode 100644 index caf09da6..00000000 --- a/packages/kbot/cpp/packages/geo/CMakeLists.txt +++ /dev/null @@ -1,5 +0,0 @@ -add_library(geo STATIC src/geo.cpp) - -target_include_directories(geo PUBLIC include) - -# No external dependencies — pure math diff --git a/packages/kbot/cpp/packages/geo/include/geo/geo.h b/packages/kbot/cpp/packages/geo/include/geo/geo.h deleted file mode 100644 index e5909961..00000000 --- a/packages/kbot/cpp/packages/geo/include/geo/geo.h +++ /dev/null @@ -1,100 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace geo { - -// ── Constants ─────────────────────────────────────────────────────────────── -constexpr double EARTH_RADIUS_KM = 6371.0; -constexpr double PI = 3.14159265358979323846; -constexpr double DEG2RAD = PI / 180.0; -constexpr double RAD2DEG = 180.0 / PI; - -// ── Core types ────────────────────────────────────────────────────────────── - -struct Coord { - double lon = 0; - double lat = 0; -}; - -struct BBox { - double minLon = 0; - double minLat = 0; - double maxLon = 0; - double maxLat = 0; - - Coord center() const { - return {(minLon + maxLon) / 2.0, (minLat + maxLat) / 2.0}; - } - - double width_deg() const { return maxLon - minLon; } - double height_deg() const { return maxLat - minLat; } -}; - -// ── Distance ──────────────────────────────────────────────────────────────── - -/// Haversine distance between two WGS84 points, in kilometers. -double distance_km(Coord a, Coord b); - -/// Haversine distance in meters. -inline double distance_m(Coord a, Coord b) { return distance_km(a, b) * 1000.0; } - -// ── Bounding box ──────────────────────────────────────────────────────────── - -/// Compute the bounding box of a polygon ring. -BBox bbox(const std::vector& ring); - -/// Compute the bounding box that covers all features' rings. -BBox bbox_union(const std::vector& boxes); - -// ── Centroid ──────────────────────────────────────────────────────────────── - -/// Geometric centroid of a polygon ring (simple average method). -Coord centroid(const std::vector& ring); - -// ── Area ──────────────────────────────────────────────────────────────────── - -/// Approximate area of a polygon ring in square meters. -/// Uses the Shoelace formula with latitude cosine correction. -double area_sq_m(const std::vector& ring); - -/// Area in square kilometers. -inline double area_sq_km(const std::vector& ring) { - return area_sq_m(ring) / 1e6; -} - -// ── Point-in-polygon ──────────────────────────────────────────────────────── - -/// Ray-casting point-in-polygon test. -/// Same algorithm as gadm/cpp pip.h but using Coord structs. -bool point_in_polygon(Coord pt, const std::vector& ring); - -// ── Bearing & destination ─────────────────────────────────────────────────── - -/// Initial bearing from a to b, in degrees (0 = north, 90 = east). -double bearing_deg(Coord from, Coord to); - -/// Compute the destination point given start, bearing (degrees), and distance (km). -Coord destination(Coord from, double bearing_deg, double distance_km); - -// ── Grid tessellation ─────────────────────────────────────────────────────── - -/// Generate a flat square grid of cell centers over a bbox. -/// cellSizeKm defines the side length of each square cell. -/// Returns center coordinates of each cell. -std::vector square_grid(BBox extent, double cellSizeKm); - -/// Generate a flat hex grid of cell centers over a bbox. -/// cellSizeKm defines the distance between hex centers. -/// Returns center coordinates of each cell. -std::vector hex_grid(BBox extent, double cellSizeKm); - -// ── Viewport estimation (matches TS estimateViewportAreaSqKm) ────────────── - -/// Estimate the km² visible in a viewport at a given lat/zoom. -double estimate_viewport_sq_km(double lat, int zoom, - int widthPx = 1024, int heightPx = 768); - -} // namespace geo diff --git a/packages/kbot/cpp/packages/geo/src/geo.cpp b/packages/kbot/cpp/packages/geo/src/geo.cpp deleted file mode 100644 index cebaf1c3..00000000 --- a/packages/kbot/cpp/packages/geo/src/geo.cpp +++ /dev/null @@ -1,204 +0,0 @@ -#include "geo/geo.h" - -#include -#include - - -namespace geo { - -// ── Distance (Haversine) ──────────────────────────────────────────────────── - -double distance_km(Coord a, Coord b) { - double dLat = (b.lat - a.lat) * DEG2RAD; - double dLon = (b.lon - a.lon) * DEG2RAD; - double lat1 = a.lat * DEG2RAD; - double lat2 = b.lat * DEG2RAD; - - double sinDLat = std::sin(dLat / 2.0); - double sinDLon = std::sin(dLon / 2.0); - double h = sinDLat * sinDLat + std::cos(lat1) * std::cos(lat2) * sinDLon * sinDLon; - return 2.0 * EARTH_RADIUS_KM * std::asin(std::sqrt(h)); -} - -// ── Bounding box ──────────────────────────────────────────────────────────── - -BBox bbox(const std::vector& ring) { - if (ring.empty()) return {}; - BBox b{ring[0].lon, ring[0].lat, ring[0].lon, ring[0].lat}; - for (size_t i = 1; i < ring.size(); ++i) { - b.minLon = std::min(b.minLon, ring[i].lon); - b.minLat = std::min(b.minLat, ring[i].lat); - b.maxLon = std::max(b.maxLon, ring[i].lon); - b.maxLat = std::max(b.maxLat, ring[i].lat); - } - return b; -} - -BBox bbox_union(const std::vector& boxes) { - if (boxes.empty()) return {}; - BBox u = boxes[0]; - for (size_t i = 1; i < boxes.size(); ++i) { - u.minLon = std::min(u.minLon, boxes[i].minLon); - u.minLat = std::min(u.minLat, boxes[i].minLat); - u.maxLon = std::max(u.maxLon, boxes[i].maxLon); - u.maxLat = std::max(u.maxLat, boxes[i].maxLat); - } - return u; -} - -// ── Centroid ──────────────────────────────────────────────────────────────── - -Coord centroid(const std::vector& ring) { - if (ring.empty()) return {}; - double sumLon = 0, sumLat = 0; - // Exclude last point if it's the same as first (closed ring) - size_t n = ring.size(); - if (n > 1 && ring[0].lon == ring[n - 1].lon && ring[0].lat == ring[n - 1].lat) { - n--; - } - for (size_t i = 0; i < n; ++i) { - sumLon += ring[i].lon; - sumLat += ring[i].lat; - } - return {sumLon / static_cast(n), sumLat / static_cast(n)}; -} - -// ── Area (Shoelace + latitude cosine correction) ──────────────────────────── - -double area_sq_m(const std::vector& ring) { - if (ring.size() < 3) return 0.0; - - // Shoelace formula in projected coordinates. - // Each degree of longitude = cos(lat) * 111320 meters at that latitude. - // Each degree of latitude = 110540 meters (approximate). - double sum = 0.0; - size_t n = ring.size(); - - for (size_t i = 0; i < n; ++i) { - size_t j = (i + 1) % n; - // Convert coordinates to approximate meters using the average latitude - double avgLat = (ring[i].lat + ring[j].lat) / 2.0; - double cosLat = std::cos(avgLat * DEG2RAD); - - double x_i = ring[i].lon * cosLat * 111320.0; - double y_i = ring[i].lat * 110540.0; - double x_j = ring[j].lon * cosLat * 111320.0; - double y_j = ring[j].lat * 110540.0; - - sum += x_i * y_j - x_j * y_i; - } - return std::abs(sum) / 2.0; -} - -// ── Point-in-polygon (ray casting) ────────────────────────────────────────── - -bool point_in_polygon(Coord pt, const std::vector& ring) { - bool inside = false; - size_t n = ring.size(); - for (size_t i = 0, j = n - 1; i < n; j = i++) { - double xi = ring[i].lon, yi = ring[i].lat; - double xj = ring[j].lon, yj = ring[j].lat; - - if (((yi > pt.lat) != (yj > pt.lat)) && - (pt.lon < (xj - xi) * (pt.lat - yi) / (yj - yi) + xi)) { - inside = !inside; - } - } - return inside; -} - -// ── Bearing ───────────────────────────────────────────────────────────────── - -double bearing_deg(Coord from, Coord to) { - double dLon = (to.lon - from.lon) * DEG2RAD; - double lat1 = from.lat * DEG2RAD; - double lat2 = to.lat * DEG2RAD; - - double y = std::sin(dLon) * std::cos(lat2); - double x = std::cos(lat1) * std::sin(lat2) - - std::sin(lat1) * std::cos(lat2) * std::cos(dLon); - double brng = std::atan2(y, x) * RAD2DEG; - return std::fmod(brng + 360.0, 360.0); -} - -// ── Destination point ─────────────────────────────────────────────────────── - -Coord destination(Coord from, double brng_deg, double dist_km) { - double brng = brng_deg * DEG2RAD; - double lat1 = from.lat * DEG2RAD; - double lon1 = from.lon * DEG2RAD; - double d = dist_km / EARTH_RADIUS_KM; - - double lat2 = std::asin(std::sin(lat1) * std::cos(d) + - std::cos(lat1) * std::sin(d) * std::cos(brng)); - double lon2 = lon1 + std::atan2( - std::sin(brng) * std::sin(d) * std::cos(lat1), - std::cos(d) - std::sin(lat1) * std::sin(lat2)); - - return {lon2 * RAD2DEG, lat2 * RAD2DEG}; -} - -// ── Square grid ───────────────────────────────────────────────────────────── - -std::vector square_grid(BBox extent, double cellSizeKm) { - std::vector centers; - if (cellSizeKm <= 0) return centers; - - // Convert cell size to degrees at the center latitude - double centerLat = (extent.minLat + extent.maxLat) / 2.0; - double cosLat = std::cos(centerLat * DEG2RAD); - if (cosLat < 1e-10) cosLat = 1e-10; // Avoid division by zero near poles - - double cellLatDeg = cellSizeKm / 110.574; // ~110.574 km per degree lat - double cellLonDeg = cellSizeKm / (111.320 * cosLat); // longitude correction - - for (double lat = extent.minLat + cellLatDeg / 2.0; - lat < extent.maxLat; lat += cellLatDeg) { - for (double lon = extent.minLon + cellLonDeg / 2.0; - lon < extent.maxLon; lon += cellLonDeg) { - centers.push_back({lon, lat}); - } - } - return centers; -} - -// ── Hex grid ──────────────────────────────────────────────────────────────── - -std::vector hex_grid(BBox extent, double cellSizeKm) { - std::vector centers; - if (cellSizeKm <= 0) return centers; - - double centerLat = (extent.minLat + extent.maxLat) / 2.0; - double cosLat = std::cos(centerLat * DEG2RAD); - if (cosLat < 1e-10) cosLat = 1e-10; - - // Hex spacing: horizontal = cellSize, vertical = cellSize * sqrt(3)/2 - double cellLatDeg = cellSizeKm / 110.574; - double cellLonDeg = cellSizeKm / (111.320 * cosLat); - double rowHeight = cellLatDeg * std::sqrt(3.0) / 2.0; - - int row = 0; - for (double lat = extent.minLat + rowHeight / 2.0; - lat < extent.maxLat; lat += rowHeight) { - // Offset every other row by half the cell width - double lonOffset = (row % 2 == 1) ? cellLonDeg / 2.0 : 0.0; - for (double lon = extent.minLon + cellLonDeg / 2.0 + lonOffset; - lon < extent.maxLon; lon += cellLonDeg) { - centers.push_back({lon, lat}); - } - row++; - } - return centers; -} - -// ── Viewport estimation ───────────────────────────────────────────────────── - -double estimate_viewport_sq_km(double lat, int zoom, int widthPx, int heightPx) { - double metersPerPx = - (156543.03392 * std::cos(lat * DEG2RAD)) / std::pow(2.0, zoom); - double widthKm = (widthPx * metersPerPx) / 1000.0; - double heightKm = (heightPx * metersPerPx) / 1000.0; - return widthKm * heightKm; -} - -} // namespace geo diff --git a/packages/kbot/cpp/packages/grid/CMakeLists.txt b/packages/kbot/cpp/packages/grid/CMakeLists.txt deleted file mode 100644 index 51da9474..00000000 --- a/packages/kbot/cpp/packages/grid/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -add_library(grid STATIC src/grid.cpp) - -target_include_directories(grid PUBLIC include) - -# Depends on geo (math) and gadm_reader (Feature type) -target_link_libraries(grid PUBLIC geo gadm_reader) diff --git a/packages/kbot/cpp/packages/grid/include/grid/grid.h b/packages/kbot/cpp/packages/grid/include/grid/grid.h deleted file mode 100644 index ca719bc3..00000000 --- a/packages/kbot/cpp/packages/grid/include/grid/grid.h +++ /dev/null @@ -1,56 +0,0 @@ -#pragma once - -#include "geo/geo.h" -#include "gadm_reader/gadm_reader.h" - -#include -#include -#include - -namespace grid { - -// ── Types (mirror TS GridSearchHop) ───────────────────────────────────────── - -struct Waypoint { - int step = 0; - double lng = 0; - double lat = 0; - double radius_km = 0; - std::string area_gid; - std::string area_name; -}; - -struct GridOptions { - std::string gridMode = "hex"; // "hex", "square", "admin", "centers" - double cellSize = 5.0; // km - double cellOverlap = 0.0; - double centroidOverlap = 0.5; - int maxCellsLimit = 15000; - double maxElevation = 0; - double minDensity = 0; - double minGhsPop = 0; - double minGhsBuilt = 0; - std::string ghsFilterMode = "AND"; // "AND" | "OR" - bool allowMissingGhs = false; - bool bypassFilters = false; - std::string pathOrder = "snake"; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest" - bool groupByRegion = true; -}; - -struct GridResult { - std::vector waypoints; - int validCells = 0; - int skippedCells = 0; - std::string error; -}; - -// ── API ───────────────────────────────────────────────────────────────────── - -/// Generate grid waypoints from GADM features + options. -/// This is the main entry point — equivalent to generateGridSearchCells() in TS. -GridResult generate( - const std::vector& features, - const GridOptions& opts -); - -} // namespace grid diff --git a/packages/kbot/cpp/packages/grid/src/grid.cpp b/packages/kbot/cpp/packages/grid/src/grid.cpp deleted file mode 100644 index 3a1ed4b1..00000000 --- a/packages/kbot/cpp/packages/grid/src/grid.cpp +++ /dev/null @@ -1,393 +0,0 @@ -#include "grid/grid.h" - -#include -#include -#include -#include - -namespace grid { - -// ── Internal types ────────────────────────────────────────────────────────── - -struct CellInfo { - geo::Coord center; - double radius_km; - int region_idx; - bool allowed; - std::string reason; -}; - -// ── Filter logic (mirrors checkCellFilters in TS) ─────────────────────────── - -static bool check_filters(const gadm::Feature& feat, const GridOptions& opts, - double areaSqKm, std::string& reason) { - if (opts.bypassFilters) return true; - - // GHS filter - bool checkPop = opts.minGhsPop > 0; - bool checkBuilt = opts.minGhsBuilt > 0; - - if (checkPop || checkBuilt) { - double ghsPop = feat.ghsPopulation; - double ghsBuilt = feat.ghsBuiltWeight; - bool popPass = checkPop && ((ghsPop == 0 && opts.allowMissingGhs) || ghsPop >= opts.minGhsPop); - bool builtPass = checkBuilt && ((ghsBuilt == 0 && opts.allowMissingGhs) || ghsBuilt >= opts.minGhsBuilt); - - if (opts.ghsFilterMode == "OR") { - if (checkPop && checkBuilt && !popPass && !builtPass) { - reason = "GHS (OR) below thresholds"; - return false; - } else if (checkPop && !checkBuilt && !popPass) { - reason = "GHS Pop below threshold"; - return false; - } else if (checkBuilt && !checkPop && !builtPass) { - reason = "GHS Built below threshold"; - return false; - } - } else { - if (checkPop && !popPass) { - reason = "GHS Pop below threshold"; - return false; - } - if (checkBuilt && !builtPass) { - reason = "GHS Built below threshold"; - return false; - } - } - } - - return true; -} - -// ── Sorting ───────────────────────────────────────────────────────────────── - -static void sort_waypoints(std::vector& wps, const std::string& pathOrder, - double cellSize) { - if (wps.size() <= 1) return; - - double rowTolerance = std::min((cellSize / 111.32) * 0.5, 0.5); - - if (pathOrder == "zigzag" || pathOrder == "snake") { - // Sort top-to-bottom, left-to-right within row tolerance - std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) { - if (std::abs(a.lat - b.lat) > rowTolerance) { - return b.lat < a.lat; // higher lat first (north to south) - } - return a.lng < b.lng; // left to right - }); - - if (pathOrder == "snake") { - // Group into rows, reverse every other row - std::vector> rows; - std::vector currentRow; - double lastY = wps[0].lat; - - for (auto& wp : wps) { - if (std::abs(wp.lat - lastY) > rowTolerance) { - rows.push_back(std::move(currentRow)); - currentRow.clear(); - lastY = wp.lat; - } - currentRow.push_back(wp); - } - if (!currentRow.empty()) rows.push_back(std::move(currentRow)); - - wps.clear(); - for (size_t i = 0; i < rows.size(); ++i) { - if (i % 2 == 1) std::reverse(rows[i].begin(), rows[i].end()); - for (auto& wp : rows[i]) wps.push_back(std::move(wp)); - } - } - - } else if (pathOrder == "spiral-out" || pathOrder == "spiral-in") { - // Sort by distance from center of all waypoints - double cLon = 0, cLat = 0; - for (const auto& wp : wps) { cLon += wp.lng; cLat += wp.lat; } - cLon /= wps.size(); - cLat /= wps.size(); - geo::Coord center{cLon, cLat}; - - std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) { - double dA = geo::distance_km(center, {a.lng, a.lat}); - double dB = geo::distance_km(center, {b.lng, b.lat}); - return (pathOrder == "spiral-out") ? (dA < dB) : (dA > dB); - }); - - } else if (pathOrder == "shortest") { - // Greedy nearest-neighbor - std::vector sorted; - sorted.reserve(wps.size()); - std::vector used(wps.size(), false); - - sorted.push_back(wps[0]); - used[0] = true; - - for (size_t step = 1; step < wps.size(); ++step) { - const auto& cur = sorted.back(); - double bestDist = 1e18; - size_t bestIdx = 0; - - for (size_t i = 0; i < wps.size(); ++i) { - if (used[i]) continue; - double dx = wps[i].lng - cur.lng; - double dy = wps[i].lat - cur.lat; - double distSq = dx * dx + dy * dy; - if (distSq < bestDist) { - bestDist = distSq; - bestIdx = i; - } - } - - sorted.push_back(wps[bestIdx]); - used[bestIdx] = true; - } - - wps = std::move(sorted); - } -} - -// ── Admin mode ────────────────────────────────────────────────────────────── - -static GridResult generate_admin(const std::vector& features, - const GridOptions& opts) { - GridResult res; - - for (size_t i = 0; i < features.size(); ++i) { - const auto& f = features[i]; - if (f.rings.empty() || f.rings[0].empty()) continue; - - std::string reason; - bool allowed = check_filters(f, opts, f.areaSqKm, reason); - - geo::Coord center = geo::centroid(f.rings[0]); - // Radius = distance from centroid to bbox corner - double radiusKm = geo::distance_km(center, {f.bbox.maxLon, f.bbox.maxLat}); - - if (allowed) { - res.waypoints.push_back({ - static_cast(res.waypoints.size() + 1), - std::round(center.lon * 1e6) / 1e6, - std::round(center.lat * 1e6) / 1e6, - std::round(radiusKm * 100.0) / 100.0, - f.gid, - f.name - }); - res.validCells++; - } else { - res.skippedCells++; - } - } - - return res; -} - -// ── Centers mode ──────────────────────────────────────────────────────────── - -static GridResult generate_centers(const std::vector& features, - const GridOptions& opts) { - GridResult res; - - struct AcceptedCenter { - geo::Coord coord; - }; - std::vector accepted; - - double minAllowedDist = opts.cellSize * (1.0 - opts.centroidOverlap); - - for (size_t i = 0; i < features.size(); ++i) { - const auto& f = features[i]; - - // Collect unique centers by rounding to 5 decimal places - std::map> centersMap; // key → [lon, lat, weight] - - auto addCenter = [&](double lon, double lat, double weight) { - char key[32]; - snprintf(key, sizeof(key), "%.5f,%.5f", lon, lat); - std::string k(key); - if (centersMap.find(k) == centersMap.end()) { - centersMap[k] = {lon, lat, weight}; - } - }; - - // Single pop/built centers - if (f.ghsPopCenter.lon != 0 || f.ghsPopCenter.lat != 0) { - addCenter(f.ghsPopCenter.lon, f.ghsPopCenter.lat, f.ghsPopulation); - } - if (f.ghsBuiltCenter.lon != 0 || f.ghsBuiltCenter.lat != 0) { - addCenter(f.ghsBuiltCenter.lon, f.ghsBuiltCenter.lat, f.ghsBuiltWeight); - } - - // Weighted center arrays - for (const auto& c : f.ghsPopCenters) { - addCenter(c[0], c[1], c[2]); - } - for (const auto& c : f.ghsBuiltCenters) { - addCenter(c[0], c[1], c[2]); - } - - for (const auto& [key, val] : centersMap) { - geo::Coord pt{val[0], val[1]}; - - std::string reason; - // For centers, use the feature's overall filters - bool allowed = check_filters(f, opts, f.areaSqKm, reason); - - // Check overlap with already-accepted centers - if (allowed && !accepted.empty()) { - for (const auto& ac : accepted) { - double dist = geo::distance_km(pt, ac.coord); - if (dist < minAllowedDist) { - allowed = false; - reason = "overlaps another centroid"; - break; - } - } - } - - if (allowed) { - accepted.push_back({pt}); - res.waypoints.push_back({ - static_cast(res.waypoints.size() + 1), - std::round(pt.lon * 1e6) / 1e6, - std::round(pt.lat * 1e6) / 1e6, - std::round((opts.cellSize / 2.0) * 100.0) / 100.0, - f.gid, - f.name - }); - res.validCells++; - } else { - res.skippedCells++; - } - } - } - - return res; -} - -// ── Polygon grid mode (hex / square) ──────────────────────────────────────── - -static GridResult generate_polygon_grid(const std::vector& features, - const GridOptions& opts) { - GridResult res; - - // Compute union bbox of all features - std::vector boxes; - for (const auto& f : features) { - if (!f.rings.empty()) boxes.push_back(f.bbox); - } - if (boxes.empty()) return res; - - geo::BBox extent = geo::bbox_union(boxes); - - // Estimate cell count to prevent runaway - double widthKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.maxLon, extent.minLat}); - double heightKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.minLon, extent.maxLat}); - double approxCellArea = opts.cellSize * opts.cellSize * 2.6; - int approxCells = static_cast(std::ceil((widthKm * heightKm) / approxCellArea)); - - if (approxCells > opts.maxCellsLimit) { - res.error = "Grid too massive (~" + std::to_string(approxCells) + " cells). Increase cell size or select smaller region."; - return res; - } - - // Generate grid centers - std::vector gridCenters; - if (opts.gridMode == "square") { - gridCenters = geo::square_grid(extent, opts.cellSize); - } else { - gridCenters = geo::hex_grid(extent, opts.cellSize); - } - - // For each grid center, check if it intersects any feature polygon - for (const auto& gc : gridCenters) { - bool intersects = false; - int regionIdx = -1; - - for (size_t i = 0; i < features.size(); ++i) { - if (features[i].rings.empty()) continue; - if (geo::point_in_polygon(gc, features[i].rings[0])) { - intersects = true; - regionIdx = static_cast(i); - break; - } - } - - if (!intersects) continue; - - const auto& regionFeat = features[regionIdx]; - std::string reason; - bool allowed = check_filters(regionFeat, opts, regionFeat.areaSqKm, reason); - - // Compute cell radius (half diagonal of cell) - double cellRadiusKm = opts.cellSize * std::sqrt(2.0) / 2.0; - - if (allowed) { - res.waypoints.push_back({ - static_cast(res.waypoints.size() + 1), - std::round(gc.lon * 1e6) / 1e6, - std::round(gc.lat * 1e6) / 1e6, - std::round(cellRadiusKm * 100.0) / 100.0, - regionFeat.gid, - regionFeat.name - }); - res.validCells++; - } else { - res.skippedCells++; - } - } - - return res; -} - -// ── Main entry point ──────────────────────────────────────────────────────── - -GridResult generate(const std::vector& features, - const GridOptions& opts) { - GridResult result; - - if (features.empty()) { - result.error = "No features provided"; - return result; - } - - if (opts.gridMode == "admin") { - result = generate_admin(features, opts); - } else if (opts.gridMode == "centers") { - result = generate_centers(features, opts); - } else { - result = generate_polygon_grid(features, opts); - } - - if (!result.error.empty()) return result; - - // Sort waypoints - if (result.waypoints.size() > 1) { - if (opts.groupByRegion && features.size() > 1) { - std::stable_sort(result.waypoints.begin(), result.waypoints.end(), - [](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; }); - - auto start = result.waypoints.begin(); - while (start != result.waypoints.end()) { - auto end = start; - while (end != result.waypoints.end() && end->area_gid == start->area_gid) { - ++end; - } - std::vector group(start, end); - sort_waypoints(group, opts.pathOrder, opts.cellSize); - std::copy(group.begin(), group.end(), start); - start = end; - } - } else { - sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize); - } - } - - // Re-number steps after sorting - for (size_t i = 0; i < result.waypoints.size(); ++i) { - result.waypoints[i].step = static_cast(i + 1); - } - - return result; -} - -} // namespace grid diff --git a/packages/kbot/cpp/packages/kbot/kbot.cpp b/packages/kbot/cpp/packages/kbot/kbot.cpp index 512f32d3..bfe1c74a 100644 --- a/packages/kbot/cpp/packages/kbot/kbot.cpp +++ b/packages/kbot/cpp/packages/kbot/kbot.cpp @@ -1,68 +1,105 @@ #include "kbot.h" -#include #include #include "logger/logger.h" #include "llm_client.h" +#include +#include namespace polymech { namespace kbot { -int run_kbot_ai_pipeline(const KBotOptions& opts, const KBotCallbacks& cb) { - logger::debug("Starting kbot ai pipeline"); - if (opts.dry_run) { - logger::info("Dry run triggered for kbot ai"); - } +namespace { - // Scaffolding multithreaded AI tasks - tf::Executor executor(4); - tf::Taskflow taskflow; - - taskflow.emplace([opts, cb](){ - logger::debug("Executing kbot ai completion via LLMClient..."); - LLMClient client(opts); - - std::string target_prompt = opts.prompt.empty() ? "Respond with 'Hello from KBot C++ AI Pipeline!'" : opts.prompt; - LLMResponse res = client.execute_chat(target_prompt); - - if (res.success) { - std::cout << res.text << "\n"; - if (cb.onEvent) { - cb.onEvent("ai_progress", "{\"message\":\"Task completion received\"}"); - } - } else { - logger::error("AI Task Failed: " + res.error); - if (cb.onEvent) { - cb.onEvent("ai_error", "{\"error\":\"Task failed\"}"); - } - } - }); - - executor.run(taskflow).wait(); - - if (cb.onEvent) { - cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"ai\"}"); - } - return 0; +std::string json_job_result_ai(bool success, const std::string &text_or_error, bool is_text) { + rapidjson::StringBuffer buf; + rapidjson::Writer w(buf); + w.StartObject(); + w.Key("status"); + w.String(success ? "success" : "error"); + w.Key("mode"); + w.String("ai"); + if (success && is_text) { + w.Key("text"); + w.String(text_or_error.c_str(), + static_cast(text_or_error.size())); + } else if (!success) { + w.Key("error"); + w.String(text_or_error.c_str(), + static_cast(text_or_error.size())); + } + w.EndObject(); + return std::string(buf.GetString(), buf.GetSize()); } -int run_kbot_run_pipeline(const KBotRunOptions& opts, const KBotCallbacks& cb) { - logger::info("Starting kbot run pipeline (stub) for config: " + opts.config); - if (opts.dry) { - logger::info("Dry run triggered for kbot run"); - } - if (opts.list) { - logger::info("List configs mode enabled"); - } - - // Stub std::system call execution (simulating child_process.execFileSync from TypeScript) - if (!opts.dry && !opts.list) { - logger::info("Simulating launching: .vscode/launch.json targeting " + opts.config); - } +} // namespace +int run_kbot_ai_pipeline(const KBotOptions &opts, const KBotCallbacks &cb) { + logger::debug("Starting kbot ai pipeline"); + + if (opts.dry_run) { + logger::info("Dry run triggered for kbot ai"); if (cb.onEvent) { - cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"run\"}"); + cb.onEvent("job_result", json_job_result_ai(true, "[dry-run] no LLM call", true)); } return 0; + } + + LLMClient client(opts); + const std::string target_prompt = + opts.prompt.empty() ? "Respond with 'Hello from KBot C++ AI Pipeline!'" + : opts.prompt; + + logger::debug("Executing kbot ai completion via LLMClient..."); + LLMResponse res = client.execute_chat(target_prompt); + + if (res.success) { + std::cout << res.text << "\n"; + if (cb.onEvent) { + cb.onEvent("ai_progress", + "{\"message\":\"Task completion received\",\"has_text\":true}"); + } + } else { + logger::error("AI Task Failed: " + res.error); + if (cb.onEvent) { + rapidjson::StringBuffer ebuf; + rapidjson::Writer ew(ebuf); + ew.StartObject(); + ew.Key("error"); + ew.String(res.error.c_str(), + static_cast(res.error.size())); + ew.EndObject(); + cb.onEvent("ai_error", + std::string(ebuf.GetString(), ebuf.GetSize())); + } + } + + if (cb.onEvent) { + if (res.success) + cb.onEvent("job_result", json_job_result_ai(true, res.text, true)); + else + cb.onEvent("job_result", json_job_result_ai(false, res.error, false)); + } + + return res.success ? 0 : 1; +} + +int run_kbot_run_pipeline(const KBotRunOptions &opts, const KBotCallbacks &cb) { + logger::info("Starting kbot run pipeline (stub) for config: " + opts.config); + if (opts.dry) { + logger::info("Dry run triggered for kbot run"); + } + if (opts.list) { + logger::info("List configs mode enabled"); + } + + if (!opts.dry && !opts.list) { + logger::info("Simulating launching: .vscode/launch.json targeting " + opts.config); + } + + if (cb.onEvent) { + cb.onEvent("job_result", "{\"status\":\"success\",\"mode\":\"run\"}"); + } + return 0; } } // namespace kbot diff --git a/packages/kbot/cpp/packages/search/CMakeLists.txt b/packages/kbot/cpp/packages/search/CMakeLists.txt deleted file mode 100644 index 976f3be1..00000000 --- a/packages/kbot/cpp/packages/search/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -add_library(search STATIC src/search.cpp) - -target_include_directories(search PUBLIC include) - -# Depends on http (curl) and json (RapidJSON wrapper) -target_link_libraries(search PUBLIC http json) -target_link_libraries(search PRIVATE tomlplusplus::tomlplusplus) diff --git a/packages/kbot/cpp/packages/search/include/search/search.h b/packages/kbot/cpp/packages/search/include/search/search.h deleted file mode 100644 index 3a0e136d..00000000 --- a/packages/kbot/cpp/packages/search/include/search/search.h +++ /dev/null @@ -1,93 +0,0 @@ -#pragma once - -#include -#include - -namespace search { - -// ── Result types ──────────────────────────────────────────────────────────── - -struct GpsCoordinates { - double lat = 0; - double lng = 0; -}; - -struct MapResult { - std::string title; - std::string place_id; - std::string data_id; - std::string address; - std::string phone; - std::string website; - std::string type; - std::vector types; - double rating = 0; - int reviews = 0; - GpsCoordinates gps; - std::string thumbnail; - std::string raw_json; - std::string geo_json; -}; - -struct SearchResult { - std::vector results; - int apiCalls = 0; - std::string error; -}; - -// ── Config ────────────────────────────────────────────────────────────────── - -struct SystemTuningOptions { - int executor_threads = 0; // 0 = hardware concurrency - int max_concurrent_jobs_per_user = 10; - int http_concurrency_throttle = 50; - int queue_depth_max = 10000; - int bulk_dequeue_size = 1; - int ipc_timeout_ms = 300000; - int max_ipc_connections = 100; - int buffer_size_max = 50 * 1024 * 1024; -}; - -struct Config { - SystemTuningOptions system; - std::string serpapi_key; - std::string geocoder_key; - std::string bigdata_key; - std::string scrapeless_key; - std::string postgres_url; - std::string supabase_url; - std::string supabase_service_key; - // [enricher] - std::string enricher_meta_scraper; - int enricher_meta_concurrency = 5; - int enricher_meta_idle_timeout = 60; - int enricher_location_concurrency = 1; -}; - -/// Load config from a TOML file (e.g. config/postgres.toml) -Config load_config(const std::string &path = "config/postgres.toml"); - -// ── Search API ────────────────────────────────────────────────────────────── - -struct SearchOptions { - std::string query; - double lat = 0; - double lng = 0; - int zoom = 13; - int limit = 20; - std::string engine = "google_maps"; - std::string hl = "en"; - std::string google_domain = "google.com"; -}; - -/// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit. -SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts); - -/// Resolve geo coordinate to place info -std::string resolve_geo(double lat, double lng, const std::string &key, - int timeout_ms = 3000); - -void resolve_geo_batch(std::vector &results, const std::string &key, - int concurrency = 10, int timeout_ms = 3000); - -} // namespace search diff --git a/packages/kbot/cpp/packages/search/src/search.cpp b/packages/kbot/cpp/packages/search/src/search.cpp deleted file mode 100644 index ce0c9100..00000000 --- a/packages/kbot/cpp/packages/search/src/search.cpp +++ /dev/null @@ -1,311 +0,0 @@ -#include "search/search.h" -#include "http/http.h" - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace search { - -// ── URL encoding (minimal) ────────────────────────────────────────────────── - -static std::string url_encode(const std::string &val) { - std::string result; - result.reserve(val.size() * 2); - for (unsigned char c : val) { - if (isalnum(static_cast(c)) || c == '-' || c == '_' || - c == '.' || c == '~') { - result += static_cast(c); - } else { - char buf[4]; - snprintf(buf, sizeof(buf), "%%%02X", c); - result += buf; - } - } - return result; -} - -// ── Config loading ────────────────────────────────────────────────────────── - -Config load_config(const std::string &path) { - Config cfg; - try { - auto tbl = toml::parse_file(path); - - // [postgres] - if (auto v = tbl["postgres"]["url"].value()) - cfg.postgres_url = *v; - - // [supabase] - if (auto v = tbl["supabase"]["url"].value()) - cfg.supabase_url = *v; - if (auto v = tbl["supabase"]["service_key"].value()) - cfg.supabase_service_key = *v; - - // [services] - if (auto v = tbl["services"]["SERPAPI_KEY"].value()) - cfg.serpapi_key = *v; - if (auto v = tbl["services"]["GEO_CODER_KEY"].value()) - cfg.geocoder_key = *v; - if (auto v = tbl["services"]["BIG_DATA_KEY"].value()) - cfg.bigdata_key = *v; - if (auto v = tbl["services"]["SCRAPELESS_KEY"].value()) - cfg.scrapeless_key = *v; - - // [enricher] - if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value()) - cfg.enricher_meta_scraper = *v; - if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value()) - cfg.enricher_meta_concurrency = *v; - if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value()) - cfg.enricher_meta_idle_timeout = *v; - if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value()) - cfg.enricher_location_concurrency = *v; - - // [system] - if (auto v = tbl["system"]["executor_threads"].value()) - cfg.system.executor_threads = *v; - if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value()) - cfg.system.max_concurrent_jobs_per_user = *v; - if (auto v = tbl["system"]["http_concurrency_throttle"].value()) - cfg.system.http_concurrency_throttle = *v; - if (auto v = tbl["system"]["queue_depth_max"].value()) - cfg.system.queue_depth_max = *v; - if (auto v = tbl["system"]["bulk_dequeue_size"].value()) - cfg.system.bulk_dequeue_size = *v; - if (auto v = tbl["system"]["ipc_timeout_ms"].value()) - cfg.system.ipc_timeout_ms = *v; - if (auto v = tbl["system"]["max_ipc_connections"].value()) - cfg.system.max_ipc_connections = *v; - if (auto v = tbl["system"]["buffer_size_max"].value()) - cfg.system.buffer_size_max = *v; - - } catch (const toml::parse_error &err) { - std::cerr << "[config] TOML parse error in " << path << ": " << err.what() - << "\n"; - } - return cfg; -} - -// ── SerpAPI URL builder ───────────────────────────────────────────────────── - -static std::string build_serpapi_url(const Config &cfg, - const SearchOptions &opts, int start) { - std::ostringstream url; - url << "https://serpapi.com/search.json" - << "?engine=" << url_encode(opts.engine) - << "&q=" << url_encode(opts.query) - << "&api_key=" << url_encode(cfg.serpapi_key) - << "&hl=" << url_encode(opts.hl) - << "&google_domain=" << url_encode(opts.google_domain); - - if (opts.lat != 0 || opts.lng != 0) { - char llBuf[128]; - snprintf(llBuf, sizeof(llBuf), "@%.7f,%.7f,%dz", opts.lat, opts.lng, - opts.zoom); - url << "&ll=" << url_encode(std::string(llBuf)); - } - - if (start > 0) { - url << "&start=" << start; - } - - return url.str(); -} - -// ── JSON result parser ────────────────────────────────────────────────────── - -static void parse_results(const rapidjson::Value &arr, - std::vector &out) { - if (!arr.IsArray()) - return; - - for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) { - const auto &obj = arr[i]; - if (!obj.IsObject()) - continue; - - MapResult r; - - // Capture raw JSON string - rapidjson::StringBuffer buf; - rapidjson::Writer writer(buf); - obj.Accept(writer); - r.raw_json = std::string(buf.GetString(), buf.GetSize()); - - if (obj.HasMember("title") && obj["title"].IsString()) - r.title = obj["title"].GetString(); - if (obj.HasMember("place_id") && obj["place_id"].IsString()) - r.place_id = obj["place_id"].GetString(); - if (obj.HasMember("data_id") && obj["data_id"].IsString()) - r.data_id = obj["data_id"].GetString(); - if (obj.HasMember("address") && obj["address"].IsString()) - r.address = obj["address"].GetString(); - if (obj.HasMember("phone") && obj["phone"].IsString()) - r.phone = obj["phone"].GetString(); - if (obj.HasMember("website") && obj["website"].IsString()) - r.website = obj["website"].GetString(); - if (obj.HasMember("type") && obj["type"].IsString()) - r.type = obj["type"].GetString(); - if (obj.HasMember("rating") && obj["rating"].IsNumber()) - r.rating = obj["rating"].GetDouble(); - if (obj.HasMember("reviews") && obj["reviews"].IsInt()) - r.reviews = obj["reviews"].GetInt(); - if (obj.HasMember("thumbnail") && obj["thumbnail"].IsString()) - r.thumbnail = obj["thumbnail"].GetString(); - - if (obj.HasMember("gps_coordinates") && obj["gps_coordinates"].IsObject()) { - const auto &gps = obj["gps_coordinates"]; - if (gps.HasMember("latitude") && gps["latitude"].IsNumber()) - r.gps.lat = gps["latitude"].GetDouble(); - if (gps.HasMember("longitude") && gps["longitude"].IsNumber()) - r.gps.lng = gps["longitude"].GetDouble(); - } - - if (obj.HasMember("types") && obj["types"].IsArray()) { - for (rapidjson::SizeType j = 0; j < obj["types"].Size(); ++j) { - if (obj["types"][j].IsString()) - r.types.push_back(obj["types"][j].GetString()); - } - } - - out.push_back(std::move(r)); - } -} - -// ── Main search function ──────────────────────────────────────────────────── - -SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts) { - SearchResult result; - - if (cfg.serpapi_key.empty()) { - result.error = "No SerpAPI key configured"; - return result; - } - - if (opts.query.empty()) { - result.error = "Empty search query"; - return result; - } - - const int PAGE_SIZE = 20; - int start = 0; - - while (static_cast(result.results.size()) < opts.limit) { - std::string url = build_serpapi_url(cfg, opts, start); - auto resp = http::get(url); - result.apiCalls++; - - if (resp.status_code != 200) { - result.error = "SerpAPI HTTP " + std::to_string(resp.status_code); - break; - } - - rapidjson::Document doc; - doc.Parse(resp.body.c_str()); - if (doc.HasParseError()) { - result.error = "Failed to parse SerpAPI response"; - break; - } - - size_t beforeCount = result.results.size(); - - // local_results (main listing) - if (doc.HasMember("local_results") && doc["local_results"].IsArray()) { - parse_results(doc["local_results"], result.results); - } - - // place_results (single result or array) - if (doc.HasMember("place_results")) { - if (doc["place_results"].IsArray()) { - parse_results(doc["place_results"], result.results); - } else if (doc["place_results"].IsObject()) { - rapidjson::Document arr; - arr.SetArray(); - arr.PushBack(rapidjson::Value(doc["place_results"], arr.GetAllocator()), - arr.GetAllocator()); - parse_results(arr, result.results); - } - } - - size_t pageCount = result.results.size() - beforeCount; - - if (pageCount == 0) - break; // No more results - if (static_cast(pageCount) < PAGE_SIZE) - break; // Last page (partial) - - start += PAGE_SIZE; - } - - // Trim to limit - if (static_cast(result.results.size()) > opts.limit) { - result.results.resize(opts.limit); - } - - return result; -} - -// ── Geo enrichment ────────────────────────────────────────────────────────── - -std::string resolve_geo(double lat, double lng, const std::string &key, - int timeout_ms) { - if (key.empty()) - return "{}"; - char url[512]; - snprintf( - url, sizeof(url), - "https://api.bigdatacloud.net/data/" - "reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s", - lat, lng, key.c_str()); - - http::GetOptions opts; - opts.timeout_ms = timeout_ms; - auto resp = http::get(url, opts); - if (resp.status_code == 200 && !resp.body.empty()) { - return resp.body; - } - return "{}"; -} - -void resolve_geo_batch(std::vector &results, const std::string &key, - int concurrency, int timeout_ms) { - if (key.empty() || results.empty()) - return; - - std::atomic current_idx{0}; - std::vector threads; - - int num_threads = - std::min(concurrency, static_cast(results.size())); - - for (int i = 0; i < num_threads; ++i) { - threads.emplace_back([&]() { - while (true) { - size_t idx = current_idx.fetch_add(1); - if (idx >= results.size()) - break; - - auto &r = results[idx]; - if (r.gps.lat != 0 || r.gps.lng != 0) { - r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms); - } - } - }); - } - - for (auto &t : threads) { - if (t.joinable()) - t.join(); - } -} - -} // namespace search diff --git a/packages/kbot/cpp/src/cmd_gridsearch-filters.h b/packages/kbot/cpp/src/cmd_gridsearch-filters.h deleted file mode 100644 index 874f005c..00000000 --- a/packages/kbot/cpp/src/cmd_gridsearch-filters.h +++ /dev/null @@ -1,65 +0,0 @@ -#pragma once - -#include "search/search.h" -#include "gadm_reader/gadm_reader.h" -#include "geo/geo.h" - -#include -#include -#include -#include - -namespace polymech { - -// ── Filter context ────────────────────────────────────────────────────────── -// All runtime data a filter predicate may need. Passed by const-ref so filters -// are pure read-only functions with no side-effects. - -struct WaypointCtx { - double lat; - double lng; - double radius_km; - std::string area_gid; // e.g. "ESP.6.1.10.2_1" -}; - -struct FilterContext { - const WaypointCtx& waypoint; - const std::vector& filter_types; // must-match list - const std::vector& exclude_types; // deny list - const std::map>& country_boundaries; -}; - -// ── Predicate type ────────────────────────────────────────────────────────── -// Returns true → KEEP the result. -// Returns false → DISCARD the result. -using LocationFilter = std::function; - -// ── Individual filters ────────────────────────────────────────────────────── - -/// Discard results that have no website (non-actionable leads). -bool filter_requires_website(const search::MapResult& r, const FilterContext& ctx); - -/// Discard results whose type matches any entry in ctx.exclude_types. -bool filter_exclude_types(const search::MapResult& r, const FilterContext& ctx); - -/// If ctx.filter_types is non-empty, keep only results that match ≥1 type. -bool filter_match_types(const search::MapResult& r, const FilterContext& ctx); - -/// Keep only results inside the country-level boundary polygon (L0) of the -/// waypoint's country. Falls back to radius-based overlap (1.5 × radius_km) -/// to gracefully handle legitimate border-proximity results. -bool filter_country_boundary(const search::MapResult& r, const FilterContext& ctx); - -// ── Filter set builder ────────────────────────────────────────────────────── - -/// Return the ordered list of default filters applied to every SerpAPI batch. -/// Filters are evaluated left-to-right; the first false short-circuits. -std::vector default_location_filters(); - -/// Run `filters` against `result`. Returns true (keep) only if every -/// filter passes. -bool apply_filters(const search::MapResult& result, - const FilterContext& ctx, - const std::vector& filters); - -} // namespace polymech diff --git a/packages/kbot/cpp/src/cmd_gridsearch-postgres.h b/packages/kbot/cpp/src/cmd_gridsearch-postgres.h deleted file mode 100644 index 2b2fbc80..00000000 --- a/packages/kbot/cpp/src/cmd_gridsearch-postgres.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once - -#include "cmd_gridsearch.h" -#include "search/search.h" -#include "enrichers/enrichers.h" -#include - -namespace polymech { - -struct PostgresStateStore { - std::string run_id; - std::string user_id; - std::string parent_id; // optional: parent run ID for expand jobs - bool enabled = false; - - void init_run(const PipelineOptions &opts); - void update_status(const std::string &status); - void complete_run(const std::string &result_json); - void fail_run(const std::string &error_msg); - void upsert_places(const std::vector &places); - void update_place_enrichment(const enrichers::EnrichedNode &enode); - - /// Query places table in chunks to find place_ids that already have meta (enriched). - /// Returns set of place_ids that should be skipped during enrichment. - std::set filter_already_enriched(const std::vector &place_ids); -}; - -} // namespace polymech diff --git a/packages/kbot/cpp/src/cmd_gridsearch.h b/packages/kbot/cpp/src/cmd_gridsearch.h deleted file mode 100644 index 6835f123..00000000 --- a/packages/kbot/cpp/src/cmd_gridsearch.h +++ /dev/null @@ -1,88 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include "search/search.h" -#include "grid/grid.h" -#include - -namespace polymech { - -std::string json_escape(const std::string &s); - -struct AreaDef { - std::string gid; - std::string name; - int level; -}; - -struct AccumulatedResult { - search::MapResult result; - std::string grid_area; - std::string grid_gid; -}; - -struct PipelineOptions { - std::vector areas; - grid::GridOptions grid_opts; - std::string search_query; - std::string search_domain = "google.com"; - std::string search_language = "en"; - std::string search_country; - int search_limit = 20; - int search_zoom = 13; - bool dry_run = false; - bool enrich = false; - std::string config_path = "config/postgres.toml"; - std::string cache_dir = "cache/gadm"; - bool persistence_postgres = false; - bool daemon_mode = false; - std::string job_id; - std::string default_user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421"; - search::SystemTuningOptions tuning; - std::shared_ptr> cancel_token; - std::vector filter_types; // if non-empty, only locations matching ≥1 type pass - std::vector exclude_types; // if non-empty, drop locations matching any - bool no_cache = false; // skip pre-enrich dedup — force re-enrichment - std::string parent_id; // if set, this run is an "expand" child of another run -}; - -std::string json_escape(const std::string &s); - -/// Optional callbacks for streaming progress events (used in IPC mode). -/// When nullptr / empty, the pipeline runs silently (CLI mode). -struct GridsearchCallbacks { - /// Emit a progress event. `type` is one of: - /// grid-ready, waypoint-start, area, location, - /// enrich-start, node, node-error, nodePage - /// `json` is the raw JSON payload string. - std::function onEvent; -}; - -CLI::App* setup_cmd_gridsearch(CLI::App& app); - -/// CLI entry point (standalone mode — reads static vars set by CLI11). -int run_cmd_gridsearch(); - -/// IPC entry point — parse `payload` JSON, run the pipeline, emit events via `cb`. -/// Returns 0 on success. -int run_cmd_gridsearch_ipc(const std::string& payload, - const std::string& jobId, - const GridsearchCallbacks& cb, - bool daemon_mode = false, - const std::string& daemon_uid = ""); - -/// Core Pipeline -int run_pipeline(const PipelineOptions &opts, std::ostream *file_out, - const GridsearchCallbacks &cb); - -/// UDS entry point — starts a persistent AF_UNIX / Named Pipe server that processes -/// concurrent jobs using Moodycamel ConcurrentQueue and Taskflow executor. -int run_cmd_gridsearch_uds(const std::string& pipe_path, - bool daemon_mode, - const std::string& daemon_uid); - -} // namespace polymech diff --git a/packages/kbot/cpp/src/cmd_kbot.cpp b/packages/kbot/cpp/src/cmd_kbot.cpp index e6637fb6..7d6d73c6 100644 --- a/packages/kbot/cpp/src/cmd_kbot.cpp +++ b/packages/kbot/cpp/src/cmd_kbot.cpp @@ -100,6 +100,8 @@ int run_kbot_ai_ipc(const std::string& payload, const std::string& jobId, const if (doc.HasMember("prompt") && doc["prompt"].IsString()) opts.prompt = doc["prompt"].GetString(); if (doc.HasMember("dry_run") && doc["dry_run"].IsBool()) opts.dry_run = doc["dry_run"].GetBool(); if (doc.HasMember("api_key") && doc["api_key"].IsString()) opts.api_key = doc["api_key"].GetString(); + if (doc.HasMember("router") && doc["router"].IsString()) opts.router = doc["router"].GetString(); + if (doc.HasMember("model") && doc["model"].IsString()) opts.model = doc["model"].GetString(); } if (opts.api_key.empty()) { diff --git a/packages/kbot/cpp/src/cmd_kbot.h b/packages/kbot/cpp/src/cmd_kbot.h index 3c611369..ef8d8f39 100644 --- a/packages/kbot/cpp/src/cmd_kbot.h +++ b/packages/kbot/cpp/src/cmd_kbot.h @@ -18,6 +18,9 @@ int run_cmd_kbot_run(); int run_kbot_ai_ipc(const std::string& payload, const std::string& jobId, const kbot::KBotCallbacks& cb); int run_kbot_run_ipc(const std::string& payload, const std::string& jobId, const kbot::KBotCallbacks& cb); +/// Standalone UDS/TCP server for KBot (orchestrator tests, LLM worker). +int run_cmd_kbot_uds(const std::string& pipe_path); + /// Helper to check parsed state bool is_kbot_ai_parsed(); bool is_kbot_run_parsed(); diff --git a/packages/kbot/cpp/src/gridsearch_serialize.cpp b/packages/kbot/cpp/src/gridsearch_serialize.cpp deleted file mode 100644 index e8fe1c90..00000000 --- a/packages/kbot/cpp/src/gridsearch_serialize.cpp +++ /dev/null @@ -1,351 +0,0 @@ -#include "gridsearch_serialize.h" - -#include -#include -#include "cmd_gridsearch.h" - -namespace polymech::serialize { - -// ── grid-ready ────────────────────────────────────────────────────────────── - -std::string grid_ready(const std::vector& waypoints) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("areas"); w.StartArray(); - for (size_t i = 0; i < waypoints.size(); ++i) { - const auto& wp = waypoints[i]; - w.StartObject(); - w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str()); - w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str()); - w.Key("lat"); w.Double(wp.lat); - w.Key("lon"); w.Double(wp.lng); - w.Key("radius_km"); w.Double(wp.radius_km); - w.Key("area_gid"); w.String(wp.area_gid.c_str()); - w.Key("area_name"); w.String(wp.area_name.c_str()); - w.Key("index"); w.Int(static_cast(i)); - w.EndObject(); - } - w.EndArray(); - w.Key("total"); w.Int(static_cast(waypoints.size())); - w.EndObject(); - return sb.GetString(); -} - -// ── waypoint-start ────────────────────────────────────────────────────────── - -std::string waypoint_start(const grid::Waypoint& wp, int index, int total) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str()); - w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str()); - w.Key("lat"); w.Double(wp.lat); - w.Key("lon"); w.Double(wp.lng); - w.Key("radius_km"); w.Double(wp.radius_km); - w.Key("area_gid"); w.String(wp.area_gid.c_str()); - w.Key("area_name"); w.String(wp.area_name.c_str()); - w.Key("index"); w.Int(index); - w.Key("total"); w.Int(total); - w.EndObject(); - return sb.GetString(); -} - -// ── location (per search result) ──────────────────────────────────────────── - -std::string location(const search::MapResult& r, int step) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("location"); w.StartObject(); - w.Key("title"); w.String(r.title.c_str()); - w.Key("place_id"); w.String(r.place_id.c_str()); - w.Key("address"); w.String(r.address.c_str()); - w.Key("website"); w.String(r.website.c_str()); - w.Key("type"); w.String(r.type.c_str()); - w.Key("phone"); w.String(r.phone.c_str()); - w.Key("rating"); w.Double(r.rating); - w.Key("reviews"); w.Int(r.reviews); - w.Key("lat"); w.Double(r.gps.lat); - w.Key("lng"); w.Double(r.gps.lng); - w.Key("types"); w.StartArray(); - for (const auto& t : r.types) w.String(t.c_str()); - w.EndArray(); - w.EndObject(); - w.Key("areaName"); w.String(("Waypoint " + std::to_string(step)).c_str()); - w.EndObject(); - return sb.GetString(); -} - -// ── area_start ────────────────────────────────────────────────────────────── - -std::string area_start(const std::string& area_gid, const std::string& area_name) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("gid"); w.String(area_gid.c_str()); - w.Key("name"); w.String(area_name.c_str()); - w.EndObject(); - return sb.GetString(); -} - -// ── area_finish ───────────────────────────────────────────────────────────── - -std::string area_finish(const std::string& area_gid) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("gid"); w.String(area_gid.c_str()); - w.EndObject(); - return sb.GetString(); -} - -// ── waypoint_finish ───────────────────────────────────────────────────────── - -std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("name"); w.String(("Waypoint " + std::to_string(wp.step)).c_str()); - w.Key("gid"); w.String(("wp-" + std::to_string(wp.step)).c_str()); - w.Key("results"); w.Int(results); - w.Key("apiCalls"); w.Int(apiCalls); - w.EndObject(); - return sb.GetString(); -} - -// ── enrich-start ──────────────────────────────────────────────────────────── - -std::string enrich_start(int locationCount) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("locationCount"); w.Int(locationCount); - w.EndObject(); - return sb.GetString(); -} - -// ── nodePage (per page error) ─────────────────────────────────────────────── - -std::string node_page(const enrichers::PageError& pe, const std::string& placeId) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("location"); w.String(placeId.c_str()); - w.Key("url"); w.String(pe.url.c_str()); - w.Key("status"); w.String(pe.status.c_str()); - w.Key("error"); w.String(pe.error.c_str()); - w.EndObject(); - return sb.GetString(); -} - -// ── node-error ────────────────────────────────────────────────────────────── - -std::string node_error(const enrichers::EnrichedNode& node) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("node"); w.StartObject(); - w.Key("title"); w.String(node.title.c_str()); - w.Key("placeId"); w.String(node.place_id.c_str()); - w.EndObject(); - w.Key("error"); w.String(node.error.c_str()); - w.EndObject(); - return sb.GetString(); -} - -// ── node (enriched location) ──────────────────────────────────────────────── - -std::string node(const enrichers::EnrichedNode& n) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - w.Key("idx"); w.Int(n.idx); - w.Key("title"); w.String(n.title.c_str()); - w.Key("placeId"); w.String(n.place_id.c_str()); - w.Key("website"); w.String(n.website.c_str()); - w.Key("address"); w.String(n.address.c_str()); - w.Key("type"); w.String(n.type.c_str()); - w.Key("status"); w.String(enrichers::status_string(n.status)); - w.Key("emails"); w.StartArray(); - for (const auto& e : n.emails) w.String(e.c_str()); - w.EndArray(); - - w.Key("social"); w.StartArray(); - for (const auto& s : n.socials) { - w.StartObject(); - w.Key("url"); w.String(s.url.c_str()); - w.Key("platform"); w.String(s.platform.c_str()); - w.EndObject(); - } - w.EndArray(); - - w.Key("sites"); w.StartArray(); - for (const auto& s : n.sites) { - w.StartObject(); - w.Key("url"); w.String(s.first.c_str()); - w.Key("name"); w.String("home"); - w.Key("content"); w.String(s.second.c_str()); - w.EndObject(); - } - w.EndArray(); - - w.Key("pagesFound"); w.Int(n.pages_found); - w.Key("pagesScraped"); w.Int(n.pages_scraped); - w.Key("metaMs"); w.Int(n.meta_ms); - w.Key("emailMs"); w.Int(n.email_ms); - w.Key("totalMs"); w.Int(n.total_ms); - w.Key("gridArea"); w.String(n.grid_area.c_str()); - w.Key("gridGid"); w.String(n.grid_gid.c_str()); - w.EndObject(); - return sb.GetString(); -} - -// ── write_options helper ──────────────────────────────────────────────────── - -static void write_options(rapidjson::Writer& w, const polymech::PipelineOptions& opts) { - w.Key("options"); - w.StartObject(); - w.Key("jobId"); w.String(opts.job_id.c_str()); - w.Key("searchQuery"); w.String(opts.search_query.c_str()); - w.Key("searchDomain"); w.String(opts.search_domain.c_str()); - w.Key("searchLanguage"); w.String(opts.search_language.c_str()); - w.Key("searchCountry"); w.String(opts.search_country.c_str()); - w.Key("searchLimit"); w.Int(opts.search_limit); - w.Key("searchZoom"); w.Int(opts.search_zoom); - w.Key("dryRun"); w.Bool(opts.dry_run); - w.Key("enrich"); w.Bool(opts.enrich); - - w.Key("grid"); - w.StartObject(); - w.Key("gridMode"); w.String(opts.grid_opts.gridMode.c_str()); - w.Key("cellSize"); w.Double(opts.grid_opts.cellSize); - w.Key("cellOverlap"); w.Double(opts.grid_opts.cellOverlap); - w.Key("centroidOverlap"); w.Double(opts.grid_opts.centroidOverlap); - w.Key("maxCellsLimit"); w.Int(opts.grid_opts.maxCellsLimit); - w.Key("maxElevation"); w.Double(opts.grid_opts.maxElevation); - w.Key("minDensity"); w.Double(opts.grid_opts.minDensity); - w.Key("minGhsPop"); w.Double(opts.grid_opts.minGhsPop); - w.Key("minGhsBuilt"); w.Double(opts.grid_opts.minGhsBuilt); - w.Key("ghsFilterMode"); w.String(opts.grid_opts.ghsFilterMode.c_str()); - w.Key("allowMissingGhs"); w.Bool(opts.grid_opts.allowMissingGhs); - w.Key("bypassFilters"); w.Bool(opts.grid_opts.bypassFilters); - w.Key("pathOrder"); w.String(opts.grid_opts.pathOrder.c_str()); - w.Key("groupByRegion"); w.Bool(opts.grid_opts.groupByRegion); - w.EndObject(); - - w.Key("areas"); - w.StartArray(); - for (const auto& a : opts.areas) { - w.StartObject(); - w.Key("gid"); w.String(a.gid.c_str()); - w.Key("name"); w.String(a.name.c_str()); - w.Key("level"); w.Int(a.level); - w.EndObject(); - } - w.EndArray(); - w.EndObject(); -} - -// ── job_result (with enrichment) ──────────────────────────────────────────── - -std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs, - int totalEmails, int totalPagesScraped, int freshApiCalls, - int waypointCount, int validCells, int skippedCells, - int totalResults, const std::vector& enrichResults, - double totalScannedSqKm, double totalPopulation) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - write_options(w, opts); - - w.Key("enumMs"); w.Int64(enumMs); - w.Key("searchMs"); w.Int64(searchMs); - w.Key("enrichMs"); w.Int64(enrichMs); - w.Key("totalMs"); w.Int64(totalMs); - - w.Key("gridStats"); - w.StartObject(); - w.Key("validCells"); w.Int(validCells); - w.Key("skippedCells"); w.Int(skippedCells); - w.Key("totalWaypoints"); w.Int(waypointCount); - w.EndObject(); - - w.Key("searchStats"); - w.StartObject(); - w.Key("apiCalls"); w.Int(freshApiCalls); - w.Key("filtered"); w.Int(0); // placeholder if needed - w.Key("areaCount"); w.Int(waypointCount); - w.Key("totalResults"); w.Int(totalResults); - w.Key("totalScannedSqKm"); w.Double(totalScannedSqKm); - w.Key("totalPopulation"); w.Double(totalPopulation); - w.EndObject(); - - w.Key("totalEmails"); w.Int(totalEmails); - - w.Key("enrichResults"); - w.StartArray(); - for (const auto& id : enrichResults) { - w.String(id.c_str()); - } - w.EndArray(); - - w.Key("freshApiCalls"); w.Int(freshApiCalls); - w.Key("waypointCount"); w.Int(waypointCount); - w.Key("totalPagesScraped"); w.Int(totalPagesScraped); - - w.EndObject(); - return sb.GetString(); -} - -// ── job_result (search only) ──────────────────────────────────────────────── - -std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs, - int freshApiCalls, int waypointCount, int validCells, - int skippedCells, int totalResults, const std::vector& enrichResults, - double totalScannedSqKm, double totalPopulation) { - rapidjson::StringBuffer sb; - rapidjson::Writer w(sb); - w.StartObject(); - write_options(w, opts); - - w.Key("enumMs"); w.Int64(enumMs); - w.Key("searchMs"); w.Int64(searchMs); - w.Key("enrichMs"); w.Int64(0); - w.Key("totalMs"); w.Int64(totalMs); - - w.Key("gridStats"); - w.StartObject(); - w.Key("validCells"); w.Int(validCells); - w.Key("skippedCells"); w.Int(skippedCells); - w.Key("totalWaypoints"); w.Int(waypointCount); - w.EndObject(); - - w.Key("searchStats"); - w.StartObject(); - w.Key("apiCalls"); w.Int(freshApiCalls); - w.Key("filtered"); w.Int(0); - w.Key("areaCount"); w.Int(waypointCount); - w.Key("totalResults"); w.Int(totalResults); - w.Key("totalScannedSqKm"); w.Double(totalScannedSqKm); - w.Key("totalPopulation"); w.Double(totalPopulation); - w.EndObject(); - - w.Key("totalEmails"); w.Int(0); - - w.Key("enrichResults"); - w.StartArray(); - for (const auto& id : enrichResults) { - w.String(id.c_str()); - } - w.EndArray(); - - w.Key("freshApiCalls"); w.Int(freshApiCalls); - w.Key("waypointCount"); w.Int(waypointCount); - w.Key("totalPagesScraped"); w.Int(0); - - w.EndObject(); - return sb.GetString(); -} - -} // namespace polymech::serialize diff --git a/packages/kbot/cpp/src/gridsearch_serialize.h b/packages/kbot/cpp/src/gridsearch_serialize.h deleted file mode 100644 index 1323f6b7..00000000 --- a/packages/kbot/cpp/src/gridsearch_serialize.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include "enrichers/enrichers.h" -#include "grid/grid.h" -#include "search/search.h" - -#include -#include -#include - -namespace polymech { -struct PipelineOptions; -} - -namespace polymech::serialize { - -/// grid-ready event payload -std::string grid_ready(const std::vector& waypoints); - -/// waypoint-start event payload -std::string waypoint_start(const grid::Waypoint& wp, int index, int total); - -/// location event payload (per search result) -std::string location(const search::MapResult& r, int step); - -/// waypoint-finish event payload (waypoint done) -std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls); - -/// area-start event payload -std::string area_start(const std::string& area_gid, const std::string& area_name); - -/// area-finish event payload -std::string area_finish(const std::string& area_gid); - -/// enrich-start event payload -std::string enrich_start(int locationCount); - -/// nodePage event payload (per page error) -std::string node_page(const enrichers::PageError& pe, const std::string& placeId); - -/// node-error event payload -std::string node_error(const enrichers::EnrichedNode& node); - -/// node event payload (enriched location) -std::string node(const enrichers::EnrichedNode& node); - -/// job_result summary (with enrichment) -std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs, - int totalEmails, int totalPagesScraped, int freshApiCalls, - int waypointCount, int validCells, int skippedCells, - int totalResults, const std::vector& enrichResults, - double totalScannedSqKm, double totalPopulation); - -/// job_result summary (search only, no enrichment) -std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs, - int freshApiCalls, int waypointCount, int validCells, - int skippedCells, int totalResults, const std::vector& enrichResults, - double totalScannedSqKm, double totalPopulation); - -} // namespace polymech::serialize diff --git a/packages/kbot/cpp/src/main.cpp b/packages/kbot/cpp/src/main.cpp index 99d87998..592b6002 100644 --- a/packages/kbot/cpp/src/main.cpp +++ b/packages/kbot/cpp/src/main.cpp @@ -17,11 +17,6 @@ #include "logger/logger.h" #include "postgres/postgres.h" #include "json/json.h" -#include "gadm_reader/gadm_reader.h" -#include "grid/grid.h" -#include "search/search.h" -#include "enrichers/enrichers.h" -#include "cmd_gridsearch.h" #include "cmd_kbot.h" #ifndef PROJECT_VERSION @@ -77,17 +72,12 @@ int main(int argc, char *argv[]) { db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10); // Subcommand: worker — IPC mode (spawned by Node.js orchestrator) - bool daemon_mode = false; - std::string daemon_uid; - std::string worker_config = "config/postgres.toml"; std::string uds_path; auto *worker_cmd = app.add_subcommand( "worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)"); - worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)"); - worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml"); - worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)"); - worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path"); + worker_cmd->add_option("--uds", uds_path, + "Listen on TCP port (Windows) or Unix socket path"); // Subcommand: kbot — AI workflows & task configurations auto* kbot_cmd = polymech::setup_cmd_kbot(app); @@ -109,19 +99,10 @@ int main(int argc, char *argv[]) { // ── worker mode ───────────────────────────────────────────────────────── if (worker_cmd->parsed()) { logger::info("Worker mode: listening on stdin"); - - if (daemon_mode) { - logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid)); - auto cfg = search::load_config(worker_config); - postgres::Config pcfg; - pcfg.supabase_url = cfg.supabase_url; - pcfg.supabase_key = cfg.supabase_service_key; - postgres::init(pcfg); - } if (!uds_path.empty()) { logger::info("Worker mode: UDS Server active on " + uds_path); - int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid); + int rc = polymech::run_cmd_kbot_uds(uds_path); return rc; } @@ -140,27 +121,6 @@ int main(int argc, char *argv[]) { if (req.type == "ping") { ipc::write_message({req.id, "pong", "{}"}); - } else if (req.type == "gridsearch") { - logger::info("Worker: gridsearch job received"); - - // Build callbacks that emit IPC events. - // Progress events use id "0" (unmatched → event for orchestrator). - // The final job_result uses the original req.id so the promise resolves. - std::string req_id = req.id; - polymech::GridsearchCallbacks cb; - cb.onEvent = [&req_id](const std::string& type, const std::string& json) { - if (type == "job_result") { - ipc::write_message({req_id, "job_result", json}); - } else { - ipc::write_message({"0", type, json}); - } - }; - - int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid); - if (rc != 0) { - ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"}); - } - } else if (req.type == "job") { // Stub: echo the payload back as job_result ipc::write_message({req.id, "job_result", req.payload}); diff --git a/packages/kbot/cpp/tests/CMakeLists.txt b/packages/kbot/cpp/tests/CMakeLists.txt index 32d8c54c..e7c1a848 100644 --- a/packages/kbot/cpp/tests/CMakeLists.txt +++ b/packages/kbot/cpp/tests/CMakeLists.txt @@ -53,12 +53,6 @@ add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp) target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads) catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) -add_executable(test_gridsearch_ipc e2e/test_gridsearch_ipc.cpp ../src/cmd_gridsearch.cpp ../src/cmd_gridsearch-filters.cpp ../src/cmd_gridsearch-uds.cpp ../src/cmd_gridsearch-postgres.cpp ../src/gridsearch_serialize.cpp ../src/sys_metrics.cpp) -target_link_libraries(test_gridsearch_ipc PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers Threads::Threads) -target_include_directories(test_gridsearch_ipc PRIVATE ${CMAKE_SOURCE_DIR}/src ${asio_SOURCE_DIR}/asio/include ${taskflow_SOURCE_DIR} ${concurrentqueue_SOURCE_DIR}) -target_compile_definitions(test_gridsearch_ipc PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1) -catch_discover_tests(test_gridsearch_ipc WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}) - add_executable(test_ipc unit/test_ipc.cpp) target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads) catch_discover_tests(test_ipc) diff --git a/packages/kbot/cpp/tests/e2e/test_gridsearch_ipc.cpp b/packages/kbot/cpp/tests/e2e/test_gridsearch_ipc.cpp deleted file mode 100644 index 6cae5f32..00000000 --- a/packages/kbot/cpp/tests/e2e/test_gridsearch_ipc.cpp +++ /dev/null @@ -1,144 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "../../src/cmd_gridsearch.h" -#include "logger/logger.h" - -// ── Helpers ────────────────────────────────────────────────────────────────── - -static std::string read_file_contents(const std::string &path) { - std::ifstream f(path); - if (!f.is_open()) - return ""; - std::stringstream ss; - ss << f.rdbuf(); - return ss.str(); -} - -/// Read a JSON config file and inject test-safe overrides: -/// - configPath = "config/postgres.toml" -/// - enrich = false (no live HTTP / thread-pool in tests) -/// - persistencePostgres = false -static std::string load_test_payload(const std::string &config_path) { - std::string raw = read_file_contents(config_path); - if (raw.empty()) - return ""; - - rapidjson::Document doc; - doc.Parse(raw.c_str()); - if (doc.HasParseError()) - return ""; - - auto &alloc = doc.GetAllocator(); - - // Remove-then-add ensures no double-add assertion from rapidjson - auto inject_bool = [&](const char *key, bool val) { - if (doc.HasMember(key)) - doc.RemoveMember(key); - doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val), alloc); - }; - auto inject_str = [&](const char *key, const char *val) { - if (doc.HasMember(key)) - doc.RemoveMember(key); - doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val, alloc), - alloc); - }; - - inject_str("configPath", "config/postgres.toml"); - inject_str("cacheDir", "../../packages/gadm/cache/gadm"); // server/cache/gadm - inject_bool("enrich", false); // no live enrichment in tests - inject_bool("persistencePostgres", false); - - rapidjson::StringBuffer buf; - rapidjson::Writer writer(buf); - doc.Accept(writer); - return buf.GetString(); -} - -// ── Tests -// ───────────────────────────────────────────────────────────────────── - -TEST_CASE("E2E: Gridsearch Country Boundary Filter (Lamu/KEN)", - "[e2e][gridsearch][boundary]") { - REQUIRE_NOTHROW(logger::init("test-gridsearch")); - - // Lamu, Kenya — SerpAPI often returns US results for obscure African regions. - // boundary_KEN_0.json should filter them out. - std::string payload = load_test_payload("config/gridsearch-lamu.json"); - REQUIRE(!payload.empty()); - - std::vector location_events; - int error_count = 0; - - polymech::GridsearchCallbacks cb; - cb.onEvent = [&](const std::string &type, const std::string &json) { - if (type == "location") { - location_events.push_back(json); - } else if (type == "error") { - error_count++; - std::cout << "[ERROR EVENT]: " << json << "\n"; - } - }; - - int result = - polymech::run_cmd_gridsearch_ipc(payload, "test-lamu-job", cb, false, ""); - - REQUIRE(result == 0); - REQUIRE(error_count == 0); - - // All returned locations must be within Kenya (no USA coords). - // Verify: no location has lng < -30 (Americas) or lng > 60 (not Africa/Asia) - // and lat outside [-5, 5] for Lamu county bounds. - int outside_kenya = 0; - for (const auto &loc_json : location_events) { - rapidjson::Document loc; - loc.Parse(loc_json.c_str()); - if (loc.HasParseError()) - continue; - if (loc.HasMember("gps") && loc["gps"].IsObject()) { - double lng = - loc["gps"].HasMember("lng") ? loc["gps"]["lng"].GetDouble() : 0; - // Kenya longitude range: ~34..42; USA is roughly -130..-60 - if (lng < 20.0 || lng > 55.0) - outside_kenya++; - } - } - - CHECK(outside_kenya == 0); - std::cout << "Lamu boundary test: " << location_events.size() - << " locations kept, " << outside_kenya << " outside Kenya.\n"; -} - -TEST_CASE("E2E: Gridsearch Type Filter (Sample/ABW)", - "[e2e][gridsearch][filter]") { - std::string payload = load_test_payload("config/gridsearch-sample.json"); - REQUIRE(!payload.empty()); - - std::vector location_events; - int error_count = 0; - - polymech::GridsearchCallbacks cb; - cb.onEvent = [&](const std::string &type, const std::string &json) { - if (type == "location") - location_events.push_back(json); - else if (type == "error") - error_count++; - }; - - int result = polymech::run_cmd_gridsearch_ipc(payload, "test-sample-job", cb, - false, ""); - - REQUIRE(result == 0); - REQUIRE(error_count == 0); - - std::cout << "Sample (ABW) type filter test: " << location_events.size() - << " locations.\n"; -}