kbot meets cpp - the beginnings :)

This commit is contained in:
lovebird 2026-03-29 18:38:21 +02:00
parent 708a756a07
commit 7b07f1a55a
88 changed files with 10690 additions and 0 deletions

31
packages/kbot/cpp/.gitignore vendored Normal file
View File

@ -0,0 +1,31 @@
# Build output
/build/
# Compiled objects
*.o
*.obj
*.exe
*.out
*.app
# CMake generated
CMakeCache.txt
CMakeFiles/
cmake_install.cmake
Makefile
# IDE / Editor
.vscode/
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db
# Logs
*.log
cache/
config/postgres.toml
dist

View File

@ -0,0 +1,120 @@
cmake_minimum_required(VERSION 3.20)
project(polymech-cli
VERSION 0.1.0
DESCRIPTION "Polymech C++ CLI"
LANGUAGES CXX C
)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${CMAKE_SOURCE_DIR}/dist")
# C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
# Dependencies
include(FetchContent)
FetchContent_Declare(
cli11
GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
GIT_TAG v2.4.2
GIT_SHALLOW TRUE
)
FetchContent_Declare(
tomlplusplus
GIT_REPOSITORY https://github.com/marzer/tomlplusplus.git
GIT_TAG v3.4.0
GIT_SHALLOW TRUE
)
FetchContent_Declare(
Catch2
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
GIT_TAG v3.7.1
GIT_SHALLOW TRUE
)
FetchContent_Declare(
asio
GIT_REPOSITORY https://github.com/chriskohlhoff/asio.git
GIT_TAG asio-1-28-0
GIT_SHALLOW TRUE
)
FetchContent_Declare(
concurrentqueue
GIT_REPOSITORY https://github.com/cameron314/concurrentqueue.git
GIT_TAG v1.0.4
GIT_SHALLOW TRUE
)
FetchContent_Declare(
taskflow
GIT_REPOSITORY https://github.com/taskflow/taskflow.git
GIT_TAG v3.6.0
GIT_SHALLOW TRUE
)
set(TF_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(TF_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(cli11 tomlplusplus Catch2 asio concurrentqueue taskflow)
# Packages
add_subdirectory(packages/logger)
add_subdirectory(packages/html)
add_subdirectory(packages/postgres)
add_subdirectory(packages/http)
add_subdirectory(packages/json)
add_subdirectory(packages/polymech)
add_subdirectory(packages/ipc)
add_subdirectory(packages/geo)
add_subdirectory(packages/gadm_reader)
add_subdirectory(packages/grid)
add_subdirectory(packages/search)
add_subdirectory(packages/enrichers)
# Sources
add_executable(${PROJECT_NAME}
src/main.cpp
src/cmd_gridsearch.cpp
src/cmd_gridsearch-filters.cpp
src/cmd_gridsearch-uds.cpp
src/cmd_gridsearch-postgres.cpp
src/gridsearch_serialize.cpp
src/sys_metrics.cpp
)
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers)
target_include_directories(${PROJECT_NAME} PRIVATE
${asio_SOURCE_DIR}/asio/include
${taskflow_SOURCE_DIR}
${concurrentqueue_SOURCE_DIR}
)
# Define standalone ASIO (since it's not boost)
target_compile_definitions(${PROJECT_NAME} PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
# Compiler warnings
if(MSVC)
target_compile_options(${PROJECT_NAME} PRIVATE /W4 /permissive-)
else()
target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -Wpedantic)
endif()
# Install
install(TARGETS ${PROJECT_NAME}
RUNTIME DESTINATION bin
)
# Tests
enable_testing()
add_subdirectory(tests)

View File

@ -0,0 +1,36 @@
{
"version": 6,
"cmakeMinimumRequired": {
"major": 3,
"minor": 20,
"patch": 0
},
"configurePresets": [
{
"name": "dev",
"displayName": "Dev (Debug)",
"binaryDir": "${sourceDir}/build/dev",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Debug"
}
},
{
"name": "release",
"displayName": "Release",
"binaryDir": "${sourceDir}/build/release",
"cacheVariables": {
"CMAKE_BUILD_TYPE": "Release"
}
}
],
"buildPresets": [
{
"name": "dev",
"configurePreset": "dev"
},
{
"name": "release",
"configurePreset": "release"
}
]
}

View File

@ -0,0 +1,9 @@
Copyright (c) <year> <owner> All rights reserved.
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

111
packages/kbot/cpp/README.md Normal file
View File

@ -0,0 +1,111 @@
# polymech-cli
Cross-platform C++ CLI built with CMake.
## Prerequisites
| Tool | Version |
|------|---------|
| CMake | ≥ 3.20 |
| C++ compiler | C++17 (MSVC, GCC, or Clang) |
## Build
```bash
# Debug
cmake --preset dev
cmake --build --preset dev
# Release
cmake --preset release
cmake --build --preset release
```
## Usage
```bash
polymech-cli --help
polymech-cli --version
```
## Worker Mode & Gridsearch
The `worker` subcommand is designed to be spawned by the Node.js frontend orchestrator (`GridSearchUdsManager`) for background gridsearch execution. It accepts length-prefixed JSON frames over a Unix Domain Socket (UDS) or a local TCP port on Windows.
```bash
polymech-cli worker --uds <path_or_port> --daemon --user-uid <id> --config <path>
```
### IPC Resiliency and Logging
The C++ worker pipeline incorporates extensive feedback and retry instrumentation:
1. **Watchdog Heartbeats (`ping` / `pong`)**
- The Node orchestrator sweeps the active worker pool every 15 seconds. It explicitly logs when a ping is sent and when a `pong` (or other active events like `log`, `job_progress`, or `ack`) are received.
- If a C++ worker stops responding to IPC events for 60 seconds (hanging thread or deadlock), it is automatically killed (`SIGKILL`) and evicted from the pool.
2. **Socket Traceability**
- The UDS socket actively traps unexpected closures and TCP faults (like `ECONNRESET`). If the pipe breaks mid-job, explicit socket `error` event handlers in the Node orchestrator will instantly fail the job and log the stack trace, preventing indefinite client-side UI hangs, especially during heavy re-runs.
3. **Persistent Crash Logging (`logs/uds.json`)**
- The C++ worker initializes a multi-sink logger (`logger::init_uds`). It pumps standard logs to `stderr` while simultaneously persisting an append-only file trace to `server/logs/uds.json`.
- The file sink guarantees synchronization to disk aggressively (every 1 second, and immediately on `info` severity). If the worker process vanishes or crashes, `uds.json` acts as the black-box flight recorder for post-mortem debugging.
4. **Job Specification Transparency**
- Gridsearch payloads (including `retry` and `expand` endpoints) aggressively log their input shape (`guided` bounds flag, `enrichers` subset) within the Node console before passing work to the C++ orchestrator. This allows for clear traceability from UI action -> Node submission -> C++ execution.
5. **Thread Safety & Frame Synchronization (Mutexes)**
- The UDS socket handles dual-direction asynchronous streams. The background execution graph (powered by Taskflow) emits high-frequency events (`location`, `waypoint-start`) via `GridsearchCallbacks`. Concurrently, the orchestrator Node.js process sends periodic commands (`ping`, `cancel`) that the C++ socket loop must instantly acknowledge.
- To prevent overlapping payload frames (which corrupt the critical 4-byte `len` header), a global `g_uds_socket_mutex` is strictly enforced. It guarantees that direct UI acknowledgments (`pong`, `cancel_ack`) and background logging (`uds_sink` / Taskflow events) never interleave their `asio::write` bursts onto the pipe.
### IPC Framing & Payload Protocol
Communication runs strictly via length-prefixed JSON frames. This safeguards against TCP fragmentation during heavy event streams.
**Binary Frame Format:**
`[4-byte Unsigned Little-Endian Integer (Payload Length)] [UTF-8 JSON Object]`
#### Control Commands (Node → C++)
If the JSON object contains an `"action"` field, it is handled synchronously on the socket thread:
- **Health Check:** `{"action": "ping"}`
*Replies:* `{"type": "pong", "data": {"memoryMb": 120, "cpuTimeMs": 4500}}`
- **Cancellation:** `{"action": "cancel", "jobId": "job_123"}`
→ Worker sets the atomic cancellation token to safely halt the target `taskflow`, instantly replying `{"type": "cancel_ack", "data": "job_123"}`
- **Daemon Teardown:** `{"action": "stop"}`
→ Flushes all streams and exits cleanly.
#### Gridsearch Payload (Node → C++)
If no `"action"` field exists, the message is treated as a gridsearch spec and pushed into a lock-free `ConcurrentQueue` for the background execution graph:
```json
{
"jobId": "run_9a8bc7",
"configPath": "config/postgres.toml",
"cacheDir": "../packages/gadm/cache",
"enrich": true,
"guided": {
"areas": [{ "gid": "ESP.6_1", "level": 1 }],
"settings": { "gridMode": "hex", "cellSize": 5.0 }
},
"search": {
"types": ["restaurant"],
"limitPerArea": 500
}
}
```
#### Event Streaming (C++ → Node)
As the gridsearch pipeline executes, the `GridsearchCallbacks` emit standard length-prefixed events directly back to the active UDS socket:
- **`ack`**: Acknowledges job was successfully dequeued (`{"type": "ack", "data": {"jobId": "..."}}`).
- **`log`**: Passthrough of all internal C++ `spdlog` messages using the custom `uds_sink` adapter.
- **`location` / `node`**: Raw geolocation geometries and enriched contact details streamed incrementally.
- **`job_progress`**: Phase updates (Grid Generation → Search → Enrichment).
- **`job_result`**: The final statistical and timer summary (EnumMs, SearchMs, Total Emails, etc).
- **`error`**: Unrecoverable boundary parsing or database initialization faults.
## License
BSD-3-Clause
## Requirements
- [https://github.com/taskflow/taskflow](https://github.com/taskflow/taskflow)
- [https://github.com/cameron314/concurrentqueue](https://github.com/cameron314/concurrentqueue)
- [https://github.com/chriskohlhoff/asio](https://github.com/chriskohlhoff/asio)

View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
#rm -rf /tmp/polymech-build
mkdir -p /tmp/polymech-build
export PATH="/snap/bin:$PATH"
cmake -S ./ -B /tmp/polymech-build -DCMAKE_BUILD_TYPE=Release
cmake --build /tmp/polymech-build

View File

@ -0,0 +1,12 @@
[project]
name = "polymech"
version = "0.1.0"
description = "Polymech C++ CLI"
[database]
host = "localhost"
port = 5432
name = "polymech"
[logging]
level = "debug"

View File

@ -0,0 +1,43 @@
{
"guided": {
"areas": [
{
"gid": "ESP.6.1.10.14_1",
"name": "Sabadell",
"level": 4,
"raw": {
"level": 3,
"gadmName": "Sabadell",
"gid": "ESP.6.1.10.14_1"
}
}
],
"settings": {
"gridMode": "centers",
"pathOrder": "snake",
"groupByRegion": false,
"cellSize": 5,
"cellOverlap": 0,
"centroidOverlap": 0,
"ghsFilterMode": "OR",
"maxCellsLimit": 50000,
"maxElevation": 1000,
"minDensity": 0,
"minGhsPop": 0,
"minGhsBuilt": 0,
"allowMissingGhs": false,
"bypassFilters": false
}
},
"search": {
"types": [
"university"
],
"filterCountry": "",
"googleDomain": "google.com",
"limitPerArea": 20,
"zoom": 15,
"language": "en"
},
"filterTypes": []
}

View File

@ -0,0 +1,49 @@
{
"guided": {
"areas": [
{
"gid": "KEN.21_1",
"name": "Lamu",
"level": 1,
"raw": {
"gid": "KEN.21_1",
"gadmName": "Lamu",
"level": 1
}
}
],
"settings": {
"gridMode": "centers",
"pathOrder": "snake",
"groupByRegion": true,
"cellSize": 5,
"cellOverlap": 0,
"centroidOverlap": 50,
"ghsFilterMode": "OR",
"maxCellsLimit": 50000,
"maxElevation": 1000,
"minDensity": 10,
"minGhsPop": 26,
"minGhsBuilt": 154,
"enableElevation": false,
"enableDensity": false,
"enableGhsPop": false,
"enableGhsBuilt": false,
"allowMissingGhs": false,
"bypassFilters": true
}
},
"search": {
"types": [
"plastic"
],
"filterCountry": "",
"googleDomain": "google.com",
"limitPerArea": 20,
"zoom": 15,
"language": "en"
},
"filterTypes": [
"Recycling center"
]
}

View File

@ -0,0 +1,40 @@
{
"guided": {
"areas": [
{
"gid": "ABW",
"name": "Aruba",
"level": 0
}
],
"settings": {
"gridMode": "centers",
"pathOrder": "snake",
"groupByRegion": false,
"cellSize": 5,
"cellOverlap": 0,
"centroidOverlap": 0,
"ghsFilterMode": "OR",
"maxCellsLimit": 50000,
"maxElevation": 1000,
"minDensity": 0,
"minGhsPop": 0,
"minGhsBuilt": 0,
"allowMissingGhs": false,
"bypassFilters": false
}
},
"search": {
"types": [
"recycling"
],
"filterCountry": "",
"googleDomain": "google.com",
"limitPerArea": 20,
"zoom": 15,
"language": "en"
},
"filterTypes": [
"Recycling center"
]
}

View File

@ -0,0 +1,45 @@
{
"guided": {
"areas": [
{
"gid": "ESP.6.1_1",
"name": "Barcelona",
"level": 3,
"raw": {
"level": 2,
"gadmName": "Barcelona",
"gid": "ESP.6.1_1"
}
}
],
"settings": {
"gridMode": "centers",
"pathOrder": "snake",
"groupByRegion": true,
"cellSize": 5,
"cellOverlap": 0,
"centroidOverlap": 0,
"ghsFilterMode": "OR",
"maxCellsLimit": 50000,
"maxElevation": 1000,
"minDensity": 10,
"minGhsPop": 26,
"minGhsBuilt": 154,
"enableElevation": false,
"enableDensity": false,
"enableGhsPop": false,
"enableGhsBuilt": false,
"allowMissingGhs": false,
"bypassFilters": true
}
},
"search": {
"types": [
"marketing"
],
"filterCountry": "Spain",
"googleDomain": "google.es",
"limitPerArea": 10,
"useCache": true
}
}

View File

@ -0,0 +1,85 @@
{
"guided": {
"areas": [
{
"gid": "ESP.6.1.10.2_1",
"name": "Barberà del Vallès",
"level": 4,
"raw": {
"level": 4,
"gadmName": "Barberà del Vallès",
"gid": "ESP.6.1.10.2_1"
}
},
{
"gid": "ESP.6.1.10.14_1",
"name": "Sabadell",
"level": 4,
"raw": {
"level": 4,
"gadmName": "Sabadell",
"gid": "ESP.6.1.10.14_1"
}
},
{
"gid": "ESP.6.1.10.11_1",
"name": "Polinyà",
"level": 4,
"raw": {
"level": 4,
"gadmName": "Polinyà",
"gid": "ESP.6.1.10.11_1"
}
},
{
"gid": "ESP.6.1.10.4_1",
"name": "Castellar del Vallès",
"level": 4,
"raw": {
"level": 4,
"gadmName": "Castellar del Vallès",
"gid": "ESP.6.1.10.4_1"
}
},
{
"gid": "ESP.6.1.10.19_1",
"name": "Sentmenat",
"level": 4,
"raw": {
"level": 4,
"gadmName": "Sentmenat",
"gid": "ESP.6.1.10.19_1"
}
}
],
"settings": {
"gridMode": "centers",
"pathOrder": "snake",
"groupByRegion": true,
"cellSize": 10,
"cellOverlap": 0,
"centroidOverlap": 0,
"ghsFilterMode": "OR",
"maxCellsLimit": 50000,
"maxElevation": 1000,
"minDensity": 10,
"minGhsPop": 26,
"minGhsBuilt": 154,
"enableElevation": false,
"enableDensity": false,
"enableGhsPop": false,
"enableGhsBuilt": false,
"allowMissingGhs": false,
"bypassFilters": true
}
},
"search": {
"types": [
"mecanizado cnc"
],
"filterCountry": "Spain",
"googleDomain": "google.es",
"limitPerArea": 10,
"useCache": true
}
}

View File

@ -0,0 +1,37 @@
{
"guided": {
"areas": [
{
"gid": "ABW",
"name": "Aruba",
"level": 0
}
],
"settings": {
"gridMode": "centers",
"pathOrder": "snake",
"groupByRegion": false,
"cellSize": 5,
"cellOverlap": 0,
"centroidOverlap": 0,
"ghsFilterMode": "OR",
"maxCellsLimit": 50000,
"maxElevation": 1000,
"minDensity": 0,
"minGhsPop": 0,
"minGhsBuilt": 0,
"allowMissingGhs": false,
"bypassFilters": false
}
},
"search": {
"types": [
"recycling"
],
"filterCountry": "",
"googleDomain": "google.com",
"limitPerArea": 1,
"zoom": 15,
"language": "en"
}
}

View File

@ -0,0 +1,60 @@
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────────────────────
# install-lnx.sh Install build dependencies for polymech-cli on Linux
#
# Tested on: Ubuntu 20.04+ / Debian 11+
# Usage: sudo bash install-lnx.sh
# ─────────────────────────────────────────────────────────────────────────────
set -euo pipefail
echo "── polymech-cli Linux dependency installer ──"
# ── 1. System packages (apt) ─────────────────────────────────────────────────
echo ""
echo "[1/3] Installing system packages via apt …"
apt-get update -qq
apt-get install -y --no-install-recommends \
build-essential \
gcc \
g++ \
git \
libssl-dev \
pkg-config \
snapd
# ── 2. CMake ≥ 3.20 via snap ────────────────────────────────────────────────
# The project requires cmake_minimum_required(VERSION 3.20).
# Ubuntu 20.04 ships cmake 3.16, so we use the snap package instead.
echo ""
echo "[2/3] Installing CMake via snap (≥ 3.20 required) …"
if command -v /snap/bin/cmake &>/dev/null; then
echo " cmake snap already installed: $(/snap/bin/cmake --version | head -1)"
else
snap install cmake --classic
echo " Installed: $(/snap/bin/cmake --version | head -1)"
fi
# ── 3. Node.js (for npm run build:linux) ──────────────────────────────────────
echo ""
echo "[3/3] Checking for Node.js / npm …"
if command -v node &>/dev/null; then
echo " node $(node --version) already installed"
else
echo " Node.js not found. Install via nvm or nodesource, e.g.:"
echo " curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -"
echo " sudo apt-get install -y nodejs"
fi
# ── Summary ──────────────────────────────────────────────────────────────────
echo ""
echo "── Done! ──"
echo ""
echo "All C++ dependencies (CLI11, tomlplusplus, Catch2, asio, concurrentqueue,"
echo "taskflow, curl, lexbor, rapidjson) are fetched automatically by CMake"
echo "FetchContent at build time — no manual installation needed."
echo ""
echo "To build:"
echo " cd $(dirname "$0")"
echo " npm run build:linux"
echo ""
echo "The binary will be placed in: dist/polymech-cli"

View File

@ -0,0 +1,159 @@
/**
* orchestrator/spawn.mjs
*
* Spawn a C++ worker as a child process, send/receive length-prefixed
* JSON messages over stdin/stdout.
*
* Usage:
* import { spawnWorker } from './spawn.mjs';
* const w = await spawnWorker('./dist/polymech-cli.exe');
* console.log(res); // { id: '...', type: 'pong', payload: {} }
* await w.shutdown();
*/
import { spawn } from 'node:child_process';
import { randomUUID } from 'node:crypto';
// ── frame helpers ────────────────────────────────────────────────────────────
/** Write a 4-byte LE length + JSON body to a writable stream. */
function writeFrame(stream, msg) {
const body = JSON.stringify(msg);
const bodyBuf = Buffer.from(body, 'utf8');
const lenBuf = Buffer.alloc(4);
lenBuf.writeUInt32LE(bodyBuf.length, 0);
stream.write(Buffer.concat([lenBuf, bodyBuf]));
}
/**
* Creates a streaming frame parser.
* Calls `onMessage(parsed)` for each complete frame.
*/
function createFrameReader(onMessage) {
let buffer = Buffer.alloc(0);
return (chunk) => {
buffer = Buffer.concat([buffer, chunk]);
while (buffer.length >= 4) {
const bodyLen = buffer.readUInt32LE(0);
const totalLen = 4 + bodyLen;
if (buffer.length < totalLen) break; // need more data
const bodyBuf = buffer.subarray(4, totalLen);
buffer = buffer.subarray(totalLen);
try {
const msg = JSON.parse(bodyBuf.toString('utf8'));
onMessage(msg);
} catch (e) {
console.error('[orchestrator] failed to parse frame:', e.message);
}
}
};
}
// ── spawnWorker ──────────────────────────────────────────────────────────────
/**
* Spawn the C++ binary in `worker` mode.
* Returns: { send, request, shutdown, kill, process, ready }
*
* `ready` is a Promise that resolves when the worker sends `{ type: 'ready' }`.
*/
export function spawnWorker(exePath, args = ['worker']) {
const proc = spawn(exePath, args, {
stdio: ['pipe', 'pipe', 'pipe'],
});
// Pending request map: id → { resolve, reject, timer }
const pending = new Map();
// Event handler for unmatched messages (progress events, etc.)
let eventHandler = null;
let readyResolve;
const ready = new Promise((resolve) => { readyResolve = resolve; });
// stderr → console (worker logs via spdlog go to stderr)
proc.stderr.on('data', (chunk) => {
const text = chunk.toString().trim();
if (text) console.error(`[worker:stderr] ${text}`);
});
// stdout → frame parser → route by id / type
const feedData = createFrameReader((msg) => {
// Handle the initial "ready" signal
if (msg.type === 'ready') {
readyResolve(msg);
return;
}
// Route response to pending request
if (msg.id && pending.has(msg.id)) {
const { resolve, timer } = pending.get(msg.id);
clearTimeout(timer);
pending.delete(msg.id);
resolve(msg);
return;
}
// Unmatched message (progress event, broadcast, etc.)
if (eventHandler) {
eventHandler(msg);
} else {
console.log('[orchestrator] unmatched message:', msg);
}
});
proc.stdout.on('data', feedData);
// ── public API ──────────────────────────────────────────────────────────
/** Fire-and-forget send. */
function send(msg) {
if (!msg.id) msg.id = randomUUID();
writeFrame(proc.stdin, msg);
}
/** Send a message and wait for the response with matching `id`. */
function request(msg, timeoutMs = 5000) {
return new Promise((resolve, reject) => {
const id = msg.id || randomUUID();
msg.id = id;
const timer = setTimeout(() => {
pending.delete(id);
reject(new Error(`IPC request timed out after ${timeoutMs}ms (id=${id}, type=${msg.type})`));
}, timeoutMs);
pending.set(id, { resolve, reject, timer });
writeFrame(proc.stdin, msg);
});
}
/** Graceful shutdown: send shutdown message & wait for process exit. */
async function shutdown(timeoutMs = 3000) {
const res = await request({ type: 'shutdown' }, timeoutMs);
// Wait for process to exit
await new Promise((resolve) => {
const timer = setTimeout(() => {
proc.kill();
resolve();
}, timeoutMs);
proc.on('exit', () => { clearTimeout(timer); resolve(); });
});
return res;
}
return {
send,
request,
shutdown,
kill: () => proc.kill(),
process: proc,
ready,
onEvent: (handler) => { eventHandler = handler; },
};
}

View File

@ -0,0 +1,204 @@
/**
* orchestrator/test-gridsearch-ipc.mjs
*
* E2E test: spawn the C++ worker, send a gridsearch request
* matching `npm run gridsearch:enrich` defaults, collect IPC events,
* and verify the full event sequence.
*
* Run: node orchestrator/test-gridsearch-ipc.mjs
* Needs: npm run build-debug (or npm run build)
*/
import { spawnWorker } from './spawn.mjs';
import { resolve, dirname } from 'node:path';
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
if (!fs.existsSync(EXE)) {
console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`);
process.exit(1);
}
console.log(`Binary: ${EXE}\n`);
// Load the sample settings (same as gridsearch:enrich)
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
);
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) {
console.log(`${label}`);
passed++;
} else {
console.error(`${label}`);
failed++;
}
}
// ── Event collector ─────────────────────────────────────────────────────────
const EXPECTED_EVENTS = [
'grid-ready',
'waypoint-start',
'area',
'location',
'enrich-start',
'node',
'nodePage',
// 'node-error' — may or may not occur, depends on network
];
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage']) {
events[t] = [];
}
return {
events,
handler(msg) {
const t = msg.type;
if (events[t]) {
events[t].push(msg);
} else {
events[t] = [msg];
}
// Live progress indicator
const d = msg.payload ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
}
},
};
}
// ── Main test ───────────────────────────────────────────────────────────────
async function run() {
console.log('🧪 Gridsearch IPC E2E Test\n');
// ── 1. Spawn worker ───────────────────────────────────────────────────
console.log('1. Spawn worker in daemon mode');
const worker = spawnWorker(EXE, ['worker', '--daemon', '--user-uid', '3bb4cfbf-318b-44d3-a9d3-35680e738421']);
const readyMsg = await worker.ready;
assert(readyMsg.type === 'ready', 'Worker sends ready signal');
// ── 2. Register event collector ───────────────────────────────────────
const collector = createCollector();
worker.onEvent(collector.handler);
// ── 3. Send gridsearch request (matching gridsearch:enrich) ────────────
console.log('2. Send gridsearch request (Aruba / recycling / --enrich)');
const t0 = Date.now();
// Very long timeout — enrichment can take minutes
const result = await worker.request(
{
type: 'gridsearch',
payload: {
...sampleConfig,
enrich: true,
},
},
5 * 60 * 1000 // 5 min timeout
);
const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`);
// ── 4. Verify final result ────────────────────────────────────────────
console.log('3. Verify job_result');
assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`);
const summary = result.payload ?? null;
assert(summary !== null, 'job_result payload is present');
if (summary) {
assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`);
assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`);
assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`);
assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`);
assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`);
assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present');
assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present');
assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`);
assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`);
}
// ── 5. Verify event sequence ──────────────────────────────────────────
console.log('4. Verify event stream');
const e = collector.events;
assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`);
assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`);
assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`);
assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`);
assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`);
const totalNodes = e['node'].length + e['node-error'].length;
assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`);
// Validate grid-ready payload
if (e['grid-ready'].length > 0) {
const gr = e['grid-ready'][0].payload ?? {};
assert(Array.isArray(gr.areas), 'grid-ready.areas is array');
assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`);
}
// Validate location events have required fields
if (e['location'].length > 0) {
const loc = e['location'][0].payload ?? {};
assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title');
assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id');
assert(typeof loc.areaName === 'string', 'location event has areaName');
}
assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`);
// Validate node payloads
if (e['node'].length > 0) {
const nd = e['node'][0].payload ?? {};
assert(typeof nd.placeId === 'string', 'node event has placeId');
assert(typeof nd.title === 'string', 'node event has title');
assert(Array.isArray(nd.emails), 'node event has emails array');
assert(typeof nd.status === 'string', 'node event has status');
}
// ── 6. Print event summary ────────────────────────────────────────────
console.log('\n5. Event summary');
for (const [type, arr] of Object.entries(e)) {
if (arr.length > 0) console.log(` ${type}: ${arr.length}`);
}
// ── 7. Shutdown ───────────────────────────────────────────────────────
console.log('\n6. Graceful shutdown');
const shutdownRes = await worker.shutdown();
assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged');
await new Promise(r => setTimeout(r, 500));
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
// ── Summary ───────────────────────────────────────────────────────────
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────\n`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch((err) => {
console.error('Test runner error:', err);
process.exit(1);
});

View File

@ -0,0 +1,218 @@
/**
* orchestrator/test-gridsearch-ipc-uds-meta.mjs
*
* E2E test for Unix Domain Sockets / Windows Named Pipes (Meta Enrichment)!
* Spawns the worker in `--uds` mode and tests direct high-throughput
* lock-free JSON binary framing over a net.Socket.
*/
import { spawn } from 'node:child_process';
import { resolve, dirname, join } from 'node:path';
import { readFileSync, existsSync, unlinkSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import net from 'node:net';
import { tmpdir } from 'node:os';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
const TEST_CANCEL = false;
if (!existsSync(EXE)) {
console.error(`❌ Binary not found at ${EXE}`);
process.exit(1);
}
const PIPE_NAME = 'polymech-test-uds-meta';
const CPP_UDS_ARG = IS_WIN ? '4001' : join(tmpdir(), `${PIPE_NAME}.sock`);
if (!IS_WIN && existsSync(CPP_UDS_ARG)) {
unlinkSync(CPP_UDS_ARG);
}
console.log(`Binary: ${EXE}`);
console.log(`C++ Arg: ${CPP_UDS_ARG}\n`);
// ── Event collector ─────────────────────────────────────────────────────────
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) {
events[t] = [];
}
return {
events,
onComplete: null,
handler(msg) {
const t = msg.type;
if (events[t]) events[t].push(msg);
else events[t] = [msg];
const d = msg.data ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
} else if (t === 'job_result') {
console.log(`\n 🏁 Pipeline complete!`);
if (this.onComplete) this.onComplete(msg);
}
},
};
}
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) { console.log(`${label}`); passed++; }
else { console.error(`${label}`); failed++; }
}
async function run() {
console.log('🧪 Gridsearch UDS Meta E2E Test\n');
// 1. Spawn worker in UDS mode
console.log('1. Spawning remote C++ Taskflow Daemon');
const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' });
// Give the daemon a moment to boot
console.log('2. Connecting net.Socket with retries...');
let socket;
for (let i = 0; i < 15; i++) {
try {
await new Promise((resolve, reject) => {
if (IS_WIN) {
socket = net.connect({ port: 4001, host: '127.0.0.1' });
} else {
socket = net.connect(CPP_UDS_ARG);
}
socket.once('connect', resolve);
socket.once('error', reject);
});
console.log(' ✅ Socket Connected to UDS!');
break;
} catch (e) {
if (i === 14) throw e;
await new Promise(r => setTimeout(r, 500));
}
}
const collector = createCollector();
let buffer = Buffer.alloc(0);
// Buffer framing logic (length-prefixed streaming)
socket.on('data', (chunk) => {
buffer = Buffer.concat([buffer, chunk]);
while (buffer.length >= 4) {
const len = buffer.readUInt32LE(0);
if (buffer.length >= 4 + len) {
const payload = buffer.toString('utf8', 4, 4 + len);
buffer = buffer.subarray(4 + len);
try {
const msg = JSON.parse(payload);
collector.handler(msg);
} catch (e) {
console.error("JSON PARSE ERROR:", e, payload);
}
} else {
break; // Wait for more chunks
}
}
});
// 3. Send Gridsearch payload
// USE gridsearch-sample.json instead of gridsearch-bcn-universities.json
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
);
sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml');
sampleConfig.jobId = 'uds-meta-test-abc';
sampleConfig.noCache = true; // force re-enrichment even if cached
console.log('3. Writing serialized IPC Payload over pipe...');
const jsonStr = JSON.stringify(sampleConfig);
const lenBuf = Buffer.alloc(4);
lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr));
socket.write(lenBuf);
socket.write(jsonStr);
// 4. Wait for pipeline completion (job_result event) or timeout
console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n');
await new Promise((resolve) => {
collector.onComplete = () => {
// Send stop command to gracefully shut down the daemon
console.log(' 📤 Sending stop command to daemon...');
const stopPayload = JSON.stringify({ action: 'stop' });
const stopLen = Buffer.alloc(4);
stopLen.writeUInt32LE(Buffer.byteLength(stopPayload));
socket.write(stopLen);
socket.write(stopPayload);
setTimeout(resolve, 1000); // Give daemon a moment to ack
};
// Safety timeout
setTimeout(() => {
console.log('\n ⏰ Timeout reached (300s) — forcing shutdown.');
resolve();
}, 300000); // Wait up to 5 minutes
});
console.log('\n\n5. Event summary');
for (const [k, v] of Object.entries(collector.events)) {
console.log(` ${k}: ${v.length}`);
}
// Assertions
const ev = collector.events;
assert(ev['grid-ready'].length === 1, 'grid-ready emitted once');
assert(ev['waypoint-start'].length > 0, 'waypoint-start events received');
assert(ev['location'].length > 0, 'location events received');
assert(ev['enrich-start'].length === 1, 'enrich-start emitted once');
assert(ev['job_result'].length === 1, 'job_result emitted once');
// Verify social profiles and md body
const nodes = ev['node'];
let foundSocial = false;
let foundSiteMd = false;
for (const n of nodes) {
const d = n.data;
if (!d) continue;
if (d.socials && d.socials.length > 0) {
foundSocial = true;
}
if (d.sites && Array.isArray(d.sites) && d.sites.length > 0) {
foundSiteMd = true;
}
}
if (foundSocial) {
assert(foundSocial, 'At least one enriched node has social media profiles discovered');
} else {
console.log(' ⚠️ No social media profiles discovered in this run (data-dependent), but pipeline completed.');
}
assert(foundSiteMd, 'At least one enriched node has markdown sites mapped');
console.log('6. Cleanup');
socket.destroy();
worker.kill('SIGTERM');
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch(e => {
console.error(e);
process.exit(1);
});

View File

@ -0,0 +1,255 @@
/**
* orchestrator/test-gridsearch-ipc-uds.mjs
*
* E2E test for Unix Domain Sockets / Windows Named Pipes!
* Spawns the worker in `--uds` mode and tests direct high-throughput
* lock-free JSON binary framing over a net.Socket.
*/
import { spawn } from 'node:child_process';
import { resolve, dirname, join } from 'node:path';
import { readFileSync, existsSync, unlinkSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import net from 'node:net';
import { tmpdir } from 'node:os';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
const TEST_CANCEL = false;
if (!existsSync(EXE)) {
console.error(`❌ Binary not found at ${EXE}`);
process.exit(1);
}
const PIPE_NAME = 'polymech-test-uds';
const CPP_UDS_ARG = IS_WIN ? '4000' : join(tmpdir(), `${PIPE_NAME}.sock`);
if (!IS_WIN && existsSync(CPP_UDS_ARG)) {
unlinkSync(CPP_UDS_ARG);
}
console.log(`Binary: ${EXE}`);
console.log(`C++ Arg: ${CPP_UDS_ARG}\n`);
// ── Event collector ─────────────────────────────────────────────────────────
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) {
events[t] = [];
}
return {
events,
onComplete: null,
handler(msg) {
const t = msg.type;
if (events[t]) events[t].push(msg);
else events[t] = [msg];
const d = msg.data ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
} else if (t === 'job_result') {
console.log(`\n 🏁 Pipeline complete!`);
if (this.onComplete) this.onComplete(msg);
}
},
};
}
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) { console.log(`${label}`); passed++; }
else { console.error(`${label}`); failed++; }
}
async function run() {
console.log('🧪 Gridsearch UDS / Named Pipe E2E Test\n');
// 1. Spawn worker in UDS mode
console.log('1. Spawning remote C++ Taskflow Daemon');
const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' });
// Give the daemon a moment to boot
console.log('2. Connecting net.Socket with retries...');
let socket;
for (let i = 0; i < 15; i++) {
try {
await new Promise((resolve, reject) => {
if (IS_WIN) {
socket = net.connect({ port: 4000, host: '127.0.0.1' });
} else {
socket = net.connect(CPP_UDS_ARG);
}
socket.once('connect', resolve);
socket.once('error', reject);
});
console.log(' ✅ Socket Connected to UDS!');
break;
} catch (e) {
if (i === 14) throw e;
await new Promise(r => setTimeout(r, 500));
}
}
const collector = createCollector();
let buffer = Buffer.alloc(0);
// Buffer framing logic (length-prefixed streaming)
socket.on('data', (chunk) => {
buffer = Buffer.concat([buffer, chunk]);
while (buffer.length >= 4) {
const len = buffer.readUInt32LE(0);
if (buffer.length >= 4 + len) {
const payload = buffer.toString('utf8', 4, 4 + len);
buffer = buffer.subarray(4 + len);
try {
const msg = JSON.parse(payload);
collector.handler(msg);
} catch (e) {
console.error("JSON PARSE ERROR:", e, payload);
}
} else {
break; // Wait for more chunks
}
}
});
// 3. Send Gridsearch payload
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-bcn-universities.json'), 'utf8')
);
sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml');
sampleConfig.jobId = 'uds-test-cancel-abc';
console.log('3. Writing serialized IPC Payload over pipe...');
const jsonStr = JSON.stringify(sampleConfig);
const lenBuf = Buffer.alloc(4);
lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr));
socket.write(lenBuf);
socket.write(jsonStr);
// Send cancellation after 5 seconds
if (TEST_CANCEL) {
setTimeout(() => {
console.log('\n\n--> Testing Dynamic Cancellation (Sending cancel event for uds-test-cancel-abc)...');
const cancelPayload = JSON.stringify({ action: "cancel", jobId: "uds-test-cancel-abc" });
const cancelLenBuf = Buffer.alloc(4);
cancelLenBuf.writeUInt32LE(Buffer.byteLength(cancelPayload));
socket.write(cancelLenBuf);
socket.write(cancelPayload);
}, 5000);
}
// 4. Wait for pipeline completion (job_result event) or timeout
console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n');
await new Promise((resolve) => {
collector.onComplete = () => {
// Send stop command to gracefully shut down the daemon
console.log(' 📤 Sending stop command to daemon...');
const stopPayload = JSON.stringify({ action: 'stop' });
const stopLen = Buffer.alloc(4);
stopLen.writeUInt32LE(Buffer.byteLength(stopPayload));
socket.write(stopLen);
socket.write(stopPayload);
setTimeout(resolve, 1000); // Give daemon a moment to ack
};
// Safety timeout
setTimeout(() => {
console.log('\n ⏰ Timeout reached (120s) — forcing shutdown.');
resolve();
}, 120000);
});
console.log('\n\n5. Event summary');
for (const [k, v] of Object.entries(collector.events)) {
console.log(` ${k}: ${v.length}`);
}
// Assertions
const ev = collector.events;
assert(ev['grid-ready'].length === 1, 'grid-ready emitted once');
assert(ev['waypoint-start'].length > 0, 'waypoint-start events received');
assert(ev['location'].length > 0, 'location events received');
assert(ev['enrich-start'].length === 1, 'enrich-start emitted once');
assert(ev['job_result'].length === 1, 'job_result emitted once');
// Check enrichment skip log (if present in log events)
const logEvents = ev['log'] ?? [];
const skipLog = logEvents.find(l =>
typeof l.data === 'string' && l.data.includes('already enriched')
);
const nodeCount = ev['node'].length + ev['node-error'].length;
if (skipLog) {
console.log(` Pre-enrich skip detected: ${skipLog.data}`);
assert(nodeCount === 0, 'no enrichment needed (all skipped)');
} else {
console.log(' No pre-enrich skips (all locations are new or unenriched)');
assert(nodeCount > 0, 'enrichment node events received');
}
// Check filterTypes assertions: all locations must have website + matching type
const FILTER_TYPE = 'Recycling center';
const locations = ev['location'];
const badWebsite = locations.filter(l => {
const loc = l.data?.location;
return !loc?.website;
});
assert(badWebsite.length === 0, `all locations have website (${badWebsite.length} missing)`);
const badType = locations.filter(l => {
const loc = l.data?.location;
const types = loc?.types ?? [];
const type = loc?.type ?? '';
return !types.includes(FILTER_TYPE) && type !== FILTER_TYPE;
});
if (badType.length > 0) {
console.log(` ❌ Mismatched locations:`);
badType.slice(0, 3).forEach(l => console.log(JSON.stringify(l.data?.location, null, 2)));
}
assert(badType.length === 0, `all locations match type "${FILTER_TYPE}" (${badType.length} mismatched)`);
const filterLog = logEvents.find(l =>
typeof l.data === 'string' && l.data.includes('locations removed')
);
if (filterLog) {
console.log(` Filter applied: ${filterLog.data}`);
}
const filterTypesLog = logEvents.filter(l =>
typeof l.data === 'string' && (l.data.includes('filterTypes:') || l.data.includes(' - '))
);
if (filterTypesLog.length > 0) {
console.log(` Parsed filterTypes in C++:`);
filterTypesLog.forEach(l => console.log(` ${l.data}`));
}
console.log(` Locations after filter: ${locations.length}`);
console.log('6. Cleanup');
socket.destroy();
worker.kill('SIGTERM');
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch(e => {
console.error(e);
process.exit(1);
});

View File

@ -0,0 +1,204 @@
/**
* orchestrator/test-gridsearch-ipc.mjs
*
* E2E test: spawn the C++ worker, send a gridsearch request
* matching `npm run gridsearch:enrich` defaults, collect IPC events,
* and verify the full event sequence.
*
* Run: node orchestrator/test-gridsearch-ipc.mjs
* Needs: npm run build-debug (or npm run build)
*/
import { spawnWorker } from './spawn.mjs';
import { resolve, dirname } from 'node:path';
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
if (!fs.existsSync(EXE)) {
console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`);
process.exit(1);
}
console.log(`Binary: ${EXE}\n`);
// Load the sample settings (same as gridsearch:enrich)
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
);
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) {
console.log(`${label}`);
passed++;
} else {
console.error(`${label}`);
failed++;
}
}
// ── Event collector ─────────────────────────────────────────────────────────
const EXPECTED_EVENTS = [
'grid-ready',
'waypoint-start',
'area',
'location',
'enrich-start',
'node',
'nodePage',
// 'node-error' — may or may not occur, depends on network
];
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage']) {
events[t] = [];
}
return {
events,
handler(msg) {
const t = msg.type;
if (events[t]) {
events[t].push(msg);
} else {
events[t] = [msg];
}
// Live progress indicator
const d = msg.payload ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
}
},
};
}
// ── Main test ───────────────────────────────────────────────────────────────
async function run() {
console.log('🧪 Gridsearch IPC E2E Test\n');
// ── 1. Spawn worker ───────────────────────────────────────────────────
console.log('1. Spawn worker');
const worker = spawnWorker(EXE);
const readyMsg = await worker.ready;
assert(readyMsg.type === 'ready', 'Worker sends ready signal');
// ── 2. Register event collector ───────────────────────────────────────
const collector = createCollector();
worker.onEvent(collector.handler);
// ── 3. Send gridsearch request (matching gridsearch:enrich) ────────────
console.log('2. Send gridsearch request (Aruba / recycling / --enrich)');
const t0 = Date.now();
// Very long timeout — enrichment can take minutes
const result = await worker.request(
{
type: 'gridsearch',
payload: {
...sampleConfig,
enrich: true,
},
},
5 * 60 * 1000 // 5 min timeout
);
const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`);
// ── 4. Verify final result ────────────────────────────────────────────
console.log('3. Verify job_result');
assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`);
const summary = result.payload ?? null;
assert(summary !== null, 'job_result payload is present');
if (summary) {
assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`);
assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`);
assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`);
assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`);
assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`);
assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present');
assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present');
assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`);
assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`);
}
// ── 5. Verify event sequence ──────────────────────────────────────────
console.log('4. Verify event stream');
const e = collector.events;
assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`);
assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`);
assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`);
assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`);
assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`);
const totalNodes = e['node'].length + e['node-error'].length;
assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`);
// Validate grid-ready payload
if (e['grid-ready'].length > 0) {
const gr = e['grid-ready'][0].payload ?? {};
assert(Array.isArray(gr.areas), 'grid-ready.areas is array');
assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`);
}
// Validate location events have required fields
if (e['location'].length > 0) {
const loc = e['location'][0].payload ?? {};
assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title');
assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id');
assert(typeof loc.areaName === 'string', 'location event has areaName');
}
assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`);
// Validate node payloads
if (e['node'].length > 0) {
const nd = e['node'][0].payload ?? {};
assert(typeof nd.placeId === 'string', 'node event has placeId');
assert(typeof nd.title === 'string', 'node event has title');
assert(Array.isArray(nd.emails), 'node event has emails array');
assert(typeof nd.status === 'string', 'node event has status');
}
// ── 6. Print event summary ────────────────────────────────────────────
console.log('\n5. Event summary');
for (const [type, arr] of Object.entries(e)) {
if (arr.length > 0) console.log(` ${type}: ${arr.length}`);
}
// ── 7. Shutdown ───────────────────────────────────────────────────────
console.log('\n6. Graceful shutdown');
const shutdownRes = await worker.shutdown();
assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged');
await new Promise(r => setTimeout(r, 500));
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
// ── Summary ───────────────────────────────────────────────────────────
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────\n`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch((err) => {
console.error('Test runner error:', err);
process.exit(1);
});

View File

@ -0,0 +1,90 @@
/**
* orchestrator/test-ipc.mjs
*
* Integration test: spawn the C++ worker, exchange messages, verify responses.
*
* Run: node orchestrator/test-ipc.mjs
* Needs: npm run build (to compile the C++ binary first)
*/
import { spawnWorker } from './spawn.mjs';
import { resolve, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const EXE = resolve(__dirname, '..', 'dist', 'polymech-cli.exe');
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) {
console.log(`${label}`);
passed++;
} else {
console.error(`${label}`);
failed++;
}
}
async function run() {
console.log('\n🔧 IPC Integration Tests\n');
// ── 1. Spawn & ready ────────────────────────────────────────────────────
console.log('1. Spawn worker and wait for ready signal');
const worker = spawnWorker(EXE);
const readyMsg = await worker.ready;
assert(readyMsg.type === 'ready', 'Worker sends ready message on startup');
// ── 2. Ping / Pong ─────────────────────────────────────────────────────
console.log('2. Ping → Pong');
const pong = await worker.request({ type: 'ping' });
assert(pong.type === 'pong', `Response type is "pong" (got "${pong.type}")`);
// ── 3. Job echo ─────────────────────────────────────────────────────────
console.log('3. Job → Job Result (echo payload)');
const payload = { action: 'resize', width: 1024, format: 'webp' };
const jobResult = await worker.request({ type: 'job', payload });
assert(jobResult.type === 'job_result', `Response type is "job_result" (got "${jobResult.type}")`);
assert(
jobResult.payload?.action === 'resize' && jobResult.payload?.width === 1024,
'Payload echoed back correctly'
);
// ── 4. Unknown type → error ─────────────────────────────────────────────
console.log('4. Unknown type → error response');
const errResp = await worker.request({ type: 'nonsense' });
assert(errResp.type === 'error', `Response type is "error" (got "${errResp.type}")`);
// ── 5. Multiple rapid requests ──────────────────────────────────────────
console.log('5. Multiple concurrent requests');
const promises = [];
for (let i = 0; i < 10; i++) {
promises.push(worker.request({ type: 'ping', payload: { seq: i } }));
}
const results = await Promise.all(promises);
assert(results.length === 10, `All 10 responses received`);
assert(results.every(r => r.type === 'pong'), 'All responses are pong');
// ── 6. Graceful shutdown ────────────────────────────────────────────────
console.log('6. Graceful shutdown');
const shutdownRes = await worker.shutdown();
assert(shutdownRes.type === 'shutdown_ack', `Shutdown acknowledged (got "${shutdownRes.type}")`);
// Wait a beat for process exit
await new Promise(r => setTimeout(r, 200));
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
// ── Summary ─────────────────────────────────────────────────────────────
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────\n`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch((err) => {
console.error('Test runner error:', err);
process.exit(1);
});

6
packages/kbot/cpp/package-lock.json generated Normal file
View File

@ -0,0 +1,6 @@
{
"name": "mono-cpp",
"lockfileVersion": 3,
"requires": true,
"packages": {}
}

View File

@ -0,0 +1,40 @@
{
"name": "mono-cpp",
"version": "1.0.0",
"description": "Cross-platform C++ CLI built with CMake.",
"directories": {
"test": "tests"
},
"scripts": {
"config": "cmake --preset dev",
"config:release": "cmake --preset release",
"build": "cmake --preset dev && cmake --build --preset dev",
"build:release": "cmake --preset release && cmake --build --preset release",
"build:linux": "bash build-linux.sh",
"test": "ctest --test-dir build/dev -C Debug --output-on-failure",
"test:release": "ctest --test-dir build/release -C Release --output-on-failure",
"clean": "cmake -E rm -rf build dist",
"rebuild": "npm run clean && npm run build",
"run": ".\\dist\\polymech-cli.exe --help",
"worker": ".\\dist\\polymech-cli.exe worker",
"test:ipc": "node orchestrator/test-gridsearch-ipc.mjs",
"gridsearch": ".\\dist\\polymech-cli.exe gridsearch ABW recycling --dry-run",
"gridsearch:settings": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --dry-run",
"gridsearch:settings:live": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json",
"gridsearch:enrich": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --enrich",
"gridsearch:enrich-test": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-test-bcn.json --enrich --persistence-postgres",
"test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs",
"test:gridsearch-filter-ipc": "cmake --build build/release --target test_gridsearch_ipc && .\\dist\\test_gridsearch_ipc.exe",
"test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs",
"test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs",
"test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs",
"test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe"
},
"repository": {
"type": "git",
"url": "https://git.polymech.info/polymech/mono-cpp.git"
},
"keywords": [],
"author": "",
"license": "ISC"
}

View File

@ -0,0 +1,4 @@
add_library(enrichers STATIC src/enrichers.cpp)
target_include_directories(enrichers PUBLIC include)
target_link_libraries(enrichers PUBLIC http html json logger)

View File

@ -0,0 +1,162 @@
#pragma once
#include <map>
#include <string>
#include <vector>
namespace enrichers {
// ── Status codes ────────────────────────────────────────────────────────────
enum class EnrichStatus {
OK,
NO_EMAIL,
META_TIMEOUT,
EMAIL_TIMEOUT,
FETCH_ERROR,
NO_PAGES,
ERROR,
};
const char *status_string(EnrichStatus s);
// ── Data types ──────────────────────────────────────────────────────────────
struct PageError {
std::string url;
std::string status; // "SEARCHED_EMAIL", "FAILED", ...
std::string method; // "GET", "SCRAPELESS", ...
std::string error;
int http_status = 0;
std::vector<std::string> emails;
};
struct SocialLink {
std::string platform; // "instagram", "facebook", "linkedin", ...
std::string url;
};
struct SiteMeta {
std::string title;
std::string description;
std::string og_image;
std::string canonical;
std::vector<SocialLink> socials;
std::vector<std::string> internal_pages; // discovered internal hrefs
std::vector<std::string> emails;
std::string body_text;
std::string body_html;
std::map<std::string, std::string> sites; // url -> body_md
int http_status = 0;
std::string fetch_error;
std::vector<std::string> json_ld;
};
struct EnrichedNode {
int idx = 0;
std::string title;
std::string place_id;
std::string website;
std::string address;
std::string type;
std::string grid_area;
std::string grid_gid;
int pages_found = 0;
int pages_scraped = 0;
std::vector<std::string> emails;
std::vector<SocialLink> socials;
int meta_ms = 0;
int email_ms = 0;
int total_ms = 0;
EnrichStatus status = EnrichStatus::NO_EMAIL;
std::string error;
std::map<std::string, std::string> pages; // "home" → body text
std::vector<std::string> meta_pages;
std::vector<PageError> page_errors;
std::string enricher_hash;
std::string geo_json;
std::map<std::string, std::string> sites; // url -> body_md
};
// ── Configuration ───────────────────────────────────────────────────────────
struct EnrichConfig {
bool enable_homepage_md = true;
int meta_timeout_ms = 10000;
int email_timeout_ms = 15000;
int email_page_timeout_ms = 10000;
int email_max_pages = 8;
int email_abort_after = 1;
/// Scrapeless API key — if set, pages that yield no emails via plain
/// HTTP GET will be re-fetched through the Scrapeless Universal Scraping
/// API (JS rendering). Leave empty to disable the fallback.
std::string scrapeless_key;
std::string bigdata_key;
std::vector<std::string> contact_patterns = {
"contact", "kontakt", "contacto", "contacta", "impression",
"about", "impress", "impressum", "datenschutz", "privacy",
"legal", "team", "nosotros", "empresa", "sobre",
};
std::vector<std::string> probe_paths = {
"/contact", "/contacto", "/kontakt", "/contacta",
"/about", "/about-us", "/impressum",
};
std::string meta_scraper;
int meta_concurrency = 5;
int meta_idle_timeout = 60;
};
// ── Location input ──────────────────────────────────────────────────────────
struct LocationInput {
std::string title;
std::string place_id;
std::string website;
std::string address;
std::string type;
std::string grid_area;
std::string grid_gid;
double lat = 0;
double lng = 0;
};
// ── Core API ────────────────────────────────────────────────────────────────
/// Check if a candidate string looks like a real email address.
bool is_likely_email(const std::string &candidate);
/// Extract all email addresses from a text body.
std::vector<std::string> extract_emails(const std::string &text);
/// Scrape metadata from a website URL (static HTML via libcurl + lexbor).
SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000);
/// Scrape emails from a single page URL.
std::vector<std::string> scrape_emails_from_page(const std::string &url,
int timeout_ms = 10000);
/// Fetch a page via Scrapeless Universal Scraping API (JS rendering),
/// then extract emails from the rendered HTML. Returns empty if key is
/// blank or the API call fails.
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms = 15000);
/// Scrape metadata from a website URL via Scrapeless Universal API (JS
/// rendering).
SiteMeta scrape_meta_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms = 15000);
/// Full enrichment pipeline for a single location: meta → email.
EnrichedNode enrich_location(const LocationInput &loc,
const EnrichConfig &cfg = {});
/// Resolve a URL relative to a base URL.
std::string resolve_url(const std::string &base, const std::string &href);
} // namespace enrichers

View File

@ -0,0 +1,800 @@
#include "enrichers/enrichers.h"
#include "html/html.h"
#include "http/http.h"
#include "logger/logger.h"
#include "json/json.h"
#include <algorithm>
#include <chrono>
#include <future>
#include <regex>
#include <set>
#include <sstream>
namespace enrichers {
// ── Status string ───────────────────────────────────────────────────────────
const char *status_string(EnrichStatus s) {
switch (s) {
case EnrichStatus::OK:
return "OK";
case EnrichStatus::NO_EMAIL:
return "NO_EMAIL";
case EnrichStatus::META_TIMEOUT:
return "META_TIMEOUT";
case EnrichStatus::EMAIL_TIMEOUT:
return "EMAIL_TIMEOUT";
case EnrichStatus::FETCH_ERROR:
return "FETCH_ERROR";
case EnrichStatus::NO_PAGES:
return "NO_PAGES";
case EnrichStatus::ERROR:
return "ERROR";
}
return "UNKNOWN";
}
// ── Timing helper ───────────────────────────────────────────────────────────
static int elapsed_ms(std::chrono::steady_clock::time_point t0) {
auto now = std::chrono::steady_clock::now();
return static_cast<int>(
std::chrono::duration_cast<std::chrono::milliseconds>(now - t0).count());
}
// ── Email extraction ────────────────────────────────────────────────────────
static const std::regex
EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
std::regex::optimize);
// Asset extensions that disqualify an email-like string
static const std::vector<std::string> ASSET_EXTS = {
".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp",
".avif", ".css", ".js", ".woff", ".woff2", ".ttf",
".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico",
};
static std::string to_lower(const std::string &s) {
std::string out = s;
std::transform(out.begin(), out.end(), out.begin(),
[](unsigned char c) { return std::tolower(c); });
return out;
}
bool is_likely_email(const std::string &candidate) {
if (candidate.size() < 5 || candidate.size() > 254)
return false;
if (candidate.find("..") != std::string::npos)
return false;
auto at_pos = candidate.find('@');
if (at_pos == std::string::npos || at_pos == 0 ||
at_pos == candidate.size() - 1)
return false;
auto lower = to_lower(candidate);
// Reject asset-like extensions
for (auto &ext : ASSET_EXTS) {
if (lower.size() >= ext.size() &&
lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) {
return false;
}
}
// Reject common placeholders
if (lower.find("example") != std::string::npos)
return false;
if (lower.find("sentry") != std::string::npos)
return false;
if (lower.find("test") != std::string::npos)
return false;
if (lower.find("placeholder") != std::string::npos)
return false;
if (lower.find("wixpress.com") != std::string::npos)
return false;
// Reject if local part is pure hex hash (8+ hex chars)
if (at_pos >= 8) {
auto local = lower.substr(0, at_pos);
bool all_hex = std::all_of(local.begin(), local.end(), [](char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
});
if (all_hex)
return false;
}
// Reject if domain part looks numeric-only (e.g. 1234@5678)
auto domain = lower.substr(at_pos + 1);
auto dot_pos = domain.find('.');
if (dot_pos == std::string::npos)
return false;
if (domain.length() - dot_pos <= 2)
return false; // Minimum 2 chars for TLD
auto domPart = domain.substr(0, dot_pos);
bool all_digits =
!domPart.empty() &&
std::all_of(domPart.begin(), domPart.end(),
[](unsigned char c) { return std::isdigit(c); });
if (all_digits)
return false;
return true;
}
static bool is_valid_email_char(char c) {
return std::isalnum(static_cast<unsigned char>(c)) || c == '.' || c == '_' ||
c == '%' || c == '+' || c == '-';
}
std::vector<std::string> extract_emails(const std::string &text) {
std::vector<std::string> results;
if (text.empty())
return results;
std::set<std::string> seen;
size_t pos = 0;
while ((pos = text.find('@', pos)) != std::string::npos) {
if (pos == 0 || pos == text.length() - 1) {
pos++;
continue;
}
// Scan backwards
size_t start = pos;
while (start > 0 && is_valid_email_char(text[start - 1])) {
start--;
}
// Scan forwards
size_t end = pos;
while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) {
end++;
}
if (start < pos && end > pos) {
std::string candidate = text.substr(start, end - start + 1);
// Strip trailing dots/hyphens eagerly grabbed
while (!candidate.empty() &&
(candidate.back() == '.' || candidate.back() == '-')) {
candidate.pop_back();
end--;
}
// Strip leading dots/hyphens
size_t local_start = 0;
while (local_start < candidate.length() &&
(candidate[local_start] == '.' || candidate[local_start] == '-')) {
local_start++;
}
if (local_start > 0) {
candidate = candidate.substr(local_start);
}
std::string lower = to_lower(candidate);
if (is_likely_email(lower)) {
if (seen.insert(lower).second) {
results.push_back(lower);
}
}
}
pos = end + 1;
}
return results;
}
// ── URL resolution ──────────────────────────────────────────────────────────
std::string resolve_url(const std::string &base, const std::string &href) {
if (href.empty())
return {};
// Already absolute
if (href.find("http://") == 0 || href.find("https://") == 0)
return href;
// Protocol-relative
if (href.find("//") == 0) {
auto proto_end = base.find("//");
if (proto_end != std::string::npos) {
return base.substr(0, proto_end) + href;
}
return "https:" + href;
}
// Skip non-HTTP
if (href.find("mailto:") == 0 || href.find("tel:") == 0 ||
href.find("javascript:") == 0 || href[0] == '#') {
return {};
}
// Relative path
// Find base origin: https://example.com
auto proto = base.find("://");
if (proto == std::string::npos)
return {};
auto origin_end = base.find('/', proto + 3);
std::string origin =
(origin_end != std::string::npos) ? base.substr(0, origin_end) : base;
if (href[0] == '/') {
return origin + href;
}
// Relative without leading slash
if (origin_end != std::string::npos) {
auto last_slash = base.rfind('/');
if (last_slash > proto + 2) {
return base.substr(0, last_slash + 1) + href;
}
}
return origin + "/" + href;
}
// ── Social link classification ──────────────────────────────────────────────
static std::string classify_social(const std::string &url) {
auto lower = to_lower(url);
if (lower.find("instagram.com") != std::string::npos)
return "instagram";
if (lower.find("facebook.com") != std::string::npos)
return "facebook";
if (lower.find("linkedin.com") != std::string::npos)
return "linkedin";
if (lower.find("twitter.com") != std::string::npos ||
lower.find("x.com") != std::string::npos)
return "twitter";
if (lower.find("youtube.com") != std::string::npos)
return "youtube";
if (lower.find("tiktok.com") != std::string::npos)
return "tiktok";
if (lower.find("pinterest.com") != std::string::npos)
return "pinterest";
if (lower.find("github.com") != std::string::npos)
return "github";
return {};
}
// ── Same-origin check ───────────────────────────────────────────────────────
static std::string get_origin(const std::string &url) {
auto proto = url.find("://");
if (proto == std::string::npos)
return {};
auto origin_end = url.find('/', proto + 3);
return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url;
}
static bool is_same_origin(const std::string &base_url,
const std::string &href) {
auto bo = to_lower(get_origin(base_url));
auto ho = to_lower(get_origin(href));
if (bo.empty() || ho.empty())
return false;
// Strip www. for comparison
auto strip_www = [](std::string &s) {
auto pos = s.find("://www.");
if (pos != std::string::npos) {
s = s.substr(0, pos + 3) + s.substr(pos + 7);
}
};
strip_www(bo);
strip_www(ho);
return bo == ho;
}
// ── Contact page matching ───────────────────────────────────────────────────
static bool matches_contact_pattern(const std::string &url,
const std::vector<std::string> &patterns) {
auto lower = to_lower(url);
for (auto &pat : patterns) {
if (lower.find(to_lower(pat)) != std::string::npos)
return true;
}
return false;
}
// ── Shared HTML parsing logic for Meta ──────────────────────────────────────
static SiteMeta parse_meta_html(const std::string &url, int http_status,
const std::string &html_body,
const std::string &fetch_error) {
SiteMeta meta;
meta.http_status = http_status;
if (!fetch_error.empty()) {
meta.fetch_error = fetch_error;
return meta;
}
meta.body_html = html_body;
// Parse with lexbor helpers
meta.title = html::get_title(html_body);
meta.description = html::get_meta(html_body, "description");
meta.og_image = html::get_meta(html_body, "og:image");
meta.canonical = html::get_canonical(html_body);
meta.body_text = html::get_body_text(html_body);
meta.json_ld = html::get_json_ld(html_body);
// OG fallbacks
if (meta.description.empty())
meta.description = html::get_meta(html_body, "og:description");
if (meta.title.empty())
meta.title = html::get_meta(html_body, "og:title");
// Links — classify into social / internal / mailto
auto links = html::get_links(html_body);
std::set<std::string> seen_pages;
// Extract emails from body text (much smaller than raw HTML)
meta.emails = extract_emails(meta.body_text);
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string email = lk.href.substr(7);
// Strip anything after ? (like ?subject=...)
auto q = email.find('?');
if (q != std::string::npos)
email = email.substr(0, q);
// Clean it
email = to_lower(email);
if (is_likely_email(email)) {
if (std::find(meta.emails.begin(), meta.emails.end(), email) ==
meta.emails.end()) {
meta.emails.push_back(email);
}
}
continue;
}
auto resolved = resolve_url(url, lk.href);
if (resolved.empty())
continue;
auto social = classify_social(resolved);
if (!social.empty()) {
meta.socials.push_back({social, resolved});
continue;
}
if (is_same_origin(url, resolved)) {
// Strip fragment (#) from URL
auto hash_pos = resolved.find('#');
if (hash_pos != std::string::npos) {
resolved = resolved.substr(0, hash_pos);
}
if (!resolved.empty() && seen_pages.insert(resolved).second) {
meta.internal_pages.push_back(resolved);
}
}
}
return meta;
}
// ── scrape_meta ─────────────────────────────────────────────────────────────
SiteMeta scrape_meta(const std::string &url, int timeout_ms) {
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36";
auto resp = http::get(url, opts);
std::string fetch_err;
if (resp.status_code < 0 || resp.status_code >= 400) {
fetch_err = resp.body;
}
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
fetch_err);
}
// ── scrape_emails_from_page ─────────────────────────────────────────────────
std::vector<std::string> scrape_emails_from_page(const std::string &url,
int timeout_ms,
int &out_status_code) {
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36";
auto resp = http::get(url, opts);
out_status_code = static_cast<int>(resp.status_code);
if (resp.status_code < 0 || resp.status_code >= 400) {
return {};
}
// Extract body text then find emails
auto text = html::get_body_text(resp.body);
auto from_text = extract_emails(text);
// Extract mailto: links from HTML directly without regexing the huge string
auto links = html::get_links(resp.body);
std::set<std::string> seen(from_text.begin(), from_text.end());
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string m = lk.href.substr(7);
auto q = m.find('?');
if (q != std::string::npos)
m = m.substr(0, q);
m = to_lower(m);
if (is_likely_email(m)) {
if (seen.insert(m).second) {
from_text.push_back(m);
}
}
}
}
return from_text;
}
static std::string extract_scrapeless_html(const std::string &json_body) {
std::string data = json::get_string(json_body, "data");
if (data.empty()) {
return json_body; // Fallback to raw response if not found
}
return data;
}
SiteMeta scrape_meta_scrapeless(const std::string &url,
const std::string &api_key, int timeout_ms) {
if (api_key.empty())
return parse_meta_html(url, 0, "", "missing api key");
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
url +
R"(","jsRender":{"enabled":true,"headless":true}}})";
http::PostOptions opts;
opts.content_type = "application/json";
opts.bearer_token = api_key;
opts.timeout_ms =
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
payload, opts);
std::string fetch_err;
if (resp.status_code < 0 || resp.status_code >= 400) {
fetch_err = resp.body;
logger::error("[meta:scrapeless] API Error HTTP " +
std::to_string(resp.status_code) + " for " + url + " : " +
fetch_err);
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
fetch_err);
}
std::string rendered_html = extract_scrapeless_html(resp.body);
return parse_meta_html(url, static_cast<int>(resp.status_code), rendered_html,
"");
}
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms) {
if (api_key.empty())
return {};
// Build the Scrapeless Universal Scraping API request body.
// We ask for the fully-rendered HTML of the target URL.
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
url +
R"(","jsRender":{"enabled":true,"headless":true}}})";
http::PostOptions opts;
opts.content_type = "application/json";
opts.bearer_token = api_key;
opts.timeout_ms =
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
payload, opts);
if (resp.status_code < 0 || resp.status_code >= 400) {
logger::error("[email:scrapeless] API Error HTTP " +
std::to_string(resp.status_code) + " for " + url + " : " +
resp.body);
return {}; // API error — silent fallback
}
std::string rendered_html = extract_scrapeless_html(resp.body);
// Parse and extract emails from the rendered HTML
auto text = html::get_body_text(rendered_html);
auto from_text = extract_emails(text);
// Fast mailto extraction instead of HTML regex
auto links = html::get_links(rendered_html);
std::set<std::string> seen(from_text.begin(), from_text.end());
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string m = lk.href.substr(7);
auto q = m.find('?');
if (q != std::string::npos)
m = m.substr(0, q);
m = to_lower(m);
if (is_likely_email(m)) {
if (seen.insert(m).second) {
from_text.push_back(m);
}
}
}
}
return from_text;
}
// ── enrich_location ─────────────────────────────────────────────────────────
EnrichedNode enrich_location(const LocationInput &loc,
const EnrichConfig &cfg) {
auto t0 = std::chrono::steady_clock::now();
EnrichedNode node;
node.title = loc.title;
node.place_id = loc.place_id;
node.website = loc.website;
node.address = loc.address;
node.type = loc.type;
node.grid_area = loc.grid_area;
node.grid_gid = loc.grid_gid;
node.status = EnrichStatus::NO_EMAIL;
if (loc.website.empty()) {
node.status = EnrichStatus::FETCH_ERROR;
node.error = "no website";
node.total_ms = elapsed_ms(t0);
return node;
}
// ── Phase 1: Meta scrape ────────────────────────────────────────────────
auto meta_t0 = std::chrono::steady_clock::now();
SiteMeta meta;
bool meta_timed_out = false;
try {
if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) {
logger::debug("[meta:scrapeless] Fetching " + loc.website);
meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key,
cfg.meta_timeout_ms);
} else {
logger::debug("[meta:http] Fetching " + loc.website);
meta = scrape_meta(loc.website, cfg.meta_timeout_ms);
}
} catch (...) {
meta.fetch_error = "exception during meta scrape";
meta_timed_out = true;
}
node.meta_ms = elapsed_ms(meta_t0);
// Check if meta took too long (within threshold of timeout)
if (node.meta_ms >= cfg.meta_timeout_ms - 1000) {
meta_timed_out = true;
}
// logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title)
// + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: "
// + std::to_string(meta.internal_pages.size()));
if (!meta.body_text.empty())
node.pages["home"] = meta.body_text;
if (cfg.enable_homepage_md && !meta.body_html.empty()) {
// Cap HTML body at 512 KB to prevent stack overflow in recursive html2md
// parser
static constexpr size_t MAX_HTML_BYTES = 512 * 1024;
if (meta.body_html.size() > MAX_HTML_BYTES) {
logger::warn("[" + loc.title + "] body_html too large (" +
std::to_string(meta.body_html.size() / 1024) +
" KB), skipping markdown conversion");
} else {
try {
node.sites[loc.website] = html::to_markdown(meta.body_html);
} catch (const std::exception &e) {
logger::warn("[" + loc.title +
"] html::to_markdown failed: " + e.what());
} catch (...) {
logger::warn("[" + loc.title +
"] html::to_markdown crashed (unknown exception)");
}
}
}
node.meta_pages = meta.internal_pages;
node.pages_found = static_cast<int>(meta.internal_pages.size());
node.socials = meta.socials;
if (!meta.fetch_error.empty()) {
node.error = meta.fetch_error;
node.status = EnrichStatus::FETCH_ERROR;
node.total_ms = elapsed_ms(t0);
return node;
}
// If meta already found emails, we're done (early exit like TS)
if (!meta.emails.empty()) {
node.emails = meta.emails;
node.status = EnrichStatus::OK;
node.total_ms = elapsed_ms(t0);
return node;
}
// ── Build contact page list ─────────────────────────────────────────────
std::vector<std::string> contact_pages;
std::set<std::string> seen_urls;
for (auto &page_url : meta.internal_pages) {
if (matches_contact_pattern(page_url, cfg.contact_patterns)) {
if (seen_urls.insert(page_url).second) {
contact_pages.push_back(page_url);
}
}
}
// No more probe paths. If we found 0 contact pages, we just give up or time
// out.
node.pages_found = static_cast<int>(contact_pages.size());
if (contact_pages.empty()) {
logger::debug("[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] No contact pages found.");
node.status =
meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES;
node.total_ms = elapsed_ms(t0);
return node;
}
logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) +
"] Contact pages to scrape: " +
std::to_string(contact_pages.size()) + " (parallel)");
// ── Phase 2: Email scrape per contact page ──────────────────────────────
struct AsyncResult {
std::string url;
std::vector<PageError> errors;
std::vector<std::string> emails;
int ms;
};
int pages_to_scrape =
std::min(static_cast<int>(contact_pages.size()), cfg.email_max_pages);
std::vector<std::thread> contact_threads;
std::vector<AsyncResult> contact_results(pages_to_scrape);
auto email_t0 = std::chrono::steady_clock::now();
for (int i = 0; i < pages_to_scrape; ++i) {
auto page_url = contact_pages[i];
contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() {
auto start = std::chrono::steady_clock::now();
AsyncResult res;
res.url = page_url;
PageError pe1;
pe1.url = page_url;
pe1.method = "GET";
int http_status = 0;
try {
// logger::debug("[email:http] Fetching " + page_url);
auto page_emails = scrape_emails_from_page(
page_url, cfg.email_page_timeout_ms, http_status);
pe1.emails = page_emails;
logger::debug("[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] HTTP fetch finished code " +
std::to_string(http_status) + " for " + page_url);
if (page_emails.empty()) {
if (http_status == 404 || http_status == 400 || http_status == 500) {
pe1.status = "NOT_FOUND";
pe1.error = "HTTP " + std::to_string(http_status);
} else {
pe1.status = "AXIOS_NO_EMAIL";
res.errors.push_back(pe1); // pushed before scrapeless
if (cfg.meta_scraper == "SCRAPELESS" &&
!cfg.scrapeless_key.empty()) {
PageError pe2;
pe2.url = page_url;
pe2.method = "SCRAPELESS";
try {
logger::debug("[email:scrapeless] Fallback scraping " +
page_url);
auto s_emails =
scrape_emails_scrapeless(page_url, cfg.scrapeless_key,
cfg.email_page_timeout_ms + 5000);
pe2.emails = s_emails;
pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL";
if (!s_emails.empty())
res.emails = s_emails;
logger::debug(
"[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] Scrapeless fallback finished for " + page_url);
} catch (...) {
pe2.status = "FAILED";
pe2.error = "scrapeless exception";
}
res.errors.push_back(pe2);
}
res.ms = elapsed_ms(start);
contact_results[i] = res;
return;
}
} else {
pe1.status = "SEARCHED_EMAIL";
res.emails = page_emails;
}
} catch (...) {
pe1.status = "AXIOS_FAILED";
pe1.error = "exception";
}
// Only insert pe1 if we didn't already push it during fallback
if (res.errors.empty() || res.errors[0].method != "GET") {
res.errors.insert(res.errors.begin(), pe1);
}
res.ms = elapsed_ms(start);
contact_results[i] = res;
});
}
for (auto &t : contact_threads) {
if (t.joinable())
t.join();
}
std::set<std::string> all_emails;
int pages_scraped = 0;
for (auto &res : contact_results) {
pages_scraped++;
for (auto &pe : res.errors) {
node.page_errors.push_back(std::move(pe));
}
for (auto &e : res.emails) {
all_emails.insert(e);
}
}
node.email_ms = elapsed_ms(email_t0);
node.pages_scraped = pages_scraped;
// Merge emails
node.emails.assign(all_emails.begin(), all_emails.end());
// Final status
bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000;
if (!node.emails.empty()) {
node.status = EnrichStatus::OK;
} else if (email_timed_out) {
node.status = EnrichStatus::EMAIL_TIMEOUT;
} else if (meta_timed_out) {
node.status = EnrichStatus::META_TIMEOUT;
} else {
node.status = EnrichStatus::NO_EMAIL;
}
node.total_ms = elapsed_ms(t0);
return node;
}
} // namespace enrichers

View File

@ -0,0 +1,6 @@
add_library(gadm_reader STATIC src/gadm_reader.cpp)
target_include_directories(gadm_reader PUBLIC include)
# Depends on geo (for Coord type) and json (for RapidJSON)
target_link_libraries(gadm_reader PUBLIC geo json)

View File

@ -0,0 +1,75 @@
#pragma once
#include "geo/geo.h"
#include <array>
#include <string>
#include <vector>
namespace gadm {
// ── Feature (mirrors TS GridFeature) ────────────────────────────────────────
struct Feature {
std::string gid; // e.g. "ABW", "AFG.1.1_1"
std::string name; // e.g. "Aruba", "Baharak"
int level = 0; // GADM admin level
// Outer ring + holes (MultiPolygon flattened to rings)
std::vector<std::vector<geo::Coord>> rings;
// Bounding box (computed from rings)
geo::BBox bbox;
// GHS enrichment (parsed from cached JSON)
double ghsPopulation = 0;
double ghsBuiltWeight = 0;
double ghsPopMaxDensity = 0;
double ghsBuiltMax = 0;
geo::Coord ghsPopCenter;
geo::Coord ghsBuiltCenter;
// Weighted centers: [lon, lat, weight]
std::vector<std::array<double, 3>> ghsPopCenters;
std::vector<std::array<double, 3>> ghsBuiltCenters;
// Computed from geometry
double areaSqKm = 0;
bool isOuter = true;
};
// ── Result ──────────────────────────────────────────────────────────────────
struct BoundaryResult {
std::vector<Feature> features;
std::string error; // empty on success
};
// ── API ─────────────────────────────────────────────────────────────────────
/// Load a pre-cached GADM boundary file.
///
/// Tries these file paths in order:
/// 1. cacheDir/boundary_{gid}_{targetLevel}.json
/// 2. cacheDir/boundary_{countryCode}_{targetLevel}.json (fallback for country-level)
///
/// Returns a BoundaryResult with parsed features or an error string.
BoundaryResult load_boundary(
const std::string& gid,
int targetLevel,
const std::string& cacheDir = "cache/gadm"
);
/// Load a boundary file directly by path.
BoundaryResult load_boundary_file(const std::string& filepath);
/// Extract the ISO country code from a GID (e.g. "AFG.1.1_1" → "AFG").
std::string country_code(const std::string& gid);
/// Infer the GADM level from a GID string.
/// "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2, etc.
int infer_level(const std::string& gid);
} // namespace gadm

View File

@ -0,0 +1,231 @@
#include "gadm_reader/gadm_reader.h"
#include <algorithm>
#include <fstream>
#include <sstream>
#include <rapidjson/document.h>
namespace gadm {
// ── Helpers ─────────────────────────────────────────────────────────────────
std::string country_code(const std::string& gid) {
auto dot = gid.find('.');
return (dot != std::string::npos) ? gid.substr(0, dot) : gid;
}
int infer_level(const std::string& gid) {
// Count dots: "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2
int dots = 0;
for (char c : gid) {
if (c == '.') dots++;
}
return dots;
}
static std::string read_file(const std::string& path) {
std::ifstream ifs(path, std::ios::binary);
if (!ifs.is_open()) return "";
std::ostringstream oss;
oss << ifs.rdbuf();
return oss.str();
}
/// Parse a coord array [lon, lat] → geo::Coord
static geo::Coord parse_coord(const rapidjson::Value& arr) {
if (arr.IsArray() && arr.Size() >= 2) {
return {arr[0].GetDouble(), arr[1].GetDouble()};
}
return {};
}
/// Parse a ring array [[lon,lat], [lon,lat], ...] → vector<Coord>
static std::vector<geo::Coord> parse_ring(const rapidjson::Value& arr) {
std::vector<geo::Coord> ring;
if (!arr.IsArray()) return ring;
ring.reserve(arr.Size());
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
ring.push_back(parse_coord(arr[i]));
}
return ring;
}
/// Parse weighted centers [[lon, lat, weight], ...]
static std::vector<std::array<double, 3>> parse_weighted_centers(
const rapidjson::Value& arr) {
std::vector<std::array<double, 3>> centers;
if (!arr.IsArray()) return centers;
centers.reserve(arr.Size());
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
const auto& c = arr[i];
if (c.IsArray() && c.Size() >= 3) {
centers.push_back({c[0].GetDouble(), c[1].GetDouble(), c[2].GetDouble()});
}
}
return centers;
}
/// Get a double from properties, with fallback
static double get_double(const rapidjson::Value& props, const char* key,
double fallback = 0.0) {
if (props.HasMember(key) && props[key].IsNumber()) {
return props[key].GetDouble();
}
return fallback;
}
/// Get a bool from properties, with fallback
static bool get_bool(const rapidjson::Value& props, const char* key,
bool fallback = true) {
if (props.HasMember(key) && props[key].IsBool()) {
return props[key].GetBool();
}
return fallback;
}
/// Get a string from properties, checking GID_0, GID_1, GID_2, etc.
static std::string get_gid(const rapidjson::Value& props) {
// Try GID_5 down to GID_0, return the most specific one found
for (int lvl = 5; lvl >= 0; --lvl) {
std::string key = "GID_" + std::to_string(lvl);
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
return props[key.c_str()].GetString();
}
}
return "";
}
/// Get the name (NAME_0, NAME_1, ... NAME_5)
static std::string get_name(const rapidjson::Value& props) {
for (int lvl = 5; lvl >= 0; --lvl) {
std::string key = "NAME_" + std::to_string(lvl);
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
return props[key.c_str()].GetString();
}
}
return "";
}
/// Parse a single GeoJSON Feature object into a gadm::Feature
static Feature parse_feature(const rapidjson::Value& feat) {
Feature f;
// Properties
if (feat.HasMember("properties") && feat["properties"].IsObject()) {
const auto& props = feat["properties"];
f.gid = get_gid(props);
f.name = get_name(props);
f.level = infer_level(f.gid);
f.ghsPopulation = get_double(props, "ghsPopulation");
f.ghsBuiltWeight = get_double(props, "ghsBuiltWeight");
f.ghsPopMaxDensity = get_double(props, "ghsPopMaxDensity");
f.ghsBuiltMax = get_double(props, "ghsBuiltMax");
f.isOuter = get_bool(props, "isOuter");
if (props.HasMember("ghsPopCenter") && props["ghsPopCenter"].IsArray()) {
f.ghsPopCenter = parse_coord(props["ghsPopCenter"]);
}
if (props.HasMember("ghsBuiltCenter") && props["ghsBuiltCenter"].IsArray()) {
f.ghsBuiltCenter = parse_coord(props["ghsBuiltCenter"]);
}
if (props.HasMember("ghsPopCenters") && props["ghsPopCenters"].IsArray()) {
f.ghsPopCenters = parse_weighted_centers(props["ghsPopCenters"]);
}
if (props.HasMember("ghsBuiltCenters") && props["ghsBuiltCenters"].IsArray()) {
f.ghsBuiltCenters = parse_weighted_centers(props["ghsBuiltCenters"]);
}
}
// Geometry
if (feat.HasMember("geometry") && feat["geometry"].IsObject()) {
const auto& geom = feat["geometry"];
std::string gtype;
if (geom.HasMember("type") && geom["type"].IsString()) {
gtype = geom["type"].GetString();
}
if (geom.HasMember("coordinates") && geom["coordinates"].IsArray()) {
const auto& coords = geom["coordinates"];
if (gtype == "Polygon") {
// coordinates: [ [ring], [hole], ... ]
for (rapidjson::SizeType r = 0; r < coords.Size(); ++r) {
f.rings.push_back(parse_ring(coords[r]));
}
} else if (gtype == "MultiPolygon") {
// coordinates: [ [ [ring], [hole] ], [ [ring] ], ... ]
for (rapidjson::SizeType p = 0; p < coords.Size(); ++p) {
if (coords[p].IsArray()) {
for (rapidjson::SizeType r = 0; r < coords[p].Size(); ++r) {
f.rings.push_back(parse_ring(coords[p][r]));
}
}
}
}
}
}
// Compute bbox and area from first ring (outer boundary)
if (!f.rings.empty() && !f.rings[0].empty()) {
f.bbox = geo::bbox(f.rings[0]);
f.areaSqKm = geo::area_sq_km(f.rings[0]);
}
return f;
}
// ── Public API ──────────────────────────────────────────────────────────────
BoundaryResult load_boundary_file(const std::string& filepath) {
BoundaryResult result;
std::string json = read_file(filepath);
if (json.empty()) {
result.error = "Failed to read file: " + filepath;
return result;
}
rapidjson::Document doc;
doc.Parse(json.c_str());
if (doc.HasParseError()) {
result.error = "JSON parse error in: " + filepath;
return result;
}
// Expect a FeatureCollection
if (!doc.HasMember("features") || !doc["features"].IsArray()) {
result.error = "Missing 'features' array in: " + filepath;
return result;
}
const auto& features = doc["features"];
result.features.reserve(features.Size());
for (rapidjson::SizeType i = 0; i < features.Size(); ++i) {
result.features.push_back(parse_feature(features[i]));
}
return result;
}
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
const std::string& cacheDir) {
std::string cc = country_code(gid);
std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json";
// Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json
std::string path = cacheDir + "/" + cc + "/" + filename;
auto result = load_boundary_file(path);
if (result.error.empty()) return result;
// Fallback (flat): cacheDir/boundary_{gid}_{level}.json
path = cacheDir + "/" + filename;
result = load_boundary_file(path);
if (result.error.empty()) return result;
// Both failed
result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir;
return result;
}
} // namespace gadm

View File

@ -0,0 +1,5 @@
add_library(geo STATIC src/geo.cpp)
target_include_directories(geo PUBLIC include)
# No external dependencies pure math

View File

@ -0,0 +1,100 @@
#pragma once
#include <array>
#include <cmath>
#include <vector>
namespace geo {
// ── Constants ───────────────────────────────────────────────────────────────
constexpr double EARTH_RADIUS_KM = 6371.0;
constexpr double PI = 3.14159265358979323846;
constexpr double DEG2RAD = PI / 180.0;
constexpr double RAD2DEG = 180.0 / PI;
// ── Core types ──────────────────────────────────────────────────────────────
struct Coord {
double lon = 0;
double lat = 0;
};
struct BBox {
double minLon = 0;
double minLat = 0;
double maxLon = 0;
double maxLat = 0;
Coord center() const {
return {(minLon + maxLon) / 2.0, (minLat + maxLat) / 2.0};
}
double width_deg() const { return maxLon - minLon; }
double height_deg() const { return maxLat - minLat; }
};
// ── Distance ────────────────────────────────────────────────────────────────
/// Haversine distance between two WGS84 points, in kilometers.
double distance_km(Coord a, Coord b);
/// Haversine distance in meters.
inline double distance_m(Coord a, Coord b) { return distance_km(a, b) * 1000.0; }
// ── Bounding box ────────────────────────────────────────────────────────────
/// Compute the bounding box of a polygon ring.
BBox bbox(const std::vector<Coord>& ring);
/// Compute the bounding box that covers all features' rings.
BBox bbox_union(const std::vector<BBox>& boxes);
// ── Centroid ────────────────────────────────────────────────────────────────
/// Geometric centroid of a polygon ring (simple average method).
Coord centroid(const std::vector<Coord>& ring);
// ── Area ────────────────────────────────────────────────────────────────────
/// Approximate area of a polygon ring in square meters.
/// Uses the Shoelace formula with latitude cosine correction.
double area_sq_m(const std::vector<Coord>& ring);
/// Area in square kilometers.
inline double area_sq_km(const std::vector<Coord>& ring) {
return area_sq_m(ring) / 1e6;
}
// ── Point-in-polygon ────────────────────────────────────────────────────────
/// Ray-casting point-in-polygon test.
/// Same algorithm as gadm/cpp pip.h but using Coord structs.
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring);
// ── Bearing & destination ───────────────────────────────────────────────────
/// Initial bearing from a to b, in degrees (0 = north, 90 = east).
double bearing_deg(Coord from, Coord to);
/// Compute the destination point given start, bearing (degrees), and distance (km).
Coord destination(Coord from, double bearing_deg, double distance_km);
// ── Grid tessellation ───────────────────────────────────────────────────────
/// Generate a flat square grid of cell centers over a bbox.
/// cellSizeKm defines the side length of each square cell.
/// Returns center coordinates of each cell.
std::vector<Coord> square_grid(BBox extent, double cellSizeKm);
/// Generate a flat hex grid of cell centers over a bbox.
/// cellSizeKm defines the distance between hex centers.
/// Returns center coordinates of each cell.
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm);
// ── Viewport estimation (matches TS estimateViewportAreaSqKm) ──────────────
/// Estimate the km² visible in a viewport at a given lat/zoom.
double estimate_viewport_sq_km(double lat, int zoom,
int widthPx = 1024, int heightPx = 768);
} // namespace geo

View File

@ -0,0 +1,204 @@
#include "geo/geo.h"
#include <algorithm>
#include <cmath>
namespace geo {
// ── Distance (Haversine) ────────────────────────────────────────────────────
double distance_km(Coord a, Coord b) {
double dLat = (b.lat - a.lat) * DEG2RAD;
double dLon = (b.lon - a.lon) * DEG2RAD;
double lat1 = a.lat * DEG2RAD;
double lat2 = b.lat * DEG2RAD;
double sinDLat = std::sin(dLat / 2.0);
double sinDLon = std::sin(dLon / 2.0);
double h = sinDLat * sinDLat + std::cos(lat1) * std::cos(lat2) * sinDLon * sinDLon;
return 2.0 * EARTH_RADIUS_KM * std::asin(std::sqrt(h));
}
// ── Bounding box ────────────────────────────────────────────────────────────
BBox bbox(const std::vector<Coord>& ring) {
if (ring.empty()) return {};
BBox b{ring[0].lon, ring[0].lat, ring[0].lon, ring[0].lat};
for (size_t i = 1; i < ring.size(); ++i) {
b.minLon = std::min(b.minLon, ring[i].lon);
b.minLat = std::min(b.minLat, ring[i].lat);
b.maxLon = std::max(b.maxLon, ring[i].lon);
b.maxLat = std::max(b.maxLat, ring[i].lat);
}
return b;
}
BBox bbox_union(const std::vector<BBox>& boxes) {
if (boxes.empty()) return {};
BBox u = boxes[0];
for (size_t i = 1; i < boxes.size(); ++i) {
u.minLon = std::min(u.minLon, boxes[i].minLon);
u.minLat = std::min(u.minLat, boxes[i].minLat);
u.maxLon = std::max(u.maxLon, boxes[i].maxLon);
u.maxLat = std::max(u.maxLat, boxes[i].maxLat);
}
return u;
}
// ── Centroid ────────────────────────────────────────────────────────────────
Coord centroid(const std::vector<Coord>& ring) {
if (ring.empty()) return {};
double sumLon = 0, sumLat = 0;
// Exclude last point if it's the same as first (closed ring)
size_t n = ring.size();
if (n > 1 && ring[0].lon == ring[n - 1].lon && ring[0].lat == ring[n - 1].lat) {
n--;
}
for (size_t i = 0; i < n; ++i) {
sumLon += ring[i].lon;
sumLat += ring[i].lat;
}
return {sumLon / static_cast<double>(n), sumLat / static_cast<double>(n)};
}
// ── Area (Shoelace + latitude cosine correction) ────────────────────────────
double area_sq_m(const std::vector<Coord>& ring) {
if (ring.size() < 3) return 0.0;
// Shoelace formula in projected coordinates.
// Each degree of longitude = cos(lat) * 111320 meters at that latitude.
// Each degree of latitude = 110540 meters (approximate).
double sum = 0.0;
size_t n = ring.size();
for (size_t i = 0; i < n; ++i) {
size_t j = (i + 1) % n;
// Convert coordinates to approximate meters using the average latitude
double avgLat = (ring[i].lat + ring[j].lat) / 2.0;
double cosLat = std::cos(avgLat * DEG2RAD);
double x_i = ring[i].lon * cosLat * 111320.0;
double y_i = ring[i].lat * 110540.0;
double x_j = ring[j].lon * cosLat * 111320.0;
double y_j = ring[j].lat * 110540.0;
sum += x_i * y_j - x_j * y_i;
}
return std::abs(sum) / 2.0;
}
// ── Point-in-polygon (ray casting) ──────────────────────────────────────────
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring) {
bool inside = false;
size_t n = ring.size();
for (size_t i = 0, j = n - 1; i < n; j = i++) {
double xi = ring[i].lon, yi = ring[i].lat;
double xj = ring[j].lon, yj = ring[j].lat;
if (((yi > pt.lat) != (yj > pt.lat)) &&
(pt.lon < (xj - xi) * (pt.lat - yi) / (yj - yi) + xi)) {
inside = !inside;
}
}
return inside;
}
// ── Bearing ─────────────────────────────────────────────────────────────────
double bearing_deg(Coord from, Coord to) {
double dLon = (to.lon - from.lon) * DEG2RAD;
double lat1 = from.lat * DEG2RAD;
double lat2 = to.lat * DEG2RAD;
double y = std::sin(dLon) * std::cos(lat2);
double x = std::cos(lat1) * std::sin(lat2) -
std::sin(lat1) * std::cos(lat2) * std::cos(dLon);
double brng = std::atan2(y, x) * RAD2DEG;
return std::fmod(brng + 360.0, 360.0);
}
// ── Destination point ───────────────────────────────────────────────────────
Coord destination(Coord from, double brng_deg, double dist_km) {
double brng = brng_deg * DEG2RAD;
double lat1 = from.lat * DEG2RAD;
double lon1 = from.lon * DEG2RAD;
double d = dist_km / EARTH_RADIUS_KM;
double lat2 = std::asin(std::sin(lat1) * std::cos(d) +
std::cos(lat1) * std::sin(d) * std::cos(brng));
double lon2 = lon1 + std::atan2(
std::sin(brng) * std::sin(d) * std::cos(lat1),
std::cos(d) - std::sin(lat1) * std::sin(lat2));
return {lon2 * RAD2DEG, lat2 * RAD2DEG};
}
// ── Square grid ─────────────────────────────────────────────────────────────
std::vector<Coord> square_grid(BBox extent, double cellSizeKm) {
std::vector<Coord> centers;
if (cellSizeKm <= 0) return centers;
// Convert cell size to degrees at the center latitude
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
double cosLat = std::cos(centerLat * DEG2RAD);
if (cosLat < 1e-10) cosLat = 1e-10; // Avoid division by zero near poles
double cellLatDeg = cellSizeKm / 110.574; // ~110.574 km per degree lat
double cellLonDeg = cellSizeKm / (111.320 * cosLat); // longitude correction
for (double lat = extent.minLat + cellLatDeg / 2.0;
lat < extent.maxLat; lat += cellLatDeg) {
for (double lon = extent.minLon + cellLonDeg / 2.0;
lon < extent.maxLon; lon += cellLonDeg) {
centers.push_back({lon, lat});
}
}
return centers;
}
// ── Hex grid ────────────────────────────────────────────────────────────────
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm) {
std::vector<Coord> centers;
if (cellSizeKm <= 0) return centers;
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
double cosLat = std::cos(centerLat * DEG2RAD);
if (cosLat < 1e-10) cosLat = 1e-10;
// Hex spacing: horizontal = cellSize, vertical = cellSize * sqrt(3)/2
double cellLatDeg = cellSizeKm / 110.574;
double cellLonDeg = cellSizeKm / (111.320 * cosLat);
double rowHeight = cellLatDeg * std::sqrt(3.0) / 2.0;
int row = 0;
for (double lat = extent.minLat + rowHeight / 2.0;
lat < extent.maxLat; lat += rowHeight) {
// Offset every other row by half the cell width
double lonOffset = (row % 2 == 1) ? cellLonDeg / 2.0 : 0.0;
for (double lon = extent.minLon + cellLonDeg / 2.0 + lonOffset;
lon < extent.maxLon; lon += cellLonDeg) {
centers.push_back({lon, lat});
}
row++;
}
return centers;
}
// ── Viewport estimation ─────────────────────────────────────────────────────
double estimate_viewport_sq_km(double lat, int zoom, int widthPx, int heightPx) {
double metersPerPx =
(156543.03392 * std::cos(lat * DEG2RAD)) / std::pow(2.0, zoom);
double widthKm = (widthPx * metersPerPx) / 1000.0;
double heightKm = (heightPx * metersPerPx) / 1000.0;
return widthKm * heightKm;
}
} // namespace geo

View File

@ -0,0 +1,6 @@
add_library(grid STATIC src/grid.cpp)
target_include_directories(grid PUBLIC include)
# Depends on geo (math) and gadm_reader (Feature type)
target_link_libraries(grid PUBLIC geo gadm_reader)

View File

@ -0,0 +1,56 @@
#pragma once
#include "geo/geo.h"
#include "gadm_reader/gadm_reader.h"
#include <functional>
#include <string>
#include <vector>
namespace grid {
// ── Types (mirror TS GridSearchHop) ─────────────────────────────────────────
struct Waypoint {
int step = 0;
double lng = 0;
double lat = 0;
double radius_km = 0;
std::string area_gid;
std::string area_name;
};
struct GridOptions {
std::string gridMode = "hex"; // "hex", "square", "admin", "centers"
double cellSize = 5.0; // km
double cellOverlap = 0.0;
double centroidOverlap = 0.5;
int maxCellsLimit = 15000;
double maxElevation = 0;
double minDensity = 0;
double minGhsPop = 0;
double minGhsBuilt = 0;
std::string ghsFilterMode = "AND"; // "AND" | "OR"
bool allowMissingGhs = false;
bool bypassFilters = false;
std::string pathOrder = "snake"; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest"
bool groupByRegion = true;
};
struct GridResult {
std::vector<Waypoint> waypoints;
int validCells = 0;
int skippedCells = 0;
std::string error;
};
// ── API ─────────────────────────────────────────────────────────────────────
/// Generate grid waypoints from GADM features + options.
/// This is the main entry point — equivalent to generateGridSearchCells() in TS.
GridResult generate(
const std::vector<gadm::Feature>& features,
const GridOptions& opts
);
} // namespace grid

View File

@ -0,0 +1,393 @@
#include "grid/grid.h"
#include <algorithm>
#include <cmath>
#include <map>
#include <unordered_map>
namespace grid {
// ── Internal types ──────────────────────────────────────────────────────────
struct CellInfo {
geo::Coord center;
double radius_km;
int region_idx;
bool allowed;
std::string reason;
};
// ── Filter logic (mirrors checkCellFilters in TS) ───────────────────────────
static bool check_filters(const gadm::Feature& feat, const GridOptions& opts,
double areaSqKm, std::string& reason) {
if (opts.bypassFilters) return true;
// GHS filter
bool checkPop = opts.minGhsPop > 0;
bool checkBuilt = opts.minGhsBuilt > 0;
if (checkPop || checkBuilt) {
double ghsPop = feat.ghsPopulation;
double ghsBuilt = feat.ghsBuiltWeight;
bool popPass = checkPop && ((ghsPop == 0 && opts.allowMissingGhs) || ghsPop >= opts.minGhsPop);
bool builtPass = checkBuilt && ((ghsBuilt == 0 && opts.allowMissingGhs) || ghsBuilt >= opts.minGhsBuilt);
if (opts.ghsFilterMode == "OR") {
if (checkPop && checkBuilt && !popPass && !builtPass) {
reason = "GHS (OR) below thresholds";
return false;
} else if (checkPop && !checkBuilt && !popPass) {
reason = "GHS Pop below threshold";
return false;
} else if (checkBuilt && !checkPop && !builtPass) {
reason = "GHS Built below threshold";
return false;
}
} else {
if (checkPop && !popPass) {
reason = "GHS Pop below threshold";
return false;
}
if (checkBuilt && !builtPass) {
reason = "GHS Built below threshold";
return false;
}
}
}
return true;
}
// ── Sorting ─────────────────────────────────────────────────────────────────
static void sort_waypoints(std::vector<Waypoint>& wps, const std::string& pathOrder,
double cellSize) {
if (wps.size() <= 1) return;
double rowTolerance = std::min((cellSize / 111.32) * 0.5, 0.5);
if (pathOrder == "zigzag" || pathOrder == "snake") {
// Sort top-to-bottom, left-to-right within row tolerance
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
if (std::abs(a.lat - b.lat) > rowTolerance) {
return b.lat < a.lat; // higher lat first (north to south)
}
return a.lng < b.lng; // left to right
});
if (pathOrder == "snake") {
// Group into rows, reverse every other row
std::vector<std::vector<Waypoint>> rows;
std::vector<Waypoint> currentRow;
double lastY = wps[0].lat;
for (auto& wp : wps) {
if (std::abs(wp.lat - lastY) > rowTolerance) {
rows.push_back(std::move(currentRow));
currentRow.clear();
lastY = wp.lat;
}
currentRow.push_back(wp);
}
if (!currentRow.empty()) rows.push_back(std::move(currentRow));
wps.clear();
for (size_t i = 0; i < rows.size(); ++i) {
if (i % 2 == 1) std::reverse(rows[i].begin(), rows[i].end());
for (auto& wp : rows[i]) wps.push_back(std::move(wp));
}
}
} else if (pathOrder == "spiral-out" || pathOrder == "spiral-in") {
// Sort by distance from center of all waypoints
double cLon = 0, cLat = 0;
for (const auto& wp : wps) { cLon += wp.lng; cLat += wp.lat; }
cLon /= wps.size();
cLat /= wps.size();
geo::Coord center{cLon, cLat};
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
double dA = geo::distance_km(center, {a.lng, a.lat});
double dB = geo::distance_km(center, {b.lng, b.lat});
return (pathOrder == "spiral-out") ? (dA < dB) : (dA > dB);
});
} else if (pathOrder == "shortest") {
// Greedy nearest-neighbor
std::vector<Waypoint> sorted;
sorted.reserve(wps.size());
std::vector<bool> used(wps.size(), false);
sorted.push_back(wps[0]);
used[0] = true;
for (size_t step = 1; step < wps.size(); ++step) {
const auto& cur = sorted.back();
double bestDist = 1e18;
size_t bestIdx = 0;
for (size_t i = 0; i < wps.size(); ++i) {
if (used[i]) continue;
double dx = wps[i].lng - cur.lng;
double dy = wps[i].lat - cur.lat;
double distSq = dx * dx + dy * dy;
if (distSq < bestDist) {
bestDist = distSq;
bestIdx = i;
}
}
sorted.push_back(wps[bestIdx]);
used[bestIdx] = true;
}
wps = std::move(sorted);
}
}
// ── Admin mode ──────────────────────────────────────────────────────────────
static GridResult generate_admin(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult res;
for (size_t i = 0; i < features.size(); ++i) {
const auto& f = features[i];
if (f.rings.empty() || f.rings[0].empty()) continue;
std::string reason;
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
geo::Coord center = geo::centroid(f.rings[0]);
// Radius = distance from centroid to bbox corner
double radiusKm = geo::distance_km(center, {f.bbox.maxLon, f.bbox.maxLat});
if (allowed) {
res.waypoints.push_back({
static_cast<int>(res.waypoints.size() + 1),
std::round(center.lon * 1e6) / 1e6,
std::round(center.lat * 1e6) / 1e6,
std::round(radiusKm * 100.0) / 100.0,
f.gid,
f.name
});
res.validCells++;
} else {
res.skippedCells++;
}
}
return res;
}
// ── Centers mode ────────────────────────────────────────────────────────────
static GridResult generate_centers(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult res;
struct AcceptedCenter {
geo::Coord coord;
};
std::vector<AcceptedCenter> accepted;
double minAllowedDist = opts.cellSize * (1.0 - opts.centroidOverlap);
for (size_t i = 0; i < features.size(); ++i) {
const auto& f = features[i];
// Collect unique centers by rounding to 5 decimal places
std::map<std::string, std::array<double, 3>> centersMap; // key → [lon, lat, weight]
auto addCenter = [&](double lon, double lat, double weight) {
char key[32];
snprintf(key, sizeof(key), "%.5f,%.5f", lon, lat);
std::string k(key);
if (centersMap.find(k) == centersMap.end()) {
centersMap[k] = {lon, lat, weight};
}
};
// Single pop/built centers
if (f.ghsPopCenter.lon != 0 || f.ghsPopCenter.lat != 0) {
addCenter(f.ghsPopCenter.lon, f.ghsPopCenter.lat, f.ghsPopulation);
}
if (f.ghsBuiltCenter.lon != 0 || f.ghsBuiltCenter.lat != 0) {
addCenter(f.ghsBuiltCenter.lon, f.ghsBuiltCenter.lat, f.ghsBuiltWeight);
}
// Weighted center arrays
for (const auto& c : f.ghsPopCenters) {
addCenter(c[0], c[1], c[2]);
}
for (const auto& c : f.ghsBuiltCenters) {
addCenter(c[0], c[1], c[2]);
}
for (const auto& [key, val] : centersMap) {
geo::Coord pt{val[0], val[1]};
std::string reason;
// For centers, use the feature's overall filters
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
// Check overlap with already-accepted centers
if (allowed && !accepted.empty()) {
for (const auto& ac : accepted) {
double dist = geo::distance_km(pt, ac.coord);
if (dist < minAllowedDist) {
allowed = false;
reason = "overlaps another centroid";
break;
}
}
}
if (allowed) {
accepted.push_back({pt});
res.waypoints.push_back({
static_cast<int>(res.waypoints.size() + 1),
std::round(pt.lon * 1e6) / 1e6,
std::round(pt.lat * 1e6) / 1e6,
std::round((opts.cellSize / 2.0) * 100.0) / 100.0,
f.gid,
f.name
});
res.validCells++;
} else {
res.skippedCells++;
}
}
}
return res;
}
// ── Polygon grid mode (hex / square) ────────────────────────────────────────
static GridResult generate_polygon_grid(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult res;
// Compute union bbox of all features
std::vector<geo::BBox> boxes;
for (const auto& f : features) {
if (!f.rings.empty()) boxes.push_back(f.bbox);
}
if (boxes.empty()) return res;
geo::BBox extent = geo::bbox_union(boxes);
// Estimate cell count to prevent runaway
double widthKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.maxLon, extent.minLat});
double heightKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.minLon, extent.maxLat});
double approxCellArea = opts.cellSize * opts.cellSize * 2.6;
int approxCells = static_cast<int>(std::ceil((widthKm * heightKm) / approxCellArea));
if (approxCells > opts.maxCellsLimit) {
res.error = "Grid too massive (~" + std::to_string(approxCells) + " cells). Increase cell size or select smaller region.";
return res;
}
// Generate grid centers
std::vector<geo::Coord> gridCenters;
if (opts.gridMode == "square") {
gridCenters = geo::square_grid(extent, opts.cellSize);
} else {
gridCenters = geo::hex_grid(extent, opts.cellSize);
}
// For each grid center, check if it intersects any feature polygon
for (const auto& gc : gridCenters) {
bool intersects = false;
int regionIdx = -1;
for (size_t i = 0; i < features.size(); ++i) {
if (features[i].rings.empty()) continue;
if (geo::point_in_polygon(gc, features[i].rings[0])) {
intersects = true;
regionIdx = static_cast<int>(i);
break;
}
}
if (!intersects) continue;
const auto& regionFeat = features[regionIdx];
std::string reason;
bool allowed = check_filters(regionFeat, opts, regionFeat.areaSqKm, reason);
// Compute cell radius (half diagonal of cell)
double cellRadiusKm = opts.cellSize * std::sqrt(2.0) / 2.0;
if (allowed) {
res.waypoints.push_back({
static_cast<int>(res.waypoints.size() + 1),
std::round(gc.lon * 1e6) / 1e6,
std::round(gc.lat * 1e6) / 1e6,
std::round(cellRadiusKm * 100.0) / 100.0,
regionFeat.gid,
regionFeat.name
});
res.validCells++;
} else {
res.skippedCells++;
}
}
return res;
}
// ── Main entry point ────────────────────────────────────────────────────────
GridResult generate(const std::vector<gadm::Feature>& features,
const GridOptions& opts) {
GridResult result;
if (features.empty()) {
result.error = "No features provided";
return result;
}
if (opts.gridMode == "admin") {
result = generate_admin(features, opts);
} else if (opts.gridMode == "centers") {
result = generate_centers(features, opts);
} else {
result = generate_polygon_grid(features, opts);
}
if (!result.error.empty()) return result;
// Sort waypoints
if (result.waypoints.size() > 1) {
if (opts.groupByRegion && features.size() > 1) {
std::stable_sort(result.waypoints.begin(), result.waypoints.end(),
[](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; });
auto start = result.waypoints.begin();
while (start != result.waypoints.end()) {
auto end = start;
while (end != result.waypoints.end() && end->area_gid == start->area_gid) {
++end;
}
std::vector<Waypoint> group(start, end);
sort_waypoints(group, opts.pathOrder, opts.cellSize);
std::copy(group.begin(), group.end(), start);
start = end;
}
} else {
sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize);
}
}
// Re-number steps after sorting
for (size_t i = 0; i < result.waypoints.size(); ++i) {
result.waypoints[i].step = static_cast<int>(i + 1);
}
return result;
}
} // namespace grid

View File

@ -0,0 +1,33 @@
include(FetchContent)
FetchContent_Declare(
lexbor
GIT_REPOSITORY https://github.com/lexbor/lexbor.git
GIT_TAG v2.4.0
GIT_SHALLOW TRUE
)
# Build lexbor as static
set(LEXBOR_BUILD_SHARED OFF CACHE BOOL "" FORCE)
set(LEXBOR_BUILD_STATIC ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(lexbor)
add_library(html STATIC
src/html.cpp
src/html2md.cpp
src/table.cpp
)
# MSVC: treat source and execution charset as UTF-8
# (fixes \u200b zero-width-space mismatch in html2md tests)
if(MSVC)
target_compile_options(html PRIVATE /utf-8)
endif()
target_include_directories(html
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(html
PUBLIC lexbor_static
)

View File

@ -0,0 +1,55 @@
#pragma once
#include <string>
#include <vector>
namespace html {
/// Parsed element — tag name + text content.
struct Element {
std::string tag;
std::string text;
};
/// Link with href and optional attributes.
struct Link {
std::string href;
std::string rel; // e.g. "canonical", "stylesheet"
std::string text; // anchor text (for <a> tags)
};
/// Parse an HTML string and return all elements with their text content.
std::vector<Element> parse(const std::string &html_str);
/// Extract the text content of all elements matching a CSS selector.
std::vector<std::string> select(const std::string &html_str,
const std::string &selector);
// ── Enricher extraction helpers ─────────────────────────────────────────────
/// Extract the <title> text.
std::string get_title(const std::string &html_str);
/// Extract a <meta name="X"> or <meta property="X"> content attribute.
std::string get_meta(const std::string &html_str, const std::string &name);
/// Extract <link rel="canonical"> href.
std::string get_canonical(const std::string &html_str);
/// Extract all <a href="..."> values (resolved links as-is from the HTML).
std::vector<Link> get_links(const std::string &html_str);
/// Extract visible body text, stripping script/style/noscript/svg/iframe.
std::string get_body_text(const std::string &html_str);
/// Extract raw JSON strings from <script type="application/ld+json">.
std::vector<std::string> get_json_ld(const std::string &html_str);
/// Extract an attribute value from the first element matching a CSS selector.
std::string get_attr(const std::string &html_str, const std::string &selector,
const std::string &attr_name);
/// Convert HTML content to Markdown.
std::string to_markdown(const std::string &html_str);
} // namespace html

View File

@ -0,0 +1,690 @@
// Copyright (c) Tim Gromeyer
// Licensed under the MIT License - https://opensource.org/licenses/MIT
#ifndef HTML2MD_H
#define HTML2MD_H
#include <memory>
#include <string>
#include <unordered_map>
#include <cstdint>
/*!
* \brief html2md namespace
*
* The html2md namespace provides:
* 1. The Converter class
* 2. Static wrapper around Converter class
*
* \note Do NOT try to convert HTML that contains a list in an ordered list or a
* `blockquote` in a list!\n This will be a **total** mess!
*/
namespace html2md {
/*!
* \brief Options for the conversion from HTML to Markdown
* \warning Make sure to pass valid options; otherwise, the output will be
* invalid!
*
* Example from `tests/main.cpp`:
*
* ```cpp
* auto *options = new html2md::Options();
* options->splitLines = false;
*
* html2md::Converter c(html, options);
* auto md = c.convert();
* ```
*/
struct Options {
/*!
* \brief Add new line when a certain number of characters is reached
*
* \see softBreak
* \see hardBreak
*/
bool splitLines = true;
/*!
* \brief softBreak Wrap after ... characters when the next space is reached
* and as long as it's not in a list, table, image or anchor (link).
*/
int softBreak = 80;
/*!
* \brief hardBreak Force a break after ... characters in a line
*/
int hardBreak = 100;
/*!
* \brief The char used for unordered lists
*
* Valid:
* - `-`
* - `+`
* - `*`
*
* Example:
*
* ```markdown
* - List
* + Also a list
* * And this to
* ```
*/
char unorderedList = '-';
/*!
* \brief The char used after the number of the item
*
* Valid:
* - `.`
* - `)`
*
* Example:
*
* ```markdown
* 1. Hello
* 2) World!
* ```
*/
char orderedList = '.';
/*!
* \brief Whether title is added as h1 heading at the very beginning of the
* markdown
*
* Whether title is added as h1 heading at the very beginning of the markdown.
* Default is true.
*/
bool includeTitle = true;
/*!
* \brief Whetever to format Markdown Tables
*
* Whetever to format Markdown Tables.
* Default is true.
*/
bool formatTable = true;
/*!
* \brief Whether to force left trim of lines in the final Markdown output
*
* Whether to force left trim of lines in the final Markdown output.
* Default is false.
*/
bool forceLeftTrim = false;
/*!
* \brief Whether to compress whitespace (tabs, multiple spaces) into a single
* space
*
* Whether to compress whitespace (tabs, multiple spaces) into a single space.
* Default is false.
*/
bool compressWhitespace = false;
/*!
* \brief Whether to escape numbered lists (e.g. "4." -> "4\.") to prevent them
* from being interpreted as lists in Markdown.
*
* Whether to escape numbered lists.
* Default is true.
*/
bool escapeNumberedList = true;
/*!
* \brief Whether to keep HTML entities (e.g. `&nbsp;`) in the output
*
* If true, the converter will not replace HTML entities configured in the
* internal conversion map. Default is false (current behaviour).
*/
bool keepHtmlEntities = false;
inline bool operator==(html2md::Options o) const {
return splitLines == o.splitLines && unorderedList == o.unorderedList &&
orderedList == o.orderedList && includeTitle == o.includeTitle &&
softBreak == o.softBreak && hardBreak == o.hardBreak &&
formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim &&
compressWhitespace == o.compressWhitespace &&
escapeNumberedList == o.escapeNumberedList &&
keepHtmlEntities == o.keepHtmlEntities;
};
};
/*!
* \brief Class for converting HTML to Markdown
*
* This class converts HTML to Markdown.
* There is also a static wrapper for this class (see html2md::Convert).
*
* ## Usage example
*
* Option 1: Use the class:
*
* ```cpp
* std::string html = "<h1>example</h1>";
* html2md::Converter c(html);
* auto md = c.convert();
*
* if (!c.ok()) std::cout << "There was something wrong in the HTML\n";
* std::cout << md; // # example
* ```
*
* Option 2: Use the static wrapper:
*
* ```cpp
* std::string html = "<h1>example</h1>";
*
* auto md = html2md::Convert(html);
* std::cout << md;
* ```
*
* Advanced: use Options:
*
* ```cpp
* std::string html = "<h1>example</h1>";
*
* auto *options = new html2md::Options();
* options->splitLines = false;
* options->unorderedList = '*';
*
* html2md::Converter c(html, options);
* auto md = c.convert();
* if (!c.ok()) std::cout << "There was something wrong in the HTML\n";
* std::cout << md; // # example
* ```
*/
class Converter {
public:
/*!
* \brief Standard initializer, takes HTML as parameter. Also prepares
* everything. \param html The HTML as std::string. \param options Options for
* the Conversation. See html2md::Options() for more.
*
* \note Don't pass anything else than HTML, otherwise the output will be a
* **mess**!
*
* This is the default initializer.<br>
* You can use appendToMd() to append something to the beginning of the
* generated output.
*/
explicit inline Converter(const std::string &html,
struct Options *options = nullptr) {
*this = Converter(&html, options);
}
/*!
* \brief Convert HTML into Markdown.
* \return Returns the converted Markdown.
*
* This function actually converts the HTML into Markdown.
* It also cleans up the Markdown so you don't have to do anything.
*/
[[nodiscard]] std::string convert();
/*!
* \brief Append a char to the Markdown.
* \param ch The char to append.
* \return Returns a copy of the instance with the char appended.
*/
Converter *appendToMd(char ch);
/*!
* \brief Append a char* to the Markdown.
* \param str The char* to append.
* \return Returns a copy of the instance with the char* appended.
*/
Converter *appendToMd(const char *str);
/*!
* \brief Append a string to the Markdown.
* \param s The string to append.
* \return Returns a copy of the instance with the string appended.
*/
inline Converter *appendToMd(const std::string &s) {
return appendToMd(s.c_str());
}
/*!
* \brief Appends a ' ' in certain cases.
* \return Copy of the instance with(maybe) the appended space.
*
* This function appends ' ' if:
* - md does not end with `*`
* - md does not end with `\n` aka newline
*/
Converter *appendBlank();
/*!
* \brief Add an HTML symbol conversion
* \param htmlSymbol The HTML symbol to convert
* \param replacement The replacement string
* \note This is useful for converting HTML entities to their Markdown
* equivalents. For example, you can add a conversion for "&nbsp;" to
* " " (space) or "&lt;" to "<" (less than).
* \note This is not a standard feature of the Converter class, but it can
* be added to the class to allow for more flexibility in the conversion
* process. You can use this feature to add custom conversions for any HTML
* symbol that you want to convert to a specific Markdown representation.
*/
void addHtmlSymbolConversion(const std::string &htmlSymbol,
const std::string &replacement) {
htmlSymbolConversions_[htmlSymbol] = replacement;
}
/*!
* \brief Remove an HTML symbol conversion
* \param htmlSymbol The HTML symbol to remove
* \note This is useful for removing custom conversions that you have added
* previously.
*/
void removeHtmlSymbolConversion(const std::string &htmlSymbol) {
htmlSymbolConversions_.erase(htmlSymbol);
}
/*!
* \brief Clear all HTML symbol conversions
* \note This is useful for clearing the conversion map (it's empty afterwards).
*/
void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); }
/*!
* \brief Checks if everything was closed properly(in the HTML).
* \return Returns false if there is a unclosed tag.
* \note As long as you have not called convert(), it always returns true.
*/
[[nodiscard]] bool ok() const;
/*!
* \brief Reset the generated Markdown
*/
void reset();
/*!
* \brief Checks if the HTML matches and the options are the same.
* \param The Converter object to compare with
* \return true if the HTML and options matches otherwise false
*/
inline bool operator==(const Converter *c) const { return *this == *c; }
inline bool operator==(const Converter &c) const {
return html_ == c.html_ && option == c.option;
}
/*!
* \brief Returns ok().
*/
inline explicit operator bool() const { return ok(); };
private:
// Attributes
static constexpr const char *kAttributeHref = "href";
static constexpr const char *kAttributeAlt = "alt";
static constexpr const char *kAttributeTitle = "title";
static constexpr const char *kAttributeClass = "class";
static constexpr const char *kAttributeSrc = "src";
static constexpr const char *kAttrinuteAlign = "align";
static constexpr const char *kTagAnchor = "a";
static constexpr const char *kTagBreak = "br";
static constexpr const char *kTagCode = "code";
static constexpr const char *kTagDiv = "div";
static constexpr const char *kTagHead = "head";
static constexpr const char *kTagLink = "link";
static constexpr const char *kTagListItem = "li";
static constexpr const char *kTagMeta = "meta";
static constexpr const char *kTagNav = "nav";
static constexpr const char *kTagNoScript = "noscript";
static constexpr const char *kTagOption = "option";
static constexpr const char *kTagOrderedList = "ol";
static constexpr const char *kTagParagraph = "p";
static constexpr const char *kTagPre = "pre";
static constexpr const char *kTagScript = "script";
static constexpr const char *kTagSpan = "span";
static constexpr const char *kTagStyle = "style";
static constexpr const char *kTagTemplate = "template";
static constexpr const char *kTagTitle = "title";
static constexpr const char *kTagUnorderedList = "ul";
static constexpr const char *kTagImg = "img";
static constexpr const char *kTagSeperator = "hr";
// Text format
static constexpr const char *kTagBold = "b";
static constexpr const char *kTagStrong = "strong";
static constexpr const char *kTagItalic = "em";
static constexpr const char *kTagItalic2 = "i";
static constexpr const char *kTagCitation = "cite";
static constexpr const char *kTagDefinition = "dfn";
static constexpr const char *kTagUnderline = "u";
static constexpr const char *kTagStrighthrought = "del";
static constexpr const char *kTagStrighthrought2 = "s";
static constexpr const char *kTagBlockquote = "blockquote";
// Header
static constexpr const char *kTagHeader1 = "h1";
static constexpr const char *kTagHeader2 = "h2";
static constexpr const char *kTagHeader3 = "h3";
static constexpr const char *kTagHeader4 = "h4";
static constexpr const char *kTagHeader5 = "h5";
static constexpr const char *kTagHeader6 = "h6";
// Table
static constexpr const char *kTagTable = "table";
static constexpr const char *kTagTableRow = "tr";
static constexpr const char *kTagTableHeader = "th";
static constexpr const char *kTagTableData = "td";
size_t index_ch_in_html_ = 0;
bool is_closing_tag_ = false;
bool is_in_attribute_value_ = false;
bool is_in_code_ = false;
bool is_in_list_ = false;
bool is_in_p_ = false;
bool is_in_pre_ = false;
bool is_in_table_ = false;
bool is_in_table_row_ = false;
bool is_in_tag_ = false;
bool is_self_closing_tag_ = false;
bool skipping_leading_whitespace_ = true;
// relevant for <li> only, false = is in unordered list
bool is_in_ordered_list_ = false;
uint8_t index_ol = 0;
// store the table start
size_t table_start = 0;
// number of lists
uint8_t index_li = 0;
uint8_t index_blockquote = 0;
char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
char prev_ch_in_html_ = 'x';
std::string html_;
uint16_t offset_lt_ = 0;
std::string current_tag_;
std::string prev_tag_;
// Line which separates header from data
std::string tableLine;
size_t chars_in_curr_line_ = 0;
std::string md_;
Options option;
std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
{"&quot;", "\""}, {"&lt;", "<"}, {"&gt;", ">"},
{"&amp;", "&"}, {"&nbsp;", " "}, {"&rarr;", ""}};
// Tag: base class for tag types
struct Tag {
virtual void OnHasLeftOpeningTag(Converter *c) = 0;
virtual void OnHasLeftClosingTag(Converter *c) = 0;
};
// Tag types
// tags that are not printed (nav, script, noscript, ...)
struct TagIgnored : Tag {
void OnHasLeftOpeningTag(Converter *c) override {};
void OnHasLeftClosingTag(Converter *c) override {};
};
struct TagAnchor : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
std::string current_href_;
std::string current_title_;
};
struct TagBold : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagItalic : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagUnderline : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagStrikethrought : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagBreak : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagDiv : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader1 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader2 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader3 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader4 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader5 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader6 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagListItem : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagOption : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagOrderedList : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagParagraph : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagPre : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagCode : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagSpan : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTitle : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagUnorderedList : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagImage : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagSeperator : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTable : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTableRow : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTableHeader : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTableData : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagBlockquote : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
explicit Converter(const std::string *html, struct Options *options);
void CleanUpMarkdown();
// Trim from start (in place)
static void LTrim(std::string *s);
// Trim from end (in place)
Converter *RTrim(std::string *s, bool trim_only_blank = false);
// Trim from both ends (in place)
Converter *Trim(std::string *s);
// 1. trim all lines
// 2. reduce consecutive newlines to maximum 3
void TidyAllLines(std::string *str);
std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
void TurnLineIntoHeader1();
void TurnLineIntoHeader2();
// Current char: '<'
void OnHasEnteredTag();
Converter *UpdatePrevChFromMd();
/**
* Handle next char within <...> tag
*
* @param ch current character
* @return continue surrounding iteration?
*/
bool ParseCharInTag(char ch);
// Current char: '>'
bool OnHasLeftTag();
inline static bool TagContainsAttributesToHide(std::string *tag) {
using std::string;
return (*tag).find(" aria=\"hidden\"") != string::npos ||
(*tag).find("display:none") != string::npos ||
(*tag).find("visibility:hidden") != string::npos ||
(*tag).find("opacity:0") != string::npos ||
(*tag).find("Details-content--hidden-not-important") != string::npos;
}
Converter *ShortenMarkdown(size_t chars = 1);
inline bool shortIfPrevCh(char prev) {
if (prev_ch_in_md_ == prev) {
ShortenMarkdown();
return true;
}
return false;
};
/**
* @param ch
* @return continue iteration surrounding this method's invocation?
*/
bool ParseCharInTagContent(char ch);
// Replace previous space (if any) in current markdown line by newline
bool ReplacePreviousSpaceInLineByNewline();
static inline bool IsIgnoredTag(const std::string &tag) {
return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
// meta: not ignored to tolerate if closing is omitted
}
[[nodiscard]] bool IsInIgnoredTag() const;
}; // Converter
/*!
* \brief Static wrapper around the Converter class
* \param html The HTML passed to Converter
* \param ok Optional: Pass a reference to a local bool to store the output of
* Converter::ok() \return Returns the by Converter generated Markdown
*/
inline std::string Convert(const std::string &html, bool *ok = nullptr) {
Converter c(html);
auto md = c.convert();
if (ok != nullptr)
*ok = c.ok();
return md;
}
#ifndef PYTHON_BINDINGS
inline std::string Convert(const std::string &&html, bool *ok = nullptr) {
return Convert(html, ok);
}
#endif
} // namespace html2md
#endif // HTML2MD_H

View File

@ -0,0 +1,11 @@
// Copyright (c) Tim Gromeyer
// Licensed under the MIT License - https://opensource.org/licenses/MIT
#ifndef TABLE_H
#define TABLE_H
#include <string>
[[nodiscard]] std::string formatMarkdownTable(const std::string &inputTable);
#endif // TABLE_H

View File

@ -0,0 +1,101 @@
# Scraper Request
## OpenAPI Specification
```yaml
openapi: 3.0.1
info:
title: ''
description: ''
version: 1.0.0
paths:
/api/v1/scraper/request:
post:
summary: Scraper Request
deprecated: false
description: ''
tags:
- Scraping API
parameters: []
requestBody:
content:
application/json:
schema:
type: object
properties:
actor:
type: string
input:
type: object
properties:
url:
type: string
required:
- url
x-apidog-orders:
- url
proxy:
type: object
properties:
country:
type: string
required:
- country
x-apidog-orders:
- country
async:
type: boolean
description: |-
If true, the task will be executed asynchronously.
If false, the task will be executed synchronously.
required:
- actor
- input
- proxy
x-apidog-orders:
- actor
- input
- proxy
- async
example:
actor: scraper.xxx
input:
url: >-
https://www.***.com/shop/us/products/stmicroelectronics/tda7265a-3074457345625542393/
proxy:
country: US
async: false
responses:
'200':
description: ''
content:
application/json:
schema:
type: object
properties: {}
x-apidog-orders: []
headers: {}
x-apidog-name: Success
security:
- apikey-header-x-api-token: []
x-apidog-folder: Scraping API
x-apidog-status: released
x-run-in-apidog: https://app.apidog.com/web/project/745098/apis/api-11949852-run
components:
schemas: {}
securitySchemes:
bearer:
type: bearer
scheme: bearer
description: Bearer token authentication using your Scrapeless API key
apikey-header-x-api-token:
type: apiKey
in: header
name: x-api-token
servers:
- url: https://api.scrapeless.com
description: Prod Env
security:
- apikey-header-x-api-token: []
```

View File

@ -0,0 +1,403 @@
#include "html/html.h"
#include <lexbor/css/css.h>
#include <lexbor/html/html.h>
#include <lexbor/selectors/selectors.h>
#include <html/html2md.h>
#include <algorithm>
#include <cstring>
namespace html {
// ── helpers ─────────────────────────────────────────────────────────────────
static std::string node_text(lxb_dom_node_t *node) {
size_t len = 0;
lxb_char_t *text = lxb_dom_node_text_content(node, &len);
if (!text)
return {};
std::string result(reinterpret_cast<const char *>(text), len);
lxb_dom_document_destroy_text(node->owner_document, text);
return result;
}
static std::string tag_name(lxb_dom_element_t *el) {
size_t len = 0;
const lxb_char_t *name = lxb_dom_element_qualified_name(el, &len);
if (!name)
return {};
return std::string(reinterpret_cast<const char *>(name), len);
}
static std::string get_element_attr(lxb_dom_element_t *el, const char *attr) {
size_t len = 0;
const lxb_char_t *val = lxb_dom_element_get_attribute(
el, reinterpret_cast<const lxb_char_t *>(attr), strlen(attr), &len);
if (!val)
return {};
return std::string(reinterpret_cast<const char *>(val), len);
}
static lxb_html_document_t *parse_doc(const std::string &html_str) {
auto *doc = lxb_html_document_create();
if (!doc) return nullptr;
auto status = lxb_html_document_parse(
doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()),
html_str.size());
if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(doc);
return nullptr;
}
return doc;
}
// ── Helper: check if a tag name matches a noise element ─────────────────────
static bool is_noise_tag(const std::string &name) {
return name == "script" || name == "style" || name == "noscript" ||
name == "svg" || name == "iframe";
}
// ── walk tree recursively ───────────────────────────────────────────────────
static void walk(lxb_dom_node_t *node, std::vector<Element> &out) {
if (!node)
return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto txt = node_text(node);
if (!txt.empty()) {
out.push_back({tag_name(el), txt});
}
}
auto *child = node->first_child;
while (child) {
walk(child, out);
child = child->next;
}
}
// ── Walk for visible text only (skip noise tags) ────────────────────────────
static void walk_text(lxb_dom_node_t *node, std::string &out) {
if (!node) return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto name = tag_name(el);
if (is_noise_tag(name)) return; // Skip noise subtrees entirely
}
if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
size_t len = 0;
const lxb_char_t *data = lxb_dom_node_text_content(node, &len);
if (data && len > 0) {
std::string chunk(reinterpret_cast<const char *>(data), len);
// Collapse whitespace
bool needSpace = !out.empty() && out.back() != ' ' && out.back() != '\n';
// Trim leading/trailing whitespace from chunk
size_t start = chunk.find_first_not_of(" \t\n\r");
size_t end = chunk.find_last_not_of(" \t\n\r");
if (start != std::string::npos) {
if (needSpace) out += ' ';
out += chunk.substr(start, end - start + 1);
}
}
}
auto *child = node->first_child;
while (child) {
walk_text(child, out);
child = child->next;
}
}
// ── Walk <head> for meta/title/link ─────────────────────────────────────────
struct HeadData {
std::string title;
std::string canonical;
std::vector<std::pair<std::string, std::string>> metas; // name/property → content
std::vector<std::string> json_ld;
};
static void walk_head(lxb_dom_node_t *node, HeadData &data) {
if (!node) return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto name = tag_name(el);
if (name == "title") {
data.title = node_text(node);
} else if (name == "meta") {
auto nameAttr = get_element_attr(el, "name");
auto propAttr = get_element_attr(el, "property");
auto content = get_element_attr(el, "content");
if (!content.empty()) {
if (!nameAttr.empty()) data.metas.emplace_back(nameAttr, content);
if (!propAttr.empty()) data.metas.emplace_back(propAttr, content);
}
} else if (name == "link") {
auto rel = get_element_attr(el, "rel");
if (rel == "canonical") {
data.canonical = get_element_attr(el, "href");
}
} else if (name == "script") {
auto type = get_element_attr(el, "type");
if (type == "application/ld+json") {
auto text = node_text(node);
if (!text.empty()) data.json_ld.push_back(text);
}
}
}
auto *child = node->first_child;
while (child) {
walk_head(child, data);
child = child->next;
}
}
// ── Walk <body> for <a> links ───────────────────────────────────────────────
static void walk_links(lxb_dom_node_t *node, std::vector<Link> &out) {
if (!node) return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto name = tag_name(el);
if (name == "a") {
auto href = get_element_attr(el, "href");
if (!href.empty()) {
Link lk;
lk.href = href;
lk.rel = get_element_attr(el, "rel");
lk.text = node_text(node);
out.push_back(std::move(lk));
}
}
}
auto *child = node->first_child;
while (child) {
walk_links(child, out);
child = child->next;
}
}
// ── public API ──────────────────────────────────────────────────────────────
std::vector<Element> parse(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
std::vector<Element> result;
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk(body, result);
lxb_html_document_destroy(doc);
return result;
}
// ── CSS selector callback ───────────────────────────────────────────────────
struct SelectCtx {
std::vector<std::string> *out;
};
static lxb_status_t select_cb(lxb_dom_node_t *node,
lxb_css_selector_specificity_t spec, void *ctx) {
(void)spec;
auto *sctx = static_cast<SelectCtx *>(ctx);
auto txt = node_text(node);
if (!txt.empty()) {
sctx->out->push_back(txt);
}
return LXB_STATUS_OK;
}
std::vector<std::string> select(const std::string &html_str,
const std::string &selector) {
std::vector<std::string> result;
auto *doc = parse_doc(html_str);
if (!doc) return result;
auto *css_parser = lxb_css_parser_create();
lxb_css_parser_init(css_parser, nullptr);
auto *selectors = lxb_selectors_create();
lxb_selectors_init(selectors);
auto *list = lxb_css_selectors_parse(
css_parser, reinterpret_cast<const lxb_char_t *>(selector.c_str()),
selector.size());
if (list) {
SelectCtx ctx{&result};
lxb_selectors_find(
selectors, lxb_dom_interface_node(lxb_html_document_body_element(doc)),
list, select_cb, &ctx);
lxb_css_selector_list_destroy_memory(list);
}
lxb_selectors_destroy(selectors, true);
lxb_css_parser_destroy(css_parser, true);
lxb_html_document_destroy(doc);
return result;
}
// ── Enricher extraction helpers ─────────────────────────────────────────────
std::string get_title(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
walk_head(head, data);
lxb_html_document_destroy(doc);
return data.title;
}
std::string get_meta(const std::string &html_str, const std::string &name) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
walk_head(head, data);
lxb_html_document_destroy(doc);
for (auto &[key, val] : data.metas) {
if (key == name) return val;
}
return {};
}
std::string get_canonical(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
walk_head(head, data);
lxb_html_document_destroy(doc);
return data.canonical;
}
std::vector<Link> get_links(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
std::vector<Link> links;
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk_links(body, links);
lxb_html_document_destroy(doc);
return links;
}
std::string get_body_text(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
std::string text;
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk_text(body, text);
lxb_html_document_destroy(doc);
return text;
}
std::vector<std::string> get_json_ld(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
// JSON-LD can be in head or body — walk entire document
auto *root = lxb_dom_interface_node(
lxb_dom_document_element(&doc->dom_document));
walk_head(root, data);
lxb_html_document_destroy(doc);
return data.json_ld;
}
// ── get_attr via CSS selector ───────────────────────────────────────────────
struct AttrCtx {
std::string attr_name;
std::string result;
bool found;
};
static lxb_status_t attr_cb(lxb_dom_node_t *node,
lxb_css_selector_specificity_t spec, void *ctx) {
(void)spec;
auto *actx = static_cast<AttrCtx *>(ctx);
if (actx->found) return LXB_STATUS_OK;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto val = get_element_attr(el, actx->attr_name.c_str());
if (!val.empty()) {
actx->result = val;
actx->found = true;
}
}
return LXB_STATUS_OK;
}
std::string get_attr(const std::string &html_str, const std::string &selector,
const std::string &attr_name) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
auto *css_parser = lxb_css_parser_create();
lxb_css_parser_init(css_parser, nullptr);
auto *selectors = lxb_selectors_create();
lxb_selectors_init(selectors);
auto *list = lxb_css_selectors_parse(
css_parser, reinterpret_cast<const lxb_char_t *>(selector.c_str()),
selector.size());
std::string result;
if (list) {
AttrCtx ctx{attr_name, {}, false};
auto *root = lxb_dom_interface_node(
lxb_dom_document_element(&doc->dom_document));
lxb_selectors_find(selectors, root, list, attr_cb, &ctx);
result = ctx.result;
lxb_css_selector_list_destroy_memory(list);
}
lxb_selectors_destroy(selectors, true);
lxb_css_parser_destroy(css_parser, true);
lxb_html_document_destroy(doc);
return result;
}
std::string to_markdown(const std::string &html_str) {
// Defense-in-depth: hard cap at 2 MB even if the caller forgets.
// The enricher pipeline already caps at 512 KB, but future callers
// may not — prevent OOM / multi-second hangs from html2md.
static constexpr size_t MAX_HTML2MD_INPUT = 2 * 1024 * 1024;
if (html_str.size() > MAX_HTML2MD_INPUT) {
return "*[Content truncated: HTML too large for markdown conversion ("
+ std::to_string(html_str.size() / 1024) + " KB)]*\n";
}
return html2md::Convert(html_str);
}
} // namespace html

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,106 @@
// Copyright (c) Tim Gromeyer
// Licensed under the MIT License - https://opensource.org/licenses/MIT
#include "html/table.h"
#include <iomanip>
#include <iostream>
#include <sstream>
#include <vector>
using std::string;
using std::vector;
const size_t MIN_LINE_LENGTH = 3; // Minimum length of line
void removeLeadingTrailingSpaces(string &str) {
size_t firstNonSpace = str.find_first_not_of(' ');
if (firstNonSpace == string::npos) {
str.clear(); // Entire string is spaces
return;
}
size_t lastNonSpace = str.find_last_not_of(' ');
str = str.substr(firstNonSpace, lastNonSpace - firstNonSpace + 1);
}
string enlargeTableHeaderLine(const string &str, size_t length) {
if (str.empty() || length < MIN_LINE_LENGTH)
return "";
size_t first = str.find_first_of(':');
size_t last = str.find_last_of(':');
if (first == 0 && first == last)
last = string::npos;
string line = string(length, '-');
if (first == 0)
line[0] = ':';
if (last == str.length() - 1)
line[length - 1] = ':';
return line;
}
string formatMarkdownTable(const string &inputTable) {
std::istringstream iss(inputTable);
string line;
vector<vector<string>> tableData;
// Parse the input table into a 2D vector
while (std::getline(iss, line)) {
std::istringstream lineStream(line);
string cell;
vector<string> rowData;
while (std::getline(lineStream, cell, '|')) {
removeLeadingTrailingSpaces(cell); // Trim first
if (!cell.empty()) { // Then check if empty
rowData.push_back(cell);
}
}
if (!rowData.empty()) {
tableData.push_back(std::move(rowData)); // Move rowData to avoid copying
}
}
if (tableData.empty()) {
return "";
}
// Determine maximum width of each column
vector<size_t> columnWidths(tableData[0].size(), 0);
for (const auto &row : tableData) {
if (columnWidths.size() < row.size()) {
columnWidths.resize(row.size(), 0);
}
for (size_t i = 0; i < row.size(); ++i) {
columnWidths[i] = std::max(columnWidths[i], row[i].size());
}
}
// Build the formatted table
std::ostringstream formattedTable;
for (size_t rowNumber = 0; rowNumber < tableData.size(); ++rowNumber) {
const auto &row = tableData[rowNumber];
formattedTable << "|";
for (size_t i = 0; i < row.size(); ++i) {
if (rowNumber == 1) {
formattedTable << enlargeTableHeaderLine(row[i], columnWidths[i] + 2)
<< "|";
continue;
}
formattedTable << " " << std::setw(columnWidths[i]) << std::left << row[i]
<< " |";
}
formattedTable << "\n";
}
return formattedTable.str();
}

View File

@ -0,0 +1,48 @@
include(FetchContent)
# Work around curl's old cmake_minimum_required for CMake 4.x
set(CMAKE_POLICY_VERSION_MINIMUM 3.5 CACHE STRING "" FORCE)
FetchContent_Declare(
CURL
URL https://github.com/curl/curl/releases/download/curl-8_12_1/curl-8.12.1.tar.xz
DOWNLOAD_EXTRACT_TIMESTAMP TRUE
)
# Minimal curl build static, SChannel TLS, no optional deps
set(BUILD_CURL_EXE OFF CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
# TLS backend: platform-appropriate
if(WIN32)
set(CURL_USE_OPENSSL OFF CACHE BOOL "" FORCE)
set(CURL_USE_SCHANNEL ON CACHE BOOL "" FORCE)
else()
set(CURL_USE_SCHANNEL OFF CACHE BOOL "" FORCE)
set(CURL_USE_OPENSSL ON CACHE BOOL "" FORCE)
endif()
# Disable optional compression/protocol deps
set(CURL_ZLIB OFF CACHE BOOL "" FORCE)
set(CURL_BROTLI OFF CACHE BOOL "" FORCE)
set(CURL_ZSTD OFF CACHE BOOL "" FORCE)
set(USE_NGHTTP2 OFF CACHE BOOL "" FORCE)
set(CURL_USE_LIBSSH2 OFF CACHE BOOL "" FORCE)
set(CURL_USE_LIBPSL OFF CACHE BOOL "" FORCE)
set(CURL_DISABLE_LDAP ON CACHE BOOL "" FORCE)
set(CURL_DISABLE_LDAPS ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(CURL)
add_library(http STATIC
src/http.cpp
)
target_include_directories(http
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(http
PUBLIC CURL::libcurl
)

View File

@ -0,0 +1,40 @@
#pragma once
#include <string>
namespace http {
struct Response {
long status_code;
std::string body;
};
/// Options for customisable HTTP GET requests.
struct GetOptions {
std::string user_agent = "Mozilla/5.0 (compatible; PolymechBot/1.0)";
int timeout_ms = 10000;
bool follow_redirects = true;
};
/// Perform an HTTP GET request. Returns the response body and status code.
Response get(const std::string &url);
/// Perform an HTTP GET request with custom options.
Response get(const std::string &url, const GetOptions &opts);
/// Perform an HTTP POST request with a body. Returns the response and status.
Response post(const std::string &url, const std::string &body,
const std::string &content_type = "application/json");
/// Options for customisable HTTP POST requests.
struct PostOptions {
std::string content_type = "application/json";
std::string bearer_token; // Authorization: Bearer <token>
int timeout_ms = 30000;
};
/// Perform an HTTP POST request with custom options.
Response post(const std::string &url, const std::string &body,
const PostOptions &opts);
} // namespace http

View File

@ -0,0 +1,216 @@
#include "http/http.h"
#include <curl/curl.h>
#include <mutex>
#include <chrono>
namespace http {
static std::once_flag curl_init_flag;
static void ensure_curl_init() {
std::call_once(curl_init_flag, []() {
curl_global_init(CURL_GLOBAL_ALL);
});
}
struct ThreadLocalCurl {
CURL *handle;
ThreadLocalCurl() {
ensure_curl_init();
handle = curl_easy_init();
}
~ThreadLocalCurl() {
if (handle) curl_easy_cleanup(handle);
}
CURL *get() {
if (handle) curl_easy_reset(handle);
return handle;
}
};
thread_local ThreadLocalCurl tl_curl;
struct ProgressData {
std::chrono::steady_clock::time_point start_time;
int timeout_ms;
};
static int progress_cb(void *clientp, curl_off_t dltotal, curl_off_t dlnow,
curl_off_t ultotal, curl_off_t ulnow) {
auto *pd = static_cast<ProgressData *>(clientp);
if (pd->timeout_ms <= 0) return 0;
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - pd->start_time).count();
if (elapsed > pd->timeout_ms) {
return 1; // Return non-zero to abort the transfer
}
return 0; // Continue
}
static size_t write_cb(void *contents, size_t size, size_t nmemb, void *userp) {
auto *out = static_cast<std::string *>(userp);
out->append(static_cast<char *>(contents), size * nmemb);
return size * nmemb;
}
Response get(const std::string &url) {
return get(url, GetOptions{});
}
Response get(const std::string &url, const GetOptions &opts) {
Response resp{};
CURL *curl = tl_curl.get();
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init (thread_local) failed";
return resp;
}
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, opts.follow_redirects ? 1L : 0L);
ProgressData prog_data;
if (opts.timeout_ms > 0) {
curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms));
prog_data.start_time = std::chrono::steady_clock::now();
prog_data.timeout_ms = opts.timeout_ms + 1000;
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
}
// Fail fast on dead sites (TCP SYN timeout)
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT_MS, 5000L);
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
// Prevent signal handlers from breaking in multithreaded environments
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
if (!opts.user_agent.empty()) {
curl_easy_setopt(curl, CURLOPT_USERAGENT, opts.user_agent.c_str());
}
// Accept-Encoding for compressed responses
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
resp.body = curl_easy_strerror(res);
} else {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
return resp;
}
Response post(const std::string &url, const std::string &body,
const std::string &content_type) {
Response resp{};
CURL *curl = tl_curl.get();
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
return resp;
}
struct curl_slist *headers = nullptr;
headers =
curl_slist_append(headers, ("Content-Type: " + content_type).c_str());
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
ProgressData prog_data;
prog_data.start_time = std::chrono::steady_clock::now();
prog_data.timeout_ms = 11000;
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
resp.body = curl_easy_strerror(res);
} else {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
curl_slist_free_all(headers);
return resp;
}
Response post(const std::string &url, const std::string &body,
const PostOptions &opts) {
Response resp{};
CURL *curl = tl_curl.get();
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
return resp;
}
struct curl_slist *headers = nullptr;
headers =
curl_slist_append(headers, ("Content-Type: " + opts.content_type).c_str());
if (!opts.bearer_token.empty()) {
headers = curl_slist_append(
headers, ("Authorization: Bearer " + opts.bearer_token).c_str());
headers = curl_slist_append(
headers, ("x-api-token: " + opts.bearer_token).c_str());
}
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
ProgressData prog_data;
if (opts.timeout_ms > 0) {
curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms));
prog_data.start_time = std::chrono::steady_clock::now();
prog_data.timeout_ms = opts.timeout_ms + 1000;
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
}
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
resp.body = curl_easy_strerror(res);
} else {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
curl_slist_free_all(headers);
return resp;
}
} // namespace http

View File

@ -0,0 +1,11 @@
add_library(ipc STATIC
src/ipc.cpp
)
target_include_directories(ipc
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(ipc
PUBLIC json logger
)

View File

@ -0,0 +1,34 @@
#pragma once
#include <cstdint>
#include <cstdio>
#include <string>
#include <vector>
namespace ipc {
/// A single IPC message: { id, type, payload (raw JSON string) }.
struct Message {
std::string id;
std::string type;
std::string payload; // opaque JSON string (can be "{}" or any object)
};
/// Encode a Message into a length-prefixed binary frame.
/// Layout: [4-byte LE uint32 length][JSON bytes]
std::vector<uint8_t> encode(const Message &msg);
/// Decode a binary frame (without the 4-byte length prefix) into a Message.
/// Returns false if the JSON is invalid or missing required fields.
bool decode(const uint8_t *data, size_t len, Message &out);
bool decode(const std::vector<uint8_t> &frame, Message &out);
/// Blocking: read exactly one length-prefixed message from a FILE*.
/// Returns false on EOF or read error.
bool read_message(Message &out, FILE *in = stdin);
/// Write one length-prefixed message to a FILE*. Flushes after write.
/// Returns false on write error.
bool write_message(const Message &msg, FILE *out = stdout);
} // namespace ipc

View File

@ -0,0 +1,158 @@
#include "ipc/ipc.h"
#include <cstring>
#include "json/json.h"
#include "logger/logger.h"
// We use RapidJSON directly for structured serialization
#include <rapidjson/document.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#ifdef _WIN32
#include <fcntl.h>
#include <io.h>
#endif
namespace ipc {
// ── helpers ──────────────────────────────────────────────────────────────────
static void write_u32_le(uint8_t *dst, uint32_t val) {
dst[0] = static_cast<uint8_t>(val & 0xFF);
dst[1] = static_cast<uint8_t>((val >> 8) & 0xFF);
dst[2] = static_cast<uint8_t>((val >> 16) & 0xFF);
dst[3] = static_cast<uint8_t>((val >> 24) & 0xFF);
}
static uint32_t read_u32_le(const uint8_t *src) {
return static_cast<uint32_t>(src[0]) |
(static_cast<uint32_t>(src[1]) << 8) |
(static_cast<uint32_t>(src[2]) << 16) |
(static_cast<uint32_t>(src[3]) << 24);
}
static bool read_exact(FILE *f, uint8_t *buf, size_t n) {
size_t total = 0;
while (total < n) {
size_t got = std::fread(buf + total, 1, n - total, f);
if (got == 0) return false; // EOF or error
total += got;
}
return true;
}
// ── encode ───────────────────────────────────────────────────────────────────
std::vector<uint8_t> encode(const Message &msg) {
// Build JSON: { "id": "...", "type": "...", "payload": ... }
// payload is stored as a raw JSON string, so we parse it first
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
w.StartObject();
w.Key("id");
w.String(msg.id.c_str(), static_cast<rapidjson::SizeType>(msg.id.size()));
w.Key("type");
w.String(msg.type.c_str(),
static_cast<rapidjson::SizeType>(msg.type.size()));
w.Key("payload");
// If payload is valid JSON, embed it as-is; otherwise embed as string
rapidjson::Document pd;
if (!msg.payload.empty() &&
!pd.Parse(msg.payload.c_str()).HasParseError()) {
pd.Accept(w);
} else {
w.String(msg.payload.c_str(),
static_cast<rapidjson::SizeType>(msg.payload.size()));
}
w.EndObject();
const char *json_str = sb.GetString();
uint32_t json_len = static_cast<uint32_t>(sb.GetSize());
std::vector<uint8_t> frame(4 + json_len);
write_u32_le(frame.data(), json_len);
std::memcpy(frame.data() + 4, json_str, json_len);
return frame;
}
// ── decode ───────────────────────────────────────────────────────────────────
bool decode(const uint8_t *data, size_t len, Message &out) {
rapidjson::Document doc;
doc.Parse(reinterpret_cast<const char *>(data), len);
if (doc.HasParseError() || !doc.IsObject()) return false;
if (!doc.HasMember("id") || !doc["id"].IsString()) return false;
if (!doc.HasMember("type") || !doc["type"].IsString()) return false;
out.id = doc["id"].GetString();
out.type = doc["type"].GetString();
if (doc.HasMember("payload")) {
if (doc["payload"].IsString()) {
out.payload = doc["payload"].GetString();
} else {
// Re-serialize non-string payload back to JSON string
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
doc["payload"].Accept(w);
out.payload = sb.GetString();
}
} else {
out.payload = "{}";
}
return true;
}
bool decode(const std::vector<uint8_t> &frame, Message &out) {
return decode(frame.data(), frame.size(), out);
}
// ── read_message ─────────────────────────────────────────────────────────────
bool read_message(Message &out, FILE *in) {
#ifdef _WIN32
// Ensure binary mode on Windows to prevent \r\n translation
_setmode(_fileno(in), _O_BINARY);
#endif
uint8_t len_buf[4];
if (!read_exact(in, len_buf, 4)) return false;
uint32_t msg_len = read_u32_le(len_buf);
if (msg_len == 0 || msg_len > 10 * 1024 * 1024) { // sanity: max 10 MB
logger::error("ipc::read_message: invalid length " +
std::to_string(msg_len));
return false;
}
std::vector<uint8_t> buf(msg_len);
if (!read_exact(in, buf.data(), msg_len)) return false;
return decode(buf, out);
}
// ── write_message ────────────────────────────────────────────────────────────
bool write_message(const Message &msg, FILE *out) {
#ifdef _WIN32
_setmode(_fileno(out), _O_BINARY);
#endif
auto frame = encode(msg);
size_t written = std::fwrite(frame.data(), 1, frame.size(), out);
if (written != frame.size()) return false;
std::fflush(out);
return true;
}
} // namespace ipc

View File

@ -0,0 +1,28 @@
include(FetchContent)
# RapidJSON use master for CMake 4.x compatibility (v1.1.0 is from 2016)
FetchContent_Declare(
rapidjson
GIT_REPOSITORY https://github.com/Tencent/rapidjson.git
GIT_TAG master
GIT_SHALLOW TRUE
)
set(RAPIDJSON_BUILD_DOC OFF CACHE BOOL "" FORCE)
set(RAPIDJSON_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(RAPIDJSON_BUILD_TESTS OFF CACHE BOOL "" FORCE)
FetchContent_GetProperties(rapidjson)
if(NOT rapidjson_POPULATED)
FetchContent_Populate(rapidjson)
# Don't add_subdirectory just use the headers
endif()
add_library(json STATIC
src/json.cpp
)
target_include_directories(json
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
PUBLIC ${rapidjson_SOURCE_DIR}/include
)

View File

@ -0,0 +1,23 @@
#pragma once
#include <string>
#include <vector>
namespace json {
/// Parse a JSON string and return a pretty-printed version.
std::string prettify(const std::string &json_str);
/// Extract a string value by key from a JSON object (top-level only).
std::string get_string(const std::string &json_str, const std::string &key);
/// Extract an int value by key from a JSON object (top-level only).
int get_int(const std::string &json_str, const std::string &key);
/// Check if a JSON string is valid.
bool is_valid(const std::string &json_str);
/// Get all top-level keys from a JSON object.
std::vector<std::string> keys(const std::string &json_str);
} // namespace json

View File

@ -0,0 +1,62 @@
#include "json/json.h"
#include <rapidjson/document.h>
#include <rapidjson/prettywriter.h>
#include <rapidjson/stringbuffer.h>
namespace json {
std::string prettify(const std::string &json_str) {
rapidjson::Document doc;
doc.Parse(json_str.c_str());
if (doc.HasParseError()) {
return {};
}
rapidjson::StringBuffer buffer;
rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer);
doc.Accept(writer);
return std::string(buffer.GetString(), buffer.GetSize());
}
std::string get_string(const std::string &json_str, const std::string &key) {
rapidjson::Document doc;
doc.Parse(json_str.c_str());
if (doc.HasParseError() || !doc.IsObject())
return {};
auto it = doc.FindMember(key.c_str());
if (it == doc.MemberEnd() || !it->value.IsString())
return {};
return std::string(it->value.GetString(), it->value.GetStringLength());
}
int get_int(const std::string &json_str, const std::string &key) {
rapidjson::Document doc;
doc.Parse(json_str.c_str());
if (doc.HasParseError() || !doc.IsObject())
return 0;
auto it = doc.FindMember(key.c_str());
if (it == doc.MemberEnd() || !it->value.IsInt())
return 0;
return it->value.GetInt();
}
bool is_valid(const std::string &json_str) {
rapidjson::Document doc;
doc.Parse(json_str.c_str());
return !doc.HasParseError();
}
std::vector<std::string> keys(const std::string &json_str) {
std::vector<std::string> result;
rapidjson::Document doc;
doc.Parse(json_str.c_str());
if (doc.HasParseError() || !doc.IsObject())
return result;
for (auto it = doc.MemberBegin(); it != doc.MemberEnd(); ++it) {
result.emplace_back(it->name.GetString(), it->name.GetStringLength());
}
return result;
}
} // namespace json

View File

@ -0,0 +1,21 @@
include(FetchContent)
FetchContent_Declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.15.1
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(spdlog)
add_library(logger STATIC
src/logger.cpp
)
target_include_directories(logger
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(logger
PUBLIC spdlog::spdlog
)

View File

@ -0,0 +1,22 @@
#pragma once
#include <string>
namespace logger {
/// Initialize the default logger (call once at startup).
void init(const std::string &app_name = "polymech", const std::string &log_level = "info");
/// Initialize logger with stderr sink (use in worker/IPC mode).
void init_stderr(const std::string &app_name = "polymech-worker", const std::string &log_level = "info");
/// Initialize logger with stderr and file sink (use in UDS worker mode).
void init_uds(const std::string &app_name = "polymech-worker", const std::string &log_level = "info", const std::string &log_file = "logs/uds.json");
/// Log at various levels.
void info(const std::string &msg);
void warn(const std::string &msg);
void error(const std::string &msg);
void debug(const std::string &msg);
} // namespace logger

View File

@ -0,0 +1,57 @@
#include "logger/logger.h"
#include <spdlog/sinks/stdout_color_sinks.h>
#include <spdlog/sinks/basic_file_sink.h>
#include <spdlog/spdlog.h>
#include <filesystem>
namespace logger {
static void apply_log_level(const std::string& level) {
if (level == "debug") spdlog::set_level(spdlog::level::debug);
else if (level == "warn") spdlog::set_level(spdlog::level::warn);
else if (level == "error") spdlog::set_level(spdlog::level::err);
else spdlog::set_level(spdlog::level::info);
}
void init(const std::string &app_name, const std::string &log_level) {
auto console = spdlog::stdout_color_mt(app_name);
spdlog::set_default_logger(console);
apply_log_level(log_level);
spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v");
}
void init_stderr(const std::string &app_name, const std::string &log_level) {
auto console = spdlog::stderr_color_mt(app_name);
spdlog::set_default_logger(console);
apply_log_level(log_level);
spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v");
}
void init_uds(const std::string &app_name, const std::string &log_level, const std::string &log_file) {
auto console_sink = std::make_shared<spdlog::sinks::stderr_color_sink_mt>();
std::filesystem::path log_path(log_file);
std::error_code ec;
std::filesystem::create_directories(log_path.parent_path(), ec);
auto file_sink = std::make_shared<spdlog::sinks::basic_file_sink_mt>(log_file, false); // false = append
std::vector<spdlog::sink_ptr> sinks {console_sink, file_sink};
auto multi_logger = std::make_shared<spdlog::logger>(app_name, sinks.begin(), sinks.end());
spdlog::set_default_logger(multi_logger);
apply_log_level(log_level);
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
// Ensure logs are flushed immediately to file
spdlog::flush_every(std::chrono::seconds(1));
spdlog::flush_on(spdlog::level::info);
}
void info(const std::string &msg) { spdlog::info(msg); }
void warn(const std::string &msg) { spdlog::warn(msg); }
void error(const std::string &msg) { spdlog::error(msg); }
void debug(const std::string &msg) { spdlog::debug(msg); }
} // namespace logger

View File

@ -0,0 +1,9 @@
add_library(polymech STATIC
src/polymech.cpp
)
target_include_directories(polymech
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(polymech PUBLIC postgres logger)

View File

@ -0,0 +1,16 @@
#pragma once
#include <string>
#include <vector>
namespace polymech {
/// Fetch all rows from the "pages" table.
/// Returns raw JSON array string from Supabase.
std::string fetch_pages();
/// Fetch pages with a specific select clause and optional filter.
std::string fetch_pages(const std::string &select,
const std::string &filter = "", int limit = 0);
} // namespace polymech

View File

@ -0,0 +1,17 @@
#include "polymech/polymech.h"
#include "logger/logger.h"
#include "postgres/postgres.h"
namespace polymech {
std::string fetch_pages() { return fetch_pages("*"); }
std::string fetch_pages(const std::string &select, const std::string &filter,
int limit) {
logger::debug("polymech::fetch_pages → select=" + select +
" filter=" + filter + " limit=" + std::to_string(limit));
return postgres::query("pages", select, filter, limit);
}
} // namespace polymech

View File

@ -0,0 +1,11 @@
add_library(postgres STATIC
src/postgres.cpp
)
target_include_directories(postgres
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(postgres
PUBLIC logger http json
)

View File

@ -0,0 +1,46 @@
#pragma once
#include <string>
#include <vector>
namespace postgres {
/// Supabase connection configuration.
struct Config {
std::string supabase_url;
std::string supabase_key;
};
/// Initialize the Supabase client with URL and API key.
void init(const Config &config);
/// Ping the Supabase REST API. Returns "ok" on success, error message on
/// failure.
std::string ping();
/// Query a table via the PostgREST API.
/// Returns the raw JSON response body.
/// @param table Table name (e.g. "profiles")
/// @param select Comma-separated columns (e.g. "id,username"), or "*"
/// @param filter PostgREST filter (e.g. "id=eq.abc"), or "" for no filter
/// @param limit Max rows (0 = no limit)
std::string query(const std::string &table, const std::string &select = "*",
const std::string &filter = "", int limit = 0);
/// Insert a row into a table. Body is a JSON object string.
/// Returns the created row as JSON.
std::string insert(const std::string &table, const std::string &json_body);
/// Upsert a row into a table. Body is a JSON array or object string.
/// Returns the upserted array as JSON.
std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict = "");
/// Update rows in a table. Body is a JSON object string.
/// Returns the updated rows as JSON.
std::string update(const std::string &table, const std::string &json_body, const std::string &filter);
/// Delete rows from a table.
/// Returns the deleted rows as JSON.
std::string del(const std::string &table, const std::string &filter);
} // namespace postgres

View File

@ -0,0 +1,236 @@
#include "postgres/postgres.h"
#include "http/http.h"
#include "logger/logger.h"
#include "json/json.h"
#include <curl/curl.h>
#include <stdexcept>
namespace postgres {
static Config s_config;
static bool s_initialized = false;
void init(const Config &config) {
s_config = config;
s_initialized = true;
logger::debug("postgres::init → " + config.supabase_url);
}
static void ensure_init() {
if (!s_initialized) {
throw std::runtime_error("postgres::init() must be called first");
}
}
/// Build the REST URL for a table query.
static std::string build_url(const std::string &table,
const std::string &select,
const std::string &filter, int limit) {
std::string url = s_config.supabase_url + "/rest/v1/" + table;
url += "?select=" + select;
if (!filter.empty()) {
url += "&" + filter;
}
if (limit > 0) {
url += "&limit=" + std::to_string(limit);
}
return url;
}
/// Make an authenticated GET request to the Supabase REST API.
static http::Response supabase_get(const std::string &url) {
// We need custom headers, so we use curl directly
CURL *curl = curl_easy_init();
http::Response resp{};
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
return resp;
}
struct curl_slist *headers = nullptr;
headers =
curl_slist_append(headers, ("apikey: " + s_config.supabase_key).c_str());
headers = curl_slist_append(
headers, ("Authorization: Bearer " + s_config.supabase_key).c_str());
auto write_cb = [](void *contents, size_t size, size_t nmemb, void *userp) {
auto *out = static_cast<std::string *>(userp);
out->append(static_cast<char *>(contents), size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(
curl, CURLOPT_WRITEFUNCTION,
static_cast<size_t (*)(void *, size_t, size_t, void *)>(+write_cb));
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
resp.body = curl_easy_strerror(res);
} else {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
return resp;
}
/// Make an authenticated request with a JSON body (POST, PATCH, DELETE).
static http::Response supabase_request(const std::string &method,
const std::string &url,
const std::string &body,
const std::string &prefer_header) {
CURL *curl = curl_easy_init();
http::Response resp{};
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
return resp;
}
struct curl_slist *headers = nullptr;
if (!body.empty()) {
headers = curl_slist_append(headers, "Content-Type: application/json");
}
if (!prefer_header.empty()) {
headers = curl_slist_append(headers, ("Prefer: " + prefer_header).c_str());
}
headers =
curl_slist_append(headers, ("apikey: " + s_config.supabase_key).c_str());
headers = curl_slist_append(
headers, ("Authorization: Bearer " + s_config.supabase_key).c_str());
auto write_cb = [](void *contents, size_t size, size_t nmemb, void *userp) {
auto *out = static_cast<std::string *>(userp);
out->append(static_cast<char *>(contents), size * nmemb);
return size * nmemb;
};
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
if (!body.empty()) {
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
}
curl_easy_setopt(
curl, CURLOPT_WRITEFUNCTION,
static_cast<size_t (*)(void *, size_t, size_t, void *)>(+write_cb));
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
resp.body = curl_easy_strerror(res);
} else {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
return resp;
}
std::string ping() {
ensure_init();
// Lightweight check: query profiles with limit=0 to verify connectivity
auto resp = supabase_get(s_config.supabase_url +
"/rest/v1/profiles?select=id&limit=0");
if (resp.status_code >= 200 && resp.status_code < 300) {
logger::info("postgres::ping → ok (HTTP " +
std::to_string(resp.status_code) + ")");
return "ok";
}
logger::error("postgres::ping → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return "error: HTTP " + std::to_string(resp.status_code);
}
std::string query(const std::string &table, const std::string &select,
const std::string &filter, int limit) {
ensure_init();
auto url = build_url(table, select, filter, limit);
logger::debug("postgres::query → " + url);
auto resp = supabase_get(url);
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::query → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
std::string insert(const std::string &table, const std::string &json_body) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
logger::debug("postgres::insert → " + url);
auto resp = supabase_request("POST", url, json_body, "return=representation");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::insert → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
if (!on_conflict.empty()) {
url += "?on_conflict=" + on_conflict;
}
logger::debug("postgres::upsert → " + url);
auto resp = supabase_request("POST", url, json_body, "return=minimal, resolution=merge-duplicates");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::upsert → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
std::string update(const std::string &table, const std::string &json_body, const std::string &filter) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
if (!filter.empty()) {
url += "?" + filter;
}
logger::debug("postgres::update → " + url);
auto resp = supabase_request("PATCH", url, json_body, "return=representation");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::update → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
std::string del(const std::string &table, const std::string &filter) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
if (!filter.empty()) {
url += "?" + filter;
}
logger::debug("postgres::del → " + url);
auto resp = supabase_request("DELETE", url, "", "return=representation");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::del → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
} // namespace postgres

View File

@ -0,0 +1,7 @@
add_library(search STATIC src/search.cpp)
target_include_directories(search PUBLIC include)
# Depends on http (curl) and json (RapidJSON wrapper)
target_link_libraries(search PUBLIC http json)
target_link_libraries(search PRIVATE tomlplusplus::tomlplusplus)

View File

@ -0,0 +1,93 @@
#pragma once
#include <string>
#include <vector>
namespace search {
// ── Result types ────────────────────────────────────────────────────────────
struct GpsCoordinates {
double lat = 0;
double lng = 0;
};
struct MapResult {
std::string title;
std::string place_id;
std::string data_id;
std::string address;
std::string phone;
std::string website;
std::string type;
std::vector<std::string> types;
double rating = 0;
int reviews = 0;
GpsCoordinates gps;
std::string thumbnail;
std::string raw_json;
std::string geo_json;
};
struct SearchResult {
std::vector<MapResult> results;
int apiCalls = 0;
std::string error;
};
// ── Config ──────────────────────────────────────────────────────────────────
struct SystemTuningOptions {
int executor_threads = 0; // 0 = hardware concurrency
int max_concurrent_jobs_per_user = 10;
int http_concurrency_throttle = 50;
int queue_depth_max = 10000;
int bulk_dequeue_size = 1;
int ipc_timeout_ms = 300000;
int max_ipc_connections = 100;
int buffer_size_max = 50 * 1024 * 1024;
};
struct Config {
SystemTuningOptions system;
std::string serpapi_key;
std::string geocoder_key;
std::string bigdata_key;
std::string scrapeless_key;
std::string postgres_url;
std::string supabase_url;
std::string supabase_service_key;
// [enricher]
std::string enricher_meta_scraper;
int enricher_meta_concurrency = 5;
int enricher_meta_idle_timeout = 60;
int enricher_location_concurrency = 1;
};
/// Load config from a TOML file (e.g. config/postgres.toml)
Config load_config(const std::string &path = "config/postgres.toml");
// ── Search API ──────────────────────────────────────────────────────────────
struct SearchOptions {
std::string query;
double lat = 0;
double lng = 0;
int zoom = 13;
int limit = 20;
std::string engine = "google_maps";
std::string hl = "en";
std::string google_domain = "google.com";
};
/// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit.
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts);
/// Resolve geo coordinate to place info
std::string resolve_geo(double lat, double lng, const std::string &key,
int timeout_ms = 3000);
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
int concurrency = 10, int timeout_ms = 3000);
} // namespace search

View File

@ -0,0 +1,311 @@
#include "search/search.h"
#include "http/http.h"
#include <rapidjson/document.h>
#include <toml++/toml.hpp>
#include <atomic>
#include <cstdio>
#include <iostream>
#include <mutex>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include <sstream>
#include <thread>
namespace search {
// ── URL encoding (minimal) ──────────────────────────────────────────────────
static std::string url_encode(const std::string &val) {
std::string result;
result.reserve(val.size() * 2);
for (unsigned char c : val) {
if (isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_' ||
c == '.' || c == '~') {
result += static_cast<char>(c);
} else {
char buf[4];
snprintf(buf, sizeof(buf), "%%%02X", c);
result += buf;
}
}
return result;
}
// ── Config loading ──────────────────────────────────────────────────────────
Config load_config(const std::string &path) {
Config cfg;
try {
auto tbl = toml::parse_file(path);
// [postgres]
if (auto v = tbl["postgres"]["url"].value<std::string>())
cfg.postgres_url = *v;
// [supabase]
if (auto v = tbl["supabase"]["url"].value<std::string>())
cfg.supabase_url = *v;
if (auto v = tbl["supabase"]["service_key"].value<std::string>())
cfg.supabase_service_key = *v;
// [services]
if (auto v = tbl["services"]["SERPAPI_KEY"].value<std::string>())
cfg.serpapi_key = *v;
if (auto v = tbl["services"]["GEO_CODER_KEY"].value<std::string>())
cfg.geocoder_key = *v;
if (auto v = tbl["services"]["BIG_DATA_KEY"].value<std::string>())
cfg.bigdata_key = *v;
if (auto v = tbl["services"]["SCRAPELESS_KEY"].value<std::string>())
cfg.scrapeless_key = *v;
// [enricher]
if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value<std::string>())
cfg.enricher_meta_scraper = *v;
if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value<int>())
cfg.enricher_meta_concurrency = *v;
if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value<int>())
cfg.enricher_meta_idle_timeout = *v;
if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value<int>())
cfg.enricher_location_concurrency = *v;
// [system]
if (auto v = tbl["system"]["executor_threads"].value<int>())
cfg.system.executor_threads = *v;
if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value<int>())
cfg.system.max_concurrent_jobs_per_user = *v;
if (auto v = tbl["system"]["http_concurrency_throttle"].value<int>())
cfg.system.http_concurrency_throttle = *v;
if (auto v = tbl["system"]["queue_depth_max"].value<int>())
cfg.system.queue_depth_max = *v;
if (auto v = tbl["system"]["bulk_dequeue_size"].value<int>())
cfg.system.bulk_dequeue_size = *v;
if (auto v = tbl["system"]["ipc_timeout_ms"].value<int>())
cfg.system.ipc_timeout_ms = *v;
if (auto v = tbl["system"]["max_ipc_connections"].value<int>())
cfg.system.max_ipc_connections = *v;
if (auto v = tbl["system"]["buffer_size_max"].value<int>())
cfg.system.buffer_size_max = *v;
} catch (const toml::parse_error &err) {
std::cerr << "[config] TOML parse error in " << path << ": " << err.what()
<< "\n";
}
return cfg;
}
// ── SerpAPI URL builder ─────────────────────────────────────────────────────
static std::string build_serpapi_url(const Config &cfg,
const SearchOptions &opts, int start) {
std::ostringstream url;
url << "https://serpapi.com/search.json"
<< "?engine=" << url_encode(opts.engine)
<< "&q=" << url_encode(opts.query)
<< "&api_key=" << url_encode(cfg.serpapi_key)
<< "&hl=" << url_encode(opts.hl)
<< "&google_domain=" << url_encode(opts.google_domain);
if (opts.lat != 0 || opts.lng != 0) {
char llBuf[128];
snprintf(llBuf, sizeof(llBuf), "@%.7f,%.7f,%dz", opts.lat, opts.lng,
opts.zoom);
url << "&ll=" << url_encode(std::string(llBuf));
}
if (start > 0) {
url << "&start=" << start;
}
return url.str();
}
// ── JSON result parser ──────────────────────────────────────────────────────
static void parse_results(const rapidjson::Value &arr,
std::vector<MapResult> &out) {
if (!arr.IsArray())
return;
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
const auto &obj = arr[i];
if (!obj.IsObject())
continue;
MapResult r;
// Capture raw JSON string
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
obj.Accept(writer);
r.raw_json = std::string(buf.GetString(), buf.GetSize());
if (obj.HasMember("title") && obj["title"].IsString())
r.title = obj["title"].GetString();
if (obj.HasMember("place_id") && obj["place_id"].IsString())
r.place_id = obj["place_id"].GetString();
if (obj.HasMember("data_id") && obj["data_id"].IsString())
r.data_id = obj["data_id"].GetString();
if (obj.HasMember("address") && obj["address"].IsString())
r.address = obj["address"].GetString();
if (obj.HasMember("phone") && obj["phone"].IsString())
r.phone = obj["phone"].GetString();
if (obj.HasMember("website") && obj["website"].IsString())
r.website = obj["website"].GetString();
if (obj.HasMember("type") && obj["type"].IsString())
r.type = obj["type"].GetString();
if (obj.HasMember("rating") && obj["rating"].IsNumber())
r.rating = obj["rating"].GetDouble();
if (obj.HasMember("reviews") && obj["reviews"].IsInt())
r.reviews = obj["reviews"].GetInt();
if (obj.HasMember("thumbnail") && obj["thumbnail"].IsString())
r.thumbnail = obj["thumbnail"].GetString();
if (obj.HasMember("gps_coordinates") && obj["gps_coordinates"].IsObject()) {
const auto &gps = obj["gps_coordinates"];
if (gps.HasMember("latitude") && gps["latitude"].IsNumber())
r.gps.lat = gps["latitude"].GetDouble();
if (gps.HasMember("longitude") && gps["longitude"].IsNumber())
r.gps.lng = gps["longitude"].GetDouble();
}
if (obj.HasMember("types") && obj["types"].IsArray()) {
for (rapidjson::SizeType j = 0; j < obj["types"].Size(); ++j) {
if (obj["types"][j].IsString())
r.types.push_back(obj["types"][j].GetString());
}
}
out.push_back(std::move(r));
}
}
// ── Main search function ────────────────────────────────────────────────────
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts) {
SearchResult result;
if (cfg.serpapi_key.empty()) {
result.error = "No SerpAPI key configured";
return result;
}
if (opts.query.empty()) {
result.error = "Empty search query";
return result;
}
const int PAGE_SIZE = 20;
int start = 0;
while (static_cast<int>(result.results.size()) < opts.limit) {
std::string url = build_serpapi_url(cfg, opts, start);
auto resp = http::get(url);
result.apiCalls++;
if (resp.status_code != 200) {
result.error = "SerpAPI HTTP " + std::to_string(resp.status_code);
break;
}
rapidjson::Document doc;
doc.Parse(resp.body.c_str());
if (doc.HasParseError()) {
result.error = "Failed to parse SerpAPI response";
break;
}
size_t beforeCount = result.results.size();
// local_results (main listing)
if (doc.HasMember("local_results") && doc["local_results"].IsArray()) {
parse_results(doc["local_results"], result.results);
}
// place_results (single result or array)
if (doc.HasMember("place_results")) {
if (doc["place_results"].IsArray()) {
parse_results(doc["place_results"], result.results);
} else if (doc["place_results"].IsObject()) {
rapidjson::Document arr;
arr.SetArray();
arr.PushBack(rapidjson::Value(doc["place_results"], arr.GetAllocator()),
arr.GetAllocator());
parse_results(arr, result.results);
}
}
size_t pageCount = result.results.size() - beforeCount;
if (pageCount == 0)
break; // No more results
if (static_cast<int>(pageCount) < PAGE_SIZE)
break; // Last page (partial)
start += PAGE_SIZE;
}
// Trim to limit
if (static_cast<int>(result.results.size()) > opts.limit) {
result.results.resize(opts.limit);
}
return result;
}
// ── Geo enrichment ──────────────────────────────────────────────────────────
std::string resolve_geo(double lat, double lng, const std::string &key,
int timeout_ms) {
if (key.empty())
return "{}";
char url[512];
snprintf(
url, sizeof(url),
"https://api.bigdatacloud.net/data/"
"reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s",
lat, lng, key.c_str());
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
auto resp = http::get(url, opts);
if (resp.status_code == 200 && !resp.body.empty()) {
return resp.body;
}
return "{}";
}
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
int concurrency, int timeout_ms) {
if (key.empty() || results.empty())
return;
std::atomic<size_t> current_idx{0};
std::vector<std::thread> threads;
int num_threads =
std::min<int>(concurrency, static_cast<int>(results.size()));
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back([&]() {
while (true) {
size_t idx = current_idx.fetch_add(1);
if (idx >= results.size())
break;
auto &r = results[idx];
if (r.gps.lat != 0 || r.gps.lng != 0) {
r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms);
}
}
});
}
for (auto &t : threads) {
if (t.joinable())
t.join();
}
}
} // namespace search

View File

@ -0,0 +1,320 @@
# Polymech C++ Gridsearch Worker — Design
## Goal
Port the [gridsearch-worker.ts](../src/products/locations/gridsearch-worker.ts) pipeline to native C++, running as a **CLI subcommand** (`polymech-cli gridsearch`) while keeping all logic in internal libraries under `packages/`. The worker communicates progress via the [IPC framing protocol](./packages/ipc/) and writes results to Supabase via the existing [postgres](./packages/postgres/) package.
---
## Status
| Package | Status | Tests | Assertions |
|---------|--------|-------|------------|
| `geo` | ✅ Done | 23 | 77 |
| `gadm_reader` | ✅ Done | 18 | 53 |
| `grid` | ✅ Done | 13 | 105 |
| `search` | ✅ Done | 8 | 13 |
| CLI `gridsearch` | ✅ Done | — | dry-run verified (3ms) |
| IPC `gridsearch` | ✅ Done | 1 | 30 |
| **Total** | | **63** | **278** |
---
## Existing C++ Inventory
| Package | Provides |
|---------|----------|
| `ipc` | Length-prefixed JSON over stdio |
| `postgres` | Supabase PostgREST: `query`, `insert` |
| `http` | libcurl `GET`/`POST` |
| `json` | RapidJSON validate/prettify |
| `logger` | spdlog (stdout or **stderr** in worker mode) |
| `html` | HTML parser |
---
## TypeScript Pipeline (Reference)
```
GADM Resolve → Grid Generate → SerpAPI Search → Enrich → Supabase Upsert
```
| Phase | Input | Output | Heavy work |
|-------|-------|--------|------------|
| **1. GADM Resolve** | GID list + target level | `GridFeature[]` (GeoJSON polygons with GHS props) | Read pre-cached JSON files from `cache/gadm/boundary_{GID}_{LEVEL}.json` |
| **2. Grid Generate** | `GridFeature[]` + settings | `GridSearchHop[]` (waypoints: lat/lng/radius) | Centroid, bbox, distance, area, point-in-polygon, cell sorting |
| **3. Search** | Waypoints + query + SerpAPI key | Place results (JSON) | HTTP calls to `serpapi.com`, per-waypoint caching |
| **4. Enrich** | Place results | Enriched data (emails, pages) | HTTP scraping |
| **5. Persist** | Enriched places | Supabase `places` + `grid_search_runs` | PostgREST upsert |
---
## Implemented Packages
### 1. `packages/geo` — Geometry primitives ✅
Header + `.cpp`, no external deps. Implements the **turf.js subset** used by the grid generator.
```cpp
namespace geo {
struct Coord { double lon, lat; };
struct BBox { double minLon, minLat, maxLon, maxLat; };
BBox bbox(const std::vector<Coord>& ring);
Coord centroid(const std::vector<Coord>& ring);
double area_sq_m(const std::vector<Coord>& ring);
double distance_km(Coord a, Coord b);
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring);
std::vector<BBox> square_grid(BBox extent, double cellSizeKm);
std::vector<BBox> hex_grid(BBox extent, double cellSizeKm);
std::vector<Coord> buffer_circle(Coord center, double radiusKm, int steps = 6);
} // namespace geo
```
**Rationale**: ~200 lines avoids pulling GEOS/Boost.Geometry. Adopts `pip.h` ray-casting pattern from `packages/gadm/cpp/` without the GDAL/GEOS/PROJ dependency (~700MB).
---
### 2. `packages/gadm_reader` — Boundary resolver ✅
Reads pre-cached GADM boundary JSON from disk. No network calls.
```cpp
namespace gadm {
struct Feature {
std::string gid, name;
int level;
std::vector<std::vector<geo::Coord>> rings;
double ghsPopulation, ghsBuiltWeight;
geo::Coord ghsPopCenter, ghsBuiltCenter;
std::vector<std::array<double, 3>> ghsPopCenters; // [lon, lat, weight]
std::vector<std::array<double, 3>> ghsBuiltCenters;
double areaSqKm;
};
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
const std::string& cacheDir = "cache/gadm");
} // namespace gadm
```
Handles `Polygon`/`MultiPolygon`, GHS enrichment fields, fallback resolution by country code prefix.
---
### 3. `packages/grid` — Grid generator ✅
Direct port of [grid-generator.ts](../../shared/src/products/places/grid-generator.ts).
```cpp
namespace grid {
struct Waypoint { int step; double lng, lat, radius_km; };
struct GridOptions {
std::string gridMode; // "hex", "square", "admin", "centers"
double cellSize; // km
double cellOverlap, centroidOverlap;
int maxCellsLimit;
double maxElevation, minDensity, minGhsPop, minGhsBuilt;
std::string ghsFilterMode; // "AND" | "OR"
bool allowMissingGhs, bypassFilters;
std::string pathOrder; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest"
bool groupByRegion;
};
struct GridResult { std::vector<Waypoint> waypoints; int validCells, skippedCells; std::string error; };
GridResult generate(const std::vector<gadm::Feature>& features, const GridOptions& opts);
} // namespace grid
```
**4 modes**: `admin` (centroid + radius), `centers` (GHS deduplicated), `hex`, `square` (tessellation + PIP)
**5 sort algorithms**: `zigzag`, `snake`, `spiral-out`, `spiral-in`, `shortest` (greedy NN)
---
### 4. `packages/search` — SerpAPI client + config ✅
```cpp
namespace search {
struct Config {
std::string serpapi_key, geocoder_key, bigdata_key;
std::string postgres_url, supabase_url, supabase_service_key;
};
Config load_config(const std::string& path = "config/postgres.toml");
struct SearchOptions {
std::string query;
double lat, lng;
int zoom = 13, limit = 20;
std::string engine = "google_maps", hl = "en", google_domain = "google.com";
};
struct MapResult {
std::string title, place_id, data_id, address, phone, website, type;
std::vector<std::string> types;
double rating; int reviews;
GpsCoordinates gps;
};
SearchResult search_google_maps(const Config& cfg, const SearchOptions& opts);
} // namespace search
```
Reads `[services].SERPAPI_KEY`, `GEO_CODER_KEY`, `BIG_DATA_KEY` from `config/postgres.toml`. HTTP pagination via `http::get()`, JSON parsing with RapidJSON.
---
## CLI Subcommands ✅
### 1. `gridsearch` (One-shot execution)
```
polymech-cli gridsearch <GID> <QUERY> [OPTIONS]
Positionals:
GID GADM GID (e.g. ESP.1.1_1) — ignored when --settings is used
QUERY Search query — ignored when --settings is used
Options:
-l, --level INT Target GADM level (default: 0)
-m, --mode TEXT Grid mode: hex|square|admin|centers (default: hex)
-s, --cell-size FLOAT Cell size in km (default: 5.0)
--limit INT Max results per area (default: 20)
-z, --zoom INT Google Maps zoom (default: 13)
--sort TEXT Path order: snake|zigzag|spiral-out|spiral-in|shortest
-c, --config TEXT TOML config path (default: config/postgres.toml)
--cache-dir TEXT GADM cache directory (default: cache/gadm)
--settings TEXT JSON settings file (matches TypeScript GuidedPreset shape)
--enrich Run enrichment pipeline (meta + email) after search
--persistence-postgres Persist run data natively via Postgres
-o, --output TEXT Output JSON file (default: gridsearch-HH-MM.json in cwd)
--dry-run Generate grid only, skip SerpAPI search
```
### 2. `worker` (IPC Daemon execution)
```
polymech-cli worker [OPTIONS]
Options:
--daemon Run persistent daemon pool (tier-based)
-c, --config TEXT TOML config path (default: config/postgres.toml)
--user-uid TEXT User ID to bind this daemon to (needed for place owner)
--uds TEXT Run over Unix Domain Socket / Named Pipe (TCP on Windows) at the given path
```
### Execution flow
```
1. load_config(configPath) → Config (TOML)
2. gadm::load_boundary(gid, level) → features[]
3. grid::generate(features, opts) → waypoints[]
4. --dry-run → output JSON array and exit
5. For each waypoint → search::search_google_maps(cfg, sopts)
6. Stream JSON summary to stdout
```
### Example
```bash
polymech-cli gridsearch ABW "recycling" --dry-run
# → [{"step":1,"lat":12.588582,"lng":-70.040465,"radius_km":3.540}, ...]
# [info] Dry-run complete in 3ms
```
### IPC worker mode
The `worker` subcommand natively routes multiplexed asynchronous `gridsearch` payloads. When launched via `--uds <path>`, it provisions a high-performance Asio streaming server (AF_UNIX sockets on POSIX, TCP sockets on Windows). Event frames (`grid-ready`, `waypoint-start`, `location`, `node`, etc) emit bi-directionally utilizing the IPC bridging protocol, dropping locking blockades completely.
---
## Exposed Configuration / Tuning Parameters
As we integrate deeper with the core business logic, the Node orchestrator and internal services should configure and enforce limits on the underlying C++ concurrent engine. Relevant configuration surfaces we need to expose for the primary ecosystem libraries include:
### 1. Taskflow (`https://github.com/taskflow/taskflow`)
- **`executor_threads` (`num_workers`)**: The size of the `tf::Executor` thread pool. As Gridsearch is heavily I/O network bound (HTTP calls for search/enrichment), setting this significantly higher than `std::thread::hardware_concurrency()` may aggressively improve HTTP ingestion throughput globally.
- **`max_concurrent_jobs_per_user`**: A structural limit dictating how many concurrent gridsearch invocation graphs a single tenant/user can enqueue and run actively to prevent monopolization.
- **`http_concurrency_throttle`**: Task limits enforced upon node scraping or SerpAPI requests per-pipeline graph to avoid widespread `429 Too Many Requests` bans.
### 2. Moodycamel ConcurrentQueue (`https://github.com/cameron314/concurrentqueue`)
- **`queue_depth_max` / `backpressure`**: Since Moodycamel queue memory allocates dynamically and lock-free to any capacity, we must mandate a hard software ceiling/backpressure limit over the Node-to-C++ IPC layer. If Node blindly streams jobs faster than Taskflow can execute them, the daemon will eventually OOM.
- **`bulk_dequeue_size`**: Exposing tuning parameters for the dispatch thread on how many concurrent IPC tasks should be sucked out of the queue simultaneously.
### 3. Boost.Asio (`https://github.com/chriskohlhoff/asio`)
- **`ipc_timeout_ms` (Read/Write)**: Mandatory timeouts for the IPC socket layer. If the orchestrator stalls, crashes, or goes silent, Asio must reap the connection and automatically GC the in-flight tasks to prevent Zombie worker processes.
- **`max_ipc_connections`**: Absolute limit on simultaneous orchestration pipelines dialing into a single Worker Pod.
- **`buffer_size_max`**: Soft constraints on async payload allocations so a malformed 200MB JSON frame from Node.js doesn't memory-spike the `asio::read` operations abruptly.
---
## Build Integration
### Dependency graph
```
┌──────────┐
│ polymech │ (the lib)
│ -cli │ (the binary)
└────┬─────┘
┌────────────┼────────────────┐
▼ ▼ ▼
┌──────────┐ ┌──────────┐ ┌──────────┐
│ search │ │ grid │ │ ipc │
└────┬─────┘ └────┬─────┘ └──────────┘
│ │
▼ ▼
┌──────────┐ ┌───────────────┐
│ http │ │ gadm_reader │
└──────────┘ └────┬──────────┘
┌──────────┐
│ geo │ ← no deps (math only)
└──────────┘
┌──────────┐
│ json │ ← RapidJSON
└──────────┘
```
All packages depend on `logger` and `json` implicitly.
---
## Testing
### Unit tests (Catch2) — 62 tests, 248 assertions ✅
| Test file | Tests | Assertions | Validates |
|-----------|-------|------------|-----------|
| `test_geo.cpp` | 23 | 77 | Haversine, area, centroid, PIP, hex/square grid |
| `test_gadm_reader.cpp` | 18 | 53 | JSON parsing, GHS props, fallback resolution |
| `test_grid.cpp` | 13 | 105 | All 4 modes × 5 sorts, GHS filtering, PIP clipping |
| `test_search.cpp` | 8 | 13 | Config loading, key validation, error handling |
### Integration test (Node.js)
- Existing `orchestrator/test-ipc.mjs` validates spawn/lifecycle/ping/job
- `orchestrator/test-gridsearch-ipc.mjs` validates full pipeline via IPC (8 event types + job result)
- `orchestrator/test-gridsearch-ipc-uds.mjs` validates high-throughput Unix Domain Sockets mapping, backpressure boundaries, and soft cancellation injections utilizing `action: cancel` frames mid-flight.
---
## IPC Cancellation & Dynamic Job Tuning
The high-performance UDS daemon now natively tracks and intercepts JSON `action: cancel` frames referencing specific `jobId`s to gracefully exit Taskflow jobs mid-flight.
Dynamic tuning limits, such as memory buffering boundaries or threading capacities, are inherently validated and bound by hard ceilings established inside the `[system]` constraint block of `config/postgres.toml`.
---
## Deferred (Phase 2)
| Item | Reason |
|------|--------|
| SerpAPI response caching | State store managed by orchestrator for now |
| Protobuf framing | JSON IPC sufficient for current throughput |
| Multi-threaded search | Sequential is fine for SerpAPI rate limits |
| GEOS integration | Custom geo is sufficient for grid math |

View File

@ -0,0 +1,65 @@
#pragma once
#include "search/search.h"
#include "gadm_reader/gadm_reader.h"
#include "geo/geo.h"
#include <functional>
#include <map>
#include <string>
#include <vector>
namespace polymech {
// ── Filter context ──────────────────────────────────────────────────────────
// All runtime data a filter predicate may need. Passed by const-ref so filters
// are pure read-only functions with no side-effects.
struct WaypointCtx {
double lat;
double lng;
double radius_km;
std::string area_gid; // e.g. "ESP.6.1.10.2_1"
};
struct FilterContext {
const WaypointCtx& waypoint;
const std::vector<std::string>& filter_types; // must-match list
const std::vector<std::string>& exclude_types; // deny list
const std::map<std::string, std::vector<gadm::Feature>>& country_boundaries;
};
// ── Predicate type ──────────────────────────────────────────────────────────
// Returns true → KEEP the result.
// Returns false → DISCARD the result.
using LocationFilter = std::function<bool(const search::MapResult&, const FilterContext&)>;
// ── Individual filters ──────────────────────────────────────────────────────
/// Discard results that have no website (non-actionable leads).
bool filter_requires_website(const search::MapResult& r, const FilterContext& ctx);
/// Discard results whose type matches any entry in ctx.exclude_types.
bool filter_exclude_types(const search::MapResult& r, const FilterContext& ctx);
/// If ctx.filter_types is non-empty, keep only results that match ≥1 type.
bool filter_match_types(const search::MapResult& r, const FilterContext& ctx);
/// Keep only results inside the country-level boundary polygon (L0) of the
/// waypoint's country. Falls back to radius-based overlap (1.5 × radius_km)
/// to gracefully handle legitimate border-proximity results.
bool filter_country_boundary(const search::MapResult& r, const FilterContext& ctx);
// ── Filter set builder ──────────────────────────────────────────────────────
/// Return the ordered list of default filters applied to every SerpAPI batch.
/// Filters are evaluated left-to-right; the first false short-circuits.
std::vector<LocationFilter> default_location_filters();
/// Run `filters` against `result`. Returns true (keep) only if every
/// filter passes.
bool apply_filters(const search::MapResult& result,
const FilterContext& ctx,
const std::vector<LocationFilter>& filters);
} // namespace polymech

View File

@ -0,0 +1,28 @@
#pragma once
#include "cmd_gridsearch.h"
#include "search/search.h"
#include "enrichers/enrichers.h"
#include <set>
namespace polymech {
struct PostgresStateStore {
std::string run_id;
std::string user_id;
std::string parent_id; // optional: parent run ID for expand jobs
bool enabled = false;
void init_run(const PipelineOptions &opts);
void update_status(const std::string &status);
void complete_run(const std::string &result_json);
void fail_run(const std::string &error_msg);
void upsert_places(const std::vector<search::MapResult> &places);
void update_place_enrichment(const enrichers::EnrichedNode &enode);
/// Query places table in chunks to find place_ids that already have meta (enriched).
/// Returns set of place_ids that should be skipped during enrichment.
std::set<std::string> filter_already_enriched(const std::vector<std::string> &place_ids);
};
} // namespace polymech

View File

@ -0,0 +1,88 @@
#pragma once
#include <CLI/CLI.hpp>
#include <functional>
#include <string>
#include <memory>
#include <atomic>
#include "search/search.h"
#include "grid/grid.h"
#include <vector>
namespace polymech {
std::string json_escape(const std::string &s);
struct AreaDef {
std::string gid;
std::string name;
int level;
};
struct AccumulatedResult {
search::MapResult result;
std::string grid_area;
std::string grid_gid;
};
struct PipelineOptions {
std::vector<AreaDef> areas;
grid::GridOptions grid_opts;
std::string search_query;
std::string search_domain = "google.com";
std::string search_language = "en";
std::string search_country;
int search_limit = 20;
int search_zoom = 13;
bool dry_run = false;
bool enrich = false;
std::string config_path = "config/postgres.toml";
std::string cache_dir = "cache/gadm";
bool persistence_postgres = false;
bool daemon_mode = false;
std::string job_id;
std::string default_user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
search::SystemTuningOptions tuning;
std::shared_ptr<std::atomic<bool>> cancel_token;
std::vector<std::string> filter_types; // if non-empty, only locations matching ≥1 type pass
std::vector<std::string> exclude_types; // if non-empty, drop locations matching any
bool no_cache = false; // skip pre-enrich dedup — force re-enrichment
std::string parent_id; // if set, this run is an "expand" child of another run
};
std::string json_escape(const std::string &s);
/// Optional callbacks for streaming progress events (used in IPC mode).
/// When nullptr / empty, the pipeline runs silently (CLI mode).
struct GridsearchCallbacks {
/// Emit a progress event. `type` is one of:
/// grid-ready, waypoint-start, area, location,
/// enrich-start, node, node-error, nodePage
/// `json` is the raw JSON payload string.
std::function<void(const std::string& type, const std::string& json)> onEvent;
};
CLI::App* setup_cmd_gridsearch(CLI::App& app);
/// CLI entry point (standalone mode — reads static vars set by CLI11).
int run_cmd_gridsearch();
/// IPC entry point — parse `payload` JSON, run the pipeline, emit events via `cb`.
/// Returns 0 on success.
int run_cmd_gridsearch_ipc(const std::string& payload,
const std::string& jobId,
const GridsearchCallbacks& cb,
bool daemon_mode = false,
const std::string& daemon_uid = "");
/// Core Pipeline
int run_pipeline(const PipelineOptions &opts, std::ostream *file_out,
const GridsearchCallbacks &cb);
/// UDS entry point — starts a persistent AF_UNIX / Named Pipe server that processes
/// concurrent jobs using Moodycamel ConcurrentQueue and Taskflow executor.
int run_cmd_gridsearch_uds(const std::string& pipe_path,
bool daemon_mode,
const std::string& daemon_uid);
} // namespace polymech

View File

@ -0,0 +1,60 @@
#pragma once
#include "enrichers/enrichers.h"
#include "grid/grid.h"
#include "search/search.h"
#include <cstdint>
#include <string>
#include <vector>
namespace polymech {
struct PipelineOptions;
}
namespace polymech::serialize {
/// grid-ready event payload
std::string grid_ready(const std::vector<grid::Waypoint>& waypoints);
/// waypoint-start event payload
std::string waypoint_start(const grid::Waypoint& wp, int index, int total);
/// location event payload (per search result)
std::string location(const search::MapResult& r, int step);
/// waypoint-finish event payload (waypoint done)
std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls);
/// area-start event payload
std::string area_start(const std::string& area_gid, const std::string& area_name);
/// area-finish event payload
std::string area_finish(const std::string& area_gid);
/// enrich-start event payload
std::string enrich_start(int locationCount);
/// nodePage event payload (per page error)
std::string node_page(const enrichers::PageError& pe, const std::string& placeId);
/// node-error event payload
std::string node_error(const enrichers::EnrichedNode& node);
/// node event payload (enriched location)
std::string node(const enrichers::EnrichedNode& node);
/// job_result summary (with enrichment)
std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs,
int totalEmails, int totalPagesScraped, int freshApiCalls,
int waypointCount, int validCells, int skippedCells,
int totalResults, const std::vector<std::string>& enrichResults,
double totalScannedSqKm, double totalPopulation);
/// job_result summary (search only, no enrichment)
std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs,
int freshApiCalls, int waypointCount, int validCells,
int skippedCells, int totalResults, const std::vector<std::string>& enrichResults,
double totalScannedSqKm, double totalPopulation);
} // namespace polymech::serialize

View File

@ -0,0 +1,269 @@
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <set>
#include <ctime>
#include <iomanip>
#include <sstream>
#include <rapidjson/document.h>
#include <CLI/CLI.hpp>
#include <toml++/toml.hpp>
#include "html/html.h"
#include "http/http.h"
#include "ipc/ipc.h"
#include "logger/logger.h"
#include "postgres/postgres.h"
#include "json/json.h"
#include "gadm_reader/gadm_reader.h"
#include "grid/grid.h"
#include "search/search.h"
#include "enrichers/enrichers.h"
#include "cmd_gridsearch.h"
#ifndef PROJECT_VERSION
#define PROJECT_VERSION "0.1.0"
#endif
int main(int argc, char *argv[]) {
CLI::App app{"polymech-cli — Polymech C++ CLI", "polymech-cli"};
app.set_version_flag("-v,--version", PROJECT_VERSION);
std::string log_level = "info";
app.add_option("--log-level", log_level, "Set log level (debug/info/warn/error)")->default_val("info");
// Subcommand: parse HTML
std::string html_input;
auto *parse_cmd = app.add_subcommand("parse", "Parse HTML and list elements");
parse_cmd->add_option("html", html_input, "HTML string to parse")->required();
// Subcommand: select from HTML
std::string select_input;
std::string selector;
auto *select_cmd =
app.add_subcommand("select", "CSS-select elements from HTML");
select_cmd->add_option("html", select_input, "HTML string")->required();
select_cmd->add_option("selector", selector, "CSS selector")->required();
// Subcommand: config — read a TOML file
std::string config_path;
auto *config_cmd =
app.add_subcommand("config", "Read and display a TOML config file");
config_cmd->add_option("file", config_path, "Path to TOML file")->required();
// Subcommand: fetch — HTTP GET a URL
std::string fetch_url;
auto *fetch_cmd =
app.add_subcommand("fetch", "HTTP GET a URL and print the response");
fetch_cmd->add_option("url", fetch_url, "URL to fetch")->required();
// Subcommand: json — prettify JSON
std::string json_input;
auto *json_cmd = app.add_subcommand("json", "Prettify a JSON string");
json_cmd->add_option("input", json_input, "JSON string")->required();
// Subcommand: db — connect to Supabase and query
std::string db_config_path = "config/postgres.toml";
std::string db_table;
int db_limit = 10;
auto *db_cmd =
app.add_subcommand("db", "Connect to Supabase and query a table");
db_cmd->add_option("-c,--config", db_config_path, "TOML config path")
->default_val("config/postgres.toml");
db_cmd->add_option("table", db_table, "Table to query (optional)");
db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10);
// Subcommand: worker — IPC mode (spawned by Node.js orchestrator)
bool daemon_mode = false;
std::string daemon_uid;
std::string worker_config = "config/postgres.toml";
std::string uds_path;
auto *worker_cmd = app.add_subcommand(
"worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)");
worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)");
worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml");
worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)");
worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path");
// Subcommand: gridsearch — Run a full gridsearch pipeline
auto* gs_cmd = polymech::setup_cmd_gridsearch(app);
CLI11_PARSE(app, argc, argv);
// Worker mode uses stderr for logs to keep stdout clean for IPC frames
if (worker_cmd->parsed()) {
if (!uds_path.empty()) {
logger::init_uds("polymech-uds", log_level, "../logs/uds.json");
} else {
logger::init_stderr("polymech-worker", log_level);
}
} else {
logger::init("polymech-cli", log_level);
}
// ── worker mode ─────────────────────────────────────────────────────────
if (worker_cmd->parsed()) {
logger::info("Worker mode: listening on stdin");
if (daemon_mode) {
logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid));
auto cfg = search::load_config(worker_config);
postgres::Config pcfg;
pcfg.supabase_url = cfg.supabase_url;
pcfg.supabase_key = cfg.supabase_service_key;
postgres::init(pcfg);
}
if (!uds_path.empty()) {
logger::info("Worker mode: UDS Server active on " + uds_path);
int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid);
return rc;
}
// Send a "ready" message so the orchestrator knows we're alive
ipc::write_message({"0", "ready", "{}"});
while (true) {
ipc::Message req;
if (!ipc::read_message(req)) {
logger::info("Worker: stdin closed, exiting");
break;
}
logger::debug("Worker recv: type=" + req.type + " id=" + req.id);
if (req.type == "ping") {
ipc::write_message({req.id, "pong", "{}"});
} else if (req.type == "gridsearch") {
logger::info("Worker: gridsearch job received");
// Build callbacks that emit IPC events.
// Progress events use id "0" (unmatched → event for orchestrator).
// The final job_result uses the original req.id so the promise resolves.
std::string req_id = req.id;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&req_id](const std::string& type, const std::string& json) {
if (type == "job_result") {
ipc::write_message({req_id, "job_result", json});
} else {
ipc::write_message({"0", type, json});
}
};
int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid);
if (rc != 0) {
ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"});
}
} else if (req.type == "job") {
// Stub: echo the payload back as job_result
ipc::write_message({req.id, "job_result", req.payload});
} else if (req.type == "shutdown") {
ipc::write_message({req.id, "shutdown_ack", "{}"});
logger::info("Worker: shutdown requested, exiting");
break;
} else {
// Unknown type — respond with error
ipc::write_message(
{req.id, "error",
"{\"message\":\"unknown type: " + req.type + "\"}"});
}
}
return 0;
}
// ── existing subcommands ────────────────────────────────────────────────
if (parse_cmd->parsed()) {
auto elements = html::parse(html_input);
logger::info("Parsed " + std::to_string(elements.size()) + " elements");
for (const auto &el : elements) {
std::cout << "<" << el.tag << "> " << el.text << "\n";
}
return 0;
}
if (select_cmd->parsed()) {
auto matches = html::select(select_input, selector);
logger::info("Matched " + std::to_string(matches.size()) + " elements");
for (const auto &m : matches) {
std::cout << m << "\n";
}
return 0;
}
if (config_cmd->parsed()) {
try {
auto tbl = toml::parse_file(config_path);
logger::info("Loaded config: " + config_path);
std::cout << tbl << "\n";
} catch (const toml::parse_error &err) {
logger::error("TOML parse error: " + std::string(err.what()));
return 1;
}
return 0;
}
if (fetch_cmd->parsed()) {
auto resp = http::get(fetch_url);
logger::info("HTTP " + std::to_string(resp.status_code) + " from " +
fetch_url);
if (json::is_valid(resp.body)) {
std::cout << json::prettify(resp.body) << "\n";
} else {
std::cout << resp.body << "\n";
}
return 0;
}
if (json_cmd->parsed()) {
if (!json::is_valid(json_input)) {
logger::error("Invalid JSON input");
return 1;
}
std::cout << json::prettify(json_input) << "\n";
return 0;
}
if (db_cmd->parsed()) {
try {
auto cfg = toml::parse_file(db_config_path);
postgres::Config pg_cfg;
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
pg_cfg.supabase_key =
cfg["supabase"]["publishable_key"].value_or(std::string(""));
postgres::init(pg_cfg);
auto status = postgres::ping();
logger::info("Supabase: " + status);
if (!db_table.empty()) {
auto result = postgres::query(db_table, "*", "", db_limit);
if (json::is_valid(result)) {
std::cout << json::prettify(result) << "\n";
} else {
std::cout << result << "\n";
}
}
} catch (const std::exception &e) {
logger::error(std::string("db error: ") + e.what());
return 1;
}
return 0;
}
// ── gridsearch subcommand ──────────────────────────────────────────────
if (gs_cmd->parsed()) {
return polymech::run_cmd_gridsearch();
}
// No subcommand — show help
std::cout << app.help() << "\n";
return 0;
}

View File

@ -0,0 +1,8 @@
#pragma once
#include <cstddef>
#include <cstdint>
namespace polymech {
size_t get_current_rss_mb();
uint64_t get_cpu_time_ms();
}

View File

@ -0,0 +1,80 @@
# Test targets
include(CTest)
include(Catch)
# pthread is required on Linux for Catch2 tests
find_package(Threads REQUIRED)
# Unit tests one per package
add_executable(test_logger unit/test_logger.cpp)
target_link_libraries(test_logger PRIVATE Catch2::Catch2WithMain logger Threads::Threads)
catch_discover_tests(test_logger)
add_executable(test_html unit/test_html.cpp)
target_link_libraries(test_html PRIVATE Catch2::Catch2WithMain html Threads::Threads)
catch_discover_tests(test_html)
add_executable(test_postgres unit/test_postgres.cpp)
target_link_libraries(test_postgres PRIVATE Catch2::Catch2WithMain postgres Threads::Threads)
catch_discover_tests(test_postgres)
add_executable(test_json unit/test_json.cpp)
target_link_libraries(test_json PRIVATE Catch2::Catch2WithMain json Threads::Threads)
catch_discover_tests(test_json)
add_executable(test_http unit/test_http.cpp)
target_link_libraries(test_http PRIVATE Catch2::Catch2WithMain http Threads::Threads)
catch_discover_tests(test_http)
# Functional test end-to-end CLI
add_executable(test_functional functional/test_cli.cpp)
target_link_libraries(test_functional PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json Threads::Threads)
catch_discover_tests(test_functional)
# E2E test real Supabase connection (requires config/postgres.toml + network)
add_executable(test_supabase e2e/test_supabase.cpp)
target_link_libraries(test_supabase PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres json Threads::Threads)
catch_discover_tests(test_supabase WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_postgres_live functional/test_postgres_live.cpp)
target_link_libraries(test_postgres_live PRIVATE Catch2::Catch2WithMain postgres search json logger tomlplusplus::tomlplusplus Threads::Threads)
catch_discover_tests(test_postgres_live WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_polymech unit/test_polymech.cpp)
target_link_libraries(test_polymech PRIVATE Catch2::Catch2WithMain polymech postgres Threads::Threads)
catch_discover_tests(test_polymech)
# E2E test polymech fetch_pages from live Supabase
add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp)
target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads)
catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_gridsearch_ipc e2e/test_gridsearch_ipc.cpp ../src/cmd_gridsearch.cpp ../src/cmd_gridsearch-filters.cpp ../src/cmd_gridsearch-uds.cpp ../src/cmd_gridsearch-postgres.cpp ../src/gridsearch_serialize.cpp ../src/sys_metrics.cpp)
target_link_libraries(test_gridsearch_ipc PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers Threads::Threads)
target_include_directories(test_gridsearch_ipc PRIVATE ${CMAKE_SOURCE_DIR}/src ${asio_SOURCE_DIR}/asio/include ${taskflow_SOURCE_DIR} ${concurrentqueue_SOURCE_DIR})
target_compile_definitions(test_gridsearch_ipc PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
catch_discover_tests(test_gridsearch_ipc WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_ipc unit/test_ipc.cpp)
target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads)
catch_discover_tests(test_ipc)
add_executable(test_geo unit/test_geo.cpp)
target_link_libraries(test_geo PRIVATE Catch2::Catch2WithMain geo Threads::Threads)
catch_discover_tests(test_geo)
add_executable(test_gadm_reader unit/test_gadm_reader.cpp)
target_link_libraries(test_gadm_reader PRIVATE Catch2::Catch2WithMain gadm_reader Threads::Threads)
catch_discover_tests(test_gadm_reader WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_grid unit/test_grid.cpp)
target_link_libraries(test_grid PRIVATE Catch2::Catch2WithMain grid Threads::Threads)
catch_discover_tests(test_grid WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_search unit/test_search.cpp)
target_link_libraries(test_search PRIVATE Catch2::Catch2WithMain search Threads::Threads)
catch_discover_tests(test_search WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_enrichers unit/test_enrichers.cpp)
target_link_libraries(test_enrichers PRIVATE Catch2::Catch2WithMain enrichers Threads::Threads)
catch_discover_tests(test_enrichers)

View File

@ -0,0 +1,144 @@
#include <catch2/catch_test_macros.hpp>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
#include <rapidjson/document.h>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include "../../src/cmd_gridsearch.h"
#include "logger/logger.h"
// ── Helpers ──────────────────────────────────────────────────────────────────
static std::string read_file_contents(const std::string &path) {
std::ifstream f(path);
if (!f.is_open())
return "";
std::stringstream ss;
ss << f.rdbuf();
return ss.str();
}
/// Read a JSON config file and inject test-safe overrides:
/// - configPath = "config/postgres.toml"
/// - enrich = false (no live HTTP / thread-pool in tests)
/// - persistencePostgres = false
static std::string load_test_payload(const std::string &config_path) {
std::string raw = read_file_contents(config_path);
if (raw.empty())
return "";
rapidjson::Document doc;
doc.Parse(raw.c_str());
if (doc.HasParseError())
return "";
auto &alloc = doc.GetAllocator();
// Remove-then-add ensures no double-add assertion from rapidjson
auto inject_bool = [&](const char *key, bool val) {
if (doc.HasMember(key))
doc.RemoveMember(key);
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val), alloc);
};
auto inject_str = [&](const char *key, const char *val) {
if (doc.HasMember(key))
doc.RemoveMember(key);
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val, alloc),
alloc);
};
inject_str("configPath", "config/postgres.toml");
inject_str("cacheDir", "../../packages/gadm/cache/gadm"); // server/cache/gadm
inject_bool("enrich", false); // no live enrichment in tests
inject_bool("persistencePostgres", false);
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
doc.Accept(writer);
return buf.GetString();
}
// ── Tests
// ─────────────────────────────────────────────────────────────────────
TEST_CASE("E2E: Gridsearch Country Boundary Filter (Lamu/KEN)",
"[e2e][gridsearch][boundary]") {
REQUIRE_NOTHROW(logger::init("test-gridsearch"));
// Lamu, Kenya — SerpAPI often returns US results for obscure African regions.
// boundary_KEN_0.json should filter them out.
std::string payload = load_test_payload("config/gridsearch-lamu.json");
REQUIRE(!payload.empty());
std::vector<std::string> location_events;
int error_count = 0;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&](const std::string &type, const std::string &json) {
if (type == "location") {
location_events.push_back(json);
} else if (type == "error") {
error_count++;
std::cout << "[ERROR EVENT]: " << json << "\n";
}
};
int result =
polymech::run_cmd_gridsearch_ipc(payload, "test-lamu-job", cb, false, "");
REQUIRE(result == 0);
REQUIRE(error_count == 0);
// All returned locations must be within Kenya (no USA coords).
// Verify: no location has lng < -30 (Americas) or lng > 60 (not Africa/Asia)
// and lat outside [-5, 5] for Lamu county bounds.
int outside_kenya = 0;
for (const auto &loc_json : location_events) {
rapidjson::Document loc;
loc.Parse(loc_json.c_str());
if (loc.HasParseError())
continue;
if (loc.HasMember("gps") && loc["gps"].IsObject()) {
double lng =
loc["gps"].HasMember("lng") ? loc["gps"]["lng"].GetDouble() : 0;
// Kenya longitude range: ~34..42; USA is roughly -130..-60
if (lng < 20.0 || lng > 55.0)
outside_kenya++;
}
}
CHECK(outside_kenya == 0);
std::cout << "Lamu boundary test: " << location_events.size()
<< " locations kept, " << outside_kenya << " outside Kenya.\n";
}
TEST_CASE("E2E: Gridsearch Type Filter (Sample/ABW)",
"[e2e][gridsearch][filter]") {
std::string payload = load_test_payload("config/gridsearch-sample.json");
REQUIRE(!payload.empty());
std::vector<std::string> location_events;
int error_count = 0;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&](const std::string &type, const std::string &json) {
if (type == "location")
location_events.push_back(json);
else if (type == "error")
error_count++;
};
int result = polymech::run_cmd_gridsearch_ipc(payload, "test-sample-job", cb,
false, "");
REQUIRE(result == 0);
REQUIRE(error_count == 0);
std::cout << "Sample (ABW) type filter test: " << location_events.size()
<< " locations.\n";
}

View File

@ -0,0 +1,34 @@
#include <catch2/catch_test_macros.hpp>
#include <toml++/toml.hpp>
#include "logger/logger.h"
#include "polymech/polymech.h"
#include "postgres/postgres.h"
#include "json/json.h"
// ── E2E: fetch pages from live Supabase ─────────────────────────────────
TEST_CASE("E2E: fetch all pages", "[e2e]") {
logger::init("e2e-polymech");
// Load config
auto cfg = toml::parse_file("config/postgres.toml");
postgres::Config pg_cfg;
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
pg_cfg.supabase_key =
cfg["supabase"]["publishable_key"].value_or(std::string(""));
REQUIRE(!pg_cfg.supabase_url.empty());
REQUIRE(!pg_cfg.supabase_key.empty());
postgres::init(pg_cfg);
auto result = polymech::fetch_pages();
// Should return valid JSON
REQUIRE(json::is_valid(result));
// Should be an array (even if empty)
REQUIRE(result.front() == '[');
REQUIRE(result.back() == ']');
}

View File

@ -0,0 +1,50 @@
#include <catch2/catch_test_macros.hpp>
#include <fstream>
#include <sstream>
#include <toml++/toml.hpp>
#include "logger/logger.h"
#include "postgres/postgres.h"
#include "json/json.h"
// ── E2E: Supabase connect via config/postgres.toml ──────────────────────────
TEST_CASE("E2E: connect to Supabase and ping", "[e2e][postgres]") {
logger::init("e2e-test");
// Read config — path relative to CWD (project root)
auto cfg = toml::parse_file("config/postgres.toml");
postgres::Config pg_cfg;
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
pg_cfg.supabase_key =
cfg["supabase"]["publishable_key"].value_or(std::string(""));
REQUIRE(!pg_cfg.supabase_url.empty());
REQUIRE(!pg_cfg.supabase_key.empty());
postgres::init(pg_cfg);
auto status = postgres::ping();
logger::info("E2E ping result: " + status);
CHECK(status == "ok");
}
TEST_CASE("E2E: query profiles table", "[e2e][postgres]") {
logger::init("e2e-test");
auto cfg = toml::parse_file("config/postgres.toml");
postgres::Config pg_cfg;
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
pg_cfg.supabase_key =
cfg["supabase"]["publishable_key"].value_or(std::string(""));
postgres::init(pg_cfg);
auto result = postgres::query("profiles", "id,username", "", 3);
logger::info("E2E query result: " + result);
// Should be valid JSON array
CHECK(json::is_valid(result));
}

View File

@ -0,0 +1,74 @@
#include <catch2/catch_test_macros.hpp>
#include <fstream>
#include <sstream>
#include <toml++/toml.hpp>
#include "html/html.h"
#include "logger/logger.h"
#include "postgres/postgres.h"
// ── Functional: full pipeline tests ─────────────────────────────────────────
TEST_CASE("Full pipeline: parse HTML and select", "[functional]") {
const std::string input =
"<html><body>"
"<h1>Title</h1>"
"<ul><li class=\"item\">A</li><li class=\"item\">B</li></ul>"
"</body></html>";
// Parse should find elements
auto elements = html::parse(input);
REQUIRE(!elements.empty());
// Select by class should find 2 items
auto items = html::select(input, ".item");
REQUIRE(items.size() == 2);
CHECK(items[0] == "A");
CHECK(items[1] == "B");
}
TEST_CASE("Full pipeline: TOML config round-trip", "[functional]") {
// Write a temp TOML file
const std::string toml_content = "[server]\n"
"host = \"localhost\"\n"
"port = 8080\n"
"\n"
"[database]\n"
"name = \"test_db\"\n";
std::string tmp_path = "test_config_tmp.toml";
{
std::ofstream out(tmp_path);
REQUIRE(out.is_open());
out << toml_content;
}
// Parse it
auto tbl = toml::parse_file(tmp_path);
CHECK(tbl["server"]["host"].value_or("") == std::string("localhost"));
CHECK(tbl["server"]["port"].value_or(0) == 8080);
CHECK(tbl["database"]["name"].value_or("") == std::string("test_db"));
// Serialize back
std::ostringstream ss;
ss << tbl;
auto serialized = ss.str();
CHECK(serialized.find("localhost") != std::string::npos);
// Cleanup
std::remove(tmp_path.c_str());
}
TEST_CASE("Full pipeline: logger + postgres integration", "[functional]") {
REQUIRE_NOTHROW(logger::init("functional-test"));
// Init with a dummy config (no real connection)
postgres::Config cfg;
cfg.supabase_url = "https://example.supabase.co";
cfg.supabase_key = "test-key";
REQUIRE_NOTHROW(postgres::init(cfg));
REQUIRE_NOTHROW(logger::info("Functional test: postgres init ok"));
}

View File

@ -0,0 +1,81 @@
#include <catch2/catch_test_macros.hpp>
#include "postgres/postgres.h"
#include "search/search.h"
#include "json/json.h"
#include "logger/logger.h"
#include <toml++/toml.h>
// Note: This test requires a valid config/postgres.toml pointing to a Supabase instance.
// We test against an arbitrary table 'test_items' or standard table.
// In this case we'll test against `grid_search_runs` since we know it exists,
// using a dummy uuid for testing.
// DO NOT RUN UNLESS CONFIGURED.
TEST_CASE("Postgres Live Operations", "[postgres_live]") {
// Load config
std::string supabase_url;
std::string supabase_key;
try {
auto config = toml::parse_file("config/postgres.toml");
supabase_url = config["supabase"]["url"].value_or("");
supabase_key = config["supabase"]["service_key"].value_or("");
} catch (const std::exception &e) {
WARN("Skipping postgres live tests. Config missing or invalid: " << e.what());
return;
}
if (supabase_url.empty() || supabase_key.empty()) {
WARN("Skipping postgres live tests. Supabase credentials missing.");
return;
}
postgres::Config pg_cfg;
pg_cfg.supabase_url = supabase_url;
pg_cfg.supabase_key = supabase_key;
postgres::init(pg_cfg);
REQUIRE(postgres::ping() == "ok");
std::string test_id = "00000000-0000-0000-0000-0000000000cc";
std::string user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
SECTION("Insert, Query, Update, Upsert, Delete") {
// 1. Clean up first just in case
postgres::del("grid_search_runs", "id=eq." + test_id);
// 2. Insert
std::string insert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "test_run", "status": "searching", "request": {}})";
std::string res1 = postgres::insert("grid_search_runs", insert_body);
// 3. Query
std::string res2 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
WARN("Insert Result: " << res1);
WARN("Query Result: " << res2);
REQUIRE(json::is_valid(res2));
REQUIRE(res2.find("test_run") != std::string::npos);
// 4. Update
std::string update_body = R"({"status": "enriching"})";
std::string res3 = postgres::update("grid_search_runs", update_body, "id=eq." + test_id);
REQUIRE(json::is_valid(res3));
REQUIRE(res3.find("error") == std::string::npos);
// 5. Upsert
std::string upsert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "upsert_run", "status": "complete", "request": {}})";
std::string res4 = postgres::upsert("grid_search_runs", upsert_body, "id");
REQUIRE(res4.find("error") == std::string::npos);
// Query again to verify upsert
std::string res5 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
REQUIRE(res5.find("upsert_run") != std::string::npos);
// 6. Delete
std::string res6 = postgres::del("grid_search_runs", "id=eq." + test_id);
REQUIRE(json::is_valid(res6));
// Verify deleted
std::string res7 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
REQUIRE(res7 == "[]");
}
}

View File

@ -0,0 +1,115 @@
#include <catch2/catch_test_macros.hpp>
#include "enrichers/enrichers.h"
using namespace enrichers;
// ── is_likely_email ─────────────────────────────────────────────────────────
TEST_CASE("is_likely_email: valid emails", "[enrichers]") {
CHECK(is_likely_email("info@example.com"));
CHECK(is_likely_email("john.doe@company.co.uk"));
CHECK(is_likely_email("contact@recycling-firm.de"));
CHECK(is_likely_email("hello@my-domain.org"));
}
TEST_CASE("is_likely_email: rejects non-emails", "[enrichers]") {
CHECK_FALSE(is_likely_email(""));
CHECK_FALSE(is_likely_email("not-an-email"));
CHECK_FALSE(is_likely_email("@no-user.com"));
CHECK_FALSE(is_likely_email("user@"));
}
TEST_CASE("is_likely_email: rejects asset extensions", "[enrichers]") {
CHECK_FALSE(is_likely_email("logo@site.png"));
CHECK_FALSE(is_likely_email("icon@site.svg"));
CHECK_FALSE(is_likely_email("style@site.css"));
CHECK_FALSE(is_likely_email("script@site.js"));
CHECK_FALSE(is_likely_email("photo@site.jpg"));
CHECK_FALSE(is_likely_email("photo@site.webp"));
}
TEST_CASE("is_likely_email: rejects placeholder/hash patterns", "[enrichers]") {
CHECK_FALSE(is_likely_email("user@example.com"));
CHECK_FALSE(is_likely_email("test@test.com"));
CHECK_FALSE(is_likely_email("a3f2b@hash.com"));
CHECK_FALSE(is_likely_email("your@email.com"));
CHECK_FALSE(is_likely_email("email@email.com"));
CHECK_FALSE(is_likely_email("name@domain.com"));
}
// ── extract_emails ──────────────────────────────────────────────────────────
TEST_CASE("extract_emails: finds emails in text", "[enrichers]") {
auto emails = extract_emails("Contact us at info@example.org or sales@company.com");
CHECK(emails.size() >= 2);
bool found_info = false, found_sales = false;
for (auto& e : emails) {
if (e == "info@example.org") found_info = true;
if (e == "sales@company.com") found_sales = true;
}
CHECK(found_info);
CHECK(found_sales);
}
TEST_CASE("extract_emails: deduplicates", "[enrichers]") {
auto emails = extract_emails("info@acme.org info@acme.org info@acme.org");
CHECK(emails.size() == 1);
}
TEST_CASE("extract_emails: empty text returns empty", "[enrichers]") {
auto emails = extract_emails("");
CHECK(emails.empty());
}
TEST_CASE("extract_emails: filters out asset emails", "[enrichers]") {
auto emails = extract_emails("logo@site.png info@real-company.de");
CHECK(emails.size() == 1);
CHECK(emails[0] == "info@real-company.de");
}
// ── resolve_url ─────────────────────────────────────────────────────────────
TEST_CASE("resolve_url: absolute stays absolute", "[enrichers]") {
CHECK(resolve_url("https://example.com", "https://other.com/page") == "https://other.com/page");
}
TEST_CASE("resolve_url: relative path", "[enrichers]") {
auto r = resolve_url("https://example.com/page", "/contact");
CHECK(r == "https://example.com/contact");
}
TEST_CASE("resolve_url: protocol-relative", "[enrichers]") {
auto r = resolve_url("https://example.com", "//other.com/foo");
CHECK(r == "https://other.com/foo");
}
TEST_CASE("resolve_url: relative without slash", "[enrichers]") {
auto r = resolve_url("https://example.com/dir/page", "about.html");
CHECK(r == "https://example.com/dir/about.html");
}
// ── status_string ───────────────────────────────────────────────────────────
TEST_CASE("status_string: covers all statuses", "[enrichers]") {
CHECK(std::string(status_string(EnrichStatus::OK)) == "OK");
CHECK(std::string(status_string(EnrichStatus::NO_EMAIL)) == "NO_EMAIL");
CHECK(std::string(status_string(EnrichStatus::META_TIMEOUT)) == "META_TIMEOUT");
CHECK(std::string(status_string(EnrichStatus::EMAIL_TIMEOUT)) == "EMAIL_TIMEOUT");
CHECK(std::string(status_string(EnrichStatus::FETCH_ERROR)) == "FETCH_ERROR");
CHECK(std::string(status_string(EnrichStatus::NO_PAGES)) == "NO_PAGES");
CHECK(std::string(status_string(EnrichStatus::ERROR)) == "ERROR");
}
// ── EnrichConfig defaults ───────────────────────────────────────────────────
TEST_CASE("EnrichConfig: default values", "[enrichers]") {
EnrichConfig cfg;
CHECK(cfg.meta_timeout_ms == 20000);
CHECK(cfg.email_timeout_ms == 30000);
CHECK(cfg.email_page_timeout_ms == 10000);
CHECK(cfg.email_max_pages == 8);
CHECK(cfg.email_abort_after == 1);
CHECK_FALSE(cfg.contact_patterns.empty());
CHECK_FALSE(cfg.probe_paths.empty());
}

View File

@ -0,0 +1,163 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include "gadm_reader/gadm_reader.h"
#include <cmath>
using namespace gadm;
using Catch::Matchers::WithinAbs;
using Catch::Matchers::WithinRel;
// ── Helper: fixtures path ───────────────────────────────────────────────────
// Tests are run with WORKING_DIRECTORY = CMAKE_SOURCE_DIR (server/cpp)
static const std::string CACHE_DIR = "cache/gadm";
// ── country_code ────────────────────────────────────────────────────────────
TEST_CASE("country_code: simple ISO3", "[gadm][util]") {
REQUIRE(country_code("ABW") == "ABW");
}
TEST_CASE("country_code: dotted GID", "[gadm][util]") {
REQUIRE(country_code("AFG.1.1_1") == "AFG");
REQUIRE(country_code("ESP.6.1_1") == "ESP");
}
// ── infer_level ─────────────────────────────────────────────────────────────
TEST_CASE("infer_level: level 0 (country)", "[gadm][util]") {
REQUIRE(infer_level("ABW") == 0);
REQUIRE(infer_level("AFG") == 0);
}
TEST_CASE("infer_level: level 1", "[gadm][util]") {
REQUIRE(infer_level("AFG.1_1") == 1);
}
TEST_CASE("infer_level: level 2", "[gadm][util]") {
REQUIRE(infer_level("AFG.1.1_1") == 2);
}
TEST_CASE("infer_level: level 3", "[gadm][util]") {
REQUIRE(infer_level("ESP.6.1.4_1") == 3);
}
// ── load_boundary_file: ABW level 0 ────────────────────────────────────────
TEST_CASE("Load ABW level 0: basic structure", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
REQUIRE(res.features.size() == 1);
const auto& f = res.features[0];
REQUIRE(f.gid == "ABW");
REQUIRE(f.name == "Aruba");
REQUIRE(f.level == 0);
REQUIRE(f.isOuter == true);
}
TEST_CASE("Load ABW level 0: has rings", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
const auto& f = res.features[0];
REQUIRE(f.rings.size() >= 1);
REQUIRE(f.rings[0].size() > 10); // ABW has ~55 coords
}
TEST_CASE("Load ABW level 0: GHS population data", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
const auto& f = res.features[0];
REQUIRE_THAT(f.ghsPopulation, WithinRel(104847.0, 0.01));
REQUIRE(f.ghsPopCenters.size() == 5);
// First pop center: [-70.04183, 12.53341, 104.0]
REQUIRE_THAT(f.ghsPopCenters[0][0], WithinAbs(-70.04183, 0.0001));
REQUIRE_THAT(f.ghsPopCenters[0][1], WithinAbs(12.53341, 0.0001));
REQUIRE_THAT(f.ghsPopCenters[0][2], WithinAbs(104.0, 0.1));
}
TEST_CASE("Load ABW level 0: GHS built data", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
const auto& f = res.features[0];
REQUIRE_THAT(f.ghsBuiltWeight, WithinRel(22900682.0, 0.01));
REQUIRE(f.ghsBuiltCenters.size() == 5);
REQUIRE_THAT(f.ghsBuiltCenter.lon, WithinAbs(-69.99304, 0.001));
REQUIRE_THAT(f.ghsBuiltCenter.lat, WithinAbs(12.51234, 0.001));
}
TEST_CASE("Load ABW level 0: computed bbox", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
const auto& f = res.features[0];
// ABW bbox should be roughly in the Caribbean
REQUIRE(f.bbox.minLon < -69.8);
REQUIRE(f.bbox.maxLon > -70.1);
REQUIRE(f.bbox.minLat > 12.4);
REQUIRE(f.bbox.maxLat < 12.7);
}
TEST_CASE("Load ABW level 0: computed area", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
const auto& f = res.features[0];
// Aruba is ~180 km²
REQUIRE_THAT(f.areaSqKm, WithinRel(180.0, 0.15)); // 15% tolerance
}
// ── load_boundary_file: AFG level 2 ────────────────────────────────────────
TEST_CASE("Load AFG.1.1_1 level 2: basic structure", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_AFG.1.1_1_2.json");
REQUIRE(res.error.empty());
REQUIRE(res.features.size() == 1);
const auto& f = res.features[0];
REQUIRE(f.gid == "AFG.1.1_1");
REQUIRE(f.name == "Baharak");
REQUIRE(f.level == 2);
}
TEST_CASE("Load AFG.1.1_1 level 2: has GHS data", "[gadm][file]") {
auto res = load_boundary_file(CACHE_DIR + "/boundary_AFG.1.1_1_2.json");
REQUIRE(res.error.empty());
const auto& f = res.features[0];
REQUIRE(f.ghsPopCenters.size() == 5);
REQUIRE(f.ghsBuiltCenters.size() == 5);
REQUIRE(f.ghsPopulation > 0);
}
// ── load_boundary: path resolution ──────────────────────────────────────────
TEST_CASE("load_boundary: direct GID match", "[gadm][resolve]") {
auto res = load_boundary("ABW", 0, CACHE_DIR);
REQUIRE(res.error.empty());
REQUIRE(res.features.size() == 1);
REQUIRE(res.features[0].gid == "ABW");
}
TEST_CASE("load_boundary: sub-region GID", "[gadm][resolve]") {
auto res = load_boundary("AFG.1.1_1", 2, CACHE_DIR);
REQUIRE(res.error.empty());
REQUIRE(res.features[0].gid == "AFG.1.1_1");
}
TEST_CASE("load_boundary: missing file returns error", "[gadm][resolve]") {
auto res = load_boundary("DOESNOTEXIST", 0, CACHE_DIR);
REQUIRE(!res.error.empty());
REQUIRE(res.features.empty());
}
// ── Error handling ──────────────────────────────────────────────────────────
TEST_CASE("load_boundary_file: nonexistent file", "[gadm][error]") {
auto res = load_boundary_file("nonexistent.json");
REQUIRE(!res.error.empty());
REQUIRE(res.features.empty());
}

View File

@ -0,0 +1,209 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include "geo/geo.h"
#include <cmath>
using namespace geo;
using Catch::Matchers::WithinAbs;
using Catch::Matchers::WithinRel;
// ── Distance ────────────────────────────────────────────────────────────────
TEST_CASE("Haversine distance: known reference values", "[geo][distance]") {
// London to Paris: ~343 km
Coord london{-0.1278, 51.5074};
Coord paris{2.3522, 48.8566};
double d = distance_km(london, paris);
REQUIRE_THAT(d, WithinRel(343.5, 0.02)); // 2% tolerance
// Same point should be zero
REQUIRE_THAT(distance_km(london, london), WithinAbs(0.0, 1e-10));
// Equatorial points 1 degree apart: ~111.32 km
Coord eq0{0, 0};
Coord eq1{1, 0};
REQUIRE_THAT(distance_km(eq0, eq1), WithinRel(111.32, 0.01));
}
TEST_CASE("Haversine distance: antipodal points", "[geo][distance]") {
// North pole to south pole: ~20015 km (half circumference)
Coord north{0, 90};
Coord south{0, -90};
double d = distance_km(north, south);
REQUIRE_THAT(d, WithinRel(20015.0, 0.01));
}
// ── BBox ────────────────────────────────────────────────────────────────────
TEST_CASE("BBox of a simple triangle", "[geo][bbox]") {
std::vector<Coord> triangle = {{0, 0}, {10, 5}, {5, 10}};
BBox b = bbox(triangle);
REQUIRE(b.minLon == 0.0);
REQUIRE(b.minLat == 0.0);
REQUIRE(b.maxLon == 10.0);
REQUIRE(b.maxLat == 10.0);
}
TEST_CASE("BBox center", "[geo][bbox]") {
BBox b{-10, -20, 10, 20};
Coord c = b.center();
REQUIRE(c.lon == 0.0);
REQUIRE(c.lat == 0.0);
}
TEST_CASE("BBox union", "[geo][bbox]") {
std::vector<BBox> boxes = {{0, 0, 5, 5}, {3, 3, 10, 10}};
BBox u = bbox_union(boxes);
REQUIRE(u.minLon == 0.0);
REQUIRE(u.minLat == 0.0);
REQUIRE(u.maxLon == 10.0);
REQUIRE(u.maxLat == 10.0);
}
TEST_CASE("BBox of empty ring returns zeros", "[geo][bbox]") {
std::vector<Coord> empty;
BBox b = bbox(empty);
REQUIRE(b.minLon == 0.0);
REQUIRE(b.maxLon == 0.0);
}
// ── Centroid ────────────────────────────────────────────────────────────────
TEST_CASE("Centroid of a square", "[geo][centroid]") {
std::vector<Coord> square = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
Coord c = centroid(square);
REQUIRE_THAT(c.lon, WithinAbs(5.0, 1e-10));
REQUIRE_THAT(c.lat, WithinAbs(5.0, 1e-10));
}
TEST_CASE("Centroid handles closed ring (duplicate first/last)", "[geo][centroid]") {
// Closed triangle — first and last point are the same
std::vector<Coord> closed = {{0, 0}, {6, 0}, {3, 6}, {0, 0}};
Coord c = centroid(closed);
// Average of 3 unique points: (0+6+3)/3 = 3, (0+0+6)/3 = 2
REQUIRE_THAT(c.lon, WithinAbs(3.0, 1e-10));
REQUIRE_THAT(c.lat, WithinAbs(2.0, 1e-10));
}
// ── Area ────────────────────────────────────────────────────────────────────
TEST_CASE("Area of an equatorial 1x1 degree square", "[geo][area]") {
// ~111.32 km × ~110.57 km ≈ ~12,308 km²
std::vector<Coord> sq = {{0, 0}, {1, 0}, {1, 1}, {0, 1}, {0, 0}};
double a = area_sq_km(sq);
REQUIRE_THAT(a, WithinRel(12308.0, 0.05)); // 5% tolerance
}
TEST_CASE("Area of a zero-size polygon is zero", "[geo][area]") {
std::vector<Coord> pt = {{5, 5}};
REQUIRE(area_sq_km(pt) == 0.0);
}
// ── Point-in-polygon ────────────────────────────────────────────────────────
TEST_CASE("PIP: point inside a square", "[geo][pip]") {
std::vector<Coord> sq = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
REQUIRE(point_in_polygon({5, 5}, sq) == true);
REQUIRE(point_in_polygon({1, 1}, sq) == true);
}
TEST_CASE("PIP: point outside a square", "[geo][pip]") {
std::vector<Coord> sq = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
REQUIRE(point_in_polygon({-1, 5}, sq) == false);
REQUIRE(point_in_polygon({15, 5}, sq) == false);
}
TEST_CASE("PIP: point on edge is indeterminate but consistent", "[geo][pip]") {
std::vector<Coord> sq = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
// Edge behavior is implementation-defined but should not crash
(void)point_in_polygon({0, 5}, sq);
(void)point_in_polygon({5, 0}, sq);
}
// ── Bearing ─────────────────────────────────────────────────────────────────
TEST_CASE("Bearing: due north", "[geo][bearing]") {
Coord a{0, 0};
Coord b{0, 10};
REQUIRE_THAT(bearing_deg(a, b), WithinAbs(0.0, 0.1));
}
TEST_CASE("Bearing: due east", "[geo][bearing]") {
Coord a{0, 0};
Coord b{10, 0};
REQUIRE_THAT(bearing_deg(a, b), WithinAbs(90.0, 0.5));
}
// ── Destination ─────────────────────────────────────────────────────────────
TEST_CASE("Destination: 100km north from equator", "[geo][destination]") {
Coord start{0, 0};
Coord dest = destination(start, 0.0, 100.0); // due north
REQUIRE_THAT(dest.lat, WithinRel(0.899, 0.02)); // ~0.9 degrees
REQUIRE_THAT(dest.lon, WithinAbs(0.0, 0.01));
}
TEST_CASE("Destination roundtrip: go 100km then measure distance", "[geo][destination]") {
Coord start{2.3522, 48.8566}; // Paris
Coord dest = destination(start, 45.0, 100.0); // 100km northeast
double d = distance_km(start, dest);
REQUIRE_THAT(d, WithinRel(100.0, 0.01)); // should be ~100km back
}
// ── Square grid ─────────────────────────────────────────────────────────────
TEST_CASE("Square grid: generates cells within bbox", "[geo][grid]") {
BBox extent{0, 0, 1, 1}; // ~111km x ~110km
auto cells = square_grid(extent, 50.0); // 50km cells → ~4 cells
REQUIRE(cells.size() >= 4);
for (const auto& c : cells) {
REQUIRE(c.lon >= extent.minLon);
REQUIRE(c.lon <= extent.maxLon);
REQUIRE(c.lat >= extent.minLat);
REQUIRE(c.lat <= extent.maxLat);
}
}
TEST_CASE("Square grid: zero cell size returns empty", "[geo][grid]") {
BBox extent{0, 0, 10, 10};
auto cells = square_grid(extent, 0.0);
REQUIRE(cells.empty());
}
// ── Hex grid ────────────────────────────────────────────────────────────────
TEST_CASE("Hex grid: generates cells within bbox", "[geo][grid]") {
BBox extent{0, 0, 1, 1};
auto cells = hex_grid(extent, 50.0);
REQUIRE(cells.size() >= 4);
for (const auto& c : cells) {
REQUIRE(c.lon >= extent.minLon);
REQUIRE(c.lon <= extent.maxLon);
REQUIRE(c.lat >= extent.minLat);
REQUIRE(c.lat <= extent.maxLat);
}
}
TEST_CASE("Hex grid: has offset rows", "[geo][grid]") {
BBox extent{0, 0, 2, 2}; // large enough for multiple rows
auto cells = hex_grid(extent, 30.0);
// Find first and second row Y values
if (cells.size() >= 3) {
// Just verify we got some cells (hex pattern is complex to validate)
REQUIRE(cells.size() > 2);
}
}
// ── Viewport estimation ─────────────────────────────────────────────────────
TEST_CASE("Viewport estimation at equator zoom 14", "[geo][viewport]") {
double sq = estimate_viewport_sq_km(0.0, 14);
// At zoom 14, equator: ~9.55 m/px → ~9.78 * 7.33 ≈ 71.7 km²
REQUIRE_THAT(sq, WithinRel(71.7, 0.15)); // 15% tolerance
}
TEST_CASE("Viewport estimation: higher zoom = smaller area", "[geo][viewport]") {
double z14 = estimate_viewport_sq_km(40.0, 14);
double z16 = estimate_viewport_sq_km(40.0, 16);
REQUIRE(z16 < z14);
}

View File

@ -0,0 +1,235 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include "grid/grid.h"
#include "gadm_reader/gadm_reader.h"
#include <cmath>
#include <set>
using Catch::Matchers::WithinAbs;
using Catch::Matchers::WithinRel;
static const std::string CACHE_DIR = "cache/gadm";
// ── Helper: load ABW boundary ───────────────────────────────────────────────
static gadm::Feature load_abw() {
auto res = gadm::load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
REQUIRE(res.error.empty());
REQUIRE(res.features.size() == 1);
return res.features[0];
}
static gadm::Feature load_afg() {
auto res = gadm::load_boundary_file(CACHE_DIR + "/boundary_AFG.1.1_1_2.json");
REQUIRE(res.error.empty());
REQUIRE(res.features.size() == 1);
return res.features[0];
}
// ── Admin mode ──────────────────────────────────────────────────────────────
TEST_CASE("Grid admin: single feature → one waypoint", "[grid][admin]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "admin";
opts.pathOrder = "zigzag";
auto result = grid::generate({feat}, opts);
REQUIRE(result.error.empty());
REQUIRE(result.validCells == 1);
REQUIRE(result.waypoints.size() == 1);
auto& wp = result.waypoints[0];
REQUIRE(wp.step == 1);
REQUIRE(wp.radius_km > 0);
// ABW centroid should be near [-70.0, 12.5]
REQUIRE_THAT(wp.lng, WithinAbs(-70.0, 0.1));
REQUIRE_THAT(wp.lat, WithinAbs(12.5, 0.1));
}
TEST_CASE("Grid admin: multiple features", "[grid][admin]") {
auto abw = load_abw();
auto afg = load_afg();
grid::GridOptions opts;
opts.gridMode = "admin";
auto result = grid::generate({abw, afg}, opts);
REQUIRE(result.error.empty());
REQUIRE(result.validCells == 2);
REQUIRE(result.waypoints.size() == 2);
REQUIRE(result.waypoints[0].step == 1);
REQUIRE(result.waypoints[1].step == 2);
}
TEST_CASE("Grid admin: empty features → error", "[grid][admin]") {
grid::GridOptions opts;
opts.gridMode = "admin";
auto result = grid::generate({}, opts);
REQUIRE(!result.error.empty());
}
// ── Centers mode ────────────────────────────────────────────────────────────
TEST_CASE("Grid centers: ABW generates waypoints from GHS centers", "[grid][centers]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "centers";
opts.cellSize = 5.0;
opts.centroidOverlap = 0.5;
auto result = grid::generate({feat}, opts);
REQUIRE(result.error.empty());
REQUIRE(result.validCells > 0);
REQUIRE(result.waypoints.size() == static_cast<size_t>(result.validCells));
// All waypoints should be near Aruba
for (const auto& wp : result.waypoints) {
REQUIRE(wp.lng > -70.2);
REQUIRE(wp.lng < -69.8);
REQUIRE(wp.lat > 12.4);
REQUIRE(wp.lat < 12.7);
}
}
TEST_CASE("Grid centers: centroid overlap filters nearby centers", "[grid][centers]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "centers";
opts.cellSize = 20.0; // big cells
opts.centroidOverlap = 0.0; // no overlap allowed → aggressive dedup
auto result_aggressive = grid::generate({feat}, opts);
opts.centroidOverlap = 0.9; // allow almost full overlap → more centers pass
auto result_relaxed = grid::generate({feat}, opts);
REQUIRE(result_relaxed.validCells >= result_aggressive.validCells);
}
// ── Hex grid mode ───────────────────────────────────────────────────────────
TEST_CASE("Grid hex: ABW at 3km cells", "[grid][hex]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "hex";
opts.cellSize = 3.0;
auto result = grid::generate({feat}, opts);
REQUIRE(result.error.empty());
REQUIRE(result.validCells > 0);
// Aruba is ~30x10 km, so with 3km cells we expect ~20-60 cells
REQUIRE(result.validCells > 5);
REQUIRE(result.validCells < 200);
}
TEST_CASE("Grid square: ABW at 5km cells", "[grid][square]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "square";
opts.cellSize = 5.0;
auto result = grid::generate({feat}, opts);
REQUIRE(result.error.empty());
REQUIRE(result.validCells > 0);
REQUIRE(result.validCells < 50); // island is small
}
TEST_CASE("Grid hex: too many cells returns error", "[grid][hex]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "hex";
opts.cellSize = 0.01; // tiny cell → huge grid
opts.maxCellsLimit = 100;
auto result = grid::generate({feat}, opts);
REQUIRE(!result.error.empty());
}
// ── Sorting ─────────────────────────────────────────────────────────────────
TEST_CASE("Grid sort: snake vs zigzag differ for multi-row grid", "[grid][sort]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "hex";
opts.cellSize = 3.0;
opts.pathOrder = "zigzag";
auto r1 = grid::generate({feat}, opts);
opts.pathOrder = "snake";
auto r2 = grid::generate({feat}, opts);
REQUIRE(r1.validCells == r2.validCells);
// Snake reverses every other row, so coordinates should differ in order
if (r1.validCells > 5) {
bool anyDiff = false;
for (size_t i = 0; i < r1.waypoints.size(); ++i) {
if (std::abs(r1.waypoints[i].lng - r2.waypoints[i].lng) > 1e-6) {
anyDiff = true;
break;
}
}
REQUIRE(anyDiff);
}
}
TEST_CASE("Grid sort: spiral-out starts near center", "[grid][sort]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "hex";
opts.cellSize = 3.0;
opts.pathOrder = "spiral-out";
auto result = grid::generate({feat}, opts);
REQUIRE(result.validCells > 3);
// Compute center of all waypoints
double cLon = 0, cLat = 0;
for (const auto& wp : result.waypoints) { cLon += wp.lng; cLat += wp.lat; }
cLon /= result.waypoints.size();
cLat /= result.waypoints.size();
// First waypoint should be closer to center than last
double distFirst = std::hypot(result.waypoints.front().lng - cLon, result.waypoints.front().lat - cLat);
double distLast = std::hypot(result.waypoints.back().lng - cLon, result.waypoints.back().lat - cLat);
REQUIRE(distFirst < distLast);
}
TEST_CASE("Grid sort: steps are sequential after sorting", "[grid][sort]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "hex";
opts.cellSize = 3.0;
opts.pathOrder = "shortest";
auto result = grid::generate({feat}, opts);
for (size_t i = 0; i < result.waypoints.size(); ++i) {
REQUIRE(result.waypoints[i].step == static_cast<int>(i + 1));
}
}
// ── GHS Filtering ───────────────────────────────────────────────────────────
TEST_CASE("Grid admin: GHS pop filter skips low-pop features", "[grid][filter]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "admin";
opts.minGhsPop = 999999999; // impossibly high
auto result = grid::generate({feat}, opts);
REQUIRE(result.validCells == 0);
REQUIRE(result.skippedCells == 1);
}
TEST_CASE("Grid admin: bypass filters passes everything", "[grid][filter]") {
auto feat = load_abw();
grid::GridOptions opts;
opts.gridMode = "admin";
opts.minGhsPop = 999999999;
opts.bypassFilters = true;
auto result = grid::generate({feat}, opts);
REQUIRE(result.validCells == 1);
}

View File

@ -0,0 +1,452 @@
#include <catch2/catch_test_macros.hpp>
#include <string>
#include <thread>
#include <vector>
#include "html/html.h"
#include "html/html2md.h"
// ═══════════════════════════════════════════════════════
// html::parse / html::select (existing)
// ═══════════════════════════════════════════════════════
TEST_CASE("html::parse returns elements from valid HTML", "[html]") {
auto elements =
html::parse("<html><body><h1>Title</h1><p>Body</p></body></html>");
REQUIRE(elements.size() >= 2);
bool found_h1 = false;
bool found_p = false;
for (const auto &el : elements) {
if (el.tag == "h1" && el.text == "Title")
found_h1 = true;
if (el.tag == "p" && el.text == "Body")
found_p = true;
}
CHECK(found_h1);
CHECK(found_p);
}
TEST_CASE("html::parse returns empty for empty input", "[html]") {
auto elements = html::parse("");
REQUIRE(elements.empty());
}
TEST_CASE("html::parse handles nested elements", "[html]") {
auto elements = html::parse("<div><span>Nested</span></div>");
bool found_span = false;
for (const auto &el : elements) {
if (el.tag == "span" && el.text == "Nested") {
found_span = true;
}
}
CHECK(found_span);
}
TEST_CASE("html::select finds elements by CSS selector", "[html][select]") {
auto matches = html::select("<ul><li>A</li><li>B</li><li>C</li></ul>", "li");
REQUIRE(matches.size() == 3);
CHECK(matches[0] == "A");
CHECK(matches[1] == "B");
CHECK(matches[2] == "C");
}
TEST_CASE("html::select returns empty for no matches", "[html][select]") {
auto matches = html::select("<p>Hello</p>", "h1");
REQUIRE(matches.empty());
}
TEST_CASE("html::select works with class selector", "[html][select]") {
auto matches = html::select(
R"(<div><span class="a">X</span><span class="b">Y</span></div>)", ".a");
REQUIRE(matches.size() == 1);
CHECK(matches[0] == "X");
}
// ═══════════════════════════════════════════════════════
// html2md — conversion & large-chunk robustness
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md basic conversion", "[html2md]") {
std::string md = html2md::Convert("<h1>Hello</h1><p>World</p>");
CHECK(md.find("Hello") != std::string::npos);
CHECK(md.find("World") != std::string::npos);
}
TEST_CASE("html2md empty input", "[html2md]") {
std::string md = html2md::Convert("");
CHECK(md.empty());
}
TEST_CASE("html2md whitespace-only input", "[html2md]") {
std::string md = html2md::Convert(" \n\t ");
// Should return empty or whitespace — must not crash
CHECK(md.size() < 20);
}
// ---------- large payload stress tests ----------
static std::string make_paragraphs(size_t count) {
std::string html;
html.reserve(count * 40);
for (size_t i = 0; i < count; ++i) {
html += "<p>Paragraph number ";
html += std::to_string(i);
html += " with some filler text.</p>\n";
}
return html;
}
static std::string make_large_html(size_t target_bytes) {
// Build a chunk of roughly target_bytes by repeating a row
const std::string row = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor.</p>\n";
std::string html;
html.reserve(target_bytes + 256);
html += "<html><body>";
while (html.size() < target_bytes) {
html += row;
}
html += "</body></html>";
return html;
}
TEST_CASE("html2md handles 64KB HTML", "[html2md][large]") {
auto html = make_large_html(64 * 1024);
REQUIRE(html.size() >= 64 * 1024);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
CHECK(md.find("Lorem ipsum") != std::string::npos);
}
TEST_CASE("html2md handles 512KB HTML", "[html2md][large]") {
auto html = make_large_html(512 * 1024);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
}
TEST_CASE("html2md handles 1MB HTML", "[html2md][large]") {
auto html = make_large_html(1024 * 1024);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
}
TEST_CASE("html2md 10K paragraphs", "[html2md][large]") {
auto html = make_paragraphs(10000);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
CHECK(md.find("Paragraph number 9999") != std::string::npos);
}
// ---------- deeply nested HTML ----------
TEST_CASE("html2md deeply nested divs (500 levels)", "[html2md][large]") {
const int depth = 500;
std::string html;
for (int i = 0; i < depth; ++i) html += "<div>";
html += "deep content";
for (int i = 0; i < depth; ++i) html += "</div>";
std::string md = html2md::Convert(html);
CHECK(md.find("deep content") != std::string::npos);
}
// ---------- wide table ----------
TEST_CASE("html2md wide table (200 columns)", "[html2md][large]") {
std::string html = "<table><tr>";
for (int i = 0; i < 200; ++i) {
html += "<td>C" + std::to_string(i) + "</td>";
}
html += "</tr></table>";
std::string md = html2md::Convert(html);
CHECK(!md.empty());
CHECK(md.find("C0") != std::string::npos);
CHECK(md.find("C199") != std::string::npos);
}
// ---------- concurrent conversion ----------
TEST_CASE("html2md concurrent conversions are thread-safe", "[html2md][threads]") {
const int num_threads = 8;
const std::string html = make_large_html(32 * 1024); // 32KB each
std::vector<std::string> results(num_threads);
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back([&results, &html, i]() {
results[i] = html2md::Convert(html);
});
}
for (auto &t : threads) t.join();
for (int i = 0; i < num_threads; ++i) {
CHECK(!results[i].empty());
CHECK(results[i].find("Lorem ipsum") != std::string::npos);
}
}
// ═══════════════════════════════════════════════════════
// html2md — malformed / faulty HTML robustness
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md unclosed tags", "[html2md][faulty]") {
std::string md = html2md::Convert("<p>Hello <b>bold <i>italic");
CHECK(md.find("Hello") != std::string::npos);
CHECK(md.find("bold") != std::string::npos);
}
TEST_CASE("html2md mismatched/overlapping tags", "[html2md][faulty]") {
std::string md = html2md::Convert("<b>bold <i>both</b> italic</i>");
CHECK(md.find("bold") != std::string::npos);
}
TEST_CASE("html2md broken attributes", "[html2md][faulty]") {
std::string md = html2md::Convert(R"(<a href="http://example.com class="bad>Link</a>)");
// must not crash — output may vary
(void)md;
}
TEST_CASE("html2md bare text (no tags)", "[html2md][faulty]") {
std::string md = html2md::Convert("Just plain text, no HTML at all.");
CHECK(md.find("Just plain text") != std::string::npos);
}
TEST_CASE("html2md random binary noise", "[html2md][faulty]") {
// Full 0-255 byte range — previously crashed on MSVC debug builds due to
// signed char passed to isspace() without unsigned cast. Fixed in html2md.cpp.
std::string noise(4096, '\0');
for (size_t i = 0; i < noise.size(); ++i) {
noise[i] = static_cast<char>((i * 131 + 17) % 256);
}
std::string md = html2md::Convert(noise);
// No assertion on content — just survival
(void)md;
}
TEST_CASE("html2md truncated document", "[html2md][faulty]") {
std::string html = "<html><body><table><tr><td>Cell1</td><td>Cell2";
// abruptly ends mid-table
std::string md = html2md::Convert(html);
CHECK(md.find("Cell1") != std::string::npos);
}
TEST_CASE("html2md script and style tags", "[html2md][faulty]") {
std::string html = R"(
<p>Before</p>
<script>alert('xss');</script>
<style>.foo { color: red; }</style>
<p>After</p>
)";
std::string md = html2md::Convert(html);
CHECK(md.find("Before") != std::string::npos);
CHECK(md.find("After") != std::string::npos);
// script/style content should be stripped
CHECK(md.find("alert") == std::string::npos);
}
TEST_CASE("html2md null bytes in input", "[html2md][faulty]") {
std::string html = "<p>Hello";
html += '\0';
html += "World</p>";
// html2md may stop at null or handle it — must not crash
std::string md = html2md::Convert(html);
(void)md;
}
// ═══════════════════════════════════════════════════════
// html2md — web scraper real-world edge cases
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md UTF-8 multibyte (CJK, Arabic, emoji)", "[html2md][scraper]") {
std::string html =
"<h1>日本語テスト</h1>"
"<p>مرحبا بالعالم</p>"
"<p>Ñoño señor über straße</p>"
"<p>Emoji: 🚀🔥💀👻 and 中文混合English</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Emoji") != std::string::npos);
}
TEST_CASE("html2md BOM prefix", "[html2md][scraper]") {
// UTF-8 BOM (EF BB BF) prepended — common from Windows-origin pages
std::string html = "\xEF\xBB\xBF<html><body><p>Content after BOM</p></body></html>";
std::string md = html2md::Convert(html);
CHECK(md.find("Content after BOM") != std::string::npos);
}
TEST_CASE("html2md entity soup", "[html2md][scraper]") {
std::string html =
"<p>Price: &euro;10 &amp; &lt;20&gt; items</p>"
"<p>&nbsp;&nbsp;&nbsp;indented &mdash; dashes &ndash; more</p>"
"<p>Bad entity: &notreal; and &#999999; and &#xZZZZ;</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Price") != std::string::npos);
}
TEST_CASE("html2md CDATA and comments", "[html2md][scraper]") {
std::string html =
"<p>Before</p>"
"<!-- <script>alert('xss')</script> -->"
"<![CDATA[This is raw <data> & stuff]]>"
"<!-- multi\nline\ncomment -->"
"<p>After</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Before") != std::string::npos);
CHECK(md.find("After") != std::string::npos);
}
TEST_CASE("html2md deeply nested inline tags", "[html2md][scraper]") {
// Real pages sometimes have insanely nested spans from WYSIWYG editors
std::string html = "<p>";
for (int i = 0; i < 100; ++i) html += "<span><b><i><em><strong>";
html += "deep text";
for (int i = 0; i < 100; ++i) html += "</strong></em></i></b></span>";
html += "</p>";
std::string md = html2md::Convert(html);
// 100 layers of bold/italic produce tons of ** and * markers —
// just verify no crash and non-empty output
CHECK(!md.empty());
}
TEST_CASE("html2md huge single line (no newlines)", "[html2md][scraper]") {
// Minified HTML — one giant line, 200KB
std::string html;
html.reserve(200 * 1024);
html += "<html><body>";
for (int i = 0; i < 5000; ++i) {
html += "<div><span class=\"c" + std::to_string(i) + "\">item" +
std::to_string(i) + "</span></div>";
}
html += "</body></html>";
std::string md = html2md::Convert(html);
CHECK(md.find("item0") != std::string::npos);
CHECK(md.find("item4999") != std::string::npos);
}
TEST_CASE("html2md data URI in img src", "[html2md][scraper]") {
std::string html =
"<p>Before image</p>"
"<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSU"
"hEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwAD"
"hgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"pixel\">"
"<p>After image</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Before image") != std::string::npos);
CHECK(md.find("After image") != std::string::npos);
}
TEST_CASE("html2md mixed Latin-1 and UTF-8 bytes", "[html2md][scraper]") {
// Latin-1 encoded chars (0x80-0xFF) that are NOT valid UTF-8
// Common when scraping pages with wrong charset declaration
std::string html = "<p>caf\xe9 na\xefve r\xe9sum\xe9</p>"; // café naïve résumé in Latin-1
std::string md = html2md::Convert(html);
CHECK(md.find("caf") != std::string::npos);
}
TEST_CASE("html2md HTML with HTTP headers prepended", "[html2md][scraper]") {
// Sometimes raw HTTP responses leak into scraper output
std::string html =
"HTTP/1.1 200 OK\r\n"
"Content-Type: text/html; charset=utf-8\r\n"
"Content-Length: 42\r\n"
"\r\n"
"<html><body><p>Real content</p></body></html>";
std::string md = html2md::Convert(html);
CHECK(md.find("Real content") != std::string::npos);
}
TEST_CASE("html2md Google Maps / Places markup soup", "[html2md][scraper]") {
// Simplified version of real Google Places HTML with data attributes,
// inline styles, aria labels, and deeply nested structure
std::string html = R"(
<div class="section-result" data-result-index="0" jsaction="pane.resultSection.click">
<div class="section-result-title">
<span><span>Müller's Büro & Café</span></span>
</div>
<div class="section-result-details">
<span class="section-result-location">Königstraße 42, München</span>
<span class="section-result-rating">
<span aria-label="4.5 stars"></span>
<span>(1,234)</span>
</span>
</div>
<div style="display:none" aria-hidden="true">
<script type="application/ld+json">{"@type":"LocalBusiness","name":"test"}</script>
</div>
</div>
)";
std::string md = html2md::Convert(html);
CHECK(md.find("Café") != std::string::npos);
CHECK(md.find("München") != std::string::npos);
}
// ═══════════════════════════════════════════════════════
// html2md — output amplification & pathological input
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md nested blockquotes (output amplification)", "[html2md][amplification]") {
// Each <blockquote> nesting adds a ">" prefix per line in markdown.
// 50 deep = each line gets 50 ">" prefixes — tests that output doesn't
// explode exponentially.
std::string html;
for (int i = 0; i < 50; ++i) html += "<blockquote>";
html += "<p>deep quote</p>";
for (int i = 0; i < 50; ++i) html += "</blockquote>";
auto md = html2md::Convert(html);
// Output size should be reasonable — not exponential.
// 50 levels * "> " prefix = ~100 chars + text < 1 KB
CHECK(md.size() < 4096);
CHECK(!md.empty());
}
TEST_CASE("html2md very long attribute value", "[html2md][amplification]") {
// 1 MB href — tests ExtractAttributeFromTagLeftOf won't choke
std::string long_url(1024 * 1024, 'A');
std::string html = "<a href=\"" + long_url + "\">Click</a>";
auto md = html2md::Convert(html);
// Must survive without crash
CHECK(!md.empty());
}
TEST_CASE("html2md 10K unclosed p tags", "[html2md][amplification]") {
// Each unclosed <p> generates "\n\n" — tests that md_ doesn't
// grow beyond reasonable bounds
std::string html;
html.reserve(50000);
for (int i = 0; i < 10000; ++i) html += "<p>text";
auto md = html2md::Convert(html);
CHECK(!md.empty());
// Should contain the text, output gets big but not catastrophic
CHECK(md.find("text") != std::string::npos);
}
TEST_CASE("html2md output-to-input ratio check", "[html2md][amplification]") {
// Verify that for normal, representative HTML, output is smaller
// than input (html2md strips tags, so markdown should be leaner)
std::string html;
html.reserve(100 * 1024);
html += "<html><body>";
for (int i = 0; i < 1000; ++i) {
html += "<div class=\"wrapper\"><p class=\"content\">Paragraph " +
std::to_string(i) + " with some text.</p></div>\n";
}
html += "</body></html>";
auto md = html2md::Convert(html);
// Markdown should be smaller than HTML (we stripped all the divs/classes)
CHECK(md.size() < html.size());
CHECK(md.size() > 0);
}
TEST_CASE("html2md pathological repeated angle brackets", "[html2md][amplification]") {
// Incomplete tags: lots of "<" without closing ">" — stresses tag parser
std::string html(8192, '<');
auto md = html2md::Convert(html);
// Must not infinite-loop — just survive
(void)md;
}

View File

@ -0,0 +1,17 @@
#include <catch2/catch_test_macros.hpp>
#include "http/http.h"
TEST_CASE("http::get returns a response", "[http]") {
// This test requires network, so we test error handling for invalid URL
auto resp = http::get("http://0.0.0.0:1/nonexistent");
// Should fail gracefully with status -1
CHECK(resp.status_code == -1);
CHECK(!resp.body.empty());
}
TEST_CASE("http::post returns a response", "[http]") {
auto resp = http::post("http://0.0.0.0:1/nonexistent", R"({"test": true})");
CHECK(resp.status_code == -1);
CHECK(!resp.body.empty());
}

View File

@ -0,0 +1,89 @@
#include <catch2/catch_test_macros.hpp>
#include "ipc/ipc.h"
#include <cstring>
TEST_CASE("ipc::encode produces a 4-byte LE length prefix", "[ipc]") {
ipc::Message msg{"1", "ping", "{}"};
auto frame = ipc::encode(msg);
REQUIRE(frame.size() > 4);
// First 4 bytes are the LE length of the JSON body
uint32_t body_len = static_cast<uint32_t>(frame[0]) |
(static_cast<uint32_t>(frame[1]) << 8) |
(static_cast<uint32_t>(frame[2]) << 16) |
(static_cast<uint32_t>(frame[3]) << 24);
REQUIRE(body_len == frame.size() - 4);
}
TEST_CASE("ipc::encode → decode round-trip", "[ipc]") {
ipc::Message original{"42", "job", R"({"action":"resize","width":800})"};
auto frame = ipc::encode(original);
// Strip the 4-byte length prefix for decode
ipc::Message decoded;
bool ok = ipc::decode(frame.data() + 4, frame.size() - 4, decoded);
REQUIRE(ok);
REQUIRE(decoded.id == "42");
REQUIRE(decoded.type == "job");
// payload should round-trip (may be compacted)
REQUIRE(decoded.payload.find("resize") != std::string::npos);
REQUIRE(decoded.payload.find("800") != std::string::npos);
}
TEST_CASE("ipc::decode rejects invalid JSON", "[ipc]") {
std::string garbage = "this is not json";
ipc::Message out;
bool ok = ipc::decode(reinterpret_cast<const uint8_t *>(garbage.data()),
garbage.size(), out);
REQUIRE_FALSE(ok);
}
TEST_CASE("ipc::decode rejects JSON missing required fields", "[ipc]") {
// Valid JSON but missing "id" and "type"
std::string json = R"({"foo":"bar"})";
ipc::Message out;
bool ok = ipc::decode(reinterpret_cast<const uint8_t *>(json.data()),
json.size(), out);
REQUIRE_FALSE(ok);
}
TEST_CASE("ipc::decode handles missing payload gracefully", "[ipc]") {
std::string json = R"({"id":"1","type":"ping"})";
ipc::Message out;
bool ok = ipc::decode(reinterpret_cast<const uint8_t *>(json.data()),
json.size(), out);
REQUIRE(ok);
REQUIRE(out.id == "1");
REQUIRE(out.type == "ping");
REQUIRE(out.payload == "{}");
}
TEST_CASE("ipc::encode with empty payload", "[ipc]") {
ipc::Message msg{"0", "ready", ""};
auto frame = ipc::encode(msg);
ipc::Message decoded;
bool ok = ipc::decode(frame.data() + 4, frame.size() - 4, decoded);
REQUIRE(ok);
REQUIRE(decoded.id == "0");
REQUIRE(decoded.type == "ready");
}
TEST_CASE("ipc::decode with vector overload", "[ipc]") {
std::string json = R"({"id":"99","type":"shutdown","payload":{}})";
std::vector<uint8_t> data(json.begin(), json.end());
ipc::Message out;
bool ok = ipc::decode(data, out);
REQUIRE(ok);
REQUIRE(out.id == "99");
REQUIRE(out.type == "shutdown");
REQUIRE(out.payload == "{}");
}

View File

@ -0,0 +1,46 @@
#include <catch2/catch_test_macros.hpp>
#include "json/json.h"
TEST_CASE("json::is_valid accepts valid JSON", "[json]") {
CHECK(json::is_valid(R"({"key": "value"})"));
CHECK(json::is_valid("[]"));
CHECK(json::is_valid("123"));
CHECK(json::is_valid("\"hello\""));
}
TEST_CASE("json::is_valid rejects invalid JSON", "[json]") {
CHECK_FALSE(json::is_valid("{invalid}"));
CHECK_FALSE(json::is_valid("{key: value}"));
}
TEST_CASE("json::get_string extracts string values", "[json]") {
auto val =
json::get_string(R"({"name": "polymech", "version": "1.0"})", "name");
CHECK(val == "polymech");
}
TEST_CASE("json::get_string returns empty for missing key", "[json]") {
auto val = json::get_string(R"({"name": "polymech"})", "missing");
CHECK(val.empty());
}
TEST_CASE("json::get_int extracts int values", "[json]") {
auto val = json::get_int(R"({"port": 8080, "name": "test"})", "port");
CHECK(val == 8080);
}
TEST_CASE("json::keys lists top-level keys", "[json]") {
auto k = json::keys(R"({"a": 1, "b": 2, "c": 3})");
REQUIRE(k.size() == 3);
CHECK(k[0] == "a");
CHECK(k[1] == "b");
CHECK(k[2] == "c");
}
TEST_CASE("json::prettify formats JSON", "[json]") {
auto pretty = json::prettify(R"({"a":1})");
REQUIRE(!pretty.empty());
// Pretty output should contain newlines
CHECK(pretty.find('\n') != std::string::npos);
}

View File

@ -0,0 +1,22 @@
#include <catch2/catch_test_macros.hpp>
#include "logger/logger.h"
TEST_CASE("logger::init does not throw", "[logger]") {
REQUIRE_NOTHROW(logger::init("test"));
}
TEST_CASE("logger functions do not throw after init", "[logger]") {
logger::init("test");
REQUIRE_NOTHROW(logger::info("info message"));
REQUIRE_NOTHROW(logger::warn("warn message"));
REQUIRE_NOTHROW(logger::error("error message"));
REQUIRE_NOTHROW(logger::debug("debug message"));
}
TEST_CASE("logger::init can be called multiple times", "[logger]") {
REQUIRE_NOTHROW(logger::init("first"));
REQUIRE_NOTHROW(logger::init("second"));
REQUIRE_NOTHROW(logger::info("after re-init"));
}

View File

@ -0,0 +1,10 @@
#include "polymech/polymech.h"
#include "postgres/postgres.h"
#include <catch2/catch_test_macros.hpp>
// Unit test — no network required
TEST_CASE("polymech::fetch_pages throws without init", "[polymech]") {
// postgres::init has not been called, so fetch_pages should throw
REQUIRE_THROWS(polymech::fetch_pages());
}

View File

@ -0,0 +1,9 @@
#include <catch2/catch_test_macros.hpp>
#include "postgres/postgres.h"
// Unit tests use a no-op init — no network required
TEST_CASE("postgres::ping throws without init", "[postgres]") {
// If called without init, should throw
CHECK_THROWS(postgres::ping());
}

View File

@ -0,0 +1,60 @@
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include "search/search.h"
// ── Config loading ──────────────────────────────────────────────────────────
TEST_CASE("Config: loads SERPAPI_KEY from postgres.toml", "[search][config]") {
auto cfg = search::load_config("config/postgres.toml");
REQUIRE(!cfg.serpapi_key.empty());
REQUIRE(cfg.serpapi_key.size() > 20); // SHA-like key
}
TEST_CASE("Config: loads GEO_CODER_KEY from postgres.toml", "[search][config]") {
auto cfg = search::load_config("config/postgres.toml");
REQUIRE(!cfg.geocoder_key.empty());
}
TEST_CASE("Config: loads BIG_DATA_KEY from postgres.toml", "[search][config]") {
auto cfg = search::load_config("config/postgres.toml");
REQUIRE(!cfg.bigdata_key.empty());
}
TEST_CASE("Config: loads postgres URL", "[search][config]") {
auto cfg = search::load_config("config/postgres.toml");
REQUIRE(cfg.postgres_url.find("supabase.com") != std::string::npos);
}
TEST_CASE("Config: loads supabase URL and service key", "[search][config]") {
auto cfg = search::load_config("config/postgres.toml");
REQUIRE(cfg.supabase_url.find("supabase.co") != std::string::npos);
REQUIRE(!cfg.supabase_service_key.empty());
}
TEST_CASE("Config: missing file returns empty config", "[search][config]") {
auto cfg = search::load_config("nonexistent.toml");
REQUIRE(cfg.serpapi_key.empty());
REQUIRE(cfg.postgres_url.empty());
}
// ── Search validation (no network) ──────────────────────────────────────────
TEST_CASE("Search: empty key returns error", "[search][validate]") {
search::Config cfg; // all empty
search::SearchOptions opts;
opts.query = "plumbers";
auto res = search::search_google_maps(cfg, opts);
REQUIRE(!res.error.empty());
REQUIRE(res.error.find("key") != std::string::npos);
}
TEST_CASE("Search: empty query returns error", "[search][validate]") {
search::Config cfg;
cfg.serpapi_key = "test_key";
search::SearchOptions opts; // empty query
auto res = search::search_google_maps(cfg, opts);
REQUIRE(!res.error.empty());
REQUIRE(res.error.find("query") != std::string::npos);
}