kbot meets cpp - the beginnings :)
This commit is contained in:
parent
708a756a07
commit
7b07f1a55a
31
packages/kbot/cpp/.gitignore
vendored
Normal file
31
packages/kbot/cpp/.gitignore
vendored
Normal file
@ -0,0 +1,31 @@
|
||||
# Build output
|
||||
/build/
|
||||
|
||||
# Compiled objects
|
||||
*.o
|
||||
*.obj
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
# CMake generated
|
||||
CMakeCache.txt
|
||||
CMakeFiles/
|
||||
cmake_install.cmake
|
||||
Makefile
|
||||
|
||||
# IDE / Editor
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
cache/
|
||||
config/postgres.toml
|
||||
dist
|
||||
120
packages/kbot/cpp/CMakeLists.txt
Normal file
120
packages/kbot/cpp/CMakeLists.txt
Normal file
@ -0,0 +1,120 @@
|
||||
cmake_minimum_required(VERSION 3.20)
|
||||
|
||||
project(polymech-cli
|
||||
VERSION 0.1.0
|
||||
DESCRIPTION "Polymech C++ CLI"
|
||||
LANGUAGES CXX C
|
||||
)
|
||||
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/dist")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_SOURCE_DIR}/dist")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_SOURCE_DIR}/dist")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${CMAKE_SOURCE_DIR}/dist")
|
||||
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${CMAKE_SOURCE_DIR}/dist")
|
||||
|
||||
# ── C++ standard ─────────────────────────────────────────────────────────────
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
# ── Dependencies ─────────────────────────────────────────────────────────────
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
cli11
|
||||
GIT_REPOSITORY https://github.com/CLIUtils/CLI11.git
|
||||
GIT_TAG v2.4.2
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
FetchContent_Declare(
|
||||
tomlplusplus
|
||||
GIT_REPOSITORY https://github.com/marzer/tomlplusplus.git
|
||||
GIT_TAG v3.4.0
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
FetchContent_Declare(
|
||||
Catch2
|
||||
GIT_REPOSITORY https://github.com/catchorg/Catch2.git
|
||||
GIT_TAG v3.7.1
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
FetchContent_Declare(
|
||||
asio
|
||||
GIT_REPOSITORY https://github.com/chriskohlhoff/asio.git
|
||||
GIT_TAG asio-1-28-0
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
FetchContent_Declare(
|
||||
concurrentqueue
|
||||
GIT_REPOSITORY https://github.com/cameron314/concurrentqueue.git
|
||||
GIT_TAG v1.0.4
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
FetchContent_Declare(
|
||||
taskflow
|
||||
GIT_REPOSITORY https://github.com/taskflow/taskflow.git
|
||||
GIT_TAG v3.6.0
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
set(TF_BUILD_TESTS OFF CACHE BOOL "" FORCE)
|
||||
set(TF_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(cli11 tomlplusplus Catch2 asio concurrentqueue taskflow)
|
||||
|
||||
# ── Packages ─────────────────────────────────────────────────────────────────
|
||||
add_subdirectory(packages/logger)
|
||||
add_subdirectory(packages/html)
|
||||
add_subdirectory(packages/postgres)
|
||||
add_subdirectory(packages/http)
|
||||
add_subdirectory(packages/json)
|
||||
add_subdirectory(packages/polymech)
|
||||
add_subdirectory(packages/ipc)
|
||||
add_subdirectory(packages/geo)
|
||||
add_subdirectory(packages/gadm_reader)
|
||||
add_subdirectory(packages/grid)
|
||||
add_subdirectory(packages/search)
|
||||
add_subdirectory(packages/enrichers)
|
||||
|
||||
# ── Sources ──────────────────────────────────────────────────────────────────
|
||||
add_executable(${PROJECT_NAME}
|
||||
src/main.cpp
|
||||
src/cmd_gridsearch.cpp
|
||||
src/cmd_gridsearch-filters.cpp
|
||||
src/cmd_gridsearch-uds.cpp
|
||||
src/cmd_gridsearch-postgres.cpp
|
||||
src/gridsearch_serialize.cpp
|
||||
src/sys_metrics.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers)
|
||||
|
||||
target_include_directories(${PROJECT_NAME} PRIVATE
|
||||
${asio_SOURCE_DIR}/asio/include
|
||||
${taskflow_SOURCE_DIR}
|
||||
${concurrentqueue_SOURCE_DIR}
|
||||
)
|
||||
|
||||
# Define standalone ASIO (since it's not boost)
|
||||
target_compile_definitions(${PROJECT_NAME} PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
|
||||
|
||||
|
||||
# ── Compiler warnings ───────────────────────────────────────────────────────
|
||||
if(MSVC)
|
||||
target_compile_options(${PROJECT_NAME} PRIVATE /W4 /permissive-)
|
||||
else()
|
||||
target_compile_options(${PROJECT_NAME} PRIVATE -Wall -Wextra -Wpedantic)
|
||||
endif()
|
||||
|
||||
# ── Install ──────────────────────────────────────────────────────────────────
|
||||
install(TARGETS ${PROJECT_NAME}
|
||||
RUNTIME DESTINATION bin
|
||||
)
|
||||
|
||||
# ── Tests ────────────────────────────────────────────────────────────────────
|
||||
enable_testing()
|
||||
add_subdirectory(tests)
|
||||
36
packages/kbot/cpp/CMakePresets.json
Normal file
36
packages/kbot/cpp/CMakePresets.json
Normal file
@ -0,0 +1,36 @@
|
||||
{
|
||||
"version": 6,
|
||||
"cmakeMinimumRequired": {
|
||||
"major": 3,
|
||||
"minor": 20,
|
||||
"patch": 0
|
||||
},
|
||||
"configurePresets": [
|
||||
{
|
||||
"name": "dev",
|
||||
"displayName": "Dev (Debug)",
|
||||
"binaryDir": "${sourceDir}/build/dev",
|
||||
"cacheVariables": {
|
||||
"CMAKE_BUILD_TYPE": "Debug"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "release",
|
||||
"displayName": "Release",
|
||||
"binaryDir": "${sourceDir}/build/release",
|
||||
"cacheVariables": {
|
||||
"CMAKE_BUILD_TYPE": "Release"
|
||||
}
|
||||
}
|
||||
],
|
||||
"buildPresets": [
|
||||
{
|
||||
"name": "dev",
|
||||
"configurePreset": "dev"
|
||||
},
|
||||
{
|
||||
"name": "release",
|
||||
"configurePreset": "release"
|
||||
}
|
||||
]
|
||||
}
|
||||
9
packages/kbot/cpp/LICENSE
Normal file
9
packages/kbot/cpp/LICENSE
Normal file
@ -0,0 +1,9 @@
|
||||
Copyright (c) <year> <owner> All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
111
packages/kbot/cpp/README.md
Normal file
111
packages/kbot/cpp/README.md
Normal file
@ -0,0 +1,111 @@
|
||||
# polymech-cli
|
||||
|
||||
Cross-platform C++ CLI built with CMake.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
| Tool | Version |
|
||||
|------|---------|
|
||||
| CMake | ≥ 3.20 |
|
||||
| C++ compiler | C++17 (MSVC, GCC, or Clang) |
|
||||
|
||||
## Build
|
||||
|
||||
```bash
|
||||
# Debug
|
||||
cmake --preset dev
|
||||
cmake --build --preset dev
|
||||
|
||||
# Release
|
||||
cmake --preset release
|
||||
cmake --build --preset release
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
polymech-cli --help
|
||||
polymech-cli --version
|
||||
```
|
||||
|
||||
## Worker Mode & Gridsearch
|
||||
|
||||
The `worker` subcommand is designed to be spawned by the Node.js frontend orchestrator (`GridSearchUdsManager`) for background gridsearch execution. It accepts length-prefixed JSON frames over a Unix Domain Socket (UDS) or a local TCP port on Windows.
|
||||
|
||||
```bash
|
||||
polymech-cli worker --uds <path_or_port> --daemon --user-uid <id> --config <path>
|
||||
```
|
||||
|
||||
### IPC Resiliency and Logging
|
||||
The C++ worker pipeline incorporates extensive feedback and retry instrumentation:
|
||||
|
||||
1. **Watchdog Heartbeats (`ping` / `pong`)**
|
||||
- The Node orchestrator sweeps the active worker pool every 15 seconds. It explicitly logs when a ping is sent and when a `pong` (or other active events like `log`, `job_progress`, or `ack`) are received.
|
||||
- If a C++ worker stops responding to IPC events for 60 seconds (hanging thread or deadlock), it is automatically killed (`SIGKILL`) and evicted from the pool.
|
||||
|
||||
2. **Socket Traceability**
|
||||
- The UDS socket actively traps unexpected closures and TCP faults (like `ECONNRESET`). If the pipe breaks mid-job, explicit socket `error` event handlers in the Node orchestrator will instantly fail the job and log the stack trace, preventing indefinite client-side UI hangs, especially during heavy re-runs.
|
||||
|
||||
3. **Persistent Crash Logging (`logs/uds.json`)**
|
||||
- The C++ worker initializes a multi-sink logger (`logger::init_uds`). It pumps standard logs to `stderr` while simultaneously persisting an append-only file trace to `server/logs/uds.json`.
|
||||
- The file sink guarantees synchronization to disk aggressively (every 1 second, and immediately on `info` severity). If the worker process vanishes or crashes, `uds.json` acts as the black-box flight recorder for post-mortem debugging.
|
||||
|
||||
4. **Job Specification Transparency**
|
||||
- Gridsearch payloads (including `retry` and `expand` endpoints) aggressively log their input shape (`guided` bounds flag, `enrichers` subset) within the Node console before passing work to the C++ orchestrator. This allows for clear traceability from UI action -> Node submission -> C++ execution.
|
||||
|
||||
5. **Thread Safety & Frame Synchronization (Mutexes)**
|
||||
- The UDS socket handles dual-direction asynchronous streams. The background execution graph (powered by Taskflow) emits high-frequency events (`location`, `waypoint-start`) via `GridsearchCallbacks`. Concurrently, the orchestrator Node.js process sends periodic commands (`ping`, `cancel`) that the C++ socket loop must instantly acknowledge.
|
||||
- To prevent overlapping payload frames (which corrupt the critical 4-byte `len` header), a global `g_uds_socket_mutex` is strictly enforced. It guarantees that direct UI acknowledgments (`pong`, `cancel_ack`) and background logging (`uds_sink` / Taskflow events) never interleave their `asio::write` bursts onto the pipe.
|
||||
|
||||
### IPC Framing & Payload Protocol
|
||||
Communication runs strictly via length-prefixed JSON frames. This safeguards against TCP fragmentation during heavy event streams.
|
||||
|
||||
**Binary Frame Format:**
|
||||
`[4-byte Unsigned Little-Endian Integer (Payload Length)] [UTF-8 JSON Object]`
|
||||
|
||||
#### Control Commands (Node → C++)
|
||||
If the JSON object contains an `"action"` field, it is handled synchronously on the socket thread:
|
||||
- **Health Check:** `{"action": "ping"}`
|
||||
→ *Replies:* `{"type": "pong", "data": {"memoryMb": 120, "cpuTimeMs": 4500}}`
|
||||
- **Cancellation:** `{"action": "cancel", "jobId": "job_123"}`
|
||||
→ Worker sets the atomic cancellation token to safely halt the target `taskflow`, instantly replying `{"type": "cancel_ack", "data": "job_123"}`
|
||||
- **Daemon Teardown:** `{"action": "stop"}`
|
||||
→ Flushes all streams and exits cleanly.
|
||||
|
||||
#### Gridsearch Payload (Node → C++)
|
||||
If no `"action"` field exists, the message is treated as a gridsearch spec and pushed into a lock-free `ConcurrentQueue` for the background execution graph:
|
||||
```json
|
||||
{
|
||||
"jobId": "run_9a8bc7",
|
||||
"configPath": "config/postgres.toml",
|
||||
"cacheDir": "../packages/gadm/cache",
|
||||
"enrich": true,
|
||||
"guided": {
|
||||
"areas": [{ "gid": "ESP.6_1", "level": 1 }],
|
||||
"settings": { "gridMode": "hex", "cellSize": 5.0 }
|
||||
},
|
||||
"search": {
|
||||
"types": ["restaurant"],
|
||||
"limitPerArea": 500
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
#### Event Streaming (C++ → Node)
|
||||
As the gridsearch pipeline executes, the `GridsearchCallbacks` emit standard length-prefixed events directly back to the active UDS socket:
|
||||
- **`ack`**: Acknowledges job was successfully dequeued (`{"type": "ack", "data": {"jobId": "..."}}`).
|
||||
- **`log`**: Passthrough of all internal C++ `spdlog` messages using the custom `uds_sink` adapter.
|
||||
- **`location` / `node`**: Raw geolocation geometries and enriched contact details streamed incrementally.
|
||||
- **`job_progress`**: Phase updates (Grid Generation → Search → Enrichment).
|
||||
- **`job_result`**: The final statistical and timer summary (EnumMs, SearchMs, Total Emails, etc).
|
||||
- **`error`**: Unrecoverable boundary parsing or database initialization faults.
|
||||
|
||||
## License
|
||||
|
||||
BSD-3-Clause
|
||||
|
||||
## Requirements
|
||||
|
||||
- [https://github.com/taskflow/taskflow](https://github.com/taskflow/taskflow)
|
||||
- [https://github.com/cameron314/concurrentqueue](https://github.com/cameron314/concurrentqueue)
|
||||
- [https://github.com/chriskohlhoff/asio](https://github.com/chriskohlhoff/asio)
|
||||
6
packages/kbot/cpp/build-linux.sh
Normal file
6
packages/kbot/cpp/build-linux.sh
Normal file
@ -0,0 +1,6 @@
|
||||
#!/usr/bin/env bash
|
||||
#rm -rf /tmp/polymech-build
|
||||
mkdir -p /tmp/polymech-build
|
||||
export PATH="/snap/bin:$PATH"
|
||||
cmake -S ./ -B /tmp/polymech-build -DCMAKE_BUILD_TYPE=Release
|
||||
cmake --build /tmp/polymech-build
|
||||
12
packages/kbot/cpp/config.toml
Normal file
12
packages/kbot/cpp/config.toml
Normal file
@ -0,0 +1,12 @@
|
||||
[project]
|
||||
name = "polymech"
|
||||
version = "0.1.0"
|
||||
description = "Polymech C++ CLI"
|
||||
|
||||
[database]
|
||||
host = "localhost"
|
||||
port = 5432
|
||||
name = "polymech"
|
||||
|
||||
[logging]
|
||||
level = "debug"
|
||||
43
packages/kbot/cpp/config/gridsearch-bcn-universities.json
Normal file
43
packages/kbot/cpp/config/gridsearch-bcn-universities.json
Normal file
@ -0,0 +1,43 @@
|
||||
{
|
||||
"guided": {
|
||||
"areas": [
|
||||
{
|
||||
"gid": "ESP.6.1.10.14_1",
|
||||
"name": "Sabadell",
|
||||
"level": 4,
|
||||
"raw": {
|
||||
"level": 3,
|
||||
"gadmName": "Sabadell",
|
||||
"gid": "ESP.6.1.10.14_1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"gridMode": "centers",
|
||||
"pathOrder": "snake",
|
||||
"groupByRegion": false,
|
||||
"cellSize": 5,
|
||||
"cellOverlap": 0,
|
||||
"centroidOverlap": 0,
|
||||
"ghsFilterMode": "OR",
|
||||
"maxCellsLimit": 50000,
|
||||
"maxElevation": 1000,
|
||||
"minDensity": 0,
|
||||
"minGhsPop": 0,
|
||||
"minGhsBuilt": 0,
|
||||
"allowMissingGhs": false,
|
||||
"bypassFilters": false
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"types": [
|
||||
"university"
|
||||
],
|
||||
"filterCountry": "",
|
||||
"googleDomain": "google.com",
|
||||
"limitPerArea": 20,
|
||||
"zoom": 15,
|
||||
"language": "en"
|
||||
},
|
||||
"filterTypes": []
|
||||
}
|
||||
49
packages/kbot/cpp/config/gridsearch-lamu.json
Normal file
49
packages/kbot/cpp/config/gridsearch-lamu.json
Normal file
@ -0,0 +1,49 @@
|
||||
{
|
||||
"guided": {
|
||||
"areas": [
|
||||
{
|
||||
"gid": "KEN.21_1",
|
||||
"name": "Lamu",
|
||||
"level": 1,
|
||||
"raw": {
|
||||
"gid": "KEN.21_1",
|
||||
"gadmName": "Lamu",
|
||||
"level": 1
|
||||
}
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"gridMode": "centers",
|
||||
"pathOrder": "snake",
|
||||
"groupByRegion": true,
|
||||
"cellSize": 5,
|
||||
"cellOverlap": 0,
|
||||
"centroidOverlap": 50,
|
||||
"ghsFilterMode": "OR",
|
||||
"maxCellsLimit": 50000,
|
||||
"maxElevation": 1000,
|
||||
"minDensity": 10,
|
||||
"minGhsPop": 26,
|
||||
"minGhsBuilt": 154,
|
||||
"enableElevation": false,
|
||||
"enableDensity": false,
|
||||
"enableGhsPop": false,
|
||||
"enableGhsBuilt": false,
|
||||
"allowMissingGhs": false,
|
||||
"bypassFilters": true
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"types": [
|
||||
"plastic"
|
||||
],
|
||||
"filterCountry": "",
|
||||
"googleDomain": "google.com",
|
||||
"limitPerArea": 20,
|
||||
"zoom": 15,
|
||||
"language": "en"
|
||||
},
|
||||
"filterTypes": [
|
||||
"Recycling center"
|
||||
]
|
||||
}
|
||||
40
packages/kbot/cpp/config/gridsearch-sample.json
Normal file
40
packages/kbot/cpp/config/gridsearch-sample.json
Normal file
@ -0,0 +1,40 @@
|
||||
{
|
||||
"guided": {
|
||||
"areas": [
|
||||
{
|
||||
"gid": "ABW",
|
||||
"name": "Aruba",
|
||||
"level": 0
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"gridMode": "centers",
|
||||
"pathOrder": "snake",
|
||||
"groupByRegion": false,
|
||||
"cellSize": 5,
|
||||
"cellOverlap": 0,
|
||||
"centroidOverlap": 0,
|
||||
"ghsFilterMode": "OR",
|
||||
"maxCellsLimit": 50000,
|
||||
"maxElevation": 1000,
|
||||
"minDensity": 0,
|
||||
"minGhsPop": 0,
|
||||
"minGhsBuilt": 0,
|
||||
"allowMissingGhs": false,
|
||||
"bypassFilters": false
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"types": [
|
||||
"recycling"
|
||||
],
|
||||
"filterCountry": "",
|
||||
"googleDomain": "google.com",
|
||||
"limitPerArea": 20,
|
||||
"zoom": 15,
|
||||
"language": "en"
|
||||
},
|
||||
"filterTypes": [
|
||||
"Recycling center"
|
||||
]
|
||||
}
|
||||
45
packages/kbot/cpp/config/gridsearch-test-bcn-large.json
Normal file
45
packages/kbot/cpp/config/gridsearch-test-bcn-large.json
Normal file
@ -0,0 +1,45 @@
|
||||
{
|
||||
"guided": {
|
||||
"areas": [
|
||||
{
|
||||
"gid": "ESP.6.1_1",
|
||||
"name": "Barcelona",
|
||||
"level": 3,
|
||||
"raw": {
|
||||
"level": 2,
|
||||
"gadmName": "Barcelona",
|
||||
"gid": "ESP.6.1_1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"gridMode": "centers",
|
||||
"pathOrder": "snake",
|
||||
"groupByRegion": true,
|
||||
"cellSize": 5,
|
||||
"cellOverlap": 0,
|
||||
"centroidOverlap": 0,
|
||||
"ghsFilterMode": "OR",
|
||||
"maxCellsLimit": 50000,
|
||||
"maxElevation": 1000,
|
||||
"minDensity": 10,
|
||||
"minGhsPop": 26,
|
||||
"minGhsBuilt": 154,
|
||||
"enableElevation": false,
|
||||
"enableDensity": false,
|
||||
"enableGhsPop": false,
|
||||
"enableGhsBuilt": false,
|
||||
"allowMissingGhs": false,
|
||||
"bypassFilters": true
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"types": [
|
||||
"marketing"
|
||||
],
|
||||
"filterCountry": "Spain",
|
||||
"googleDomain": "google.es",
|
||||
"limitPerArea": 10,
|
||||
"useCache": true
|
||||
}
|
||||
}
|
||||
85
packages/kbot/cpp/config/gridsearch-test-bcn.json
Normal file
85
packages/kbot/cpp/config/gridsearch-test-bcn.json
Normal file
@ -0,0 +1,85 @@
|
||||
{
|
||||
"guided": {
|
||||
"areas": [
|
||||
{
|
||||
"gid": "ESP.6.1.10.2_1",
|
||||
"name": "Barberà del Vallès",
|
||||
"level": 4,
|
||||
"raw": {
|
||||
"level": 4,
|
||||
"gadmName": "Barberà del Vallès",
|
||||
"gid": "ESP.6.1.10.2_1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"gid": "ESP.6.1.10.14_1",
|
||||
"name": "Sabadell",
|
||||
"level": 4,
|
||||
"raw": {
|
||||
"level": 4,
|
||||
"gadmName": "Sabadell",
|
||||
"gid": "ESP.6.1.10.14_1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"gid": "ESP.6.1.10.11_1",
|
||||
"name": "Polinyà",
|
||||
"level": 4,
|
||||
"raw": {
|
||||
"level": 4,
|
||||
"gadmName": "Polinyà",
|
||||
"gid": "ESP.6.1.10.11_1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"gid": "ESP.6.1.10.4_1",
|
||||
"name": "Castellar del Vallès",
|
||||
"level": 4,
|
||||
"raw": {
|
||||
"level": 4,
|
||||
"gadmName": "Castellar del Vallès",
|
||||
"gid": "ESP.6.1.10.4_1"
|
||||
}
|
||||
},
|
||||
{
|
||||
"gid": "ESP.6.1.10.19_1",
|
||||
"name": "Sentmenat",
|
||||
"level": 4,
|
||||
"raw": {
|
||||
"level": 4,
|
||||
"gadmName": "Sentmenat",
|
||||
"gid": "ESP.6.1.10.19_1"
|
||||
}
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"gridMode": "centers",
|
||||
"pathOrder": "snake",
|
||||
"groupByRegion": true,
|
||||
"cellSize": 10,
|
||||
"cellOverlap": 0,
|
||||
"centroidOverlap": 0,
|
||||
"ghsFilterMode": "OR",
|
||||
"maxCellsLimit": 50000,
|
||||
"maxElevation": 1000,
|
||||
"minDensity": 10,
|
||||
"minGhsPop": 26,
|
||||
"minGhsBuilt": 154,
|
||||
"enableElevation": false,
|
||||
"enableDensity": false,
|
||||
"enableGhsPop": false,
|
||||
"enableGhsBuilt": false,
|
||||
"allowMissingGhs": false,
|
||||
"bypassFilters": true
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"types": [
|
||||
"mecanizado cnc"
|
||||
],
|
||||
"filterCountry": "Spain",
|
||||
"googleDomain": "google.es",
|
||||
"limitPerArea": 10,
|
||||
"useCache": true
|
||||
}
|
||||
}
|
||||
37
packages/kbot/cpp/config/gridsearch-test.json
Normal file
37
packages/kbot/cpp/config/gridsearch-test.json
Normal file
@ -0,0 +1,37 @@
|
||||
{
|
||||
"guided": {
|
||||
"areas": [
|
||||
{
|
||||
"gid": "ABW",
|
||||
"name": "Aruba",
|
||||
"level": 0
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"gridMode": "centers",
|
||||
"pathOrder": "snake",
|
||||
"groupByRegion": false,
|
||||
"cellSize": 5,
|
||||
"cellOverlap": 0,
|
||||
"centroidOverlap": 0,
|
||||
"ghsFilterMode": "OR",
|
||||
"maxCellsLimit": 50000,
|
||||
"maxElevation": 1000,
|
||||
"minDensity": 0,
|
||||
"minGhsPop": 0,
|
||||
"minGhsBuilt": 0,
|
||||
"allowMissingGhs": false,
|
||||
"bypassFilters": false
|
||||
}
|
||||
},
|
||||
"search": {
|
||||
"types": [
|
||||
"recycling"
|
||||
],
|
||||
"filterCountry": "",
|
||||
"googleDomain": "google.com",
|
||||
"limitPerArea": 1,
|
||||
"zoom": 15,
|
||||
"language": "en"
|
||||
}
|
||||
}
|
||||
60
packages/kbot/cpp/install-lnx.sh
Normal file
60
packages/kbot/cpp/install-lnx.sh
Normal file
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env bash
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# install-lnx.sh – Install build dependencies for polymech-cli on Linux
|
||||
#
|
||||
# Tested on: Ubuntu 20.04+ / Debian 11+
|
||||
# Usage: sudo bash install-lnx.sh
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
set -euo pipefail
|
||||
|
||||
echo "── polymech-cli Linux dependency installer ──"
|
||||
|
||||
# ── 1. System packages (apt) ─────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "[1/3] Installing system packages via apt …"
|
||||
apt-get update -qq
|
||||
apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
gcc \
|
||||
g++ \
|
||||
git \
|
||||
libssl-dev \
|
||||
pkg-config \
|
||||
snapd
|
||||
|
||||
# ── 2. CMake ≥ 3.20 via snap ────────────────────────────────────────────────
|
||||
# The project requires cmake_minimum_required(VERSION 3.20).
|
||||
# Ubuntu 20.04 ships cmake 3.16, so we use the snap package instead.
|
||||
echo ""
|
||||
echo "[2/3] Installing CMake via snap (≥ 3.20 required) …"
|
||||
if command -v /snap/bin/cmake &>/dev/null; then
|
||||
echo " cmake snap already installed: $(/snap/bin/cmake --version | head -1)"
|
||||
else
|
||||
snap install cmake --classic
|
||||
echo " Installed: $(/snap/bin/cmake --version | head -1)"
|
||||
fi
|
||||
|
||||
# ── 3. Node.js (for npm run build:linux) ──────────────────────────────────────
|
||||
echo ""
|
||||
echo "[3/3] Checking for Node.js / npm …"
|
||||
if command -v node &>/dev/null; then
|
||||
echo " node $(node --version) already installed"
|
||||
else
|
||||
echo " Node.js not found. Install via nvm or nodesource, e.g.:"
|
||||
echo " curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -"
|
||||
echo " sudo apt-get install -y nodejs"
|
||||
fi
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────────────────
|
||||
echo ""
|
||||
echo "── Done! ──"
|
||||
echo ""
|
||||
echo "All C++ dependencies (CLI11, tomlplusplus, Catch2, asio, concurrentqueue,"
|
||||
echo "taskflow, curl, lexbor, rapidjson) are fetched automatically by CMake"
|
||||
echo "FetchContent at build time — no manual installation needed."
|
||||
echo ""
|
||||
echo "To build:"
|
||||
echo " cd $(dirname "$0")"
|
||||
echo " npm run build:linux"
|
||||
echo ""
|
||||
echo "The binary will be placed in: dist/polymech-cli"
|
||||
159
packages/kbot/cpp/orchestrator/spawn.mjs
Normal file
159
packages/kbot/cpp/orchestrator/spawn.mjs
Normal file
@ -0,0 +1,159 @@
|
||||
/**
|
||||
* orchestrator/spawn.mjs
|
||||
*
|
||||
* Spawn a C++ worker as a child process, send/receive length-prefixed
|
||||
* JSON messages over stdin/stdout.
|
||||
*
|
||||
* Usage:
|
||||
* import { spawnWorker } from './spawn.mjs';
|
||||
* const w = await spawnWorker('./dist/polymech-cli.exe');
|
||||
* console.log(res); // { id: '...', type: 'pong', payload: {} }
|
||||
* await w.shutdown();
|
||||
*/
|
||||
|
||||
import { spawn } from 'node:child_process';
|
||||
import { randomUUID } from 'node:crypto';
|
||||
|
||||
// ── frame helpers ────────────────────────────────────────────────────────────
|
||||
|
||||
/** Write a 4-byte LE length + JSON body to a writable stream. */
|
||||
function writeFrame(stream, msg) {
|
||||
const body = JSON.stringify(msg);
|
||||
const bodyBuf = Buffer.from(body, 'utf8');
|
||||
const lenBuf = Buffer.alloc(4);
|
||||
lenBuf.writeUInt32LE(bodyBuf.length, 0);
|
||||
stream.write(Buffer.concat([lenBuf, bodyBuf]));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a streaming frame parser.
|
||||
* Calls `onMessage(parsed)` for each complete frame.
|
||||
*/
|
||||
function createFrameReader(onMessage) {
|
||||
let buffer = Buffer.alloc(0);
|
||||
|
||||
return (chunk) => {
|
||||
buffer = Buffer.concat([buffer, chunk]);
|
||||
|
||||
while (buffer.length >= 4) {
|
||||
const bodyLen = buffer.readUInt32LE(0);
|
||||
const totalLen = 4 + bodyLen;
|
||||
|
||||
if (buffer.length < totalLen) break; // need more data
|
||||
|
||||
const bodyBuf = buffer.subarray(4, totalLen);
|
||||
buffer = buffer.subarray(totalLen);
|
||||
|
||||
try {
|
||||
const msg = JSON.parse(bodyBuf.toString('utf8'));
|
||||
onMessage(msg);
|
||||
} catch (e) {
|
||||
console.error('[orchestrator] failed to parse frame:', e.message);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// ── spawnWorker ──────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Spawn the C++ binary in `worker` mode.
|
||||
* Returns: { send, request, shutdown, kill, process, ready }
|
||||
*
|
||||
* `ready` is a Promise that resolves when the worker sends `{ type: 'ready' }`.
|
||||
*/
|
||||
export function spawnWorker(exePath, args = ['worker']) {
|
||||
const proc = spawn(exePath, args, {
|
||||
stdio: ['pipe', 'pipe', 'pipe'],
|
||||
});
|
||||
|
||||
// Pending request map: id → { resolve, reject, timer }
|
||||
const pending = new Map();
|
||||
|
||||
// Event handler for unmatched messages (progress events, etc.)
|
||||
let eventHandler = null;
|
||||
|
||||
let readyResolve;
|
||||
const ready = new Promise((resolve) => { readyResolve = resolve; });
|
||||
|
||||
// stderr → console (worker logs via spdlog go to stderr)
|
||||
proc.stderr.on('data', (chunk) => {
|
||||
const text = chunk.toString().trim();
|
||||
if (text) console.error(`[worker:stderr] ${text}`);
|
||||
});
|
||||
|
||||
// stdout → frame parser → route by id / type
|
||||
const feedData = createFrameReader((msg) => {
|
||||
// Handle the initial "ready" signal
|
||||
if (msg.type === 'ready') {
|
||||
readyResolve(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
// Route response to pending request
|
||||
if (msg.id && pending.has(msg.id)) {
|
||||
const { resolve, timer } = pending.get(msg.id);
|
||||
clearTimeout(timer);
|
||||
pending.delete(msg.id);
|
||||
resolve(msg);
|
||||
return;
|
||||
}
|
||||
|
||||
// Unmatched message (progress event, broadcast, etc.)
|
||||
if (eventHandler) {
|
||||
eventHandler(msg);
|
||||
} else {
|
||||
console.log('[orchestrator] unmatched message:', msg);
|
||||
}
|
||||
});
|
||||
|
||||
proc.stdout.on('data', feedData);
|
||||
|
||||
// ── public API ──────────────────────────────────────────────────────────
|
||||
|
||||
/** Fire-and-forget send. */
|
||||
function send(msg) {
|
||||
if (!msg.id) msg.id = randomUUID();
|
||||
writeFrame(proc.stdin, msg);
|
||||
}
|
||||
|
||||
/** Send a message and wait for the response with matching `id`. */
|
||||
function request(msg, timeoutMs = 5000) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const id = msg.id || randomUUID();
|
||||
msg.id = id;
|
||||
|
||||
const timer = setTimeout(() => {
|
||||
pending.delete(id);
|
||||
reject(new Error(`IPC request timed out after ${timeoutMs}ms (id=${id}, type=${msg.type})`));
|
||||
}, timeoutMs);
|
||||
|
||||
pending.set(id, { resolve, reject, timer });
|
||||
writeFrame(proc.stdin, msg);
|
||||
});
|
||||
}
|
||||
|
||||
/** Graceful shutdown: send shutdown message & wait for process exit. */
|
||||
async function shutdown(timeoutMs = 3000) {
|
||||
const res = await request({ type: 'shutdown' }, timeoutMs);
|
||||
// Wait for process to exit
|
||||
await new Promise((resolve) => {
|
||||
const timer = setTimeout(() => {
|
||||
proc.kill();
|
||||
resolve();
|
||||
}, timeoutMs);
|
||||
proc.on('exit', () => { clearTimeout(timer); resolve(); });
|
||||
});
|
||||
return res;
|
||||
}
|
||||
|
||||
return {
|
||||
send,
|
||||
request,
|
||||
shutdown,
|
||||
kill: () => proc.kill(),
|
||||
process: proc,
|
||||
ready,
|
||||
onEvent: (handler) => { eventHandler = handler; },
|
||||
};
|
||||
}
|
||||
204
packages/kbot/cpp/orchestrator/test-gridsearch-ipc-daemon.mjs
Normal file
204
packages/kbot/cpp/orchestrator/test-gridsearch-ipc-daemon.mjs
Normal file
@ -0,0 +1,204 @@
|
||||
/**
|
||||
* orchestrator/test-gridsearch-ipc.mjs
|
||||
*
|
||||
* E2E test: spawn the C++ worker, send a gridsearch request
|
||||
* matching `npm run gridsearch:enrich` defaults, collect IPC events,
|
||||
* and verify the full event sequence.
|
||||
*
|
||||
* Run: node orchestrator/test-gridsearch-ipc.mjs
|
||||
* Needs: npm run build-debug (or npm run build)
|
||||
*/
|
||||
|
||||
import { spawnWorker } from './spawn.mjs';
|
||||
import { resolve, dirname } from 'node:path';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import fs from 'node:fs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const IS_WIN = process.platform === 'win32';
|
||||
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
|
||||
|
||||
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
|
||||
if (!fs.existsSync(EXE)) {
|
||||
console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Binary: ${EXE}\n`);
|
||||
|
||||
// Load the sample settings (same as gridsearch:enrich)
|
||||
const sampleConfig = JSON.parse(
|
||||
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
|
||||
);
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
|
||||
function assert(condition, label) {
|
||||
if (condition) {
|
||||
console.log(` ✅ ${label}`);
|
||||
passed++;
|
||||
} else {
|
||||
console.error(` ❌ ${label}`);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Event collector ─────────────────────────────────────────────────────────
|
||||
|
||||
const EXPECTED_EVENTS = [
|
||||
'grid-ready',
|
||||
'waypoint-start',
|
||||
'area',
|
||||
'location',
|
||||
'enrich-start',
|
||||
'node',
|
||||
'nodePage',
|
||||
// 'node-error' — may or may not occur, depends on network
|
||||
];
|
||||
|
||||
function createCollector() {
|
||||
const events = {};
|
||||
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
|
||||
'enrich-start', 'node', 'node-error', 'nodePage']) {
|
||||
events[t] = [];
|
||||
}
|
||||
return {
|
||||
events,
|
||||
handler(msg) {
|
||||
const t = msg.type;
|
||||
if (events[t]) {
|
||||
events[t].push(msg);
|
||||
} else {
|
||||
events[t] = [msg];
|
||||
}
|
||||
// Live progress indicator
|
||||
const d = msg.payload ?? {};
|
||||
if (t === 'waypoint-start') {
|
||||
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
|
||||
} else if (t === 'node') {
|
||||
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
|
||||
} else if (t === 'node-error') {
|
||||
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Main test ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function run() {
|
||||
console.log('🧪 Gridsearch IPC E2E Test\n');
|
||||
|
||||
// ── 1. Spawn worker ───────────────────────────────────────────────────
|
||||
console.log('1. Spawn worker in daemon mode');
|
||||
const worker = spawnWorker(EXE, ['worker', '--daemon', '--user-uid', '3bb4cfbf-318b-44d3-a9d3-35680e738421']);
|
||||
const readyMsg = await worker.ready;
|
||||
assert(readyMsg.type === 'ready', 'Worker sends ready signal');
|
||||
|
||||
// ── 2. Register event collector ───────────────────────────────────────
|
||||
const collector = createCollector();
|
||||
worker.onEvent(collector.handler);
|
||||
|
||||
// ── 3. Send gridsearch request (matching gridsearch:enrich) ────────────
|
||||
console.log('2. Send gridsearch request (Aruba / recycling / --enrich)');
|
||||
const t0 = Date.now();
|
||||
|
||||
// Very long timeout — enrichment can take minutes
|
||||
const result = await worker.request(
|
||||
{
|
||||
type: 'gridsearch',
|
||||
payload: {
|
||||
...sampleConfig,
|
||||
enrich: true,
|
||||
},
|
||||
},
|
||||
5 * 60 * 1000 // 5 min timeout
|
||||
);
|
||||
|
||||
const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
|
||||
console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`);
|
||||
|
||||
// ── 4. Verify final result ────────────────────────────────────────────
|
||||
console.log('3. Verify job_result');
|
||||
assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`);
|
||||
|
||||
const summary = result.payload ?? null;
|
||||
assert(summary !== null, 'job_result payload is present');
|
||||
|
||||
if (summary) {
|
||||
assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`);
|
||||
assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`);
|
||||
assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`);
|
||||
assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`);
|
||||
assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`);
|
||||
assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present');
|
||||
assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present');
|
||||
assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`);
|
||||
assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`);
|
||||
}
|
||||
|
||||
// ── 5. Verify event sequence ──────────────────────────────────────────
|
||||
console.log('4. Verify event stream');
|
||||
const e = collector.events;
|
||||
|
||||
assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`);
|
||||
assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`);
|
||||
assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`);
|
||||
assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`);
|
||||
assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`);
|
||||
|
||||
const totalNodes = e['node'].length + e['node-error'].length;
|
||||
assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`);
|
||||
|
||||
// Validate grid-ready payload
|
||||
if (e['grid-ready'].length > 0) {
|
||||
const gr = e['grid-ready'][0].payload ?? {};
|
||||
assert(Array.isArray(gr.areas), 'grid-ready.areas is array');
|
||||
assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`);
|
||||
}
|
||||
|
||||
// Validate location events have required fields
|
||||
if (e['location'].length > 0) {
|
||||
const loc = e['location'][0].payload ?? {};
|
||||
assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title');
|
||||
assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id');
|
||||
assert(typeof loc.areaName === 'string', 'location event has areaName');
|
||||
}
|
||||
assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`);
|
||||
|
||||
// Validate node payloads
|
||||
if (e['node'].length > 0) {
|
||||
const nd = e['node'][0].payload ?? {};
|
||||
assert(typeof nd.placeId === 'string', 'node event has placeId');
|
||||
assert(typeof nd.title === 'string', 'node event has title');
|
||||
assert(Array.isArray(nd.emails), 'node event has emails array');
|
||||
assert(typeof nd.status === 'string', 'node event has status');
|
||||
}
|
||||
|
||||
// ── 6. Print event summary ────────────────────────────────────────────
|
||||
console.log('\n5. Event summary');
|
||||
for (const [type, arr] of Object.entries(e)) {
|
||||
if (arr.length > 0) console.log(` ${type}: ${arr.length}`);
|
||||
}
|
||||
|
||||
// ── 7. Shutdown ───────────────────────────────────────────────────────
|
||||
console.log('\n6. Graceful shutdown');
|
||||
const shutdownRes = await worker.shutdown();
|
||||
assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged');
|
||||
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
|
||||
|
||||
// ── Summary ───────────────────────────────────────────────────────────
|
||||
console.log(`\n────────────────────────────────`);
|
||||
console.log(` Passed: ${passed} Failed: ${failed}`);
|
||||
console.log(`────────────────────────────────\n`);
|
||||
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
run().catch((err) => {
|
||||
console.error('Test runner error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
218
packages/kbot/cpp/orchestrator/test-gridsearch-ipc-uds-meta.mjs
Normal file
218
packages/kbot/cpp/orchestrator/test-gridsearch-ipc-uds-meta.mjs
Normal file
@ -0,0 +1,218 @@
|
||||
/**
|
||||
* orchestrator/test-gridsearch-ipc-uds-meta.mjs
|
||||
*
|
||||
* E2E test for Unix Domain Sockets / Windows Named Pipes (Meta Enrichment)!
|
||||
* Spawns the worker in `--uds` mode and tests direct high-throughput
|
||||
* lock-free JSON binary framing over a net.Socket.
|
||||
*/
|
||||
|
||||
import { spawn } from 'node:child_process';
|
||||
import { resolve, dirname, join } from 'node:path';
|
||||
import { readFileSync, existsSync, unlinkSync } from 'node:fs';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import net from 'node:net';
|
||||
import { tmpdir } from 'node:os';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const IS_WIN = process.platform === 'win32';
|
||||
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
|
||||
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
|
||||
const TEST_CANCEL = false;
|
||||
|
||||
if (!existsSync(EXE)) {
|
||||
console.error(`❌ Binary not found at ${EXE}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const PIPE_NAME = 'polymech-test-uds-meta';
|
||||
const CPP_UDS_ARG = IS_WIN ? '4001' : join(tmpdir(), `${PIPE_NAME}.sock`);
|
||||
|
||||
if (!IS_WIN && existsSync(CPP_UDS_ARG)) {
|
||||
unlinkSync(CPP_UDS_ARG);
|
||||
}
|
||||
|
||||
console.log(`Binary: ${EXE}`);
|
||||
console.log(`C++ Arg: ${CPP_UDS_ARG}\n`);
|
||||
|
||||
// ── Event collector ─────────────────────────────────────────────────────────
|
||||
function createCollector() {
|
||||
const events = {};
|
||||
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
|
||||
'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) {
|
||||
events[t] = [];
|
||||
}
|
||||
return {
|
||||
events,
|
||||
onComplete: null,
|
||||
handler(msg) {
|
||||
const t = msg.type;
|
||||
if (events[t]) events[t].push(msg);
|
||||
else events[t] = [msg];
|
||||
|
||||
const d = msg.data ?? {};
|
||||
if (t === 'waypoint-start') {
|
||||
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
|
||||
} else if (t === 'node') {
|
||||
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
|
||||
} else if (t === 'node-error') {
|
||||
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
|
||||
} else if (t === 'job_result') {
|
||||
console.log(`\n 🏁 Pipeline complete!`);
|
||||
if (this.onComplete) this.onComplete(msg);
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
function assert(condition, label) {
|
||||
if (condition) { console.log(` ✅ ${label}`); passed++; }
|
||||
else { console.error(` ❌ ${label}`); failed++; }
|
||||
}
|
||||
|
||||
async function run() {
|
||||
console.log('🧪 Gridsearch UDS Meta E2E Test\n');
|
||||
|
||||
// 1. Spawn worker in UDS mode
|
||||
console.log('1. Spawning remote C++ Taskflow Daemon');
|
||||
const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' });
|
||||
|
||||
// Give the daemon a moment to boot
|
||||
console.log('2. Connecting net.Socket with retries...');
|
||||
|
||||
let socket;
|
||||
for (let i = 0; i < 15; i++) {
|
||||
try {
|
||||
await new Promise((resolve, reject) => {
|
||||
if (IS_WIN) {
|
||||
socket = net.connect({ port: 4001, host: '127.0.0.1' });
|
||||
} else {
|
||||
socket = net.connect(CPP_UDS_ARG);
|
||||
}
|
||||
socket.once('connect', resolve);
|
||||
socket.once('error', reject);
|
||||
});
|
||||
console.log(' ✅ Socket Connected to UDS!');
|
||||
break;
|
||||
} catch (e) {
|
||||
if (i === 14) throw e;
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
}
|
||||
|
||||
const collector = createCollector();
|
||||
let buffer = Buffer.alloc(0);
|
||||
|
||||
// Buffer framing logic (length-prefixed streaming)
|
||||
socket.on('data', (chunk) => {
|
||||
buffer = Buffer.concat([buffer, chunk]);
|
||||
while (buffer.length >= 4) {
|
||||
const len = buffer.readUInt32LE(0);
|
||||
if (buffer.length >= 4 + len) {
|
||||
const payload = buffer.toString('utf8', 4, 4 + len);
|
||||
buffer = buffer.subarray(4 + len);
|
||||
try {
|
||||
const msg = JSON.parse(payload);
|
||||
collector.handler(msg);
|
||||
} catch (e) {
|
||||
console.error("JSON PARSE ERROR:", e, payload);
|
||||
}
|
||||
} else {
|
||||
break; // Wait for more chunks
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 3. Send Gridsearch payload
|
||||
// USE gridsearch-sample.json instead of gridsearch-bcn-universities.json
|
||||
const sampleConfig = JSON.parse(
|
||||
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
|
||||
);
|
||||
|
||||
sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml');
|
||||
sampleConfig.jobId = 'uds-meta-test-abc';
|
||||
sampleConfig.noCache = true; // force re-enrichment even if cached
|
||||
|
||||
console.log('3. Writing serialized IPC Payload over pipe...');
|
||||
const jsonStr = JSON.stringify(sampleConfig);
|
||||
const lenBuf = Buffer.alloc(4);
|
||||
lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr));
|
||||
socket.write(lenBuf);
|
||||
socket.write(jsonStr);
|
||||
|
||||
// 4. Wait for pipeline completion (job_result event) or timeout
|
||||
console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n');
|
||||
|
||||
await new Promise((resolve) => {
|
||||
collector.onComplete = () => {
|
||||
// Send stop command to gracefully shut down the daemon
|
||||
console.log(' 📤 Sending stop command to daemon...');
|
||||
const stopPayload = JSON.stringify({ action: 'stop' });
|
||||
const stopLen = Buffer.alloc(4);
|
||||
stopLen.writeUInt32LE(Buffer.byteLength(stopPayload));
|
||||
socket.write(stopLen);
|
||||
socket.write(stopPayload);
|
||||
setTimeout(resolve, 1000); // Give daemon a moment to ack
|
||||
};
|
||||
|
||||
// Safety timeout
|
||||
setTimeout(() => {
|
||||
console.log('\n ⏰ Timeout reached (300s) — forcing shutdown.');
|
||||
resolve();
|
||||
}, 300000); // Wait up to 5 minutes
|
||||
});
|
||||
|
||||
console.log('\n\n5. Event summary');
|
||||
for (const [k, v] of Object.entries(collector.events)) {
|
||||
console.log(` ${k}: ${v.length}`);
|
||||
}
|
||||
|
||||
// Assertions
|
||||
const ev = collector.events;
|
||||
assert(ev['grid-ready'].length === 1, 'grid-ready emitted once');
|
||||
assert(ev['waypoint-start'].length > 0, 'waypoint-start events received');
|
||||
assert(ev['location'].length > 0, 'location events received');
|
||||
assert(ev['enrich-start'].length === 1, 'enrich-start emitted once');
|
||||
assert(ev['job_result'].length === 1, 'job_result emitted once');
|
||||
|
||||
// Verify social profiles and md body
|
||||
const nodes = ev['node'];
|
||||
let foundSocial = false;
|
||||
let foundSiteMd = false;
|
||||
|
||||
for (const n of nodes) {
|
||||
const d = n.data;
|
||||
if (!d) continue;
|
||||
|
||||
if (d.socials && d.socials.length > 0) {
|
||||
foundSocial = true;
|
||||
}
|
||||
|
||||
if (d.sites && Array.isArray(d.sites) && d.sites.length > 0) {
|
||||
foundSiteMd = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (foundSocial) {
|
||||
assert(foundSocial, 'At least one enriched node has social media profiles discovered');
|
||||
} else {
|
||||
console.log(' ⚠️ No social media profiles discovered in this run (data-dependent), but pipeline completed.');
|
||||
}
|
||||
|
||||
assert(foundSiteMd, 'At least one enriched node has markdown sites mapped');
|
||||
|
||||
console.log('6. Cleanup');
|
||||
socket.destroy();
|
||||
worker.kill('SIGTERM');
|
||||
|
||||
console.log(`\n────────────────────────────────`);
|
||||
console.log(` Passed: ${passed} Failed: ${failed}`);
|
||||
console.log(`────────────────────────────────`);
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
run().catch(e => {
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
});
|
||||
255
packages/kbot/cpp/orchestrator/test-gridsearch-ipc-uds.mjs
Normal file
255
packages/kbot/cpp/orchestrator/test-gridsearch-ipc-uds.mjs
Normal file
@ -0,0 +1,255 @@
|
||||
/**
|
||||
* orchestrator/test-gridsearch-ipc-uds.mjs
|
||||
*
|
||||
* E2E test for Unix Domain Sockets / Windows Named Pipes!
|
||||
* Spawns the worker in `--uds` mode and tests direct high-throughput
|
||||
* lock-free JSON binary framing over a net.Socket.
|
||||
*/
|
||||
|
||||
import { spawn } from 'node:child_process';
|
||||
import { resolve, dirname, join } from 'node:path';
|
||||
import { readFileSync, existsSync, unlinkSync } from 'node:fs';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import net from 'node:net';
|
||||
import { tmpdir } from 'node:os';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const IS_WIN = process.platform === 'win32';
|
||||
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
|
||||
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
|
||||
const TEST_CANCEL = false;
|
||||
|
||||
if (!existsSync(EXE)) {
|
||||
console.error(`❌ Binary not found at ${EXE}`);
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
const PIPE_NAME = 'polymech-test-uds';
|
||||
const CPP_UDS_ARG = IS_WIN ? '4000' : join(tmpdir(), `${PIPE_NAME}.sock`);
|
||||
|
||||
if (!IS_WIN && existsSync(CPP_UDS_ARG)) {
|
||||
unlinkSync(CPP_UDS_ARG);
|
||||
}
|
||||
|
||||
console.log(`Binary: ${EXE}`);
|
||||
console.log(`C++ Arg: ${CPP_UDS_ARG}\n`);
|
||||
|
||||
// ── Event collector ─────────────────────────────────────────────────────────
|
||||
function createCollector() {
|
||||
const events = {};
|
||||
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
|
||||
'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) {
|
||||
events[t] = [];
|
||||
}
|
||||
return {
|
||||
events,
|
||||
onComplete: null,
|
||||
handler(msg) {
|
||||
const t = msg.type;
|
||||
if (events[t]) events[t].push(msg);
|
||||
else events[t] = [msg];
|
||||
|
||||
const d = msg.data ?? {};
|
||||
if (t === 'waypoint-start') {
|
||||
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
|
||||
} else if (t === 'node') {
|
||||
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
|
||||
} else if (t === 'node-error') {
|
||||
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
|
||||
} else if (t === 'job_result') {
|
||||
console.log(`\n 🏁 Pipeline complete!`);
|
||||
if (this.onComplete) this.onComplete(msg);
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
function assert(condition, label) {
|
||||
if (condition) { console.log(` ✅ ${label}`); passed++; }
|
||||
else { console.error(` ❌ ${label}`); failed++; }
|
||||
}
|
||||
|
||||
async function run() {
|
||||
console.log('🧪 Gridsearch UDS / Named Pipe E2E Test\n');
|
||||
|
||||
// 1. Spawn worker in UDS mode
|
||||
console.log('1. Spawning remote C++ Taskflow Daemon');
|
||||
const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' });
|
||||
|
||||
// Give the daemon a moment to boot
|
||||
console.log('2. Connecting net.Socket with retries...');
|
||||
|
||||
let socket;
|
||||
for (let i = 0; i < 15; i++) {
|
||||
try {
|
||||
await new Promise((resolve, reject) => {
|
||||
if (IS_WIN) {
|
||||
socket = net.connect({ port: 4000, host: '127.0.0.1' });
|
||||
} else {
|
||||
socket = net.connect(CPP_UDS_ARG);
|
||||
}
|
||||
socket.once('connect', resolve);
|
||||
socket.once('error', reject);
|
||||
});
|
||||
console.log(' ✅ Socket Connected to UDS!');
|
||||
break;
|
||||
} catch (e) {
|
||||
if (i === 14) throw e;
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
}
|
||||
}
|
||||
|
||||
const collector = createCollector();
|
||||
let buffer = Buffer.alloc(0);
|
||||
|
||||
// Buffer framing logic (length-prefixed streaming)
|
||||
socket.on('data', (chunk) => {
|
||||
buffer = Buffer.concat([buffer, chunk]);
|
||||
while (buffer.length >= 4) {
|
||||
const len = buffer.readUInt32LE(0);
|
||||
if (buffer.length >= 4 + len) {
|
||||
const payload = buffer.toString('utf8', 4, 4 + len);
|
||||
buffer = buffer.subarray(4 + len);
|
||||
try {
|
||||
const msg = JSON.parse(payload);
|
||||
collector.handler(msg);
|
||||
} catch (e) {
|
||||
console.error("JSON PARSE ERROR:", e, payload);
|
||||
}
|
||||
} else {
|
||||
break; // Wait for more chunks
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// 3. Send Gridsearch payload
|
||||
const sampleConfig = JSON.parse(
|
||||
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-bcn-universities.json'), 'utf8')
|
||||
);
|
||||
|
||||
sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml');
|
||||
sampleConfig.jobId = 'uds-test-cancel-abc';
|
||||
|
||||
console.log('3. Writing serialized IPC Payload over pipe...');
|
||||
const jsonStr = JSON.stringify(sampleConfig);
|
||||
const lenBuf = Buffer.alloc(4);
|
||||
lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr));
|
||||
socket.write(lenBuf);
|
||||
socket.write(jsonStr);
|
||||
|
||||
// Send cancellation after 5 seconds
|
||||
if (TEST_CANCEL) {
|
||||
setTimeout(() => {
|
||||
console.log('\n\n--> Testing Dynamic Cancellation (Sending cancel event for uds-test-cancel-abc)...');
|
||||
const cancelPayload = JSON.stringify({ action: "cancel", jobId: "uds-test-cancel-abc" });
|
||||
const cancelLenBuf = Buffer.alloc(4);
|
||||
cancelLenBuf.writeUInt32LE(Buffer.byteLength(cancelPayload));
|
||||
socket.write(cancelLenBuf);
|
||||
socket.write(cancelPayload);
|
||||
}, 5000);
|
||||
}
|
||||
|
||||
// 4. Wait for pipeline completion (job_result event) or timeout
|
||||
console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n');
|
||||
|
||||
await new Promise((resolve) => {
|
||||
collector.onComplete = () => {
|
||||
// Send stop command to gracefully shut down the daemon
|
||||
console.log(' 📤 Sending stop command to daemon...');
|
||||
const stopPayload = JSON.stringify({ action: 'stop' });
|
||||
const stopLen = Buffer.alloc(4);
|
||||
stopLen.writeUInt32LE(Buffer.byteLength(stopPayload));
|
||||
socket.write(stopLen);
|
||||
socket.write(stopPayload);
|
||||
setTimeout(resolve, 1000); // Give daemon a moment to ack
|
||||
};
|
||||
|
||||
// Safety timeout
|
||||
setTimeout(() => {
|
||||
console.log('\n ⏰ Timeout reached (120s) — forcing shutdown.');
|
||||
resolve();
|
||||
}, 120000);
|
||||
});
|
||||
|
||||
console.log('\n\n5. Event summary');
|
||||
for (const [k, v] of Object.entries(collector.events)) {
|
||||
console.log(` ${k}: ${v.length}`);
|
||||
}
|
||||
|
||||
// Assertions
|
||||
const ev = collector.events;
|
||||
assert(ev['grid-ready'].length === 1, 'grid-ready emitted once');
|
||||
assert(ev['waypoint-start'].length > 0, 'waypoint-start events received');
|
||||
assert(ev['location'].length > 0, 'location events received');
|
||||
assert(ev['enrich-start'].length === 1, 'enrich-start emitted once');
|
||||
assert(ev['job_result'].length === 1, 'job_result emitted once');
|
||||
|
||||
// Check enrichment skip log (if present in log events)
|
||||
const logEvents = ev['log'] ?? [];
|
||||
const skipLog = logEvents.find(l =>
|
||||
typeof l.data === 'string' && l.data.includes('already enriched')
|
||||
);
|
||||
const nodeCount = ev['node'].length + ev['node-error'].length;
|
||||
if (skipLog) {
|
||||
console.log(` ℹ️ Pre-enrich skip detected: ${skipLog.data}`);
|
||||
assert(nodeCount === 0, 'no enrichment needed (all skipped)');
|
||||
} else {
|
||||
console.log(' ℹ️ No pre-enrich skips (all locations are new or unenriched)');
|
||||
assert(nodeCount > 0, 'enrichment node events received');
|
||||
}
|
||||
|
||||
// Check filterTypes assertions: all locations must have website + matching type
|
||||
const FILTER_TYPE = 'Recycling center';
|
||||
const locations = ev['location'];
|
||||
const badWebsite = locations.filter(l => {
|
||||
const loc = l.data?.location;
|
||||
return !loc?.website;
|
||||
});
|
||||
|
||||
assert(badWebsite.length === 0, `all locations have website (${badWebsite.length} missing)`);
|
||||
|
||||
const badType = locations.filter(l => {
|
||||
const loc = l.data?.location;
|
||||
const types = loc?.types ?? [];
|
||||
const type = loc?.type ?? '';
|
||||
return !types.includes(FILTER_TYPE) && type !== FILTER_TYPE;
|
||||
});
|
||||
if (badType.length > 0) {
|
||||
console.log(` ❌ Mismatched locations:`);
|
||||
badType.slice(0, 3).forEach(l => console.log(JSON.stringify(l.data?.location, null, 2)));
|
||||
}
|
||||
assert(badType.length === 0, `all locations match type "${FILTER_TYPE}" (${badType.length} mismatched)`);
|
||||
|
||||
const filterLog = logEvents.find(l =>
|
||||
typeof l.data === 'string' && l.data.includes('locations removed')
|
||||
);
|
||||
if (filterLog) {
|
||||
console.log(` ℹ️ Filter applied: ${filterLog.data}`);
|
||||
}
|
||||
|
||||
const filterTypesLog = logEvents.filter(l =>
|
||||
typeof l.data === 'string' && (l.data.includes('filterTypes:') || l.data.includes(' - '))
|
||||
);
|
||||
if (filterTypesLog.length > 0) {
|
||||
console.log(` ℹ️ Parsed filterTypes in C++:`);
|
||||
filterTypesLog.forEach(l => console.log(` ${l.data}`));
|
||||
}
|
||||
|
||||
console.log(` ℹ️ Locations after filter: ${locations.length}`);
|
||||
|
||||
console.log('6. Cleanup');
|
||||
socket.destroy();
|
||||
worker.kill('SIGTERM');
|
||||
|
||||
console.log(`\n────────────────────────────────`);
|
||||
console.log(` Passed: ${passed} Failed: ${failed}`);
|
||||
console.log(`────────────────────────────────`);
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
run().catch(e => {
|
||||
console.error(e);
|
||||
process.exit(1);
|
||||
});
|
||||
204
packages/kbot/cpp/orchestrator/test-gridsearch-ipc.mjs
Normal file
204
packages/kbot/cpp/orchestrator/test-gridsearch-ipc.mjs
Normal file
@ -0,0 +1,204 @@
|
||||
/**
|
||||
* orchestrator/test-gridsearch-ipc.mjs
|
||||
*
|
||||
* E2E test: spawn the C++ worker, send a gridsearch request
|
||||
* matching `npm run gridsearch:enrich` defaults, collect IPC events,
|
||||
* and verify the full event sequence.
|
||||
*
|
||||
* Run: node orchestrator/test-gridsearch-ipc.mjs
|
||||
* Needs: npm run build-debug (or npm run build)
|
||||
*/
|
||||
|
||||
import { spawnWorker } from './spawn.mjs';
|
||||
import { resolve, dirname } from 'node:path';
|
||||
import { readFileSync } from 'node:fs';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
import fs from 'node:fs';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const IS_WIN = process.platform === 'win32';
|
||||
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
|
||||
|
||||
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
|
||||
if (!fs.existsSync(EXE)) {
|
||||
console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`);
|
||||
process.exit(1);
|
||||
}
|
||||
console.log(`Binary: ${EXE}\n`);
|
||||
|
||||
// Load the sample settings (same as gridsearch:enrich)
|
||||
const sampleConfig = JSON.parse(
|
||||
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
|
||||
);
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
|
||||
function assert(condition, label) {
|
||||
if (condition) {
|
||||
console.log(` ✅ ${label}`);
|
||||
passed++;
|
||||
} else {
|
||||
console.error(` ❌ ${label}`);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Event collector ─────────────────────────────────────────────────────────
|
||||
|
||||
const EXPECTED_EVENTS = [
|
||||
'grid-ready',
|
||||
'waypoint-start',
|
||||
'area',
|
||||
'location',
|
||||
'enrich-start',
|
||||
'node',
|
||||
'nodePage',
|
||||
// 'node-error' — may or may not occur, depends on network
|
||||
];
|
||||
|
||||
function createCollector() {
|
||||
const events = {};
|
||||
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
|
||||
'enrich-start', 'node', 'node-error', 'nodePage']) {
|
||||
events[t] = [];
|
||||
}
|
||||
return {
|
||||
events,
|
||||
handler(msg) {
|
||||
const t = msg.type;
|
||||
if (events[t]) {
|
||||
events[t].push(msg);
|
||||
} else {
|
||||
events[t] = [msg];
|
||||
}
|
||||
// Live progress indicator
|
||||
const d = msg.payload ?? {};
|
||||
if (t === 'waypoint-start') {
|
||||
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
|
||||
} else if (t === 'node') {
|
||||
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
|
||||
} else if (t === 'node-error') {
|
||||
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
|
||||
}
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
// ── Main test ───────────────────────────────────────────────────────────────
|
||||
|
||||
async function run() {
|
||||
console.log('🧪 Gridsearch IPC E2E Test\n');
|
||||
|
||||
// ── 1. Spawn worker ───────────────────────────────────────────────────
|
||||
console.log('1. Spawn worker');
|
||||
const worker = spawnWorker(EXE);
|
||||
const readyMsg = await worker.ready;
|
||||
assert(readyMsg.type === 'ready', 'Worker sends ready signal');
|
||||
|
||||
// ── 2. Register event collector ───────────────────────────────────────
|
||||
const collector = createCollector();
|
||||
worker.onEvent(collector.handler);
|
||||
|
||||
// ── 3. Send gridsearch request (matching gridsearch:enrich) ────────────
|
||||
console.log('2. Send gridsearch request (Aruba / recycling / --enrich)');
|
||||
const t0 = Date.now();
|
||||
|
||||
// Very long timeout — enrichment can take minutes
|
||||
const result = await worker.request(
|
||||
{
|
||||
type: 'gridsearch',
|
||||
payload: {
|
||||
...sampleConfig,
|
||||
enrich: true,
|
||||
},
|
||||
},
|
||||
5 * 60 * 1000 // 5 min timeout
|
||||
);
|
||||
|
||||
const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
|
||||
console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`);
|
||||
|
||||
// ── 4. Verify final result ────────────────────────────────────────────
|
||||
console.log('3. Verify job_result');
|
||||
assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`);
|
||||
|
||||
const summary = result.payload ?? null;
|
||||
assert(summary !== null, 'job_result payload is present');
|
||||
|
||||
if (summary) {
|
||||
assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`);
|
||||
assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`);
|
||||
assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`);
|
||||
assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`);
|
||||
assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`);
|
||||
assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present');
|
||||
assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present');
|
||||
assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`);
|
||||
assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`);
|
||||
}
|
||||
|
||||
// ── 5. Verify event sequence ──────────────────────────────────────────
|
||||
console.log('4. Verify event stream');
|
||||
const e = collector.events;
|
||||
|
||||
assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`);
|
||||
assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`);
|
||||
assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`);
|
||||
assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`);
|
||||
assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`);
|
||||
|
||||
const totalNodes = e['node'].length + e['node-error'].length;
|
||||
assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`);
|
||||
|
||||
// Validate grid-ready payload
|
||||
if (e['grid-ready'].length > 0) {
|
||||
const gr = e['grid-ready'][0].payload ?? {};
|
||||
assert(Array.isArray(gr.areas), 'grid-ready.areas is array');
|
||||
assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`);
|
||||
}
|
||||
|
||||
// Validate location events have required fields
|
||||
if (e['location'].length > 0) {
|
||||
const loc = e['location'][0].payload ?? {};
|
||||
assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title');
|
||||
assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id');
|
||||
assert(typeof loc.areaName === 'string', 'location event has areaName');
|
||||
}
|
||||
assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`);
|
||||
|
||||
// Validate node payloads
|
||||
if (e['node'].length > 0) {
|
||||
const nd = e['node'][0].payload ?? {};
|
||||
assert(typeof nd.placeId === 'string', 'node event has placeId');
|
||||
assert(typeof nd.title === 'string', 'node event has title');
|
||||
assert(Array.isArray(nd.emails), 'node event has emails array');
|
||||
assert(typeof nd.status === 'string', 'node event has status');
|
||||
}
|
||||
|
||||
// ── 6. Print event summary ────────────────────────────────────────────
|
||||
console.log('\n5. Event summary');
|
||||
for (const [type, arr] of Object.entries(e)) {
|
||||
if (arr.length > 0) console.log(` ${type}: ${arr.length}`);
|
||||
}
|
||||
|
||||
// ── 7. Shutdown ───────────────────────────────────────────────────────
|
||||
console.log('\n6. Graceful shutdown');
|
||||
const shutdownRes = await worker.shutdown();
|
||||
assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged');
|
||||
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
|
||||
|
||||
// ── Summary ───────────────────────────────────────────────────────────
|
||||
console.log(`\n────────────────────────────────`);
|
||||
console.log(` Passed: ${passed} Failed: ${failed}`);
|
||||
console.log(`────────────────────────────────\n`);
|
||||
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
run().catch((err) => {
|
||||
console.error('Test runner error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
90
packages/kbot/cpp/orchestrator/test-ipc.mjs
Normal file
90
packages/kbot/cpp/orchestrator/test-ipc.mjs
Normal file
@ -0,0 +1,90 @@
|
||||
/**
|
||||
* orchestrator/test-ipc.mjs
|
||||
*
|
||||
* Integration test: spawn the C++ worker, exchange messages, verify responses.
|
||||
*
|
||||
* Run: node orchestrator/test-ipc.mjs
|
||||
* Needs: npm run build (to compile the C++ binary first)
|
||||
*/
|
||||
|
||||
import { spawnWorker } from './spawn.mjs';
|
||||
import { resolve, dirname } from 'node:path';
|
||||
import { fileURLToPath } from 'node:url';
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const EXE = resolve(__dirname, '..', 'dist', 'polymech-cli.exe');
|
||||
|
||||
let passed = 0;
|
||||
let failed = 0;
|
||||
|
||||
function assert(condition, label) {
|
||||
if (condition) {
|
||||
console.log(` ✅ ${label}`);
|
||||
passed++;
|
||||
} else {
|
||||
console.error(` ❌ ${label}`);
|
||||
failed++;
|
||||
}
|
||||
}
|
||||
|
||||
async function run() {
|
||||
console.log('\n🔧 IPC Integration Tests\n');
|
||||
|
||||
// ── 1. Spawn & ready ────────────────────────────────────────────────────
|
||||
console.log('1. Spawn worker and wait for ready signal');
|
||||
const worker = spawnWorker(EXE);
|
||||
|
||||
const readyMsg = await worker.ready;
|
||||
assert(readyMsg.type === 'ready', 'Worker sends ready message on startup');
|
||||
|
||||
// ── 2. Ping / Pong ─────────────────────────────────────────────────────
|
||||
console.log('2. Ping → Pong');
|
||||
const pong = await worker.request({ type: 'ping' });
|
||||
assert(pong.type === 'pong', `Response type is "pong" (got "${pong.type}")`);
|
||||
|
||||
// ── 3. Job echo ─────────────────────────────────────────────────────────
|
||||
console.log('3. Job → Job Result (echo payload)');
|
||||
const payload = { action: 'resize', width: 1024, format: 'webp' };
|
||||
const jobResult = await worker.request({ type: 'job', payload });
|
||||
assert(jobResult.type === 'job_result', `Response type is "job_result" (got "${jobResult.type}")`);
|
||||
assert(
|
||||
jobResult.payload?.action === 'resize' && jobResult.payload?.width === 1024,
|
||||
'Payload echoed back correctly'
|
||||
);
|
||||
|
||||
// ── 4. Unknown type → error ─────────────────────────────────────────────
|
||||
console.log('4. Unknown type → error response');
|
||||
const errResp = await worker.request({ type: 'nonsense' });
|
||||
assert(errResp.type === 'error', `Response type is "error" (got "${errResp.type}")`);
|
||||
|
||||
// ── 5. Multiple rapid requests ──────────────────────────────────────────
|
||||
console.log('5. Multiple concurrent requests');
|
||||
const promises = [];
|
||||
for (let i = 0; i < 10; i++) {
|
||||
promises.push(worker.request({ type: 'ping', payload: { seq: i } }));
|
||||
}
|
||||
const results = await Promise.all(promises);
|
||||
assert(results.length === 10, `All 10 responses received`);
|
||||
assert(results.every(r => r.type === 'pong'), 'All responses are pong');
|
||||
|
||||
// ── 6. Graceful shutdown ────────────────────────────────────────────────
|
||||
console.log('6. Graceful shutdown');
|
||||
const shutdownRes = await worker.shutdown();
|
||||
assert(shutdownRes.type === 'shutdown_ack', `Shutdown acknowledged (got "${shutdownRes.type}")`);
|
||||
|
||||
// Wait a beat for process exit
|
||||
await new Promise(r => setTimeout(r, 200));
|
||||
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
|
||||
|
||||
// ── Summary ─────────────────────────────────────────────────────────────
|
||||
console.log(`\n────────────────────────────────`);
|
||||
console.log(` Passed: ${passed} Failed: ${failed}`);
|
||||
console.log(`────────────────────────────────\n`);
|
||||
|
||||
process.exit(failed > 0 ? 1 : 0);
|
||||
}
|
||||
|
||||
run().catch((err) => {
|
||||
console.error('Test runner error:', err);
|
||||
process.exit(1);
|
||||
});
|
||||
6
packages/kbot/cpp/package-lock.json
generated
Normal file
6
packages/kbot/cpp/package-lock.json
generated
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"name": "mono-cpp",
|
||||
"lockfileVersion": 3,
|
||||
"requires": true,
|
||||
"packages": {}
|
||||
}
|
||||
40
packages/kbot/cpp/package.json
Normal file
40
packages/kbot/cpp/package.json
Normal file
@ -0,0 +1,40 @@
|
||||
{
|
||||
"name": "mono-cpp",
|
||||
"version": "1.0.0",
|
||||
"description": "Cross-platform C++ CLI built with CMake.",
|
||||
"directories": {
|
||||
"test": "tests"
|
||||
},
|
||||
"scripts": {
|
||||
"config": "cmake --preset dev",
|
||||
"config:release": "cmake --preset release",
|
||||
"build": "cmake --preset dev && cmake --build --preset dev",
|
||||
"build:release": "cmake --preset release && cmake --build --preset release",
|
||||
"build:linux": "bash build-linux.sh",
|
||||
"test": "ctest --test-dir build/dev -C Debug --output-on-failure",
|
||||
"test:release": "ctest --test-dir build/release -C Release --output-on-failure",
|
||||
"clean": "cmake -E rm -rf build dist",
|
||||
"rebuild": "npm run clean && npm run build",
|
||||
"run": ".\\dist\\polymech-cli.exe --help",
|
||||
"worker": ".\\dist\\polymech-cli.exe worker",
|
||||
"test:ipc": "node orchestrator/test-gridsearch-ipc.mjs",
|
||||
"gridsearch": ".\\dist\\polymech-cli.exe gridsearch ABW recycling --dry-run",
|
||||
"gridsearch:settings": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --dry-run",
|
||||
"gridsearch:settings:live": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json",
|
||||
"gridsearch:enrich": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --enrich",
|
||||
"gridsearch:enrich-test": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-test-bcn.json --enrich --persistence-postgres",
|
||||
"test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs",
|
||||
"test:gridsearch-filter-ipc": "cmake --build build/release --target test_gridsearch_ipc && .\\dist\\test_gridsearch_ipc.exe",
|
||||
"test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs",
|
||||
"test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs",
|
||||
"test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs",
|
||||
"test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://git.polymech.info/polymech/mono-cpp.git"
|
||||
},
|
||||
"keywords": [],
|
||||
"author": "",
|
||||
"license": "ISC"
|
||||
}
|
||||
4
packages/kbot/cpp/packages/enrichers/CMakeLists.txt
Normal file
4
packages/kbot/cpp/packages/enrichers/CMakeLists.txt
Normal file
@ -0,0 +1,4 @@
|
||||
add_library(enrichers STATIC src/enrichers.cpp)
|
||||
|
||||
target_include_directories(enrichers PUBLIC include)
|
||||
target_link_libraries(enrichers PUBLIC http html json logger)
|
||||
@ -0,0 +1,162 @@
|
||||
#pragma once
|
||||
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace enrichers {
|
||||
|
||||
// ── Status codes ────────────────────────────────────────────────────────────
|
||||
|
||||
enum class EnrichStatus {
|
||||
OK,
|
||||
NO_EMAIL,
|
||||
META_TIMEOUT,
|
||||
EMAIL_TIMEOUT,
|
||||
FETCH_ERROR,
|
||||
NO_PAGES,
|
||||
ERROR,
|
||||
};
|
||||
|
||||
const char *status_string(EnrichStatus s);
|
||||
|
||||
// ── Data types ──────────────────────────────────────────────────────────────
|
||||
|
||||
struct PageError {
|
||||
std::string url;
|
||||
std::string status; // "SEARCHED_EMAIL", "FAILED", ...
|
||||
std::string method; // "GET", "SCRAPELESS", ...
|
||||
std::string error;
|
||||
int http_status = 0;
|
||||
std::vector<std::string> emails;
|
||||
};
|
||||
|
||||
struct SocialLink {
|
||||
std::string platform; // "instagram", "facebook", "linkedin", ...
|
||||
std::string url;
|
||||
};
|
||||
|
||||
struct SiteMeta {
|
||||
std::string title;
|
||||
std::string description;
|
||||
std::string og_image;
|
||||
std::string canonical;
|
||||
std::vector<SocialLink> socials;
|
||||
std::vector<std::string> internal_pages; // discovered internal hrefs
|
||||
std::vector<std::string> emails;
|
||||
std::string body_text;
|
||||
std::string body_html;
|
||||
std::map<std::string, std::string> sites; // url -> body_md
|
||||
int http_status = 0;
|
||||
std::string fetch_error;
|
||||
std::vector<std::string> json_ld;
|
||||
};
|
||||
|
||||
struct EnrichedNode {
|
||||
int idx = 0;
|
||||
std::string title;
|
||||
std::string place_id;
|
||||
std::string website;
|
||||
std::string address;
|
||||
std::string type;
|
||||
std::string grid_area;
|
||||
std::string grid_gid;
|
||||
int pages_found = 0;
|
||||
int pages_scraped = 0;
|
||||
std::vector<std::string> emails;
|
||||
std::vector<SocialLink> socials;
|
||||
int meta_ms = 0;
|
||||
int email_ms = 0;
|
||||
int total_ms = 0;
|
||||
EnrichStatus status = EnrichStatus::NO_EMAIL;
|
||||
std::string error;
|
||||
std::map<std::string, std::string> pages; // "home" → body text
|
||||
std::vector<std::string> meta_pages;
|
||||
std::vector<PageError> page_errors;
|
||||
std::string enricher_hash;
|
||||
std::string geo_json;
|
||||
std::map<std::string, std::string> sites; // url -> body_md
|
||||
};
|
||||
|
||||
// ── Configuration ───────────────────────────────────────────────────────────
|
||||
|
||||
struct EnrichConfig {
|
||||
bool enable_homepage_md = true;
|
||||
int meta_timeout_ms = 10000;
|
||||
int email_timeout_ms = 15000;
|
||||
int email_page_timeout_ms = 10000;
|
||||
int email_max_pages = 8;
|
||||
int email_abort_after = 1;
|
||||
|
||||
/// Scrapeless API key — if set, pages that yield no emails via plain
|
||||
/// HTTP GET will be re-fetched through the Scrapeless Universal Scraping
|
||||
/// API (JS rendering). Leave empty to disable the fallback.
|
||||
std::string scrapeless_key;
|
||||
|
||||
std::string bigdata_key;
|
||||
|
||||
std::vector<std::string> contact_patterns = {
|
||||
"contact", "kontakt", "contacto", "contacta", "impression",
|
||||
"about", "impress", "impressum", "datenschutz", "privacy",
|
||||
"legal", "team", "nosotros", "empresa", "sobre",
|
||||
};
|
||||
std::vector<std::string> probe_paths = {
|
||||
"/contact", "/contacto", "/kontakt", "/contacta",
|
||||
"/about", "/about-us", "/impressum",
|
||||
};
|
||||
|
||||
std::string meta_scraper;
|
||||
int meta_concurrency = 5;
|
||||
int meta_idle_timeout = 60;
|
||||
};
|
||||
|
||||
// ── Location input ──────────────────────────────────────────────────────────
|
||||
|
||||
struct LocationInput {
|
||||
std::string title;
|
||||
std::string place_id;
|
||||
std::string website;
|
||||
std::string address;
|
||||
std::string type;
|
||||
std::string grid_area;
|
||||
std::string grid_gid;
|
||||
double lat = 0;
|
||||
double lng = 0;
|
||||
};
|
||||
|
||||
// ── Core API ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Check if a candidate string looks like a real email address.
|
||||
bool is_likely_email(const std::string &candidate);
|
||||
|
||||
/// Extract all email addresses from a text body.
|
||||
std::vector<std::string> extract_emails(const std::string &text);
|
||||
|
||||
/// Scrape metadata from a website URL (static HTML via libcurl + lexbor).
|
||||
SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000);
|
||||
|
||||
/// Scrape emails from a single page URL.
|
||||
std::vector<std::string> scrape_emails_from_page(const std::string &url,
|
||||
int timeout_ms = 10000);
|
||||
|
||||
/// Fetch a page via Scrapeless Universal Scraping API (JS rendering),
|
||||
/// then extract emails from the rendered HTML. Returns empty if key is
|
||||
/// blank or the API call fails.
|
||||
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
|
||||
const std::string &api_key,
|
||||
int timeout_ms = 15000);
|
||||
|
||||
/// Scrape metadata from a website URL via Scrapeless Universal API (JS
|
||||
/// rendering).
|
||||
SiteMeta scrape_meta_scrapeless(const std::string &url,
|
||||
const std::string &api_key,
|
||||
int timeout_ms = 15000);
|
||||
|
||||
/// Full enrichment pipeline for a single location: meta → email.
|
||||
EnrichedNode enrich_location(const LocationInput &loc,
|
||||
const EnrichConfig &cfg = {});
|
||||
|
||||
/// Resolve a URL relative to a base URL.
|
||||
std::string resolve_url(const std::string &base, const std::string &href);
|
||||
|
||||
} // namespace enrichers
|
||||
800
packages/kbot/cpp/packages/enrichers/src/enrichers.cpp
Normal file
800
packages/kbot/cpp/packages/enrichers/src/enrichers.cpp
Normal file
@ -0,0 +1,800 @@
|
||||
#include "enrichers/enrichers.h"
|
||||
#include "html/html.h"
|
||||
#include "http/http.h"
|
||||
#include "logger/logger.h"
|
||||
#include "json/json.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <chrono>
|
||||
#include <future>
|
||||
#include <regex>
|
||||
#include <set>
|
||||
#include <sstream>
|
||||
|
||||
namespace enrichers {
|
||||
|
||||
// ── Status string ───────────────────────────────────────────────────────────
|
||||
|
||||
const char *status_string(EnrichStatus s) {
|
||||
switch (s) {
|
||||
case EnrichStatus::OK:
|
||||
return "OK";
|
||||
case EnrichStatus::NO_EMAIL:
|
||||
return "NO_EMAIL";
|
||||
case EnrichStatus::META_TIMEOUT:
|
||||
return "META_TIMEOUT";
|
||||
case EnrichStatus::EMAIL_TIMEOUT:
|
||||
return "EMAIL_TIMEOUT";
|
||||
case EnrichStatus::FETCH_ERROR:
|
||||
return "FETCH_ERROR";
|
||||
case EnrichStatus::NO_PAGES:
|
||||
return "NO_PAGES";
|
||||
case EnrichStatus::ERROR:
|
||||
return "ERROR";
|
||||
}
|
||||
return "UNKNOWN";
|
||||
}
|
||||
|
||||
// ── Timing helper ───────────────────────────────────────────────────────────
|
||||
|
||||
static int elapsed_ms(std::chrono::steady_clock::time_point t0) {
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
return static_cast<int>(
|
||||
std::chrono::duration_cast<std::chrono::milliseconds>(now - t0).count());
|
||||
}
|
||||
|
||||
// ── Email extraction ────────────────────────────────────────────────────────
|
||||
|
||||
static const std::regex
|
||||
EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
|
||||
std::regex::optimize);
|
||||
|
||||
// Asset extensions that disqualify an email-like string
|
||||
static const std::vector<std::string> ASSET_EXTS = {
|
||||
".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp",
|
||||
".avif", ".css", ".js", ".woff", ".woff2", ".ttf",
|
||||
".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico",
|
||||
};
|
||||
|
||||
static std::string to_lower(const std::string &s) {
|
||||
std::string out = s;
|
||||
std::transform(out.begin(), out.end(), out.begin(),
|
||||
[](unsigned char c) { return std::tolower(c); });
|
||||
return out;
|
||||
}
|
||||
|
||||
bool is_likely_email(const std::string &candidate) {
|
||||
if (candidate.size() < 5 || candidate.size() > 254)
|
||||
return false;
|
||||
if (candidate.find("..") != std::string::npos)
|
||||
return false;
|
||||
auto at_pos = candidate.find('@');
|
||||
if (at_pos == std::string::npos || at_pos == 0 ||
|
||||
at_pos == candidate.size() - 1)
|
||||
return false;
|
||||
|
||||
auto lower = to_lower(candidate);
|
||||
|
||||
// Reject asset-like extensions
|
||||
for (auto &ext : ASSET_EXTS) {
|
||||
if (lower.size() >= ext.size() &&
|
||||
lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Reject common placeholders
|
||||
if (lower.find("example") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("sentry") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("test") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("placeholder") != std::string::npos)
|
||||
return false;
|
||||
if (lower.find("wixpress.com") != std::string::npos)
|
||||
return false;
|
||||
|
||||
// Reject if local part is pure hex hash (8+ hex chars)
|
||||
if (at_pos >= 8) {
|
||||
auto local = lower.substr(0, at_pos);
|
||||
bool all_hex = std::all_of(local.begin(), local.end(), [](char c) {
|
||||
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
|
||||
});
|
||||
if (all_hex)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Reject if domain part looks numeric-only (e.g. 1234@5678)
|
||||
auto domain = lower.substr(at_pos + 1);
|
||||
auto dot_pos = domain.find('.');
|
||||
if (dot_pos == std::string::npos)
|
||||
return false;
|
||||
if (domain.length() - dot_pos <= 2)
|
||||
return false; // Minimum 2 chars for TLD
|
||||
|
||||
auto domPart = domain.substr(0, dot_pos);
|
||||
bool all_digits =
|
||||
!domPart.empty() &&
|
||||
std::all_of(domPart.begin(), domPart.end(),
|
||||
[](unsigned char c) { return std::isdigit(c); });
|
||||
if (all_digits)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool is_valid_email_char(char c) {
|
||||
return std::isalnum(static_cast<unsigned char>(c)) || c == '.' || c == '_' ||
|
||||
c == '%' || c == '+' || c == '-';
|
||||
}
|
||||
|
||||
std::vector<std::string> extract_emails(const std::string &text) {
|
||||
std::vector<std::string> results;
|
||||
if (text.empty())
|
||||
return results;
|
||||
|
||||
std::set<std::string> seen;
|
||||
size_t pos = 0;
|
||||
|
||||
while ((pos = text.find('@', pos)) != std::string::npos) {
|
||||
if (pos == 0 || pos == text.length() - 1) {
|
||||
pos++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Scan backwards
|
||||
size_t start = pos;
|
||||
while (start > 0 && is_valid_email_char(text[start - 1])) {
|
||||
start--;
|
||||
}
|
||||
|
||||
// Scan forwards
|
||||
size_t end = pos;
|
||||
while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) {
|
||||
end++;
|
||||
}
|
||||
|
||||
if (start < pos && end > pos) {
|
||||
std::string candidate = text.substr(start, end - start + 1);
|
||||
|
||||
// Strip trailing dots/hyphens eagerly grabbed
|
||||
while (!candidate.empty() &&
|
||||
(candidate.back() == '.' || candidate.back() == '-')) {
|
||||
candidate.pop_back();
|
||||
end--;
|
||||
}
|
||||
|
||||
// Strip leading dots/hyphens
|
||||
size_t local_start = 0;
|
||||
while (local_start < candidate.length() &&
|
||||
(candidate[local_start] == '.' || candidate[local_start] == '-')) {
|
||||
local_start++;
|
||||
}
|
||||
if (local_start > 0) {
|
||||
candidate = candidate.substr(local_start);
|
||||
}
|
||||
|
||||
std::string lower = to_lower(candidate);
|
||||
if (is_likely_email(lower)) {
|
||||
if (seen.insert(lower).second) {
|
||||
results.push_back(lower);
|
||||
}
|
||||
}
|
||||
}
|
||||
pos = end + 1;
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
// ── URL resolution ──────────────────────────────────────────────────────────
|
||||
|
||||
std::string resolve_url(const std::string &base, const std::string &href) {
|
||||
if (href.empty())
|
||||
return {};
|
||||
|
||||
// Already absolute
|
||||
if (href.find("http://") == 0 || href.find("https://") == 0)
|
||||
return href;
|
||||
|
||||
// Protocol-relative
|
||||
if (href.find("//") == 0) {
|
||||
auto proto_end = base.find("//");
|
||||
if (proto_end != std::string::npos) {
|
||||
return base.substr(0, proto_end) + href;
|
||||
}
|
||||
return "https:" + href;
|
||||
}
|
||||
|
||||
// Skip non-HTTP
|
||||
if (href.find("mailto:") == 0 || href.find("tel:") == 0 ||
|
||||
href.find("javascript:") == 0 || href[0] == '#') {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Relative path
|
||||
// Find base origin: https://example.com
|
||||
auto proto = base.find("://");
|
||||
if (proto == std::string::npos)
|
||||
return {};
|
||||
auto origin_end = base.find('/', proto + 3);
|
||||
std::string origin =
|
||||
(origin_end != std::string::npos) ? base.substr(0, origin_end) : base;
|
||||
|
||||
if (href[0] == '/') {
|
||||
return origin + href;
|
||||
}
|
||||
|
||||
// Relative without leading slash
|
||||
if (origin_end != std::string::npos) {
|
||||
auto last_slash = base.rfind('/');
|
||||
if (last_slash > proto + 2) {
|
||||
return base.substr(0, last_slash + 1) + href;
|
||||
}
|
||||
}
|
||||
return origin + "/" + href;
|
||||
}
|
||||
|
||||
// ── Social link classification ──────────────────────────────────────────────
|
||||
|
||||
static std::string classify_social(const std::string &url) {
|
||||
auto lower = to_lower(url);
|
||||
if (lower.find("instagram.com") != std::string::npos)
|
||||
return "instagram";
|
||||
if (lower.find("facebook.com") != std::string::npos)
|
||||
return "facebook";
|
||||
if (lower.find("linkedin.com") != std::string::npos)
|
||||
return "linkedin";
|
||||
if (lower.find("twitter.com") != std::string::npos ||
|
||||
lower.find("x.com") != std::string::npos)
|
||||
return "twitter";
|
||||
if (lower.find("youtube.com") != std::string::npos)
|
||||
return "youtube";
|
||||
if (lower.find("tiktok.com") != std::string::npos)
|
||||
return "tiktok";
|
||||
if (lower.find("pinterest.com") != std::string::npos)
|
||||
return "pinterest";
|
||||
if (lower.find("github.com") != std::string::npos)
|
||||
return "github";
|
||||
return {};
|
||||
}
|
||||
|
||||
// ── Same-origin check ───────────────────────────────────────────────────────
|
||||
|
||||
static std::string get_origin(const std::string &url) {
|
||||
auto proto = url.find("://");
|
||||
if (proto == std::string::npos)
|
||||
return {};
|
||||
auto origin_end = url.find('/', proto + 3);
|
||||
return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url;
|
||||
}
|
||||
|
||||
static bool is_same_origin(const std::string &base_url,
|
||||
const std::string &href) {
|
||||
auto bo = to_lower(get_origin(base_url));
|
||||
auto ho = to_lower(get_origin(href));
|
||||
if (bo.empty() || ho.empty())
|
||||
return false;
|
||||
// Strip www. for comparison
|
||||
auto strip_www = [](std::string &s) {
|
||||
auto pos = s.find("://www.");
|
||||
if (pos != std::string::npos) {
|
||||
s = s.substr(0, pos + 3) + s.substr(pos + 7);
|
||||
}
|
||||
};
|
||||
strip_www(bo);
|
||||
strip_www(ho);
|
||||
return bo == ho;
|
||||
}
|
||||
|
||||
// ── Contact page matching ───────────────────────────────────────────────────
|
||||
|
||||
static bool matches_contact_pattern(const std::string &url,
|
||||
const std::vector<std::string> &patterns) {
|
||||
auto lower = to_lower(url);
|
||||
for (auto &pat : patterns) {
|
||||
if (lower.find(to_lower(pat)) != std::string::npos)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// ── Shared HTML parsing logic for Meta ──────────────────────────────────────
|
||||
|
||||
static SiteMeta parse_meta_html(const std::string &url, int http_status,
|
||||
const std::string &html_body,
|
||||
const std::string &fetch_error) {
|
||||
SiteMeta meta;
|
||||
meta.http_status = http_status;
|
||||
|
||||
if (!fetch_error.empty()) {
|
||||
meta.fetch_error = fetch_error;
|
||||
return meta;
|
||||
}
|
||||
|
||||
meta.body_html = html_body;
|
||||
|
||||
// Parse with lexbor helpers
|
||||
meta.title = html::get_title(html_body);
|
||||
meta.description = html::get_meta(html_body, "description");
|
||||
meta.og_image = html::get_meta(html_body, "og:image");
|
||||
meta.canonical = html::get_canonical(html_body);
|
||||
meta.body_text = html::get_body_text(html_body);
|
||||
meta.json_ld = html::get_json_ld(html_body);
|
||||
|
||||
// OG fallbacks
|
||||
if (meta.description.empty())
|
||||
meta.description = html::get_meta(html_body, "og:description");
|
||||
if (meta.title.empty())
|
||||
meta.title = html::get_meta(html_body, "og:title");
|
||||
|
||||
// Links — classify into social / internal / mailto
|
||||
auto links = html::get_links(html_body);
|
||||
std::set<std::string> seen_pages;
|
||||
|
||||
// Extract emails from body text (much smaller than raw HTML)
|
||||
meta.emails = extract_emails(meta.body_text);
|
||||
|
||||
for (auto &lk : links) {
|
||||
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
|
||||
std::string email = lk.href.substr(7);
|
||||
// Strip anything after ? (like ?subject=...)
|
||||
auto q = email.find('?');
|
||||
if (q != std::string::npos)
|
||||
email = email.substr(0, q);
|
||||
// Clean it
|
||||
email = to_lower(email);
|
||||
if (is_likely_email(email)) {
|
||||
if (std::find(meta.emails.begin(), meta.emails.end(), email) ==
|
||||
meta.emails.end()) {
|
||||
meta.emails.push_back(email);
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
auto resolved = resolve_url(url, lk.href);
|
||||
if (resolved.empty())
|
||||
continue;
|
||||
|
||||
auto social = classify_social(resolved);
|
||||
if (!social.empty()) {
|
||||
meta.socials.push_back({social, resolved});
|
||||
continue;
|
||||
}
|
||||
|
||||
if (is_same_origin(url, resolved)) {
|
||||
// Strip fragment (#) from URL
|
||||
auto hash_pos = resolved.find('#');
|
||||
if (hash_pos != std::string::npos) {
|
||||
resolved = resolved.substr(0, hash_pos);
|
||||
}
|
||||
if (!resolved.empty() && seen_pages.insert(resolved).second) {
|
||||
meta.internal_pages.push_back(resolved);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return meta;
|
||||
}
|
||||
|
||||
// ── scrape_meta ─────────────────────────────────────────────────────────────
|
||||
|
||||
SiteMeta scrape_meta(const std::string &url, int timeout_ms) {
|
||||
http::GetOptions opts;
|
||||
opts.timeout_ms = timeout_ms;
|
||||
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36";
|
||||
|
||||
auto resp = http::get(url, opts);
|
||||
std::string fetch_err;
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
fetch_err = resp.body;
|
||||
}
|
||||
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
|
||||
fetch_err);
|
||||
}
|
||||
|
||||
// ── scrape_emails_from_page ─────────────────────────────────────────────────
|
||||
|
||||
std::vector<std::string> scrape_emails_from_page(const std::string &url,
|
||||
int timeout_ms,
|
||||
int &out_status_code) {
|
||||
http::GetOptions opts;
|
||||
opts.timeout_ms = timeout_ms;
|
||||
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/120.0.0.0 Safari/537.36";
|
||||
|
||||
auto resp = http::get(url, opts);
|
||||
out_status_code = static_cast<int>(resp.status_code);
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Extract body text then find emails
|
||||
auto text = html::get_body_text(resp.body);
|
||||
auto from_text = extract_emails(text);
|
||||
|
||||
// Extract mailto: links from HTML directly without regexing the huge string
|
||||
auto links = html::get_links(resp.body);
|
||||
std::set<std::string> seen(from_text.begin(), from_text.end());
|
||||
|
||||
for (auto &lk : links) {
|
||||
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
|
||||
std::string m = lk.href.substr(7);
|
||||
auto q = m.find('?');
|
||||
if (q != std::string::npos)
|
||||
m = m.substr(0, q);
|
||||
m = to_lower(m);
|
||||
if (is_likely_email(m)) {
|
||||
if (seen.insert(m).second) {
|
||||
from_text.push_back(m);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return from_text;
|
||||
}
|
||||
|
||||
static std::string extract_scrapeless_html(const std::string &json_body) {
|
||||
std::string data = json::get_string(json_body, "data");
|
||||
if (data.empty()) {
|
||||
return json_body; // Fallback to raw response if not found
|
||||
}
|
||||
return data;
|
||||
}
|
||||
|
||||
SiteMeta scrape_meta_scrapeless(const std::string &url,
|
||||
const std::string &api_key, int timeout_ms) {
|
||||
if (api_key.empty())
|
||||
return parse_meta_html(url, 0, "", "missing api key");
|
||||
|
||||
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
|
||||
url +
|
||||
R"(","jsRender":{"enabled":true,"headless":true}}})";
|
||||
|
||||
http::PostOptions opts;
|
||||
opts.content_type = "application/json";
|
||||
opts.bearer_token = api_key;
|
||||
opts.timeout_ms =
|
||||
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
|
||||
|
||||
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
|
||||
payload, opts);
|
||||
|
||||
std::string fetch_err;
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
fetch_err = resp.body;
|
||||
logger::error("[meta:scrapeless] API Error HTTP " +
|
||||
std::to_string(resp.status_code) + " for " + url + " : " +
|
||||
fetch_err);
|
||||
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
|
||||
fetch_err);
|
||||
}
|
||||
|
||||
std::string rendered_html = extract_scrapeless_html(resp.body);
|
||||
return parse_meta_html(url, static_cast<int>(resp.status_code), rendered_html,
|
||||
"");
|
||||
}
|
||||
|
||||
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
|
||||
const std::string &api_key,
|
||||
int timeout_ms) {
|
||||
if (api_key.empty())
|
||||
return {};
|
||||
|
||||
// Build the Scrapeless Universal Scraping API request body.
|
||||
// We ask for the fully-rendered HTML of the target URL.
|
||||
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
|
||||
url +
|
||||
R"(","jsRender":{"enabled":true,"headless":true}}})";
|
||||
|
||||
http::PostOptions opts;
|
||||
opts.content_type = "application/json";
|
||||
opts.bearer_token = api_key;
|
||||
opts.timeout_ms =
|
||||
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
|
||||
|
||||
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
|
||||
payload, opts);
|
||||
|
||||
if (resp.status_code < 0 || resp.status_code >= 400) {
|
||||
logger::error("[email:scrapeless] API Error HTTP " +
|
||||
std::to_string(resp.status_code) + " for " + url + " : " +
|
||||
resp.body);
|
||||
return {}; // API error — silent fallback
|
||||
}
|
||||
|
||||
std::string rendered_html = extract_scrapeless_html(resp.body);
|
||||
|
||||
// Parse and extract emails from the rendered HTML
|
||||
auto text = html::get_body_text(rendered_html);
|
||||
auto from_text = extract_emails(text);
|
||||
|
||||
// Fast mailto extraction instead of HTML regex
|
||||
auto links = html::get_links(rendered_html);
|
||||
std::set<std::string> seen(from_text.begin(), from_text.end());
|
||||
|
||||
for (auto &lk : links) {
|
||||
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
|
||||
std::string m = lk.href.substr(7);
|
||||
auto q = m.find('?');
|
||||
if (q != std::string::npos)
|
||||
m = m.substr(0, q);
|
||||
m = to_lower(m);
|
||||
if (is_likely_email(m)) {
|
||||
if (seen.insert(m).second) {
|
||||
from_text.push_back(m);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return from_text;
|
||||
}
|
||||
|
||||
// ── enrich_location ─────────────────────────────────────────────────────────
|
||||
|
||||
EnrichedNode enrich_location(const LocationInput &loc,
|
||||
const EnrichConfig &cfg) {
|
||||
auto t0 = std::chrono::steady_clock::now();
|
||||
|
||||
EnrichedNode node;
|
||||
node.title = loc.title;
|
||||
node.place_id = loc.place_id;
|
||||
node.website = loc.website;
|
||||
node.address = loc.address;
|
||||
node.type = loc.type;
|
||||
node.grid_area = loc.grid_area;
|
||||
node.grid_gid = loc.grid_gid;
|
||||
node.status = EnrichStatus::NO_EMAIL;
|
||||
|
||||
if (loc.website.empty()) {
|
||||
node.status = EnrichStatus::FETCH_ERROR;
|
||||
node.error = "no website";
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
// ── Phase 1: Meta scrape ────────────────────────────────────────────────
|
||||
|
||||
auto meta_t0 = std::chrono::steady_clock::now();
|
||||
SiteMeta meta;
|
||||
bool meta_timed_out = false;
|
||||
|
||||
try {
|
||||
if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) {
|
||||
logger::debug("[meta:scrapeless] Fetching " + loc.website);
|
||||
meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key,
|
||||
cfg.meta_timeout_ms);
|
||||
} else {
|
||||
logger::debug("[meta:http] Fetching " + loc.website);
|
||||
meta = scrape_meta(loc.website, cfg.meta_timeout_ms);
|
||||
}
|
||||
} catch (...) {
|
||||
meta.fetch_error = "exception during meta scrape";
|
||||
meta_timed_out = true;
|
||||
}
|
||||
node.meta_ms = elapsed_ms(meta_t0);
|
||||
|
||||
// Check if meta took too long (within threshold of timeout)
|
||||
if (node.meta_ms >= cfg.meta_timeout_ms - 1000) {
|
||||
meta_timed_out = true;
|
||||
}
|
||||
|
||||
// logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title)
|
||||
// + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: "
|
||||
// + std::to_string(meta.internal_pages.size()));
|
||||
|
||||
if (!meta.body_text.empty())
|
||||
node.pages["home"] = meta.body_text;
|
||||
if (cfg.enable_homepage_md && !meta.body_html.empty()) {
|
||||
// Cap HTML body at 512 KB to prevent stack overflow in recursive html2md
|
||||
// parser
|
||||
static constexpr size_t MAX_HTML_BYTES = 512 * 1024;
|
||||
if (meta.body_html.size() > MAX_HTML_BYTES) {
|
||||
logger::warn("[" + loc.title + "] body_html too large (" +
|
||||
std::to_string(meta.body_html.size() / 1024) +
|
||||
" KB), skipping markdown conversion");
|
||||
} else {
|
||||
try {
|
||||
node.sites[loc.website] = html::to_markdown(meta.body_html);
|
||||
} catch (const std::exception &e) {
|
||||
logger::warn("[" + loc.title +
|
||||
"] html::to_markdown failed: " + e.what());
|
||||
} catch (...) {
|
||||
logger::warn("[" + loc.title +
|
||||
"] html::to_markdown crashed (unknown exception)");
|
||||
}
|
||||
}
|
||||
}
|
||||
node.meta_pages = meta.internal_pages;
|
||||
node.pages_found = static_cast<int>(meta.internal_pages.size());
|
||||
node.socials = meta.socials;
|
||||
|
||||
if (!meta.fetch_error.empty()) {
|
||||
node.error = meta.fetch_error;
|
||||
node.status = EnrichStatus::FETCH_ERROR;
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
// If meta already found emails, we're done (early exit like TS)
|
||||
if (!meta.emails.empty()) {
|
||||
node.emails = meta.emails;
|
||||
node.status = EnrichStatus::OK;
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
// ── Build contact page list ─────────────────────────────────────────────
|
||||
|
||||
std::vector<std::string> contact_pages;
|
||||
std::set<std::string> seen_urls;
|
||||
|
||||
for (auto &page_url : meta.internal_pages) {
|
||||
if (matches_contact_pattern(page_url, cfg.contact_patterns)) {
|
||||
if (seen_urls.insert(page_url).second) {
|
||||
contact_pages.push_back(page_url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No more probe paths. If we found 0 contact pages, we just give up or time
|
||||
// out.
|
||||
|
||||
node.pages_found = static_cast<int>(contact_pages.size());
|
||||
|
||||
if (contact_pages.empty()) {
|
||||
logger::debug("[" +
|
||||
std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] No contact pages found.");
|
||||
node.status =
|
||||
meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES;
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] Contact pages to scrape: " +
|
||||
std::to_string(contact_pages.size()) + " (parallel)");
|
||||
|
||||
// ── Phase 2: Email scrape per contact page ──────────────────────────────
|
||||
|
||||
struct AsyncResult {
|
||||
std::string url;
|
||||
std::vector<PageError> errors;
|
||||
std::vector<std::string> emails;
|
||||
int ms;
|
||||
};
|
||||
|
||||
int pages_to_scrape =
|
||||
std::min(static_cast<int>(contact_pages.size()), cfg.email_max_pages);
|
||||
|
||||
std::vector<std::thread> contact_threads;
|
||||
std::vector<AsyncResult> contact_results(pages_to_scrape);
|
||||
|
||||
auto email_t0 = std::chrono::steady_clock::now();
|
||||
|
||||
for (int i = 0; i < pages_to_scrape; ++i) {
|
||||
auto page_url = contact_pages[i];
|
||||
|
||||
contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() {
|
||||
auto start = std::chrono::steady_clock::now();
|
||||
AsyncResult res;
|
||||
res.url = page_url;
|
||||
|
||||
PageError pe1;
|
||||
pe1.url = page_url;
|
||||
pe1.method = "GET";
|
||||
|
||||
int http_status = 0;
|
||||
try {
|
||||
// logger::debug("[email:http] Fetching " + page_url);
|
||||
auto page_emails = scrape_emails_from_page(
|
||||
page_url, cfg.email_page_timeout_ms, http_status);
|
||||
pe1.emails = page_emails;
|
||||
logger::debug("[" +
|
||||
std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] HTTP fetch finished code " +
|
||||
std::to_string(http_status) + " for " + page_url);
|
||||
|
||||
if (page_emails.empty()) {
|
||||
if (http_status == 404 || http_status == 400 || http_status == 500) {
|
||||
pe1.status = "NOT_FOUND";
|
||||
pe1.error = "HTTP " + std::to_string(http_status);
|
||||
} else {
|
||||
pe1.status = "AXIOS_NO_EMAIL";
|
||||
res.errors.push_back(pe1); // pushed before scrapeless
|
||||
|
||||
if (cfg.meta_scraper == "SCRAPELESS" &&
|
||||
!cfg.scrapeless_key.empty()) {
|
||||
PageError pe2;
|
||||
pe2.url = page_url;
|
||||
pe2.method = "SCRAPELESS";
|
||||
try {
|
||||
logger::debug("[email:scrapeless] Fallback scraping " +
|
||||
page_url);
|
||||
auto s_emails =
|
||||
scrape_emails_scrapeless(page_url, cfg.scrapeless_key,
|
||||
cfg.email_page_timeout_ms + 5000);
|
||||
pe2.emails = s_emails;
|
||||
pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL";
|
||||
if (!s_emails.empty())
|
||||
res.emails = s_emails;
|
||||
logger::debug(
|
||||
"[" +
|
||||
std::string(loc.title.empty() ? loc.website : loc.title) +
|
||||
"] Scrapeless fallback finished for " + page_url);
|
||||
} catch (...) {
|
||||
pe2.status = "FAILED";
|
||||
pe2.error = "scrapeless exception";
|
||||
}
|
||||
res.errors.push_back(pe2);
|
||||
}
|
||||
res.ms = elapsed_ms(start);
|
||||
contact_results[i] = res;
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
pe1.status = "SEARCHED_EMAIL";
|
||||
res.emails = page_emails;
|
||||
}
|
||||
} catch (...) {
|
||||
pe1.status = "AXIOS_FAILED";
|
||||
pe1.error = "exception";
|
||||
}
|
||||
// Only insert pe1 if we didn't already push it during fallback
|
||||
if (res.errors.empty() || res.errors[0].method != "GET") {
|
||||
res.errors.insert(res.errors.begin(), pe1);
|
||||
}
|
||||
res.ms = elapsed_ms(start);
|
||||
contact_results[i] = res;
|
||||
});
|
||||
}
|
||||
|
||||
for (auto &t : contact_threads) {
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
|
||||
std::set<std::string> all_emails;
|
||||
int pages_scraped = 0;
|
||||
|
||||
for (auto &res : contact_results) {
|
||||
pages_scraped++;
|
||||
for (auto &pe : res.errors) {
|
||||
node.page_errors.push_back(std::move(pe));
|
||||
}
|
||||
for (auto &e : res.emails) {
|
||||
all_emails.insert(e);
|
||||
}
|
||||
}
|
||||
|
||||
node.email_ms = elapsed_ms(email_t0);
|
||||
node.pages_scraped = pages_scraped;
|
||||
|
||||
// Merge emails
|
||||
node.emails.assign(all_emails.begin(), all_emails.end());
|
||||
|
||||
// Final status
|
||||
bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000;
|
||||
if (!node.emails.empty()) {
|
||||
node.status = EnrichStatus::OK;
|
||||
} else if (email_timed_out) {
|
||||
node.status = EnrichStatus::EMAIL_TIMEOUT;
|
||||
} else if (meta_timed_out) {
|
||||
node.status = EnrichStatus::META_TIMEOUT;
|
||||
} else {
|
||||
node.status = EnrichStatus::NO_EMAIL;
|
||||
}
|
||||
|
||||
node.total_ms = elapsed_ms(t0);
|
||||
return node;
|
||||
}
|
||||
|
||||
} // namespace enrichers
|
||||
6
packages/kbot/cpp/packages/gadm_reader/CMakeLists.txt
Normal file
6
packages/kbot/cpp/packages/gadm_reader/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
add_library(gadm_reader STATIC src/gadm_reader.cpp)
|
||||
|
||||
target_include_directories(gadm_reader PUBLIC include)
|
||||
|
||||
# Depends on geo (for Coord type) and json (for RapidJSON)
|
||||
target_link_libraries(gadm_reader PUBLIC geo json)
|
||||
@ -0,0 +1,75 @@
|
||||
#pragma once
|
||||
|
||||
#include "geo/geo.h"
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace gadm {
|
||||
|
||||
// ── Feature (mirrors TS GridFeature) ────────────────────────────────────────
|
||||
|
||||
struct Feature {
|
||||
std::string gid; // e.g. "ABW", "AFG.1.1_1"
|
||||
std::string name; // e.g. "Aruba", "Baharak"
|
||||
int level = 0; // GADM admin level
|
||||
|
||||
// Outer ring + holes (MultiPolygon flattened to rings)
|
||||
std::vector<std::vector<geo::Coord>> rings;
|
||||
|
||||
// Bounding box (computed from rings)
|
||||
geo::BBox bbox;
|
||||
|
||||
// GHS enrichment (parsed from cached JSON)
|
||||
double ghsPopulation = 0;
|
||||
double ghsBuiltWeight = 0;
|
||||
double ghsPopMaxDensity = 0;
|
||||
double ghsBuiltMax = 0;
|
||||
|
||||
geo::Coord ghsPopCenter;
|
||||
geo::Coord ghsBuiltCenter;
|
||||
|
||||
// Weighted centers: [lon, lat, weight]
|
||||
std::vector<std::array<double, 3>> ghsPopCenters;
|
||||
std::vector<std::array<double, 3>> ghsBuiltCenters;
|
||||
|
||||
// Computed from geometry
|
||||
double areaSqKm = 0;
|
||||
|
||||
bool isOuter = true;
|
||||
};
|
||||
|
||||
// ── Result ──────────────────────────────────────────────────────────────────
|
||||
|
||||
struct BoundaryResult {
|
||||
std::vector<Feature> features;
|
||||
std::string error; // empty on success
|
||||
};
|
||||
|
||||
// ── API ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Load a pre-cached GADM boundary file.
|
||||
///
|
||||
/// Tries these file paths in order:
|
||||
/// 1. cacheDir/boundary_{gid}_{targetLevel}.json
|
||||
/// 2. cacheDir/boundary_{countryCode}_{targetLevel}.json (fallback for country-level)
|
||||
///
|
||||
/// Returns a BoundaryResult with parsed features or an error string.
|
||||
BoundaryResult load_boundary(
|
||||
const std::string& gid,
|
||||
int targetLevel,
|
||||
const std::string& cacheDir = "cache/gadm"
|
||||
);
|
||||
|
||||
/// Load a boundary file directly by path.
|
||||
BoundaryResult load_boundary_file(const std::string& filepath);
|
||||
|
||||
/// Extract the ISO country code from a GID (e.g. "AFG.1.1_1" → "AFG").
|
||||
std::string country_code(const std::string& gid);
|
||||
|
||||
/// Infer the GADM level from a GID string.
|
||||
/// "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2, etc.
|
||||
int infer_level(const std::string& gid);
|
||||
|
||||
} // namespace gadm
|
||||
231
packages/kbot/cpp/packages/gadm_reader/src/gadm_reader.cpp
Normal file
231
packages/kbot/cpp/packages/gadm_reader/src/gadm_reader.cpp
Normal file
@ -0,0 +1,231 @@
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
|
||||
namespace gadm {
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
std::string country_code(const std::string& gid) {
|
||||
auto dot = gid.find('.');
|
||||
return (dot != std::string::npos) ? gid.substr(0, dot) : gid;
|
||||
}
|
||||
|
||||
int infer_level(const std::string& gid) {
|
||||
// Count dots: "ABW" → 0, "AFG.1_1" → 1, "AFG.1.1_1" → 2
|
||||
int dots = 0;
|
||||
for (char c : gid) {
|
||||
if (c == '.') dots++;
|
||||
}
|
||||
return dots;
|
||||
}
|
||||
|
||||
static std::string read_file(const std::string& path) {
|
||||
std::ifstream ifs(path, std::ios::binary);
|
||||
if (!ifs.is_open()) return "";
|
||||
std::ostringstream oss;
|
||||
oss << ifs.rdbuf();
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
/// Parse a coord array [lon, lat] → geo::Coord
|
||||
static geo::Coord parse_coord(const rapidjson::Value& arr) {
|
||||
if (arr.IsArray() && arr.Size() >= 2) {
|
||||
return {arr[0].GetDouble(), arr[1].GetDouble()};
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/// Parse a ring array [[lon,lat], [lon,lat], ...] → vector<Coord>
|
||||
static std::vector<geo::Coord> parse_ring(const rapidjson::Value& arr) {
|
||||
std::vector<geo::Coord> ring;
|
||||
if (!arr.IsArray()) return ring;
|
||||
ring.reserve(arr.Size());
|
||||
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
|
||||
ring.push_back(parse_coord(arr[i]));
|
||||
}
|
||||
return ring;
|
||||
}
|
||||
|
||||
/// Parse weighted centers [[lon, lat, weight], ...]
|
||||
static std::vector<std::array<double, 3>> parse_weighted_centers(
|
||||
const rapidjson::Value& arr) {
|
||||
std::vector<std::array<double, 3>> centers;
|
||||
if (!arr.IsArray()) return centers;
|
||||
centers.reserve(arr.Size());
|
||||
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
|
||||
const auto& c = arr[i];
|
||||
if (c.IsArray() && c.Size() >= 3) {
|
||||
centers.push_back({c[0].GetDouble(), c[1].GetDouble(), c[2].GetDouble()});
|
||||
}
|
||||
}
|
||||
return centers;
|
||||
}
|
||||
|
||||
/// Get a double from properties, with fallback
|
||||
static double get_double(const rapidjson::Value& props, const char* key,
|
||||
double fallback = 0.0) {
|
||||
if (props.HasMember(key) && props[key].IsNumber()) {
|
||||
return props[key].GetDouble();
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/// Get a bool from properties, with fallback
|
||||
static bool get_bool(const rapidjson::Value& props, const char* key,
|
||||
bool fallback = true) {
|
||||
if (props.HasMember(key) && props[key].IsBool()) {
|
||||
return props[key].GetBool();
|
||||
}
|
||||
return fallback;
|
||||
}
|
||||
|
||||
/// Get a string from properties, checking GID_0, GID_1, GID_2, etc.
|
||||
static std::string get_gid(const rapidjson::Value& props) {
|
||||
// Try GID_5 down to GID_0, return the most specific one found
|
||||
for (int lvl = 5; lvl >= 0; --lvl) {
|
||||
std::string key = "GID_" + std::to_string(lvl);
|
||||
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
|
||||
return props[key.c_str()].GetString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/// Get the name (NAME_0, NAME_1, ... NAME_5)
|
||||
static std::string get_name(const rapidjson::Value& props) {
|
||||
for (int lvl = 5; lvl >= 0; --lvl) {
|
||||
std::string key = "NAME_" + std::to_string(lvl);
|
||||
if (props.HasMember(key.c_str()) && props[key.c_str()].IsString()) {
|
||||
return props[key.c_str()].GetString();
|
||||
}
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/// Parse a single GeoJSON Feature object into a gadm::Feature
|
||||
static Feature parse_feature(const rapidjson::Value& feat) {
|
||||
Feature f;
|
||||
|
||||
// Properties
|
||||
if (feat.HasMember("properties") && feat["properties"].IsObject()) {
|
||||
const auto& props = feat["properties"];
|
||||
f.gid = get_gid(props);
|
||||
f.name = get_name(props);
|
||||
f.level = infer_level(f.gid);
|
||||
f.ghsPopulation = get_double(props, "ghsPopulation");
|
||||
f.ghsBuiltWeight = get_double(props, "ghsBuiltWeight");
|
||||
f.ghsPopMaxDensity = get_double(props, "ghsPopMaxDensity");
|
||||
f.ghsBuiltMax = get_double(props, "ghsBuiltMax");
|
||||
f.isOuter = get_bool(props, "isOuter");
|
||||
|
||||
if (props.HasMember("ghsPopCenter") && props["ghsPopCenter"].IsArray()) {
|
||||
f.ghsPopCenter = parse_coord(props["ghsPopCenter"]);
|
||||
}
|
||||
if (props.HasMember("ghsBuiltCenter") && props["ghsBuiltCenter"].IsArray()) {
|
||||
f.ghsBuiltCenter = parse_coord(props["ghsBuiltCenter"]);
|
||||
}
|
||||
if (props.HasMember("ghsPopCenters") && props["ghsPopCenters"].IsArray()) {
|
||||
f.ghsPopCenters = parse_weighted_centers(props["ghsPopCenters"]);
|
||||
}
|
||||
if (props.HasMember("ghsBuiltCenters") && props["ghsBuiltCenters"].IsArray()) {
|
||||
f.ghsBuiltCenters = parse_weighted_centers(props["ghsBuiltCenters"]);
|
||||
}
|
||||
}
|
||||
|
||||
// Geometry
|
||||
if (feat.HasMember("geometry") && feat["geometry"].IsObject()) {
|
||||
const auto& geom = feat["geometry"];
|
||||
std::string gtype;
|
||||
if (geom.HasMember("type") && geom["type"].IsString()) {
|
||||
gtype = geom["type"].GetString();
|
||||
}
|
||||
|
||||
if (geom.HasMember("coordinates") && geom["coordinates"].IsArray()) {
|
||||
const auto& coords = geom["coordinates"];
|
||||
|
||||
if (gtype == "Polygon") {
|
||||
// coordinates: [ [ring], [hole], ... ]
|
||||
for (rapidjson::SizeType r = 0; r < coords.Size(); ++r) {
|
||||
f.rings.push_back(parse_ring(coords[r]));
|
||||
}
|
||||
} else if (gtype == "MultiPolygon") {
|
||||
// coordinates: [ [ [ring], [hole] ], [ [ring] ], ... ]
|
||||
for (rapidjson::SizeType p = 0; p < coords.Size(); ++p) {
|
||||
if (coords[p].IsArray()) {
|
||||
for (rapidjson::SizeType r = 0; r < coords[p].Size(); ++r) {
|
||||
f.rings.push_back(parse_ring(coords[p][r]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Compute bbox and area from first ring (outer boundary)
|
||||
if (!f.rings.empty() && !f.rings[0].empty()) {
|
||||
f.bbox = geo::bbox(f.rings[0]);
|
||||
f.areaSqKm = geo::area_sq_km(f.rings[0]);
|
||||
}
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
// ── Public API ──────────────────────────────────────────────────────────────
|
||||
|
||||
BoundaryResult load_boundary_file(const std::string& filepath) {
|
||||
BoundaryResult result;
|
||||
|
||||
std::string json = read_file(filepath);
|
||||
if (json.empty()) {
|
||||
result.error = "Failed to read file: " + filepath;
|
||||
return result;
|
||||
}
|
||||
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json.c_str());
|
||||
if (doc.HasParseError()) {
|
||||
result.error = "JSON parse error in: " + filepath;
|
||||
return result;
|
||||
}
|
||||
|
||||
// Expect a FeatureCollection
|
||||
if (!doc.HasMember("features") || !doc["features"].IsArray()) {
|
||||
result.error = "Missing 'features' array in: " + filepath;
|
||||
return result;
|
||||
}
|
||||
|
||||
const auto& features = doc["features"];
|
||||
result.features.reserve(features.Size());
|
||||
for (rapidjson::SizeType i = 0; i < features.Size(); ++i) {
|
||||
result.features.push_back(parse_feature(features[i]));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
|
||||
const std::string& cacheDir) {
|
||||
std::string cc = country_code(gid);
|
||||
std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json";
|
||||
|
||||
// Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json
|
||||
std::string path = cacheDir + "/" + cc + "/" + filename;
|
||||
auto result = load_boundary_file(path);
|
||||
if (result.error.empty()) return result;
|
||||
|
||||
// Fallback (flat): cacheDir/boundary_{gid}_{level}.json
|
||||
path = cacheDir + "/" + filename;
|
||||
result = load_boundary_file(path);
|
||||
if (result.error.empty()) return result;
|
||||
|
||||
// Both failed
|
||||
result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir;
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace gadm
|
||||
5
packages/kbot/cpp/packages/geo/CMakeLists.txt
Normal file
5
packages/kbot/cpp/packages/geo/CMakeLists.txt
Normal file
@ -0,0 +1,5 @@
|
||||
add_library(geo STATIC src/geo.cpp)
|
||||
|
||||
target_include_directories(geo PUBLIC include)
|
||||
|
||||
# No external dependencies — pure math
|
||||
100
packages/kbot/cpp/packages/geo/include/geo/geo.h
Normal file
100
packages/kbot/cpp/packages/geo/include/geo/geo.h
Normal file
@ -0,0 +1,100 @@
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
|
||||
namespace geo {
|
||||
|
||||
// ── Constants ───────────────────────────────────────────────────────────────
|
||||
constexpr double EARTH_RADIUS_KM = 6371.0;
|
||||
constexpr double PI = 3.14159265358979323846;
|
||||
constexpr double DEG2RAD = PI / 180.0;
|
||||
constexpr double RAD2DEG = 180.0 / PI;
|
||||
|
||||
// ── Core types ──────────────────────────────────────────────────────────────
|
||||
|
||||
struct Coord {
|
||||
double lon = 0;
|
||||
double lat = 0;
|
||||
};
|
||||
|
||||
struct BBox {
|
||||
double minLon = 0;
|
||||
double minLat = 0;
|
||||
double maxLon = 0;
|
||||
double maxLat = 0;
|
||||
|
||||
Coord center() const {
|
||||
return {(minLon + maxLon) / 2.0, (minLat + maxLat) / 2.0};
|
||||
}
|
||||
|
||||
double width_deg() const { return maxLon - minLon; }
|
||||
double height_deg() const { return maxLat - minLat; }
|
||||
};
|
||||
|
||||
// ── Distance ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Haversine distance between two WGS84 points, in kilometers.
|
||||
double distance_km(Coord a, Coord b);
|
||||
|
||||
/// Haversine distance in meters.
|
||||
inline double distance_m(Coord a, Coord b) { return distance_km(a, b) * 1000.0; }
|
||||
|
||||
// ── Bounding box ────────────────────────────────────────────────────────────
|
||||
|
||||
/// Compute the bounding box of a polygon ring.
|
||||
BBox bbox(const std::vector<Coord>& ring);
|
||||
|
||||
/// Compute the bounding box that covers all features' rings.
|
||||
BBox bbox_union(const std::vector<BBox>& boxes);
|
||||
|
||||
// ── Centroid ────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Geometric centroid of a polygon ring (simple average method).
|
||||
Coord centroid(const std::vector<Coord>& ring);
|
||||
|
||||
// ── Area ────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Approximate area of a polygon ring in square meters.
|
||||
/// Uses the Shoelace formula with latitude cosine correction.
|
||||
double area_sq_m(const std::vector<Coord>& ring);
|
||||
|
||||
/// Area in square kilometers.
|
||||
inline double area_sq_km(const std::vector<Coord>& ring) {
|
||||
return area_sq_m(ring) / 1e6;
|
||||
}
|
||||
|
||||
// ── Point-in-polygon ────────────────────────────────────────────────────────
|
||||
|
||||
/// Ray-casting point-in-polygon test.
|
||||
/// Same algorithm as gadm/cpp pip.h but using Coord structs.
|
||||
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring);
|
||||
|
||||
// ── Bearing & destination ───────────────────────────────────────────────────
|
||||
|
||||
/// Initial bearing from a to b, in degrees (0 = north, 90 = east).
|
||||
double bearing_deg(Coord from, Coord to);
|
||||
|
||||
/// Compute the destination point given start, bearing (degrees), and distance (km).
|
||||
Coord destination(Coord from, double bearing_deg, double distance_km);
|
||||
|
||||
// ── Grid tessellation ───────────────────────────────────────────────────────
|
||||
|
||||
/// Generate a flat square grid of cell centers over a bbox.
|
||||
/// cellSizeKm defines the side length of each square cell.
|
||||
/// Returns center coordinates of each cell.
|
||||
std::vector<Coord> square_grid(BBox extent, double cellSizeKm);
|
||||
|
||||
/// Generate a flat hex grid of cell centers over a bbox.
|
||||
/// cellSizeKm defines the distance between hex centers.
|
||||
/// Returns center coordinates of each cell.
|
||||
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm);
|
||||
|
||||
// ── Viewport estimation (matches TS estimateViewportAreaSqKm) ──────────────
|
||||
|
||||
/// Estimate the km² visible in a viewport at a given lat/zoom.
|
||||
double estimate_viewport_sq_km(double lat, int zoom,
|
||||
int widthPx = 1024, int heightPx = 768);
|
||||
|
||||
} // namespace geo
|
||||
204
packages/kbot/cpp/packages/geo/src/geo.cpp
Normal file
204
packages/kbot/cpp/packages/geo/src/geo.cpp
Normal file
@ -0,0 +1,204 @@
|
||||
#include "geo/geo.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
|
||||
namespace geo {
|
||||
|
||||
// ── Distance (Haversine) ────────────────────────────────────────────────────
|
||||
|
||||
double distance_km(Coord a, Coord b) {
|
||||
double dLat = (b.lat - a.lat) * DEG2RAD;
|
||||
double dLon = (b.lon - a.lon) * DEG2RAD;
|
||||
double lat1 = a.lat * DEG2RAD;
|
||||
double lat2 = b.lat * DEG2RAD;
|
||||
|
||||
double sinDLat = std::sin(dLat / 2.0);
|
||||
double sinDLon = std::sin(dLon / 2.0);
|
||||
double h = sinDLat * sinDLat + std::cos(lat1) * std::cos(lat2) * sinDLon * sinDLon;
|
||||
return 2.0 * EARTH_RADIUS_KM * std::asin(std::sqrt(h));
|
||||
}
|
||||
|
||||
// ── Bounding box ────────────────────────────────────────────────────────────
|
||||
|
||||
BBox bbox(const std::vector<Coord>& ring) {
|
||||
if (ring.empty()) return {};
|
||||
BBox b{ring[0].lon, ring[0].lat, ring[0].lon, ring[0].lat};
|
||||
for (size_t i = 1; i < ring.size(); ++i) {
|
||||
b.minLon = std::min(b.minLon, ring[i].lon);
|
||||
b.minLat = std::min(b.minLat, ring[i].lat);
|
||||
b.maxLon = std::max(b.maxLon, ring[i].lon);
|
||||
b.maxLat = std::max(b.maxLat, ring[i].lat);
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
BBox bbox_union(const std::vector<BBox>& boxes) {
|
||||
if (boxes.empty()) return {};
|
||||
BBox u = boxes[0];
|
||||
for (size_t i = 1; i < boxes.size(); ++i) {
|
||||
u.minLon = std::min(u.minLon, boxes[i].minLon);
|
||||
u.minLat = std::min(u.minLat, boxes[i].minLat);
|
||||
u.maxLon = std::max(u.maxLon, boxes[i].maxLon);
|
||||
u.maxLat = std::max(u.maxLat, boxes[i].maxLat);
|
||||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
// ── Centroid ────────────────────────────────────────────────────────────────
|
||||
|
||||
Coord centroid(const std::vector<Coord>& ring) {
|
||||
if (ring.empty()) return {};
|
||||
double sumLon = 0, sumLat = 0;
|
||||
// Exclude last point if it's the same as first (closed ring)
|
||||
size_t n = ring.size();
|
||||
if (n > 1 && ring[0].lon == ring[n - 1].lon && ring[0].lat == ring[n - 1].lat) {
|
||||
n--;
|
||||
}
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
sumLon += ring[i].lon;
|
||||
sumLat += ring[i].lat;
|
||||
}
|
||||
return {sumLon / static_cast<double>(n), sumLat / static_cast<double>(n)};
|
||||
}
|
||||
|
||||
// ── Area (Shoelace + latitude cosine correction) ────────────────────────────
|
||||
|
||||
double area_sq_m(const std::vector<Coord>& ring) {
|
||||
if (ring.size() < 3) return 0.0;
|
||||
|
||||
// Shoelace formula in projected coordinates.
|
||||
// Each degree of longitude = cos(lat) * 111320 meters at that latitude.
|
||||
// Each degree of latitude = 110540 meters (approximate).
|
||||
double sum = 0.0;
|
||||
size_t n = ring.size();
|
||||
|
||||
for (size_t i = 0; i < n; ++i) {
|
||||
size_t j = (i + 1) % n;
|
||||
// Convert coordinates to approximate meters using the average latitude
|
||||
double avgLat = (ring[i].lat + ring[j].lat) / 2.0;
|
||||
double cosLat = std::cos(avgLat * DEG2RAD);
|
||||
|
||||
double x_i = ring[i].lon * cosLat * 111320.0;
|
||||
double y_i = ring[i].lat * 110540.0;
|
||||
double x_j = ring[j].lon * cosLat * 111320.0;
|
||||
double y_j = ring[j].lat * 110540.0;
|
||||
|
||||
sum += x_i * y_j - x_j * y_i;
|
||||
}
|
||||
return std::abs(sum) / 2.0;
|
||||
}
|
||||
|
||||
// ── Point-in-polygon (ray casting) ──────────────────────────────────────────
|
||||
|
||||
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring) {
|
||||
bool inside = false;
|
||||
size_t n = ring.size();
|
||||
for (size_t i = 0, j = n - 1; i < n; j = i++) {
|
||||
double xi = ring[i].lon, yi = ring[i].lat;
|
||||
double xj = ring[j].lon, yj = ring[j].lat;
|
||||
|
||||
if (((yi > pt.lat) != (yj > pt.lat)) &&
|
||||
(pt.lon < (xj - xi) * (pt.lat - yi) / (yj - yi) + xi)) {
|
||||
inside = !inside;
|
||||
}
|
||||
}
|
||||
return inside;
|
||||
}
|
||||
|
||||
// ── Bearing ─────────────────────────────────────────────────────────────────
|
||||
|
||||
double bearing_deg(Coord from, Coord to) {
|
||||
double dLon = (to.lon - from.lon) * DEG2RAD;
|
||||
double lat1 = from.lat * DEG2RAD;
|
||||
double lat2 = to.lat * DEG2RAD;
|
||||
|
||||
double y = std::sin(dLon) * std::cos(lat2);
|
||||
double x = std::cos(lat1) * std::sin(lat2) -
|
||||
std::sin(lat1) * std::cos(lat2) * std::cos(dLon);
|
||||
double brng = std::atan2(y, x) * RAD2DEG;
|
||||
return std::fmod(brng + 360.0, 360.0);
|
||||
}
|
||||
|
||||
// ── Destination point ───────────────────────────────────────────────────────
|
||||
|
||||
Coord destination(Coord from, double brng_deg, double dist_km) {
|
||||
double brng = brng_deg * DEG2RAD;
|
||||
double lat1 = from.lat * DEG2RAD;
|
||||
double lon1 = from.lon * DEG2RAD;
|
||||
double d = dist_km / EARTH_RADIUS_KM;
|
||||
|
||||
double lat2 = std::asin(std::sin(lat1) * std::cos(d) +
|
||||
std::cos(lat1) * std::sin(d) * std::cos(brng));
|
||||
double lon2 = lon1 + std::atan2(
|
||||
std::sin(brng) * std::sin(d) * std::cos(lat1),
|
||||
std::cos(d) - std::sin(lat1) * std::sin(lat2));
|
||||
|
||||
return {lon2 * RAD2DEG, lat2 * RAD2DEG};
|
||||
}
|
||||
|
||||
// ── Square grid ─────────────────────────────────────────────────────────────
|
||||
|
||||
std::vector<Coord> square_grid(BBox extent, double cellSizeKm) {
|
||||
std::vector<Coord> centers;
|
||||
if (cellSizeKm <= 0) return centers;
|
||||
|
||||
// Convert cell size to degrees at the center latitude
|
||||
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
|
||||
double cosLat = std::cos(centerLat * DEG2RAD);
|
||||
if (cosLat < 1e-10) cosLat = 1e-10; // Avoid division by zero near poles
|
||||
|
||||
double cellLatDeg = cellSizeKm / 110.574; // ~110.574 km per degree lat
|
||||
double cellLonDeg = cellSizeKm / (111.320 * cosLat); // longitude correction
|
||||
|
||||
for (double lat = extent.minLat + cellLatDeg / 2.0;
|
||||
lat < extent.maxLat; lat += cellLatDeg) {
|
||||
for (double lon = extent.minLon + cellLonDeg / 2.0;
|
||||
lon < extent.maxLon; lon += cellLonDeg) {
|
||||
centers.push_back({lon, lat});
|
||||
}
|
||||
}
|
||||
return centers;
|
||||
}
|
||||
|
||||
// ── Hex grid ────────────────────────────────────────────────────────────────
|
||||
|
||||
std::vector<Coord> hex_grid(BBox extent, double cellSizeKm) {
|
||||
std::vector<Coord> centers;
|
||||
if (cellSizeKm <= 0) return centers;
|
||||
|
||||
double centerLat = (extent.minLat + extent.maxLat) / 2.0;
|
||||
double cosLat = std::cos(centerLat * DEG2RAD);
|
||||
if (cosLat < 1e-10) cosLat = 1e-10;
|
||||
|
||||
// Hex spacing: horizontal = cellSize, vertical = cellSize * sqrt(3)/2
|
||||
double cellLatDeg = cellSizeKm / 110.574;
|
||||
double cellLonDeg = cellSizeKm / (111.320 * cosLat);
|
||||
double rowHeight = cellLatDeg * std::sqrt(3.0) / 2.0;
|
||||
|
||||
int row = 0;
|
||||
for (double lat = extent.minLat + rowHeight / 2.0;
|
||||
lat < extent.maxLat; lat += rowHeight) {
|
||||
// Offset every other row by half the cell width
|
||||
double lonOffset = (row % 2 == 1) ? cellLonDeg / 2.0 : 0.0;
|
||||
for (double lon = extent.minLon + cellLonDeg / 2.0 + lonOffset;
|
||||
lon < extent.maxLon; lon += cellLonDeg) {
|
||||
centers.push_back({lon, lat});
|
||||
}
|
||||
row++;
|
||||
}
|
||||
return centers;
|
||||
}
|
||||
|
||||
// ── Viewport estimation ─────────────────────────────────────────────────────
|
||||
|
||||
double estimate_viewport_sq_km(double lat, int zoom, int widthPx, int heightPx) {
|
||||
double metersPerPx =
|
||||
(156543.03392 * std::cos(lat * DEG2RAD)) / std::pow(2.0, zoom);
|
||||
double widthKm = (widthPx * metersPerPx) / 1000.0;
|
||||
double heightKm = (heightPx * metersPerPx) / 1000.0;
|
||||
return widthKm * heightKm;
|
||||
}
|
||||
|
||||
} // namespace geo
|
||||
6
packages/kbot/cpp/packages/grid/CMakeLists.txt
Normal file
6
packages/kbot/cpp/packages/grid/CMakeLists.txt
Normal file
@ -0,0 +1,6 @@
|
||||
add_library(grid STATIC src/grid.cpp)
|
||||
|
||||
target_include_directories(grid PUBLIC include)
|
||||
|
||||
# Depends on geo (math) and gadm_reader (Feature type)
|
||||
target_link_libraries(grid PUBLIC geo gadm_reader)
|
||||
56
packages/kbot/cpp/packages/grid/include/grid/grid.h
Normal file
56
packages/kbot/cpp/packages/grid/include/grid/grid.h
Normal file
@ -0,0 +1,56 @@
|
||||
#pragma once
|
||||
|
||||
#include "geo/geo.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace grid {
|
||||
|
||||
// ── Types (mirror TS GridSearchHop) ─────────────────────────────────────────
|
||||
|
||||
struct Waypoint {
|
||||
int step = 0;
|
||||
double lng = 0;
|
||||
double lat = 0;
|
||||
double radius_km = 0;
|
||||
std::string area_gid;
|
||||
std::string area_name;
|
||||
};
|
||||
|
||||
struct GridOptions {
|
||||
std::string gridMode = "hex"; // "hex", "square", "admin", "centers"
|
||||
double cellSize = 5.0; // km
|
||||
double cellOverlap = 0.0;
|
||||
double centroidOverlap = 0.5;
|
||||
int maxCellsLimit = 15000;
|
||||
double maxElevation = 0;
|
||||
double minDensity = 0;
|
||||
double minGhsPop = 0;
|
||||
double minGhsBuilt = 0;
|
||||
std::string ghsFilterMode = "AND"; // "AND" | "OR"
|
||||
bool allowMissingGhs = false;
|
||||
bool bypassFilters = false;
|
||||
std::string pathOrder = "snake"; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest"
|
||||
bool groupByRegion = true;
|
||||
};
|
||||
|
||||
struct GridResult {
|
||||
std::vector<Waypoint> waypoints;
|
||||
int validCells = 0;
|
||||
int skippedCells = 0;
|
||||
std::string error;
|
||||
};
|
||||
|
||||
// ── API ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
/// Generate grid waypoints from GADM features + options.
|
||||
/// This is the main entry point — equivalent to generateGridSearchCells() in TS.
|
||||
GridResult generate(
|
||||
const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts
|
||||
);
|
||||
|
||||
} // namespace grid
|
||||
393
packages/kbot/cpp/packages/grid/src/grid.cpp
Normal file
393
packages/kbot/cpp/packages/grid/src/grid.cpp
Normal file
@ -0,0 +1,393 @@
|
||||
#include "grid/grid.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
|
||||
namespace grid {
|
||||
|
||||
// ── Internal types ──────────────────────────────────────────────────────────
|
||||
|
||||
struct CellInfo {
|
||||
geo::Coord center;
|
||||
double radius_km;
|
||||
int region_idx;
|
||||
bool allowed;
|
||||
std::string reason;
|
||||
};
|
||||
|
||||
// ── Filter logic (mirrors checkCellFilters in TS) ───────────────────────────
|
||||
|
||||
static bool check_filters(const gadm::Feature& feat, const GridOptions& opts,
|
||||
double areaSqKm, std::string& reason) {
|
||||
if (opts.bypassFilters) return true;
|
||||
|
||||
// GHS filter
|
||||
bool checkPop = opts.minGhsPop > 0;
|
||||
bool checkBuilt = opts.minGhsBuilt > 0;
|
||||
|
||||
if (checkPop || checkBuilt) {
|
||||
double ghsPop = feat.ghsPopulation;
|
||||
double ghsBuilt = feat.ghsBuiltWeight;
|
||||
bool popPass = checkPop && ((ghsPop == 0 && opts.allowMissingGhs) || ghsPop >= opts.minGhsPop);
|
||||
bool builtPass = checkBuilt && ((ghsBuilt == 0 && opts.allowMissingGhs) || ghsBuilt >= opts.minGhsBuilt);
|
||||
|
||||
if (opts.ghsFilterMode == "OR") {
|
||||
if (checkPop && checkBuilt && !popPass && !builtPass) {
|
||||
reason = "GHS (OR) below thresholds";
|
||||
return false;
|
||||
} else if (checkPop && !checkBuilt && !popPass) {
|
||||
reason = "GHS Pop below threshold";
|
||||
return false;
|
||||
} else if (checkBuilt && !checkPop && !builtPass) {
|
||||
reason = "GHS Built below threshold";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (checkPop && !popPass) {
|
||||
reason = "GHS Pop below threshold";
|
||||
return false;
|
||||
}
|
||||
if (checkBuilt && !builtPass) {
|
||||
reason = "GHS Built below threshold";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// ── Sorting ─────────────────────────────────────────────────────────────────
|
||||
|
||||
static void sort_waypoints(std::vector<Waypoint>& wps, const std::string& pathOrder,
|
||||
double cellSize) {
|
||||
if (wps.size() <= 1) return;
|
||||
|
||||
double rowTolerance = std::min((cellSize / 111.32) * 0.5, 0.5);
|
||||
|
||||
if (pathOrder == "zigzag" || pathOrder == "snake") {
|
||||
// Sort top-to-bottom, left-to-right within row tolerance
|
||||
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
|
||||
if (std::abs(a.lat - b.lat) > rowTolerance) {
|
||||
return b.lat < a.lat; // higher lat first (north to south)
|
||||
}
|
||||
return a.lng < b.lng; // left to right
|
||||
});
|
||||
|
||||
if (pathOrder == "snake") {
|
||||
// Group into rows, reverse every other row
|
||||
std::vector<std::vector<Waypoint>> rows;
|
||||
std::vector<Waypoint> currentRow;
|
||||
double lastY = wps[0].lat;
|
||||
|
||||
for (auto& wp : wps) {
|
||||
if (std::abs(wp.lat - lastY) > rowTolerance) {
|
||||
rows.push_back(std::move(currentRow));
|
||||
currentRow.clear();
|
||||
lastY = wp.lat;
|
||||
}
|
||||
currentRow.push_back(wp);
|
||||
}
|
||||
if (!currentRow.empty()) rows.push_back(std::move(currentRow));
|
||||
|
||||
wps.clear();
|
||||
for (size_t i = 0; i < rows.size(); ++i) {
|
||||
if (i % 2 == 1) std::reverse(rows[i].begin(), rows[i].end());
|
||||
for (auto& wp : rows[i]) wps.push_back(std::move(wp));
|
||||
}
|
||||
}
|
||||
|
||||
} else if (pathOrder == "spiral-out" || pathOrder == "spiral-in") {
|
||||
// Sort by distance from center of all waypoints
|
||||
double cLon = 0, cLat = 0;
|
||||
for (const auto& wp : wps) { cLon += wp.lng; cLat += wp.lat; }
|
||||
cLon /= wps.size();
|
||||
cLat /= wps.size();
|
||||
geo::Coord center{cLon, cLat};
|
||||
|
||||
std::sort(wps.begin(), wps.end(), [&](const Waypoint& a, const Waypoint& b) {
|
||||
double dA = geo::distance_km(center, {a.lng, a.lat});
|
||||
double dB = geo::distance_km(center, {b.lng, b.lat});
|
||||
return (pathOrder == "spiral-out") ? (dA < dB) : (dA > dB);
|
||||
});
|
||||
|
||||
} else if (pathOrder == "shortest") {
|
||||
// Greedy nearest-neighbor
|
||||
std::vector<Waypoint> sorted;
|
||||
sorted.reserve(wps.size());
|
||||
std::vector<bool> used(wps.size(), false);
|
||||
|
||||
sorted.push_back(wps[0]);
|
||||
used[0] = true;
|
||||
|
||||
for (size_t step = 1; step < wps.size(); ++step) {
|
||||
const auto& cur = sorted.back();
|
||||
double bestDist = 1e18;
|
||||
size_t bestIdx = 0;
|
||||
|
||||
for (size_t i = 0; i < wps.size(); ++i) {
|
||||
if (used[i]) continue;
|
||||
double dx = wps[i].lng - cur.lng;
|
||||
double dy = wps[i].lat - cur.lat;
|
||||
double distSq = dx * dx + dy * dy;
|
||||
if (distSq < bestDist) {
|
||||
bestDist = distSq;
|
||||
bestIdx = i;
|
||||
}
|
||||
}
|
||||
|
||||
sorted.push_back(wps[bestIdx]);
|
||||
used[bestIdx] = true;
|
||||
}
|
||||
|
||||
wps = std::move(sorted);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Admin mode ──────────────────────────────────────────────────────────────
|
||||
|
||||
static GridResult generate_admin(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult res;
|
||||
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
const auto& f = features[i];
|
||||
if (f.rings.empty() || f.rings[0].empty()) continue;
|
||||
|
||||
std::string reason;
|
||||
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
|
||||
|
||||
geo::Coord center = geo::centroid(f.rings[0]);
|
||||
// Radius = distance from centroid to bbox corner
|
||||
double radiusKm = geo::distance_km(center, {f.bbox.maxLon, f.bbox.maxLat});
|
||||
|
||||
if (allowed) {
|
||||
res.waypoints.push_back({
|
||||
static_cast<int>(res.waypoints.size() + 1),
|
||||
std::round(center.lon * 1e6) / 1e6,
|
||||
std::round(center.lat * 1e6) / 1e6,
|
||||
std::round(radiusKm * 100.0) / 100.0,
|
||||
f.gid,
|
||||
f.name
|
||||
});
|
||||
res.validCells++;
|
||||
} else {
|
||||
res.skippedCells++;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// ── Centers mode ────────────────────────────────────────────────────────────
|
||||
|
||||
static GridResult generate_centers(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult res;
|
||||
|
||||
struct AcceptedCenter {
|
||||
geo::Coord coord;
|
||||
};
|
||||
std::vector<AcceptedCenter> accepted;
|
||||
|
||||
double minAllowedDist = opts.cellSize * (1.0 - opts.centroidOverlap);
|
||||
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
const auto& f = features[i];
|
||||
|
||||
// Collect unique centers by rounding to 5 decimal places
|
||||
std::map<std::string, std::array<double, 3>> centersMap; // key → [lon, lat, weight]
|
||||
|
||||
auto addCenter = [&](double lon, double lat, double weight) {
|
||||
char key[32];
|
||||
snprintf(key, sizeof(key), "%.5f,%.5f", lon, lat);
|
||||
std::string k(key);
|
||||
if (centersMap.find(k) == centersMap.end()) {
|
||||
centersMap[k] = {lon, lat, weight};
|
||||
}
|
||||
};
|
||||
|
||||
// Single pop/built centers
|
||||
if (f.ghsPopCenter.lon != 0 || f.ghsPopCenter.lat != 0) {
|
||||
addCenter(f.ghsPopCenter.lon, f.ghsPopCenter.lat, f.ghsPopulation);
|
||||
}
|
||||
if (f.ghsBuiltCenter.lon != 0 || f.ghsBuiltCenter.lat != 0) {
|
||||
addCenter(f.ghsBuiltCenter.lon, f.ghsBuiltCenter.lat, f.ghsBuiltWeight);
|
||||
}
|
||||
|
||||
// Weighted center arrays
|
||||
for (const auto& c : f.ghsPopCenters) {
|
||||
addCenter(c[0], c[1], c[2]);
|
||||
}
|
||||
for (const auto& c : f.ghsBuiltCenters) {
|
||||
addCenter(c[0], c[1], c[2]);
|
||||
}
|
||||
|
||||
for (const auto& [key, val] : centersMap) {
|
||||
geo::Coord pt{val[0], val[1]};
|
||||
|
||||
std::string reason;
|
||||
// For centers, use the feature's overall filters
|
||||
bool allowed = check_filters(f, opts, f.areaSqKm, reason);
|
||||
|
||||
// Check overlap with already-accepted centers
|
||||
if (allowed && !accepted.empty()) {
|
||||
for (const auto& ac : accepted) {
|
||||
double dist = geo::distance_km(pt, ac.coord);
|
||||
if (dist < minAllowedDist) {
|
||||
allowed = false;
|
||||
reason = "overlaps another centroid";
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (allowed) {
|
||||
accepted.push_back({pt});
|
||||
res.waypoints.push_back({
|
||||
static_cast<int>(res.waypoints.size() + 1),
|
||||
std::round(pt.lon * 1e6) / 1e6,
|
||||
std::round(pt.lat * 1e6) / 1e6,
|
||||
std::round((opts.cellSize / 2.0) * 100.0) / 100.0,
|
||||
f.gid,
|
||||
f.name
|
||||
});
|
||||
res.validCells++;
|
||||
} else {
|
||||
res.skippedCells++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// ── Polygon grid mode (hex / square) ────────────────────────────────────────
|
||||
|
||||
static GridResult generate_polygon_grid(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult res;
|
||||
|
||||
// Compute union bbox of all features
|
||||
std::vector<geo::BBox> boxes;
|
||||
for (const auto& f : features) {
|
||||
if (!f.rings.empty()) boxes.push_back(f.bbox);
|
||||
}
|
||||
if (boxes.empty()) return res;
|
||||
|
||||
geo::BBox extent = geo::bbox_union(boxes);
|
||||
|
||||
// Estimate cell count to prevent runaway
|
||||
double widthKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.maxLon, extent.minLat});
|
||||
double heightKm = geo::distance_km({extent.minLon, extent.minLat}, {extent.minLon, extent.maxLat});
|
||||
double approxCellArea = opts.cellSize * opts.cellSize * 2.6;
|
||||
int approxCells = static_cast<int>(std::ceil((widthKm * heightKm) / approxCellArea));
|
||||
|
||||
if (approxCells > opts.maxCellsLimit) {
|
||||
res.error = "Grid too massive (~" + std::to_string(approxCells) + " cells). Increase cell size or select smaller region.";
|
||||
return res;
|
||||
}
|
||||
|
||||
// Generate grid centers
|
||||
std::vector<geo::Coord> gridCenters;
|
||||
if (opts.gridMode == "square") {
|
||||
gridCenters = geo::square_grid(extent, opts.cellSize);
|
||||
} else {
|
||||
gridCenters = geo::hex_grid(extent, opts.cellSize);
|
||||
}
|
||||
|
||||
// For each grid center, check if it intersects any feature polygon
|
||||
for (const auto& gc : gridCenters) {
|
||||
bool intersects = false;
|
||||
int regionIdx = -1;
|
||||
|
||||
for (size_t i = 0; i < features.size(); ++i) {
|
||||
if (features[i].rings.empty()) continue;
|
||||
if (geo::point_in_polygon(gc, features[i].rings[0])) {
|
||||
intersects = true;
|
||||
regionIdx = static_cast<int>(i);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!intersects) continue;
|
||||
|
||||
const auto& regionFeat = features[regionIdx];
|
||||
std::string reason;
|
||||
bool allowed = check_filters(regionFeat, opts, regionFeat.areaSqKm, reason);
|
||||
|
||||
// Compute cell radius (half diagonal of cell)
|
||||
double cellRadiusKm = opts.cellSize * std::sqrt(2.0) / 2.0;
|
||||
|
||||
if (allowed) {
|
||||
res.waypoints.push_back({
|
||||
static_cast<int>(res.waypoints.size() + 1),
|
||||
std::round(gc.lon * 1e6) / 1e6,
|
||||
std::round(gc.lat * 1e6) / 1e6,
|
||||
std::round(cellRadiusKm * 100.0) / 100.0,
|
||||
regionFeat.gid,
|
||||
regionFeat.name
|
||||
});
|
||||
res.validCells++;
|
||||
} else {
|
||||
res.skippedCells++;
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// ── Main entry point ────────────────────────────────────────────────────────
|
||||
|
||||
GridResult generate(const std::vector<gadm::Feature>& features,
|
||||
const GridOptions& opts) {
|
||||
GridResult result;
|
||||
|
||||
if (features.empty()) {
|
||||
result.error = "No features provided";
|
||||
return result;
|
||||
}
|
||||
|
||||
if (opts.gridMode == "admin") {
|
||||
result = generate_admin(features, opts);
|
||||
} else if (opts.gridMode == "centers") {
|
||||
result = generate_centers(features, opts);
|
||||
} else {
|
||||
result = generate_polygon_grid(features, opts);
|
||||
}
|
||||
|
||||
if (!result.error.empty()) return result;
|
||||
|
||||
// Sort waypoints
|
||||
if (result.waypoints.size() > 1) {
|
||||
if (opts.groupByRegion && features.size() > 1) {
|
||||
std::stable_sort(result.waypoints.begin(), result.waypoints.end(),
|
||||
[](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; });
|
||||
|
||||
auto start = result.waypoints.begin();
|
||||
while (start != result.waypoints.end()) {
|
||||
auto end = start;
|
||||
while (end != result.waypoints.end() && end->area_gid == start->area_gid) {
|
||||
++end;
|
||||
}
|
||||
std::vector<Waypoint> group(start, end);
|
||||
sort_waypoints(group, opts.pathOrder, opts.cellSize);
|
||||
std::copy(group.begin(), group.end(), start);
|
||||
start = end;
|
||||
}
|
||||
} else {
|
||||
sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize);
|
||||
}
|
||||
}
|
||||
|
||||
// Re-number steps after sorting
|
||||
for (size_t i = 0; i < result.waypoints.size(); ++i) {
|
||||
result.waypoints[i].step = static_cast<int>(i + 1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace grid
|
||||
33
packages/kbot/cpp/packages/html/CMakeLists.txt
Normal file
33
packages/kbot/cpp/packages/html/CMakeLists.txt
Normal file
@ -0,0 +1,33 @@
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
lexbor
|
||||
GIT_REPOSITORY https://github.com/lexbor/lexbor.git
|
||||
GIT_TAG v2.4.0
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
# Build lexbor as static
|
||||
set(LEXBOR_BUILD_SHARED OFF CACHE BOOL "" FORCE)
|
||||
set(LEXBOR_BUILD_STATIC ON CACHE BOOL "" FORCE)
|
||||
FetchContent_MakeAvailable(lexbor)
|
||||
|
||||
add_library(html STATIC
|
||||
src/html.cpp
|
||||
src/html2md.cpp
|
||||
src/table.cpp
|
||||
)
|
||||
|
||||
# MSVC: treat source and execution charset as UTF-8
|
||||
# (fixes \u200b zero-width-space mismatch in html2md tests)
|
||||
if(MSVC)
|
||||
target_compile_options(html PRIVATE /utf-8)
|
||||
endif()
|
||||
|
||||
target_include_directories(html
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(html
|
||||
PUBLIC lexbor_static
|
||||
)
|
||||
55
packages/kbot/cpp/packages/html/include/html/html.h
Normal file
55
packages/kbot/cpp/packages/html/include/html/html.h
Normal file
@ -0,0 +1,55 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace html {
|
||||
|
||||
/// Parsed element — tag name + text content.
|
||||
struct Element {
|
||||
std::string tag;
|
||||
std::string text;
|
||||
};
|
||||
|
||||
/// Link with href and optional attributes.
|
||||
struct Link {
|
||||
std::string href;
|
||||
std::string rel; // e.g. "canonical", "stylesheet"
|
||||
std::string text; // anchor text (for <a> tags)
|
||||
};
|
||||
|
||||
/// Parse an HTML string and return all elements with their text content.
|
||||
std::vector<Element> parse(const std::string &html_str);
|
||||
|
||||
/// Extract the text content of all elements matching a CSS selector.
|
||||
std::vector<std::string> select(const std::string &html_str,
|
||||
const std::string &selector);
|
||||
|
||||
// ── Enricher extraction helpers ─────────────────────────────────────────────
|
||||
|
||||
/// Extract the <title> text.
|
||||
std::string get_title(const std::string &html_str);
|
||||
|
||||
/// Extract a <meta name="X"> or <meta property="X"> content attribute.
|
||||
std::string get_meta(const std::string &html_str, const std::string &name);
|
||||
|
||||
/// Extract <link rel="canonical"> href.
|
||||
std::string get_canonical(const std::string &html_str);
|
||||
|
||||
/// Extract all <a href="..."> values (resolved links as-is from the HTML).
|
||||
std::vector<Link> get_links(const std::string &html_str);
|
||||
|
||||
/// Extract visible body text, stripping script/style/noscript/svg/iframe.
|
||||
std::string get_body_text(const std::string &html_str);
|
||||
|
||||
/// Extract raw JSON strings from <script type="application/ld+json">.
|
||||
std::vector<std::string> get_json_ld(const std::string &html_str);
|
||||
|
||||
/// Extract an attribute value from the first element matching a CSS selector.
|
||||
std::string get_attr(const std::string &html_str, const std::string &selector,
|
||||
const std::string &attr_name);
|
||||
|
||||
/// Convert HTML content to Markdown.
|
||||
std::string to_markdown(const std::string &html_str);
|
||||
|
||||
} // namespace html
|
||||
690
packages/kbot/cpp/packages/html/include/html/html2md.h
Normal file
690
packages/kbot/cpp/packages/html/include/html/html2md.h
Normal file
@ -0,0 +1,690 @@
|
||||
// Copyright (c) Tim Gromeyer
|
||||
// Licensed under the MIT License - https://opensource.org/licenses/MIT
|
||||
|
||||
#ifndef HTML2MD_H
|
||||
#define HTML2MD_H
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <cstdint>
|
||||
|
||||
/*!
|
||||
* \brief html2md namespace
|
||||
*
|
||||
* The html2md namespace provides:
|
||||
* 1. The Converter class
|
||||
* 2. Static wrapper around Converter class
|
||||
*
|
||||
* \note Do NOT try to convert HTML that contains a list in an ordered list or a
|
||||
* `blockquote` in a list!\n This will be a **total** mess!
|
||||
*/
|
||||
namespace html2md {
|
||||
|
||||
/*!
|
||||
* \brief Options for the conversion from HTML to Markdown
|
||||
* \warning Make sure to pass valid options; otherwise, the output will be
|
||||
* invalid!
|
||||
*
|
||||
* Example from `tests/main.cpp`:
|
||||
*
|
||||
* ```cpp
|
||||
* auto *options = new html2md::Options();
|
||||
* options->splitLines = false;
|
||||
*
|
||||
* html2md::Converter c(html, options);
|
||||
* auto md = c.convert();
|
||||
* ```
|
||||
*/
|
||||
struct Options {
|
||||
/*!
|
||||
* \brief Add new line when a certain number of characters is reached
|
||||
*
|
||||
* \see softBreak
|
||||
* \see hardBreak
|
||||
*/
|
||||
bool splitLines = true;
|
||||
|
||||
/*!
|
||||
* \brief softBreak Wrap after ... characters when the next space is reached
|
||||
* and as long as it's not in a list, table, image or anchor (link).
|
||||
*/
|
||||
int softBreak = 80;
|
||||
|
||||
/*!
|
||||
* \brief hardBreak Force a break after ... characters in a line
|
||||
*/
|
||||
int hardBreak = 100;
|
||||
|
||||
/*!
|
||||
* \brief The char used for unordered lists
|
||||
*
|
||||
* Valid:
|
||||
* - `-`
|
||||
* - `+`
|
||||
* - `*`
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* ```markdown
|
||||
* - List
|
||||
* + Also a list
|
||||
* * And this to
|
||||
* ```
|
||||
*/
|
||||
char unorderedList = '-';
|
||||
|
||||
/*!
|
||||
* \brief The char used after the number of the item
|
||||
*
|
||||
* Valid:
|
||||
* - `.`
|
||||
* - `)`
|
||||
*
|
||||
* Example:
|
||||
*
|
||||
* ```markdown
|
||||
* 1. Hello
|
||||
* 2) World!
|
||||
* ```
|
||||
*/
|
||||
char orderedList = '.';
|
||||
|
||||
/*!
|
||||
* \brief Whether title is added as h1 heading at the very beginning of the
|
||||
* markdown
|
||||
*
|
||||
* Whether title is added as h1 heading at the very beginning of the markdown.
|
||||
* Default is true.
|
||||
*/
|
||||
bool includeTitle = true;
|
||||
|
||||
/*!
|
||||
* \brief Whetever to format Markdown Tables
|
||||
*
|
||||
* Whetever to format Markdown Tables.
|
||||
* Default is true.
|
||||
*/
|
||||
bool formatTable = true;
|
||||
|
||||
/*!
|
||||
* \brief Whether to force left trim of lines in the final Markdown output
|
||||
*
|
||||
* Whether to force left trim of lines in the final Markdown output.
|
||||
* Default is false.
|
||||
*/
|
||||
bool forceLeftTrim = false;
|
||||
|
||||
/*!
|
||||
* \brief Whether to compress whitespace (tabs, multiple spaces) into a single
|
||||
* space
|
||||
*
|
||||
* Whether to compress whitespace (tabs, multiple spaces) into a single space.
|
||||
* Default is false.
|
||||
*/
|
||||
bool compressWhitespace = false;
|
||||
|
||||
/*!
|
||||
* \brief Whether to escape numbered lists (e.g. "4." -> "4\.") to prevent them
|
||||
* from being interpreted as lists in Markdown.
|
||||
*
|
||||
* Whether to escape numbered lists.
|
||||
* Default is true.
|
||||
*/
|
||||
bool escapeNumberedList = true;
|
||||
|
||||
/*!
|
||||
* \brief Whether to keep HTML entities (e.g. ` `) in the output
|
||||
*
|
||||
* If true, the converter will not replace HTML entities configured in the
|
||||
* internal conversion map. Default is false (current behaviour).
|
||||
*/
|
||||
bool keepHtmlEntities = false;
|
||||
|
||||
inline bool operator==(html2md::Options o) const {
|
||||
return splitLines == o.splitLines && unorderedList == o.unorderedList &&
|
||||
orderedList == o.orderedList && includeTitle == o.includeTitle &&
|
||||
softBreak == o.softBreak && hardBreak == o.hardBreak &&
|
||||
formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim &&
|
||||
compressWhitespace == o.compressWhitespace &&
|
||||
escapeNumberedList == o.escapeNumberedList &&
|
||||
keepHtmlEntities == o.keepHtmlEntities;
|
||||
};
|
||||
};
|
||||
|
||||
/*!
|
||||
* \brief Class for converting HTML to Markdown
|
||||
*
|
||||
* This class converts HTML to Markdown.
|
||||
* There is also a static wrapper for this class (see html2md::Convert).
|
||||
*
|
||||
* ## Usage example
|
||||
*
|
||||
* Option 1: Use the class:
|
||||
*
|
||||
* ```cpp
|
||||
* std::string html = "<h1>example</h1>";
|
||||
* html2md::Converter c(html);
|
||||
* auto md = c.convert();
|
||||
*
|
||||
* if (!c.ok()) std::cout << "There was something wrong in the HTML\n";
|
||||
* std::cout << md; // # example
|
||||
* ```
|
||||
*
|
||||
* Option 2: Use the static wrapper:
|
||||
*
|
||||
* ```cpp
|
||||
* std::string html = "<h1>example</h1>";
|
||||
*
|
||||
* auto md = html2md::Convert(html);
|
||||
* std::cout << md;
|
||||
* ```
|
||||
*
|
||||
* Advanced: use Options:
|
||||
*
|
||||
* ```cpp
|
||||
* std::string html = "<h1>example</h1>";
|
||||
*
|
||||
* auto *options = new html2md::Options();
|
||||
* options->splitLines = false;
|
||||
* options->unorderedList = '*';
|
||||
*
|
||||
* html2md::Converter c(html, options);
|
||||
* auto md = c.convert();
|
||||
* if (!c.ok()) std::cout << "There was something wrong in the HTML\n";
|
||||
* std::cout << md; // # example
|
||||
* ```
|
||||
*/
|
||||
class Converter {
|
||||
public:
|
||||
/*!
|
||||
* \brief Standard initializer, takes HTML as parameter. Also prepares
|
||||
* everything. \param html The HTML as std::string. \param options Options for
|
||||
* the Conversation. See html2md::Options() for more.
|
||||
*
|
||||
* \note Don't pass anything else than HTML, otherwise the output will be a
|
||||
* **mess**!
|
||||
*
|
||||
* This is the default initializer.<br>
|
||||
* You can use appendToMd() to append something to the beginning of the
|
||||
* generated output.
|
||||
*/
|
||||
explicit inline Converter(const std::string &html,
|
||||
struct Options *options = nullptr) {
|
||||
*this = Converter(&html, options);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Convert HTML into Markdown.
|
||||
* \return Returns the converted Markdown.
|
||||
*
|
||||
* This function actually converts the HTML into Markdown.
|
||||
* It also cleans up the Markdown so you don't have to do anything.
|
||||
*/
|
||||
[[nodiscard]] std::string convert();
|
||||
|
||||
/*!
|
||||
* \brief Append a char to the Markdown.
|
||||
* \param ch The char to append.
|
||||
* \return Returns a copy of the instance with the char appended.
|
||||
*/
|
||||
Converter *appendToMd(char ch);
|
||||
|
||||
/*!
|
||||
* \brief Append a char* to the Markdown.
|
||||
* \param str The char* to append.
|
||||
* \return Returns a copy of the instance with the char* appended.
|
||||
*/
|
||||
Converter *appendToMd(const char *str);
|
||||
|
||||
/*!
|
||||
* \brief Append a string to the Markdown.
|
||||
* \param s The string to append.
|
||||
* \return Returns a copy of the instance with the string appended.
|
||||
*/
|
||||
inline Converter *appendToMd(const std::string &s) {
|
||||
return appendToMd(s.c_str());
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Appends a ' ' in certain cases.
|
||||
* \return Copy of the instance with(maybe) the appended space.
|
||||
*
|
||||
* This function appends ' ' if:
|
||||
* - md does not end with `*`
|
||||
* - md does not end with `\n` aka newline
|
||||
*/
|
||||
Converter *appendBlank();
|
||||
|
||||
/*!
|
||||
* \brief Add an HTML symbol conversion
|
||||
* \param htmlSymbol The HTML symbol to convert
|
||||
* \param replacement The replacement string
|
||||
* \note This is useful for converting HTML entities to their Markdown
|
||||
* equivalents. For example, you can add a conversion for " " to
|
||||
* " " (space) or "<" to "<" (less than).
|
||||
* \note This is not a standard feature of the Converter class, but it can
|
||||
* be added to the class to allow for more flexibility in the conversion
|
||||
* process. You can use this feature to add custom conversions for any HTML
|
||||
* symbol that you want to convert to a specific Markdown representation.
|
||||
*/
|
||||
void addHtmlSymbolConversion(const std::string &htmlSymbol,
|
||||
const std::string &replacement) {
|
||||
htmlSymbolConversions_[htmlSymbol] = replacement;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Remove an HTML symbol conversion
|
||||
* \param htmlSymbol The HTML symbol to remove
|
||||
* \note This is useful for removing custom conversions that you have added
|
||||
* previously.
|
||||
*/
|
||||
void removeHtmlSymbolConversion(const std::string &htmlSymbol) {
|
||||
htmlSymbolConversions_.erase(htmlSymbol);
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Clear all HTML symbol conversions
|
||||
* \note This is useful for clearing the conversion map (it's empty afterwards).
|
||||
*/
|
||||
void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); }
|
||||
|
||||
/*!
|
||||
* \brief Checks if everything was closed properly(in the HTML).
|
||||
* \return Returns false if there is a unclosed tag.
|
||||
* \note As long as you have not called convert(), it always returns true.
|
||||
*/
|
||||
[[nodiscard]] bool ok() const;
|
||||
|
||||
/*!
|
||||
* \brief Reset the generated Markdown
|
||||
*/
|
||||
void reset();
|
||||
|
||||
/*!
|
||||
* \brief Checks if the HTML matches and the options are the same.
|
||||
* \param The Converter object to compare with
|
||||
* \return true if the HTML and options matches otherwise false
|
||||
*/
|
||||
inline bool operator==(const Converter *c) const { return *this == *c; }
|
||||
|
||||
inline bool operator==(const Converter &c) const {
|
||||
return html_ == c.html_ && option == c.option;
|
||||
}
|
||||
|
||||
/*!
|
||||
* \brief Returns ok().
|
||||
*/
|
||||
inline explicit operator bool() const { return ok(); };
|
||||
|
||||
private:
|
||||
// Attributes
|
||||
static constexpr const char *kAttributeHref = "href";
|
||||
static constexpr const char *kAttributeAlt = "alt";
|
||||
static constexpr const char *kAttributeTitle = "title";
|
||||
static constexpr const char *kAttributeClass = "class";
|
||||
static constexpr const char *kAttributeSrc = "src";
|
||||
static constexpr const char *kAttrinuteAlign = "align";
|
||||
|
||||
static constexpr const char *kTagAnchor = "a";
|
||||
static constexpr const char *kTagBreak = "br";
|
||||
static constexpr const char *kTagCode = "code";
|
||||
static constexpr const char *kTagDiv = "div";
|
||||
static constexpr const char *kTagHead = "head";
|
||||
static constexpr const char *kTagLink = "link";
|
||||
static constexpr const char *kTagListItem = "li";
|
||||
static constexpr const char *kTagMeta = "meta";
|
||||
static constexpr const char *kTagNav = "nav";
|
||||
static constexpr const char *kTagNoScript = "noscript";
|
||||
static constexpr const char *kTagOption = "option";
|
||||
static constexpr const char *kTagOrderedList = "ol";
|
||||
static constexpr const char *kTagParagraph = "p";
|
||||
static constexpr const char *kTagPre = "pre";
|
||||
static constexpr const char *kTagScript = "script";
|
||||
static constexpr const char *kTagSpan = "span";
|
||||
static constexpr const char *kTagStyle = "style";
|
||||
static constexpr const char *kTagTemplate = "template";
|
||||
static constexpr const char *kTagTitle = "title";
|
||||
static constexpr const char *kTagUnorderedList = "ul";
|
||||
static constexpr const char *kTagImg = "img";
|
||||
static constexpr const char *kTagSeperator = "hr";
|
||||
|
||||
// Text format
|
||||
static constexpr const char *kTagBold = "b";
|
||||
static constexpr const char *kTagStrong = "strong";
|
||||
static constexpr const char *kTagItalic = "em";
|
||||
static constexpr const char *kTagItalic2 = "i";
|
||||
static constexpr const char *kTagCitation = "cite";
|
||||
static constexpr const char *kTagDefinition = "dfn";
|
||||
static constexpr const char *kTagUnderline = "u";
|
||||
static constexpr const char *kTagStrighthrought = "del";
|
||||
static constexpr const char *kTagStrighthrought2 = "s";
|
||||
|
||||
static constexpr const char *kTagBlockquote = "blockquote";
|
||||
|
||||
// Header
|
||||
static constexpr const char *kTagHeader1 = "h1";
|
||||
static constexpr const char *kTagHeader2 = "h2";
|
||||
static constexpr const char *kTagHeader3 = "h3";
|
||||
static constexpr const char *kTagHeader4 = "h4";
|
||||
static constexpr const char *kTagHeader5 = "h5";
|
||||
static constexpr const char *kTagHeader6 = "h6";
|
||||
|
||||
// Table
|
||||
static constexpr const char *kTagTable = "table";
|
||||
static constexpr const char *kTagTableRow = "tr";
|
||||
static constexpr const char *kTagTableHeader = "th";
|
||||
static constexpr const char *kTagTableData = "td";
|
||||
|
||||
size_t index_ch_in_html_ = 0;
|
||||
|
||||
bool is_closing_tag_ = false;
|
||||
bool is_in_attribute_value_ = false;
|
||||
bool is_in_code_ = false;
|
||||
bool is_in_list_ = false;
|
||||
bool is_in_p_ = false;
|
||||
bool is_in_pre_ = false;
|
||||
bool is_in_table_ = false;
|
||||
bool is_in_table_row_ = false;
|
||||
bool is_in_tag_ = false;
|
||||
bool is_self_closing_tag_ = false;
|
||||
bool skipping_leading_whitespace_ = true;
|
||||
|
||||
// relevant for <li> only, false = is in unordered list
|
||||
bool is_in_ordered_list_ = false;
|
||||
uint8_t index_ol = 0;
|
||||
|
||||
// store the table start
|
||||
size_t table_start = 0;
|
||||
|
||||
// number of lists
|
||||
uint8_t index_li = 0;
|
||||
|
||||
uint8_t index_blockquote = 0;
|
||||
|
||||
char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
|
||||
char prev_ch_in_html_ = 'x';
|
||||
|
||||
std::string html_;
|
||||
|
||||
uint16_t offset_lt_ = 0;
|
||||
std::string current_tag_;
|
||||
std::string prev_tag_;
|
||||
|
||||
// Line which separates header from data
|
||||
std::string tableLine;
|
||||
|
||||
size_t chars_in_curr_line_ = 0;
|
||||
|
||||
std::string md_;
|
||||
|
||||
Options option;
|
||||
|
||||
std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
|
||||
{""", "\""}, {"<", "<"}, {">", ">"},
|
||||
{"&", "&"}, {" ", " "}, {"→", "→"}};
|
||||
|
||||
// Tag: base class for tag types
|
||||
struct Tag {
|
||||
virtual void OnHasLeftOpeningTag(Converter *c) = 0;
|
||||
virtual void OnHasLeftClosingTag(Converter *c) = 0;
|
||||
};
|
||||
|
||||
// Tag types
|
||||
|
||||
// tags that are not printed (nav, script, noscript, ...)
|
||||
struct TagIgnored : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override {};
|
||||
void OnHasLeftClosingTag(Converter *c) override {};
|
||||
};
|
||||
|
||||
struct TagAnchor : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
|
||||
std::string current_href_;
|
||||
std::string current_title_;
|
||||
};
|
||||
|
||||
struct TagBold : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagItalic : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagUnderline : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagStrikethrought : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagBreak : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagDiv : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagHeader1 : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagHeader2 : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagHeader3 : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagHeader4 : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagHeader5 : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagHeader6 : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagListItem : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagOption : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagOrderedList : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagParagraph : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagPre : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagCode : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagSpan : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagTitle : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagUnorderedList : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagImage : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagSeperator : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagTable : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagTableRow : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagTableHeader : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagTableData : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
struct TagBlockquote : Tag {
|
||||
void OnHasLeftOpeningTag(Converter *c) override;
|
||||
void OnHasLeftClosingTag(Converter *c) override;
|
||||
};
|
||||
|
||||
std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
|
||||
|
||||
explicit Converter(const std::string *html, struct Options *options);
|
||||
|
||||
void CleanUpMarkdown();
|
||||
|
||||
// Trim from start (in place)
|
||||
static void LTrim(std::string *s);
|
||||
|
||||
// Trim from end (in place)
|
||||
Converter *RTrim(std::string *s, bool trim_only_blank = false);
|
||||
|
||||
// Trim from both ends (in place)
|
||||
Converter *Trim(std::string *s);
|
||||
|
||||
// 1. trim all lines
|
||||
// 2. reduce consecutive newlines to maximum 3
|
||||
void TidyAllLines(std::string *str);
|
||||
|
||||
std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
|
||||
|
||||
void TurnLineIntoHeader1();
|
||||
|
||||
void TurnLineIntoHeader2();
|
||||
|
||||
// Current char: '<'
|
||||
void OnHasEnteredTag();
|
||||
|
||||
Converter *UpdatePrevChFromMd();
|
||||
|
||||
/**
|
||||
* Handle next char within <...> tag
|
||||
*
|
||||
* @param ch current character
|
||||
* @return continue surrounding iteration?
|
||||
*/
|
||||
bool ParseCharInTag(char ch);
|
||||
|
||||
// Current char: '>'
|
||||
bool OnHasLeftTag();
|
||||
|
||||
inline static bool TagContainsAttributesToHide(std::string *tag) {
|
||||
using std::string;
|
||||
|
||||
return (*tag).find(" aria=\"hidden\"") != string::npos ||
|
||||
(*tag).find("display:none") != string::npos ||
|
||||
(*tag).find("visibility:hidden") != string::npos ||
|
||||
(*tag).find("opacity:0") != string::npos ||
|
||||
(*tag).find("Details-content--hidden-not-important") != string::npos;
|
||||
}
|
||||
|
||||
Converter *ShortenMarkdown(size_t chars = 1);
|
||||
inline bool shortIfPrevCh(char prev) {
|
||||
if (prev_ch_in_md_ == prev) {
|
||||
ShortenMarkdown();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
|
||||
/**
|
||||
* @param ch
|
||||
* @return continue iteration surrounding this method's invocation?
|
||||
*/
|
||||
bool ParseCharInTagContent(char ch);
|
||||
|
||||
// Replace previous space (if any) in current markdown line by newline
|
||||
bool ReplacePreviousSpaceInLineByNewline();
|
||||
|
||||
static inline bool IsIgnoredTag(const std::string &tag) {
|
||||
return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
|
||||
kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
|
||||
|
||||
// meta: not ignored to tolerate if closing is omitted
|
||||
}
|
||||
|
||||
[[nodiscard]] bool IsInIgnoredTag() const;
|
||||
}; // Converter
|
||||
|
||||
/*!
|
||||
* \brief Static wrapper around the Converter class
|
||||
* \param html The HTML passed to Converter
|
||||
* \param ok Optional: Pass a reference to a local bool to store the output of
|
||||
* Converter::ok() \return Returns the by Converter generated Markdown
|
||||
*/
|
||||
inline std::string Convert(const std::string &html, bool *ok = nullptr) {
|
||||
Converter c(html);
|
||||
auto md = c.convert();
|
||||
if (ok != nullptr)
|
||||
*ok = c.ok();
|
||||
return md;
|
||||
}
|
||||
|
||||
#ifndef PYTHON_BINDINGS
|
||||
inline std::string Convert(const std::string &&html, bool *ok = nullptr) {
|
||||
return Convert(html, ok);
|
||||
}
|
||||
#endif
|
||||
|
||||
} // namespace html2md
|
||||
|
||||
#endif // HTML2MD_H
|
||||
11
packages/kbot/cpp/packages/html/include/html/table.h
Normal file
11
packages/kbot/cpp/packages/html/include/html/table.h
Normal file
@ -0,0 +1,11 @@
|
||||
// Copyright (c) Tim Gromeyer
|
||||
// Licensed under the MIT License - https://opensource.org/licenses/MIT
|
||||
|
||||
#ifndef TABLE_H
|
||||
#define TABLE_H
|
||||
|
||||
#include <string>
|
||||
|
||||
[[nodiscard]] std::string formatMarkdownTable(const std::string &inputTable);
|
||||
|
||||
#endif // TABLE_H
|
||||
101
packages/kbot/cpp/packages/html/readme.md
Normal file
101
packages/kbot/cpp/packages/html/readme.md
Normal file
@ -0,0 +1,101 @@
|
||||
# Scraper Request
|
||||
|
||||
## OpenAPI Specification
|
||||
|
||||
```yaml
|
||||
openapi: 3.0.1
|
||||
info:
|
||||
title: ''
|
||||
description: ''
|
||||
version: 1.0.0
|
||||
paths:
|
||||
/api/v1/scraper/request:
|
||||
post:
|
||||
summary: Scraper Request
|
||||
deprecated: false
|
||||
description: ''
|
||||
tags:
|
||||
- Scraping API
|
||||
parameters: []
|
||||
requestBody:
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
properties:
|
||||
actor:
|
||||
type: string
|
||||
input:
|
||||
type: object
|
||||
properties:
|
||||
url:
|
||||
type: string
|
||||
required:
|
||||
- url
|
||||
x-apidog-orders:
|
||||
- url
|
||||
proxy:
|
||||
type: object
|
||||
properties:
|
||||
country:
|
||||
type: string
|
||||
required:
|
||||
- country
|
||||
x-apidog-orders:
|
||||
- country
|
||||
async:
|
||||
type: boolean
|
||||
description: |-
|
||||
If true, the task will be executed asynchronously.
|
||||
If false, the task will be executed synchronously.
|
||||
required:
|
||||
- actor
|
||||
- input
|
||||
- proxy
|
||||
x-apidog-orders:
|
||||
- actor
|
||||
- input
|
||||
- proxy
|
||||
- async
|
||||
example:
|
||||
actor: scraper.xxx
|
||||
input:
|
||||
url: >-
|
||||
https://www.***.com/shop/us/products/stmicroelectronics/tda7265a-3074457345625542393/
|
||||
proxy:
|
||||
country: US
|
||||
async: false
|
||||
responses:
|
||||
'200':
|
||||
description: ''
|
||||
content:
|
||||
application/json:
|
||||
schema:
|
||||
type: object
|
||||
properties: {}
|
||||
x-apidog-orders: []
|
||||
headers: {}
|
||||
x-apidog-name: Success
|
||||
security:
|
||||
- apikey-header-x-api-token: []
|
||||
x-apidog-folder: Scraping API
|
||||
x-apidog-status: released
|
||||
x-run-in-apidog: https://app.apidog.com/web/project/745098/apis/api-11949852-run
|
||||
components:
|
||||
schemas: {}
|
||||
securitySchemes:
|
||||
bearer:
|
||||
type: bearer
|
||||
scheme: bearer
|
||||
description: Bearer token authentication using your Scrapeless API key
|
||||
apikey-header-x-api-token:
|
||||
type: apiKey
|
||||
in: header
|
||||
name: x-api-token
|
||||
servers:
|
||||
- url: https://api.scrapeless.com
|
||||
description: Prod Env
|
||||
security:
|
||||
- apikey-header-x-api-token: []
|
||||
|
||||
```
|
||||
403
packages/kbot/cpp/packages/html/src/html.cpp
Normal file
403
packages/kbot/cpp/packages/html/src/html.cpp
Normal file
@ -0,0 +1,403 @@
|
||||
#include "html/html.h"
|
||||
|
||||
#include <lexbor/css/css.h>
|
||||
#include <lexbor/html/html.h>
|
||||
#include <lexbor/selectors/selectors.h>
|
||||
#include <html/html2md.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstring>
|
||||
|
||||
namespace html {
|
||||
|
||||
// ── helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
static std::string node_text(lxb_dom_node_t *node) {
|
||||
size_t len = 0;
|
||||
lxb_char_t *text = lxb_dom_node_text_content(node, &len);
|
||||
if (!text)
|
||||
return {};
|
||||
std::string result(reinterpret_cast<const char *>(text), len);
|
||||
lxb_dom_document_destroy_text(node->owner_document, text);
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string tag_name(lxb_dom_element_t *el) {
|
||||
size_t len = 0;
|
||||
const lxb_char_t *name = lxb_dom_element_qualified_name(el, &len);
|
||||
if (!name)
|
||||
return {};
|
||||
return std::string(reinterpret_cast<const char *>(name), len);
|
||||
}
|
||||
|
||||
static std::string get_element_attr(lxb_dom_element_t *el, const char *attr) {
|
||||
size_t len = 0;
|
||||
const lxb_char_t *val = lxb_dom_element_get_attribute(
|
||||
el, reinterpret_cast<const lxb_char_t *>(attr), strlen(attr), &len);
|
||||
if (!val)
|
||||
return {};
|
||||
return std::string(reinterpret_cast<const char *>(val), len);
|
||||
}
|
||||
|
||||
static lxb_html_document_t *parse_doc(const std::string &html_str) {
|
||||
auto *doc = lxb_html_document_create();
|
||||
if (!doc) return nullptr;
|
||||
auto status = lxb_html_document_parse(
|
||||
doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()),
|
||||
html_str.size());
|
||||
if (status != LXB_STATUS_OK) {
|
||||
lxb_html_document_destroy(doc);
|
||||
return nullptr;
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
// ── Helper: check if a tag name matches a noise element ─────────────────────
|
||||
|
||||
static bool is_noise_tag(const std::string &name) {
|
||||
return name == "script" || name == "style" || name == "noscript" ||
|
||||
name == "svg" || name == "iframe";
|
||||
}
|
||||
|
||||
// ── walk tree recursively ───────────────────────────────────────────────────
|
||||
|
||||
static void walk(lxb_dom_node_t *node, std::vector<Element> &out) {
|
||||
if (!node)
|
||||
return;
|
||||
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
||||
auto *el = lxb_dom_interface_element(node);
|
||||
auto txt = node_text(node);
|
||||
if (!txt.empty()) {
|
||||
out.push_back({tag_name(el), txt});
|
||||
}
|
||||
}
|
||||
auto *child = node->first_child;
|
||||
while (child) {
|
||||
walk(child, out);
|
||||
child = child->next;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Walk for visible text only (skip noise tags) ────────────────────────────
|
||||
|
||||
static void walk_text(lxb_dom_node_t *node, std::string &out) {
|
||||
if (!node) return;
|
||||
|
||||
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
||||
auto *el = lxb_dom_interface_element(node);
|
||||
auto name = tag_name(el);
|
||||
if (is_noise_tag(name)) return; // Skip noise subtrees entirely
|
||||
}
|
||||
|
||||
if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
|
||||
size_t len = 0;
|
||||
const lxb_char_t *data = lxb_dom_node_text_content(node, &len);
|
||||
if (data && len > 0) {
|
||||
std::string chunk(reinterpret_cast<const char *>(data), len);
|
||||
// Collapse whitespace
|
||||
bool needSpace = !out.empty() && out.back() != ' ' && out.back() != '\n';
|
||||
// Trim leading/trailing whitespace from chunk
|
||||
size_t start = chunk.find_first_not_of(" \t\n\r");
|
||||
size_t end = chunk.find_last_not_of(" \t\n\r");
|
||||
if (start != std::string::npos) {
|
||||
if (needSpace) out += ' ';
|
||||
out += chunk.substr(start, end - start + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto *child = node->first_child;
|
||||
while (child) {
|
||||
walk_text(child, out);
|
||||
child = child->next;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Walk <head> for meta/title/link ─────────────────────────────────────────
|
||||
|
||||
struct HeadData {
|
||||
std::string title;
|
||||
std::string canonical;
|
||||
std::vector<std::pair<std::string, std::string>> metas; // name/property → content
|
||||
std::vector<std::string> json_ld;
|
||||
};
|
||||
|
||||
static void walk_head(lxb_dom_node_t *node, HeadData &data) {
|
||||
if (!node) return;
|
||||
|
||||
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
||||
auto *el = lxb_dom_interface_element(node);
|
||||
auto name = tag_name(el);
|
||||
|
||||
if (name == "title") {
|
||||
data.title = node_text(node);
|
||||
} else if (name == "meta") {
|
||||
auto nameAttr = get_element_attr(el, "name");
|
||||
auto propAttr = get_element_attr(el, "property");
|
||||
auto content = get_element_attr(el, "content");
|
||||
if (!content.empty()) {
|
||||
if (!nameAttr.empty()) data.metas.emplace_back(nameAttr, content);
|
||||
if (!propAttr.empty()) data.metas.emplace_back(propAttr, content);
|
||||
}
|
||||
} else if (name == "link") {
|
||||
auto rel = get_element_attr(el, "rel");
|
||||
if (rel == "canonical") {
|
||||
data.canonical = get_element_attr(el, "href");
|
||||
}
|
||||
} else if (name == "script") {
|
||||
auto type = get_element_attr(el, "type");
|
||||
if (type == "application/ld+json") {
|
||||
auto text = node_text(node);
|
||||
if (!text.empty()) data.json_ld.push_back(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto *child = node->first_child;
|
||||
while (child) {
|
||||
walk_head(child, data);
|
||||
child = child->next;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Walk <body> for <a> links ───────────────────────────────────────────────
|
||||
|
||||
static void walk_links(lxb_dom_node_t *node, std::vector<Link> &out) {
|
||||
if (!node) return;
|
||||
|
||||
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
||||
auto *el = lxb_dom_interface_element(node);
|
||||
auto name = tag_name(el);
|
||||
|
||||
if (name == "a") {
|
||||
auto href = get_element_attr(el, "href");
|
||||
if (!href.empty()) {
|
||||
Link lk;
|
||||
lk.href = href;
|
||||
lk.rel = get_element_attr(el, "rel");
|
||||
lk.text = node_text(node);
|
||||
out.push_back(std::move(lk));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto *child = node->first_child;
|
||||
while (child) {
|
||||
walk_links(child, out);
|
||||
child = child->next;
|
||||
}
|
||||
}
|
||||
|
||||
// ── public API ──────────────────────────────────────────────────────────────
|
||||
|
||||
std::vector<Element> parse(const std::string &html_str) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
std::vector<Element> result;
|
||||
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
|
||||
walk(body, result);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── CSS selector callback ───────────────────────────────────────────────────
|
||||
|
||||
struct SelectCtx {
|
||||
std::vector<std::string> *out;
|
||||
};
|
||||
|
||||
static lxb_status_t select_cb(lxb_dom_node_t *node,
|
||||
lxb_css_selector_specificity_t spec, void *ctx) {
|
||||
(void)spec;
|
||||
auto *sctx = static_cast<SelectCtx *>(ctx);
|
||||
auto txt = node_text(node);
|
||||
if (!txt.empty()) {
|
||||
sctx->out->push_back(txt);
|
||||
}
|
||||
return LXB_STATUS_OK;
|
||||
}
|
||||
|
||||
std::vector<std::string> select(const std::string &html_str,
|
||||
const std::string &selector) {
|
||||
std::vector<std::string> result;
|
||||
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return result;
|
||||
|
||||
auto *css_parser = lxb_css_parser_create();
|
||||
lxb_css_parser_init(css_parser, nullptr);
|
||||
|
||||
auto *selectors = lxb_selectors_create();
|
||||
lxb_selectors_init(selectors);
|
||||
|
||||
auto *list = lxb_css_selectors_parse(
|
||||
css_parser, reinterpret_cast<const lxb_char_t *>(selector.c_str()),
|
||||
selector.size());
|
||||
|
||||
if (list) {
|
||||
SelectCtx ctx{&result};
|
||||
lxb_selectors_find(
|
||||
selectors, lxb_dom_interface_node(lxb_html_document_body_element(doc)),
|
||||
list, select_cb, &ctx);
|
||||
lxb_css_selector_list_destroy_memory(list);
|
||||
}
|
||||
|
||||
lxb_selectors_destroy(selectors, true);
|
||||
lxb_css_parser_destroy(css_parser, true);
|
||||
lxb_html_document_destroy(doc);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Enricher extraction helpers ─────────────────────────────────────────────
|
||||
|
||||
std::string get_title(const std::string &html_str) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
HeadData data;
|
||||
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
|
||||
walk_head(head, data);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
return data.title;
|
||||
}
|
||||
|
||||
std::string get_meta(const std::string &html_str, const std::string &name) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
HeadData data;
|
||||
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
|
||||
walk_head(head, data);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
|
||||
for (auto &[key, val] : data.metas) {
|
||||
if (key == name) return val;
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string get_canonical(const std::string &html_str) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
HeadData data;
|
||||
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
|
||||
walk_head(head, data);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
return data.canonical;
|
||||
}
|
||||
|
||||
std::vector<Link> get_links(const std::string &html_str) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
std::vector<Link> links;
|
||||
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
|
||||
walk_links(body, links);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
return links;
|
||||
}
|
||||
|
||||
std::string get_body_text(const std::string &html_str) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
std::string text;
|
||||
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
|
||||
walk_text(body, text);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
return text;
|
||||
}
|
||||
|
||||
std::vector<std::string> get_json_ld(const std::string &html_str) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
HeadData data;
|
||||
// JSON-LD can be in head or body — walk entire document
|
||||
auto *root = lxb_dom_interface_node(
|
||||
lxb_dom_document_element(&doc->dom_document));
|
||||
walk_head(root, data);
|
||||
|
||||
lxb_html_document_destroy(doc);
|
||||
return data.json_ld;
|
||||
}
|
||||
|
||||
// ── get_attr via CSS selector ───────────────────────────────────────────────
|
||||
|
||||
struct AttrCtx {
|
||||
std::string attr_name;
|
||||
std::string result;
|
||||
bool found;
|
||||
};
|
||||
|
||||
static lxb_status_t attr_cb(lxb_dom_node_t *node,
|
||||
lxb_css_selector_specificity_t spec, void *ctx) {
|
||||
(void)spec;
|
||||
auto *actx = static_cast<AttrCtx *>(ctx);
|
||||
if (actx->found) return LXB_STATUS_OK;
|
||||
|
||||
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
|
||||
auto *el = lxb_dom_interface_element(node);
|
||||
auto val = get_element_attr(el, actx->attr_name.c_str());
|
||||
if (!val.empty()) {
|
||||
actx->result = val;
|
||||
actx->found = true;
|
||||
}
|
||||
}
|
||||
return LXB_STATUS_OK;
|
||||
}
|
||||
|
||||
std::string get_attr(const std::string &html_str, const std::string &selector,
|
||||
const std::string &attr_name) {
|
||||
auto *doc = parse_doc(html_str);
|
||||
if (!doc) return {};
|
||||
|
||||
auto *css_parser = lxb_css_parser_create();
|
||||
lxb_css_parser_init(css_parser, nullptr);
|
||||
|
||||
auto *selectors = lxb_selectors_create();
|
||||
lxb_selectors_init(selectors);
|
||||
|
||||
auto *list = lxb_css_selectors_parse(
|
||||
css_parser, reinterpret_cast<const lxb_char_t *>(selector.c_str()),
|
||||
selector.size());
|
||||
|
||||
std::string result;
|
||||
if (list) {
|
||||
AttrCtx ctx{attr_name, {}, false};
|
||||
auto *root = lxb_dom_interface_node(
|
||||
lxb_dom_document_element(&doc->dom_document));
|
||||
lxb_selectors_find(selectors, root, list, attr_cb, &ctx);
|
||||
result = ctx.result;
|
||||
lxb_css_selector_list_destroy_memory(list);
|
||||
}
|
||||
|
||||
lxb_selectors_destroy(selectors, true);
|
||||
lxb_css_parser_destroy(css_parser, true);
|
||||
lxb_html_document_destroy(doc);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string to_markdown(const std::string &html_str) {
|
||||
// Defense-in-depth: hard cap at 2 MB even if the caller forgets.
|
||||
// The enricher pipeline already caps at 512 KB, but future callers
|
||||
// may not — prevent OOM / multi-second hangs from html2md.
|
||||
static constexpr size_t MAX_HTML2MD_INPUT = 2 * 1024 * 1024;
|
||||
if (html_str.size() > MAX_HTML2MD_INPUT) {
|
||||
return "*[Content truncated: HTML too large for markdown conversion ("
|
||||
+ std::to_string(html_str.size() / 1024) + " KB)]*\n";
|
||||
}
|
||||
return html2md::Convert(html_str);
|
||||
}
|
||||
|
||||
} // namespace html
|
||||
1195
packages/kbot/cpp/packages/html/src/html2md.cpp
Normal file
1195
packages/kbot/cpp/packages/html/src/html2md.cpp
Normal file
File diff suppressed because it is too large
Load Diff
106
packages/kbot/cpp/packages/html/src/table.cpp
Normal file
106
packages/kbot/cpp/packages/html/src/table.cpp
Normal file
@ -0,0 +1,106 @@
|
||||
// Copyright (c) Tim Gromeyer
|
||||
// Licensed under the MIT License - https://opensource.org/licenses/MIT
|
||||
|
||||
#include "html/table.h"
|
||||
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <vector>
|
||||
|
||||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
const size_t MIN_LINE_LENGTH = 3; // Minimum length of line
|
||||
|
||||
void removeLeadingTrailingSpaces(string &str) {
|
||||
size_t firstNonSpace = str.find_first_not_of(' ');
|
||||
if (firstNonSpace == string::npos) {
|
||||
str.clear(); // Entire string is spaces
|
||||
return;
|
||||
}
|
||||
|
||||
size_t lastNonSpace = str.find_last_not_of(' ');
|
||||
str = str.substr(firstNonSpace, lastNonSpace - firstNonSpace + 1);
|
||||
}
|
||||
|
||||
string enlargeTableHeaderLine(const string &str, size_t length) {
|
||||
if (str.empty() || length < MIN_LINE_LENGTH)
|
||||
return "";
|
||||
|
||||
size_t first = str.find_first_of(':');
|
||||
size_t last = str.find_last_of(':');
|
||||
|
||||
if (first == 0 && first == last)
|
||||
last = string::npos;
|
||||
|
||||
string line = string(length, '-');
|
||||
|
||||
if (first == 0)
|
||||
line[0] = ':';
|
||||
if (last == str.length() - 1)
|
||||
line[length - 1] = ':';
|
||||
|
||||
return line;
|
||||
}
|
||||
|
||||
string formatMarkdownTable(const string &inputTable) {
|
||||
std::istringstream iss(inputTable);
|
||||
string line;
|
||||
vector<vector<string>> tableData;
|
||||
|
||||
// Parse the input table into a 2D vector
|
||||
while (std::getline(iss, line)) {
|
||||
std::istringstream lineStream(line);
|
||||
string cell;
|
||||
vector<string> rowData;
|
||||
|
||||
while (std::getline(lineStream, cell, '|')) {
|
||||
removeLeadingTrailingSpaces(cell); // Trim first
|
||||
if (!cell.empty()) { // Then check if empty
|
||||
rowData.push_back(cell);
|
||||
}
|
||||
}
|
||||
|
||||
if (!rowData.empty()) {
|
||||
tableData.push_back(std::move(rowData)); // Move rowData to avoid copying
|
||||
}
|
||||
}
|
||||
|
||||
if (tableData.empty()) {
|
||||
return "";
|
||||
}
|
||||
|
||||
// Determine maximum width of each column
|
||||
vector<size_t> columnWidths(tableData[0].size(), 0);
|
||||
for (const auto &row : tableData) {
|
||||
if (columnWidths.size() < row.size()) {
|
||||
columnWidths.resize(row.size(), 0);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
columnWidths[i] = std::max(columnWidths[i], row[i].size());
|
||||
}
|
||||
}
|
||||
|
||||
// Build the formatted table
|
||||
std::ostringstream formattedTable;
|
||||
for (size_t rowNumber = 0; rowNumber < tableData.size(); ++rowNumber) {
|
||||
const auto &row = tableData[rowNumber];
|
||||
|
||||
formattedTable << "|";
|
||||
|
||||
for (size_t i = 0; i < row.size(); ++i) {
|
||||
if (rowNumber == 1) {
|
||||
formattedTable << enlargeTableHeaderLine(row[i], columnWidths[i] + 2)
|
||||
<< "|";
|
||||
continue;
|
||||
}
|
||||
formattedTable << " " << std::setw(columnWidths[i]) << std::left << row[i]
|
||||
<< " |";
|
||||
}
|
||||
formattedTable << "\n";
|
||||
}
|
||||
|
||||
return formattedTable.str();
|
||||
}
|
||||
48
packages/kbot/cpp/packages/http/CMakeLists.txt
Normal file
48
packages/kbot/cpp/packages/http/CMakeLists.txt
Normal file
@ -0,0 +1,48 @@
|
||||
include(FetchContent)
|
||||
|
||||
# Work around curl's old cmake_minimum_required for CMake 4.x
|
||||
set(CMAKE_POLICY_VERSION_MINIMUM 3.5 CACHE STRING "" FORCE)
|
||||
|
||||
FetchContent_Declare(
|
||||
CURL
|
||||
URL https://github.com/curl/curl/releases/download/curl-8_12_1/curl-8.12.1.tar.xz
|
||||
DOWNLOAD_EXTRACT_TIMESTAMP TRUE
|
||||
)
|
||||
|
||||
# Minimal curl build — static, SChannel TLS, no optional deps
|
||||
set(BUILD_CURL_EXE OFF CACHE BOOL "" FORCE)
|
||||
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
|
||||
set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
|
||||
|
||||
# TLS backend: platform-appropriate
|
||||
if(WIN32)
|
||||
set(CURL_USE_OPENSSL OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_USE_SCHANNEL ON CACHE BOOL "" FORCE)
|
||||
else()
|
||||
set(CURL_USE_SCHANNEL OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_USE_OPENSSL ON CACHE BOOL "" FORCE)
|
||||
endif()
|
||||
|
||||
# Disable optional compression/protocol deps
|
||||
set(CURL_ZLIB OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_BROTLI OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_ZSTD OFF CACHE BOOL "" FORCE)
|
||||
set(USE_NGHTTP2 OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_USE_LIBSSH2 OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_USE_LIBPSL OFF CACHE BOOL "" FORCE)
|
||||
set(CURL_DISABLE_LDAP ON CACHE BOOL "" FORCE)
|
||||
set(CURL_DISABLE_LDAPS ON CACHE BOOL "" FORCE)
|
||||
|
||||
FetchContent_MakeAvailable(CURL)
|
||||
|
||||
add_library(http STATIC
|
||||
src/http.cpp
|
||||
)
|
||||
|
||||
target_include_directories(http
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(http
|
||||
PUBLIC CURL::libcurl
|
||||
)
|
||||
40
packages/kbot/cpp/packages/http/include/http/http.h
Normal file
40
packages/kbot/cpp/packages/http/include/http/http.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace http {
|
||||
|
||||
struct Response {
|
||||
long status_code;
|
||||
std::string body;
|
||||
};
|
||||
|
||||
/// Options for customisable HTTP GET requests.
|
||||
struct GetOptions {
|
||||
std::string user_agent = "Mozilla/5.0 (compatible; PolymechBot/1.0)";
|
||||
int timeout_ms = 10000;
|
||||
bool follow_redirects = true;
|
||||
};
|
||||
|
||||
/// Perform an HTTP GET request. Returns the response body and status code.
|
||||
Response get(const std::string &url);
|
||||
|
||||
/// Perform an HTTP GET request with custom options.
|
||||
Response get(const std::string &url, const GetOptions &opts);
|
||||
|
||||
/// Perform an HTTP POST request with a body. Returns the response and status.
|
||||
Response post(const std::string &url, const std::string &body,
|
||||
const std::string &content_type = "application/json");
|
||||
|
||||
/// Options for customisable HTTP POST requests.
|
||||
struct PostOptions {
|
||||
std::string content_type = "application/json";
|
||||
std::string bearer_token; // Authorization: Bearer <token>
|
||||
int timeout_ms = 30000;
|
||||
};
|
||||
|
||||
/// Perform an HTTP POST request with custom options.
|
||||
Response post(const std::string &url, const std::string &body,
|
||||
const PostOptions &opts);
|
||||
|
||||
} // namespace http
|
||||
216
packages/kbot/cpp/packages/http/src/http.cpp
Normal file
216
packages/kbot/cpp/packages/http/src/http.cpp
Normal file
@ -0,0 +1,216 @@
|
||||
#include "http/http.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
#include <mutex>
|
||||
#include <chrono>
|
||||
|
||||
namespace http {
|
||||
|
||||
static std::once_flag curl_init_flag;
|
||||
static void ensure_curl_init() {
|
||||
std::call_once(curl_init_flag, []() {
|
||||
curl_global_init(CURL_GLOBAL_ALL);
|
||||
});
|
||||
}
|
||||
|
||||
struct ThreadLocalCurl {
|
||||
CURL *handle;
|
||||
ThreadLocalCurl() {
|
||||
ensure_curl_init();
|
||||
handle = curl_easy_init();
|
||||
}
|
||||
~ThreadLocalCurl() {
|
||||
if (handle) curl_easy_cleanup(handle);
|
||||
}
|
||||
CURL *get() {
|
||||
if (handle) curl_easy_reset(handle);
|
||||
return handle;
|
||||
}
|
||||
};
|
||||
|
||||
thread_local ThreadLocalCurl tl_curl;
|
||||
|
||||
struct ProgressData {
|
||||
std::chrono::steady_clock::time_point start_time;
|
||||
int timeout_ms;
|
||||
};
|
||||
|
||||
static int progress_cb(void *clientp, curl_off_t dltotal, curl_off_t dlnow,
|
||||
curl_off_t ultotal, curl_off_t ulnow) {
|
||||
auto *pd = static_cast<ProgressData *>(clientp);
|
||||
if (pd->timeout_ms <= 0) return 0;
|
||||
|
||||
auto now = std::chrono::steady_clock::now();
|
||||
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - pd->start_time).count();
|
||||
if (elapsed > pd->timeout_ms) {
|
||||
return 1; // Return non-zero to abort the transfer
|
||||
}
|
||||
return 0; // Continue
|
||||
}
|
||||
|
||||
static size_t write_cb(void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
auto *out = static_cast<std::string *>(userp);
|
||||
out->append(static_cast<char *>(contents), size * nmemb);
|
||||
return size * nmemb;
|
||||
}
|
||||
|
||||
Response get(const std::string &url) {
|
||||
return get(url, GetOptions{});
|
||||
}
|
||||
|
||||
Response get(const std::string &url, const GetOptions &opts) {
|
||||
Response resp{};
|
||||
|
||||
CURL *curl = tl_curl.get();
|
||||
if (!curl) {
|
||||
resp.status_code = -1;
|
||||
resp.body = "curl_easy_init (thread_local) failed";
|
||||
return resp;
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, opts.follow_redirects ? 1L : 0L);
|
||||
|
||||
ProgressData prog_data;
|
||||
if (opts.timeout_ms > 0) {
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms));
|
||||
prog_data.start_time = std::chrono::steady_clock::now();
|
||||
prog_data.timeout_ms = opts.timeout_ms + 1000;
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||
}
|
||||
|
||||
// Fail fast on dead sites (TCP SYN timeout)
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT_MS, 5000L);
|
||||
|
||||
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
|
||||
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
|
||||
|
||||
// Prevent signal handlers from breaking in multithreaded environments
|
||||
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
|
||||
|
||||
if (!opts.user_agent.empty()) {
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, opts.user_agent.c_str());
|
||||
}
|
||||
|
||||
// Accept-Encoding for compressed responses
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
resp.status_code = -1;
|
||||
resp.body = curl_easy_strerror(res);
|
||||
} else {
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
|
||||
}
|
||||
|
||||
return resp;
|
||||
}
|
||||
|
||||
Response post(const std::string &url, const std::string &body,
|
||||
const std::string &content_type) {
|
||||
Response resp{};
|
||||
|
||||
CURL *curl = tl_curl.get();
|
||||
if (!curl) {
|
||||
resp.status_code = -1;
|
||||
resp.body = "curl_easy_init failed";
|
||||
return resp;
|
||||
}
|
||||
|
||||
struct curl_slist *headers = nullptr;
|
||||
headers =
|
||||
curl_slist_append(headers, ("Content-Type: " + content_type).c_str());
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
|
||||
|
||||
ProgressData prog_data;
|
||||
prog_data.start_time = std::chrono::steady_clock::now();
|
||||
prog_data.timeout_ms = 11000;
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||
|
||||
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
|
||||
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
|
||||
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
resp.status_code = -1;
|
||||
resp.body = curl_easy_strerror(res);
|
||||
} else {
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
|
||||
}
|
||||
|
||||
curl_slist_free_all(headers);
|
||||
return resp;
|
||||
}
|
||||
|
||||
Response post(const std::string &url, const std::string &body,
|
||||
const PostOptions &opts) {
|
||||
Response resp{};
|
||||
|
||||
CURL *curl = tl_curl.get();
|
||||
if (!curl) {
|
||||
resp.status_code = -1;
|
||||
resp.body = "curl_easy_init failed";
|
||||
return resp;
|
||||
}
|
||||
|
||||
struct curl_slist *headers = nullptr;
|
||||
headers =
|
||||
curl_slist_append(headers, ("Content-Type: " + opts.content_type).c_str());
|
||||
if (!opts.bearer_token.empty()) {
|
||||
headers = curl_slist_append(
|
||||
headers, ("Authorization: Bearer " + opts.bearer_token).c_str());
|
||||
headers = curl_slist_append(
|
||||
headers, ("x-api-token: " + opts.bearer_token).c_str());
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
|
||||
ProgressData prog_data;
|
||||
if (opts.timeout_ms > 0) {
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms));
|
||||
prog_data.start_time = std::chrono::steady_clock::now();
|
||||
prog_data.timeout_ms = opts.timeout_ms + 1000;
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
|
||||
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
|
||||
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
|
||||
}
|
||||
|
||||
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
|
||||
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
|
||||
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
resp.status_code = -1;
|
||||
resp.body = curl_easy_strerror(res);
|
||||
} else {
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
|
||||
}
|
||||
|
||||
curl_slist_free_all(headers);
|
||||
return resp;
|
||||
}
|
||||
|
||||
} // namespace http
|
||||
11
packages/kbot/cpp/packages/ipc/CMakeLists.txt
Normal file
11
packages/kbot/cpp/packages/ipc/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
||||
add_library(ipc STATIC
|
||||
src/ipc.cpp
|
||||
)
|
||||
|
||||
target_include_directories(ipc
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(ipc
|
||||
PUBLIC json logger
|
||||
)
|
||||
34
packages/kbot/cpp/packages/ipc/include/ipc/ipc.h
Normal file
34
packages/kbot/cpp/packages/ipc/include/ipc/ipc.h
Normal file
@ -0,0 +1,34 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace ipc {
|
||||
|
||||
/// A single IPC message: { id, type, payload (raw JSON string) }.
|
||||
struct Message {
|
||||
std::string id;
|
||||
std::string type;
|
||||
std::string payload; // opaque JSON string (can be "{}" or any object)
|
||||
};
|
||||
|
||||
/// Encode a Message into a length-prefixed binary frame.
|
||||
/// Layout: [4-byte LE uint32 length][JSON bytes]
|
||||
std::vector<uint8_t> encode(const Message &msg);
|
||||
|
||||
/// Decode a binary frame (without the 4-byte length prefix) into a Message.
|
||||
/// Returns false if the JSON is invalid or missing required fields.
|
||||
bool decode(const uint8_t *data, size_t len, Message &out);
|
||||
bool decode(const std::vector<uint8_t> &frame, Message &out);
|
||||
|
||||
/// Blocking: read exactly one length-prefixed message from a FILE*.
|
||||
/// Returns false on EOF or read error.
|
||||
bool read_message(Message &out, FILE *in = stdin);
|
||||
|
||||
/// Write one length-prefixed message to a FILE*. Flushes after write.
|
||||
/// Returns false on write error.
|
||||
bool write_message(const Message &msg, FILE *out = stdout);
|
||||
|
||||
} // namespace ipc
|
||||
158
packages/kbot/cpp/packages/ipc/src/ipc.cpp
Normal file
158
packages/kbot/cpp/packages/ipc/src/ipc.cpp
Normal file
@ -0,0 +1,158 @@
|
||||
#include "ipc/ipc.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
#include "json/json.h"
|
||||
#include "logger/logger.h"
|
||||
|
||||
// We use RapidJSON directly for structured serialization
|
||||
#include <rapidjson/document.h>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
|
||||
#ifdef _WIN32
|
||||
#include <fcntl.h>
|
||||
#include <io.h>
|
||||
#endif
|
||||
|
||||
namespace ipc {
|
||||
|
||||
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
static void write_u32_le(uint8_t *dst, uint32_t val) {
|
||||
dst[0] = static_cast<uint8_t>(val & 0xFF);
|
||||
dst[1] = static_cast<uint8_t>((val >> 8) & 0xFF);
|
||||
dst[2] = static_cast<uint8_t>((val >> 16) & 0xFF);
|
||||
dst[3] = static_cast<uint8_t>((val >> 24) & 0xFF);
|
||||
}
|
||||
|
||||
static uint32_t read_u32_le(const uint8_t *src) {
|
||||
return static_cast<uint32_t>(src[0]) |
|
||||
(static_cast<uint32_t>(src[1]) << 8) |
|
||||
(static_cast<uint32_t>(src[2]) << 16) |
|
||||
(static_cast<uint32_t>(src[3]) << 24);
|
||||
}
|
||||
|
||||
static bool read_exact(FILE *f, uint8_t *buf, size_t n) {
|
||||
size_t total = 0;
|
||||
while (total < n) {
|
||||
size_t got = std::fread(buf + total, 1, n - total, f);
|
||||
if (got == 0) return false; // EOF or error
|
||||
total += got;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// ── encode ───────────────────────────────────────────────────────────────────
|
||||
|
||||
std::vector<uint8_t> encode(const Message &msg) {
|
||||
// Build JSON: { "id": "...", "type": "...", "payload": ... }
|
||||
// payload is stored as a raw JSON string, so we parse it first
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
|
||||
w.StartObject();
|
||||
w.Key("id");
|
||||
w.String(msg.id.c_str(), static_cast<rapidjson::SizeType>(msg.id.size()));
|
||||
w.Key("type");
|
||||
w.String(msg.type.c_str(),
|
||||
static_cast<rapidjson::SizeType>(msg.type.size()));
|
||||
w.Key("payload");
|
||||
|
||||
// If payload is valid JSON, embed it as-is; otherwise embed as string
|
||||
rapidjson::Document pd;
|
||||
if (!msg.payload.empty() &&
|
||||
!pd.Parse(msg.payload.c_str()).HasParseError()) {
|
||||
pd.Accept(w);
|
||||
} else {
|
||||
w.String(msg.payload.c_str(),
|
||||
static_cast<rapidjson::SizeType>(msg.payload.size()));
|
||||
}
|
||||
|
||||
w.EndObject();
|
||||
|
||||
const char *json_str = sb.GetString();
|
||||
uint32_t json_len = static_cast<uint32_t>(sb.GetSize());
|
||||
|
||||
std::vector<uint8_t> frame(4 + json_len);
|
||||
write_u32_le(frame.data(), json_len);
|
||||
std::memcpy(frame.data() + 4, json_str, json_len);
|
||||
|
||||
return frame;
|
||||
}
|
||||
|
||||
// ── decode ───────────────────────────────────────────────────────────────────
|
||||
|
||||
bool decode(const uint8_t *data, size_t len, Message &out) {
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(reinterpret_cast<const char *>(data), len);
|
||||
|
||||
if (doc.HasParseError() || !doc.IsObject()) return false;
|
||||
|
||||
if (!doc.HasMember("id") || !doc["id"].IsString()) return false;
|
||||
if (!doc.HasMember("type") || !doc["type"].IsString()) return false;
|
||||
|
||||
out.id = doc["id"].GetString();
|
||||
out.type = doc["type"].GetString();
|
||||
|
||||
if (doc.HasMember("payload")) {
|
||||
if (doc["payload"].IsString()) {
|
||||
out.payload = doc["payload"].GetString();
|
||||
} else {
|
||||
// Re-serialize non-string payload back to JSON string
|
||||
rapidjson::StringBuffer sb;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> w(sb);
|
||||
doc["payload"].Accept(w);
|
||||
out.payload = sb.GetString();
|
||||
}
|
||||
} else {
|
||||
out.payload = "{}";
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool decode(const std::vector<uint8_t> &frame, Message &out) {
|
||||
return decode(frame.data(), frame.size(), out);
|
||||
}
|
||||
|
||||
// ── read_message ─────────────────────────────────────────────────────────────
|
||||
|
||||
bool read_message(Message &out, FILE *in) {
|
||||
#ifdef _WIN32
|
||||
// Ensure binary mode on Windows to prevent \r\n translation
|
||||
_setmode(_fileno(in), _O_BINARY);
|
||||
#endif
|
||||
|
||||
uint8_t len_buf[4];
|
||||
if (!read_exact(in, len_buf, 4)) return false;
|
||||
|
||||
uint32_t msg_len = read_u32_le(len_buf);
|
||||
if (msg_len == 0 || msg_len > 10 * 1024 * 1024) { // sanity: max 10 MB
|
||||
logger::error("ipc::read_message: invalid length " +
|
||||
std::to_string(msg_len));
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<uint8_t> buf(msg_len);
|
||||
if (!read_exact(in, buf.data(), msg_len)) return false;
|
||||
|
||||
return decode(buf, out);
|
||||
}
|
||||
|
||||
// ── write_message ────────────────────────────────────────────────────────────
|
||||
|
||||
bool write_message(const Message &msg, FILE *out) {
|
||||
#ifdef _WIN32
|
||||
_setmode(_fileno(out), _O_BINARY);
|
||||
#endif
|
||||
|
||||
auto frame = encode(msg);
|
||||
size_t written = std::fwrite(frame.data(), 1, frame.size(), out);
|
||||
if (written != frame.size()) return false;
|
||||
|
||||
std::fflush(out);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace ipc
|
||||
28
packages/kbot/cpp/packages/json/CMakeLists.txt
Normal file
28
packages/kbot/cpp/packages/json/CMakeLists.txt
Normal file
@ -0,0 +1,28 @@
|
||||
include(FetchContent)
|
||||
|
||||
# RapidJSON — use master for CMake 4.x compatibility (v1.1.0 is from 2016)
|
||||
FetchContent_Declare(
|
||||
rapidjson
|
||||
GIT_REPOSITORY https://github.com/Tencent/rapidjson.git
|
||||
GIT_TAG master
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
|
||||
set(RAPIDJSON_BUILD_DOC OFF CACHE BOOL "" FORCE)
|
||||
set(RAPIDJSON_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
|
||||
set(RAPIDJSON_BUILD_TESTS OFF CACHE BOOL "" FORCE)
|
||||
|
||||
FetchContent_GetProperties(rapidjson)
|
||||
if(NOT rapidjson_POPULATED)
|
||||
FetchContent_Populate(rapidjson)
|
||||
# Don't add_subdirectory — just use the headers
|
||||
endif()
|
||||
|
||||
add_library(json STATIC
|
||||
src/json.cpp
|
||||
)
|
||||
|
||||
target_include_directories(json
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
PUBLIC ${rapidjson_SOURCE_DIR}/include
|
||||
)
|
||||
23
packages/kbot/cpp/packages/json/include/json/json.h
Normal file
23
packages/kbot/cpp/packages/json/include/json/json.h
Normal file
@ -0,0 +1,23 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace json {
|
||||
|
||||
/// Parse a JSON string and return a pretty-printed version.
|
||||
std::string prettify(const std::string &json_str);
|
||||
|
||||
/// Extract a string value by key from a JSON object (top-level only).
|
||||
std::string get_string(const std::string &json_str, const std::string &key);
|
||||
|
||||
/// Extract an int value by key from a JSON object (top-level only).
|
||||
int get_int(const std::string &json_str, const std::string &key);
|
||||
|
||||
/// Check if a JSON string is valid.
|
||||
bool is_valid(const std::string &json_str);
|
||||
|
||||
/// Get all top-level keys from a JSON object.
|
||||
std::vector<std::string> keys(const std::string &json_str);
|
||||
|
||||
} // namespace json
|
||||
62
packages/kbot/cpp/packages/json/src/json.cpp
Normal file
62
packages/kbot/cpp/packages/json/src/json.cpp
Normal file
@ -0,0 +1,62 @@
|
||||
#include "json/json.h"
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
#include <rapidjson/prettywriter.h>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
|
||||
namespace json {
|
||||
|
||||
std::string prettify(const std::string &json_str) {
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json_str.c_str());
|
||||
if (doc.HasParseError()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
rapidjson::StringBuffer buffer;
|
||||
rapidjson::PrettyWriter<rapidjson::StringBuffer> writer(buffer);
|
||||
doc.Accept(writer);
|
||||
return std::string(buffer.GetString(), buffer.GetSize());
|
||||
}
|
||||
|
||||
std::string get_string(const std::string &json_str, const std::string &key) {
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json_str.c_str());
|
||||
if (doc.HasParseError() || !doc.IsObject())
|
||||
return {};
|
||||
auto it = doc.FindMember(key.c_str());
|
||||
if (it == doc.MemberEnd() || !it->value.IsString())
|
||||
return {};
|
||||
return std::string(it->value.GetString(), it->value.GetStringLength());
|
||||
}
|
||||
|
||||
int get_int(const std::string &json_str, const std::string &key) {
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json_str.c_str());
|
||||
if (doc.HasParseError() || !doc.IsObject())
|
||||
return 0;
|
||||
auto it = doc.FindMember(key.c_str());
|
||||
if (it == doc.MemberEnd() || !it->value.IsInt())
|
||||
return 0;
|
||||
return it->value.GetInt();
|
||||
}
|
||||
|
||||
bool is_valid(const std::string &json_str) {
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json_str.c_str());
|
||||
return !doc.HasParseError();
|
||||
}
|
||||
|
||||
std::vector<std::string> keys(const std::string &json_str) {
|
||||
std::vector<std::string> result;
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(json_str.c_str());
|
||||
if (doc.HasParseError() || !doc.IsObject())
|
||||
return result;
|
||||
for (auto it = doc.MemberBegin(); it != doc.MemberEnd(); ++it) {
|
||||
result.emplace_back(it->name.GetString(), it->name.GetStringLength());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace json
|
||||
21
packages/kbot/cpp/packages/logger/CMakeLists.txt
Normal file
21
packages/kbot/cpp/packages/logger/CMakeLists.txt
Normal file
@ -0,0 +1,21 @@
|
||||
include(FetchContent)
|
||||
|
||||
FetchContent_Declare(
|
||||
spdlog
|
||||
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
||||
GIT_TAG v1.15.1
|
||||
GIT_SHALLOW TRUE
|
||||
)
|
||||
FetchContent_MakeAvailable(spdlog)
|
||||
|
||||
add_library(logger STATIC
|
||||
src/logger.cpp
|
||||
)
|
||||
|
||||
target_include_directories(logger
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(logger
|
||||
PUBLIC spdlog::spdlog
|
||||
)
|
||||
22
packages/kbot/cpp/packages/logger/include/logger/logger.h
Normal file
22
packages/kbot/cpp/packages/logger/include/logger/logger.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace logger {
|
||||
|
||||
/// Initialize the default logger (call once at startup).
|
||||
void init(const std::string &app_name = "polymech", const std::string &log_level = "info");
|
||||
|
||||
/// Initialize logger with stderr sink (use in worker/IPC mode).
|
||||
void init_stderr(const std::string &app_name = "polymech-worker", const std::string &log_level = "info");
|
||||
|
||||
/// Initialize logger with stderr and file sink (use in UDS worker mode).
|
||||
void init_uds(const std::string &app_name = "polymech-worker", const std::string &log_level = "info", const std::string &log_file = "logs/uds.json");
|
||||
|
||||
/// Log at various levels.
|
||||
void info(const std::string &msg);
|
||||
void warn(const std::string &msg);
|
||||
void error(const std::string &msg);
|
||||
void debug(const std::string &msg);
|
||||
|
||||
} // namespace logger
|
||||
57
packages/kbot/cpp/packages/logger/src/logger.cpp
Normal file
57
packages/kbot/cpp/packages/logger/src/logger.cpp
Normal file
@ -0,0 +1,57 @@
|
||||
#include "logger/logger.h"
|
||||
|
||||
#include <spdlog/sinks/stdout_color_sinks.h>
|
||||
#include <spdlog/sinks/basic_file_sink.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <filesystem>
|
||||
|
||||
|
||||
namespace logger {
|
||||
|
||||
static void apply_log_level(const std::string& level) {
|
||||
if (level == "debug") spdlog::set_level(spdlog::level::debug);
|
||||
else if (level == "warn") spdlog::set_level(spdlog::level::warn);
|
||||
else if (level == "error") spdlog::set_level(spdlog::level::err);
|
||||
else spdlog::set_level(spdlog::level::info);
|
||||
}
|
||||
|
||||
void init(const std::string &app_name, const std::string &log_level) {
|
||||
auto console = spdlog::stdout_color_mt(app_name);
|
||||
spdlog::set_default_logger(console);
|
||||
apply_log_level(log_level);
|
||||
spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v");
|
||||
}
|
||||
|
||||
void init_stderr(const std::string &app_name, const std::string &log_level) {
|
||||
auto console = spdlog::stderr_color_mt(app_name);
|
||||
spdlog::set_default_logger(console);
|
||||
apply_log_level(log_level);
|
||||
spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v");
|
||||
}
|
||||
|
||||
void init_uds(const std::string &app_name, const std::string &log_level, const std::string &log_file) {
|
||||
auto console_sink = std::make_shared<spdlog::sinks::stderr_color_sink_mt>();
|
||||
|
||||
std::filesystem::path log_path(log_file);
|
||||
std::error_code ec;
|
||||
std::filesystem::create_directories(log_path.parent_path(), ec);
|
||||
|
||||
auto file_sink = std::make_shared<spdlog::sinks::basic_file_sink_mt>(log_file, false); // false = append
|
||||
|
||||
std::vector<spdlog::sink_ptr> sinks {console_sink, file_sink};
|
||||
auto multi_logger = std::make_shared<spdlog::logger>(app_name, sinks.begin(), sinks.end());
|
||||
|
||||
spdlog::set_default_logger(multi_logger);
|
||||
apply_log_level(log_level);
|
||||
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
|
||||
// Ensure logs are flushed immediately to file
|
||||
spdlog::flush_every(std::chrono::seconds(1));
|
||||
spdlog::flush_on(spdlog::level::info);
|
||||
}
|
||||
|
||||
void info(const std::string &msg) { spdlog::info(msg); }
|
||||
void warn(const std::string &msg) { spdlog::warn(msg); }
|
||||
void error(const std::string &msg) { spdlog::error(msg); }
|
||||
void debug(const std::string &msg) { spdlog::debug(msg); }
|
||||
|
||||
} // namespace logger
|
||||
9
packages/kbot/cpp/packages/polymech/CMakeLists.txt
Normal file
9
packages/kbot/cpp/packages/polymech/CMakeLists.txt
Normal file
@ -0,0 +1,9 @@
|
||||
add_library(polymech STATIC
|
||||
src/polymech.cpp
|
||||
)
|
||||
|
||||
target_include_directories(polymech
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(polymech PUBLIC postgres logger)
|
||||
@ -0,0 +1,16 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
/// Fetch all rows from the "pages" table.
|
||||
/// Returns raw JSON array string from Supabase.
|
||||
std::string fetch_pages();
|
||||
|
||||
/// Fetch pages with a specific select clause and optional filter.
|
||||
std::string fetch_pages(const std::string &select,
|
||||
const std::string &filter = "", int limit = 0);
|
||||
|
||||
} // namespace polymech
|
||||
17
packages/kbot/cpp/packages/polymech/src/polymech.cpp
Normal file
17
packages/kbot/cpp/packages/polymech/src/polymech.cpp
Normal file
@ -0,0 +1,17 @@
|
||||
#include "polymech/polymech.h"
|
||||
#include "logger/logger.h"
|
||||
#include "postgres/postgres.h"
|
||||
|
||||
|
||||
namespace polymech {
|
||||
|
||||
std::string fetch_pages() { return fetch_pages("*"); }
|
||||
|
||||
std::string fetch_pages(const std::string &select, const std::string &filter,
|
||||
int limit) {
|
||||
logger::debug("polymech::fetch_pages → select=" + select +
|
||||
" filter=" + filter + " limit=" + std::to_string(limit));
|
||||
return postgres::query("pages", select, filter, limit);
|
||||
}
|
||||
|
||||
} // namespace polymech
|
||||
11
packages/kbot/cpp/packages/postgres/CMakeLists.txt
Normal file
11
packages/kbot/cpp/packages/postgres/CMakeLists.txt
Normal file
@ -0,0 +1,11 @@
|
||||
add_library(postgres STATIC
|
||||
src/postgres.cpp
|
||||
)
|
||||
|
||||
target_include_directories(postgres
|
||||
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
)
|
||||
|
||||
target_link_libraries(postgres
|
||||
PUBLIC logger http json
|
||||
)
|
||||
@ -0,0 +1,46 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace postgres {
|
||||
|
||||
/// Supabase connection configuration.
|
||||
struct Config {
|
||||
std::string supabase_url;
|
||||
std::string supabase_key;
|
||||
};
|
||||
|
||||
/// Initialize the Supabase client with URL and API key.
|
||||
void init(const Config &config);
|
||||
|
||||
/// Ping the Supabase REST API. Returns "ok" on success, error message on
|
||||
/// failure.
|
||||
std::string ping();
|
||||
|
||||
/// Query a table via the PostgREST API.
|
||||
/// Returns the raw JSON response body.
|
||||
/// @param table Table name (e.g. "profiles")
|
||||
/// @param select Comma-separated columns (e.g. "id,username"), or "*"
|
||||
/// @param filter PostgREST filter (e.g. "id=eq.abc"), or "" for no filter
|
||||
/// @param limit Max rows (0 = no limit)
|
||||
std::string query(const std::string &table, const std::string &select = "*",
|
||||
const std::string &filter = "", int limit = 0);
|
||||
|
||||
/// Insert a row into a table. Body is a JSON object string.
|
||||
/// Returns the created row as JSON.
|
||||
std::string insert(const std::string &table, const std::string &json_body);
|
||||
|
||||
/// Upsert a row into a table. Body is a JSON array or object string.
|
||||
/// Returns the upserted array as JSON.
|
||||
std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict = "");
|
||||
|
||||
/// Update rows in a table. Body is a JSON object string.
|
||||
/// Returns the updated rows as JSON.
|
||||
std::string update(const std::string &table, const std::string &json_body, const std::string &filter);
|
||||
|
||||
/// Delete rows from a table.
|
||||
/// Returns the deleted rows as JSON.
|
||||
std::string del(const std::string &table, const std::string &filter);
|
||||
|
||||
} // namespace postgres
|
||||
236
packages/kbot/cpp/packages/postgres/src/postgres.cpp
Normal file
236
packages/kbot/cpp/packages/postgres/src/postgres.cpp
Normal file
@ -0,0 +1,236 @@
|
||||
#include "postgres/postgres.h"
|
||||
#include "http/http.h"
|
||||
#include "logger/logger.h"
|
||||
#include "json/json.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
#include <stdexcept>
|
||||
|
||||
namespace postgres {
|
||||
|
||||
static Config s_config;
|
||||
static bool s_initialized = false;
|
||||
|
||||
void init(const Config &config) {
|
||||
s_config = config;
|
||||
s_initialized = true;
|
||||
logger::debug("postgres::init → " + config.supabase_url);
|
||||
}
|
||||
|
||||
static void ensure_init() {
|
||||
if (!s_initialized) {
|
||||
throw std::runtime_error("postgres::init() must be called first");
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the REST URL for a table query.
|
||||
static std::string build_url(const std::string &table,
|
||||
const std::string &select,
|
||||
const std::string &filter, int limit) {
|
||||
std::string url = s_config.supabase_url + "/rest/v1/" + table;
|
||||
url += "?select=" + select;
|
||||
if (!filter.empty()) {
|
||||
url += "&" + filter;
|
||||
}
|
||||
if (limit > 0) {
|
||||
url += "&limit=" + std::to_string(limit);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
/// Make an authenticated GET request to the Supabase REST API.
|
||||
static http::Response supabase_get(const std::string &url) {
|
||||
// We need custom headers, so we use curl directly
|
||||
CURL *curl = curl_easy_init();
|
||||
http::Response resp{};
|
||||
if (!curl) {
|
||||
resp.status_code = -1;
|
||||
resp.body = "curl_easy_init failed";
|
||||
return resp;
|
||||
}
|
||||
|
||||
struct curl_slist *headers = nullptr;
|
||||
headers =
|
||||
curl_slist_append(headers, ("apikey: " + s_config.supabase_key).c_str());
|
||||
headers = curl_slist_append(
|
||||
headers, ("Authorization: Bearer " + s_config.supabase_key).c_str());
|
||||
|
||||
auto write_cb = [](void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
auto *out = static_cast<std::string *>(userp);
|
||||
out->append(static_cast<char *>(contents), size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
curl_easy_setopt(
|
||||
curl, CURLOPT_WRITEFUNCTION,
|
||||
static_cast<size_t (*)(void *, size_t, size_t, void *)>(+write_cb));
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
resp.status_code = -1;
|
||||
resp.body = curl_easy_strerror(res);
|
||||
} else {
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
|
||||
}
|
||||
|
||||
curl_slist_free_all(headers);
|
||||
curl_easy_cleanup(curl);
|
||||
return resp;
|
||||
}
|
||||
|
||||
/// Make an authenticated request with a JSON body (POST, PATCH, DELETE).
|
||||
static http::Response supabase_request(const std::string &method,
|
||||
const std::string &url,
|
||||
const std::string &body,
|
||||
const std::string &prefer_header) {
|
||||
CURL *curl = curl_easy_init();
|
||||
http::Response resp{};
|
||||
if (!curl) {
|
||||
resp.status_code = -1;
|
||||
resp.body = "curl_easy_init failed";
|
||||
return resp;
|
||||
}
|
||||
|
||||
struct curl_slist *headers = nullptr;
|
||||
if (!body.empty()) {
|
||||
headers = curl_slist_append(headers, "Content-Type: application/json");
|
||||
}
|
||||
if (!prefer_header.empty()) {
|
||||
headers = curl_slist_append(headers, ("Prefer: " + prefer_header).c_str());
|
||||
}
|
||||
headers =
|
||||
curl_slist_append(headers, ("apikey: " + s_config.supabase_key).c_str());
|
||||
headers = curl_slist_append(
|
||||
headers, ("Authorization: Bearer " + s_config.supabase_key).c_str());
|
||||
|
||||
auto write_cb = [](void *contents, size_t size, size_t nmemb, void *userp) {
|
||||
auto *out = static_cast<std::string *>(userp);
|
||||
out->append(static_cast<char *>(contents), size * nmemb);
|
||||
return size * nmemb;
|
||||
};
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||
if (!body.empty()) {
|
||||
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
|
||||
}
|
||||
curl_easy_setopt(
|
||||
curl, CURLOPT_WRITEFUNCTION,
|
||||
static_cast<size_t (*)(void *, size_t, size_t, void *)>(+write_cb));
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
resp.status_code = -1;
|
||||
resp.body = curl_easy_strerror(res);
|
||||
} else {
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
|
||||
}
|
||||
|
||||
curl_slist_free_all(headers);
|
||||
curl_easy_cleanup(curl);
|
||||
return resp;
|
||||
}
|
||||
|
||||
std::string ping() {
|
||||
ensure_init();
|
||||
// Lightweight check: query profiles with limit=0 to verify connectivity
|
||||
auto resp = supabase_get(s_config.supabase_url +
|
||||
"/rest/v1/profiles?select=id&limit=0");
|
||||
if (resp.status_code >= 200 && resp.status_code < 300) {
|
||||
logger::info("postgres::ping → ok (HTTP " +
|
||||
std::to_string(resp.status_code) + ")");
|
||||
return "ok";
|
||||
}
|
||||
logger::error("postgres::ping → HTTP " + std::to_string(resp.status_code) +
|
||||
": " + resp.body);
|
||||
return "error: HTTP " + std::to_string(resp.status_code);
|
||||
}
|
||||
|
||||
std::string query(const std::string &table, const std::string &select,
|
||||
const std::string &filter, int limit) {
|
||||
ensure_init();
|
||||
auto url = build_url(table, select, filter, limit);
|
||||
logger::debug("postgres::query → " + url);
|
||||
|
||||
auto resp = supabase_get(url);
|
||||
if (resp.status_code >= 200 && resp.status_code < 300) {
|
||||
return resp.body;
|
||||
}
|
||||
logger::error("postgres::query → HTTP " + std::to_string(resp.status_code) +
|
||||
": " + resp.body);
|
||||
return resp.body;
|
||||
}
|
||||
|
||||
std::string insert(const std::string &table, const std::string &json_body) {
|
||||
ensure_init();
|
||||
auto url = s_config.supabase_url + "/rest/v1/" + table;
|
||||
logger::debug("postgres::insert → " + url);
|
||||
|
||||
auto resp = supabase_request("POST", url, json_body, "return=representation");
|
||||
if (resp.status_code >= 200 && resp.status_code < 300) {
|
||||
return resp.body;
|
||||
}
|
||||
logger::error("postgres::insert → HTTP " + std::to_string(resp.status_code) +
|
||||
": " + resp.body);
|
||||
return resp.body;
|
||||
}
|
||||
|
||||
std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict) {
|
||||
ensure_init();
|
||||
auto url = s_config.supabase_url + "/rest/v1/" + table;
|
||||
if (!on_conflict.empty()) {
|
||||
url += "?on_conflict=" + on_conflict;
|
||||
}
|
||||
logger::debug("postgres::upsert → " + url);
|
||||
|
||||
auto resp = supabase_request("POST", url, json_body, "return=minimal, resolution=merge-duplicates");
|
||||
if (resp.status_code >= 200 && resp.status_code < 300) {
|
||||
return resp.body;
|
||||
}
|
||||
logger::error("postgres::upsert → HTTP " + std::to_string(resp.status_code) +
|
||||
": " + resp.body);
|
||||
return resp.body;
|
||||
}
|
||||
|
||||
std::string update(const std::string &table, const std::string &json_body, const std::string &filter) {
|
||||
ensure_init();
|
||||
auto url = s_config.supabase_url + "/rest/v1/" + table;
|
||||
if (!filter.empty()) {
|
||||
url += "?" + filter;
|
||||
}
|
||||
logger::debug("postgres::update → " + url);
|
||||
|
||||
auto resp = supabase_request("PATCH", url, json_body, "return=representation");
|
||||
if (resp.status_code >= 200 && resp.status_code < 300) {
|
||||
return resp.body;
|
||||
}
|
||||
logger::error("postgres::update → HTTP " + std::to_string(resp.status_code) +
|
||||
": " + resp.body);
|
||||
return resp.body;
|
||||
}
|
||||
|
||||
std::string del(const std::string &table, const std::string &filter) {
|
||||
ensure_init();
|
||||
auto url = s_config.supabase_url + "/rest/v1/" + table;
|
||||
if (!filter.empty()) {
|
||||
url += "?" + filter;
|
||||
}
|
||||
logger::debug("postgres::del → " + url);
|
||||
|
||||
auto resp = supabase_request("DELETE", url, "", "return=representation");
|
||||
if (resp.status_code >= 200 && resp.status_code < 300) {
|
||||
return resp.body;
|
||||
}
|
||||
logger::error("postgres::del → HTTP " + std::to_string(resp.status_code) +
|
||||
": " + resp.body);
|
||||
return resp.body;
|
||||
}
|
||||
|
||||
} // namespace postgres
|
||||
7
packages/kbot/cpp/packages/search/CMakeLists.txt
Normal file
7
packages/kbot/cpp/packages/search/CMakeLists.txt
Normal file
@ -0,0 +1,7 @@
|
||||
add_library(search STATIC src/search.cpp)
|
||||
|
||||
target_include_directories(search PUBLIC include)
|
||||
|
||||
# Depends on http (curl) and json (RapidJSON wrapper)
|
||||
target_link_libraries(search PUBLIC http json)
|
||||
target_link_libraries(search PRIVATE tomlplusplus::tomlplusplus)
|
||||
93
packages/kbot/cpp/packages/search/include/search/search.h
Normal file
93
packages/kbot/cpp/packages/search/include/search/search.h
Normal file
@ -0,0 +1,93 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace search {
|
||||
|
||||
// ── Result types ────────────────────────────────────────────────────────────
|
||||
|
||||
struct GpsCoordinates {
|
||||
double lat = 0;
|
||||
double lng = 0;
|
||||
};
|
||||
|
||||
struct MapResult {
|
||||
std::string title;
|
||||
std::string place_id;
|
||||
std::string data_id;
|
||||
std::string address;
|
||||
std::string phone;
|
||||
std::string website;
|
||||
std::string type;
|
||||
std::vector<std::string> types;
|
||||
double rating = 0;
|
||||
int reviews = 0;
|
||||
GpsCoordinates gps;
|
||||
std::string thumbnail;
|
||||
std::string raw_json;
|
||||
std::string geo_json;
|
||||
};
|
||||
|
||||
struct SearchResult {
|
||||
std::vector<MapResult> results;
|
||||
int apiCalls = 0;
|
||||
std::string error;
|
||||
};
|
||||
|
||||
// ── Config ──────────────────────────────────────────────────────────────────
|
||||
|
||||
struct SystemTuningOptions {
|
||||
int executor_threads = 0; // 0 = hardware concurrency
|
||||
int max_concurrent_jobs_per_user = 10;
|
||||
int http_concurrency_throttle = 50;
|
||||
int queue_depth_max = 10000;
|
||||
int bulk_dequeue_size = 1;
|
||||
int ipc_timeout_ms = 300000;
|
||||
int max_ipc_connections = 100;
|
||||
int buffer_size_max = 50 * 1024 * 1024;
|
||||
};
|
||||
|
||||
struct Config {
|
||||
SystemTuningOptions system;
|
||||
std::string serpapi_key;
|
||||
std::string geocoder_key;
|
||||
std::string bigdata_key;
|
||||
std::string scrapeless_key;
|
||||
std::string postgres_url;
|
||||
std::string supabase_url;
|
||||
std::string supabase_service_key;
|
||||
// [enricher]
|
||||
std::string enricher_meta_scraper;
|
||||
int enricher_meta_concurrency = 5;
|
||||
int enricher_meta_idle_timeout = 60;
|
||||
int enricher_location_concurrency = 1;
|
||||
};
|
||||
|
||||
/// Load config from a TOML file (e.g. config/postgres.toml)
|
||||
Config load_config(const std::string &path = "config/postgres.toml");
|
||||
|
||||
// ── Search API ──────────────────────────────────────────────────────────────
|
||||
|
||||
struct SearchOptions {
|
||||
std::string query;
|
||||
double lat = 0;
|
||||
double lng = 0;
|
||||
int zoom = 13;
|
||||
int limit = 20;
|
||||
std::string engine = "google_maps";
|
||||
std::string hl = "en";
|
||||
std::string google_domain = "google.com";
|
||||
};
|
||||
|
||||
/// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit.
|
||||
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts);
|
||||
|
||||
/// Resolve geo coordinate to place info
|
||||
std::string resolve_geo(double lat, double lng, const std::string &key,
|
||||
int timeout_ms = 3000);
|
||||
|
||||
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
|
||||
int concurrency = 10, int timeout_ms = 3000);
|
||||
|
||||
} // namespace search
|
||||
311
packages/kbot/cpp/packages/search/src/search.cpp
Normal file
311
packages/kbot/cpp/packages/search/src/search.cpp
Normal file
@ -0,0 +1,311 @@
|
||||
#include "search/search.h"
|
||||
#include "http/http.h"
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
#include <toml++/toml.hpp>
|
||||
|
||||
#include <atomic>
|
||||
#include <cstdio>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
#include <sstream>
|
||||
#include <thread>
|
||||
|
||||
namespace search {
|
||||
|
||||
// ── URL encoding (minimal) ──────────────────────────────────────────────────
|
||||
|
||||
static std::string url_encode(const std::string &val) {
|
||||
std::string result;
|
||||
result.reserve(val.size() * 2);
|
||||
for (unsigned char c : val) {
|
||||
if (isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_' ||
|
||||
c == '.' || c == '~') {
|
||||
result += static_cast<char>(c);
|
||||
} else {
|
||||
char buf[4];
|
||||
snprintf(buf, sizeof(buf), "%%%02X", c);
|
||||
result += buf;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Config loading ──────────────────────────────────────────────────────────
|
||||
|
||||
Config load_config(const std::string &path) {
|
||||
Config cfg;
|
||||
try {
|
||||
auto tbl = toml::parse_file(path);
|
||||
|
||||
// [postgres]
|
||||
if (auto v = tbl["postgres"]["url"].value<std::string>())
|
||||
cfg.postgres_url = *v;
|
||||
|
||||
// [supabase]
|
||||
if (auto v = tbl["supabase"]["url"].value<std::string>())
|
||||
cfg.supabase_url = *v;
|
||||
if (auto v = tbl["supabase"]["service_key"].value<std::string>())
|
||||
cfg.supabase_service_key = *v;
|
||||
|
||||
// [services]
|
||||
if (auto v = tbl["services"]["SERPAPI_KEY"].value<std::string>())
|
||||
cfg.serpapi_key = *v;
|
||||
if (auto v = tbl["services"]["GEO_CODER_KEY"].value<std::string>())
|
||||
cfg.geocoder_key = *v;
|
||||
if (auto v = tbl["services"]["BIG_DATA_KEY"].value<std::string>())
|
||||
cfg.bigdata_key = *v;
|
||||
if (auto v = tbl["services"]["SCRAPELESS_KEY"].value<std::string>())
|
||||
cfg.scrapeless_key = *v;
|
||||
|
||||
// [enricher]
|
||||
if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value<std::string>())
|
||||
cfg.enricher_meta_scraper = *v;
|
||||
if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value<int>())
|
||||
cfg.enricher_meta_concurrency = *v;
|
||||
if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value<int>())
|
||||
cfg.enricher_meta_idle_timeout = *v;
|
||||
if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value<int>())
|
||||
cfg.enricher_location_concurrency = *v;
|
||||
|
||||
// [system]
|
||||
if (auto v = tbl["system"]["executor_threads"].value<int>())
|
||||
cfg.system.executor_threads = *v;
|
||||
if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value<int>())
|
||||
cfg.system.max_concurrent_jobs_per_user = *v;
|
||||
if (auto v = tbl["system"]["http_concurrency_throttle"].value<int>())
|
||||
cfg.system.http_concurrency_throttle = *v;
|
||||
if (auto v = tbl["system"]["queue_depth_max"].value<int>())
|
||||
cfg.system.queue_depth_max = *v;
|
||||
if (auto v = tbl["system"]["bulk_dequeue_size"].value<int>())
|
||||
cfg.system.bulk_dequeue_size = *v;
|
||||
if (auto v = tbl["system"]["ipc_timeout_ms"].value<int>())
|
||||
cfg.system.ipc_timeout_ms = *v;
|
||||
if (auto v = tbl["system"]["max_ipc_connections"].value<int>())
|
||||
cfg.system.max_ipc_connections = *v;
|
||||
if (auto v = tbl["system"]["buffer_size_max"].value<int>())
|
||||
cfg.system.buffer_size_max = *v;
|
||||
|
||||
} catch (const toml::parse_error &err) {
|
||||
std::cerr << "[config] TOML parse error in " << path << ": " << err.what()
|
||||
<< "\n";
|
||||
}
|
||||
return cfg;
|
||||
}
|
||||
|
||||
// ── SerpAPI URL builder ─────────────────────────────────────────────────────
|
||||
|
||||
static std::string build_serpapi_url(const Config &cfg,
|
||||
const SearchOptions &opts, int start) {
|
||||
std::ostringstream url;
|
||||
url << "https://serpapi.com/search.json"
|
||||
<< "?engine=" << url_encode(opts.engine)
|
||||
<< "&q=" << url_encode(opts.query)
|
||||
<< "&api_key=" << url_encode(cfg.serpapi_key)
|
||||
<< "&hl=" << url_encode(opts.hl)
|
||||
<< "&google_domain=" << url_encode(opts.google_domain);
|
||||
|
||||
if (opts.lat != 0 || opts.lng != 0) {
|
||||
char llBuf[128];
|
||||
snprintf(llBuf, sizeof(llBuf), "@%.7f,%.7f,%dz", opts.lat, opts.lng,
|
||||
opts.zoom);
|
||||
url << "&ll=" << url_encode(std::string(llBuf));
|
||||
}
|
||||
|
||||
if (start > 0) {
|
||||
url << "&start=" << start;
|
||||
}
|
||||
|
||||
return url.str();
|
||||
}
|
||||
|
||||
// ── JSON result parser ──────────────────────────────────────────────────────
|
||||
|
||||
static void parse_results(const rapidjson::Value &arr,
|
||||
std::vector<MapResult> &out) {
|
||||
if (!arr.IsArray())
|
||||
return;
|
||||
|
||||
for (rapidjson::SizeType i = 0; i < arr.Size(); ++i) {
|
||||
const auto &obj = arr[i];
|
||||
if (!obj.IsObject())
|
||||
continue;
|
||||
|
||||
MapResult r;
|
||||
|
||||
// Capture raw JSON string
|
||||
rapidjson::StringBuffer buf;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
|
||||
obj.Accept(writer);
|
||||
r.raw_json = std::string(buf.GetString(), buf.GetSize());
|
||||
|
||||
if (obj.HasMember("title") && obj["title"].IsString())
|
||||
r.title = obj["title"].GetString();
|
||||
if (obj.HasMember("place_id") && obj["place_id"].IsString())
|
||||
r.place_id = obj["place_id"].GetString();
|
||||
if (obj.HasMember("data_id") && obj["data_id"].IsString())
|
||||
r.data_id = obj["data_id"].GetString();
|
||||
if (obj.HasMember("address") && obj["address"].IsString())
|
||||
r.address = obj["address"].GetString();
|
||||
if (obj.HasMember("phone") && obj["phone"].IsString())
|
||||
r.phone = obj["phone"].GetString();
|
||||
if (obj.HasMember("website") && obj["website"].IsString())
|
||||
r.website = obj["website"].GetString();
|
||||
if (obj.HasMember("type") && obj["type"].IsString())
|
||||
r.type = obj["type"].GetString();
|
||||
if (obj.HasMember("rating") && obj["rating"].IsNumber())
|
||||
r.rating = obj["rating"].GetDouble();
|
||||
if (obj.HasMember("reviews") && obj["reviews"].IsInt())
|
||||
r.reviews = obj["reviews"].GetInt();
|
||||
if (obj.HasMember("thumbnail") && obj["thumbnail"].IsString())
|
||||
r.thumbnail = obj["thumbnail"].GetString();
|
||||
|
||||
if (obj.HasMember("gps_coordinates") && obj["gps_coordinates"].IsObject()) {
|
||||
const auto &gps = obj["gps_coordinates"];
|
||||
if (gps.HasMember("latitude") && gps["latitude"].IsNumber())
|
||||
r.gps.lat = gps["latitude"].GetDouble();
|
||||
if (gps.HasMember("longitude") && gps["longitude"].IsNumber())
|
||||
r.gps.lng = gps["longitude"].GetDouble();
|
||||
}
|
||||
|
||||
if (obj.HasMember("types") && obj["types"].IsArray()) {
|
||||
for (rapidjson::SizeType j = 0; j < obj["types"].Size(); ++j) {
|
||||
if (obj["types"][j].IsString())
|
||||
r.types.push_back(obj["types"][j].GetString());
|
||||
}
|
||||
}
|
||||
|
||||
out.push_back(std::move(r));
|
||||
}
|
||||
}
|
||||
|
||||
// ── Main search function ────────────────────────────────────────────────────
|
||||
|
||||
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts) {
|
||||
SearchResult result;
|
||||
|
||||
if (cfg.serpapi_key.empty()) {
|
||||
result.error = "No SerpAPI key configured";
|
||||
return result;
|
||||
}
|
||||
|
||||
if (opts.query.empty()) {
|
||||
result.error = "Empty search query";
|
||||
return result;
|
||||
}
|
||||
|
||||
const int PAGE_SIZE = 20;
|
||||
int start = 0;
|
||||
|
||||
while (static_cast<int>(result.results.size()) < opts.limit) {
|
||||
std::string url = build_serpapi_url(cfg, opts, start);
|
||||
auto resp = http::get(url);
|
||||
result.apiCalls++;
|
||||
|
||||
if (resp.status_code != 200) {
|
||||
result.error = "SerpAPI HTTP " + std::to_string(resp.status_code);
|
||||
break;
|
||||
}
|
||||
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(resp.body.c_str());
|
||||
if (doc.HasParseError()) {
|
||||
result.error = "Failed to parse SerpAPI response";
|
||||
break;
|
||||
}
|
||||
|
||||
size_t beforeCount = result.results.size();
|
||||
|
||||
// local_results (main listing)
|
||||
if (doc.HasMember("local_results") && doc["local_results"].IsArray()) {
|
||||
parse_results(doc["local_results"], result.results);
|
||||
}
|
||||
|
||||
// place_results (single result or array)
|
||||
if (doc.HasMember("place_results")) {
|
||||
if (doc["place_results"].IsArray()) {
|
||||
parse_results(doc["place_results"], result.results);
|
||||
} else if (doc["place_results"].IsObject()) {
|
||||
rapidjson::Document arr;
|
||||
arr.SetArray();
|
||||
arr.PushBack(rapidjson::Value(doc["place_results"], arr.GetAllocator()),
|
||||
arr.GetAllocator());
|
||||
parse_results(arr, result.results);
|
||||
}
|
||||
}
|
||||
|
||||
size_t pageCount = result.results.size() - beforeCount;
|
||||
|
||||
if (pageCount == 0)
|
||||
break; // No more results
|
||||
if (static_cast<int>(pageCount) < PAGE_SIZE)
|
||||
break; // Last page (partial)
|
||||
|
||||
start += PAGE_SIZE;
|
||||
}
|
||||
|
||||
// Trim to limit
|
||||
if (static_cast<int>(result.results.size()) > opts.limit) {
|
||||
result.results.resize(opts.limit);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ── Geo enrichment ──────────────────────────────────────────────────────────
|
||||
|
||||
std::string resolve_geo(double lat, double lng, const std::string &key,
|
||||
int timeout_ms) {
|
||||
if (key.empty())
|
||||
return "{}";
|
||||
char url[512];
|
||||
snprintf(
|
||||
url, sizeof(url),
|
||||
"https://api.bigdatacloud.net/data/"
|
||||
"reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s",
|
||||
lat, lng, key.c_str());
|
||||
|
||||
http::GetOptions opts;
|
||||
opts.timeout_ms = timeout_ms;
|
||||
auto resp = http::get(url, opts);
|
||||
if (resp.status_code == 200 && !resp.body.empty()) {
|
||||
return resp.body;
|
||||
}
|
||||
return "{}";
|
||||
}
|
||||
|
||||
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
|
||||
int concurrency, int timeout_ms) {
|
||||
if (key.empty() || results.empty())
|
||||
return;
|
||||
|
||||
std::atomic<size_t> current_idx{0};
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
int num_threads =
|
||||
std::min<int>(concurrency, static_cast<int>(results.size()));
|
||||
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
threads.emplace_back([&]() {
|
||||
while (true) {
|
||||
size_t idx = current_idx.fetch_add(1);
|
||||
if (idx >= results.size())
|
||||
break;
|
||||
|
||||
auto &r = results[idx];
|
||||
if (r.gps.lat != 0 || r.gps.lng != 0) {
|
||||
r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
for (auto &t : threads) {
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace search
|
||||
320
packages/kbot/cpp/polymech.md
Normal file
320
packages/kbot/cpp/polymech.md
Normal file
@ -0,0 +1,320 @@
|
||||
# Polymech C++ Gridsearch Worker — Design
|
||||
|
||||
## Goal
|
||||
|
||||
Port the [gridsearch-worker.ts](../src/products/locations/gridsearch-worker.ts) pipeline to native C++, running as a **CLI subcommand** (`polymech-cli gridsearch`) while keeping all logic in internal libraries under `packages/`. The worker communicates progress via the [IPC framing protocol](./packages/ipc/) and writes results to Supabase via the existing [postgres](./packages/postgres/) package.
|
||||
|
||||
---
|
||||
|
||||
## Status
|
||||
|
||||
| Package | Status | Tests | Assertions |
|
||||
|---------|--------|-------|------------|
|
||||
| `geo` | ✅ Done | 23 | 77 |
|
||||
| `gadm_reader` | ✅ Done | 18 | 53 |
|
||||
| `grid` | ✅ Done | 13 | 105 |
|
||||
| `search` | ✅ Done | 8 | 13 |
|
||||
| CLI `gridsearch` | ✅ Done | — | dry-run verified (3ms) |
|
||||
| IPC `gridsearch` | ✅ Done | 1 | 30 |
|
||||
| **Total** | | **63** | **278** |
|
||||
|
||||
---
|
||||
|
||||
## Existing C++ Inventory
|
||||
|
||||
| Package | Provides |
|
||||
|---------|----------|
|
||||
| `ipc` | Length-prefixed JSON over stdio |
|
||||
| `postgres` | Supabase PostgREST: `query`, `insert` |
|
||||
| `http` | libcurl `GET`/`POST` |
|
||||
| `json` | RapidJSON validate/prettify |
|
||||
| `logger` | spdlog (stdout or **stderr** in worker mode) |
|
||||
| `html` | HTML parser |
|
||||
|
||||
---
|
||||
|
||||
## TypeScript Pipeline (Reference)
|
||||
|
||||
```
|
||||
GADM Resolve → Grid Generate → SerpAPI Search → Enrich → Supabase Upsert
|
||||
```
|
||||
|
||||
| Phase | Input | Output | Heavy work |
|
||||
|-------|-------|--------|------------|
|
||||
| **1. GADM Resolve** | GID list + target level | `GridFeature[]` (GeoJSON polygons with GHS props) | Read pre-cached JSON files from `cache/gadm/boundary_{GID}_{LEVEL}.json` |
|
||||
| **2. Grid Generate** | `GridFeature[]` + settings | `GridSearchHop[]` (waypoints: lat/lng/radius) | Centroid, bbox, distance, area, point-in-polygon, cell sorting |
|
||||
| **3. Search** | Waypoints + query + SerpAPI key | Place results (JSON) | HTTP calls to `serpapi.com`, per-waypoint caching |
|
||||
| **4. Enrich** | Place results | Enriched data (emails, pages) | HTTP scraping |
|
||||
| **5. Persist** | Enriched places | Supabase `places` + `grid_search_runs` | PostgREST upsert |
|
||||
|
||||
---
|
||||
|
||||
## Implemented Packages
|
||||
|
||||
### 1. `packages/geo` — Geometry primitives ✅
|
||||
|
||||
Header + `.cpp`, no external deps. Implements the **turf.js subset** used by the grid generator.
|
||||
|
||||
```cpp
|
||||
namespace geo {
|
||||
|
||||
struct Coord { double lon, lat; };
|
||||
struct BBox { double minLon, minLat, maxLon, maxLat; };
|
||||
|
||||
BBox bbox(const std::vector<Coord>& ring);
|
||||
Coord centroid(const std::vector<Coord>& ring);
|
||||
double area_sq_m(const std::vector<Coord>& ring);
|
||||
double distance_km(Coord a, Coord b);
|
||||
bool point_in_polygon(Coord pt, const std::vector<Coord>& ring);
|
||||
|
||||
std::vector<BBox> square_grid(BBox extent, double cellSizeKm);
|
||||
std::vector<BBox> hex_grid(BBox extent, double cellSizeKm);
|
||||
std::vector<Coord> buffer_circle(Coord center, double radiusKm, int steps = 6);
|
||||
} // namespace geo
|
||||
```
|
||||
|
||||
**Rationale**: ~200 lines avoids pulling GEOS/Boost.Geometry. Adopts `pip.h` ray-casting pattern from `packages/gadm/cpp/` without the GDAL/GEOS/PROJ dependency (~700MB).
|
||||
|
||||
---
|
||||
|
||||
### 2. `packages/gadm_reader` — Boundary resolver ✅
|
||||
|
||||
Reads pre-cached GADM boundary JSON from disk. No network calls.
|
||||
|
||||
```cpp
|
||||
namespace gadm {
|
||||
|
||||
struct Feature {
|
||||
std::string gid, name;
|
||||
int level;
|
||||
std::vector<std::vector<geo::Coord>> rings;
|
||||
double ghsPopulation, ghsBuiltWeight;
|
||||
geo::Coord ghsPopCenter, ghsBuiltCenter;
|
||||
std::vector<std::array<double, 3>> ghsPopCenters; // [lon, lat, weight]
|
||||
std::vector<std::array<double, 3>> ghsBuiltCenters;
|
||||
double areaSqKm;
|
||||
};
|
||||
|
||||
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
|
||||
const std::string& cacheDir = "cache/gadm");
|
||||
} // namespace gadm
|
||||
```
|
||||
|
||||
Handles `Polygon`/`MultiPolygon`, GHS enrichment fields, fallback resolution by country code prefix.
|
||||
|
||||
---
|
||||
|
||||
### 3. `packages/grid` — Grid generator ✅
|
||||
|
||||
Direct port of [grid-generator.ts](../../shared/src/products/places/grid-generator.ts).
|
||||
|
||||
```cpp
|
||||
namespace grid {
|
||||
|
||||
struct Waypoint { int step; double lng, lat, radius_km; };
|
||||
struct GridOptions {
|
||||
std::string gridMode; // "hex", "square", "admin", "centers"
|
||||
double cellSize; // km
|
||||
double cellOverlap, centroidOverlap;
|
||||
int maxCellsLimit;
|
||||
double maxElevation, minDensity, minGhsPop, minGhsBuilt;
|
||||
std::string ghsFilterMode; // "AND" | "OR"
|
||||
bool allowMissingGhs, bypassFilters;
|
||||
std::string pathOrder; // "zigzag", "snake", "spiral-out", "spiral-in", "shortest"
|
||||
bool groupByRegion;
|
||||
};
|
||||
struct GridResult { std::vector<Waypoint> waypoints; int validCells, skippedCells; std::string error; };
|
||||
|
||||
GridResult generate(const std::vector<gadm::Feature>& features, const GridOptions& opts);
|
||||
} // namespace grid
|
||||
```
|
||||
|
||||
**4 modes**: `admin` (centroid + radius), `centers` (GHS deduplicated), `hex`, `square` (tessellation + PIP)
|
||||
**5 sort algorithms**: `zigzag`, `snake`, `spiral-out`, `spiral-in`, `shortest` (greedy NN)
|
||||
|
||||
---
|
||||
|
||||
### 4. `packages/search` — SerpAPI client + config ✅
|
||||
|
||||
```cpp
|
||||
namespace search {
|
||||
|
||||
struct Config {
|
||||
std::string serpapi_key, geocoder_key, bigdata_key;
|
||||
std::string postgres_url, supabase_url, supabase_service_key;
|
||||
};
|
||||
|
||||
Config load_config(const std::string& path = "config/postgres.toml");
|
||||
|
||||
struct SearchOptions {
|
||||
std::string query;
|
||||
double lat, lng;
|
||||
int zoom = 13, limit = 20;
|
||||
std::string engine = "google_maps", hl = "en", google_domain = "google.com";
|
||||
};
|
||||
|
||||
struct MapResult {
|
||||
std::string title, place_id, data_id, address, phone, website, type;
|
||||
std::vector<std::string> types;
|
||||
double rating; int reviews;
|
||||
GpsCoordinates gps;
|
||||
};
|
||||
|
||||
SearchResult search_google_maps(const Config& cfg, const SearchOptions& opts);
|
||||
} // namespace search
|
||||
```
|
||||
|
||||
Reads `[services].SERPAPI_KEY`, `GEO_CODER_KEY`, `BIG_DATA_KEY` from `config/postgres.toml`. HTTP pagination via `http::get()`, JSON parsing with RapidJSON.
|
||||
|
||||
---
|
||||
|
||||
## CLI Subcommands ✅
|
||||
|
||||
### 1. `gridsearch` (One-shot execution)
|
||||
|
||||
```
|
||||
polymech-cli gridsearch <GID> <QUERY> [OPTIONS]
|
||||
|
||||
Positionals:
|
||||
GID GADM GID (e.g. ESP.1.1_1) — ignored when --settings is used
|
||||
QUERY Search query — ignored when --settings is used
|
||||
|
||||
Options:
|
||||
-l, --level INT Target GADM level (default: 0)
|
||||
-m, --mode TEXT Grid mode: hex|square|admin|centers (default: hex)
|
||||
-s, --cell-size FLOAT Cell size in km (default: 5.0)
|
||||
--limit INT Max results per area (default: 20)
|
||||
-z, --zoom INT Google Maps zoom (default: 13)
|
||||
--sort TEXT Path order: snake|zigzag|spiral-out|spiral-in|shortest
|
||||
-c, --config TEXT TOML config path (default: config/postgres.toml)
|
||||
--cache-dir TEXT GADM cache directory (default: cache/gadm)
|
||||
--settings TEXT JSON settings file (matches TypeScript GuidedPreset shape)
|
||||
--enrich Run enrichment pipeline (meta + email) after search
|
||||
--persistence-postgres Persist run data natively via Postgres
|
||||
-o, --output TEXT Output JSON file (default: gridsearch-HH-MM.json in cwd)
|
||||
--dry-run Generate grid only, skip SerpAPI search
|
||||
```
|
||||
|
||||
### 2. `worker` (IPC Daemon execution)
|
||||
|
||||
```
|
||||
polymech-cli worker [OPTIONS]
|
||||
|
||||
Options:
|
||||
--daemon Run persistent daemon pool (tier-based)
|
||||
-c, --config TEXT TOML config path (default: config/postgres.toml)
|
||||
--user-uid TEXT User ID to bind this daemon to (needed for place owner)
|
||||
--uds TEXT Run over Unix Domain Socket / Named Pipe (TCP on Windows) at the given path
|
||||
```
|
||||
|
||||
### Execution flow
|
||||
|
||||
```
|
||||
1. load_config(configPath) → Config (TOML)
|
||||
2. gadm::load_boundary(gid, level) → features[]
|
||||
3. grid::generate(features, opts) → waypoints[]
|
||||
4. --dry-run → output JSON array and exit
|
||||
5. For each waypoint → search::search_google_maps(cfg, sopts)
|
||||
6. Stream JSON summary to stdout
|
||||
```
|
||||
|
||||
### Example
|
||||
|
||||
```bash
|
||||
polymech-cli gridsearch ABW "recycling" --dry-run
|
||||
# → [{"step":1,"lat":12.588582,"lng":-70.040465,"radius_km":3.540}, ...]
|
||||
# [info] Dry-run complete in 3ms
|
||||
```
|
||||
|
||||
### IPC worker mode
|
||||
|
||||
The `worker` subcommand natively routes multiplexed asynchronous `gridsearch` payloads. When launched via `--uds <path>`, it provisions a high-performance Asio streaming server (AF_UNIX sockets on POSIX, TCP sockets on Windows). Event frames (`grid-ready`, `waypoint-start`, `location`, `node`, etc) emit bi-directionally utilizing the IPC bridging protocol, dropping locking blockades completely.
|
||||
|
||||
---
|
||||
|
||||
## Exposed Configuration / Tuning Parameters
|
||||
|
||||
As we integrate deeper with the core business logic, the Node orchestrator and internal services should configure and enforce limits on the underlying C++ concurrent engine. Relevant configuration surfaces we need to expose for the primary ecosystem libraries include:
|
||||
|
||||
### 1. Taskflow (`https://github.com/taskflow/taskflow`)
|
||||
- **`executor_threads` (`num_workers`)**: The size of the `tf::Executor` thread pool. As Gridsearch is heavily I/O network bound (HTTP calls for search/enrichment), setting this significantly higher than `std::thread::hardware_concurrency()` may aggressively improve HTTP ingestion throughput globally.
|
||||
- **`max_concurrent_jobs_per_user`**: A structural limit dictating how many concurrent gridsearch invocation graphs a single tenant/user can enqueue and run actively to prevent monopolization.
|
||||
- **`http_concurrency_throttle`**: Task limits enforced upon node scraping or SerpAPI requests per-pipeline graph to avoid widespread `429 Too Many Requests` bans.
|
||||
|
||||
### 2. Moodycamel ConcurrentQueue (`https://github.com/cameron314/concurrentqueue`)
|
||||
- **`queue_depth_max` / `backpressure`**: Since Moodycamel queue memory allocates dynamically and lock-free to any capacity, we must mandate a hard software ceiling/backpressure limit over the Node-to-C++ IPC layer. If Node blindly streams jobs faster than Taskflow can execute them, the daemon will eventually OOM.
|
||||
- **`bulk_dequeue_size`**: Exposing tuning parameters for the dispatch thread on how many concurrent IPC tasks should be sucked out of the queue simultaneously.
|
||||
|
||||
### 3. Boost.Asio (`https://github.com/chriskohlhoff/asio`)
|
||||
- **`ipc_timeout_ms` (Read/Write)**: Mandatory timeouts for the IPC socket layer. If the orchestrator stalls, crashes, or goes silent, Asio must reap the connection and automatically GC the in-flight tasks to prevent Zombie worker processes.
|
||||
- **`max_ipc_connections`**: Absolute limit on simultaneous orchestration pipelines dialing into a single Worker Pod.
|
||||
- **`buffer_size_max`**: Soft constraints on async payload allocations so a malformed 200MB JSON frame from Node.js doesn't memory-spike the `asio::read` operations abruptly.
|
||||
|
||||
---
|
||||
|
||||
## Build Integration
|
||||
|
||||
### Dependency graph
|
||||
|
||||
```
|
||||
┌──────────┐
|
||||
│ polymech │ (the lib)
|
||||
│ -cli │ (the binary)
|
||||
└────┬─────┘
|
||||
┌────────────┼────────────────┐
|
||||
▼ ▼ ▼
|
||||
┌──────────┐ ┌──────────┐ ┌──────────┐
|
||||
│ search │ │ grid │ │ ipc │
|
||||
└────┬─────┘ └────┬─────┘ └──────────┘
|
||||
│ │
|
||||
▼ ▼
|
||||
┌──────────┐ ┌───────────────┐
|
||||
│ http │ │ gadm_reader │
|
||||
└──────────┘ └────┬──────────┘
|
||||
▼
|
||||
┌──────────┐
|
||||
│ geo │ ← no deps (math only)
|
||||
└──────────┘
|
||||
┌──────────┐
|
||||
│ json │ ← RapidJSON
|
||||
└──────────┘
|
||||
```
|
||||
|
||||
All packages depend on `logger` and `json` implicitly.
|
||||
|
||||
---
|
||||
|
||||
## Testing
|
||||
|
||||
### Unit tests (Catch2) — 62 tests, 248 assertions ✅
|
||||
|
||||
| Test file | Tests | Assertions | Validates |
|
||||
|-----------|-------|------------|-----------|
|
||||
| `test_geo.cpp` | 23 | 77 | Haversine, area, centroid, PIP, hex/square grid |
|
||||
| `test_gadm_reader.cpp` | 18 | 53 | JSON parsing, GHS props, fallback resolution |
|
||||
| `test_grid.cpp` | 13 | 105 | All 4 modes × 5 sorts, GHS filtering, PIP clipping |
|
||||
| `test_search.cpp` | 8 | 13 | Config loading, key validation, error handling |
|
||||
|
||||
### Integration test (Node.js)
|
||||
|
||||
- Existing `orchestrator/test-ipc.mjs` validates spawn/lifecycle/ping/job
|
||||
- `orchestrator/test-gridsearch-ipc.mjs` validates full pipeline via IPC (8 event types + job result)
|
||||
- `orchestrator/test-gridsearch-ipc-uds.mjs` validates high-throughput Unix Domain Sockets mapping, backpressure boundaries, and soft cancellation injections utilizing `action: cancel` frames mid-flight.
|
||||
|
||||
---
|
||||
|
||||
## IPC Cancellation & Dynamic Job Tuning
|
||||
|
||||
The high-performance UDS daemon now natively tracks and intercepts JSON `action: cancel` frames referencing specific `jobId`s to gracefully exit Taskflow jobs mid-flight.
|
||||
Dynamic tuning limits, such as memory buffering boundaries or threading capacities, are inherently validated and bound by hard ceilings established inside the `[system]` constraint block of `config/postgres.toml`.
|
||||
|
||||
---
|
||||
|
||||
## Deferred (Phase 2)
|
||||
|
||||
| Item | Reason |
|
||||
|------|--------|
|
||||
| SerpAPI response caching | State store managed by orchestrator for now |
|
||||
| Protobuf framing | JSON IPC sufficient for current throughput |
|
||||
| Multi-threaded search | Sequential is fine for SerpAPI rate limits |
|
||||
| GEOS integration | Custom geo is sufficient for grid math |
|
||||
65
packages/kbot/cpp/src/cmd_gridsearch-filters.h
Normal file
65
packages/kbot/cpp/src/cmd_gridsearch-filters.h
Normal file
@ -0,0 +1,65 @@
|
||||
#pragma once
|
||||
|
||||
#include "search/search.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
#include "geo/geo.h"
|
||||
|
||||
#include <functional>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
// ── Filter context ──────────────────────────────────────────────────────────
|
||||
// All runtime data a filter predicate may need. Passed by const-ref so filters
|
||||
// are pure read-only functions with no side-effects.
|
||||
|
||||
struct WaypointCtx {
|
||||
double lat;
|
||||
double lng;
|
||||
double radius_km;
|
||||
std::string area_gid; // e.g. "ESP.6.1.10.2_1"
|
||||
};
|
||||
|
||||
struct FilterContext {
|
||||
const WaypointCtx& waypoint;
|
||||
const std::vector<std::string>& filter_types; // must-match list
|
||||
const std::vector<std::string>& exclude_types; // deny list
|
||||
const std::map<std::string, std::vector<gadm::Feature>>& country_boundaries;
|
||||
};
|
||||
|
||||
// ── Predicate type ──────────────────────────────────────────────────────────
|
||||
// Returns true → KEEP the result.
|
||||
// Returns false → DISCARD the result.
|
||||
using LocationFilter = std::function<bool(const search::MapResult&, const FilterContext&)>;
|
||||
|
||||
// ── Individual filters ──────────────────────────────────────────────────────
|
||||
|
||||
/// Discard results that have no website (non-actionable leads).
|
||||
bool filter_requires_website(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
/// Discard results whose type matches any entry in ctx.exclude_types.
|
||||
bool filter_exclude_types(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
/// If ctx.filter_types is non-empty, keep only results that match ≥1 type.
|
||||
bool filter_match_types(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
/// Keep only results inside the country-level boundary polygon (L0) of the
|
||||
/// waypoint's country. Falls back to radius-based overlap (1.5 × radius_km)
|
||||
/// to gracefully handle legitimate border-proximity results.
|
||||
bool filter_country_boundary(const search::MapResult& r, const FilterContext& ctx);
|
||||
|
||||
// ── Filter set builder ──────────────────────────────────────────────────────
|
||||
|
||||
/// Return the ordered list of default filters applied to every SerpAPI batch.
|
||||
/// Filters are evaluated left-to-right; the first false short-circuits.
|
||||
std::vector<LocationFilter> default_location_filters();
|
||||
|
||||
/// Run `filters` against `result`. Returns true (keep) only if every
|
||||
/// filter passes.
|
||||
bool apply_filters(const search::MapResult& result,
|
||||
const FilterContext& ctx,
|
||||
const std::vector<LocationFilter>& filters);
|
||||
|
||||
} // namespace polymech
|
||||
28
packages/kbot/cpp/src/cmd_gridsearch-postgres.h
Normal file
28
packages/kbot/cpp/src/cmd_gridsearch-postgres.h
Normal file
@ -0,0 +1,28 @@
|
||||
#pragma once
|
||||
|
||||
#include "cmd_gridsearch.h"
|
||||
#include "search/search.h"
|
||||
#include "enrichers/enrichers.h"
|
||||
#include <set>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
struct PostgresStateStore {
|
||||
std::string run_id;
|
||||
std::string user_id;
|
||||
std::string parent_id; // optional: parent run ID for expand jobs
|
||||
bool enabled = false;
|
||||
|
||||
void init_run(const PipelineOptions &opts);
|
||||
void update_status(const std::string &status);
|
||||
void complete_run(const std::string &result_json);
|
||||
void fail_run(const std::string &error_msg);
|
||||
void upsert_places(const std::vector<search::MapResult> &places);
|
||||
void update_place_enrichment(const enrichers::EnrichedNode &enode);
|
||||
|
||||
/// Query places table in chunks to find place_ids that already have meta (enriched).
|
||||
/// Returns set of place_ids that should be skipped during enrichment.
|
||||
std::set<std::string> filter_already_enriched(const std::vector<std::string> &place_ids);
|
||||
};
|
||||
|
||||
} // namespace polymech
|
||||
88
packages/kbot/cpp/src/cmd_gridsearch.h
Normal file
88
packages/kbot/cpp/src/cmd_gridsearch.h
Normal file
@ -0,0 +1,88 @@
|
||||
#pragma once
|
||||
|
||||
#include <CLI/CLI.hpp>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include <atomic>
|
||||
#include "search/search.h"
|
||||
#include "grid/grid.h"
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
|
||||
std::string json_escape(const std::string &s);
|
||||
|
||||
struct AreaDef {
|
||||
std::string gid;
|
||||
std::string name;
|
||||
int level;
|
||||
};
|
||||
|
||||
struct AccumulatedResult {
|
||||
search::MapResult result;
|
||||
std::string grid_area;
|
||||
std::string grid_gid;
|
||||
};
|
||||
|
||||
struct PipelineOptions {
|
||||
std::vector<AreaDef> areas;
|
||||
grid::GridOptions grid_opts;
|
||||
std::string search_query;
|
||||
std::string search_domain = "google.com";
|
||||
std::string search_language = "en";
|
||||
std::string search_country;
|
||||
int search_limit = 20;
|
||||
int search_zoom = 13;
|
||||
bool dry_run = false;
|
||||
bool enrich = false;
|
||||
std::string config_path = "config/postgres.toml";
|
||||
std::string cache_dir = "cache/gadm";
|
||||
bool persistence_postgres = false;
|
||||
bool daemon_mode = false;
|
||||
std::string job_id;
|
||||
std::string default_user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
|
||||
search::SystemTuningOptions tuning;
|
||||
std::shared_ptr<std::atomic<bool>> cancel_token;
|
||||
std::vector<std::string> filter_types; // if non-empty, only locations matching ≥1 type pass
|
||||
std::vector<std::string> exclude_types; // if non-empty, drop locations matching any
|
||||
bool no_cache = false; // skip pre-enrich dedup — force re-enrichment
|
||||
std::string parent_id; // if set, this run is an "expand" child of another run
|
||||
};
|
||||
|
||||
std::string json_escape(const std::string &s);
|
||||
|
||||
/// Optional callbacks for streaming progress events (used in IPC mode).
|
||||
/// When nullptr / empty, the pipeline runs silently (CLI mode).
|
||||
struct GridsearchCallbacks {
|
||||
/// Emit a progress event. `type` is one of:
|
||||
/// grid-ready, waypoint-start, area, location,
|
||||
/// enrich-start, node, node-error, nodePage
|
||||
/// `json` is the raw JSON payload string.
|
||||
std::function<void(const std::string& type, const std::string& json)> onEvent;
|
||||
};
|
||||
|
||||
CLI::App* setup_cmd_gridsearch(CLI::App& app);
|
||||
|
||||
/// CLI entry point (standalone mode — reads static vars set by CLI11).
|
||||
int run_cmd_gridsearch();
|
||||
|
||||
/// IPC entry point — parse `payload` JSON, run the pipeline, emit events via `cb`.
|
||||
/// Returns 0 on success.
|
||||
int run_cmd_gridsearch_ipc(const std::string& payload,
|
||||
const std::string& jobId,
|
||||
const GridsearchCallbacks& cb,
|
||||
bool daemon_mode = false,
|
||||
const std::string& daemon_uid = "");
|
||||
|
||||
/// Core Pipeline
|
||||
int run_pipeline(const PipelineOptions &opts, std::ostream *file_out,
|
||||
const GridsearchCallbacks &cb);
|
||||
|
||||
/// UDS entry point — starts a persistent AF_UNIX / Named Pipe server that processes
|
||||
/// concurrent jobs using Moodycamel ConcurrentQueue and Taskflow executor.
|
||||
int run_cmd_gridsearch_uds(const std::string& pipe_path,
|
||||
bool daemon_mode,
|
||||
const std::string& daemon_uid);
|
||||
|
||||
} // namespace polymech
|
||||
60
packages/kbot/cpp/src/gridsearch_serialize.h
Normal file
60
packages/kbot/cpp/src/gridsearch_serialize.h
Normal file
@ -0,0 +1,60 @@
|
||||
#pragma once
|
||||
|
||||
#include "enrichers/enrichers.h"
|
||||
#include "grid/grid.h"
|
||||
#include "search/search.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace polymech {
|
||||
struct PipelineOptions;
|
||||
}
|
||||
|
||||
namespace polymech::serialize {
|
||||
|
||||
/// grid-ready event payload
|
||||
std::string grid_ready(const std::vector<grid::Waypoint>& waypoints);
|
||||
|
||||
/// waypoint-start event payload
|
||||
std::string waypoint_start(const grid::Waypoint& wp, int index, int total);
|
||||
|
||||
/// location event payload (per search result)
|
||||
std::string location(const search::MapResult& r, int step);
|
||||
|
||||
/// waypoint-finish event payload (waypoint done)
|
||||
std::string waypoint_finish(const grid::Waypoint& wp, int results, int apiCalls);
|
||||
|
||||
/// area-start event payload
|
||||
std::string area_start(const std::string& area_gid, const std::string& area_name);
|
||||
|
||||
/// area-finish event payload
|
||||
std::string area_finish(const std::string& area_gid);
|
||||
|
||||
/// enrich-start event payload
|
||||
std::string enrich_start(int locationCount);
|
||||
|
||||
/// nodePage event payload (per page error)
|
||||
std::string node_page(const enrichers::PageError& pe, const std::string& placeId);
|
||||
|
||||
/// node-error event payload
|
||||
std::string node_error(const enrichers::EnrichedNode& node);
|
||||
|
||||
/// node event payload (enriched location)
|
||||
std::string node(const enrichers::EnrichedNode& node);
|
||||
|
||||
/// job_result summary (with enrichment)
|
||||
std::string job_result(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t enrichMs, int64_t totalMs,
|
||||
int totalEmails, int totalPagesScraped, int freshApiCalls,
|
||||
int waypointCount, int validCells, int skippedCells,
|
||||
int totalResults, const std::vector<std::string>& enrichResults,
|
||||
double totalScannedSqKm, double totalPopulation);
|
||||
|
||||
/// job_result summary (search only, no enrichment)
|
||||
std::string job_result_search_only(const polymech::PipelineOptions& opts, int64_t enumMs, int64_t searchMs, int64_t totalMs,
|
||||
int freshApiCalls, int waypointCount, int validCells,
|
||||
int skippedCells, int totalResults, const std::vector<std::string>& enrichResults,
|
||||
double totalScannedSqKm, double totalPopulation);
|
||||
|
||||
} // namespace polymech::serialize
|
||||
269
packages/kbot/cpp/src/main.cpp
Normal file
269
packages/kbot/cpp/src/main.cpp
Normal file
@ -0,0 +1,269 @@
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <chrono>
|
||||
#include <set>
|
||||
#include <ctime>
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <rapidjson/document.h>
|
||||
|
||||
#include <CLI/CLI.hpp>
|
||||
#include <toml++/toml.hpp>
|
||||
|
||||
#include "html/html.h"
|
||||
#include "http/http.h"
|
||||
#include "ipc/ipc.h"
|
||||
#include "logger/logger.h"
|
||||
#include "postgres/postgres.h"
|
||||
#include "json/json.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
#include "grid/grid.h"
|
||||
#include "search/search.h"
|
||||
#include "enrichers/enrichers.h"
|
||||
#include "cmd_gridsearch.h"
|
||||
|
||||
#ifndef PROJECT_VERSION
|
||||
#define PROJECT_VERSION "0.1.0"
|
||||
#endif
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
CLI::App app{"polymech-cli — Polymech C++ CLI", "polymech-cli"};
|
||||
app.set_version_flag("-v,--version", PROJECT_VERSION);
|
||||
|
||||
std::string log_level = "info";
|
||||
app.add_option("--log-level", log_level, "Set log level (debug/info/warn/error)")->default_val("info");
|
||||
|
||||
// Subcommand: parse HTML
|
||||
std::string html_input;
|
||||
auto *parse_cmd = app.add_subcommand("parse", "Parse HTML and list elements");
|
||||
parse_cmd->add_option("html", html_input, "HTML string to parse")->required();
|
||||
|
||||
// Subcommand: select from HTML
|
||||
std::string select_input;
|
||||
std::string selector;
|
||||
auto *select_cmd =
|
||||
app.add_subcommand("select", "CSS-select elements from HTML");
|
||||
select_cmd->add_option("html", select_input, "HTML string")->required();
|
||||
select_cmd->add_option("selector", selector, "CSS selector")->required();
|
||||
|
||||
// Subcommand: config — read a TOML file
|
||||
std::string config_path;
|
||||
auto *config_cmd =
|
||||
app.add_subcommand("config", "Read and display a TOML config file");
|
||||
config_cmd->add_option("file", config_path, "Path to TOML file")->required();
|
||||
|
||||
// Subcommand: fetch — HTTP GET a URL
|
||||
std::string fetch_url;
|
||||
auto *fetch_cmd =
|
||||
app.add_subcommand("fetch", "HTTP GET a URL and print the response");
|
||||
fetch_cmd->add_option("url", fetch_url, "URL to fetch")->required();
|
||||
|
||||
// Subcommand: json — prettify JSON
|
||||
std::string json_input;
|
||||
auto *json_cmd = app.add_subcommand("json", "Prettify a JSON string");
|
||||
json_cmd->add_option("input", json_input, "JSON string")->required();
|
||||
|
||||
// Subcommand: db — connect to Supabase and query
|
||||
std::string db_config_path = "config/postgres.toml";
|
||||
std::string db_table;
|
||||
int db_limit = 10;
|
||||
auto *db_cmd =
|
||||
app.add_subcommand("db", "Connect to Supabase and query a table");
|
||||
db_cmd->add_option("-c,--config", db_config_path, "TOML config path")
|
||||
->default_val("config/postgres.toml");
|
||||
db_cmd->add_option("table", db_table, "Table to query (optional)");
|
||||
db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10);
|
||||
|
||||
// Subcommand: worker — IPC mode (spawned by Node.js orchestrator)
|
||||
bool daemon_mode = false;
|
||||
std::string daemon_uid;
|
||||
std::string worker_config = "config/postgres.toml";
|
||||
std::string uds_path;
|
||||
|
||||
auto *worker_cmd = app.add_subcommand(
|
||||
"worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)");
|
||||
worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)");
|
||||
worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml");
|
||||
worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)");
|
||||
worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path");
|
||||
|
||||
// Subcommand: gridsearch — Run a full gridsearch pipeline
|
||||
auto* gs_cmd = polymech::setup_cmd_gridsearch(app);
|
||||
|
||||
CLI11_PARSE(app, argc, argv);
|
||||
|
||||
// Worker mode uses stderr for logs to keep stdout clean for IPC frames
|
||||
if (worker_cmd->parsed()) {
|
||||
if (!uds_path.empty()) {
|
||||
logger::init_uds("polymech-uds", log_level, "../logs/uds.json");
|
||||
} else {
|
||||
logger::init_stderr("polymech-worker", log_level);
|
||||
}
|
||||
} else {
|
||||
logger::init("polymech-cli", log_level);
|
||||
}
|
||||
|
||||
// ── worker mode ─────────────────────────────────────────────────────────
|
||||
if (worker_cmd->parsed()) {
|
||||
logger::info("Worker mode: listening on stdin");
|
||||
|
||||
if (daemon_mode) {
|
||||
logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid));
|
||||
auto cfg = search::load_config(worker_config);
|
||||
postgres::Config pcfg;
|
||||
pcfg.supabase_url = cfg.supabase_url;
|
||||
pcfg.supabase_key = cfg.supabase_service_key;
|
||||
postgres::init(pcfg);
|
||||
}
|
||||
|
||||
if (!uds_path.empty()) {
|
||||
logger::info("Worker mode: UDS Server active on " + uds_path);
|
||||
int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid);
|
||||
return rc;
|
||||
}
|
||||
|
||||
// Send a "ready" message so the orchestrator knows we're alive
|
||||
ipc::write_message({"0", "ready", "{}"});
|
||||
|
||||
while (true) {
|
||||
ipc::Message req;
|
||||
if (!ipc::read_message(req)) {
|
||||
logger::info("Worker: stdin closed, exiting");
|
||||
break;
|
||||
}
|
||||
|
||||
logger::debug("Worker recv: type=" + req.type + " id=" + req.id);
|
||||
|
||||
if (req.type == "ping") {
|
||||
ipc::write_message({req.id, "pong", "{}"});
|
||||
|
||||
} else if (req.type == "gridsearch") {
|
||||
logger::info("Worker: gridsearch job received");
|
||||
|
||||
// Build callbacks that emit IPC events.
|
||||
// Progress events use id "0" (unmatched → event for orchestrator).
|
||||
// The final job_result uses the original req.id so the promise resolves.
|
||||
std::string req_id = req.id;
|
||||
polymech::GridsearchCallbacks cb;
|
||||
cb.onEvent = [&req_id](const std::string& type, const std::string& json) {
|
||||
if (type == "job_result") {
|
||||
ipc::write_message({req_id, "job_result", json});
|
||||
} else {
|
||||
ipc::write_message({"0", type, json});
|
||||
}
|
||||
};
|
||||
|
||||
int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid);
|
||||
if (rc != 0) {
|
||||
ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"});
|
||||
}
|
||||
|
||||
} else if (req.type == "job") {
|
||||
// Stub: echo the payload back as job_result
|
||||
ipc::write_message({req.id, "job_result", req.payload});
|
||||
|
||||
} else if (req.type == "shutdown") {
|
||||
ipc::write_message({req.id, "shutdown_ack", "{}"});
|
||||
logger::info("Worker: shutdown requested, exiting");
|
||||
break;
|
||||
|
||||
} else {
|
||||
// Unknown type — respond with error
|
||||
ipc::write_message(
|
||||
{req.id, "error",
|
||||
"{\"message\":\"unknown type: " + req.type + "\"}"});
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ── existing subcommands ────────────────────────────────────────────────
|
||||
if (parse_cmd->parsed()) {
|
||||
auto elements = html::parse(html_input);
|
||||
logger::info("Parsed " + std::to_string(elements.size()) + " elements");
|
||||
for (const auto &el : elements) {
|
||||
std::cout << "<" << el.tag << "> " << el.text << "\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (select_cmd->parsed()) {
|
||||
auto matches = html::select(select_input, selector);
|
||||
logger::info("Matched " + std::to_string(matches.size()) + " elements");
|
||||
for (const auto &m : matches) {
|
||||
std::cout << m << "\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (config_cmd->parsed()) {
|
||||
try {
|
||||
auto tbl = toml::parse_file(config_path);
|
||||
logger::info("Loaded config: " + config_path);
|
||||
std::cout << tbl << "\n";
|
||||
} catch (const toml::parse_error &err) {
|
||||
logger::error("TOML parse error: " + std::string(err.what()));
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (fetch_cmd->parsed()) {
|
||||
auto resp = http::get(fetch_url);
|
||||
logger::info("HTTP " + std::to_string(resp.status_code) + " from " +
|
||||
fetch_url);
|
||||
if (json::is_valid(resp.body)) {
|
||||
std::cout << json::prettify(resp.body) << "\n";
|
||||
} else {
|
||||
std::cout << resp.body << "\n";
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (json_cmd->parsed()) {
|
||||
if (!json::is_valid(json_input)) {
|
||||
logger::error("Invalid JSON input");
|
||||
return 1;
|
||||
}
|
||||
std::cout << json::prettify(json_input) << "\n";
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (db_cmd->parsed()) {
|
||||
try {
|
||||
auto cfg = toml::parse_file(db_config_path);
|
||||
postgres::Config pg_cfg;
|
||||
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
|
||||
pg_cfg.supabase_key =
|
||||
cfg["supabase"]["publishable_key"].value_or(std::string(""));
|
||||
postgres::init(pg_cfg);
|
||||
|
||||
auto status = postgres::ping();
|
||||
logger::info("Supabase: " + status);
|
||||
|
||||
if (!db_table.empty()) {
|
||||
auto result = postgres::query(db_table, "*", "", db_limit);
|
||||
if (json::is_valid(result)) {
|
||||
std::cout << json::prettify(result) << "\n";
|
||||
} else {
|
||||
std::cout << result << "\n";
|
||||
}
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
logger::error(std::string("db error: ") + e.what());
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ── gridsearch subcommand ──────────────────────────────────────────────
|
||||
if (gs_cmd->parsed()) {
|
||||
return polymech::run_cmd_gridsearch();
|
||||
}
|
||||
|
||||
// No subcommand — show help
|
||||
std::cout << app.help() << "\n";
|
||||
return 0;
|
||||
}
|
||||
8
packages/kbot/cpp/src/sys_metrics.h
Normal file
8
packages/kbot/cpp/src/sys_metrics.h
Normal file
@ -0,0 +1,8 @@
|
||||
#pragma once
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
|
||||
namespace polymech {
|
||||
size_t get_current_rss_mb();
|
||||
uint64_t get_cpu_time_ms();
|
||||
}
|
||||
80
packages/kbot/cpp/tests/CMakeLists.txt
Normal file
80
packages/kbot/cpp/tests/CMakeLists.txt
Normal file
@ -0,0 +1,80 @@
|
||||
# ── Test targets ──────────────────────────────────────────────────────────────
|
||||
include(CTest)
|
||||
include(Catch)
|
||||
|
||||
# pthread is required on Linux for Catch2 tests
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
# Unit tests — one per package
|
||||
add_executable(test_logger unit/test_logger.cpp)
|
||||
target_link_libraries(test_logger PRIVATE Catch2::Catch2WithMain logger Threads::Threads)
|
||||
catch_discover_tests(test_logger)
|
||||
|
||||
add_executable(test_html unit/test_html.cpp)
|
||||
target_link_libraries(test_html PRIVATE Catch2::Catch2WithMain html Threads::Threads)
|
||||
catch_discover_tests(test_html)
|
||||
|
||||
add_executable(test_postgres unit/test_postgres.cpp)
|
||||
target_link_libraries(test_postgres PRIVATE Catch2::Catch2WithMain postgres Threads::Threads)
|
||||
catch_discover_tests(test_postgres)
|
||||
|
||||
add_executable(test_json unit/test_json.cpp)
|
||||
target_link_libraries(test_json PRIVATE Catch2::Catch2WithMain json Threads::Threads)
|
||||
catch_discover_tests(test_json)
|
||||
|
||||
add_executable(test_http unit/test_http.cpp)
|
||||
target_link_libraries(test_http PRIVATE Catch2::Catch2WithMain http Threads::Threads)
|
||||
catch_discover_tests(test_http)
|
||||
|
||||
# Functional test — end-to-end CLI
|
||||
add_executable(test_functional functional/test_cli.cpp)
|
||||
target_link_libraries(test_functional PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json Threads::Threads)
|
||||
catch_discover_tests(test_functional)
|
||||
|
||||
# E2E test — real Supabase connection (requires config/postgres.toml + network)
|
||||
add_executable(test_supabase e2e/test_supabase.cpp)
|
||||
target_link_libraries(test_supabase PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres json Threads::Threads)
|
||||
catch_discover_tests(test_supabase WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_postgres_live functional/test_postgres_live.cpp)
|
||||
target_link_libraries(test_postgres_live PRIVATE Catch2::Catch2WithMain postgres search json logger tomlplusplus::tomlplusplus Threads::Threads)
|
||||
catch_discover_tests(test_postgres_live WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_polymech unit/test_polymech.cpp)
|
||||
target_link_libraries(test_polymech PRIVATE Catch2::Catch2WithMain polymech postgres Threads::Threads)
|
||||
catch_discover_tests(test_polymech)
|
||||
|
||||
# E2E test — polymech fetch_pages from live Supabase
|
||||
add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp)
|
||||
target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads)
|
||||
catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_gridsearch_ipc e2e/test_gridsearch_ipc.cpp ../src/cmd_gridsearch.cpp ../src/cmd_gridsearch-filters.cpp ../src/cmd_gridsearch-uds.cpp ../src/cmd_gridsearch-postgres.cpp ../src/gridsearch_serialize.cpp ../src/sys_metrics.cpp)
|
||||
target_link_libraries(test_gridsearch_ipc PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers Threads::Threads)
|
||||
target_include_directories(test_gridsearch_ipc PRIVATE ${CMAKE_SOURCE_DIR}/src ${asio_SOURCE_DIR}/asio/include ${taskflow_SOURCE_DIR} ${concurrentqueue_SOURCE_DIR})
|
||||
target_compile_definitions(test_gridsearch_ipc PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
|
||||
catch_discover_tests(test_gridsearch_ipc WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_ipc unit/test_ipc.cpp)
|
||||
target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads)
|
||||
catch_discover_tests(test_ipc)
|
||||
|
||||
add_executable(test_geo unit/test_geo.cpp)
|
||||
target_link_libraries(test_geo PRIVATE Catch2::Catch2WithMain geo Threads::Threads)
|
||||
catch_discover_tests(test_geo)
|
||||
|
||||
add_executable(test_gadm_reader unit/test_gadm_reader.cpp)
|
||||
target_link_libraries(test_gadm_reader PRIVATE Catch2::Catch2WithMain gadm_reader Threads::Threads)
|
||||
catch_discover_tests(test_gadm_reader WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_grid unit/test_grid.cpp)
|
||||
target_link_libraries(test_grid PRIVATE Catch2::Catch2WithMain grid Threads::Threads)
|
||||
catch_discover_tests(test_grid WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_search unit/test_search.cpp)
|
||||
target_link_libraries(test_search PRIVATE Catch2::Catch2WithMain search Threads::Threads)
|
||||
catch_discover_tests(test_search WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
|
||||
|
||||
add_executable(test_enrichers unit/test_enrichers.cpp)
|
||||
target_link_libraries(test_enrichers PRIVATE Catch2::Catch2WithMain enrichers Threads::Threads)
|
||||
catch_discover_tests(test_enrichers)
|
||||
144
packages/kbot/cpp/tests/e2e/test_gridsearch_ipc.cpp
Normal file
144
packages/kbot/cpp/tests/e2e/test_gridsearch_ipc.cpp
Normal file
@ -0,0 +1,144 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <rapidjson/document.h>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <rapidjson/writer.h>
|
||||
|
||||
#include "../../src/cmd_gridsearch.h"
|
||||
#include "logger/logger.h"
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
static std::string read_file_contents(const std::string &path) {
|
||||
std::ifstream f(path);
|
||||
if (!f.is_open())
|
||||
return "";
|
||||
std::stringstream ss;
|
||||
ss << f.rdbuf();
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
/// Read a JSON config file and inject test-safe overrides:
|
||||
/// - configPath = "config/postgres.toml"
|
||||
/// - enrich = false (no live HTTP / thread-pool in tests)
|
||||
/// - persistencePostgres = false
|
||||
static std::string load_test_payload(const std::string &config_path) {
|
||||
std::string raw = read_file_contents(config_path);
|
||||
if (raw.empty())
|
||||
return "";
|
||||
|
||||
rapidjson::Document doc;
|
||||
doc.Parse(raw.c_str());
|
||||
if (doc.HasParseError())
|
||||
return "";
|
||||
|
||||
auto &alloc = doc.GetAllocator();
|
||||
|
||||
// Remove-then-add ensures no double-add assertion from rapidjson
|
||||
auto inject_bool = [&](const char *key, bool val) {
|
||||
if (doc.HasMember(key))
|
||||
doc.RemoveMember(key);
|
||||
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val), alloc);
|
||||
};
|
||||
auto inject_str = [&](const char *key, const char *val) {
|
||||
if (doc.HasMember(key))
|
||||
doc.RemoveMember(key);
|
||||
doc.AddMember(rapidjson::Value(key, alloc), rapidjson::Value(val, alloc),
|
||||
alloc);
|
||||
};
|
||||
|
||||
inject_str("configPath", "config/postgres.toml");
|
||||
inject_str("cacheDir", "../../packages/gadm/cache/gadm"); // server/cache/gadm
|
||||
inject_bool("enrich", false); // no live enrichment in tests
|
||||
inject_bool("persistencePostgres", false);
|
||||
|
||||
rapidjson::StringBuffer buf;
|
||||
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
|
||||
doc.Accept(writer);
|
||||
return buf.GetString();
|
||||
}
|
||||
|
||||
// ── Tests
|
||||
// ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("E2E: Gridsearch Country Boundary Filter (Lamu/KEN)",
|
||||
"[e2e][gridsearch][boundary]") {
|
||||
REQUIRE_NOTHROW(logger::init("test-gridsearch"));
|
||||
|
||||
// Lamu, Kenya — SerpAPI often returns US results for obscure African regions.
|
||||
// boundary_KEN_0.json should filter them out.
|
||||
std::string payload = load_test_payload("config/gridsearch-lamu.json");
|
||||
REQUIRE(!payload.empty());
|
||||
|
||||
std::vector<std::string> location_events;
|
||||
int error_count = 0;
|
||||
|
||||
polymech::GridsearchCallbacks cb;
|
||||
cb.onEvent = [&](const std::string &type, const std::string &json) {
|
||||
if (type == "location") {
|
||||
location_events.push_back(json);
|
||||
} else if (type == "error") {
|
||||
error_count++;
|
||||
std::cout << "[ERROR EVENT]: " << json << "\n";
|
||||
}
|
||||
};
|
||||
|
||||
int result =
|
||||
polymech::run_cmd_gridsearch_ipc(payload, "test-lamu-job", cb, false, "");
|
||||
|
||||
REQUIRE(result == 0);
|
||||
REQUIRE(error_count == 0);
|
||||
|
||||
// All returned locations must be within Kenya (no USA coords).
|
||||
// Verify: no location has lng < -30 (Americas) or lng > 60 (not Africa/Asia)
|
||||
// and lat outside [-5, 5] for Lamu county bounds.
|
||||
int outside_kenya = 0;
|
||||
for (const auto &loc_json : location_events) {
|
||||
rapidjson::Document loc;
|
||||
loc.Parse(loc_json.c_str());
|
||||
if (loc.HasParseError())
|
||||
continue;
|
||||
if (loc.HasMember("gps") && loc["gps"].IsObject()) {
|
||||
double lng =
|
||||
loc["gps"].HasMember("lng") ? loc["gps"]["lng"].GetDouble() : 0;
|
||||
// Kenya longitude range: ~34..42; USA is roughly -130..-60
|
||||
if (lng < 20.0 || lng > 55.0)
|
||||
outside_kenya++;
|
||||
}
|
||||
}
|
||||
|
||||
CHECK(outside_kenya == 0);
|
||||
std::cout << "Lamu boundary test: " << location_events.size()
|
||||
<< " locations kept, " << outside_kenya << " outside Kenya.\n";
|
||||
}
|
||||
|
||||
TEST_CASE("E2E: Gridsearch Type Filter (Sample/ABW)",
|
||||
"[e2e][gridsearch][filter]") {
|
||||
std::string payload = load_test_payload("config/gridsearch-sample.json");
|
||||
REQUIRE(!payload.empty());
|
||||
|
||||
std::vector<std::string> location_events;
|
||||
int error_count = 0;
|
||||
|
||||
polymech::GridsearchCallbacks cb;
|
||||
cb.onEvent = [&](const std::string &type, const std::string &json) {
|
||||
if (type == "location")
|
||||
location_events.push_back(json);
|
||||
else if (type == "error")
|
||||
error_count++;
|
||||
};
|
||||
|
||||
int result = polymech::run_cmd_gridsearch_ipc(payload, "test-sample-job", cb,
|
||||
false, "");
|
||||
|
||||
REQUIRE(result == 0);
|
||||
REQUIRE(error_count == 0);
|
||||
|
||||
std::cout << "Sample (ABW) type filter test: " << location_events.size()
|
||||
<< " locations.\n";
|
||||
}
|
||||
34
packages/kbot/cpp/tests/e2e/test_polymech_e2e.cpp
Normal file
34
packages/kbot/cpp/tests/e2e/test_polymech_e2e.cpp
Normal file
@ -0,0 +1,34 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <toml++/toml.hpp>
|
||||
|
||||
#include "logger/logger.h"
|
||||
#include "polymech/polymech.h"
|
||||
#include "postgres/postgres.h"
|
||||
#include "json/json.h"
|
||||
|
||||
// ── E2E: fetch pages from live Supabase ─────────────────────────────────
|
||||
|
||||
TEST_CASE("E2E: fetch all pages", "[e2e]") {
|
||||
logger::init("e2e-polymech");
|
||||
|
||||
// Load config
|
||||
auto cfg = toml::parse_file("config/postgres.toml");
|
||||
postgres::Config pg_cfg;
|
||||
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
|
||||
pg_cfg.supabase_key =
|
||||
cfg["supabase"]["publishable_key"].value_or(std::string(""));
|
||||
|
||||
REQUIRE(!pg_cfg.supabase_url.empty());
|
||||
REQUIRE(!pg_cfg.supabase_key.empty());
|
||||
|
||||
postgres::init(pg_cfg);
|
||||
|
||||
auto result = polymech::fetch_pages();
|
||||
|
||||
// Should return valid JSON
|
||||
REQUIRE(json::is_valid(result));
|
||||
|
||||
// Should be an array (even if empty)
|
||||
REQUIRE(result.front() == '[');
|
||||
REQUIRE(result.back() == ']');
|
||||
}
|
||||
50
packages/kbot/cpp/tests/e2e/test_supabase.cpp
Normal file
50
packages/kbot/cpp/tests/e2e/test_supabase.cpp
Normal file
@ -0,0 +1,50 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <toml++/toml.hpp>
|
||||
|
||||
#include "logger/logger.h"
|
||||
#include "postgres/postgres.h"
|
||||
#include "json/json.h"
|
||||
|
||||
|
||||
// ── E2E: Supabase connect via config/postgres.toml ──────────────────────────
|
||||
|
||||
TEST_CASE("E2E: connect to Supabase and ping", "[e2e][postgres]") {
|
||||
logger::init("e2e-test");
|
||||
|
||||
// Read config — path relative to CWD (project root)
|
||||
auto cfg = toml::parse_file("config/postgres.toml");
|
||||
postgres::Config pg_cfg;
|
||||
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
|
||||
pg_cfg.supabase_key =
|
||||
cfg["supabase"]["publishable_key"].value_or(std::string(""));
|
||||
|
||||
REQUIRE(!pg_cfg.supabase_url.empty());
|
||||
REQUIRE(!pg_cfg.supabase_key.empty());
|
||||
|
||||
postgres::init(pg_cfg);
|
||||
|
||||
auto status = postgres::ping();
|
||||
logger::info("E2E ping result: " + status);
|
||||
CHECK(status == "ok");
|
||||
}
|
||||
|
||||
TEST_CASE("E2E: query profiles table", "[e2e][postgres]") {
|
||||
logger::init("e2e-test");
|
||||
|
||||
auto cfg = toml::parse_file("config/postgres.toml");
|
||||
postgres::Config pg_cfg;
|
||||
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
|
||||
pg_cfg.supabase_key =
|
||||
cfg["supabase"]["publishable_key"].value_or(std::string(""));
|
||||
|
||||
postgres::init(pg_cfg);
|
||||
|
||||
auto result = postgres::query("profiles", "id,username", "", 3);
|
||||
logger::info("E2E query result: " + result);
|
||||
|
||||
// Should be valid JSON array
|
||||
CHECK(json::is_valid(result));
|
||||
}
|
||||
74
packages/kbot/cpp/tests/functional/test_cli.cpp
Normal file
74
packages/kbot/cpp/tests/functional/test_cli.cpp
Normal file
@ -0,0 +1,74 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
|
||||
#include <toml++/toml.hpp>
|
||||
|
||||
#include "html/html.h"
|
||||
#include "logger/logger.h"
|
||||
#include "postgres/postgres.h"
|
||||
|
||||
// ── Functional: full pipeline tests ─────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Full pipeline: parse HTML and select", "[functional]") {
|
||||
const std::string input =
|
||||
"<html><body>"
|
||||
"<h1>Title</h1>"
|
||||
"<ul><li class=\"item\">A</li><li class=\"item\">B</li></ul>"
|
||||
"</body></html>";
|
||||
|
||||
// Parse should find elements
|
||||
auto elements = html::parse(input);
|
||||
REQUIRE(!elements.empty());
|
||||
|
||||
// Select by class should find 2 items
|
||||
auto items = html::select(input, ".item");
|
||||
REQUIRE(items.size() == 2);
|
||||
CHECK(items[0] == "A");
|
||||
CHECK(items[1] == "B");
|
||||
}
|
||||
|
||||
TEST_CASE("Full pipeline: TOML config round-trip", "[functional]") {
|
||||
// Write a temp TOML file
|
||||
const std::string toml_content = "[server]\n"
|
||||
"host = \"localhost\"\n"
|
||||
"port = 8080\n"
|
||||
"\n"
|
||||
"[database]\n"
|
||||
"name = \"test_db\"\n";
|
||||
|
||||
std::string tmp_path = "test_config_tmp.toml";
|
||||
{
|
||||
std::ofstream out(tmp_path);
|
||||
REQUIRE(out.is_open());
|
||||
out << toml_content;
|
||||
}
|
||||
|
||||
// Parse it
|
||||
auto tbl = toml::parse_file(tmp_path);
|
||||
|
||||
CHECK(tbl["server"]["host"].value_or("") == std::string("localhost"));
|
||||
CHECK(tbl["server"]["port"].value_or(0) == 8080);
|
||||
CHECK(tbl["database"]["name"].value_or("") == std::string("test_db"));
|
||||
|
||||
// Serialize back
|
||||
std::ostringstream ss;
|
||||
ss << tbl;
|
||||
auto serialized = ss.str();
|
||||
CHECK(serialized.find("localhost") != std::string::npos);
|
||||
|
||||
// Cleanup
|
||||
std::remove(tmp_path.c_str());
|
||||
}
|
||||
|
||||
TEST_CASE("Full pipeline: logger + postgres integration", "[functional]") {
|
||||
REQUIRE_NOTHROW(logger::init("functional-test"));
|
||||
|
||||
// Init with a dummy config (no real connection)
|
||||
postgres::Config cfg;
|
||||
cfg.supabase_url = "https://example.supabase.co";
|
||||
cfg.supabase_key = "test-key";
|
||||
REQUIRE_NOTHROW(postgres::init(cfg));
|
||||
|
||||
REQUIRE_NOTHROW(logger::info("Functional test: postgres init ok"));
|
||||
}
|
||||
81
packages/kbot/cpp/tests/functional/test_postgres_live.cpp
Normal file
81
packages/kbot/cpp/tests/functional/test_postgres_live.cpp
Normal file
@ -0,0 +1,81 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include "postgres/postgres.h"
|
||||
#include "search/search.h"
|
||||
#include "json/json.h"
|
||||
#include "logger/logger.h"
|
||||
|
||||
#include <toml++/toml.h>
|
||||
|
||||
// Note: This test requires a valid config/postgres.toml pointing to a Supabase instance.
|
||||
// We test against an arbitrary table 'test_items' or standard table.
|
||||
// In this case we'll test against `grid_search_runs` since we know it exists,
|
||||
// using a dummy uuid for testing.
|
||||
// DO NOT RUN UNLESS CONFIGURED.
|
||||
|
||||
TEST_CASE("Postgres Live Operations", "[postgres_live]") {
|
||||
// Load config
|
||||
std::string supabase_url;
|
||||
std::string supabase_key;
|
||||
try {
|
||||
auto config = toml::parse_file("config/postgres.toml");
|
||||
supabase_url = config["supabase"]["url"].value_or("");
|
||||
supabase_key = config["supabase"]["service_key"].value_or("");
|
||||
} catch (const std::exception &e) {
|
||||
WARN("Skipping postgres live tests. Config missing or invalid: " << e.what());
|
||||
return;
|
||||
}
|
||||
|
||||
if (supabase_url.empty() || supabase_key.empty()) {
|
||||
WARN("Skipping postgres live tests. Supabase credentials missing.");
|
||||
return;
|
||||
}
|
||||
|
||||
postgres::Config pg_cfg;
|
||||
pg_cfg.supabase_url = supabase_url;
|
||||
pg_cfg.supabase_key = supabase_key;
|
||||
postgres::init(pg_cfg);
|
||||
|
||||
REQUIRE(postgres::ping() == "ok");
|
||||
|
||||
std::string test_id = "00000000-0000-0000-0000-0000000000cc";
|
||||
std::string user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
|
||||
|
||||
SECTION("Insert, Query, Update, Upsert, Delete") {
|
||||
// 1. Clean up first just in case
|
||||
postgres::del("grid_search_runs", "id=eq." + test_id);
|
||||
|
||||
// 2. Insert
|
||||
std::string insert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "test_run", "status": "searching", "request": {}})";
|
||||
std::string res1 = postgres::insert("grid_search_runs", insert_body);
|
||||
|
||||
// 3. Query
|
||||
std::string res2 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
|
||||
WARN("Insert Result: " << res1);
|
||||
WARN("Query Result: " << res2);
|
||||
REQUIRE(json::is_valid(res2));
|
||||
REQUIRE(res2.find("test_run") != std::string::npos);
|
||||
|
||||
// 4. Update
|
||||
std::string update_body = R"({"status": "enriching"})";
|
||||
std::string res3 = postgres::update("grid_search_runs", update_body, "id=eq." + test_id);
|
||||
REQUIRE(json::is_valid(res3));
|
||||
REQUIRE(res3.find("error") == std::string::npos);
|
||||
|
||||
// 5. Upsert
|
||||
std::string upsert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "upsert_run", "status": "complete", "request": {}})";
|
||||
std::string res4 = postgres::upsert("grid_search_runs", upsert_body, "id");
|
||||
REQUIRE(res4.find("error") == std::string::npos);
|
||||
|
||||
// Query again to verify upsert
|
||||
std::string res5 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
|
||||
REQUIRE(res5.find("upsert_run") != std::string::npos);
|
||||
|
||||
// 6. Delete
|
||||
std::string res6 = postgres::del("grid_search_runs", "id=eq." + test_id);
|
||||
REQUIRE(json::is_valid(res6));
|
||||
|
||||
// Verify deleted
|
||||
std::string res7 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
|
||||
REQUIRE(res7 == "[]");
|
||||
}
|
||||
}
|
||||
115
packages/kbot/cpp/tests/unit/test_enrichers.cpp
Normal file
115
packages/kbot/cpp/tests/unit/test_enrichers.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include "enrichers/enrichers.h"
|
||||
|
||||
using namespace enrichers;
|
||||
|
||||
// ── is_likely_email ─────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("is_likely_email: valid emails", "[enrichers]") {
|
||||
CHECK(is_likely_email("info@example.com"));
|
||||
CHECK(is_likely_email("john.doe@company.co.uk"));
|
||||
CHECK(is_likely_email("contact@recycling-firm.de"));
|
||||
CHECK(is_likely_email("hello@my-domain.org"));
|
||||
}
|
||||
|
||||
TEST_CASE("is_likely_email: rejects non-emails", "[enrichers]") {
|
||||
CHECK_FALSE(is_likely_email(""));
|
||||
CHECK_FALSE(is_likely_email("not-an-email"));
|
||||
CHECK_FALSE(is_likely_email("@no-user.com"));
|
||||
CHECK_FALSE(is_likely_email("user@"));
|
||||
}
|
||||
|
||||
TEST_CASE("is_likely_email: rejects asset extensions", "[enrichers]") {
|
||||
CHECK_FALSE(is_likely_email("logo@site.png"));
|
||||
CHECK_FALSE(is_likely_email("icon@site.svg"));
|
||||
CHECK_FALSE(is_likely_email("style@site.css"));
|
||||
CHECK_FALSE(is_likely_email("script@site.js"));
|
||||
CHECK_FALSE(is_likely_email("photo@site.jpg"));
|
||||
CHECK_FALSE(is_likely_email("photo@site.webp"));
|
||||
}
|
||||
|
||||
TEST_CASE("is_likely_email: rejects placeholder/hash patterns", "[enrichers]") {
|
||||
CHECK_FALSE(is_likely_email("user@example.com"));
|
||||
CHECK_FALSE(is_likely_email("test@test.com"));
|
||||
CHECK_FALSE(is_likely_email("a3f2b@hash.com"));
|
||||
CHECK_FALSE(is_likely_email("your@email.com"));
|
||||
CHECK_FALSE(is_likely_email("email@email.com"));
|
||||
CHECK_FALSE(is_likely_email("name@domain.com"));
|
||||
}
|
||||
|
||||
// ── extract_emails ──────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("extract_emails: finds emails in text", "[enrichers]") {
|
||||
auto emails = extract_emails("Contact us at info@example.org or sales@company.com");
|
||||
CHECK(emails.size() >= 2);
|
||||
|
||||
bool found_info = false, found_sales = false;
|
||||
for (auto& e : emails) {
|
||||
if (e == "info@example.org") found_info = true;
|
||||
if (e == "sales@company.com") found_sales = true;
|
||||
}
|
||||
CHECK(found_info);
|
||||
CHECK(found_sales);
|
||||
}
|
||||
|
||||
TEST_CASE("extract_emails: deduplicates", "[enrichers]") {
|
||||
auto emails = extract_emails("info@acme.org info@acme.org info@acme.org");
|
||||
CHECK(emails.size() == 1);
|
||||
}
|
||||
|
||||
TEST_CASE("extract_emails: empty text returns empty", "[enrichers]") {
|
||||
auto emails = extract_emails("");
|
||||
CHECK(emails.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("extract_emails: filters out asset emails", "[enrichers]") {
|
||||
auto emails = extract_emails("logo@site.png info@real-company.de");
|
||||
CHECK(emails.size() == 1);
|
||||
CHECK(emails[0] == "info@real-company.de");
|
||||
}
|
||||
|
||||
// ── resolve_url ─────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("resolve_url: absolute stays absolute", "[enrichers]") {
|
||||
CHECK(resolve_url("https://example.com", "https://other.com/page") == "https://other.com/page");
|
||||
}
|
||||
|
||||
TEST_CASE("resolve_url: relative path", "[enrichers]") {
|
||||
auto r = resolve_url("https://example.com/page", "/contact");
|
||||
CHECK(r == "https://example.com/contact");
|
||||
}
|
||||
|
||||
TEST_CASE("resolve_url: protocol-relative", "[enrichers]") {
|
||||
auto r = resolve_url("https://example.com", "//other.com/foo");
|
||||
CHECK(r == "https://other.com/foo");
|
||||
}
|
||||
|
||||
TEST_CASE("resolve_url: relative without slash", "[enrichers]") {
|
||||
auto r = resolve_url("https://example.com/dir/page", "about.html");
|
||||
CHECK(r == "https://example.com/dir/about.html");
|
||||
}
|
||||
|
||||
// ── status_string ───────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("status_string: covers all statuses", "[enrichers]") {
|
||||
CHECK(std::string(status_string(EnrichStatus::OK)) == "OK");
|
||||
CHECK(std::string(status_string(EnrichStatus::NO_EMAIL)) == "NO_EMAIL");
|
||||
CHECK(std::string(status_string(EnrichStatus::META_TIMEOUT)) == "META_TIMEOUT");
|
||||
CHECK(std::string(status_string(EnrichStatus::EMAIL_TIMEOUT)) == "EMAIL_TIMEOUT");
|
||||
CHECK(std::string(status_string(EnrichStatus::FETCH_ERROR)) == "FETCH_ERROR");
|
||||
CHECK(std::string(status_string(EnrichStatus::NO_PAGES)) == "NO_PAGES");
|
||||
CHECK(std::string(status_string(EnrichStatus::ERROR)) == "ERROR");
|
||||
}
|
||||
|
||||
// ── EnrichConfig defaults ───────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("EnrichConfig: default values", "[enrichers]") {
|
||||
EnrichConfig cfg;
|
||||
CHECK(cfg.meta_timeout_ms == 20000);
|
||||
CHECK(cfg.email_timeout_ms == 30000);
|
||||
CHECK(cfg.email_page_timeout_ms == 10000);
|
||||
CHECK(cfg.email_max_pages == 8);
|
||||
CHECK(cfg.email_abort_after == 1);
|
||||
CHECK_FALSE(cfg.contact_patterns.empty());
|
||||
CHECK_FALSE(cfg.probe_paths.empty());
|
||||
}
|
||||
163
packages/kbot/cpp/tests/unit/test_gadm_reader.cpp
Normal file
163
packages/kbot/cpp/tests/unit/test_gadm_reader.cpp
Normal file
@ -0,0 +1,163 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
#include <cmath>
|
||||
|
||||
using namespace gadm;
|
||||
using Catch::Matchers::WithinAbs;
|
||||
using Catch::Matchers::WithinRel;
|
||||
|
||||
// ── Helper: fixtures path ───────────────────────────────────────────────────
|
||||
// Tests are run with WORKING_DIRECTORY = CMAKE_SOURCE_DIR (server/cpp)
|
||||
|
||||
static const std::string CACHE_DIR = "cache/gadm";
|
||||
|
||||
// ── country_code ────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("country_code: simple ISO3", "[gadm][util]") {
|
||||
REQUIRE(country_code("ABW") == "ABW");
|
||||
}
|
||||
|
||||
TEST_CASE("country_code: dotted GID", "[gadm][util]") {
|
||||
REQUIRE(country_code("AFG.1.1_1") == "AFG");
|
||||
REQUIRE(country_code("ESP.6.1_1") == "ESP");
|
||||
}
|
||||
|
||||
// ── infer_level ─────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("infer_level: level 0 (country)", "[gadm][util]") {
|
||||
REQUIRE(infer_level("ABW") == 0);
|
||||
REQUIRE(infer_level("AFG") == 0);
|
||||
}
|
||||
|
||||
TEST_CASE("infer_level: level 1", "[gadm][util]") {
|
||||
REQUIRE(infer_level("AFG.1_1") == 1);
|
||||
}
|
||||
|
||||
TEST_CASE("infer_level: level 2", "[gadm][util]") {
|
||||
REQUIRE(infer_level("AFG.1.1_1") == 2);
|
||||
}
|
||||
|
||||
TEST_CASE("infer_level: level 3", "[gadm][util]") {
|
||||
REQUIRE(infer_level("ESP.6.1.4_1") == 3);
|
||||
}
|
||||
|
||||
// ── load_boundary_file: ABW level 0 ────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Load ABW level 0: basic structure", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
REQUIRE(res.features.size() == 1);
|
||||
|
||||
const auto& f = res.features[0];
|
||||
REQUIRE(f.gid == "ABW");
|
||||
REQUIRE(f.name == "Aruba");
|
||||
REQUIRE(f.level == 0);
|
||||
REQUIRE(f.isOuter == true);
|
||||
}
|
||||
|
||||
TEST_CASE("Load ABW level 0: has rings", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
const auto& f = res.features[0];
|
||||
|
||||
REQUIRE(f.rings.size() >= 1);
|
||||
REQUIRE(f.rings[0].size() > 10); // ABW has ~55 coords
|
||||
}
|
||||
|
||||
TEST_CASE("Load ABW level 0: GHS population data", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
const auto& f = res.features[0];
|
||||
|
||||
REQUIRE_THAT(f.ghsPopulation, WithinRel(104847.0, 0.01));
|
||||
REQUIRE(f.ghsPopCenters.size() == 5);
|
||||
// First pop center: [-70.04183, 12.53341, 104.0]
|
||||
REQUIRE_THAT(f.ghsPopCenters[0][0], WithinAbs(-70.04183, 0.0001));
|
||||
REQUIRE_THAT(f.ghsPopCenters[0][1], WithinAbs(12.53341, 0.0001));
|
||||
REQUIRE_THAT(f.ghsPopCenters[0][2], WithinAbs(104.0, 0.1));
|
||||
}
|
||||
|
||||
TEST_CASE("Load ABW level 0: GHS built data", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
const auto& f = res.features[0];
|
||||
|
||||
REQUIRE_THAT(f.ghsBuiltWeight, WithinRel(22900682.0, 0.01));
|
||||
REQUIRE(f.ghsBuiltCenters.size() == 5);
|
||||
REQUIRE_THAT(f.ghsBuiltCenter.lon, WithinAbs(-69.99304, 0.001));
|
||||
REQUIRE_THAT(f.ghsBuiltCenter.lat, WithinAbs(12.51234, 0.001));
|
||||
}
|
||||
|
||||
TEST_CASE("Load ABW level 0: computed bbox", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
const auto& f = res.features[0];
|
||||
|
||||
// ABW bbox should be roughly in the Caribbean
|
||||
REQUIRE(f.bbox.minLon < -69.8);
|
||||
REQUIRE(f.bbox.maxLon > -70.1);
|
||||
REQUIRE(f.bbox.minLat > 12.4);
|
||||
REQUIRE(f.bbox.maxLat < 12.7);
|
||||
}
|
||||
|
||||
TEST_CASE("Load ABW level 0: computed area", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
const auto& f = res.features[0];
|
||||
|
||||
// Aruba is ~180 km²
|
||||
REQUIRE_THAT(f.areaSqKm, WithinRel(180.0, 0.15)); // 15% tolerance
|
||||
}
|
||||
|
||||
// ── load_boundary_file: AFG level 2 ────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Load AFG.1.1_1 level 2: basic structure", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_AFG.1.1_1_2.json");
|
||||
REQUIRE(res.error.empty());
|
||||
REQUIRE(res.features.size() == 1);
|
||||
|
||||
const auto& f = res.features[0];
|
||||
REQUIRE(f.gid == "AFG.1.1_1");
|
||||
REQUIRE(f.name == "Baharak");
|
||||
REQUIRE(f.level == 2);
|
||||
}
|
||||
|
||||
TEST_CASE("Load AFG.1.1_1 level 2: has GHS data", "[gadm][file]") {
|
||||
auto res = load_boundary_file(CACHE_DIR + "/boundary_AFG.1.1_1_2.json");
|
||||
REQUIRE(res.error.empty());
|
||||
const auto& f = res.features[0];
|
||||
|
||||
REQUIRE(f.ghsPopCenters.size() == 5);
|
||||
REQUIRE(f.ghsBuiltCenters.size() == 5);
|
||||
REQUIRE(f.ghsPopulation > 0);
|
||||
}
|
||||
|
||||
// ── load_boundary: path resolution ──────────────────────────────────────────
|
||||
|
||||
TEST_CASE("load_boundary: direct GID match", "[gadm][resolve]") {
|
||||
auto res = load_boundary("ABW", 0, CACHE_DIR);
|
||||
REQUIRE(res.error.empty());
|
||||
REQUIRE(res.features.size() == 1);
|
||||
REQUIRE(res.features[0].gid == "ABW");
|
||||
}
|
||||
|
||||
TEST_CASE("load_boundary: sub-region GID", "[gadm][resolve]") {
|
||||
auto res = load_boundary("AFG.1.1_1", 2, CACHE_DIR);
|
||||
REQUIRE(res.error.empty());
|
||||
REQUIRE(res.features[0].gid == "AFG.1.1_1");
|
||||
}
|
||||
|
||||
TEST_CASE("load_boundary: missing file returns error", "[gadm][resolve]") {
|
||||
auto res = load_boundary("DOESNOTEXIST", 0, CACHE_DIR);
|
||||
REQUIRE(!res.error.empty());
|
||||
REQUIRE(res.features.empty());
|
||||
}
|
||||
|
||||
// ── Error handling ──────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("load_boundary_file: nonexistent file", "[gadm][error]") {
|
||||
auto res = load_boundary_file("nonexistent.json");
|
||||
REQUIRE(!res.error.empty());
|
||||
REQUIRE(res.features.empty());
|
||||
}
|
||||
209
packages/kbot/cpp/tests/unit/test_geo.cpp
Normal file
209
packages/kbot/cpp/tests/unit/test_geo.cpp
Normal file
@ -0,0 +1,209 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
||||
#include "geo/geo.h"
|
||||
#include <cmath>
|
||||
|
||||
using namespace geo;
|
||||
using Catch::Matchers::WithinAbs;
|
||||
using Catch::Matchers::WithinRel;
|
||||
|
||||
// ── Distance ────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Haversine distance: known reference values", "[geo][distance]") {
|
||||
// London to Paris: ~343 km
|
||||
Coord london{-0.1278, 51.5074};
|
||||
Coord paris{2.3522, 48.8566};
|
||||
double d = distance_km(london, paris);
|
||||
REQUIRE_THAT(d, WithinRel(343.5, 0.02)); // 2% tolerance
|
||||
|
||||
// Same point should be zero
|
||||
REQUIRE_THAT(distance_km(london, london), WithinAbs(0.0, 1e-10));
|
||||
|
||||
// Equatorial points 1 degree apart: ~111.32 km
|
||||
Coord eq0{0, 0};
|
||||
Coord eq1{1, 0};
|
||||
REQUIRE_THAT(distance_km(eq0, eq1), WithinRel(111.32, 0.01));
|
||||
}
|
||||
|
||||
TEST_CASE("Haversine distance: antipodal points", "[geo][distance]") {
|
||||
// North pole to south pole: ~20015 km (half circumference)
|
||||
Coord north{0, 90};
|
||||
Coord south{0, -90};
|
||||
double d = distance_km(north, south);
|
||||
REQUIRE_THAT(d, WithinRel(20015.0, 0.01));
|
||||
}
|
||||
|
||||
// ── BBox ────────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("BBox of a simple triangle", "[geo][bbox]") {
|
||||
std::vector<Coord> triangle = {{0, 0}, {10, 5}, {5, 10}};
|
||||
BBox b = bbox(triangle);
|
||||
REQUIRE(b.minLon == 0.0);
|
||||
REQUIRE(b.minLat == 0.0);
|
||||
REQUIRE(b.maxLon == 10.0);
|
||||
REQUIRE(b.maxLat == 10.0);
|
||||
}
|
||||
|
||||
TEST_CASE("BBox center", "[geo][bbox]") {
|
||||
BBox b{-10, -20, 10, 20};
|
||||
Coord c = b.center();
|
||||
REQUIRE(c.lon == 0.0);
|
||||
REQUIRE(c.lat == 0.0);
|
||||
}
|
||||
|
||||
TEST_CASE("BBox union", "[geo][bbox]") {
|
||||
std::vector<BBox> boxes = {{0, 0, 5, 5}, {3, 3, 10, 10}};
|
||||
BBox u = bbox_union(boxes);
|
||||
REQUIRE(u.minLon == 0.0);
|
||||
REQUIRE(u.minLat == 0.0);
|
||||
REQUIRE(u.maxLon == 10.0);
|
||||
REQUIRE(u.maxLat == 10.0);
|
||||
}
|
||||
|
||||
TEST_CASE("BBox of empty ring returns zeros", "[geo][bbox]") {
|
||||
std::vector<Coord> empty;
|
||||
BBox b = bbox(empty);
|
||||
REQUIRE(b.minLon == 0.0);
|
||||
REQUIRE(b.maxLon == 0.0);
|
||||
}
|
||||
|
||||
// ── Centroid ────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Centroid of a square", "[geo][centroid]") {
|
||||
std::vector<Coord> square = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
|
||||
Coord c = centroid(square);
|
||||
REQUIRE_THAT(c.lon, WithinAbs(5.0, 1e-10));
|
||||
REQUIRE_THAT(c.lat, WithinAbs(5.0, 1e-10));
|
||||
}
|
||||
|
||||
TEST_CASE("Centroid handles closed ring (duplicate first/last)", "[geo][centroid]") {
|
||||
// Closed triangle — first and last point are the same
|
||||
std::vector<Coord> closed = {{0, 0}, {6, 0}, {3, 6}, {0, 0}};
|
||||
Coord c = centroid(closed);
|
||||
// Average of 3 unique points: (0+6+3)/3 = 3, (0+0+6)/3 = 2
|
||||
REQUIRE_THAT(c.lon, WithinAbs(3.0, 1e-10));
|
||||
REQUIRE_THAT(c.lat, WithinAbs(2.0, 1e-10));
|
||||
}
|
||||
|
||||
// ── Area ────────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Area of an equatorial 1x1 degree square", "[geo][area]") {
|
||||
// ~111.32 km × ~110.57 km ≈ ~12,308 km²
|
||||
std::vector<Coord> sq = {{0, 0}, {1, 0}, {1, 1}, {0, 1}, {0, 0}};
|
||||
double a = area_sq_km(sq);
|
||||
REQUIRE_THAT(a, WithinRel(12308.0, 0.05)); // 5% tolerance
|
||||
}
|
||||
|
||||
TEST_CASE("Area of a zero-size polygon is zero", "[geo][area]") {
|
||||
std::vector<Coord> pt = {{5, 5}};
|
||||
REQUIRE(area_sq_km(pt) == 0.0);
|
||||
}
|
||||
|
||||
// ── Point-in-polygon ────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("PIP: point inside a square", "[geo][pip]") {
|
||||
std::vector<Coord> sq = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
|
||||
REQUIRE(point_in_polygon({5, 5}, sq) == true);
|
||||
REQUIRE(point_in_polygon({1, 1}, sq) == true);
|
||||
}
|
||||
|
||||
TEST_CASE("PIP: point outside a square", "[geo][pip]") {
|
||||
std::vector<Coord> sq = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
|
||||
REQUIRE(point_in_polygon({-1, 5}, sq) == false);
|
||||
REQUIRE(point_in_polygon({15, 5}, sq) == false);
|
||||
}
|
||||
|
||||
TEST_CASE("PIP: point on edge is indeterminate but consistent", "[geo][pip]") {
|
||||
std::vector<Coord> sq = {{0, 0}, {10, 0}, {10, 10}, {0, 10}, {0, 0}};
|
||||
// Edge behavior is implementation-defined but should not crash
|
||||
(void)point_in_polygon({0, 5}, sq);
|
||||
(void)point_in_polygon({5, 0}, sq);
|
||||
}
|
||||
|
||||
// ── Bearing ─────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Bearing: due north", "[geo][bearing]") {
|
||||
Coord a{0, 0};
|
||||
Coord b{0, 10};
|
||||
REQUIRE_THAT(bearing_deg(a, b), WithinAbs(0.0, 0.1));
|
||||
}
|
||||
|
||||
TEST_CASE("Bearing: due east", "[geo][bearing]") {
|
||||
Coord a{0, 0};
|
||||
Coord b{10, 0};
|
||||
REQUIRE_THAT(bearing_deg(a, b), WithinAbs(90.0, 0.5));
|
||||
}
|
||||
|
||||
// ── Destination ─────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Destination: 100km north from equator", "[geo][destination]") {
|
||||
Coord start{0, 0};
|
||||
Coord dest = destination(start, 0.0, 100.0); // due north
|
||||
REQUIRE_THAT(dest.lat, WithinRel(0.899, 0.02)); // ~0.9 degrees
|
||||
REQUIRE_THAT(dest.lon, WithinAbs(0.0, 0.01));
|
||||
}
|
||||
|
||||
TEST_CASE("Destination roundtrip: go 100km then measure distance", "[geo][destination]") {
|
||||
Coord start{2.3522, 48.8566}; // Paris
|
||||
Coord dest = destination(start, 45.0, 100.0); // 100km northeast
|
||||
double d = distance_km(start, dest);
|
||||
REQUIRE_THAT(d, WithinRel(100.0, 0.01)); // should be ~100km back
|
||||
}
|
||||
|
||||
// ── Square grid ─────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Square grid: generates cells within bbox", "[geo][grid]") {
|
||||
BBox extent{0, 0, 1, 1}; // ~111km x ~110km
|
||||
auto cells = square_grid(extent, 50.0); // 50km cells → ~4 cells
|
||||
REQUIRE(cells.size() >= 4);
|
||||
for (const auto& c : cells) {
|
||||
REQUIRE(c.lon >= extent.minLon);
|
||||
REQUIRE(c.lon <= extent.maxLon);
|
||||
REQUIRE(c.lat >= extent.minLat);
|
||||
REQUIRE(c.lat <= extent.maxLat);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Square grid: zero cell size returns empty", "[geo][grid]") {
|
||||
BBox extent{0, 0, 10, 10};
|
||||
auto cells = square_grid(extent, 0.0);
|
||||
REQUIRE(cells.empty());
|
||||
}
|
||||
|
||||
// ── Hex grid ────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Hex grid: generates cells within bbox", "[geo][grid]") {
|
||||
BBox extent{0, 0, 1, 1};
|
||||
auto cells = hex_grid(extent, 50.0);
|
||||
REQUIRE(cells.size() >= 4);
|
||||
for (const auto& c : cells) {
|
||||
REQUIRE(c.lon >= extent.minLon);
|
||||
REQUIRE(c.lon <= extent.maxLon);
|
||||
REQUIRE(c.lat >= extent.minLat);
|
||||
REQUIRE(c.lat <= extent.maxLat);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Hex grid: has offset rows", "[geo][grid]") {
|
||||
BBox extent{0, 0, 2, 2}; // large enough for multiple rows
|
||||
auto cells = hex_grid(extent, 30.0);
|
||||
// Find first and second row Y values
|
||||
if (cells.size() >= 3) {
|
||||
// Just verify we got some cells (hex pattern is complex to validate)
|
||||
REQUIRE(cells.size() > 2);
|
||||
}
|
||||
}
|
||||
|
||||
// ── Viewport estimation ─────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Viewport estimation at equator zoom 14", "[geo][viewport]") {
|
||||
double sq = estimate_viewport_sq_km(0.0, 14);
|
||||
// At zoom 14, equator: ~9.55 m/px → ~9.78 * 7.33 ≈ 71.7 km²
|
||||
REQUIRE_THAT(sq, WithinRel(71.7, 0.15)); // 15% tolerance
|
||||
}
|
||||
|
||||
TEST_CASE("Viewport estimation: higher zoom = smaller area", "[geo][viewport]") {
|
||||
double z14 = estimate_viewport_sq_km(40.0, 14);
|
||||
double z16 = estimate_viewport_sq_km(40.0, 16);
|
||||
REQUIRE(z16 < z14);
|
||||
}
|
||||
235
packages/kbot/cpp/tests/unit/test_grid.cpp
Normal file
235
packages/kbot/cpp/tests/unit/test_grid.cpp
Normal file
@ -0,0 +1,235 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
||||
#include "grid/grid.h"
|
||||
#include "gadm_reader/gadm_reader.h"
|
||||
|
||||
#include <cmath>
|
||||
#include <set>
|
||||
|
||||
using Catch::Matchers::WithinAbs;
|
||||
using Catch::Matchers::WithinRel;
|
||||
|
||||
static const std::string CACHE_DIR = "cache/gadm";
|
||||
|
||||
// ── Helper: load ABW boundary ───────────────────────────────────────────────
|
||||
|
||||
static gadm::Feature load_abw() {
|
||||
auto res = gadm::load_boundary_file(CACHE_DIR + "/boundary_ABW_0.json");
|
||||
REQUIRE(res.error.empty());
|
||||
REQUIRE(res.features.size() == 1);
|
||||
return res.features[0];
|
||||
}
|
||||
|
||||
static gadm::Feature load_afg() {
|
||||
auto res = gadm::load_boundary_file(CACHE_DIR + "/boundary_AFG.1.1_1_2.json");
|
||||
REQUIRE(res.error.empty());
|
||||
REQUIRE(res.features.size() == 1);
|
||||
return res.features[0];
|
||||
}
|
||||
|
||||
// ── Admin mode ──────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Grid admin: single feature → one waypoint", "[grid][admin]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "admin";
|
||||
opts.pathOrder = "zigzag";
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.error.empty());
|
||||
REQUIRE(result.validCells == 1);
|
||||
REQUIRE(result.waypoints.size() == 1);
|
||||
|
||||
auto& wp = result.waypoints[0];
|
||||
REQUIRE(wp.step == 1);
|
||||
REQUIRE(wp.radius_km > 0);
|
||||
// ABW centroid should be near [-70.0, 12.5]
|
||||
REQUIRE_THAT(wp.lng, WithinAbs(-70.0, 0.1));
|
||||
REQUIRE_THAT(wp.lat, WithinAbs(12.5, 0.1));
|
||||
}
|
||||
|
||||
TEST_CASE("Grid admin: multiple features", "[grid][admin]") {
|
||||
auto abw = load_abw();
|
||||
auto afg = load_afg();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "admin";
|
||||
|
||||
auto result = grid::generate({abw, afg}, opts);
|
||||
REQUIRE(result.error.empty());
|
||||
REQUIRE(result.validCells == 2);
|
||||
REQUIRE(result.waypoints.size() == 2);
|
||||
REQUIRE(result.waypoints[0].step == 1);
|
||||
REQUIRE(result.waypoints[1].step == 2);
|
||||
}
|
||||
|
||||
TEST_CASE("Grid admin: empty features → error", "[grid][admin]") {
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "admin";
|
||||
|
||||
auto result = grid::generate({}, opts);
|
||||
REQUIRE(!result.error.empty());
|
||||
}
|
||||
|
||||
// ── Centers mode ────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Grid centers: ABW generates waypoints from GHS centers", "[grid][centers]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "centers";
|
||||
opts.cellSize = 5.0;
|
||||
opts.centroidOverlap = 0.5;
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.error.empty());
|
||||
REQUIRE(result.validCells > 0);
|
||||
REQUIRE(result.waypoints.size() == static_cast<size_t>(result.validCells));
|
||||
|
||||
// All waypoints should be near Aruba
|
||||
for (const auto& wp : result.waypoints) {
|
||||
REQUIRE(wp.lng > -70.2);
|
||||
REQUIRE(wp.lng < -69.8);
|
||||
REQUIRE(wp.lat > 12.4);
|
||||
REQUIRE(wp.lat < 12.7);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Grid centers: centroid overlap filters nearby centers", "[grid][centers]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "centers";
|
||||
opts.cellSize = 20.0; // big cells
|
||||
opts.centroidOverlap = 0.0; // no overlap allowed → aggressive dedup
|
||||
|
||||
auto result_aggressive = grid::generate({feat}, opts);
|
||||
|
||||
opts.centroidOverlap = 0.9; // allow almost full overlap → more centers pass
|
||||
auto result_relaxed = grid::generate({feat}, opts);
|
||||
|
||||
REQUIRE(result_relaxed.validCells >= result_aggressive.validCells);
|
||||
}
|
||||
|
||||
// ── Hex grid mode ───────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Grid hex: ABW at 3km cells", "[grid][hex]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "hex";
|
||||
opts.cellSize = 3.0;
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.error.empty());
|
||||
REQUIRE(result.validCells > 0);
|
||||
// Aruba is ~30x10 km, so with 3km cells we expect ~20-60 cells
|
||||
REQUIRE(result.validCells > 5);
|
||||
REQUIRE(result.validCells < 200);
|
||||
}
|
||||
|
||||
TEST_CASE("Grid square: ABW at 5km cells", "[grid][square]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "square";
|
||||
opts.cellSize = 5.0;
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.error.empty());
|
||||
REQUIRE(result.validCells > 0);
|
||||
REQUIRE(result.validCells < 50); // island is small
|
||||
}
|
||||
|
||||
TEST_CASE("Grid hex: too many cells returns error", "[grid][hex]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "hex";
|
||||
opts.cellSize = 0.01; // tiny cell → huge grid
|
||||
opts.maxCellsLimit = 100;
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(!result.error.empty());
|
||||
}
|
||||
|
||||
// ── Sorting ─────────────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Grid sort: snake vs zigzag differ for multi-row grid", "[grid][sort]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "hex";
|
||||
opts.cellSize = 3.0;
|
||||
|
||||
opts.pathOrder = "zigzag";
|
||||
auto r1 = grid::generate({feat}, opts);
|
||||
|
||||
opts.pathOrder = "snake";
|
||||
auto r2 = grid::generate({feat}, opts);
|
||||
|
||||
REQUIRE(r1.validCells == r2.validCells);
|
||||
// Snake reverses every other row, so coordinates should differ in order
|
||||
if (r1.validCells > 5) {
|
||||
bool anyDiff = false;
|
||||
for (size_t i = 0; i < r1.waypoints.size(); ++i) {
|
||||
if (std::abs(r1.waypoints[i].lng - r2.waypoints[i].lng) > 1e-6) {
|
||||
anyDiff = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
REQUIRE(anyDiff);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE("Grid sort: spiral-out starts near center", "[grid][sort]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "hex";
|
||||
opts.cellSize = 3.0;
|
||||
opts.pathOrder = "spiral-out";
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.validCells > 3);
|
||||
|
||||
// Compute center of all waypoints
|
||||
double cLon = 0, cLat = 0;
|
||||
for (const auto& wp : result.waypoints) { cLon += wp.lng; cLat += wp.lat; }
|
||||
cLon /= result.waypoints.size();
|
||||
cLat /= result.waypoints.size();
|
||||
|
||||
// First waypoint should be closer to center than last
|
||||
double distFirst = std::hypot(result.waypoints.front().lng - cLon, result.waypoints.front().lat - cLat);
|
||||
double distLast = std::hypot(result.waypoints.back().lng - cLon, result.waypoints.back().lat - cLat);
|
||||
REQUIRE(distFirst < distLast);
|
||||
}
|
||||
|
||||
TEST_CASE("Grid sort: steps are sequential after sorting", "[grid][sort]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "hex";
|
||||
opts.cellSize = 3.0;
|
||||
opts.pathOrder = "shortest";
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
for (size_t i = 0; i < result.waypoints.size(); ++i) {
|
||||
REQUIRE(result.waypoints[i].step == static_cast<int>(i + 1));
|
||||
}
|
||||
}
|
||||
|
||||
// ── GHS Filtering ───────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Grid admin: GHS pop filter skips low-pop features", "[grid][filter]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "admin";
|
||||
opts.minGhsPop = 999999999; // impossibly high
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.validCells == 0);
|
||||
REQUIRE(result.skippedCells == 1);
|
||||
}
|
||||
|
||||
TEST_CASE("Grid admin: bypass filters passes everything", "[grid][filter]") {
|
||||
auto feat = load_abw();
|
||||
grid::GridOptions opts;
|
||||
opts.gridMode = "admin";
|
||||
opts.minGhsPop = 999999999;
|
||||
opts.bypassFilters = true;
|
||||
|
||||
auto result = grid::generate({feat}, opts);
|
||||
REQUIRE(result.validCells == 1);
|
||||
}
|
||||
452
packages/kbot/cpp/tests/unit/test_html.cpp
Normal file
452
packages/kbot/cpp/tests/unit/test_html.cpp
Normal file
@ -0,0 +1,452 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
#include "html/html.h"
|
||||
#include "html/html2md.h"
|
||||
|
||||
// ═══════════════════════════════════════════════════════
|
||||
// html::parse / html::select (existing)
|
||||
// ═══════════════════════════════════════════════════════
|
||||
|
||||
TEST_CASE("html::parse returns elements from valid HTML", "[html]") {
|
||||
auto elements =
|
||||
html::parse("<html><body><h1>Title</h1><p>Body</p></body></html>");
|
||||
|
||||
REQUIRE(elements.size() >= 2);
|
||||
|
||||
bool found_h1 = false;
|
||||
bool found_p = false;
|
||||
for (const auto &el : elements) {
|
||||
if (el.tag == "h1" && el.text == "Title")
|
||||
found_h1 = true;
|
||||
if (el.tag == "p" && el.text == "Body")
|
||||
found_p = true;
|
||||
}
|
||||
CHECK(found_h1);
|
||||
CHECK(found_p);
|
||||
}
|
||||
|
||||
TEST_CASE("html::parse returns empty for empty input", "[html]") {
|
||||
auto elements = html::parse("");
|
||||
REQUIRE(elements.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html::parse handles nested elements", "[html]") {
|
||||
auto elements = html::parse("<div><span>Nested</span></div>");
|
||||
|
||||
bool found_span = false;
|
||||
for (const auto &el : elements) {
|
||||
if (el.tag == "span" && el.text == "Nested") {
|
||||
found_span = true;
|
||||
}
|
||||
}
|
||||
CHECK(found_span);
|
||||
}
|
||||
|
||||
TEST_CASE("html::select finds elements by CSS selector", "[html][select]") {
|
||||
auto matches = html::select("<ul><li>A</li><li>B</li><li>C</li></ul>", "li");
|
||||
|
||||
REQUIRE(matches.size() == 3);
|
||||
CHECK(matches[0] == "A");
|
||||
CHECK(matches[1] == "B");
|
||||
CHECK(matches[2] == "C");
|
||||
}
|
||||
|
||||
TEST_CASE("html::select returns empty for no matches", "[html][select]") {
|
||||
auto matches = html::select("<p>Hello</p>", "h1");
|
||||
REQUIRE(matches.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html::select works with class selector", "[html][select]") {
|
||||
auto matches = html::select(
|
||||
R"(<div><span class="a">X</span><span class="b">Y</span></div>)", ".a");
|
||||
|
||||
REQUIRE(matches.size() == 1);
|
||||
CHECK(matches[0] == "X");
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════
|
||||
// html2md — conversion & large-chunk robustness
|
||||
// ═══════════════════════════════════════════════════════
|
||||
|
||||
TEST_CASE("html2md basic conversion", "[html2md]") {
|
||||
std::string md = html2md::Convert("<h1>Hello</h1><p>World</p>");
|
||||
CHECK(md.find("Hello") != std::string::npos);
|
||||
CHECK(md.find("World") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md empty input", "[html2md]") {
|
||||
std::string md = html2md::Convert("");
|
||||
CHECK(md.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html2md whitespace-only input", "[html2md]") {
|
||||
std::string md = html2md::Convert(" \n\t ");
|
||||
// Should return empty or whitespace — must not crash
|
||||
CHECK(md.size() < 20);
|
||||
}
|
||||
|
||||
// ---------- large payload stress tests ----------
|
||||
|
||||
static std::string make_paragraphs(size_t count) {
|
||||
std::string html;
|
||||
html.reserve(count * 40);
|
||||
for (size_t i = 0; i < count; ++i) {
|
||||
html += "<p>Paragraph number ";
|
||||
html += std::to_string(i);
|
||||
html += " with some filler text.</p>\n";
|
||||
}
|
||||
return html;
|
||||
}
|
||||
|
||||
static std::string make_large_html(size_t target_bytes) {
|
||||
// Build a chunk of roughly target_bytes by repeating a row
|
||||
const std::string row = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor.</p>\n";
|
||||
std::string html;
|
||||
html.reserve(target_bytes + 256);
|
||||
html += "<html><body>";
|
||||
while (html.size() < target_bytes) {
|
||||
html += row;
|
||||
}
|
||||
html += "</body></html>";
|
||||
return html;
|
||||
}
|
||||
|
||||
TEST_CASE("html2md handles 64KB HTML", "[html2md][large]") {
|
||||
auto html = make_large_html(64 * 1024);
|
||||
REQUIRE(html.size() >= 64 * 1024);
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(!md.empty());
|
||||
CHECK(md.find("Lorem ipsum") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md handles 512KB HTML", "[html2md][large]") {
|
||||
auto html = make_large_html(512 * 1024);
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(!md.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html2md handles 1MB HTML", "[html2md][large]") {
|
||||
auto html = make_large_html(1024 * 1024);
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(!md.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html2md 10K paragraphs", "[html2md][large]") {
|
||||
auto html = make_paragraphs(10000);
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(!md.empty());
|
||||
CHECK(md.find("Paragraph number 9999") != std::string::npos);
|
||||
}
|
||||
|
||||
// ---------- deeply nested HTML ----------
|
||||
|
||||
TEST_CASE("html2md deeply nested divs (500 levels)", "[html2md][large]") {
|
||||
const int depth = 500;
|
||||
std::string html;
|
||||
for (int i = 0; i < depth; ++i) html += "<div>";
|
||||
html += "deep content";
|
||||
for (int i = 0; i < depth; ++i) html += "</div>";
|
||||
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("deep content") != std::string::npos);
|
||||
}
|
||||
|
||||
// ---------- wide table ----------
|
||||
|
||||
TEST_CASE("html2md wide table (200 columns)", "[html2md][large]") {
|
||||
std::string html = "<table><tr>";
|
||||
for (int i = 0; i < 200; ++i) {
|
||||
html += "<td>C" + std::to_string(i) + "</td>";
|
||||
}
|
||||
html += "</tr></table>";
|
||||
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(!md.empty());
|
||||
CHECK(md.find("C0") != std::string::npos);
|
||||
CHECK(md.find("C199") != std::string::npos);
|
||||
}
|
||||
|
||||
// ---------- concurrent conversion ----------
|
||||
|
||||
TEST_CASE("html2md concurrent conversions are thread-safe", "[html2md][threads]") {
|
||||
const int num_threads = 8;
|
||||
const std::string html = make_large_html(32 * 1024); // 32KB each
|
||||
std::vector<std::string> results(num_threads);
|
||||
std::vector<std::thread> threads;
|
||||
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
threads.emplace_back([&results, &html, i]() {
|
||||
results[i] = html2md::Convert(html);
|
||||
});
|
||||
}
|
||||
|
||||
for (auto &t : threads) t.join();
|
||||
|
||||
for (int i = 0; i < num_threads; ++i) {
|
||||
CHECK(!results[i].empty());
|
||||
CHECK(results[i].find("Lorem ipsum") != std::string::npos);
|
||||
}
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════
|
||||
// html2md — malformed / faulty HTML robustness
|
||||
// ═══════════════════════════════════════════════════════
|
||||
|
||||
TEST_CASE("html2md unclosed tags", "[html2md][faulty]") {
|
||||
std::string md = html2md::Convert("<p>Hello <b>bold <i>italic");
|
||||
CHECK(md.find("Hello") != std::string::npos);
|
||||
CHECK(md.find("bold") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md mismatched/overlapping tags", "[html2md][faulty]") {
|
||||
std::string md = html2md::Convert("<b>bold <i>both</b> italic</i>");
|
||||
CHECK(md.find("bold") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md broken attributes", "[html2md][faulty]") {
|
||||
std::string md = html2md::Convert(R"(<a href="http://example.com class="bad>Link</a>)");
|
||||
// must not crash — output may vary
|
||||
(void)md;
|
||||
}
|
||||
|
||||
TEST_CASE("html2md bare text (no tags)", "[html2md][faulty]") {
|
||||
std::string md = html2md::Convert("Just plain text, no HTML at all.");
|
||||
CHECK(md.find("Just plain text") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md random binary noise", "[html2md][faulty]") {
|
||||
// Full 0-255 byte range — previously crashed on MSVC debug builds due to
|
||||
// signed char passed to isspace() without unsigned cast. Fixed in html2md.cpp.
|
||||
std::string noise(4096, '\0');
|
||||
for (size_t i = 0; i < noise.size(); ++i) {
|
||||
noise[i] = static_cast<char>((i * 131 + 17) % 256);
|
||||
}
|
||||
std::string md = html2md::Convert(noise);
|
||||
// No assertion on content — just survival
|
||||
(void)md;
|
||||
}
|
||||
|
||||
TEST_CASE("html2md truncated document", "[html2md][faulty]") {
|
||||
std::string html = "<html><body><table><tr><td>Cell1</td><td>Cell2";
|
||||
// abruptly ends mid-table
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Cell1") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md script and style tags", "[html2md][faulty]") {
|
||||
std::string html = R"(
|
||||
<p>Before</p>
|
||||
<script>alert('xss');</script>
|
||||
<style>.foo { color: red; }</style>
|
||||
<p>After</p>
|
||||
)";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Before") != std::string::npos);
|
||||
CHECK(md.find("After") != std::string::npos);
|
||||
// script/style content should be stripped
|
||||
CHECK(md.find("alert") == std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md null bytes in input", "[html2md][faulty]") {
|
||||
std::string html = "<p>Hello";
|
||||
html += '\0';
|
||||
html += "World</p>";
|
||||
// html2md may stop at null or handle it — must not crash
|
||||
std::string md = html2md::Convert(html);
|
||||
(void)md;
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════
|
||||
// html2md — web scraper real-world edge cases
|
||||
// ═══════════════════════════════════════════════════════
|
||||
|
||||
TEST_CASE("html2md UTF-8 multibyte (CJK, Arabic, emoji)", "[html2md][scraper]") {
|
||||
std::string html =
|
||||
"<h1>日本語テスト</h1>"
|
||||
"<p>مرحبا بالعالم</p>"
|
||||
"<p>Ñoño señor über straße</p>"
|
||||
"<p>Emoji: 🚀🔥💀👻 and 中文混合English</p>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Emoji") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md BOM prefix", "[html2md][scraper]") {
|
||||
// UTF-8 BOM (EF BB BF) prepended — common from Windows-origin pages
|
||||
std::string html = "\xEF\xBB\xBF<html><body><p>Content after BOM</p></body></html>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Content after BOM") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md entity soup", "[html2md][scraper]") {
|
||||
std::string html =
|
||||
"<p>Price: €10 & <20> items</p>"
|
||||
"<p> indented — dashes – more</p>"
|
||||
"<p>Bad entity: ¬real; and 󴈿 and &#xZZZZ;</p>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Price") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md CDATA and comments", "[html2md][scraper]") {
|
||||
std::string html =
|
||||
"<p>Before</p>"
|
||||
"<!-- <script>alert('xss')</script> -->"
|
||||
"<![CDATA[This is raw <data> & stuff]]>"
|
||||
"<!-- multi\nline\ncomment -->"
|
||||
"<p>After</p>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Before") != std::string::npos);
|
||||
CHECK(md.find("After") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md deeply nested inline tags", "[html2md][scraper]") {
|
||||
// Real pages sometimes have insanely nested spans from WYSIWYG editors
|
||||
std::string html = "<p>";
|
||||
for (int i = 0; i < 100; ++i) html += "<span><b><i><em><strong>";
|
||||
html += "deep text";
|
||||
for (int i = 0; i < 100; ++i) html += "</strong></em></i></b></span>";
|
||||
html += "</p>";
|
||||
std::string md = html2md::Convert(html);
|
||||
// 100 layers of bold/italic produce tons of ** and * markers —
|
||||
// just verify no crash and non-empty output
|
||||
CHECK(!md.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html2md huge single line (no newlines)", "[html2md][scraper]") {
|
||||
// Minified HTML — one giant line, 200KB
|
||||
std::string html;
|
||||
html.reserve(200 * 1024);
|
||||
html += "<html><body>";
|
||||
for (int i = 0; i < 5000; ++i) {
|
||||
html += "<div><span class=\"c" + std::to_string(i) + "\">item" +
|
||||
std::to_string(i) + "</span></div>";
|
||||
}
|
||||
html += "</body></html>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("item0") != std::string::npos);
|
||||
CHECK(md.find("item4999") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md data URI in img src", "[html2md][scraper]") {
|
||||
std::string html =
|
||||
"<p>Before image</p>"
|
||||
"<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSU"
|
||||
"hEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwAD"
|
||||
"hgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"pixel\">"
|
||||
"<p>After image</p>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Before image") != std::string::npos);
|
||||
CHECK(md.find("After image") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md mixed Latin-1 and UTF-8 bytes", "[html2md][scraper]") {
|
||||
// Latin-1 encoded chars (0x80-0xFF) that are NOT valid UTF-8
|
||||
// Common when scraping pages with wrong charset declaration
|
||||
std::string html = "<p>caf\xe9 na\xefve r\xe9sum\xe9</p>"; // café naïve résumé in Latin-1
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("caf") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md HTML with HTTP headers prepended", "[html2md][scraper]") {
|
||||
// Sometimes raw HTTP responses leak into scraper output
|
||||
std::string html =
|
||||
"HTTP/1.1 200 OK\r\n"
|
||||
"Content-Type: text/html; charset=utf-8\r\n"
|
||||
"Content-Length: 42\r\n"
|
||||
"\r\n"
|
||||
"<html><body><p>Real content</p></body></html>";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Real content") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md Google Maps / Places markup soup", "[html2md][scraper]") {
|
||||
// Simplified version of real Google Places HTML with data attributes,
|
||||
// inline styles, aria labels, and deeply nested structure
|
||||
std::string html = R"(
|
||||
<div class="section-result" data-result-index="0" jsaction="pane.resultSection.click">
|
||||
<div class="section-result-title">
|
||||
<span><span>Müller's Büro & Café</span></span>
|
||||
</div>
|
||||
<div class="section-result-details">
|
||||
<span class="section-result-location">Königstraße 42, München</span>
|
||||
<span class="section-result-rating">
|
||||
<span aria-label="4.5 stars">★★★★☆</span>
|
||||
<span>(1,234)</span>
|
||||
</span>
|
||||
</div>
|
||||
<div style="display:none" aria-hidden="true">
|
||||
<script type="application/ld+json">{"@type":"LocalBusiness","name":"test"}</script>
|
||||
</div>
|
||||
</div>
|
||||
)";
|
||||
std::string md = html2md::Convert(html);
|
||||
CHECK(md.find("Café") != std::string::npos);
|
||||
CHECK(md.find("München") != std::string::npos);
|
||||
}
|
||||
|
||||
// ═══════════════════════════════════════════════════════
|
||||
// html2md — output amplification & pathological input
|
||||
// ═══════════════════════════════════════════════════════
|
||||
|
||||
TEST_CASE("html2md nested blockquotes (output amplification)", "[html2md][amplification]") {
|
||||
// Each <blockquote> nesting adds a ">" prefix per line in markdown.
|
||||
// 50 deep = each line gets 50 ">" prefixes — tests that output doesn't
|
||||
// explode exponentially.
|
||||
std::string html;
|
||||
for (int i = 0; i < 50; ++i) html += "<blockquote>";
|
||||
html += "<p>deep quote</p>";
|
||||
for (int i = 0; i < 50; ++i) html += "</blockquote>";
|
||||
auto md = html2md::Convert(html);
|
||||
// Output size should be reasonable — not exponential.
|
||||
// 50 levels * "> " prefix = ~100 chars + text < 1 KB
|
||||
CHECK(md.size() < 4096);
|
||||
CHECK(!md.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html2md very long attribute value", "[html2md][amplification]") {
|
||||
// 1 MB href — tests ExtractAttributeFromTagLeftOf won't choke
|
||||
std::string long_url(1024 * 1024, 'A');
|
||||
std::string html = "<a href=\"" + long_url + "\">Click</a>";
|
||||
auto md = html2md::Convert(html);
|
||||
// Must survive without crash
|
||||
CHECK(!md.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("html2md 10K unclosed p tags", "[html2md][amplification]") {
|
||||
// Each unclosed <p> generates "\n\n" — tests that md_ doesn't
|
||||
// grow beyond reasonable bounds
|
||||
std::string html;
|
||||
html.reserve(50000);
|
||||
for (int i = 0; i < 10000; ++i) html += "<p>text";
|
||||
auto md = html2md::Convert(html);
|
||||
CHECK(!md.empty());
|
||||
// Should contain the text, output gets big but not catastrophic
|
||||
CHECK(md.find("text") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md output-to-input ratio check", "[html2md][amplification]") {
|
||||
// Verify that for normal, representative HTML, output is smaller
|
||||
// than input (html2md strips tags, so markdown should be leaner)
|
||||
std::string html;
|
||||
html.reserve(100 * 1024);
|
||||
html += "<html><body>";
|
||||
for (int i = 0; i < 1000; ++i) {
|
||||
html += "<div class=\"wrapper\"><p class=\"content\">Paragraph " +
|
||||
std::to_string(i) + " with some text.</p></div>\n";
|
||||
}
|
||||
html += "</body></html>";
|
||||
auto md = html2md::Convert(html);
|
||||
// Markdown should be smaller than HTML (we stripped all the divs/classes)
|
||||
CHECK(md.size() < html.size());
|
||||
CHECK(md.size() > 0);
|
||||
}
|
||||
|
||||
TEST_CASE("html2md pathological repeated angle brackets", "[html2md][amplification]") {
|
||||
// Incomplete tags: lots of "<" without closing ">" — stresses tag parser
|
||||
std::string html(8192, '<');
|
||||
auto md = html2md::Convert(html);
|
||||
// Must not infinite-loop — just survive
|
||||
(void)md;
|
||||
}
|
||||
17
packages/kbot/cpp/tests/unit/test_http.cpp
Normal file
17
packages/kbot/cpp/tests/unit/test_http.cpp
Normal file
@ -0,0 +1,17 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include "http/http.h"
|
||||
|
||||
TEST_CASE("http::get returns a response", "[http]") {
|
||||
// This test requires network, so we test error handling for invalid URL
|
||||
auto resp = http::get("http://0.0.0.0:1/nonexistent");
|
||||
// Should fail gracefully with status -1
|
||||
CHECK(resp.status_code == -1);
|
||||
CHECK(!resp.body.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("http::post returns a response", "[http]") {
|
||||
auto resp = http::post("http://0.0.0.0:1/nonexistent", R"({"test": true})");
|
||||
CHECK(resp.status_code == -1);
|
||||
CHECK(!resp.body.empty());
|
||||
}
|
||||
89
packages/kbot/cpp/tests/unit/test_ipc.cpp
Normal file
89
packages/kbot/cpp/tests/unit/test_ipc.cpp
Normal file
@ -0,0 +1,89 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include "ipc/ipc.h"
|
||||
|
||||
#include <cstring>
|
||||
|
||||
TEST_CASE("ipc::encode produces a 4-byte LE length prefix", "[ipc]") {
|
||||
ipc::Message msg{"1", "ping", "{}"};
|
||||
auto frame = ipc::encode(msg);
|
||||
|
||||
REQUIRE(frame.size() > 4);
|
||||
|
||||
// First 4 bytes are the LE length of the JSON body
|
||||
uint32_t body_len = static_cast<uint32_t>(frame[0]) |
|
||||
(static_cast<uint32_t>(frame[1]) << 8) |
|
||||
(static_cast<uint32_t>(frame[2]) << 16) |
|
||||
(static_cast<uint32_t>(frame[3]) << 24);
|
||||
|
||||
REQUIRE(body_len == frame.size() - 4);
|
||||
}
|
||||
|
||||
TEST_CASE("ipc::encode → decode round-trip", "[ipc]") {
|
||||
ipc::Message original{"42", "job", R"({"action":"resize","width":800})"};
|
||||
auto frame = ipc::encode(original);
|
||||
|
||||
// Strip the 4-byte length prefix for decode
|
||||
ipc::Message decoded;
|
||||
bool ok = ipc::decode(frame.data() + 4, frame.size() - 4, decoded);
|
||||
|
||||
REQUIRE(ok);
|
||||
REQUIRE(decoded.id == "42");
|
||||
REQUIRE(decoded.type == "job");
|
||||
// payload should round-trip (may be compacted)
|
||||
REQUIRE(decoded.payload.find("resize") != std::string::npos);
|
||||
REQUIRE(decoded.payload.find("800") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("ipc::decode rejects invalid JSON", "[ipc]") {
|
||||
std::string garbage = "this is not json";
|
||||
ipc::Message out;
|
||||
bool ok = ipc::decode(reinterpret_cast<const uint8_t *>(garbage.data()),
|
||||
garbage.size(), out);
|
||||
REQUIRE_FALSE(ok);
|
||||
}
|
||||
|
||||
TEST_CASE("ipc::decode rejects JSON missing required fields", "[ipc]") {
|
||||
// Valid JSON but missing "id" and "type"
|
||||
std::string json = R"({"foo":"bar"})";
|
||||
ipc::Message out;
|
||||
bool ok = ipc::decode(reinterpret_cast<const uint8_t *>(json.data()),
|
||||
json.size(), out);
|
||||
REQUIRE_FALSE(ok);
|
||||
}
|
||||
|
||||
TEST_CASE("ipc::decode handles missing payload gracefully", "[ipc]") {
|
||||
std::string json = R"({"id":"1","type":"ping"})";
|
||||
ipc::Message out;
|
||||
bool ok = ipc::decode(reinterpret_cast<const uint8_t *>(json.data()),
|
||||
json.size(), out);
|
||||
REQUIRE(ok);
|
||||
REQUIRE(out.id == "1");
|
||||
REQUIRE(out.type == "ping");
|
||||
REQUIRE(out.payload == "{}");
|
||||
}
|
||||
|
||||
TEST_CASE("ipc::encode with empty payload", "[ipc]") {
|
||||
ipc::Message msg{"0", "ready", ""};
|
||||
auto frame = ipc::encode(msg);
|
||||
|
||||
ipc::Message decoded;
|
||||
bool ok = ipc::decode(frame.data() + 4, frame.size() - 4, decoded);
|
||||
|
||||
REQUIRE(ok);
|
||||
REQUIRE(decoded.id == "0");
|
||||
REQUIRE(decoded.type == "ready");
|
||||
}
|
||||
|
||||
TEST_CASE("ipc::decode with vector overload", "[ipc]") {
|
||||
std::string json = R"({"id":"99","type":"shutdown","payload":{}})";
|
||||
std::vector<uint8_t> data(json.begin(), json.end());
|
||||
|
||||
ipc::Message out;
|
||||
bool ok = ipc::decode(data, out);
|
||||
|
||||
REQUIRE(ok);
|
||||
REQUIRE(out.id == "99");
|
||||
REQUIRE(out.type == "shutdown");
|
||||
REQUIRE(out.payload == "{}");
|
||||
}
|
||||
46
packages/kbot/cpp/tests/unit/test_json.cpp
Normal file
46
packages/kbot/cpp/tests/unit/test_json.cpp
Normal file
@ -0,0 +1,46 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include "json/json.h"
|
||||
|
||||
TEST_CASE("json::is_valid accepts valid JSON", "[json]") {
|
||||
CHECK(json::is_valid(R"({"key": "value"})"));
|
||||
CHECK(json::is_valid("[]"));
|
||||
CHECK(json::is_valid("123"));
|
||||
CHECK(json::is_valid("\"hello\""));
|
||||
}
|
||||
|
||||
TEST_CASE("json::is_valid rejects invalid JSON", "[json]") {
|
||||
CHECK_FALSE(json::is_valid("{invalid}"));
|
||||
CHECK_FALSE(json::is_valid("{key: value}"));
|
||||
}
|
||||
|
||||
TEST_CASE("json::get_string extracts string values", "[json]") {
|
||||
auto val =
|
||||
json::get_string(R"({"name": "polymech", "version": "1.0"})", "name");
|
||||
CHECK(val == "polymech");
|
||||
}
|
||||
|
||||
TEST_CASE("json::get_string returns empty for missing key", "[json]") {
|
||||
auto val = json::get_string(R"({"name": "polymech"})", "missing");
|
||||
CHECK(val.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("json::get_int extracts int values", "[json]") {
|
||||
auto val = json::get_int(R"({"port": 8080, "name": "test"})", "port");
|
||||
CHECK(val == 8080);
|
||||
}
|
||||
|
||||
TEST_CASE("json::keys lists top-level keys", "[json]") {
|
||||
auto k = json::keys(R"({"a": 1, "b": 2, "c": 3})");
|
||||
REQUIRE(k.size() == 3);
|
||||
CHECK(k[0] == "a");
|
||||
CHECK(k[1] == "b");
|
||||
CHECK(k[2] == "c");
|
||||
}
|
||||
|
||||
TEST_CASE("json::prettify formats JSON", "[json]") {
|
||||
auto pretty = json::prettify(R"({"a":1})");
|
||||
REQUIRE(!pretty.empty());
|
||||
// Pretty output should contain newlines
|
||||
CHECK(pretty.find('\n') != std::string::npos);
|
||||
}
|
||||
22
packages/kbot/cpp/tests/unit/test_logger.cpp
Normal file
22
packages/kbot/cpp/tests/unit/test_logger.cpp
Normal file
@ -0,0 +1,22 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include "logger/logger.h"
|
||||
|
||||
TEST_CASE("logger::init does not throw", "[logger]") {
|
||||
REQUIRE_NOTHROW(logger::init("test"));
|
||||
}
|
||||
|
||||
TEST_CASE("logger functions do not throw after init", "[logger]") {
|
||||
logger::init("test");
|
||||
|
||||
REQUIRE_NOTHROW(logger::info("info message"));
|
||||
REQUIRE_NOTHROW(logger::warn("warn message"));
|
||||
REQUIRE_NOTHROW(logger::error("error message"));
|
||||
REQUIRE_NOTHROW(logger::debug("debug message"));
|
||||
}
|
||||
|
||||
TEST_CASE("logger::init can be called multiple times", "[logger]") {
|
||||
REQUIRE_NOTHROW(logger::init("first"));
|
||||
REQUIRE_NOTHROW(logger::init("second"));
|
||||
REQUIRE_NOTHROW(logger::info("after re-init"));
|
||||
}
|
||||
10
packages/kbot/cpp/tests/unit/test_polymech.cpp
Normal file
10
packages/kbot/cpp/tests/unit/test_polymech.cpp
Normal file
@ -0,0 +1,10 @@
|
||||
#include "polymech/polymech.h"
|
||||
#include "postgres/postgres.h"
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
|
||||
// Unit test — no network required
|
||||
TEST_CASE("polymech::fetch_pages throws without init", "[polymech]") {
|
||||
// postgres::init has not been called, so fetch_pages should throw
|
||||
REQUIRE_THROWS(polymech::fetch_pages());
|
||||
}
|
||||
9
packages/kbot/cpp/tests/unit/test_postgres.cpp
Normal file
9
packages/kbot/cpp/tests/unit/test_postgres.cpp
Normal file
@ -0,0 +1,9 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
|
||||
#include "postgres/postgres.h"
|
||||
|
||||
// Unit tests use a no-op init — no network required
|
||||
TEST_CASE("postgres::ping throws without init", "[postgres]") {
|
||||
// If called without init, should throw
|
||||
CHECK_THROWS(postgres::ping());
|
||||
}
|
||||
60
packages/kbot/cpp/tests/unit/test_search.cpp
Normal file
60
packages/kbot/cpp/tests/unit/test_search.cpp
Normal file
@ -0,0 +1,60 @@
|
||||
#include <catch2/catch_test_macros.hpp>
|
||||
#include <catch2/matchers/catch_matchers_floating_point.hpp>
|
||||
#include "search/search.h"
|
||||
|
||||
// ── Config loading ──────────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Config: loads SERPAPI_KEY from postgres.toml", "[search][config]") {
|
||||
auto cfg = search::load_config("config/postgres.toml");
|
||||
REQUIRE(!cfg.serpapi_key.empty());
|
||||
REQUIRE(cfg.serpapi_key.size() > 20); // SHA-like key
|
||||
}
|
||||
|
||||
TEST_CASE("Config: loads GEO_CODER_KEY from postgres.toml", "[search][config]") {
|
||||
auto cfg = search::load_config("config/postgres.toml");
|
||||
REQUIRE(!cfg.geocoder_key.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("Config: loads BIG_DATA_KEY from postgres.toml", "[search][config]") {
|
||||
auto cfg = search::load_config("config/postgres.toml");
|
||||
REQUIRE(!cfg.bigdata_key.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("Config: loads postgres URL", "[search][config]") {
|
||||
auto cfg = search::load_config("config/postgres.toml");
|
||||
REQUIRE(cfg.postgres_url.find("supabase.com") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("Config: loads supabase URL and service key", "[search][config]") {
|
||||
auto cfg = search::load_config("config/postgres.toml");
|
||||
REQUIRE(cfg.supabase_url.find("supabase.co") != std::string::npos);
|
||||
REQUIRE(!cfg.supabase_service_key.empty());
|
||||
}
|
||||
|
||||
TEST_CASE("Config: missing file returns empty config", "[search][config]") {
|
||||
auto cfg = search::load_config("nonexistent.toml");
|
||||
REQUIRE(cfg.serpapi_key.empty());
|
||||
REQUIRE(cfg.postgres_url.empty());
|
||||
}
|
||||
|
||||
// ── Search validation (no network) ──────────────────────────────────────────
|
||||
|
||||
TEST_CASE("Search: empty key returns error", "[search][validate]") {
|
||||
search::Config cfg; // all empty
|
||||
search::SearchOptions opts;
|
||||
opts.query = "plumbers";
|
||||
|
||||
auto res = search::search_google_maps(cfg, opts);
|
||||
REQUIRE(!res.error.empty());
|
||||
REQUIRE(res.error.find("key") != std::string::npos);
|
||||
}
|
||||
|
||||
TEST_CASE("Search: empty query returns error", "[search][validate]") {
|
||||
search::Config cfg;
|
||||
cfg.serpapi_key = "test_key";
|
||||
search::SearchOptions opts; // empty query
|
||||
|
||||
auto res = search::search_google_maps(cfg, opts);
|
||||
REQUIRE(!res.error.empty());
|
||||
REQUIRE(res.error.find("query") != std::string::npos);
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user