This commit is contained in:
lovebird 2026-03-28 13:11:29 +01:00
parent 966cca2bf1
commit 84f26184d4
58 changed files with 5899 additions and 1014 deletions

2
.gitignore vendored
View File

@ -27,3 +27,5 @@ Thumbs.db
# Logs
*.log
cache/

View File

@ -6,6 +6,12 @@ project(polymech-cli
LANGUAGES CXX C
)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO "${CMAKE_SOURCE_DIR}/dist")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL "${CMAKE_SOURCE_DIR}/dist")
# C++ standard
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
@ -35,7 +41,30 @@ FetchContent_Declare(
GIT_SHALLOW TRUE
)
FetchContent_MakeAvailable(cli11 tomlplusplus Catch2)
FetchContent_Declare(
asio
GIT_REPOSITORY https://github.com/chriskohlhoff/asio.git
GIT_TAG asio-1-28-0
GIT_SHALLOW TRUE
)
FetchContent_Declare(
concurrentqueue
GIT_REPOSITORY https://github.com/cameron314/concurrentqueue.git
GIT_TAG v1.0.4
GIT_SHALLOW TRUE
)
FetchContent_Declare(
taskflow
GIT_REPOSITORY https://github.com/taskflow/taskflow.git
GIT_TAG v3.6.0
GIT_SHALLOW TRUE
)
set(TF_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(TF_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(cli11 tomlplusplus Catch2 asio concurrentqueue taskflow)
# Packages
add_subdirectory(packages/logger)
@ -49,13 +78,29 @@ add_subdirectory(packages/geo)
add_subdirectory(packages/gadm_reader)
add_subdirectory(packages/grid)
add_subdirectory(packages/search)
add_subdirectory(packages/enrichers)
# Sources
add_executable(${PROJECT_NAME}
src/main.cpp
src/cmd_gridsearch.cpp
src/cmd_gridsearch-uds.cpp
src/cmd_gridsearch-postgres.cpp
src/gridsearch_serialize.cpp
src/sys_metrics.cpp
)
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search)
target_link_libraries(${PROJECT_NAME} PRIVATE CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json polymech ipc geo gadm_reader grid search enrichers)
target_include_directories(${PROJECT_NAME} PRIVATE
${asio_SOURCE_DIR}/asio/include
${taskflow_SOURCE_DIR}
${concurrentqueue_SOURCE_DIR}
)
# Define standalone ASIO (since it's not boost)
target_compile_definitions(${PROJECT_NAME} PRIVATE ASIO_STANDALONE=1 ASIO_NO_DEPRECATED=1)
# Compiler warnings
if(MSVC)

View File

@ -30,4 +30,10 @@ polymech-cli --version
## License
BSD-3-Clause
BSD-3-Clause
## Requirements
- [https://github.com/taskflow/taskflow](https://github.com/taskflow/taskflow)
- [https://github.com/cameron314/concurrentqueue](https://github.com/cameron314/concurrentqueue)
- [https://github.com/chriskohlhoff/asio](https://github.com/chriskohlhoff/asio)

6
build-linux.sh Normal file
View File

@ -0,0 +1,6 @@
#!/usr/bin/env bash
#rm -rf /tmp/polymech-build
mkdir -p /tmp/polymech-build
export PATH="/snap/bin:$PATH"
cmake -S /mnt/hgfs/Desktop/polymech/pm-pics/server/cpp -B /tmp/polymech-build -DCMAKE_BUILD_TYPE=Release
cmake --build /tmp/polymech-build

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_0":"ABW","NAME_0":"Aruba","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0,"isOuter":true},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_1":"","NAME_1":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_2":"","NAME_2":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_3":"","NAME_3":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_4":"","NAME_4":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[-69.9782,12.46986],[-69.98736,12.48097],[-69.99792,12.47847],[-70.00208,12.48486],[-70.0107,12.48875],[-70.0107,12.49347],[-70.02847,12.50319],[-70.03208,12.51347],[-70.04292,12.51875],[-70.06347,12.53931],[-70.05514,12.55458],[-70.05597,12.55986],[-70.04792,12.56875],[-70.04486,12.58069],[-70.05375,12.6107],[-70.05875,12.61625],[-70.05625,12.62319],[-70.05125,12.62403],[-70.04181,12.61403],[-70.02375,12.60347],[-70.01347,12.58681],[-69.98764,12.55958],[-69.9782,12.55931],[-69.97903,12.55653],[-69.9707,12.55208],[-69.9707,12.54792],[-69.96958,12.55153],[-69.96291,12.54625],[-69.95819,12.54653],[-69.95597,12.5368],[-69.94681,12.5407],[-69.92819,12.52486],[-69.92709,12.51514],[-69.92431,12.51542],[-69.91764,12.50597],[-69.9093,12.50264],[-69.89625,12.48569],[-69.88958,12.48486],[-69.88458,12.47847],[-69.88153,12.46375],[-69.87347,12.44764],[-69.87375,12.43875],[-69.86625,12.4157],[-69.87347,12.41236],[-69.88403,12.41292],[-69.88736,12.42042],[-69.89569,12.42069],[-69.9082,12.43097],[-69.92709,12.43236],[-69.9257,12.43931],[-69.94041,12.4418],[-69.95403,12.45042],[-69.97598,12.46875],[-69.97486,12.47458],[-69.9782,12.46986]]],"type":"Polygon"},"properties":{"GID_5":"","NAME_5":"","ghsBuiltCenter":[-69.99304,12.51234],"ghsBuiltCenters":[[-70.01503,12.50648,8970.0],[-70.05108,12.53423,8710.0],[-69.99892,12.48281,8660.0],[-69.9548,12.45505,7461.0],[-69.89409,12.42486,7435.0]],"ghsBuiltMax":8970.0,"ghsBuiltWeight":22900682.0,"ghsPopCenter":[-69.99866,12.51683],"ghsPopCenters":[[-70.04183,12.53341,104.0],[-69.90443,12.4322,98.0],[-70.01465,12.51627,81.0],[-69.98646,12.52933,51.0],[-69.96467,12.46566,51.0]],"ghsPopMaxDensity":104.0,"ghsPopulation":104847.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_2":"AFG.1.1_1","NAME_2":"Baharak","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0,"isOuter":true},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_3":"","NAME_3":"","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_4":"","NAME_4":"","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.41149,36.55717],[71.40954,36.55237],[71.37395,36.55474],[71.36436,36.55226],[71.31843,36.53446],[71.3019,36.52355],[71.28774,36.52113],[71.28183,36.51721],[71.27595,36.49977],[71.25977,36.48325],[71.24686,36.47709],[71.23186,36.47865],[71.22168,36.4843],[71.20222,36.48003],[71.1881,36.48441],[71.18169,36.49196],[71.1856,36.49435],[71.20061,36.52153],[71.20232,36.53118],[71.19587,36.54671],[71.16729,36.56328],[71.14429,36.58761],[71.12424,36.5924],[71.11617,36.59662],[71.11034,36.60459],[71.10903,36.66565],[71.02873,36.79067],[71.01995,36.80966],[71.01716,36.83664],[71.0187,36.86476],[71.01316,36.88173],[70.99757,36.9066],[70.95702,36.93212],[70.89158,36.95346],[70.85869,36.96816],[70.83394,36.98849],[70.8073,37.00198],[70.78426,37.02771],[70.75385,37.05461],[70.71236,37.07621],[70.7171,37.09387],[70.73001,37.104],[70.8381,37.13387],[70.85306,37.14125],[70.86099,37.15163],[70.85707,37.16642],[70.86703,37.17764],[70.86687,37.18637],[70.85607,37.19467],[70.84901,37.20512],[70.82961,37.2114],[70.80228,37.25319],[70.78971,37.2621],[70.78635,37.27388],[70.78272,37.27678],[70.80682,37.2773],[70.81486,37.28134],[70.81709,37.29346],[70.82154,37.29682],[70.83209,37.29845],[70.84579,37.29279],[70.8627,37.29437],[70.87193,37.29905],[70.87048,37.30951],[70.87566,37.31231],[70.8936,37.31329],[70.90681,37.30573],[70.92742,37.30981],[70.93779,37.32669],[70.95306,37.33867],[70.96316,37.35409],[71.01808,37.36797],[71.02711,37.37658],[71.03407,37.39542],[71.04252,37.40275],[71.05182,37.40542],[71.06761,37.40209],[71.09549,37.38773],[71.1464,37.38029],[71.17564,37.38212],[71.21194,37.39499],[71.23927,37.39724],[71.28146,37.39539],[71.30579,37.39853],[71.31545,37.39558],[71.3167,37.39105],[71.31016,37.38471],[71.29498,37.38023],[71.28695,37.37336],[71.29104,37.36458],[71.28975,37.36018],[71.27915,37.35599],[71.25114,37.35282],[71.22665,37.34412],[71.2082,37.33265],[71.1979,37.31739],[71.19634,37.30693],[71.20239,37.28747],[71.24394,37.26496],[71.30699,37.25485],[71.31807,37.24994],[71.30641,37.21708],[71.31552,37.19207],[71.30919,37.16819],[71.30914,37.13688],[71.32497,37.114],[71.33672,37.08014],[71.34899,37.0579],[71.36018,37.02248],[71.39261,36.98609],[71.3889,36.98077],[71.36919,36.97119],[71.35229,36.95037],[71.32862,36.94063],[71.32713,36.93637],[71.35377,36.91397],[71.35461,36.90298],[71.35136,36.88972],[71.33064,36.8632],[71.31949,36.85485],[71.30568,36.85132],[71.25354,36.85461],[71.2429,36.84789],[71.24265,36.83884],[71.24941,36.82845],[71.26183,36.81963],[71.26368,36.81425],[71.25796,36.80269],[71.24998,36.79636],[71.22235,36.78543],[71.21882,36.77991],[71.21972,36.77044],[71.26667,36.74249],[71.27195,36.73764],[71.27148,36.73246],[71.25753,36.73465],[71.25262,36.73069],[71.25707,36.72251],[71.25055,36.71885],[71.23594,36.72063],[71.23033,36.71722],[71.23382,36.70341],[71.22665,36.68107],[71.24262,36.65906],[71.26329,36.64516],[71.29496,36.61458],[71.34565,36.58645],[71.3614,36.58263],[71.37926,36.56792],[71.41149,36.55717]]],"type":"Polygon"},"properties":{"GID_5":"","NAME_5":"","ghsBuiltCenter":[71.10105,37.04904],"ghsBuiltCenters":[[71.13941,37.07362,2693.0],[71.09857,37.04895,2582.0],[71.04391,37.03397,2090.0],[71.06012,36.9142,1765.0],[71.17729,37.056,1322.0]],"ghsBuiltMax":2693.0,"ghsBuiltWeight":229321.0,"ghsPopCenter":[71.09335,37.02337],"ghsPopCenters":[[71.06012,36.9142,1843.0],[71.13941,37.07362,542.0],[71.09857,37.04895,519.0],[71.04391,37.03397,420.0],[71.07823,36.87635,323.0]],"ghsPopMaxDensity":1843.0,"ghsPopulation":56538.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.2762,38.00465],[71.26561,38.006],[71.25337,38.01428],[71.22517,38.01237],[71.19342,38.01883],[71.17726,38.02659],[71.17789,38.03534],[71.17218,38.03826],[71.16112,38.02494],[71.15066,38.01851],[71.12814,38.01732],[71.11626,38.01307],[71.10431,38.01876],[71.09132,38.01694],[71.08074,38.01983],[71.07189,38.01053],[71.05569,38.00447],[71.03897,37.98],[71.01289,37.96274],[71.00059,37.95081],[71.00089,37.95954],[70.99486,37.96888],[70.95911,37.98344],[70.91583,38.00888],[70.80695,38.05487],[70.78855,38.05737],[70.76753,38.0549],[70.70683,38.03009],[70.68048,38.02772],[70.66789,38.03206],[70.6513,38.04594],[70.63457,38.05439],[70.58826,38.06125],[70.56323,38.07679],[70.54028,38.08434],[70.51607,38.08412],[70.4374,38.06392],[70.41875,38.07549],[70.41838,38.08248],[70.42033,38.08901],[70.43102,38.09893],[70.46798,38.10495],[70.49106,38.12076],[70.505,38.12257],[70.50031,38.14607],[70.50352,38.1644],[70.51794,38.19432],[70.53645,38.20976],[70.53726,38.2244],[70.54602,38.24455],[70.56711,38.26815],[70.60484,38.2833],[70.60899,38.29649],[70.60512,38.30585],[70.61463,38.3245],[70.61066,38.34573],[70.61833,38.35136],[70.6309,38.35353],[70.64323,38.34887],[70.65401,38.3495],[70.65928,38.35102],[70.66563,38.3586],[70.69146,38.36913],[70.69441,38.38095],[70.69186,38.38648],[70.68796,38.38932],[70.67838,38.38737],[70.67419,38.39047],[70.67468,38.40339],[70.67855,38.40888],[70.69831,38.41975],[70.7148,38.4145],[70.74165,38.41995],[70.7566,38.428],[70.77225,38.45615],[70.78256,38.45518],[70.78762,38.45091],[70.80948,38.44415],[70.82104,38.44799],[70.82435,38.45286],[70.82961,38.45212],[70.84317,38.4463],[70.84344,38.44035],[70.8501,38.43874],[70.86002,38.45011],[70.85907,38.45964],[70.86411,38.46035],[70.87345,38.46884],[70.89499,38.46514],[70.90042,38.45963],[70.9007,38.44741],[70.90692,38.44351],[70.9077,38.43891],[70.92131,38.43476],[70.92443,38.4303],[70.93446,38.43529],[70.9354,38.44066],[70.94362,38.44265],[70.94785,38.43847],[70.95433,38.43819],[70.95847,38.4418],[70.96202,38.45662],[70.94379,38.46688],[70.9472,38.47688],[70.9696,38.47609],[70.98946,38.49041],[70.99661,38.48767],[71.00134,38.47703],[71.00946,38.47116],[71.02091,38.46896],[71.0309,38.46125],[71.0294,38.45271],[71.03682,38.44218],[71.04692,38.41021],[71.05642,38.39895],[71.06659,38.40005],[71.06855,38.40467],[71.06647,38.41275],[71.07633,38.41411],[71.09161,38.42328],[71.10434,38.42241],[71.10783,38.41936],[71.10534,38.40898],[71.11013,38.40584],[71.12121,38.4063],[71.13288,38.39707],[71.14374,38.40255],[71.14958,38.39739],[71.15116,38.38928],[71.16306,38.38721],[71.17526,38.36986],[71.18422,38.34589],[71.1974,38.34302],[71.22577,38.32113],[71.2439,38.31803],[71.2527,38.31032],[71.27956,38.31457],[71.29659,38.31296],[71.31345,38.30438],[71.32798,38.30544],[71.334,38.29334],[71.32713,38.28508],[71.33293,38.28072],[71.33456,38.27015],[71.34693,38.27131],[71.36104,38.26833],[71.37412,38.25563],[71.36751,38.22331],[71.37737,38.21522],[71.37899,38.21031],[71.36478,38.19609],[71.3663,38.17847],[71.37576,38.1608],[71.36604,38.15059],[71.34839,38.14753],[71.33983,38.13181],[71.33657,38.11368],[71.3302,38.11209],[71.32637,38.1029],[71.31956,38.0998],[71.3212,38.07051],[71.31136,38.06038],[71.30648,38.04686],[71.30022,38.04295],[71.29439,38.04479],[71.28365,38.04029],[71.28629,38.03307],[71.29458,38.0294],[71.29507,38.01832],[71.2762,38.00465]]],"type":"Polygon"},"properties":{"GID_2":"AFG.1.2_1","NAME_2":"Darwaz","ghsBuiltCenter":[70.7694,38.39001],"ghsBuiltCenters":[[70.69373,38.40057,3959.0],[70.81617,38.44315,3473.0],[70.54108,38.13021,1843.0],[70.84193,38.31455,1670.0],[70.69396,38.36864,1491.0]],"ghsBuiltMax":3959.0,"ghsBuiltWeight":140314.0,"ghsPopCenter":[70.79892,38.38974],"ghsPopCenters":[[71.04928,38.40678,3822.0],[70.81617,38.44315,2720.0],[70.69373,38.40057,2541.0],[70.84193,38.31455,1308.0],[70.54108,38.13021,1183.0]],"ghsPopMaxDensity":3822.0,"ghsPopulation":100450.0,"isOuter":true},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.2762,38.00465],[71.26561,38.006],[71.25337,38.01428],[71.22517,38.01237],[71.19342,38.01883],[71.17726,38.02659],[71.17789,38.03534],[71.17218,38.03826],[71.16112,38.02494],[71.15066,38.01851],[71.12814,38.01732],[71.11626,38.01307],[71.10431,38.01876],[71.09132,38.01694],[71.08074,38.01983],[71.07189,38.01053],[71.05569,38.00447],[71.03897,37.98],[71.01289,37.96274],[71.00059,37.95081],[71.00089,37.95954],[70.99486,37.96888],[70.95911,37.98344],[70.91583,38.00888],[70.80695,38.05487],[70.78855,38.05737],[70.76753,38.0549],[70.70683,38.03009],[70.68048,38.02772],[70.66789,38.03206],[70.6513,38.04594],[70.63457,38.05439],[70.58826,38.06125],[70.56323,38.07679],[70.54028,38.08434],[70.51607,38.08412],[70.4374,38.06392],[70.41875,38.07549],[70.41838,38.08248],[70.42033,38.08901],[70.43102,38.09893],[70.46798,38.10495],[70.49106,38.12076],[70.505,38.12257],[70.50031,38.14607],[70.50352,38.1644],[70.51794,38.19432],[70.53645,38.20976],[70.53726,38.2244],[70.54602,38.24455],[70.56711,38.26815],[70.60484,38.2833],[70.60899,38.29649],[70.60512,38.30585],[70.61463,38.3245],[70.61066,38.34573],[70.61833,38.35136],[70.6309,38.35353],[70.64323,38.34887],[70.65401,38.3495],[70.65928,38.35102],[70.66563,38.3586],[70.69146,38.36913],[70.69441,38.38095],[70.69186,38.38648],[70.68796,38.38932],[70.67838,38.38737],[70.67419,38.39047],[70.67468,38.40339],[70.67855,38.40888],[70.69831,38.41975],[70.7148,38.4145],[70.74165,38.41995],[70.7566,38.428],[70.77225,38.45615],[70.78256,38.45518],[70.78762,38.45091],[70.80948,38.44415],[70.82104,38.44799],[70.82435,38.45286],[70.82961,38.45212],[70.84317,38.4463],[70.84344,38.44035],[70.8501,38.43874],[70.86002,38.45011],[70.85907,38.45964],[70.86411,38.46035],[70.87345,38.46884],[70.89499,38.46514],[70.90042,38.45963],[70.9007,38.44741],[70.90692,38.44351],[70.9077,38.43891],[70.92131,38.43476],[70.92443,38.4303],[70.93446,38.43529],[70.9354,38.44066],[70.94362,38.44265],[70.94785,38.43847],[70.95433,38.43819],[70.95847,38.4418],[70.96202,38.45662],[70.94379,38.46688],[70.9472,38.47688],[70.9696,38.47609],[70.98946,38.49041],[70.99661,38.48767],[71.00134,38.47703],[71.00946,38.47116],[71.02091,38.46896],[71.0309,38.46125],[71.0294,38.45271],[71.03682,38.44218],[71.04692,38.41021],[71.05642,38.39895],[71.06659,38.40005],[71.06855,38.40467],[71.06647,38.41275],[71.07633,38.41411],[71.09161,38.42328],[71.10434,38.42241],[71.10783,38.41936],[71.10534,38.40898],[71.11013,38.40584],[71.12121,38.4063],[71.13288,38.39707],[71.14374,38.40255],[71.14958,38.39739],[71.15116,38.38928],[71.16306,38.38721],[71.17526,38.36986],[71.18422,38.34589],[71.1974,38.34302],[71.22577,38.32113],[71.2439,38.31803],[71.2527,38.31032],[71.27956,38.31457],[71.29659,38.31296],[71.31345,38.30438],[71.32798,38.30544],[71.334,38.29334],[71.32713,38.28508],[71.33293,38.28072],[71.33456,38.27015],[71.34693,38.27131],[71.36104,38.26833],[71.37412,38.25563],[71.36751,38.22331],[71.37737,38.21522],[71.37899,38.21031],[71.36478,38.19609],[71.3663,38.17847],[71.37576,38.1608],[71.36604,38.15059],[71.34839,38.14753],[71.33983,38.13181],[71.33657,38.11368],[71.3302,38.11209],[71.32637,38.1029],[71.31956,38.0998],[71.3212,38.07051],[71.31136,38.06038],[71.30648,38.04686],[71.30022,38.04295],[71.29439,38.04479],[71.28365,38.04029],[71.28629,38.03307],[71.29458,38.0294],[71.29507,38.01832],[71.2762,38.00465]]],"type":"Polygon"},"properties":{"GID_3":"","NAME_3":"","ghsBuiltCenter":[70.7694,38.39001],"ghsBuiltCenters":[[70.69373,38.40057,3959.0],[70.81617,38.44315,3473.0],[70.54108,38.13021,1843.0],[70.84193,38.31455,1670.0],[70.69396,38.36864,1491.0]],"ghsBuiltMax":3959.0,"ghsBuiltWeight":140314.0,"ghsPopCenter":[70.79892,38.38974],"ghsPopCenters":[[71.04928,38.40678,3822.0],[70.81617,38.44315,2720.0],[70.69373,38.40057,2541.0],[70.84193,38.31455,1308.0],[70.54108,38.13021,1183.0]],"ghsPopMaxDensity":3822.0,"ghsPopulation":100450.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1 +0,0 @@
{"features":[{"geometry":{"coordinates":[[[71.2762,38.00465],[71.26561,38.006],[71.25337,38.01428],[71.22517,38.01237],[71.19342,38.01883],[71.17726,38.02659],[71.17789,38.03534],[71.17218,38.03826],[71.16112,38.02494],[71.15066,38.01851],[71.12814,38.01732],[71.11626,38.01307],[71.10431,38.01876],[71.09132,38.01694],[71.08074,38.01983],[71.07189,38.01053],[71.05569,38.00447],[71.03897,37.98],[71.01289,37.96274],[71.00059,37.95081],[71.00089,37.95954],[70.99486,37.96888],[70.95911,37.98344],[70.91583,38.00888],[70.80695,38.05487],[70.78855,38.05737],[70.76753,38.0549],[70.70683,38.03009],[70.68048,38.02772],[70.66789,38.03206],[70.6513,38.04594],[70.63457,38.05439],[70.58826,38.06125],[70.56323,38.07679],[70.54028,38.08434],[70.51607,38.08412],[70.4374,38.06392],[70.41875,38.07549],[70.41838,38.08248],[70.42033,38.08901],[70.43102,38.09893],[70.46798,38.10495],[70.49106,38.12076],[70.505,38.12257],[70.50031,38.14607],[70.50352,38.1644],[70.51794,38.19432],[70.53645,38.20976],[70.53726,38.2244],[70.54602,38.24455],[70.56711,38.26815],[70.60484,38.2833],[70.60899,38.29649],[70.60512,38.30585],[70.61463,38.3245],[70.61066,38.34573],[70.61833,38.35136],[70.6309,38.35353],[70.64323,38.34887],[70.65401,38.3495],[70.65928,38.35102],[70.66563,38.3586],[70.69146,38.36913],[70.69441,38.38095],[70.69186,38.38648],[70.68796,38.38932],[70.67838,38.38737],[70.67419,38.39047],[70.67468,38.40339],[70.67855,38.40888],[70.69831,38.41975],[70.7148,38.4145],[70.74165,38.41995],[70.7566,38.428],[70.77225,38.45615],[70.78256,38.45518],[70.78762,38.45091],[70.80948,38.44415],[70.82104,38.44799],[70.82435,38.45286],[70.82961,38.45212],[70.84317,38.4463],[70.84344,38.44035],[70.8501,38.43874],[70.86002,38.45011],[70.85907,38.45964],[70.86411,38.46035],[70.87345,38.46884],[70.89499,38.46514],[70.90042,38.45963],[70.9007,38.44741],[70.90692,38.44351],[70.9077,38.43891],[70.92131,38.43476],[70.92443,38.4303],[70.93446,38.43529],[70.9354,38.44066],[70.94362,38.44265],[70.94785,38.43847],[70.95433,38.43819],[70.95847,38.4418],[70.96202,38.45662],[70.94379,38.46688],[70.9472,38.47688],[70.9696,38.47609],[70.98946,38.49041],[70.99661,38.48767],[71.00134,38.47703],[71.00946,38.47116],[71.02091,38.46896],[71.0309,38.46125],[71.0294,38.45271],[71.03682,38.44218],[71.04692,38.41021],[71.05642,38.39895],[71.06659,38.40005],[71.06855,38.40467],[71.06647,38.41275],[71.07633,38.41411],[71.09161,38.42328],[71.10434,38.42241],[71.10783,38.41936],[71.10534,38.40898],[71.11013,38.40584],[71.12121,38.4063],[71.13288,38.39707],[71.14374,38.40255],[71.14958,38.39739],[71.15116,38.38928],[71.16306,38.38721],[71.17526,38.36986],[71.18422,38.34589],[71.1974,38.34302],[71.22577,38.32113],[71.2439,38.31803],[71.2527,38.31032],[71.27956,38.31457],[71.29659,38.31296],[71.31345,38.30438],[71.32798,38.30544],[71.334,38.29334],[71.32713,38.28508],[71.33293,38.28072],[71.33456,38.27015],[71.34693,38.27131],[71.36104,38.26833],[71.37412,38.25563],[71.36751,38.22331],[71.37737,38.21522],[71.37899,38.21031],[71.36478,38.19609],[71.3663,38.17847],[71.37576,38.1608],[71.36604,38.15059],[71.34839,38.14753],[71.33983,38.13181],[71.33657,38.11368],[71.3302,38.11209],[71.32637,38.1029],[71.31956,38.0998],[71.3212,38.07051],[71.31136,38.06038],[71.30648,38.04686],[71.30022,38.04295],[71.29439,38.04479],[71.28365,38.04029],[71.28629,38.03307],[71.29458,38.0294],[71.29507,38.01832],[71.2762,38.00465]]],"type":"Polygon"},"properties":{"GID_4":"","NAME_4":"","ghsBuiltCenter":[70.7694,38.39001],"ghsBuiltCenters":[[70.69373,38.40057,3959.0],[70.81617,38.44315,3473.0],[70.54108,38.13021,1843.0],[70.84193,38.31455,1670.0],[70.69396,38.36864,1491.0]],"ghsBuiltMax":3959.0,"ghsBuiltWeight":140314.0,"ghsPopCenter":[70.79892,38.38974],"ghsPopCenters":[[71.04928,38.40678,3822.0],[70.81617,38.44315,2720.0],[70.69373,38.40057,2541.0],[70.84193,38.31455,1308.0],[70.54108,38.13021,1183.0]],"ghsPopMaxDensity":3822.0,"ghsPopulation":100450.0},"type":"Feature"}],"type":"FeatureCollection"}

View File

@ -1,459 +0,0 @@
# Running Products via Native Node.js Worker Threads
Moving heavy queues (like `ImagesProduct` crunching images via `sharp`, or `LocationsProduct` running grid searches) out of the main Event Loop is essential to preserve API performance and maintain a high Event Loop FPS.
We orchestrate this entirely within Node.js using the native `worker_threads` module, driven by a centralized JSON configuration. No PM2 dependency is required.
---
## Architecture: Config-Driven Worker Spawning
The application topology is defined in `server/config/products.json`. The main thread reads this file on boot. If a product has `"workers" > 0`, the main thread spawns dedicated native `Worker` threads to handle its `pg-boss` background jobs — while still registering the product's HTTP routes on the main thread.
### 1. The Configuration Format (`config/products.json`)
Each product entry specifies:
- **`name`** — maps to a key in `PRODUCT_IMPORTS` in `registry.ts`
- **`enabled`** — whether to load the product at all
- **`workers`** — how many native Worker threads to spawn (0 = run everything on the main thread)
- **`deps`** — informational dependency list
```json
{
"products": [
{ "name": "images", "enabled": true, "workers": 1, "deps": ["serving", "storage"] },
{ "name": "videos", "enabled": true, "workers": 0, "deps": ["serving", "storage"] },
{ "name": "locations", "enabled": true, "workers": 0, "deps": ["serving", "storage"] },
{ "name": "serving", "enabled": true, "workers": 0, "deps": ["images"] },
{ "name": "email", "enabled": true, "workers": 0, "deps": [] },
{ "name": "openai", "enabled": true, "workers": 0, "deps": [] },
{ "name": "analytics", "enabled": true, "workers": 0, "deps": [] },
{ "name": "storage", "enabled": true, "workers": 0, "deps": [] },
{ "name": "ecommerce", "enabled": true, "workers": 0, "deps": ["images"] },
{ "name": "contacts", "enabled": true, "workers": 0, "deps": [] },
{ "name": "campaigns", "enabled": true, "workers": 0, "deps": ["contacts"] },
{ "name": "mcp", "enabled": true, "workers": 0, "deps": ["serving"] }
]
}
```
### 2. Main Thread: The Orchestrator (`src/products/registry.ts`)
Boot-up is split into two phases:
**Phase 1 — `registerProductRoutes(app)`:** Reads `products.json`, lazy-imports only the enabled product modules via a `PRODUCT_IMPORTS` map (avoids importing everything on boot), instantiates them, and registers their HTTP routes on the Hono app.
**Phase 2 — `startProducts(boss)`:** For each product:
- If `workers > 0`, spawns native Worker threads (see §3).
- Always calls `product.start(boss)` on the main thread so the product can register pg-boss queue names and perform local init.
```typescript
// Lazy imports — only loaded when the product is enabled
const PRODUCT_IMPORTS: Record<string, () => Promise<any>> = {
'images': () => import('./images/index.js'),
'videos': () => import('./videos/index.js'),
'locations': () => import('./locations/index.js'),
// ... all 12 products
};
export const startProducts = async (boss?: any) => {
for (const product of instances) {
const pConfig = product.__config;
if (pConfig && pConfig.workers > 0) {
const isDev = process.env.NODE_ENV !== 'production';
// Dev: uses vite-node wrapper to load TS directly
// Prod: uses pre-bundled worker.cjs
let workerEntry = isDev
? path.resolve(process.cwd(), 'src', 'worker_wrapper.mjs')
: path.resolve(process.cwd(), 'worker.cjs');
for (let i = 0; i < pConfig.workers; i++) {
const worker = new Worker(workerEntry, {
workerData: { productName: pConfig.name, workerScript }
});
nativeWorkers.push({ id: product.id, worker });
// Forward EventBus events from worker → main thread
worker.on('message', (msg) => {
if (msg?.type === 'event' && msg.name) {
EventBus.emit(msg.name, msg.data);
}
});
}
}
// Main-thread init (HTTP deps, caching, boss queue creation)
await product.start(boss);
}
};
```
### 3. Worker Entrypoint (`src/worker.ts`)
When a Worker thread boots, `worker.ts` is loaded. It reads `workerData.productName`, instantiates the matching product class, and starts only its pg-boss consumers. It does **not** start an HTTP server.
Key responsibilities:
- **PG-Boss queue consumers** — the product's `onStart(boss)` registers workers for its queues.
- **IPC health checks** — responds to `{ type: 'ping' }` messages with `{ type: 'pong', activeJobs, ... }`.
- **IPC job dispatch** — handles `{ type: 'job' }` messages for synchronous request-response via `dispatchToWorker()`.
- **EventBus bridging** — forwards `job:progress`, `job:complete`, and `job:error` events to the parent thread via `parentPort.postMessage()`.
```typescript
// worker.ts (runs inside the Worker thread)
import { workerData, isMainThread, parentPort } from 'worker_threads';
if (isMainThread) throw new Error('Must run inside a Worker thread.');
const ProductClass = PRODUCT_CLASSES[workerData.productName];
const instance = new ProductClass();
// IPC: ping/pong + job dispatch
parentPort.on('message', async (msg) => {
if (msg.type === 'ping') return parentPort.postMessage({ type: 'pong', ... });
if (msg.type === 'job') { /* handleJob → postMessage result */ }
});
// Bridge internal events to parent thread
EventBus.on('job:progress', (data) => parentPort.postMessage({ type: 'event', name: 'job:progress', data }));
EventBus.on('job:complete', (data) => parentPort.postMessage({ type: 'event', name: 'job:complete', data }));
// Start isolated PG-Boss and bind the product
const workerBoss = await startBoss();
await instance.start(workerBoss);
```
### 4. Dev Mode: `worker_wrapper.mjs`
In dev, Worker threads can't inherit `tsx` hooks from the parent process. To support TypeScript directly, a plain `.mjs` bootstrap uses `vite-node`'s programmatic API to load and execute `worker.ts` with full TS resolution:
```javascript
// worker_wrapper.mjs
import { workerData } from 'node:worker_threads';
import { createServer } from 'vite';
import { ViteNodeRunner } from 'vite-node/client';
const server = await createServer({ /* hmr: false, @-alias setup */ });
const runner = new ViteNodeRunner({ root, base, fetchModule, resolveId });
await runner.executeFile(workerData.workerScript);
```
### 5. Smart Consumer Skipping
Products that support pg-boss workers (like `LocationsProduct`) use this pattern in `onStart()` to avoid double-consuming:
```typescript
async onStart(boss?: PgBoss) {
const { isMainThread } = await import('node:worker_threads');
const workersConfig = this.__config?.workers ?? 0;
const shouldConsume = !isMainThread || workersConfig === 0;
for (const WorkerClass of this.workers) {
const worker = new WorkerClass();
await boss.createQueue(worker.queueName);
if (shouldConsume) {
await boss.work(worker.queueName, options, worker.handler.bind(worker));
}
}
}
```
If the product is running with dedicated Worker threads (`workers > 0`), the main thread skips consuming from pg-boss queues — only the Worker threads will consume them.
### 6. IPC Job Dispatch (`src/commons/worker-ipc.ts`)
For synchronous request-response between the main thread and worker threads (e.g., image processing called from an HTTP handler), there is a utility:
```typescript
import { dispatchToWorker, hasWorker } from '@/commons/worker-ipc.js';
// Check if a live worker exists
if (await hasWorker('images')) {
const result = await dispatchToWorker('images', 'process_image', { buffer, ... }, [buffer]);
}
```
- Uses round-robin across multiple worker threads for the same product.
- Supports zero-copy `ArrayBuffer` transfers via the `transferList` parameter.
- Has a configurable timeout (default 30s).
---
## Base Classes
### `AbstractProduct` (`src/products/AbstractProduct.ts`)
All products extend this. Provides:
- `start(boss)` / `stop()` lifecycle hooks
- `handleJob(action, msg)` — for IPC job dispatch from worker threads
- `handleStream()` — SSE streaming helper with cache-checking
- `generateHash()` — deterministic deep-sorted SHA-256 hashing
### `AbstractWorker` (`src/jobs/boss/AbstractWorker.ts`)
PG-Boss queue consumers extend this. Provides:
- `queueName` — the pg-boss queue to consume
- `process(job)` — override with business logic
- `calculateCost(job, result)` — usage metering
- `handler()` — wraps `process()` with error handling and emits `job:complete` / `job:failed`
Worker classes use the `@Worker(queueName)` decorator for registration.
---
## Case Study: `ImagesProduct` — The Canonical Worker-Offloaded Product
`ImagesProduct` (`src/products/images/index.ts`) is currently the **only product running with `workers: 1`** in production. It demonstrates the full IPC lifecycle — from HTTP request through worker dispatch to cached response. It does **not** use `AbstractWorker` or pg-boss queues; instead, it uses the synchronous IPC dispatch pattern via `worker-ipc.ts`.
### The Hybrid Pattern: `hasWorker` + Inline Fallback
Every image processing path checks whether a live worker thread exists. If yes, the heavy `sharp` work is offloaded. If no (e.g., during tests, or if `workers: 0` in config), it falls back to inline processing on the main thread:
```typescript
// src/products/images/index.ts — _ensureCachedImage()
if (await hasWorker('images')) {
// Zero-copy transfer: copy Buffer into a transferable ArrayBuffer
const arrayBuffer = new ArrayBuffer(inputBuffer.length);
new Uint8Array(arrayBuffer).set(inputBuffer);
await dispatchToWorker('images', 'process_image', {
buffer: arrayBuffer, width, height, format, fit
}, [arrayBuffer]); // ← transfer list: moves memory, doesn't clone
} else {
// Inline fallback (same thread)
const pipeline = sharp(inputBuffer).resize({ width, height, fit }).toFormat(format);
await fs.writeFile(filepath, await pipeline.toBuffer());
}
```
This pattern is used in three HTTP handlers:
- **`handlePostImage`** — file upload → resize → cache (or forward to Supabase Storage)
- **`handleRenderImage`** — URL → fetch → resize → serve as binary (used by lazy srcset URLs)
- **`handlePostResponsive`** / **`handleGetResponsive`** — generate multi-format, multi-size srcset variants
### Worker-Side: `handleJob()` Actions
Inside the worker thread, the `ImagesProduct` instance receives IPC job messages and routes them by `action`:
```typescript
// src/products/images/index.ts — handleJob()
async handleJob(action: string, msg: any): Promise<any> {
if (action === 'process_image') {
// Reconstruct Buffer from transferred ArrayBuffer
const inputBuffer = Buffer.from(msg.buffer);
await this.performProcessImage(inputBuffer, filepath, { width, height, format, fit });
return { filename };
}
if (action === 'render_image') {
// Supports square crop, contain fit, etc.
await this.performRenderImage(inputBuffer, filepath, { width, height, format, square, contain });
return { filename };
}
return super.handleJob(action, msg); // Throws for unknown actions
}
```
Both actions write the processed image to the shared `cache/` directory on disk. The main thread then reads the file to serve or forward the response.
### The Responsive Image Pipeline
The responsive endpoint generates multiple width × format variants (e.g., `[180, 640, 1024, 2048] × [avif, webp]`). It splits work between **eager** and **lazy** generation:
| Variant Width | Strategy | What Happens |
|--------------|----------|--------------|
| ≤ 600px | **Eager** | Processed immediately (via worker or inline) and cached to disk. Returns direct cache URL. |
| > 600px | **Lazy** | Returns a dynamic `/api/images/render?url=...&width=...&format=...` URL. Processed on-demand when the browser requests it. |
This avoids eagerly generating large, rarely-used variants for every upload while ensuring small thumbnails are always instant.
### Request Coalescing
When multiple concurrent requests reference the same source URL, `fetchImageCoalesced()` deduplicates them using an in-flight `Map<string, Promise<Buffer>>`. Only one HTTP fetch goes out; all callers share the same Promise.
### Data Flow Summary
```
HTTP Request (main thread)
→ hasWorker('images')? ──yes──→ dispatchToWorker()
│ │
│ ├─ postMessage({ type:'job', action:'render_image', buffer }, [buffer])
│ │ ↓ (zero-copy ArrayBuffer transfer)
│ │ Worker Thread: handleJob('render_image', msg)
│ │ ↓
│ │ sharp(buffer).resize().toFormat().toFile(filepath)
│ │ ↓ (streams directly to disk)
│ └─ postMessage({ type:'job_result', result: { filename } })
│ ↓
│ main thread: fs.readFile(cache/hash.format)
│ ↓
│ return c.redirect() or c.body()
└──no──→ Inline: sharp().resize().toFile(filepath) → serve
```
---
## Why this Pattern is Powerful
1. **Zero PM2 Dependency:** Entirely native to Node.js. Containerization, Nexe builds — nothing changes.
2. **True Multi-Core Utilization:** `worker_threads` run on distinct OS threads. Setting `workers: 2` for `images` dedicates two CPU cores to Sharp.
3. **API Immunity:** Workers have their own V8 heap and Event Loop. A massive image resize will have zero impact on the main API's Event Loop FPS.
4. **EventBus Bridging:** Worker events (progress, completion) are forwarded to the main thread via IPC `postMessage`, enabling real-time SSE streams to API clients.
5. **Dev/Prod Parity:** The `worker_wrapper.mjs` + vite-node setup means TypeScript runs natively in dev worker threads, while production uses pre-bundled JS — same behavior in both environments.
6. **Round-Robin Dispatch:** The `worker-ipc.ts` utility distributes synchronous job requests across multiple threads, enabling true horizontal scaling within a single process.
---
## Constraints & Gotchas (Lessons from Inngest + Our Benchmarks)
Node.js worker threads have real constraints that Go/Rust/Python developers would never expect. The [Inngest post on worker threads](https://www.inngest.com/blog/node-worker-threads) formalizes these well. Here's how each constraint applies to **our** architecture:
### 1. Workers Are NOT Lightweight
Each worker thread is a **full V8 isolate** — its own heap, its own event loop. ~10 MB memory overhead per worker, with tens-of-milliseconds startup cost. This is why our `products.json` caps workers at 1-2 per product, and workers are spawned **once at boot** and persist for the process lifetime. We never create/destroy workers per-job.
### 2. You Can't Pass Logic — Only Messages
Unlike Go goroutines or Rust threads, you can't pass a function to `new Worker()`. The structured clone algorithm can't serialize functions. This is why:
- Our `EventBus` listeners live on the **main thread** — worker threads post `{ type: 'event' }` messages that get bridged to the main-thread EventBus
- Pino `logger` instances can't cross the boundary — worker threads use their own logger
- `pg-boss` connections are per-thread — each worker establishes its own
### 3. Bundler Discovery Is Fragile
Bundlers (webpack) can't statically analyze `new Worker(path)`. Our approach:
- **Dev:** `worker_wrapper.mjs` uses vite-node's `ViteNodeRunner` to resolve TypeScript at runtime
- **Prod:** `build.sh` compiles `worker.ts``worker.cjs` as a separate webpack entry point, and the registry uses `__dirname + '/worker.cjs'` — a plain string the bundler can't trace
Both paths are hardcoded and tested — no dynamic path construction that could break silently.
### 4. Dev-Mode vite-node Overhead (CRITICAL)
Benchmarked 2024-03-24, same 386KB JPEG source at 800px webp:
| Path | Encode Time | Notes |
|------|-------------|-------|
| Worker thread (vite-node) | **3:265** (3.26s) | IPC + vite-node module transform overhead |
| Main thread (inline) | **0:140** (140ms) | Direct sharp call, no IPC |
**~23× slower in dev mode via worker thread.** The vite-node `ViteNodeRunner` inside the worker's V8 isolate adds massive overhead for module resolution and transformation. Sharp itself (native C++ addon) runs at the same speed — the cost is entirely in the JS wrapper.
In **production** with pre-bundled `worker.cjs`, the worker thread runs at near-native speed. The overhead is a **dev-only artifact**.
> **Practical implication:** Consider setting `"workers": 0` for `images` during local development to avoid the vite-node penalty. The main thread handles 140ms encodes without impacting dev-server responsiveness.
### 5. No Respawning (Current Gap)
Inngest implements exponential backoff respawning — if a worker thread crashes (unhandled exception, OOM), the main thread detects the `exit` event and spins up a replacement with increasing delay.
**We don't do this yet.** If a worker thread dies, it's gone until a full server restart. The `registry.ts` spawner doesn't watch for `exit` events. This is acceptable for now because:
- Workers are simple (sharp pipeline, no external connections beyond pg-boss)
- Crashes are rare in production
- The inline fallback (`hasWorker() === false`) means the main thread picks up the work
But for robustness, adding respawn-with-backoff to the worker spawner in `registry.ts` would be a good future improvement.
### 6. Elastic Autospawning & Tier-Based Limits (Grid Searches)
Monolithic jobs that process tens of thousands of items (e.g., massive Grid Searches) expose a flaw in static worker pools: **head-of-line blocking**. If all workers are occupied by a massive Enterprise search, Free/Pro users starve.
To solve this we use an **Elastic Autospawn / Fan-Out Architecture**:
1. **Fan-Out (Map-Reduce):** Instead of processing 10,000 grid cells in a single Node.js worker loop, an *Orchestrator* job enumerates the area and splits it into 10,000 individual `gridsearch-cell` jobs pushed to PG-Boss.
2. **Tier-Based Queue Routing/Throttling:** We use PG-Boss `singletonKey` (tied to `userId`) and tier-based concurrency limits (e.g., `teamConcurrency: 5` for Pro vs `20` for Enterprise) to ensure fairness at the database queue level.
3. **Distributed SSE (Pub/Sub):** Because micro-jobs fan out across multiple elastic workers, tying SSE to a local `EventBus` via `parentPort` fails. Instead, workers emit progress via **Postgres `NOTIFY`** or Supabase Realtime channels. The main API process (handling the SSE route) uses `LISTEN` to receive events from any worker on any machine, bridging them back to the user's HTTP stream.
---
## Exploring Native (Rust/C++) Replacements
Given the constraints of V8 Isolates (10MB overhead, slow startup, lack of shared memory serialization), a viable future replacement for CPU-bound or massively concurrent products (like `images` or `locations` grid searches) is replacing Node.js `worker_threads` with **Per-Product Rust or C++ implementations (Binaries or N-API)**.
If a Native (Rust/C++) worker is implemented:
- **Fast Autospawn:** Native binaries spawn in under 1ms. If compiled as an N-API native module (via `napi-rs` or `node-addon-api` for C++), worker execution is effectively instantaneous function calls avoiding V8 Isolate boot.
- **IPC Performance:**
- Subprocesses communicating via raw UNIX socket or `stdout` streams provide near-native memory transfer without structured-clone serialization bounds.
- N-API bindings allow direct zero-copy memory (SharedArrayBuffer) access between the main Thread JavaScript and native execution.
- **Memory Efficiency:** A single Native concurrency pipeline scaling to 10,000 asynchronous grid cells uses a fraction of the RAM of dozens of isolated Node.js context engines.
### Side-By-Side Comparison
| Feature | Node.js `worker_threads` | Rust (N-API / Subprocess) | C++ (N-API / Subprocess) |
| :--- | :--- | :--- | :--- |
| **Startup Time** | ~30-50ms (V8 Isolate boot) | **<1ms** (Native / Binary spawn) | **<1ms** (Native / Binary spawn) |
| **Memory per Instance** | High (~10-30MB baseline) | **Minimal** (<2MB) | **Minimal** (<2MB) |
| **IPC Performance** | Slow (`postMessage` Structured Clone) | **High** (Zero-Copy SharedArrayBuffer or MsgPack UDS) | **High** (Zero-Copy SharedArrayBuffer or MsgPack UDS) |
| **Autospawning** | Poor (Spiking spawns causes OOM) | **Excellent** | **Excellent** |
| **Development Speed** | Fastest | Slower (Strict compiler, borrow checker) | Slower (Manual compilation, header management) |
| **Memory Safety** | High (V8 Engine) | **High** (Compiler-enforced lifetimes) | Lower (Prone to segfaults / memory leaks) |
| **Ecosystem (Parallelism)** | Limited (libuv threadpool) | **Best-in-class** (Tokio, Rayon) | Strong (std::thread, Boost) |
---
## 7. Messaging: Internal & External Workers (Protobuf)
When moving to an Elastic Autospawn architecture with Native workers, the serialization format and communication transport become the most crucial factors for performance and system integrity.
### Why Protobuf?
While MessagePack over Unix Domain Sockets works, **Protocol Buffers (Protobuf)** offers several distinct advantages, especially when scaling from "Internal Subprocesses" to "External Distributed Workers":
1. **Strict Type Contracts:** Both Node.js (TypeScript) and Native (Rust/C++) share the exact same `.proto` schema. If a payload field is required, the compiler ensures it exists. If the Node.js API changes a field structure, the Native worker fails to compile, preventing silent production parsing errors.
2. **Backwards Compatibility:** Protobuf is inherently designed for evolving APIs without breaking older workers.
3. **RPC Native (gRPC):** As we expand from *Internal Workers* on the same machine to *External Workers* on entirely different physical servers, Protobuf naturally upgrades into gRPC with zero serialization changes.
### The "Dual Model" Architecture
The beauty of standardizing on Protobuf is that the *exact same serialization code* is used regardless of where the worker lives.
#### 1. Internal Workers (Local IPC via Subprocesses)
- **The Scenario:** The main Node.js API process spawns a native Rust/C++ executable as a child process on the **same machine**.
- **The Transport:** Unix Domain Sockets (UDS) / Named Pipes or Standard I/O (stdio). UDS is preferred because it's full-duplex and avoids Node's `stdout` buffering constraints.
- **How it works:**
1. Node.js encodes the `JobPayload` message using the compiled `protobufjs` TypeScript library.
2. Node.js writes the binary payload to the local UNIX Domain Socket (e.g., `/tmp/worker_grid_123.sock`). Because UDS is a TCP-like stream, payloads must be **length-prefixed** (e.g., 4 bytes for length, followed by the Protobuf bytes) so the receiver knows when the message ends.
3. The Rust/C++ subprocess reads the length prefix, reads the exact byte count, and uses `prost` (Rust) or the Google Protobuf C++ library to deserialize instantly.
4. The worker executes the CPU-heavy logic, serializes the `JobResult`, prefixes the length, and streams it back.
#### 2. External Workers (Distributed Execution)
- **The Scenario:** Fanning out 10,000 Grid Search cells across dozens of physical worker nodes to prevent local CPU exhaustion.
- **The Transport:** Pg-Boss / Postgres (or gRPC).
- **How it works:**
1. **The Queue:** The main Node.js process encodes the job payload via Protobuf and saves the raw bytes (or Base64-encoded bytes) into the `pgboss.job` table.
2. **The Fleet:** Hundreds of external Rust/C++ worker nodes connect directly to the database layer (or via a gRPC interface) pulling jobs.
3. **The Decoding:** The remote execution node pulls the binary payload and deserializes the Protobuf bytes. Since the schema is strict, all external workers instantly understand the payload, ensuring perfect schema synchronization across the heterogeneous distributed fleet.
---
## 8. Storage & Database Integrations for Native Workers
Transitioning to Native Autospawning workers heavily impacts how the database and storage layers scale, specifically around connection pooling, payload limits, and blob storage.
### Connection Limits (Supavisor)
If 5,000 autospawned native processes all open distinct `libpq` connections to Postgres, the database will instantly lock up with `FATAL: too many clients`.
**The Rule:** All native workers (whether internal executables or external nodes) *must* connect to Postgres via a connection pooler like **Supavisor** or **PgBouncer**, which transparently multiplexes thousands of transient client connections onto a handful of persistent database connections.
### Event Bus Limits (Postgres NOTIFY)
As established, we use `LISTEN / NOTIFY` to bridge Server-Sent Events (SSE) from the Native workers back to the Node.js API stream.
**The Constraint:** Postgres `NOTIFY` string payloads are hard-limited to **8000 bytes**. You cannot emit massive JSON/Protobuf result arrays over `NOTIFY`. It must only contain progression percentages or tiny metadata.
### Returning Artifacts & Large Results
When a Native worker finishes crunching data, it needs to save the result.
1. **Small Results (JSON/Protobuf < 1MB):**
- The native worker calls the `pg-boss.complete(jobId, protobuf_bytes)` equivalent, storing the payload back in the `pgboss.job` table.
2. **Tabular Results (Big Data):**
- e.g., 50,000 scraped locations from a massive grid cell. The native worker uses the incredibly fast SQL `COPY` command (bulk insert) to slam the data directly into a dedicated Postgres table (e.g., `places`), and completes the `pg-boss` job with an empty payload.
3. **Huge Blobs (Images / Videos / AI Models):**
- The native worker *does not touch Postgres for blobs*. The Node API orchestrator pre-signs a **Supabase Storage Upload URL** and embeds it in the job payload. The Native worker generates the 50MB file and streams it via `libcurl` directly to S3/Supabase Storage, completely bypassing the database stack.
---
## 9. Next-level Abstracting: Embedded Scripting (Lua/WASM)
While writing the *infrastructure layer* (UDS reading, Protobuf decoding, Postgres connection pooling) in strictly-typed Native code (Rust/C++) is essential for performance, writing volatile *business logic* (like search heuristics) in C++ hurts developer velocity and requires constant recompilations.
To solve this we use the **Native Host + Embedded Scripting** pattern:
1. **The Architecture:** We compile a standalone Native Executable (the "Host") in Rust or C++. This host statically embeds a lightweight scripting engine (like **LuaJIT** or a **WASM** runtime like Wasmtime).
2. **Execution:** The Native Host safely handles all the heavy lifting—reading Unix Domain Sockets, managing DB connections, and parsing Protobuf. Once the payload is ready, it passes it into the embedded Lua state or WASM function instance.
3. **The Benefit:** Developers write the actual product logic in high-level Lua (or AssemblyScript for WASM). It executes wildly faster than Node.js (LuaJIT approaches raw C speed) while maintaining the tiny `<2MB` memory footprint, but allows for instant hot-reloading of the scripts without ever running a C++ compiler.

BIN
image.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

60
install-lnx.sh Normal file
View File

@ -0,0 +1,60 @@
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────────────────────
# install-lnx.sh Install build dependencies for polymech-cli on Linux
#
# Tested on: Ubuntu 20.04+ / Debian 11+
# Usage: sudo bash install-lnx.sh
# ─────────────────────────────────────────────────────────────────────────────
set -euo pipefail
echo "── polymech-cli Linux dependency installer ──"
# ── 1. System packages (apt) ─────────────────────────────────────────────────
echo ""
echo "[1/3] Installing system packages via apt …"
apt-get update -qq
apt-get install -y --no-install-recommends \
build-essential \
gcc \
g++ \
git \
libssl-dev \
pkg-config \
snapd
# ── 2. CMake ≥ 3.20 via snap ────────────────────────────────────────────────
# The project requires cmake_minimum_required(VERSION 3.20).
# Ubuntu 20.04 ships cmake 3.16, so we use the snap package instead.
echo ""
echo "[2/3] Installing CMake via snap (≥ 3.20 required) …"
if command -v /snap/bin/cmake &>/dev/null; then
echo " cmake snap already installed: $(/snap/bin/cmake --version | head -1)"
else
snap install cmake --classic
echo " Installed: $(/snap/bin/cmake --version | head -1)"
fi
# ── 3. Node.js (for npm run build:linux) ──────────────────────────────────────
echo ""
echo "[3/3] Checking for Node.js / npm …"
if command -v node &>/dev/null; then
echo " node $(node --version) already installed"
else
echo " Node.js not found. Install via nvm or nodesource, e.g.:"
echo " curl -fsSL https://deb.nodesource.com/setup_20.x | sudo -E bash -"
echo " sudo apt-get install -y nodejs"
fi
# ── Summary ──────────────────────────────────────────────────────────────────
echo ""
echo "── Done! ──"
echo ""
echo "All C++ dependencies (CLI11, tomlplusplus, Catch2, asio, concurrentqueue,"
echo "taskflow, curl, lexbor, rapidjson) are fetched automatically by CMake"
echo "FetchContent at build time — no manual installation needed."
echo ""
echo "To build:"
echo " cd $(dirname "$0")"
echo " npm run build:linux"
echo ""
echo "The binary will be placed in: dist/polymech-cli"

View File

@ -6,8 +6,7 @@
*
* Usage:
* import { spawnWorker } from './spawn.mjs';
* const w = await spawnWorker('./build/dev/Debug/polymech-cli.exe');
* const res = await w.request({ type: 'ping' });
* const w = await spawnWorker('./dist/polymech-cli.exe');
* console.log(res); // { id: '...', type: 'pong', payload: {} }
* await w.shutdown();
*/
@ -71,6 +70,9 @@ export function spawnWorker(exePath, args = ['worker']) {
// Pending request map: id → { resolve, reject, timer }
const pending = new Map();
// Event handler for unmatched messages (progress events, etc.)
let eventHandler = null;
let readyResolve;
const ready = new Promise((resolve) => { readyResolve = resolve; });
@ -97,8 +99,12 @@ export function spawnWorker(exePath, args = ['worker']) {
return;
}
// Unmatched message (event, broadcast, etc.)
console.log('[orchestrator] unmatched message:', msg);
// Unmatched message (progress event, broadcast, etc.)
if (eventHandler) {
eventHandler(msg);
} else {
console.log('[orchestrator] unmatched message:', msg);
}
});
proc.stdout.on('data', feedData);
@ -148,5 +154,6 @@ export function spawnWorker(exePath, args = ['worker']) {
kill: () => proc.kill(),
process: proc,
ready,
onEvent: (handler) => { eventHandler = handler; },
};
}

View File

@ -0,0 +1,204 @@
/**
* orchestrator/test-gridsearch-ipc.mjs
*
* E2E test: spawn the C++ worker, send a gridsearch request
* matching `npm run gridsearch:enrich` defaults, collect IPC events,
* and verify the full event sequence.
*
* Run: node orchestrator/test-gridsearch-ipc.mjs
* Needs: npm run build-debug (or npm run build)
*/
import { spawnWorker } from './spawn.mjs';
import { resolve, dirname } from 'node:path';
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
if (!fs.existsSync(EXE)) {
console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`);
process.exit(1);
}
console.log(`Binary: ${EXE}\n`);
// Load the sample settings (same as gridsearch:enrich)
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
);
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) {
console.log(`${label}`);
passed++;
} else {
console.error(`${label}`);
failed++;
}
}
// ── Event collector ─────────────────────────────────────────────────────────
const EXPECTED_EVENTS = [
'grid-ready',
'waypoint-start',
'area',
'location',
'enrich-start',
'node',
'nodePage',
// 'node-error' — may or may not occur, depends on network
];
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage']) {
events[t] = [];
}
return {
events,
handler(msg) {
const t = msg.type;
if (events[t]) {
events[t].push(msg);
} else {
events[t] = [msg];
}
// Live progress indicator
const d = msg.payload ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
}
},
};
}
// ── Main test ───────────────────────────────────────────────────────────────
async function run() {
console.log('🧪 Gridsearch IPC E2E Test\n');
// ── 1. Spawn worker ───────────────────────────────────────────────────
console.log('1. Spawn worker in daemon mode');
const worker = spawnWorker(EXE, ['worker', '--daemon', '--user-uid', '3bb4cfbf-318b-44d3-a9d3-35680e738421']);
const readyMsg = await worker.ready;
assert(readyMsg.type === 'ready', 'Worker sends ready signal');
// ── 2. Register event collector ───────────────────────────────────────
const collector = createCollector();
worker.onEvent(collector.handler);
// ── 3. Send gridsearch request (matching gridsearch:enrich) ────────────
console.log('2. Send gridsearch request (Aruba / recycling / --enrich)');
const t0 = Date.now();
// Very long timeout — enrichment can take minutes
const result = await worker.request(
{
type: 'gridsearch',
payload: {
...sampleConfig,
enrich: true,
},
},
5 * 60 * 1000 // 5 min timeout
);
const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`);
// ── 4. Verify final result ────────────────────────────────────────────
console.log('3. Verify job_result');
assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`);
const summary = result.payload ?? null;
assert(summary !== null, 'job_result payload is present');
if (summary) {
assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`);
assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`);
assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`);
assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`);
assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`);
assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present');
assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present');
assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`);
assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`);
}
// ── 5. Verify event sequence ──────────────────────────────────────────
console.log('4. Verify event stream');
const e = collector.events;
assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`);
assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`);
assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`);
assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`);
assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`);
const totalNodes = e['node'].length + e['node-error'].length;
assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`);
// Validate grid-ready payload
if (e['grid-ready'].length > 0) {
const gr = e['grid-ready'][0].payload ?? {};
assert(Array.isArray(gr.areas), 'grid-ready.areas is array');
assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`);
}
// Validate location events have required fields
if (e['location'].length > 0) {
const loc = e['location'][0].payload ?? {};
assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title');
assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id');
assert(typeof loc.areaName === 'string', 'location event has areaName');
}
assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`);
// Validate node payloads
if (e['node'].length > 0) {
const nd = e['node'][0].payload ?? {};
assert(typeof nd.placeId === 'string', 'node event has placeId');
assert(typeof nd.title === 'string', 'node event has title');
assert(Array.isArray(nd.emails), 'node event has emails array');
assert(typeof nd.status === 'string', 'node event has status');
}
// ── 6. Print event summary ────────────────────────────────────────────
console.log('\n5. Event summary');
for (const [type, arr] of Object.entries(e)) {
if (arr.length > 0) console.log(` ${type}: ${arr.length}`);
}
// ── 7. Shutdown ───────────────────────────────────────────────────────
console.log('\n6. Graceful shutdown');
const shutdownRes = await worker.shutdown();
assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged');
await new Promise(r => setTimeout(r, 500));
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
// ── Summary ───────────────────────────────────────────────────────────
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────\n`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch((err) => {
console.error('Test runner error:', err);
process.exit(1);
});

View File

@ -0,0 +1,218 @@
/**
* orchestrator/test-gridsearch-ipc-uds-meta.mjs
*
* E2E test for Unix Domain Sockets / Windows Named Pipes (Meta Enrichment)!
* Spawns the worker in `--uds` mode and tests direct high-throughput
* lock-free JSON binary framing over a net.Socket.
*/
import { spawn } from 'node:child_process';
import { resolve, dirname, join } from 'node:path';
import { readFileSync, existsSync, unlinkSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import net from 'node:net';
import { tmpdir } from 'node:os';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
const TEST_CANCEL = false;
if (!existsSync(EXE)) {
console.error(`❌ Binary not found at ${EXE}`);
process.exit(1);
}
const PIPE_NAME = 'polymech-test-uds-meta';
const CPP_UDS_ARG = IS_WIN ? '4001' : join(tmpdir(), `${PIPE_NAME}.sock`);
if (!IS_WIN && existsSync(CPP_UDS_ARG)) {
unlinkSync(CPP_UDS_ARG);
}
console.log(`Binary: ${EXE}`);
console.log(`C++ Arg: ${CPP_UDS_ARG}\n`);
// ── Event collector ─────────────────────────────────────────────────────────
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) {
events[t] = [];
}
return {
events,
onComplete: null,
handler(msg) {
const t = msg.type;
if (events[t]) events[t].push(msg);
else events[t] = [msg];
const d = msg.data ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
} else if (t === 'job_result') {
console.log(`\n 🏁 Pipeline complete!`);
if (this.onComplete) this.onComplete(msg);
}
},
};
}
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) { console.log(`${label}`); passed++; }
else { console.error(`${label}`); failed++; }
}
async function run() {
console.log('🧪 Gridsearch UDS Meta E2E Test\n');
// 1. Spawn worker in UDS mode
console.log('1. Spawning remote C++ Taskflow Daemon');
const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' });
// Give the daemon a moment to boot
console.log('2. Connecting net.Socket with retries...');
let socket;
for (let i = 0; i < 15; i++) {
try {
await new Promise((resolve, reject) => {
if (IS_WIN) {
socket = net.connect({ port: 4001, host: '127.0.0.1' });
} else {
socket = net.connect(CPP_UDS_ARG);
}
socket.once('connect', resolve);
socket.once('error', reject);
});
console.log(' ✅ Socket Connected to UDS!');
break;
} catch (e) {
if (i === 14) throw e;
await new Promise(r => setTimeout(r, 500));
}
}
const collector = createCollector();
let buffer = Buffer.alloc(0);
// Buffer framing logic (length-prefixed streaming)
socket.on('data', (chunk) => {
buffer = Buffer.concat([buffer, chunk]);
while (buffer.length >= 4) {
const len = buffer.readUInt32LE(0);
if (buffer.length >= 4 + len) {
const payload = buffer.toString('utf8', 4, 4 + len);
buffer = buffer.subarray(4 + len);
try {
const msg = JSON.parse(payload);
collector.handler(msg);
} catch (e) {
console.error("JSON PARSE ERROR:", e, payload);
}
} else {
break; // Wait for more chunks
}
}
});
// 3. Send Gridsearch payload
// USE gridsearch-sample.json instead of gridsearch-bcn-universities.json
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
);
sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml');
sampleConfig.jobId = 'uds-meta-test-abc';
sampleConfig.noCache = true; // force re-enrichment even if cached
console.log('3. Writing serialized IPC Payload over pipe...');
const jsonStr = JSON.stringify(sampleConfig);
const lenBuf = Buffer.alloc(4);
lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr));
socket.write(lenBuf);
socket.write(jsonStr);
// 4. Wait for pipeline completion (job_result event) or timeout
console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n');
await new Promise((resolve) => {
collector.onComplete = () => {
// Send stop command to gracefully shut down the daemon
console.log(' 📤 Sending stop command to daemon...');
const stopPayload = JSON.stringify({ action: 'stop' });
const stopLen = Buffer.alloc(4);
stopLen.writeUInt32LE(Buffer.byteLength(stopPayload));
socket.write(stopLen);
socket.write(stopPayload);
setTimeout(resolve, 1000); // Give daemon a moment to ack
};
// Safety timeout
setTimeout(() => {
console.log('\n ⏰ Timeout reached (300s) — forcing shutdown.');
resolve();
}, 300000); // Wait up to 5 minutes
});
console.log('\n\n5. Event summary');
for (const [k, v] of Object.entries(collector.events)) {
console.log(` ${k}: ${v.length}`);
}
// Assertions
const ev = collector.events;
assert(ev['grid-ready'].length === 1, 'grid-ready emitted once');
assert(ev['waypoint-start'].length > 0, 'waypoint-start events received');
assert(ev['location'].length > 0, 'location events received');
assert(ev['enrich-start'].length === 1, 'enrich-start emitted once');
assert(ev['job_result'].length === 1, 'job_result emitted once');
// Verify social profiles and md body
const nodes = ev['node'];
let foundSocial = false;
let foundSiteMd = false;
for (const n of nodes) {
const d = n.data;
if (!d) continue;
if (d.socials && d.socials.length > 0) {
foundSocial = true;
}
if (d.sites && Array.isArray(d.sites) && d.sites.length > 0) {
foundSiteMd = true;
}
}
if (foundSocial) {
assert(foundSocial, 'At least one enriched node has social media profiles discovered');
} else {
console.log(' ⚠️ No social media profiles discovered in this run (data-dependent), but pipeline completed.');
}
assert(foundSiteMd, 'At least one enriched node has markdown sites mapped');
console.log('6. Cleanup');
socket.destroy();
worker.kill('SIGTERM');
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch(e => {
console.error(e);
process.exit(1);
});

View File

@ -0,0 +1,255 @@
/**
* orchestrator/test-gridsearch-ipc-uds.mjs
*
* E2E test for Unix Domain Sockets / Windows Named Pipes!
* Spawns the worker in `--uds` mode and tests direct high-throughput
* lock-free JSON binary framing over a net.Socket.
*/
import { spawn } from 'node:child_process';
import { resolve, dirname, join } from 'node:path';
import { readFileSync, existsSync, unlinkSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import net from 'node:net';
import { tmpdir } from 'node:os';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
const TEST_CANCEL = false;
if (!existsSync(EXE)) {
console.error(`❌ Binary not found at ${EXE}`);
process.exit(1);
}
const PIPE_NAME = 'polymech-test-uds';
const CPP_UDS_ARG = IS_WIN ? '4000' : join(tmpdir(), `${PIPE_NAME}.sock`);
if (!IS_WIN && existsSync(CPP_UDS_ARG)) {
unlinkSync(CPP_UDS_ARG);
}
console.log(`Binary: ${EXE}`);
console.log(`C++ Arg: ${CPP_UDS_ARG}\n`);
// ── Event collector ─────────────────────────────────────────────────────────
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage', 'job_result']) {
events[t] = [];
}
return {
events,
onComplete: null,
handler(msg) {
const t = msg.type;
if (events[t]) events[t].push(msg);
else events[t] = [msg];
const d = msg.data ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
} else if (t === 'job_result') {
console.log(`\n 🏁 Pipeline complete!`);
if (this.onComplete) this.onComplete(msg);
}
},
};
}
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) { console.log(`${label}`); passed++; }
else { console.error(`${label}`); failed++; }
}
async function run() {
console.log('🧪 Gridsearch UDS / Named Pipe E2E Test\n');
// 1. Spawn worker in UDS mode
console.log('1. Spawning remote C++ Taskflow Daemon');
const worker = spawn(EXE, ['worker', '--uds', CPP_UDS_ARG, '--daemon'], { stdio: 'inherit' });
// Give the daemon a moment to boot
console.log('2. Connecting net.Socket with retries...');
let socket;
for (let i = 0; i < 15; i++) {
try {
await new Promise((resolve, reject) => {
if (IS_WIN) {
socket = net.connect({ port: 4000, host: '127.0.0.1' });
} else {
socket = net.connect(CPP_UDS_ARG);
}
socket.once('connect', resolve);
socket.once('error', reject);
});
console.log(' ✅ Socket Connected to UDS!');
break;
} catch (e) {
if (i === 14) throw e;
await new Promise(r => setTimeout(r, 500));
}
}
const collector = createCollector();
let buffer = Buffer.alloc(0);
// Buffer framing logic (length-prefixed streaming)
socket.on('data', (chunk) => {
buffer = Buffer.concat([buffer, chunk]);
while (buffer.length >= 4) {
const len = buffer.readUInt32LE(0);
if (buffer.length >= 4 + len) {
const payload = buffer.toString('utf8', 4, 4 + len);
buffer = buffer.subarray(4 + len);
try {
const msg = JSON.parse(payload);
collector.handler(msg);
} catch (e) {
console.error("JSON PARSE ERROR:", e, payload);
}
} else {
break; // Wait for more chunks
}
}
});
// 3. Send Gridsearch payload
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-bcn-universities.json'), 'utf8')
);
sampleConfig.configPath = resolve(__dirname, '..', 'config', 'postgres.toml');
sampleConfig.jobId = 'uds-test-cancel-abc';
console.log('3. Writing serialized IPC Payload over pipe...');
const jsonStr = JSON.stringify(sampleConfig);
const lenBuf = Buffer.alloc(4);
lenBuf.writeUInt32LE(Buffer.byteLength(jsonStr));
socket.write(lenBuf);
socket.write(jsonStr);
// Send cancellation after 5 seconds
if (TEST_CANCEL) {
setTimeout(() => {
console.log('\n\n--> Testing Dynamic Cancellation (Sending cancel event for uds-test-cancel-abc)...');
const cancelPayload = JSON.stringify({ action: "cancel", jobId: "uds-test-cancel-abc" });
const cancelLenBuf = Buffer.alloc(4);
cancelLenBuf.writeUInt32LE(Buffer.byteLength(cancelPayload));
socket.write(cancelLenBuf);
socket.write(cancelPayload);
}, 5000);
}
// 4. Wait for pipeline completion (job_result event) or timeout
console.log('\n4. Awaiting multi-threaded Execution Pipeline (can take minutes)...\n');
await new Promise((resolve) => {
collector.onComplete = () => {
// Send stop command to gracefully shut down the daemon
console.log(' 📤 Sending stop command to daemon...');
const stopPayload = JSON.stringify({ action: 'stop' });
const stopLen = Buffer.alloc(4);
stopLen.writeUInt32LE(Buffer.byteLength(stopPayload));
socket.write(stopLen);
socket.write(stopPayload);
setTimeout(resolve, 1000); // Give daemon a moment to ack
};
// Safety timeout
setTimeout(() => {
console.log('\n ⏰ Timeout reached (120s) — forcing shutdown.');
resolve();
}, 120000);
});
console.log('\n\n5. Event summary');
for (const [k, v] of Object.entries(collector.events)) {
console.log(` ${k}: ${v.length}`);
}
// Assertions
const ev = collector.events;
assert(ev['grid-ready'].length === 1, 'grid-ready emitted once');
assert(ev['waypoint-start'].length > 0, 'waypoint-start events received');
assert(ev['location'].length > 0, 'location events received');
assert(ev['enrich-start'].length === 1, 'enrich-start emitted once');
assert(ev['job_result'].length === 1, 'job_result emitted once');
// Check enrichment skip log (if present in log events)
const logEvents = ev['log'] ?? [];
const skipLog = logEvents.find(l =>
typeof l.data === 'string' && l.data.includes('already enriched')
);
const nodeCount = ev['node'].length + ev['node-error'].length;
if (skipLog) {
console.log(` Pre-enrich skip detected: ${skipLog.data}`);
assert(nodeCount === 0, 'no enrichment needed (all skipped)');
} else {
console.log(' No pre-enrich skips (all locations are new or unenriched)');
assert(nodeCount > 0, 'enrichment node events received');
}
// Check filterTypes assertions: all locations must have website + matching type
const FILTER_TYPE = 'Recycling center';
const locations = ev['location'];
const badWebsite = locations.filter(l => {
const loc = l.data?.location;
return !loc?.website;
});
assert(badWebsite.length === 0, `all locations have website (${badWebsite.length} missing)`);
const badType = locations.filter(l => {
const loc = l.data?.location;
const types = loc?.types ?? [];
const type = loc?.type ?? '';
return !types.includes(FILTER_TYPE) && type !== FILTER_TYPE;
});
if (badType.length > 0) {
console.log(` ❌ Mismatched locations:`);
badType.slice(0, 3).forEach(l => console.log(JSON.stringify(l.data?.location, null, 2)));
}
assert(badType.length === 0, `all locations match type "${FILTER_TYPE}" (${badType.length} mismatched)`);
const filterLog = logEvents.find(l =>
typeof l.data === 'string' && l.data.includes('locations removed')
);
if (filterLog) {
console.log(` Filter applied: ${filterLog.data}`);
}
const filterTypesLog = logEvents.filter(l =>
typeof l.data === 'string' && (l.data.includes('filterTypes:') || l.data.includes(' - '))
);
if (filterTypesLog.length > 0) {
console.log(` Parsed filterTypes in C++:`);
filterTypesLog.forEach(l => console.log(` ${l.data}`));
}
console.log(` Locations after filter: ${locations.length}`);
console.log('6. Cleanup');
socket.destroy();
worker.kill('SIGTERM');
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch(e => {
console.error(e);
process.exit(1);
});

View File

@ -0,0 +1,204 @@
/**
* orchestrator/test-gridsearch-ipc.mjs
*
* E2E test: spawn the C++ worker, send a gridsearch request
* matching `npm run gridsearch:enrich` defaults, collect IPC events,
* and verify the full event sequence.
*
* Run: node orchestrator/test-gridsearch-ipc.mjs
* Needs: npm run build-debug (or npm run build)
*/
import { spawnWorker } from './spawn.mjs';
import { resolve, dirname } from 'node:path';
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import fs from 'node:fs';
const __dirname = dirname(fileURLToPath(import.meta.url));
const IS_WIN = process.platform === 'win32';
const EXE_NAME = IS_WIN ? 'polymech-cli.exe' : 'polymech-cli';
const EXE = resolve(__dirname, '..', 'dist', EXE_NAME);
if (!fs.existsSync(EXE)) {
console.error(`❌ No ${EXE_NAME} found in dist. Run npm run build first.`);
process.exit(1);
}
console.log(`Binary: ${EXE}\n`);
// Load the sample settings (same as gridsearch:enrich)
const sampleConfig = JSON.parse(
readFileSync(resolve(__dirname, '..', 'config', 'gridsearch-sample.json'), 'utf8')
);
let passed = 0;
let failed = 0;
function assert(condition, label) {
if (condition) {
console.log(`${label}`);
passed++;
} else {
console.error(`${label}`);
failed++;
}
}
// ── Event collector ─────────────────────────────────────────────────────────
const EXPECTED_EVENTS = [
'grid-ready',
'waypoint-start',
'area',
'location',
'enrich-start',
'node',
'nodePage',
// 'node-error' — may or may not occur, depends on network
];
function createCollector() {
const events = {};
for (const t of ['grid-ready', 'waypoint-start', 'area', 'location',
'enrich-start', 'node', 'node-error', 'nodePage']) {
events[t] = [];
}
return {
events,
handler(msg) {
const t = msg.type;
if (events[t]) {
events[t].push(msg);
} else {
events[t] = [msg];
}
// Live progress indicator
const d = msg.payload ?? {};
if (t === 'waypoint-start') {
process.stdout.write(`\r 🔍 Searching waypoint ${(d.index ?? 0) + 1}/${d.total ?? '?'}...`);
} else if (t === 'node') {
process.stdout.write(`\r 📧 Enriched: ${d.title?.substring(0, 40) ?? ''} `);
} else if (t === 'node-error') {
process.stdout.write(`\r ⚠️ Error: ${d.node?.title?.substring(0, 40) ?? ''} `);
}
},
};
}
// ── Main test ───────────────────────────────────────────────────────────────
async function run() {
console.log('🧪 Gridsearch IPC E2E Test\n');
// ── 1. Spawn worker ───────────────────────────────────────────────────
console.log('1. Spawn worker');
const worker = spawnWorker(EXE);
const readyMsg = await worker.ready;
assert(readyMsg.type === 'ready', 'Worker sends ready signal');
// ── 2. Register event collector ───────────────────────────────────────
const collector = createCollector();
worker.onEvent(collector.handler);
// ── 3. Send gridsearch request (matching gridsearch:enrich) ────────────
console.log('2. Send gridsearch request (Aruba / recycling / --enrich)');
const t0 = Date.now();
// Very long timeout — enrichment can take minutes
const result = await worker.request(
{
type: 'gridsearch',
payload: {
...sampleConfig,
enrich: true,
},
},
5 * 60 * 1000 // 5 min timeout
);
const elapsed = ((Date.now() - t0) / 1000).toFixed(1);
console.log(`\n\n ⏱️ Completed in ${elapsed}s\n`);
// ── 4. Verify final result ────────────────────────────────────────────
console.log('3. Verify job_result');
assert(result.type === 'job_result', `Response type is "job_result" (got "${result.type}")`);
const summary = result.payload ?? null;
assert(summary !== null, 'job_result payload is present');
if (summary) {
assert(typeof summary.totalMs === 'number', `totalMs is number (${summary.totalMs})`);
assert(typeof summary.searchMs === 'number', `searchMs is number (${summary.searchMs})`);
assert(typeof summary.enrichMs === 'number', `enrichMs is number (${summary.enrichMs})`);
assert(typeof summary.freshApiCalls === 'number', `freshApiCalls is number (${summary.freshApiCalls})`);
assert(typeof summary.waypointCount === 'number', `waypointCount is number (${summary.waypointCount})`);
assert(summary.gridStats && typeof summary.gridStats.validCells === 'number', 'gridStats.validCells present');
assert(summary.searchStats && typeof summary.searchStats.totalResults === 'number', 'searchStats.totalResults present');
assert(typeof summary.enrichedOk === 'number', `enrichedOk is number (${summary.enrichedOk})`);
assert(typeof summary.enrichedTotal === 'number', `enrichedTotal is number (${summary.enrichedTotal})`);
}
// ── 5. Verify event sequence ──────────────────────────────────────────
console.log('4. Verify event stream');
const e = collector.events;
assert(e['grid-ready'].length === 1, `Exactly 1 grid-ready event (got ${e['grid-ready'].length})`);
assert(e['waypoint-start'].length > 0, `At least 1 waypoint-start event (got ${e['waypoint-start'].length})`);
assert(e['area'].length > 0, `At least 1 area event (got ${e['area'].length})`);
assert(e['waypoint-start'].length === e['area'].length, `waypoint-start count (${e['waypoint-start'].length}) === area count (${e['area'].length})`);
assert(e['enrich-start'].length === 1, `Exactly 1 enrich-start event (got ${e['enrich-start'].length})`);
const totalNodes = e['node'].length + e['node-error'].length;
assert(totalNodes > 0, `At least 1 node event (got ${totalNodes}: ${e['node'].length} ok, ${e['node-error'].length} errors)`);
// Validate grid-ready payload
if (e['grid-ready'].length > 0) {
const gr = e['grid-ready'][0].payload ?? {};
assert(Array.isArray(gr.areas), 'grid-ready.areas is array');
assert(typeof gr.total === 'number' && gr.total > 0, `grid-ready.total > 0 (${gr.total})`);
}
// Validate location events have required fields
if (e['location'].length > 0) {
const loc = e['location'][0].payload ?? {};
assert(loc.location && typeof loc.location.title === 'string', 'location event has location.title');
assert(loc.location && typeof loc.location.place_id === 'string', 'location event has location.place_id');
assert(typeof loc.areaName === 'string', 'location event has areaName');
}
assert(e['location'].length > 0, `At least 1 location event (got ${e['location'].length})`);
// Validate node payloads
if (e['node'].length > 0) {
const nd = e['node'][0].payload ?? {};
assert(typeof nd.placeId === 'string', 'node event has placeId');
assert(typeof nd.title === 'string', 'node event has title');
assert(Array.isArray(nd.emails), 'node event has emails array');
assert(typeof nd.status === 'string', 'node event has status');
}
// ── 6. Print event summary ────────────────────────────────────────────
console.log('\n5. Event summary');
for (const [type, arr] of Object.entries(e)) {
if (arr.length > 0) console.log(` ${type}: ${arr.length}`);
}
// ── 7. Shutdown ───────────────────────────────────────────────────────
console.log('\n6. Graceful shutdown');
const shutdownRes = await worker.shutdown();
assert(shutdownRes.type === 'shutdown_ack', 'Shutdown acknowledged');
await new Promise(r => setTimeout(r, 500));
assert(worker.process.exitCode === 0, `Worker exited with code 0 (got ${worker.process.exitCode})`);
// ── Summary ───────────────────────────────────────────────────────────
console.log(`\n────────────────────────────────`);
console.log(` Passed: ${passed} Failed: ${failed}`);
console.log(`────────────────────────────────\n`);
process.exit(failed > 0 ? 1 : 0);
}
run().catch((err) => {
console.error('Test runner error:', err);
process.exit(1);
});

View File

@ -12,7 +12,7 @@ import { resolve, dirname } from 'node:path';
import { fileURLToPath } from 'node:url';
const __dirname = dirname(fileURLToPath(import.meta.url));
const EXE = resolve(__dirname, '..', 'build', 'dev', 'Debug', 'polymech-cli.exe');
const EXE = resolve(__dirname, '..', 'dist', 'polymech-cli.exe');
let passed = 0;
let failed = 0;

View File

@ -8,18 +8,26 @@
"scripts": {
"config": "cmake --preset dev",
"config:release": "cmake --preset release",
"build": "cmake --build --preset dev",
"build:release": "cmake --build --preset release",
"build": "cmake --preset dev && cmake --build --preset dev",
"build:release": "cmake --preset release && cmake --build --preset release",
"build:linux": "bash build-linux.sh",
"test": "ctest --test-dir build/dev -C Debug --output-on-failure",
"test:release": "ctest --test-dir build/release -C Release --output-on-failure",
"clean": "cmake -E rm -rf build/dev",
"clean:release": "cmake -E rm -rf build/release",
"clean:all": "cmake -E rm -rf build",
"rebuild": "npm run clean && npm run config && npm run build",
"run": ".\\build\\dev\\Debug\\polymech-cli.exe --help",
"worker": ".\\build\\dev\\Debug\\polymech-cli.exe worker",
"test:ipc": "node orchestrator/test-ipc.mjs",
"gridsearch": ".\\build\\Debug\\polymech-cli.exe gridsearch ABW recycling --dry-run"
"clean": "cmake -E rm -rf build dist",
"rebuild": "npm run clean && npm run build",
"run": ".\\dist\\polymech-cli.exe --help",
"worker": ".\\dist\\polymech-cli.exe worker",
"test:ipc": "node orchestrator/test-gridsearch-ipc.mjs",
"gridsearch": ".\\dist\\polymech-cli.exe gridsearch ABW recycling --dry-run",
"gridsearch:settings": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --dry-run",
"gridsearch:settings:live": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json",
"gridsearch:enrich": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-sample.json --enrich",
"gridsearch:enrich-test": ".\\dist\\polymech-cli.exe gridsearch --settings config/gridsearch-test-bcn.json --enrich --persistence-postgres",
"test:gridsearch-ipc": "node orchestrator/test-gridsearch-ipc.mjs",
"test:ipc:daemon": "node orchestrator/test-gridsearch-ipc-daemon.mjs",
"test:ipc:uds": "node orchestrator/test-gridsearch-ipc-uds.mjs",
"test:ipc:uds-meta": "node orchestrator/test-gridsearch-ipc-uds-meta.mjs",
"test:html": "cmake --preset release && cmake --build --preset release --target test_html && .\\dist\\test_html.exe"
},
"repository": {
"type": "git",

View File

@ -0,0 +1,4 @@
add_library(enrichers STATIC src/enrichers.cpp)
target_include_directories(enrichers PUBLIC include)
target_link_libraries(enrichers PUBLIC http html json logger)

View File

@ -0,0 +1,162 @@
#pragma once
#include <map>
#include <string>
#include <vector>
namespace enrichers {
// ── Status codes ────────────────────────────────────────────────────────────
enum class EnrichStatus {
OK,
NO_EMAIL,
META_TIMEOUT,
EMAIL_TIMEOUT,
FETCH_ERROR,
NO_PAGES,
ERROR,
};
const char *status_string(EnrichStatus s);
// ── Data types ──────────────────────────────────────────────────────────────
struct PageError {
std::string url;
std::string status; // "SEARCHED_EMAIL", "FAILED", ...
std::string method; // "GET", "SCRAPELESS", ...
std::string error;
int http_status = 0;
std::vector<std::string> emails;
};
struct SocialLink {
std::string platform; // "instagram", "facebook", "linkedin", ...
std::string url;
};
struct SiteMeta {
std::string title;
std::string description;
std::string og_image;
std::string canonical;
std::vector<SocialLink> socials;
std::vector<std::string> internal_pages; // discovered internal hrefs
std::vector<std::string> emails;
std::string body_text;
std::string body_html;
std::map<std::string, std::string> sites; // url -> body_md
int http_status = 0;
std::string fetch_error;
std::vector<std::string> json_ld;
};
struct EnrichedNode {
int idx = 0;
std::string title;
std::string place_id;
std::string website;
std::string address;
std::string type;
std::string grid_area;
std::string grid_gid;
int pages_found = 0;
int pages_scraped = 0;
std::vector<std::string> emails;
std::vector<SocialLink> socials;
int meta_ms = 0;
int email_ms = 0;
int total_ms = 0;
EnrichStatus status = EnrichStatus::NO_EMAIL;
std::string error;
std::map<std::string, std::string> pages; // "home" → body text
std::vector<std::string> meta_pages;
std::vector<PageError> page_errors;
std::string enricher_hash;
std::string geo_json;
std::map<std::string, std::string> sites; // url -> body_md
};
// ── Configuration ───────────────────────────────────────────────────────────
struct EnrichConfig {
bool enable_homepage_md = true;
int meta_timeout_ms = 10000;
int email_timeout_ms = 15000;
int email_page_timeout_ms = 10000;
int email_max_pages = 8;
int email_abort_after = 1;
/// Scrapeless API key — if set, pages that yield no emails via plain
/// HTTP GET will be re-fetched through the Scrapeless Universal Scraping
/// API (JS rendering). Leave empty to disable the fallback.
std::string scrapeless_key;
std::string bigdata_key;
std::vector<std::string> contact_patterns = {
"contact", "kontakt", "contacto", "contacta", "impression",
"about", "impress", "impressum", "datenschutz", "privacy",
"legal", "team", "nosotros", "empresa", "sobre",
};
std::vector<std::string> probe_paths = {
"/contact", "/contacto", "/kontakt", "/contacta",
"/about", "/about-us", "/impressum",
};
std::string meta_scraper;
int meta_concurrency = 5;
int meta_idle_timeout = 60;
};
// ── Location input ──────────────────────────────────────────────────────────
struct LocationInput {
std::string title;
std::string place_id;
std::string website;
std::string address;
std::string type;
std::string grid_area;
std::string grid_gid;
double lat = 0;
double lng = 0;
};
// ── Core API ────────────────────────────────────────────────────────────────
/// Check if a candidate string looks like a real email address.
bool is_likely_email(const std::string &candidate);
/// Extract all email addresses from a text body.
std::vector<std::string> extract_emails(const std::string &text);
/// Scrape metadata from a website URL (static HTML via libcurl + lexbor).
SiteMeta scrape_meta(const std::string &url, int timeout_ms = 10000);
/// Scrape emails from a single page URL.
std::vector<std::string> scrape_emails_from_page(const std::string &url,
int timeout_ms = 10000);
/// Fetch a page via Scrapeless Universal Scraping API (JS rendering),
/// then extract emails from the rendered HTML. Returns empty if key is
/// blank or the API call fails.
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms = 15000);
/// Scrape metadata from a website URL via Scrapeless Universal API (JS
/// rendering).
SiteMeta scrape_meta_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms = 15000);
/// Full enrichment pipeline for a single location: meta → email.
EnrichedNode enrich_location(const LocationInput &loc,
const EnrichConfig &cfg = {});
/// Resolve a URL relative to a base URL.
std::string resolve_url(const std::string &base, const std::string &href);
} // namespace enrichers

View File

@ -0,0 +1,800 @@
#include "enrichers/enrichers.h"
#include "html/html.h"
#include "http/http.h"
#include "logger/logger.h"
#include "json/json.h"
#include <algorithm>
#include <chrono>
#include <future>
#include <regex>
#include <set>
#include <sstream>
namespace enrichers {
// ── Status string ───────────────────────────────────────────────────────────
const char *status_string(EnrichStatus s) {
switch (s) {
case EnrichStatus::OK:
return "OK";
case EnrichStatus::NO_EMAIL:
return "NO_EMAIL";
case EnrichStatus::META_TIMEOUT:
return "META_TIMEOUT";
case EnrichStatus::EMAIL_TIMEOUT:
return "EMAIL_TIMEOUT";
case EnrichStatus::FETCH_ERROR:
return "FETCH_ERROR";
case EnrichStatus::NO_PAGES:
return "NO_PAGES";
case EnrichStatus::ERROR:
return "ERROR";
}
return "UNKNOWN";
}
// ── Timing helper ───────────────────────────────────────────────────────────
static int elapsed_ms(std::chrono::steady_clock::time_point t0) {
auto now = std::chrono::steady_clock::now();
return static_cast<int>(
std::chrono::duration_cast<std::chrono::milliseconds>(now - t0).count());
}
// ── Email extraction ────────────────────────────────────────────────────────
static const std::regex
EMAIL_RE(R"([a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})",
std::regex::optimize);
// Asset extensions that disqualify an email-like string
static const std::vector<std::string> ASSET_EXTS = {
".png", ".jpg", ".jpeg", ".gif", ".svg", ".webp",
".avif", ".css", ".js", ".woff", ".woff2", ".ttf",
".eot", ".mp4", ".mp3", ".pdf", ".zip", ".ico",
};
static std::string to_lower(const std::string &s) {
std::string out = s;
std::transform(out.begin(), out.end(), out.begin(),
[](unsigned char c) { return std::tolower(c); });
return out;
}
bool is_likely_email(const std::string &candidate) {
if (candidate.size() < 5 || candidate.size() > 254)
return false;
if (candidate.find("..") != std::string::npos)
return false;
auto at_pos = candidate.find('@');
if (at_pos == std::string::npos || at_pos == 0 ||
at_pos == candidate.size() - 1)
return false;
auto lower = to_lower(candidate);
// Reject asset-like extensions
for (auto &ext : ASSET_EXTS) {
if (lower.size() >= ext.size() &&
lower.compare(lower.size() - ext.size(), ext.size(), ext) == 0) {
return false;
}
}
// Reject common placeholders
if (lower.find("example") != std::string::npos)
return false;
if (lower.find("sentry") != std::string::npos)
return false;
if (lower.find("test") != std::string::npos)
return false;
if (lower.find("placeholder") != std::string::npos)
return false;
if (lower.find("wixpress.com") != std::string::npos)
return false;
// Reject if local part is pure hex hash (8+ hex chars)
if (at_pos >= 8) {
auto local = lower.substr(0, at_pos);
bool all_hex = std::all_of(local.begin(), local.end(), [](char c) {
return (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f');
});
if (all_hex)
return false;
}
// Reject if domain part looks numeric-only (e.g. 1234@5678)
auto domain = lower.substr(at_pos + 1);
auto dot_pos = domain.find('.');
if (dot_pos == std::string::npos)
return false;
if (domain.length() - dot_pos <= 2)
return false; // Minimum 2 chars for TLD
auto domPart = domain.substr(0, dot_pos);
bool all_digits =
!domPart.empty() &&
std::all_of(domPart.begin(), domPart.end(),
[](unsigned char c) { return std::isdigit(c); });
if (all_digits)
return false;
return true;
}
static bool is_valid_email_char(char c) {
return std::isalnum(static_cast<unsigned char>(c)) || c == '.' || c == '_' ||
c == '%' || c == '+' || c == '-';
}
std::vector<std::string> extract_emails(const std::string &text) {
std::vector<std::string> results;
if (text.empty())
return results;
std::set<std::string> seen;
size_t pos = 0;
while ((pos = text.find('@', pos)) != std::string::npos) {
if (pos == 0 || pos == text.length() - 1) {
pos++;
continue;
}
// Scan backwards
size_t start = pos;
while (start > 0 && is_valid_email_char(text[start - 1])) {
start--;
}
// Scan forwards
size_t end = pos;
while (end < text.length() - 1 && is_valid_email_char(text[end + 1])) {
end++;
}
if (start < pos && end > pos) {
std::string candidate = text.substr(start, end - start + 1);
// Strip trailing dots/hyphens eagerly grabbed
while (!candidate.empty() &&
(candidate.back() == '.' || candidate.back() == '-')) {
candidate.pop_back();
end--;
}
// Strip leading dots/hyphens
size_t local_start = 0;
while (local_start < candidate.length() &&
(candidate[local_start] == '.' || candidate[local_start] == '-')) {
local_start++;
}
if (local_start > 0) {
candidate = candidate.substr(local_start);
}
std::string lower = to_lower(candidate);
if (is_likely_email(lower)) {
if (seen.insert(lower).second) {
results.push_back(lower);
}
}
}
pos = end + 1;
}
return results;
}
// ── URL resolution ──────────────────────────────────────────────────────────
std::string resolve_url(const std::string &base, const std::string &href) {
if (href.empty())
return {};
// Already absolute
if (href.find("http://") == 0 || href.find("https://") == 0)
return href;
// Protocol-relative
if (href.find("//") == 0) {
auto proto_end = base.find("//");
if (proto_end != std::string::npos) {
return base.substr(0, proto_end) + href;
}
return "https:" + href;
}
// Skip non-HTTP
if (href.find("mailto:") == 0 || href.find("tel:") == 0 ||
href.find("javascript:") == 0 || href[0] == '#') {
return {};
}
// Relative path
// Find base origin: https://example.com
auto proto = base.find("://");
if (proto == std::string::npos)
return {};
auto origin_end = base.find('/', proto + 3);
std::string origin =
(origin_end != std::string::npos) ? base.substr(0, origin_end) : base;
if (href[0] == '/') {
return origin + href;
}
// Relative without leading slash
if (origin_end != std::string::npos) {
auto last_slash = base.rfind('/');
if (last_slash > proto + 2) {
return base.substr(0, last_slash + 1) + href;
}
}
return origin + "/" + href;
}
// ── Social link classification ──────────────────────────────────────────────
static std::string classify_social(const std::string &url) {
auto lower = to_lower(url);
if (lower.find("instagram.com") != std::string::npos)
return "instagram";
if (lower.find("facebook.com") != std::string::npos)
return "facebook";
if (lower.find("linkedin.com") != std::string::npos)
return "linkedin";
if (lower.find("twitter.com") != std::string::npos ||
lower.find("x.com") != std::string::npos)
return "twitter";
if (lower.find("youtube.com") != std::string::npos)
return "youtube";
if (lower.find("tiktok.com") != std::string::npos)
return "tiktok";
if (lower.find("pinterest.com") != std::string::npos)
return "pinterest";
if (lower.find("github.com") != std::string::npos)
return "github";
return {};
}
// ── Same-origin check ───────────────────────────────────────────────────────
static std::string get_origin(const std::string &url) {
auto proto = url.find("://");
if (proto == std::string::npos)
return {};
auto origin_end = url.find('/', proto + 3);
return (origin_end != std::string::npos) ? url.substr(0, origin_end) : url;
}
static bool is_same_origin(const std::string &base_url,
const std::string &href) {
auto bo = to_lower(get_origin(base_url));
auto ho = to_lower(get_origin(href));
if (bo.empty() || ho.empty())
return false;
// Strip www. for comparison
auto strip_www = [](std::string &s) {
auto pos = s.find("://www.");
if (pos != std::string::npos) {
s = s.substr(0, pos + 3) + s.substr(pos + 7);
}
};
strip_www(bo);
strip_www(ho);
return bo == ho;
}
// ── Contact page matching ───────────────────────────────────────────────────
static bool matches_contact_pattern(const std::string &url,
const std::vector<std::string> &patterns) {
auto lower = to_lower(url);
for (auto &pat : patterns) {
if (lower.find(to_lower(pat)) != std::string::npos)
return true;
}
return false;
}
// ── Shared HTML parsing logic for Meta ──────────────────────────────────────
static SiteMeta parse_meta_html(const std::string &url, int http_status,
const std::string &html_body,
const std::string &fetch_error) {
SiteMeta meta;
meta.http_status = http_status;
if (!fetch_error.empty()) {
meta.fetch_error = fetch_error;
return meta;
}
meta.body_html = html_body;
// Parse with lexbor helpers
meta.title = html::get_title(html_body);
meta.description = html::get_meta(html_body, "description");
meta.og_image = html::get_meta(html_body, "og:image");
meta.canonical = html::get_canonical(html_body);
meta.body_text = html::get_body_text(html_body);
meta.json_ld = html::get_json_ld(html_body);
// OG fallbacks
if (meta.description.empty())
meta.description = html::get_meta(html_body, "og:description");
if (meta.title.empty())
meta.title = html::get_meta(html_body, "og:title");
// Links — classify into social / internal / mailto
auto links = html::get_links(html_body);
std::set<std::string> seen_pages;
// Extract emails from body text (much smaller than raw HTML)
meta.emails = extract_emails(meta.body_text);
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string email = lk.href.substr(7);
// Strip anything after ? (like ?subject=...)
auto q = email.find('?');
if (q != std::string::npos)
email = email.substr(0, q);
// Clean it
email = to_lower(email);
if (is_likely_email(email)) {
if (std::find(meta.emails.begin(), meta.emails.end(), email) ==
meta.emails.end()) {
meta.emails.push_back(email);
}
}
continue;
}
auto resolved = resolve_url(url, lk.href);
if (resolved.empty())
continue;
auto social = classify_social(resolved);
if (!social.empty()) {
meta.socials.push_back({social, resolved});
continue;
}
if (is_same_origin(url, resolved)) {
// Strip fragment (#) from URL
auto hash_pos = resolved.find('#');
if (hash_pos != std::string::npos) {
resolved = resolved.substr(0, hash_pos);
}
if (!resolved.empty() && seen_pages.insert(resolved).second) {
meta.internal_pages.push_back(resolved);
}
}
}
return meta;
}
// ── scrape_meta ─────────────────────────────────────────────────────────────
SiteMeta scrape_meta(const std::string &url, int timeout_ms) {
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36";
auto resp = http::get(url, opts);
std::string fetch_err;
if (resp.status_code < 0 || resp.status_code >= 400) {
fetch_err = resp.body;
}
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
fetch_err);
}
// ── scrape_emails_from_page ─────────────────────────────────────────────────
std::vector<std::string> scrape_emails_from_page(const std::string &url,
int timeout_ms,
int &out_status_code) {
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
opts.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36";
auto resp = http::get(url, opts);
out_status_code = static_cast<int>(resp.status_code);
if (resp.status_code < 0 || resp.status_code >= 400) {
return {};
}
// Extract body text then find emails
auto text = html::get_body_text(resp.body);
auto from_text = extract_emails(text);
// Extract mailto: links from HTML directly without regexing the huge string
auto links = html::get_links(resp.body);
std::set<std::string> seen(from_text.begin(), from_text.end());
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string m = lk.href.substr(7);
auto q = m.find('?');
if (q != std::string::npos)
m = m.substr(0, q);
m = to_lower(m);
if (is_likely_email(m)) {
if (seen.insert(m).second) {
from_text.push_back(m);
}
}
}
}
return from_text;
}
static std::string extract_scrapeless_html(const std::string &json_body) {
std::string data = json::get_string(json_body, "data");
if (data.empty()) {
return json_body; // Fallback to raw response if not found
}
return data;
}
SiteMeta scrape_meta_scrapeless(const std::string &url,
const std::string &api_key, int timeout_ms) {
if (api_key.empty())
return parse_meta_html(url, 0, "", "missing api key");
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
url +
R"(","jsRender":{"enabled":true,"headless":true}}})";
http::PostOptions opts;
opts.content_type = "application/json";
opts.bearer_token = api_key;
opts.timeout_ms =
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
payload, opts);
std::string fetch_err;
if (resp.status_code < 0 || resp.status_code >= 400) {
fetch_err = resp.body;
logger::error("[meta:scrapeless] API Error HTTP " +
std::to_string(resp.status_code) + " for " + url + " : " +
fetch_err);
return parse_meta_html(url, static_cast<int>(resp.status_code), resp.body,
fetch_err);
}
std::string rendered_html = extract_scrapeless_html(resp.body);
return parse_meta_html(url, static_cast<int>(resp.status_code), rendered_html,
"");
}
std::vector<std::string> scrape_emails_scrapeless(const std::string &url,
const std::string &api_key,
int timeout_ms) {
if (api_key.empty())
return {};
// Build the Scrapeless Universal Scraping API request body.
// We ask for the fully-rendered HTML of the target URL.
std::string payload = R"({"actor":"unlocker.webunlocker","input":{"url":")" +
url +
R"(","jsRender":{"enabled":true,"headless":true}}})";
http::PostOptions opts;
opts.content_type = "application/json";
opts.bearer_token = api_key;
opts.timeout_ms =
std::max(timeout_ms, 45000); // Scrapeless needs generous timeout
auto resp = http::post("https://api.scrapeless.com/api/v2/unlocker/request",
payload, opts);
if (resp.status_code < 0 || resp.status_code >= 400) {
logger::error("[email:scrapeless] API Error HTTP " +
std::to_string(resp.status_code) + " for " + url + " : " +
resp.body);
return {}; // API error — silent fallback
}
std::string rendered_html = extract_scrapeless_html(resp.body);
// Parse and extract emails from the rendered HTML
auto text = html::get_body_text(rendered_html);
auto from_text = extract_emails(text);
// Fast mailto extraction instead of HTML regex
auto links = html::get_links(rendered_html);
std::set<std::string> seen(from_text.begin(), from_text.end());
for (auto &lk : links) {
if (lk.href.length() > 7 && to_lower(lk.href).find("mailto:") == 0) {
std::string m = lk.href.substr(7);
auto q = m.find('?');
if (q != std::string::npos)
m = m.substr(0, q);
m = to_lower(m);
if (is_likely_email(m)) {
if (seen.insert(m).second) {
from_text.push_back(m);
}
}
}
}
return from_text;
}
// ── enrich_location ─────────────────────────────────────────────────────────
EnrichedNode enrich_location(const LocationInput &loc,
const EnrichConfig &cfg) {
auto t0 = std::chrono::steady_clock::now();
EnrichedNode node;
node.title = loc.title;
node.place_id = loc.place_id;
node.website = loc.website;
node.address = loc.address;
node.type = loc.type;
node.grid_area = loc.grid_area;
node.grid_gid = loc.grid_gid;
node.status = EnrichStatus::NO_EMAIL;
if (loc.website.empty()) {
node.status = EnrichStatus::FETCH_ERROR;
node.error = "no website";
node.total_ms = elapsed_ms(t0);
return node;
}
// ── Phase 1: Meta scrape ────────────────────────────────────────────────
auto meta_t0 = std::chrono::steady_clock::now();
SiteMeta meta;
bool meta_timed_out = false;
try {
if (cfg.meta_scraper == "SCRAPELESS" && !cfg.scrapeless_key.empty()) {
logger::debug("[meta:scrapeless] Fetching " + loc.website);
meta = scrape_meta_scrapeless(loc.website, cfg.scrapeless_key,
cfg.meta_timeout_ms);
} else {
logger::debug("[meta:http] Fetching " + loc.website);
meta = scrape_meta(loc.website, cfg.meta_timeout_ms);
}
} catch (...) {
meta.fetch_error = "exception during meta scrape";
meta_timed_out = true;
}
node.meta_ms = elapsed_ms(meta_t0);
// Check if meta took too long (within threshold of timeout)
if (node.meta_ms >= cfg.meta_timeout_ms - 1000) {
meta_timed_out = true;
}
// logger::info("[" + std::string(loc.title.empty() ? loc.website : loc.title)
// + "] Meta fetch took " + std::to_string(node.meta_ms) + "ms. Links found: "
// + std::to_string(meta.internal_pages.size()));
if (!meta.body_text.empty())
node.pages["home"] = meta.body_text;
if (cfg.enable_homepage_md && !meta.body_html.empty()) {
// Cap HTML body at 512 KB to prevent stack overflow in recursive html2md
// parser
static constexpr size_t MAX_HTML_BYTES = 512 * 1024;
if (meta.body_html.size() > MAX_HTML_BYTES) {
logger::warn("[" + loc.title + "] body_html too large (" +
std::to_string(meta.body_html.size() / 1024) +
" KB), skipping markdown conversion");
} else {
try {
node.sites[loc.website] = html::to_markdown(meta.body_html);
} catch (const std::exception &e) {
logger::warn("[" + loc.title +
"] html::to_markdown failed: " + e.what());
} catch (...) {
logger::warn("[" + loc.title +
"] html::to_markdown crashed (unknown exception)");
}
}
}
node.meta_pages = meta.internal_pages;
node.pages_found = static_cast<int>(meta.internal_pages.size());
node.socials = meta.socials;
if (!meta.fetch_error.empty()) {
node.error = meta.fetch_error;
node.status = EnrichStatus::FETCH_ERROR;
node.total_ms = elapsed_ms(t0);
return node;
}
// If meta already found emails, we're done (early exit like TS)
if (!meta.emails.empty()) {
node.emails = meta.emails;
node.status = EnrichStatus::OK;
node.total_ms = elapsed_ms(t0);
return node;
}
// ── Build contact page list ─────────────────────────────────────────────
std::vector<std::string> contact_pages;
std::set<std::string> seen_urls;
for (auto &page_url : meta.internal_pages) {
if (matches_contact_pattern(page_url, cfg.contact_patterns)) {
if (seen_urls.insert(page_url).second) {
contact_pages.push_back(page_url);
}
}
}
// No more probe paths. If we found 0 contact pages, we just give up or time
// out.
node.pages_found = static_cast<int>(contact_pages.size());
if (contact_pages.empty()) {
logger::debug("[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] No contact pages found.");
node.status =
meta_timed_out ? EnrichStatus::META_TIMEOUT : EnrichStatus::NO_PAGES;
node.total_ms = elapsed_ms(t0);
return node;
}
logger::debug("[" + std::string(loc.title.empty() ? loc.website : loc.title) +
"] Contact pages to scrape: " +
std::to_string(contact_pages.size()) + " (parallel)");
// ── Phase 2: Email scrape per contact page ──────────────────────────────
struct AsyncResult {
std::string url;
std::vector<PageError> errors;
std::vector<std::string> emails;
int ms;
};
int pages_to_scrape =
std::min(static_cast<int>(contact_pages.size()), cfg.email_max_pages);
std::vector<std::thread> contact_threads;
std::vector<AsyncResult> contact_results(pages_to_scrape);
auto email_t0 = std::chrono::steady_clock::now();
for (int i = 0; i < pages_to_scrape; ++i) {
auto page_url = contact_pages[i];
contact_threads.emplace_back([i, &contact_results, page_url, cfg, loc]() {
auto start = std::chrono::steady_clock::now();
AsyncResult res;
res.url = page_url;
PageError pe1;
pe1.url = page_url;
pe1.method = "GET";
int http_status = 0;
try {
// logger::debug("[email:http] Fetching " + page_url);
auto page_emails = scrape_emails_from_page(
page_url, cfg.email_page_timeout_ms, http_status);
pe1.emails = page_emails;
logger::debug("[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] HTTP fetch finished code " +
std::to_string(http_status) + " for " + page_url);
if (page_emails.empty()) {
if (http_status == 404 || http_status == 400 || http_status == 500) {
pe1.status = "NOT_FOUND";
pe1.error = "HTTP " + std::to_string(http_status);
} else {
pe1.status = "AXIOS_NO_EMAIL";
res.errors.push_back(pe1); // pushed before scrapeless
if (cfg.meta_scraper == "SCRAPELESS" &&
!cfg.scrapeless_key.empty()) {
PageError pe2;
pe2.url = page_url;
pe2.method = "SCRAPELESS";
try {
logger::debug("[email:scrapeless] Fallback scraping " +
page_url);
auto s_emails =
scrape_emails_scrapeless(page_url, cfg.scrapeless_key,
cfg.email_page_timeout_ms + 5000);
pe2.emails = s_emails;
pe2.status = s_emails.empty() ? "FAILED" : "SEARCHED_EMAIL";
if (!s_emails.empty())
res.emails = s_emails;
logger::debug(
"[" +
std::string(loc.title.empty() ? loc.website : loc.title) +
"] Scrapeless fallback finished for " + page_url);
} catch (...) {
pe2.status = "FAILED";
pe2.error = "scrapeless exception";
}
res.errors.push_back(pe2);
}
res.ms = elapsed_ms(start);
contact_results[i] = res;
return;
}
} else {
pe1.status = "SEARCHED_EMAIL";
res.emails = page_emails;
}
} catch (...) {
pe1.status = "AXIOS_FAILED";
pe1.error = "exception";
}
// Only insert pe1 if we didn't already push it during fallback
if (res.errors.empty() || res.errors[0].method != "GET") {
res.errors.insert(res.errors.begin(), pe1);
}
res.ms = elapsed_ms(start);
contact_results[i] = res;
});
}
for (auto &t : contact_threads) {
if (t.joinable())
t.join();
}
std::set<std::string> all_emails;
int pages_scraped = 0;
for (auto &res : contact_results) {
pages_scraped++;
for (auto &pe : res.errors) {
node.page_errors.push_back(std::move(pe));
}
for (auto &e : res.emails) {
all_emails.insert(e);
}
}
node.email_ms = elapsed_ms(email_t0);
node.pages_scraped = pages_scraped;
// Merge emails
node.emails.assign(all_emails.begin(), all_emails.end());
// Final status
bool email_timed_out = node.email_ms >= cfg.email_timeout_ms - 1000;
if (!node.emails.empty()) {
node.status = EnrichStatus::OK;
} else if (email_timed_out) {
node.status = EnrichStatus::EMAIL_TIMEOUT;
} else if (meta_timed_out) {
node.status = EnrichStatus::META_TIMEOUT;
} else {
node.status = EnrichStatus::NO_EMAIL;
}
node.total_ms = elapsed_ms(t0);
return node;
}
} // namespace enrichers

View File

@ -210,18 +210,18 @@ BoundaryResult load_boundary_file(const std::string& filepath) {
BoundaryResult load_boundary(const std::string& gid, int targetLevel,
const std::string& cacheDir) {
// Try: cacheDir/boundary_{gid}_{level}.json
std::string path = cacheDir + "/boundary_" + gid + "_" + std::to_string(targetLevel) + ".json";
std::string cc = country_code(gid);
std::string filename = "boundary_" + gid + "_" + std::to_string(targetLevel) + ".json";
// Primary: cacheDir/{countryCode}/boundary_{gid}_{level}.json
std::string path = cacheDir + "/" + cc + "/" + filename;
auto result = load_boundary_file(path);
if (result.error.empty()) return result;
// Fallback: cacheDir/boundary_{countryCode}_{level}.json
std::string cc = country_code(gid);
if (cc != gid) {
path = cacheDir + "/boundary_" + cc + "_" + std::to_string(targetLevel) + ".json";
result = load_boundary_file(path);
if (result.error.empty()) return result;
}
// Fallback (flat): cacheDir/boundary_{gid}_{level}.json
path = cacheDir + "/" + filename;
result = load_boundary_file(path);
if (result.error.empty()) return result;
// Both failed
result.error = "No boundary file found for gid=" + gid + " level=" + std::to_string(targetLevel) + " in " + cacheDir;

View File

@ -16,6 +16,8 @@ struct Waypoint {
double lng = 0;
double lat = 0;
double radius_km = 0;
std::string area_gid;
std::string area_name;
};
struct GridOptions {

View File

@ -168,7 +168,9 @@ static GridResult generate_admin(const std::vector<gadm::Feature>& features,
static_cast<int>(res.waypoints.size() + 1),
std::round(center.lon * 1e6) / 1e6,
std::round(center.lat * 1e6) / 1e6,
std::round(radiusKm * 100.0) / 100.0
std::round(radiusKm * 100.0) / 100.0,
f.gid,
f.name
});
res.validCells++;
} else {
@ -248,7 +250,9 @@ static GridResult generate_centers(const std::vector<gadm::Feature>& features,
static_cast<int>(res.waypoints.size() + 1),
std::round(pt.lon * 1e6) / 1e6,
std::round(pt.lat * 1e6) / 1e6,
std::round((opts.cellSize / 2.0) * 100.0) / 100.0
std::round((opts.cellSize / 2.0) * 100.0) / 100.0,
f.gid,
f.name
});
res.validCells++;
} else {
@ -322,7 +326,9 @@ static GridResult generate_polygon_grid(const std::vector<gadm::Feature>& featur
static_cast<int>(res.waypoints.size() + 1),
std::round(gc.lon * 1e6) / 1e6,
std::round(gc.lat * 1e6) / 1e6,
std::round(cellRadiusKm * 100.0) / 100.0
std::round(cellRadiusKm * 100.0) / 100.0,
regionFeat.gid,
regionFeat.name
});
res.validCells++;
} else {
@ -356,9 +362,21 @@ GridResult generate(const std::vector<gadm::Feature>& features,
// Sort waypoints
if (result.waypoints.size() > 1) {
if (opts.groupByRegion && features.size() > 1 && opts.gridMode != "admin" && opts.gridMode != "centers") {
// Group by region index could be added, but for now sort all together
sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize);
if (opts.groupByRegion && features.size() > 1) {
std::stable_sort(result.waypoints.begin(), result.waypoints.end(),
[](const Waypoint& a, const Waypoint& b) { return a.area_gid < b.area_gid; });
auto start = result.waypoints.begin();
while (start != result.waypoints.end()) {
auto end = start;
while (end != result.waypoints.end() && end->area_gid == start->area_gid) {
++end;
}
std::vector<Waypoint> group(start, end);
sort_waypoints(group, opts.pathOrder, opts.cellSize);
std::copy(group.begin(), group.end(), start);
start = end;
}
} else {
sort_waypoints(result.waypoints, opts.pathOrder, opts.cellSize);
}

View File

@ -1,26 +1,33 @@
include(FetchContent)
FetchContent_Declare(
lexbor
GIT_REPOSITORY https://github.com/lexbor/lexbor.git
GIT_TAG v2.4.0
GIT_SHALLOW TRUE
)
# Build lexbor as static
set(LEXBOR_BUILD_SHARED OFF CACHE BOOL "" FORCE)
set(LEXBOR_BUILD_STATIC ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(lexbor)
add_library(html STATIC
src/html.cpp
)
target_include_directories(html
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(html
PUBLIC lexbor_static
include(FetchContent)
FetchContent_Declare(
lexbor
GIT_REPOSITORY https://github.com/lexbor/lexbor.git
GIT_TAG v2.4.0
GIT_SHALLOW TRUE
)
# Build lexbor as static
set(LEXBOR_BUILD_SHARED OFF CACHE BOOL "" FORCE)
set(LEXBOR_BUILD_STATIC ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(lexbor)
add_library(html STATIC
src/html.cpp
src/html2md.cpp
src/table.cpp
)
# MSVC: treat source and execution charset as UTF-8
# (fixes \u200b zero-width-space mismatch in html2md tests)
if(MSVC)
target_compile_options(html PRIVATE /utf-8)
endif()
target_include_directories(html
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(html
PUBLIC lexbor_static
)

View File

@ -11,6 +11,13 @@ struct Element {
std::string text;
};
/// Link with href and optional attributes.
struct Link {
std::string href;
std::string rel; // e.g. "canonical", "stylesheet"
std::string text; // anchor text (for <a> tags)
};
/// Parse an HTML string and return all elements with their text content.
std::vector<Element> parse(const std::string &html_str);
@ -18,4 +25,31 @@ std::vector<Element> parse(const std::string &html_str);
std::vector<std::string> select(const std::string &html_str,
const std::string &selector);
// ── Enricher extraction helpers ─────────────────────────────────────────────
/// Extract the <title> text.
std::string get_title(const std::string &html_str);
/// Extract a <meta name="X"> or <meta property="X"> content attribute.
std::string get_meta(const std::string &html_str, const std::string &name);
/// Extract <link rel="canonical"> href.
std::string get_canonical(const std::string &html_str);
/// Extract all <a href="..."> values (resolved links as-is from the HTML).
std::vector<Link> get_links(const std::string &html_str);
/// Extract visible body text, stripping script/style/noscript/svg/iframe.
std::string get_body_text(const std::string &html_str);
/// Extract raw JSON strings from <script type="application/ld+json">.
std::vector<std::string> get_json_ld(const std::string &html_str);
/// Extract an attribute value from the first element matching a CSS selector.
std::string get_attr(const std::string &html_str, const std::string &selector,
const std::string &attr_name);
/// Convert HTML content to Markdown.
std::string to_markdown(const std::string &html_str);
} // namespace html

View File

@ -0,0 +1,690 @@
// Copyright (c) Tim Gromeyer
// Licensed under the MIT License - https://opensource.org/licenses/MIT
#ifndef HTML2MD_H
#define HTML2MD_H
#include <memory>
#include <string>
#include <unordered_map>
#include <cstdint>
/*!
* \brief html2md namespace
*
* The html2md namespace provides:
* 1. The Converter class
* 2. Static wrapper around Converter class
*
* \note Do NOT try to convert HTML that contains a list in an ordered list or a
* `blockquote` in a list!\n This will be a **total** mess!
*/
namespace html2md {
/*!
* \brief Options for the conversion from HTML to Markdown
* \warning Make sure to pass valid options; otherwise, the output will be
* invalid!
*
* Example from `tests/main.cpp`:
*
* ```cpp
* auto *options = new html2md::Options();
* options->splitLines = false;
*
* html2md::Converter c(html, options);
* auto md = c.convert();
* ```
*/
struct Options {
/*!
* \brief Add new line when a certain number of characters is reached
*
* \see softBreak
* \see hardBreak
*/
bool splitLines = true;
/*!
* \brief softBreak Wrap after ... characters when the next space is reached
* and as long as it's not in a list, table, image or anchor (link).
*/
int softBreak = 80;
/*!
* \brief hardBreak Force a break after ... characters in a line
*/
int hardBreak = 100;
/*!
* \brief The char used for unordered lists
*
* Valid:
* - `-`
* - `+`
* - `*`
*
* Example:
*
* ```markdown
* - List
* + Also a list
* * And this to
* ```
*/
char unorderedList = '-';
/*!
* \brief The char used after the number of the item
*
* Valid:
* - `.`
* - `)`
*
* Example:
*
* ```markdown
* 1. Hello
* 2) World!
* ```
*/
char orderedList = '.';
/*!
* \brief Whether title is added as h1 heading at the very beginning of the
* markdown
*
* Whether title is added as h1 heading at the very beginning of the markdown.
* Default is true.
*/
bool includeTitle = true;
/*!
* \brief Whetever to format Markdown Tables
*
* Whetever to format Markdown Tables.
* Default is true.
*/
bool formatTable = true;
/*!
* \brief Whether to force left trim of lines in the final Markdown output
*
* Whether to force left trim of lines in the final Markdown output.
* Default is false.
*/
bool forceLeftTrim = false;
/*!
* \brief Whether to compress whitespace (tabs, multiple spaces) into a single
* space
*
* Whether to compress whitespace (tabs, multiple spaces) into a single space.
* Default is false.
*/
bool compressWhitespace = false;
/*!
* \brief Whether to escape numbered lists (e.g. "4." -> "4\.") to prevent them
* from being interpreted as lists in Markdown.
*
* Whether to escape numbered lists.
* Default is true.
*/
bool escapeNumberedList = true;
/*!
* \brief Whether to keep HTML entities (e.g. `&nbsp;`) in the output
*
* If true, the converter will not replace HTML entities configured in the
* internal conversion map. Default is false (current behaviour).
*/
bool keepHtmlEntities = false;
inline bool operator==(html2md::Options o) const {
return splitLines == o.splitLines && unorderedList == o.unorderedList &&
orderedList == o.orderedList && includeTitle == o.includeTitle &&
softBreak == o.softBreak && hardBreak == o.hardBreak &&
formatTable == o.formatTable && forceLeftTrim == o.forceLeftTrim &&
compressWhitespace == o.compressWhitespace &&
escapeNumberedList == o.escapeNumberedList &&
keepHtmlEntities == o.keepHtmlEntities;
};
};
/*!
* \brief Class for converting HTML to Markdown
*
* This class converts HTML to Markdown.
* There is also a static wrapper for this class (see html2md::Convert).
*
* ## Usage example
*
* Option 1: Use the class:
*
* ```cpp
* std::string html = "<h1>example</h1>";
* html2md::Converter c(html);
* auto md = c.convert();
*
* if (!c.ok()) std::cout << "There was something wrong in the HTML\n";
* std::cout << md; // # example
* ```
*
* Option 2: Use the static wrapper:
*
* ```cpp
* std::string html = "<h1>example</h1>";
*
* auto md = html2md::Convert(html);
* std::cout << md;
* ```
*
* Advanced: use Options:
*
* ```cpp
* std::string html = "<h1>example</h1>";
*
* auto *options = new html2md::Options();
* options->splitLines = false;
* options->unorderedList = '*';
*
* html2md::Converter c(html, options);
* auto md = c.convert();
* if (!c.ok()) std::cout << "There was something wrong in the HTML\n";
* std::cout << md; // # example
* ```
*/
class Converter {
public:
/*!
* \brief Standard initializer, takes HTML as parameter. Also prepares
* everything. \param html The HTML as std::string. \param options Options for
* the Conversation. See html2md::Options() for more.
*
* \note Don't pass anything else than HTML, otherwise the output will be a
* **mess**!
*
* This is the default initializer.<br>
* You can use appendToMd() to append something to the beginning of the
* generated output.
*/
explicit inline Converter(const std::string &html,
struct Options *options = nullptr) {
*this = Converter(&html, options);
}
/*!
* \brief Convert HTML into Markdown.
* \return Returns the converted Markdown.
*
* This function actually converts the HTML into Markdown.
* It also cleans up the Markdown so you don't have to do anything.
*/
[[nodiscard]] std::string convert();
/*!
* \brief Append a char to the Markdown.
* \param ch The char to append.
* \return Returns a copy of the instance with the char appended.
*/
Converter *appendToMd(char ch);
/*!
* \brief Append a char* to the Markdown.
* \param str The char* to append.
* \return Returns a copy of the instance with the char* appended.
*/
Converter *appendToMd(const char *str);
/*!
* \brief Append a string to the Markdown.
* \param s The string to append.
* \return Returns a copy of the instance with the string appended.
*/
inline Converter *appendToMd(const std::string &s) {
return appendToMd(s.c_str());
}
/*!
* \brief Appends a ' ' in certain cases.
* \return Copy of the instance with(maybe) the appended space.
*
* This function appends ' ' if:
* - md does not end with `*`
* - md does not end with `\n` aka newline
*/
Converter *appendBlank();
/*!
* \brief Add an HTML symbol conversion
* \param htmlSymbol The HTML symbol to convert
* \param replacement The replacement string
* \note This is useful for converting HTML entities to their Markdown
* equivalents. For example, you can add a conversion for "&nbsp;" to
* " " (space) or "&lt;" to "<" (less than).
* \note This is not a standard feature of the Converter class, but it can
* be added to the class to allow for more flexibility in the conversion
* process. You can use this feature to add custom conversions for any HTML
* symbol that you want to convert to a specific Markdown representation.
*/
void addHtmlSymbolConversion(const std::string &htmlSymbol,
const std::string &replacement) {
htmlSymbolConversions_[htmlSymbol] = replacement;
}
/*!
* \brief Remove an HTML symbol conversion
* \param htmlSymbol The HTML symbol to remove
* \note This is useful for removing custom conversions that you have added
* previously.
*/
void removeHtmlSymbolConversion(const std::string &htmlSymbol) {
htmlSymbolConversions_.erase(htmlSymbol);
}
/*!
* \brief Clear all HTML symbol conversions
* \note This is useful for clearing the conversion map (it's empty afterwards).
*/
void clearHtmlSymbolConversions() { htmlSymbolConversions_.clear(); }
/*!
* \brief Checks if everything was closed properly(in the HTML).
* \return Returns false if there is a unclosed tag.
* \note As long as you have not called convert(), it always returns true.
*/
[[nodiscard]] bool ok() const;
/*!
* \brief Reset the generated Markdown
*/
void reset();
/*!
* \brief Checks if the HTML matches and the options are the same.
* \param The Converter object to compare with
* \return true if the HTML and options matches otherwise false
*/
inline bool operator==(const Converter *c) const { return *this == *c; }
inline bool operator==(const Converter &c) const {
return html_ == c.html_ && option == c.option;
}
/*!
* \brief Returns ok().
*/
inline explicit operator bool() const { return ok(); };
private:
// Attributes
static constexpr const char *kAttributeHref = "href";
static constexpr const char *kAttributeAlt = "alt";
static constexpr const char *kAttributeTitle = "title";
static constexpr const char *kAttributeClass = "class";
static constexpr const char *kAttributeSrc = "src";
static constexpr const char *kAttrinuteAlign = "align";
static constexpr const char *kTagAnchor = "a";
static constexpr const char *kTagBreak = "br";
static constexpr const char *kTagCode = "code";
static constexpr const char *kTagDiv = "div";
static constexpr const char *kTagHead = "head";
static constexpr const char *kTagLink = "link";
static constexpr const char *kTagListItem = "li";
static constexpr const char *kTagMeta = "meta";
static constexpr const char *kTagNav = "nav";
static constexpr const char *kTagNoScript = "noscript";
static constexpr const char *kTagOption = "option";
static constexpr const char *kTagOrderedList = "ol";
static constexpr const char *kTagParagraph = "p";
static constexpr const char *kTagPre = "pre";
static constexpr const char *kTagScript = "script";
static constexpr const char *kTagSpan = "span";
static constexpr const char *kTagStyle = "style";
static constexpr const char *kTagTemplate = "template";
static constexpr const char *kTagTitle = "title";
static constexpr const char *kTagUnorderedList = "ul";
static constexpr const char *kTagImg = "img";
static constexpr const char *kTagSeperator = "hr";
// Text format
static constexpr const char *kTagBold = "b";
static constexpr const char *kTagStrong = "strong";
static constexpr const char *kTagItalic = "em";
static constexpr const char *kTagItalic2 = "i";
static constexpr const char *kTagCitation = "cite";
static constexpr const char *kTagDefinition = "dfn";
static constexpr const char *kTagUnderline = "u";
static constexpr const char *kTagStrighthrought = "del";
static constexpr const char *kTagStrighthrought2 = "s";
static constexpr const char *kTagBlockquote = "blockquote";
// Header
static constexpr const char *kTagHeader1 = "h1";
static constexpr const char *kTagHeader2 = "h2";
static constexpr const char *kTagHeader3 = "h3";
static constexpr const char *kTagHeader4 = "h4";
static constexpr const char *kTagHeader5 = "h5";
static constexpr const char *kTagHeader6 = "h6";
// Table
static constexpr const char *kTagTable = "table";
static constexpr const char *kTagTableRow = "tr";
static constexpr const char *kTagTableHeader = "th";
static constexpr const char *kTagTableData = "td";
size_t index_ch_in_html_ = 0;
bool is_closing_tag_ = false;
bool is_in_attribute_value_ = false;
bool is_in_code_ = false;
bool is_in_list_ = false;
bool is_in_p_ = false;
bool is_in_pre_ = false;
bool is_in_table_ = false;
bool is_in_table_row_ = false;
bool is_in_tag_ = false;
bool is_self_closing_tag_ = false;
bool skipping_leading_whitespace_ = true;
// relevant for <li> only, false = is in unordered list
bool is_in_ordered_list_ = false;
uint8_t index_ol = 0;
// store the table start
size_t table_start = 0;
// number of lists
uint8_t index_li = 0;
uint8_t index_blockquote = 0;
char prev_ch_in_md_ = 0, prev_prev_ch_in_md_ = 0;
char prev_ch_in_html_ = 'x';
std::string html_;
uint16_t offset_lt_ = 0;
std::string current_tag_;
std::string prev_tag_;
// Line which separates header from data
std::string tableLine;
size_t chars_in_curr_line_ = 0;
std::string md_;
Options option;
std::unordered_map<std::string, std::string> htmlSymbolConversions_ = {
{"&quot;", "\""}, {"&lt;", "<"}, {"&gt;", ">"},
{"&amp;", "&"}, {"&nbsp;", " "}, {"&rarr;", ""}};
// Tag: base class for tag types
struct Tag {
virtual void OnHasLeftOpeningTag(Converter *c) = 0;
virtual void OnHasLeftClosingTag(Converter *c) = 0;
};
// Tag types
// tags that are not printed (nav, script, noscript, ...)
struct TagIgnored : Tag {
void OnHasLeftOpeningTag(Converter *c) override {};
void OnHasLeftClosingTag(Converter *c) override {};
};
struct TagAnchor : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
std::string current_href_;
std::string current_title_;
};
struct TagBold : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagItalic : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagUnderline : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagStrikethrought : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagBreak : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagDiv : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader1 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader2 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader3 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader4 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader5 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagHeader6 : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagListItem : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagOption : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagOrderedList : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagParagraph : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagPre : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagCode : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagSpan : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTitle : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagUnorderedList : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagImage : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagSeperator : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTable : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTableRow : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTableHeader : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagTableData : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
struct TagBlockquote : Tag {
void OnHasLeftOpeningTag(Converter *c) override;
void OnHasLeftClosingTag(Converter *c) override;
};
std::unordered_map<std::string, std::shared_ptr<Tag>> tags_;
explicit Converter(const std::string *html, struct Options *options);
void CleanUpMarkdown();
// Trim from start (in place)
static void LTrim(std::string *s);
// Trim from end (in place)
Converter *RTrim(std::string *s, bool trim_only_blank = false);
// Trim from both ends (in place)
Converter *Trim(std::string *s);
// 1. trim all lines
// 2. reduce consecutive newlines to maximum 3
void TidyAllLines(std::string *str);
std::string ExtractAttributeFromTagLeftOf(const std::string &attr);
void TurnLineIntoHeader1();
void TurnLineIntoHeader2();
// Current char: '<'
void OnHasEnteredTag();
Converter *UpdatePrevChFromMd();
/**
* Handle next char within <...> tag
*
* @param ch current character
* @return continue surrounding iteration?
*/
bool ParseCharInTag(char ch);
// Current char: '>'
bool OnHasLeftTag();
inline static bool TagContainsAttributesToHide(std::string *tag) {
using std::string;
return (*tag).find(" aria=\"hidden\"") != string::npos ||
(*tag).find("display:none") != string::npos ||
(*tag).find("visibility:hidden") != string::npos ||
(*tag).find("opacity:0") != string::npos ||
(*tag).find("Details-content--hidden-not-important") != string::npos;
}
Converter *ShortenMarkdown(size_t chars = 1);
inline bool shortIfPrevCh(char prev) {
if (prev_ch_in_md_ == prev) {
ShortenMarkdown();
return true;
}
return false;
};
/**
* @param ch
* @return continue iteration surrounding this method's invocation?
*/
bool ParseCharInTagContent(char ch);
// Replace previous space (if any) in current markdown line by newline
bool ReplacePreviousSpaceInLineByNewline();
static inline bool IsIgnoredTag(const std::string &tag) {
return (tag[0] == '-' || kTagTemplate == tag || kTagStyle == tag ||
kTagScript == tag || kTagNoScript == tag || kTagNav == tag);
// meta: not ignored to tolerate if closing is omitted
}
[[nodiscard]] bool IsInIgnoredTag() const;
}; // Converter
/*!
* \brief Static wrapper around the Converter class
* \param html The HTML passed to Converter
* \param ok Optional: Pass a reference to a local bool to store the output of
* Converter::ok() \return Returns the by Converter generated Markdown
*/
inline std::string Convert(const std::string &html, bool *ok = nullptr) {
Converter c(html);
auto md = c.convert();
if (ok != nullptr)
*ok = c.ok();
return md;
}
#ifndef PYTHON_BINDINGS
inline std::string Convert(const std::string &&html, bool *ok = nullptr) {
return Convert(html, ok);
}
#endif
} // namespace html2md
#endif // HTML2MD_H

View File

@ -0,0 +1,11 @@
// Copyright (c) Tim Gromeyer
// Licensed under the MIT License - https://opensource.org/licenses/MIT
#ifndef TABLE_H
#define TABLE_H
#include <string>
[[nodiscard]] std::string formatMarkdownTable(const std::string &inputTable);
#endif // TABLE_H

101
packages/html/readme.md Normal file
View File

@ -0,0 +1,101 @@
# Scraper Request
## OpenAPI Specification
```yaml
openapi: 3.0.1
info:
title: ''
description: ''
version: 1.0.0
paths:
/api/v1/scraper/request:
post:
summary: Scraper Request
deprecated: false
description: ''
tags:
- Scraping API
parameters: []
requestBody:
content:
application/json:
schema:
type: object
properties:
actor:
type: string
input:
type: object
properties:
url:
type: string
required:
- url
x-apidog-orders:
- url
proxy:
type: object
properties:
country:
type: string
required:
- country
x-apidog-orders:
- country
async:
type: boolean
description: |-
If true, the task will be executed asynchronously.
If false, the task will be executed synchronously.
required:
- actor
- input
- proxy
x-apidog-orders:
- actor
- input
- proxy
- async
example:
actor: scraper.xxx
input:
url: >-
https://www.***.com/shop/us/products/stmicroelectronics/tda7265a-3074457345625542393/
proxy:
country: US
async: false
responses:
'200':
description: ''
content:
application/json:
schema:
type: object
properties: {}
x-apidog-orders: []
headers: {}
x-apidog-name: Success
security:
- apikey-header-x-api-token: []
x-apidog-folder: Scraping API
x-apidog-status: released
x-run-in-apidog: https://app.apidog.com/web/project/745098/apis/api-11949852-run
components:
schemas: {}
securitySchemes:
bearer:
type: bearer
scheme: bearer
description: Bearer token authentication using your Scrapeless API key
apikey-header-x-api-token:
type: apiKey
in: header
name: x-api-token
servers:
- url: https://api.scrapeless.com
description: Prod Env
security:
- apikey-header-x-api-token: []
```

View File

@ -3,6 +3,10 @@
#include <lexbor/css/css.h>
#include <lexbor/html/html.h>
#include <lexbor/selectors/selectors.h>
#include <html/html2md.h>
#include <algorithm>
#include <cstring>
namespace html {
@ -26,6 +30,35 @@ static std::string tag_name(lxb_dom_element_t *el) {
return std::string(reinterpret_cast<const char *>(name), len);
}
static std::string get_element_attr(lxb_dom_element_t *el, const char *attr) {
size_t len = 0;
const lxb_char_t *val = lxb_dom_element_get_attribute(
el, reinterpret_cast<const lxb_char_t *>(attr), strlen(attr), &len);
if (!val)
return {};
return std::string(reinterpret_cast<const char *>(val), len);
}
static lxb_html_document_t *parse_doc(const std::string &html_str) {
auto *doc = lxb_html_document_create();
if (!doc) return nullptr;
auto status = lxb_html_document_parse(
doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()),
html_str.size());
if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(doc);
return nullptr;
}
return doc;
}
// ── Helper: check if a tag name matches a noise element ─────────────────────
static bool is_noise_tag(const std::string &name) {
return name == "script" || name == "style" || name == "noscript" ||
name == "svg" || name == "iframe";
}
// ── walk tree recursively ───────────────────────────────────────────────────
static void walk(lxb_dom_node_t *node, std::vector<Element> &out) {
@ -45,22 +78,125 @@ static void walk(lxb_dom_node_t *node, std::vector<Element> &out) {
}
}
// ── Walk for visible text only (skip noise tags) ────────────────────────────
static void walk_text(lxb_dom_node_t *node, std::string &out) {
if (!node) return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto name = tag_name(el);
if (is_noise_tag(name)) return; // Skip noise subtrees entirely
}
if (node->type == LXB_DOM_NODE_TYPE_TEXT) {
size_t len = 0;
const lxb_char_t *data = lxb_dom_node_text_content(node, &len);
if (data && len > 0) {
std::string chunk(reinterpret_cast<const char *>(data), len);
// Collapse whitespace
bool needSpace = !out.empty() && out.back() != ' ' && out.back() != '\n';
// Trim leading/trailing whitespace from chunk
size_t start = chunk.find_first_not_of(" \t\n\r");
size_t end = chunk.find_last_not_of(" \t\n\r");
if (start != std::string::npos) {
if (needSpace) out += ' ';
out += chunk.substr(start, end - start + 1);
}
}
}
auto *child = node->first_child;
while (child) {
walk_text(child, out);
child = child->next;
}
}
// ── Walk <head> for meta/title/link ─────────────────────────────────────────
struct HeadData {
std::string title;
std::string canonical;
std::vector<std::pair<std::string, std::string>> metas; // name/property → content
std::vector<std::string> json_ld;
};
static void walk_head(lxb_dom_node_t *node, HeadData &data) {
if (!node) return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto name = tag_name(el);
if (name == "title") {
data.title = node_text(node);
} else if (name == "meta") {
auto nameAttr = get_element_attr(el, "name");
auto propAttr = get_element_attr(el, "property");
auto content = get_element_attr(el, "content");
if (!content.empty()) {
if (!nameAttr.empty()) data.metas.emplace_back(nameAttr, content);
if (!propAttr.empty()) data.metas.emplace_back(propAttr, content);
}
} else if (name == "link") {
auto rel = get_element_attr(el, "rel");
if (rel == "canonical") {
data.canonical = get_element_attr(el, "href");
}
} else if (name == "script") {
auto type = get_element_attr(el, "type");
if (type == "application/ld+json") {
auto text = node_text(node);
if (!text.empty()) data.json_ld.push_back(text);
}
}
}
auto *child = node->first_child;
while (child) {
walk_head(child, data);
child = child->next;
}
}
// ── Walk <body> for <a> links ───────────────────────────────────────────────
static void walk_links(lxb_dom_node_t *node, std::vector<Link> &out) {
if (!node) return;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto name = tag_name(el);
if (name == "a") {
auto href = get_element_attr(el, "href");
if (!href.empty()) {
Link lk;
lk.href = href;
lk.rel = get_element_attr(el, "rel");
lk.text = node_text(node);
out.push_back(std::move(lk));
}
}
}
auto *child = node->first_child;
while (child) {
walk_links(child, out);
child = child->next;
}
}
// ── public API ──────────────────────────────────────────────────────────────
std::vector<Element> parse(const std::string &html_str) {
auto *doc = lxb_html_document_create();
if (!doc)
return {};
auto status = lxb_html_document_parse(
doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()),
html_str.size());
auto *doc = parse_doc(html_str);
if (!doc) return {};
std::vector<Element> result;
if (status == LXB_STATUS_OK) {
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk(body, result);
}
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk(body, result);
lxb_html_document_destroy(doc);
return result;
@ -87,20 +223,9 @@ std::vector<std::string> select(const std::string &html_str,
const std::string &selector) {
std::vector<std::string> result;
// Parse document
auto *doc = lxb_html_document_create();
if (!doc)
return result;
auto *doc = parse_doc(html_str);
if (!doc) return result;
auto status = lxb_html_document_parse(
doc, reinterpret_cast<const lxb_char_t *>(html_str.c_str()),
html_str.size());
if (status != LXB_STATUS_OK) {
lxb_html_document_destroy(doc);
return result;
}
// Set up CSS parser + selectors engine
auto *css_parser = lxb_css_parser_create();
lxb_css_parser_init(css_parser, nullptr);
@ -126,4 +251,153 @@ std::vector<std::string> select(const std::string &html_str,
return result;
}
// ── Enricher extraction helpers ─────────────────────────────────────────────
std::string get_title(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
walk_head(head, data);
lxb_html_document_destroy(doc);
return data.title;
}
std::string get_meta(const std::string &html_str, const std::string &name) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
walk_head(head, data);
lxb_html_document_destroy(doc);
for (auto &[key, val] : data.metas) {
if (key == name) return val;
}
return {};
}
std::string get_canonical(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
auto *head = lxb_dom_interface_node(lxb_html_document_head_element(doc));
walk_head(head, data);
lxb_html_document_destroy(doc);
return data.canonical;
}
std::vector<Link> get_links(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
std::vector<Link> links;
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk_links(body, links);
lxb_html_document_destroy(doc);
return links;
}
std::string get_body_text(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
std::string text;
auto *body = lxb_dom_interface_node(lxb_html_document_body_element(doc));
walk_text(body, text);
lxb_html_document_destroy(doc);
return text;
}
std::vector<std::string> get_json_ld(const std::string &html_str) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
HeadData data;
// JSON-LD can be in head or body — walk entire document
auto *root = lxb_dom_interface_node(
lxb_dom_document_element(&doc->dom_document));
walk_head(root, data);
lxb_html_document_destroy(doc);
return data.json_ld;
}
// ── get_attr via CSS selector ───────────────────────────────────────────────
struct AttrCtx {
std::string attr_name;
std::string result;
bool found;
};
static lxb_status_t attr_cb(lxb_dom_node_t *node,
lxb_css_selector_specificity_t spec, void *ctx) {
(void)spec;
auto *actx = static_cast<AttrCtx *>(ctx);
if (actx->found) return LXB_STATUS_OK;
if (node->type == LXB_DOM_NODE_TYPE_ELEMENT) {
auto *el = lxb_dom_interface_element(node);
auto val = get_element_attr(el, actx->attr_name.c_str());
if (!val.empty()) {
actx->result = val;
actx->found = true;
}
}
return LXB_STATUS_OK;
}
std::string get_attr(const std::string &html_str, const std::string &selector,
const std::string &attr_name) {
auto *doc = parse_doc(html_str);
if (!doc) return {};
auto *css_parser = lxb_css_parser_create();
lxb_css_parser_init(css_parser, nullptr);
auto *selectors = lxb_selectors_create();
lxb_selectors_init(selectors);
auto *list = lxb_css_selectors_parse(
css_parser, reinterpret_cast<const lxb_char_t *>(selector.c_str()),
selector.size());
std::string result;
if (list) {
AttrCtx ctx{attr_name, {}, false};
auto *root = lxb_dom_interface_node(
lxb_dom_document_element(&doc->dom_document));
lxb_selectors_find(selectors, root, list, attr_cb, &ctx);
result = ctx.result;
lxb_css_selector_list_destroy_memory(list);
}
lxb_selectors_destroy(selectors, true);
lxb_css_parser_destroy(css_parser, true);
lxb_html_document_destroy(doc);
return result;
}
std::string to_markdown(const std::string &html_str) {
// Defense-in-depth: hard cap at 2 MB even if the caller forgets.
// The enricher pipeline already caps at 512 KB, but future callers
// may not — prevent OOM / multi-second hangs from html2md.
static constexpr size_t MAX_HTML2MD_INPUT = 2 * 1024 * 1024;
if (html_str.size() > MAX_HTML2MD_INPUT) {
return "*[Content truncated: HTML too large for markdown conversion ("
+ std::to_string(html_str.size() / 1024) + " KB)]*\n";
}
return html2md::Convert(html_str);
}
} // namespace html

File diff suppressed because it is too large Load Diff

106
packages/html/src/table.cpp Normal file
View File

@ -0,0 +1,106 @@
// Copyright (c) Tim Gromeyer
// Licensed under the MIT License - https://opensource.org/licenses/MIT
#include "html/table.h"
#include <iomanip>
#include <iostream>
#include <sstream>
#include <vector>
using std::string;
using std::vector;
const size_t MIN_LINE_LENGTH = 3; // Minimum length of line
void removeLeadingTrailingSpaces(string &str) {
size_t firstNonSpace = str.find_first_not_of(' ');
if (firstNonSpace == string::npos) {
str.clear(); // Entire string is spaces
return;
}
size_t lastNonSpace = str.find_last_not_of(' ');
str = str.substr(firstNonSpace, lastNonSpace - firstNonSpace + 1);
}
string enlargeTableHeaderLine(const string &str, size_t length) {
if (str.empty() || length < MIN_LINE_LENGTH)
return "";
size_t first = str.find_first_of(':');
size_t last = str.find_last_of(':');
if (first == 0 && first == last)
last = string::npos;
string line = string(length, '-');
if (first == 0)
line[0] = ':';
if (last == str.length() - 1)
line[length - 1] = ':';
return line;
}
string formatMarkdownTable(const string &inputTable) {
std::istringstream iss(inputTable);
string line;
vector<vector<string>> tableData;
// Parse the input table into a 2D vector
while (std::getline(iss, line)) {
std::istringstream lineStream(line);
string cell;
vector<string> rowData;
while (std::getline(lineStream, cell, '|')) {
removeLeadingTrailingSpaces(cell); // Trim first
if (!cell.empty()) { // Then check if empty
rowData.push_back(cell);
}
}
if (!rowData.empty()) {
tableData.push_back(std::move(rowData)); // Move rowData to avoid copying
}
}
if (tableData.empty()) {
return "";
}
// Determine maximum width of each column
vector<size_t> columnWidths(tableData[0].size(), 0);
for (const auto &row : tableData) {
if (columnWidths.size() < row.size()) {
columnWidths.resize(row.size(), 0);
}
for (size_t i = 0; i < row.size(); ++i) {
columnWidths[i] = std::max(columnWidths[i], row[i].size());
}
}
// Build the formatted table
std::ostringstream formattedTable;
for (size_t rowNumber = 0; rowNumber < tableData.size(); ++rowNumber) {
const auto &row = tableData[rowNumber];
formattedTable << "|";
for (size_t i = 0; i < row.size(); ++i) {
if (rowNumber == 1) {
formattedTable << enlargeTableHeaderLine(row[i], columnWidths[i] + 2)
<< "|";
continue;
}
formattedTable << " " << std::setw(columnWidths[i]) << std::left << row[i]
<< " |";
}
formattedTable << "\n";
}
return formattedTable.str();
}

View File

@ -14,9 +14,14 @@ set(BUILD_CURL_EXE OFF CACHE BOOL "" FORCE)
set(BUILD_SHARED_LIBS OFF CACHE BOOL "" FORCE)
set(BUILD_TESTING OFF CACHE BOOL "" FORCE)
# TLS backend: Windows native SChannel
set(CURL_USE_OPENSSL OFF CACHE BOOL "" FORCE)
set(CURL_USE_SCHANNEL ON CACHE BOOL "" FORCE)
# TLS backend: platform-appropriate
if(WIN32)
set(CURL_USE_OPENSSL OFF CACHE BOOL "" FORCE)
set(CURL_USE_SCHANNEL ON CACHE BOOL "" FORCE)
else()
set(CURL_USE_SCHANNEL OFF CACHE BOOL "" FORCE)
set(CURL_USE_OPENSSL ON CACHE BOOL "" FORCE)
endif()
# Disable optional compression/protocol deps
set(CURL_ZLIB OFF CACHE BOOL "" FORCE)

View File

@ -9,11 +9,32 @@ struct Response {
std::string body;
};
/// Options for customisable HTTP GET requests.
struct GetOptions {
std::string user_agent = "Mozilla/5.0 (compatible; PolymechBot/1.0)";
int timeout_ms = 10000;
bool follow_redirects = true;
};
/// Perform an HTTP GET request. Returns the response body and status code.
Response get(const std::string &url);
/// Perform an HTTP GET request with custom options.
Response get(const std::string &url, const GetOptions &opts);
/// Perform an HTTP POST request with a body. Returns the response and status.
Response post(const std::string &url, const std::string &body,
const std::string &content_type = "application/json");
/// Options for customisable HTTP POST requests.
struct PostOptions {
std::string content_type = "application/json";
std::string bearer_token; // Authorization: Bearer <token>
int timeout_ms = 30000;
};
/// Perform an HTTP POST request with custom options.
Response post(const std::string &url, const std::string &body,
const PostOptions &opts);
} // namespace http

View File

@ -1,9 +1,53 @@
#include "http/http.h"
#include <curl/curl.h>
#include <mutex>
#include <chrono>
namespace http {
static std::once_flag curl_init_flag;
static void ensure_curl_init() {
std::call_once(curl_init_flag, []() {
curl_global_init(CURL_GLOBAL_ALL);
});
}
struct ThreadLocalCurl {
CURL *handle;
ThreadLocalCurl() {
ensure_curl_init();
handle = curl_easy_init();
}
~ThreadLocalCurl() {
if (handle) curl_easy_cleanup(handle);
}
CURL *get() {
if (handle) curl_easy_reset(handle);
return handle;
}
};
thread_local ThreadLocalCurl tl_curl;
struct ProgressData {
std::chrono::steady_clock::time_point start_time;
int timeout_ms;
};
static int progress_cb(void *clientp, curl_off_t dltotal, curl_off_t dlnow,
curl_off_t ultotal, curl_off_t ulnow) {
auto *pd = static_cast<ProgressData *>(clientp);
if (pd->timeout_ms <= 0) return 0;
auto now = std::chrono::steady_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - pd->start_time).count();
if (elapsed > pd->timeout_ms) {
return 1; // Return non-zero to abort the transfer
}
return 0; // Continue
}
static size_t write_cb(void *contents, size_t size, size_t nmemb, void *userp) {
auto *out = static_cast<std::string *>(userp);
out->append(static_cast<char *>(contents), size * nmemb);
@ -11,20 +55,50 @@ static size_t write_cb(void *contents, size_t size, size_t nmemb, void *userp) {
}
Response get(const std::string &url) {
return get(url, GetOptions{});
}
Response get(const std::string &url, const GetOptions &opts) {
Response resp{};
CURL *curl = curl_easy_init();
CURL *curl = tl_curl.get();
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
resp.body = "curl_easy_init (thread_local) failed";
return resp;
}
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, opts.follow_redirects ? 1L : 0L);
ProgressData prog_data;
if (opts.timeout_ms > 0) {
curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms));
prog_data.start_time = std::chrono::steady_clock::now();
prog_data.timeout_ms = opts.timeout_ms + 1000;
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
}
// Fail fast on dead sites (TCP SYN timeout)
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT_MS, 5000L);
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
// Prevent signal handlers from breaking in multithreaded environments
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
if (!opts.user_agent.empty()) {
curl_easy_setopt(curl, CURLOPT_USERAGENT, opts.user_agent.c_str());
}
// Accept-Encoding for compressed responses
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "");
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
@ -34,7 +108,6 @@ Response get(const std::string &url) {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
curl_easy_cleanup(curl);
return resp;
}
@ -42,7 +115,7 @@ Response post(const std::string &url, const std::string &body,
const std::string &content_type) {
Response resp{};
CURL *curl = curl_easy_init();
CURL *curl = tl_curl.get();
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
@ -61,6 +134,73 @@ Response post(const std::string &url, const std::string &body,
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 10L);
ProgressData prog_data;
prog_data.start_time = std::chrono::steady_clock::now();
prog_data.timeout_ms = 11000;
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
resp.body = curl_easy_strerror(res);
} else {
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &resp.status_code);
}
curl_slist_free_all(headers);
return resp;
}
Response post(const std::string &url, const std::string &body,
const PostOptions &opts) {
Response resp{};
CURL *curl = tl_curl.get();
if (!curl) {
resp.status_code = -1;
resp.body = "curl_easy_init failed";
return resp;
}
struct curl_slist *headers = nullptr;
headers =
curl_slist_append(headers, ("Content-Type: " + opts.content_type).c_str());
if (!opts.bearer_token.empty()) {
headers = curl_slist_append(
headers, ("Authorization: Bearer " + opts.bearer_token).c_str());
headers = curl_slist_append(
headers, ("x-api-token: " + opts.bearer_token).c_str());
}
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_cb);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &resp.body);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
ProgressData prog_data;
if (opts.timeout_ms > 0) {
curl_easy_setopt(curl, CURLOPT_TIMEOUT_MS, static_cast<long>(opts.timeout_ms));
prog_data.start_time = std::chrono::steady_clock::now();
prog_data.timeout_ms = opts.timeout_ms + 1000;
curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_cb);
curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &prog_data);
curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
}
// Prevent stalling: abort if transfer speed is less than 1 byte/sec for 10 seconds
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_LIMIT, 1L);
curl_easy_setopt(curl, CURLOPT_LOW_SPEED_TIME, 10L);
curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1L);
CURLcode res = curl_easy_perform(curl);
if (res != CURLE_OK) {
resp.status_code = -1;
@ -70,7 +210,6 @@ Response post(const std::string &url, const std::string &body,
}
curl_slist_free_all(headers);
curl_easy_cleanup(curl);
return resp;
}

View File

@ -5,10 +5,10 @@
namespace logger {
/// Initialize the default logger (call once at startup).
void init(const std::string &app_name = "polymech");
void init(const std::string &app_name = "polymech", const std::string &log_level = "info");
/// Initialize logger with stderr sink (use in worker/IPC mode).
void init_stderr(const std::string &app_name = "polymech-worker");
void init_stderr(const std::string &app_name = "polymech-worker", const std::string &log_level = "info");
/// Log at various levels.
void info(const std::string &msg);

View File

@ -6,17 +6,24 @@
namespace logger {
void init(const std::string &app_name) {
static void apply_log_level(const std::string& level) {
if (level == "debug") spdlog::set_level(spdlog::level::debug);
else if (level == "warn") spdlog::set_level(spdlog::level::warn);
else if (level == "error") spdlog::set_level(spdlog::level::err);
else spdlog::set_level(spdlog::level::info);
}
void init(const std::string &app_name, const std::string &log_level) {
auto console = spdlog::stdout_color_mt(app_name);
spdlog::set_default_logger(console);
spdlog::set_level(spdlog::level::debug);
apply_log_level(log_level);
spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v");
}
void init_stderr(const std::string &app_name) {
void init_stderr(const std::string &app_name, const std::string &log_level) {
auto console = spdlog::stderr_color_mt(app_name);
spdlog::set_default_logger(console);
spdlog::set_level(spdlog::level::debug);
apply_log_level(log_level);
spdlog::set_pattern("[%H:%M:%S] [%^%l%$] %v");
}

View File

@ -31,4 +31,16 @@ std::string query(const std::string &table, const std::string &select = "*",
/// Returns the created row as JSON.
std::string insert(const std::string &table, const std::string &json_body);
/// Upsert a row into a table. Body is a JSON array or object string.
/// Returns the upserted array as JSON.
std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict = "");
/// Update rows in a table. Body is a JSON object string.
/// Returns the updated rows as JSON.
std::string update(const std::string &table, const std::string &json_body, const std::string &filter);
/// Delete rows from a table.
/// Returns the deleted rows as JSON.
std::string del(const std::string &table, const std::string &filter);
} // namespace postgres

View File

@ -82,9 +82,11 @@ static http::Response supabase_get(const std::string &url) {
return resp;
}
/// Make an authenticated POST request.
static http::Response supabase_post(const std::string &url,
const std::string &body) {
/// Make an authenticated request with a JSON body (POST, PATCH, DELETE).
static http::Response supabase_request(const std::string &method,
const std::string &url,
const std::string &body,
const std::string &prefer_header) {
CURL *curl = curl_easy_init();
http::Response resp{};
if (!curl) {
@ -94,8 +96,12 @@ static http::Response supabase_post(const std::string &url,
}
struct curl_slist *headers = nullptr;
headers = curl_slist_append(headers, "Content-Type: application/json");
headers = curl_slist_append(headers, "Prefer: return=representation");
if (!body.empty()) {
headers = curl_slist_append(headers, "Content-Type: application/json");
}
if (!prefer_header.empty()) {
headers = curl_slist_append(headers, ("Prefer: " + prefer_header).c_str());
}
headers =
curl_slist_append(headers, ("apikey: " + s_config.supabase_key).c_str());
headers = curl_slist_append(
@ -108,8 +114,11 @@ static http::Response supabase_post(const std::string &url,
};
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, method.c_str());
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
if (!body.empty()) {
curl_easy_setopt(curl, CURLOPT_POSTFIELDS, body.c_str());
}
curl_easy_setopt(
curl, CURLOPT_WRITEFUNCTION,
static_cast<size_t (*)(void *, size_t, size_t, void *)>(+write_cb));
@ -164,7 +173,7 @@ std::string insert(const std::string &table, const std::string &json_body) {
auto url = s_config.supabase_url + "/rest/v1/" + table;
logger::debug("postgres::insert → " + url);
auto resp = supabase_post(url, json_body);
auto resp = supabase_request("POST", url, json_body, "return=representation");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
@ -173,4 +182,55 @@ std::string insert(const std::string &table, const std::string &json_body) {
return resp.body;
}
std::string upsert(const std::string &table, const std::string &json_body, const std::string &on_conflict) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
if (!on_conflict.empty()) {
url += "?on_conflict=" + on_conflict;
}
logger::debug("postgres::upsert → " + url);
auto resp = supabase_request("POST", url, json_body, "return=minimal, resolution=merge-duplicates");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::upsert → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
std::string update(const std::string &table, const std::string &json_body, const std::string &filter) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
if (!filter.empty()) {
url += "?" + filter;
}
logger::debug("postgres::update → " + url);
auto resp = supabase_request("PATCH", url, json_body, "return=representation");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::update → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
std::string del(const std::string &table, const std::string &filter) {
ensure_init();
auto url = s_config.supabase_url + "/rest/v1/" + table;
if (!filter.empty()) {
url += "?" + filter;
}
logger::debug("postgres::del → " + url);
auto resp = supabase_request("DELETE", url, "", "return=representation");
if (resp.status_code >= 200 && resp.status_code < 300) {
return resp.body;
}
logger::error("postgres::del → HTTP " + std::to_string(resp.status_code) +
": " + resp.body);
return resp.body;
}
} // namespace postgres

View File

@ -25,6 +25,8 @@ struct MapResult {
int reviews = 0;
GpsCoordinates gps;
std::string thumbnail;
std::string raw_json;
std::string geo_json;
};
struct SearchResult {
@ -35,17 +37,35 @@ struct SearchResult {
// ── Config ──────────────────────────────────────────────────────────────────
struct SystemTuningOptions {
int executor_threads = 0; // 0 = hardware concurrency
int max_concurrent_jobs_per_user = 10;
int http_concurrency_throttle = 50;
int queue_depth_max = 10000;
int bulk_dequeue_size = 1;
int ipc_timeout_ms = 300000;
int max_ipc_connections = 100;
int buffer_size_max = 50 * 1024 * 1024;
};
struct Config {
SystemTuningOptions system;
std::string serpapi_key;
std::string geocoder_key;
std::string bigdata_key;
std::string scrapeless_key;
std::string postgres_url;
std::string supabase_url;
std::string supabase_service_key;
// [enricher]
std::string enricher_meta_scraper;
int enricher_meta_concurrency = 5;
int enricher_meta_idle_timeout = 60;
int enricher_location_concurrency = 1;
};
/// Load config from a TOML file (e.g. config/postgres.toml)
Config load_config(const std::string& path = "config/postgres.toml");
Config load_config(const std::string &path = "config/postgres.toml");
// ── Search API ──────────────────────────────────────────────────────────────
@ -61,6 +81,13 @@ struct SearchOptions {
};
/// Execute a SerpAPI Google Maps search. Handles pagination up to opts.limit.
SearchResult search_google_maps(const Config& cfg, const SearchOptions& opts);
SearchResult search_google_maps(const Config &cfg, const SearchOptions &opts);
/// Resolve geo coordinate to place info
std::string resolve_geo(double lat, double lng, const std::string &key,
int timeout_ms = 3000);
void resolve_geo_batch(std::vector<MapResult> &results, const std::string &key,
int concurrency = 10, int timeout_ms = 3000);
} // namespace search

View File

@ -6,6 +6,12 @@
#include <sstream>
#include <cstdio>
#include <iostream>
#include <rapidjson/stringbuffer.h>
#include <rapidjson/writer.h>
#include <thread>
#include <mutex>
#include <atomic>
namespace search {
@ -15,7 +21,7 @@ static std::string url_encode(const std::string& val) {
std::string result;
result.reserve(val.size() * 2);
for (unsigned char c : val) {
if (isalnum(c) || c == '-' || c == '_' || c == '.' || c == '~') {
if (isalnum(static_cast<unsigned char>(c)) || c == '-' || c == '_' || c == '.' || c == '~') {
result += static_cast<char>(c);
} else {
char buf[4];
@ -44,10 +50,26 @@ Config load_config(const std::string& path) {
if (auto v = tbl["services"]["SERPAPI_KEY"].value<std::string>()) cfg.serpapi_key = *v;
if (auto v = tbl["services"]["GEO_CODER_KEY"].value<std::string>()) cfg.geocoder_key = *v;
if (auto v = tbl["services"]["BIG_DATA_KEY"].value<std::string>()) cfg.bigdata_key = *v;
if (auto v = tbl["services"]["SCRAPELESS_KEY"].value<std::string>()) cfg.scrapeless_key = *v;
// [enricher]
if (auto v = tbl["enricher"]["ENRICHER_META_SCRAPER"].value<std::string>()) cfg.enricher_meta_scraper = *v;
if (auto v = tbl["enricher"]["ENRICHER_META_CONCURRENCY"].value<int>()) cfg.enricher_meta_concurrency = *v;
if (auto v = tbl["enricher"]["ENRICHER_META_IDLE_TIMEOUT"].value<int>()) cfg.enricher_meta_idle_timeout = *v;
if (auto v = tbl["enricher"]["ENRICHER_LOCATION_CONCURRENCY"].value<int>()) cfg.enricher_location_concurrency = *v;
// [system]
if (auto v = tbl["system"]["executor_threads"].value<int>()) cfg.system.executor_threads = *v;
if (auto v = tbl["system"]["max_concurrent_jobs_per_user"].value<int>()) cfg.system.max_concurrent_jobs_per_user = *v;
if (auto v = tbl["system"]["http_concurrency_throttle"].value<int>()) cfg.system.http_concurrency_throttle = *v;
if (auto v = tbl["system"]["queue_depth_max"].value<int>()) cfg.system.queue_depth_max = *v;
if (auto v = tbl["system"]["bulk_dequeue_size"].value<int>()) cfg.system.bulk_dequeue_size = *v;
if (auto v = tbl["system"]["ipc_timeout_ms"].value<int>()) cfg.system.ipc_timeout_ms = *v;
if (auto v = tbl["system"]["max_ipc_connections"].value<int>()) cfg.system.max_ipc_connections = *v;
if (auto v = tbl["system"]["buffer_size_max"].value<int>()) cfg.system.buffer_size_max = *v;
} catch (const toml::parse_error& err) {
// Config file missing or malformed — caller should check empty keys
(void)err;
std::cerr << "[config] TOML parse error in " << path << ": " << err.what() << "\n";
}
return cfg;
}
@ -86,6 +108,13 @@ static void parse_results(const rapidjson::Value& arr, std::vector<MapResult>& o
if (!obj.IsObject()) continue;
MapResult r;
// Capture raw JSON string
rapidjson::StringBuffer buf;
rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
obj.Accept(writer);
r.raw_json = std::string(buf.GetString(), buf.GetSize());
if (obj.HasMember("title") && obj["title"].IsString())
r.title = obj["title"].GetString();
if (obj.HasMember("place_id") && obj["place_id"].IsString())
@ -196,4 +225,49 @@ SearchResult search_google_maps(const Config& cfg, const SearchOptions& opts) {
return result;
}
// ── Geo enrichment ──────────────────────────────────────────────────────────
std::string resolve_geo(double lat, double lng, const std::string& key, int timeout_ms) {
if (key.empty()) return "{}";
char url[512];
snprintf(url, sizeof(url),
"https://api.bigdatacloud.net/data/reverse-geocode?latitude=%.7f&longitude=%.7f&localityLanguage=en&key=%s",
lat, lng, key.c_str());
http::GetOptions opts;
opts.timeout_ms = timeout_ms;
auto resp = http::get(url, opts);
if (resp.status_code == 200 && !resp.body.empty()) {
return resp.body;
}
return "{}";
}
void resolve_geo_batch(std::vector<MapResult>& results, const std::string& key, int concurrency, int timeout_ms) {
if (key.empty() || results.empty()) return;
std::atomic<size_t> current_idx{0};
std::vector<std::thread> threads;
int num_threads = std::min<int>(concurrency, static_cast<int>(results.size()));
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back([&]() {
while (true) {
size_t idx = current_idx.fetch_add(1);
if (idx >= results.size()) break;
auto& r = results[idx];
if (r.gps.lat != 0 || r.gps.lng != 0) {
r.geo_json = resolve_geo(r.gps.lat, r.gps.lng, key, timeout_ms);
}
}
});
}
for (auto& t : threads) {
if (t.joinable()) t.join();
}
}
} // namespace search

View File

@ -15,8 +15,8 @@ Port the [gridsearch-worker.ts](../src/products/locations/gridsearch-worker.ts)
| `grid` | ✅ Done | 13 | 105 |
| `search` | ✅ Done | 8 | 13 |
| CLI `gridsearch` | ✅ Done | — | dry-run verified (3ms) |
| IPC `gridsearch` | 🔧 Stub | — | routes msg, TODO: parse payload |
| **Total** | | **62** | **248** |
| IPC `gridsearch` | ✅ Done | 1 | 30 |
| **Total** | | **63** | **278** |
---
@ -44,7 +44,7 @@ GADM Resolve → Grid Generate → SerpAPI Search → Enrich → Supabase Upsert
| **1. GADM Resolve** | GID list + target level | `GridFeature[]` (GeoJSON polygons with GHS props) | Read pre-cached JSON files from `cache/gadm/boundary_{GID}_{LEVEL}.json` |
| **2. Grid Generate** | `GridFeature[]` + settings | `GridSearchHop[]` (waypoints: lat/lng/radius) | Centroid, bbox, distance, area, point-in-polygon, cell sorting |
| **3. Search** | Waypoints + query + SerpAPI key | Place results (JSON) | HTTP calls to `serpapi.com`, per-waypoint caching |
| **4. Enrich** | Place results | Enriched data (emails, pages) | HTTP scraping **defer to Phase 2** |
| **4. Enrich** | Place results | Enriched data (emails, pages) | HTTP scraping |
| **5. Persist** | Enriched places | Supabase `places` + `grid_search_runs` | PostgREST upsert |
---
@ -168,14 +168,16 @@ Reads `[services].SERPAPI_KEY`, `GEO_CODER_KEY`, `BIG_DATA_KEY` from `config/pos
---
## CLI Subcommand: `gridsearch`
## CLI Subcommands ✅
### 1. `gridsearch` (One-shot execution)
```
polymech-cli gridsearch <GID> <QUERY> [OPTIONS]
Positionals:
GID GADM GID (e.g. ESP.1.1_1)
QUERY Search query (e.g. 'mecanizado cnc')
GID GADM GID (e.g. ESP.1.1_1) — ignored when --settings is used
QUERY Search query — ignored when --settings is used
Options:
-l, --level INT Target GADM level (default: 0)
@ -186,9 +188,25 @@ Options:
--sort TEXT Path order: snake|zigzag|spiral-out|spiral-in|shortest
-c, --config TEXT TOML config path (default: config/postgres.toml)
--cache-dir TEXT GADM cache directory (default: cache/gadm)
--settings TEXT JSON settings file (matches TypeScript GuidedPreset shape)
--enrich Run enrichment pipeline (meta + email) after search
--persistence-postgres Persist run data natively via Postgres
-o, --output TEXT Output JSON file (default: gridsearch-HH-MM.json in cwd)
--dry-run Generate grid only, skip SerpAPI search
```
### 2. `worker` (IPC Daemon execution)
```
polymech-cli worker [OPTIONS]
Options:
--daemon Run persistent daemon pool (tier-based)
-c, --config TEXT TOML config path (default: config/postgres.toml)
--user-uid TEXT User ID to bind this daemon to (needed for place owner)
--uds TEXT Run over Unix Domain Socket / Named Pipe (TCP on Windows) at the given path
```
### Execution flow
```
@ -210,7 +228,27 @@ polymech-cli gridsearch ABW "recycling" --dry-run
### IPC worker mode
The `worker` subcommand routes `gridsearch` message type (currently echoes payload — TODO: wire full pipeline from parsed JSON).
The `worker` subcommand natively routes multiplexed asynchronous `gridsearch` payloads. When launched via `--uds <path>`, it provisions a high-performance Asio streaming server (AF_UNIX sockets on POSIX, TCP sockets on Windows). Event frames (`grid-ready`, `waypoint-start`, `location`, `node`, etc) emit bi-directionally utilizing the IPC bridging protocol, dropping locking blockades completely.
---
## Exposed Configuration / Tuning Parameters
As we integrate deeper with the core business logic, the Node orchestrator and internal services should configure and enforce limits on the underlying C++ concurrent engine. Relevant configuration surfaces we need to expose for the primary ecosystem libraries include:
### 1. Taskflow (`https://github.com/taskflow/taskflow`)
- **`executor_threads` (`num_workers`)**: The size of the `tf::Executor` thread pool. As Gridsearch is heavily I/O network bound (HTTP calls for search/enrichment), setting this significantly higher than `std::thread::hardware_concurrency()` may aggressively improve HTTP ingestion throughput globally.
- **`max_concurrent_jobs_per_user`**: A structural limit dictating how many concurrent gridsearch invocation graphs a single tenant/user can enqueue and run actively to prevent monopolization.
- **`http_concurrency_throttle`**: Task limits enforced upon node scraping or SerpAPI requests per-pipeline graph to avoid widespread `429 Too Many Requests` bans.
### 2. Moodycamel ConcurrentQueue (`https://github.com/cameron314/concurrentqueue`)
- **`queue_depth_max` / `backpressure`**: Since Moodycamel queue memory allocates dynamically and lock-free to any capacity, we must mandate a hard software ceiling/backpressure limit over the Node-to-C++ IPC layer. If Node blindly streams jobs faster than Taskflow can execute them, the daemon will eventually OOM.
- **`bulk_dequeue_size`**: Exposing tuning parameters for the dispatch thread on how many concurrent IPC tasks should be sucked out of the queue simultaneously.
### 3. Boost.Asio (`https://github.com/chriskohlhoff/asio`)
- **`ipc_timeout_ms` (Read/Write)**: Mandatory timeouts for the IPC socket layer. If the orchestrator stalls, crashes, or goes silent, Asio must reap the connection and automatically GC the in-flight tasks to prevent Zombie worker processes.
- **`max_ipc_connections`**: Absolute limit on simultaneous orchestration pipelines dialing into a single Worker Pod.
- **`buffer_size_max`**: Soft constraints on async payload allocations so a malformed 200MB JSON frame from Node.js doesn't memory-spike the `asio::read` operations abruptly.
---
@ -260,7 +298,15 @@ All packages depend on `logger` and `json` implicitly.
### Integration test (Node.js)
- Existing `orchestrator/test-ipc.mjs` validates spawn/lifecycle/ping/job
- TODO: `test-gridsearch.mjs` for full pipeline via IPC
- `orchestrator/test-gridsearch-ipc.mjs` validates full pipeline via IPC (8 event types + job result)
- `orchestrator/test-gridsearch-ipc-uds.mjs` validates high-throughput Unix Domain Sockets mapping, backpressure boundaries, and soft cancellation injections utilizing `action: cancel` frames mid-flight.
---
## IPC Cancellation & Dynamic Job Tuning
The high-performance UDS daemon now natively tracks and intercepts JSON `action: cancel` frames referencing specific `jobId`s to gracefully exit Taskflow jobs mid-flight.
Dynamic tuning limits, such as memory buffering boundaries or threading capacities, are inherently validated and bound by hard ceilings established inside the `[system]` constraint block of `config/postgres.toml`.
---
@ -268,10 +314,7 @@ All packages depend on `logger` and `json` implicitly.
| Item | Reason |
|------|--------|
| Enrichment (email scraping) | Complex + browser-dependent; keep in Node.js |
| SerpAPI response caching | State store managed by orchestrator for now |
| Protobuf framing | JSON IPC sufficient for current throughput |
| Multi-threaded search | Sequential is fine for SerpAPI rate limits |
| GEOS integration | Custom geo is sufficient for grid math |
| IPC gridsearch payload parser | Currently a stub; wire full pipeline from JSON |
| Supabase upsert in CLI | Use postgres package for batch insert |

View File

@ -1,351 +1,265 @@
#include <iostream>
#include <string>
#include <chrono>
#include <CLI/CLI.hpp>
#include <toml++/toml.hpp>
#include "html/html.h"
#include "http/http.h"
#include "ipc/ipc.h"
#include "logger/logger.h"
#include "postgres/postgres.h"
#include "json/json.h"
#include "gadm_reader/gadm_reader.h"
#include "grid/grid.h"
#include "search/search.h"
#ifndef PROJECT_VERSION
#define PROJECT_VERSION "0.1.0"
#endif
int main(int argc, char *argv[]) {
CLI::App app{"polymech-cli — Polymech C++ CLI", "polymech-cli"};
app.set_version_flag("-v,--version", PROJECT_VERSION);
// Subcommand: parse HTML
std::string html_input;
auto *parse_cmd = app.add_subcommand("parse", "Parse HTML and list elements");
parse_cmd->add_option("html", html_input, "HTML string to parse")->required();
// Subcommand: select from HTML
std::string select_input;
std::string selector;
auto *select_cmd =
app.add_subcommand("select", "CSS-select elements from HTML");
select_cmd->add_option("html", select_input, "HTML string")->required();
select_cmd->add_option("selector", selector, "CSS selector")->required();
// Subcommand: config — read a TOML file
std::string config_path;
auto *config_cmd =
app.add_subcommand("config", "Read and display a TOML config file");
config_cmd->add_option("file", config_path, "Path to TOML file")->required();
// Subcommand: fetch — HTTP GET a URL
std::string fetch_url;
auto *fetch_cmd =
app.add_subcommand("fetch", "HTTP GET a URL and print the response");
fetch_cmd->add_option("url", fetch_url, "URL to fetch")->required();
// Subcommand: json — prettify JSON
std::string json_input;
auto *json_cmd = app.add_subcommand("json", "Prettify a JSON string");
json_cmd->add_option("input", json_input, "JSON string")->required();
// Subcommand: db — connect to Supabase and query
std::string db_config_path = "config/postgres.toml";
std::string db_table;
int db_limit = 10;
auto *db_cmd =
app.add_subcommand("db", "Connect to Supabase and query a table");
db_cmd->add_option("-c,--config", db_config_path, "TOML config path")
->default_val("config/postgres.toml");
db_cmd->add_option("table", db_table, "Table to query (optional)");
db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10);
// Subcommand: worker — IPC mode (spawned by Node.js orchestrator)
auto *worker_cmd = app.add_subcommand(
"worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)");
// Subcommand: gridsearch — Run a full gridsearch pipeline
std::string gs_gid;
int gs_level = 0;
std::string gs_query;
std::string gs_grid_mode = "hex";
double gs_cell_size = 5.0;
int gs_limit = 20;
int gs_zoom = 13;
std::string gs_sort = "snake";
std::string gs_config_path = "config/postgres.toml";
std::string gs_cache_dir = "cache/gadm";
bool gs_dry_run = false;
auto *gs_cmd = app.add_subcommand("gridsearch", "Run a full gridsearch pipeline (enumerate → grid → search)");
gs_cmd->add_option("gid", gs_gid, "GADM GID (e.g. ESP.1.1_1)")->required();
gs_cmd->add_option("query", gs_query, "Search query (e.g. 'mecanizado cnc')")->required();
gs_cmd->add_option("-l,--level", gs_level, "Target GADM level")->default_val(0);
gs_cmd->add_option("-m,--mode", gs_grid_mode, "Grid mode: hex|square|admin|centers")->default_val("hex");
gs_cmd->add_option("-s,--cell-size", gs_cell_size, "Cell size in km")->default_val(5.0);
gs_cmd->add_option("--limit", gs_limit, "Max results per area")->default_val(20);
gs_cmd->add_option("-z,--zoom", gs_zoom, "Google Maps zoom")->default_val(13);
gs_cmd->add_option("--sort", gs_sort, "Path order: snake|zigzag|spiral-out|spiral-in|shortest")->default_val("snake");
gs_cmd->add_option("-c,--config", gs_config_path, "TOML config path")->default_val("config/postgres.toml");
gs_cmd->add_option("--cache-dir", gs_cache_dir, "GADM cache directory")->default_val("cache/gadm");
gs_cmd->add_flag("--dry-run", gs_dry_run, "Generate grid only, skip SerpAPI search");
CLI11_PARSE(app, argc, argv);
// Worker mode uses stderr for logs to keep stdout clean for IPC frames
if (worker_cmd->parsed()) {
logger::init_stderr("polymech-worker");
} else {
logger::init("polymech-cli");
}
// ── worker mode ─────────────────────────────────────────────────────────
if (worker_cmd->parsed()) {
logger::info("Worker mode: listening on stdin");
// Send a "ready" message so the orchestrator knows we're alive
ipc::write_message({"0", "ready", "{}"});
while (true) {
ipc::Message req;
if (!ipc::read_message(req)) {
logger::info("Worker: stdin closed, exiting");
break;
}
logger::debug("Worker recv: type=" + req.type + " id=" + req.id);
if (req.type == "ping") {
ipc::write_message({req.id, "pong", "{}"});
} else if (req.type == "gridsearch") {
// Parse gridsearch job from payload
logger::info("Worker: gridsearch job received");
// TODO: parse req.payload JSON into gs options, run pipeline, emit progress
ipc::write_message({req.id, "job_result", req.payload});
} else if (req.type == "job") {
// Stub: echo the payload back as job_result
ipc::write_message({req.id, "job_result", req.payload});
} else if (req.type == "shutdown") {
ipc::write_message({req.id, "shutdown_ack", "{}"});
logger::info("Worker: shutdown requested, exiting");
break;
} else {
// Unknown type — respond with error
ipc::write_message(
{req.id, "error",
"{\"message\":\"unknown type: " + req.type + "\"}"});
}
}
return 0;
}
// ── existing subcommands ────────────────────────────────────────────────
if (parse_cmd->parsed()) {
auto elements = html::parse(html_input);
logger::info("Parsed " + std::to_string(elements.size()) + " elements");
for (const auto &el : elements) {
std::cout << "<" << el.tag << "> " << el.text << "\n";
}
return 0;
}
if (select_cmd->parsed()) {
auto matches = html::select(select_input, selector);
logger::info("Matched " + std::to_string(matches.size()) + " elements");
for (const auto &m : matches) {
std::cout << m << "\n";
}
return 0;
}
if (config_cmd->parsed()) {
try {
auto tbl = toml::parse_file(config_path);
logger::info("Loaded config: " + config_path);
std::cout << tbl << "\n";
} catch (const toml::parse_error &err) {
logger::error("TOML parse error: " + std::string(err.what()));
return 1;
}
return 0;
}
if (fetch_cmd->parsed()) {
auto resp = http::get(fetch_url);
logger::info("HTTP " + std::to_string(resp.status_code) + " from " +
fetch_url);
if (json::is_valid(resp.body)) {
std::cout << json::prettify(resp.body) << "\n";
} else {
std::cout << resp.body << "\n";
}
return 0;
}
if (json_cmd->parsed()) {
if (!json::is_valid(json_input)) {
logger::error("Invalid JSON input");
return 1;
}
std::cout << json::prettify(json_input) << "\n";
return 0;
}
if (db_cmd->parsed()) {
try {
auto cfg = toml::parse_file(db_config_path);
postgres::Config pg_cfg;
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
pg_cfg.supabase_key =
cfg["supabase"]["publishable_key"].value_or(std::string(""));
postgres::init(pg_cfg);
auto status = postgres::ping();
logger::info("Supabase: " + status);
if (!db_table.empty()) {
auto result = postgres::query(db_table, "*", "", db_limit);
if (json::is_valid(result)) {
std::cout << json::prettify(result) << "\n";
} else {
std::cout << result << "\n";
}
}
} catch (const std::exception &e) {
logger::error(std::string("db error: ") + e.what());
return 1;
}
return 0;
}
// ── gridsearch subcommand ──────────────────────────────────────────────
if (gs_cmd->parsed()) {
logger::info("Gridsearch: gid=" + gs_gid + " query=\"" + gs_query + "\" mode=" + gs_grid_mode);
auto t0 = std::chrono::steady_clock::now();
// 1. Load config
auto cfg = search::load_config(gs_config_path);
if (cfg.serpapi_key.empty() && !gs_dry_run) {
logger::error("No SERPAPI_KEY in " + gs_config_path);
return 1;
}
// 2. Resolve GADM boundaries
logger::info("Loading boundary for " + gs_gid + " level=" + std::to_string(gs_level));
auto boundary = gadm::load_boundary(gs_gid, gs_level, gs_cache_dir);
if (!boundary.error.empty()) {
logger::error("Boundary error: " + boundary.error);
return 1;
}
logger::info("Resolved " + std::to_string(boundary.features.size()) + " features");
// 3. Generate grid
grid::GridOptions grid_opts;
grid_opts.gridMode = gs_grid_mode;
grid_opts.cellSize = gs_cell_size;
grid_opts.cellOverlap = 0;
grid_opts.centroidOverlap = 0;
grid_opts.maxCellsLimit = 10000;
grid_opts.maxElevation = 0;
grid_opts.minDensity = 0;
grid_opts.minGhsPop = 0;
grid_opts.minGhsBuilt = 0;
grid_opts.ghsFilterMode = "OR";
grid_opts.allowMissingGhs = true;
grid_opts.bypassFilters = false;
grid_opts.pathOrder = gs_sort;
grid_opts.groupByRegion = false;
auto grid_result = grid::generate(boundary.features, grid_opts);
if (!grid_result.error.empty()) {
logger::error("Grid error: " + grid_result.error);
return 1;
}
logger::info("Grid: " + std::to_string(grid_result.waypoints.size()) + " waypoints, "
+ std::to_string(grid_result.skippedCells) + " skipped");
if (gs_dry_run) {
// Output waypoints as JSON array
std::cout << "[";
for (size_t i = 0; i < grid_result.waypoints.size(); ++i) {
const auto& wp = grid_result.waypoints[i];
if (i > 0) std::cout << ",";
char buf[256];
snprintf(buf, sizeof(buf),
"{\"step\":%d,\"lat\":%.6f,\"lng\":%.6f,\"radius_km\":%.3f}",
wp.step, wp.lat, wp.lng, wp.radius_km);
std::cout << buf;
}
std::cout << "]\n";
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - t0).count();
logger::info("Dry-run complete in " + std::to_string(elapsed) + "ms");
return 0;
}
// 4. Search each waypoint via SerpAPI
logger::info("Starting SerpAPI search for " + std::to_string(grid_result.waypoints.size()) + " waypoints");
int totalResults = 0;
int totalApiCalls = 0;
std::cout << "{\"waypoints\":[";
for (size_t i = 0; i < grid_result.waypoints.size(); ++i) {
const auto& wp = grid_result.waypoints[i];
search::SearchOptions sopts;
sopts.query = gs_query;
sopts.lat = wp.lat;
sopts.lng = wp.lng;
sopts.zoom = gs_zoom;
sopts.limit = gs_limit;
auto sr = search::search_google_maps(cfg, sopts);
totalResults += static_cast<int>(sr.results.size());
totalApiCalls += sr.apiCalls;
if (i > 0) std::cout << ",";
char hdr[256];
snprintf(hdr, sizeof(hdr),
"{\"step\":%d,\"lat\":%.6f,\"lng\":%.6f,\"results\":%zu,\"apiCalls\":%d}",
wp.step, wp.lat, wp.lng, sr.results.size(), sr.apiCalls);
std::cout << hdr;
// Log progress
logger::info("Waypoint " + std::to_string(i + 1) + "/" +
std::to_string(grid_result.waypoints.size()) +
"" + std::to_string(sr.results.size()) + " results");
}
std::cout << "],";
auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::steady_clock::now() - t0).count();
char summary[512];
snprintf(summary, sizeof(summary),
"\"summary\":{\"waypoints\":%zu,\"totalResults\":%d,"
"\"totalApiCalls\":%d,\"elapsedMs\":%lld}}",
grid_result.waypoints.size(), totalResults, totalApiCalls,
static_cast<long long>(elapsed));
std::cout << summary << "\n";
logger::info("Gridsearch done: " + std::to_string(totalResults) +
" results, " + std::to_string(totalApiCalls) +
" API calls, " + std::to_string(elapsed) + "ms");
return 0;
}
// No subcommand — show help
std::cout << app.help() << "\n";
return 0;
}
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <set>
#include <ctime>
#include <iomanip>
#include <sstream>
#include <rapidjson/document.h>
#include <CLI/CLI.hpp>
#include <toml++/toml.hpp>
#include "html/html.h"
#include "http/http.h"
#include "ipc/ipc.h"
#include "logger/logger.h"
#include "postgres/postgres.h"
#include "json/json.h"
#include "gadm_reader/gadm_reader.h"
#include "grid/grid.h"
#include "search/search.h"
#include "enrichers/enrichers.h"
#include "cmd_gridsearch.h"
#ifndef PROJECT_VERSION
#define PROJECT_VERSION "0.1.0"
#endif
int main(int argc, char *argv[]) {
CLI::App app{"polymech-cli — Polymech C++ CLI", "polymech-cli"};
app.set_version_flag("-v,--version", PROJECT_VERSION);
std::string log_level = "info";
app.add_option("--log-level", log_level, "Set log level (debug/info/warn/error)")->default_val("info");
// Subcommand: parse HTML
std::string html_input;
auto *parse_cmd = app.add_subcommand("parse", "Parse HTML and list elements");
parse_cmd->add_option("html", html_input, "HTML string to parse")->required();
// Subcommand: select from HTML
std::string select_input;
std::string selector;
auto *select_cmd =
app.add_subcommand("select", "CSS-select elements from HTML");
select_cmd->add_option("html", select_input, "HTML string")->required();
select_cmd->add_option("selector", selector, "CSS selector")->required();
// Subcommand: config — read a TOML file
std::string config_path;
auto *config_cmd =
app.add_subcommand("config", "Read and display a TOML config file");
config_cmd->add_option("file", config_path, "Path to TOML file")->required();
// Subcommand: fetch — HTTP GET a URL
std::string fetch_url;
auto *fetch_cmd =
app.add_subcommand("fetch", "HTTP GET a URL and print the response");
fetch_cmd->add_option("url", fetch_url, "URL to fetch")->required();
// Subcommand: json — prettify JSON
std::string json_input;
auto *json_cmd = app.add_subcommand("json", "Prettify a JSON string");
json_cmd->add_option("input", json_input, "JSON string")->required();
// Subcommand: db — connect to Supabase and query
std::string db_config_path = "config/postgres.toml";
std::string db_table;
int db_limit = 10;
auto *db_cmd =
app.add_subcommand("db", "Connect to Supabase and query a table");
db_cmd->add_option("-c,--config", db_config_path, "TOML config path")
->default_val("config/postgres.toml");
db_cmd->add_option("table", db_table, "Table to query (optional)");
db_cmd->add_option("-l,--limit", db_limit, "Row limit")->default_val(10);
// Subcommand: worker — IPC mode (spawned by Node.js orchestrator)
bool daemon_mode = false;
std::string daemon_uid;
std::string worker_config = "config/postgres.toml";
std::string uds_path;
auto *worker_cmd = app.add_subcommand(
"worker", "Run as IPC worker (stdin/stdout length-prefixed JSON)");
worker_cmd->add_flag("--daemon", daemon_mode, "Run persistent daemon pool (tier-based)");
worker_cmd->add_option("-c,--config", worker_config, "TOML config path")->default_val("config/postgres.toml");
worker_cmd->add_option("--user-uid", daemon_uid, "User ID to bind this daemon to (needed for place owner)");
worker_cmd->add_option("--uds", uds_path, "Run over Unix Domain Socket / Named Pipe at the given path");
// Subcommand: gridsearch — Run a full gridsearch pipeline
auto* gs_cmd = polymech::setup_cmd_gridsearch(app);
CLI11_PARSE(app, argc, argv);
// Worker mode uses stderr for logs to keep stdout clean for IPC frames
if (worker_cmd->parsed()) {
logger::init_stderr("polymech-worker", log_level);
} else {
logger::init("polymech-cli", log_level);
}
// ── worker mode ─────────────────────────────────────────────────────────
if (worker_cmd->parsed()) {
logger::info("Worker mode: listening on stdin");
if (daemon_mode) {
logger::info("Daemon mode enabled. Pre-initializing Postgres pool and binding to User: " + (daemon_uid.empty() ? "None" : daemon_uid));
auto cfg = search::load_config(worker_config);
postgres::Config pcfg;
pcfg.supabase_url = cfg.supabase_url;
pcfg.supabase_key = cfg.supabase_service_key;
postgres::init(pcfg);
}
if (!uds_path.empty()) {
logger::info("Worker mode: UDS Server active on " + uds_path);
int rc = polymech::run_cmd_gridsearch_uds(uds_path, daemon_mode, daemon_uid);
return rc;
}
// Send a "ready" message so the orchestrator knows we're alive
ipc::write_message({"0", "ready", "{}"});
while (true) {
ipc::Message req;
if (!ipc::read_message(req)) {
logger::info("Worker: stdin closed, exiting");
break;
}
logger::debug("Worker recv: type=" + req.type + " id=" + req.id);
if (req.type == "ping") {
ipc::write_message({req.id, "pong", "{}"});
} else if (req.type == "gridsearch") {
logger::info("Worker: gridsearch job received");
// Build callbacks that emit IPC events.
// Progress events use id "0" (unmatched → event for orchestrator).
// The final job_result uses the original req.id so the promise resolves.
std::string req_id = req.id;
polymech::GridsearchCallbacks cb;
cb.onEvent = [&req_id](const std::string& type, const std::string& json) {
if (type == "job_result") {
ipc::write_message({req_id, "job_result", json});
} else {
ipc::write_message({"0", type, json});
}
};
int rc = polymech::run_cmd_gridsearch_ipc(req.payload, req.id, cb, daemon_mode, daemon_uid);
if (rc != 0) {
ipc::write_message({req.id, "error", "{\"message\":\"gridsearch pipeline failed\"}"});
}
} else if (req.type == "job") {
// Stub: echo the payload back as job_result
ipc::write_message({req.id, "job_result", req.payload});
} else if (req.type == "shutdown") {
ipc::write_message({req.id, "shutdown_ack", "{}"});
logger::info("Worker: shutdown requested, exiting");
break;
} else {
// Unknown type — respond with error
ipc::write_message(
{req.id, "error",
"{\"message\":\"unknown type: " + req.type + "\"}"});
}
}
return 0;
}
// ── existing subcommands ────────────────────────────────────────────────
if (parse_cmd->parsed()) {
auto elements = html::parse(html_input);
logger::info("Parsed " + std::to_string(elements.size()) + " elements");
for (const auto &el : elements) {
std::cout << "<" << el.tag << "> " << el.text << "\n";
}
return 0;
}
if (select_cmd->parsed()) {
auto matches = html::select(select_input, selector);
logger::info("Matched " + std::to_string(matches.size()) + " elements");
for (const auto &m : matches) {
std::cout << m << "\n";
}
return 0;
}
if (config_cmd->parsed()) {
try {
auto tbl = toml::parse_file(config_path);
logger::info("Loaded config: " + config_path);
std::cout << tbl << "\n";
} catch (const toml::parse_error &err) {
logger::error("TOML parse error: " + std::string(err.what()));
return 1;
}
return 0;
}
if (fetch_cmd->parsed()) {
auto resp = http::get(fetch_url);
logger::info("HTTP " + std::to_string(resp.status_code) + " from " +
fetch_url);
if (json::is_valid(resp.body)) {
std::cout << json::prettify(resp.body) << "\n";
} else {
std::cout << resp.body << "\n";
}
return 0;
}
if (json_cmd->parsed()) {
if (!json::is_valid(json_input)) {
logger::error("Invalid JSON input");
return 1;
}
std::cout << json::prettify(json_input) << "\n";
return 0;
}
if (db_cmd->parsed()) {
try {
auto cfg = toml::parse_file(db_config_path);
postgres::Config pg_cfg;
pg_cfg.supabase_url = cfg["supabase"]["url"].value_or(std::string(""));
pg_cfg.supabase_key =
cfg["supabase"]["publishable_key"].value_or(std::string(""));
postgres::init(pg_cfg);
auto status = postgres::ping();
logger::info("Supabase: " + status);
if (!db_table.empty()) {
auto result = postgres::query(db_table, "*", "", db_limit);
if (json::is_valid(result)) {
std::cout << json::prettify(result) << "\n";
} else {
std::cout << result << "\n";
}
}
} catch (const std::exception &e) {
logger::error(std::string("db error: ") + e.what());
return 1;
}
return 0;
}
// ── gridsearch subcommand ──────────────────────────────────────────────
if (gs_cmd->parsed()) {
return polymech::run_cmd_gridsearch();
}
// No subcommand — show help
std::cout << app.help() << "\n";
return 0;
}

36
src/sys_metrics.cpp Normal file
View File

@ -0,0 +1,36 @@
#include "sys_metrics.h"
#ifdef _WIN32
#define NOMINMAX
#include <windows.h>
#include <psapi.h>
#pragma comment(lib, "psapi.lib")
namespace polymech {
size_t get_current_rss_mb() {
PROCESS_MEMORY_COUNTERS info;
if (GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info))) {
return (size_t)(info.WorkingSetSize) / (1024 * 1024);
}
return 0;
}
uint64_t get_cpu_time_ms() {
FILETIME creationTime, exitTime, kernelTime, userTime;
if (GetProcessTimes(GetCurrentProcess(), &creationTime, &exitTime, &kernelTime, &userTime)) {
ULARGE_INTEGER kernel, user;
kernel.LowPart = kernelTime.dwLowDateTime;
kernel.HighPart = kernelTime.dwHighDateTime;
user.LowPart = userTime.dwLowDateTime;
user.HighPart = userTime.dwHighDateTime;
return (kernel.QuadPart + user.QuadPart) / 10000;
}
return 0;
}
}
#else
namespace polymech {
size_t get_current_rss_mb() { return 0; }
uint64_t get_cpu_time_ms() { return 0; }
}
#endif

8
src/sys_metrics.h Normal file
View File

@ -0,0 +1,8 @@
#pragma once
#include <cstddef>
#include <cstdint>
namespace polymech {
size_t get_current_rss_mb();
uint64_t get_cpu_time_ms();
}

View File

@ -1,67 +1,74 @@
# Test targets
include(CTest)
include(Catch)
# Unit tests one per package
add_executable(test_logger unit/test_logger.cpp)
target_link_libraries(test_logger PRIVATE Catch2::Catch2WithMain logger)
catch_discover_tests(test_logger)
add_executable(test_html unit/test_html.cpp)
target_link_libraries(test_html PRIVATE Catch2::Catch2WithMain html)
catch_discover_tests(test_html)
add_executable(test_postgres unit/test_postgres.cpp)
target_link_libraries(test_postgres PRIVATE Catch2::Catch2WithMain postgres)
catch_discover_tests(test_postgres)
add_executable(test_json unit/test_json.cpp)
target_link_libraries(test_json PRIVATE Catch2::Catch2WithMain json)
catch_discover_tests(test_json)
add_executable(test_http unit/test_http.cpp)
target_link_libraries(test_http PRIVATE Catch2::Catch2WithMain http)
catch_discover_tests(test_http)
# Functional test end-to-end CLI
add_executable(test_functional functional/test_cli.cpp)
target_link_libraries(test_functional PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json)
catch_discover_tests(test_functional)
# E2E test real Supabase connection (requires config/postgres.toml + network)
add_executable(test_supabase e2e/test_supabase.cpp)
target_link_libraries(test_supabase PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres json)
catch_discover_tests(test_supabase WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_polymech unit/test_polymech.cpp)
target_link_libraries(test_polymech PRIVATE Catch2::Catch2WithMain polymech postgres)
catch_discover_tests(test_polymech)
# E2E test polymech fetch_pages from live Supabase
add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp)
target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json)
catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_ipc unit/test_ipc.cpp)
target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc)
catch_discover_tests(test_ipc)
add_executable(test_geo unit/test_geo.cpp)
target_link_libraries(test_geo PRIVATE Catch2::Catch2WithMain geo)
catch_discover_tests(test_geo)
add_executable(test_gadm_reader unit/test_gadm_reader.cpp)
target_link_libraries(test_gadm_reader PRIVATE Catch2::Catch2WithMain gadm_reader)
catch_discover_tests(test_gadm_reader WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_grid unit/test_grid.cpp)
target_link_libraries(test_grid PRIVATE Catch2::Catch2WithMain grid)
catch_discover_tests(test_grid WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_search unit/test_search.cpp)
target_link_libraries(test_search PRIVATE Catch2::Catch2WithMain search)
catch_discover_tests(test_search WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
# Test targets
include(CTest)
include(Catch)
# pthread is required on Linux for Catch2 tests
find_package(Threads REQUIRED)
# Unit tests one per package
add_executable(test_logger unit/test_logger.cpp)
target_link_libraries(test_logger PRIVATE Catch2::Catch2WithMain logger Threads::Threads)
catch_discover_tests(test_logger)
add_executable(test_html unit/test_html.cpp)
target_link_libraries(test_html PRIVATE Catch2::Catch2WithMain html Threads::Threads)
catch_discover_tests(test_html)
add_executable(test_postgres unit/test_postgres.cpp)
target_link_libraries(test_postgres PRIVATE Catch2::Catch2WithMain postgres Threads::Threads)
catch_discover_tests(test_postgres)
add_executable(test_json unit/test_json.cpp)
target_link_libraries(test_json PRIVATE Catch2::Catch2WithMain json Threads::Threads)
catch_discover_tests(test_json)
add_executable(test_http unit/test_http.cpp)
target_link_libraries(test_http PRIVATE Catch2::Catch2WithMain http Threads::Threads)
catch_discover_tests(test_http)
# Functional test end-to-end CLI
add_executable(test_functional functional/test_cli.cpp)
target_link_libraries(test_functional PRIVATE Catch2::Catch2WithMain CLI11::CLI11 tomlplusplus::tomlplusplus logger html postgres http json Threads::Threads)
catch_discover_tests(test_functional)
# E2E test real Supabase connection (requires config/postgres.toml + network)
add_executable(test_supabase e2e/test_supabase.cpp)
target_link_libraries(test_supabase PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres json Threads::Threads)
catch_discover_tests(test_supabase WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_postgres_live functional/test_postgres_live.cpp)
target_link_libraries(test_postgres_live PRIVATE Catch2::Catch2WithMain postgres search json logger tomlplusplus::tomlplusplus Threads::Threads)
catch_discover_tests(test_postgres_live WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_polymech unit/test_polymech.cpp)
target_link_libraries(test_polymech PRIVATE Catch2::Catch2WithMain polymech postgres Threads::Threads)
catch_discover_tests(test_polymech)
# E2E test polymech fetch_pages from live Supabase
add_executable(test_polymech_e2e e2e/test_polymech_e2e.cpp)
target_link_libraries(test_polymech_e2e PRIVATE Catch2::Catch2WithMain tomlplusplus::tomlplusplus logger postgres polymech json Threads::Threads)
catch_discover_tests(test_polymech_e2e WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_ipc unit/test_ipc.cpp)
target_link_libraries(test_ipc PRIVATE Catch2::Catch2WithMain ipc Threads::Threads)
catch_discover_tests(test_ipc)
add_executable(test_geo unit/test_geo.cpp)
target_link_libraries(test_geo PRIVATE Catch2::Catch2WithMain geo Threads::Threads)
catch_discover_tests(test_geo)
add_executable(test_gadm_reader unit/test_gadm_reader.cpp)
target_link_libraries(test_gadm_reader PRIVATE Catch2::Catch2WithMain gadm_reader Threads::Threads)
catch_discover_tests(test_gadm_reader WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_grid unit/test_grid.cpp)
target_link_libraries(test_grid PRIVATE Catch2::Catch2WithMain grid Threads::Threads)
catch_discover_tests(test_grid WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_search unit/test_search.cpp)
target_link_libraries(test_search PRIVATE Catch2::Catch2WithMain search Threads::Threads)
catch_discover_tests(test_search WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
add_executable(test_enrichers unit/test_enrichers.cpp)
target_link_libraries(test_enrichers PRIVATE Catch2::Catch2WithMain enrichers Threads::Threads)
catch_discover_tests(test_enrichers)

View File

@ -0,0 +1,81 @@
#include <catch2/catch_test_macros.hpp>
#include "postgres/postgres.h"
#include "search/search.h"
#include "json/json.h"
#include "logger/logger.h"
#include <toml++/toml.h>
// Note: This test requires a valid config/postgres.toml pointing to a Supabase instance.
// We test against an arbitrary table 'test_items' or standard table.
// In this case we'll test against `grid_search_runs` since we know it exists,
// using a dummy uuid for testing.
// DO NOT RUN UNLESS CONFIGURED.
TEST_CASE("Postgres Live Operations", "[postgres_live]") {
// Load config
std::string supabase_url;
std::string supabase_key;
try {
auto config = toml::parse_file("config/postgres.toml");
supabase_url = config["supabase"]["url"].value_or("");
supabase_key = config["supabase"]["service_key"].value_or("");
} catch (const std::exception &e) {
WARN("Skipping postgres live tests. Config missing or invalid: " << e.what());
return;
}
if (supabase_url.empty() || supabase_key.empty()) {
WARN("Skipping postgres live tests. Supabase credentials missing.");
return;
}
postgres::Config pg_cfg;
pg_cfg.supabase_url = supabase_url;
pg_cfg.supabase_key = supabase_key;
postgres::init(pg_cfg);
REQUIRE(postgres::ping() == "ok");
std::string test_id = "00000000-0000-0000-0000-0000000000cc";
std::string user_id = "3bb4cfbf-318b-44d3-a9d3-35680e738421";
SECTION("Insert, Query, Update, Upsert, Delete") {
// 1. Clean up first just in case
postgres::del("grid_search_runs", "id=eq." + test_id);
// 2. Insert
std::string insert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "test_run", "status": "searching", "request": {}})";
std::string res1 = postgres::insert("grid_search_runs", insert_body);
// 3. Query
std::string res2 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
WARN("Insert Result: " << res1);
WARN("Query Result: " << res2);
REQUIRE(json::is_valid(res2));
REQUIRE(res2.find("test_run") != std::string::npos);
// 4. Update
std::string update_body = R"({"status": "enriching"})";
std::string res3 = postgres::update("grid_search_runs", update_body, "id=eq." + test_id);
REQUIRE(json::is_valid(res3));
REQUIRE(res3.find("error") == std::string::npos);
// 5. Upsert
std::string upsert_body = R"({"id": ")" + test_id + R"(", "user_id": ")" + user_id + R"(", "run_id": "upsert_run", "status": "complete", "request": {}})";
std::string res4 = postgres::upsert("grid_search_runs", upsert_body, "id");
REQUIRE(res4.find("error") == std::string::npos);
// Query again to verify upsert
std::string res5 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
REQUIRE(res5.find("upsert_run") != std::string::npos);
// 6. Delete
std::string res6 = postgres::del("grid_search_runs", "id=eq." + test_id);
REQUIRE(json::is_valid(res6));
// Verify deleted
std::string res7 = postgres::query("grid_search_runs", "*", "id=eq." + test_id);
REQUIRE(res7 == "[]");
}
}

View File

@ -0,0 +1,115 @@
#include <catch2/catch_test_macros.hpp>
#include "enrichers/enrichers.h"
using namespace enrichers;
// ── is_likely_email ─────────────────────────────────────────────────────────
TEST_CASE("is_likely_email: valid emails", "[enrichers]") {
CHECK(is_likely_email("info@example.com"));
CHECK(is_likely_email("john.doe@company.co.uk"));
CHECK(is_likely_email("contact@recycling-firm.de"));
CHECK(is_likely_email("hello@my-domain.org"));
}
TEST_CASE("is_likely_email: rejects non-emails", "[enrichers]") {
CHECK_FALSE(is_likely_email(""));
CHECK_FALSE(is_likely_email("not-an-email"));
CHECK_FALSE(is_likely_email("@no-user.com"));
CHECK_FALSE(is_likely_email("user@"));
}
TEST_CASE("is_likely_email: rejects asset extensions", "[enrichers]") {
CHECK_FALSE(is_likely_email("logo@site.png"));
CHECK_FALSE(is_likely_email("icon@site.svg"));
CHECK_FALSE(is_likely_email("style@site.css"));
CHECK_FALSE(is_likely_email("script@site.js"));
CHECK_FALSE(is_likely_email("photo@site.jpg"));
CHECK_FALSE(is_likely_email("photo@site.webp"));
}
TEST_CASE("is_likely_email: rejects placeholder/hash patterns", "[enrichers]") {
CHECK_FALSE(is_likely_email("user@example.com"));
CHECK_FALSE(is_likely_email("test@test.com"));
CHECK_FALSE(is_likely_email("a3f2b@hash.com"));
CHECK_FALSE(is_likely_email("your@email.com"));
CHECK_FALSE(is_likely_email("email@email.com"));
CHECK_FALSE(is_likely_email("name@domain.com"));
}
// ── extract_emails ──────────────────────────────────────────────────────────
TEST_CASE("extract_emails: finds emails in text", "[enrichers]") {
auto emails = extract_emails("Contact us at info@example.org or sales@company.com");
CHECK(emails.size() >= 2);
bool found_info = false, found_sales = false;
for (auto& e : emails) {
if (e == "info@example.org") found_info = true;
if (e == "sales@company.com") found_sales = true;
}
CHECK(found_info);
CHECK(found_sales);
}
TEST_CASE("extract_emails: deduplicates", "[enrichers]") {
auto emails = extract_emails("info@acme.org info@acme.org info@acme.org");
CHECK(emails.size() == 1);
}
TEST_CASE("extract_emails: empty text returns empty", "[enrichers]") {
auto emails = extract_emails("");
CHECK(emails.empty());
}
TEST_CASE("extract_emails: filters out asset emails", "[enrichers]") {
auto emails = extract_emails("logo@site.png info@real-company.de");
CHECK(emails.size() == 1);
CHECK(emails[0] == "info@real-company.de");
}
// ── resolve_url ─────────────────────────────────────────────────────────────
TEST_CASE("resolve_url: absolute stays absolute", "[enrichers]") {
CHECK(resolve_url("https://example.com", "https://other.com/page") == "https://other.com/page");
}
TEST_CASE("resolve_url: relative path", "[enrichers]") {
auto r = resolve_url("https://example.com/page", "/contact");
CHECK(r == "https://example.com/contact");
}
TEST_CASE("resolve_url: protocol-relative", "[enrichers]") {
auto r = resolve_url("https://example.com", "//other.com/foo");
CHECK(r == "https://other.com/foo");
}
TEST_CASE("resolve_url: relative without slash", "[enrichers]") {
auto r = resolve_url("https://example.com/dir/page", "about.html");
CHECK(r == "https://example.com/dir/about.html");
}
// ── status_string ───────────────────────────────────────────────────────────
TEST_CASE("status_string: covers all statuses", "[enrichers]") {
CHECK(std::string(status_string(EnrichStatus::OK)) == "OK");
CHECK(std::string(status_string(EnrichStatus::NO_EMAIL)) == "NO_EMAIL");
CHECK(std::string(status_string(EnrichStatus::META_TIMEOUT)) == "META_TIMEOUT");
CHECK(std::string(status_string(EnrichStatus::EMAIL_TIMEOUT)) == "EMAIL_TIMEOUT");
CHECK(std::string(status_string(EnrichStatus::FETCH_ERROR)) == "FETCH_ERROR");
CHECK(std::string(status_string(EnrichStatus::NO_PAGES)) == "NO_PAGES");
CHECK(std::string(status_string(EnrichStatus::ERROR)) == "ERROR");
}
// ── EnrichConfig defaults ───────────────────────────────────────────────────
TEST_CASE("EnrichConfig: default values", "[enrichers]") {
EnrichConfig cfg;
CHECK(cfg.meta_timeout_ms == 20000);
CHECK(cfg.email_timeout_ms == 30000);
CHECK(cfg.email_page_timeout_ms == 10000);
CHECK(cfg.email_max_pages == 8);
CHECK(cfg.email_abort_after == 1);
CHECK_FALSE(cfg.contact_patterns.empty());
CHECK_FALSE(cfg.probe_paths.empty());
}

View File

@ -1,6 +1,14 @@
#include <catch2/catch_test_macros.hpp>
#include <string>
#include <thread>
#include <vector>
#include "html/html.h"
#include "html/html2md.h"
// ═══════════════════════════════════════════════════════
// html::parse / html::select (existing)
// ═══════════════════════════════════════════════════════
TEST_CASE("html::parse returns elements from valid HTML", "[html]") {
auto elements =
@ -22,15 +30,12 @@ TEST_CASE("html::parse returns elements from valid HTML", "[html]") {
TEST_CASE("html::parse returns empty for empty input", "[html]") {
auto elements = html::parse("");
// Empty or minimal — parser may produce an empty body
REQUIRE(elements.empty());
}
TEST_CASE("html::parse handles nested elements", "[html]") {
auto elements = html::parse("<div><span>Nested</span></div>");
// Parent nodes (body, div) also get text "Nested" via node_text.
// Just verify that the span element is present among the results.
bool found_span = false;
for (const auto &el : elements) {
if (el.tag == "span" && el.text == "Nested") {
@ -61,3 +66,387 @@ TEST_CASE("html::select works with class selector", "[html][select]") {
REQUIRE(matches.size() == 1);
CHECK(matches[0] == "X");
}
// ═══════════════════════════════════════════════════════
// html2md — conversion & large-chunk robustness
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md basic conversion", "[html2md]") {
std::string md = html2md::Convert("<h1>Hello</h1><p>World</p>");
CHECK(md.find("Hello") != std::string::npos);
CHECK(md.find("World") != std::string::npos);
}
TEST_CASE("html2md empty input", "[html2md]") {
std::string md = html2md::Convert("");
CHECK(md.empty());
}
TEST_CASE("html2md whitespace-only input", "[html2md]") {
std::string md = html2md::Convert(" \n\t ");
// Should return empty or whitespace — must not crash
CHECK(md.size() < 20);
}
// ---------- large payload stress tests ----------
static std::string make_paragraphs(size_t count) {
std::string html;
html.reserve(count * 40);
for (size_t i = 0; i < count; ++i) {
html += "<p>Paragraph number ";
html += std::to_string(i);
html += " with some filler text.</p>\n";
}
return html;
}
static std::string make_large_html(size_t target_bytes) {
// Build a chunk of roughly target_bytes by repeating a row
const std::string row = "<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor.</p>\n";
std::string html;
html.reserve(target_bytes + 256);
html += "<html><body>";
while (html.size() < target_bytes) {
html += row;
}
html += "</body></html>";
return html;
}
TEST_CASE("html2md handles 64KB HTML", "[html2md][large]") {
auto html = make_large_html(64 * 1024);
REQUIRE(html.size() >= 64 * 1024);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
CHECK(md.find("Lorem ipsum") != std::string::npos);
}
TEST_CASE("html2md handles 512KB HTML", "[html2md][large]") {
auto html = make_large_html(512 * 1024);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
}
TEST_CASE("html2md handles 1MB HTML", "[html2md][large]") {
auto html = make_large_html(1024 * 1024);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
}
TEST_CASE("html2md 10K paragraphs", "[html2md][large]") {
auto html = make_paragraphs(10000);
std::string md = html2md::Convert(html);
CHECK(!md.empty());
CHECK(md.find("Paragraph number 9999") != std::string::npos);
}
// ---------- deeply nested HTML ----------
TEST_CASE("html2md deeply nested divs (500 levels)", "[html2md][large]") {
const int depth = 500;
std::string html;
for (int i = 0; i < depth; ++i) html += "<div>";
html += "deep content";
for (int i = 0; i < depth; ++i) html += "</div>";
std::string md = html2md::Convert(html);
CHECK(md.find("deep content") != std::string::npos);
}
// ---------- wide table ----------
TEST_CASE("html2md wide table (200 columns)", "[html2md][large]") {
std::string html = "<table><tr>";
for (int i = 0; i < 200; ++i) {
html += "<td>C" + std::to_string(i) + "</td>";
}
html += "</tr></table>";
std::string md = html2md::Convert(html);
CHECK(!md.empty());
CHECK(md.find("C0") != std::string::npos);
CHECK(md.find("C199") != std::string::npos);
}
// ---------- concurrent conversion ----------
TEST_CASE("html2md concurrent conversions are thread-safe", "[html2md][threads]") {
const int num_threads = 8;
const std::string html = make_large_html(32 * 1024); // 32KB each
std::vector<std::string> results(num_threads);
std::vector<std::thread> threads;
for (int i = 0; i < num_threads; ++i) {
threads.emplace_back([&results, &html, i]() {
results[i] = html2md::Convert(html);
});
}
for (auto &t : threads) t.join();
for (int i = 0; i < num_threads; ++i) {
CHECK(!results[i].empty());
CHECK(results[i].find("Lorem ipsum") != std::string::npos);
}
}
// ═══════════════════════════════════════════════════════
// html2md — malformed / faulty HTML robustness
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md unclosed tags", "[html2md][faulty]") {
std::string md = html2md::Convert("<p>Hello <b>bold <i>italic");
CHECK(md.find("Hello") != std::string::npos);
CHECK(md.find("bold") != std::string::npos);
}
TEST_CASE("html2md mismatched/overlapping tags", "[html2md][faulty]") {
std::string md = html2md::Convert("<b>bold <i>both</b> italic</i>");
CHECK(md.find("bold") != std::string::npos);
}
TEST_CASE("html2md broken attributes", "[html2md][faulty]") {
std::string md = html2md::Convert(R"(<a href="http://example.com class="bad>Link</a>)");
// must not crash — output may vary
(void)md;
}
TEST_CASE("html2md bare text (no tags)", "[html2md][faulty]") {
std::string md = html2md::Convert("Just plain text, no HTML at all.");
CHECK(md.find("Just plain text") != std::string::npos);
}
TEST_CASE("html2md random binary noise", "[html2md][faulty]") {
// Full 0-255 byte range — previously crashed on MSVC debug builds due to
// signed char passed to isspace() without unsigned cast. Fixed in html2md.cpp.
std::string noise(4096, '\0');
for (size_t i = 0; i < noise.size(); ++i) {
noise[i] = static_cast<char>((i * 131 + 17) % 256);
}
std::string md = html2md::Convert(noise);
// No assertion on content — just survival
(void)md;
}
TEST_CASE("html2md truncated document", "[html2md][faulty]") {
std::string html = "<html><body><table><tr><td>Cell1</td><td>Cell2";
// abruptly ends mid-table
std::string md = html2md::Convert(html);
CHECK(md.find("Cell1") != std::string::npos);
}
TEST_CASE("html2md script and style tags", "[html2md][faulty]") {
std::string html = R"(
<p>Before</p>
<script>alert('xss');</script>
<style>.foo { color: red; }</style>
<p>After</p>
)";
std::string md = html2md::Convert(html);
CHECK(md.find("Before") != std::string::npos);
CHECK(md.find("After") != std::string::npos);
// script/style content should be stripped
CHECK(md.find("alert") == std::string::npos);
}
TEST_CASE("html2md null bytes in input", "[html2md][faulty]") {
std::string html = "<p>Hello";
html += '\0';
html += "World</p>";
// html2md may stop at null or handle it — must not crash
std::string md = html2md::Convert(html);
(void)md;
}
// ═══════════════════════════════════════════════════════
// html2md — web scraper real-world edge cases
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md UTF-8 multibyte (CJK, Arabic, emoji)", "[html2md][scraper]") {
std::string html =
"<h1>日本語テスト</h1>"
"<p>مرحبا بالعالم</p>"
"<p>Ñoño señor über straße</p>"
"<p>Emoji: 🚀🔥💀👻 and 中文混合English</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Emoji") != std::string::npos);
}
TEST_CASE("html2md BOM prefix", "[html2md][scraper]") {
// UTF-8 BOM (EF BB BF) prepended — common from Windows-origin pages
std::string html = "\xEF\xBB\xBF<html><body><p>Content after BOM</p></body></html>";
std::string md = html2md::Convert(html);
CHECK(md.find("Content after BOM") != std::string::npos);
}
TEST_CASE("html2md entity soup", "[html2md][scraper]") {
std::string html =
"<p>Price: &euro;10 &amp; &lt;20&gt; items</p>"
"<p>&nbsp;&nbsp;&nbsp;indented &mdash; dashes &ndash; more</p>"
"<p>Bad entity: &notreal; and &#999999; and &#xZZZZ;</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Price") != std::string::npos);
}
TEST_CASE("html2md CDATA and comments", "[html2md][scraper]") {
std::string html =
"<p>Before</p>"
"<!-- <script>alert('xss')</script> -->"
"<![CDATA[This is raw <data> & stuff]]>"
"<!-- multi\nline\ncomment -->"
"<p>After</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Before") != std::string::npos);
CHECK(md.find("After") != std::string::npos);
}
TEST_CASE("html2md deeply nested inline tags", "[html2md][scraper]") {
// Real pages sometimes have insanely nested spans from WYSIWYG editors
std::string html = "<p>";
for (int i = 0; i < 100; ++i) html += "<span><b><i><em><strong>";
html += "deep text";
for (int i = 0; i < 100; ++i) html += "</strong></em></i></b></span>";
html += "</p>";
std::string md = html2md::Convert(html);
// 100 layers of bold/italic produce tons of ** and * markers —
// just verify no crash and non-empty output
CHECK(!md.empty());
}
TEST_CASE("html2md huge single line (no newlines)", "[html2md][scraper]") {
// Minified HTML — one giant line, 200KB
std::string html;
html.reserve(200 * 1024);
html += "<html><body>";
for (int i = 0; i < 5000; ++i) {
html += "<div><span class=\"c" + std::to_string(i) + "\">item" +
std::to_string(i) + "</span></div>";
}
html += "</body></html>";
std::string md = html2md::Convert(html);
CHECK(md.find("item0") != std::string::npos);
CHECK(md.find("item4999") != std::string::npos);
}
TEST_CASE("html2md data URI in img src", "[html2md][scraper]") {
std::string html =
"<p>Before image</p>"
"<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSU"
"hEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwAD"
"hgGAWjR9awAAAABJRU5ErkJggg==\" alt=\"pixel\">"
"<p>After image</p>";
std::string md = html2md::Convert(html);
CHECK(md.find("Before image") != std::string::npos);
CHECK(md.find("After image") != std::string::npos);
}
TEST_CASE("html2md mixed Latin-1 and UTF-8 bytes", "[html2md][scraper]") {
// Latin-1 encoded chars (0x80-0xFF) that are NOT valid UTF-8
// Common when scraping pages with wrong charset declaration
std::string html = "<p>caf\xe9 na\xefve r\xe9sum\xe9</p>"; // café naïve résumé in Latin-1
std::string md = html2md::Convert(html);
CHECK(md.find("caf") != std::string::npos);
}
TEST_CASE("html2md HTML with HTTP headers prepended", "[html2md][scraper]") {
// Sometimes raw HTTP responses leak into scraper output
std::string html =
"HTTP/1.1 200 OK\r\n"
"Content-Type: text/html; charset=utf-8\r\n"
"Content-Length: 42\r\n"
"\r\n"
"<html><body><p>Real content</p></body></html>";
std::string md = html2md::Convert(html);
CHECK(md.find("Real content") != std::string::npos);
}
TEST_CASE("html2md Google Maps / Places markup soup", "[html2md][scraper]") {
// Simplified version of real Google Places HTML with data attributes,
// inline styles, aria labels, and deeply nested structure
std::string html = R"(
<div class="section-result" data-result-index="0" jsaction="pane.resultSection.click">
<div class="section-result-title">
<span><span>Müller's Büro & Café</span></span>
</div>
<div class="section-result-details">
<span class="section-result-location">Königstraße 42, München</span>
<span class="section-result-rating">
<span aria-label="4.5 stars"></span>
<span>(1,234)</span>
</span>
</div>
<div style="display:none" aria-hidden="true">
<script type="application/ld+json">{"@type":"LocalBusiness","name":"test"}</script>
</div>
</div>
)";
std::string md = html2md::Convert(html);
CHECK(md.find("Café") != std::string::npos);
CHECK(md.find("München") != std::string::npos);
}
// ═══════════════════════════════════════════════════════
// html2md — output amplification & pathological input
// ═══════════════════════════════════════════════════════
TEST_CASE("html2md nested blockquotes (output amplification)", "[html2md][amplification]") {
// Each <blockquote> nesting adds a ">" prefix per line in markdown.
// 50 deep = each line gets 50 ">" prefixes — tests that output doesn't
// explode exponentially.
std::string html;
for (int i = 0; i < 50; ++i) html += "<blockquote>";
html += "<p>deep quote</p>";
for (int i = 0; i < 50; ++i) html += "</blockquote>";
auto md = html2md::Convert(html);
// Output size should be reasonable — not exponential.
// 50 levels * "> " prefix = ~100 chars + text < 1 KB
CHECK(md.size() < 4096);
CHECK(!md.empty());
}
TEST_CASE("html2md very long attribute value", "[html2md][amplification]") {
// 1 MB href — tests ExtractAttributeFromTagLeftOf won't choke
std::string long_url(1024 * 1024, 'A');
std::string html = "<a href=\"" + long_url + "\">Click</a>";
auto md = html2md::Convert(html);
// Must survive without crash
CHECK(!md.empty());
}
TEST_CASE("html2md 10K unclosed p tags", "[html2md][amplification]") {
// Each unclosed <p> generates "\n\n" — tests that md_ doesn't
// grow beyond reasonable bounds
std::string html;
html.reserve(50000);
for (int i = 0; i < 10000; ++i) html += "<p>text";
auto md = html2md::Convert(html);
CHECK(!md.empty());
// Should contain the text, output gets big but not catastrophic
CHECK(md.find("text") != std::string::npos);
}
TEST_CASE("html2md output-to-input ratio check", "[html2md][amplification]") {
// Verify that for normal, representative HTML, output is smaller
// than input (html2md strips tags, so markdown should be leaner)
std::string html;
html.reserve(100 * 1024);
html += "<html><body>";
for (int i = 0; i < 1000; ++i) {
html += "<div class=\"wrapper\"><p class=\"content\">Paragraph " +
std::to_string(i) + " with some text.</p></div>\n";
}
html += "</body></html>";
auto md = html2md::Convert(html);
// Markdown should be smaller than HTML (we stripped all the divs/classes)
CHECK(md.size() < html.size());
CHECK(md.size() > 0);
}
TEST_CASE("html2md pathological repeated angle brackets", "[html2md][amplification]") {
// Incomplete tags: lots of "<" without closing ">" — stresses tag parser
std::string html(8192, '<');
auto md = html2md::Convert(html);
// Must not infinite-loop — just survive
(void)md;
}